libsmatrix 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5ed4aa20301e6e40e3a9b6b422c081c9b4b00e07
4
+ data.tar.gz: 4b43c976b08df58841171aaa39919a885f7650f1
5
+ SHA512:
6
+ metadata.gz: 7d647cd4bdfd5159f38e31955471418ec7a32df1e2d674de13a6c60a2f62ab2857becc4918fbe5d47a1b7f61bc8dcf4e15f6e11729f125f361c6a91e9de4a5dc
7
+ data.tar.gz: 0700cc4e0b013a5a86be04590e581165e7c8cd3ac4789e16e6f690ed422253308ec6368a8933a2d10009692b04962ce399922831c2e169cbed0ddfca00d9a863
@@ -0,0 +1,52 @@
1
+ # This file is part of the "libsmatrix" project
2
+ # (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
3
+ #
4
+ # Licensed under the MIT License (the "License"); you may not use this
5
+ # file except in compliance with the License. You may obtain a copy of
6
+ # the License at: http://opensource.org/licenses/MIT
7
+
8
+ include src/Makefile.in
9
+
10
+ SHELL = /bin/sh
11
+ CC = clang
12
+ CFLAGS_ = $(CFLAGS) -Wall -Wextra -O3 -march=native -mtune=native -D NDEBUG -fPIC
13
+ LDFLAGS = -lpthread -lm -lruby
14
+ PREFIX = $(DESTDIR)/usr/local
15
+ LIBDIR = $(PREFIX)/lib
16
+ UNAME = $(shell uname)
17
+ SOURCES = src/smatrix.c src/smatrix_jni.c src/smatrix_ruby.c
18
+
19
+ all: src/smatrix.$(LIBEXT)
20
+
21
+ src/smatrix.$(LIBEXT):
22
+ cd src && make
23
+
24
+ install:
25
+ cp src/smatrix.$(LIBEXT) $(LIBDIR)
26
+
27
+ clean:
28
+ find . -name "*.o" -o -name "*.class" -o -name "*.so" -o -name "*.dylib" -o -name "*.bundle" | xargs rm
29
+ rm -rf src/java/target src/config.h src/smatrix_benchmark *.gem
30
+
31
+ ruby:
32
+ cd src/ruby && ruby extconf.rb
33
+ cd src/ruby && make
34
+
35
+ publish_ruby:
36
+ gem build src/ruby/libsmatrix.gemspec
37
+ mv *.gem src/ruby/
38
+
39
+ java:
40
+ cd src/java && make
41
+
42
+ publish_java: java
43
+ cd src/java && mvn deploy
44
+
45
+ benchmark: src/smatrix_benchmark
46
+ src/smatrix_benchmark full
47
+
48
+ src/smatrix_benchmark:
49
+ cd src && make smatrix_benchmark
50
+
51
+ test:
52
+ cd src/java && make test
@@ -0,0 +1,188 @@
1
+ libsmatrix
2
+ ==========
3
+
4
+ A thread-safe two dimensional sparse matrix data structure with C, Java and Ruby bindings.
5
+ It was created to make loading and accessing medium sized (10GB+) matrices in boxed languages
6
+ like Java/Scala or Ruby easier.
7
+
8
+ While the chosen internal storage format (nested hashmaps) is neither the most memory-efficient
9
+ nor extremely fast in terms of access/insert time it seems to be a good tradeoff between these
10
+ two goals.
11
+
12
+ A libsmatrix sparse matrix features two modes of operation; a memory-only mode in which all data
13
+ is kept in main memory and a mode in which the data is stored on disk and only a pool of recently
14
+ used rows is kept in memory. In this mode the data is persisted across program restarts. It also
15
+ allows you to handle datasets larger than your available main memory.
16
+
17
+ #### Documentation
18
+
19
+ + [Getting Started](#getting-started)
20
+ + [C API](#c-api)
21
+ + [Java/Scala API](#fnord)
22
+ + [Ruby API](#ruby-api)
23
+ + [Internals](#internals)
24
+ + [Benchmarks](#benchmarks)
25
+ + [Examples](#examples)
26
+ + [License](#license)
27
+
28
+
29
+ Getting Started (Building)
30
+ --------------------------
31
+
32
+ There are multiple ways to install libsmatrix:
33
+
34
+ ### Compile from source
35
+
36
+ This will produce a single shared object "smatrix.so" file that exports all calls documented
37
+ in "C API".
38
+
39
+ $ make
40
+ $ make install
41
+
42
+ To run the tests/benchmarks (optional, requires java and ruby)
43
+
44
+ $ make test
45
+ $ make benchmark
46
+
47
+ To build the MRI ruby and Java JNI bindings (optional), run:
48
+
49
+ $ make ruby
50
+ $ make java
51
+
52
+ This will produce the respective shared objects and bundles in:
53
+
54
+ src/ruby/smatrix_ruby.so
55
+ src/ruby/smatrix_X.X.X.gem
56
+
57
+ src/java/smatrix_java.so
58
+ src/java/target/libsmatrix-X.X-SNAPSHOT.jar
59
+
60
+ ### Import artifact via Maven/sbt (java/scala)
61
+
62
+ Currently the maven artifact only contains the binding glue code and doesn't actually build
63
+ the native shared object. You need to compile & install "libsmatrix.so" yourself on the target
64
+ host, otherwise you'll get a "UnsatisfiedLinkError".
65
+
66
+ Import artifact via sbt:
67
+
68
+ resolvers += "sbt-libsmatrix-repo" at "https://raw.github.com/paulasmuth/libsmatrix/mvn-repo/"
69
+
70
+ libraryDependencies += "com.paulasmuth.libsmatrix" % "libsmatrix" % "0.2-SNAPSHOT"
71
+
72
+ Import artifact via Maven2 (put this into your pom.xml):
73
+
74
+ <repository>
75
+ <id>libsmatrix-mvn-repo</id>
76
+ <url>https://raw.github.com/paulsmuth/libsmatrix/mvn-repo/</url>
77
+ <snapshots>
78
+ <enabled>true</enabled>
79
+ <updatePolicy>always</updatePolicy>
80
+ </snapshots>
81
+ </repository>
82
+
83
+ To publish the maven artifact from source, check out libsmatrix and run this:
84
+
85
+ $ make publish_java
86
+
87
+
88
+ ### Import gem via rubygems (ruby only)
89
+
90
+ This will install the ruby bindings and compile the native shared object:
91
+
92
+ $ gem install libsmatrix
93
+
94
+ To use libsmatrix in your project, require it like this:
95
+
96
+ require "libsmatrix"
97
+
98
+ To build and publish the ruby gem run:
99
+
100
+ $ make publish_ruby
101
+
102
+ C API
103
+ -----
104
+
105
+ Open a smatrix (if filename is NULL, use in memory only mode; otherwise open or create file)
106
+
107
+ smatrix_t* smatrix_open(const char* fname);
108
+
109
+ Close a smatrix:
110
+
111
+ void smatrix_close(smatrix_t* self);
112
+
113
+ Get, Set, Increment, Decrement a (x,y) position. _All of the methods are threadsafe_
114
+
115
+ uint32_t smatrix_get(smatrix_t* self, uint32_t x, uint32_t y);
116
+ uint32_t smatrix_set(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
117
+ uint32_t smatrix_incr(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
118
+ uint32_t smatrix_decr(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
119
+
120
+ Get a whole "row" of the matrix by row coordinate x. _All of the methods are threadsafe_
121
+
122
+ uint32_t smatrix_rowlen(smatrix_t* self, uint32_t x);
123
+ uint32_t smatrix_getrow(smatrix_t* self, uint32_t x, uint32_t* ret, size_t ret_len);
124
+
125
+
126
+ Java / Scala API
127
+ ----------------
128
+
129
+ here be dragons
130
+
131
+
132
+ Ruby API
133
+ --------
134
+
135
+
136
+ Require the gem:
137
+
138
+ $ require 'libsmatrix'
139
+
140
+
141
+ Create a new smatrix instance:
142
+
143
+ $ smatrix = SparseMatrix.new("/path/to/smatrix.smx")
144
+
145
+ Get, Set, Increment, Decrement a (x,y) position
146
+
147
+ $ smatrix.set(x, y, 5)
148
+ => 5
149
+ $ smatrix.get(x, y)
150
+ => 5
151
+ $ smatrix.incr(x, y, 1)
152
+ => 6
153
+ $ smatrix.decr(x, y, 1)
154
+ => 5
155
+
156
+ Close and free the matrix (data is persisted to disk):
157
+
158
+ $ smatrix = nil
159
+
160
+
161
+ Benchmarks
162
+ ----------
163
+
164
+ **No big-data disclaimer:** We are using this code to run a Collaborative Filtering
165
+ recommendation engine for one of Germany's largest ecommerce sites. It is tested on "small-data"
166
+ datasets with up to 40GB per matrix (1.5 billion values in 13 million rows). If your data is
167
+ actually much bigger (measured in terrabytes, not gigabytes) this library is not for you.
168
+
169
+ here be dragons
170
+
171
+
172
+ Examples
173
+ -------
174
+
175
+ + There is a simple example in src/smatrix_example.c
176
+ + There is a simple Collaborative Filtering based recommendation engine in src/smatrix_example_recommender.c
177
+
178
+
179
+ License
180
+ -------
181
+
182
+ Copyright (c) 2011 Paul Asmuth
183
+
184
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use, copy and modify copies of the Software, subject to the following conditions:
185
+
186
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
187
+
188
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,2 @@
1
+ *.o
2
+ *.class
@@ -0,0 +1,24 @@
1
+ /**
2
+ * This file is part of the "libsmatrix" project
3
+ * (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
4
+ *
5
+ * Licensed under the MIT License (the "License"); you may not use this
6
+ * file except in compliance with the License. You may obtain a copy of
7
+ * the License at: http://opensource.org/licenses/MIT
8
+ */
9
+ import com.paulasmuth.libsmatrix.SparseMatrix;
10
+
11
+ /**
12
+ * Compile & run this example:
13
+ *
14
+ * $ javac CFRecommender.java && java CFRecommeder
15
+ *
16
+ */
17
+ class CFRecommender {
18
+
19
+ public static void main(String[] opts) {
20
+ SparseMatrix smx = new SparseMatrix();
21
+ smx.test();
22
+ }
23
+
24
+ }
@@ -0,0 +1,87 @@
1
+ // This file is part of the "libsmatrix" project
2
+ // (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
3
+ //
4
+ // Licensed under the MIT License (the "License"); you may not use this
5
+ // file except in compliance with the License. You may obtain a copy of
6
+ // the License at: http://opensource.org/licenses/MIT
7
+
8
+ #include <stdio.h>
9
+ #include <string.h>
10
+ #include <stdlib.h>
11
+ #include <pthread.h>
12
+
13
+ #include "smatrix.h"
14
+
15
+ smatrix_t* my_smatrix;
16
+
17
+ // libsmatrix example: simple CF based recommendation engine
18
+ int main(int argc, char **argv) {
19
+ my_smatrix = smatrix_open(NULL);
20
+
21
+ // one preference set = list of items in one session
22
+ // e.g. list of viewed items by the same user
23
+ // e.g. list of bought items in the same checkout
24
+ uint32_t input_ids[5] = {12,52,63,76,43};
25
+ import_preference_set(input_ids, 5);
26
+
27
+ // generate recommendations (similar items) for item #76
28
+ void neighbors_for_item(76);
29
+
30
+ smatrix_close(my_smatrix);
31
+ return 0;
32
+ }
33
+
34
+ // train / add a preference set (list of items in one session)
35
+ void import_preference_set(uint32_t* ids, uint32_t num_ids) {
36
+ uint32_t i, n;
37
+
38
+ for (n = 0; n < num_ids; n++) {
39
+ smatrix_incr(my_smatrix, ids[n], 0, 1);
40
+
41
+ for (i = 0; i < pset->len; i++) {
42
+ if (i != n) {
43
+ smatrix_incr(my_smatrix, ids[n], ids[i], 1);
44
+ }
45
+ }
46
+ }
47
+ }
48
+
49
+ // get recommendations for item with id "item_id"
50
+ void neighbors_for_item(uint32_t item_id)
51
+ uint32_t neighbors, *row, total;
52
+
53
+ total = smatrix_get(my_smatrix, item_id, 0);
54
+ neighbors = smatrix_getrow(my_smatrix, item_id, row, 8192);
55
+
56
+ for (pos = 0; pos < neighbors; pos++) {
57
+ uint32_t cur_id = row[pos * 2];
58
+
59
+ printf("found neighbor for item %u: item %u with distance %f\n",
60
+ item_id, cf_cosine(smatrix, cur_id, row[pos * 2 + 1], total));
61
+ }
62
+
63
+ free(row);
64
+ }
65
+
66
+ // calculates the cosine vector distance between two items
67
+ double cf_cosine(smatrix_t* smatrix, uint32_t b_id, uint32_t cc_count, uint32_t a_total) {
68
+ uint32_t b_total;
69
+ double num, den;
70
+
71
+ b_total = smatrix_get(smatrix, b_id, 0);
72
+
73
+ if (b_total == 0)
74
+ b_total = 1;
75
+
76
+ num = cc_count;
77
+ den = sqrt((double) a_total) * sqrt((double) b_total);
78
+
79
+ if (den == 0.0)
80
+ return 0.0;
81
+
82
+ if (num > den)
83
+ return 0.0;
84
+
85
+ return (num / den);
86
+ }
87
+
@@ -0,0 +1,75 @@
1
+ // This file is part of the "libsmatrix" project
2
+ // (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
3
+ //
4
+ // Licensed under the MIT License (the "License"); you may not use this
5
+ // file except in compliance with the License. You may obtain a copy of
6
+ // the License at: http://opensource.org/licenses/MIT
7
+
8
+ #include <stdio.h>
9
+ #include <string.h>
10
+ #include <stdlib.h>
11
+ #include <pthread.h>
12
+
13
+ #include "smatrix.h"
14
+
15
+ smatrix_t* db;
16
+
17
+ void* test(void* fnord) {
18
+ uint64_t i, n, m;
19
+
20
+ for (m = 0; m < 100; m++) {
21
+ for (n = 1; n < 30; n++) {
22
+ for (i = 1; i < 50; i++) {
23
+ smatrix_incr(db, n, i, 1);
24
+ }
25
+ }
26
+ }
27
+
28
+ return NULL;
29
+ }
30
+
31
+ int main(int argc, char **argv) {
32
+ int i,n,m,l,x=0, num_threads = 4;
33
+ pthread_t threads[num_threads];
34
+
35
+ printf("\nloading\n");
36
+ db = smatrix_open("/var/tmp/reco.db");
37
+ //db = smatrix_open(NULL); // in-memory only mode
38
+
39
+ if (db == NULL)
40
+ abort();
41
+
42
+ printf("\nstarting\n");
43
+
44
+ for (n = 0; n < num_threads; n++)
45
+ pthread_create(&threads[n], NULL, test, NULL);
46
+
47
+ for (n = 0; n < num_threads; n++)
48
+ pthread_join(threads[n], NULL);
49
+
50
+ printf("\ndone\n");
51
+
52
+ for (n = 1; n < 30; n++) {
53
+ for (i = 1; i < 50; i++) {
54
+ printf("(%u,%u) => %u, ", n, i, smatrix_get(db, n, i));
55
+ if (x++ % 5 == 0) printf("\n");
56
+ }
57
+ }
58
+
59
+ printf("rowlen: %u\n", l = smatrix_rowlen(db, 23));
60
+ size_t bytes = sizeof(uint32_t) * l * 2;
61
+ uint32_t* data = malloc(bytes);
62
+
63
+ l = smatrix_getrow(db, 23, data, bytes);
64
+
65
+ for (i = 0; i < l; i++) {
66
+ printf("%u => %u, ", data[i * 2], data[i * 2 + 1]);
67
+ }
68
+
69
+ printf("\n");
70
+
71
+ smatrix_close(db);
72
+ printf("in use at exit: %lu\n", db->mem);
73
+
74
+ return 0;
75
+ }
@@ -0,0 +1,28 @@
1
+ # This file is part of the "libsmatrix" project
2
+ # (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
3
+ #
4
+ # Licensed under the MIT License (the "License"); you may not use this
5
+ # file except in compliance with the License. You may obtain a copy of
6
+ # the License at: http://opensource.org/licenses/MIT
7
+
8
+ include Makefile.in
9
+
10
+ TARGET = smatrix.$(LIBEXT)
11
+
12
+ all: $(TARGET)
13
+
14
+ $(TARGET): smatrix.o
15
+ $(CC) $(LIBFLAGS) smatrix.o -o $(TARGET) $(LDFLAGS)
16
+
17
+ smatrix.o: config.h smatrix.c smatrix.h smatrix_private.h
18
+ $(CC) -c $(CFLAGS) smatrix.c -o smatrix.o
19
+
20
+ config.h:
21
+ touch config.h
22
+
23
+ smatrix_jni.h:
24
+ javac com/paulasmuth/libsmatrix/SparseMatrix.java
25
+ javah -o smatrix_jni.h -classpath . com.paulasmuth.libsmatrix.SparseMatrix
26
+
27
+ smatrix_benchmark: smatrix.o smatrix_benchmark.c
28
+ $(CC) $(CFLAGS) smatrix_benchmark.c smatrix.o -o smatrix_benchmark $(LDFLAGS)