libsmatrix 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Makefile +52 -0
- data/README.md +188 -0
- data/examples/.gitignore +2 -0
- data/examples/CFRecommender.java +24 -0
- data/examples/cf_recommender.c +87 -0
- data/examples/smatrix_example.c +75 -0
- data/src/Makefile +28 -0
- data/src/Makefile.in +22 -0
- data/src/java/Makefile +35 -0
- data/src/java/com/paulasmuth/libsmatrix/SparseMatrix.java +146 -0
- data/src/java/pom.xml +68 -0
- data/src/java/test/TestSparseMatrix.java +207 -0
- data/src/ruby/.gitignore +1 -0
- data/src/ruby/Makefile +21 -0
- data/src/ruby/extconf.rb +18 -0
- data/src/ruby/libsmatrix.gemspec +20 -0
- data/src/ruby/libsmatrix.rb +6 -0
- data/src/smatrix.c +960 -0
- data/src/smatrix.h +96 -0
- data/src/smatrix_benchmark.c +236 -0
- data/src/smatrix_jni.c +161 -0
- data/src/smatrix_jni.h +77 -0
- data/src/smatrix_private.h +51 -0
- data/src/smatrix_ruby.c +178 -0
- data/src/smatrix_ruby.h +29 -0
- metadata +90 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5ed4aa20301e6e40e3a9b6b422c081c9b4b00e07
|
4
|
+
data.tar.gz: 4b43c976b08df58841171aaa39919a885f7650f1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7d647cd4bdfd5159f38e31955471418ec7a32df1e2d674de13a6c60a2f62ab2857becc4918fbe5d47a1b7f61bc8dcf4e15f6e11729f125f361c6a91e9de4a5dc
|
7
|
+
data.tar.gz: 0700cc4e0b013a5a86be04590e581165e7c8cd3ac4789e16e6f690ed422253308ec6368a8933a2d10009692b04962ce399922831c2e169cbed0ddfca00d9a863
|
data/Makefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# This file is part of the "libsmatrix" project
|
2
|
+
# (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
|
3
|
+
#
|
4
|
+
# Licensed under the MIT License (the "License"); you may not use this
|
5
|
+
# file except in compliance with the License. You may obtain a copy of
|
6
|
+
# the License at: http://opensource.org/licenses/MIT
|
7
|
+
|
8
|
+
include src/Makefile.in
|
9
|
+
|
10
|
+
SHELL = /bin/sh
|
11
|
+
CC = clang
|
12
|
+
CFLAGS_ = $(CFLAGS) -Wall -Wextra -O3 -march=native -mtune=native -D NDEBUG -fPIC
|
13
|
+
LDFLAGS = -lpthread -lm -lruby
|
14
|
+
PREFIX = $(DESTDIR)/usr/local
|
15
|
+
LIBDIR = $(PREFIX)/lib
|
16
|
+
UNAME = $(shell uname)
|
17
|
+
SOURCES = src/smatrix.c src/smatrix_jni.c src/smatrix_ruby.c
|
18
|
+
|
19
|
+
all: src/smatrix.$(LIBEXT)
|
20
|
+
|
21
|
+
src/smatrix.$(LIBEXT):
|
22
|
+
cd src && make
|
23
|
+
|
24
|
+
install:
|
25
|
+
cp src/smatrix.$(LIBEXT) $(LIBDIR)
|
26
|
+
|
27
|
+
clean:
|
28
|
+
find . -name "*.o" -o -name "*.class" -o -name "*.so" -o -name "*.dylib" -o -name "*.bundle" | xargs rm
|
29
|
+
rm -rf src/java/target src/config.h src/smatrix_benchmark *.gem
|
30
|
+
|
31
|
+
ruby:
|
32
|
+
cd src/ruby && ruby extconf.rb
|
33
|
+
cd src/ruby && make
|
34
|
+
|
35
|
+
publish_ruby:
|
36
|
+
gem build src/ruby/libsmatrix.gemspec
|
37
|
+
mv *.gem src/ruby/
|
38
|
+
|
39
|
+
java:
|
40
|
+
cd src/java && make
|
41
|
+
|
42
|
+
publish_java: java
|
43
|
+
cd src/java && mvn deploy
|
44
|
+
|
45
|
+
benchmark: src/smatrix_benchmark
|
46
|
+
src/smatrix_benchmark full
|
47
|
+
|
48
|
+
src/smatrix_benchmark:
|
49
|
+
cd src && make smatrix_benchmark
|
50
|
+
|
51
|
+
test:
|
52
|
+
cd src/java && make test
|
data/README.md
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
libsmatrix
|
2
|
+
==========
|
3
|
+
|
4
|
+
A thread-safe two dimensional sparse matrix data structure with C, Java and Ruby bindings.
|
5
|
+
It was created to make loading and accessing medium sized (10GB+) matrices in boxed languages
|
6
|
+
like Java/Scala or Ruby easier.
|
7
|
+
|
8
|
+
While the chosen internal storage format (nested hashmaps) is neither the most memory-efficient
|
9
|
+
nor extremely fast in terms of access/insert time it seems to be a good tradeoff between these
|
10
|
+
two goals.
|
11
|
+
|
12
|
+
A libsmatrix sparse matrix features two modes of operation; a memory-only mode in which all data
|
13
|
+
is kept in main memory and a mode in which the data is stored on disk and only a pool of recently
|
14
|
+
used rows is kept in memory. In this mode the data is persisted across program restarts. It also
|
15
|
+
allows you to handle datasets larger than your available main memory.
|
16
|
+
|
17
|
+
#### Documentation
|
18
|
+
|
19
|
+
+ [Getting Started](#getting-started)
|
20
|
+
+ [C API](#c-api)
|
21
|
+
+ [Java/Scala API](#fnord)
|
22
|
+
+ [Ruby API](#ruby-api)
|
23
|
+
+ [Internals](#internals)
|
24
|
+
+ [Benchmarks](#benchmarks)
|
25
|
+
+ [Examples](#examples)
|
26
|
+
+ [License](#license)
|
27
|
+
|
28
|
+
|
29
|
+
Getting Started (Building)
|
30
|
+
--------------------------
|
31
|
+
|
32
|
+
There are multiple ways to install libsmatrix:
|
33
|
+
|
34
|
+
### Compile from source
|
35
|
+
|
36
|
+
This will produce a single shared object "smatrix.so" file that exports all calls documented
|
37
|
+
in "C API".
|
38
|
+
|
39
|
+
$ make
|
40
|
+
$ make install
|
41
|
+
|
42
|
+
To run the tests/benchmarks (optional, requires java and ruby)
|
43
|
+
|
44
|
+
$ make test
|
45
|
+
$ make benchmark
|
46
|
+
|
47
|
+
To build the MRI ruby and Java JNI bindings (optional), run:
|
48
|
+
|
49
|
+
$ make ruby
|
50
|
+
$ make java
|
51
|
+
|
52
|
+
This will produce the respective shared objects and bundles in:
|
53
|
+
|
54
|
+
src/ruby/smatrix_ruby.so
|
55
|
+
src/ruby/smatrix_X.X.X.gem
|
56
|
+
|
57
|
+
src/java/smatrix_java.so
|
58
|
+
src/java/target/libsmatrix-X.X-SNAPSHOT.jar
|
59
|
+
|
60
|
+
### Import artifact via Maven/sbt (java/scala)
|
61
|
+
|
62
|
+
Currently the maven artifact only contains the binding glue code and doesn't actually build
|
63
|
+
the native shared object. You need to compile & install "libsmatrix.so" yourself on the target
|
64
|
+
host, otherwise you'll get a "UnsatisfiedLinkError".
|
65
|
+
|
66
|
+
Import artifact via sbt:
|
67
|
+
|
68
|
+
resolvers += "sbt-libsmatrix-repo" at "https://raw.github.com/paulasmuth/libsmatrix/mvn-repo/"
|
69
|
+
|
70
|
+
libraryDependencies += "com.paulasmuth.libsmatrix" % "libsmatrix" % "0.2-SNAPSHOT"
|
71
|
+
|
72
|
+
Import artifact via Maven2 (put this into your pom.xml):
|
73
|
+
|
74
|
+
<repository>
|
75
|
+
<id>libsmatrix-mvn-repo</id>
|
76
|
+
<url>https://raw.github.com/paulsmuth/libsmatrix/mvn-repo/</url>
|
77
|
+
<snapshots>
|
78
|
+
<enabled>true</enabled>
|
79
|
+
<updatePolicy>always</updatePolicy>
|
80
|
+
</snapshots>
|
81
|
+
</repository>
|
82
|
+
|
83
|
+
To publish the maven artifact from source, check out libsmatrix and run this:
|
84
|
+
|
85
|
+
$ make publish_java
|
86
|
+
|
87
|
+
|
88
|
+
### Import gem via rubygems (ruby only)
|
89
|
+
|
90
|
+
This will install the ruby bindings and compile the native shared object:
|
91
|
+
|
92
|
+
$ gem install libsmatrix
|
93
|
+
|
94
|
+
To use libsmatrix in your project, require it like this:
|
95
|
+
|
96
|
+
require "libsmatrix"
|
97
|
+
|
98
|
+
To build and publish the ruby gem run:
|
99
|
+
|
100
|
+
$ make publish_ruby
|
101
|
+
|
102
|
+
C API
|
103
|
+
-----
|
104
|
+
|
105
|
+
Open a smatrix (if filename is NULL, use in memory only mode; otherwise open or create file)
|
106
|
+
|
107
|
+
smatrix_t* smatrix_open(const char* fname);
|
108
|
+
|
109
|
+
Close a smatrix:
|
110
|
+
|
111
|
+
void smatrix_close(smatrix_t* self);
|
112
|
+
|
113
|
+
Get, Set, Increment, Decrement a (x,y) position. _All of the methods are threadsafe_
|
114
|
+
|
115
|
+
uint32_t smatrix_get(smatrix_t* self, uint32_t x, uint32_t y);
|
116
|
+
uint32_t smatrix_set(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
|
117
|
+
uint32_t smatrix_incr(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
|
118
|
+
uint32_t smatrix_decr(smatrix_t* self, uint32_t x, uint32_t y, uint32_t value);
|
119
|
+
|
120
|
+
Get a whole "row" of the matrix by row coordinate x. _All of the methods are threadsafe_
|
121
|
+
|
122
|
+
uint32_t smatrix_rowlen(smatrix_t* self, uint32_t x);
|
123
|
+
uint32_t smatrix_getrow(smatrix_t* self, uint32_t x, uint32_t* ret, size_t ret_len);
|
124
|
+
|
125
|
+
|
126
|
+
Java / Scala API
|
127
|
+
----------------
|
128
|
+
|
129
|
+
here be dragons
|
130
|
+
|
131
|
+
|
132
|
+
Ruby API
|
133
|
+
--------
|
134
|
+
|
135
|
+
|
136
|
+
Require the gem:
|
137
|
+
|
138
|
+
$ require 'libsmatrix'
|
139
|
+
|
140
|
+
|
141
|
+
Create a new smatrix instance:
|
142
|
+
|
143
|
+
$ smatrix = SparseMatrix.new("/path/to/smatrix.smx")
|
144
|
+
|
145
|
+
Get, Set, Increment, Decrement a (x,y) position
|
146
|
+
|
147
|
+
$ smatrix.set(x, y, 5)
|
148
|
+
=> 5
|
149
|
+
$ smatrix.get(x, y)
|
150
|
+
=> 5
|
151
|
+
$ smatrix.incr(x, y, 1)
|
152
|
+
=> 6
|
153
|
+
$ smatrix.decr(x, y, 1)
|
154
|
+
=> 5
|
155
|
+
|
156
|
+
Close and free the matrix (data is persisted to disk):
|
157
|
+
|
158
|
+
$ smatrix = nil
|
159
|
+
|
160
|
+
|
161
|
+
Benchmarks
|
162
|
+
----------
|
163
|
+
|
164
|
+
**No big-data disclaimer:** We are using this code to run a Collaborative Filtering
|
165
|
+
recommendation engine for one of Germany's largest ecommerce sites. It is tested on "small-data"
|
166
|
+
datasets with up to 40GB per matrix (1.5 billion values in 13 million rows). If your data is
|
167
|
+
actually much bigger (measured in terrabytes, not gigabytes) this library is not for you.
|
168
|
+
|
169
|
+
here be dragons
|
170
|
+
|
171
|
+
|
172
|
+
Examples
|
173
|
+
-------
|
174
|
+
|
175
|
+
+ There is a simple example in src/smatrix_example.c
|
176
|
+
+ There is a simple Collaborative Filtering based recommendation engine in src/smatrix_example_recommender.c
|
177
|
+
|
178
|
+
|
179
|
+
License
|
180
|
+
-------
|
181
|
+
|
182
|
+
Copyright (c) 2011 Paul Asmuth
|
183
|
+
|
184
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use, copy and modify copies of the Software, subject to the following conditions:
|
185
|
+
|
186
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
187
|
+
|
188
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/examples/.gitignore
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
/**
|
2
|
+
* This file is part of the "libsmatrix" project
|
3
|
+
* (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
|
4
|
+
*
|
5
|
+
* Licensed under the MIT License (the "License"); you may not use this
|
6
|
+
* file except in compliance with the License. You may obtain a copy of
|
7
|
+
* the License at: http://opensource.org/licenses/MIT
|
8
|
+
*/
|
9
|
+
import com.paulasmuth.libsmatrix.SparseMatrix;
|
10
|
+
|
11
|
+
/**
|
12
|
+
* Compile & run this example:
|
13
|
+
*
|
14
|
+
* $ javac CFRecommender.java && java CFRecommeder
|
15
|
+
*
|
16
|
+
*/
|
17
|
+
class CFRecommender {
|
18
|
+
|
19
|
+
public static void main(String[] opts) {
|
20
|
+
SparseMatrix smx = new SparseMatrix();
|
21
|
+
smx.test();
|
22
|
+
}
|
23
|
+
|
24
|
+
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
// This file is part of the "libsmatrix" project
|
2
|
+
// (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
|
3
|
+
//
|
4
|
+
// Licensed under the MIT License (the "License"); you may not use this
|
5
|
+
// file except in compliance with the License. You may obtain a copy of
|
6
|
+
// the License at: http://opensource.org/licenses/MIT
|
7
|
+
|
8
|
+
#include <stdio.h>
|
9
|
+
#include <string.h>
|
10
|
+
#include <stdlib.h>
|
11
|
+
#include <pthread.h>
|
12
|
+
|
13
|
+
#include "smatrix.h"
|
14
|
+
|
15
|
+
smatrix_t* my_smatrix;
|
16
|
+
|
17
|
+
// libsmatrix example: simple CF based recommendation engine
|
18
|
+
int main(int argc, char **argv) {
|
19
|
+
my_smatrix = smatrix_open(NULL);
|
20
|
+
|
21
|
+
// one preference set = list of items in one session
|
22
|
+
// e.g. list of viewed items by the same user
|
23
|
+
// e.g. list of bought items in the same checkout
|
24
|
+
uint32_t input_ids[5] = {12,52,63,76,43};
|
25
|
+
import_preference_set(input_ids, 5);
|
26
|
+
|
27
|
+
// generate recommendations (similar items) for item #76
|
28
|
+
void neighbors_for_item(76);
|
29
|
+
|
30
|
+
smatrix_close(my_smatrix);
|
31
|
+
return 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
// train / add a preference set (list of items in one session)
|
35
|
+
void import_preference_set(uint32_t* ids, uint32_t num_ids) {
|
36
|
+
uint32_t i, n;
|
37
|
+
|
38
|
+
for (n = 0; n < num_ids; n++) {
|
39
|
+
smatrix_incr(my_smatrix, ids[n], 0, 1);
|
40
|
+
|
41
|
+
for (i = 0; i < pset->len; i++) {
|
42
|
+
if (i != n) {
|
43
|
+
smatrix_incr(my_smatrix, ids[n], ids[i], 1);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
// get recommendations for item with id "item_id"
|
50
|
+
void neighbors_for_item(uint32_t item_id)
|
51
|
+
uint32_t neighbors, *row, total;
|
52
|
+
|
53
|
+
total = smatrix_get(my_smatrix, item_id, 0);
|
54
|
+
neighbors = smatrix_getrow(my_smatrix, item_id, row, 8192);
|
55
|
+
|
56
|
+
for (pos = 0; pos < neighbors; pos++) {
|
57
|
+
uint32_t cur_id = row[pos * 2];
|
58
|
+
|
59
|
+
printf("found neighbor for item %u: item %u with distance %f\n",
|
60
|
+
item_id, cf_cosine(smatrix, cur_id, row[pos * 2 + 1], total));
|
61
|
+
}
|
62
|
+
|
63
|
+
free(row);
|
64
|
+
}
|
65
|
+
|
66
|
+
// calculates the cosine vector distance between two items
|
67
|
+
double cf_cosine(smatrix_t* smatrix, uint32_t b_id, uint32_t cc_count, uint32_t a_total) {
|
68
|
+
uint32_t b_total;
|
69
|
+
double num, den;
|
70
|
+
|
71
|
+
b_total = smatrix_get(smatrix, b_id, 0);
|
72
|
+
|
73
|
+
if (b_total == 0)
|
74
|
+
b_total = 1;
|
75
|
+
|
76
|
+
num = cc_count;
|
77
|
+
den = sqrt((double) a_total) * sqrt((double) b_total);
|
78
|
+
|
79
|
+
if (den == 0.0)
|
80
|
+
return 0.0;
|
81
|
+
|
82
|
+
if (num > den)
|
83
|
+
return 0.0;
|
84
|
+
|
85
|
+
return (num / den);
|
86
|
+
}
|
87
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
// This file is part of the "libsmatrix" project
|
2
|
+
// (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
|
3
|
+
//
|
4
|
+
// Licensed under the MIT License (the "License"); you may not use this
|
5
|
+
// file except in compliance with the License. You may obtain a copy of
|
6
|
+
// the License at: http://opensource.org/licenses/MIT
|
7
|
+
|
8
|
+
#include <stdio.h>
|
9
|
+
#include <string.h>
|
10
|
+
#include <stdlib.h>
|
11
|
+
#include <pthread.h>
|
12
|
+
|
13
|
+
#include "smatrix.h"
|
14
|
+
|
15
|
+
smatrix_t* db;
|
16
|
+
|
17
|
+
void* test(void* fnord) {
|
18
|
+
uint64_t i, n, m;
|
19
|
+
|
20
|
+
for (m = 0; m < 100; m++) {
|
21
|
+
for (n = 1; n < 30; n++) {
|
22
|
+
for (i = 1; i < 50; i++) {
|
23
|
+
smatrix_incr(db, n, i, 1);
|
24
|
+
}
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
return NULL;
|
29
|
+
}
|
30
|
+
|
31
|
+
int main(int argc, char **argv) {
|
32
|
+
int i,n,m,l,x=0, num_threads = 4;
|
33
|
+
pthread_t threads[num_threads];
|
34
|
+
|
35
|
+
printf("\nloading\n");
|
36
|
+
db = smatrix_open("/var/tmp/reco.db");
|
37
|
+
//db = smatrix_open(NULL); // in-memory only mode
|
38
|
+
|
39
|
+
if (db == NULL)
|
40
|
+
abort();
|
41
|
+
|
42
|
+
printf("\nstarting\n");
|
43
|
+
|
44
|
+
for (n = 0; n < num_threads; n++)
|
45
|
+
pthread_create(&threads[n], NULL, test, NULL);
|
46
|
+
|
47
|
+
for (n = 0; n < num_threads; n++)
|
48
|
+
pthread_join(threads[n], NULL);
|
49
|
+
|
50
|
+
printf("\ndone\n");
|
51
|
+
|
52
|
+
for (n = 1; n < 30; n++) {
|
53
|
+
for (i = 1; i < 50; i++) {
|
54
|
+
printf("(%u,%u) => %u, ", n, i, smatrix_get(db, n, i));
|
55
|
+
if (x++ % 5 == 0) printf("\n");
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
printf("rowlen: %u\n", l = smatrix_rowlen(db, 23));
|
60
|
+
size_t bytes = sizeof(uint32_t) * l * 2;
|
61
|
+
uint32_t* data = malloc(bytes);
|
62
|
+
|
63
|
+
l = smatrix_getrow(db, 23, data, bytes);
|
64
|
+
|
65
|
+
for (i = 0; i < l; i++) {
|
66
|
+
printf("%u => %u, ", data[i * 2], data[i * 2 + 1]);
|
67
|
+
}
|
68
|
+
|
69
|
+
printf("\n");
|
70
|
+
|
71
|
+
smatrix_close(db);
|
72
|
+
printf("in use at exit: %lu\n", db->mem);
|
73
|
+
|
74
|
+
return 0;
|
75
|
+
}
|
data/src/Makefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file is part of the "libsmatrix" project
|
2
|
+
# (c) 2011-2013 Paul Asmuth <paul@paulasmuth.com>
|
3
|
+
#
|
4
|
+
# Licensed under the MIT License (the "License"); you may not use this
|
5
|
+
# file except in compliance with the License. You may obtain a copy of
|
6
|
+
# the License at: http://opensource.org/licenses/MIT
|
7
|
+
|
8
|
+
include Makefile.in
|
9
|
+
|
10
|
+
TARGET = smatrix.$(LIBEXT)
|
11
|
+
|
12
|
+
all: $(TARGET)
|
13
|
+
|
14
|
+
$(TARGET): smatrix.o
|
15
|
+
$(CC) $(LIBFLAGS) smatrix.o -o $(TARGET) $(LDFLAGS)
|
16
|
+
|
17
|
+
smatrix.o: config.h smatrix.c smatrix.h smatrix_private.h
|
18
|
+
$(CC) -c $(CFLAGS) smatrix.c -o smatrix.o
|
19
|
+
|
20
|
+
config.h:
|
21
|
+
touch config.h
|
22
|
+
|
23
|
+
smatrix_jni.h:
|
24
|
+
javac com/paulasmuth/libsmatrix/SparseMatrix.java
|
25
|
+
javah -o smatrix_jni.h -classpath . com.paulasmuth.libsmatrix.SparseMatrix
|
26
|
+
|
27
|
+
smatrix_benchmark: smatrix.o smatrix_benchmark.c
|
28
|
+
$(CC) $(CFLAGS) smatrix_benchmark.c smatrix.o -o smatrix_benchmark $(LDFLAGS)
|