recommendify-ruby 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +9 -0
- data/README.md +154 -0
- data/Rakefile +18 -0
- data/doc/example.png +0 -0
- data/doc/example.rb +87 -0
- data/doc/example_data.csv +120048 -0
- data/ext/cc_item.h +8 -0
- data/ext/cosine.c +3 -0
- data/ext/extconf.rb +18 -0
- data/ext/iikey.c +18 -0
- data/ext/jaccard.c +19 -0
- data/ext/output.c +22 -0
- data/ext/recommendify.c +214 -0
- data/ext/sort.c +23 -0
- data/ext/version.h +17 -0
- data/lib/recommendify.rb +9 -0
- data/lib/recommendify/base.rb +86 -0
- data/lib/recommendify/cc_matrix.rb +51 -0
- data/lib/recommendify/cosine_input_matrix.rb +7 -0
- data/lib/recommendify/input_matrix.rb +52 -0
- data/lib/recommendify/jaccard_input_matrix.rb +62 -0
- data/lib/recommendify/neighbor.rb +19 -0
- data/lib/recommendify/recommendify.rb +25 -0
- data/lib/recommendify/similarity_matrix.rb +62 -0
- data/lib/recommendify/sparse_matrix.rb +53 -0
- data/recommendify.gemspec +25 -0
- data/spec/base_spec.rb +188 -0
- data/spec/cc_matrix_shared.rb +89 -0
- data/spec/cosine_input_matrix_spec.rb +18 -0
- data/spec/input_matrix_shared.rb +27 -0
- data/spec/input_matrix_spec.rb +29 -0
- data/spec/jaccard_input_matrix_spec.rb +95 -0
- data/spec/recommendify_spec.rb +28 -0
- data/spec/similarity_matrix_spec.rb +93 -0
- data/spec/sparse_matrix_spec.rb +78 -0
- data/spec/spec_helper.rb +42 -0
- metadata +128 -0
data/ext/cc_item.h
ADDED
data/ext/cosine.c
ADDED
data/ext/extconf.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
makefile = <<-MAKEFILE
|
2
|
+
all: prepare build
|
3
|
+
|
4
|
+
build:
|
5
|
+
gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
|
6
|
+
|
7
|
+
prepare:
|
8
|
+
mkdir -p ../bin
|
9
|
+
|
10
|
+
clean:
|
11
|
+
rm -f *.o
|
12
|
+
|
13
|
+
install: prepare
|
14
|
+
MAKEFILE
|
15
|
+
|
16
|
+
File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
|
17
|
+
f.write(makefile)
|
18
|
+
end
|
data/ext/iikey.c
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
char* item_item_key(char *item1, char *item2){
|
2
|
+
int keylen = strlen(item1) + strlen(item2) + 2;
|
3
|
+
char *key = (char *)malloc(keylen * sizeof(char));
|
4
|
+
|
5
|
+
if(!key){
|
6
|
+
printf("cannot allocate\n");
|
7
|
+
return 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
// FIXPAUL: make shure this does exactly the same as ruby sort
|
11
|
+
if(rb_strcmp(item1, item2) <= 0){
|
12
|
+
snprintf(key, keylen, "%s:%s", item1, item2);
|
13
|
+
} else {
|
14
|
+
snprintf(key, keylen, "%s:%s", item2, item1);
|
15
|
+
}
|
16
|
+
|
17
|
+
return key;
|
18
|
+
}
|
data/ext/jaccard.c
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
|
2
|
+
int j, n;
|
3
|
+
|
4
|
+
for(j = 0; j < cc_items_size; j++){
|
5
|
+
n = cc_items[j].coconcurrency_count;
|
6
|
+
if(n>0){
|
7
|
+
cc_items[j].similarity = (
|
8
|
+
(float)n / (
|
9
|
+
(float)itemCount +
|
10
|
+
(float)cc_items[j].total_count -
|
11
|
+
(float)n
|
12
|
+
)
|
13
|
+
);
|
14
|
+
} else {
|
15
|
+
cc_items[j].similarity = 0.0;
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
}
|
data/ext/output.c
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
int print_version(){
|
2
|
+
printf(
|
3
|
+
VERSION_STRING,
|
4
|
+
VERSION_MAJOR,
|
5
|
+
VERSION_MINOR,
|
6
|
+
VERSION_MICRO
|
7
|
+
);
|
8
|
+
return 0;
|
9
|
+
}
|
10
|
+
|
11
|
+
int print_usage(char *bin){
|
12
|
+
printf(USAGE_STRING, bin);
|
13
|
+
return 1;
|
14
|
+
}
|
15
|
+
|
16
|
+
void print_item(struct cc_item item){
|
17
|
+
printf(
|
18
|
+
"OUT: (%s) (%.4f)\n",
|
19
|
+
item.item_id,
|
20
|
+
item.similarity
|
21
|
+
);
|
22
|
+
}
|
data/ext/recommendify.c
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <hiredis/hiredis.h>
|
5
|
+
|
6
|
+
#include "version.h"
|
7
|
+
#include "cc_item.h"
|
8
|
+
#include "jaccard.c"
|
9
|
+
#include "cosine.c"
|
10
|
+
#include "output.c"
|
11
|
+
#include "sort.c"
|
12
|
+
#include "iikey.c"
|
13
|
+
|
14
|
+
|
15
|
+
int main(int argc, char **argv){
|
16
|
+
int i, j, n, similarityFunc = 0;
|
17
|
+
int itemCount = 0;
|
18
|
+
char *itemID;
|
19
|
+
char *redisPrefix;
|
20
|
+
redisContext *c;
|
21
|
+
redisReply *all_items;
|
22
|
+
redisReply *reply;
|
23
|
+
int cur_batch_size;
|
24
|
+
char* cur_batch;
|
25
|
+
char *iikey;
|
26
|
+
|
27
|
+
int batch_size = 200; /* FIXPAUL: make option */
|
28
|
+
int maxItems = 50; /* FIXPAUL: make option */
|
29
|
+
|
30
|
+
struct {
|
31
|
+
char host[1024];
|
32
|
+
int port;
|
33
|
+
} redis_addr;
|
34
|
+
|
35
|
+
/* option parsing */
|
36
|
+
if(argc < 2)
|
37
|
+
return print_usage(argv[0]);
|
38
|
+
|
39
|
+
if(!strcmp(argv[1], "--version"))
|
40
|
+
return print_version();
|
41
|
+
|
42
|
+
if(!strcmp(argv[1], "--jaccard"))
|
43
|
+
similarityFunc = 1;
|
44
|
+
|
45
|
+
if(!strcmp(argv[1], "--cosine"))
|
46
|
+
similarityFunc = 2;
|
47
|
+
|
48
|
+
if(!similarityFunc){
|
49
|
+
printf("invalid option: %s\n", argv[1]);
|
50
|
+
return 1;
|
51
|
+
} else if(argc < 4 || argc > 5){
|
52
|
+
printf("wrong number of arguments\n");
|
53
|
+
print_usage(argv[0]);
|
54
|
+
return 1;
|
55
|
+
}
|
56
|
+
|
57
|
+
redisPrefix = argv[2];
|
58
|
+
itemID = argv[3];
|
59
|
+
redis_addr.host[0] = 0;
|
60
|
+
redis_addr.port = 0;
|
61
|
+
|
62
|
+
/* configure redis location */
|
63
|
+
if(argc > 4){
|
64
|
+
char* has_port = strchr(argv[4], ':');
|
65
|
+
if(has_port){
|
66
|
+
strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
|
67
|
+
redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
|
68
|
+
redis_addr.port = atoi(has_port + 1);
|
69
|
+
} else {
|
70
|
+
strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
/* default redis location */
|
75
|
+
if(strlen(redis_addr.host) == 0)
|
76
|
+
strcpy(redis_addr.host, "localhost");
|
77
|
+
|
78
|
+
if(!redis_addr.port)
|
79
|
+
redis_addr.port = 6379;
|
80
|
+
|
81
|
+
/* connect to redis */
|
82
|
+
struct timeval timeout = { 1, 500000 };
|
83
|
+
c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
|
84
|
+
|
85
|
+
if(c->err){
|
86
|
+
printf("connection to redis failed: %s\n", c->errstr);
|
87
|
+
return 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
|
91
|
+
/* get item count */
|
92
|
+
reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
|
93
|
+
|
94
|
+
if(reply->str){
|
95
|
+
itemCount = atoi(reply->str);
|
96
|
+
} else {
|
97
|
+
itemCount = 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
freeReplyObject(reply);
|
101
|
+
|
102
|
+
if(itemCount < 2){
|
103
|
+
printf("exit: item count is zero or one\n");
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
|
108
|
+
/* get all items_ids and the total counts */
|
109
|
+
all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
|
110
|
+
|
111
|
+
if(all_items->type != REDIS_REPLY_ARRAY)
|
112
|
+
return 1;
|
113
|
+
|
114
|
+
|
115
|
+
/* populate the cc_items array */
|
116
|
+
int cc_items_size = all_items->elements / 2;
|
117
|
+
int cc_items_mem = cc_items_size * sizeof(struct cc_item);
|
118
|
+
struct cc_item *cc_items = malloc(cc_items_mem);
|
119
|
+
cc_items_size--;
|
120
|
+
|
121
|
+
if(!cc_items){
|
122
|
+
printf("cannot allocate memory: %i", cc_items_mem);
|
123
|
+
return 1;
|
124
|
+
}
|
125
|
+
|
126
|
+
i = 0;
|
127
|
+
for (j = 0; j < all_items->elements/2; j++){
|
128
|
+
if(strcmp(itemID, all_items->element[j*2]->str) != 0){
|
129
|
+
strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
|
130
|
+
cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
|
131
|
+
i++;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
freeReplyObject(all_items);
|
136
|
+
|
137
|
+
|
138
|
+
// batched redis hmgets on the ccmatrix
|
139
|
+
cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
|
140
|
+
|
141
|
+
if(!cur_batch){
|
142
|
+
printf("cannot allocate memory");
|
143
|
+
return 1;
|
144
|
+
}
|
145
|
+
|
146
|
+
n = cc_items_size;
|
147
|
+
while(n >= 0){
|
148
|
+
cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
|
149
|
+
sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
|
150
|
+
|
151
|
+
for(i = 0; i < cur_batch_size; i++){
|
152
|
+
iikey = item_item_key(itemID, cc_items[n-i].item_id);
|
153
|
+
|
154
|
+
strcat(cur_batch, iikey);
|
155
|
+
strcat(cur_batch, " ");
|
156
|
+
|
157
|
+
if(iikey)
|
158
|
+
free(iikey);
|
159
|
+
}
|
160
|
+
|
161
|
+
redisAppendCommand(c, cur_batch);
|
162
|
+
redisGetReply(c, (void**)&reply);
|
163
|
+
|
164
|
+
for(j = 0; j < reply->elements; j++){
|
165
|
+
if(reply->element[j]->str){
|
166
|
+
cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
|
167
|
+
} else {
|
168
|
+
cc_items[n-j].coconcurrency_count = 0;
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
freeReplyObject(reply);
|
173
|
+
n -= batch_size;
|
174
|
+
}
|
175
|
+
|
176
|
+
free(cur_batch);
|
177
|
+
|
178
|
+
|
179
|
+
|
180
|
+
/* calculate similarities */
|
181
|
+
if(similarityFunc == 1)
|
182
|
+
calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
|
183
|
+
|
184
|
+
if(similarityFunc == 2)
|
185
|
+
calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
|
186
|
+
|
187
|
+
|
188
|
+
/* find the top x items with simple bubble sort */
|
189
|
+
for(i = 0; i < maxItems - 1; ++i){
|
190
|
+
for (j = 0; j < cc_items_size - i - 1; ++j){
|
191
|
+
if (cc_items[j].similarity > cc_items[j + 1].similarity){
|
192
|
+
struct cc_item tmp = cc_items[j];
|
193
|
+
cc_items[j] = cc_items[j + 1];
|
194
|
+
cc_items[j + 1] = tmp;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
|
200
|
+
/* print top k items */
|
201
|
+
n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
|
202
|
+
for(j = 0; j < n; j++){
|
203
|
+
i = cc_items_size-j-1;
|
204
|
+
if(cc_items[i].similarity > 0){
|
205
|
+
print_item(cc_items[i]);
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
|
210
|
+
free(cc_items);
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
|
214
|
+
|
data/ext/sort.c
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
int lesser(int i1, int i2){
|
2
|
+
if(i1 > i2){
|
3
|
+
return i2;
|
4
|
+
} else {
|
5
|
+
return i1;
|
6
|
+
}
|
7
|
+
}
|
8
|
+
|
9
|
+
int rb_strcmp(char *str1, char *str2){
|
10
|
+
long len;
|
11
|
+
int retval;
|
12
|
+
len = lesser(strlen(str1), strlen(str2));
|
13
|
+
retval = memcmp(str1, str2, len);
|
14
|
+
if (retval == 0){
|
15
|
+
if (strlen(str1) == strlen(str2)) {
|
16
|
+
return 0;
|
17
|
+
}
|
18
|
+
if (strlen(str1) > strlen(str2)) return 1;
|
19
|
+
return -1;
|
20
|
+
}
|
21
|
+
if (retval > 0) return 1;
|
22
|
+
return -1;
|
23
|
+
}
|
data/ext/version.h
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef VERSION_H
|
2
|
+
#define VERSION_H
|
3
|
+
|
4
|
+
#define VERSION_MAJOR 0
|
5
|
+
#define VERSION_MINOR 0
|
6
|
+
#define VERSION_MICRO 1
|
7
|
+
|
8
|
+
#define VERSION_STRING "recommendify_native %i.%i.%i\n" \
|
9
|
+
"\n" \
|
10
|
+
"Copyright © 2012\n" \
|
11
|
+
" Paul Asmuth <paul@paulasmuth.com>\n"
|
12
|
+
|
13
|
+
#define USAGE_STRING "usage: %s " \
|
14
|
+
"{--version|--jaccard|--cosine} " \
|
15
|
+
"[redis_key] [item_id]\n"
|
16
|
+
|
17
|
+
#endif
|
data/lib/recommendify.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require "recommendify/recommendify"
|
2
|
+
require "recommendify/sparse_matrix"
|
3
|
+
require "recommendify/cc_matrix"
|
4
|
+
require "recommendify/similarity_matrix"
|
5
|
+
require "recommendify/input_matrix"
|
6
|
+
require "recommendify/jaccard_input_matrix"
|
7
|
+
require "recommendify/cosine_input_matrix"
|
8
|
+
require "recommendify/base"
|
9
|
+
require "recommendify/neighbor"
|
@@ -0,0 +1,86 @@
|
|
1
|
+
class Recommendify::Base
|
2
|
+
|
3
|
+
attr_reader :similarity_matrix, :input_matrices
|
4
|
+
|
5
|
+
@@max_neighbors = nil
|
6
|
+
@@input_matrices = {}
|
7
|
+
|
8
|
+
def self.max_neighbors(n=nil)
|
9
|
+
return @@max_neighbors unless n
|
10
|
+
@@max_neighbors = n
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.input_matrix(key, opts)
|
14
|
+
@@input_matrices[key] = opts
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.input_matrices
|
18
|
+
@@input_matrices
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
|
23
|
+
opts.merge!(:key => key, :redis_prefix => redis_prefix)
|
24
|
+
[ key, Recommendify::InputMatrix.create(opts) ]
|
25
|
+
}]
|
26
|
+
@similarity_matrix = Recommendify::SimilarityMatrix.new(
|
27
|
+
:max_neighbors => max_neighbors,
|
28
|
+
:key => :similarities,
|
29
|
+
:redis_prefix => redis_prefix
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
def redis_prefix
|
34
|
+
"recommendify"
|
35
|
+
end
|
36
|
+
|
37
|
+
def max_neighbors
|
38
|
+
self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
|
39
|
+
end
|
40
|
+
|
41
|
+
def method_missing(method, *args)
|
42
|
+
if @input_matrices.has_key?(method)
|
43
|
+
@input_matrices[method]
|
44
|
+
else
|
45
|
+
raise NoMethodError.new(method.to_s)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def respond_to?(method)
|
50
|
+
@input_matrices.has_key?(method) ? true : super
|
51
|
+
end
|
52
|
+
|
53
|
+
def all_items
|
54
|
+
@input_matrices.map{ |k,m| m.all_items }.flatten.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
def for(item_id)
|
58
|
+
similarity_matrix[item_id].map do |item_id, similarity|
|
59
|
+
Recommendify::Neighbor.new(
|
60
|
+
:item_id => item_id,
|
61
|
+
:similarity => similarity
|
62
|
+
)
|
63
|
+
end.sort
|
64
|
+
end
|
65
|
+
|
66
|
+
def process!
|
67
|
+
all_items.each{ |item_id,n| process_item!(item_id) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_item!(item_id)
|
71
|
+
input_matrices.map do |k,m|
|
72
|
+
neighbors = m.similarities_for(item_id).map do |i,w|
|
73
|
+
[i,w*m.weight]
|
74
|
+
end
|
75
|
+
similarity_matrix.update(item_id, neighbors)
|
76
|
+
end
|
77
|
+
similarity_matrix.commit_item!(item_id)
|
78
|
+
end
|
79
|
+
|
80
|
+
def delete_item!(item_id)
|
81
|
+
input_matrices.map do |k,m|
|
82
|
+
m.delete_item(item_id)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|