recommendify-ruby 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +9 -0
- data/README.md +154 -0
- data/Rakefile +18 -0
- data/doc/example.png +0 -0
- data/doc/example.rb +87 -0
- data/doc/example_data.csv +120048 -0
- data/ext/cc_item.h +8 -0
- data/ext/cosine.c +3 -0
- data/ext/extconf.rb +18 -0
- data/ext/iikey.c +18 -0
- data/ext/jaccard.c +19 -0
- data/ext/output.c +22 -0
- data/ext/recommendify.c +214 -0
- data/ext/sort.c +23 -0
- data/ext/version.h +17 -0
- data/lib/recommendify.rb +9 -0
- data/lib/recommendify/base.rb +86 -0
- data/lib/recommendify/cc_matrix.rb +51 -0
- data/lib/recommendify/cosine_input_matrix.rb +7 -0
- data/lib/recommendify/input_matrix.rb +52 -0
- data/lib/recommendify/jaccard_input_matrix.rb +62 -0
- data/lib/recommendify/neighbor.rb +19 -0
- data/lib/recommendify/recommendify.rb +25 -0
- data/lib/recommendify/similarity_matrix.rb +62 -0
- data/lib/recommendify/sparse_matrix.rb +53 -0
- data/recommendify.gemspec +25 -0
- data/spec/base_spec.rb +188 -0
- data/spec/cc_matrix_shared.rb +89 -0
- data/spec/cosine_input_matrix_spec.rb +18 -0
- data/spec/input_matrix_shared.rb +27 -0
- data/spec/input_matrix_spec.rb +29 -0
- data/spec/jaccard_input_matrix_spec.rb +95 -0
- data/spec/recommendify_spec.rb +28 -0
- data/spec/similarity_matrix_spec.rb +93 -0
- data/spec/sparse_matrix_spec.rb +78 -0
- data/spec/spec_helper.rb +42 -0
- metadata +128 -0
data/ext/cc_item.h
ADDED
data/ext/cosine.c
ADDED
data/ext/extconf.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
makefile = <<-MAKEFILE
|
2
|
+
all: prepare build
|
3
|
+
|
4
|
+
build:
|
5
|
+
gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
|
6
|
+
|
7
|
+
prepare:
|
8
|
+
mkdir -p ../bin
|
9
|
+
|
10
|
+
clean:
|
11
|
+
rm -f *.o
|
12
|
+
|
13
|
+
install: prepare
|
14
|
+
MAKEFILE
|
15
|
+
|
16
|
+
File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
|
17
|
+
f.write(makefile)
|
18
|
+
end
|
data/ext/iikey.c
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
char* item_item_key(char *item1, char *item2){
|
2
|
+
int keylen = strlen(item1) + strlen(item2) + 2;
|
3
|
+
char *key = (char *)malloc(keylen * sizeof(char));
|
4
|
+
|
5
|
+
if(!key){
|
6
|
+
printf("cannot allocate\n");
|
7
|
+
return 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
// FIXPAUL: make shure this does exactly the same as ruby sort
|
11
|
+
if(rb_strcmp(item1, item2) <= 0){
|
12
|
+
snprintf(key, keylen, "%s:%s", item1, item2);
|
13
|
+
} else {
|
14
|
+
snprintf(key, keylen, "%s:%s", item2, item1);
|
15
|
+
}
|
16
|
+
|
17
|
+
return key;
|
18
|
+
}
|
data/ext/jaccard.c
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
|
2
|
+
int j, n;
|
3
|
+
|
4
|
+
for(j = 0; j < cc_items_size; j++){
|
5
|
+
n = cc_items[j].coconcurrency_count;
|
6
|
+
if(n>0){
|
7
|
+
cc_items[j].similarity = (
|
8
|
+
(float)n / (
|
9
|
+
(float)itemCount +
|
10
|
+
(float)cc_items[j].total_count -
|
11
|
+
(float)n
|
12
|
+
)
|
13
|
+
);
|
14
|
+
} else {
|
15
|
+
cc_items[j].similarity = 0.0;
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
}
|
data/ext/output.c
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
int print_version(){
|
2
|
+
printf(
|
3
|
+
VERSION_STRING,
|
4
|
+
VERSION_MAJOR,
|
5
|
+
VERSION_MINOR,
|
6
|
+
VERSION_MICRO
|
7
|
+
);
|
8
|
+
return 0;
|
9
|
+
}
|
10
|
+
|
11
|
+
int print_usage(char *bin){
|
12
|
+
printf(USAGE_STRING, bin);
|
13
|
+
return 1;
|
14
|
+
}
|
15
|
+
|
16
|
+
void print_item(struct cc_item item){
|
17
|
+
printf(
|
18
|
+
"OUT: (%s) (%.4f)\n",
|
19
|
+
item.item_id,
|
20
|
+
item.similarity
|
21
|
+
);
|
22
|
+
}
|
data/ext/recommendify.c
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <hiredis/hiredis.h>
|
5
|
+
|
6
|
+
#include "version.h"
|
7
|
+
#include "cc_item.h"
|
8
|
+
#include "jaccard.c"
|
9
|
+
#include "cosine.c"
|
10
|
+
#include "output.c"
|
11
|
+
#include "sort.c"
|
12
|
+
#include "iikey.c"
|
13
|
+
|
14
|
+
|
15
|
+
int main(int argc, char **argv){
|
16
|
+
int i, j, n, similarityFunc = 0;
|
17
|
+
int itemCount = 0;
|
18
|
+
char *itemID;
|
19
|
+
char *redisPrefix;
|
20
|
+
redisContext *c;
|
21
|
+
redisReply *all_items;
|
22
|
+
redisReply *reply;
|
23
|
+
int cur_batch_size;
|
24
|
+
char* cur_batch;
|
25
|
+
char *iikey;
|
26
|
+
|
27
|
+
int batch_size = 200; /* FIXPAUL: make option */
|
28
|
+
int maxItems = 50; /* FIXPAUL: make option */
|
29
|
+
|
30
|
+
struct {
|
31
|
+
char host[1024];
|
32
|
+
int port;
|
33
|
+
} redis_addr;
|
34
|
+
|
35
|
+
/* option parsing */
|
36
|
+
if(argc < 2)
|
37
|
+
return print_usage(argv[0]);
|
38
|
+
|
39
|
+
if(!strcmp(argv[1], "--version"))
|
40
|
+
return print_version();
|
41
|
+
|
42
|
+
if(!strcmp(argv[1], "--jaccard"))
|
43
|
+
similarityFunc = 1;
|
44
|
+
|
45
|
+
if(!strcmp(argv[1], "--cosine"))
|
46
|
+
similarityFunc = 2;
|
47
|
+
|
48
|
+
if(!similarityFunc){
|
49
|
+
printf("invalid option: %s\n", argv[1]);
|
50
|
+
return 1;
|
51
|
+
} else if(argc < 4 || argc > 5){
|
52
|
+
printf("wrong number of arguments\n");
|
53
|
+
print_usage(argv[0]);
|
54
|
+
return 1;
|
55
|
+
}
|
56
|
+
|
57
|
+
redisPrefix = argv[2];
|
58
|
+
itemID = argv[3];
|
59
|
+
redis_addr.host[0] = 0;
|
60
|
+
redis_addr.port = 0;
|
61
|
+
|
62
|
+
/* configure redis location */
|
63
|
+
if(argc > 4){
|
64
|
+
char* has_port = strchr(argv[4], ':');
|
65
|
+
if(has_port){
|
66
|
+
strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
|
67
|
+
redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
|
68
|
+
redis_addr.port = atoi(has_port + 1);
|
69
|
+
} else {
|
70
|
+
strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
/* default redis location */
|
75
|
+
if(strlen(redis_addr.host) == 0)
|
76
|
+
strcpy(redis_addr.host, "localhost");
|
77
|
+
|
78
|
+
if(!redis_addr.port)
|
79
|
+
redis_addr.port = 6379;
|
80
|
+
|
81
|
+
/* connect to redis */
|
82
|
+
struct timeval timeout = { 1, 500000 };
|
83
|
+
c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
|
84
|
+
|
85
|
+
if(c->err){
|
86
|
+
printf("connection to redis failed: %s\n", c->errstr);
|
87
|
+
return 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
|
91
|
+
/* get item count */
|
92
|
+
reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
|
93
|
+
|
94
|
+
if(reply->str){
|
95
|
+
itemCount = atoi(reply->str);
|
96
|
+
} else {
|
97
|
+
itemCount = 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
freeReplyObject(reply);
|
101
|
+
|
102
|
+
if(itemCount < 2){
|
103
|
+
printf("exit: item count is zero or one\n");
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
|
108
|
+
/* get all items_ids and the total counts */
|
109
|
+
all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
|
110
|
+
|
111
|
+
if(all_items->type != REDIS_REPLY_ARRAY)
|
112
|
+
return 1;
|
113
|
+
|
114
|
+
|
115
|
+
/* populate the cc_items array */
|
116
|
+
int cc_items_size = all_items->elements / 2;
|
117
|
+
int cc_items_mem = cc_items_size * sizeof(struct cc_item);
|
118
|
+
struct cc_item *cc_items = malloc(cc_items_mem);
|
119
|
+
cc_items_size--;
|
120
|
+
|
121
|
+
if(!cc_items){
|
122
|
+
printf("cannot allocate memory: %i", cc_items_mem);
|
123
|
+
return 1;
|
124
|
+
}
|
125
|
+
|
126
|
+
i = 0;
|
127
|
+
for (j = 0; j < all_items->elements/2; j++){
|
128
|
+
if(strcmp(itemID, all_items->element[j*2]->str) != 0){
|
129
|
+
strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
|
130
|
+
cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
|
131
|
+
i++;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
freeReplyObject(all_items);
|
136
|
+
|
137
|
+
|
138
|
+
// batched redis hmgets on the ccmatrix
|
139
|
+
cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
|
140
|
+
|
141
|
+
if(!cur_batch){
|
142
|
+
printf("cannot allocate memory");
|
143
|
+
return 1;
|
144
|
+
}
|
145
|
+
|
146
|
+
n = cc_items_size;
|
147
|
+
while(n >= 0){
|
148
|
+
cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
|
149
|
+
sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
|
150
|
+
|
151
|
+
for(i = 0; i < cur_batch_size; i++){
|
152
|
+
iikey = item_item_key(itemID, cc_items[n-i].item_id);
|
153
|
+
|
154
|
+
strcat(cur_batch, iikey);
|
155
|
+
strcat(cur_batch, " ");
|
156
|
+
|
157
|
+
if(iikey)
|
158
|
+
free(iikey);
|
159
|
+
}
|
160
|
+
|
161
|
+
redisAppendCommand(c, cur_batch);
|
162
|
+
redisGetReply(c, (void**)&reply);
|
163
|
+
|
164
|
+
for(j = 0; j < reply->elements; j++){
|
165
|
+
if(reply->element[j]->str){
|
166
|
+
cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
|
167
|
+
} else {
|
168
|
+
cc_items[n-j].coconcurrency_count = 0;
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
freeReplyObject(reply);
|
173
|
+
n -= batch_size;
|
174
|
+
}
|
175
|
+
|
176
|
+
free(cur_batch);
|
177
|
+
|
178
|
+
|
179
|
+
|
180
|
+
/* calculate similarities */
|
181
|
+
if(similarityFunc == 1)
|
182
|
+
calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
|
183
|
+
|
184
|
+
if(similarityFunc == 2)
|
185
|
+
calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
|
186
|
+
|
187
|
+
|
188
|
+
/* find the top x items with simple bubble sort */
|
189
|
+
for(i = 0; i < maxItems - 1; ++i){
|
190
|
+
for (j = 0; j < cc_items_size - i - 1; ++j){
|
191
|
+
if (cc_items[j].similarity > cc_items[j + 1].similarity){
|
192
|
+
struct cc_item tmp = cc_items[j];
|
193
|
+
cc_items[j] = cc_items[j + 1];
|
194
|
+
cc_items[j + 1] = tmp;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
|
200
|
+
/* print top k items */
|
201
|
+
n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
|
202
|
+
for(j = 0; j < n; j++){
|
203
|
+
i = cc_items_size-j-1;
|
204
|
+
if(cc_items[i].similarity > 0){
|
205
|
+
print_item(cc_items[i]);
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
|
210
|
+
free(cc_items);
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
|
214
|
+
|
data/ext/sort.c
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
int lesser(int i1, int i2){
|
2
|
+
if(i1 > i2){
|
3
|
+
return i2;
|
4
|
+
} else {
|
5
|
+
return i1;
|
6
|
+
}
|
7
|
+
}
|
8
|
+
|
9
|
+
int rb_strcmp(char *str1, char *str2){
|
10
|
+
long len;
|
11
|
+
int retval;
|
12
|
+
len = lesser(strlen(str1), strlen(str2));
|
13
|
+
retval = memcmp(str1, str2, len);
|
14
|
+
if (retval == 0){
|
15
|
+
if (strlen(str1) == strlen(str2)) {
|
16
|
+
return 0;
|
17
|
+
}
|
18
|
+
if (strlen(str1) > strlen(str2)) return 1;
|
19
|
+
return -1;
|
20
|
+
}
|
21
|
+
if (retval > 0) return 1;
|
22
|
+
return -1;
|
23
|
+
}
|
data/ext/version.h
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef VERSION_H
|
2
|
+
#define VERSION_H
|
3
|
+
|
4
|
+
#define VERSION_MAJOR 0
|
5
|
+
#define VERSION_MINOR 0
|
6
|
+
#define VERSION_MICRO 1
|
7
|
+
|
8
|
+
#define VERSION_STRING "recommendify_native %i.%i.%i\n" \
|
9
|
+
"\n" \
|
10
|
+
"Copyright © 2012\n" \
|
11
|
+
" Paul Asmuth <paul@paulasmuth.com>\n"
|
12
|
+
|
13
|
+
#define USAGE_STRING "usage: %s " \
|
14
|
+
"{--version|--jaccard|--cosine} " \
|
15
|
+
"[redis_key] [item_id]\n"
|
16
|
+
|
17
|
+
#endif
|
data/lib/recommendify.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require "recommendify/recommendify"
|
2
|
+
require "recommendify/sparse_matrix"
|
3
|
+
require "recommendify/cc_matrix"
|
4
|
+
require "recommendify/similarity_matrix"
|
5
|
+
require "recommendify/input_matrix"
|
6
|
+
require "recommendify/jaccard_input_matrix"
|
7
|
+
require "recommendify/cosine_input_matrix"
|
8
|
+
require "recommendify/base"
|
9
|
+
require "recommendify/neighbor"
|
@@ -0,0 +1,86 @@
|
|
1
|
+
class Recommendify::Base
|
2
|
+
|
3
|
+
attr_reader :similarity_matrix, :input_matrices
|
4
|
+
|
5
|
+
@@max_neighbors = nil
|
6
|
+
@@input_matrices = {}
|
7
|
+
|
8
|
+
def self.max_neighbors(n=nil)
|
9
|
+
return @@max_neighbors unless n
|
10
|
+
@@max_neighbors = n
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.input_matrix(key, opts)
|
14
|
+
@@input_matrices[key] = opts
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.input_matrices
|
18
|
+
@@input_matrices
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
|
23
|
+
opts.merge!(:key => key, :redis_prefix => redis_prefix)
|
24
|
+
[ key, Recommendify::InputMatrix.create(opts) ]
|
25
|
+
}]
|
26
|
+
@similarity_matrix = Recommendify::SimilarityMatrix.new(
|
27
|
+
:max_neighbors => max_neighbors,
|
28
|
+
:key => :similarities,
|
29
|
+
:redis_prefix => redis_prefix
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
def redis_prefix
|
34
|
+
"recommendify"
|
35
|
+
end
|
36
|
+
|
37
|
+
def max_neighbors
|
38
|
+
self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
|
39
|
+
end
|
40
|
+
|
41
|
+
def method_missing(method, *args)
|
42
|
+
if @input_matrices.has_key?(method)
|
43
|
+
@input_matrices[method]
|
44
|
+
else
|
45
|
+
raise NoMethodError.new(method.to_s)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def respond_to?(method)
|
50
|
+
@input_matrices.has_key?(method) ? true : super
|
51
|
+
end
|
52
|
+
|
53
|
+
def all_items
|
54
|
+
@input_matrices.map{ |k,m| m.all_items }.flatten.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
def for(item_id)
|
58
|
+
similarity_matrix[item_id].map do |item_id, similarity|
|
59
|
+
Recommendify::Neighbor.new(
|
60
|
+
:item_id => item_id,
|
61
|
+
:similarity => similarity
|
62
|
+
)
|
63
|
+
end.sort
|
64
|
+
end
|
65
|
+
|
66
|
+
def process!
|
67
|
+
all_items.each{ |item_id,n| process_item!(item_id) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_item!(item_id)
|
71
|
+
input_matrices.map do |k,m|
|
72
|
+
neighbors = m.similarities_for(item_id).map do |i,w|
|
73
|
+
[i,w*m.weight]
|
74
|
+
end
|
75
|
+
similarity_matrix.update(item_id, neighbors)
|
76
|
+
end
|
77
|
+
similarity_matrix.commit_item!(item_id)
|
78
|
+
end
|
79
|
+
|
80
|
+
def delete_item!(item_id)
|
81
|
+
input_matrices.map do |k,m|
|
82
|
+
m.delete_item(item_id)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|