recommendify_whosv 0.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/README.md +154 -0
- data/Rakefile +18 -0
- data/doc/example.png +0 -0
- data/doc/example.rb +87 -0
- data/doc/example_data.csv +120048 -0
- data/ext/cc_item.h +8 -0
- data/ext/cosine.c +3 -0
- data/ext/extconf.rb +18 -0
- data/ext/iikey.c +18 -0
- data/ext/jaccard.c +19 -0
- data/ext/output.c +22 -0
- data/ext/recommendify.c +214 -0
- data/ext/sort.c +23 -0
- data/ext/version.h +17 -0
- data/lib/recommendify/base.rb +88 -0
- data/lib/recommendify/cc_matrix.rb +51 -0
- data/lib/recommendify/cosine_input_matrix.rb +7 -0
- data/lib/recommendify/input_matrix.rb +52 -0
- data/lib/recommendify/jaccard_input_matrix.rb +62 -0
- data/lib/recommendify/neighbor.rb +19 -0
- data/lib/recommendify/recommendify.rb +25 -0
- data/lib/recommendify/similarity_matrix.rb +63 -0
- data/lib/recommendify/sparse_matrix.rb +53 -0
- data/lib/recommendify.rb +9 -0
- data/recommendify.gemspec +25 -0
- data/spec/base_spec.rb +188 -0
- data/spec/cc_matrix_shared.rb +89 -0
- data/spec/cosine_input_matrix_spec.rb +18 -0
- data/spec/input_matrix_shared.rb +27 -0
- data/spec/input_matrix_spec.rb +29 -0
- data/spec/jaccard_input_matrix_spec.rb +95 -0
- data/spec/recommendify_spec.rb +28 -0
- data/spec/similarity_matrix_spec.rb +93 -0
- data/spec/sparse_matrix_spec.rb +78 -0
- data/spec/spec_helper.rb +42 -0
- metadata +122 -0
data/ext/cc_item.h
ADDED
data/ext/cosine.c
ADDED
data/ext/extconf.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
makefile = <<-MAKEFILE
|
2
|
+
all: prepare build
|
3
|
+
|
4
|
+
build:
|
5
|
+
gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
|
6
|
+
|
7
|
+
prepare:
|
8
|
+
mkdir -p ../bin
|
9
|
+
|
10
|
+
clean:
|
11
|
+
rm -f *.o
|
12
|
+
|
13
|
+
install: prepare
|
14
|
+
MAKEFILE
|
15
|
+
|
16
|
+
File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
|
17
|
+
f.write(makefile)
|
18
|
+
end
|
data/ext/iikey.c
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
char* item_item_key(char *item1, char *item2){
|
2
|
+
int keylen = strlen(item1) + strlen(item2) + 2;
|
3
|
+
char *key = (char *)malloc(keylen * sizeof(char));
|
4
|
+
|
5
|
+
if(!key){
|
6
|
+
printf("cannot allocate\n");
|
7
|
+
return 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
// FIXPAUL: make shure this does exactly the same as ruby sort
|
11
|
+
if(rb_strcmp(item1, item2) <= 0){
|
12
|
+
snprintf(key, keylen, "%s:%s", item1, item2);
|
13
|
+
} else {
|
14
|
+
snprintf(key, keylen, "%s:%s", item2, item1);
|
15
|
+
}
|
16
|
+
|
17
|
+
return key;
|
18
|
+
}
|
data/ext/jaccard.c
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
|
2
|
+
int j, n;
|
3
|
+
|
4
|
+
for(j = 0; j < cc_items_size; j++){
|
5
|
+
n = cc_items[j].coconcurrency_count;
|
6
|
+
if(n>0){
|
7
|
+
cc_items[j].similarity = (
|
8
|
+
(float)n / (
|
9
|
+
(float)itemCount +
|
10
|
+
(float)cc_items[j].total_count -
|
11
|
+
(float)n
|
12
|
+
)
|
13
|
+
);
|
14
|
+
} else {
|
15
|
+
cc_items[j].similarity = 0.0;
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
}
|
data/ext/output.c
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
int print_version(){
|
2
|
+
printf(
|
3
|
+
VERSION_STRING,
|
4
|
+
VERSION_MAJOR,
|
5
|
+
VERSION_MINOR,
|
6
|
+
VERSION_MICRO
|
7
|
+
);
|
8
|
+
return 0;
|
9
|
+
}
|
10
|
+
|
11
|
+
int print_usage(char *bin){
|
12
|
+
printf(USAGE_STRING, bin);
|
13
|
+
return 1;
|
14
|
+
}
|
15
|
+
|
16
|
+
void print_item(struct cc_item item){
|
17
|
+
printf(
|
18
|
+
"OUT: (%s) (%.4f)\n",
|
19
|
+
item.item_id,
|
20
|
+
item.similarity
|
21
|
+
);
|
22
|
+
}
|
data/ext/recommendify.c
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <hiredis/hiredis.h>
|
5
|
+
|
6
|
+
#include "version.h"
|
7
|
+
#include "cc_item.h"
|
8
|
+
#include "jaccard.c"
|
9
|
+
#include "cosine.c"
|
10
|
+
#include "output.c"
|
11
|
+
#include "sort.c"
|
12
|
+
#include "iikey.c"
|
13
|
+
|
14
|
+
|
15
|
+
int main(int argc, char **argv){
|
16
|
+
int i, j, n, similarityFunc = 0;
|
17
|
+
int itemCount = 0;
|
18
|
+
char *itemID;
|
19
|
+
char *redisPrefix;
|
20
|
+
redisContext *c;
|
21
|
+
redisReply *all_items;
|
22
|
+
redisReply *reply;
|
23
|
+
int cur_batch_size;
|
24
|
+
char* cur_batch;
|
25
|
+
char *iikey;
|
26
|
+
|
27
|
+
int batch_size = 200; /* FIXPAUL: make option */
|
28
|
+
int maxItems = 50; /* FIXPAUL: make option */
|
29
|
+
|
30
|
+
struct {
|
31
|
+
char host[1024];
|
32
|
+
int port;
|
33
|
+
} redis_addr;
|
34
|
+
|
35
|
+
/* option parsing */
|
36
|
+
if(argc < 2)
|
37
|
+
return print_usage(argv[0]);
|
38
|
+
|
39
|
+
if(!strcmp(argv[1], "--version"))
|
40
|
+
return print_version();
|
41
|
+
|
42
|
+
if(!strcmp(argv[1], "--jaccard"))
|
43
|
+
similarityFunc = 1;
|
44
|
+
|
45
|
+
if(!strcmp(argv[1], "--cosine"))
|
46
|
+
similarityFunc = 2;
|
47
|
+
|
48
|
+
if(!similarityFunc){
|
49
|
+
printf("invalid option: %s\n", argv[1]);
|
50
|
+
return 1;
|
51
|
+
} else if(argc < 4 || argc > 5){
|
52
|
+
printf("wrong number of arguments\n");
|
53
|
+
print_usage(argv[0]);
|
54
|
+
return 1;
|
55
|
+
}
|
56
|
+
|
57
|
+
redisPrefix = argv[2];
|
58
|
+
itemID = argv[3];
|
59
|
+
redis_addr.host[0] = 0;
|
60
|
+
redis_addr.port = 0;
|
61
|
+
|
62
|
+
/* configure redis location */
|
63
|
+
if(argc > 4){
|
64
|
+
char* has_port = strchr(argv[4], ':');
|
65
|
+
if(has_port){
|
66
|
+
strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
|
67
|
+
redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
|
68
|
+
redis_addr.port = atoi(has_port + 1);
|
69
|
+
} else {
|
70
|
+
strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
/* default redis location */
|
75
|
+
if(strlen(redis_addr.host) == 0)
|
76
|
+
strcpy(redis_addr.host, "localhost");
|
77
|
+
|
78
|
+
if(!redis_addr.port)
|
79
|
+
redis_addr.port = 6379;
|
80
|
+
|
81
|
+
/* connect to redis */
|
82
|
+
struct timeval timeout = { 1, 500000 };
|
83
|
+
c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
|
84
|
+
|
85
|
+
if(c->err){
|
86
|
+
printf("connection to redis failed: %s\n", c->errstr);
|
87
|
+
return 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
|
91
|
+
/* get item count */
|
92
|
+
reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
|
93
|
+
|
94
|
+
if(reply->str){
|
95
|
+
itemCount = atoi(reply->str);
|
96
|
+
} else {
|
97
|
+
itemCount = 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
freeReplyObject(reply);
|
101
|
+
|
102
|
+
if(itemCount < 2){
|
103
|
+
printf("exit: item count is zero or one\n");
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
|
108
|
+
/* get all items_ids and the total counts */
|
109
|
+
all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
|
110
|
+
|
111
|
+
if(all_items->type != REDIS_REPLY_ARRAY)
|
112
|
+
return 1;
|
113
|
+
|
114
|
+
|
115
|
+
/* populate the cc_items array */
|
116
|
+
int cc_items_size = all_items->elements / 2;
|
117
|
+
int cc_items_mem = cc_items_size * sizeof(struct cc_item);
|
118
|
+
struct cc_item *cc_items = malloc(cc_items_mem);
|
119
|
+
cc_items_size--;
|
120
|
+
|
121
|
+
if(!cc_items){
|
122
|
+
printf("cannot allocate memory: %i", cc_items_mem);
|
123
|
+
return 1;
|
124
|
+
}
|
125
|
+
|
126
|
+
i = 0;
|
127
|
+
for (j = 0; j < all_items->elements/2; j++){
|
128
|
+
if(strcmp(itemID, all_items->element[j*2]->str) != 0){
|
129
|
+
strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
|
130
|
+
cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
|
131
|
+
i++;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
freeReplyObject(all_items);
|
136
|
+
|
137
|
+
|
138
|
+
// batched redis hmgets on the ccmatrix
|
139
|
+
cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
|
140
|
+
|
141
|
+
if(!cur_batch){
|
142
|
+
printf("cannot allocate memory");
|
143
|
+
return 1;
|
144
|
+
}
|
145
|
+
|
146
|
+
n = cc_items_size;
|
147
|
+
while(n >= 0){
|
148
|
+
cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
|
149
|
+
sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
|
150
|
+
|
151
|
+
for(i = 0; i < cur_batch_size; i++){
|
152
|
+
iikey = item_item_key(itemID, cc_items[n-i].item_id);
|
153
|
+
|
154
|
+
strcat(cur_batch, iikey);
|
155
|
+
strcat(cur_batch, " ");
|
156
|
+
|
157
|
+
if(iikey)
|
158
|
+
free(iikey);
|
159
|
+
}
|
160
|
+
|
161
|
+
redisAppendCommand(c, cur_batch);
|
162
|
+
redisGetReply(c, (void**)&reply);
|
163
|
+
|
164
|
+
for(j = 0; j < reply->elements; j++){
|
165
|
+
if(reply->element[j]->str){
|
166
|
+
cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
|
167
|
+
} else {
|
168
|
+
cc_items[n-j].coconcurrency_count = 0;
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
freeReplyObject(reply);
|
173
|
+
n -= batch_size;
|
174
|
+
}
|
175
|
+
|
176
|
+
free(cur_batch);
|
177
|
+
|
178
|
+
|
179
|
+
|
180
|
+
/* calculate similarities */
|
181
|
+
if(similarityFunc == 1)
|
182
|
+
calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
|
183
|
+
|
184
|
+
if(similarityFunc == 2)
|
185
|
+
calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
|
186
|
+
|
187
|
+
|
188
|
+
/* find the top x items with simple bubble sort */
|
189
|
+
for(i = 0; i < maxItems - 1; ++i){
|
190
|
+
for (j = 0; j < cc_items_size - i - 1; ++j){
|
191
|
+
if (cc_items[j].similarity > cc_items[j + 1].similarity){
|
192
|
+
struct cc_item tmp = cc_items[j];
|
193
|
+
cc_items[j] = cc_items[j + 1];
|
194
|
+
cc_items[j + 1] = tmp;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
|
200
|
+
/* print top k items */
|
201
|
+
n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
|
202
|
+
for(j = 0; j < n; j++){
|
203
|
+
i = cc_items_size-j-1;
|
204
|
+
if(cc_items[i].similarity > 0){
|
205
|
+
print_item(cc_items[i]);
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
|
210
|
+
free(cc_items);
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
|
214
|
+
|
data/ext/sort.c
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
int lesser(int i1, int i2){
|
2
|
+
if(i1 > i2){
|
3
|
+
return i2;
|
4
|
+
} else {
|
5
|
+
return i1;
|
6
|
+
}
|
7
|
+
}
|
8
|
+
|
9
|
+
int rb_strcmp(char *str1, char *str2){
|
10
|
+
long len;
|
11
|
+
int retval;
|
12
|
+
len = lesser(strlen(str1), strlen(str2));
|
13
|
+
retval = memcmp(str1, str2, len);
|
14
|
+
if (retval == 0){
|
15
|
+
if (strlen(str1) == strlen(str2)) {
|
16
|
+
return 0;
|
17
|
+
}
|
18
|
+
if (strlen(str1) > strlen(str2)) return 1;
|
19
|
+
return -1;
|
20
|
+
}
|
21
|
+
if (retval > 0) return 1;
|
22
|
+
return -1;
|
23
|
+
}
|
data/ext/version.h
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef VERSION_H
|
2
|
+
#define VERSION_H
|
3
|
+
|
4
|
+
#define VERSION_MAJOR 0
|
5
|
+
#define VERSION_MINOR 0
|
6
|
+
#define VERSION_MICRO 1
|
7
|
+
|
8
|
+
#define VERSION_STRING "recommendify_native %i.%i.%i\n" \
|
9
|
+
"\n" \
|
10
|
+
"Copyright © 2012\n" \
|
11
|
+
" Paul Asmuth <paul@paulasmuth.com>\n"
|
12
|
+
|
13
|
+
#define USAGE_STRING "usage: %s " \
|
14
|
+
"{--version|--jaccard|--cosine} " \
|
15
|
+
"[redis_key] [item_id]\n"
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,88 @@
|
|
1
|
+
class Recommendify::Base
|
2
|
+
|
3
|
+
attr_reader :similarity_matrix, :input_matrices
|
4
|
+
attr_accessor :redis_prefix_self
|
5
|
+
|
6
|
+
@@max_neighbors = nil
|
7
|
+
@@input_matrices = {}
|
8
|
+
|
9
|
+
def self.max_neighbors(n=nil)
|
10
|
+
return @@max_neighbors unless n
|
11
|
+
@@max_neighbors = n
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.input_matrix(key, opts)
|
15
|
+
@@input_matrices[key] = opts
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.input_matrices
|
19
|
+
@@input_matrices
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(opts = nil)
|
23
|
+
@redis_prefix_self = opts[:redis_prefix] if opts && opts[:redis_prefix]
|
24
|
+
@input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
|
25
|
+
opts.merge!(:key => key, :redis_prefix => redis_prefix)
|
26
|
+
[ key, Recommendify::InputMatrix.create(opts) ]
|
27
|
+
}]
|
28
|
+
@similarity_matrix = Recommendify::SimilarityMatrix.new(
|
29
|
+
:max_neighbors => max_neighbors,
|
30
|
+
:key => :similarities,
|
31
|
+
:redis_prefix => redis_prefix
|
32
|
+
)
|
33
|
+
end
|
34
|
+
|
35
|
+
def redis_prefix
|
36
|
+
@redis_prefix_self || "recommendify"
|
37
|
+
end
|
38
|
+
|
39
|
+
def max_neighbors
|
40
|
+
self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
|
41
|
+
end
|
42
|
+
|
43
|
+
def method_missing(method, *args)
|
44
|
+
if @input_matrices.has_key?(method)
|
45
|
+
@input_matrices[method]
|
46
|
+
else
|
47
|
+
raise NoMethodError.new(method.to_s)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def respond_to?(method)
|
52
|
+
@input_matrices.has_key?(method) ? true : super
|
53
|
+
end
|
54
|
+
|
55
|
+
def all_items
|
56
|
+
@input_matrices.map{ |k,m| m.all_items }.flatten.uniq
|
57
|
+
end
|
58
|
+
|
59
|
+
def for(item_id)
|
60
|
+
similarity_matrix[item_id].map do |item_id, similarity|
|
61
|
+
Recommendify::Neighbor.new(
|
62
|
+
:item_id => item_id,
|
63
|
+
:similarity => similarity
|
64
|
+
)
|
65
|
+
end.sort
|
66
|
+
end
|
67
|
+
|
68
|
+
def process!
|
69
|
+
all_items.each{ |item_id,n| process_item!(item_id) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def process_item!(item_id)
|
73
|
+
input_matrices.map do |k,m|
|
74
|
+
neighbors = m.similarities_for(item_id).map do |i,w|
|
75
|
+
[i,w*m.weight]
|
76
|
+
end
|
77
|
+
similarity_matrix.update(item_id, neighbors)
|
78
|
+
end
|
79
|
+
similarity_matrix.commit_item!(item_id)
|
80
|
+
end
|
81
|
+
|
82
|
+
def delete_item!(item_id)
|
83
|
+
input_matrices.map do |k,m|
|
84
|
+
m.delete_item(item_id)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Recommendify::CCMatrix
|
2
|
+
|
3
|
+
def ccmatrix
|
4
|
+
@ccmatrix ||= Recommendify::SparseMatrix.new(
|
5
|
+
:redis_prefix => @opts.fetch(:redis_prefix),
|
6
|
+
:key => [@opts.fetch(:key), :ccmatrix].join(":")
|
7
|
+
)
|
8
|
+
end
|
9
|
+
|
10
|
+
def add_set(set_id, item_ids)
|
11
|
+
# FIXPAUL: forbid | and : in item_ids
|
12
|
+
item_ids.each do |item_id|
|
13
|
+
item_count_incr(item_id)
|
14
|
+
end
|
15
|
+
all_pairs(item_ids).map do |pair|
|
16
|
+
i1, i2 = pair.split(":")
|
17
|
+
ccmatrix.incr(i1, i2)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_single(set_id, item_id, other_item_ids)
|
22
|
+
item_count_incr(item_id)
|
23
|
+
other_item_ids.each do |other_item|
|
24
|
+
ccmatrix.incr(item_id, other_item)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def all_items
|
29
|
+
Recommendify.redis.hkeys(redis_key(:items))
|
30
|
+
end
|
31
|
+
|
32
|
+
def delete_item(item_id)
|
33
|
+
Recommendify.redis.hdel(redis_key(:items), item_id)
|
34
|
+
ccmatrix.send(:k_delall, item_id)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def all_pairs(keys)
|
40
|
+
keys.map{ |k1| (keys-[k1]).map{ |k2| [k1,k2].sort.join(":") } }.flatten.uniq
|
41
|
+
end
|
42
|
+
|
43
|
+
def item_count_incr(key)
|
44
|
+
Recommendify.redis.hincrby(redis_key(:items), key, 1)
|
45
|
+
end
|
46
|
+
|
47
|
+
def item_count(key)
|
48
|
+
Recommendify.redis.hget(redis_key(:items), key).to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class Recommendify::InputMatrix
|
2
|
+
|
3
|
+
def self.create(opts)
|
4
|
+
klass = "#{Recommendify.capitalize(opts[:similarity_func])}InputMatrix"
|
5
|
+
Recommendify.constantize(klass.intern).new(opts)
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(opts)
|
9
|
+
@opts = opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def redis_key(append=nil)
|
13
|
+
[@opts.fetch(:redis_prefix), @opts.fetch(:key), append].flatten.compact.join(":")
|
14
|
+
end
|
15
|
+
|
16
|
+
def weight
|
17
|
+
(@opts[:weight] || 1).to_f
|
18
|
+
end
|
19
|
+
|
20
|
+
# add a set of item_ids to the matrix
|
21
|
+
def add_set(set_id, item_ids)
|
22
|
+
raise "implemented in subclass"
|
23
|
+
end
|
24
|
+
|
25
|
+
# add a single item to a set of item_ids to the matrix
|
26
|
+
def add_single(set_id, item_id, other_item_ids)
|
27
|
+
raise "implemented in subclass"
|
28
|
+
end
|
29
|
+
|
30
|
+
# calculate the similarity between item1 and item1 (0.0-1.0)
|
31
|
+
def similarity(item1, item2)
|
32
|
+
raise "implemented in subclass"
|
33
|
+
end
|
34
|
+
|
35
|
+
# calculate all similarities to other items in the matrix for item1
|
36
|
+
def similarities_for(item1)
|
37
|
+
# return => [ ["item23", 0.6], ["item42", 0.23], (...) ]
|
38
|
+
raise "implemented in subclass"
|
39
|
+
end
|
40
|
+
|
41
|
+
# retrieve all item_ids in the matrix
|
42
|
+
def all_items
|
43
|
+
# retzrb => [ "item23", "item42", "item17", (...) ]
|
44
|
+
raise "implemented in subclass"
|
45
|
+
end
|
46
|
+
|
47
|
+
# delete item_id from the matrix
|
48
|
+
def delete_item(item_id)
|
49
|
+
raise "implemented in subclass"
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
class Recommendify::JaccardInputMatrix < Recommendify::InputMatrix
|
2
|
+
|
3
|
+
include Recommendify::CCMatrix
|
4
|
+
|
5
|
+
def initialize(opts={})
|
6
|
+
check_native if opts[:native]
|
7
|
+
super(opts)
|
8
|
+
end
|
9
|
+
|
10
|
+
def similarity(item1, item2)
|
11
|
+
calculate_jaccard_cached(item1, item2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def similarities_for(item1)
|
15
|
+
return run_native(item1) if @opts[:native]
|
16
|
+
calculate_similarities(item1)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def calculate_similarities(item1)
|
22
|
+
(all_items - [item1]).map do |item2|
|
23
|
+
[item2, similarity(item1, item2)]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def calculate_jaccard_cached(item1, item2)
|
28
|
+
val = ccmatrix[item1, item2]
|
29
|
+
val.to_f / (item_count(item1)+item_count(item2)-val).to_f
|
30
|
+
end
|
31
|
+
|
32
|
+
def calculate_jaccard(set1, set2)
|
33
|
+
(set1&set2).length.to_f / (set1 + set2).uniq.length.to_f
|
34
|
+
end
|
35
|
+
|
36
|
+
def run_native(item_id)
|
37
|
+
res = %x{#{native_path} --jaccard "#{redis_key}" "#{item_id}" "#{redis_url}"}
|
38
|
+
raise "error: dirty exit (#{$?})" if $? != 0
|
39
|
+
res.split("\n").map do |line|
|
40
|
+
sim = line.match(/OUT: \(([^\)]*)\) \(([^\)]*)\)/)
|
41
|
+
unless sim
|
42
|
+
raise "error: #{res}" unless (res||"").include?('exit:')
|
43
|
+
else
|
44
|
+
[sim[1], sim[2].to_f]
|
45
|
+
end
|
46
|
+
end.compact
|
47
|
+
end
|
48
|
+
|
49
|
+
def check_native
|
50
|
+
return true if ::File.exists?(native_path)
|
51
|
+
raise "recommendify_native not found - you need to run rake build_native first"
|
52
|
+
end
|
53
|
+
|
54
|
+
def native_path
|
55
|
+
::File.expand_path('../../../bin/recommendify', __FILE__)
|
56
|
+
end
|
57
|
+
|
58
|
+
def redis_url
|
59
|
+
Recommendify.redis.client.location
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Recommendify::Neighbor
|
2
|
+
|
3
|
+
def initialize(data)
|
4
|
+
@data = data
|
5
|
+
end
|
6
|
+
|
7
|
+
def item_id
|
8
|
+
@data.fetch(:item_id).to_s
|
9
|
+
end
|
10
|
+
|
11
|
+
def similarity
|
12
|
+
@data.fetch(:similarity)
|
13
|
+
end
|
14
|
+
|
15
|
+
def <=>(other)
|
16
|
+
other.similarity <=> self.similarity
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Recommendify
|
2
|
+
|
3
|
+
DEFAULT_MAX_NEIGHBORS = 50
|
4
|
+
|
5
|
+
@@redis = nil
|
6
|
+
|
7
|
+
def self.redis=(redis)
|
8
|
+
@@redis = redis
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.redis
|
12
|
+
return @@redis unless @@redis.nil?
|
13
|
+
raise "redis not configured! - Recommendify.redis = Redis.new"
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.capitalize(str_or_sym)
|
17
|
+
str = str_or_sym.to_s.each_char.to_a
|
18
|
+
str.first.upcase + str[1..-1].join("").downcase
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.constantize(klass)
|
22
|
+
Object.module_eval("Recommendify::#{klass}", __FILE__, __LINE__)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|