recommendify-ruby 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
data/ext/cc_item.h ADDED
@@ -0,0 +1,8 @@
1
+ #define ITEM_ID_SIZE 64
2
+
3
+ struct cc_item {
4
+ char item_id[ITEM_ID_SIZE];
5
+ int coconcurrency_count;
6
+ int total_count;
7
+ float similarity;
8
+ };
data/ext/cosine.c ADDED
@@ -0,0 +1,3 @@
1
+ void calculate_cosine(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ /* here be dragons */
3
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,18 @@
1
+ makefile = <<-MAKEFILE
2
+ all: prepare build
3
+
4
+ build:
5
+ gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
6
+
7
+ prepare:
8
+ mkdir -p ../bin
9
+
10
+ clean:
11
+ rm -f *.o
12
+
13
+ install: prepare
14
+ MAKEFILE
15
+
16
+ File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
17
+ f.write(makefile)
18
+ end
data/ext/iikey.c ADDED
@@ -0,0 +1,18 @@
1
+ char* item_item_key(char *item1, char *item2){
2
+ int keylen = strlen(item1) + strlen(item2) + 2;
3
+ char *key = (char *)malloc(keylen * sizeof(char));
4
+
5
+ if(!key){
6
+ printf("cannot allocate\n");
7
+ return 0;
8
+ }
9
+
10
+ // FIXPAUL: make shure this does exactly the same as ruby sort
11
+ if(rb_strcmp(item1, item2) <= 0){
12
+ snprintf(key, keylen, "%s:%s", item1, item2);
13
+ } else {
14
+ snprintf(key, keylen, "%s:%s", item2, item1);
15
+ }
16
+
17
+ return key;
18
+ }
data/ext/jaccard.c ADDED
@@ -0,0 +1,19 @@
1
+ void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ int j, n;
3
+
4
+ for(j = 0; j < cc_items_size; j++){
5
+ n = cc_items[j].coconcurrency_count;
6
+ if(n>0){
7
+ cc_items[j].similarity = (
8
+ (float)n / (
9
+ (float)itemCount +
10
+ (float)cc_items[j].total_count -
11
+ (float)n
12
+ )
13
+ );
14
+ } else {
15
+ cc_items[j].similarity = 0.0;
16
+ }
17
+ }
18
+
19
+ }
data/ext/output.c ADDED
@@ -0,0 +1,22 @@
1
+ int print_version(){
2
+ printf(
3
+ VERSION_STRING,
4
+ VERSION_MAJOR,
5
+ VERSION_MINOR,
6
+ VERSION_MICRO
7
+ );
8
+ return 0;
9
+ }
10
+
11
+ int print_usage(char *bin){
12
+ printf(USAGE_STRING, bin);
13
+ return 1;
14
+ }
15
+
16
+ void print_item(struct cc_item item){
17
+ printf(
18
+ "OUT: (%s) (%.4f)\n",
19
+ item.item_id,
20
+ item.similarity
21
+ );
22
+ }
@@ -0,0 +1,214 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include <hiredis/hiredis.h>
5
+
6
+ #include "version.h"
7
+ #include "cc_item.h"
8
+ #include "jaccard.c"
9
+ #include "cosine.c"
10
+ #include "output.c"
11
+ #include "sort.c"
12
+ #include "iikey.c"
13
+
14
+
15
+ int main(int argc, char **argv){
16
+ int i, j, n, similarityFunc = 0;
17
+ int itemCount = 0;
18
+ char *itemID;
19
+ char *redisPrefix;
20
+ redisContext *c;
21
+ redisReply *all_items;
22
+ redisReply *reply;
23
+ int cur_batch_size;
24
+ char* cur_batch;
25
+ char *iikey;
26
+
27
+ int batch_size = 200; /* FIXPAUL: make option */
28
+ int maxItems = 50; /* FIXPAUL: make option */
29
+
30
+ struct {
31
+ char host[1024];
32
+ int port;
33
+ } redis_addr;
34
+
35
+ /* option parsing */
36
+ if(argc < 2)
37
+ return print_usage(argv[0]);
38
+
39
+ if(!strcmp(argv[1], "--version"))
40
+ return print_version();
41
+
42
+ if(!strcmp(argv[1], "--jaccard"))
43
+ similarityFunc = 1;
44
+
45
+ if(!strcmp(argv[1], "--cosine"))
46
+ similarityFunc = 2;
47
+
48
+ if(!similarityFunc){
49
+ printf("invalid option: %s\n", argv[1]);
50
+ return 1;
51
+ } else if(argc < 4 || argc > 5){
52
+ printf("wrong number of arguments\n");
53
+ print_usage(argv[0]);
54
+ return 1;
55
+ }
56
+
57
+ redisPrefix = argv[2];
58
+ itemID = argv[3];
59
+ redis_addr.host[0] = 0;
60
+ redis_addr.port = 0;
61
+
62
+ /* configure redis location */
63
+ if(argc > 4){
64
+ char* has_port = strchr(argv[4], ':');
65
+ if(has_port){
66
+ strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
67
+ redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
68
+ redis_addr.port = atoi(has_port + 1);
69
+ } else {
70
+ strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
71
+ }
72
+ }
73
+
74
+ /* default redis location */
75
+ if(strlen(redis_addr.host) == 0)
76
+ strcpy(redis_addr.host, "localhost");
77
+
78
+ if(!redis_addr.port)
79
+ redis_addr.port = 6379;
80
+
81
+ /* connect to redis */
82
+ struct timeval timeout = { 1, 500000 };
83
+ c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
84
+
85
+ if(c->err){
86
+ printf("connection to redis failed: %s\n", c->errstr);
87
+ return 1;
88
+ }
89
+
90
+
91
+ /* get item count */
92
+ reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
93
+
94
+ if(reply->str){
95
+ itemCount = atoi(reply->str);
96
+ } else {
97
+ itemCount = 0;
98
+ }
99
+
100
+ freeReplyObject(reply);
101
+
102
+ if(itemCount < 2){
103
+ printf("exit: item count is zero or one\n");
104
+ return 0;
105
+ }
106
+
107
+
108
+ /* get all items_ids and the total counts */
109
+ all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
110
+
111
+ if(all_items->type != REDIS_REPLY_ARRAY)
112
+ return 1;
113
+
114
+
115
+ /* populate the cc_items array */
116
+ int cc_items_size = all_items->elements / 2;
117
+ int cc_items_mem = cc_items_size * sizeof(struct cc_item);
118
+ struct cc_item *cc_items = malloc(cc_items_mem);
119
+ cc_items_size--;
120
+
121
+ if(!cc_items){
122
+ printf("cannot allocate memory: %i", cc_items_mem);
123
+ return 1;
124
+ }
125
+
126
+ i = 0;
127
+ for (j = 0; j < all_items->elements/2; j++){
128
+ if(strcmp(itemID, all_items->element[j*2]->str) != 0){
129
+ strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
130
+ cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
131
+ i++;
132
+ }
133
+ }
134
+
135
+ freeReplyObject(all_items);
136
+
137
+
138
+ // batched redis hmgets on the ccmatrix
139
+ cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
140
+
141
+ if(!cur_batch){
142
+ printf("cannot allocate memory");
143
+ return 1;
144
+ }
145
+
146
+ n = cc_items_size;
147
+ while(n >= 0){
148
+ cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
149
+ sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
150
+
151
+ for(i = 0; i < cur_batch_size; i++){
152
+ iikey = item_item_key(itemID, cc_items[n-i].item_id);
153
+
154
+ strcat(cur_batch, iikey);
155
+ strcat(cur_batch, " ");
156
+
157
+ if(iikey)
158
+ free(iikey);
159
+ }
160
+
161
+ redisAppendCommand(c, cur_batch);
162
+ redisGetReply(c, (void**)&reply);
163
+
164
+ for(j = 0; j < reply->elements; j++){
165
+ if(reply->element[j]->str){
166
+ cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
167
+ } else {
168
+ cc_items[n-j].coconcurrency_count = 0;
169
+ }
170
+ }
171
+
172
+ freeReplyObject(reply);
173
+ n -= batch_size;
174
+ }
175
+
176
+ free(cur_batch);
177
+
178
+
179
+
180
+ /* calculate similarities */
181
+ if(similarityFunc == 1)
182
+ calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
183
+
184
+ if(similarityFunc == 2)
185
+ calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
186
+
187
+
188
+ /* find the top x items with simple bubble sort */
189
+ for(i = 0; i < maxItems - 1; ++i){
190
+ for (j = 0; j < cc_items_size - i - 1; ++j){
191
+ if (cc_items[j].similarity > cc_items[j + 1].similarity){
192
+ struct cc_item tmp = cc_items[j];
193
+ cc_items[j] = cc_items[j + 1];
194
+ cc_items[j + 1] = tmp;
195
+ }
196
+ }
197
+ }
198
+
199
+
200
+ /* print top k items */
201
+ n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
202
+ for(j = 0; j < n; j++){
203
+ i = cc_items_size-j-1;
204
+ if(cc_items[i].similarity > 0){
205
+ print_item(cc_items[i]);
206
+ }
207
+ }
208
+
209
+
210
+ free(cc_items);
211
+ return 0;
212
+ }
213
+
214
+
data/ext/sort.c ADDED
@@ -0,0 +1,23 @@
1
+ int lesser(int i1, int i2){
2
+ if(i1 > i2){
3
+ return i2;
4
+ } else {
5
+ return i1;
6
+ }
7
+ }
8
+
9
+ int rb_strcmp(char *str1, char *str2){
10
+ long len;
11
+ int retval;
12
+ len = lesser(strlen(str1), strlen(str2));
13
+ retval = memcmp(str1, str2, len);
14
+ if (retval == 0){
15
+ if (strlen(str1) == strlen(str2)) {
16
+ return 0;
17
+ }
18
+ if (strlen(str1) > strlen(str2)) return 1;
19
+ return -1;
20
+ }
21
+ if (retval > 0) return 1;
22
+ return -1;
23
+ }
data/ext/version.h ADDED
@@ -0,0 +1,17 @@
1
+ #ifndef VERSION_H
2
+ #define VERSION_H
3
+
4
+ #define VERSION_MAJOR 0
5
+ #define VERSION_MINOR 0
6
+ #define VERSION_MICRO 1
7
+
8
+ #define VERSION_STRING "recommendify_native %i.%i.%i\n" \
9
+ "\n" \
10
+ "Copyright © 2012\n" \
11
+ " Paul Asmuth <paul@paulasmuth.com>\n"
12
+
13
+ #define USAGE_STRING "usage: %s " \
14
+ "{--version|--jaccard|--cosine} " \
15
+ "[redis_key] [item_id]\n"
16
+
17
+ #endif
@@ -0,0 +1,9 @@
1
+ require "recommendify/recommendify"
2
+ require "recommendify/sparse_matrix"
3
+ require "recommendify/cc_matrix"
4
+ require "recommendify/similarity_matrix"
5
+ require "recommendify/input_matrix"
6
+ require "recommendify/jaccard_input_matrix"
7
+ require "recommendify/cosine_input_matrix"
8
+ require "recommendify/base"
9
+ require "recommendify/neighbor"
@@ -0,0 +1,86 @@
1
+ class Recommendify::Base
2
+
3
+ attr_reader :similarity_matrix, :input_matrices
4
+
5
+ @@max_neighbors = nil
6
+ @@input_matrices = {}
7
+
8
+ def self.max_neighbors(n=nil)
9
+ return @@max_neighbors unless n
10
+ @@max_neighbors = n
11
+ end
12
+
13
+ def self.input_matrix(key, opts)
14
+ @@input_matrices[key] = opts
15
+ end
16
+
17
+ def self.input_matrices
18
+ @@input_matrices
19
+ end
20
+
21
+ def initialize
22
+ @input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
23
+ opts.merge!(:key => key, :redis_prefix => redis_prefix)
24
+ [ key, Recommendify::InputMatrix.create(opts) ]
25
+ }]
26
+ @similarity_matrix = Recommendify::SimilarityMatrix.new(
27
+ :max_neighbors => max_neighbors,
28
+ :key => :similarities,
29
+ :redis_prefix => redis_prefix
30
+ )
31
+ end
32
+
33
+ def redis_prefix
34
+ "recommendify"
35
+ end
36
+
37
+ def max_neighbors
38
+ self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
39
+ end
40
+
41
+ def method_missing(method, *args)
42
+ if @input_matrices.has_key?(method)
43
+ @input_matrices[method]
44
+ else
45
+ raise NoMethodError.new(method.to_s)
46
+ end
47
+ end
48
+
49
+ def respond_to?(method)
50
+ @input_matrices.has_key?(method) ? true : super
51
+ end
52
+
53
+ def all_items
54
+ @input_matrices.map{ |k,m| m.all_items }.flatten.uniq
55
+ end
56
+
57
+ def for(item_id)
58
+ similarity_matrix[item_id].map do |item_id, similarity|
59
+ Recommendify::Neighbor.new(
60
+ :item_id => item_id,
61
+ :similarity => similarity
62
+ )
63
+ end.sort
64
+ end
65
+
66
+ def process!
67
+ all_items.each{ |item_id,n| process_item!(item_id) }
68
+ end
69
+
70
+ def process_item!(item_id)
71
+ input_matrices.map do |k,m|
72
+ neighbors = m.similarities_for(item_id).map do |i,w|
73
+ [i,w*m.weight]
74
+ end
75
+ similarity_matrix.update(item_id, neighbors)
76
+ end
77
+ similarity_matrix.commit_item!(item_id)
78
+ end
79
+
80
+ def delete_item!(item_id)
81
+ input_matrices.map do |k,m|
82
+ m.delete_item(item_id)
83
+ end
84
+ end
85
+
86
+ end