recommendify-ruby 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/cc_item.h ADDED
@@ -0,0 +1,8 @@
1
+ #define ITEM_ID_SIZE 64
2
+
3
+ struct cc_item {
4
+ char item_id[ITEM_ID_SIZE];
5
+ int coconcurrency_count;
6
+ int total_count;
7
+ float similarity;
8
+ };
data/ext/cosine.c ADDED
@@ -0,0 +1,3 @@
1
+ void calculate_cosine(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ /* here be dragons */
3
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,18 @@
1
+ makefile = <<-MAKEFILE
2
+ all: prepare build
3
+
4
+ build:
5
+ gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
6
+
7
+ prepare:
8
+ mkdir -p ../bin
9
+
10
+ clean:
11
+ rm -f *.o
12
+
13
+ install: prepare
14
+ MAKEFILE
15
+
16
+ File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
17
+ f.write(makefile)
18
+ end
data/ext/iikey.c ADDED
@@ -0,0 +1,18 @@
1
+ char* item_item_key(char *item1, char *item2){
2
+ int keylen = strlen(item1) + strlen(item2) + 2;
3
+ char *key = (char *)malloc(keylen * sizeof(char));
4
+
5
+ if(!key){
6
+ printf("cannot allocate\n");
7
+ return 0;
8
+ }
9
+
10
+ // FIXPAUL: make shure this does exactly the same as ruby sort
11
+ if(rb_strcmp(item1, item2) <= 0){
12
+ snprintf(key, keylen, "%s:%s", item1, item2);
13
+ } else {
14
+ snprintf(key, keylen, "%s:%s", item2, item1);
15
+ }
16
+
17
+ return key;
18
+ }
data/ext/jaccard.c ADDED
@@ -0,0 +1,19 @@
1
+ void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ int j, n;
3
+
4
+ for(j = 0; j < cc_items_size; j++){
5
+ n = cc_items[j].coconcurrency_count;
6
+ if(n>0){
7
+ cc_items[j].similarity = (
8
+ (float)n / (
9
+ (float)itemCount +
10
+ (float)cc_items[j].total_count -
11
+ (float)n
12
+ )
13
+ );
14
+ } else {
15
+ cc_items[j].similarity = 0.0;
16
+ }
17
+ }
18
+
19
+ }
data/ext/output.c ADDED
@@ -0,0 +1,22 @@
1
+ int print_version(){
2
+ printf(
3
+ VERSION_STRING,
4
+ VERSION_MAJOR,
5
+ VERSION_MINOR,
6
+ VERSION_MICRO
7
+ );
8
+ return 0;
9
+ }
10
+
11
+ int print_usage(char *bin){
12
+ printf(USAGE_STRING, bin);
13
+ return 1;
14
+ }
15
+
16
+ void print_item(struct cc_item item){
17
+ printf(
18
+ "OUT: (%s) (%.4f)\n",
19
+ item.item_id,
20
+ item.similarity
21
+ );
22
+ }
@@ -0,0 +1,214 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include <hiredis/hiredis.h>
5
+
6
+ #include "version.h"
7
+ #include "cc_item.h"
8
+ #include "jaccard.c"
9
+ #include "cosine.c"
10
+ #include "output.c"
11
+ #include "sort.c"
12
+ #include "iikey.c"
13
+
14
+
15
+ int main(int argc, char **argv){
16
+ int i, j, n, similarityFunc = 0;
17
+ int itemCount = 0;
18
+ char *itemID;
19
+ char *redisPrefix;
20
+ redisContext *c;
21
+ redisReply *all_items;
22
+ redisReply *reply;
23
+ int cur_batch_size;
24
+ char* cur_batch;
25
+ char *iikey;
26
+
27
+ int batch_size = 200; /* FIXPAUL: make option */
28
+ int maxItems = 50; /* FIXPAUL: make option */
29
+
30
+ struct {
31
+ char host[1024];
32
+ int port;
33
+ } redis_addr;
34
+
35
+ /* option parsing */
36
+ if(argc < 2)
37
+ return print_usage(argv[0]);
38
+
39
+ if(!strcmp(argv[1], "--version"))
40
+ return print_version();
41
+
42
+ if(!strcmp(argv[1], "--jaccard"))
43
+ similarityFunc = 1;
44
+
45
+ if(!strcmp(argv[1], "--cosine"))
46
+ similarityFunc = 2;
47
+
48
+ if(!similarityFunc){
49
+ printf("invalid option: %s\n", argv[1]);
50
+ return 1;
51
+ } else if(argc < 4 || argc > 5){
52
+ printf("wrong number of arguments\n");
53
+ print_usage(argv[0]);
54
+ return 1;
55
+ }
56
+
57
+ redisPrefix = argv[2];
58
+ itemID = argv[3];
59
+ redis_addr.host[0] = 0;
60
+ redis_addr.port = 0;
61
+
62
+ /* configure redis location */
63
+ if(argc > 4){
64
+ char* has_port = strchr(argv[4], ':');
65
+ if(has_port){
66
+ strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
67
+ redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
68
+ redis_addr.port = atoi(has_port + 1);
69
+ } else {
70
+ strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
71
+ }
72
+ }
73
+
74
+ /* default redis location */
75
+ if(strlen(redis_addr.host) == 0)
76
+ strcpy(redis_addr.host, "localhost");
77
+
78
+ if(!redis_addr.port)
79
+ redis_addr.port = 6379;
80
+
81
+ /* connect to redis */
82
+ struct timeval timeout = { 1, 500000 };
83
+ c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
84
+
85
+ if(c->err){
86
+ printf("connection to redis failed: %s\n", c->errstr);
87
+ return 1;
88
+ }
89
+
90
+
91
+ /* get item count */
92
+ reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
93
+
94
+ if(reply->str){
95
+ itemCount = atoi(reply->str);
96
+ } else {
97
+ itemCount = 0;
98
+ }
99
+
100
+ freeReplyObject(reply);
101
+
102
+ if(itemCount < 2){
103
+ printf("exit: item count is zero or one\n");
104
+ return 0;
105
+ }
106
+
107
+
108
+ /* get all items_ids and the total counts */
109
+ all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
110
+
111
+ if(all_items->type != REDIS_REPLY_ARRAY)
112
+ return 1;
113
+
114
+
115
+ /* populate the cc_items array */
116
+ int cc_items_size = all_items->elements / 2;
117
+ int cc_items_mem = cc_items_size * sizeof(struct cc_item);
118
+ struct cc_item *cc_items = malloc(cc_items_mem);
119
+ cc_items_size--;
120
+
121
+ if(!cc_items){
122
+ printf("cannot allocate memory: %i", cc_items_mem);
123
+ return 1;
124
+ }
125
+
126
+ i = 0;
127
+ for (j = 0; j < all_items->elements/2; j++){
128
+ if(strcmp(itemID, all_items->element[j*2]->str) != 0){
129
+ strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
130
+ cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
131
+ i++;
132
+ }
133
+ }
134
+
135
+ freeReplyObject(all_items);
136
+
137
+
138
+ // batched redis hmgets on the ccmatrix
139
+ cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
140
+
141
+ if(!cur_batch){
142
+ printf("cannot allocate memory");
143
+ return 1;
144
+ }
145
+
146
+ n = cc_items_size;
147
+ while(n >= 0){
148
+ cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
149
+ sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
150
+
151
+ for(i = 0; i < cur_batch_size; i++){
152
+ iikey = item_item_key(itemID, cc_items[n-i].item_id);
153
+
154
+ strcat(cur_batch, iikey);
155
+ strcat(cur_batch, " ");
156
+
157
+ if(iikey)
158
+ free(iikey);
159
+ }
160
+
161
+ redisAppendCommand(c, cur_batch);
162
+ redisGetReply(c, (void**)&reply);
163
+
164
+ for(j = 0; j < reply->elements; j++){
165
+ if(reply->element[j]->str){
166
+ cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
167
+ } else {
168
+ cc_items[n-j].coconcurrency_count = 0;
169
+ }
170
+ }
171
+
172
+ freeReplyObject(reply);
173
+ n -= batch_size;
174
+ }
175
+
176
+ free(cur_batch);
177
+
178
+
179
+
180
+ /* calculate similarities */
181
+ if(similarityFunc == 1)
182
+ calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
183
+
184
+ if(similarityFunc == 2)
185
+ calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
186
+
187
+
188
+ /* find the top x items with simple bubble sort */
189
+ for(i = 0; i < maxItems - 1; ++i){
190
+ for (j = 0; j < cc_items_size - i - 1; ++j){
191
+ if (cc_items[j].similarity > cc_items[j + 1].similarity){
192
+ struct cc_item tmp = cc_items[j];
193
+ cc_items[j] = cc_items[j + 1];
194
+ cc_items[j + 1] = tmp;
195
+ }
196
+ }
197
+ }
198
+
199
+
200
+ /* print top k items */
201
+ n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
202
+ for(j = 0; j < n; j++){
203
+ i = cc_items_size-j-1;
204
+ if(cc_items[i].similarity > 0){
205
+ print_item(cc_items[i]);
206
+ }
207
+ }
208
+
209
+
210
+ free(cc_items);
211
+ return 0;
212
+ }
213
+
214
+
data/ext/sort.c ADDED
@@ -0,0 +1,23 @@
1
+ int lesser(int i1, int i2){
2
+ if(i1 > i2){
3
+ return i2;
4
+ } else {
5
+ return i1;
6
+ }
7
+ }
8
+
9
+ int rb_strcmp(char *str1, char *str2){
10
+ long len;
11
+ int retval;
12
+ len = lesser(strlen(str1), strlen(str2));
13
+ retval = memcmp(str1, str2, len);
14
+ if (retval == 0){
15
+ if (strlen(str1) == strlen(str2)) {
16
+ return 0;
17
+ }
18
+ if (strlen(str1) > strlen(str2)) return 1;
19
+ return -1;
20
+ }
21
+ if (retval > 0) return 1;
22
+ return -1;
23
+ }
data/ext/version.h ADDED
@@ -0,0 +1,17 @@
1
+ #ifndef VERSION_H
2
+ #define VERSION_H
3
+
4
+ #define VERSION_MAJOR 0
5
+ #define VERSION_MINOR 0
6
+ #define VERSION_MICRO 1
7
+
8
+ #define VERSION_STRING "recommendify_native %i.%i.%i\n" \
9
+ "\n" \
10
+ "Copyright © 2012\n" \
11
+ " Paul Asmuth <paul@paulasmuth.com>\n"
12
+
13
+ #define USAGE_STRING "usage: %s " \
14
+ "{--version|--jaccard|--cosine} " \
15
+ "[redis_key] [item_id]\n"
16
+
17
+ #endif
@@ -0,0 +1,9 @@
1
+ require "recommendify/recommendify"
2
+ require "recommendify/sparse_matrix"
3
+ require "recommendify/cc_matrix"
4
+ require "recommendify/similarity_matrix"
5
+ require "recommendify/input_matrix"
6
+ require "recommendify/jaccard_input_matrix"
7
+ require "recommendify/cosine_input_matrix"
8
+ require "recommendify/base"
9
+ require "recommendify/neighbor"
@@ -0,0 +1,86 @@
1
+ class Recommendify::Base
2
+
3
+ attr_reader :similarity_matrix, :input_matrices
4
+
5
+ @@max_neighbors = nil
6
+ @@input_matrices = {}
7
+
8
+ def self.max_neighbors(n=nil)
9
+ return @@max_neighbors unless n
10
+ @@max_neighbors = n
11
+ end
12
+
13
+ def self.input_matrix(key, opts)
14
+ @@input_matrices[key] = opts
15
+ end
16
+
17
+ def self.input_matrices
18
+ @@input_matrices
19
+ end
20
+
21
+ def initialize
22
+ @input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
23
+ opts.merge!(:key => key, :redis_prefix => redis_prefix)
24
+ [ key, Recommendify::InputMatrix.create(opts) ]
25
+ }]
26
+ @similarity_matrix = Recommendify::SimilarityMatrix.new(
27
+ :max_neighbors => max_neighbors,
28
+ :key => :similarities,
29
+ :redis_prefix => redis_prefix
30
+ )
31
+ end
32
+
33
+ def redis_prefix
34
+ "recommendify"
35
+ end
36
+
37
+ def max_neighbors
38
+ self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
39
+ end
40
+
41
+ def method_missing(method, *args)
42
+ if @input_matrices.has_key?(method)
43
+ @input_matrices[method]
44
+ else
45
+ raise NoMethodError.new(method.to_s)
46
+ end
47
+ end
48
+
49
+ def respond_to?(method)
50
+ @input_matrices.has_key?(method) ? true : super
51
+ end
52
+
53
+ def all_items
54
+ @input_matrices.map{ |k,m| m.all_items }.flatten.uniq
55
+ end
56
+
57
+ def for(item_id)
58
+ similarity_matrix[item_id].map do |item_id, similarity|
59
+ Recommendify::Neighbor.new(
60
+ :item_id => item_id,
61
+ :similarity => similarity
62
+ )
63
+ end.sort
64
+ end
65
+
66
+ def process!
67
+ all_items.each{ |item_id,n| process_item!(item_id) }
68
+ end
69
+
70
+ def process_item!(item_id)
71
+ input_matrices.map do |k,m|
72
+ neighbors = m.similarities_for(item_id).map do |i,w|
73
+ [i,w*m.weight]
74
+ end
75
+ similarity_matrix.update(item_id, neighbors)
76
+ end
77
+ similarity_matrix.commit_item!(item_id)
78
+ end
79
+
80
+ def delete_item!(item_id)
81
+ input_matrices.map do |k,m|
82
+ m.delete_item(item_id)
83
+ end
84
+ end
85
+
86
+ end