recommendify_whosv 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/cc_item.h ADDED
@@ -0,0 +1,8 @@
1
+ #define ITEM_ID_SIZE 64
2
+
3
+ struct cc_item {
4
+ char item_id[ITEM_ID_SIZE];
5
+ int coconcurrency_count;
6
+ int total_count;
7
+ float similarity;
8
+ };
data/ext/cosine.c ADDED
@@ -0,0 +1,3 @@
1
+ void calculate_cosine(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ /* here be dragons */
3
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,18 @@
1
+ makefile = <<-MAKEFILE
2
+ all: prepare build
3
+
4
+ build:
5
+ gcc -Wall recommendify.c -lhiredis -o ../bin/recommendify
6
+
7
+ prepare:
8
+ mkdir -p ../bin
9
+
10
+ clean:
11
+ rm -f *.o
12
+
13
+ install: prepare
14
+ MAKEFILE
15
+
16
+ File.open(::File.expand_path("../Makefile", __FILE__), "w+") do |f|
17
+ f.write(makefile)
18
+ end
data/ext/iikey.c ADDED
@@ -0,0 +1,18 @@
1
+ char* item_item_key(char *item1, char *item2){
2
+ int keylen = strlen(item1) + strlen(item2) + 2;
3
+ char *key = (char *)malloc(keylen * sizeof(char));
4
+
5
+ if(!key){
6
+ printf("cannot allocate\n");
7
+ return 0;
8
+ }
9
+
10
+ // FIXPAUL: make shure this does exactly the same as ruby sort
11
+ if(rb_strcmp(item1, item2) <= 0){
12
+ snprintf(key, keylen, "%s:%s", item1, item2);
13
+ } else {
14
+ snprintf(key, keylen, "%s:%s", item2, item1);
15
+ }
16
+
17
+ return key;
18
+ }
data/ext/jaccard.c ADDED
@@ -0,0 +1,19 @@
1
+ void calculate_jaccard(char *item_id, int itemCount, struct cc_item *cc_items, int cc_items_size){
2
+ int j, n;
3
+
4
+ for(j = 0; j < cc_items_size; j++){
5
+ n = cc_items[j].coconcurrency_count;
6
+ if(n>0){
7
+ cc_items[j].similarity = (
8
+ (float)n / (
9
+ (float)itemCount +
10
+ (float)cc_items[j].total_count -
11
+ (float)n
12
+ )
13
+ );
14
+ } else {
15
+ cc_items[j].similarity = 0.0;
16
+ }
17
+ }
18
+
19
+ }
data/ext/output.c ADDED
@@ -0,0 +1,22 @@
1
+ int print_version(){
2
+ printf(
3
+ VERSION_STRING,
4
+ VERSION_MAJOR,
5
+ VERSION_MINOR,
6
+ VERSION_MICRO
7
+ );
8
+ return 0;
9
+ }
10
+
11
+ int print_usage(char *bin){
12
+ printf(USAGE_STRING, bin);
13
+ return 1;
14
+ }
15
+
16
+ void print_item(struct cc_item item){
17
+ printf(
18
+ "OUT: (%s) (%.4f)\n",
19
+ item.item_id,
20
+ item.similarity
21
+ );
22
+ }
@@ -0,0 +1,214 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include <hiredis/hiredis.h>
5
+
6
+ #include "version.h"
7
+ #include "cc_item.h"
8
+ #include "jaccard.c"
9
+ #include "cosine.c"
10
+ #include "output.c"
11
+ #include "sort.c"
12
+ #include "iikey.c"
13
+
14
+
15
+ int main(int argc, char **argv){
16
+ int i, j, n, similarityFunc = 0;
17
+ int itemCount = 0;
18
+ char *itemID;
19
+ char *redisPrefix;
20
+ redisContext *c;
21
+ redisReply *all_items;
22
+ redisReply *reply;
23
+ int cur_batch_size;
24
+ char* cur_batch;
25
+ char *iikey;
26
+
27
+ int batch_size = 200; /* FIXPAUL: make option */
28
+ int maxItems = 50; /* FIXPAUL: make option */
29
+
30
+ struct {
31
+ char host[1024];
32
+ int port;
33
+ } redis_addr;
34
+
35
+ /* option parsing */
36
+ if(argc < 2)
37
+ return print_usage(argv[0]);
38
+
39
+ if(!strcmp(argv[1], "--version"))
40
+ return print_version();
41
+
42
+ if(!strcmp(argv[1], "--jaccard"))
43
+ similarityFunc = 1;
44
+
45
+ if(!strcmp(argv[1], "--cosine"))
46
+ similarityFunc = 2;
47
+
48
+ if(!similarityFunc){
49
+ printf("invalid option: %s\n", argv[1]);
50
+ return 1;
51
+ } else if(argc < 4 || argc > 5){
52
+ printf("wrong number of arguments\n");
53
+ print_usage(argv[0]);
54
+ return 1;
55
+ }
56
+
57
+ redisPrefix = argv[2];
58
+ itemID = argv[3];
59
+ redis_addr.host[0] = 0;
60
+ redis_addr.port = 0;
61
+
62
+ /* configure redis location */
63
+ if(argc > 4){
64
+ char* has_port = strchr(argv[4], ':');
65
+ if(has_port){
66
+ strncpy(redis_addr.host, argv[4], strlen(argv[4]) - strlen(has_port));
67
+ redis_addr.host[strlen(argv[4]) - strlen(has_port)] = 0;
68
+ redis_addr.port = atoi(has_port + 1);
69
+ } else {
70
+ strncpy(redis_addr.host, argv[4], sizeof(redis_addr.host));
71
+ }
72
+ }
73
+
74
+ /* default redis location */
75
+ if(strlen(redis_addr.host) == 0)
76
+ strcpy(redis_addr.host, "localhost");
77
+
78
+ if(!redis_addr.port)
79
+ redis_addr.port = 6379;
80
+
81
+ /* connect to redis */
82
+ struct timeval timeout = { 1, 500000 };
83
+ c = redisConnectWithTimeout(redis_addr.host, redis_addr.port, timeout);
84
+
85
+ if(c->err){
86
+ printf("connection to redis failed: %s\n", c->errstr);
87
+ return 1;
88
+ }
89
+
90
+
91
+ /* get item count */
92
+ reply = redisCommand(c,"HGET %s:items %s", redisPrefix, itemID);
93
+
94
+ if(reply->str){
95
+ itemCount = atoi(reply->str);
96
+ } else {
97
+ itemCount = 0;
98
+ }
99
+
100
+ freeReplyObject(reply);
101
+
102
+ if(itemCount < 2){
103
+ printf("exit: item count is zero or one\n");
104
+ return 0;
105
+ }
106
+
107
+
108
+ /* get all items_ids and the total counts */
109
+ all_items = redisCommand(c,"HGETALL %s:items", redisPrefix);
110
+
111
+ if(all_items->type != REDIS_REPLY_ARRAY)
112
+ return 1;
113
+
114
+
115
+ /* populate the cc_items array */
116
+ int cc_items_size = all_items->elements / 2;
117
+ int cc_items_mem = cc_items_size * sizeof(struct cc_item);
118
+ struct cc_item *cc_items = malloc(cc_items_mem);
119
+ cc_items_size--;
120
+
121
+ if(!cc_items){
122
+ printf("cannot allocate memory: %i", cc_items_mem);
123
+ return 1;
124
+ }
125
+
126
+ i = 0;
127
+ for (j = 0; j < all_items->elements/2; j++){
128
+ if(strcmp(itemID, all_items->element[j*2]->str) != 0){
129
+ strncpy(cc_items[i].item_id, all_items->element[j*2]->str, ITEM_ID_SIZE);
130
+ cc_items[i].total_count = atoi(all_items->element[j*2+1]->str);
131
+ i++;
132
+ }
133
+ }
134
+
135
+ freeReplyObject(all_items);
136
+
137
+
138
+ // batched redis hmgets on the ccmatrix
139
+ cur_batch = (char *)malloc(((batch_size * (ITEM_ID_SIZE + 4) * 2) + 100) * sizeof(char));
140
+
141
+ if(!cur_batch){
142
+ printf("cannot allocate memory");
143
+ return 1;
144
+ }
145
+
146
+ n = cc_items_size;
147
+ while(n >= 0){
148
+ cur_batch_size = ((n-1 < batch_size) ? n-1 : batch_size);
149
+ sprintf(cur_batch, "HMGET %s:ccmatrix ", redisPrefix);
150
+
151
+ for(i = 0; i < cur_batch_size; i++){
152
+ iikey = item_item_key(itemID, cc_items[n-i].item_id);
153
+
154
+ strcat(cur_batch, iikey);
155
+ strcat(cur_batch, " ");
156
+
157
+ if(iikey)
158
+ free(iikey);
159
+ }
160
+
161
+ redisAppendCommand(c, cur_batch);
162
+ redisGetReply(c, (void**)&reply);
163
+
164
+ for(j = 0; j < reply->elements; j++){
165
+ if(reply->element[j]->str){
166
+ cc_items[n-j].coconcurrency_count = atoi(reply->element[j]->str);
167
+ } else {
168
+ cc_items[n-j].coconcurrency_count = 0;
169
+ }
170
+ }
171
+
172
+ freeReplyObject(reply);
173
+ n -= batch_size;
174
+ }
175
+
176
+ free(cur_batch);
177
+
178
+
179
+
180
+ /* calculate similarities */
181
+ if(similarityFunc == 1)
182
+ calculate_jaccard(itemID, itemCount, cc_items, cc_items_size);
183
+
184
+ if(similarityFunc == 2)
185
+ calculate_cosine(itemID, itemCount, cc_items, cc_items_size);
186
+
187
+
188
+ /* find the top x items with simple bubble sort */
189
+ for(i = 0; i < maxItems - 1; ++i){
190
+ for (j = 0; j < cc_items_size - i - 1; ++j){
191
+ if (cc_items[j].similarity > cc_items[j + 1].similarity){
192
+ struct cc_item tmp = cc_items[j];
193
+ cc_items[j] = cc_items[j + 1];
194
+ cc_items[j + 1] = tmp;
195
+ }
196
+ }
197
+ }
198
+
199
+
200
+ /* print top k items */
201
+ n = ((cc_items_size < maxItems) ? cc_items_size : maxItems);
202
+ for(j = 0; j < n; j++){
203
+ i = cc_items_size-j-1;
204
+ if(cc_items[i].similarity > 0){
205
+ print_item(cc_items[i]);
206
+ }
207
+ }
208
+
209
+
210
+ free(cc_items);
211
+ return 0;
212
+ }
213
+
214
+
data/ext/sort.c ADDED
@@ -0,0 +1,23 @@
1
+ int lesser(int i1, int i2){
2
+ if(i1 > i2){
3
+ return i2;
4
+ } else {
5
+ return i1;
6
+ }
7
+ }
8
+
9
+ int rb_strcmp(char *str1, char *str2){
10
+ long len;
11
+ int retval;
12
+ len = lesser(strlen(str1), strlen(str2));
13
+ retval = memcmp(str1, str2, len);
14
+ if (retval == 0){
15
+ if (strlen(str1) == strlen(str2)) {
16
+ return 0;
17
+ }
18
+ if (strlen(str1) > strlen(str2)) return 1;
19
+ return -1;
20
+ }
21
+ if (retval > 0) return 1;
22
+ return -1;
23
+ }
data/ext/version.h ADDED
@@ -0,0 +1,17 @@
1
+ #ifndef VERSION_H
2
+ #define VERSION_H
3
+
4
+ #define VERSION_MAJOR 0
5
+ #define VERSION_MINOR 0
6
+ #define VERSION_MICRO 1
7
+
8
+ #define VERSION_STRING "recommendify_native %i.%i.%i\n" \
9
+ "\n" \
10
+ "Copyright © 2012\n" \
11
+ " Paul Asmuth <paul@paulasmuth.com>\n"
12
+
13
+ #define USAGE_STRING "usage: %s " \
14
+ "{--version|--jaccard|--cosine} " \
15
+ "[redis_key] [item_id]\n"
16
+
17
+ #endif
@@ -0,0 +1,88 @@
1
+ class Recommendify::Base
2
+
3
+ attr_reader :similarity_matrix, :input_matrices
4
+ attr_accessor :redis_prefix_self
5
+
6
+ @@max_neighbors = nil
7
+ @@input_matrices = {}
8
+
9
+ def self.max_neighbors(n=nil)
10
+ return @@max_neighbors unless n
11
+ @@max_neighbors = n
12
+ end
13
+
14
+ def self.input_matrix(key, opts)
15
+ @@input_matrices[key] = opts
16
+ end
17
+
18
+ def self.input_matrices
19
+ @@input_matrices
20
+ end
21
+
22
+ def initialize(opts = nil)
23
+ @redis_prefix_self = opts[:redis_prefix] if opts && opts[:redis_prefix]
24
+ @input_matrices = Hash[self.class.input_matrices.map{ |key, opts|
25
+ opts.merge!(:key => key, :redis_prefix => redis_prefix)
26
+ [ key, Recommendify::InputMatrix.create(opts) ]
27
+ }]
28
+ @similarity_matrix = Recommendify::SimilarityMatrix.new(
29
+ :max_neighbors => max_neighbors,
30
+ :key => :similarities,
31
+ :redis_prefix => redis_prefix
32
+ )
33
+ end
34
+
35
+ def redis_prefix
36
+ @redis_prefix_self || "recommendify"
37
+ end
38
+
39
+ def max_neighbors
40
+ self.class.max_neighbors || Recommendify::DEFAULT_MAX_NEIGHBORS
41
+ end
42
+
43
+ def method_missing(method, *args)
44
+ if @input_matrices.has_key?(method)
45
+ @input_matrices[method]
46
+ else
47
+ raise NoMethodError.new(method.to_s)
48
+ end
49
+ end
50
+
51
+ def respond_to?(method)
52
+ @input_matrices.has_key?(method) ? true : super
53
+ end
54
+
55
+ def all_items
56
+ @input_matrices.map{ |k,m| m.all_items }.flatten.uniq
57
+ end
58
+
59
+ def for(item_id)
60
+ similarity_matrix[item_id].map do |item_id, similarity|
61
+ Recommendify::Neighbor.new(
62
+ :item_id => item_id,
63
+ :similarity => similarity
64
+ )
65
+ end.sort
66
+ end
67
+
68
+ def process!
69
+ all_items.each{ |item_id,n| process_item!(item_id) }
70
+ end
71
+
72
+ def process_item!(item_id)
73
+ input_matrices.map do |k,m|
74
+ neighbors = m.similarities_for(item_id).map do |i,w|
75
+ [i,w*m.weight]
76
+ end
77
+ similarity_matrix.update(item_id, neighbors)
78
+ end
79
+ similarity_matrix.commit_item!(item_id)
80
+ end
81
+
82
+ def delete_item!(item_id)
83
+ input_matrices.map do |k,m|
84
+ m.delete_item(item_id)
85
+ end
86
+ end
87
+
88
+ end
@@ -0,0 +1,51 @@
1
+ module Recommendify::CCMatrix
2
+
3
+ def ccmatrix
4
+ @ccmatrix ||= Recommendify::SparseMatrix.new(
5
+ :redis_prefix => @opts.fetch(:redis_prefix),
6
+ :key => [@opts.fetch(:key), :ccmatrix].join(":")
7
+ )
8
+ end
9
+
10
+ def add_set(set_id, item_ids)
11
+ # FIXPAUL: forbid | and : in item_ids
12
+ item_ids.each do |item_id|
13
+ item_count_incr(item_id)
14
+ end
15
+ all_pairs(item_ids).map do |pair|
16
+ i1, i2 = pair.split(":")
17
+ ccmatrix.incr(i1, i2)
18
+ end
19
+ end
20
+
21
+ def add_single(set_id, item_id, other_item_ids)
22
+ item_count_incr(item_id)
23
+ other_item_ids.each do |other_item|
24
+ ccmatrix.incr(item_id, other_item)
25
+ end
26
+ end
27
+
28
+ def all_items
29
+ Recommendify.redis.hkeys(redis_key(:items))
30
+ end
31
+
32
+ def delete_item(item_id)
33
+ Recommendify.redis.hdel(redis_key(:items), item_id)
34
+ ccmatrix.send(:k_delall, item_id)
35
+ end
36
+
37
+ private
38
+
39
+ def all_pairs(keys)
40
+ keys.map{ |k1| (keys-[k1]).map{ |k2| [k1,k2].sort.join(":") } }.flatten.uniq
41
+ end
42
+
43
+ def item_count_incr(key)
44
+ Recommendify.redis.hincrby(redis_key(:items), key, 1)
45
+ end
46
+
47
+ def item_count(key)
48
+ Recommendify.redis.hget(redis_key(:items), key).to_i
49
+ end
50
+
51
+ end
@@ -0,0 +1,7 @@
1
+ class Recommendify::CosineInputMatrix < Recommendify::InputMatrix
2
+
3
+ include Recommendify::CCMatrix
4
+
5
+ # here be dragons ;)
6
+
7
+ end
@@ -0,0 +1,52 @@
1
+ class Recommendify::InputMatrix
2
+
3
+ def self.create(opts)
4
+ klass = "#{Recommendify.capitalize(opts[:similarity_func])}InputMatrix"
5
+ Recommendify.constantize(klass.intern).new(opts)
6
+ end
7
+
8
+ def initialize(opts)
9
+ @opts = opts
10
+ end
11
+
12
+ def redis_key(append=nil)
13
+ [@opts.fetch(:redis_prefix), @opts.fetch(:key), append].flatten.compact.join(":")
14
+ end
15
+
16
+ def weight
17
+ (@opts[:weight] || 1).to_f
18
+ end
19
+
20
+ # add a set of item_ids to the matrix
21
+ def add_set(set_id, item_ids)
22
+ raise "implemented in subclass"
23
+ end
24
+
25
+ # add a single item to a set of item_ids to the matrix
26
+ def add_single(set_id, item_id, other_item_ids)
27
+ raise "implemented in subclass"
28
+ end
29
+
30
+ # calculate the similarity between item1 and item1 (0.0-1.0)
31
+ def similarity(item1, item2)
32
+ raise "implemented in subclass"
33
+ end
34
+
35
+ # calculate all similarities to other items in the matrix for item1
36
+ def similarities_for(item1)
37
+ # return => [ ["item23", 0.6], ["item42", 0.23], (...) ]
38
+ raise "implemented in subclass"
39
+ end
40
+
41
+ # retrieve all item_ids in the matrix
42
+ def all_items
43
+ # retzrb => [ "item23", "item42", "item17", (...) ]
44
+ raise "implemented in subclass"
45
+ end
46
+
47
+ # delete item_id from the matrix
48
+ def delete_item(item_id)
49
+ raise "implemented in subclass"
50
+ end
51
+
52
+ end
@@ -0,0 +1,62 @@
1
+ class Recommendify::JaccardInputMatrix < Recommendify::InputMatrix
2
+
3
+ include Recommendify::CCMatrix
4
+
5
+ def initialize(opts={})
6
+ check_native if opts[:native]
7
+ super(opts)
8
+ end
9
+
10
+ def similarity(item1, item2)
11
+ calculate_jaccard_cached(item1, item2)
12
+ end
13
+
14
+ def similarities_for(item1)
15
+ return run_native(item1) if @opts[:native]
16
+ calculate_similarities(item1)
17
+ end
18
+
19
+ private
20
+
21
+ def calculate_similarities(item1)
22
+ (all_items - [item1]).map do |item2|
23
+ [item2, similarity(item1, item2)]
24
+ end
25
+ end
26
+
27
+ def calculate_jaccard_cached(item1, item2)
28
+ val = ccmatrix[item1, item2]
29
+ val.to_f / (item_count(item1)+item_count(item2)-val).to_f
30
+ end
31
+
32
+ def calculate_jaccard(set1, set2)
33
+ (set1&set2).length.to_f / (set1 + set2).uniq.length.to_f
34
+ end
35
+
36
+ def run_native(item_id)
37
+ res = %x{#{native_path} --jaccard "#{redis_key}" "#{item_id}" "#{redis_url}"}
38
+ raise "error: dirty exit (#{$?})" if $? != 0
39
+ res.split("\n").map do |line|
40
+ sim = line.match(/OUT: \(([^\)]*)\) \(([^\)]*)\)/)
41
+ unless sim
42
+ raise "error: #{res}" unless (res||"").include?('exit:')
43
+ else
44
+ [sim[1], sim[2].to_f]
45
+ end
46
+ end.compact
47
+ end
48
+
49
+ def check_native
50
+ return true if ::File.exists?(native_path)
51
+ raise "recommendify_native not found - you need to run rake build_native first"
52
+ end
53
+
54
+ def native_path
55
+ ::File.expand_path('../../../bin/recommendify', __FILE__)
56
+ end
57
+
58
+ def redis_url
59
+ Recommendify.redis.client.location
60
+ end
61
+
62
+ end
@@ -0,0 +1,19 @@
1
+ class Recommendify::Neighbor
2
+
3
+ def initialize(data)
4
+ @data = data
5
+ end
6
+
7
+ def item_id
8
+ @data.fetch(:item_id).to_s
9
+ end
10
+
11
+ def similarity
12
+ @data.fetch(:similarity)
13
+ end
14
+
15
+ def <=>(other)
16
+ other.similarity <=> self.similarity
17
+ end
18
+
19
+ end
@@ -0,0 +1,25 @@
1
+ module Recommendify
2
+
3
+ DEFAULT_MAX_NEIGHBORS = 50
4
+
5
+ @@redis = nil
6
+
7
+ def self.redis=(redis)
8
+ @@redis = redis
9
+ end
10
+
11
+ def self.redis
12
+ return @@redis unless @@redis.nil?
13
+ raise "redis not configured! - Recommendify.redis = Redis.new"
14
+ end
15
+
16
+ def self.capitalize(str_or_sym)
17
+ str = str_or_sym.to_s.each_char.to_a
18
+ str.first.upcase + str[1..-1].join("").downcase
19
+ end
20
+
21
+ def self.constantize(klass)
22
+ Object.module_eval("Recommendify::#{klass}", __FILE__, __LINE__)
23
+ end
24
+
25
+ end