disco 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +3 -3
- data/lib/disco/recommender.rb +52 -30
- data/lib/disco/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7823dbe0e68967c39a59f8cdc2fe577f4366b492e0559487606b74a7de1cc84
|
4
|
+
data.tar.gz: ba40e46b203e424eccb811c6b042c9a283356c42585b7e00123b4bb2f232b1e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ee43326933ac019b0bae631631ba79a7b1e03d1e9669361ef7722aa5a43b7bf2a2f49ccf8b098ab23539392fd09b83224c3cb9d340b80483179fabb45d62ee30
|
7
|
+
data.tar.gz: 9733820cc4e81b22cca51dbf89a02aa87e96cbbc1add753b2799878b5b50b549f2a27886dcfae387ad4cc158ce4bd651354f8bbd2514460ac07a60560ad5c455
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.2.6 (2021-02-24)
|
2
|
+
|
3
|
+
- Improved performance
|
4
|
+
- Improved `inspect` method
|
5
|
+
- Fixed issue with `similar_users` and `item_recs` returning the original user/item
|
6
|
+
- Fixed error with `fit` after loading
|
7
|
+
|
1
8
|
## 0.2.5 (2021-02-20)
|
2
9
|
|
3
10
|
- Added `top_items` method
|
data/README.md
CHANGED
@@ -44,7 +44,7 @@ recommender.fit([
|
|
44
44
|
])
|
45
45
|
```
|
46
46
|
|
47
|
-
> Use `value` instead of rating for implicit feedback
|
47
|
+
> Use `value` instead of `rating` for implicit feedback
|
48
48
|
|
49
49
|
Get user-based recommendations - “users like you also liked”
|
50
50
|
|
@@ -247,7 +247,7 @@ recommender.fit(data)
|
|
247
247
|
recommender.top_items
|
248
248
|
```
|
249
249
|
|
250
|
-
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
|
250
|
+
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback.
|
251
251
|
|
252
252
|
## Data
|
253
253
|
|
@@ -269,7 +269,7 @@ Or a Daru data frame
|
|
269
269
|
Daru::DataFrame.from_csv("ratings.csv")
|
270
270
|
```
|
271
271
|
|
272
|
-
## Performance
|
272
|
+
## Performance
|
273
273
|
|
274
274
|
If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
|
275
275
|
|
data/lib/disco/recommender.rb
CHANGED
@@ -17,24 +17,28 @@ module Disco
|
|
17
17
|
|
18
18
|
check_training_set(train_set)
|
19
19
|
|
20
|
+
# TODO option to set in initializer to avoid pass
|
21
|
+
# could also just check first few values
|
22
|
+
# but may be confusing if they are all missing and later ones aren't
|
20
23
|
@implicit = !train_set.any? { |v| v[:rating] }
|
24
|
+
|
25
|
+
# TODO improve performance
|
26
|
+
# (catch exception instead of checking ahead of time)
|
21
27
|
unless @implicit
|
22
28
|
check_ratings(train_set)
|
23
|
-
@min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
|
24
29
|
|
25
30
|
if validation_set
|
26
31
|
check_ratings(validation_set)
|
27
32
|
end
|
28
33
|
end
|
29
34
|
|
30
|
-
update_maps(train_set)
|
31
|
-
|
32
35
|
@rated = Hash.new { |hash, key| hash[key] = {} }
|
33
36
|
input = []
|
34
37
|
value_key = @implicit ? :value : :rating
|
35
38
|
train_set.each do |v|
|
36
|
-
|
37
|
-
|
39
|
+
# update maps and build matrix in single pass
|
40
|
+
u = (@user_map[v[:user_id]] ||= @user_map.size)
|
41
|
+
i = (@item_map[v[:item_id]] ||= @item_map.size)
|
38
42
|
@rated[u][i] = true
|
39
43
|
|
40
44
|
# explicit will always have a value due to check_ratings
|
@@ -42,6 +46,15 @@ module Disco
|
|
42
46
|
end
|
43
47
|
@rated.default = nil
|
44
48
|
|
49
|
+
# much more efficient than checking every value in another pass
|
50
|
+
raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
|
51
|
+
raise ArgumentError, "Missing item_id" if @item_map.key?(nil)
|
52
|
+
|
53
|
+
# TODO improve performance
|
54
|
+
unless @implicit
|
55
|
+
@min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
|
56
|
+
end
|
57
|
+
|
45
58
|
if @top_items
|
46
59
|
@item_count = [0] * @item_map.size
|
47
60
|
@item_sum = [0.0] * @item_map.size
|
@@ -78,6 +91,9 @@ module Disco
|
|
78
91
|
@user_factors = model.p_factors(format: :numo)
|
79
92
|
@item_factors = model.q_factors(format: :numo)
|
80
93
|
|
94
|
+
@normalized_user_factors = nil
|
95
|
+
@normalized_item_factors = nil
|
96
|
+
|
81
97
|
@user_recs_index = nil
|
82
98
|
@similar_users_index = nil
|
83
99
|
@similar_items_index = nil
|
@@ -149,13 +165,13 @@ module Disco
|
|
149
165
|
|
150
166
|
def similar_items(item_id, count: 5)
|
151
167
|
check_fit
|
152
|
-
similar(item_id, @item_map,
|
168
|
+
similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
153
169
|
end
|
154
170
|
alias_method :item_recs, :similar_items
|
155
171
|
|
156
172
|
def similar_users(user_id, count: 5)
|
157
173
|
check_fit
|
158
|
-
similar(user_id, @user_map,
|
174
|
+
similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
159
175
|
end
|
160
176
|
|
161
177
|
def top_items(count: 5)
|
@@ -212,13 +228,17 @@ module Disco
|
|
212
228
|
|
213
229
|
def optimize_similar_items(library: nil)
|
214
230
|
check_fit
|
215
|
-
@similar_items_index = create_index(
|
231
|
+
@similar_items_index = create_index(normalized_item_factors, library: library)
|
216
232
|
end
|
217
233
|
alias_method :optimize_item_recs, :optimize_similar_items
|
218
234
|
|
219
235
|
def optimize_similar_users(library: nil)
|
220
236
|
check_fit
|
221
|
-
@similar_users_index = create_index(
|
237
|
+
@similar_users_index = create_index(normalized_user_factors, library: library)
|
238
|
+
end
|
239
|
+
|
240
|
+
def inspect
|
241
|
+
to_s # for now
|
222
242
|
end
|
223
243
|
|
224
244
|
private
|
@@ -251,7 +271,7 @@ module Disco
|
|
251
271
|
# https://github.com/yahoojapan/NGT/issues/36
|
252
272
|
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
|
253
273
|
|
254
|
-
# NGT normalizes so could call create_index
|
274
|
+
# NGT normalizes so could call create_index without normalized factors
|
255
275
|
# but keep code simple for now
|
256
276
|
ids = index.batch_insert(factors)
|
257
277
|
raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
|
@@ -262,15 +282,15 @@ module Disco
|
|
262
282
|
end
|
263
283
|
end
|
264
284
|
|
265
|
-
def
|
266
|
-
@
|
285
|
+
def normalized_user_factors
|
286
|
+
@normalized_user_factors ||= normalize(@user_factors)
|
267
287
|
end
|
268
288
|
|
269
|
-
def
|
270
|
-
@
|
289
|
+
def normalized_item_factors
|
290
|
+
@normalized_item_factors ||= normalize(@item_factors)
|
271
291
|
end
|
272
292
|
|
273
|
-
def
|
293
|
+
def normalize(factors)
|
274
294
|
norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
|
275
295
|
norms[norms.eq(0)] = 1e-10 # no zeros
|
276
296
|
factors / norms.expand_dims(1)
|
@@ -303,30 +323,26 @@ module Disco
|
|
303
323
|
# TODO use user_id for similar_users in 0.3.0
|
304
324
|
key = :item_id
|
305
325
|
|
306
|
-
|
307
|
-
|
326
|
+
result = []
|
327
|
+
# items can have the same score
|
328
|
+
# so original item may not be at index 0
|
329
|
+
ids.each_with_index do |id, j|
|
330
|
+
next if id == i
|
331
|
+
|
332
|
+
result << {key => keys[id], score: predictions[j]}
|
308
333
|
end
|
334
|
+
result
|
309
335
|
else
|
310
336
|
[]
|
311
337
|
end
|
312
338
|
end
|
313
339
|
|
314
|
-
def update_maps(train_set)
|
315
|
-
raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
|
316
|
-
raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
|
317
|
-
|
318
|
-
train_set.each do |v|
|
319
|
-
@user_map[v[:user_id]] ||= @user_map.size
|
320
|
-
@item_map[v[:item_id]] ||= @item_map.size
|
321
|
-
end
|
322
|
-
end
|
323
|
-
|
324
340
|
def check_ratings(ratings)
|
325
341
|
unless ratings.all? { |r| !r[:rating].nil? }
|
326
|
-
raise ArgumentError, "Missing
|
342
|
+
raise ArgumentError, "Missing rating"
|
327
343
|
end
|
328
344
|
unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
|
329
|
-
raise ArgumentError, "
|
345
|
+
raise ArgumentError, "Rating must be numeric"
|
330
346
|
end
|
331
347
|
end
|
332
348
|
|
@@ -365,7 +381,10 @@ module Disco
|
|
365
381
|
rated: @rated,
|
366
382
|
global_mean: @global_mean,
|
367
383
|
user_factors: @user_factors,
|
368
|
-
item_factors: @item_factors
|
384
|
+
item_factors: @item_factors,
|
385
|
+
factors: @factors,
|
386
|
+
epochs: @epochs,
|
387
|
+
verbose: @verbose
|
369
388
|
}
|
370
389
|
|
371
390
|
unless @implicit
|
@@ -389,6 +408,9 @@ module Disco
|
|
389
408
|
@global_mean = obj[:global_mean]
|
390
409
|
@user_factors = obj[:user_factors]
|
391
410
|
@item_factors = obj[:item_factors]
|
411
|
+
@factors = obj[:factors]
|
412
|
+
@epochs = obj[:epochs]
|
413
|
+
@verbose = obj[:verbose]
|
392
414
|
|
393
415
|
unless @implicit
|
394
416
|
@min_rating = obj[:min_rating]
|
data/lib/disco/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|