disco 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +3 -3
- data/lib/disco/recommender.rb +52 -30
- data/lib/disco/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7823dbe0e68967c39a59f8cdc2fe577f4366b492e0559487606b74a7de1cc84
|
4
|
+
data.tar.gz: ba40e46b203e424eccb811c6b042c9a283356c42585b7e00123b4bb2f232b1e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ee43326933ac019b0bae631631ba79a7b1e03d1e9669361ef7722aa5a43b7bf2a2f49ccf8b098ab23539392fd09b83224c3cb9d340b80483179fabb45d62ee30
|
7
|
+
data.tar.gz: 9733820cc4e81b22cca51dbf89a02aa87e96cbbc1add753b2799878b5b50b549f2a27886dcfae387ad4cc158ce4bd651354f8bbd2514460ac07a60560ad5c455
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.2.6 (2021-02-24)
|
2
|
+
|
3
|
+
- Improved performance
|
4
|
+
- Improved `inspect` method
|
5
|
+
- Fixed issue with `similar_users` and `item_recs` returning the original user/item
|
6
|
+
- Fixed error with `fit` after loading
|
7
|
+
|
1
8
|
## 0.2.5 (2021-02-20)
|
2
9
|
|
3
10
|
- Added `top_items` method
|
data/README.md
CHANGED
@@ -44,7 +44,7 @@ recommender.fit([
|
|
44
44
|
])
|
45
45
|
```
|
46
46
|
|
47
|
-
> Use `value` instead of rating for implicit feedback
|
47
|
+
> Use `value` instead of `rating` for implicit feedback
|
48
48
|
|
49
49
|
Get user-based recommendations - “users like you also liked”
|
50
50
|
|
@@ -247,7 +247,7 @@ recommender.fit(data)
|
|
247
247
|
recommender.top_items
|
248
248
|
```
|
249
249
|
|
250
|
-
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
|
250
|
+
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback.
|
251
251
|
|
252
252
|
## Data
|
253
253
|
|
@@ -269,7 +269,7 @@ Or a Daru data frame
|
|
269
269
|
Daru::DataFrame.from_csv("ratings.csv")
|
270
270
|
```
|
271
271
|
|
272
|
-
## Performance
|
272
|
+
## Performance
|
273
273
|
|
274
274
|
If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
|
275
275
|
|
data/lib/disco/recommender.rb
CHANGED
@@ -17,24 +17,28 @@ module Disco
|
|
17
17
|
|
18
18
|
check_training_set(train_set)
|
19
19
|
|
20
|
+
# TODO option to set in initializer to avoid pass
|
21
|
+
# could also just check first few values
|
22
|
+
# but may be confusing if they are all missing and later ones aren't
|
20
23
|
@implicit = !train_set.any? { |v| v[:rating] }
|
24
|
+
|
25
|
+
# TODO improve performance
|
26
|
+
# (catch exception instead of checking ahead of time)
|
21
27
|
unless @implicit
|
22
28
|
check_ratings(train_set)
|
23
|
-
@min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
|
24
29
|
|
25
30
|
if validation_set
|
26
31
|
check_ratings(validation_set)
|
27
32
|
end
|
28
33
|
end
|
29
34
|
|
30
|
-
update_maps(train_set)
|
31
|
-
|
32
35
|
@rated = Hash.new { |hash, key| hash[key] = {} }
|
33
36
|
input = []
|
34
37
|
value_key = @implicit ? :value : :rating
|
35
38
|
train_set.each do |v|
|
36
|
-
|
37
|
-
|
39
|
+
# update maps and build matrix in single pass
|
40
|
+
u = (@user_map[v[:user_id]] ||= @user_map.size)
|
41
|
+
i = (@item_map[v[:item_id]] ||= @item_map.size)
|
38
42
|
@rated[u][i] = true
|
39
43
|
|
40
44
|
# explicit will always have a value due to check_ratings
|
@@ -42,6 +46,15 @@ module Disco
|
|
42
46
|
end
|
43
47
|
@rated.default = nil
|
44
48
|
|
49
|
+
# much more efficient than checking every value in another pass
|
50
|
+
raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
|
51
|
+
raise ArgumentError, "Missing item_id" if @item_map.key?(nil)
|
52
|
+
|
53
|
+
# TODO improve performance
|
54
|
+
unless @implicit
|
55
|
+
@min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
|
56
|
+
end
|
57
|
+
|
45
58
|
if @top_items
|
46
59
|
@item_count = [0] * @item_map.size
|
47
60
|
@item_sum = [0.0] * @item_map.size
|
@@ -78,6 +91,9 @@ module Disco
|
|
78
91
|
@user_factors = model.p_factors(format: :numo)
|
79
92
|
@item_factors = model.q_factors(format: :numo)
|
80
93
|
|
94
|
+
@normalized_user_factors = nil
|
95
|
+
@normalized_item_factors = nil
|
96
|
+
|
81
97
|
@user_recs_index = nil
|
82
98
|
@similar_users_index = nil
|
83
99
|
@similar_items_index = nil
|
@@ -149,13 +165,13 @@ module Disco
|
|
149
165
|
|
150
166
|
def similar_items(item_id, count: 5)
|
151
167
|
check_fit
|
152
|
-
similar(item_id, @item_map,
|
168
|
+
similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
153
169
|
end
|
154
170
|
alias_method :item_recs, :similar_items
|
155
171
|
|
156
172
|
def similar_users(user_id, count: 5)
|
157
173
|
check_fit
|
158
|
-
similar(user_id, @user_map,
|
174
|
+
similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
159
175
|
end
|
160
176
|
|
161
177
|
def top_items(count: 5)
|
@@ -212,13 +228,17 @@ module Disco
|
|
212
228
|
|
213
229
|
def optimize_similar_items(library: nil)
|
214
230
|
check_fit
|
215
|
-
@similar_items_index = create_index(
|
231
|
+
@similar_items_index = create_index(normalized_item_factors, library: library)
|
216
232
|
end
|
217
233
|
alias_method :optimize_item_recs, :optimize_similar_items
|
218
234
|
|
219
235
|
def optimize_similar_users(library: nil)
|
220
236
|
check_fit
|
221
|
-
@similar_users_index = create_index(
|
237
|
+
@similar_users_index = create_index(normalized_user_factors, library: library)
|
238
|
+
end
|
239
|
+
|
240
|
+
def inspect
|
241
|
+
to_s # for now
|
222
242
|
end
|
223
243
|
|
224
244
|
private
|
@@ -251,7 +271,7 @@ module Disco
|
|
251
271
|
# https://github.com/yahoojapan/NGT/issues/36
|
252
272
|
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
|
253
273
|
|
254
|
-
# NGT normalizes so could call create_index
|
274
|
+
# NGT normalizes so could call create_index without normalized factors
|
255
275
|
# but keep code simple for now
|
256
276
|
ids = index.batch_insert(factors)
|
257
277
|
raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
|
@@ -262,15 +282,15 @@ module Disco
|
|
262
282
|
end
|
263
283
|
end
|
264
284
|
|
265
|
-
def
|
266
|
-
@
|
285
|
+
def normalized_user_factors
|
286
|
+
@normalized_user_factors ||= normalize(@user_factors)
|
267
287
|
end
|
268
288
|
|
269
|
-
def
|
270
|
-
@
|
289
|
+
def normalized_item_factors
|
290
|
+
@normalized_item_factors ||= normalize(@item_factors)
|
271
291
|
end
|
272
292
|
|
273
|
-
def
|
293
|
+
def normalize(factors)
|
274
294
|
norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
|
275
295
|
norms[norms.eq(0)] = 1e-10 # no zeros
|
276
296
|
factors / norms.expand_dims(1)
|
@@ -303,30 +323,26 @@ module Disco
|
|
303
323
|
# TODO use user_id for similar_users in 0.3.0
|
304
324
|
key = :item_id
|
305
325
|
|
306
|
-
|
307
|
-
|
326
|
+
result = []
|
327
|
+
# items can have the same score
|
328
|
+
# so original item may not be at index 0
|
329
|
+
ids.each_with_index do |id, j|
|
330
|
+
next if id == i
|
331
|
+
|
332
|
+
result << {key => keys[id], score: predictions[j]}
|
308
333
|
end
|
334
|
+
result
|
309
335
|
else
|
310
336
|
[]
|
311
337
|
end
|
312
338
|
end
|
313
339
|
|
314
|
-
def update_maps(train_set)
|
315
|
-
raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
|
316
|
-
raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
|
317
|
-
|
318
|
-
train_set.each do |v|
|
319
|
-
@user_map[v[:user_id]] ||= @user_map.size
|
320
|
-
@item_map[v[:item_id]] ||= @item_map.size
|
321
|
-
end
|
322
|
-
end
|
323
|
-
|
324
340
|
def check_ratings(ratings)
|
325
341
|
unless ratings.all? { |r| !r[:rating].nil? }
|
326
|
-
raise ArgumentError, "Missing
|
342
|
+
raise ArgumentError, "Missing rating"
|
327
343
|
end
|
328
344
|
unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
|
329
|
-
raise ArgumentError, "
|
345
|
+
raise ArgumentError, "Rating must be numeric"
|
330
346
|
end
|
331
347
|
end
|
332
348
|
|
@@ -365,7 +381,10 @@ module Disco
|
|
365
381
|
rated: @rated,
|
366
382
|
global_mean: @global_mean,
|
367
383
|
user_factors: @user_factors,
|
368
|
-
item_factors: @item_factors
|
384
|
+
item_factors: @item_factors,
|
385
|
+
factors: @factors,
|
386
|
+
epochs: @epochs,
|
387
|
+
verbose: @verbose
|
369
388
|
}
|
370
389
|
|
371
390
|
unless @implicit
|
@@ -389,6 +408,9 @@ module Disco
|
|
389
408
|
@global_mean = obj[:global_mean]
|
390
409
|
@user_factors = obj[:user_factors]
|
391
410
|
@item_factors = obj[:item_factors]
|
411
|
+
@factors = obj[:factors]
|
412
|
+
@epochs = obj[:epochs]
|
413
|
+
@verbose = obj[:verbose]
|
392
414
|
|
393
415
|
unless @implicit
|
394
416
|
@min_rating = obj[:min_rating]
|
data/lib/disco/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|