disco 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fbecb858b316ed39a9cb726263e182561cba6df498e6253d88c79ebec5cab05
4
- data.tar.gz: 42eb38a6e4e0b3fc5a9452deae5a48676ae9a53e78eeb6197718a0c94bd02b6b
3
+ metadata.gz: a7823dbe0e68967c39a59f8cdc2fe577f4366b492e0559487606b74a7de1cc84
4
+ data.tar.gz: ba40e46b203e424eccb811c6b042c9a283356c42585b7e00123b4bb2f232b1e2
5
5
  SHA512:
6
- metadata.gz: d0250346d75fba75064a29578f6bfd39f09ecf712ba2e505b97a4952b5ff8b31af307eb1b912e9b25cc3dc28dee0d096bea44b47bb2ef268859bb4171f0ef8b2
7
- data.tar.gz: 7b341328c12885efd0ffece4201036bb9457caee80a48a99ba110af9a81bcf832bbc1e8f8f5f14e7fddffef2dd3f4643837e0d569c997ab0c2d9ae85e12422f7
6
+ metadata.gz: ee43326933ac019b0bae631631ba79a7b1e03d1e9669361ef7722aa5a43b7bf2a2f49ccf8b098ab23539392fd09b83224c3cb9d340b80483179fabb45d62ee30
7
+ data.tar.gz: 9733820cc4e81b22cca51dbf89a02aa87e96cbbc1add753b2799878b5b50b549f2a27886dcfae387ad4cc158ce4bd651354f8bbd2514460ac07a60560ad5c455
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.2.6 (2021-02-24)
2
+
3
+ - Improved performance
4
+ - Improved `inspect` method
5
+ - Fixed issue with `similar_users` and `item_recs` returning the original user/item
6
+ - Fixed error with `fit` after loading
7
+
1
8
  ## 0.2.5 (2021-02-20)
2
9
 
3
10
  - Added `top_items` method
data/README.md CHANGED
@@ -44,7 +44,7 @@ recommender.fit([
44
44
  ])
45
45
  ```
46
46
 
47
- > Use `value` instead of rating for implicit feedback
47
+ > Use `value` instead of `rating` for implicit feedback
48
48
 
49
49
  Get user-based recommendations - “users like you also liked”
50
50
 
@@ -247,7 +247,7 @@ recommender.fit(data)
247
247
  recommender.top_items
248
248
  ```
249
249
 
250
- This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
250
+ This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback.
251
251
 
252
252
  ## Data
253
253
 
@@ -269,7 +269,7 @@ Or a Daru data frame
269
269
  Daru::DataFrame.from_csv("ratings.csv")
270
270
  ```
271
271
 
272
- ## Performance [master]
272
+ ## Performance
273
273
 
274
274
  If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
275
275
 
@@ -17,24 +17,28 @@ module Disco
17
17
 
18
18
  check_training_set(train_set)
19
19
 
20
+ # TODO option to set in initializer to avoid pass
21
+ # could also just check first few values
22
+ # but may be confusing if they are all missing and later ones aren't
20
23
  @implicit = !train_set.any? { |v| v[:rating] }
24
+
25
+ # TODO improve performance
26
+ # (catch exception instead of checking ahead of time)
21
27
  unless @implicit
22
28
  check_ratings(train_set)
23
- @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
24
29
 
25
30
  if validation_set
26
31
  check_ratings(validation_set)
27
32
  end
28
33
  end
29
34
 
30
- update_maps(train_set)
31
-
32
35
  @rated = Hash.new { |hash, key| hash[key] = {} }
33
36
  input = []
34
37
  value_key = @implicit ? :value : :rating
35
38
  train_set.each do |v|
36
- u = @user_map[v[:user_id]]
37
- i = @item_map[v[:item_id]]
39
+ # update maps and build matrix in single pass
40
+ u = (@user_map[v[:user_id]] ||= @user_map.size)
41
+ i = (@item_map[v[:item_id]] ||= @item_map.size)
38
42
  @rated[u][i] = true
39
43
 
40
44
  # explicit will always have a value due to check_ratings
@@ -42,6 +46,15 @@ module Disco
42
46
  end
43
47
  @rated.default = nil
44
48
 
49
+ # much more efficient than checking every value in another pass
50
+ raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
51
+ raise ArgumentError, "Missing item_id" if @item_map.key?(nil)
52
+
53
+ # TODO improve performance
54
+ unless @implicit
55
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
56
+ end
57
+
45
58
  if @top_items
46
59
  @item_count = [0] * @item_map.size
47
60
  @item_sum = [0.0] * @item_map.size
@@ -78,6 +91,9 @@ module Disco
78
91
  @user_factors = model.p_factors(format: :numo)
79
92
  @item_factors = model.q_factors(format: :numo)
80
93
 
94
+ @normalized_user_factors = nil
95
+ @normalized_item_factors = nil
96
+
81
97
  @user_recs_index = nil
82
98
  @similar_users_index = nil
83
99
  @similar_items_index = nil
@@ -149,13 +165,13 @@ module Disco
149
165
 
150
166
  def similar_items(item_id, count: 5)
151
167
  check_fit
152
- similar(item_id, @item_map, item_norms, count, @similar_items_index)
168
+ similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
153
169
  end
154
170
  alias_method :item_recs, :similar_items
155
171
 
156
172
  def similar_users(user_id, count: 5)
157
173
  check_fit
158
- similar(user_id, @user_map, user_norms, count, @similar_users_index)
174
+ similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
159
175
  end
160
176
 
161
177
  def top_items(count: 5)
@@ -212,13 +228,17 @@ module Disco
212
228
 
213
229
  def optimize_similar_items(library: nil)
214
230
  check_fit
215
- @similar_items_index = create_index(item_norms, library: library)
231
+ @similar_items_index = create_index(normalized_item_factors, library: library)
216
232
  end
217
233
  alias_method :optimize_item_recs, :optimize_similar_items
218
234
 
219
235
  def optimize_similar_users(library: nil)
220
236
  check_fit
221
- @similar_users_index = create_index(user_norms, library: library)
237
+ @similar_users_index = create_index(normalized_user_factors, library: library)
238
+ end
239
+
240
+ def inspect
241
+ to_s # for now
222
242
  end
223
243
 
224
244
  private
@@ -251,7 +271,7 @@ module Disco
251
271
  # https://github.com/yahoojapan/NGT/issues/36
252
272
  index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
253
273
 
254
- # NGT normalizes so could call create_index with factors instead of norms
274
+ # NGT normalizes so could call create_index without normalized factors
255
275
  # but keep code simple for now
256
276
  ids = index.batch_insert(factors)
257
277
  raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
@@ -262,15 +282,15 @@ module Disco
262
282
  end
263
283
  end
264
284
 
265
- def user_norms
266
- @user_norms ||= norms(@user_factors)
285
+ def normalized_user_factors
286
+ @normalized_user_factors ||= normalize(@user_factors)
267
287
  end
268
288
 
269
- def item_norms
270
- @item_norms ||= norms(@item_factors)
289
+ def normalized_item_factors
290
+ @normalized_item_factors ||= normalize(@item_factors)
271
291
  end
272
292
 
273
- def norms(factors)
293
+ def normalize(factors)
274
294
  norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
275
295
  norms[norms.eq(0)] = 1e-10 # no zeros
276
296
  factors / norms.expand_dims(1)
@@ -303,30 +323,26 @@ module Disco
303
323
  # TODO use user_id for similar_users in 0.3.0
304
324
  key = :item_id
305
325
 
306
- (1...ids.size).map do |i|
307
- {key => keys[ids[i]], score: predictions[i]}
326
+ result = []
327
+ # items can have the same score
328
+ # so original item may not be at index 0
329
+ ids.each_with_index do |id, j|
330
+ next if id == i
331
+
332
+ result << {key => keys[id], score: predictions[j]}
308
333
  end
334
+ result
309
335
  else
310
336
  []
311
337
  end
312
338
  end
313
339
 
314
- def update_maps(train_set)
315
- raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
316
- raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
317
-
318
- train_set.each do |v|
319
- @user_map[v[:user_id]] ||= @user_map.size
320
- @item_map[v[:item_id]] ||= @item_map.size
321
- end
322
- end
323
-
324
340
  def check_ratings(ratings)
325
341
  unless ratings.all? { |r| !r[:rating].nil? }
326
- raise ArgumentError, "Missing ratings"
342
+ raise ArgumentError, "Missing rating"
327
343
  end
328
344
  unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
329
- raise ArgumentError, "Ratings must be numeric"
345
+ raise ArgumentError, "Rating must be numeric"
330
346
  end
331
347
  end
332
348
 
@@ -365,7 +381,10 @@ module Disco
365
381
  rated: @rated,
366
382
  global_mean: @global_mean,
367
383
  user_factors: @user_factors,
368
- item_factors: @item_factors
384
+ item_factors: @item_factors,
385
+ factors: @factors,
386
+ epochs: @epochs,
387
+ verbose: @verbose
369
388
  }
370
389
 
371
390
  unless @implicit
@@ -389,6 +408,9 @@ module Disco
389
408
  @global_mean = obj[:global_mean]
390
409
  @user_factors = obj[:user_factors]
391
410
  @item_factors = obj[:item_factors]
411
+ @factors = obj[:factors]
412
+ @epochs = obj[:epochs]
413
+ @verbose = obj[:verbose]
392
414
 
393
415
  unless @implicit
394
416
  @min_rating = obj[:min_rating]
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-20 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf