disco 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fbecb858b316ed39a9cb726263e182561cba6df498e6253d88c79ebec5cab05
4
- data.tar.gz: 42eb38a6e4e0b3fc5a9452deae5a48676ae9a53e78eeb6197718a0c94bd02b6b
3
+ metadata.gz: a7823dbe0e68967c39a59f8cdc2fe577f4366b492e0559487606b74a7de1cc84
4
+ data.tar.gz: ba40e46b203e424eccb811c6b042c9a283356c42585b7e00123b4bb2f232b1e2
5
5
  SHA512:
6
- metadata.gz: d0250346d75fba75064a29578f6bfd39f09ecf712ba2e505b97a4952b5ff8b31af307eb1b912e9b25cc3dc28dee0d096bea44b47bb2ef268859bb4171f0ef8b2
7
- data.tar.gz: 7b341328c12885efd0ffece4201036bb9457caee80a48a99ba110af9a81bcf832bbc1e8f8f5f14e7fddffef2dd3f4643837e0d569c997ab0c2d9ae85e12422f7
6
+ metadata.gz: ee43326933ac019b0bae631631ba79a7b1e03d1e9669361ef7722aa5a43b7bf2a2f49ccf8b098ab23539392fd09b83224c3cb9d340b80483179fabb45d62ee30
7
+ data.tar.gz: 9733820cc4e81b22cca51dbf89a02aa87e96cbbc1add753b2799878b5b50b549f2a27886dcfae387ad4cc158ce4bd651354f8bbd2514460ac07a60560ad5c455
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.2.6 (2021-02-24)
2
+
3
+ - Improved performance
4
+ - Improved `inspect` method
5
+ - Fixed issue with `similar_users` and `item_recs` returning the original user/item
6
+ - Fixed error with `fit` after loading
7
+
1
8
  ## 0.2.5 (2021-02-20)
2
9
 
3
10
  - Added `top_items` method
data/README.md CHANGED
@@ -44,7 +44,7 @@ recommender.fit([
44
44
  ])
45
45
  ```
46
46
 
47
- > Use `value` instead of rating for implicit feedback
47
+ > Use `value` instead of `rating` for implicit feedback
48
48
 
49
49
  Get user-based recommendations - “users like you also liked”
50
50
 
@@ -247,7 +247,7 @@ recommender.fit(data)
247
247
  recommender.top_items
248
248
  ```
249
249
 
250
- This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
250
+ This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback.
251
251
 
252
252
  ## Data
253
253
 
@@ -269,7 +269,7 @@ Or a Daru data frame
269
269
  Daru::DataFrame.from_csv("ratings.csv")
270
270
  ```
271
271
 
272
- ## Performance [master]
272
+ ## Performance
273
273
 
274
274
  If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
275
275
 
@@ -17,24 +17,28 @@ module Disco
17
17
 
18
18
  check_training_set(train_set)
19
19
 
20
+ # TODO option to set in initializer to avoid pass
21
+ # could also just check first few values
22
+ # but may be confusing if they are all missing and later ones aren't
20
23
  @implicit = !train_set.any? { |v| v[:rating] }
24
+
25
+ # TODO improve performance
26
+ # (catch exception instead of checking ahead of time)
21
27
  unless @implicit
22
28
  check_ratings(train_set)
23
- @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
24
29
 
25
30
  if validation_set
26
31
  check_ratings(validation_set)
27
32
  end
28
33
  end
29
34
 
30
- update_maps(train_set)
31
-
32
35
  @rated = Hash.new { |hash, key| hash[key] = {} }
33
36
  input = []
34
37
  value_key = @implicit ? :value : :rating
35
38
  train_set.each do |v|
36
- u = @user_map[v[:user_id]]
37
- i = @item_map[v[:item_id]]
39
+ # update maps and build matrix in single pass
40
+ u = (@user_map[v[:user_id]] ||= @user_map.size)
41
+ i = (@item_map[v[:item_id]] ||= @item_map.size)
38
42
  @rated[u][i] = true
39
43
 
40
44
  # explicit will always have a value due to check_ratings
@@ -42,6 +46,15 @@ module Disco
42
46
  end
43
47
  @rated.default = nil
44
48
 
49
+ # much more efficient than checking every value in another pass
50
+ raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
51
+ raise ArgumentError, "Missing item_id" if @item_map.key?(nil)
52
+
53
+ # TODO improve performance
54
+ unless @implicit
55
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
56
+ end
57
+
45
58
  if @top_items
46
59
  @item_count = [0] * @item_map.size
47
60
  @item_sum = [0.0] * @item_map.size
@@ -78,6 +91,9 @@ module Disco
78
91
  @user_factors = model.p_factors(format: :numo)
79
92
  @item_factors = model.q_factors(format: :numo)
80
93
 
94
+ @normalized_user_factors = nil
95
+ @normalized_item_factors = nil
96
+
81
97
  @user_recs_index = nil
82
98
  @similar_users_index = nil
83
99
  @similar_items_index = nil
@@ -149,13 +165,13 @@ module Disco
149
165
 
150
166
  def similar_items(item_id, count: 5)
151
167
  check_fit
152
- similar(item_id, @item_map, item_norms, count, @similar_items_index)
168
+ similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
153
169
  end
154
170
  alias_method :item_recs, :similar_items
155
171
 
156
172
  def similar_users(user_id, count: 5)
157
173
  check_fit
158
- similar(user_id, @user_map, user_norms, count, @similar_users_index)
174
+ similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
159
175
  end
160
176
 
161
177
  def top_items(count: 5)
@@ -212,13 +228,17 @@ module Disco
212
228
 
213
229
  def optimize_similar_items(library: nil)
214
230
  check_fit
215
- @similar_items_index = create_index(item_norms, library: library)
231
+ @similar_items_index = create_index(normalized_item_factors, library: library)
216
232
  end
217
233
  alias_method :optimize_item_recs, :optimize_similar_items
218
234
 
219
235
  def optimize_similar_users(library: nil)
220
236
  check_fit
221
- @similar_users_index = create_index(user_norms, library: library)
237
+ @similar_users_index = create_index(normalized_user_factors, library: library)
238
+ end
239
+
240
+ def inspect
241
+ to_s # for now
222
242
  end
223
243
 
224
244
  private
@@ -251,7 +271,7 @@ module Disco
251
271
  # https://github.com/yahoojapan/NGT/issues/36
252
272
  index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
253
273
 
254
- # NGT normalizes so could call create_index with factors instead of norms
274
+ # NGT normalizes so could call create_index without normalized factors
255
275
  # but keep code simple for now
256
276
  ids = index.batch_insert(factors)
257
277
  raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
@@ -262,15 +282,15 @@ module Disco
262
282
  end
263
283
  end
264
284
 
265
- def user_norms
266
- @user_norms ||= norms(@user_factors)
285
+ def normalized_user_factors
286
+ @normalized_user_factors ||= normalize(@user_factors)
267
287
  end
268
288
 
269
- def item_norms
270
- @item_norms ||= norms(@item_factors)
289
+ def normalized_item_factors
290
+ @normalized_item_factors ||= normalize(@item_factors)
271
291
  end
272
292
 
273
- def norms(factors)
293
+ def normalize(factors)
274
294
  norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
275
295
  norms[norms.eq(0)] = 1e-10 # no zeros
276
296
  factors / norms.expand_dims(1)
@@ -303,30 +323,26 @@ module Disco
303
323
  # TODO use user_id for similar_users in 0.3.0
304
324
  key = :item_id
305
325
 
306
- (1...ids.size).map do |i|
307
- {key => keys[ids[i]], score: predictions[i]}
326
+ result = []
327
+ # items can have the same score
328
+ # so original item may not be at index 0
329
+ ids.each_with_index do |id, j|
330
+ next if id == i
331
+
332
+ result << {key => keys[id], score: predictions[j]}
308
333
  end
334
+ result
309
335
  else
310
336
  []
311
337
  end
312
338
  end
313
339
 
314
- def update_maps(train_set)
315
- raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
316
- raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
317
-
318
- train_set.each do |v|
319
- @user_map[v[:user_id]] ||= @user_map.size
320
- @item_map[v[:item_id]] ||= @item_map.size
321
- end
322
- end
323
-
324
340
  def check_ratings(ratings)
325
341
  unless ratings.all? { |r| !r[:rating].nil? }
326
- raise ArgumentError, "Missing ratings"
342
+ raise ArgumentError, "Missing rating"
327
343
  end
328
344
  unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
329
- raise ArgumentError, "Ratings must be numeric"
345
+ raise ArgumentError, "Rating must be numeric"
330
346
  end
331
347
  end
332
348
 
@@ -365,7 +381,10 @@ module Disco
365
381
  rated: @rated,
366
382
  global_mean: @global_mean,
367
383
  user_factors: @user_factors,
368
- item_factors: @item_factors
384
+ item_factors: @item_factors,
385
+ factors: @factors,
386
+ epochs: @epochs,
387
+ verbose: @verbose
369
388
  }
370
389
 
371
390
  unless @implicit
@@ -389,6 +408,9 @@ module Disco
389
408
  @global_mean = obj[:global_mean]
390
409
  @user_factors = obj[:user_factors]
391
410
  @item_factors = obj[:item_factors]
411
+ @factors = obj[:factors]
412
+ @epochs = obj[:epochs]
413
+ @verbose = obj[:verbose]
392
414
 
393
415
  unless @implicit
394
416
  @min_rating = obj[:min_rating]
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-20 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf