disco 0.2.7 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f400f07839587b574ddcfa4c88335bfe20fcd876164b943e8094a35c3c1cfef
4
- data.tar.gz: e2426b283146837d14be154ff0e67eb2505fd6587958b39212bf2dfe3bfccd80
3
+ metadata.gz: 815bc7de802959be7093d9e0478d83a0cf49a522e72a2df928de86223799d83d
4
+ data.tar.gz: cbfacf86f1e0507abe4df07b45f20bc3d06d682617c482419a05935186a61c15
5
5
  SHA512:
6
- metadata.gz: 2be9f24184036ec5b093de55640aebb60887ac59c566f37698fcba7a18daa15cf586566708def0060f80fc0747a50447538cf42fdf36024ae19ddac0de8b415c
7
- data.tar.gz: 4682a5524a8cad4a247ec53f99c78e317d56ee55433bb2ad7806af4f2a9854bc016fd23564003f009dc69d0fdcf81949dc88c64d3cbe824a8e76fc5cae8abc7d
6
+ metadata.gz: d0f3285b53cb8fe7e7d5ef30a970c632c52112c6b0503b8c81155f6cdb37583f036107b052c37019671355d0838512f904a61aeff5b69b7f6f8a2c1f4fabe785
7
+ data.tar.gz: f1b9c5759d77c1f497a0ac09ccf455beda29417024c4cd8ba6c0f8fcbac3347ab233c9e8c558a75382ef3b41495b5b693495ab0533c3a084a416c1f75a38313b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.3.0 (2022-03-22)
2
+
3
+ - Changed `item_id` to `user_id` for `similar_users`
4
+ - Changed warning to an error when `value` passed to `fit`
5
+ - Changed to use Faiss over NGT for `optimize_item_recs` and `optimize_similar_users` when both are installed
6
+ - Removed dependency on `wilson_score` gem for `top_items`
7
+ - Dropped support for Ruby < 2.6
8
+
9
+ ## 0.2.9 (2022-03-22)
10
+
11
+ - Fixed error with `load_movielens`
12
+
13
+ ## 0.2.8 (2022-03-13)
14
+
15
+ - Fixed error with `top_items` with all same rating
16
+
1
17
  ## 0.2.7 (2021-08-06)
2
18
 
3
19
  - Added warning for `value`
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2019-2021 Andrew Kane
1
+ Copyright (c) 2019-2022 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -13,7 +13,7 @@
13
13
  Add this line to your application’s Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'disco'
16
+ gem "disco"
17
17
  ```
18
18
 
19
19
  ## Getting Started
@@ -44,6 +44,8 @@ recommender.fit([
44
44
  ])
45
45
  ```
46
46
 
47
+ > Each `user_id`/`item_id` combination should only appear once
48
+
47
49
  Get user-based recommendations - “users like you also liked”
48
50
 
49
51
  ```ruby
@@ -97,11 +99,7 @@ recommender.item_recs("Star Wars (1977)")
97
99
  [Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback
98
100
 
99
101
  ```ruby
100
- views = Ahoy::Event.
101
- where(name: "Viewed post").
102
- group(:user_id).
103
- group("properties->>'post_id'"). # postgres syntax
104
- count
102
+ views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count
105
103
 
106
104
  data =
107
105
  views.map do |(user_id, post_id), _|
@@ -244,7 +242,7 @@ recommender.fit(data)
244
242
  recommender.top_items
245
243
  ```
246
244
 
247
- This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback.
245
+ This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback and item frequency for implicit feedback.
248
246
 
249
247
  ## Data
250
248
 
@@ -273,7 +271,7 @@ If you have a large number of users or items, you can use an approximate nearest
273
271
  Add this line to your application’s Gemfile:
274
272
 
275
273
  ```ruby
276
- gem 'faiss'
274
+ gem "faiss"
277
275
  ```
278
276
 
279
277
  Speed up the `user_recs` method with:
data/lib/disco/data.rb CHANGED
@@ -1,9 +1,11 @@
1
1
  module Disco
2
2
  module Data
3
3
  def load_movielens
4
- item_path = download_file("ml-100k/u.item", "http://files.grouplens.org/datasets/movielens/ml-100k/u.item",
4
+ require "csv"
5
+
6
+ item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
5
7
  file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
6
- data_path = download_file("ml-100k/u.data", "http://files.grouplens.org/datasets/movielens/ml-100k/u.data",
8
+ data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
7
9
  file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
8
10
 
9
11
  # convert u.item to utf-8
@@ -29,6 +31,11 @@ module Disco
29
31
  private
30
32
 
31
33
  def download_file(fname, origin, file_hash:)
34
+ require "digest"
35
+ require "fileutils"
36
+ require "net/http"
37
+ require "tmpdir"
38
+
32
39
  # TODO handle this better
33
40
  raise "No HOME" unless ENV["HOME"]
34
41
  dest = "#{ENV["HOME"]}/.disco/#{fname}"
data/lib/disco/model.rb CHANGED
@@ -12,7 +12,7 @@ module Disco
12
12
 
13
13
  define_method("update_recommended_#{name}") do |items|
14
14
  now = Time.now
15
- items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item[:item_id], context: name, score: item[:score], created_at: now, updated_at: now} }
15
+ items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} }
16
16
 
17
17
  self.class.transaction do
18
18
  recommendations.where(context: name).delete_all
@@ -23,7 +23,7 @@ module Disco
23
23
  @implicit = !train_set.any? { |v| v[:rating] }
24
24
 
25
25
  if @implicit && train_set.any? { |v| v[:value] }
26
- warn "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used."
26
+ raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used."
27
27
  end
28
28
 
29
29
  # TODO improve performance
@@ -167,13 +167,13 @@ module Disco
167
167
 
168
168
  def similar_items(item_id, count: 5)
169
169
  check_fit
170
- similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
170
+ similar(item_id, :item_id, @item_map, normalized_item_factors, count, @similar_items_index)
171
171
  end
172
172
  alias_method :item_recs, :similar_items
173
173
 
174
174
  def similar_users(user_id, count: 5)
175
175
  check_fit
176
- similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
176
+ similar(user_id, :user_id, @user_map, normalized_user_factors, count, @similar_users_index)
177
177
  end
178
178
 
179
179
  def top_items(count: 5)
@@ -183,21 +183,20 @@ module Disco
183
183
  if @implicit
184
184
  scores = Numo::UInt64.cast(@item_count)
185
185
  else
186
- require "wilson_score"
186
+ min_rating = @min_rating
187
187
 
188
- range = @min_rating..@max_rating
189
- scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) })
188
+ # TODO remove temp fix
189
+ min_rating -= 1 if @min_rating == @max_rating
190
190
 
191
- # TODO uncomment in 0.3.0
192
191
  # wilson score with continuity correction
193
192
  # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
194
- # z = 1.96 # 95% confidence
195
- # range = @max_rating - @min_rating
196
- # n = Numo::DFloat.cast(@item_count)
197
- # phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n
198
- # phat = (phat - (1 / 2 * n)).clip(0, 100) # continuity correction
199
- # scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
200
- # scores = scores * range + @min_rating
193
+ z = 1.96 # 95% confidence
194
+ range = @max_rating - @min_rating
195
+ n = Numo::DFloat.cast(@item_count)
196
+ phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
197
+ phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
198
+ scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
199
+ scores = scores * range + min_rating
201
200
  end
202
201
 
203
202
  indexes = scores.sort_index.reverse
@@ -260,8 +259,7 @@ module Disco
260
259
 
261
260
  # factors should already be normalized for similar users/items
262
261
  def create_index(factors, library:)
263
- # TODO make Faiss the default in 0.3.0
264
- library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
262
+ library ||= defined?(Ngt) && !defined?(Faiss) ? "ngt" : "faiss"
265
263
 
266
264
  case library
267
265
  when "faiss"
@@ -270,7 +268,7 @@ module Disco
270
268
  # inner product is cosine similarity with normalized vectors
271
269
  # https://github.com/facebookresearch/faiss/issues/95
272
270
  #
273
- # TODO use non-exact index in 0.3.0
271
+ # TODO add option for index type
274
272
  # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
275
273
  # index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product)
276
274
  index = Faiss::IndexFlatIP.new(factors.shape[1])
@@ -312,7 +310,7 @@ module Disco
312
310
  factors / norms.expand_dims(1)
313
311
  end
314
312
 
315
- def similar(id, map, norm_factors, count, index)
313
+ def similar(id, key, map, norm_factors, count, index)
316
314
  i = map[id]
317
315
 
318
316
  if i && norm_factors.shape[0] > 1
@@ -336,9 +334,6 @@ module Disco
336
334
 
337
335
  keys = map.keys
338
336
 
339
- # TODO use user_id for similar_users in 0.3.0
340
- key = :item_id
341
-
342
337
  result = []
343
338
  # items can have the same score
344
339
  # so original item may not be at index 0
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.2.7"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/disco.rb CHANGED
@@ -2,11 +2,6 @@
2
2
  require "libmf"
3
3
  require "numo/narray"
4
4
 
5
- # stdlib
6
- require "csv"
7
- require "fileutils"
8
- require "net/http"
9
-
10
5
  # modules
11
6
  require "disco/data"
12
7
  require "disco/metrics"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-06 00:00:00.000000000 Z
11
+ date: 2022-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf
@@ -69,14 +69,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
69
69
  requirements:
70
70
  - - ">="
71
71
  - !ruby/object:Gem::Version
72
- version: '2.4'
72
+ version: '2.6'
73
73
  required_rubygems_version: !ruby/object:Gem::Requirement
74
74
  requirements:
75
75
  - - ">="
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
78
  requirements: []
79
- rubygems_version: 3.2.22
79
+ rubygems_version: 3.3.7
80
80
  signing_key:
81
81
  specification_version: 4
82
82
  summary: Recommendations for Ruby and Rails using collaborative filtering