disco 0.2.8 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +10 -8
- data/lib/disco/data.rb +9 -2
- data/lib/disco/model.rb +1 -2
- data/lib/disco/recommender.rb +82 -27
- data/lib/disco/version.rb +1 -1
- data/lib/disco.rb +0 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4af4d7df56f884618557fd98f97da2686cecbfaf3ce1f1f52b6ba1a3a9155f5
|
4
|
+
data.tar.gz: ddbc7551c3534c41284e958042a94d988cdd5c52248f3ca7e4a8d8a72c6b168e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f40f125fe4096dcf09eaf1d1295f23e68fef9bbe1e5651a1ecfa1ed06748df7a0f4c9ea048b767b429b67b4d0add1f32663388707616571a6936a55ffd1d6b7
|
7
|
+
data.tar.gz: baf3caa4deec5422bd85e9372bc717f683d4cbb067f468eb7d4c964556885f7bb273fa0b6339201aef917c8f046346c20356358462fc73103243608a5245ebe6
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## 0.3.1 (2022-07-10)
|
2
|
+
|
3
|
+
- Added support for JSON serialization
|
4
|
+
|
5
|
+
## 0.3.0 (2022-03-22)
|
6
|
+
|
7
|
+
- Changed `item_id` to `user_id` for `similar_users`
|
8
|
+
- Changed warning to an error when `value` passed to `fit`
|
9
|
+
- Changed to use Faiss over NGT for `optimize_item_recs` and `optimize_similar_users` when both are installed
|
10
|
+
- Removed dependency on `wilson_score` gem for `top_items`
|
11
|
+
- Dropped support for Ruby < 2.6
|
12
|
+
|
13
|
+
## 0.2.9 (2022-03-22)
|
14
|
+
|
15
|
+
- Fixed error with `load_movielens`
|
16
|
+
|
1
17
|
## 0.2.8 (2022-03-13)
|
2
18
|
|
3
19
|
- Fixed error with `top_items` with all same rating
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "disco"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Getting Started
|
@@ -183,17 +183,19 @@ For Rails < 6, speed up inserts by adding [activerecord-import](https://github.c
|
|
183
183
|
If you’d prefer to perform recommendations on-the-fly, store the recommender
|
184
184
|
|
185
185
|
```ruby
|
186
|
-
|
187
|
-
File.
|
186
|
+
json = recommender.to_json
|
187
|
+
File.write("recommender.json", json)
|
188
188
|
```
|
189
189
|
|
190
|
-
> You can save it to a file, database, or any other storage system
|
190
|
+
> You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this.
|
191
|
+
|
192
|
+
The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it.
|
191
193
|
|
192
194
|
Load a recommender
|
193
195
|
|
194
196
|
```ruby
|
195
|
-
|
196
|
-
recommender =
|
197
|
+
json = File.read("recommender.json")
|
198
|
+
recommender = Disco::Recommender.load_json(json)
|
197
199
|
```
|
198
200
|
|
199
201
|
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples).
|
@@ -242,7 +244,7 @@ recommender.fit(data)
|
|
242
244
|
recommender.top_items
|
243
245
|
```
|
244
246
|
|
245
|
-
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback
|
247
|
+
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback and item frequency for implicit feedback.
|
246
248
|
|
247
249
|
## Data
|
248
250
|
|
@@ -271,7 +273,7 @@ If you have a large number of users or items, you can use an approximate nearest
|
|
271
273
|
Add this line to your application’s Gemfile:
|
272
274
|
|
273
275
|
```ruby
|
274
|
-
gem
|
276
|
+
gem "faiss"
|
275
277
|
```
|
276
278
|
|
277
279
|
Speed up the `user_recs` method with:
|
data/lib/disco/data.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
module Disco
|
2
2
|
module Data
|
3
3
|
def load_movielens
|
4
|
-
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
5
7
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
6
|
-
data_path = download_file("ml-100k/u.data", "
|
8
|
+
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
9
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
10
|
|
9
11
|
# convert u.item to utf-8
|
@@ -29,6 +31,11 @@ module Disco
|
|
29
31
|
private
|
30
32
|
|
31
33
|
def download_file(fname, origin, file_hash:)
|
34
|
+
require "digest"
|
35
|
+
require "fileutils"
|
36
|
+
require "net/http"
|
37
|
+
require "tmpdir"
|
38
|
+
|
32
39
|
# TODO handle this better
|
33
40
|
raise "No HOME" unless ENV["HOME"]
|
34
41
|
dest = "#{ENV["HOME"]}/.disco/#{fname}"
|
data/lib/disco/model.rb
CHANGED
@@ -10,10 +10,9 @@ module Disco
|
|
10
10
|
|
11
11
|
has_many :"recommended_#{name}", -> { where("disco_recommendations.context = ?", name).order("disco_recommendations.score DESC") }, through: :recommendations, source: :item, source_type: class_name
|
12
12
|
|
13
|
-
# TODO use fetch for item_id and score in 0.3.0
|
14
13
|
define_method("update_recommended_#{name}") do |items|
|
15
14
|
now = Time.now
|
16
|
-
items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item
|
15
|
+
items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} }
|
17
16
|
|
18
17
|
self.class.transaction do
|
19
18
|
recommendations.where(context: name).delete_all
|
data/lib/disco/recommender.rb
CHANGED
@@ -23,7 +23,7 @@ module Disco
|
|
23
23
|
@implicit = !train_set.any? { |v| v[:rating] }
|
24
24
|
|
25
25
|
if @implicit && train_set.any? { |v| v[:value] }
|
26
|
-
|
26
|
+
raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used."
|
27
27
|
end
|
28
28
|
|
29
29
|
# TODO improve performance
|
@@ -167,13 +167,13 @@ module Disco
|
|
167
167
|
|
168
168
|
def similar_items(item_id, count: 5)
|
169
169
|
check_fit
|
170
|
-
similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
170
|
+
similar(item_id, :item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
171
171
|
end
|
172
172
|
alias_method :item_recs, :similar_items
|
173
173
|
|
174
174
|
def similar_users(user_id, count: 5)
|
175
175
|
check_fit
|
176
|
-
similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
176
|
+
similar(user_id, :user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
177
177
|
end
|
178
178
|
|
179
179
|
def top_items(count: 5)
|
@@ -183,27 +183,20 @@ module Disco
|
|
183
183
|
if @implicit
|
184
184
|
scores = Numo::UInt64.cast(@item_count)
|
185
185
|
else
|
186
|
-
|
186
|
+
min_rating = @min_rating
|
187
187
|
|
188
|
-
|
189
|
-
|
190
|
-
# TODO remove temp fix
|
191
|
-
(@min_rating - 1)..@max_rating
|
192
|
-
else
|
193
|
-
@min_rating..@max_rating
|
194
|
-
end
|
195
|
-
scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) })
|
188
|
+
# TODO remove temp fix
|
189
|
+
min_rating -= 1 if @min_rating == @max_rating
|
196
190
|
|
197
|
-
# TODO uncomment in 0.3.0
|
198
191
|
# wilson score with continuity correction
|
199
192
|
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
193
|
+
z = 1.96 # 95% confidence
|
194
|
+
range = @max_rating - @min_rating
|
195
|
+
n = Numo::DFloat.cast(@item_count)
|
196
|
+
phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
|
197
|
+
phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
|
198
|
+
scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
|
199
|
+
scores = scores * range + min_rating
|
207
200
|
end
|
208
201
|
|
209
202
|
indexes = scores.sort_index.reverse
|
@@ -262,12 +255,51 @@ module Disco
|
|
262
255
|
to_s # for now
|
263
256
|
end
|
264
257
|
|
258
|
+
def to_json
|
259
|
+
require "base64"
|
260
|
+
require "json"
|
261
|
+
|
262
|
+
obj = {
|
263
|
+
implicit: @implicit,
|
264
|
+
user_ids: @user_map.keys,
|
265
|
+
item_ids: @item_map.keys,
|
266
|
+
rated: @user_map.map { |_, u| (@rated[u] || {}).keys },
|
267
|
+
global_mean: @global_mean,
|
268
|
+
user_factors: Base64.strict_encode64(@user_factors.to_binary),
|
269
|
+
item_factors: Base64.strict_encode64(@item_factors.to_binary),
|
270
|
+
factors: @factors,
|
271
|
+
epochs: @epochs,
|
272
|
+
verbose: @verbose
|
273
|
+
}
|
274
|
+
|
275
|
+
unless @implicit
|
276
|
+
obj[:min_rating] = @min_rating
|
277
|
+
obj[:max_rating] = @max_rating
|
278
|
+
end
|
279
|
+
|
280
|
+
if @top_items
|
281
|
+
obj[:item_count] = @item_count
|
282
|
+
obj[:item_sum] = @item_sum
|
283
|
+
end
|
284
|
+
|
285
|
+
JSON.generate(obj)
|
286
|
+
end
|
287
|
+
|
288
|
+
def self.load_json(json)
|
289
|
+
require "json"
|
290
|
+
|
291
|
+
obj = JSON.parse(json)
|
292
|
+
|
293
|
+
recommender = new
|
294
|
+
recommender.send(:json_load, obj)
|
295
|
+
recommender
|
296
|
+
end
|
297
|
+
|
265
298
|
private
|
266
299
|
|
267
300
|
# factors should already be normalized for similar users/items
|
268
301
|
def create_index(factors, library:)
|
269
|
-
|
270
|
-
library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
|
302
|
+
library ||= defined?(Ngt) && !defined?(Faiss) ? "ngt" : "faiss"
|
271
303
|
|
272
304
|
case library
|
273
305
|
when "faiss"
|
@@ -276,7 +308,7 @@ module Disco
|
|
276
308
|
# inner product is cosine similarity with normalized vectors
|
277
309
|
# https://github.com/facebookresearch/faiss/issues/95
|
278
310
|
#
|
279
|
-
# TODO
|
311
|
+
# TODO add option for index type
|
280
312
|
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
281
313
|
# index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product)
|
282
314
|
index = Faiss::IndexFlatIP.new(factors.shape[1])
|
@@ -318,7 +350,7 @@ module Disco
|
|
318
350
|
factors / norms.expand_dims(1)
|
319
351
|
end
|
320
352
|
|
321
|
-
def similar(id, map, norm_factors, count, index)
|
353
|
+
def similar(id, key, map, norm_factors, count, index)
|
322
354
|
i = map[id]
|
323
355
|
|
324
356
|
if i && norm_factors.shape[0] > 1
|
@@ -342,9 +374,6 @@ module Disco
|
|
342
374
|
|
343
375
|
keys = map.keys
|
344
376
|
|
345
|
-
# TODO use user_id for similar_users in 0.3.0
|
346
|
-
key = :item_id
|
347
|
-
|
348
377
|
result = []
|
349
378
|
# items can have the same score
|
350
379
|
# so original item may not be at index 0
|
@@ -445,5 +474,31 @@ module Disco
|
|
445
474
|
@item_sum = obj[:item_sum]
|
446
475
|
end
|
447
476
|
end
|
477
|
+
|
478
|
+
def json_load(obj)
|
479
|
+
require "base64"
|
480
|
+
|
481
|
+
@implicit = obj["implicit"]
|
482
|
+
@user_map = obj["user_ids"].map.with_index.to_h
|
483
|
+
@item_map = obj["item_ids"].map.with_index.to_h
|
484
|
+
@rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
|
485
|
+
@global_mean = obj["global_mean"].to_f
|
486
|
+
@factors = obj["factors"].to_i
|
487
|
+
@user_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["user_factors"]), [@user_map.size, @factors])
|
488
|
+
@item_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["item_factors"]), [@item_map.size, @factors])
|
489
|
+
@epochs = obj["epochs"].to_i
|
490
|
+
@verbose = obj["verbose"]
|
491
|
+
|
492
|
+
unless @implicit
|
493
|
+
@min_rating = obj["min_rating"]
|
494
|
+
@max_rating = obj["max_rating"]
|
495
|
+
end
|
496
|
+
|
497
|
+
@top_items = obj.key?("item_count")
|
498
|
+
if @top_items
|
499
|
+
@item_count = obj["item_count"]
|
500
|
+
@item_sum = obj["item_sum"]
|
501
|
+
end
|
502
|
+
end
|
448
503
|
end
|
449
504
|
end
|
data/lib/disco/version.rb
CHANGED
data/lib/disco.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -69,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
69
|
requirements:
|
70
70
|
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
|
-
version: '2.
|
72
|
+
version: '2.6'
|
73
73
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
75
|
- - ">="
|