disco 0.2.8 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +10 -8
- data/lib/disco/data.rb +9 -2
- data/lib/disco/model.rb +1 -2
- data/lib/disco/recommender.rb +82 -27
- data/lib/disco/version.rb +1 -1
- data/lib/disco.rb +0 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4af4d7df56f884618557fd98f97da2686cecbfaf3ce1f1f52b6ba1a3a9155f5
|
4
|
+
data.tar.gz: ddbc7551c3534c41284e958042a94d988cdd5c52248f3ca7e4a8d8a72c6b168e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f40f125fe4096dcf09eaf1d1295f23e68fef9bbe1e5651a1ecfa1ed06748df7a0f4c9ea048b767b429b67b4d0add1f32663388707616571a6936a55ffd1d6b7
|
7
|
+
data.tar.gz: baf3caa4deec5422bd85e9372bc717f683d4cbb067f468eb7d4c964556885f7bb273fa0b6339201aef917c8f046346c20356358462fc73103243608a5245ebe6
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## 0.3.1 (2022-07-10)
|
2
|
+
|
3
|
+
- Added support for JSON serialization
|
4
|
+
|
5
|
+
## 0.3.0 (2022-03-22)
|
6
|
+
|
7
|
+
- Changed `item_id` to `user_id` for `similar_users`
|
8
|
+
- Changed warning to an error when `value` passed to `fit`
|
9
|
+
- Changed to use Faiss over NGT for `optimize_item_recs` and `optimize_similar_users` when both are installed
|
10
|
+
- Removed dependency on `wilson_score` gem for `top_items`
|
11
|
+
- Dropped support for Ruby < 2.6
|
12
|
+
|
13
|
+
## 0.2.9 (2022-03-22)
|
14
|
+
|
15
|
+
- Fixed error with `load_movielens`
|
16
|
+
|
1
17
|
## 0.2.8 (2022-03-13)
|
2
18
|
|
3
19
|
- Fixed error with `top_items` with all same rating
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "disco"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Getting Started
|
@@ -183,17 +183,19 @@ For Rails < 6, speed up inserts by adding [activerecord-import](https://github.c
|
|
183
183
|
If you’d prefer to perform recommendations on-the-fly, store the recommender
|
184
184
|
|
185
185
|
```ruby
|
186
|
-
|
187
|
-
File.
|
186
|
+
json = recommender.to_json
|
187
|
+
File.write("recommender.json", json)
|
188
188
|
```
|
189
189
|
|
190
|
-
> You can save it to a file, database, or any other storage system
|
190
|
+
> You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this.
|
191
|
+
|
192
|
+
The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it.
|
191
193
|
|
192
194
|
Load a recommender
|
193
195
|
|
194
196
|
```ruby
|
195
|
-
|
196
|
-
recommender =
|
197
|
+
json = File.read("recommender.json")
|
198
|
+
recommender = Disco::Recommender.load_json(json)
|
197
199
|
```
|
198
200
|
|
199
201
|
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples).
|
@@ -242,7 +244,7 @@ recommender.fit(data)
|
|
242
244
|
recommender.top_items
|
243
245
|
```
|
244
246
|
|
245
|
-
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback
|
247
|
+
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback and item frequency for implicit feedback.
|
246
248
|
|
247
249
|
## Data
|
248
250
|
|
@@ -271,7 +273,7 @@ If you have a large number of users or items, you can use an approximate nearest
|
|
271
273
|
Add this line to your application’s Gemfile:
|
272
274
|
|
273
275
|
```ruby
|
274
|
-
gem
|
276
|
+
gem "faiss"
|
275
277
|
```
|
276
278
|
|
277
279
|
Speed up the `user_recs` method with:
|
data/lib/disco/data.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
module Disco
|
2
2
|
module Data
|
3
3
|
def load_movielens
|
4
|
-
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
5
7
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
6
|
-
data_path = download_file("ml-100k/u.data", "
|
8
|
+
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
9
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
10
|
|
9
11
|
# convert u.item to utf-8
|
@@ -29,6 +31,11 @@ module Disco
|
|
29
31
|
private
|
30
32
|
|
31
33
|
def download_file(fname, origin, file_hash:)
|
34
|
+
require "digest"
|
35
|
+
require "fileutils"
|
36
|
+
require "net/http"
|
37
|
+
require "tmpdir"
|
38
|
+
|
32
39
|
# TODO handle this better
|
33
40
|
raise "No HOME" unless ENV["HOME"]
|
34
41
|
dest = "#{ENV["HOME"]}/.disco/#{fname}"
|
data/lib/disco/model.rb
CHANGED
@@ -10,10 +10,9 @@ module Disco
|
|
10
10
|
|
11
11
|
has_many :"recommended_#{name}", -> { where("disco_recommendations.context = ?", name).order("disco_recommendations.score DESC") }, through: :recommendations, source: :item, source_type: class_name
|
12
12
|
|
13
|
-
# TODO use fetch for item_id and score in 0.3.0
|
14
13
|
define_method("update_recommended_#{name}") do |items|
|
15
14
|
now = Time.now
|
16
|
-
items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item
|
15
|
+
items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} }
|
17
16
|
|
18
17
|
self.class.transaction do
|
19
18
|
recommendations.where(context: name).delete_all
|
data/lib/disco/recommender.rb
CHANGED
@@ -23,7 +23,7 @@ module Disco
|
|
23
23
|
@implicit = !train_set.any? { |v| v[:rating] }
|
24
24
|
|
25
25
|
if @implicit && train_set.any? { |v| v[:value] }
|
26
|
-
|
26
|
+
raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used."
|
27
27
|
end
|
28
28
|
|
29
29
|
# TODO improve performance
|
@@ -167,13 +167,13 @@ module Disco
|
|
167
167
|
|
168
168
|
def similar_items(item_id, count: 5)
|
169
169
|
check_fit
|
170
|
-
similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
170
|
+
similar(item_id, :item_id, @item_map, normalized_item_factors, count, @similar_items_index)
|
171
171
|
end
|
172
172
|
alias_method :item_recs, :similar_items
|
173
173
|
|
174
174
|
def similar_users(user_id, count: 5)
|
175
175
|
check_fit
|
176
|
-
similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
176
|
+
similar(user_id, :user_id, @user_map, normalized_user_factors, count, @similar_users_index)
|
177
177
|
end
|
178
178
|
|
179
179
|
def top_items(count: 5)
|
@@ -183,27 +183,20 @@ module Disco
|
|
183
183
|
if @implicit
|
184
184
|
scores = Numo::UInt64.cast(@item_count)
|
185
185
|
else
|
186
|
-
|
186
|
+
min_rating = @min_rating
|
187
187
|
|
188
|
-
|
189
|
-
|
190
|
-
# TODO remove temp fix
|
191
|
-
(@min_rating - 1)..@max_rating
|
192
|
-
else
|
193
|
-
@min_rating..@max_rating
|
194
|
-
end
|
195
|
-
scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) })
|
188
|
+
# TODO remove temp fix
|
189
|
+
min_rating -= 1 if @min_rating == @max_rating
|
196
190
|
|
197
|
-
# TODO uncomment in 0.3.0
|
198
191
|
# wilson score with continuity correction
|
199
192
|
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
193
|
+
z = 1.96 # 95% confidence
|
194
|
+
range = @max_rating - @min_rating
|
195
|
+
n = Numo::DFloat.cast(@item_count)
|
196
|
+
phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
|
197
|
+
phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
|
198
|
+
scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
|
199
|
+
scores = scores * range + min_rating
|
207
200
|
end
|
208
201
|
|
209
202
|
indexes = scores.sort_index.reverse
|
@@ -262,12 +255,51 @@ module Disco
|
|
262
255
|
to_s # for now
|
263
256
|
end
|
264
257
|
|
258
|
+
def to_json
|
259
|
+
require "base64"
|
260
|
+
require "json"
|
261
|
+
|
262
|
+
obj = {
|
263
|
+
implicit: @implicit,
|
264
|
+
user_ids: @user_map.keys,
|
265
|
+
item_ids: @item_map.keys,
|
266
|
+
rated: @user_map.map { |_, u| (@rated[u] || {}).keys },
|
267
|
+
global_mean: @global_mean,
|
268
|
+
user_factors: Base64.strict_encode64(@user_factors.to_binary),
|
269
|
+
item_factors: Base64.strict_encode64(@item_factors.to_binary),
|
270
|
+
factors: @factors,
|
271
|
+
epochs: @epochs,
|
272
|
+
verbose: @verbose
|
273
|
+
}
|
274
|
+
|
275
|
+
unless @implicit
|
276
|
+
obj[:min_rating] = @min_rating
|
277
|
+
obj[:max_rating] = @max_rating
|
278
|
+
end
|
279
|
+
|
280
|
+
if @top_items
|
281
|
+
obj[:item_count] = @item_count
|
282
|
+
obj[:item_sum] = @item_sum
|
283
|
+
end
|
284
|
+
|
285
|
+
JSON.generate(obj)
|
286
|
+
end
|
287
|
+
|
288
|
+
def self.load_json(json)
|
289
|
+
require "json"
|
290
|
+
|
291
|
+
obj = JSON.parse(json)
|
292
|
+
|
293
|
+
recommender = new
|
294
|
+
recommender.send(:json_load, obj)
|
295
|
+
recommender
|
296
|
+
end
|
297
|
+
|
265
298
|
private
|
266
299
|
|
267
300
|
# factors should already be normalized for similar users/items
|
268
301
|
def create_index(factors, library:)
|
269
|
-
|
270
|
-
library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
|
302
|
+
library ||= defined?(Ngt) && !defined?(Faiss) ? "ngt" : "faiss"
|
271
303
|
|
272
304
|
case library
|
273
305
|
when "faiss"
|
@@ -276,7 +308,7 @@ module Disco
|
|
276
308
|
# inner product is cosine similarity with normalized vectors
|
277
309
|
# https://github.com/facebookresearch/faiss/issues/95
|
278
310
|
#
|
279
|
-
# TODO
|
311
|
+
# TODO add option for index type
|
280
312
|
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
281
313
|
# index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product)
|
282
314
|
index = Faiss::IndexFlatIP.new(factors.shape[1])
|
@@ -318,7 +350,7 @@ module Disco
|
|
318
350
|
factors / norms.expand_dims(1)
|
319
351
|
end
|
320
352
|
|
321
|
-
def similar(id, map, norm_factors, count, index)
|
353
|
+
def similar(id, key, map, norm_factors, count, index)
|
322
354
|
i = map[id]
|
323
355
|
|
324
356
|
if i && norm_factors.shape[0] > 1
|
@@ -342,9 +374,6 @@ module Disco
|
|
342
374
|
|
343
375
|
keys = map.keys
|
344
376
|
|
345
|
-
# TODO use user_id for similar_users in 0.3.0
|
346
|
-
key = :item_id
|
347
|
-
|
348
377
|
result = []
|
349
378
|
# items can have the same score
|
350
379
|
# so original item may not be at index 0
|
@@ -445,5 +474,31 @@ module Disco
|
|
445
474
|
@item_sum = obj[:item_sum]
|
446
475
|
end
|
447
476
|
end
|
477
|
+
|
478
|
+
def json_load(obj)
|
479
|
+
require "base64"
|
480
|
+
|
481
|
+
@implicit = obj["implicit"]
|
482
|
+
@user_map = obj["user_ids"].map.with_index.to_h
|
483
|
+
@item_map = obj["item_ids"].map.with_index.to_h
|
484
|
+
@rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
|
485
|
+
@global_mean = obj["global_mean"].to_f
|
486
|
+
@factors = obj["factors"].to_i
|
487
|
+
@user_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["user_factors"]), [@user_map.size, @factors])
|
488
|
+
@item_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["item_factors"]), [@item_map.size, @factors])
|
489
|
+
@epochs = obj["epochs"].to_i
|
490
|
+
@verbose = obj["verbose"]
|
491
|
+
|
492
|
+
unless @implicit
|
493
|
+
@min_rating = obj["min_rating"]
|
494
|
+
@max_rating = obj["max_rating"]
|
495
|
+
end
|
496
|
+
|
497
|
+
@top_items = obj.key?("item_count")
|
498
|
+
if @top_items
|
499
|
+
@item_count = obj["item_count"]
|
500
|
+
@item_sum = obj["item_sum"]
|
501
|
+
end
|
502
|
+
end
|
448
503
|
end
|
449
504
|
end
|
data/lib/disco/version.rb
CHANGED
data/lib/disco.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -69,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
69
|
requirements:
|
70
70
|
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
|
-
version: '2.
|
72
|
+
version: '2.6'
|
73
73
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
75
|
- - ">="
|