disco 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +36 -19
- data/lib/disco/data.rb +9 -2
- data/lib/disco/model.rb +1 -0
- data/lib/disco/recommender.rb +37 -15
- data/lib/disco/version.rb +1 -1
- data/lib/disco.rb +0 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d677e14bfb504669dd4f36cc00997128bbb0f7282c428ec29f88bf072587d82f
|
4
|
+
data.tar.gz: c797b4f1eb39aff5596b5e10b346260f6701df43e1c84559fb2395b23ed2c8f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b2f0d889ac2c3dbed66642a15460c87e04eb81731f3a729b111522220159e7c52927aed5238413e323b83901fc490e1dce2cd12d6005b196f91eca69c9023277
|
7
|
+
data.tar.gz: a57abc84399c0cbc57f2997f341d8fe192e2835887df63fc75c13bc6a5d2aed5808aaf555b48c25902a65375df5cfd11990f1e987dfef77a7eef98d63f19914e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 0.2.9 (2022-03-22)
|
2
|
+
|
3
|
+
- Fixed error with `load_movielens`
|
4
|
+
|
5
|
+
## 0.2.8 (2022-03-13)
|
6
|
+
|
7
|
+
- Fixed error with `top_items` with all same rating
|
8
|
+
|
9
|
+
## 0.2.7 (2021-08-06)
|
10
|
+
|
11
|
+
- Added warning for `value`
|
12
|
+
|
1
13
|
## 0.2.6 (2021-02-24)
|
2
14
|
|
3
15
|
- Improved performance
|
data/README.md
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "disco"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Getting Started
|
@@ -35,16 +35,16 @@ recommender.fit([
|
|
35
35
|
|
36
36
|
> IDs can be integers, strings, or any other data type
|
37
37
|
|
38
|
-
If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating
|
38
|
+
If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating.
|
39
39
|
|
40
40
|
```ruby
|
41
41
|
recommender.fit([
|
42
|
-
{user_id: 1, item_id: 1
|
43
|
-
{user_id: 2, item_id: 1
|
42
|
+
{user_id: 1, item_id: 1},
|
43
|
+
{user_id: 2, item_id: 1}
|
44
44
|
])
|
45
45
|
```
|
46
46
|
|
47
|
-
>
|
47
|
+
> Each `user_id`/`item_id` combination should only appear once
|
48
48
|
|
49
49
|
Get user-based recommendations - “users like you also liked”
|
50
50
|
|
@@ -99,18 +99,13 @@ recommender.item_recs("Star Wars (1977)")
|
|
99
99
|
[Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback
|
100
100
|
|
101
101
|
```ruby
|
102
|
-
views = Ahoy::Event.
|
103
|
-
where(name: "Viewed post").
|
104
|
-
group(:user_id).
|
105
|
-
group("properties->>'post_id'"). # postgres syntax
|
106
|
-
count
|
102
|
+
views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count
|
107
103
|
|
108
104
|
data =
|
109
|
-
views.map do |(user_id, post_id),
|
105
|
+
views.map do |(user_id, post_id), _|
|
110
106
|
{
|
111
107
|
user_id: user_id,
|
112
|
-
item_id: post_id
|
113
|
-
value: count
|
108
|
+
item_id: post_id
|
114
109
|
}
|
115
110
|
end
|
116
111
|
```
|
@@ -201,7 +196,7 @@ bin = File.binread("recommender.bin")
|
|
201
196
|
recommender = Marshal.load(bin)
|
202
197
|
```
|
203
198
|
|
204
|
-
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor)
|
199
|
+
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples).
|
205
200
|
|
206
201
|
## Algorithms
|
207
202
|
|
@@ -276,28 +271,28 @@ If you have a large number of users or items, you can use an approximate nearest
|
|
276
271
|
Add this line to your application’s Gemfile:
|
277
272
|
|
278
273
|
```ruby
|
279
|
-
gem
|
274
|
+
gem "faiss"
|
280
275
|
```
|
281
276
|
|
282
277
|
Speed up the `user_recs` method with:
|
283
278
|
|
284
279
|
```ruby
|
285
|
-
|
280
|
+
recommender.optimize_user_recs
|
286
281
|
```
|
287
282
|
|
288
283
|
Speed up the `item_recs` method with:
|
289
284
|
|
290
285
|
```ruby
|
291
|
-
|
286
|
+
recommender.optimize_item_recs
|
292
287
|
```
|
293
288
|
|
294
289
|
Speed up the `similar_users` method with:
|
295
290
|
|
296
291
|
```ruby
|
297
|
-
|
292
|
+
recommender.optimize_similar_users
|
298
293
|
```
|
299
294
|
|
300
|
-
This should be called after fitting or loading the
|
295
|
+
This should be called after fitting or loading the recommender.
|
301
296
|
|
302
297
|
## Reference
|
303
298
|
|
@@ -336,6 +331,28 @@ Thanks to:
|
|
336
331
|
- [Implicit](https://github.com/benfred/implicit/) for serving as an initial reference for user and item similarity
|
337
332
|
- [@dasch](https://github.com/dasch) for the gem name
|
338
333
|
|
334
|
+
## Upgrading
|
335
|
+
|
336
|
+
### 0.2.7
|
337
|
+
|
338
|
+
There’s now a warning when passing `:value` with implicit feedback, as this has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used.
|
339
|
+
|
340
|
+
```ruby
|
341
|
+
recommender.fit([
|
342
|
+
{user_id: 1, item_id: 1, value: 1},
|
343
|
+
{user_id: 2, item_id: 1, value: 3}
|
344
|
+
])
|
345
|
+
```
|
346
|
+
|
347
|
+
to:
|
348
|
+
|
349
|
+
```ruby
|
350
|
+
recommender.fit([
|
351
|
+
{user_id: 1, item_id: 1},
|
352
|
+
{user_id: 2, item_id: 1}
|
353
|
+
])
|
354
|
+
```
|
355
|
+
|
339
356
|
## History
|
340
357
|
|
341
358
|
View the [changelog](https://github.com/ankane/disco/blob/master/CHANGELOG.md)
|
data/lib/disco/data.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
module Disco
|
2
2
|
module Data
|
3
3
|
def load_movielens
|
4
|
-
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
5
7
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
6
|
-
data_path = download_file("ml-100k/u.data", "
|
8
|
+
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
9
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
10
|
|
9
11
|
# convert u.item to utf-8
|
@@ -29,6 +31,11 @@ module Disco
|
|
29
31
|
private
|
30
32
|
|
31
33
|
def download_file(fname, origin, file_hash:)
|
34
|
+
require "digest"
|
35
|
+
require "fileutils"
|
36
|
+
require "net/http"
|
37
|
+
require "tmpdir"
|
38
|
+
|
32
39
|
# TODO handle this better
|
33
40
|
raise "No HOME" unless ENV["HOME"]
|
34
41
|
dest = "#{ENV["HOME"]}/.disco/#{fname}"
|
data/lib/disco/model.rb
CHANGED
@@ -10,6 +10,7 @@ module Disco
|
|
10
10
|
|
11
11
|
has_many :"recommended_#{name}", -> { where("disco_recommendations.context = ?", name).order("disco_recommendations.score DESC") }, through: :recommendations, source: :item, source_type: class_name
|
12
12
|
|
13
|
+
# TODO use fetch for item_id and score in 0.3.0
|
13
14
|
define_method("update_recommended_#{name}") do |items|
|
14
15
|
now = Time.now
|
15
16
|
items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item[:item_id], context: name, score: item[:score], created_at: now, updated_at: now} }
|
data/lib/disco/recommender.rb
CHANGED
@@ -22,6 +22,10 @@ module Disco
|
|
22
22
|
# but may be confusing if they are all missing and later ones aren't
|
23
23
|
@implicit = !train_set.any? { |v| v[:rating] }
|
24
24
|
|
25
|
+
if @implicit && train_set.any? { |v| v[:value] }
|
26
|
+
warn "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used."
|
27
|
+
end
|
28
|
+
|
25
29
|
# TODO improve performance
|
26
30
|
# (catch exception instead of checking ahead of time)
|
27
31
|
unless @implicit
|
@@ -34,7 +38,6 @@ module Disco
|
|
34
38
|
|
35
39
|
@rated = Hash.new { |hash, key| hash[key] = {} }
|
36
40
|
input = []
|
37
|
-
value_key = @implicit ? :value : :rating
|
38
41
|
train_set.each do |v|
|
39
42
|
# update maps and build matrix in single pass
|
40
43
|
u = (@user_map[v[:user_id]] ||= @user_map.size)
|
@@ -42,7 +45,7 @@ module Disco
|
|
42
45
|
@rated[u][i] = true
|
43
46
|
|
44
47
|
# explicit will always have a value due to check_ratings
|
45
|
-
input << [u, i,
|
48
|
+
input << [u, i, @implicit ? 1 : v[:rating]]
|
46
49
|
end
|
47
50
|
@rated.default = nil
|
48
51
|
|
@@ -61,7 +64,7 @@ module Disco
|
|
61
64
|
train_set.each do |v|
|
62
65
|
i = @item_map[v[:item_id]]
|
63
66
|
@item_count[i] += 1
|
64
|
-
@item_sum[i] += (v[
|
67
|
+
@item_sum[i] += (@implicit ? 1 : v[:rating])
|
65
68
|
end
|
66
69
|
end
|
67
70
|
|
@@ -76,7 +79,7 @@ module Disco
|
|
76
79
|
u ||= -1
|
77
80
|
i ||= -1
|
78
81
|
|
79
|
-
eval_set << [u, i,
|
82
|
+
eval_set << [u, i, @implicit ? 1 : v[:rating]]
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -138,8 +141,7 @@ module Disco
|
|
138
141
|
predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
|
139
142
|
else
|
140
143
|
predictions = @item_factors.inner(@user_factors[u, true])
|
141
|
-
|
142
|
-
indexes = predictions.sort_index.reverse
|
144
|
+
indexes = predictions.sort_index.reverse # reverse just creates view
|
143
145
|
indexes = indexes[0...[count + rated.size, indexes.size].min] if count
|
144
146
|
predictions = predictions[indexes]
|
145
147
|
ids = indexes
|
@@ -179,19 +181,38 @@ module Disco
|
|
179
181
|
raise "top_items not computed" unless @top_items
|
180
182
|
|
181
183
|
if @implicit
|
182
|
-
scores = @item_count
|
184
|
+
scores = Numo::UInt64.cast(@item_count)
|
183
185
|
else
|
184
186
|
require "wilson_score"
|
185
187
|
|
186
|
-
range =
|
187
|
-
|
188
|
+
range =
|
189
|
+
if @min_rating == @max_rating
|
190
|
+
# TODO remove temp fix
|
191
|
+
(@min_rating - 1)..@max_rating
|
192
|
+
else
|
193
|
+
@min_rating..@max_rating
|
194
|
+
end
|
195
|
+
scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) })
|
196
|
+
|
197
|
+
# TODO uncomment in 0.3.0
|
198
|
+
# wilson score with continuity correction
|
199
|
+
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
|
200
|
+
# z = 1.96 # 95% confidence
|
201
|
+
# range = @max_rating - @min_rating
|
202
|
+
# n = Numo::DFloat.cast(@item_count)
|
203
|
+
# phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n
|
204
|
+
# phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
|
205
|
+
# scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
|
206
|
+
# scores = scores * range + @min_rating
|
188
207
|
end
|
189
208
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
209
|
+
indexes = scores.sort_index.reverse
|
210
|
+
indexes = indexes[0...[count, indexes.size].min] if count
|
211
|
+
scores = scores[indexes]
|
212
|
+
|
213
|
+
keys = @item_map.keys
|
214
|
+
indexes.size.times.map do |i|
|
215
|
+
{item_id: keys[indexes[i]], score: scores[i]}
|
195
216
|
end
|
196
217
|
end
|
197
218
|
|
@@ -255,8 +276,9 @@ module Disco
|
|
255
276
|
# inner product is cosine similarity with normalized vectors
|
256
277
|
# https://github.com/facebookresearch/faiss/issues/95
|
257
278
|
#
|
258
|
-
# TODO use non-exact index
|
279
|
+
# TODO use non-exact index in 0.3.0
|
259
280
|
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
281
|
+
# index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product)
|
260
282
|
index = Faiss::IndexFlatIP.new(factors.shape[1])
|
261
283
|
|
262
284
|
# ids are from 0...total
|
data/lib/disco/version.rb
CHANGED
data/lib/disco.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -76,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
78
|
requirements: []
|
79
|
-
rubygems_version: 3.
|
79
|
+
rubygems_version: 3.3.7
|
80
80
|
signing_key:
|
81
81
|
specification_version: 4
|
82
82
|
summary: Recommendations for Ruby and Rails using collaborative filtering
|