disco 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +23 -5
- data/lib/disco.rb +1 -0
- data/lib/disco/metrics.rb +10 -0
- data/lib/disco/recommender.rb +154 -74
- data/lib/disco/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8fbecb858b316ed39a9cb726263e182561cba6df498e6253d88c79ebec5cab05
|
4
|
+
data.tar.gz: 42eb38a6e4e0b3fc5a9452deae5a48676ae9a53e78eeb6197718a0c94bd02b6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0250346d75fba75064a29578f6bfd39f09ecf712ba2e505b97a4952b5ff8b31af307eb1b912e9b25cc3dc28dee0d096bea44b47bb2ef268859bb4171f0ef8b2
|
7
|
+
data.tar.gz: 7b341328c12885efd0ffece4201036bb9457caee80a48a99ba110af9a81bcf832bbc1e8f8f5f14e7fddffef2dd3f4643837e0d569c997ab0c2d9ae85e12422f7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## 0.2.5 (2021-02-20)
|
2
|
+
|
3
|
+
- Added `top_items` method
|
4
|
+
- Added `optimize_similar_users` method
|
5
|
+
- Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods
|
6
|
+
- Added `rmse` method
|
7
|
+
- Improved performance
|
8
|
+
|
1
9
|
## 0.2.4 (2021-02-15)
|
2
10
|
|
3
11
|
- Added `user_ids` and `item_ids` methods
|
data/README.md
CHANGED
@@ -201,6 +201,8 @@ bin = File.binread("recommender.bin")
|
|
201
201
|
recommender = Marshal.load(bin)
|
202
202
|
```
|
203
203
|
|
204
|
+
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor)
|
205
|
+
|
204
206
|
## Algorithms
|
205
207
|
|
206
208
|
Disco uses high-performance matrix factorization.
|
@@ -237,6 +239,16 @@ There are a number of ways to deal with this, but here are some common ones:
|
|
237
239
|
- For user-based recommendations, show new users the most popular items.
|
238
240
|
- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity).
|
239
241
|
|
242
|
+
Get top items with:
|
243
|
+
|
244
|
+
```ruby
|
245
|
+
recommender = Disco::Recommender.new(top_items: true)
|
246
|
+
recommender.fit(data)
|
247
|
+
recommender.top_items
|
248
|
+
```
|
249
|
+
|
250
|
+
This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
|
251
|
+
|
240
252
|
## Data
|
241
253
|
|
242
254
|
Data can be an array of hashes
|
@@ -257,23 +269,29 @@ Or a Daru data frame
|
|
257
269
|
Daru::DataFrame.from_csv("ratings.csv")
|
258
270
|
```
|
259
271
|
|
260
|
-
##
|
272
|
+
## Performance [master]
|
261
273
|
|
262
|
-
If you have a large number of users
|
274
|
+
If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
|
263
275
|
|
264
276
|
Add this line to your application’s Gemfile:
|
265
277
|
|
266
278
|
```ruby
|
267
|
-
gem '
|
279
|
+
gem 'faiss'
|
280
|
+
```
|
281
|
+
|
282
|
+
Speed up the `user_recs` method with:
|
283
|
+
|
284
|
+
```ruby
|
285
|
+
model.optimize_user_recs
|
268
286
|
```
|
269
287
|
|
270
|
-
Speed up
|
288
|
+
Speed up the `item_recs` method with:
|
271
289
|
|
272
290
|
```ruby
|
273
291
|
model.optimize_item_recs
|
274
292
|
```
|
275
293
|
|
276
|
-
Speed up
|
294
|
+
Speed up the `similar_users` method with:
|
277
295
|
|
278
296
|
```ruby
|
279
297
|
model.optimize_similar_users
|
data/lib/disco.rb
CHANGED
data/lib/disco/recommender.rb
CHANGED
@@ -2,12 +2,13 @@ module Disco
|
|
2
2
|
class Recommender
|
3
3
|
attr_reader :global_mean
|
4
4
|
|
5
|
-
def initialize(factors: 8, epochs: 20, verbose: nil)
|
5
|
+
def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false)
|
6
6
|
@factors = factors
|
7
7
|
@epochs = epochs
|
8
8
|
@verbose = verbose
|
9
9
|
@user_map = {}
|
10
10
|
@item_map = {}
|
11
|
+
@top_items = top_items
|
11
12
|
end
|
12
13
|
|
13
14
|
def fit(train_set, validation_set: nil)
|
@@ -41,6 +42,16 @@ module Disco
|
|
41
42
|
end
|
42
43
|
@rated.default = nil
|
43
44
|
|
45
|
+
if @top_items
|
46
|
+
@item_count = [0] * @item_map.size
|
47
|
+
@item_sum = [0.0] * @item_map.size
|
48
|
+
train_set.each do |v|
|
49
|
+
i = @item_map[v[:item_id]]
|
50
|
+
@item_count[i] += 1
|
51
|
+
@item_sum[i] += (v[value_key] || 1)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
44
55
|
eval_set = nil
|
45
56
|
if validation_set
|
46
57
|
eval_set = []
|
@@ -67,8 +78,9 @@ module Disco
|
|
67
78
|
@user_factors = model.p_factors(format: :numo)
|
68
79
|
@item_factors = model.q_factors(format: :numo)
|
69
80
|
|
70
|
-
@
|
71
|
-
@
|
81
|
+
@user_recs_index = nil
|
82
|
+
@similar_users_index = nil
|
83
|
+
@similar_items_index = nil
|
72
84
|
end
|
73
85
|
|
74
86
|
# generates a prediction even if a user has already rated the item
|
@@ -95,61 +107,76 @@ module Disco
|
|
95
107
|
u = @user_map[user_id]
|
96
108
|
|
97
109
|
if u
|
98
|
-
|
99
|
-
|
100
|
-
predictions =
|
101
|
-
@item_map.keys.zip(predictions).map do |item_id, pred|
|
102
|
-
{item_id: item_id, score: pred}
|
103
|
-
end
|
110
|
+
rated = item_ids ? {} : @rated[u]
|
104
111
|
|
105
112
|
if item_ids
|
106
|
-
|
107
|
-
|
113
|
+
ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
|
114
|
+
return [] if ids.size == 0
|
115
|
+
|
116
|
+
predictions = @item_factors[ids, true].inner(@user_factors[u, true])
|
117
|
+
indexes = predictions.sort_index.reverse
|
118
|
+
indexes = indexes[0...[count + rated.size, indexes.size].min] if count
|
119
|
+
predictions = predictions[indexes]
|
120
|
+
ids = ids[indexes]
|
121
|
+
elsif @user_recs_index && count
|
122
|
+
predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
|
108
123
|
else
|
109
|
-
@
|
110
|
-
|
111
|
-
|
124
|
+
predictions = @item_factors.inner(@user_factors[u, true])
|
125
|
+
# TODO make sure reverse isn't hurting performance
|
126
|
+
indexes = predictions.sort_index.reverse
|
127
|
+
indexes = indexes[0...[count + rated.size, indexes.size].min] if count
|
128
|
+
predictions = predictions[indexes]
|
129
|
+
ids = indexes
|
112
130
|
end
|
113
131
|
|
114
|
-
predictions.
|
115
|
-
predictions = predictions.first(count) if count && !item_ids
|
132
|
+
predictions.inplace.clip(@min_rating, @max_rating) if @min_rating
|
116
133
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
pred[:score] = pred[:score].clamp(@min_rating, @max_rating)
|
122
|
-
end
|
123
|
-
end
|
134
|
+
keys = @item_map.keys
|
135
|
+
result = []
|
136
|
+
ids.each_with_index do |item_id, i|
|
137
|
+
next if rated[item_id]
|
124
138
|
|
125
|
-
|
139
|
+
result << {item_id: keys[item_id], score: predictions[i]}
|
140
|
+
break if result.size == count
|
141
|
+
end
|
142
|
+
result
|
143
|
+
elsif @top_items
|
144
|
+
top_items(count: count)
|
126
145
|
else
|
127
|
-
# no items if user is unknown
|
128
|
-
# TODO maybe most popular items
|
129
146
|
[]
|
130
147
|
end
|
131
148
|
end
|
132
149
|
|
133
|
-
def
|
150
|
+
def similar_items(item_id, count: 5)
|
134
151
|
check_fit
|
135
|
-
@
|
152
|
+
similar(item_id, @item_map, item_norms, count, @similar_items_index)
|
136
153
|
end
|
137
|
-
alias_method :
|
154
|
+
alias_method :item_recs, :similar_items
|
138
155
|
|
139
|
-
def
|
156
|
+
def similar_users(user_id, count: 5)
|
140
157
|
check_fit
|
141
|
-
@
|
158
|
+
similar(user_id, @user_map, user_norms, count, @similar_users_index)
|
142
159
|
end
|
143
160
|
|
144
|
-
def
|
161
|
+
def top_items(count: 5)
|
145
162
|
check_fit
|
146
|
-
|
147
|
-
end
|
148
|
-
alias_method :item_recs, :similar_items
|
163
|
+
raise "top_items not computed" unless @top_items
|
149
164
|
|
150
|
-
|
151
|
-
|
152
|
-
|
165
|
+
if @implicit
|
166
|
+
scores = @item_count
|
167
|
+
else
|
168
|
+
require "wilson_score"
|
169
|
+
|
170
|
+
range = @min_rating..@max_rating
|
171
|
+
scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }
|
172
|
+
end
|
173
|
+
|
174
|
+
scores = scores.map.with_index.sort_by { |s, _| -s }
|
175
|
+
scores = scores.first(count) if count
|
176
|
+
item_ids = item_ids()
|
177
|
+
scores.map do |s, i|
|
178
|
+
{item_id: item_ids[i], score: s}
|
179
|
+
end
|
153
180
|
end
|
154
181
|
|
155
182
|
def user_ids
|
@@ -178,17 +205,61 @@ module Disco
|
|
178
205
|
end
|
179
206
|
end
|
180
207
|
|
208
|
+
def optimize_user_recs
|
209
|
+
check_fit
|
210
|
+
@user_recs_index = create_index(item_factors, library: "faiss")
|
211
|
+
end
|
212
|
+
|
213
|
+
def optimize_similar_items(library: nil)
|
214
|
+
check_fit
|
215
|
+
@similar_items_index = create_index(item_norms, library: library)
|
216
|
+
end
|
217
|
+
alias_method :optimize_item_recs, :optimize_similar_items
|
218
|
+
|
219
|
+
def optimize_similar_users(library: nil)
|
220
|
+
check_fit
|
221
|
+
@similar_users_index = create_index(user_norms, library: library)
|
222
|
+
end
|
223
|
+
|
181
224
|
private
|
182
225
|
|
183
|
-
|
184
|
-
|
226
|
+
# factors should already be normalized for similar users/items
|
227
|
+
def create_index(factors, library:)
|
228
|
+
# TODO make Faiss the default in 0.3.0
|
229
|
+
library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
|
230
|
+
|
231
|
+
case library
|
232
|
+
when "faiss"
|
233
|
+
require "faiss"
|
234
|
+
|
235
|
+
# inner product is cosine similarity with normalized vectors
|
236
|
+
# https://github.com/facebookresearch/faiss/issues/95
|
237
|
+
#
|
238
|
+
# TODO use non-exact index
|
239
|
+
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
240
|
+
index = Faiss::IndexFlatIP.new(factors.shape[1])
|
241
|
+
|
242
|
+
# ids are from 0...total
|
243
|
+
# https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89
|
244
|
+
index.add(factors)
|
245
|
+
|
246
|
+
index
|
247
|
+
when "ngt"
|
248
|
+
require "ngt"
|
249
|
+
|
250
|
+
# could speed up search with normalized cosine
|
251
|
+
# https://github.com/yahoojapan/NGT/issues/36
|
252
|
+
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
|
185
253
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
254
|
+
# NGT normalizes so could call create_index with factors instead of norms
|
255
|
+
# but keep code simple for now
|
256
|
+
ids = index.batch_insert(factors)
|
257
|
+
raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
|
258
|
+
|
259
|
+
index
|
260
|
+
else
|
261
|
+
raise ArgumentError, "Invalid library: #{library}"
|
262
|
+
end
|
192
263
|
end
|
193
264
|
|
194
265
|
def user_norms
|
@@ -202,40 +273,38 @@ module Disco
|
|
202
273
|
def norms(factors)
|
203
274
|
norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
|
204
275
|
norms[norms.eq(0)] = 1e-10 # no zeros
|
205
|
-
norms
|
276
|
+
factors / norms.expand_dims(1)
|
206
277
|
end
|
207
278
|
|
208
|
-
def similar(id, map,
|
279
|
+
def similar(id, map, norm_factors, count, index)
|
209
280
|
i = map[id]
|
210
|
-
|
281
|
+
|
282
|
+
if i && norm_factors.shape[0] > 1
|
211
283
|
if index && count
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
}
|
284
|
+
if defined?(Faiss) && index.is_a?(Faiss::Index)
|
285
|
+
predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] }
|
286
|
+
else
|
287
|
+
result = index.search(norm_factors[i, true], size: count + 1)
|
288
|
+
# ids from batch_insert start at 1 instead of 0
|
289
|
+
ids = result.map { |v| v[:id] - 1 }
|
290
|
+
# convert cosine distance to cosine similarity
|
291
|
+
predictions = result.map { |v| 1 - v[:distance] }
|
221
292
|
end
|
222
293
|
else
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
predictions.each { |pred| pred[:score] /= norms[i] }
|
238
|
-
predictions
|
294
|
+
predictions = norm_factors.inner(norm_factors[i, true])
|
295
|
+
indexes = predictions.sort_index.reverse
|
296
|
+
indexes = indexes[0...[count + 1, indexes.size].min] if count
|
297
|
+
predictions = predictions[indexes]
|
298
|
+
ids = indexes
|
299
|
+
end
|
300
|
+
|
301
|
+
keys = map.keys
|
302
|
+
|
303
|
+
# TODO use user_id for similar_users in 0.3.0
|
304
|
+
key = :item_id
|
305
|
+
|
306
|
+
(1...ids.size).map do |i|
|
307
|
+
{key => keys[ids[i]], score: predictions[i]}
|
239
308
|
end
|
240
309
|
else
|
241
310
|
[]
|
@@ -304,6 +373,11 @@ module Disco
|
|
304
373
|
obj[:max_rating] = @max_rating
|
305
374
|
end
|
306
375
|
|
376
|
+
if @top_items
|
377
|
+
obj[:item_count] = @item_count
|
378
|
+
obj[:item_sum] = @item_sum
|
379
|
+
end
|
380
|
+
|
307
381
|
obj
|
308
382
|
end
|
309
383
|
|
@@ -320,6 +394,12 @@ module Disco
|
|
320
394
|
@min_rating = obj[:min_rating]
|
321
395
|
@max_rating = obj[:max_rating]
|
322
396
|
end
|
397
|
+
|
398
|
+
@top_items = obj.key?(:item_count)
|
399
|
+
if @top_items
|
400
|
+
@item_count = obj[:item_count]
|
401
|
+
@item_sum = obj[:item_sum]
|
402
|
+
end
|
323
403
|
end
|
324
404
|
end
|
325
405
|
end
|
data/lib/disco/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -51,6 +51,7 @@ files:
|
|
51
51
|
- lib/disco.rb
|
52
52
|
- lib/disco/data.rb
|
53
53
|
- lib/disco/engine.rb
|
54
|
+
- lib/disco/metrics.rb
|
54
55
|
- lib/disco/model.rb
|
55
56
|
- lib/disco/recommender.rb
|
56
57
|
- lib/disco/version.rb
|