disco 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +35 -6
- data/lib/disco/recommender.rb +52 -16
- data/lib/disco/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz: '
|
3
|
+
metadata.gz: 04d278a7daf8187ac8a5eadaa279c98a0a51a8cf0ad596e793198dcc9141233a
|
4
|
+
data.tar.gz: '0916f7cfb91d5bf48ce1186502f15647c102eba54e07bdc33eb042b75e1fb0c6'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8e977bcf2988e8e4cb85b13959446d068e3a41feeca26f3789ff7aa0a454258340bc81fb3adb470e0143cc6027cd803ef034900cc29db4648b01f855f6cb011
|
7
|
+
data.tar.gz: defc71dd93461a114338f0737cfa3eccae47605e2922aaf12d960a0cb6309131dbba497f7c7d125e962edd055ff7df898cd406544971ed75906cb8c1db6004cf
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
- Supports user-based and item-based recommendations
|
6
6
|
- Works with explicit and implicit feedback
|
7
|
-
- Uses matrix factorization
|
7
|
+
- Uses high-performance matrix factorization
|
8
8
|
|
9
9
|
[](https://travis-ci.org/ankane/disco)
|
10
10
|
|
@@ -202,7 +202,7 @@ recommender = Marshal.load(bin)
|
|
202
202
|
|
203
203
|
## Algorithms
|
204
204
|
|
205
|
-
Disco uses matrix factorization.
|
205
|
+
Disco uses high-performance matrix factorization.
|
206
206
|
|
207
207
|
- For explicit feedback, it uses [stochastic gradient descent](https://www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_journal.pdf)
|
208
208
|
- For implicit feedback, it uses [coordinate descent](https://www.csie.ntu.edu.tw/~cjlin/papers/one-class-mf/biased-mf-sdm-with-supp.pdf)
|
@@ -236,15 +236,44 @@ There are a number of ways to deal with this, but here are some common ones:
|
|
236
236
|
- For user-based recommendations, show new users the most popular items.
|
237
237
|
- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity).
|
238
238
|
|
239
|
-
##
|
239
|
+
## Data
|
240
240
|
|
241
|
-
|
241
|
+
Data can be an array of hashes
|
242
242
|
|
243
243
|
```ruby
|
244
|
-
|
245
|
-
|
244
|
+
[{user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3}]
|
245
|
+
```
|
246
|
+
|
247
|
+
Or a Daru data frame
|
248
|
+
|
249
|
+
```ruby
|
250
|
+
Daru::DataFrame.from_csv("ratings.csv")
|
246
251
|
```
|
247
252
|
|
253
|
+
## Faster Similarity [experimental]
|
254
|
+
|
255
|
+
If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users.
|
256
|
+
|
257
|
+
Add this line to your application’s Gemfile:
|
258
|
+
|
259
|
+
```ruby
|
260
|
+
gem 'ngt', '>= 0.2.3'
|
261
|
+
```
|
262
|
+
|
263
|
+
Speed up item-based recommendations with:
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
model.optimize_item_recs
|
267
|
+
```
|
268
|
+
|
269
|
+
Speed up similar users with:
|
270
|
+
|
271
|
+
```ruby
|
272
|
+
model.optimize_similar_users
|
273
|
+
```
|
274
|
+
|
275
|
+
This should be called after fitting or loading the model.
|
276
|
+
|
248
277
|
## Reference
|
249
278
|
|
250
279
|
Get the global mean
|
data/lib/disco/recommender.rb
CHANGED
@@ -70,9 +70,11 @@ module Disco
|
|
70
70
|
|
71
71
|
@global_mean = model.bias
|
72
72
|
|
73
|
-
|
74
|
-
@
|
75
|
-
|
73
|
+
@user_factors = model.p_factors(format: :numo)
|
74
|
+
@item_factors = model.q_factors(format: :numo)
|
75
|
+
|
76
|
+
@user_index = nil
|
77
|
+
@item_index = nil
|
76
78
|
end
|
77
79
|
|
78
80
|
def user_recs(user_id, count: 5, item_ids: nil)
|
@@ -106,17 +108,34 @@ module Disco
|
|
106
108
|
end
|
107
109
|
end
|
108
110
|
|
111
|
+
def optimize_similar_items
|
112
|
+
@item_index = create_index(@item_factors)
|
113
|
+
end
|
114
|
+
alias_method :optimize_item_recs, :optimize_similar_items
|
115
|
+
|
116
|
+
def optimize_similar_users
|
117
|
+
@user_index = create_index(@user_factors)
|
118
|
+
end
|
119
|
+
|
109
120
|
def similar_items(item_id, count: 5)
|
110
|
-
similar(item_id, @item_map, @item_factors, item_norms, count)
|
121
|
+
similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
|
111
122
|
end
|
112
123
|
alias_method :item_recs, :similar_items
|
113
124
|
|
114
125
|
def similar_users(user_id, count: 5)
|
115
|
-
similar(user_id, @user_map, @user_factors, user_norms, count)
|
126
|
+
similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
|
116
127
|
end
|
117
128
|
|
118
129
|
private
|
119
130
|
|
131
|
+
def create_index(factors)
|
132
|
+
require "ngt"
|
133
|
+
|
134
|
+
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
|
135
|
+
index.batch_insert(factors)
|
136
|
+
index
|
137
|
+
end
|
138
|
+
|
120
139
|
def user_norms
|
121
140
|
@user_norms ||= norms(@user_factors)
|
122
141
|
end
|
@@ -131,20 +150,37 @@ module Disco
|
|
131
150
|
norms
|
132
151
|
end
|
133
152
|
|
134
|
-
def similar(id, map, factors, norms, count)
|
153
|
+
def similar(id, map, factors, norms, count, index)
|
135
154
|
i = map[id]
|
136
155
|
if i
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
{
|
156
|
+
if index && count
|
157
|
+
keys = map.keys
|
158
|
+
result = index.search(factors[i, true], size: count + 1)[1..-1]
|
159
|
+
result.map do |v|
|
160
|
+
{
|
161
|
+
# ids from batch_insert start at 1 instead of 0
|
162
|
+
item_id: keys[v[:id] - 1],
|
163
|
+
# convert cosine distance to cosine similarity
|
164
|
+
score: 1 - v[:distance]
|
165
|
+
}
|
142
166
|
end
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
167
|
+
else
|
168
|
+
predictions = factors.dot(factors[i, true]) / norms
|
169
|
+
|
170
|
+
predictions =
|
171
|
+
map.keys.zip(predictions).map do |item_id, pred|
|
172
|
+
{item_id: item_id, score: pred}
|
173
|
+
end
|
174
|
+
|
175
|
+
max_score = predictions.delete_at(i)[:score]
|
176
|
+
predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
|
177
|
+
predictions = predictions.first(count) if count
|
178
|
+
# divide by max score to get cosine similarity
|
179
|
+
# only need to do for returned records
|
180
|
+
# could alternatively do cosine distance = 1 - cosine similarity
|
181
|
+
# predictions.each { |pred| pred[:score] /= max_score }
|
182
|
+
predictions
|
183
|
+
end
|
148
184
|
else
|
149
185
|
[]
|
150
186
|
end
|
data/lib/disco/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.2.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: numo-narray
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: ngt
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 0.2.3
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: 0.2.3
|
125
139
|
description:
|
126
140
|
email: andrew@chartkick.com
|
127
141
|
executables: []
|
@@ -159,7 +173,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
159
173
|
- !ruby/object:Gem::Version
|
160
174
|
version: '0'
|
161
175
|
requirements: []
|
162
|
-
rubygems_version: 3.
|
176
|
+
rubygems_version: 3.1.2
|
163
177
|
signing_key:
|
164
178
|
specification_version: 4
|
165
179
|
summary: Collaborative filtering for Ruby
|