disco 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +35 -6
- data/lib/disco/recommender.rb +52 -16
- data/lib/disco/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz: '
|
3
|
+
metadata.gz: 04d278a7daf8187ac8a5eadaa279c98a0a51a8cf0ad596e793198dcc9141233a
|
4
|
+
data.tar.gz: '0916f7cfb91d5bf48ce1186502f15647c102eba54e07bdc33eb042b75e1fb0c6'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8e977bcf2988e8e4cb85b13959446d068e3a41feeca26f3789ff7aa0a454258340bc81fb3adb470e0143cc6027cd803ef034900cc29db4648b01f855f6cb011
|
7
|
+
data.tar.gz: defc71dd93461a114338f0737cfa3eccae47605e2922aaf12d960a0cb6309131dbba497f7c7d125e962edd055ff7df898cd406544971ed75906cb8c1db6004cf
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
- Supports user-based and item-based recommendations
|
6
6
|
- Works with explicit and implicit feedback
|
7
|
-
- Uses matrix factorization
|
7
|
+
- Uses high-performance matrix factorization
|
8
8
|
|
9
9
|
[![Build Status](https://travis-ci.org/ankane/disco.svg?branch=master)](https://travis-ci.org/ankane/disco)
|
10
10
|
|
@@ -202,7 +202,7 @@ recommender = Marshal.load(bin)
|
|
202
202
|
|
203
203
|
## Algorithms
|
204
204
|
|
205
|
-
Disco uses matrix factorization.
|
205
|
+
Disco uses high-performance matrix factorization.
|
206
206
|
|
207
207
|
- For explicit feedback, it uses [stochastic gradient descent](https://www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_journal.pdf)
|
208
208
|
- For implicit feedback, it uses [coordinate descent](https://www.csie.ntu.edu.tw/~cjlin/papers/one-class-mf/biased-mf-sdm-with-supp.pdf)
|
@@ -236,15 +236,44 @@ There are a number of ways to deal with this, but here are some common ones:
|
|
236
236
|
- For user-based recommendations, show new users the most popular items.
|
237
237
|
- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity).
|
238
238
|
|
239
|
-
##
|
239
|
+
## Data
|
240
240
|
|
241
|
-
|
241
|
+
Data can be an array of hashes
|
242
242
|
|
243
243
|
```ruby
|
244
|
-
|
245
|
-
|
244
|
+
[{user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3}]
|
245
|
+
```
|
246
|
+
|
247
|
+
Or a Daru data frame
|
248
|
+
|
249
|
+
```ruby
|
250
|
+
Daru::DataFrame.from_csv("ratings.csv")
|
246
251
|
```
|
247
252
|
|
253
|
+
## Faster Similarity [experimental]
|
254
|
+
|
255
|
+
If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users.
|
256
|
+
|
257
|
+
Add this line to your application’s Gemfile:
|
258
|
+
|
259
|
+
```ruby
|
260
|
+
gem 'ngt', '>= 0.2.3'
|
261
|
+
```
|
262
|
+
|
263
|
+
Speed up item-based recommendations with:
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
model.optimize_item_recs
|
267
|
+
```
|
268
|
+
|
269
|
+
Speed up similar users with:
|
270
|
+
|
271
|
+
```ruby
|
272
|
+
model.optimize_similar_users
|
273
|
+
```
|
274
|
+
|
275
|
+
This should be called after fitting or loading the model.
|
276
|
+
|
248
277
|
## Reference
|
249
278
|
|
250
279
|
Get the global mean
|
data/lib/disco/recommender.rb
CHANGED
@@ -70,9 +70,11 @@ module Disco
|
|
70
70
|
|
71
71
|
@global_mean = model.bias
|
72
72
|
|
73
|
-
|
74
|
-
@
|
75
|
-
|
73
|
+
@user_factors = model.p_factors(format: :numo)
|
74
|
+
@item_factors = model.q_factors(format: :numo)
|
75
|
+
|
76
|
+
@user_index = nil
|
77
|
+
@item_index = nil
|
76
78
|
end
|
77
79
|
|
78
80
|
def user_recs(user_id, count: 5, item_ids: nil)
|
@@ -106,17 +108,34 @@ module Disco
|
|
106
108
|
end
|
107
109
|
end
|
108
110
|
|
111
|
+
def optimize_similar_items
|
112
|
+
@item_index = create_index(@item_factors)
|
113
|
+
end
|
114
|
+
alias_method :optimize_item_recs, :optimize_similar_items
|
115
|
+
|
116
|
+
def optimize_similar_users
|
117
|
+
@user_index = create_index(@user_factors)
|
118
|
+
end
|
119
|
+
|
109
120
|
def similar_items(item_id, count: 5)
|
110
|
-
similar(item_id, @item_map, @item_factors, item_norms, count)
|
121
|
+
similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
|
111
122
|
end
|
112
123
|
alias_method :item_recs, :similar_items
|
113
124
|
|
114
125
|
def similar_users(user_id, count: 5)
|
115
|
-
similar(user_id, @user_map, @user_factors, user_norms, count)
|
126
|
+
similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
|
116
127
|
end
|
117
128
|
|
118
129
|
private
|
119
130
|
|
131
|
+
def create_index(factors)
|
132
|
+
require "ngt"
|
133
|
+
|
134
|
+
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
|
135
|
+
index.batch_insert(factors)
|
136
|
+
index
|
137
|
+
end
|
138
|
+
|
120
139
|
def user_norms
|
121
140
|
@user_norms ||= norms(@user_factors)
|
122
141
|
end
|
@@ -131,20 +150,37 @@ module Disco
|
|
131
150
|
norms
|
132
151
|
end
|
133
152
|
|
134
|
-
def similar(id, map, factors, norms, count)
|
153
|
+
def similar(id, map, factors, norms, count, index)
|
135
154
|
i = map[id]
|
136
155
|
if i
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
{
|
156
|
+
if index && count
|
157
|
+
keys = map.keys
|
158
|
+
result = index.search(factors[i, true], size: count + 1)[1..-1]
|
159
|
+
result.map do |v|
|
160
|
+
{
|
161
|
+
# ids from batch_insert start at 1 instead of 0
|
162
|
+
item_id: keys[v[:id] - 1],
|
163
|
+
# convert cosine distance to cosine similarity
|
164
|
+
score: 1 - v[:distance]
|
165
|
+
}
|
142
166
|
end
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
167
|
+
else
|
168
|
+
predictions = factors.dot(factors[i, true]) / norms
|
169
|
+
|
170
|
+
predictions =
|
171
|
+
map.keys.zip(predictions).map do |item_id, pred|
|
172
|
+
{item_id: item_id, score: pred}
|
173
|
+
end
|
174
|
+
|
175
|
+
max_score = predictions.delete_at(i)[:score]
|
176
|
+
predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
|
177
|
+
predictions = predictions.first(count) if count
|
178
|
+
# divide by max score to get cosine similarity
|
179
|
+
# only need to do for returned records
|
180
|
+
# could alternatively do cosine distance = 1 - cosine similarity
|
181
|
+
# predictions.each { |pred| pred[:score] /= max_score }
|
182
|
+
predictions
|
183
|
+
end
|
148
184
|
else
|
149
185
|
[]
|
150
186
|
end
|
data/lib/disco/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: disco
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: libmf
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.2.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: numo-narray
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: ngt
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 0.2.3
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: 0.2.3
|
125
139
|
description:
|
126
140
|
email: andrew@chartkick.com
|
127
141
|
executables: []
|
@@ -159,7 +173,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
159
173
|
- !ruby/object:Gem::Version
|
160
174
|
version: '0'
|
161
175
|
requirements: []
|
162
|
-
rubygems_version: 3.
|
176
|
+
rubygems_version: 3.1.2
|
163
177
|
signing_key:
|
164
178
|
specification_version: 4
|
165
179
|
summary: Collaborative filtering for Ruby
|