disco 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e16ea9f41bc910a0966c4f16f0e48df98f40abb70d2d3b3bd2e8ba2080e57599
4
- data.tar.gz: '05490bda394fa0edf02ab33cb16cc65093d5b66b6317671cbdbfc429c3bd196c'
3
+ metadata.gz: 04d278a7daf8187ac8a5eadaa279c98a0a51a8cf0ad596e793198dcc9141233a
4
+ data.tar.gz: '0916f7cfb91d5bf48ce1186502f15647c102eba54e07bdc33eb042b75e1fb0c6'
5
5
  SHA512:
6
- metadata.gz: b12681372323e4bc323915923f91b5883ea97161e2f2a2846548657b25932ddc7ce364e09a6b932506e15e3cb45ff03c8c2cc22022f3a2cfa62d64bbe77e988d
7
- data.tar.gz: ea2d4200bfb4d4aed3481ebda8de4133c7af5bb4c4bb7c65685a726bbf661b2285579a507ea464ed49b3609d5350b45404dd6cf99401664e013870a6dee54b7f
6
+ metadata.gz: a8e977bcf2988e8e4cb85b13959446d068e3a41feeca26f3789ff7aa0a454258340bc81fb3adb470e0143cc6027cd803ef034900cc29db4648b01f855f6cb011
7
+ data.tar.gz: defc71dd93461a114338f0737cfa3eccae47605e2922aaf12d960a0cb6309131dbba497f7c7d125e962edd055ff7df898cd406544971ed75906cb8c1db6004cf
@@ -1,3 +1,7 @@
1
+ ## 0.1.2 (2020-03-26)
2
+
3
+ - Added experimental `optimize_item_recs` and `optimize_similar_users` methods
4
+
1
5
  ## 0.1.1 (2019-11-14)
2
6
 
3
7
  - Fixed Rails integration
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  - Supports user-based and item-based recommendations
6
6
  - Works with explicit and implicit feedback
7
- - Uses matrix factorization
7
+ - Uses high-performance matrix factorization
8
8
 
9
9
  [![Build Status](https://travis-ci.org/ankane/disco.svg?branch=master)](https://travis-ci.org/ankane/disco)
10
10
 
@@ -202,7 +202,7 @@ recommender = Marshal.load(bin)
202
202
 
203
203
  ## Algorithms
204
204
 
205
- Disco uses matrix factorization.
205
+ Disco uses high-performance matrix factorization.
206
206
 
207
207
  - For explicit feedback, it uses [stochastic gradient descent](https://www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_journal.pdf)
208
208
  - For implicit feedback, it uses [coordinate descent](https://www.csie.ntu.edu.tw/~cjlin/papers/one-class-mf/biased-mf-sdm-with-supp.pdf)
@@ -236,15 +236,44 @@ There are a number of ways to deal with this, but here are some common ones:
236
236
  - For user-based recommendations, show new users the most popular items.
237
237
  - For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity).
238
238
 
239
- ## Daru
239
+ ## Data
240
240
 
241
- Disco works with Daru data frames
241
+ Data can be an array of hashes
242
242
 
243
243
  ```ruby
244
- data = Daru::DataFrame.from_csv("ratings.csv")
245
- recommender.fit(data)
244
+ [{user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3}]
245
+ ```
246
+
247
+ Or a Daru data frame
248
+
249
+ ```ruby
250
+ Daru::DataFrame.from_csv("ratings.csv")
246
251
  ```
247
252
 
253
+ ## Faster Similarity [experimental]
254
+
255
+ If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users.
256
+
257
+ Add this line to your application’s Gemfile:
258
+
259
+ ```ruby
260
+ gem 'ngt', '>= 0.2.3'
261
+ ```
262
+
263
+ Speed up item-based recommendations with:
264
+
265
+ ```ruby
266
+ model.optimize_item_recs
267
+ ```
268
+
269
+ Speed up similar users with:
270
+
271
+ ```ruby
272
+ model.optimize_similar_users
273
+ ```
274
+
275
+ This should be called after fitting or loading the model.
276
+
248
277
  ## Reference
249
278
 
250
279
  Get the global mean
@@ -70,9 +70,11 @@ module Disco
70
70
 
71
71
  @global_mean = model.bias
72
72
 
73
- # TODO read from LIBMF directly to Numo for performance
74
- @user_factors = Numo::DFloat.cast(model.p_factors)
75
- @item_factors = Numo::DFloat.cast(model.q_factors)
73
+ @user_factors = model.p_factors(format: :numo)
74
+ @item_factors = model.q_factors(format: :numo)
75
+
76
+ @user_index = nil
77
+ @item_index = nil
76
78
  end
77
79
 
78
80
  def user_recs(user_id, count: 5, item_ids: nil)
@@ -106,17 +108,34 @@ module Disco
106
108
  end
107
109
  end
108
110
 
111
+ def optimize_similar_items
112
+ @item_index = create_index(@item_factors)
113
+ end
114
+ alias_method :optimize_item_recs, :optimize_similar_items
115
+
116
+ def optimize_similar_users
117
+ @user_index = create_index(@user_factors)
118
+ end
119
+
109
120
  def similar_items(item_id, count: 5)
110
- similar(item_id, @item_map, @item_factors, item_norms, count)
121
+ similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
111
122
  end
112
123
  alias_method :item_recs, :similar_items
113
124
 
114
125
  def similar_users(user_id, count: 5)
115
- similar(user_id, @user_map, @user_factors, user_norms, count)
126
+ similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
116
127
  end
117
128
 
118
129
  private
119
130
 
131
+ def create_index(factors)
132
+ require "ngt"
133
+
134
+ index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
135
+ index.batch_insert(factors)
136
+ index
137
+ end
138
+
120
139
  def user_norms
121
140
  @user_norms ||= norms(@user_factors)
122
141
  end
@@ -131,20 +150,37 @@ module Disco
131
150
  norms
132
151
  end
133
152
 
134
- def similar(id, map, factors, norms, count)
153
+ def similar(id, map, factors, norms, count, index)
135
154
  i = map[id]
136
155
  if i
137
- predictions = factors.dot(factors[i, true]) / norms
138
-
139
- predictions =
140
- map.keys.zip(predictions).map do |item_id, pred|
141
- {item_id: item_id, score: pred}
156
+ if index && count
157
+ keys = map.keys
158
+ result = index.search(factors[i, true], size: count + 1)[1..-1]
159
+ result.map do |v|
160
+ {
161
+ # ids from batch_insert start at 1 instead of 0
162
+ item_id: keys[v[:id] - 1],
163
+ # convert cosine distance to cosine similarity
164
+ score: 1 - v[:distance]
165
+ }
142
166
  end
143
-
144
- predictions.delete_at(i)
145
- predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
146
- predictions = predictions.first(count) if count
147
- predictions
167
+ else
168
+ predictions = factors.dot(factors[i, true]) / norms
169
+
170
+ predictions =
171
+ map.keys.zip(predictions).map do |item_id, pred|
172
+ {item_id: item_id, score: pred}
173
+ end
174
+
175
+ max_score = predictions.delete_at(i)[:score]
176
+ predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
177
+ predictions = predictions.first(count) if count
178
+ # divide by max score to get cosine similarity
179
+ # only need to do for returned records
180
+ # could alternatively do cosine distance = 1 - cosine similarity
181
+ # predictions.each { |pred| pred[:score] /= max_score }
182
+ predictions
183
+ end
148
184
  else
149
185
  []
150
186
  end
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-14 00:00:00.000000000 Z
11
+ date: 2020-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.1.3
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.1.3
26
+ version: 0.2.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: numo-narray
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: ngt
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 0.2.3
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: 0.2.3
125
139
  description:
126
140
  email: andrew@chartkick.com
127
141
  executables: []
@@ -159,7 +173,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
159
173
  - !ruby/object:Gem::Version
160
174
  version: '0'
161
175
  requirements: []
162
- rubygems_version: 3.0.6
176
+ rubygems_version: 3.1.2
163
177
  signing_key:
164
178
  specification_version: 4
165
179
  summary: Collaborative filtering for Ruby