disco 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e9b8792d465e2bd894ce9aaa5dabf79dd89e93337d838917c709ac7747b85772
4
- data.tar.gz: 9d34a5124dc26f8a2ecb7e2ed3cbf524fe586c37c693d885e668974e24dfaf0a
3
+ metadata.gz: e4a978d2eec39ca280142c49fb4ef4be2e1ad4f35dfa4d977941f46d5d34b466
4
+ data.tar.gz: 8a29a54bba5ac8b715294e2fce4e34fa1b11442b1800c388807c60b9520ced23
5
5
  SHA512:
6
- metadata.gz: 658b48b75994a295382eb22908d4a5f1825b01bfc26f52428e993802c42f7ebb435a59e7f1262d17400d65eda886f1a4f38edff82cdeda96c2a0ce280602742f
7
- data.tar.gz: c9acce77cae8a575c5814456247600367d5fea4eb85a52309e26cac643d107bc03c42c99ce094c2f9ec46a329fd2dfa97d2f58f0020b337feb88b56346630942
6
+ metadata.gz: 99376dd48cce340a4fdcb0d76c93b03af494d88167e2caaca0d186fcf5d2303f2524884e0c712c2f8e3d7be79a92b029a8d5fa726bb94826315f283afea0f74b
7
+ data.tar.gz: eeb8c480098616f93d6c7e39a1bb57e2feefa6af3696c407791ff6f052450eb035f1d1659ded70d7b5fbbbe8cff9f7309118828a454b1d4f9d459321b90035cf
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.2.4 (2021-02-15)
2
+
3
+ - Added `user_ids` and `item_ids` methods
4
+ - Added `user_id` argument to `user_factors`
5
+ - Added `item_id` argument to `item_factors`
6
+
1
7
  ## 0.2.3 (2020-11-28)
2
8
 
3
9
  - Added `predict` method
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2019-2020 Andrew Kane
1
+ Copyright (c) 2019-2021 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -46,13 +46,13 @@ recommender.fit([
46
46
 
47
47
  > Use `value` instead of rating for implicit feedback
48
48
 
49
- Get user-based (user-item) recommendations - “users like you also liked”
49
+ Get user-based recommendations - “users like you also liked”
50
50
 
51
51
  ```ruby
52
52
  recommender.user_recs(user_id)
53
53
  ```
54
54
 
55
- Get item-based (item-item) recommendations - “users who liked this item also liked”
55
+ Get item-based recommendations - “users who liked this item also liked”
56
56
 
57
57
  ```ruby
58
58
  recommender.item_recs(item_id)
@@ -283,19 +283,33 @@ This should be called after fitting or loading the model.
283
283
 
284
284
  ## Reference
285
285
 
286
+ Get ids
287
+
288
+ ```ruby
289
+ recommender.user_ids
290
+ recommender.item_ids
291
+ ```
292
+
286
293
  Get the global mean
287
294
 
288
295
  ```ruby
289
296
  recommender.global_mean
290
297
  ```
291
298
 
292
- Get the factors
299
+ Get factors
293
300
 
294
301
  ```ruby
295
302
  recommender.user_factors
296
303
  recommender.item_factors
297
304
  ```
298
305
 
306
+ Get factors for specific users and items
307
+
308
+ ```ruby
309
+ recommender.user_factors(user_id)
310
+ recommender.item_factors(item_id)
311
+ ```
312
+
299
313
  ## Credits
300
314
 
301
315
  Thanks to:
@@ -1,32 +1,32 @@
1
1
  module Disco
2
2
  class Recommender
3
- attr_reader :global_mean, :item_factors, :user_factors
3
+ attr_reader :global_mean
4
4
 
5
5
  def initialize(factors: 8, epochs: 20, verbose: nil)
6
6
  @factors = factors
7
7
  @epochs = epochs
8
8
  @verbose = verbose
9
+ @user_map = {}
10
+ @item_map = {}
9
11
  end
10
12
 
11
13
  def fit(train_set, validation_set: nil)
12
14
  train_set = to_dataset(train_set)
13
15
  validation_set = to_dataset(validation_set) if validation_set
14
16
 
15
- @implicit = !train_set.any? { |v| v[:rating] }
17
+ check_training_set(train_set)
16
18
 
19
+ @implicit = !train_set.any? { |v| v[:rating] }
17
20
  unless @implicit
18
- ratings = train_set.map { |o| o[:rating] }
19
- check_ratings(ratings)
20
- @min_rating = ratings.min
21
- @max_rating = ratings.max
21
+ check_ratings(train_set)
22
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
22
23
 
23
24
  if validation_set
24
- check_ratings(validation_set.map { |o| o[:rating] })
25
+ check_ratings(validation_set)
25
26
  end
26
27
  end
27
28
 
28
- check_training_set(train_set)
29
- create_maps(train_set)
29
+ update_maps(train_set)
30
30
 
31
31
  @rated = Hash.new { |hash, key| hash[key] = {} }
32
32
  input = []
@@ -143,13 +143,39 @@ module Disco
143
143
 
144
144
  def similar_items(item_id, count: 5)
145
145
  check_fit
146
- similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
146
+ similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
147
147
  end
148
148
  alias_method :item_recs, :similar_items
149
149
 
150
150
  def similar_users(user_id, count: 5)
151
151
  check_fit
152
- similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
152
+ similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
153
+ end
154
+
155
+ def user_ids
156
+ @user_map.keys
157
+ end
158
+
159
+ def item_ids
160
+ @item_map.keys
161
+ end
162
+
163
+ def user_factors(user_id = nil)
164
+ if user_id
165
+ u = @user_map[user_id]
166
+ @user_factors[u, true] if u
167
+ else
168
+ @user_factors
169
+ end
170
+ end
171
+
172
+ def item_factors(item_id = nil)
173
+ if item_id
174
+ i = @item_map[item_id]
175
+ @item_factors[i, true] if i
176
+ else
177
+ @item_factors
178
+ end
153
179
  end
154
180
 
155
181
  private
@@ -157,8 +183,11 @@ module Disco
157
183
  def create_index(factors)
158
184
  require "ngt"
159
185
 
186
+ # could speed up search with normalized cosine
187
+ # https://github.com/yahoojapan/NGT/issues/36
160
188
  index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
161
- index.batch_insert(factors)
189
+ ids = index.batch_insert(factors)
190
+ raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
162
191
  index
163
192
  end
164
193
 
@@ -191,19 +220,21 @@ module Disco
191
220
  }
192
221
  end
193
222
  else
194
- predictions = factors.dot(factors[i, true]) / norms
223
+ # cosine similarity without norms[i]
224
+ # otherwise, denominator would be (norms[i] * norms)
225
+ predictions = factors.inner(factors[i, true]) / norms
195
226
 
196
227
  predictions =
197
228
  map.keys.zip(predictions).map do |item_id, pred|
198
229
  {item_id: item_id, score: pred}
199
230
  end
200
231
 
201
- max_score = predictions.delete_at(i)[:score]
232
+ predictions.delete_at(i)
202
233
  predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
203
234
  predictions = predictions.first(count) if count
204
- # divide by max score to get cosine similarity
235
+ # divide by norms[i] to get cosine similarity
205
236
  # only need to do for returned records
206
- predictions.each { |pred| pred[:score] /= max_score }
237
+ predictions.each { |pred| pred[:score] /= norms[i] }
207
238
  predictions
208
239
  end
209
240
  else
@@ -211,22 +242,21 @@ module Disco
211
242
  end
212
243
  end
213
244
 
214
- def create_maps(train_set)
215
- user_ids = train_set.map { |v| v[:user_id] }.uniq.sort
216
- item_ids = train_set.map { |v| v[:item_id] }.uniq.sort
217
-
218
- raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?)
219
- raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?)
245
+ def update_maps(train_set)
246
+ raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
247
+ raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
220
248
 
221
- @user_map = user_ids.zip(user_ids.size.times).to_h
222
- @item_map = item_ids.zip(item_ids.size.times).to_h
249
+ train_set.each do |v|
250
+ @user_map[v[:user_id]] ||= @user_map.size
251
+ @item_map[v[:item_id]] ||= @item_map.size
252
+ end
223
253
  end
224
254
 
225
255
  def check_ratings(ratings)
226
- unless ratings.all? { |r| !r.nil? }
256
+ unless ratings.all? { |r| !r[:rating].nil? }
227
257
  raise ArgumentError, "Missing ratings"
228
258
  end
229
- unless ratings.all? { |r| r.is_a?(Numeric) }
259
+ unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
230
260
  raise ArgumentError, "Ratings must be numeric"
231
261
  end
232
262
  end
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-28 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  description:
42
- email: andrew@chartkick.com
42
+ email: andrew@ankane.org
43
43
  executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
75
  - !ruby/object:Gem::Version
76
76
  version: '0'
77
77
  requirements: []
78
- rubygems_version: 3.1.4
78
+ rubygems_version: 3.2.3
79
79
  signing_key:
80
80
  specification_version: 4
81
81
  summary: Recommendations for Ruby and Rails using collaborative filtering