disco 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e9b8792d465e2bd894ce9aaa5dabf79dd89e93337d838917c709ac7747b85772
4
- data.tar.gz: 9d34a5124dc26f8a2ecb7e2ed3cbf524fe586c37c693d885e668974e24dfaf0a
3
+ metadata.gz: e4a978d2eec39ca280142c49fb4ef4be2e1ad4f35dfa4d977941f46d5d34b466
4
+ data.tar.gz: 8a29a54bba5ac8b715294e2fce4e34fa1b11442b1800c388807c60b9520ced23
5
5
  SHA512:
6
- metadata.gz: 658b48b75994a295382eb22908d4a5f1825b01bfc26f52428e993802c42f7ebb435a59e7f1262d17400d65eda886f1a4f38edff82cdeda96c2a0ce280602742f
7
- data.tar.gz: c9acce77cae8a575c5814456247600367d5fea4eb85a52309e26cac643d107bc03c42c99ce094c2f9ec46a329fd2dfa97d2f58f0020b337feb88b56346630942
6
+ metadata.gz: 99376dd48cce340a4fdcb0d76c93b03af494d88167e2caaca0d186fcf5d2303f2524884e0c712c2f8e3d7be79a92b029a8d5fa726bb94826315f283afea0f74b
7
+ data.tar.gz: eeb8c480098616f93d6c7e39a1bb57e2feefa6af3696c407791ff6f052450eb035f1d1659ded70d7b5fbbbe8cff9f7309118828a454b1d4f9d459321b90035cf
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.2.4 (2021-02-15)
2
+
3
+ - Added `user_ids` and `item_ids` methods
4
+ - Added `user_id` argument to `user_factors`
5
+ - Added `item_id` argument to `item_factors`
6
+
1
7
  ## 0.2.3 (2020-11-28)
2
8
 
3
9
  - Added `predict` method
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2019-2020 Andrew Kane
1
+ Copyright (c) 2019-2021 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -46,13 +46,13 @@ recommender.fit([
46
46
 
47
47
  > Use `value` instead of rating for implicit feedback
48
48
 
49
- Get user-based (user-item) recommendations - “users like you also liked”
49
+ Get user-based recommendations - “users like you also liked”
50
50
 
51
51
  ```ruby
52
52
  recommender.user_recs(user_id)
53
53
  ```
54
54
 
55
- Get item-based (item-item) recommendations - “users who liked this item also liked”
55
+ Get item-based recommendations - “users who liked this item also liked”
56
56
 
57
57
  ```ruby
58
58
  recommender.item_recs(item_id)
@@ -283,19 +283,33 @@ This should be called after fitting or loading the model.
283
283
 
284
284
  ## Reference
285
285
 
286
+ Get ids
287
+
288
+ ```ruby
289
+ recommender.user_ids
290
+ recommender.item_ids
291
+ ```
292
+
286
293
  Get the global mean
287
294
 
288
295
  ```ruby
289
296
  recommender.global_mean
290
297
  ```
291
298
 
292
- Get the factors
299
+ Get factors
293
300
 
294
301
  ```ruby
295
302
  recommender.user_factors
296
303
  recommender.item_factors
297
304
  ```
298
305
 
306
+ Get factors for specific users and items
307
+
308
+ ```ruby
309
+ recommender.user_factors(user_id)
310
+ recommender.item_factors(item_id)
311
+ ```
312
+
299
313
  ## Credits
300
314
 
301
315
  Thanks to:
@@ -1,32 +1,32 @@
1
1
  module Disco
2
2
  class Recommender
3
- attr_reader :global_mean, :item_factors, :user_factors
3
+ attr_reader :global_mean
4
4
 
5
5
  def initialize(factors: 8, epochs: 20, verbose: nil)
6
6
  @factors = factors
7
7
  @epochs = epochs
8
8
  @verbose = verbose
9
+ @user_map = {}
10
+ @item_map = {}
9
11
  end
10
12
 
11
13
  def fit(train_set, validation_set: nil)
12
14
  train_set = to_dataset(train_set)
13
15
  validation_set = to_dataset(validation_set) if validation_set
14
16
 
15
- @implicit = !train_set.any? { |v| v[:rating] }
17
+ check_training_set(train_set)
16
18
 
19
+ @implicit = !train_set.any? { |v| v[:rating] }
17
20
  unless @implicit
18
- ratings = train_set.map { |o| o[:rating] }
19
- check_ratings(ratings)
20
- @min_rating = ratings.min
21
- @max_rating = ratings.max
21
+ check_ratings(train_set)
22
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
22
23
 
23
24
  if validation_set
24
- check_ratings(validation_set.map { |o| o[:rating] })
25
+ check_ratings(validation_set)
25
26
  end
26
27
  end
27
28
 
28
- check_training_set(train_set)
29
- create_maps(train_set)
29
+ update_maps(train_set)
30
30
 
31
31
  @rated = Hash.new { |hash, key| hash[key] = {} }
32
32
  input = []
@@ -143,13 +143,39 @@ module Disco
143
143
 
144
144
  def similar_items(item_id, count: 5)
145
145
  check_fit
146
- similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
146
+ similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
147
147
  end
148
148
  alias_method :item_recs, :similar_items
149
149
 
150
150
  def similar_users(user_id, count: 5)
151
151
  check_fit
152
- similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
152
+ similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
153
+ end
154
+
155
+ def user_ids
156
+ @user_map.keys
157
+ end
158
+
159
+ def item_ids
160
+ @item_map.keys
161
+ end
162
+
163
+ def user_factors(user_id = nil)
164
+ if user_id
165
+ u = @user_map[user_id]
166
+ @user_factors[u, true] if u
167
+ else
168
+ @user_factors
169
+ end
170
+ end
171
+
172
+ def item_factors(item_id = nil)
173
+ if item_id
174
+ i = @item_map[item_id]
175
+ @item_factors[i, true] if i
176
+ else
177
+ @item_factors
178
+ end
153
179
  end
154
180
 
155
181
  private
@@ -157,8 +183,11 @@ module Disco
157
183
  def create_index(factors)
158
184
  require "ngt"
159
185
 
186
+ # could speed up search with normalized cosine
187
+ # https://github.com/yahoojapan/NGT/issues/36
160
188
  index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
161
- index.batch_insert(factors)
189
+ ids = index.batch_insert(factors)
190
+ raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
162
191
  index
163
192
  end
164
193
 
@@ -191,19 +220,21 @@ module Disco
191
220
  }
192
221
  end
193
222
  else
194
- predictions = factors.dot(factors[i, true]) / norms
223
+ # cosine similarity without norms[i]
224
+ # otherwise, denominator would be (norms[i] * norms)
225
+ predictions = factors.inner(factors[i, true]) / norms
195
226
 
196
227
  predictions =
197
228
  map.keys.zip(predictions).map do |item_id, pred|
198
229
  {item_id: item_id, score: pred}
199
230
  end
200
231
 
201
- max_score = predictions.delete_at(i)[:score]
232
+ predictions.delete_at(i)
202
233
  predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
203
234
  predictions = predictions.first(count) if count
204
- # divide by max score to get cosine similarity
235
+ # divide by norms[i] to get cosine similarity
205
236
  # only need to do for returned records
206
- predictions.each { |pred| pred[:score] /= max_score }
237
+ predictions.each { |pred| pred[:score] /= norms[i] }
207
238
  predictions
208
239
  end
209
240
  else
@@ -211,22 +242,21 @@ module Disco
211
242
  end
212
243
  end
213
244
 
214
- def create_maps(train_set)
215
- user_ids = train_set.map { |v| v[:user_id] }.uniq.sort
216
- item_ids = train_set.map { |v| v[:item_id] }.uniq.sort
217
-
218
- raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?)
219
- raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?)
245
+ def update_maps(train_set)
246
+ raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
247
+ raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
220
248
 
221
- @user_map = user_ids.zip(user_ids.size.times).to_h
222
- @item_map = item_ids.zip(item_ids.size.times).to_h
249
+ train_set.each do |v|
250
+ @user_map[v[:user_id]] ||= @user_map.size
251
+ @item_map[v[:item_id]] ||= @item_map.size
252
+ end
223
253
  end
224
254
 
225
255
  def check_ratings(ratings)
226
- unless ratings.all? { |r| !r.nil? }
256
+ unless ratings.all? { |r| !r[:rating].nil? }
227
257
  raise ArgumentError, "Missing ratings"
228
258
  end
229
- unless ratings.all? { |r| r.is_a?(Numeric) }
259
+ unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
230
260
  raise ArgumentError, "Ratings must be numeric"
231
261
  end
232
262
  end
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-28 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: libmf
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  description:
42
- email: andrew@chartkick.com
42
+ email: andrew@ankane.org
43
43
  executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
75
  - !ruby/object:Gem::Version
76
76
  version: '0'
77
77
  requirements: []
78
- rubygems_version: 3.1.4
78
+ rubygems_version: 3.2.3
79
79
  signing_key:
80
80
  specification_version: 4
81
81
  summary: Recommendations for Ruby and Rails using collaborative filtering