disco 0.5.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b1d7b56a9c40088874bbd79ff7fa5bf8783a4406a9e5dadf1dddb04303ca7fee
4
- data.tar.gz: 0d843da83fc246b5da2aa097eba3534d729b51d9b03c368226b884fe33e61795
3
+ metadata.gz: bb67e295f1ad63fed7441947934c3bf4c3b527cfddb1f1d9620d551c9e409223
4
+ data.tar.gz: 6c86546e3890792c89008fe9850a242e894b097eed2621bb9dba3bef6a213296
5
5
  SHA512:
6
- metadata.gz: 49341b30e2ffa7348c19af2c0073f8f9e15afe3dcd856922fb5ae3fd3cd6e5ab555731c38bf3678f5f4efb064ecfce994d24b6238aa93fbc62e4621d7de036ab
7
- data.tar.gz: 9bb387230927cf55e94c3614c4a68881031206c9ee2dee2d48c06f6053134cdff0af4c4d24eba825acf3429394556368378bd51860f4dfcddf0dd1742309794f
6
+ metadata.gz: f277e589e5a833a834874c5cf25dba32d7908b197047fe0b6e44ae6b8950a55204dc235de8d5c36c6b5ec6a78c53c4ba63c1607c64b8da4c1555eba4b0dcd7bd
7
+ data.tar.gz: 2abee5543ba9a600dc4692e15af30d63505dc87eb6ccc3dd42b530c4760aacef562c597882b26a9e1c586b813ae1f91c79307627d7511321298e879205d1a631
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 1.0.0 (2026-04-17)
2
+
3
+ - Switched to `numo-narray-alt`
4
+ - Fixed handling of new users and items in validation set
5
+ - Dropped support for Daru
6
+ - Dropped support for Ruby < 3.3 and Rails < 7.2
7
+
8
+ ## 0.5.2 (2025-09-12)
9
+
10
+ - Fixed recommendations when numo-narray compiled with GCC 13+
11
+ - Fixed error with Rover 0.5+
12
+
1
13
  ## 0.5.1 (2024-12-29)
2
14
 
3
15
  - Removed dependency on `base64` gem for serialization
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2019-2024 Andrew Kane
1
+ Copyright (c) 2019-2026 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -256,12 +256,6 @@ Or a Rover data frame
256
256
  Rover.read_csv("ratings.csv")
257
257
  ```
258
258
 
259
- Or a Daru data frame
260
-
261
- ```ruby
262
- Daru::DataFrame.from_csv("ratings.csv")
263
- ```
264
-
265
259
  ## Performance
266
260
 
267
261
  If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
data/lib/disco/data.rb CHANGED
@@ -29,9 +29,7 @@ module Disco
29
29
 
30
30
  def download_file(fname, origin, file_hash:)
31
31
  require "digest"
32
- require "fileutils"
33
- require "net/http"
34
- require "tmpdir"
32
+ require "open-uri"
35
33
 
36
34
  cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
37
35
  dest = "#{cache_home}/disco/#{fname}"
@@ -39,38 +37,24 @@ module Disco
39
37
 
40
38
  return dest if File.exist?(dest)
41
39
 
42
- temp_path = "#{Dir.tmpdir}/disco-#{Time.now.to_f}" # TODO better name
43
-
44
- digest = Digest::SHA2.new
45
-
46
- uri = URI(origin)
47
-
48
- # Net::HTTP automatically adds Accept-Encoding for compression
49
- # of response bodies and automatically decompresses gzip
50
- # and deflateresponses unless a Range header was sent.
51
- # https://ruby-doc.org/stdlib-2.6.4/libdoc/net/http/rdoc/Net/HTTP.html
52
- Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
53
- request = Net::HTTP::Get.new(uri)
54
-
55
- puts "Downloading data from #{origin}"
56
- File.open(temp_path, "wb") do |f|
57
- http.request(request) do |response|
58
- response.read_body do |chunk|
59
- f.write(chunk)
60
- digest.update(chunk)
61
- end
40
+ puts "Downloading data from #{origin}"
41
+ URI.parse(origin).open(redirect: false) do |download|
42
+ digest =
43
+ if download.respond_to?(:path)
44
+ download.flush
45
+ Digest::SHA256.file(download.path).hexdigest
46
+ else
47
+ Digest::SHA256.hexdigest(download.string)
62
48
  end
49
+
50
+ if digest != file_hash
51
+ raise Error, "Bad hash: #{digest}"
63
52
  end
64
- end
53
+ puts "Hash verified: #{file_hash}"
65
54
 
66
- if digest.hexdigest != file_hash
67
- raise Error, "Bad hash: #{digest.hexdigest}"
55
+ IO.copy_stream(download, dest)
68
56
  end
69
57
 
70
- puts "Hash verified: #{file_hash}"
71
-
72
- FileUtils.mv(temp_path, dest)
73
-
74
58
  dest
75
59
  end
76
60
  end
data/lib/disco/model.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module Disco
2
2
  module Model
3
3
  def has_recommended(name, class_name: nil)
4
- if ActiveRecord::VERSION::MAJOR < 7
5
- raise Disco::Error, "Requires Active Record 7+"
4
+ if ActiveRecord::VERSION::STRING.to_f < 7.2
5
+ raise Disco::Error, "Requires Active Record 7.2+"
6
6
  end
7
7
 
8
8
  class_name ||= name.to_s.singularize.camelize
@@ -38,18 +38,17 @@ module Disco
38
38
 
39
39
  @user_map = {}
40
40
  @item_map = {}
41
- @rated = Hash.new { |hash, key| hash[key] = {} }
41
+ @rated = []
42
42
  input = []
43
43
  train_set.each do |v|
44
44
  # update maps and build matrix in single pass
45
45
  u = (@user_map[v[:user_id]] ||= @user_map.size)
46
46
  i = (@item_map[v[:item_id]] ||= @item_map.size)
47
- @rated[u][i] = true
47
+ (@rated[u] ||= Set.new) << i
48
48
 
49
49
  # explicit will always have a value due to check_ratings
50
50
  input << [u, i, @implicit ? 1 : v[:rating]]
51
51
  end
52
- @rated.default = nil
53
52
 
54
53
  # much more efficient than checking every value in another pass
55
54
  raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
@@ -74,15 +73,24 @@ module Disco
74
73
  end
75
74
 
76
75
  eval_set = nil
77
- if validation_set
76
+ if validation_set&.any?
78
77
  eval_set = []
79
78
  validation_set.each do |v|
80
79
  u = @user_map[v[:user_id]]
81
80
  i = @item_map[v[:item_id]]
82
81
 
83
- # set to non-existent item
84
- u ||= -1
85
- i ||= -1
82
+ if @implicit
83
+ if u.nil?
84
+ raise ArgumentError, "Validation set cannot have new users for implicit feedback"
85
+ end
86
+
87
+ if i.nil?
88
+ raise ArgumentError, "Validation set cannot have new items for implicit feedback"
89
+ end
90
+ else
91
+ u ||= @user_map.size
92
+ i ||= @item_map.size
93
+ end
86
94
 
87
95
  eval_set << [u, i, @implicit ? 1 : v[:rating]]
88
96
  end
@@ -134,21 +142,17 @@ module Disco
134
142
  rated = item_ids ? {} : @rated[u]
135
143
 
136
144
  if item_ids
137
- ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
145
+ ids = Numo::NArray.cast(item_ids.filter_map { |i| @item_map[i] })
138
146
  return [] if ids.size == 0
139
147
 
140
148
  predictions = @item_factors[ids, true].inner(@user_factors[u, true])
141
- indexes = predictions.sort_index.reverse
142
- indexes = indexes[0...[count + rated.size, indexes.size].min] if count
143
- predictions = predictions[indexes]
149
+ predictions, indexes = top_k(predictions, count ? count + rated.size : nil)
144
150
  ids = ids[indexes]
145
151
  elsif @user_recs_index && count
146
152
  predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
147
153
  else
148
154
  predictions = @item_factors.inner(@user_factors[u, true])
149
- indexes = predictions.sort_index.reverse # reverse just creates view
150
- indexes = indexes[0...[count + rated.size, indexes.size].min] if count
151
- predictions = predictions[indexes]
155
+ predictions, indexes = top_k(predictions, count ? count + rated.size : nil)
152
156
  ids = indexes
153
157
  end
154
158
 
@@ -157,7 +161,7 @@ module Disco
157
161
  keys = @item_map.keys
158
162
  result = []
159
163
  ids.each_with_index do |item_id, i|
160
- next if rated[item_id]
164
+ next if rated.include?(item_id)
161
165
 
162
166
  result << {item_id: keys[item_id], score: predictions[i]}
163
167
  break if result.size == count
@@ -196,7 +200,7 @@ module Disco
196
200
  # wilson score with continuity correction
197
201
  # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
198
202
  z = 1.96 # 95% confidence
199
- range = @max_rating - @min_rating
203
+ range = @max_rating - min_rating
200
204
  n = Numo::DFloat.cast(@item_count)
201
205
  phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
202
206
  phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
@@ -204,9 +208,7 @@ module Disco
204
208
  scores = scores * range + min_rating
205
209
  end
206
210
 
207
- indexes = scores.sort_index.reverse
208
- indexes = indexes[0...[count, indexes.size].min] if count
209
- scores = scores[indexes]
211
+ scores, indexes = top_k(scores, count)
210
212
 
211
213
  keys = @item_map.keys
212
214
  indexes.size.times.map do |i|
@@ -267,7 +269,7 @@ module Disco
267
269
  implicit: @implicit,
268
270
  user_ids: @user_map.keys,
269
271
  item_ids: @item_map.keys,
270
- rated: @user_map.map { |_, u| (@rated[u] || {}).keys },
272
+ rated: @rated.map { |v| v.to_a.sort },
271
273
  global_mean: @global_mean,
272
274
  user_factors: [@user_factors.to_binary].pack("m0"),
273
275
  item_factors: [@item_factors.to_binary].pack("m0"),
@@ -371,9 +373,7 @@ module Disco
371
373
  end
372
374
  else
373
375
  predictions = factors.inner(factors[i, true]) / (norms * norms[i])
374
- indexes = predictions.sort_index.reverse
375
- indexes = indexes[0...[count + 1, indexes.size].min] if count
376
- predictions = predictions[indexes]
376
+ predictions, indexes = top_k(predictions, count ? count + 1 : nil)
377
377
  ids = indexes
378
378
  end
379
379
 
@@ -394,6 +394,12 @@ module Disco
394
394
  end
395
395
  end
396
396
 
397
+ def top_k(values, count)
398
+ indexes = values.sort_index.reverse
399
+ indexes = indexes[0...[count, indexes.size].min] if count
400
+ [values[indexes], indexes]
401
+ end
402
+
397
403
  def check_ratings(ratings)
398
404
  unless ratings.all? { |r| !r[:rating].nil? }
399
405
  raise ArgumentError, "Missing rating"
@@ -414,17 +420,7 @@ module Disco
414
420
  def to_dataset(dataset)
415
421
  if defined?(Rover::DataFrame) && dataset.is_a?(Rover::DataFrame)
416
422
  # convert keys to symbols
417
- dataset = dataset.dup
418
- dataset.keys.each do |k, v|
419
- dataset[k.to_sym] ||= dataset.delete(k)
420
- end
421
- dataset.to_a
422
- elsif defined?(Daru::DataFrame) && dataset.is_a?(Daru::DataFrame)
423
- # convert keys to symbols
424
- dataset = dataset.dup
425
- new_names = dataset.vectors.to_a.map { |k| [k, k.to_sym] }.to_h
426
- dataset.rename_vectors!(new_names)
427
- dataset.to_a[0]
423
+ dataset.each_row.map { |v| v.transform_keys(&:to_sym) }
428
424
  else
429
425
  dataset
430
426
  end
@@ -434,7 +430,7 @@ module Disco
434
430
  @implicit = obj["implicit"]
435
431
  @user_map = obj["user_ids"].map.with_index.to_h
436
432
  @item_map = obj["item_ids"].map.with_index.to_h
437
- @rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
433
+ @rated = obj["rated"].map { |r| Set.new(r) }
438
434
  @global_mean = obj["global_mean"].to_f
439
435
  @factors = obj["factors"].to_i
440
436
  @user_factors = Numo::SFloat.from_binary(obj["user_factors"].unpack1("m0"), [@user_map.size, @factors])
data/lib/disco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Disco
2
- VERSION = "0.5.1"
2
+ VERSION = "1.0.0"
3
3
  end
data/lib/disco.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  # dependencies
2
2
  require "libmf"
3
- require "numo/narray"
3
+ require "numo/narray/alt"
4
+
5
+ # stdlib
6
+ require "set"
4
7
 
5
8
  # modules
6
9
  require_relative "disco/data"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: disco
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-12-30 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: libmf
@@ -15,28 +15,28 @@ dependencies:
15
15
  requirements:
16
16
  - - ">="
17
17
  - !ruby/object:Gem::Version
18
- version: '0.4'
18
+ version: '0.5'
19
19
  type: :runtime
20
20
  prerelease: false
21
21
  version_requirements: !ruby/object:Gem::Requirement
22
22
  requirements:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
- version: '0.4'
25
+ version: '0.5'
26
26
  - !ruby/object:Gem::Dependency
27
- name: numo-narray
27
+ name: numo-narray-alt
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
- version: 0.9.2
32
+ version: '0.10'
33
33
  type: :runtime
34
34
  prerelease: false
35
35
  version_requirements: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - ">="
38
38
  - !ruby/object:Gem::Version
39
- version: 0.9.2
39
+ version: '0.10'
40
40
  email: andrew@ankane.org
41
41
  executables: []
42
42
  extensions: []
@@ -66,14 +66,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: '3.1'
69
+ version: '3.3'
70
70
  required_rubygems_version: !ruby/object:Gem::Requirement
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
74
  version: '0'
75
75
  requirements: []
76
- rubygems_version: 3.6.2
76
+ rubygems_version: 4.0.6
77
77
  specification_version: 4
78
78
  summary: Recommendations for Ruby and Rails using collaborative filtering
79
79
  test_files: []