disco 0.5.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/LICENSE.txt +1 -1
- data/README.md +0 -6
- data/lib/disco/data.rb +14 -30
- data/lib/disco/model.rb +2 -2
- data/lib/disco/recommender.rb +21 -34
- data/lib/disco/version.rb +1 -1
- data/lib/disco.rb +4 -1
- metadata +8 -8
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bb67e295f1ad63fed7441947934c3bf4c3b527cfddb1f1d9620d551c9e409223
|
|
4
|
+
data.tar.gz: 6c86546e3890792c89008fe9850a242e894b097eed2621bb9dba3bef6a213296
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f277e589e5a833a834874c5cf25dba32d7908b197047fe0b6e44ae6b8950a55204dc235de8d5c36c6b5ec6a78c53c4ba63c1607c64b8da4c1555eba4b0dcd7bd
|
|
7
|
+
data.tar.gz: 2abee5543ba9a600dc4692e15af30d63505dc87eb6ccc3dd42b530c4760aacef562c597882b26a9e1c586b813ae1f91c79307627d7511321298e879205d1a631
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
## 1.0.0 (2026-04-17)
|
|
2
|
+
|
|
3
|
+
- Switched to `numo-narray-alt`
|
|
4
|
+
- Fixed handling of new users and items in validation set
|
|
5
|
+
- Dropped support for Daru
|
|
6
|
+
- Dropped support for Ruby < 3.3 and Rails < 7.2
|
|
7
|
+
|
|
1
8
|
## 0.5.2 (2025-09-12)
|
|
2
9
|
|
|
3
10
|
- Fixed recommendations when numo-narray compiled with GCC 13+
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
|
@@ -256,12 +256,6 @@ Or a Rover data frame
|
|
|
256
256
|
Rover.read_csv("ratings.csv")
|
|
257
257
|
```
|
|
258
258
|
|
|
259
|
-
Or a Daru data frame
|
|
260
|
-
|
|
261
|
-
```ruby
|
|
262
|
-
Daru::DataFrame.from_csv("ratings.csv")
|
|
263
|
-
```
|
|
264
|
-
|
|
265
259
|
## Performance
|
|
266
260
|
|
|
267
261
|
If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
|
data/lib/disco/data.rb
CHANGED
|
@@ -29,9 +29,7 @@ module Disco
|
|
|
29
29
|
|
|
30
30
|
def download_file(fname, origin, file_hash:)
|
|
31
31
|
require "digest"
|
|
32
|
-
require "
|
|
33
|
-
require "net/http"
|
|
34
|
-
require "tmpdir"
|
|
32
|
+
require "open-uri"
|
|
35
33
|
|
|
36
34
|
cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
|
|
37
35
|
dest = "#{cache_home}/disco/#{fname}"
|
|
@@ -39,38 +37,24 @@ module Disco
|
|
|
39
37
|
|
|
40
38
|
return dest if File.exist?(dest)
|
|
41
39
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# and deflateresponses unless a Range header was sent.
|
|
51
|
-
# https://ruby-doc.org/stdlib-2.6.4/libdoc/net/http/rdoc/Net/HTTP.html
|
|
52
|
-
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
53
|
-
request = Net::HTTP::Get.new(uri)
|
|
54
|
-
|
|
55
|
-
puts "Downloading data from #{origin}"
|
|
56
|
-
File.open(temp_path, "wb") do |f|
|
|
57
|
-
http.request(request) do |response|
|
|
58
|
-
response.read_body do |chunk|
|
|
59
|
-
f.write(chunk)
|
|
60
|
-
digest.update(chunk)
|
|
61
|
-
end
|
|
40
|
+
puts "Downloading data from #{origin}"
|
|
41
|
+
URI.parse(origin).open(redirect: false) do |download|
|
|
42
|
+
digest =
|
|
43
|
+
if download.respond_to?(:path)
|
|
44
|
+
download.flush
|
|
45
|
+
Digest::SHA256.file(download.path).hexdigest
|
|
46
|
+
else
|
|
47
|
+
Digest::SHA256.hexdigest(download.string)
|
|
62
48
|
end
|
|
49
|
+
|
|
50
|
+
if digest != file_hash
|
|
51
|
+
raise Error, "Bad hash: #{digest}"
|
|
63
52
|
end
|
|
64
|
-
|
|
53
|
+
puts "Hash verified: #{file_hash}"
|
|
65
54
|
|
|
66
|
-
|
|
67
|
-
raise Error, "Bad hash: #{digest.hexdigest}"
|
|
55
|
+
IO.copy_stream(download, dest)
|
|
68
56
|
end
|
|
69
57
|
|
|
70
|
-
puts "Hash verified: #{file_hash}"
|
|
71
|
-
|
|
72
|
-
FileUtils.mv(temp_path, dest)
|
|
73
|
-
|
|
74
58
|
dest
|
|
75
59
|
end
|
|
76
60
|
end
|
data/lib/disco/model.rb
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Disco
|
|
2
2
|
module Model
|
|
3
3
|
def has_recommended(name, class_name: nil)
|
|
4
|
-
if ActiveRecord::VERSION::
|
|
5
|
-
raise Disco::Error, "Requires Active Record 7+"
|
|
4
|
+
if ActiveRecord::VERSION::STRING.to_f < 7.2
|
|
5
|
+
raise Disco::Error, "Requires Active Record 7.2+"
|
|
6
6
|
end
|
|
7
7
|
|
|
8
8
|
class_name ||= name.to_s.singularize.camelize
|
data/lib/disco/recommender.rb
CHANGED
|
@@ -38,18 +38,17 @@ module Disco
|
|
|
38
38
|
|
|
39
39
|
@user_map = {}
|
|
40
40
|
@item_map = {}
|
|
41
|
-
@rated =
|
|
41
|
+
@rated = []
|
|
42
42
|
input = []
|
|
43
43
|
train_set.each do |v|
|
|
44
44
|
# update maps and build matrix in single pass
|
|
45
45
|
u = (@user_map[v[:user_id]] ||= @user_map.size)
|
|
46
46
|
i = (@item_map[v[:item_id]] ||= @item_map.size)
|
|
47
|
-
@rated[u]
|
|
47
|
+
(@rated[u] ||= Set.new) << i
|
|
48
48
|
|
|
49
49
|
# explicit will always have a value due to check_ratings
|
|
50
50
|
input << [u, i, @implicit ? 1 : v[:rating]]
|
|
51
51
|
end
|
|
52
|
-
@rated.default = nil
|
|
53
52
|
|
|
54
53
|
# much more efficient than checking every value in another pass
|
|
55
54
|
raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
|
|
@@ -74,15 +73,24 @@ module Disco
|
|
|
74
73
|
end
|
|
75
74
|
|
|
76
75
|
eval_set = nil
|
|
77
|
-
if validation_set
|
|
76
|
+
if validation_set&.any?
|
|
78
77
|
eval_set = []
|
|
79
78
|
validation_set.each do |v|
|
|
80
79
|
u = @user_map[v[:user_id]]
|
|
81
80
|
i = @item_map[v[:item_id]]
|
|
82
81
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
82
|
+
if @implicit
|
|
83
|
+
if u.nil?
|
|
84
|
+
raise ArgumentError, "Validation set cannot have new users for implicit feedback"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
if i.nil?
|
|
88
|
+
raise ArgumentError, "Validation set cannot have new items for implicit feedback"
|
|
89
|
+
end
|
|
90
|
+
else
|
|
91
|
+
u ||= @user_map.size
|
|
92
|
+
i ||= @item_map.size
|
|
93
|
+
end
|
|
86
94
|
|
|
87
95
|
eval_set << [u, i, @implicit ? 1 : v[:rating]]
|
|
88
96
|
end
|
|
@@ -134,7 +142,7 @@ module Disco
|
|
|
134
142
|
rated = item_ids ? {} : @rated[u]
|
|
135
143
|
|
|
136
144
|
if item_ids
|
|
137
|
-
ids = Numo::NArray.cast(item_ids.
|
|
145
|
+
ids = Numo::NArray.cast(item_ids.filter_map { |i| @item_map[i] })
|
|
138
146
|
return [] if ids.size == 0
|
|
139
147
|
|
|
140
148
|
predictions = @item_factors[ids, true].inner(@user_factors[u, true])
|
|
@@ -153,7 +161,7 @@ module Disco
|
|
|
153
161
|
keys = @item_map.keys
|
|
154
162
|
result = []
|
|
155
163
|
ids.each_with_index do |item_id, i|
|
|
156
|
-
next if rated
|
|
164
|
+
next if rated.include?(item_id)
|
|
157
165
|
|
|
158
166
|
result << {item_id: keys[item_id], score: predictions[i]}
|
|
159
167
|
break if result.size == count
|
|
@@ -261,7 +269,7 @@ module Disco
|
|
|
261
269
|
implicit: @implicit,
|
|
262
270
|
user_ids: @user_map.keys,
|
|
263
271
|
item_ids: @item_map.keys,
|
|
264
|
-
rated: @
|
|
272
|
+
rated: @rated.map { |v| v.to_a.sort },
|
|
265
273
|
global_mean: @global_mean,
|
|
266
274
|
user_factors: [@user_factors.to_binary].pack("m0"),
|
|
267
275
|
item_factors: [@item_factors.to_binary].pack("m0"),
|
|
@@ -387,26 +395,11 @@ module Disco
|
|
|
387
395
|
end
|
|
388
396
|
|
|
389
397
|
def top_k(values, count)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
indexes = indexes[0...[count, indexes.size].min] if count
|
|
393
|
-
else
|
|
394
|
-
indexes = values.to_a.each_with_index.sort_by { |v, _| -v }
|
|
395
|
-
indexes = indexes.first(count) if count
|
|
396
|
-
indexes = indexes.map(&:last)
|
|
397
|
-
end
|
|
398
|
+
indexes = values.sort_index.reverse
|
|
399
|
+
indexes = indexes[0...[count, indexes.size].min] if count
|
|
398
400
|
[values[indexes], indexes]
|
|
399
401
|
end
|
|
400
402
|
|
|
401
|
-
# https://github.com/ruby-numo/numo-narray/issues/243
|
|
402
|
-
def self.sort_index?
|
|
403
|
-
unless defined?(@sort_index)
|
|
404
|
-
arr = Numo::SFloat.new(100).rand
|
|
405
|
-
@sort_index = arr[arr.sort_index].to_a == arr.to_a.sort
|
|
406
|
-
end
|
|
407
|
-
@sort_index
|
|
408
|
-
end
|
|
409
|
-
|
|
410
403
|
def check_ratings(ratings)
|
|
411
404
|
unless ratings.all? { |r| !r[:rating].nil? }
|
|
412
405
|
raise ArgumentError, "Missing rating"
|
|
@@ -428,12 +421,6 @@ module Disco
|
|
|
428
421
|
if defined?(Rover::DataFrame) && dataset.is_a?(Rover::DataFrame)
|
|
429
422
|
# convert keys to symbols
|
|
430
423
|
dataset.each_row.map { |v| v.transform_keys(&:to_sym) }
|
|
431
|
-
elsif defined?(Daru::DataFrame) && dataset.is_a?(Daru::DataFrame)
|
|
432
|
-
# convert keys to symbols
|
|
433
|
-
dataset = dataset.dup
|
|
434
|
-
new_names = dataset.vectors.to_a.map { |k| [k, k.to_sym] }.to_h
|
|
435
|
-
dataset.rename_vectors!(new_names)
|
|
436
|
-
dataset.to_a[0]
|
|
437
424
|
else
|
|
438
425
|
dataset
|
|
439
426
|
end
|
|
@@ -443,7 +430,7 @@ module Disco
|
|
|
443
430
|
@implicit = obj["implicit"]
|
|
444
431
|
@user_map = obj["user_ids"].map.with_index.to_h
|
|
445
432
|
@item_map = obj["item_ids"].map.with_index.to_h
|
|
446
|
-
@rated = obj["rated"].map
|
|
433
|
+
@rated = obj["rated"].map { |r| Set.new(r) }
|
|
447
434
|
@global_mean = obj["global_mean"].to_f
|
|
448
435
|
@factors = obj["factors"].to_i
|
|
449
436
|
@user_factors = Numo::SFloat.from_binary(obj["user_factors"].unpack1("m0"), [@user_map.size, @factors])
|
data/lib/disco/version.rb
CHANGED
data/lib/disco.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: disco
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
@@ -15,28 +15,28 @@ dependencies:
|
|
|
15
15
|
requirements:
|
|
16
16
|
- - ">="
|
|
17
17
|
- !ruby/object:Gem::Version
|
|
18
|
-
version: '0.
|
|
18
|
+
version: '0.5'
|
|
19
19
|
type: :runtime
|
|
20
20
|
prerelease: false
|
|
21
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
22
22
|
requirements:
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
|
-
version: '0.
|
|
25
|
+
version: '0.5'
|
|
26
26
|
- !ruby/object:Gem::Dependency
|
|
27
|
-
name: numo-narray
|
|
27
|
+
name: numo-narray-alt
|
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
|
29
29
|
requirements:
|
|
30
30
|
- - ">="
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 0.
|
|
32
|
+
version: '0.10'
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
36
36
|
requirements:
|
|
37
37
|
- - ">="
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
|
-
version: 0.
|
|
39
|
+
version: '0.10'
|
|
40
40
|
email: andrew@ankane.org
|
|
41
41
|
executables: []
|
|
42
42
|
extensions: []
|
|
@@ -66,14 +66,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
66
66
|
requirements:
|
|
67
67
|
- - ">="
|
|
68
68
|
- !ruby/object:Gem::Version
|
|
69
|
-
version: '3.
|
|
69
|
+
version: '3.3'
|
|
70
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
71
71
|
requirements:
|
|
72
72
|
- - ">="
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
74
|
version: '0'
|
|
75
75
|
requirements: []
|
|
76
|
-
rubygems_version:
|
|
76
|
+
rubygems_version: 4.0.6
|
|
77
77
|
specification_version: 4
|
|
78
78
|
summary: Recommendations for Ruby and Rails using collaborative filtering
|
|
79
79
|
test_files: []
|