disco 0.5.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/LICENSE.txt +1 -1
- data/README.md +0 -6
- data/lib/disco/data.rb +14 -30
- data/lib/disco/model.rb +2 -2
- data/lib/disco/recommender.rb +31 -35
- data/lib/disco/version.rb +1 -1
- data/lib/disco.rb +4 -1
- metadata +9 -9
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bb67e295f1ad63fed7441947934c3bf4c3b527cfddb1f1d9620d551c9e409223
|
|
4
|
+
data.tar.gz: 6c86546e3890792c89008fe9850a242e894b097eed2621bb9dba3bef6a213296
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f277e589e5a833a834874c5cf25dba32d7908b197047fe0b6e44ae6b8950a55204dc235de8d5c36c6b5ec6a78c53c4ba63c1607c64b8da4c1555eba4b0dcd7bd
|
|
7
|
+
data.tar.gz: 2abee5543ba9a600dc4692e15af30d63505dc87eb6ccc3dd42b530c4760aacef562c597882b26a9e1c586b813ae1f91c79307627d7511321298e879205d1a631
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
## 1.0.0 (2026-04-17)
|
|
2
|
+
|
|
3
|
+
- Switched to `numo-narray-alt`
|
|
4
|
+
- Fixed handling of new users and items in validation set
|
|
5
|
+
- Dropped support for Daru
|
|
6
|
+
- Dropped support for Ruby < 3.3 and Rails < 7.2
|
|
7
|
+
|
|
8
|
+
## 0.5.2 (2025-09-12)
|
|
9
|
+
|
|
10
|
+
- Fixed recommendations when numo-narray compiled with GCC 13+
|
|
11
|
+
- Fixed error with Rover 0.5+
|
|
12
|
+
|
|
1
13
|
## 0.5.1 (2024-12-29)
|
|
2
14
|
|
|
3
15
|
- Removed dependency on `base64` gem for serialization
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
|
@@ -256,12 +256,6 @@ Or a Rover data frame
|
|
|
256
256
|
Rover.read_csv("ratings.csv")
|
|
257
257
|
```
|
|
258
258
|
|
|
259
|
-
Or a Daru data frame
|
|
260
|
-
|
|
261
|
-
```ruby
|
|
262
|
-
Daru::DataFrame.from_csv("ratings.csv")
|
|
263
|
-
```
|
|
264
|
-
|
|
265
259
|
## Performance
|
|
266
260
|
|
|
267
261
|
If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
|
data/lib/disco/data.rb
CHANGED
|
@@ -29,9 +29,7 @@ module Disco
|
|
|
29
29
|
|
|
30
30
|
def download_file(fname, origin, file_hash:)
|
|
31
31
|
require "digest"
|
|
32
|
-
require "
|
|
33
|
-
require "net/http"
|
|
34
|
-
require "tmpdir"
|
|
32
|
+
require "open-uri"
|
|
35
33
|
|
|
36
34
|
cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
|
|
37
35
|
dest = "#{cache_home}/disco/#{fname}"
|
|
@@ -39,38 +37,24 @@ module Disco
|
|
|
39
37
|
|
|
40
38
|
return dest if File.exist?(dest)
|
|
41
39
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# and deflateresponses unless a Range header was sent.
|
|
51
|
-
# https://ruby-doc.org/stdlib-2.6.4/libdoc/net/http/rdoc/Net/HTTP.html
|
|
52
|
-
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
53
|
-
request = Net::HTTP::Get.new(uri)
|
|
54
|
-
|
|
55
|
-
puts "Downloading data from #{origin}"
|
|
56
|
-
File.open(temp_path, "wb") do |f|
|
|
57
|
-
http.request(request) do |response|
|
|
58
|
-
response.read_body do |chunk|
|
|
59
|
-
f.write(chunk)
|
|
60
|
-
digest.update(chunk)
|
|
61
|
-
end
|
|
40
|
+
puts "Downloading data from #{origin}"
|
|
41
|
+
URI.parse(origin).open(redirect: false) do |download|
|
|
42
|
+
digest =
|
|
43
|
+
if download.respond_to?(:path)
|
|
44
|
+
download.flush
|
|
45
|
+
Digest::SHA256.file(download.path).hexdigest
|
|
46
|
+
else
|
|
47
|
+
Digest::SHA256.hexdigest(download.string)
|
|
62
48
|
end
|
|
49
|
+
|
|
50
|
+
if digest != file_hash
|
|
51
|
+
raise Error, "Bad hash: #{digest}"
|
|
63
52
|
end
|
|
64
|
-
|
|
53
|
+
puts "Hash verified: #{file_hash}"
|
|
65
54
|
|
|
66
|
-
|
|
67
|
-
raise Error, "Bad hash: #{digest.hexdigest}"
|
|
55
|
+
IO.copy_stream(download, dest)
|
|
68
56
|
end
|
|
69
57
|
|
|
70
|
-
puts "Hash verified: #{file_hash}"
|
|
71
|
-
|
|
72
|
-
FileUtils.mv(temp_path, dest)
|
|
73
|
-
|
|
74
58
|
dest
|
|
75
59
|
end
|
|
76
60
|
end
|
data/lib/disco/model.rb
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Disco
|
|
2
2
|
module Model
|
|
3
3
|
def has_recommended(name, class_name: nil)
|
|
4
|
-
if ActiveRecord::VERSION::
|
|
5
|
-
raise Disco::Error, "Requires Active Record 7+"
|
|
4
|
+
if ActiveRecord::VERSION::STRING.to_f < 7.2
|
|
5
|
+
raise Disco::Error, "Requires Active Record 7.2+"
|
|
6
6
|
end
|
|
7
7
|
|
|
8
8
|
class_name ||= name.to_s.singularize.camelize
|
data/lib/disco/recommender.rb
CHANGED
|
@@ -38,18 +38,17 @@ module Disco
|
|
|
38
38
|
|
|
39
39
|
@user_map = {}
|
|
40
40
|
@item_map = {}
|
|
41
|
-
@rated =
|
|
41
|
+
@rated = []
|
|
42
42
|
input = []
|
|
43
43
|
train_set.each do |v|
|
|
44
44
|
# update maps and build matrix in single pass
|
|
45
45
|
u = (@user_map[v[:user_id]] ||= @user_map.size)
|
|
46
46
|
i = (@item_map[v[:item_id]] ||= @item_map.size)
|
|
47
|
-
@rated[u]
|
|
47
|
+
(@rated[u] ||= Set.new) << i
|
|
48
48
|
|
|
49
49
|
# explicit will always have a value due to check_ratings
|
|
50
50
|
input << [u, i, @implicit ? 1 : v[:rating]]
|
|
51
51
|
end
|
|
52
|
-
@rated.default = nil
|
|
53
52
|
|
|
54
53
|
# much more efficient than checking every value in another pass
|
|
55
54
|
raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
|
|
@@ -74,15 +73,24 @@ module Disco
|
|
|
74
73
|
end
|
|
75
74
|
|
|
76
75
|
eval_set = nil
|
|
77
|
-
if validation_set
|
|
76
|
+
if validation_set&.any?
|
|
78
77
|
eval_set = []
|
|
79
78
|
validation_set.each do |v|
|
|
80
79
|
u = @user_map[v[:user_id]]
|
|
81
80
|
i = @item_map[v[:item_id]]
|
|
82
81
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
82
|
+
if @implicit
|
|
83
|
+
if u.nil?
|
|
84
|
+
raise ArgumentError, "Validation set cannot have new users for implicit feedback"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
if i.nil?
|
|
88
|
+
raise ArgumentError, "Validation set cannot have new items for implicit feedback"
|
|
89
|
+
end
|
|
90
|
+
else
|
|
91
|
+
u ||= @user_map.size
|
|
92
|
+
i ||= @item_map.size
|
|
93
|
+
end
|
|
86
94
|
|
|
87
95
|
eval_set << [u, i, @implicit ? 1 : v[:rating]]
|
|
88
96
|
end
|
|
@@ -134,21 +142,17 @@ module Disco
|
|
|
134
142
|
rated = item_ids ? {} : @rated[u]
|
|
135
143
|
|
|
136
144
|
if item_ids
|
|
137
|
-
ids = Numo::NArray.cast(item_ids.
|
|
145
|
+
ids = Numo::NArray.cast(item_ids.filter_map { |i| @item_map[i] })
|
|
138
146
|
return [] if ids.size == 0
|
|
139
147
|
|
|
140
148
|
predictions = @item_factors[ids, true].inner(@user_factors[u, true])
|
|
141
|
-
indexes = predictions.
|
|
142
|
-
indexes = indexes[0...[count + rated.size, indexes.size].min] if count
|
|
143
|
-
predictions = predictions[indexes]
|
|
149
|
+
predictions, indexes = top_k(predictions, count ? count + rated.size : nil)
|
|
144
150
|
ids = ids[indexes]
|
|
145
151
|
elsif @user_recs_index && count
|
|
146
152
|
predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
|
|
147
153
|
else
|
|
148
154
|
predictions = @item_factors.inner(@user_factors[u, true])
|
|
149
|
-
indexes = predictions
|
|
150
|
-
indexes = indexes[0...[count + rated.size, indexes.size].min] if count
|
|
151
|
-
predictions = predictions[indexes]
|
|
155
|
+
predictions, indexes = top_k(predictions, count ? count + rated.size : nil)
|
|
152
156
|
ids = indexes
|
|
153
157
|
end
|
|
154
158
|
|
|
@@ -157,7 +161,7 @@ module Disco
|
|
|
157
161
|
keys = @item_map.keys
|
|
158
162
|
result = []
|
|
159
163
|
ids.each_with_index do |item_id, i|
|
|
160
|
-
next if rated
|
|
164
|
+
next if rated.include?(item_id)
|
|
161
165
|
|
|
162
166
|
result << {item_id: keys[item_id], score: predictions[i]}
|
|
163
167
|
break if result.size == count
|
|
@@ -196,7 +200,7 @@ module Disco
|
|
|
196
200
|
# wilson score with continuity correction
|
|
197
201
|
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
|
|
198
202
|
z = 1.96 # 95% confidence
|
|
199
|
-
range = @max_rating -
|
|
203
|
+
range = @max_rating - min_rating
|
|
200
204
|
n = Numo::DFloat.cast(@item_count)
|
|
201
205
|
phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
|
|
202
206
|
phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
|
|
@@ -204,9 +208,7 @@ module Disco
|
|
|
204
208
|
scores = scores * range + min_rating
|
|
205
209
|
end
|
|
206
210
|
|
|
207
|
-
indexes = scores
|
|
208
|
-
indexes = indexes[0...[count, indexes.size].min] if count
|
|
209
|
-
scores = scores[indexes]
|
|
211
|
+
scores, indexes = top_k(scores, count)
|
|
210
212
|
|
|
211
213
|
keys = @item_map.keys
|
|
212
214
|
indexes.size.times.map do |i|
|
|
@@ -267,7 +269,7 @@ module Disco
|
|
|
267
269
|
implicit: @implicit,
|
|
268
270
|
user_ids: @user_map.keys,
|
|
269
271
|
item_ids: @item_map.keys,
|
|
270
|
-
rated: @
|
|
272
|
+
rated: @rated.map { |v| v.to_a.sort },
|
|
271
273
|
global_mean: @global_mean,
|
|
272
274
|
user_factors: [@user_factors.to_binary].pack("m0"),
|
|
273
275
|
item_factors: [@item_factors.to_binary].pack("m0"),
|
|
@@ -371,9 +373,7 @@ module Disco
|
|
|
371
373
|
end
|
|
372
374
|
else
|
|
373
375
|
predictions = factors.inner(factors[i, true]) / (norms * norms[i])
|
|
374
|
-
indexes = predictions
|
|
375
|
-
indexes = indexes[0...[count + 1, indexes.size].min] if count
|
|
376
|
-
predictions = predictions[indexes]
|
|
376
|
+
predictions, indexes = top_k(predictions, count ? count + 1 : nil)
|
|
377
377
|
ids = indexes
|
|
378
378
|
end
|
|
379
379
|
|
|
@@ -394,6 +394,12 @@ module Disco
|
|
|
394
394
|
end
|
|
395
395
|
end
|
|
396
396
|
|
|
397
|
+
def top_k(values, count)
|
|
398
|
+
indexes = values.sort_index.reverse
|
|
399
|
+
indexes = indexes[0...[count, indexes.size].min] if count
|
|
400
|
+
[values[indexes], indexes]
|
|
401
|
+
end
|
|
402
|
+
|
|
397
403
|
def check_ratings(ratings)
|
|
398
404
|
unless ratings.all? { |r| !r[:rating].nil? }
|
|
399
405
|
raise ArgumentError, "Missing rating"
|
|
@@ -414,17 +420,7 @@ module Disco
|
|
|
414
420
|
def to_dataset(dataset)
|
|
415
421
|
if defined?(Rover::DataFrame) && dataset.is_a?(Rover::DataFrame)
|
|
416
422
|
# convert keys to symbols
|
|
417
|
-
dataset
|
|
418
|
-
dataset.keys.each do |k, v|
|
|
419
|
-
dataset[k.to_sym] ||= dataset.delete(k)
|
|
420
|
-
end
|
|
421
|
-
dataset.to_a
|
|
422
|
-
elsif defined?(Daru::DataFrame) && dataset.is_a?(Daru::DataFrame)
|
|
423
|
-
# convert keys to symbols
|
|
424
|
-
dataset = dataset.dup
|
|
425
|
-
new_names = dataset.vectors.to_a.map { |k| [k, k.to_sym] }.to_h
|
|
426
|
-
dataset.rename_vectors!(new_names)
|
|
427
|
-
dataset.to_a[0]
|
|
423
|
+
dataset.each_row.map { |v| v.transform_keys(&:to_sym) }
|
|
428
424
|
else
|
|
429
425
|
dataset
|
|
430
426
|
end
|
|
@@ -434,7 +430,7 @@ module Disco
|
|
|
434
430
|
@implicit = obj["implicit"]
|
|
435
431
|
@user_map = obj["user_ids"].map.with_index.to_h
|
|
436
432
|
@item_map = obj["item_ids"].map.with_index.to_h
|
|
437
|
-
@rated = obj["rated"].map
|
|
433
|
+
@rated = obj["rated"].map { |r| Set.new(r) }
|
|
438
434
|
@global_mean = obj["global_mean"].to_f
|
|
439
435
|
@factors = obj["factors"].to_i
|
|
440
436
|
@user_factors = Numo::SFloat.from_binary(obj["user_factors"].unpack1("m0"), [@user_map.size, @factors])
|
data/lib/disco/version.rb
CHANGED
data/lib/disco.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: disco
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: libmf
|
|
@@ -15,28 +15,28 @@ dependencies:
|
|
|
15
15
|
requirements:
|
|
16
16
|
- - ">="
|
|
17
17
|
- !ruby/object:Gem::Version
|
|
18
|
-
version: '0.
|
|
18
|
+
version: '0.5'
|
|
19
19
|
type: :runtime
|
|
20
20
|
prerelease: false
|
|
21
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
22
22
|
requirements:
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
|
-
version: '0.
|
|
25
|
+
version: '0.5'
|
|
26
26
|
- !ruby/object:Gem::Dependency
|
|
27
|
-
name: numo-narray
|
|
27
|
+
name: numo-narray-alt
|
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
|
29
29
|
requirements:
|
|
30
30
|
- - ">="
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 0.
|
|
32
|
+
version: '0.10'
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
36
36
|
requirements:
|
|
37
37
|
- - ">="
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
|
-
version: 0.
|
|
39
|
+
version: '0.10'
|
|
40
40
|
email: andrew@ankane.org
|
|
41
41
|
executables: []
|
|
42
42
|
extensions: []
|
|
@@ -66,14 +66,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
66
66
|
requirements:
|
|
67
67
|
- - ">="
|
|
68
68
|
- !ruby/object:Gem::Version
|
|
69
|
-
version: '3.
|
|
69
|
+
version: '3.3'
|
|
70
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
71
71
|
requirements:
|
|
72
72
|
- - ">="
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
74
|
version: '0'
|
|
75
75
|
requirements: []
|
|
76
|
-
rubygems_version:
|
|
76
|
+
rubygems_version: 4.0.6
|
|
77
77
|
specification_version: 4
|
|
78
78
|
summary: Recommendations for Ruby and Rails using collaborative filtering
|
|
79
79
|
test_files: []
|