cmfrec 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -8
- data/lib/cmfrec/data.rb +12 -15
- data/lib/cmfrec/recommender.rb +94 -75
- data/lib/cmfrec/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3cc7d70530eefc7b13324753b454d03573da7d832c46cb4dee2ab9213eafcdd
|
4
|
+
data.tar.gz: 662af2ec4ab1a1bd33a39c18773d0c48967422286a2fa137960772b7e67d437a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fd6f1f8f0bd7d7c870c28fb57a0cec89aacf2d27aed53b5d68fb6935f5071dbe73931a5ff776f4a864f0cc91a17c793eabfe2a2b21f9b368a4c36ada5cb929d
|
7
|
+
data.tar.gz: 116d26ddafeeb439ef0895e30805afa0d2d2a453aeb369cf7122f13f5bf3ad457dac65c974d26debc84f353baf5f0c889c559e56620ef7458af4968ee9f5262a
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.3.0 (2024-10-23)
|
2
|
+
|
3
|
+
- Changed dataset directory to match XDG Base Directory Specification
|
4
|
+
- Removed dependency on `csv` gem for `load_movielens`
|
5
|
+
- Dropped support for marshal serialization
|
6
|
+
- Dropped support for Ruby < 3.1
|
7
|
+
|
8
|
+
## 0.2.1 (2022-07-11)
|
9
|
+
|
10
|
+
- Added support for JSON serialization
|
11
|
+
|
1
12
|
## 0.2.0 (2022-06-14)
|
2
13
|
|
3
14
|
- Updated cmfrec to 3.4.2
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
- Works with explicit and implicit feedback
|
7
7
|
- Uses high-performance matrix factorization
|
8
8
|
|
9
|
-
[](https://github.com/ankane/cmfrec-ruby/actions)
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -82,11 +82,11 @@ Add side information about users, items, or both
|
|
82
82
|
```ruby
|
83
83
|
user_info = [
|
84
84
|
{user_id: 1, cats: 1, dogs: 0},
|
85
|
-
{user_id: 2, cats: 2, dogs: 1}
|
85
|
+
{user_id: 2, cats: 2, dogs: 1}
|
86
86
|
]
|
87
87
|
item_info = [
|
88
88
|
{item_id: 1, genre_comedy: 1, genre_drama: 0},
|
89
|
-
{item_id: 2, genre_comedy: 0, genre_drama: 1}
|
89
|
+
{item_id: 2, genre_comedy: 0, genre_drama: 1}
|
90
90
|
]
|
91
91
|
recommender.fit(ratings, user_info: user_info, item_info: item_info)
|
92
92
|
```
|
@@ -213,17 +213,17 @@ Rover.read_csv("ratings.csv")
|
|
213
213
|
Store the recommender
|
214
214
|
|
215
215
|
```ruby
|
216
|
-
|
217
|
-
File.
|
216
|
+
json = recommender.to_json
|
217
|
+
File.write("recommender.json", json)
|
218
218
|
```
|
219
219
|
|
220
|
-
|
220
|
+
The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this.
|
221
221
|
|
222
222
|
Load a recommender
|
223
223
|
|
224
224
|
```ruby
|
225
|
-
|
226
|
-
recommender =
|
225
|
+
json = File.read("recommender.json")
|
226
|
+
recommender = Cmfrec::Recommender.load_json(json)
|
227
227
|
```
|
228
228
|
|
229
229
|
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples) for Disco, which has a similar API. For explicit feedback, you should [disable the bias](#explicit-feedback) with this approach.
|
data/lib/cmfrec/data.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
module Cmfrec
|
2
2
|
module Data
|
3
3
|
def load_movielens
|
4
|
-
require "csv"
|
5
|
-
|
6
4
|
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
5
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
6
|
user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user",
|
@@ -10,11 +8,9 @@ module Cmfrec
|
|
10
8
|
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
11
9
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
12
10
|
|
13
|
-
# convert u.item to utf-8
|
14
|
-
movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
|
15
|
-
|
16
11
|
user_info = []
|
17
|
-
|
12
|
+
File.foreach(user_path) do |line|
|
13
|
+
row = line.split("|")
|
18
14
|
user = {user_id: row[0].to_i}
|
19
15
|
10.times do |i|
|
20
16
|
user[:"region#{i}"] = row[4][0] == i.to_s ? 1 : 0
|
@@ -26,26 +22,28 @@ module Cmfrec
|
|
26
22
|
movies = {}
|
27
23
|
movie_names = {}
|
28
24
|
genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western)
|
29
|
-
|
25
|
+
File.foreach(item_path) do |line|
|
26
|
+
row = line.encode("UTF-8", "ISO-8859-1").split("|")
|
30
27
|
movies[row[0]] = row[1]
|
31
28
|
|
32
29
|
# filter duplicates
|
33
30
|
next if movie_names[row[1]]
|
34
31
|
movie_names[row[1]] = true
|
35
32
|
|
36
|
-
item = {item_id: row[1], year: row[2] ? Date.parse(row[2]).year : 1970}
|
33
|
+
item = {item_id: row[1], year: !row[2].empty? ? Date.parse(row[2]).year : 1970}
|
37
34
|
genres.each_with_index do |genre, i|
|
38
|
-
item[:"genre_#{genre}"] = row[i + 5]
|
35
|
+
item[:"genre_#{genre}"] = row[i + 5].to_i
|
39
36
|
end
|
40
37
|
item_info << item
|
41
38
|
end
|
42
39
|
|
43
40
|
data = []
|
44
|
-
|
41
|
+
File.foreach(data_path) do |line|
|
42
|
+
row = line.split("\t")
|
45
43
|
data << {
|
46
|
-
user_id: row[0],
|
44
|
+
user_id: row[0].to_i,
|
47
45
|
item_id: movies[row[1]],
|
48
|
-
rating: row[2]
|
46
|
+
rating: row[2].to_i
|
49
47
|
}
|
50
48
|
end
|
51
49
|
|
@@ -60,9 +58,8 @@ module Cmfrec
|
|
60
58
|
require "net/http"
|
61
59
|
require "tmpdir"
|
62
60
|
|
63
|
-
|
64
|
-
|
65
|
-
dest = "#{ENV["HOME"]}/.cmfrec/#{fname}"
|
61
|
+
cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
|
62
|
+
dest = "#{cache_home}/cmfrec/#{fname}"
|
66
63
|
FileUtils.mkdir_p(File.dirname(dest))
|
67
64
|
|
68
65
|
return dest if File.exist?(dest)
|
data/lib/cmfrec/recommender.rb
CHANGED
@@ -249,6 +249,68 @@ module Cmfrec
|
|
249
249
|
similar(user_id, @user_map, user_factors, count, user_index)
|
250
250
|
end
|
251
251
|
|
252
|
+
def to_json
|
253
|
+
require "base64"
|
254
|
+
require "json"
|
255
|
+
|
256
|
+
obj = {
|
257
|
+
implicit: @implicit
|
258
|
+
}
|
259
|
+
|
260
|
+
# options
|
261
|
+
obj[:factors] = @k
|
262
|
+
obj[:epochs] = @niter
|
263
|
+
obj[:verbose] = @verbose
|
264
|
+
|
265
|
+
# factors
|
266
|
+
obj[:user_ids] = @user_map.keys
|
267
|
+
obj[:item_ids] = @item_map.keys
|
268
|
+
obj[:rated] = @user_map.map { |_, u| (@rated[u] || {}).keys }
|
269
|
+
obj[:user_factors] = json_dump_ptr(@a)
|
270
|
+
obj[:item_factors] = json_dump_ptr(@b)
|
271
|
+
|
272
|
+
# bias
|
273
|
+
obj[:user_bias] = json_dump_ptr(@bias_a)
|
274
|
+
obj[:item_bias] = json_dump_ptr(@bias_b)
|
275
|
+
|
276
|
+
# mean
|
277
|
+
obj[:global_mean] = @global_mean
|
278
|
+
|
279
|
+
unless (@user_info_map.keys + @item_info_map.keys).all? { |v| v.is_a?(Symbol) }
|
280
|
+
raise "Side info keys must be symbols to save"
|
281
|
+
end
|
282
|
+
|
283
|
+
# side info
|
284
|
+
obj[:user_info_ids] = @user_info_map.keys
|
285
|
+
obj[:item_info_ids] = @item_info_map.keys
|
286
|
+
obj[:user_info_factors] = json_dump_ptr(@c)
|
287
|
+
obj[:item_info_factors] = json_dump_ptr(@d)
|
288
|
+
|
289
|
+
# implicit features
|
290
|
+
obj[:add_implicit_features] = @add_implicit_features
|
291
|
+
obj[:user_factors_implicit] = json_dump_ptr(@ai)
|
292
|
+
obj[:item_factors_implicit] = json_dump_ptr(@bi)
|
293
|
+
|
294
|
+
unless @implicit
|
295
|
+
obj[:min_rating] = @min_rating
|
296
|
+
obj[:max_rating] = @max_rating
|
297
|
+
end
|
298
|
+
|
299
|
+
obj[:user_means] = json_dump_ptr(@u_colmeans)
|
300
|
+
|
301
|
+
JSON.generate(obj)
|
302
|
+
end
|
303
|
+
|
304
|
+
def self.load_json(json)
|
305
|
+
require "json"
|
306
|
+
|
307
|
+
obj = JSON.parse(json)
|
308
|
+
|
309
|
+
recommender = new
|
310
|
+
recommender.send(:json_load, obj)
|
311
|
+
recommender
|
312
|
+
end
|
313
|
+
|
252
314
|
private
|
253
315
|
|
254
316
|
def user_index
|
@@ -452,7 +514,7 @@ module Cmfrec
|
|
452
514
|
nil, #precomputedBiTBi,
|
453
515
|
nil, #precomputedTransCtCinvCt,
|
454
516
|
nil, #precomputedCtCw
|
455
|
-
nil
|
517
|
+
nil #precomputedCtUbias
|
456
518
|
]
|
457
519
|
check_status FFI.fit_collective_explicit_als(*fiddle_args(args))
|
458
520
|
|
@@ -749,103 +811,60 @@ module Cmfrec
|
|
749
811
|
@finalize_chol = false
|
750
812
|
end
|
751
813
|
|
752
|
-
def
|
753
|
-
ptr.to_s(ptr.size) if ptr
|
814
|
+
def json_dump_ptr(ptr)
|
815
|
+
Base64.strict_encode64(ptr.to_s(ptr.size)) if ptr
|
754
816
|
end
|
755
817
|
|
756
|
-
def
|
757
|
-
Fiddle::Pointer[str] if str
|
818
|
+
def json_load_ptr(str)
|
819
|
+
Fiddle::Pointer[Base64.strict_decode64(str)] if str
|
758
820
|
end
|
759
821
|
|
760
|
-
def
|
761
|
-
|
762
|
-
implicit: @implicit
|
763
|
-
}
|
764
|
-
|
765
|
-
# options
|
766
|
-
obj[:factors] = @k
|
767
|
-
obj[:epochs] = @niter
|
768
|
-
obj[:verbose] = @verbose
|
769
|
-
|
770
|
-
# factors
|
771
|
-
obj[:user_map] = @user_map
|
772
|
-
obj[:item_map] = @item_map
|
773
|
-
obj[:rated] = @rated
|
774
|
-
obj[:user_factors] = dump_ptr(@a)
|
775
|
-
obj[:item_factors] = dump_ptr(@b)
|
776
|
-
|
777
|
-
# bias
|
778
|
-
obj[:user_bias] = dump_ptr(@bias_a)
|
779
|
-
obj[:item_bias] = dump_ptr(@bias_b)
|
780
|
-
|
781
|
-
# mean
|
782
|
-
obj[:global_mean] = @global_mean
|
783
|
-
|
784
|
-
# side info
|
785
|
-
obj[:user_info_map] = @user_info_map
|
786
|
-
obj[:item_info_map] = @item_info_map
|
787
|
-
obj[:user_info_factors] = dump_ptr(@c)
|
788
|
-
obj[:item_info_factors] = dump_ptr(@d)
|
789
|
-
|
790
|
-
# implicit features
|
791
|
-
obj[:add_implicit_features] = @add_implicit_features
|
792
|
-
obj[:user_factors_implicit] = dump_ptr(@ai)
|
793
|
-
obj[:item_factors_implicit] = dump_ptr(@bi)
|
794
|
-
|
795
|
-
unless @implicit
|
796
|
-
obj[:min_rating] = @min_rating
|
797
|
-
obj[:max_rating] = @max_rating
|
798
|
-
end
|
799
|
-
|
800
|
-
obj[:user_means] = dump_ptr(@u_colmeans)
|
801
|
-
|
802
|
-
obj
|
803
|
-
end
|
822
|
+
def json_load(obj)
|
823
|
+
require "base64"
|
804
824
|
|
805
|
-
|
806
|
-
@implicit = obj[:implicit]
|
825
|
+
@implicit = obj["implicit"]
|
807
826
|
|
808
827
|
# options
|
809
828
|
set_params(
|
810
|
-
k: obj[
|
811
|
-
niter: obj[
|
812
|
-
verbose: obj[
|
813
|
-
user_bias: !obj[
|
814
|
-
item_bias: !obj[
|
815
|
-
add_implicit_features: obj[
|
829
|
+
k: obj["factors"],
|
830
|
+
niter: obj["epochs"],
|
831
|
+
verbose: obj["verbose"],
|
832
|
+
user_bias: !obj["user_bias"].nil?,
|
833
|
+
item_bias: !obj["item_bias"].nil?,
|
834
|
+
add_implicit_features: obj["add_implicit_features"]
|
816
835
|
)
|
817
836
|
|
818
837
|
# factors
|
819
|
-
@user_map = obj[
|
820
|
-
@item_map = obj[
|
821
|
-
@rated = obj[
|
822
|
-
@a =
|
823
|
-
@b =
|
838
|
+
@user_map = obj["user_ids"].map.with_index.to_h
|
839
|
+
@item_map = obj["item_ids"].map.with_index.to_h
|
840
|
+
@rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
|
841
|
+
@a = json_load_ptr(obj["user_factors"])
|
842
|
+
@b = json_load_ptr(obj["item_factors"])
|
824
843
|
|
825
844
|
# bias
|
826
|
-
@bias_a =
|
827
|
-
@bias_b =
|
845
|
+
@bias_a = json_load_ptr(obj["user_bias"])
|
846
|
+
@bias_b = json_load_ptr(obj["item_bias"])
|
828
847
|
|
829
848
|
# mean
|
830
|
-
@global_mean = obj[
|
849
|
+
@global_mean = obj["global_mean"]
|
831
850
|
|
832
851
|
# side info
|
833
|
-
@user_info_map = obj[
|
834
|
-
@item_info_map = obj[
|
835
|
-
@c =
|
836
|
-
@d =
|
852
|
+
@user_info_map = obj["user_info_ids"].map(&:to_sym).map.with_index.to_h
|
853
|
+
@item_info_map = obj["item_info_ids"].map(&:to_sym).map.with_index.to_h
|
854
|
+
@c = json_load_ptr(obj["user_info_factors"])
|
855
|
+
@d = json_load_ptr(obj["item_info_factors"])
|
837
856
|
|
838
857
|
# implicit features
|
839
|
-
@add_implicit_features = obj[
|
840
|
-
@ai =
|
841
|
-
@bi =
|
858
|
+
@add_implicit_features = obj["add_implicit_features"]
|
859
|
+
@ai = json_load_ptr(obj["user_factors_implicit"])
|
860
|
+
@bi = json_load_ptr(obj["item_factors_implicit"])
|
842
861
|
|
843
862
|
unless @implicit
|
844
|
-
@min_rating = obj[
|
845
|
-
@max_rating = obj[
|
863
|
+
@min_rating = obj["min_rating"]
|
864
|
+
@max_rating = obj["max_rating"]
|
846
865
|
end
|
847
866
|
|
848
|
-
@u_colmeans =
|
867
|
+
@u_colmeans = json_load_ptr(obj["user_means"])
|
849
868
|
|
850
869
|
@m = @user_map.size
|
851
870
|
@n = @item_map.size
|
data/lib/cmfrec/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmfrec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
11
|
+
date: 2024-10-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fiddle
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
description:
|
14
28
|
email: andrew@ankane.org
|
15
29
|
executables: []
|
@@ -45,14 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
59
|
requirements:
|
46
60
|
- - ">="
|
47
61
|
- !ruby/object:Gem::Version
|
48
|
-
version: '
|
62
|
+
version: '3.1'
|
49
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
64
|
requirements:
|
51
65
|
- - ">="
|
52
66
|
- !ruby/object:Gem::Version
|
53
67
|
version: '0'
|
54
68
|
requirements: []
|
55
|
-
rubygems_version: 3.
|
69
|
+
rubygems_version: 3.5.16
|
56
70
|
signing_key:
|
57
71
|
specification_version: 4
|
58
72
|
summary: Recommendations for Ruby using collective matrix factorization
|