cmfrec 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -8
- data/lib/cmfrec/data.rb +12 -15
- data/lib/cmfrec/recommender.rb +94 -75
- data/lib/cmfrec/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3cc7d70530eefc7b13324753b454d03573da7d832c46cb4dee2ab9213eafcdd
|
4
|
+
data.tar.gz: 662af2ec4ab1a1bd33a39c18773d0c48967422286a2fa137960772b7e67d437a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fd6f1f8f0bd7d7c870c28fb57a0cec89aacf2d27aed53b5d68fb6935f5071dbe73931a5ff776f4a864f0cc91a17c793eabfe2a2b21f9b368a4c36ada5cb929d
|
7
|
+
data.tar.gz: 116d26ddafeeb439ef0895e30805afa0d2d2a453aeb369cf7122f13f5bf3ad457dac65c974d26debc84f353baf5f0c889c559e56620ef7458af4968ee9f5262a
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.3.0 (2024-10-23)
|
2
|
+
|
3
|
+
- Changed dataset directory to match XDG Base Directory Specification
|
4
|
+
- Removed dependency on `csv` gem for `load_movielens`
|
5
|
+
- Dropped support for marshal serialization
|
6
|
+
- Dropped support for Ruby < 3.1
|
7
|
+
|
8
|
+
## 0.2.1 (2022-07-11)
|
9
|
+
|
10
|
+
- Added support for JSON serialization
|
11
|
+
|
1
12
|
## 0.2.0 (2022-06-14)
|
2
13
|
|
3
14
|
- Updated cmfrec to 3.4.2
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
- Works with explicit and implicit feedback
|
7
7
|
- Uses high-performance matrix factorization
|
8
8
|
|
9
|
-
[![Build Status](https://github.com/ankane/cmfrec-ruby/workflows/build/badge.svg
|
9
|
+
[![Build Status](https://github.com/ankane/cmfrec-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/cmfrec-ruby/actions)
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -82,11 +82,11 @@ Add side information about users, items, or both
|
|
82
82
|
```ruby
|
83
83
|
user_info = [
|
84
84
|
{user_id: 1, cats: 1, dogs: 0},
|
85
|
-
{user_id: 2, cats: 2, dogs: 1}
|
85
|
+
{user_id: 2, cats: 2, dogs: 1}
|
86
86
|
]
|
87
87
|
item_info = [
|
88
88
|
{item_id: 1, genre_comedy: 1, genre_drama: 0},
|
89
|
-
{item_id: 2, genre_comedy: 0, genre_drama: 1}
|
89
|
+
{item_id: 2, genre_comedy: 0, genre_drama: 1}
|
90
90
|
]
|
91
91
|
recommender.fit(ratings, user_info: user_info, item_info: item_info)
|
92
92
|
```
|
@@ -213,17 +213,17 @@ Rover.read_csv("ratings.csv")
|
|
213
213
|
Store the recommender
|
214
214
|
|
215
215
|
```ruby
|
216
|
-
|
217
|
-
File.
|
216
|
+
json = recommender.to_json
|
217
|
+
File.write("recommender.json", json)
|
218
218
|
```
|
219
219
|
|
220
|
-
|
220
|
+
The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this.
|
221
221
|
|
222
222
|
Load a recommender
|
223
223
|
|
224
224
|
```ruby
|
225
|
-
|
226
|
-
recommender =
|
225
|
+
json = File.read("recommender.json")
|
226
|
+
recommender = Cmfrec::Recommender.load_json(json)
|
227
227
|
```
|
228
228
|
|
229
229
|
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples) for Disco, which has a similar API. For explicit feedback, you should [disable the bias](#explicit-feedback) with this approach.
|
data/lib/cmfrec/data.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
module Cmfrec
|
2
2
|
module Data
|
3
3
|
def load_movielens
|
4
|
-
require "csv"
|
5
|
-
|
6
4
|
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
5
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
6
|
user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user",
|
@@ -10,11 +8,9 @@ module Cmfrec
|
|
10
8
|
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
11
9
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
12
10
|
|
13
|
-
# convert u.item to utf-8
|
14
|
-
movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
|
15
|
-
|
16
11
|
user_info = []
|
17
|
-
|
12
|
+
File.foreach(user_path) do |line|
|
13
|
+
row = line.split("|")
|
18
14
|
user = {user_id: row[0].to_i}
|
19
15
|
10.times do |i|
|
20
16
|
user[:"region#{i}"] = row[4][0] == i.to_s ? 1 : 0
|
@@ -26,26 +22,28 @@ module Cmfrec
|
|
26
22
|
movies = {}
|
27
23
|
movie_names = {}
|
28
24
|
genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western)
|
29
|
-
|
25
|
+
File.foreach(item_path) do |line|
|
26
|
+
row = line.encode("UTF-8", "ISO-8859-1").split("|")
|
30
27
|
movies[row[0]] = row[1]
|
31
28
|
|
32
29
|
# filter duplicates
|
33
30
|
next if movie_names[row[1]]
|
34
31
|
movie_names[row[1]] = true
|
35
32
|
|
36
|
-
item = {item_id: row[1], year: row[2] ? Date.parse(row[2]).year : 1970}
|
33
|
+
item = {item_id: row[1], year: !row[2].empty? ? Date.parse(row[2]).year : 1970}
|
37
34
|
genres.each_with_index do |genre, i|
|
38
|
-
item[:"genre_#{genre}"] = row[i + 5]
|
35
|
+
item[:"genre_#{genre}"] = row[i + 5].to_i
|
39
36
|
end
|
40
37
|
item_info << item
|
41
38
|
end
|
42
39
|
|
43
40
|
data = []
|
44
|
-
|
41
|
+
File.foreach(data_path) do |line|
|
42
|
+
row = line.split("\t")
|
45
43
|
data << {
|
46
|
-
user_id: row[0],
|
44
|
+
user_id: row[0].to_i,
|
47
45
|
item_id: movies[row[1]],
|
48
|
-
rating: row[2]
|
46
|
+
rating: row[2].to_i
|
49
47
|
}
|
50
48
|
end
|
51
49
|
|
@@ -60,9 +58,8 @@ module Cmfrec
|
|
60
58
|
require "net/http"
|
61
59
|
require "tmpdir"
|
62
60
|
|
63
|
-
|
64
|
-
|
65
|
-
dest = "#{ENV["HOME"]}/.cmfrec/#{fname}"
|
61
|
+
cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
|
62
|
+
dest = "#{cache_home}/cmfrec/#{fname}"
|
66
63
|
FileUtils.mkdir_p(File.dirname(dest))
|
67
64
|
|
68
65
|
return dest if File.exist?(dest)
|
data/lib/cmfrec/recommender.rb
CHANGED
@@ -249,6 +249,68 @@ module Cmfrec
|
|
249
249
|
similar(user_id, @user_map, user_factors, count, user_index)
|
250
250
|
end
|
251
251
|
|
252
|
+
def to_json
|
253
|
+
require "base64"
|
254
|
+
require "json"
|
255
|
+
|
256
|
+
obj = {
|
257
|
+
implicit: @implicit
|
258
|
+
}
|
259
|
+
|
260
|
+
# options
|
261
|
+
obj[:factors] = @k
|
262
|
+
obj[:epochs] = @niter
|
263
|
+
obj[:verbose] = @verbose
|
264
|
+
|
265
|
+
# factors
|
266
|
+
obj[:user_ids] = @user_map.keys
|
267
|
+
obj[:item_ids] = @item_map.keys
|
268
|
+
obj[:rated] = @user_map.map { |_, u| (@rated[u] || {}).keys }
|
269
|
+
obj[:user_factors] = json_dump_ptr(@a)
|
270
|
+
obj[:item_factors] = json_dump_ptr(@b)
|
271
|
+
|
272
|
+
# bias
|
273
|
+
obj[:user_bias] = json_dump_ptr(@bias_a)
|
274
|
+
obj[:item_bias] = json_dump_ptr(@bias_b)
|
275
|
+
|
276
|
+
# mean
|
277
|
+
obj[:global_mean] = @global_mean
|
278
|
+
|
279
|
+
unless (@user_info_map.keys + @item_info_map.keys).all? { |v| v.is_a?(Symbol) }
|
280
|
+
raise "Side info keys must be symbols to save"
|
281
|
+
end
|
282
|
+
|
283
|
+
# side info
|
284
|
+
obj[:user_info_ids] = @user_info_map.keys
|
285
|
+
obj[:item_info_ids] = @item_info_map.keys
|
286
|
+
obj[:user_info_factors] = json_dump_ptr(@c)
|
287
|
+
obj[:item_info_factors] = json_dump_ptr(@d)
|
288
|
+
|
289
|
+
# implicit features
|
290
|
+
obj[:add_implicit_features] = @add_implicit_features
|
291
|
+
obj[:user_factors_implicit] = json_dump_ptr(@ai)
|
292
|
+
obj[:item_factors_implicit] = json_dump_ptr(@bi)
|
293
|
+
|
294
|
+
unless @implicit
|
295
|
+
obj[:min_rating] = @min_rating
|
296
|
+
obj[:max_rating] = @max_rating
|
297
|
+
end
|
298
|
+
|
299
|
+
obj[:user_means] = json_dump_ptr(@u_colmeans)
|
300
|
+
|
301
|
+
JSON.generate(obj)
|
302
|
+
end
|
303
|
+
|
304
|
+
def self.load_json(json)
|
305
|
+
require "json"
|
306
|
+
|
307
|
+
obj = JSON.parse(json)
|
308
|
+
|
309
|
+
recommender = new
|
310
|
+
recommender.send(:json_load, obj)
|
311
|
+
recommender
|
312
|
+
end
|
313
|
+
|
252
314
|
private
|
253
315
|
|
254
316
|
def user_index
|
@@ -452,7 +514,7 @@ module Cmfrec
|
|
452
514
|
nil, #precomputedBiTBi,
|
453
515
|
nil, #precomputedTransCtCinvCt,
|
454
516
|
nil, #precomputedCtCw
|
455
|
-
nil
|
517
|
+
nil #precomputedCtUbias
|
456
518
|
]
|
457
519
|
check_status FFI.fit_collective_explicit_als(*fiddle_args(args))
|
458
520
|
|
@@ -749,103 +811,60 @@ module Cmfrec
|
|
749
811
|
@finalize_chol = false
|
750
812
|
end
|
751
813
|
|
752
|
-
def
|
753
|
-
ptr.to_s(ptr.size) if ptr
|
814
|
+
def json_dump_ptr(ptr)
|
815
|
+
Base64.strict_encode64(ptr.to_s(ptr.size)) if ptr
|
754
816
|
end
|
755
817
|
|
756
|
-
def
|
757
|
-
Fiddle::Pointer[str] if str
|
818
|
+
def json_load_ptr(str)
|
819
|
+
Fiddle::Pointer[Base64.strict_decode64(str)] if str
|
758
820
|
end
|
759
821
|
|
760
|
-
def
|
761
|
-
|
762
|
-
implicit: @implicit
|
763
|
-
}
|
764
|
-
|
765
|
-
# options
|
766
|
-
obj[:factors] = @k
|
767
|
-
obj[:epochs] = @niter
|
768
|
-
obj[:verbose] = @verbose
|
769
|
-
|
770
|
-
# factors
|
771
|
-
obj[:user_map] = @user_map
|
772
|
-
obj[:item_map] = @item_map
|
773
|
-
obj[:rated] = @rated
|
774
|
-
obj[:user_factors] = dump_ptr(@a)
|
775
|
-
obj[:item_factors] = dump_ptr(@b)
|
776
|
-
|
777
|
-
# bias
|
778
|
-
obj[:user_bias] = dump_ptr(@bias_a)
|
779
|
-
obj[:item_bias] = dump_ptr(@bias_b)
|
780
|
-
|
781
|
-
# mean
|
782
|
-
obj[:global_mean] = @global_mean
|
783
|
-
|
784
|
-
# side info
|
785
|
-
obj[:user_info_map] = @user_info_map
|
786
|
-
obj[:item_info_map] = @item_info_map
|
787
|
-
obj[:user_info_factors] = dump_ptr(@c)
|
788
|
-
obj[:item_info_factors] = dump_ptr(@d)
|
789
|
-
|
790
|
-
# implicit features
|
791
|
-
obj[:add_implicit_features] = @add_implicit_features
|
792
|
-
obj[:user_factors_implicit] = dump_ptr(@ai)
|
793
|
-
obj[:item_factors_implicit] = dump_ptr(@bi)
|
794
|
-
|
795
|
-
unless @implicit
|
796
|
-
obj[:min_rating] = @min_rating
|
797
|
-
obj[:max_rating] = @max_rating
|
798
|
-
end
|
799
|
-
|
800
|
-
obj[:user_means] = dump_ptr(@u_colmeans)
|
801
|
-
|
802
|
-
obj
|
803
|
-
end
|
822
|
+
def json_load(obj)
|
823
|
+
require "base64"
|
804
824
|
|
805
|
-
|
806
|
-
@implicit = obj[:implicit]
|
825
|
+
@implicit = obj["implicit"]
|
807
826
|
|
808
827
|
# options
|
809
828
|
set_params(
|
810
|
-
k: obj[
|
811
|
-
niter: obj[
|
812
|
-
verbose: obj[
|
813
|
-
user_bias: !obj[
|
814
|
-
item_bias: !obj[
|
815
|
-
add_implicit_features: obj[
|
829
|
+
k: obj["factors"],
|
830
|
+
niter: obj["epochs"],
|
831
|
+
verbose: obj["verbose"],
|
832
|
+
user_bias: !obj["user_bias"].nil?,
|
833
|
+
item_bias: !obj["item_bias"].nil?,
|
834
|
+
add_implicit_features: obj["add_implicit_features"]
|
816
835
|
)
|
817
836
|
|
818
837
|
# factors
|
819
|
-
@user_map = obj[
|
820
|
-
@item_map = obj[
|
821
|
-
@rated = obj[
|
822
|
-
@a =
|
823
|
-
@b =
|
838
|
+
@user_map = obj["user_ids"].map.with_index.to_h
|
839
|
+
@item_map = obj["item_ids"].map.with_index.to_h
|
840
|
+
@rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
|
841
|
+
@a = json_load_ptr(obj["user_factors"])
|
842
|
+
@b = json_load_ptr(obj["item_factors"])
|
824
843
|
|
825
844
|
# bias
|
826
|
-
@bias_a =
|
827
|
-
@bias_b =
|
845
|
+
@bias_a = json_load_ptr(obj["user_bias"])
|
846
|
+
@bias_b = json_load_ptr(obj["item_bias"])
|
828
847
|
|
829
848
|
# mean
|
830
|
-
@global_mean = obj[
|
849
|
+
@global_mean = obj["global_mean"]
|
831
850
|
|
832
851
|
# side info
|
833
|
-
@user_info_map = obj[
|
834
|
-
@item_info_map = obj[
|
835
|
-
@c =
|
836
|
-
@d =
|
852
|
+
@user_info_map = obj["user_info_ids"].map(&:to_sym).map.with_index.to_h
|
853
|
+
@item_info_map = obj["item_info_ids"].map(&:to_sym).map.with_index.to_h
|
854
|
+
@c = json_load_ptr(obj["user_info_factors"])
|
855
|
+
@d = json_load_ptr(obj["item_info_factors"])
|
837
856
|
|
838
857
|
# implicit features
|
839
|
-
@add_implicit_features = obj[
|
840
|
-
@ai =
|
841
|
-
@bi =
|
858
|
+
@add_implicit_features = obj["add_implicit_features"]
|
859
|
+
@ai = json_load_ptr(obj["user_factors_implicit"])
|
860
|
+
@bi = json_load_ptr(obj["item_factors_implicit"])
|
842
861
|
|
843
862
|
unless @implicit
|
844
|
-
@min_rating = obj[
|
845
|
-
@max_rating = obj[
|
863
|
+
@min_rating = obj["min_rating"]
|
864
|
+
@max_rating = obj["max_rating"]
|
846
865
|
end
|
847
866
|
|
848
|
-
@u_colmeans =
|
867
|
+
@u_colmeans = json_load_ptr(obj["user_means"])
|
849
868
|
|
850
869
|
@m = @user_map.size
|
851
870
|
@n = @item_map.size
|
data/lib/cmfrec/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmfrec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
11
|
+
date: 2024-10-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fiddle
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
description:
|
14
28
|
email: andrew@ankane.org
|
15
29
|
executables: []
|
@@ -45,14 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
59
|
requirements:
|
46
60
|
- - ">="
|
47
61
|
- !ruby/object:Gem::Version
|
48
|
-
version: '
|
62
|
+
version: '3.1'
|
49
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
64
|
requirements:
|
51
65
|
- - ">="
|
52
66
|
- !ruby/object:Gem::Version
|
53
67
|
version: '0'
|
54
68
|
requirements: []
|
55
|
-
rubygems_version: 3.
|
69
|
+
rubygems_version: 3.5.16
|
56
70
|
signing_key:
|
57
71
|
specification_version: 4
|
58
72
|
summary: Recommendations for Ruby using collective matrix factorization
|