cmfrec 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6dbbc801e415a4f505ffc436be23ccf066d144da072669e782b88c02e14b0f8
4
- data.tar.gz: 3851230f0a4dc4be9fbc24fe81681de0758bdbb583803780f8e07b10741f4bd1
3
+ metadata.gz: d3cc7d70530eefc7b13324753b454d03573da7d832c46cb4dee2ab9213eafcdd
4
+ data.tar.gz: 662af2ec4ab1a1bd33a39c18773d0c48967422286a2fa137960772b7e67d437a
5
5
  SHA512:
6
- metadata.gz: 103c09a7c0e13fca3cb81dc68c667e776fc965a485cf358e0fc8f350a97474b54bdfc0910f4472051020dbe34b8c097908c1024d5fc036ad77d0444372885109
7
- data.tar.gz: b107b36333f714106d981168f24fda48a2a211f288a2dbe01f570adb607b7d6a5215c18df4c67d159bb367d652f3d10faeb4dbc66688eaf117c5b7a1432ee951
6
+ metadata.gz: 8fd6f1f8f0bd7d7c870c28fb57a0cec89aacf2d27aed53b5d68fb6935f5071dbe73931a5ff776f4a864f0cc91a17c793eabfe2a2b21f9b368a4c36ada5cb929d
7
+ data.tar.gz: 116d26ddafeeb439ef0895e30805afa0d2d2a453aeb369cf7122f13f5bf3ad457dac65c974d26debc84f353baf5f0c889c559e56620ef7458af4968ee9f5262a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.3.0 (2024-10-23)
2
+
3
+ - Changed dataset directory to match XDG Base Directory Specification
4
+ - Removed dependency on `csv` gem for `load_movielens`
5
+ - Dropped support for marshal serialization
6
+ - Dropped support for Ruby < 3.1
7
+
8
+ ## 0.2.1 (2022-07-11)
9
+
10
+ - Added support for JSON serialization
11
+
1
12
  ## 0.2.0 (2022-06-14)
2
13
 
3
14
  - Updated cmfrec to 3.4.2
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2020 David Cortes
4
- Copyright (c) 2020-2021 Andrew Kane
4
+ Copyright (c) 2020-2024 Andrew Kane
5
5
 
6
6
  All rights reserved.
7
7
 
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
  - Works with explicit and implicit feedback
7
7
  - Uses high-performance matrix factorization
8
8
 
9
- [![Build Status](https://github.com/ankane/cmfrec-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/cmfrec-ruby/actions)
9
+ [![Build Status](https://github.com/ankane/cmfrec-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/cmfrec-ruby/actions)
10
10
 
11
11
  ## Installation
12
12
 
@@ -82,11 +82,11 @@ Add side information about users, items, or both
82
82
  ```ruby
83
83
  user_info = [
84
84
  {user_id: 1, cats: 1, dogs: 0},
85
- {user_id: 2, cats: 2, dogs: 1},
85
+ {user_id: 2, cats: 2, dogs: 1}
86
86
  ]
87
87
  item_info = [
88
88
  {item_id: 1, genre_comedy: 1, genre_drama: 0},
89
- {item_id: 2, genre_comedy: 0, genre_drama: 1},
89
+ {item_id: 2, genre_comedy: 0, genre_drama: 1}
90
90
  ]
91
91
  recommender.fit(ratings, user_info: user_info, item_info: item_info)
92
92
  ```
@@ -213,17 +213,17 @@ Rover.read_csv("ratings.csv")
213
213
  Store the recommender
214
214
 
215
215
  ```ruby
216
- bin = Marshal.dump(recommender)
217
- File.binwrite("recommender.bin", bin)
216
+ json = recommender.to_json
217
+ File.write("recommender.json", json)
218
218
  ```
219
219
 
220
- > You can save it to a file, database, or any other storage system
220
+ The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this.
221
221
 
222
222
  Load a recommender
223
223
 
224
224
  ```ruby
225
- bin = File.binread("recommender.bin")
226
- recommender = Marshal.load(bin)
225
+ json = File.read("recommender.json")
226
+ recommender = Cmfrec::Recommender.load_json(json)
227
227
  ```
228
228
 
229
229
  Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples) for Disco, which has a similar API. For explicit feedback, you should [disable the bias](#explicit-feedback) with this approach.
data/lib/cmfrec/data.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  module Cmfrec
2
2
  module Data
3
3
  def load_movielens
4
- require "csv"
5
-
6
4
  data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
7
5
  file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
8
6
  user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user",
@@ -10,11 +8,9 @@ module Cmfrec
10
8
  item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
11
9
  file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
12
10
 
13
- # convert u.item to utf-8
14
- movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
15
-
16
11
  user_info = []
17
- CSV.foreach(user_path, col_sep: "|") do |row|
12
+ File.foreach(user_path) do |line|
13
+ row = line.split("|")
18
14
  user = {user_id: row[0].to_i}
19
15
  10.times do |i|
20
16
  user[:"region#{i}"] = row[4][0] == i.to_s ? 1 : 0
@@ -26,26 +22,28 @@ module Cmfrec
26
22
  movies = {}
27
23
  movie_names = {}
28
24
  genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western)
29
- CSV.parse(movies_str, col_sep: "|", converters: [:numeric]) do |row|
25
+ File.foreach(item_path) do |line|
26
+ row = line.encode("UTF-8", "ISO-8859-1").split("|")
30
27
  movies[row[0]] = row[1]
31
28
 
32
29
  # filter duplicates
33
30
  next if movie_names[row[1]]
34
31
  movie_names[row[1]] = true
35
32
 
36
- item = {item_id: row[1], year: row[2] ? Date.parse(row[2]).year : 1970}
33
+ item = {item_id: row[1], year: !row[2].empty? ? Date.parse(row[2]).year : 1970}
37
34
  genres.each_with_index do |genre, i|
38
- item[:"genre_#{genre}"] = row[i + 5]
35
+ item[:"genre_#{genre}"] = row[i + 5].to_i
39
36
  end
40
37
  item_info << item
41
38
  end
42
39
 
43
40
  data = []
44
- CSV.foreach(data_path, col_sep: "\t", converters: [:numeric]) do |row|
41
+ File.foreach(data_path) do |line|
42
+ row = line.split("\t")
45
43
  data << {
46
- user_id: row[0],
44
+ user_id: row[0].to_i,
47
45
  item_id: movies[row[1]],
48
- rating: row[2]
46
+ rating: row[2].to_i
49
47
  }
50
48
  end
51
49
 
@@ -60,9 +58,8 @@ module Cmfrec
60
58
  require "net/http"
61
59
  require "tmpdir"
62
60
 
63
- # TODO handle this better
64
- raise "No HOME" unless ENV["HOME"]
65
- dest = "#{ENV["HOME"]}/.cmfrec/#{fname}"
61
+ cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache"
62
+ dest = "#{cache_home}/cmfrec/#{fname}"
66
63
  FileUtils.mkdir_p(File.dirname(dest))
67
64
 
68
65
  return dest if File.exist?(dest)
@@ -249,6 +249,68 @@ module Cmfrec
249
249
  similar(user_id, @user_map, user_factors, count, user_index)
250
250
  end
251
251
 
252
+ def to_json
253
+ require "base64"
254
+ require "json"
255
+
256
+ obj = {
257
+ implicit: @implicit
258
+ }
259
+
260
+ # options
261
+ obj[:factors] = @k
262
+ obj[:epochs] = @niter
263
+ obj[:verbose] = @verbose
264
+
265
+ # factors
266
+ obj[:user_ids] = @user_map.keys
267
+ obj[:item_ids] = @item_map.keys
268
+ obj[:rated] = @user_map.map { |_, u| (@rated[u] || {}).keys }
269
+ obj[:user_factors] = json_dump_ptr(@a)
270
+ obj[:item_factors] = json_dump_ptr(@b)
271
+
272
+ # bias
273
+ obj[:user_bias] = json_dump_ptr(@bias_a)
274
+ obj[:item_bias] = json_dump_ptr(@bias_b)
275
+
276
+ # mean
277
+ obj[:global_mean] = @global_mean
278
+
279
+ unless (@user_info_map.keys + @item_info_map.keys).all? { |v| v.is_a?(Symbol) }
280
+ raise "Side info keys must be symbols to save"
281
+ end
282
+
283
+ # side info
284
+ obj[:user_info_ids] = @user_info_map.keys
285
+ obj[:item_info_ids] = @item_info_map.keys
286
+ obj[:user_info_factors] = json_dump_ptr(@c)
287
+ obj[:item_info_factors] = json_dump_ptr(@d)
288
+
289
+ # implicit features
290
+ obj[:add_implicit_features] = @add_implicit_features
291
+ obj[:user_factors_implicit] = json_dump_ptr(@ai)
292
+ obj[:item_factors_implicit] = json_dump_ptr(@bi)
293
+
294
+ unless @implicit
295
+ obj[:min_rating] = @min_rating
296
+ obj[:max_rating] = @max_rating
297
+ end
298
+
299
+ obj[:user_means] = json_dump_ptr(@u_colmeans)
300
+
301
+ JSON.generate(obj)
302
+ end
303
+
304
+ def self.load_json(json)
305
+ require "json"
306
+
307
+ obj = JSON.parse(json)
308
+
309
+ recommender = new
310
+ recommender.send(:json_load, obj)
311
+ recommender
312
+ end
313
+
252
314
  private
253
315
 
254
316
  def user_index
@@ -452,7 +514,7 @@ module Cmfrec
452
514
  nil, #precomputedBiTBi,
453
515
  nil, #precomputedTransCtCinvCt,
454
516
  nil, #precomputedCtCw
455
- nil, #precomputedCtUbias
517
+ nil #precomputedCtUbias
456
518
  ]
457
519
  check_status FFI.fit_collective_explicit_als(*fiddle_args(args))
458
520
 
@@ -749,103 +811,60 @@ module Cmfrec
749
811
  @finalize_chol = false
750
812
  end
751
813
 
752
- def dump_ptr(ptr)
753
- ptr.to_s(ptr.size) if ptr
814
+ def json_dump_ptr(ptr)
815
+ Base64.strict_encode64(ptr.to_s(ptr.size)) if ptr
754
816
  end
755
817
 
756
- def load_ptr(str)
757
- Fiddle::Pointer[str] if str
818
+ def json_load_ptr(str)
819
+ Fiddle::Pointer[Base64.strict_decode64(str)] if str
758
820
  end
759
821
 
760
- def marshal_dump
761
- obj = {
762
- implicit: @implicit
763
- }
764
-
765
- # options
766
- obj[:factors] = @k
767
- obj[:epochs] = @niter
768
- obj[:verbose] = @verbose
769
-
770
- # factors
771
- obj[:user_map] = @user_map
772
- obj[:item_map] = @item_map
773
- obj[:rated] = @rated
774
- obj[:user_factors] = dump_ptr(@a)
775
- obj[:item_factors] = dump_ptr(@b)
776
-
777
- # bias
778
- obj[:user_bias] = dump_ptr(@bias_a)
779
- obj[:item_bias] = dump_ptr(@bias_b)
780
-
781
- # mean
782
- obj[:global_mean] = @global_mean
783
-
784
- # side info
785
- obj[:user_info_map] = @user_info_map
786
- obj[:item_info_map] = @item_info_map
787
- obj[:user_info_factors] = dump_ptr(@c)
788
- obj[:item_info_factors] = dump_ptr(@d)
789
-
790
- # implicit features
791
- obj[:add_implicit_features] = @add_implicit_features
792
- obj[:user_factors_implicit] = dump_ptr(@ai)
793
- obj[:item_factors_implicit] = dump_ptr(@bi)
794
-
795
- unless @implicit
796
- obj[:min_rating] = @min_rating
797
- obj[:max_rating] = @max_rating
798
- end
799
-
800
- obj[:user_means] = dump_ptr(@u_colmeans)
801
-
802
- obj
803
- end
822
+ def json_load(obj)
823
+ require "base64"
804
824
 
805
- def marshal_load(obj)
806
- @implicit = obj[:implicit]
825
+ @implicit = obj["implicit"]
807
826
 
808
827
  # options
809
828
  set_params(
810
- k: obj[:factors],
811
- niter: obj[:epochs],
812
- verbose: obj[:verbose],
813
- user_bias: !obj[:user_bias].nil?,
814
- item_bias: !obj[:item_bias].nil?,
815
- add_implicit_features: obj[:add_implicit_features]
829
+ k: obj["factors"],
830
+ niter: obj["epochs"],
831
+ verbose: obj["verbose"],
832
+ user_bias: !obj["user_bias"].nil?,
833
+ item_bias: !obj["item_bias"].nil?,
834
+ add_implicit_features: obj["add_implicit_features"]
816
835
  )
817
836
 
818
837
  # factors
819
- @user_map = obj[:user_map]
820
- @item_map = obj[:item_map]
821
- @rated = obj[:rated] || {}
822
- @a = load_ptr(obj[:user_factors])
823
- @b = load_ptr(obj[:item_factors])
838
+ @user_map = obj["user_ids"].map.with_index.to_h
839
+ @item_map = obj["item_ids"].map.with_index.to_h
840
+ @rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] }
841
+ @a = json_load_ptr(obj["user_factors"])
842
+ @b = json_load_ptr(obj["item_factors"])
824
843
 
825
844
  # bias
826
- @bias_a = load_ptr(obj[:user_bias])
827
- @bias_b = load_ptr(obj[:item_bias])
845
+ @bias_a = json_load_ptr(obj["user_bias"])
846
+ @bias_b = json_load_ptr(obj["item_bias"])
828
847
 
829
848
  # mean
830
- @global_mean = obj[:global_mean]
849
+ @global_mean = obj["global_mean"]
831
850
 
832
851
  # side info
833
- @user_info_map = obj[:user_info_map]
834
- @item_info_map = obj[:item_info_map]
835
- @c = load_ptr(obj[:user_info_factors])
836
- @d = load_ptr(obj[:item_info_factors])
852
+ @user_info_map = obj["user_info_ids"].map(&:to_sym).map.with_index.to_h
853
+ @item_info_map = obj["item_info_ids"].map(&:to_sym).map.with_index.to_h
854
+ @c = json_load_ptr(obj["user_info_factors"])
855
+ @d = json_load_ptr(obj["item_info_factors"])
837
856
 
838
857
  # implicit features
839
- @add_implicit_features = obj[:add_implicit_features]
840
- @ai = load_ptr(obj[:user_factors_implicit])
841
- @bi = load_ptr(obj[:item_factors_implicit])
858
+ @add_implicit_features = obj["add_implicit_features"]
859
+ @ai = json_load_ptr(obj["user_factors_implicit"])
860
+ @bi = json_load_ptr(obj["item_factors_implicit"])
842
861
 
843
862
  unless @implicit
844
- @min_rating = obj[:min_rating]
845
- @max_rating = obj[:max_rating]
863
+ @min_rating = obj["min_rating"]
864
+ @max_rating = obj["max_rating"]
846
865
  end
847
866
 
848
- @u_colmeans = load_ptr(obj[:user_means])
867
+ @u_colmeans = json_load_ptr(obj["user_means"])
849
868
 
850
869
  @m = @user_map.size
851
870
  @n = @item_map.size
@@ -1,3 +1,3 @@
1
1
  module Cmfrec
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cmfrec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-15 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2024-10-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fiddle
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description:
14
28
  email: andrew@ankane.org
15
29
  executables: []
@@ -45,14 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
45
59
  requirements:
46
60
  - - ">="
47
61
  - !ruby/object:Gem::Version
48
- version: '2.7'
62
+ version: '3.1'
49
63
  required_rubygems_version: !ruby/object:Gem::Requirement
50
64
  requirements:
51
65
  - - ">="
52
66
  - !ruby/object:Gem::Version
53
67
  version: '0'
54
68
  requirements: []
55
- rubygems_version: 3.3.7
69
+ rubygems_version: 3.5.16
56
70
  signing_key:
57
71
  specification_version: 4
58
72
  summary: Recommendations for Ruby using collective matrix factorization