cmfrec 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/README.md +19 -14
- data/lib/cmfrec/data.rb +11 -7
- data/lib/cmfrec/recommender.rb +92 -45
- data/lib/cmfrec/version.rb +1 -1
- data/lib/cmfrec.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d16ab98cb7de22042eaf353a9d41d0b7a4214a631a373c553f73825418c026a
|
4
|
+
data.tar.gz: 9ab678a9d389b835b4dfd91d14c372d5acfef950bf068ac46d2d879af04f0fcc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 175d3c91056d2e8734af6961471c98be76e4d5f6b85faaecdfd3b39a220efafa70150983e9d74efb1a1211a29e6c867d6b1d7f482cc34c55500268d29b40158c
|
7
|
+
data.tar.gz: faaed621391ccc7d94f2e6309481b24f7db62fbe86eb6c1bbb35445dde0cabf32c41791af4ebcd25e530bdc1f6319f1b4477b6c4d6ca1bf77353d3a0c4ae8d5c
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,24 @@
|
|
1
|
+
## 0.1.7 (2022-03-22)
|
2
|
+
|
3
|
+
- Improved ARM detection
|
4
|
+
- Fixed error with `load_movielens`
|
5
|
+
- Fixed duplicates in `item_info` with `load_movielens`
|
6
|
+
|
7
|
+
## 0.1.6 (2021-08-12)
|
8
|
+
|
9
|
+
- Added `user_ids` and `item_ids` methods
|
10
|
+
- Added `user_id` argument to `user_factors`
|
11
|
+
- Added `item_id` argument to `item_factors`
|
12
|
+
- Added `user_id` argument to `user_bias`
|
13
|
+
- Added `item_id` argument to `item_bias`
|
14
|
+
- Added `item_ids` argument to `new_user_recs`
|
15
|
+
- Fixed order for `user_recs`
|
16
|
+
|
17
|
+
## 0.1.5 (2021-08-10)
|
18
|
+
|
19
|
+
- Fixed issue with `user_recs` and `new_user_recs` returning rated items
|
20
|
+
- Fixed error with `new_user_recs`
|
21
|
+
|
1
22
|
## 0.1.4 (2021-02-04)
|
2
23
|
|
3
24
|
- Added support for saving and loading recommenders
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# cmfrec
|
1
|
+
# cmfrec Ruby
|
2
2
|
|
3
3
|
:fire: Recommendations for Ruby, powered by [cmfrec](https://github.com/david-cortes/cmfrec)
|
4
4
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
- Works with explicit and implicit feedback
|
7
7
|
- Uses high-performance matrix factorization
|
8
8
|
|
9
|
-
[](https://github.com/ankane/cmfrec/actions)
|
9
|
+
[](https://github.com/ankane/cmfrec-ruby/actions)
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -58,8 +58,8 @@ Get recommendations for a new user
|
|
58
58
|
|
59
59
|
```ruby
|
60
60
|
recommender.new_user_recs([
|
61
|
-
{item_id: 1,
|
62
|
-
{item_id: 2,
|
61
|
+
{item_id: 1, rating: 5},
|
62
|
+
{item_id: 2, rating: 3}
|
63
63
|
])
|
64
64
|
```
|
65
65
|
|
@@ -150,11 +150,7 @@ recommender.predict(ratings.last(20000))
|
|
150
150
|
[Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback
|
151
151
|
|
152
152
|
```ruby
|
153
|
-
views = Ahoy::Event.
|
154
|
-
where(name: "Viewed post").
|
155
|
-
group(:user_id).
|
156
|
-
group("properties->>'post_id'"). # postgres syntax
|
157
|
-
count
|
153
|
+
views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count
|
158
154
|
|
159
155
|
data =
|
160
156
|
views.map do |(user_id, post_id), count|
|
@@ -230,8 +226,17 @@ bin = File.binread("recommender.bin")
|
|
230
226
|
recommender = Marshal.load(bin)
|
231
227
|
```
|
232
228
|
|
229
|
+
Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples) for Disco, which has a similar API. For explicit feedback, you should [disable the bias](#explicit-feedback) with this approach.
|
230
|
+
|
233
231
|
## Reference
|
234
232
|
|
233
|
+
Get ids
|
234
|
+
|
235
|
+
```ruby
|
236
|
+
recommender.user_ids
|
237
|
+
recommender.item_ids
|
238
|
+
```
|
239
|
+
|
235
240
|
Get the global mean
|
236
241
|
|
237
242
|
```ruby
|
@@ -262,22 +267,22 @@ Cmfrec.ffi_lib = "path/to/cmfrec.dll"
|
|
262
267
|
|
263
268
|
## History
|
264
269
|
|
265
|
-
View the [changelog](https://github.com/ankane/cmfrec/blob/master/CHANGELOG.md)
|
270
|
+
View the [changelog](https://github.com/ankane/cmfrec-ruby/blob/master/CHANGELOG.md)
|
266
271
|
|
267
272
|
## Contributing
|
268
273
|
|
269
274
|
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
270
275
|
|
271
|
-
- [Report bugs](https://github.com/ankane/cmfrec/issues)
|
272
|
-
- Fix bugs and [submit pull requests](https://github.com/ankane/cmfrec/pulls)
|
276
|
+
- [Report bugs](https://github.com/ankane/cmfrec-ruby/issues)
|
277
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/cmfrec-ruby/pulls)
|
273
278
|
- Write, clarify, or fix documentation
|
274
279
|
- Suggest or add new features
|
275
280
|
|
276
281
|
To get started with development:
|
277
282
|
|
278
283
|
```sh
|
279
|
-
git clone https://github.com/ankane/cmfrec.git
|
280
|
-
cd cmfrec
|
284
|
+
git clone https://github.com/ankane/cmfrec-ruby.git
|
285
|
+
cd cmfrec-ruby
|
281
286
|
bundle install
|
282
287
|
bundle exec rake vendor:all
|
283
288
|
bundle exec rake test
|
data/lib/cmfrec/data.rb
CHANGED
@@ -3,11 +3,11 @@ module Cmfrec
|
|
3
3
|
def load_movielens
|
4
4
|
require "csv"
|
5
5
|
|
6
|
-
data_path = download_file("ml-100k/u.data", "
|
6
|
+
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
|
7
7
|
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
|
8
|
-
user_path = download_file("ml-100k/u.user", "
|
8
|
+
user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user",
|
9
9
|
file_hash: "f120e114da2e8cf314fd28f99417c94ae9ddf1cb6db8ce0e4b5995d40e90e62c")
|
10
|
-
item_path = download_file("ml-100k/u.item", "
|
10
|
+
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
|
11
11
|
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
|
12
12
|
|
13
13
|
# convert u.item to utf-8
|
@@ -24,8 +24,13 @@ module Cmfrec
|
|
24
24
|
|
25
25
|
item_info = []
|
26
26
|
movies = {}
|
27
|
+
movie_names = {}
|
27
28
|
genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western)
|
28
29
|
CSV.parse(movies_str, col_sep: "|", converters: [:numeric]) do |row|
|
30
|
+
# filter duplicates
|
31
|
+
next if movie_names[row[1]]
|
32
|
+
movie_names[row[1]] = true
|
33
|
+
|
29
34
|
movies[row[0]] = row[1]
|
30
35
|
item = {item_id: row[1], year: row[2] ? Date.parse(row[2]).year : 1970}
|
31
36
|
genres.each_with_index do |genre, i|
|
@@ -49,7 +54,10 @@ module Cmfrec
|
|
49
54
|
private
|
50
55
|
|
51
56
|
def download_file(fname, origin, file_hash:)
|
57
|
+
require "digest"
|
52
58
|
require "fileutils"
|
59
|
+
require "net/http"
|
60
|
+
require "tmpdir"
|
53
61
|
|
54
62
|
# TODO handle this better
|
55
63
|
raise "No HOME" unless ENV["HOME"]
|
@@ -58,10 +66,6 @@ module Cmfrec
|
|
58
66
|
|
59
67
|
return dest if File.exist?(dest)
|
60
68
|
|
61
|
-
require "digest"
|
62
|
-
require "net/http"
|
63
|
-
require "tmpdir"
|
64
|
-
|
65
69
|
temp_path = "#{Dir.tmpdir}/cmfrec-#{Time.now.to_f}" # TODO better name
|
66
70
|
|
67
71
|
digest = Digest::SHA2.new
|
data/lib/cmfrec/recommender.rb
CHANGED
@@ -67,21 +67,10 @@ module Cmfrec
|
|
67
67
|
user = @user_map[user_id]
|
68
68
|
|
69
69
|
if user
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
data = item_ids.map { |v| {user_id: user_id, item_id: v} }
|
75
|
-
scores = predict(data)
|
76
|
-
|
77
|
-
item_ids.zip(scores).map do |item_id, score|
|
78
|
-
{item_id: item_id, score: score}
|
79
|
-
end
|
80
|
-
else
|
81
|
-
a_vec = @a[user * @k * Fiddle::SIZEOF_DOUBLE, @k * Fiddle::SIZEOF_DOUBLE]
|
82
|
-
a_bias = @bias_a ? @bias_a[user * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") : 0
|
83
|
-
top_n(a_vec: a_vec, a_bias: a_bias, count: count)
|
84
|
-
end
|
70
|
+
a_vec = @a[user * @k * Fiddle::SIZEOF_DOUBLE, @k * Fiddle::SIZEOF_DOUBLE]
|
71
|
+
a_bias = @bias_a ? @bias_a[user * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") : 0
|
72
|
+
# @rated[user] will be nil for recommenders saved before 0.1.5
|
73
|
+
top_n(a_vec: a_vec, a_bias: a_bias, count: count, rated: (@rated[user] || {}).keys, item_ids: item_ids)
|
85
74
|
else
|
86
75
|
# no items if user is unknown
|
87
76
|
# TODO maybe most popular items
|
@@ -89,28 +78,35 @@ module Cmfrec
|
|
89
78
|
end
|
90
79
|
end
|
91
80
|
|
92
|
-
|
93
|
-
def new_user_recs(data, count: 5, user_info: nil)
|
81
|
+
def new_user_recs(data, count: 5, user_info: nil, item_ids: nil)
|
94
82
|
check_fit
|
95
83
|
|
96
|
-
a_vec, a_bias = factors_warm(data, user_info: user_info)
|
97
|
-
top_n(a_vec: a_vec, a_bias: a_bias, count: count)
|
84
|
+
a_vec, a_bias, rated = factors_warm(data, user_info: user_info)
|
85
|
+
top_n(a_vec: a_vec, a_bias: a_bias, count: count, rated: rated, item_ids: item_ids)
|
86
|
+
end
|
87
|
+
|
88
|
+
def user_ids
|
89
|
+
@user_map.keys
|
98
90
|
end
|
99
91
|
|
100
|
-
def
|
101
|
-
|
92
|
+
def item_ids
|
93
|
+
@item_map.keys
|
102
94
|
end
|
103
95
|
|
104
|
-
def
|
105
|
-
read_factors(@
|
96
|
+
def user_factors(user_id = nil)
|
97
|
+
read_factors(@a, [@m, @m_u].max, @k_user + @k + @k_main, user_id, @user_map)
|
106
98
|
end
|
107
99
|
|
108
|
-
def
|
109
|
-
|
100
|
+
def item_factors(item_id = nil)
|
101
|
+
read_factors(@b, [@n, @n_i].max, @k_item + @k + @k_main, item_id, @item_map)
|
110
102
|
end
|
111
103
|
|
112
|
-
def
|
113
|
-
read_bias(@
|
104
|
+
def user_bias(user_id = nil)
|
105
|
+
read_bias(@bias_a, user_id, @user_map) if @bias_a
|
106
|
+
end
|
107
|
+
|
108
|
+
def item_bias(item_id = nil)
|
109
|
+
read_bias(@bias_b, item_id, @item_map) if @bias_b
|
114
110
|
end
|
115
111
|
|
116
112
|
def similar_items(item_id, count: 5)
|
@@ -191,11 +187,17 @@ module Cmfrec
|
|
191
187
|
x_col = []
|
192
188
|
x_val = []
|
193
189
|
value_key = @implicit ? :value : :rating
|
190
|
+
@rated = Hash.new { |hash, key| hash[key] = {} }
|
194
191
|
train_set.each do |v|
|
195
|
-
|
196
|
-
|
192
|
+
u = @user_map[v[:user_id]]
|
193
|
+
i = @item_map[v[:item_id]]
|
194
|
+
@rated[u][i] = true
|
195
|
+
|
196
|
+
x_row << u
|
197
|
+
x_col << i
|
197
198
|
x_val << (v[value_key] || 1)
|
198
199
|
end
|
200
|
+
@rated.default = nil
|
199
201
|
|
200
202
|
@m = @user_map.size
|
201
203
|
@n = @item_map.size
|
@@ -435,26 +437,59 @@ module Cmfrec
|
|
435
437
|
end
|
436
438
|
end
|
437
439
|
|
438
|
-
def read_factors(ptr, d1, d2)
|
439
|
-
arr = []
|
440
|
-
offset = 0
|
440
|
+
def read_factors(ptr, d1, d2, id, map)
|
441
441
|
width = d2 * Fiddle::SIZEOF_DOUBLE
|
442
|
-
|
443
|
-
|
444
|
-
|
442
|
+
if id
|
443
|
+
i = map[id]
|
444
|
+
ptr[i * width, width].unpack("d*") if i
|
445
|
+
else
|
446
|
+
arr = []
|
447
|
+
offset = 0
|
448
|
+
d1.times do |i|
|
449
|
+
arr << ptr[offset, width].unpack("d*")
|
450
|
+
offset += width
|
451
|
+
end
|
452
|
+
arr
|
445
453
|
end
|
446
|
-
arr
|
447
454
|
end
|
448
455
|
|
449
|
-
def read_bias(ptr)
|
450
|
-
|
456
|
+
def read_bias(ptr, id, map)
|
457
|
+
if id
|
458
|
+
i = map[id]
|
459
|
+
ptr[i * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") if i
|
460
|
+
else
|
461
|
+
real_array(ptr)
|
462
|
+
end
|
451
463
|
end
|
452
464
|
|
453
|
-
def top_n(a_vec:, a_bias:, count:)
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
465
|
+
def top_n(a_vec:, a_bias:, count:, rated: nil, item_ids: nil)
|
466
|
+
if item_ids
|
467
|
+
# remove missing ids
|
468
|
+
item_ids = item_ids.map { |v| @item_map[v] }.compact
|
469
|
+
return [] if item_ids.empty?
|
470
|
+
|
471
|
+
include_ix = int_ptr(item_ids)
|
472
|
+
n_include = item_ids.size
|
473
|
+
|
474
|
+
# TODO uncomment in 0.2.0
|
475
|
+
count = n_include # if n_include < count
|
476
|
+
else
|
477
|
+
include_ix = nil
|
478
|
+
n_include = 0
|
479
|
+
end
|
480
|
+
|
481
|
+
if rated && !item_ids
|
482
|
+
# assumes rated is unique and all items are known
|
483
|
+
# calling code is responsible for this
|
484
|
+
exclude_ix = int_ptr(rated)
|
485
|
+
n_exclude = rated.size
|
486
|
+
remaining = @item_map.size - n_exclude
|
487
|
+
return [] if remaining == 0
|
488
|
+
count = remaining if remaining < count
|
489
|
+
else
|
490
|
+
exclude_ix = nil
|
491
|
+
n_exclude = 0
|
492
|
+
end
|
458
493
|
|
459
494
|
outp_ix = Fiddle::Pointer.malloc(count * Fiddle::SIZEOF_INT)
|
460
495
|
outp_score = Fiddle::Pointer.malloc(count * Fiddle::SIZEOF_DOUBLE)
|
@@ -484,6 +519,16 @@ module Cmfrec
|
|
484
519
|
data = to_dataset(data)
|
485
520
|
user_info = to_dataset(user_info) if user_info
|
486
521
|
|
522
|
+
# remove unknown items
|
523
|
+
data, unknown_data = data.partition { |d| @item_map[d[:item_id]] }
|
524
|
+
|
525
|
+
if unknown_data.any?
|
526
|
+
# TODO warn for unknown items?
|
527
|
+
# warn "[cmfrec] Unknown items: #{unknown_data.map { |d| d[:item_id] }.join(", ")}"
|
528
|
+
end
|
529
|
+
|
530
|
+
item_ids = data.map { |d| @item_map[d[:item_id]] }
|
531
|
+
|
487
532
|
nnz = data.size
|
488
533
|
a_vec = Fiddle::Pointer.malloc((@k_user + @k + @k_main) * Fiddle::SIZEOF_DOUBLE)
|
489
534
|
bias_a = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
@@ -524,7 +569,7 @@ module Cmfrec
|
|
524
569
|
check_ratings(ratings)
|
525
570
|
end
|
526
571
|
xa = real_ptr(ratings)
|
527
|
-
x_col = int_ptr(
|
572
|
+
x_col = int_ptr(item_ids)
|
528
573
|
else
|
529
574
|
xa = nil
|
530
575
|
x_col = nil
|
@@ -587,7 +632,7 @@ module Cmfrec
|
|
587
632
|
check_status FFI.factors_collective_explicit_single(*fiddle_args(args))
|
588
633
|
end
|
589
634
|
|
590
|
-
[a_vec, real_array(bias_a).first]
|
635
|
+
[a_vec, real_array(bias_a).first, item_ids.uniq]
|
591
636
|
end
|
592
637
|
|
593
638
|
# convert boolean to int
|
@@ -679,6 +724,7 @@ module Cmfrec
|
|
679
724
|
# factors
|
680
725
|
obj[:user_map] = @user_map
|
681
726
|
obj[:item_map] = @item_map
|
727
|
+
obj[:rated] = @rated
|
682
728
|
obj[:user_factors] = dump_ptr(@a)
|
683
729
|
obj[:item_factors] = dump_ptr(@b)
|
684
730
|
|
@@ -726,6 +772,7 @@ module Cmfrec
|
|
726
772
|
# factors
|
727
773
|
@user_map = obj[:user_map]
|
728
774
|
@item_map = obj[:item_map]
|
775
|
+
@rated = obj[:rated] || {}
|
729
776
|
@a = load_ptr(obj[:user_factors])
|
730
777
|
@b = load_ptr(obj[:item_factors])
|
731
778
|
|
data/lib/cmfrec/version.rb
CHANGED
data/lib/cmfrec.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmfrec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -28,7 +28,7 @@ files:
|
|
28
28
|
- vendor/libcmfrec.arm64.dylib
|
29
29
|
- vendor/libcmfrec.dylib
|
30
30
|
- vendor/libcmfrec.so
|
31
|
-
homepage: https://github.com/ankane/cmfrec
|
31
|
+
homepage: https://github.com/ankane/cmfrec-ruby
|
32
32
|
licenses:
|
33
33
|
- MIT
|
34
34
|
metadata: {}
|
@@ -47,7 +47,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '0'
|
49
49
|
requirements: []
|
50
|
-
rubygems_version: 3.
|
50
|
+
rubygems_version: 3.3.7
|
51
51
|
signing_key:
|
52
52
|
specification_version: 4
|
53
53
|
summary: Recommendations for Ruby using collective matrix factorization
|