RubyGems - cmfrec - Versions diffs - 0.1.3 → 0.1.4 - Mend

cmfrec 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/LICENSE.txt +1 -1
data/README.md +67 -0
data/lib/cmfrec.rb +5 -3
data/lib/cmfrec/recommender.rb +299 -121
data/lib/cmfrec/version.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 361b138735f0599f3aea9d87da1f39b0b0bcb3d7c531eb8eb08d11f84ad12b30
-  data.tar.gz: 8f634b076fb0171e7c22bd5db0d70512f5cf8dc1c4f6c2a9cb2702628936ed44
+  metadata.gz: bb7b07ae46500a545f1a130dfc5648aa3f925f9b5766a6c70a1652c7b5732182
+  data.tar.gz: e89a6d1900cda651dc6b0aac2899050e28680cddfb6b39b6b5eacfe467b59aad
 SHA512:
-  metadata.gz: be7d1466e512d99edaa8d10b858211130e4d70c33f1a01e58a5541f849d4744fb2ca1beed66c8e75b77a17ba4b9954ff4c17c3119b1f16859db3eb9cf4d3a902
-  data.tar.gz: 70db81ada4152b37d583bac35ca1cdbc5e69eecd8b69dcb8bb472dcc84a2c131f85cdb6e7dbc3a3167868f2432716a22e93546fcd8ae6c661b012c287484a08c
+  metadata.gz: 117aa6952fe0ab8ddebfaece6655cf479a7adbab7d6f634e7d3428c72824a410812c037ae006366180a9691a6d160d8065b777a9c10a33a5ccfefedb28c99ec6
+  data.tar.gz: 57985a055705b820226a2aa1451453383ee3509e43225f8fdb09e713c4530754b0b608f7d1b4814973b43e3d625f824f9f87939687d015b352cc8905f7b4f118

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## 0.1.4 (2021-02-04)
+- Added support for saving and loading recommenders
+- Added `similar_users` and `similar_items`
+- Improved ARM detection
 ## 0.1.3 (2020-12-28)
 - Added ARM shared library for Mac

data/LICENSE.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 MIT License
 Copyright (c) 2020 David Cortes
-Copyright (c) 2020 Andrew Kane
+Copyright (c) 2020-2021 Andrew Kane
 All rights reserved.

data/README.md CHANGED Viewed

@@ -107,6 +107,26 @@ Get recommendations with only side information
 recommender.new_user_recs([], user_info: {cats: 0, dogs: 2})
 ```
+## Similarity
+Add this line to your application’s Gemfile:
+```ruby
+gem 'ngt'
+```
+Get similar users
+```ruby
+recommender.similar_users(user_id)
+```
+Get similar items - “users who liked this item also liked”
+```ruby
+recommender.similar_items(item_id)
+```
 ## Examples
 ### MovieLens
@@ -125,6 +145,35 @@ recommender.fit(ratings.first(80000), user_info: user_info, item_info: item_info
 recommender.predict(ratings.last(20000))
 ```
+### Ahoy
+[Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback
+```ruby
+views = Ahoy::Event.
+  where(name: "Viewed post").
+  group(:user_id).
+  group("properties->>'post_id'"). # postgres syntax
+  count
+data =
+  views.map do |(user_id, post_id), count|
+    {
+      user_id: user_id,
+      item_id: post_id,
+      value: count
+    }
+  end
+```
+Create a recommender and get recommended posts for a user
+```ruby
+recommender = Cmfrec::Recommender.new
+recommender.fit(data)
+recommender.user_recs(current_user.id)
+```
 ## Options
 Specify the number of factors and epochs
@@ -163,6 +212,24 @@ Or a Rover data frame
 Rover.read_csv("ratings.csv")
 ```
+## Storing Recommenders
+Store the recommender
+```ruby
+bin = Marshal.dump(recommender)
+File.binwrite("recommender.bin", bin)
+```
+> You can save it to a file, database, or any other storage system
+Load a recommender
+```ruby
+bin = File.binread("recommender.bin")
+recommender = Marshal.load(bin)
+```
 ## Reference
 Get the global mean

data/lib/cmfrec.rb CHANGED Viewed

@@ -18,10 +18,12 @@ module Cmfrec
   lib_name =
     if Gem.win_platform?
       "cmfrec.dll"
-    elsif RbConfig::CONFIG["arch"] =~ /arm64-darwin/i
-      "libcmfrec.arm64.dylib"
     elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
-      "libcmfrec.dylib"
+      if RbConfig::CONFIG["host_cpu"] =~ /arm/i
+        "libcmfrec.arm64.dylib"
+      else
+        "libcmfrec.dylib"
+      end
     else
       "libcmfrec.so"
     end

data/lib/cmfrec/recommender.rb CHANGED Viewed

@@ -11,19 +11,181 @@ module Cmfrec
         item_bias: item_bias,
         add_implicit_features: add_implicit_features
       )
+      @fit = false
+      @user_map = {}
+      @item_map = {}
+      @user_info_map = {}
+      @item_info_map = {}
     end
     def fit(train_set, user_info: nil, item_info: nil)
+      reset
+      partial_fit(train_set, user_info: user_info, item_info: item_info)
+    end
+    def predict(data)
+      check_fit
+      data = to_dataset(data)
+      u = data.map { |v| @user_map[v[:user_id]] || @user_map.size }
+      i = data.map { |v| @item_map[v[:item_id]] || @item_map.size }
+      row = int_ptr(u)
+      col = int_ptr(i)
+      n_predict = data.size
+      predicted = Fiddle::Pointer.malloc(n_predict * Fiddle::SIZEOF_DOUBLE)
+      if @implicit
+        check_status FFI.predict_X_old_collective_implicit(
+          row, col, predicted, n_predict,
+          @a, @b,
+          @k, @k_user, @k_item, @k_main,
+          @m, @n,
+          @nthreads
+        )
+      else
+        check_status FFI.predict_X_old_collective_explicit(
+          row, col, predicted, n_predict,
+          @a, @bias_a,
+          @b, @bias_b,
+          @global_mean,
+          @k, @k_user, @k_item, @k_main,
+          @m, @n,
+          @nthreads
+        )
+      end
+      predictions = real_array(predicted)
+      predictions.map! { |v| v.nan? ? @global_mean : v } if @implicit
+      predictions
+    end
+    def user_recs(user_id, count: 5, item_ids: nil)
+      check_fit
+      user = @user_map[user_id]
+      if user
+        if item_ids
+          # remove missing ids
+          item_ids = item_ids.select { |v| @item_map[v] }
+          data = item_ids.map { |v| {user_id: user_id, item_id: v} }
+          scores = predict(data)
+          item_ids.zip(scores).map do |item_id, score|
+            {item_id: item_id, score: score}
+          end
+        else
+          a_vec = @a[user * @k * Fiddle::SIZEOF_DOUBLE, @k * Fiddle::SIZEOF_DOUBLE]
+          a_bias = @bias_a ? @bias_a[user * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") : 0
+          top_n(a_vec: a_vec, a_bias: a_bias, count: count)
+        end
+      else
+        # no items if user is unknown
+        # TODO maybe most popular items
+        []
+      end
+    end
+    # TODO add item_ids
+    def new_user_recs(data, count: 5, user_info: nil)
+      check_fit
+      a_vec, a_bias = factors_warm(data, user_info: user_info)
+      top_n(a_vec: a_vec, a_bias: a_bias, count: count)
+    end
+    def user_factors
+      read_factors(@a, [@m, @m_u].max, @k_user + @k + @k_main)
+    end
+    def item_factors
+      read_factors(@b, [@n, @n_i].max, @k_item + @k + @k_main)
+    end
+    def user_bias
+      read_bias(@bias_a) if @bias_a
+    end
+    def item_bias
+      read_bias(@bias_b) if @bias_b
+    end
+    def similar_items(item_id, count: 5)
+      check_fit
+      similar(item_id, @item_map, item_factors, count, item_index)
+    end
+    alias_method :item_recs, :similar_items
+    def similar_users(user_id, count: 5)
+      check_fit
+      similar(user_id, @user_map, user_factors, count, user_index)
+    end
+    private
+    def user_index
+      @user_index ||= create_index(user_factors)
+    end
+    def item_index
+      @item_index ||= create_index(item_factors)
+    end
+    def create_index(factors)
+      require "ngt"
+      index = Ngt::Index.new(@k, distance_type: "Cosine")
+      index.batch_insert(factors)
+      index
+    end
+    # TODO include bias
+    def similar(id, map, factors, count, index)
+      i = map[id]
+      if i
+        keys = map.keys
+        result = index.search(factors[i], size: count + 1)[1..-1]
+        result.map do |v|
+          {
+            # ids from batch_insert start at 1 instead of 0
+            item_id: keys[v[:id] - 1],
+            # convert cosine distance to cosine similarity
+            score: 1 - v[:distance]
+          }
+        end
+      else
+        []
+      end
+    end
+    def reset
+      @fit = false
+      @user_map.clear
+      @item_map.clear
+      @user_info_map.clear
+      @item_info_map.clear
+      @user_index = nil
+      @item_index = nil
+    end
+    # TODO resize pointers as needed and reset values for new memory
+    def partial_fit(train_set, user_info: nil, item_info: nil)
       train_set = to_dataset(train_set)
-      @implicit = !train_set.any? { |v| v[:rating] }
+      unless @fit
+        @implicit = !train_set.any? { |v| v[:rating] }
+      end
       unless @implicit
         ratings = train_set.map { |o| o[:rating] }
         check_ratings(ratings)
       end
       check_training_set(train_set)
-      create_maps(train_set)
+      update_maps(train_set)
       x_row = []
       x_col = []
@@ -52,16 +214,14 @@ module Cmfrec
       uu = nil
       ii = nil
-      @user_info_map = {}
+      # side info
       u_row, u_col, u_sp, nnz_u, @m_u, p_ = process_info(user_info, @user_map, @user_info_map, :user_id)
-      @item_info_map = {}
       i_row, i_col, i_sp, nnz_i, @n_i, q = process_info(item_info, @item_map, @item_info_map, :item_id)
       @precompute_for_predictions = false
       # initialize w/ normal distribution
-      reset_values = true
+      reset_values = !@fit
       @a = Fiddle::Pointer.malloc([@m, @m_u].max * (@k_user + @k + @k_main) * Fiddle::SIZEOF_DOUBLE)
       @b = Fiddle::Pointer.malloc([@n, @n_i].max * (@k_item + @k + @k_main) * Fiddle::SIZEOF_DOUBLE)
@@ -75,16 +235,7 @@ module Cmfrec
       i_colmeans = Fiddle::Pointer.malloc(q * Fiddle::SIZEOF_DOUBLE)
       if @implicit
-        @w_main_multiplier = 1.0
-        @alpha = 1.0
-        @adjust_weight = false # downweight?
-        @apply_log_transf = false
-        # different defaults
-        @lambda_ = 1e0
-        @w_user = 10
-        @w_item = 10
-        @finalize_chol = false
+        set_implicit_vars
         args = [
           @a, @b,
@@ -175,104 +326,13 @@ module Cmfrec
         @global_mean = real_array(glob_mean).first
       end
-      @u_colmeans = real_array(u_colmeans)
-      @i_colmeans = real_array(i_colmeans)
-      @u_colmeans_ptr = u_colmeans
-      self
-    end
-    def predict(data)
-      check_fit
-      data = to_dataset(data)
-      u = data.map { |v| @user_map[v[:user_id]] || @user_map.size }
-      i = data.map { |v| @item_map[v[:item_id]] || @item_map.size }
-      row = int_ptr(u)
-      col = int_ptr(i)
-      n_predict = data.size
-      predicted = Fiddle::Pointer.malloc(n_predict * Fiddle::SIZEOF_DOUBLE)
-      if @implicit
-        check_status FFI.predict_X_old_collective_implicit(
-          row, col, predicted, n_predict,
-          @a, @b,
-          @k, @k_user, @k_item, @k_main,
-          @m, @n,
-          @nthreads
-        )
-      else
-        check_status FFI.predict_X_old_collective_explicit(
-          row, col, predicted, n_predict,
-          @a, @bias_a,
-          @b, @bias_b,
-          @global_mean,
-          @k, @k_user, @k_item, @k_main,
-          @m, @n,
-          @nthreads
-        )
-      end
-      predictions = real_array(predicted)
-      predictions.map! { |v| v.nan? ? @global_mean : v } if @implicit
-      predictions
-    end
-    def user_recs(user_id, count: 5, item_ids: nil)
-      check_fit
-      user = @user_map[user_id]
-      if user
-        if item_ids
-          # remove missing ids
-          item_ids = item_ids.select { |v| @item_map[v] }
-          data = item_ids.map { |v| {user_id: user_id, item_id: v} }
-          scores = predict(data)
-          item_ids.zip(scores).map do |item_id, score|
-            {item_id: item_id, score: score}
-          end
-        else
-          a_vec = @a[user * @k * Fiddle::SIZEOF_DOUBLE, @k * Fiddle::SIZEOF_DOUBLE]
-          a_bias = @bias_a ? @bias_a[user * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") : 0
-          top_n(a_vec: a_vec, a_bias: a_bias, count: count)
-        end
-      else
-        # no items if user is unknown
-        # TODO maybe most popular items
-        []
-      end
-    end
-    # TODO add item_ids
-    def new_user_recs(data, count: 5, user_info: nil)
-      check_fit
-      a_vec, a_bias = factors_warm(data, user_info: user_info)
-      top_n(a_vec: a_vec, a_bias: a_bias, count: count)
-    end
-    def user_factors
-      read_factors(@a, [@m, @m_u].max, @k_user + @k + @k_main)
-    end
-    def item_factors
-      read_factors(@b, [@n, @n_i].max, @k_item + @k + @k_main)
-    end
+      @u_colmeans = u_colmeans
-    def user_bias
-      read_bias(@bias_a) if @bias_a
-    end
+      @fit = true
-    def item_bias
-      read_bias(@bias_b) if @bias_b
+      self
     end
-    private
     def set_params(
       k: 40, lambda_: 1e+1, method: "als", use_cg: true, user_bias: true,
       item_bias: true, add_implicit_features: false,
@@ -329,15 +389,14 @@ module Cmfrec
       @nthreads = nthreads
     end
-    def create_maps(train_set)
-      user_ids = train_set.map { |v| v[:user_id] }.uniq.sort
-      item_ids = train_set.map { |v| v[:item_id] }.uniq.sort
-      raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?)
-      raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?)
+    def update_maps(train_set)
+      raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
+      raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
-      @user_map = user_ids.zip(user_ids.size.times).to_h
-      @item_map = item_ids.zip(item_ids.size.times).to_h
+      train_set.each do |v|
+        @user_map[v[:user_id]] ||= @user_map.size
+        @item_map[v[:item_id]] ||= @item_map.size
+      end
     end
     def check_ratings(ratings)
@@ -354,7 +413,7 @@ module Cmfrec
     end
     def check_fit
-      raise "Not fit" unless defined?(@implicit)
+      raise "Not fit" unless @fit
     end
     def to_dataset(dataset)
@@ -479,7 +538,7 @@ module Cmfrec
           u_vec_sp, u_vec_x_col, nnz_u_vec,
           @na_as_zero_user,
           @nonneg,
-          @u_colmeans_ptr,
+          @u_colmeans,
           @b, @n, @c,
           xa, x_col, nnz,
           @k, @k_user, @k_item, @k_main,
@@ -505,7 +564,7 @@ module Cmfrec
           @na_as_zero_user, @na_as_zero,
           @nonneg,
           @c, cb,
-          @global_mean, @bias_b, @u_colmeans_ptr,
+          @global_mean, @bias_b, @u_colmeans,
           xa, x_col, nnz, xa_dense,
           @n, weight, @b, @bi,
           @add_implicit_features,
@@ -585,5 +644,124 @@ module Cmfrec
     def real_array(ptr)
       ptr.to_s(ptr.size).unpack("d*")
     end
+    def set_implicit_vars
+      @w_main_multiplier = 1.0
+      @alpha = 1.0
+      @adjust_weight = false # downweight?
+      @apply_log_transf = false
+      # different defaults
+      @lambda_ = 1e0
+      @w_user = 10
+      @w_item = 10
+      @finalize_chol = false
+    end
+    def dump_ptr(ptr)
+      ptr.to_s(ptr.size) if ptr
+    end
+    def load_ptr(str)
+      Fiddle::Pointer[str] if str
+    end
+    def marshal_dump
+      obj = {
+        implicit: @implicit
+      }
+      # options
+      obj[:factors] = @k
+      obj[:epochs] = @niter
+      obj[:verbose] = @verbose
+      # factors
+      obj[:user_map] = @user_map
+      obj[:item_map] = @item_map
+      obj[:user_factors] = dump_ptr(@a)
+      obj[:item_factors] = dump_ptr(@b)
+      # bias
+      obj[:user_bias] = dump_ptr(@bias_a)
+      obj[:item_bias] = dump_ptr(@bias_b)
+      # mean
+      obj[:global_mean] = @global_mean
+      # side info
+      obj[:user_info_map] = @user_info_map
+      obj[:item_info_map] = @item_info_map
+      obj[:user_info_factors] = dump_ptr(@c)
+      obj[:item_info_factors] = dump_ptr(@d)
+      # implicit features
+      obj[:add_implicit_features] = @add_implicit_features
+      obj[:user_factors_implicit] = dump_ptr(@ai)
+      obj[:item_factors_implicit] = dump_ptr(@bi)
+      unless @implicit
+        obj[:min_rating] = @min_rating
+        obj[:max_rating] = @max_rating
+      end
+      obj[:user_means] = dump_ptr(@u_colmeans)
+      obj
+    end
+    def marshal_load(obj)
+      @implicit = obj[:implicit]
+      # options
+      set_params(
+        k: obj[:factors],
+        niter: obj[:epochs],
+        verbose: obj[:verbose],
+        user_bias: !obj[:user_bias].nil?,
+        item_bias: !obj[:item_bias].nil?,
+        add_implicit_features: obj[:add_implicit_features]
+      )
+      # factors
+      @user_map = obj[:user_map]
+      @item_map = obj[:item_map]
+      @a = load_ptr(obj[:user_factors])
+      @b = load_ptr(obj[:item_factors])
+      # bias
+      @bias_a = load_ptr(obj[:user_bias])
+      @bias_b = load_ptr(obj[:item_bias])
+      # mean
+      @global_mean = obj[:global_mean]
+      # side info
+      @user_info_map = obj[:user_info_map]
+      @item_info_map = obj[:item_info_map]
+      @c = load_ptr(obj[:user_info_factors])
+      @d = load_ptr(obj[:item_info_factors])
+      # implicit features
+      @add_implicit_features = obj[:add_implicit_features]
+      @ai = load_ptr(obj[:user_factors_implicit])
+      @bi = load_ptr(obj[:item_factors_implicit])
+      unless @implicit
+        @min_rating = obj[:min_rating]
+        @max_rating = obj[:max_rating]
+      end
+      @u_colmeans = load_ptr(obj[:user_means])
+      @m = @user_map.size
+      @n = @item_map.size
+      @m_u = @user_info_map.size
+      @n_i = @item_info_map.size
+      set_implicit_vars if @implicit
+      @fit = @m > 0
+    end
   end
 end

data/lib/cmfrec/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Cmfrec
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

metadata CHANGED Viewed

@@ -1,17 +1,17 @@
 --- !ruby/object:Gem::Specification
 name: cmfrec
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-12-29 00:00:00.000000000 Z
+date: 2021-02-05 00:00:00.000000000 Z
 dependencies: []
 description:
-email: andrew@chartkick.com
+email: andrew@ankane.org
 executables: []
 extensions: []
 extra_rdoc_files: []