RubyGems - disco - Versions diffs - 0.2.4 → 0.2.5 - Mend

disco 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e4a978d2eec39ca280142c49fb4ef4be2e1ad4f35dfa4d977941f46d5d34b466
-  data.tar.gz: 8a29a54bba5ac8b715294e2fce4e34fa1b11442b1800c388807c60b9520ced23
+  metadata.gz: 8fbecb858b316ed39a9cb726263e182561cba6df498e6253d88c79ebec5cab05
+  data.tar.gz: 42eb38a6e4e0b3fc5a9452deae5a48676ae9a53e78eeb6197718a0c94bd02b6b
 SHA512:
-  metadata.gz: 99376dd48cce340a4fdcb0d76c93b03af494d88167e2caaca0d186fcf5d2303f2524884e0c712c2f8e3d7be79a92b029a8d5fa726bb94826315f283afea0f74b
-  data.tar.gz: eeb8c480098616f93d6c7e39a1bb57e2feefa6af3696c407791ff6f052450eb035f1d1659ded70d7b5fbbbe8cff9f7309118828a454b1d4f9d459321b90035cf
+  metadata.gz: d0250346d75fba75064a29578f6bfd39f09ecf712ba2e505b97a4952b5ff8b31af307eb1b912e9b25cc3dc28dee0d096bea44b47bb2ef268859bb4171f0ef8b2
+  data.tar.gz: 7b341328c12885efd0ffece4201036bb9457caee80a48a99ba110af9a81bcf832bbc1e8f8f5f14e7fddffef2dd3f4643837e0d569c997ab0c2d9ae85e12422f7

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,11 @@
+## 0.2.5 (2021-02-20)
+- Added `top_items` method
+- Added `optimize_similar_users` method
+- Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods
+- Added `rmse` method
+- Improved performance
 ## 0.2.4 (2021-02-15)
 - Added `user_ids` and `item_ids` methods

data/README.md CHANGED Viewed

@@ -201,6 +201,8 @@ bin = File.binread("recommender.bin")
 recommender = Marshal.load(bin)
 ```
+Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor)
 ## Algorithms
 Disco uses high-performance matrix factorization.
@@ -237,6 +239,16 @@ There are a number of ways to deal with this, but here are some common ones:
 - For user-based recommendations, show new users the most popular items.
 - For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity).
+Get top items with:
+```ruby
+recommender = Disco::Recommender.new(top_items: true)
+recommender.fit(data)
+recommender.top_items
+```
+This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback.
 ## Data
 Data can be an array of hashes
@@ -257,23 +269,29 @@ Or a Daru data frame
 Daru::DataFrame.from_csv("ratings.csv")
 ```
-## Faster Similarity
+## Performance [master]
-If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users.
+If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods.
 Add this line to your application’s Gemfile:
 ```ruby
-gem 'ngt', '>= 0.3.0'
+gem 'faiss'
+```
+Speed up the `user_recs` method with:
+```ruby
+model.optimize_user_recs
 ```
-Speed up item-based recommendations with:
+Speed up the `item_recs` method with:
 ```ruby
 model.optimize_item_recs
 ```
-Speed up similar users with:
+Speed up the `similar_users` method with:
 ```ruby
 model.optimize_similar_users

data/lib/disco.rb CHANGED Viewed

@@ -9,6 +9,7 @@ require "net/http"
 # modules
 require "disco/data"
+require "disco/metrics"
 require "disco/recommender"
 require "disco/version"

data/lib/disco/metrics.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module Disco
+  module Metrics
+    class << self
+      def rmse(act, exp)
+        raise ArgumentError, "Size mismatch" if act.size != exp.size
+        Math.sqrt(act.zip(exp).sum { |a, e| (a - e)**2 } / act.size.to_f)
+      end
+    end
+  end
+end

data/lib/disco/recommender.rb CHANGED Viewed

@@ -2,12 +2,13 @@ module Disco
   class Recommender
     attr_reader :global_mean
-    def initialize(factors: 8, epochs: 20, verbose: nil)
+    def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false)
       @factors = factors
       @epochs = epochs
       @verbose = verbose
       @user_map = {}
       @item_map = {}
+      @top_items = top_items
     end
     def fit(train_set, validation_set: nil)
@@ -41,6 +42,16 @@ module Disco
       end
       @rated.default = nil
+      if @top_items
+        @item_count = [0] * @item_map.size
+        @item_sum = [0.0] * @item_map.size
+        train_set.each do |v|
+          i = @item_map[v[:item_id]]
+          @item_count[i] += 1
+          @item_sum[i] += (v[value_key] || 1)
+        end
+      end
       eval_set = nil
       if validation_set
         eval_set = []
@@ -67,8 +78,9 @@ module Disco
       @user_factors = model.p_factors(format: :numo)
       @item_factors = model.q_factors(format: :numo)
-      @user_index = nil
-      @item_index = nil
+      @user_recs_index = nil
+      @similar_users_index = nil
+      @similar_items_index = nil
     end
     # generates a prediction even if a user has already rated the item
@@ -95,61 +107,76 @@ module Disco
       u = @user_map[user_id]
       if u
-        predictions = @item_factors.inner(@user_factors[u, true])
-        predictions =
-          @item_map.keys.zip(predictions).map do |item_id, pred|
-            {item_id: item_id, score: pred}
-          end
+        rated = item_ids ? {} : @rated[u]
         if item_ids
-          idx = item_ids.map { |i| @item_map[i] }.compact
-          predictions = predictions.values_at(*idx)
+          ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
+          return [] if ids.size == 0
+          predictions = @item_factors[ids, true].inner(@user_factors[u, true])
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = ids[indexes]
+        elsif @user_recs_index && count
+          predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
         else
-          @rated[u].keys.sort_by { |v| -v }.each do |i|
-            predictions.delete_at(i)
-          end
+          predictions = @item_factors.inner(@user_factors[u, true])
+          # TODO make sure reverse isn't hurting performance
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = indexes
         end
-        predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
-        predictions = predictions.first(count) if count && !item_ids
+        predictions.inplace.clip(@min_rating, @max_rating) if @min_rating
-        # clamp *after* sorting
-        # also, only needed for returned predictions
-        if @min_rating
-          predictions.each do |pred|
-            pred[:score] = pred[:score].clamp(@min_rating, @max_rating)
-          end
-        end
+        keys = @item_map.keys
+        result = []
+        ids.each_with_index do |item_id, i|
+          next if rated[item_id]
-        predictions
+          result << {item_id: keys[item_id], score: predictions[i]}
+          break if result.size == count
+        end
+        result
+      elsif @top_items
+        top_items(count: count)
       else
-        # no items if user is unknown
-        # TODO maybe most popular items
         []
       end
     end
-    def optimize_similar_items
+    def similar_items(item_id, count: 5)
       check_fit
-      @item_index = create_index(@item_factors)
+      similar(item_id, @item_map, item_norms, count, @similar_items_index)
     end
-    alias_method :optimize_item_recs, :optimize_similar_items
+    alias_method :item_recs, :similar_items
-    def optimize_similar_users
+    def similar_users(user_id, count: 5)
       check_fit
-      @user_index = create_index(@user_factors)
+      similar(user_id, @user_map, user_norms, count, @similar_users_index)
     end
-    def similar_items(item_id, count: 5)
+    def top_items(count: 5)
       check_fit
-      similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
-    end
-    alias_method :item_recs, :similar_items
+      raise "top_items not computed" unless @top_items
-    def similar_users(user_id, count: 5)
-      check_fit
-      similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
+      if @implicit
+        scores = @item_count
+      else
+        require "wilson_score"
+        range = @min_rating..@max_rating
+        scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }
+      end
+      scores = scores.map.with_index.sort_by { |s, _| -s }
+      scores = scores.first(count) if count
+      item_ids = item_ids()
+      scores.map do |s, i|
+        {item_id: item_ids[i], score: s}
+      end
     end
     def user_ids
@@ -178,17 +205,61 @@ module Disco
       end
     end
+    def optimize_user_recs
+      check_fit
+      @user_recs_index = create_index(item_factors, library: "faiss")
+    end
+    def optimize_similar_items(library: nil)
+      check_fit
+      @similar_items_index = create_index(item_norms, library: library)
+    end
+    alias_method :optimize_item_recs, :optimize_similar_items
+    def optimize_similar_users(library: nil)
+      check_fit
+      @similar_users_index = create_index(user_norms, library: library)
+    end
     private
-    def create_index(factors)
-      require "ngt"
+    # factors should already be normalized for similar users/items
+    def create_index(factors, library:)
+      # TODO make Faiss the default in 0.3.0
+      library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
+      case library
+      when "faiss"
+        require "faiss"
+        # inner product is cosine similarity with normalized vectors
+        # https://github.com/facebookresearch/faiss/issues/95
+        #
+        # TODO use non-exact index
+        # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
+        index = Faiss::IndexFlatIP.new(factors.shape[1])
+        # ids are from 0...total
+        # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89
+        index.add(factors)
+        index
+      when "ngt"
+        require "ngt"
+        # could speed up search with normalized cosine
+        # https://github.com/yahoojapan/NGT/issues/36
+        index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
-      # could speed up search with normalized cosine
-      # https://github.com/yahoojapan/NGT/issues/36
-      index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
-      ids = index.batch_insert(factors)
-      raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
-      index
+        # NGT normalizes so could call create_index with factors instead of norms
+        # but keep code simple for now
+        ids = index.batch_insert(factors)
+        raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
+        index
+      else
+        raise ArgumentError, "Invalid library: #{library}"
+      end
     end
     def user_norms
@@ -202,40 +273,38 @@ module Disco
     def norms(factors)
       norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
       norms[norms.eq(0)] = 1e-10 # no zeros
-      norms
+      factors / norms.expand_dims(1)
     end
-    def similar(id, map, factors, norms, count, index)
+    def similar(id, map, norm_factors, count, index)
       i = map[id]
-      if i
+      if i && norm_factors.shape[0] > 1
         if index && count
-          keys = map.keys
-          result = index.search(factors[i, true], size: count + 1)[1..-1]
-          result.map do |v|
-            {
-              # ids from batch_insert start at 1 instead of 0
-              item_id: keys[v[:id] - 1],
-              # convert cosine distance to cosine similarity
-              score: 1 - v[:distance]
-            }
+          if defined?(Faiss) && index.is_a?(Faiss::Index)
+            predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] }
+          else
+            result = index.search(norm_factors[i, true], size: count + 1)
+            # ids from batch_insert start at 1 instead of 0
+            ids = result.map { |v| v[:id] - 1 }
+            # convert cosine distance to cosine similarity
+            predictions = result.map { |v| 1 - v[:distance] }
           end
         else
-          # cosine similarity without norms[i]
-          # otherwise, denominator would be (norms[i] * norms)
-          predictions = factors.inner(factors[i, true]) / norms
-          predictions =
-            map.keys.zip(predictions).map do |item_id, pred|
-              {item_id: item_id, score: pred}
-            end
-          predictions.delete_at(i)
-          predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
-          predictions = predictions.first(count) if count
-          # divide by norms[i] to get cosine similarity
-          # only need to do for returned records
-          predictions.each { |pred| pred[:score] /= norms[i] }
-          predictions
+          predictions = norm_factors.inner(norm_factors[i, true])
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + 1, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = indexes
+        end
+        keys = map.keys
+        # TODO use user_id for similar_users in 0.3.0
+        key = :item_id
+        (1...ids.size).map do |i|
+          {key => keys[ids[i]], score: predictions[i]}
         end
       else
         []
@@ -304,6 +373,11 @@ module Disco
         obj[:max_rating] = @max_rating
       end
+      if @top_items
+        obj[:item_count] = @item_count
+        obj[:item_sum] = @item_sum
+      end
       obj
     end
@@ -320,6 +394,12 @@ module Disco
         @min_rating = obj[:min_rating]
         @max_rating = obj[:max_rating]
       end
+      @top_items = obj.key?(:item_count)
+      if @top_items
+        @item_count = obj[:item_count]
+        @item_sum = obj[:item_sum]
+      end
     end
   end
 end

data/lib/disco/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Disco
-  VERSION = "0.2.4"
+  VERSION = "0.2.5"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: disco
 version: !ruby/object:Gem::Version
-  version: 0.2.4
+  version: 0.2.5
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-02-16 00:00:00.000000000 Z
+date: 2021-02-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: libmf
@@ -51,6 +51,7 @@ files:
 - lib/disco.rb
 - lib/disco/data.rb
 - lib/disco/engine.rb
+- lib/disco/metrics.rb
 - lib/disco/model.rb
 - lib/disco/recommender.rb
 - lib/disco/version.rb