RubyGems - bayesic_matching - Versions diffs - 0.2.0 → 0.3.0 - Mend

bayesic_matching 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +13 -4
data/examples/benchmark.rb +3 -3
data/lib/bayesic_matching.rb +9 -13
data/lib/bayesic_matching/matcher.rb +22 -0
data/lib/bayesic_matching/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: df98c7d569ac9e2e548452b9a3b40656b3626a91
-  data.tar.gz: 8da8d971e361eb88303054ca11ab83deeab10a6c
+  metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
+  data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
 SHA512:
-  metadata.gz: 3b0bffa3d5381e085f7376e76053ed47b3593b9cc51a64590ea15f889a763b8fd71a40a8a5f0820444221d31990fed2936ee17d06e23fdab8cd752e2ed55e3f7
-  data.tar.gz: 64d046aff06b505337e8f3593f9cd4a89e365fdcad65cd482d2d4c3782ed6f5e33b696f0a372c93ec7257eff3f39d5681410d7ba0787cee5ebaf0f2ab72c4687
+  metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
+  data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    bayesic_matching (0.2.0)
+    bayesic_matching (0.3.0)
 GEM
   remote: https://rubygems.org/
@@ -34,4 +34,4 @@ DEPENDENCIES
   rspec (~> 3.0)
 BUNDLED WITH
-   1.16.0
+   1.16.1

data/README.md CHANGED Viewed

@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
 ## Usage
 ```ruby
-matcher = BayesicMatching.new
-matcher.train(["it","was","the","best","of","times"], "novel")
-matcher.train(["tonight","on","the","seven","o'clock"], "news")
+trainer = BayesicMatching.new
+trainer.train(["it","was","the","best","of","times"], "novel")
+trainer.train(["tonight","on","the","seven","o'clock"], "news")
+matcher = trainer.finalize
 matcher.classify(["the","best","of"])
 # => {"novel"=>1.0, "news"=>0.667}
 matcher.classify(["the","time"])
 #  => {"novel"=>0.667, "news"=>0.667}
-```
+```
+## Pruning
+One of the fastest ways to improve the speed of matching is to prune common tokens.
+For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
+To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
+You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
+In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
 ## How It Works

data/examples/benchmark.rb CHANGED Viewed

@@ -33,11 +33,11 @@ matching_rows = []
 end
 def train_matcher(training_rows)
-  matcher = BayesicMatching.new
+  trainer = BayesicMatching.new
   training_rows.each do |row|
-    matcher.train(row[:tokens], row[:id])
+    trainer.train(row[:tokens], row[:id])
   end
-  matcher
+  trainer.finalize(pruning_percent: 0.2)
 end
 def attempt_matches(matcher, matching_rows, print_mismatch_data = false)

data/lib/bayesic_matching.rb CHANGED Viewed

@@ -1,31 +1,27 @@
 require "bayesic_matching/version"
+require "bayesic_matching/matcher"
 require "set"
 class BayesicMatching
   def initialize
     @classifications = ::Set.new
     @classifications_by_token = {}
-    @tokens_by_classification = {}
   end
-  def classify(tokens)
-    tokens = tokens.reject{|t| @classifications_by_token[t].nil? }.uniq
-    tokens.each_with_object({}) do |token, hash|
-      @classifications_by_token[token].each do |c|
-        p_klass = hash[c] || (1.0 / @classifications.size)
-        p_not_klass = 1.0 - p_klass
-        p_token_given_klass = 1.0
-        p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
-        hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
-      end
+  def finalize(opts = {})
+    pruning_percent = opts.fetch(:pruning_percent, 0.5)
+    threshold = @classifications.size * pruning_percent
+    by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
+      class_count = classifications.size
+      next if class_count > threshold
+      hash[token] = {count: class_count, classifications: classifications}
     end
+    BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
   end
   def train(tokens, classification)
     @classifications << classification
-    @tokens_by_classification[classification] ||= ::Set.new
     tokens.each do |token|
-      @tokens_by_classification[classification] << token
       @classifications_by_token[token] ||= ::Set.new
       @classifications_by_token[token] << classification
     end

data/lib/bayesic_matching/matcher.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class BayesicMatching
+  class Matcher
+    def initialize(class_count:, by_token:)
+      @class_count = class_count
+      @by_token = by_token
+      @prior = 1.0 / class_count
+    end
+    def classify(tokens)
+      tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
+      tokens.each_with_object({}) do |token, hash|
+        @by_token[token][:classifications].each do |c|
+          p_klass = hash[c] || @prior
+          p_not_klass = 1.0 - p_klass
+          p_token_given_klass = 1.0
+          p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
+          hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
+        end
+      end
+    end
+  end
+end

data/lib/bayesic_matching/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BayesicMatching
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bayesic_matching
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Michael Ries
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-12-14 00:00:00.000000000 Z
+date: 2018-01-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: benchmark-ips
@@ -88,6 +88,7 @@ files:
 - examples/favorite_recent_movies.csv
 - examples/popular_recent_movies.csv
 - lib/bayesic_matching.rb
+- lib/bayesic_matching/matcher.rb
 - lib/bayesic_matching/version.rb
 homepage: https://github.com/mmmries/bayesic_matching
 licenses: