RubyGems - bayesic_matching - Versions diffs - 0.2.0 → 0.3.0 - Mend

bayesic_matching 0.2.0 → 0.3.0

Files changed (8) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +13 -4
data/examples/benchmark.rb +3 -3
data/lib/bayesic_matching.rb +9 -13
data/lib/bayesic_matching/matcher.rb +22 -0
data/lib/bayesic_matching/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: df98c7d569ac9e2e548452b9a3b40656b3626a91
-  data.tar.gz: 8da8d971e361eb88303054ca11ab83deeab10a6c
+  metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
+  data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
 SHA512:
-  metadata.gz: 3b0bffa3d5381e085f7376e76053ed47b3593b9cc51a64590ea15f889a763b8fd71a40a8a5f0820444221d31990fed2936ee17d06e23fdab8cd752e2ed55e3f7
-  data.tar.gz: 64d046aff06b505337e8f3593f9cd4a89e365fdcad65cd482d2d4c3782ed6f5e33b696f0a372c93ec7257eff3f39d5681410d7ba0787cee5ebaf0f2ab72c4687
+  metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
+  data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    bayesic_matching (0.2.0)
+    bayesic_matching (0.3.0)
 GEM
   remote: https://rubygems.org/
@@ -34,4 +34,4 @@ DEPENDENCIES
   rspec (~> 3.0)
 BUNDLED WITH
-   1.16.0
+   1.16.1

data/README.md CHANGED Viewed

@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
 ## Usage
 ```ruby
-matcher = BayesicMatching.new
-matcher.train(["it","was","the","best","of","times"], "novel")
-matcher.train(["tonight","on","the","seven","o'clock"], "news")
+trainer = BayesicMatching.new
+trainer.train(["it","was","the","best","of","times"], "novel")
+trainer.train(["tonight","on","the","seven","o'clock"], "news")
+matcher = trainer.finalize
 matcher.classify(["the","best","of"])
 # => {"novel"=>1.0, "news"=>0.667}
 matcher.classify(["the","time"])
 #  => {"novel"=>0.667, "news"=>0.667}
-```
+```
+## Pruning
+One of the fastest ways to improve the speed of matching is to prune common tokens.
+For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
+To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
+You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
+In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
 ## How It Works

data/examples/benchmark.rb CHANGED Viewed

@@ -33,11 +33,11 @@ matching_rows = []
 end
 def train_matcher(training_rows)
-  matcher = BayesicMatching.new
+  trainer = BayesicMatching.new
   training_rows.each do |row|
-    matcher.train(row[:tokens], row[:id])
+    trainer.train(row[:tokens], row[:id])
   end
-  matcher
+  trainer.finalize(pruning_percent: 0.2)
 end
 def attempt_matches(matcher, matching_rows, print_mismatch_data = false)

data/lib/bayesic_matching.rb CHANGED Viewed

@@ -1,31 +1,27 @@
 require "bayesic_matching/version"
+require "bayesic_matching/matcher"
 require "set"
 class BayesicMatching
   def initialize
     @classifications = ::Set.new
     @classifications_by_token = {}
-    @tokens_by_classification = {}
   end
-  def classify(tokens)
-    tokens = tokens.reject{|t| @classifications_by_token[t].nil? }.uniq
-    tokens.each_with_object({}) do |token, hash|
-      @classifications_by_token[token].each do |c|
-        p_klass = hash[c] || (1.0 / @classifications.size)
-        p_not_klass = 1.0 - p_klass
-        p_token_given_klass = 1.0
-        p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
-        hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
-      end
+  def finalize(opts = {})
+    pruning_percent = opts.fetch(:pruning_percent, 0.5)
+    threshold = @classifications.size * pruning_percent
+    by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
+      class_count = classifications.size
+      next if class_count > threshold
+      hash[token] = {count: class_count, classifications: classifications}
     end
+    BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
   end
   def train(tokens, classification)
     @classifications << classification
-    @tokens_by_classification[classification] ||= ::Set.new
     tokens.each do |token|
-      @tokens_by_classification[classification] << token
       @classifications_by_token[token] ||= ::Set.new
       @classifications_by_token[token] << classification
     end

data/lib/bayesic_matching/matcher.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class BayesicMatching
+  class Matcher
+    def initialize(class_count:, by_token:)
+      @class_count = class_count
+      @by_token = by_token
+      @prior = 1.0 / class_count
+    end
+    def classify(tokens)
+      tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
+      tokens.each_with_object({}) do |token, hash|
+        @by_token[token][:classifications].each do |c|
+          p_klass = hash[c] || @prior
+          p_not_klass = 1.0 - p_klass
+          p_token_given_klass = 1.0
+          p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
+          hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
+        end
+      end
+    end
+  end
+end

data/lib/bayesic_matching/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class BayesicMatching
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bayesic_matching
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Michael Ries
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-12-14 00:00:00.000000000 Z
+date: 2018-01-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: benchmark-ips
@@ -88,6 +88,7 @@ files:
 - examples/favorite_recent_movies.csv
 - examples/popular_recent_movies.csv
 - lib/bayesic_matching.rb
+- lib/bayesic_matching/matcher.rb
 - lib/bayesic_matching/version.rb
 homepage: https://github.com/mmmries/bayesic_matching
 licenses: