bayesic_matching 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df98c7d569ac9e2e548452b9a3b40656b3626a91
4
- data.tar.gz: 8da8d971e361eb88303054ca11ab83deeab10a6c
3
+ metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
4
+ data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
5
5
  SHA512:
6
- metadata.gz: 3b0bffa3d5381e085f7376e76053ed47b3593b9cc51a64590ea15f889a763b8fd71a40a8a5f0820444221d31990fed2936ee17d06e23fdab8cd752e2ed55e3f7
7
- data.tar.gz: 64d046aff06b505337e8f3593f9cd4a89e365fdcad65cd482d2d4c3782ed6f5e33b696f0a372c93ec7257eff3f39d5681410d7ba0787cee5ebaf0f2ab72c4687
6
+ metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
7
+ data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bayesic_matching (0.2.0)
4
+ bayesic_matching (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -34,4 +34,4 @@ DEPENDENCIES
34
34
  rspec (~> 3.0)
35
35
 
36
36
  BUNDLED WITH
37
- 1.16.0
37
+ 1.16.1
data/README.md CHANGED
@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
7
7
  ## Usage
8
8
 
9
9
  ```ruby
10
- matcher = BayesicMatching.new
11
- matcher.train(["it","was","the","best","of","times"], "novel")
12
- matcher.train(["tonight","on","the","seven","o'clock"], "news")
10
+ trainer = BayesicMatching.new
11
+ trainer.train(["it","was","the","best","of","times"], "novel")
12
+ trainer.train(["tonight","on","the","seven","o'clock"], "news")
13
+ matcher = trainer.finalize
13
14
 
14
15
  matcher.classify(["the","best","of"])
15
16
  # => {"novel"=>1.0, "news"=>0.667}
16
17
  matcher.classify(["the","time"])
17
18
  # => {"novel"=>0.667, "news"=>0.667}
18
- ```
19
+ ```
20
+
21
+ ## Pruning
22
+
23
+ One of the fastest ways to improve the speed of matching is to prune common tokens.
24
+ For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
25
+ To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
26
+ You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
27
+ In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
19
28
 
20
29
  ## How It Works
21
30
 
@@ -33,11 +33,11 @@ matching_rows = []
33
33
  end
34
34
 
35
35
  def train_matcher(training_rows)
36
- matcher = BayesicMatching.new
36
+ trainer = BayesicMatching.new
37
37
  training_rows.each do |row|
38
- matcher.train(row[:tokens], row[:id])
38
+ trainer.train(row[:tokens], row[:id])
39
39
  end
40
- matcher
40
+ trainer.finalize(pruning_percent: 0.2)
41
41
  end
42
42
 
43
43
  def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
@@ -1,31 +1,27 @@
1
1
  require "bayesic_matching/version"
2
+ require "bayesic_matching/matcher"
2
3
  require "set"
3
4
 
4
5
  class BayesicMatching
5
6
  def initialize
6
7
  @classifications = ::Set.new
7
8
  @classifications_by_token = {}
8
- @tokens_by_classification = {}
9
9
  end
10
10
 
11
- def classify(tokens)
12
- tokens = tokens.reject{|t| @classifications_by_token[t].nil? }.uniq
13
- tokens.each_with_object({}) do |token, hash|
14
- @classifications_by_token[token].each do |c|
15
- p_klass = hash[c] || (1.0 / @classifications.size)
16
- p_not_klass = 1.0 - p_klass
17
- p_token_given_klass = 1.0
18
- p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
19
- hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
20
- end
11
+ def finalize(opts = {})
12
+ pruning_percent = opts.fetch(:pruning_percent, 0.5)
13
+ threshold = @classifications.size * pruning_percent
14
+ by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
15
+ class_count = classifications.size
16
+ next if class_count > threshold
17
+ hash[token] = {count: class_count, classifications: classifications}
21
18
  end
19
+ BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
22
20
  end
23
21
 
24
22
  def train(tokens, classification)
25
23
  @classifications << classification
26
- @tokens_by_classification[classification] ||= ::Set.new
27
24
  tokens.each do |token|
28
- @tokens_by_classification[classification] << token
29
25
  @classifications_by_token[token] ||= ::Set.new
30
26
  @classifications_by_token[token] << classification
31
27
  end
@@ -0,0 +1,22 @@
1
+ class BayesicMatching
2
+ class Matcher
3
+ def initialize(class_count:, by_token:)
4
+ @class_count = class_count
5
+ @by_token = by_token
6
+ @prior = 1.0 / class_count
7
+ end
8
+
9
+ def classify(tokens)
10
+ tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
11
+ tokens.each_with_object({}) do |token, hash|
12
+ @by_token[token][:classifications].each do |c|
13
+ p_klass = hash[c] || @prior
14
+ p_not_klass = 1.0 - p_klass
15
+ p_token_given_klass = 1.0
16
+ p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
17
+ hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,3 +1,3 @@
1
1
  class BayesicMatching
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bayesic_matching
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Ries
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-12-14 00:00:00.000000000 Z
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips
@@ -88,6 +88,7 @@ files:
88
88
  - examples/favorite_recent_movies.csv
89
89
  - examples/popular_recent_movies.csv
90
90
  - lib/bayesic_matching.rb
91
+ - lib/bayesic_matching/matcher.rb
91
92
  - lib/bayesic_matching/version.rb
92
93
  homepage: https://github.com/mmmries/bayesic_matching
93
94
  licenses: