bayesic_matching 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df98c7d569ac9e2e548452b9a3b40656b3626a91
4
- data.tar.gz: 8da8d971e361eb88303054ca11ab83deeab10a6c
3
+ metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
4
+ data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
5
5
  SHA512:
6
- metadata.gz: 3b0bffa3d5381e085f7376e76053ed47b3593b9cc51a64590ea15f889a763b8fd71a40a8a5f0820444221d31990fed2936ee17d06e23fdab8cd752e2ed55e3f7
7
- data.tar.gz: 64d046aff06b505337e8f3593f9cd4a89e365fdcad65cd482d2d4c3782ed6f5e33b696f0a372c93ec7257eff3f39d5681410d7ba0787cee5ebaf0f2ab72c4687
6
+ metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
7
+ data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- bayesic_matching (0.2.0)
4
+ bayesic_matching (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -34,4 +34,4 @@ DEPENDENCIES
34
34
  rspec (~> 3.0)
35
35
 
36
36
  BUNDLED WITH
37
- 1.16.0
37
+ 1.16.1
data/README.md CHANGED
@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
7
7
  ## Usage
8
8
 
9
9
  ```ruby
10
- matcher = BayesicMatching.new
11
- matcher.train(["it","was","the","best","of","times"], "novel")
12
- matcher.train(["tonight","on","the","seven","o'clock"], "news")
10
+ trainer = BayesicMatching.new
11
+ trainer.train(["it","was","the","best","of","times"], "novel")
12
+ trainer.train(["tonight","on","the","seven","o'clock"], "news")
13
+ matcher = trainer.finalize
13
14
 
14
15
  matcher.classify(["the","best","of"])
15
16
  # => {"novel"=>1.0, "news"=>0.667}
16
17
  matcher.classify(["the","time"])
17
18
  # => {"novel"=>0.667, "news"=>0.667}
18
- ```
19
+ ```
20
+
21
+ ## Pruning
22
+
23
+ One of the fastest ways to improve the speed of matching is to prune common tokens.
24
+ For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
25
+ To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
26
+ You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
27
+ In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
19
28
 
20
29
  ## How It Works
21
30
 
@@ -33,11 +33,11 @@ matching_rows = []
33
33
  end
34
34
 
35
35
  def train_matcher(training_rows)
36
- matcher = BayesicMatching.new
36
+ trainer = BayesicMatching.new
37
37
  training_rows.each do |row|
38
- matcher.train(row[:tokens], row[:id])
38
+ trainer.train(row[:tokens], row[:id])
39
39
  end
40
- matcher
40
+ trainer.finalize(pruning_percent: 0.2)
41
41
  end
42
42
 
43
43
  def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
@@ -1,31 +1,27 @@
1
1
  require "bayesic_matching/version"
2
+ require "bayesic_matching/matcher"
2
3
  require "set"
3
4
 
4
5
  class BayesicMatching
5
6
  def initialize
6
7
  @classifications = ::Set.new
7
8
  @classifications_by_token = {}
8
- @tokens_by_classification = {}
9
9
  end
10
10
 
11
- def classify(tokens)
12
- tokens = tokens.reject{|t| @classifications_by_token[t].nil? }.uniq
13
- tokens.each_with_object({}) do |token, hash|
14
- @classifications_by_token[token].each do |c|
15
- p_klass = hash[c] || (1.0 / @classifications.size)
16
- p_not_klass = 1.0 - p_klass
17
- p_token_given_klass = 1.0
18
- p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
19
- hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
20
- end
11
+ def finalize(opts = {})
12
+ pruning_percent = opts.fetch(:pruning_percent, 0.5)
13
+ threshold = @classifications.size * pruning_percent
14
+ by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
15
+ class_count = classifications.size
16
+ next if class_count > threshold
17
+ hash[token] = {count: class_count, classifications: classifications}
21
18
  end
19
+ BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
22
20
  end
23
21
 
24
22
  def train(tokens, classification)
25
23
  @classifications << classification
26
- @tokens_by_classification[classification] ||= ::Set.new
27
24
  tokens.each do |token|
28
- @tokens_by_classification[classification] << token
29
25
  @classifications_by_token[token] ||= ::Set.new
30
26
  @classifications_by_token[token] << classification
31
27
  end
@@ -0,0 +1,22 @@
1
+ class BayesicMatching
2
+ class Matcher
3
+ def initialize(class_count:, by_token:)
4
+ @class_count = class_count
5
+ @by_token = by_token
6
+ @prior = 1.0 / class_count
7
+ end
8
+
9
+ def classify(tokens)
10
+ tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
11
+ tokens.each_with_object({}) do |token, hash|
12
+ @by_token[token][:classifications].each do |c|
13
+ p_klass = hash[c] || @prior
14
+ p_not_klass = 1.0 - p_klass
15
+ p_token_given_klass = 1.0
16
+ p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
17
+ hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,3 +1,3 @@
1
1
  class BayesicMatching
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bayesic_matching
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Ries
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-12-14 00:00:00.000000000 Z
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips
@@ -88,6 +88,7 @@ files:
88
88
  - examples/favorite_recent_movies.csv
89
89
  - examples/popular_recent_movies.csv
90
90
  - lib/bayesic_matching.rb
91
+ - lib/bayesic_matching/matcher.rb
91
92
  - lib/bayesic_matching/version.rb
92
93
  homepage: https://github.com/mmmries/bayesic_matching
93
94
  licenses: