bayesic_matching 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +13 -4
- data/examples/benchmark.rb +3 -3
- data/lib/bayesic_matching.rb +9 -13
- data/lib/bayesic_matching/matcher.rb +22 -0
- data/lib/bayesic_matching/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
|
4
|
+
data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
|
7
|
+
data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
|
|
7
7
|
## Usage
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
trainer = BayesicMatching.new
|
11
|
+
trainer.train(["it","was","the","best","of","times"], "novel")
|
12
|
+
trainer.train(["tonight","on","the","seven","o'clock"], "news")
|
13
|
+
matcher = trainer.finalize
|
13
14
|
|
14
15
|
matcher.classify(["the","best","of"])
|
15
16
|
# => {"novel"=>1.0, "news"=>0.667}
|
16
17
|
matcher.classify(["the","time"])
|
17
18
|
# => {"novel"=>0.667, "news"=>0.667}
|
18
|
-
```
|
19
|
+
```
|
20
|
+
|
21
|
+
## Pruning
|
22
|
+
|
23
|
+
One of the fastest ways to improve the speed of matching is to prune common tokens.
|
24
|
+
For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
|
25
|
+
To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
|
26
|
+
You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
|
27
|
+
In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
|
19
28
|
|
20
29
|
## How It Works
|
21
30
|
|
data/examples/benchmark.rb
CHANGED
@@ -33,11 +33,11 @@ matching_rows = []
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def train_matcher(training_rows)
|
36
|
-
|
36
|
+
trainer = BayesicMatching.new
|
37
37
|
training_rows.each do |row|
|
38
|
-
|
38
|
+
trainer.train(row[:tokens], row[:id])
|
39
39
|
end
|
40
|
-
|
40
|
+
trainer.finalize(pruning_percent: 0.2)
|
41
41
|
end
|
42
42
|
|
43
43
|
def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
|
data/lib/bayesic_matching.rb
CHANGED
@@ -1,31 +1,27 @@
|
|
1
1
|
require "bayesic_matching/version"
|
2
|
+
require "bayesic_matching/matcher"
|
2
3
|
require "set"
|
3
4
|
|
4
5
|
class BayesicMatching
|
5
6
|
def initialize
|
6
7
|
@classifications = ::Set.new
|
7
8
|
@classifications_by_token = {}
|
8
|
-
@tokens_by_classification = {}
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
|
19
|
-
hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
|
20
|
-
end
|
11
|
+
def finalize(opts = {})
|
12
|
+
pruning_percent = opts.fetch(:pruning_percent, 0.5)
|
13
|
+
threshold = @classifications.size * pruning_percent
|
14
|
+
by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
|
15
|
+
class_count = classifications.size
|
16
|
+
next if class_count > threshold
|
17
|
+
hash[token] = {count: class_count, classifications: classifications}
|
21
18
|
end
|
19
|
+
BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
|
22
20
|
end
|
23
21
|
|
24
22
|
def train(tokens, classification)
|
25
23
|
@classifications << classification
|
26
|
-
@tokens_by_classification[classification] ||= ::Set.new
|
27
24
|
tokens.each do |token|
|
28
|
-
@tokens_by_classification[classification] << token
|
29
25
|
@classifications_by_token[token] ||= ::Set.new
|
30
26
|
@classifications_by_token[token] << classification
|
31
27
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class BayesicMatching
|
2
|
+
class Matcher
|
3
|
+
def initialize(class_count:, by_token:)
|
4
|
+
@class_count = class_count
|
5
|
+
@by_token = by_token
|
6
|
+
@prior = 1.0 / class_count
|
7
|
+
end
|
8
|
+
|
9
|
+
def classify(tokens)
|
10
|
+
tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
|
11
|
+
tokens.each_with_object({}) do |token, hash|
|
12
|
+
@by_token[token][:classifications].each do |c|
|
13
|
+
p_klass = hash[c] || @prior
|
14
|
+
p_not_klass = 1.0 - p_klass
|
15
|
+
p_token_given_klass = 1.0
|
16
|
+
p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
|
17
|
+
hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bayesic_matching
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Ries
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|
@@ -88,6 +88,7 @@ files:
|
|
88
88
|
- examples/favorite_recent_movies.csv
|
89
89
|
- examples/popular_recent_movies.csv
|
90
90
|
- lib/bayesic_matching.rb
|
91
|
+
- lib/bayesic_matching/matcher.rb
|
91
92
|
- lib/bayesic_matching/version.rb
|
92
93
|
homepage: https://github.com/mmmries/bayesic_matching
|
93
94
|
licenses:
|