bayesic_matching 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +13 -4
- data/examples/benchmark.rb +3 -3
- data/lib/bayesic_matching.rb +9 -13
- data/lib/bayesic_matching/matcher.rb +22 -0
- data/lib/bayesic_matching/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48a60d5172b83ebb7733aa8a6e04700c19dd557d
|
4
|
+
data.tar.gz: 8a03feebb8928b702ecbde0d7fe0856980350d0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cadcfb974116f4403815f70acdaae5042170af5967fb70f13de54fe02b6c170e431f9cf697fdaba0346dd7c8673f89b0f4b352577bd3841bacba30eda2fa711a
|
7
|
+
data.tar.gz: 2041479d0e12d17b76af098246d1597f07edb26f07319686e3a38564cd169f08e9b446b822559009ef30cb4a0b8779fe74508d928814108cc4dd115722c18e67
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -7,15 +7,24 @@ This is useful if you have two lists of names or titles and you want to match be
|
|
7
7
|
## Usage
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
trainer = BayesicMatching.new
|
11
|
+
trainer.train(["it","was","the","best","of","times"], "novel")
|
12
|
+
trainer.train(["tonight","on","the","seven","o'clock"], "news")
|
13
|
+
matcher = trainer.finalize
|
13
14
|
|
14
15
|
matcher.classify(["the","best","of"])
|
15
16
|
# => {"novel"=>1.0, "news"=>0.667}
|
16
17
|
matcher.classify(["the","time"])
|
17
18
|
# => {"novel"=>0.667, "news"=>0.667}
|
18
|
-
```
|
19
|
+
```
|
20
|
+
|
21
|
+
## Pruning
|
22
|
+
|
23
|
+
One of the fastest ways to improve the speed of matching is to prune common tokens.
|
24
|
+
For example, if the token "the" is present in every single classification, then its presence doesn't tell you much about your confidence of a match, but we `BayesicMatching` would now return a confidence for every possible classification.
|
25
|
+
To avoid this there is a default pruning where any token that exists in more than 50% of your classifications will get pruned during the `finalize` call.
|
26
|
+
You can tune this pruning by passing `.finalize(pruning_percent: 0.25)`.
|
27
|
+
In my own usage I've found that pruning tokens that exist in more than `0.2` of all classifications has almost no impact on accuracy, but gives me a significant speed boost.
|
19
28
|
|
20
29
|
## How It Works
|
21
30
|
|
data/examples/benchmark.rb
CHANGED
@@ -33,11 +33,11 @@ matching_rows = []
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def train_matcher(training_rows)
|
36
|
-
|
36
|
+
trainer = BayesicMatching.new
|
37
37
|
training_rows.each do |row|
|
38
|
-
|
38
|
+
trainer.train(row[:tokens], row[:id])
|
39
39
|
end
|
40
|
-
|
40
|
+
trainer.finalize(pruning_percent: 0.2)
|
41
41
|
end
|
42
42
|
|
43
43
|
def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
|
data/lib/bayesic_matching.rb
CHANGED
@@ -1,31 +1,27 @@
|
|
1
1
|
require "bayesic_matching/version"
|
2
|
+
require "bayesic_matching/matcher"
|
2
3
|
require "set"
|
3
4
|
|
4
5
|
class BayesicMatching
|
5
6
|
def initialize
|
6
7
|
@classifications = ::Set.new
|
7
8
|
@classifications_by_token = {}
|
8
|
-
@tokens_by_classification = {}
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
p_token_given_not_klass = (@classifications_by_token[token].size - 1) / @classifications.size.to_f
|
19
|
-
hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
|
20
|
-
end
|
11
|
+
def finalize(opts = {})
|
12
|
+
pruning_percent = opts.fetch(:pruning_percent, 0.5)
|
13
|
+
threshold = @classifications.size * pruning_percent
|
14
|
+
by_token = @classifications_by_token.each_with_object({}) do |(token, classifications), hash|
|
15
|
+
class_count = classifications.size
|
16
|
+
next if class_count > threshold
|
17
|
+
hash[token] = {count: class_count, classifications: classifications}
|
21
18
|
end
|
19
|
+
BayesicMatching::Matcher.new(class_count: @classifications.size, by_token: by_token)
|
22
20
|
end
|
23
21
|
|
24
22
|
def train(tokens, classification)
|
25
23
|
@classifications << classification
|
26
|
-
@tokens_by_classification[classification] ||= ::Set.new
|
27
24
|
tokens.each do |token|
|
28
|
-
@tokens_by_classification[classification] << token
|
29
25
|
@classifications_by_token[token] ||= ::Set.new
|
30
26
|
@classifications_by_token[token] << classification
|
31
27
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class BayesicMatching
|
2
|
+
class Matcher
|
3
|
+
def initialize(class_count:, by_token:)
|
4
|
+
@class_count = class_count
|
5
|
+
@by_token = by_token
|
6
|
+
@prior = 1.0 / class_count
|
7
|
+
end
|
8
|
+
|
9
|
+
def classify(tokens)
|
10
|
+
tokens = tokens.reject{|t| !@by_token.has_key?(t) }.uniq
|
11
|
+
tokens.each_with_object({}) do |token, hash|
|
12
|
+
@by_token[token][:classifications].each do |c|
|
13
|
+
p_klass = hash[c] || @prior
|
14
|
+
p_not_klass = 1.0 - p_klass
|
15
|
+
p_token_given_klass = 1.0
|
16
|
+
p_token_given_not_klass = (@by_token[token][:count] - 1) / @class_count.to_f
|
17
|
+
hash[c] = (p_token_given_klass * p_klass) / ((p_token_given_klass * p_klass) + (p_token_given_not_klass * p_not_klass))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bayesic_matching
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Ries
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|
@@ -88,6 +88,7 @@ files:
|
|
88
88
|
- examples/favorite_recent_movies.csv
|
89
89
|
- examples/popular_recent_movies.csv
|
90
90
|
- lib/bayesic_matching.rb
|
91
|
+
- lib/bayesic_matching/matcher.rb
|
91
92
|
- lib/bayesic_matching/version.rb
|
92
93
|
homepage: https://github.com/mmmries/bayesic_matching
|
93
94
|
licenses:
|