bayesic_matching 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bayesic_matching.gemspec +28 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/examples/benchmark.rb +82 -0
- data/examples/favorite_recent_movies.csv +11 -0
- data/examples/popular_recent_movies.csv +2601 -0
- data/lib/bayesic_matching/version.rb +3 -0
- data/lib/bayesic_matching.rb +34 -0
- metadata +116 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5cea0e170af73887df8e76272b3cd884b83ddf06
|
4
|
+
data.tar.gz: a3448c71f0ea4614fea06983678f988bcde33c27
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 159b74bf8b9224c7cba78e483b88f2f69c6ac943d5bcabee6bebc2a65077755d2402512139e94adde5b3b0e3be4c074d37d8fcd32262c3d6ee8b822dc6925294
|
7
|
+
data.tar.gz: 3e451a929ab45a35cd7462f72fd23ad962b844d7a3b854cf77479cb4347e34d647fab4d4f801c94d29e2f67015461bb5e87d2e44b6500a1fd723db0815dcdd1c
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bayesic_matching (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
benchmark-ips (2.7.2)
|
10
|
+
diff-lcs (1.3)
|
11
|
+
rake (10.5.0)
|
12
|
+
rspec (3.7.0)
|
13
|
+
rspec-core (~> 3.7.0)
|
14
|
+
rspec-expectations (~> 3.7.0)
|
15
|
+
rspec-mocks (~> 3.7.0)
|
16
|
+
rspec-core (3.7.0)
|
17
|
+
rspec-support (~> 3.7.0)
|
18
|
+
rspec-expectations (3.7.0)
|
19
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
20
|
+
rspec-support (~> 3.7.0)
|
21
|
+
rspec-mocks (3.7.0)
|
22
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
23
|
+
rspec-support (~> 3.7.0)
|
24
|
+
rspec-support (3.7.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
bayesic_matching!
|
31
|
+
benchmark-ips (~> 2.7)
|
32
|
+
bundler (~> 1.16)
|
33
|
+
rake (~> 10.0)
|
34
|
+
rspec (~> 3.0)
|
35
|
+
|
36
|
+
BUNDLED WITH
|
37
|
+
1.16.0
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 Michael Ries
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# BayesicMatching
|
2
|
+
|
3
|
+
Like NaiveBayes, except useful for the case of many possible classes with small training sets per class.
|
4
|
+
|
5
|
+
This is useful if you have two lists of names or titles and you want to match between them with a given confidence level.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
matcher = BayesicMatching.new
|
11
|
+
matcher.train(["it","was","the","best","of","times"], "novel")
|
12
|
+
matcher.train(["tonight","on","the","seven","o'clock"], "news")
|
13
|
+
|
14
|
+
matcher.classify(["the","best","of"])
|
15
|
+
# => {"novel"=>1.0, "news"=>0.667}
|
16
|
+
matcher.classify(["the","time"])
|
17
|
+
# => {"novel"=>0.667, "news"=>0.667}
|
18
|
+
```
|
19
|
+
|
20
|
+
## How It Works
|
21
|
+
|
22
|
+
This library uses the basic idea of [Bayes Theorem](https://en.wikipedia.org/wiki/Bayes%27_theorem).
|
23
|
+
|
24
|
+
It records which tokens it has seen for each possible classification. Later when you pass a set of tokens and ask for the most likely classification it looks for all potential matches and then ranks them by considering the probabily of any given match according to the tokens that it sees.
|
25
|
+
|
26
|
+
Tokens which exist in many records (ie not very unique) have a smaller impact on the probability of a match and more unique tokens have a larger impact.
|
27
|
+
|
28
|
+
## Will It Work For My Dataset?
|
29
|
+
|
30
|
+
I'm using this in a project that has to match several hundred records against a list of ~10k possible matches.
|
31
|
+
At these sizes this project will train a matcher in ~10ms and each record that I check for a match takes ~1.2ms.
|
32
|
+
|
33
|
+
You can try it out with your own dataset by producing two simple CSV files and running the `examples/benchmark.rb` script in this repo.
|
34
|
+
For example you can run `bundle exec ruby benchmark.rb popular_recent_movies.csv favorite_recent_movies.csv` (those two files are provided in the examples directory as well).
|
35
|
+
If you can create a similar pair of CSV files you can test on whatever dataset you want and see the accuracy and performance of the library.
|
36
|
+
|
37
|
+
## License
|
38
|
+
|
39
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "bayesic_matching/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "bayesic_matching"
|
8
|
+
spec.version = BayesicMatching::VERSION
|
9
|
+
spec.authors = ["Michael Ries"]
|
10
|
+
spec.email = ["michael@riesd.com"]
|
11
|
+
|
12
|
+
spec.summary = "bayesian approach to matching one list of strings with another"
|
13
|
+
spec.description = spec.summary
|
14
|
+
spec.homepage = "https://github.com/mmmries/bayesic_matching"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
18
|
+
f.match(%r{^(test|spec|features)/})
|
19
|
+
end
|
20
|
+
spec.bindir = "exe"
|
21
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
|
24
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
26
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "bayesic_matching"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require "bayesic_matching"
|
2
|
+
require "benchmark/ips"
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
if ARGV.size < 2
|
6
|
+
puts "please provide a pair of CSV files. (i.e. ruby benchmark.rb training.csv matching.csv)"
|
7
|
+
puts "\ttraining.csv should have source_id and source_string columns"
|
8
|
+
puts "\tmatching.csv should have match_string and source_id columns"
|
9
|
+
exit(1)
|
10
|
+
end
|
11
|
+
|
12
|
+
training_csv_path = ARGV[0]
|
13
|
+
matching_csv_path = ARGV[1]
|
14
|
+
|
15
|
+
# You can tokenize your strings using many different schemes.
|
16
|
+
# The method below just downcases and splits on word boundaries,
|
17
|
+
# then removes punctuation and filters single-letter words.
|
18
|
+
# Feel free to change this to a tokenization scheme of your preference
|
19
|
+
def tokenize_string(str)
|
20
|
+
str.downcase.split(/\b+/).map do |word|
|
21
|
+
word.gsub(/[^\w ]/,"")
|
22
|
+
end.reject{|word| word.size < 2 }
|
23
|
+
end
|
24
|
+
|
25
|
+
training_rows = []
|
26
|
+
::CSV.foreach(training_csv_path, :headers => true, :header_converters => :symbol) do |row|
|
27
|
+
training_rows << {:string => row[:source_string], :id => row[:source_id], :tokens => tokenize_string(row[:source_string])}
|
28
|
+
end
|
29
|
+
|
30
|
+
matching_rows = []
|
31
|
+
::CSV.foreach(matching_csv_path, :headers => true, :header_converters => :symbol) do |row|
|
32
|
+
matching_rows << {:string => row[:match_string], :source_id => row[:source_id], :tokens => tokenize_string(row[:match_string])}
|
33
|
+
end
|
34
|
+
|
35
|
+
def train_matcher(training_rows)
|
36
|
+
matcher = BayesicMatching.new
|
37
|
+
training_rows.each do |row|
|
38
|
+
matcher.train(row[:tokens], row[:id])
|
39
|
+
end
|
40
|
+
matcher
|
41
|
+
end
|
42
|
+
|
43
|
+
def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
|
44
|
+
results = {:correct => 0, :incorrect => 0, :unmatched => 0, :total => 0}
|
45
|
+
matching_rows.each do |row|
|
46
|
+
probabilities = matcher.classify(row[:tokens])
|
47
|
+
next if row[:source_id].nil? or row[:source_id].size == 0 # if no source_id was present don't bother counting the statistics
|
48
|
+
results[:total] += 1
|
49
|
+
if probabilities.empty?
|
50
|
+
results[:unmatched] += 1
|
51
|
+
else
|
52
|
+
best_match, confidence = probabilities.max_by{|_klass, probability| probability }
|
53
|
+
if best_match == row[:source_id]
|
54
|
+
results[:correct] += 1
|
55
|
+
else
|
56
|
+
results[:incorrect] += 1
|
57
|
+
if print_mismatch_data
|
58
|
+
puts "MISMATCH of #{row[:string]} (#{row[:tokens]}) to #{best_match} (should have been #{row[:source_id]})"
|
59
|
+
puts "\tconfidence: #{probabilities[best_match]}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
results
|
65
|
+
end
|
66
|
+
|
67
|
+
matcher = train_matcher(training_rows)
|
68
|
+
|
69
|
+
Benchmark.ips do |x|
|
70
|
+
x.config(:time => 5, :warmup => 2)
|
71
|
+
x.report("training") { train_matcher(training_rows) }
|
72
|
+
x.report("matching") { attempt_matches(matcher, matching_rows) }
|
73
|
+
end
|
74
|
+
|
75
|
+
puts "= Checking Accuracy"
|
76
|
+
results = attempt_matches(matcher, matching_rows, true)
|
77
|
+
|
78
|
+
puts "= Accuracy Results"
|
79
|
+
puts "\t#{results[:total]} attempted matches"
|
80
|
+
puts "\t#{results[:correct]} correct (#{results[:correct].to_f / results[:total]}%)"
|
81
|
+
puts "\t#{results[:incorrect]} incorrect (#{results[:incorrect].to_f / results[:total]}%)"
|
82
|
+
puts "\t#{results[:unmatched]} unmatched (#{results[:unmatched].to_f / results[:total]}%)"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
match_string,source_id
|
2
|
+
Forest Gump,2101
|
3
|
+
Benjamin Button,720
|
4
|
+
12 Years a Slave,262
|
5
|
+
Green Mile,1612
|
6
|
+
Pulp Fiction,2110
|
7
|
+
Titanic,1801
|
8
|
+
Inglourious Basterds,625
|
9
|
+
The Lord of the Rings: The Fellowship of the Ring,1402
|
10
|
+
The Lord of the Rings: The Two Towers,1302
|
11
|
+
Fight Club,1654
|