bayesic_matching 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5cea0e170af73887df8e76272b3cd884b83ddf06
4
+ data.tar.gz: a3448c71f0ea4614fea06983678f988bcde33c27
5
+ SHA512:
6
+ metadata.gz: 159b74bf8b9224c7cba78e483b88f2f69c6ac943d5bcabee6bebc2a65077755d2402512139e94adde5b3b0e3be4c074d37d8fcd32262c3d6ee8b822dc6925294
7
+ data.tar.gz: 3e451a929ab45a35cd7462f72fd23ad962b844d7a3b854cf77479cb4347e34d647fab4d4f801c94d29e2f67015461bb5e87d2e44b6500a1fd723db0815dcdd1c
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.16.0
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in bayesic_matching.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ bayesic_matching (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ benchmark-ips (2.7.2)
10
+ diff-lcs (1.3)
11
+ rake (10.5.0)
12
+ rspec (3.7.0)
13
+ rspec-core (~> 3.7.0)
14
+ rspec-expectations (~> 3.7.0)
15
+ rspec-mocks (~> 3.7.0)
16
+ rspec-core (3.7.0)
17
+ rspec-support (~> 3.7.0)
18
+ rspec-expectations (3.7.0)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.7.0)
21
+ rspec-mocks (3.7.0)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.7.0)
24
+ rspec-support (3.7.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bayesic_matching!
31
+ benchmark-ips (~> 2.7)
32
+ bundler (~> 1.16)
33
+ rake (~> 10.0)
34
+ rspec (~> 3.0)
35
+
36
+ BUNDLED WITH
37
+ 1.16.0
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Michael Ries
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # BayesicMatching
2
+
3
+ Like NaiveBayes, except useful for the case of many possible classes with small training sets per class.
4
+
5
+ This is useful if you have two lists of names or titles and you want to match between them with a given confidence level.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ matcher = BayesicMatching.new
11
+ matcher.train(["it","was","the","best","of","times"], "novel")
12
+ matcher.train(["tonight","on","the","seven","o'clock"], "news")
13
+
14
+ matcher.classify(["the","best","of"])
15
+ # => {"novel"=>1.0, "news"=>0.667}
16
+ matcher.classify(["the","time"])
17
+ # => {"novel"=>0.667, "news"=>0.667}
18
+ ```
19
+
20
+ ## How It Works
21
+
22
+ This library uses the basic idea of [Bayes Theorem](https://en.wikipedia.org/wiki/Bayes%27_theorem).
23
+
24
+ It records which tokens it has seen for each possible classification. Later when you pass a set of tokens and ask for the most likely classification it looks for all potential matches and then ranks them by considering the probabily of any given match according to the tokens that it sees.
25
+
26
+ Tokens which exist in many records (ie not very unique) have a smaller impact on the probability of a match and more unique tokens have a larger impact.
27
+
28
+ ## Will It Work For My Dataset?
29
+
30
+ I'm using this in a project that has to match several hundred records against a list of ~10k possible matches.
31
+ At these sizes this project will train a matcher in ~10ms and each record that I check for a match takes ~1.2ms.
32
+
33
+ You can try it out with your own dataset by producing two simple CSV files and running the `examples/benchmark.rb` script in this repo.
34
+ For example you can run `bundle exec ruby benchmark.rb popular_recent_movies.csv favorite_recent_movies.csv` (those two files are provided in the examples directory as well).
35
+ If you can create a similar pair of CSV files you can test on whatever dataset you want and see the accuracy and performance of the library.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,28 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "bayesic_matching/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bayesic_matching"
8
+ spec.version = BayesicMatching::VERSION
9
+ spec.authors = ["Michael Ries"]
10
+ spec.email = ["michael@riesd.com"]
11
+
12
+ spec.summary = "bayesian approach to matching one list of strings with another"
13
+ spec.description = spec.summary
14
+ spec.homepage = "https://github.com/mmmries/bayesic_matching"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ spec.bindir = "exe"
21
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
+ spec.require_paths = ["lib"]
23
+
24
+ spec.add_development_dependency "benchmark-ips", "~> 2.7"
25
+ spec.add_development_dependency "bundler", "~> 1.16"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "bayesic_matching"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,82 @@
1
+ require "bayesic_matching"
2
+ require "benchmark/ips"
3
+ require "csv"
4
+
5
+ if ARGV.size < 2
6
+ puts "please provide a pair of CSV files. (i.e. ruby benchmark.rb training.csv matching.csv)"
7
+ puts "\ttraining.csv should have source_id and source_string columns"
8
+ puts "\tmatching.csv should have match_string and source_id columns"
9
+ exit(1)
10
+ end
11
+
12
+ training_csv_path = ARGV[0]
13
+ matching_csv_path = ARGV[1]
14
+
15
+ # You can tokenize your strings using many different schemes.
16
+ # The method below just downcases and splits on word boundaries,
17
+ # then removes punctuation and filters single-letter words.
18
+ # Feel free to change this to a tokenization scheme of your preference
19
+ def tokenize_string(str)
20
+ str.downcase.split(/\b+/).map do |word|
21
+ word.gsub(/[^\w ]/,"")
22
+ end.reject{|word| word.size < 2 }
23
+ end
24
+
25
+ training_rows = []
26
+ ::CSV.foreach(training_csv_path, :headers => true, :header_converters => :symbol) do |row|
27
+ training_rows << {:string => row[:source_string], :id => row[:source_id], :tokens => tokenize_string(row[:source_string])}
28
+ end
29
+
30
+ matching_rows = []
31
+ ::CSV.foreach(matching_csv_path, :headers => true, :header_converters => :symbol) do |row|
32
+ matching_rows << {:string => row[:match_string], :source_id => row[:source_id], :tokens => tokenize_string(row[:match_string])}
33
+ end
34
+
35
+ def train_matcher(training_rows)
36
+ matcher = BayesicMatching.new
37
+ training_rows.each do |row|
38
+ matcher.train(row[:tokens], row[:id])
39
+ end
40
+ matcher
41
+ end
42
+
43
+ def attempt_matches(matcher, matching_rows, print_mismatch_data = false)
44
+ results = {:correct => 0, :incorrect => 0, :unmatched => 0, :total => 0}
45
+ matching_rows.each do |row|
46
+ probabilities = matcher.classify(row[:tokens])
47
+ next if row[:source_id].nil? or row[:source_id].size == 0 # if no source_id was present don't bother counting the statistics
48
+ results[:total] += 1
49
+ if probabilities.empty?
50
+ results[:unmatched] += 1
51
+ else
52
+ best_match, confidence = probabilities.max_by{|_klass, probability| probability }
53
+ if best_match == row[:source_id]
54
+ results[:correct] += 1
55
+ else
56
+ results[:incorrect] += 1
57
+ if print_mismatch_data
58
+ puts "MISMATCH of #{row[:string]} (#{row[:tokens]}) to #{best_match} (should have been #{row[:source_id]})"
59
+ puts "\tconfidence: #{probabilities[best_match]}"
60
+ end
61
+ end
62
+ end
63
+ end
64
+ results
65
+ end
66
+
67
+ matcher = train_matcher(training_rows)
68
+
69
+ Benchmark.ips do |x|
70
+ x.config(:time => 5, :warmup => 2)
71
+ x.report("training") { train_matcher(training_rows) }
72
+ x.report("matching") { attempt_matches(matcher, matching_rows) }
73
+ end
74
+
75
+ puts "= Checking Accuracy"
76
+ results = attempt_matches(matcher, matching_rows, true)
77
+
78
+ puts "= Accuracy Results"
79
+ puts "\t#{results[:total]} attempted matches"
80
+ puts "\t#{results[:correct]} correct (#{results[:correct].to_f / results[:total]}%)"
81
+ puts "\t#{results[:incorrect]} incorrect (#{results[:incorrect].to_f / results[:total]}%)"
82
+ puts "\t#{results[:unmatched]} unmatched (#{results[:unmatched].to_f / results[:total]}%)"
@@ -0,0 +1,11 @@
1
+ match_string,source_id
2
+ Forest Gump,2101
3
+ Benjamin Button,720
4
+ 12 Years a Slave,262
5
+ Green Mile,1612
6
+ Pulp Fiction,2110
7
+ Titanic,1801
8
+ Inglourious Basterds,625
9
+ The Lord of the Rings: The Fellowship of the Ring,1402
10
+ The Lord of the Rings: The Two Towers,1302
11
+ Fight Club,1654