fuzzy_match 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +8 -8
  2. data/.rspec +2 -0
  3. data/CHANGELOG +14 -0
  4. data/Gemfile +8 -0
  5. data/README.markdown +58 -38
  6. data/Rakefile +0 -9
  7. data/bin/fuzzy_match +106 -0
  8. data/fuzzy_match.gemspec +4 -4
  9. data/groupings-screenshot.png +0 -0
  10. data/highlevel.graffle +0 -0
  11. data/highlevel.png +0 -0
  12. data/lib/fuzzy_match/record.rb +58 -0
  13. data/lib/fuzzy_match/result.rb +11 -8
  14. data/lib/fuzzy_match/rule/grouping.rb +70 -12
  15. data/lib/fuzzy_match/rule/identity.rb +3 -3
  16. data/lib/fuzzy_match/rule.rb +1 -1
  17. data/lib/fuzzy_match/score/amatch.rb +0 -4
  18. data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
  19. data/lib/fuzzy_match/score.rb +4 -0
  20. data/lib/fuzzy_match/similarity.rb +10 -32
  21. data/lib/fuzzy_match/version.rb +1 -1
  22. data/lib/fuzzy_match.rb +78 -94
  23. data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
  24. data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
  25. data/spec/foo.rb +9 -0
  26. data/spec/fuzzy_match_spec.rb +354 -0
  27. data/spec/grouping_spec.rb +60 -0
  28. data/spec/identity_spec.rb +29 -0
  29. data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
  30. data/spec/spec_helper.rb +21 -0
  31. metadata +56 -50
  32. data/bin/fuzzy_match_checker +0 -71
  33. data/examples/bts_aircraft/5-2-A.htm +0 -10305
  34. data/examples/bts_aircraft/5-2-B.htm +0 -9576
  35. data/examples/bts_aircraft/5-2-D.htm +0 -7094
  36. data/examples/bts_aircraft/5-2-E.htm +0 -2349
  37. data/examples/bts_aircraft/5-2-G.htm +0 -2922
  38. data/examples/bts_aircraft/groupings.csv +0 -1
  39. data/examples/bts_aircraft/identities.csv +0 -1
  40. data/examples/bts_aircraft/negatives.csv +0 -1
  41. data/examples/bts_aircraft/normalizers.csv +0 -1
  42. data/examples/bts_aircraft/number_260.csv +0 -334
  43. data/examples/bts_aircraft/positives.csv +0 -1
  44. data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
  45. data/examples/first_name_matching.rb +0 -15
  46. data/examples/icao-bts.xls +0 -0
  47. data/lib/fuzzy_match/rule/normalizer.rb +0 -20
  48. data/lib/fuzzy_match/rule/stop_word.rb +0 -11
  49. data/lib/fuzzy_match/wrapper.rb +0 -73
  50. data/test/helper.rb +0 -12
  51. data/test/test_fuzzy_match.rb +0 -304
  52. data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
  53. data/test/test_grouping.rb +0 -28
  54. data/test/test_identity.rb +0 -34
  55. data/test/test_normalizer.rb +0 -10
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YTlmMmE3MDI3MWQ0NzY0NGE5N2Q4ZDY4MmI5NjUzZTg5YWU1OWE5OQ==
4
+ NDhjZDk1NjAxOTMzMGZkMWU4ODAyNjRmMzA4YzlhNzQwZWIwZGU5MQ==
5
5
  data.tar.gz: !binary |-
6
- MzU3MDc3NjQ1NDczNWFhNWE0ZDdlZGRmYjlhYWQ3Y2YyZTNiMjRhMQ==
6
+ OTk5Yjc2NmY4ZTY3NDkyOTFjOGQwYjlhZDgwYjk2NmViMzI0NGYyMQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MDcyZDk5OGY5MTQwNTEyMDA1YmQ1MzdiYzYyZTczMWExNzI0NjhlYWNkYzcx
10
- Y2UwNzgwN2I5YmI3MjVhMzIwMDljZmVmMTFkODQ2MjY2NDdkNzViZjlhNTcy
11
- NjliMmU5NTAwOTBiYTYxNjk4ZDkwZWM0OWMzMTdjZTU4ZjQ4NDk=
9
+ YmE4MjUxNzg0MmFlN2UyMTYxNGJhNGRiZjZmYThjOGYzNDMwYjRjZmVlNTBk
10
+ MmQzZjE0NWJiN2IzZGY3YzBhZWQzOWVkMTNjZmVhODUwMTk5ODJmMTY0Njk0
11
+ ODEwMjc2MDc0M2M2YjNmZmY5MmViZGNiOTYzNGQ2MjQxODZkZDQ=
12
12
  data.tar.gz: !binary |-
13
- MTg0MGVlYTc3NTY0NjMxZWMwNWFmZTdhYmRhOWM4MGJmN2QwYjc3NmNiYzQz
14
- MzU3ODc2NDRjNDJjNzhiYmQwMmNkNmQ3MTdjYzMyNjM2YzBjMDEwOGYzNzgy
15
- NmFkZTM0M2E2YzkwZjY1YjM4ZTEzY2Y1YmU5MmY3MDZjNTJhMzg=
13
+ NGIzYTVmNzk0N2E4YjU4ZDc1MTNiNTU4YmRhMDgxNmM5Y2Q0MWM2NzYzZjhj
14
+ NTA3ZDg5NWE2ZWM2MWZkNDRjM2I1NWY2NzA4ODAwM2E4NTIwY2NmYWI4MzBm
15
+ MjliNDZkNzIzMGJhZTVhMzViNThhZjk1MmM1NTViNmQ4ODFjNzM=
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/CHANGELOG CHANGED
@@ -1,3 +1,17 @@
1
+ 2.0.0 / 2013-05-22
2
+
3
+ * Breaking changes
4
+
5
+ * normalizers removed - use groupings instead
6
+ * first_grouping_decides removed
7
+ * FuzzyMatch#free gone
8
+
9
+ * Enhancements
10
+
11
+ * chained groupings!
12
+ * faster and simpler structure
13
+ * FuzzyMatch#find_with_score returns [record, dice_score, lev_score]
14
+
1
15
  1.5.0 / 2013-04-03
2
16
 
3
17
  * Breaking changes
data/Gemfile CHANGED
@@ -1,3 +1,11 @@
1
1
  source :rubygems
2
2
 
3
3
  gemspec
4
+
5
+ # bin/fuzzy_match development
6
+ gem 'activesupport'
7
+ gem 'remote_table'
8
+ gem 'thor'
9
+ gem 'to_regexp'
10
+ gem 'perftools.rb'
11
+ gem 'pry'
data/README.markdown CHANGED
@@ -4,20 +4,9 @@ Find a needle in a haystack based on string similarity and regular expression ru
4
4
 
5
5
  Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight_dictionary) because that was a confusing name.
6
6
 
7
- ## Real-world usage
8
-
9
- <p><a href="http://brighterplanet.com"><img src="https://s3.amazonaws.com/static.brighterplanet.com/assets/logos/flush-left/inline/green/rasterized/brighter_planet-160-transparent.png" alt="Brighter Planet logo"/></a></p>
10
-
11
- We use `fuzzy_match` for [data science at Brighter Planet](http://brighterplanet.com/research) and in production at
12
-
13
- * [Brighter Planet's impact estimate web service](http://impact.brighterplanet.com)
14
- * [Brighter Planet's reference data web service](http://data.brighterplanet.com)
15
-
16
- We often combine it with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
7
+ Warning! `normalizers` are gone in version 2 and above! See the CHANGELOG and check out enhanced (and hopefully more intuitive) `groupings`.
17
8
 
18
- - download table with `remote_table`
19
- - correct serious or repeated errors with `errata`
20
- - `fuzzy_match` the rest
9
+ ![diagram of matching process](https://raw.github.com/seamusabshere/fuzzy_match/master/highlevel.png)
21
10
 
22
11
  ## Quickstart
23
12
 
@@ -30,41 +19,62 @@ See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2
30
19
 
31
20
  ## Default matching (string similarity)
32
21
 
33
- At the core, and even if you configure nothing else, string similarity (calculated by "pair distance" aka Dice's) is used to compare records.
22
+ At the core, and even if you configure nothing else, string similarity (calculated by "pair distance" aka Dice's Coefficient) is used to compare records.
34
23
 
35
24
  You can tell `FuzzyMatch` what field or method to use via the `:read` option... for example, let's say you want to match a `Country` object like `#<Country name:"Uruguay" iso_3166_code:"UY">`
36
25
 
37
- >> matcher = FuzzyMatch.new(Country.all, :read => :name) # Country#name will be called when comparing
26
+ >> fz = FuzzyMatch.new(Country.all, :read => :name)
38
27
  => #<FuzzyMatch: [...]>
39
- >> matcher.find('youruguay')
40
- => #<Country name:"Uruguay" iso_3166_code:"UY"> # the matcher returns a Country object
28
+ >> fz.find('youruguay')
29
+ => #<Country name:"Uruguay" iso_3166_code:"UY">
41
30
 
42
31
  ## Optional rules (regular expressions)
43
32
 
44
- You can improve the default matchings with rules. There are 4 different kinds of rules. Each rule is a regular expression. Depending on the kind of rule, the results of running the regular expression are used for a particular purpose.
33
+ You can improve the default matchings with rules. There are 3 different kinds of rules. Each rule is a regular expression.
45
34
 
46
35
  We suggest that you **first try without any rules** and only define them to improve matching, prevent false positives, etc.
47
36
 
48
- >> matcher = FuzzyMatch.new(['Ford F-150', 'Ford F-250', 'GMC 1500', 'GMC 2500'], :groupings => [ /ford/i, /gmc/i ], :normalizers => [ /K(\d500)/i ], :identities => [ /(f)-?(\d50)/i ])
49
- => #<FuzzyMatch: [...]>
50
- >> matcher.find('fordf250')
51
- => "Ford F-250"
52
- >> matcher.find('gmc truck k1500')
53
- => "GMC 1500"
37
+ ### Groupings
54
38
 
55
- For identities and normalizers (see below), **only the captures are used.** For example, `/(f)-?(\d50)/i` captures the "F" and the "250" but ignores the dash. So place your parentheses carefully! Groupings work the same way, except that if you don't have any captures, a simple match will pass.
39
+ Group records together. The two laws of groupings:
56
40
 
57
- ### Groupings
41
+ 1. If a needle matches a grouping, only compare it with straws in the same grouping; (the "buddies vs buddies" rule)
42
+ 2. If a needle doesn't match any grouping, only compare it with straws that also don't match ANY grouping (the "misfits vs misfits" rule)
43
+
44
+ The two laws of chained groupings: (new in v2.0 and rather important)
45
+
46
+ 1. Sub-groupings (e.g., `/plaza/i` below) only match if their primary (e.g., `/ramada/i`) does
47
+ 2. In final grouping decisions, sub-groupings win over primaries (so "Ramada Inn" is NOT grouped with "Ramada Plaza", but if you removed `/plaza/i` sub-grouping, then they would be grouped together)
48
+
49
+ Hopefully they are rather intuitive once you start using them.
50
+
51
+ [![screenshot of spreadsheet of groupings](https://raw.github.com/seamusabshere/fuzzy_match/master/groupings-screenshot.png)](https://docs.google.com/spreadsheet/pub?key=0AkCJNpm9Ks6JdG4xSWhfWFlOV1RsZ2NCeU9seGx6cnc&single=true&gid=0&output=html)
52
+
53
+ That will...
58
54
 
59
- Group records together.
55
+ * separate "Orient Express Hotel" and "Ramada Conference Center Mandarin" from real Mandarin Oriental hotels
56
+ * keep "Trump Hotel Collection" away from "Luxury Collection" (another real hotel brand) without messing with the word "Luxury"
57
+ * make sure that "Ramada Plaza" are always grouped with other RPs&mdash;and not with plain old Ramadas&mdash;and vice versa
58
+ * splits out Hyatts into their different brands
59
+ * and more
60
60
 
61
- Setting a grouping of `/Airbus/` ensures that strings containing "Airbus" will only be scored against to other strings containing "Airbus". A better grouping in this case would probably be `/airbus/i`.
61
+ You specify chained groupings as arrays of regexps:
62
+
63
+ groupings = [
64
+ /mandarin/i,
65
+ /trump/i,
66
+ [ /ramada/i, /plaza/i ],
67
+ ...
68
+ ]
69
+ fz = FuzzyMatch.new(haystack, groupings: groupings)
70
+
71
+ This way of specifying groupings is meant to be easy to load from a CSV, like `bin/fuzzy_match` does.
62
72
 
63
73
  Formerly called "blockings," but that was jargon that confused people.
64
74
 
65
75
  ### Identities
66
76
 
67
- Prevent impossible matches.
77
+ Prevent impossible matches. Can be very confusing&mdash;see if you can make things work with groupings first.
68
78
 
69
79
  Adding an identity like `/(f)-?(\d50)/i` ensures that "Ford F-150" and "Ford F-250" never match.
70
80
 
@@ -72,29 +82,24 @@ Note that identities do not establish certainty. They just say whether two recor
72
82
 
73
83
  ### Stop words
74
84
 
75
- Ignore common and/or meaningless words. Applied before normalizers.
85
+ Ignore common and/or meaningless words when doing string similarity.
76
86
 
77
87
  Adding a stop word like `THE` ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
78
88
 
79
- ### Normalizers (formerly called tighteners)
80
-
81
- Strip strings down to the essentials. Applied after stop words.
82
-
83
- Adding a normalizer like `/(boeing).*(7\d\d)/i` will cause "BOEING COMPANY 747" and "boeing747" to be normalized to "BOEING 747" and "boeing 747", respectively. Since things are generally downcased before they are compared, these would be an exact match.
89
+ Stop words are NOT removed when checking `:must_match_at_least_one_word` and when doing identities and groupings.
84
90
 
85
91
  ## Find options
86
92
 
87
93
  * `read`: how to interpret each record in the 'haystack', either a Proc or a symbol
88
94
  * `must_match_grouping`: don't return a match unless the needle fits into one of the groupings you specified
89
95
  * `must_match_at_least_one_word`: don't return a match unless the needle shares at least one word with the match. Note that "Foo's" is treated like one word (so that it won't match "'s") and "Bolivia," is treated as just "bolivia"
90
- * `first_grouping_decides`: force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
91
96
  * `gather_last_result`: enable `last_result`
92
97
 
93
98
  ## Case sensitivity
94
99
 
95
100
  String similarity is case-insensitive. Everything is downcased before scoring. This is a change from previous versions.
96
101
 
97
- Be careful when trying to use case-sensitivity in your rules; in general, things are downcased before comparing.
102
+ Be careful with uppercase letters in your rules; in general, things are downcased before comparing.
98
103
 
99
104
  ## String similarity algorithm
100
105
 
@@ -152,6 +157,21 @@ You can optionally use [`amatch`](http://flori.github.com/amatch/) by [Florian F
152
157
 
153
158
  Otherwise, pure ruby versions of the string similarity algorithms derived from the [answer to a StackOverflow question](http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings) and [the text gem](https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb) are used. Thanks [marzagao](http://stackoverflow.com/users/10997/marzagao) and [threedaymonk](https://github.com/threedaymonk)!
154
159
 
160
+ ## Real-world usage
161
+
162
+ <p><a href="http://brighterplanet.com"><img src="https://s3.amazonaws.com/static.brighterplanet.com/assets/logos/flush-left/inline/green/rasterized/brighter_planet-160-transparent.png" alt="Brighter Planet logo"/></a></p>
163
+
164
+ We use `fuzzy_match` for [data science at Brighter Planet](http://brighterplanet.com/research) and in production at
165
+
166
+ * [Brighter Planet's impact estimate web service](http://impact.brighterplanet.com)
167
+ * [Brighter Planet's reference data web service](http://data.brighterplanet.com)
168
+
169
+ We often combine it with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
170
+
171
+ - download table with `remote_table`
172
+ - correct serious or repeated errors with `errata`
173
+ - `fuzzy_match` the rest
174
+
155
175
  ## Authors
156
176
 
157
177
  * Seamus Abshere <seamus@abshere.net>
@@ -160,4 +180,4 @@ Otherwise, pure ruby versions of the string similarity algorithms derived from t
160
180
 
161
181
  ## Copyright
162
182
 
163
- Copyright 2012 Brighter Planet, Inc.
183
+ Copyright 2013 Seamus Abshere
data/Rakefile CHANGED
@@ -1,14 +1,5 @@
1
1
  require 'bundler'
2
2
  Bundler::GemHelper.install_tasks
3
3
 
4
- require 'rake/testtask'
5
- Rake::TestTask.new(:test) do |test|
6
- test.libs << 'lib' << 'test'
7
- test.pattern = 'test/**/test_*.rb'
8
- test.verbose = true
9
- end
10
-
11
- task :default => :test
12
-
13
4
  require 'yard'
14
5
  YARD::Rake::YardocTask.new
data/bin/fuzzy_match ADDED
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if File.exist?('Gemfile')
4
+ require 'bundler/setup'
5
+ end
6
+ if ENV['PROFILE'] == 'true'
7
+ require 'perftools'
8
+ end
9
+ # PerfTools::CpuProfiler.start("profile_data") do
10
+ require 'fuzzy_match'
11
+ require 'fuzzy_match/version'
12
+
13
+ require 'active_support/core_ext'
14
+ require 'remote_table'
15
+ require 'thor'
16
+ require 'to_regexp'
17
+
18
+ class FuzzyMatch
19
+ class Cli < ::Thor
20
+ desc :match, "Print out matches between A and B, where A is haystack and B is a bunch of needles."
21
+ method_option :csv, default: false, type: :boolean, desc: "CSV output"
22
+ method_option :a_col, default: 0, type: :string, desc: "Column name in A. Defaults to first column."
23
+ method_option :b_col, default: 0, type: :string, desc: "Column name in B. Defaults to first column."
24
+ method_option :downcase, default: true, type: :boolean, desc: "Whether to downcase everything (except regexes, where you have to do /foo/i)"
25
+ method_option :groupings, default: nil, type: :string, desc: "Spreadsheet with groupings - no headers, multi-part groupings on the same row"
26
+ method_option :rules, default: nil, type: :string, desc: "Spreadsheet with headers: stop_words, identities, find_options. Listing a find_option like must_match_grouping makes it true."
27
+ method_option :explain, default: false, type: :boolean
28
+ method_option :grep, default: nil, type: :string
29
+ method_option :limit, default: 1.0/0, type: :numeric
30
+ def match(a_url, b_url)
31
+ puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
32
+ fz = mkfz a_url
33
+ b = load_b b_url
34
+ if ENV['PROFILE'] == 'true'
35
+ require 'perftools'
36
+ PerfTools::CpuProfiler.start("profile.bin") { report(fz, b) }
37
+ system "pprof.rb --text profile.bin"
38
+ `pprof.rb --gif profile.bin > profile.gif`
39
+ else
40
+ report fz, b
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def report(fz, b)
47
+ b.each do |b_val|
48
+ if options.explain
49
+ fz.explain
50
+ else
51
+ a_val = fz.find b_val
52
+ if options.csv
53
+ # puts [ b_val.ljust(50), a_val ].join('-> ')
54
+ puts [ b_val, a_val ].to_csv
55
+ else
56
+ puts %{\nB: #{b_val}\nA: #{a_val}}
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ def load_b(b_url)
63
+ b_options = options.b_col.is_a?(String) ? { headers: :first_row } : { headers: false }
64
+ if options[:grep]
65
+ regexp = options[:grep].to_regexp(detect: true)
66
+ b_options[:select] = lambda { |row| regexp =~ row[options.b_col] }
67
+ end
68
+ b = RemoteTable.new(b_url, b_options).to_a
69
+ limit = [options.limit, b.length].min
70
+ b.first(limit).map do |row|
71
+ b_val = row[options.b_col]
72
+ b_val.downcase! if options.downcase
73
+ b_val
74
+ end
75
+ end
76
+
77
+ def mkfz(a_url)
78
+ a_options = options.a_col.is_a?(String) ? { headers: :first_row } : { headers: false }
79
+ a = RemoteTable.new(a_url, a_options).map { |row| row[options.a_col] }
80
+ a.map!(&:downcase) if options.downcase
81
+ FuzzyMatch.new a, fz_options
82
+ end
83
+
84
+ def fz_options
85
+ memo = {}
86
+ if options.groupings
87
+ memo[:groupings] = RemoteTable.new(options.groupings, headers: false).map do |row|
88
+ row.to_a.select(&:present?).map { |v| v.to_regexp(detect: true) }
89
+ end
90
+ end
91
+ if options.rules
92
+ t = RemoteTable.new(options.rules, headers: :first_row)
93
+ find_options = t.rows.map { |row| row['find_options'] }
94
+ memo.merge!(
95
+ identities: t.rows.map { |row| row['identities'] }.select(&:present?).map { |v| v.to_regexp(detect: true) },
96
+ stop_words: t.rows.map { |row| row['stop_words'] }.select(&:present?).map { |v| v.to_regexp(detect: true) },
97
+ must_match_grouping: find_options.include?('must_match_grouping'),
98
+ must_match_at_least_one_word: find_options.include?('must_match_at_least_one_word'),
99
+ )
100
+ end
101
+ memo
102
+ end
103
+ end
104
+ end
105
+
106
+ FuzzyMatch::Cli.start
data/fuzzy_match.gemspec CHANGED
@@ -21,14 +21,14 @@ Gem::Specification.new do |s|
21
21
  s.add_development_dependency 'active_record_inline_schema', '>=0.4.0'
22
22
 
23
23
  # development dependencies
24
- s.add_development_dependency "minitest"
24
+ s.add_development_dependency 'pry'
25
+ s.add_development_dependency 'rspec-core'
26
+ s.add_development_dependency 'rspec-expectations'
27
+ s.add_development_dependency 'rspec-mocks'
25
28
  s.add_development_dependency 'activerecord', '>=3'
26
29
  s.add_development_dependency 'mysql2'
27
30
  s.add_development_dependency 'cohort_analysis'
28
31
  s.add_development_dependency 'weighted_average'
29
32
  s.add_development_dependency 'yard'
30
33
  s.add_development_dependency 'amatch'
31
- if RUBY_VERSION >= '1.9'
32
- s.add_development_dependency 'minitest-reporters'
33
- end
34
34
  end
Binary file
data/highlevel.graffle ADDED
Binary file
data/highlevel.png ADDED
Binary file
@@ -0,0 +1,58 @@
1
+ class FuzzyMatch
2
+ # Records are the tokens that are passed around when doing scoring and optimizing.
3
+ class Record #:nodoc: all
4
+ # "Foo's" is one word
5
+ # "North-west" is just one word
6
+ # "Bolivia," is just Bolivia
7
+ WORD_BOUNDARY = %r{\W*(?:\s+|$)}
8
+ EMPTY = [].freeze
9
+ BLANK = ''.freeze
10
+
11
+ attr_reader :original
12
+ attr_reader :read
13
+ attr_reader :stop_words
14
+
15
+ def initialize(original, options = {})
16
+ @original = original
17
+ @read = options[:read]
18
+ @stop_words = options.fetch(:stop_words, EMPTY)
19
+ end
20
+
21
+ def inspect
22
+ "w(#{clean.inspect})"
23
+ end
24
+
25
+ def clean
26
+ @clean ||= begin
27
+ memo = whole.dup
28
+ stop_words.each do |stop_word|
29
+ memo.gsub! stop_word, BLANK
30
+ end
31
+ memo.strip.freeze
32
+ end
33
+ end
34
+
35
+ def words
36
+ @words ||= clean.downcase.split(WORD_BOUNDARY).freeze
37
+ end
38
+
39
+ def similarity(other)
40
+ Similarity.new self, other
41
+ end
42
+
43
+ def whole
44
+ @whole ||= case read
45
+ when ::NilClass
46
+ original
47
+ when ::Numeric, ::String
48
+ original[read]
49
+ when ::Proc
50
+ read.call original
51
+ when ::Symbol
52
+ original.respond_to?(read) ? original.send(read) : original[read]
53
+ else
54
+ raise "Expected nil, a proc, or a symbol, got #{read.inspect}"
55
+ end.to_s.strip.freeze
56
+ end
57
+ end
58
+ end
@@ -1,19 +1,23 @@
1
+ # encoding: utf-8
1
2
  require 'erb'
3
+ require 'pp'
2
4
 
3
5
  class FuzzyMatch
4
6
  class Result #:nodoc: all
5
7
  EXPLANATION = <<-ERB
6
- You looked for <%= needle.render.inspect %>
8
+ #####################################################
9
+ # SUMMARY
10
+ #####################################################
7
11
 
8
- <% if winner %>It was matched with "<%= winner %>"<% else %>No match was found<% end %>
12
+ Needle: <%= needle.inspect %>
13
+ Match: <%= winner.inspect %>
9
14
 
10
- # THE HAYSTACK
15
+ #####################################################
16
+ # OPTIONS
17
+ #####################################################
11
18
 
12
- The haystack reader was <%= read.inspect %>.
19
+ <%= PP.pp(options, '') %>
13
20
 
14
- The haystack contained <%= haystack.length %> records like <%= haystack[0, 3].map(&:render).map(&:inspect).join(', ') %>
15
-
16
- # HOW IT WAS MATCHED
17
21
  <% timeline.each_with_index do |event, index| %>
18
22
  (<%= index+1 %>) <%= event %>
19
23
  <% end %>
@@ -23,7 +27,6 @@ ERB
23
27
  attr_accessor :read
24
28
  attr_accessor :haystack
25
29
  attr_accessor :options
26
- attr_accessor :normalizers
27
30
  attr_accessor :groupings
28
31
  attr_accessor :identities
29
32
  attr_accessor :stop_words
@@ -1,3 +1,4 @@
1
+ # require 'pry'
1
2
  class FuzzyMatch
2
3
  class Rule
3
4
  # "Record linkage typically involves two main steps: grouping and scoring..."
@@ -8,18 +9,75 @@ class FuzzyMatch
8
9
  # A grouping (formerly known as a blocking) comes into effect when a str matches.
9
10
  # Then the needle must also match the grouping's regexp.
10
11
  class Grouping < Rule
11
- def match?(str)
12
- !!(regexp.match(str))
13
- end
14
-
15
- # If a grouping "joins" two strings, that means they both fit into it.
16
- #
17
- # Returns false if they certainly don't fit this grouping.
18
- # Returns nil if the grouping doesn't apply, i.e. str2 doesn't fit the grouping.
19
- def join?(str1, str2)
20
- if str2_match_data = regexp.match(str2)
21
- if str1_match_data = regexp.match(str1)
22
- str2_match_data.captures.join.downcase == str1_match_data.captures.join.downcase
12
+ class << self
13
+ def make(regexps)
14
+ case regexps
15
+ when ::Regexp
16
+ new regexps
17
+ when ::Array
18
+ chain = regexps.flatten.map { |regexp| new regexp }
19
+ if chain.length == 1
20
+ chain[0] # not really a chain after all
21
+ else
22
+ chain.each { |grouping| grouping.chain = chain }
23
+ chain
24
+ end
25
+ else
26
+ raise ArgumentError, "[fuzzy_match] Groupings should be specified as single regexps or an array of regexps (got #{regexps.inspect})"
27
+ end
28
+ end
29
+ end
30
+
31
+ attr_accessor :chain
32
+
33
+ def inspect
34
+ memo = []
35
+ memo << "#{regexp.inspect}"
36
+ if chain
37
+ memo << "(#{chain.find_index(self)} of #{chain.length})"
38
+ end
39
+ memo.join ' '
40
+ end
41
+
42
+ def xmatch?(record)
43
+ if primary?
44
+ match?(record) and subs.none? { |sub| sub.match?(record) }
45
+ else
46
+ match?(record) and primary.match?(record)
47
+ end
48
+ end
49
+
50
+ def xjoin?(needle, straw)
51
+ if primary?
52
+ join?(needle, straw) and subs.none? { |sub| sub.match?(straw) } # maybe xmatch here?
53
+ else
54
+ join?(needle, straw) and primary.match?(straw)
55
+ end
56
+ end
57
+
58
+ protected
59
+
60
+ def primary?
61
+ chain ? (primary == self) : true
62
+ # not chain or primary == self
63
+ end
64
+
65
+ def primary
66
+ chain ? chain[0] : self
67
+ end
68
+
69
+ def subs
70
+ chain ? chain[1..-1] : []
71
+ end
72
+
73
+ def match?(record)
74
+ !!(regexp.match(record.whole))
75
+ end
76
+
77
+ def join?(needle, straw)
78
+ if straw_match_data = regexp.match(straw.whole)
79
+ if needle_match_data = regexp.match(needle.whole)
80
+ straw_match_data.captures.join.downcase == needle_match_data.captures.join.downcase
23
81
  else
24
82
  false
25
83
  end
@@ -7,9 +7,9 @@ class FuzzyMatch
7
7
  #
8
8
  # Only returns true/false if both strings match the regexp.
9
9
  # Otherwise returns nil.
10
- def identical?(str1, str2)
11
- if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
12
- str1_match_data.captures.join.downcase == match_data.captures.join.downcase
10
+ def identical?(record1, record2)
11
+ if str1_match_data = regexp.match(record1.whole) and str2_match_data = regexp.match(record2.whole)
12
+ str1_match_data.captures.join.downcase == str2_match_data.captures.join.downcase
13
13
  else
14
14
  nil
15
15
  end
@@ -11,7 +11,7 @@ class FuzzyMatch
11
11
  end
12
12
 
13
13
  def ==(other)
14
- regexp == other.regexp
14
+ other.class == self.class and regexp == other.regexp
15
15
  end
16
16
  end
17
17
  end
@@ -3,10 +3,6 @@ class FuzzyMatch
3
3
  # be sure to `require 'amatch'` before you use this class
4
4
  class Amatch < Score
5
5
 
6
- def inspect
7
- %{#<FuzzyMatch::Score::Amatch: str1=#{str1.inspect} str2=#{str2.inspect} dices_coefficient_similar=#{dices_coefficient_similar} levenshtein_similar=#{levenshtein_similar}>}
8
- end
9
-
10
6
  def dices_coefficient_similar
11
7
  @dices_coefficient_similar ||= if str1 == str2
12
8
  1.0
@@ -4,10 +4,6 @@ class FuzzyMatch
4
4
 
5
5
  SPACE = ' '
6
6
 
7
- def inspect
8
- %{#<FuzzyMatch::Score::PureRuby: str1=#{str1.inspect} str2=#{str2.inspect} dices_coefficient_similar=#{dices_coefficient_similar} levenshtein_similar=#{levenshtein_similar}>}
9
- end
10
-
11
7
  # http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
12
8
  def dices_coefficient_similar
13
9
  @dices_coefficient_similar ||= begin
@@ -90,10 +86,8 @@ class FuzzyMatch
90
86
  private
91
87
 
92
88
  def utf8?
93
- return @utf8_query[0] if @utf8_query.is_a?(::Array) # ActiveSupport::Memoizable is deprecated in 3.2, how annoying
94
- utf8_query = (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
95
- @utf8_query = [utf8_query]
96
- utf8_query
89
+ return @utf8_query if defined?(@utf8_query)
90
+ @utf8_query = (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
97
91
  end
98
92
  end
99
93
  end
@@ -13,6 +13,10 @@ class FuzzyMatch
13
13
  @str2 = str2.downcase
14
14
  end
15
15
 
16
+ def inspect
17
+ %{(dice=#{"%0.5f" % dices_coefficient_similar},lev=#{"%0.5f" % levenshtein_similar})}
18
+ end
19
+
16
20
  def <=>(other)
17
21
  a = dices_coefficient_similar
18
22
  b = other.dices_coefficient_similar