fuzzy_match 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/.document +5 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +20 -0
  5. data/README.rdoc +94 -0
  6. data/Rakefile +21 -0
  7. data/THANKS-WILLIAM-JAMES.rb +37 -0
  8. data/benchmark/before-with-free.txt +283 -0
  9. data/benchmark/before-without-last-result.txt +257 -0
  10. data/benchmark/before.txt +304 -0
  11. data/benchmark/memory.rb +54 -0
  12. data/examples/bts_aircraft/5-2-A.htm +10305 -0
  13. data/examples/bts_aircraft/5-2-B.htm +9576 -0
  14. data/examples/bts_aircraft/5-2-D.htm +7094 -0
  15. data/examples/bts_aircraft/5-2-E.htm +2349 -0
  16. data/examples/bts_aircraft/5-2-G.htm +2922 -0
  17. data/examples/bts_aircraft/blockings.csv +1 -0
  18. data/examples/bts_aircraft/identities.csv +1 -0
  19. data/examples/bts_aircraft/negatives.csv +1 -0
  20. data/examples/bts_aircraft/number_260.csv +334 -0
  21. data/examples/bts_aircraft/positives.csv +1 -0
  22. data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
  23. data/examples/bts_aircraft/tighteners.csv +1 -0
  24. data/examples/first_name_matching.rb +15 -0
  25. data/examples/icao-bts.xls +0 -0
  26. data/fuzzy_match.gemspec +32 -0
  27. data/lib/fuzzy_match/blocking.rb +36 -0
  28. data/lib/fuzzy_match/cached_result.rb +74 -0
  29. data/lib/fuzzy_match/identity.rb +23 -0
  30. data/lib/fuzzy_match/result.rb +17 -0
  31. data/lib/fuzzy_match/score.rb +125 -0
  32. data/lib/fuzzy_match/similarity.rb +53 -0
  33. data/lib/fuzzy_match/stop_word.rb +19 -0
  34. data/lib/fuzzy_match/tightener.rb +28 -0
  35. data/lib/fuzzy_match/version.rb +3 -0
  36. data/lib/fuzzy_match/wrapper.rb +67 -0
  37. data/lib/fuzzy_match.rb +252 -0
  38. data/test/helper.rb +12 -0
  39. data/test/test_blocking.rb +23 -0
  40. data/test/test_cache.rb +130 -0
  41. data/test/test_fuzzy_match.rb +190 -0
  42. data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
  43. data/test/test_identity.rb +33 -0
  44. data/test/test_tightening.rb +10 -0
  45. metadata +197 -0
@@ -0,0 +1,23 @@
1
+ class FuzzyMatch
2
+ # Identities take effect when needle and haystack both match a regexp
3
+ # Then the captured part of the regexp has to match exactly
4
+ class Identity
5
+ attr_reader :regexp
6
+
7
+ def initialize(regexp_or_str)
8
+ @regexp = regexp_or_str.to_regexp
9
+ end
10
+
11
+ # Two strings are "identical" if they both match this identity and the captures are equal.
12
+ #
13
+ # Only returns true/false if both strings match the regexp.
14
+ # Otherwise returns nil.
15
+ def identical?(str1, str2)
16
+ if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
17
+ str1_match_data.captures == match_data.captures
18
+ else
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,17 @@
1
+ class FuzzyMatch
2
+ class Result #:nodoc: all
3
+ attr_accessor :needle
4
+ attr_accessor :tighteners
5
+ attr_accessor :blockings
6
+ attr_accessor :identities
7
+ attr_accessor :stop_words
8
+ attr_accessor :candidates
9
+ attr_accessor :joint
10
+ attr_accessor :disjoint
11
+ attr_accessor :possibly_identical
12
+ attr_accessor :certainly_different
13
+ attr_accessor :similarities
14
+ attr_accessor :record
15
+ attr_accessor :score
16
+ end
17
+ end
@@ -0,0 +1,125 @@
1
+ begin
2
+ require 'amatch'
3
+ rescue ::LoadError
4
+ # using native ruby similarity scoring
5
+ end
6
+
7
+ class FuzzyMatch
8
+ class Score
9
+ attr_reader :str1, :str2
10
+
11
+ def initialize(str1, str2)
12
+ @str1 = str1.downcase
13
+ @str2 = str2.downcase
14
+ end
15
+
16
+ def inspect
17
+ %{#<Score: dices_coefficient=#{dices_coefficient} levenshtein=#{levenshtein}>}
18
+ end
19
+
20
+ def <=>(other)
21
+ by_dices_coefficient = (dices_coefficient <=> other.dices_coefficient)
22
+ if by_dices_coefficient == 0
23
+ levenshtein <=> other.levenshtein
24
+ else
25
+ by_dices_coefficient
26
+ end
27
+ end
28
+
29
+ def utf8?
30
+ (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
31
+ end
32
+
33
+ if defined?(::Amatch)
34
+
35
+ def dices_coefficient
36
+ str1.pair_distance_similar str2
37
+ end
38
+
39
+ def levenshtein
40
+ str1.levenshtein_similar str2
41
+ end
42
+
43
+ else
44
+
45
+ SPACE = ' '
46
+ # http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
47
+ def dices_coefficient
48
+ if str1 == str2
49
+ return 1.0
50
+ elsif str1.length == 1 and str2.length == 1
51
+ return 0.0
52
+ end
53
+ pairs1 = (0..str1.length-2).map do |i|
54
+ str1[i,2]
55
+ end.reject do |pair|
56
+ pair.include? SPACE
57
+ end
58
+ pairs2 = (0..str2.length-2).map do |i|
59
+ str2[i,2]
60
+ end.reject do |pair|
61
+ pair.include? SPACE
62
+ end
63
+ union = pairs1.size + pairs2.size
64
+ intersection = 0
65
+ pairs1.each do |p1|
66
+ 0.upto(pairs2.size-1) do |i|
67
+ if p1 == pairs2[i]
68
+ intersection += 1
69
+ pairs2.slice!(i)
70
+ break
71
+ end
72
+ end
73
+ end
74
+ (2.0 * intersection) / union
75
+ end
76
+
77
+ # extracted/adapted from the text gem version 1.0.2
78
+ # normalization added for utf-8 strings
79
+ # lib/text/levenshtein.rb
80
+ def levenshtein
81
+ if utf8?
82
+ unpack_rule = 'U*'
83
+ else
84
+ unpack_rule = 'C*'
85
+ end
86
+ s = str1.unpack(unpack_rule)
87
+ t = str2.unpack(unpack_rule)
88
+ n = s.length
89
+ m = t.length
90
+ if n == 0 or m == 0
91
+ return 0.0
92
+ end
93
+ d = (0..m).to_a
94
+ x = nil
95
+ (0...n).each do |i|
96
+ e = i+1
97
+ (0...m).each do |j|
98
+ cost = (s[i] == t[j]) ? 0 : 1
99
+ x = [
100
+ d[j+1] + 1, # insertion
101
+ e + 1, # deletion
102
+ d[j] + cost # substitution
103
+ ].min
104
+ d[j] = e
105
+ e = x
106
+ end
107
+ d[m] = x
108
+ end
109
+ # normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
110
+ # if (b_len > a_len) {
111
+ # result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
112
+ # } else {
113
+ # result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
114
+ # }
115
+ 1.0 - x.to_f / [n, m].max
116
+ end
117
+
118
+ end
119
+
120
+ extend ::ActiveSupport::Memoizable
121
+ memoize :dices_coefficient
122
+ memoize :levenshtein
123
+ memoize :utf8?
124
+ end
125
+ end
@@ -0,0 +1,53 @@
1
+ class FuzzyMatch
2
+ class Similarity
3
+ attr_reader :wrapper1
4
+ attr_reader :wrapper2
5
+
6
+ def initialize(wrapper1, wrapper2)
7
+ @wrapper1 = wrapper1
8
+ @wrapper2 = wrapper2
9
+ end
10
+
11
+ def <=>(other)
12
+ by_score = best_score <=> other.best_score
13
+ if by_score == 0
14
+ original_weight <=> other.original_weight
15
+ else
16
+ by_score
17
+ end
18
+ end
19
+
20
+ # Weight things towards short original strings
21
+ def original_weight
22
+ @original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
23
+ end
24
+
25
+ def best_score
26
+ @best_score ||= Score.new best_wrapper1_variant, best_wrapper2_variant
27
+ end
28
+
29
+ def best_wrapper1_variant
30
+ best_variants[0]
31
+ end
32
+
33
+ def best_wrapper2_variant
34
+ best_variants[1]
35
+ end
36
+
37
+ def best_variants
38
+ @best_variants ||= wrapper1.variants.product(wrapper2.variants).sort do |tuple1, tuple2|
39
+ wrapper1_variant1, wrapper2_variant1 = tuple1
40
+ wrapper1_variant2, wrapper2_variant2 = tuple2
41
+
42
+ score1 = Score.new wrapper1_variant1, wrapper2_variant1
43
+ score2 = Score.new wrapper1_variant2, wrapper2_variant2
44
+
45
+ score1 <=> score2
46
+ end[-1]
47
+ end
48
+
49
+ def inspect
50
+ %{#<Similarity "#{wrapper2.render}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.render}"=>"#{best_wrapper1_variant}" original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,19 @@
1
+ class FuzzyMatch
2
+ # A stop word is ignored
3
+ class StopWord
4
+ attr_reader :regexp
5
+
6
+ def initialize(regexp_or_str)
7
+ @regexp = regexp_or_str.to_regexp
8
+ end
9
+
10
+ # Destructively remove stop words from the string
11
+ def apply!(str)
12
+ str.gsub! regexp, ''
13
+ end
14
+
15
+ def inspect
16
+ "#<StopWord regexp=#{regexp.inspect}>"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,28 @@
1
+ class FuzzyMatch
2
+ # A tightener just strips a string down to its core
3
+ class Tightener
4
+ attr_reader :regexp
5
+
6
+ def initialize(regexp_or_str)
7
+ @regexp = regexp_or_str.to_regexp
8
+ end
9
+
10
+ # A tightener applies when its regexp matches and captures a new (shorter) string
11
+ def apply?(str)
12
+ !!(regexp.match(str))
13
+ end
14
+
15
+ # The result of applying a tightener is just all the captures put together.
16
+ def apply(str)
17
+ if match_data = regexp.match(str)
18
+ match_data.captures.join
19
+ else
20
+ str
21
+ end
22
+ end
23
+
24
+ def inspect
25
+ "#<Tightener regexp=#{regexp.inspect}>"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ class FuzzyMatch
2
+ VERSION = '1.0.5'
3
+ end
@@ -0,0 +1,67 @@
1
+ class FuzzyMatch
2
+ # Wrappers are the tokens that are passed around when doing scoring and optimizing.
3
+ class Wrapper #:nodoc: all
4
+ attr_reader :fuzzy_match
5
+ attr_reader :record
6
+ attr_reader :read
7
+
8
+ def initialize(fuzzy_match, record, read = nil)
9
+ @fuzzy_match = fuzzy_match
10
+ @record = record
11
+ @read = read
12
+ end
13
+
14
+ def inspect
15
+ "#<Wrapper render=#{render} variants=#{variants.length}>"
16
+ end
17
+
18
+ def render
19
+ return @render if rendered?
20
+ str = case read
21
+ when ::Proc
22
+ read.call record
23
+ when ::Symbol
24
+ if record.respond_to?(read)
25
+ record.send read
26
+ else
27
+ record[read]
28
+ end
29
+ when ::NilClass
30
+ record
31
+ else
32
+ record[read]
33
+ end.to_s.dup
34
+ fuzzy_match.stop_words.each do |stop_word|
35
+ stop_word.apply! str
36
+ end
37
+ str.strip!
38
+ @render = str.freeze
39
+ @rendered = true
40
+ @render
41
+ end
42
+
43
+ alias :to_str :render
44
+
45
+ WORD_BOUNDARY = %r{\s*\b\s*}
46
+ def words
47
+ @words ||= render.split(WORD_BOUNDARY)
48
+ end
49
+
50
+ def similarity(other)
51
+ Similarity.new self, other
52
+ end
53
+
54
+ def variants
55
+ @variants ||= fuzzy_match.tighteners.inject([ render ]) do |memo, tightener|
56
+ if tightener.apply? render
57
+ memo.push tightener.apply(render)
58
+ end
59
+ memo
60
+ end.uniq
61
+ end
62
+
63
+ def rendered?
64
+ @rendered == true
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,252 @@
1
+ require 'active_support'
2
+ require 'active_support/version'
3
+ if ::ActiveSupport::VERSION::MAJOR >= 3
4
+ require 'active_support/core_ext'
5
+ end
6
+ require 'to_regexp'
7
+
8
+ # See the README for more information.
9
+ class FuzzyMatch
10
+ autoload :Tightener, 'fuzzy_match/tightener'
11
+ autoload :StopWord, 'fuzzy_match/stop_word'
12
+ autoload :Blocking, 'fuzzy_match/blocking'
13
+ autoload :Identity, 'fuzzy_match/identity'
14
+ autoload :Result, 'fuzzy_match/result'
15
+ autoload :Wrapper, 'fuzzy_match/wrapper'
16
+ autoload :Similarity, 'fuzzy_match/similarity'
17
+ autoload :Score, 'fuzzy_match/score'
18
+ autoload :CachedResult, 'fuzzy_match/cached_result'
19
+
20
+ attr_reader :haystack
21
+ attr_reader :blockings
22
+ attr_reader :identities
23
+ attr_reader :tighteners
24
+ attr_reader :stop_words
25
+ attr_reader :default_first_blocking_decides
26
+ attr_reader :default_must_match_blocking
27
+ attr_reader :default_must_match_at_least_one_word
28
+
29
+ # haystack - a bunch of records
30
+ # options
31
+ # * tighteners: regexps (see readme)
32
+ # * identities: regexps
33
+ # * blockings: regexps
34
+ # * stop_words: regexps
35
+ # * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
36
+ def initialize(records, options = {})
37
+ options = options.symbolize_keys
38
+ @default_first_blocking_decides = options[:first_blocking_decides]
39
+ @default_must_match_blocking = options[:must_match_blocking]
40
+ @default_must_match_at_least_one_word = options[:must_match_at_least_one_word]
41
+ @blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
42
+ @identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
43
+ @tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
44
+ @stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
45
+ read = options[:read] || options[:haystack_reader]
46
+ @haystack = records.map { |record| Wrapper.new self, record, read }
47
+ end
48
+
49
+ def last_result
50
+ @last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true")
51
+ end
52
+
53
+ def find_all(needle, options = {})
54
+ options = options.symbolize_keys.merge(:find_all => true)
55
+ find needle, options
56
+ end
57
+
58
+ def find(needle, options = {})
59
+ raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
60
+
61
+ options = options.symbolize_keys
62
+ gather_last_result = options.fetch(:gather_last_result, false)
63
+ is_find_all = options.fetch(:find_all, false)
64
+ first_blocking_decides = options.fetch(:first_blocking_decides, default_first_blocking_decides)
65
+ must_match_blocking = options.fetch(:must_match_blocking, default_must_match_blocking)
66
+ must_match_at_least_one_word = options.fetch(:must_match_at_least_one_word, default_must_match_at_least_one_word)
67
+
68
+ if gather_last_result
69
+ free_last_result
70
+ @last_result = Result.new
71
+ end
72
+
73
+ if gather_last_result
74
+ last_result.tighteners = tighteners
75
+ last_result.identities = identities
76
+ last_result.blockings = blockings
77
+ last_result.stop_words = stop_words
78
+ end
79
+
80
+ needle = Wrapper.new self, needle
81
+
82
+ if gather_last_result
83
+ last_result.needle = needle
84
+ end
85
+
86
+ if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
87
+ if is_find_all
88
+ return []
89
+ else
90
+ return nil
91
+ end
92
+ end
93
+
94
+ candidates = if must_match_at_least_one_word
95
+ haystack.select do |straw|
96
+ (needle.words & straw.words).any?
97
+ end
98
+ else
99
+ haystack
100
+ end
101
+
102
+ if gather_last_result
103
+ last_result.candidates = candidates
104
+ end
105
+
106
+ joint, disjoint = if blockings.any?
107
+ candidates.partition do |straw|
108
+ if first_blocking_decides
109
+ blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
110
+ else
111
+ blockings.any? { |blocking| blocking.join? needle, straw }
112
+ end
113
+ end
114
+ else
115
+ [ candidates.dup, [] ]
116
+ end
117
+
118
+ if joint.none?
119
+ if must_match_blocking
120
+ if is_find_all
121
+ return []
122
+ else
123
+ return nil
124
+ end
125
+ else
126
+ # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
127
+ joint = disjoint
128
+ disjoint = []
129
+ end
130
+ end
131
+
132
+ if gather_last_result
133
+ last_result.joint = joint
134
+ last_result.disjoint = disjoint
135
+ end
136
+
137
+ possibly_identical, certainly_different = if identities.any?
138
+ joint.partition do |straw|
139
+ identities.all? do |identity|
140
+ answer = identity.identical? needle, straw
141
+ answer.nil? or answer == true
142
+ end
143
+ end
144
+ else
145
+ [ joint.dup, [] ]
146
+ end
147
+
148
+ if gather_last_result
149
+ last_result.possibly_identical = possibly_identical
150
+ last_result.certainly_different = certainly_different
151
+ end
152
+
153
+ if is_find_all
154
+ return possibly_identical.map { |straw| straw.record }
155
+ end
156
+
157
+ similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
158
+
159
+ if gather_last_result
160
+ last_result.similarities = similarities
161
+ end
162
+
163
+ if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
164
+ record = best_similarity.wrapper2.record
165
+ if gather_last_result
166
+ last_result.record = record
167
+ last_result.score = best_similarity.best_score.dices_coefficient
168
+ end
169
+ record
170
+ end
171
+ end
172
+
173
+ # Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
174
+ #
175
+ # d = FuzzyMatch.new ['737', '747', '757' ]
176
+ # d.explain 'boeing 737-100'
177
+ def explain(needle, options = {})
178
+ record = find needle, options.merge(:gather_last_result => true)
179
+ log "#" * 150
180
+ log "# Match #{needle.inspect} => #{record.inspect}"
181
+ log "#" * 150
182
+ log
183
+ log "Needle"
184
+ log "-" * 150
185
+ log last_result.needle.render
186
+ log
187
+ log "Stop words"
188
+ log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
189
+ log
190
+ log "Candidates"
191
+ log "-" * 150
192
+ log last_result.candidates.map { |record| record.render }.join("\n")
193
+ log
194
+ log "Tighteners"
195
+ log "-" * 150
196
+ log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
197
+ log
198
+ log "Blockings"
199
+ log "-" * 150
200
+ log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
201
+ log
202
+ log "Identities"
203
+ log "-" * 150
204
+ log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
205
+ log
206
+ log "Joint"
207
+ log "-" * 150
208
+ log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
209
+ log
210
+ log "Disjoint"
211
+ log "-" * 150
212
+ log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
213
+ log
214
+ log "Possibly identical"
215
+ log "-" * 150
216
+ log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
217
+ log
218
+ log "Certainly different"
219
+ log "-" * 150
220
+ log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
221
+ log
222
+ log "Similarities"
223
+ log "-" * 150
224
+ log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
225
+ log
226
+ log "Match"
227
+ log "-" * 150
228
+ log record.inspect
229
+ end
230
+
231
+ def log(str = '') #:nodoc:
232
+ $stderr.puts str
233
+ end
234
+
235
+ def freed?
236
+ @freed == true
237
+ end
238
+
239
+ def free
240
+ free_last_result
241
+ @haystack.try :clear
242
+ @haystack = nil
243
+ ensure
244
+ @freed = true
245
+ end
246
+
247
+ private
248
+
249
+ def free_last_result
250
+ @last_result = nil
251
+ end
252
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.setup
4
+ require 'test/unit'
5
+ require 'stringio'
6
+ require 'remote_table'
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ require 'fuzzy_match'
10
+
11
+ class Test::Unit::TestCase
12
+ end
@@ -0,0 +1,23 @@
1
+ require 'helper'
2
+
3
+ class TestBlocking < Test::Unit::TestCase
4
+ def test_001_match_one
5
+ b = FuzzyMatch::Blocking.new %r{apple}
6
+ assert_equal true, b.match?('2 apples')
7
+ end
8
+
9
+ def test_002_join_both
10
+ b = FuzzyMatch::Blocking.new %r{apple}
11
+ assert_equal true, b.join?('apple', '2 apples')
12
+ end
13
+
14
+ def test_002_doesnt_join_both
15
+ b = FuzzyMatch::Blocking.new %r{apple}
16
+ assert_equal false, b.join?('orange', '2 apples')
17
+ end
18
+
19
+ def test_003_no_information
20
+ b = FuzzyMatch::Blocking.new %r{apple}
21
+ assert_equal nil, b.join?('orange', 'orange')
22
+ end
23
+ end