fuzzy_match 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/.document +5 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +20 -0
  5. data/README.rdoc +94 -0
  6. data/Rakefile +21 -0
  7. data/THANKS-WILLIAM-JAMES.rb +37 -0
  8. data/benchmark/before-with-free.txt +283 -0
  9. data/benchmark/before-without-last-result.txt +257 -0
  10. data/benchmark/before.txt +304 -0
  11. data/benchmark/memory.rb +54 -0
  12. data/examples/bts_aircraft/5-2-A.htm +10305 -0
  13. data/examples/bts_aircraft/5-2-B.htm +9576 -0
  14. data/examples/bts_aircraft/5-2-D.htm +7094 -0
  15. data/examples/bts_aircraft/5-2-E.htm +2349 -0
  16. data/examples/bts_aircraft/5-2-G.htm +2922 -0
  17. data/examples/bts_aircraft/blockings.csv +1 -0
  18. data/examples/bts_aircraft/identities.csv +1 -0
  19. data/examples/bts_aircraft/negatives.csv +1 -0
  20. data/examples/bts_aircraft/number_260.csv +334 -0
  21. data/examples/bts_aircraft/positives.csv +1 -0
  22. data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
  23. data/examples/bts_aircraft/tighteners.csv +1 -0
  24. data/examples/first_name_matching.rb +15 -0
  25. data/examples/icao-bts.xls +0 -0
  26. data/fuzzy_match.gemspec +32 -0
  27. data/lib/fuzzy_match/blocking.rb +36 -0
  28. data/lib/fuzzy_match/cached_result.rb +74 -0
  29. data/lib/fuzzy_match/identity.rb +23 -0
  30. data/lib/fuzzy_match/result.rb +17 -0
  31. data/lib/fuzzy_match/score.rb +125 -0
  32. data/lib/fuzzy_match/similarity.rb +53 -0
  33. data/lib/fuzzy_match/stop_word.rb +19 -0
  34. data/lib/fuzzy_match/tightener.rb +28 -0
  35. data/lib/fuzzy_match/version.rb +3 -0
  36. data/lib/fuzzy_match/wrapper.rb +67 -0
  37. data/lib/fuzzy_match.rb +252 -0
  38. data/test/helper.rb +12 -0
  39. data/test/test_blocking.rb +23 -0
  40. data/test/test_cache.rb +130 -0
  41. data/test/test_fuzzy_match.rb +190 -0
  42. data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
  43. data/test/test_identity.rb +33 -0
  44. data/test/test_tightening.rb +10 -0
  45. metadata +197 -0
@@ -0,0 +1,130 @@
1
+ require 'helper'
2
+
3
+ require 'active_support/all'
4
+ require 'active_record'
5
+ require 'cohort_scope'
6
+ require 'weighted_average'
7
+
8
+ ActiveRecord::Base.establish_connection(
9
+ 'adapter' => 'mysql',
10
+ 'database' => 'fuzzy_match_test',
11
+ 'username' => 'root',
12
+ 'password' => 'password'
13
+ )
14
+
15
+ # ActiveRecord::Base.logger = Logger.new $stderr
16
+
17
+ ActiveSupport::Inflector.inflections do |inflect|
18
+ inflect.uncountable 'aircraft'
19
+ end
20
+
21
+ require 'fuzzy_match/cached_result'
22
+
23
+ ::FuzzyMatch::CachedResult.setup(true)
24
+ ::FuzzyMatch::CachedResult.delete_all
25
+
26
+ class Aircraft < ActiveRecord::Base
27
+ set_primary_key :icao_code
28
+
29
+ cache_fuzzy_match_matches_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
30
+
31
+ def aircraft_description
32
+ [manufacturer_name, model_name].compact.join(' ')
33
+ end
34
+
35
+ def self.fuzzy_match
36
+ @fuzzy_match ||= FuzzyMatch.new all, :read => ::Proc.new { |straw| straw.aircraft_description }
37
+ end
38
+
39
+ def self.create_table
40
+ connection.drop_table(:aircraft) rescue nil
41
+ connection.execute %{
42
+ CREATE TABLE `aircraft` (
43
+ `icao_code` varchar(255) DEFAULT NULL,
44
+ `manufacturer_name` varchar(255) DEFAULT NULL,
45
+ `model_name` varchar(255) DEFAULT NULL,
46
+ PRIMARY KEY (`icao_code`)
47
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
48
+ }
49
+ reset_column_information
50
+ end
51
+ end
52
+
53
+ class FlightSegment < ActiveRecord::Base
54
+ set_primary_key :row_hash
55
+
56
+ cache_fuzzy_match_matches_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
57
+
58
+ extend CohortScope
59
+ self.minimum_cohort_size = 1
60
+
61
+ def self.create_table
62
+ connection.drop_table(:flight_segments) rescue nil
63
+ connection.execute %{
64
+ CREATE TABLE `flight_segments` (
65
+ `row_hash` varchar(255) NOT NULL DEFAULT '',
66
+ `aircraft_description` varchar(255) DEFAULT NULL,
67
+ `passengers` int(11) DEFAULT NULL,
68
+ `seats` int(11) DEFAULT NULL,
69
+ PRIMARY KEY (`row_hash`)
70
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
71
+ }
72
+ end
73
+ end
74
+
75
+ FlightSegment.create_table
76
+ Aircraft.create_table
77
+
78
+ a = Aircraft.new
79
+ a.icao_code = 'B742'
80
+ a.manufacturer_name = 'Boeing'
81
+ a.model_name = '747-200'
82
+ a.save!
83
+
84
+ fs = FlightSegment.new
85
+ fs.row_hash = 'madison to chicago'
86
+ fs.aircraft_description = 'BORING 747200'
87
+ fs.passengers = 10
88
+ fs.seats = 10
89
+ fs.save!
90
+
91
+ fs = FlightSegment.new
92
+ fs.row_hash = 'madison to minneapolis'
93
+ fs.aircraft_description = 'bing 747'
94
+ fs.passengers = 100
95
+ fs.seats = 5
96
+ fs.save!
97
+
98
+ FlightSegment.find_each do |fs|
99
+ fs.cache_aircraft!
100
+ end
101
+
102
+ class TestCache < Test::Unit::TestCase
103
+ def test_002_one_degree_of_separation
104
+ aircraft = Aircraft.find('B742')
105
+ assert_equal 2, aircraft.flight_segments.count
106
+ end
107
+
108
+ def test_003_standard_sql_calculations
109
+ aircraft = Aircraft.find('B742')
110
+ assert_equal 110, aircraft.flight_segments.sum(:passengers)
111
+ end
112
+
113
+ def test_004_weighted_average
114
+ aircraft = Aircraft.find('B742')
115
+ assert_equal 5.45455, aircraft.flight_segments.weighted_average(:seats, :weighted_by => :passengers)
116
+ end
117
+
118
+ def test_005_right_way_to_do_cohorts
119
+ aircraft = Aircraft.find('B742')
120
+ assert_equal 2, FlightSegment.big_cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).count
121
+ end
122
+
123
+ def test_006_you_can_get_aircraft_from_flight_segments
124
+ fs = FlightSegment.first
125
+ # you need to add an aircraft_description column
126
+ assert_raises(ActiveRecord::StatementInvalid) do
127
+ assert_equal 2, fs.aircraft.count
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,190 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'helper'
3
+
4
+ class TestFuzzyMatch < Test::Unit::TestCase
5
+ def test_001_find
6
+ d = FuzzyMatch.new %w{ RATZ CATZ }
7
+ assert_equal 'RATZ', d.find('RITZ')
8
+ assert_equal 'RATZ', d.find('RíTZ')
9
+
10
+ d = FuzzyMatch.new [ 'X' ]
11
+ assert_equal 'X', d.find('X')
12
+ assert_equal nil, d.find('A')
13
+ end
14
+
15
+ def test_002_dont_gather_last_result_by_default
16
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
17
+ d.find('MISSAM')
18
+ assert_raises(::RuntimeError, /gather_last_result/) do
19
+ d.last_result
20
+ end
21
+ end
22
+
23
+ def test_003_last_result
24
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
25
+ d.find 'MISSAM', :gather_last_result => true
26
+ assert_equal 0.6, d.last_result.score
27
+ assert_equal 'NISSAN', d.last_result.record
28
+ end
29
+
30
+ def test_004_false_positive_without_tightener
31
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
32
+ assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
33
+ end
34
+
35
+ def test_005_correct_with_tightener
36
+ tighteners = [
37
+ %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
38
+ ]
39
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
40
+ assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
41
+ end
42
+
43
+ def test_008_false_positive_without_identity
44
+ d = FuzzyMatch.new %w{ foo bar }
45
+ assert_equal 'bar', d.find('baz')
46
+ end
47
+
48
+ def test_008_identify_false_positive
49
+ d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
50
+ assert_equal nil, d.find('baz')
51
+ end
52
+
53
+ # TODO this is not very helpful
54
+ def test_009_blocking
55
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
56
+ assert_equal 'X', d.find('X')
57
+ assert_equal nil, d.find('A')
58
+ end
59
+
60
+ # TODO this is not very helpful
61
+ def test_0095_must_match_blocking
62
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
63
+ assert_equal 'X', d.find('X')
64
+ assert_equal nil, d.find('A')
65
+
66
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
67
+ assert_equal 'X', d.find('X', :must_match_blocking => true)
68
+ assert_equal nil, d.find('A', :must_match_blocking => true)
69
+ end
70
+
71
+ def test_011_free
72
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
73
+ d.free
74
+ assert_raises(::RuntimeError, /free/) do
75
+ d.find('foobar')
76
+ end
77
+ end
78
+
79
+ def test_012_find_all
80
+ d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
81
+ assert_equal ['X', 'X22' ], d.find_all('X')
82
+ assert_equal [], d.find_all('A')
83
+ end
84
+
85
+ def test_013_first_blocking_decides
86
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
87
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
88
+
89
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
90
+ assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
91
+
92
+ # first_blocking_decides refers to the needle
93
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
94
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
95
+
96
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
97
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
98
+
99
+ # or equivalently with an identity
100
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
101
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
102
+ end
103
+
104
+ MyStruct = Struct.new(:one, :two)
105
+ def test_014_symbol_read_sends_method
106
+ ab = MyStruct.new('a', 'b')
107
+ ba = MyStruct.new('b', 'a')
108
+ haystack = [ab, ba]
109
+ by_first = FuzzyMatch.new haystack, :read => :one
110
+ by_last = FuzzyMatch.new haystack, :read => :two
111
+ assert_equal ab, by_first.find('a')
112
+ assert_equal ab, by_last.find('b')
113
+ assert_equal ba, by_first.find('b')
114
+ assert_equal ba, by_last.find('a')
115
+ end
116
+
117
+ def test_015_symbol_read_reads_array
118
+ ab = ['a', 'b']
119
+ ba = ['b', 'a']
120
+ haystack = [ab, ba]
121
+ by_first = FuzzyMatch.new haystack, :read => 0
122
+ by_last = FuzzyMatch.new haystack, :read => 1
123
+ assert_equal ab, by_first.find('a')
124
+ assert_equal ab, by_last.find('b')
125
+ assert_equal ba, by_first.find('b')
126
+ assert_equal ba, by_last.find('a')
127
+ end
128
+
129
+ def test_016_symbol_read_reads_hash
130
+ ab = { :one => 'a', :two => 'b' }
131
+ ba = { :one => 'b', :two => 'a' }
132
+ haystack = [ab, ba]
133
+ by_first = FuzzyMatch.new haystack, :read => :one
134
+ by_last = FuzzyMatch.new haystack, :read => :two
135
+ assert_equal ab, by_first.find('a')
136
+ assert_equal ab, by_last.find('b')
137
+ assert_equal ba, by_first.find('b')
138
+ assert_equal ba, by_last.find('a')
139
+ end
140
+
141
+ def test_017_understands_haystack_reader_option
142
+ ab = ['a', 'b']
143
+ ba = ['b', 'a']
144
+ haystack = [ab, ba]
145
+ by_first = FuzzyMatch.new haystack, :haystack_reader => 0
146
+ assert_equal ab, by_first.find('a')
147
+ assert_equal ba, by_first.find('b')
148
+ end
149
+
150
+ def test_018_no_result_if_best_score_is_zero
151
+ assert_equal nil, FuzzyMatch.new(['a']).find('b')
152
+ end
153
+
154
+ def test_019_must_match_at_least_one_word
155
+ d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
156
+ assert_equal nil, d.find('RITZ')
157
+ end
158
+
159
+ def test_020_stop_words
160
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
161
+ assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
162
+
163
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
164
+ assert_equal 'B HTL', d.find('A HTL')
165
+
166
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
167
+ assert_equal 'A HOTEL', d.find('A HTL')
168
+ end
169
+
170
+ def test_021_explain
171
+ require 'stringio'
172
+ capture = StringIO.new
173
+ begin
174
+ old_stderr = $stderr
175
+ $stderr = capture
176
+ d = FuzzyMatch.new %w{ RATZ CATZ }
177
+ d.explain('RITZ')
178
+ ensure
179
+ $stderr = old_stderr
180
+ end
181
+ capture.rewind
182
+ assert capture.read.include?('CATZ')
183
+ capture.close
184
+ end
185
+
186
+ def test_022_compare_words_with_words
187
+ d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
188
+ assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
189
+ end
190
+ end
@@ -0,0 +1,268 @@
1
+ require 'helper'
2
+
3
+ require 'shoulda'
4
+
5
+ $log = false
6
+
7
+ class TestFuzzyMatchConvoluted < Test::Unit::TestCase
8
+ def setup
9
+ clear_ltd
10
+
11
+ # dh 8 400
12
+ @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
13
+ @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
14
+ # dh 88
15
+ @b_needle = ['ABCDEFG DH88 HIJKLMNOP']
16
+ # dh 89
17
+ @c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
18
+ # dh 8 200
19
+ @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
20
+ @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
21
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
22
+
23
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
24
+
25
+ @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
26
+
27
+ @needle = [
28
+ @a_needle,
29
+ @b_needle,
30
+ ['DE HAVILLAND DH89 Dragon Rapide'],
31
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
32
+ @d_needle,
33
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
34
+ ['DE HAVILLAND DH90 Dragonfly']
35
+ ]
36
+ @haystack = [
37
+ @a_haystack,
38
+ @c_haystack,
39
+ @d_haystack,
40
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
41
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
42
+ ]
43
+ @tightenings = []
44
+ @identities = []
45
+ @blockings = []
46
+ @positives = []
47
+ @negatives = []
48
+ end
49
+
50
+ def clear_ltd
51
+ @_ltd = nil
52
+ end
53
+
54
+ def ltd
55
+ @_ltd ||= FuzzyMatch.new @haystack,
56
+ :tightenings => @tightenings,
57
+ :identities => @identities,
58
+ :blockings => @blockings,
59
+ :positives => @positives,
60
+ :negatives => @negatives,
61
+ :blocking_only => @blocking_only,
62
+ :log => $log
63
+ end
64
+
65
+ should "optionally only pay attention to things that match blockings" do
66
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
67
+
68
+ clear_ltd
69
+ @blocking_only = true
70
+ assert_equal nil, ltd.improver.match(@a_needle)
71
+
72
+ clear_ltd
73
+ @blocking_only = true
74
+ @blockings.push ['/dash/i']
75
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
76
+ end
77
+
78
+ # the example from the readme, considerably uglier here
79
+ should "check a simple table" do
80
+ @haystack = [ 'seamus', 'andy', 'ben' ]
81
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
82
+ needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
83
+
84
+ assert_nothing_raised do
85
+ ltd.improver.check needle
86
+ end
87
+ end
88
+
89
+ should "treat a String as a full record if passed through" do
90
+ dash = 'DHC8-400'
91
+ b747 = 'B747200/300'
92
+ dc9 = 'DC-9-10'
93
+ haystack_records = [ dash, b747, dc9 ]
94
+ simple_ltd = FuzzyMatch.new haystack_records, :log => $log
95
+ assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
96
+ assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
97
+ assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
98
+ end
99
+
100
+ should "call it a mismatch if you hit a blank positive" do
101
+ @positives.push [@a_needle[0], '']
102
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
103
+ ltd.improver.match @a_needle
104
+ end
105
+ end
106
+
107
+ should "call it a false positive if you hit a blank negative" do
108
+ @negatives.push [@a_needle[0], '']
109
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
110
+ ltd.improver.match @a_needle
111
+ end
112
+ end
113
+
114
+ should "have a false match without blocking" do
115
+ # @d_needle will be our victim
116
+ @haystack.push @d_lookalike
117
+ @tightenings.push @t_1
118
+
119
+ assert_equal @d_lookalike, ltd.improver.match(@d_needle)
120
+ end
121
+
122
+ should "do blocking if the needle matches a block" do
123
+ # @d_needle will be our victim
124
+ @haystack.push @d_lookalike
125
+ @tightenings.push @t_1
126
+ @blockings.push ['/(bombardier|de ?havilland)/i']
127
+
128
+ assert_equal @d_haystack, ltd.improver.match(@d_needle)
129
+ end
130
+
131
+ should "treat blocks as exclusive" do
132
+ @haystack = [ @d_needle ]
133
+ @tightenings.push @t_1
134
+ @blockings.push ['/(bombardier|de ?havilland)/i']
135
+
136
+ assert_equal nil, ltd.improver.match(@d_lookalike)
137
+ end
138
+
139
+ should "only use identities if they stem from the same regexp" do
140
+ @identities.push @r_1
141
+ @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
142
+ @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
143
+ x_needle = [ 'CESSNA D-333 CITATION V']
144
+ x_haystack = [ 'CESSNA D-333' ]
145
+ @haystack.push x_haystack
146
+
147
+ assert_equal x_haystack, ltd.improver.match(x_needle)
148
+ end
149
+
150
+ should "use the best score from all of the tightenings" do
151
+ x_needle = ["BOEING 737100"]
152
+ x_haystack = ["BOEING BOEING 737-100/200"]
153
+ x_haystack_wrong = ["BOEING BOEING 737-900"]
154
+ @haystack.push x_haystack
155
+ @haystack.push x_haystack_wrong
156
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
157
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
158
+
159
+ assert_equal x_haystack, ltd.improver.match(x_needle)
160
+ end
161
+
162
+ should "compare using prefixes if tightened key is shorter than correct match" do
163
+ x_needle = ["BOEING 720"]
164
+ x_haystack = ["BOEING BOEING 720-000"]
165
+ x_haystack_wrong = ["BOEING BOEING 717-200"]
166
+ @haystack.push x_haystack
167
+ @haystack.push x_haystack_wrong
168
+ @tightenings.push @t_1
169
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
170
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
171
+
172
+ assert_equal x_haystack, ltd.improver.match(x_needle)
173
+ end
174
+
175
+ should "use the shortest original input" do
176
+ x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
177
+ x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
178
+ x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
179
+
180
+ @haystack.push x_haystack_long
181
+ @haystack.push x_haystack
182
+ @tightenings.push @t_1
183
+
184
+ assert_equal x_haystack, ltd.improver.match(x_needle)
185
+ end
186
+
187
+ should "perform lookups needle to haystack" do
188
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
189
+ end
190
+
191
+ should "succeed if there are no checks" do
192
+ assert_nothing_raised do
193
+ ltd.improver.check @needle
194
+ end
195
+ end
196
+
197
+ should "succeed if the positive checks just work" do
198
+ @positives.push [ @a_needle[0], @a_haystack[0] ]
199
+
200
+ assert_nothing_raised do
201
+ ltd.improver.check @needle
202
+ end
203
+ end
204
+
205
+ should "fail if positive checks don't work" do
206
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
207
+
208
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
209
+ ltd.improver.check @needle
210
+ end
211
+ end
212
+
213
+ should "succeed if proper tightening is applied" do
214
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
215
+ @tightenings.push @t_1
216
+
217
+ assert_nothing_raised do
218
+ ltd.improver.check @needle
219
+ end
220
+ end
221
+
222
+ should "use a Google Docs spreadsheet as a source of tightenings" do
223
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
224
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
225
+
226
+ # sabshere 9/30/10 this shouldn't raise anything
227
+ # but the tightenings have been changed... we should be using test-only tightenings, not production ones
228
+ # assert_nothing_raised do
229
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
230
+ ltd.improver.check @needle
231
+ end
232
+ end
233
+
234
+ should "fail if negative checks don't work" do
235
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
236
+
237
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
238
+ ltd.improver.check @needle
239
+ end
240
+ end
241
+
242
+ should "do inline checking" do
243
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
244
+
245
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
246
+ ltd.improver.match @b_needle
247
+ end
248
+ end
249
+
250
+ should "fail if negative checks don't work, even with tightening" do
251
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
252
+ @tightenings.push @t_1
253
+
254
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
255
+ ltd.improver.check @needle
256
+ end
257
+ end
258
+
259
+ should "succeed if proper identity is applied" do
260
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
261
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
262
+ @identities.push @r_1
263
+
264
+ assert_nothing_raised do
265
+ ltd.improver.check @needle
266
+ end
267
+ end
268
+ end
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestIdentity < Test::Unit::TestCase
4
+ def test_001_identical
5
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
6
+ assert_equal true, i.identical?('A1', 'A 1foobar')
7
+ end
8
+
9
+ def test_002_certainly_different
10
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
11
+ assert_equal false, i.identical?('A1', 'A 2foobar')
12
+ end
13
+
14
+ def test_003_no_information_ie_possible_identical
15
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
16
+ assert_equal nil, i.identical?('B1', 'A 2foobar')
17
+ end
18
+
19
+ def test_004_regexp
20
+ i = FuzzyMatch::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
22
+ end
23
+
24
+ def test_005_regexp_from_string
25
+ i = FuzzyMatch::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
26
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
27
+ end
28
+
29
+ def test_006_regexp_from_string_using_slash_delim
30
+ i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
31
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
32
+ end
33
+ end
@@ -0,0 +1,10 @@
1
+ require 'helper'
2
+
3
+ class TestTightener < Test::Unit::TestCase
4
+ def test_001_apply
5
+ t = FuzzyMatch::Tightener.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
+ assert_equal 'Ford F350', t.apply('Ford F-350')
7
+ assert_equal 'Ford F150', t.apply('Ford F150')
8
+ assert_equal 'Ford F350', t.apply('Ford F 350')
9
+ end
10
+ end