fuzzy_match 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/.document +5 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +20 -0
  5. data/README.rdoc +94 -0
  6. data/Rakefile +21 -0
  7. data/THANKS-WILLIAM-JAMES.rb +37 -0
  8. data/benchmark/before-with-free.txt +283 -0
  9. data/benchmark/before-without-last-result.txt +257 -0
  10. data/benchmark/before.txt +304 -0
  11. data/benchmark/memory.rb +54 -0
  12. data/examples/bts_aircraft/5-2-A.htm +10305 -0
  13. data/examples/bts_aircraft/5-2-B.htm +9576 -0
  14. data/examples/bts_aircraft/5-2-D.htm +7094 -0
  15. data/examples/bts_aircraft/5-2-E.htm +2349 -0
  16. data/examples/bts_aircraft/5-2-G.htm +2922 -0
  17. data/examples/bts_aircraft/blockings.csv +1 -0
  18. data/examples/bts_aircraft/identities.csv +1 -0
  19. data/examples/bts_aircraft/negatives.csv +1 -0
  20. data/examples/bts_aircraft/number_260.csv +334 -0
  21. data/examples/bts_aircraft/positives.csv +1 -0
  22. data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
  23. data/examples/bts_aircraft/tighteners.csv +1 -0
  24. data/examples/first_name_matching.rb +15 -0
  25. data/examples/icao-bts.xls +0 -0
  26. data/fuzzy_match.gemspec +32 -0
  27. data/lib/fuzzy_match/blocking.rb +36 -0
  28. data/lib/fuzzy_match/cached_result.rb +74 -0
  29. data/lib/fuzzy_match/identity.rb +23 -0
  30. data/lib/fuzzy_match/result.rb +17 -0
  31. data/lib/fuzzy_match/score.rb +125 -0
  32. data/lib/fuzzy_match/similarity.rb +53 -0
  33. data/lib/fuzzy_match/stop_word.rb +19 -0
  34. data/lib/fuzzy_match/tightener.rb +28 -0
  35. data/lib/fuzzy_match/version.rb +3 -0
  36. data/lib/fuzzy_match/wrapper.rb +67 -0
  37. data/lib/fuzzy_match.rb +252 -0
  38. data/test/helper.rb +12 -0
  39. data/test/test_blocking.rb +23 -0
  40. data/test/test_cache.rb +130 -0
  41. data/test/test_fuzzy_match.rb +190 -0
  42. data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
  43. data/test/test_identity.rb +33 -0
  44. data/test/test_tightening.rb +10 -0
  45. metadata +197 -0
@@ -0,0 +1,130 @@
1
+ require 'helper'
2
+
3
+ require 'active_support/all'
4
+ require 'active_record'
5
+ require 'cohort_scope'
6
+ require 'weighted_average'
7
+
8
+ ActiveRecord::Base.establish_connection(
9
+ 'adapter' => 'mysql',
10
+ 'database' => 'fuzzy_match_test',
11
+ 'username' => 'root',
12
+ 'password' => 'password'
13
+ )
14
+
15
+ # ActiveRecord::Base.logger = Logger.new $stderr
16
+
17
+ ActiveSupport::Inflector.inflections do |inflect|
18
+ inflect.uncountable 'aircraft'
19
+ end
20
+
21
+ require 'fuzzy_match/cached_result'
22
+
23
+ ::FuzzyMatch::CachedResult.setup(true)
24
+ ::FuzzyMatch::CachedResult.delete_all
25
+
26
+ class Aircraft < ActiveRecord::Base
27
+ set_primary_key :icao_code
28
+
29
+ cache_fuzzy_match_matches_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
30
+
31
+ def aircraft_description
32
+ [manufacturer_name, model_name].compact.join(' ')
33
+ end
34
+
35
+ def self.fuzzy_match
36
+ @fuzzy_match ||= FuzzyMatch.new all, :read => ::Proc.new { |straw| straw.aircraft_description }
37
+ end
38
+
39
+ def self.create_table
40
+ connection.drop_table(:aircraft) rescue nil
41
+ connection.execute %{
42
+ CREATE TABLE `aircraft` (
43
+ `icao_code` varchar(255) DEFAULT NULL,
44
+ `manufacturer_name` varchar(255) DEFAULT NULL,
45
+ `model_name` varchar(255) DEFAULT NULL,
46
+ PRIMARY KEY (`icao_code`)
47
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
48
+ }
49
+ reset_column_information
50
+ end
51
+ end
52
+
53
+ class FlightSegment < ActiveRecord::Base
54
+ set_primary_key :row_hash
55
+
56
+ cache_fuzzy_match_matches_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
57
+
58
+ extend CohortScope
59
+ self.minimum_cohort_size = 1
60
+
61
+ def self.create_table
62
+ connection.drop_table(:flight_segments) rescue nil
63
+ connection.execute %{
64
+ CREATE TABLE `flight_segments` (
65
+ `row_hash` varchar(255) NOT NULL DEFAULT '',
66
+ `aircraft_description` varchar(255) DEFAULT NULL,
67
+ `passengers` int(11) DEFAULT NULL,
68
+ `seats` int(11) DEFAULT NULL,
69
+ PRIMARY KEY (`row_hash`)
70
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
71
+ }
72
+ end
73
+ end
74
+
75
+ FlightSegment.create_table
76
+ Aircraft.create_table
77
+
78
+ a = Aircraft.new
79
+ a.icao_code = 'B742'
80
+ a.manufacturer_name = 'Boeing'
81
+ a.model_name = '747-200'
82
+ a.save!
83
+
84
+ fs = FlightSegment.new
85
+ fs.row_hash = 'madison to chicago'
86
+ fs.aircraft_description = 'BORING 747200'
87
+ fs.passengers = 10
88
+ fs.seats = 10
89
+ fs.save!
90
+
91
+ fs = FlightSegment.new
92
+ fs.row_hash = 'madison to minneapolis'
93
+ fs.aircraft_description = 'bing 747'
94
+ fs.passengers = 100
95
+ fs.seats = 5
96
+ fs.save!
97
+
98
+ FlightSegment.find_each do |fs|
99
+ fs.cache_aircraft!
100
+ end
101
+
102
+ class TestCache < Test::Unit::TestCase
103
+ def test_002_one_degree_of_separation
104
+ aircraft = Aircraft.find('B742')
105
+ assert_equal 2, aircraft.flight_segments.count
106
+ end
107
+
108
+ def test_003_standard_sql_calculations
109
+ aircraft = Aircraft.find('B742')
110
+ assert_equal 110, aircraft.flight_segments.sum(:passengers)
111
+ end
112
+
113
+ def test_004_weighted_average
114
+ aircraft = Aircraft.find('B742')
115
+ assert_equal 5.45455, aircraft.flight_segments.weighted_average(:seats, :weighted_by => :passengers)
116
+ end
117
+
118
+ def test_005_right_way_to_do_cohorts
119
+ aircraft = Aircraft.find('B742')
120
+ assert_equal 2, FlightSegment.big_cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).count
121
+ end
122
+
123
+ def test_006_you_can_get_aircraft_from_flight_segments
124
+ fs = FlightSegment.first
125
+ # you need to add an aircraft_description column
126
+ assert_raises(ActiveRecord::StatementInvalid) do
127
+ assert_equal 2, fs.aircraft.count
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,190 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'helper'
3
+
4
+ class TestFuzzyMatch < Test::Unit::TestCase
5
+ def test_001_find
6
+ d = FuzzyMatch.new %w{ RATZ CATZ }
7
+ assert_equal 'RATZ', d.find('RITZ')
8
+ assert_equal 'RATZ', d.find('RíTZ')
9
+
10
+ d = FuzzyMatch.new [ 'X' ]
11
+ assert_equal 'X', d.find('X')
12
+ assert_equal nil, d.find('A')
13
+ end
14
+
15
+ def test_002_dont_gather_last_result_by_default
16
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
17
+ d.find('MISSAM')
18
+ assert_raises(::RuntimeError, /gather_last_result/) do
19
+ d.last_result
20
+ end
21
+ end
22
+
23
+ def test_003_last_result
24
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
25
+ d.find 'MISSAM', :gather_last_result => true
26
+ assert_equal 0.6, d.last_result.score
27
+ assert_equal 'NISSAN', d.last_result.record
28
+ end
29
+
30
+ def test_004_false_positive_without_tightener
31
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
32
+ assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
33
+ end
34
+
35
+ def test_005_correct_with_tightener
36
+ tighteners = [
37
+ %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
38
+ ]
39
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
40
+ assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
41
+ end
42
+
43
+ def test_008_false_positive_without_identity
44
+ d = FuzzyMatch.new %w{ foo bar }
45
+ assert_equal 'bar', d.find('baz')
46
+ end
47
+
48
+ def test_008_identify_false_positive
49
+ d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
50
+ assert_equal nil, d.find('baz')
51
+ end
52
+
53
+ # TODO this is not very helpful
54
+ def test_009_blocking
55
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
56
+ assert_equal 'X', d.find('X')
57
+ assert_equal nil, d.find('A')
58
+ end
59
+
60
+ # TODO this is not very helpful
61
+ def test_0095_must_match_blocking
62
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
63
+ assert_equal 'X', d.find('X')
64
+ assert_equal nil, d.find('A')
65
+
66
+ d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
67
+ assert_equal 'X', d.find('X', :must_match_blocking => true)
68
+ assert_equal nil, d.find('A', :must_match_blocking => true)
69
+ end
70
+
71
+ def test_011_free
72
+ d = FuzzyMatch.new %w{ NISSAN HONDA }
73
+ d.free
74
+ assert_raises(::RuntimeError, /free/) do
75
+ d.find('foobar')
76
+ end
77
+ end
78
+
79
+ def test_012_find_all
80
+ d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
81
+ assert_equal ['X', 'X22' ], d.find_all('X')
82
+ assert_equal [], d.find_all('A')
83
+ end
84
+
85
+ def test_013_first_blocking_decides
86
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
87
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
88
+
89
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
90
+ assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
91
+
92
+ # first_blocking_decides refers to the needle
93
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
94
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
95
+
96
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
97
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
98
+
99
+ # or equivalently with an identity
100
+ d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
101
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
102
+ end
103
+
104
+ MyStruct = Struct.new(:one, :two)
105
+ def test_014_symbol_read_sends_method
106
+ ab = MyStruct.new('a', 'b')
107
+ ba = MyStruct.new('b', 'a')
108
+ haystack = [ab, ba]
109
+ by_first = FuzzyMatch.new haystack, :read => :one
110
+ by_last = FuzzyMatch.new haystack, :read => :two
111
+ assert_equal ab, by_first.find('a')
112
+ assert_equal ab, by_last.find('b')
113
+ assert_equal ba, by_first.find('b')
114
+ assert_equal ba, by_last.find('a')
115
+ end
116
+
117
+ def test_015_symbol_read_reads_array
118
+ ab = ['a', 'b']
119
+ ba = ['b', 'a']
120
+ haystack = [ab, ba]
121
+ by_first = FuzzyMatch.new haystack, :read => 0
122
+ by_last = FuzzyMatch.new haystack, :read => 1
123
+ assert_equal ab, by_first.find('a')
124
+ assert_equal ab, by_last.find('b')
125
+ assert_equal ba, by_first.find('b')
126
+ assert_equal ba, by_last.find('a')
127
+ end
128
+
129
+ def test_016_symbol_read_reads_hash
130
+ ab = { :one => 'a', :two => 'b' }
131
+ ba = { :one => 'b', :two => 'a' }
132
+ haystack = [ab, ba]
133
+ by_first = FuzzyMatch.new haystack, :read => :one
134
+ by_last = FuzzyMatch.new haystack, :read => :two
135
+ assert_equal ab, by_first.find('a')
136
+ assert_equal ab, by_last.find('b')
137
+ assert_equal ba, by_first.find('b')
138
+ assert_equal ba, by_last.find('a')
139
+ end
140
+
141
+ def test_017_understands_haystack_reader_option
142
+ ab = ['a', 'b']
143
+ ba = ['b', 'a']
144
+ haystack = [ab, ba]
145
+ by_first = FuzzyMatch.new haystack, :haystack_reader => 0
146
+ assert_equal ab, by_first.find('a')
147
+ assert_equal ba, by_first.find('b')
148
+ end
149
+
150
+ def test_018_no_result_if_best_score_is_zero
151
+ assert_equal nil, FuzzyMatch.new(['a']).find('b')
152
+ end
153
+
154
+ def test_019_must_match_at_least_one_word
155
+ d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
156
+ assert_equal nil, d.find('RITZ')
157
+ end
158
+
159
+ def test_020_stop_words
160
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
161
+ assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
162
+
163
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
164
+ assert_equal 'B HTL', d.find('A HTL')
165
+
166
+ d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
167
+ assert_equal 'A HOTEL', d.find('A HTL')
168
+ end
169
+
170
+ def test_021_explain
171
+ require 'stringio'
172
+ capture = StringIO.new
173
+ begin
174
+ old_stderr = $stderr
175
+ $stderr = capture
176
+ d = FuzzyMatch.new %w{ RATZ CATZ }
177
+ d.explain('RITZ')
178
+ ensure
179
+ $stderr = old_stderr
180
+ end
181
+ capture.rewind
182
+ assert capture.read.include?('CATZ')
183
+ capture.close
184
+ end
185
+
186
+ def test_022_compare_words_with_words
187
+ d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
188
+ assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
189
+ end
190
+ end
@@ -0,0 +1,268 @@
1
+ require 'helper'
2
+
3
+ require 'shoulda'
4
+
5
+ $log = false
6
+
7
+ class TestFuzzyMatchConvoluted < Test::Unit::TestCase
8
+ def setup
9
+ clear_ltd
10
+
11
+ # dh 8 400
12
+ @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
13
+ @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
14
+ # dh 88
15
+ @b_needle = ['ABCDEFG DH88 HIJKLMNOP']
16
+ # dh 89
17
+ @c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
18
+ # dh 8 200
19
+ @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
20
+ @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
21
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
22
+
23
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
24
+
25
+ @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
26
+
27
+ @needle = [
28
+ @a_needle,
29
+ @b_needle,
30
+ ['DE HAVILLAND DH89 Dragon Rapide'],
31
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
32
+ @d_needle,
33
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
34
+ ['DE HAVILLAND DH90 Dragonfly']
35
+ ]
36
+ @haystack = [
37
+ @a_haystack,
38
+ @c_haystack,
39
+ @d_haystack,
40
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
41
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
42
+ ]
43
+ @tightenings = []
44
+ @identities = []
45
+ @blockings = []
46
+ @positives = []
47
+ @negatives = []
48
+ end
49
+
50
+ def clear_ltd
51
+ @_ltd = nil
52
+ end
53
+
54
+ def ltd
55
+ @_ltd ||= FuzzyMatch.new @haystack,
56
+ :tightenings => @tightenings,
57
+ :identities => @identities,
58
+ :blockings => @blockings,
59
+ :positives => @positives,
60
+ :negatives => @negatives,
61
+ :blocking_only => @blocking_only,
62
+ :log => $log
63
+ end
64
+
65
+ should "optionally only pay attention to things that match blockings" do
66
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
67
+
68
+ clear_ltd
69
+ @blocking_only = true
70
+ assert_equal nil, ltd.improver.match(@a_needle)
71
+
72
+ clear_ltd
73
+ @blocking_only = true
74
+ @blockings.push ['/dash/i']
75
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
76
+ end
77
+
78
+ # the example from the readme, considerably uglier here
79
+ should "check a simple table" do
80
+ @haystack = [ 'seamus', 'andy', 'ben' ]
81
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
82
+ needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
83
+
84
+ assert_nothing_raised do
85
+ ltd.improver.check needle
86
+ end
87
+ end
88
+
89
+ should "treat a String as a full record if passed through" do
90
+ dash = 'DHC8-400'
91
+ b747 = 'B747200/300'
92
+ dc9 = 'DC-9-10'
93
+ haystack_records = [ dash, b747, dc9 ]
94
+ simple_ltd = FuzzyMatch.new haystack_records, :log => $log
95
+ assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
96
+ assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
97
+ assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
98
+ end
99
+
100
+ should "call it a mismatch if you hit a blank positive" do
101
+ @positives.push [@a_needle[0], '']
102
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
103
+ ltd.improver.match @a_needle
104
+ end
105
+ end
106
+
107
+ should "call it a false positive if you hit a blank negative" do
108
+ @negatives.push [@a_needle[0], '']
109
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
110
+ ltd.improver.match @a_needle
111
+ end
112
+ end
113
+
114
+ should "have a false match without blocking" do
115
+ # @d_needle will be our victim
116
+ @haystack.push @d_lookalike
117
+ @tightenings.push @t_1
118
+
119
+ assert_equal @d_lookalike, ltd.improver.match(@d_needle)
120
+ end
121
+
122
+ should "do blocking if the needle matches a block" do
123
+ # @d_needle will be our victim
124
+ @haystack.push @d_lookalike
125
+ @tightenings.push @t_1
126
+ @blockings.push ['/(bombardier|de ?havilland)/i']
127
+
128
+ assert_equal @d_haystack, ltd.improver.match(@d_needle)
129
+ end
130
+
131
+ should "treat blocks as exclusive" do
132
+ @haystack = [ @d_needle ]
133
+ @tightenings.push @t_1
134
+ @blockings.push ['/(bombardier|de ?havilland)/i']
135
+
136
+ assert_equal nil, ltd.improver.match(@d_lookalike)
137
+ end
138
+
139
+ should "only use identities if they stem from the same regexp" do
140
+ @identities.push @r_1
141
+ @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
142
+ @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
143
+ x_needle = [ 'CESSNA D-333 CITATION V']
144
+ x_haystack = [ 'CESSNA D-333' ]
145
+ @haystack.push x_haystack
146
+
147
+ assert_equal x_haystack, ltd.improver.match(x_needle)
148
+ end
149
+
150
+ should "use the best score from all of the tightenings" do
151
+ x_needle = ["BOEING 737100"]
152
+ x_haystack = ["BOEING BOEING 737-100/200"]
153
+ x_haystack_wrong = ["BOEING BOEING 737-900"]
154
+ @haystack.push x_haystack
155
+ @haystack.push x_haystack_wrong
156
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
157
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
158
+
159
+ assert_equal x_haystack, ltd.improver.match(x_needle)
160
+ end
161
+
162
+ should "compare using prefixes if tightened key is shorter than correct match" do
163
+ x_needle = ["BOEING 720"]
164
+ x_haystack = ["BOEING BOEING 720-000"]
165
+ x_haystack_wrong = ["BOEING BOEING 717-200"]
166
+ @haystack.push x_haystack
167
+ @haystack.push x_haystack_wrong
168
+ @tightenings.push @t_1
169
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
170
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
171
+
172
+ assert_equal x_haystack, ltd.improver.match(x_needle)
173
+ end
174
+
175
+ should "use the shortest original input" do
176
+ x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
177
+ x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
178
+ x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
179
+
180
+ @haystack.push x_haystack_long
181
+ @haystack.push x_haystack
182
+ @tightenings.push @t_1
183
+
184
+ assert_equal x_haystack, ltd.improver.match(x_needle)
185
+ end
186
+
187
+ should "perform lookups needle to haystack" do
188
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
189
+ end
190
+
191
+ should "succeed if there are no checks" do
192
+ assert_nothing_raised do
193
+ ltd.improver.check @needle
194
+ end
195
+ end
196
+
197
+ should "succeed if the positive checks just work" do
198
+ @positives.push [ @a_needle[0], @a_haystack[0] ]
199
+
200
+ assert_nothing_raised do
201
+ ltd.improver.check @needle
202
+ end
203
+ end
204
+
205
+ should "fail if positive checks don't work" do
206
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
207
+
208
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
209
+ ltd.improver.check @needle
210
+ end
211
+ end
212
+
213
+ should "succeed if proper tightening is applied" do
214
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
215
+ @tightenings.push @t_1
216
+
217
+ assert_nothing_raised do
218
+ ltd.improver.check @needle
219
+ end
220
+ end
221
+
222
+ should "use a Google Docs spreadsheet as a source of tightenings" do
223
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
224
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
225
+
226
+ # sabshere 9/30/10 this shouldn't raise anything
227
+ # but the tightenings have been changed... we should be using test-only tightenings, not production ones
228
+ # assert_nothing_raised do
229
+ assert_raises(FuzzyMatch::Improver::Mismatch) do
230
+ ltd.improver.check @needle
231
+ end
232
+ end
233
+
234
+ should "fail if negative checks don't work" do
235
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
236
+
237
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
238
+ ltd.improver.check @needle
239
+ end
240
+ end
241
+
242
+ should "do inline checking" do
243
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
244
+
245
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
246
+ ltd.improver.match @b_needle
247
+ end
248
+ end
249
+
250
+ should "fail if negative checks don't work, even with tightening" do
251
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
252
+ @tightenings.push @t_1
253
+
254
+ assert_raises(FuzzyMatch::Improver::FalsePositive) do
255
+ ltd.improver.check @needle
256
+ end
257
+ end
258
+
259
+ should "succeed if proper identity is applied" do
260
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
261
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
262
+ @identities.push @r_1
263
+
264
+ assert_nothing_raised do
265
+ ltd.improver.check @needle
266
+ end
267
+ end
268
+ end
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestIdentity < Test::Unit::TestCase
4
+ def test_001_identical
5
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
6
+ assert_equal true, i.identical?('A1', 'A 1foobar')
7
+ end
8
+
9
+ def test_002_certainly_different
10
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
11
+ assert_equal false, i.identical?('A1', 'A 2foobar')
12
+ end
13
+
14
+ def test_003_no_information_ie_possible_identical
15
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
16
+ assert_equal nil, i.identical?('B1', 'A 2foobar')
17
+ end
18
+
19
+ def test_004_regexp
20
+ i = FuzzyMatch::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
22
+ end
23
+
24
+ def test_005_regexp_from_string
25
+ i = FuzzyMatch::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
26
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
27
+ end
28
+
29
+ def test_006_regexp_from_string_using_slash_delim
30
+ i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
31
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
32
+ end
33
+ end
@@ -0,0 +1,10 @@
1
+ require 'helper'
2
+
3
+ class TestTightener < Test::Unit::TestCase
4
+ def test_001_apply
5
+ t = FuzzyMatch::Tightener.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
+ assert_equal 'Ford F350', t.apply('Ford F-350')
7
+ assert_equal 'Ford F150', t.apply('Ford F150')
8
+ assert_equal 'Ford F350', t.apply('Ford F 350')
9
+ end
10
+ end