fuzzy_match 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.rdoc +94 -0
- data/Rakefile +21 -0
- data/THANKS-WILLIAM-JAMES.rb +37 -0
- data/benchmark/before-with-free.txt +283 -0
- data/benchmark/before-without-last-result.txt +257 -0
- data/benchmark/before.txt +304 -0
- data/benchmark/memory.rb +54 -0
- data/examples/bts_aircraft/5-2-A.htm +10305 -0
- data/examples/bts_aircraft/5-2-B.htm +9576 -0
- data/examples/bts_aircraft/5-2-D.htm +7094 -0
- data/examples/bts_aircraft/5-2-E.htm +2349 -0
- data/examples/bts_aircraft/5-2-G.htm +2922 -0
- data/examples/bts_aircraft/blockings.csv +1 -0
- data/examples/bts_aircraft/identities.csv +1 -0
- data/examples/bts_aircraft/negatives.csv +1 -0
- data/examples/bts_aircraft/number_260.csv +334 -0
- data/examples/bts_aircraft/positives.csv +1 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
- data/examples/bts_aircraft/tighteners.csv +1 -0
- data/examples/first_name_matching.rb +15 -0
- data/examples/icao-bts.xls +0 -0
- data/fuzzy_match.gemspec +32 -0
- data/lib/fuzzy_match/blocking.rb +36 -0
- data/lib/fuzzy_match/cached_result.rb +74 -0
- data/lib/fuzzy_match/identity.rb +23 -0
- data/lib/fuzzy_match/result.rb +17 -0
- data/lib/fuzzy_match/score.rb +125 -0
- data/lib/fuzzy_match/similarity.rb +53 -0
- data/lib/fuzzy_match/stop_word.rb +19 -0
- data/lib/fuzzy_match/tightener.rb +28 -0
- data/lib/fuzzy_match/version.rb +3 -0
- data/lib/fuzzy_match/wrapper.rb +67 -0
- data/lib/fuzzy_match.rb +252 -0
- data/test/helper.rb +12 -0
- data/test/test_blocking.rb +23 -0
- data/test/test_cache.rb +130 -0
- data/test/test_fuzzy_match.rb +190 -0
- data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
- data/test/test_identity.rb +33 -0
- data/test/test_tightening.rb +10 -0
- metadata +197 -0
data/test/test_cache.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
require 'active_support/all'
|
4
|
+
require 'active_record'
|
5
|
+
require 'cohort_scope'
|
6
|
+
require 'weighted_average'
|
7
|
+
|
8
|
+
ActiveRecord::Base.establish_connection(
|
9
|
+
'adapter' => 'mysql',
|
10
|
+
'database' => 'fuzzy_match_test',
|
11
|
+
'username' => 'root',
|
12
|
+
'password' => 'password'
|
13
|
+
)
|
14
|
+
|
15
|
+
# ActiveRecord::Base.logger = Logger.new $stderr
|
16
|
+
|
17
|
+
ActiveSupport::Inflector.inflections do |inflect|
|
18
|
+
inflect.uncountable 'aircraft'
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'fuzzy_match/cached_result'
|
22
|
+
|
23
|
+
::FuzzyMatch::CachedResult.setup(true)
|
24
|
+
::FuzzyMatch::CachedResult.delete_all
|
25
|
+
|
26
|
+
class Aircraft < ActiveRecord::Base
|
27
|
+
set_primary_key :icao_code
|
28
|
+
|
29
|
+
cache_fuzzy_match_matches_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
30
|
+
|
31
|
+
def aircraft_description
|
32
|
+
[manufacturer_name, model_name].compact.join(' ')
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.fuzzy_match
|
36
|
+
@fuzzy_match ||= FuzzyMatch.new all, :read => ::Proc.new { |straw| straw.aircraft_description }
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.create_table
|
40
|
+
connection.drop_table(:aircraft) rescue nil
|
41
|
+
connection.execute %{
|
42
|
+
CREATE TABLE `aircraft` (
|
43
|
+
`icao_code` varchar(255) DEFAULT NULL,
|
44
|
+
`manufacturer_name` varchar(255) DEFAULT NULL,
|
45
|
+
`model_name` varchar(255) DEFAULT NULL,
|
46
|
+
PRIMARY KEY (`icao_code`)
|
47
|
+
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
48
|
+
}
|
49
|
+
reset_column_information
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class FlightSegment < ActiveRecord::Base
|
54
|
+
set_primary_key :row_hash
|
55
|
+
|
56
|
+
cache_fuzzy_match_matches_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
57
|
+
|
58
|
+
extend CohortScope
|
59
|
+
self.minimum_cohort_size = 1
|
60
|
+
|
61
|
+
def self.create_table
|
62
|
+
connection.drop_table(:flight_segments) rescue nil
|
63
|
+
connection.execute %{
|
64
|
+
CREATE TABLE `flight_segments` (
|
65
|
+
`row_hash` varchar(255) NOT NULL DEFAULT '',
|
66
|
+
`aircraft_description` varchar(255) DEFAULT NULL,
|
67
|
+
`passengers` int(11) DEFAULT NULL,
|
68
|
+
`seats` int(11) DEFAULT NULL,
|
69
|
+
PRIMARY KEY (`row_hash`)
|
70
|
+
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
71
|
+
}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
FlightSegment.create_table
|
76
|
+
Aircraft.create_table
|
77
|
+
|
78
|
+
a = Aircraft.new
|
79
|
+
a.icao_code = 'B742'
|
80
|
+
a.manufacturer_name = 'Boeing'
|
81
|
+
a.model_name = '747-200'
|
82
|
+
a.save!
|
83
|
+
|
84
|
+
fs = FlightSegment.new
|
85
|
+
fs.row_hash = 'madison to chicago'
|
86
|
+
fs.aircraft_description = 'BORING 747200'
|
87
|
+
fs.passengers = 10
|
88
|
+
fs.seats = 10
|
89
|
+
fs.save!
|
90
|
+
|
91
|
+
fs = FlightSegment.new
|
92
|
+
fs.row_hash = 'madison to minneapolis'
|
93
|
+
fs.aircraft_description = 'bing 747'
|
94
|
+
fs.passengers = 100
|
95
|
+
fs.seats = 5
|
96
|
+
fs.save!
|
97
|
+
|
98
|
+
FlightSegment.find_each do |fs|
|
99
|
+
fs.cache_aircraft!
|
100
|
+
end
|
101
|
+
|
102
|
+
class TestCache < Test::Unit::TestCase
|
103
|
+
def test_002_one_degree_of_separation
|
104
|
+
aircraft = Aircraft.find('B742')
|
105
|
+
assert_equal 2, aircraft.flight_segments.count
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_003_standard_sql_calculations
|
109
|
+
aircraft = Aircraft.find('B742')
|
110
|
+
assert_equal 110, aircraft.flight_segments.sum(:passengers)
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_004_weighted_average
|
114
|
+
aircraft = Aircraft.find('B742')
|
115
|
+
assert_equal 5.45455, aircraft.flight_segments.weighted_average(:seats, :weighted_by => :passengers)
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_005_right_way_to_do_cohorts
|
119
|
+
aircraft = Aircraft.find('B742')
|
120
|
+
assert_equal 2, FlightSegment.big_cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).count
|
121
|
+
end
|
122
|
+
|
123
|
+
def test_006_you_can_get_aircraft_from_flight_segments
|
124
|
+
fs = FlightSegment.first
|
125
|
+
# you need to add an aircraft_description column
|
126
|
+
assert_raises(ActiveRecord::StatementInvalid) do
|
127
|
+
assert_equal 2, fs.aircraft.count
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
class TestFuzzyMatch < Test::Unit::TestCase
|
5
|
+
def test_001_find
|
6
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }
|
7
|
+
assert_equal 'RATZ', d.find('RITZ')
|
8
|
+
assert_equal 'RATZ', d.find('RíTZ')
|
9
|
+
|
10
|
+
d = FuzzyMatch.new [ 'X' ]
|
11
|
+
assert_equal 'X', d.find('X')
|
12
|
+
assert_equal nil, d.find('A')
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_002_dont_gather_last_result_by_default
|
16
|
+
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
17
|
+
d.find('MISSAM')
|
18
|
+
assert_raises(::RuntimeError, /gather_last_result/) do
|
19
|
+
d.last_result
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_003_last_result
|
24
|
+
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
|
+
d.find 'MISSAM', :gather_last_result => true
|
26
|
+
assert_equal 0.6, d.last_result.score
|
27
|
+
assert_equal 'NISSAN', d.last_result.record
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_004_false_positive_without_tightener
|
31
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
32
|
+
assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_005_correct_with_tightener
|
36
|
+
tighteners = [
|
37
|
+
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
38
|
+
]
|
39
|
+
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
|
40
|
+
assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_008_false_positive_without_identity
|
44
|
+
d = FuzzyMatch.new %w{ foo bar }
|
45
|
+
assert_equal 'bar', d.find('baz')
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_008_identify_false_positive
|
49
|
+
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
50
|
+
assert_equal nil, d.find('baz')
|
51
|
+
end
|
52
|
+
|
53
|
+
# TODO this is not very helpful
|
54
|
+
def test_009_blocking
|
55
|
+
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
56
|
+
assert_equal 'X', d.find('X')
|
57
|
+
assert_equal nil, d.find('A')
|
58
|
+
end
|
59
|
+
|
60
|
+
# TODO this is not very helpful
|
61
|
+
def test_0095_must_match_blocking
|
62
|
+
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
63
|
+
assert_equal 'X', d.find('X')
|
64
|
+
assert_equal nil, d.find('A')
|
65
|
+
|
66
|
+
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
67
|
+
assert_equal 'X', d.find('X', :must_match_blocking => true)
|
68
|
+
assert_equal nil, d.find('A', :must_match_blocking => true)
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_011_free
|
72
|
+
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
73
|
+
d.free
|
74
|
+
assert_raises(::RuntimeError, /free/) do
|
75
|
+
d.find('foobar')
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_012_find_all
|
80
|
+
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
81
|
+
assert_equal ['X', 'X22' ], d.find_all('X')
|
82
|
+
assert_equal [], d.find_all('A')
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_013_first_blocking_decides
|
86
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
87
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
|
88
|
+
|
89
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
90
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
|
91
|
+
|
92
|
+
# first_blocking_decides refers to the needle
|
93
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
94
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
|
95
|
+
|
96
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
97
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
98
|
+
|
99
|
+
# or equivalently with an identity
|
100
|
+
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
101
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
102
|
+
end
|
103
|
+
|
104
|
+
MyStruct = Struct.new(:one, :two)
|
105
|
+
def test_014_symbol_read_sends_method
|
106
|
+
ab = MyStruct.new('a', 'b')
|
107
|
+
ba = MyStruct.new('b', 'a')
|
108
|
+
haystack = [ab, ba]
|
109
|
+
by_first = FuzzyMatch.new haystack, :read => :one
|
110
|
+
by_last = FuzzyMatch.new haystack, :read => :two
|
111
|
+
assert_equal ab, by_first.find('a')
|
112
|
+
assert_equal ab, by_last.find('b')
|
113
|
+
assert_equal ba, by_first.find('b')
|
114
|
+
assert_equal ba, by_last.find('a')
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_015_symbol_read_reads_array
|
118
|
+
ab = ['a', 'b']
|
119
|
+
ba = ['b', 'a']
|
120
|
+
haystack = [ab, ba]
|
121
|
+
by_first = FuzzyMatch.new haystack, :read => 0
|
122
|
+
by_last = FuzzyMatch.new haystack, :read => 1
|
123
|
+
assert_equal ab, by_first.find('a')
|
124
|
+
assert_equal ab, by_last.find('b')
|
125
|
+
assert_equal ba, by_first.find('b')
|
126
|
+
assert_equal ba, by_last.find('a')
|
127
|
+
end
|
128
|
+
|
129
|
+
def test_016_symbol_read_reads_hash
|
130
|
+
ab = { :one => 'a', :two => 'b' }
|
131
|
+
ba = { :one => 'b', :two => 'a' }
|
132
|
+
haystack = [ab, ba]
|
133
|
+
by_first = FuzzyMatch.new haystack, :read => :one
|
134
|
+
by_last = FuzzyMatch.new haystack, :read => :two
|
135
|
+
assert_equal ab, by_first.find('a')
|
136
|
+
assert_equal ab, by_last.find('b')
|
137
|
+
assert_equal ba, by_first.find('b')
|
138
|
+
assert_equal ba, by_last.find('a')
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_017_understands_haystack_reader_option
|
142
|
+
ab = ['a', 'b']
|
143
|
+
ba = ['b', 'a']
|
144
|
+
haystack = [ab, ba]
|
145
|
+
by_first = FuzzyMatch.new haystack, :haystack_reader => 0
|
146
|
+
assert_equal ab, by_first.find('a')
|
147
|
+
assert_equal ba, by_first.find('b')
|
148
|
+
end
|
149
|
+
|
150
|
+
def test_018_no_result_if_best_score_is_zero
|
151
|
+
assert_equal nil, FuzzyMatch.new(['a']).find('b')
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_019_must_match_at_least_one_word
|
155
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
156
|
+
assert_equal nil, d.find('RITZ')
|
157
|
+
end
|
158
|
+
|
159
|
+
def test_020_stop_words
|
160
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
161
|
+
assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
|
162
|
+
|
163
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
164
|
+
assert_equal 'B HTL', d.find('A HTL')
|
165
|
+
|
166
|
+
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
167
|
+
assert_equal 'A HOTEL', d.find('A HTL')
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_021_explain
|
171
|
+
require 'stringio'
|
172
|
+
capture = StringIO.new
|
173
|
+
begin
|
174
|
+
old_stderr = $stderr
|
175
|
+
$stderr = capture
|
176
|
+
d = FuzzyMatch.new %w{ RATZ CATZ }
|
177
|
+
d.explain('RITZ')
|
178
|
+
ensure
|
179
|
+
$stderr = old_stderr
|
180
|
+
end
|
181
|
+
capture.rewind
|
182
|
+
assert capture.read.include?('CATZ')
|
183
|
+
capture.close
|
184
|
+
end
|
185
|
+
|
186
|
+
def test_022_compare_words_with_words
|
187
|
+
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
188
|
+
assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
require 'shoulda'
|
4
|
+
|
5
|
+
$log = false
|
6
|
+
|
7
|
+
class TestFuzzyMatchConvoluted < Test::Unit::TestCase
|
8
|
+
def setup
|
9
|
+
clear_ltd
|
10
|
+
|
11
|
+
# dh 8 400
|
12
|
+
@a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
|
13
|
+
@a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
|
14
|
+
# dh 88
|
15
|
+
@b_needle = ['ABCDEFG DH88 HIJKLMNOP']
|
16
|
+
# dh 89
|
17
|
+
@c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
|
18
|
+
# dh 8 200
|
19
|
+
@d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
|
20
|
+
@d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
|
21
|
+
@d_lookalike = ['ABCD DHC8200 Dash 8']
|
22
|
+
|
23
|
+
@t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
|
24
|
+
|
25
|
+
@r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
|
26
|
+
|
27
|
+
@needle = [
|
28
|
+
@a_needle,
|
29
|
+
@b_needle,
|
30
|
+
['DE HAVILLAND DH89 Dragon Rapide'],
|
31
|
+
['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
|
32
|
+
@d_needle,
|
33
|
+
['DE HAVILLAND CANADA DHC8300 Dash 8'],
|
34
|
+
['DE HAVILLAND DH90 Dragonfly']
|
35
|
+
]
|
36
|
+
@haystack = [
|
37
|
+
@a_haystack,
|
38
|
+
@c_haystack,
|
39
|
+
@d_haystack,
|
40
|
+
['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
|
41
|
+
['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
|
42
|
+
]
|
43
|
+
@tightenings = []
|
44
|
+
@identities = []
|
45
|
+
@blockings = []
|
46
|
+
@positives = []
|
47
|
+
@negatives = []
|
48
|
+
end
|
49
|
+
|
50
|
+
def clear_ltd
|
51
|
+
@_ltd = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def ltd
|
55
|
+
@_ltd ||= FuzzyMatch.new @haystack,
|
56
|
+
:tightenings => @tightenings,
|
57
|
+
:identities => @identities,
|
58
|
+
:blockings => @blockings,
|
59
|
+
:positives => @positives,
|
60
|
+
:negatives => @negatives,
|
61
|
+
:blocking_only => @blocking_only,
|
62
|
+
:log => $log
|
63
|
+
end
|
64
|
+
|
65
|
+
should "optionally only pay attention to things that match blockings" do
|
66
|
+
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
67
|
+
|
68
|
+
clear_ltd
|
69
|
+
@blocking_only = true
|
70
|
+
assert_equal nil, ltd.improver.match(@a_needle)
|
71
|
+
|
72
|
+
clear_ltd
|
73
|
+
@blocking_only = true
|
74
|
+
@blockings.push ['/dash/i']
|
75
|
+
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
76
|
+
end
|
77
|
+
|
78
|
+
# the example from the readme, considerably uglier here
|
79
|
+
should "check a simple table" do
|
80
|
+
@haystack = [ 'seamus', 'andy', 'ben' ]
|
81
|
+
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
82
|
+
needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
83
|
+
|
84
|
+
assert_nothing_raised do
|
85
|
+
ltd.improver.check needle
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
should "treat a String as a full record if passed through" do
|
90
|
+
dash = 'DHC8-400'
|
91
|
+
b747 = 'B747200/300'
|
92
|
+
dc9 = 'DC-9-10'
|
93
|
+
haystack_records = [ dash, b747, dc9 ]
|
94
|
+
simple_ltd = FuzzyMatch.new haystack_records, :log => $log
|
95
|
+
assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
|
96
|
+
assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
|
97
|
+
assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
|
98
|
+
end
|
99
|
+
|
100
|
+
should "call it a mismatch if you hit a blank positive" do
|
101
|
+
@positives.push [@a_needle[0], '']
|
102
|
+
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
103
|
+
ltd.improver.match @a_needle
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
should "call it a false positive if you hit a blank negative" do
|
108
|
+
@negatives.push [@a_needle[0], '']
|
109
|
+
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
110
|
+
ltd.improver.match @a_needle
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
should "have a false match without blocking" do
|
115
|
+
# @d_needle will be our victim
|
116
|
+
@haystack.push @d_lookalike
|
117
|
+
@tightenings.push @t_1
|
118
|
+
|
119
|
+
assert_equal @d_lookalike, ltd.improver.match(@d_needle)
|
120
|
+
end
|
121
|
+
|
122
|
+
should "do blocking if the needle matches a block" do
|
123
|
+
# @d_needle will be our victim
|
124
|
+
@haystack.push @d_lookalike
|
125
|
+
@tightenings.push @t_1
|
126
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
127
|
+
|
128
|
+
assert_equal @d_haystack, ltd.improver.match(@d_needle)
|
129
|
+
end
|
130
|
+
|
131
|
+
should "treat blocks as exclusive" do
|
132
|
+
@haystack = [ @d_needle ]
|
133
|
+
@tightenings.push @t_1
|
134
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
135
|
+
|
136
|
+
assert_equal nil, ltd.improver.match(@d_lookalike)
|
137
|
+
end
|
138
|
+
|
139
|
+
should "only use identities if they stem from the same regexp" do
|
140
|
+
@identities.push @r_1
|
141
|
+
@identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
|
142
|
+
@identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
|
143
|
+
x_needle = [ 'CESSNA D-333 CITATION V']
|
144
|
+
x_haystack = [ 'CESSNA D-333' ]
|
145
|
+
@haystack.push x_haystack
|
146
|
+
|
147
|
+
assert_equal x_haystack, ltd.improver.match(x_needle)
|
148
|
+
end
|
149
|
+
|
150
|
+
should "use the best score from all of the tightenings" do
|
151
|
+
x_needle = ["BOEING 737100"]
|
152
|
+
x_haystack = ["BOEING BOEING 737-100/200"]
|
153
|
+
x_haystack_wrong = ["BOEING BOEING 737-900"]
|
154
|
+
@haystack.push x_haystack
|
155
|
+
@haystack.push x_haystack_wrong
|
156
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
157
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
158
|
+
|
159
|
+
assert_equal x_haystack, ltd.improver.match(x_needle)
|
160
|
+
end
|
161
|
+
|
162
|
+
should "compare using prefixes if tightened key is shorter than correct match" do
|
163
|
+
x_needle = ["BOEING 720"]
|
164
|
+
x_haystack = ["BOEING BOEING 720-000"]
|
165
|
+
x_haystack_wrong = ["BOEING BOEING 717-200"]
|
166
|
+
@haystack.push x_haystack
|
167
|
+
@haystack.push x_haystack_wrong
|
168
|
+
@tightenings.push @t_1
|
169
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
170
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
171
|
+
|
172
|
+
assert_equal x_haystack, ltd.improver.match(x_needle)
|
173
|
+
end
|
174
|
+
|
175
|
+
should "use the shortest original input" do
|
176
|
+
x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
|
177
|
+
x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
|
178
|
+
x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
179
|
+
|
180
|
+
@haystack.push x_haystack_long
|
181
|
+
@haystack.push x_haystack
|
182
|
+
@tightenings.push @t_1
|
183
|
+
|
184
|
+
assert_equal x_haystack, ltd.improver.match(x_needle)
|
185
|
+
end
|
186
|
+
|
187
|
+
should "perform lookups needle to haystack" do
|
188
|
+
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
189
|
+
end
|
190
|
+
|
191
|
+
should "succeed if there are no checks" do
|
192
|
+
assert_nothing_raised do
|
193
|
+
ltd.improver.check @needle
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
should "succeed if the positive checks just work" do
|
198
|
+
@positives.push [ @a_needle[0], @a_haystack[0] ]
|
199
|
+
|
200
|
+
assert_nothing_raised do
|
201
|
+
ltd.improver.check @needle
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
should "fail if positive checks don't work" do
|
206
|
+
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
207
|
+
|
208
|
+
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
209
|
+
ltd.improver.check @needle
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
should "succeed if proper tightening is applied" do
|
214
|
+
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
215
|
+
@tightenings.push @t_1
|
216
|
+
|
217
|
+
assert_nothing_raised do
|
218
|
+
ltd.improver.check @needle
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
should "use a Google Docs spreadsheet as a source of tightenings" do
|
223
|
+
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
224
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
225
|
+
|
226
|
+
# sabshere 9/30/10 this shouldn't raise anything
|
227
|
+
# but the tightenings have been changed... we should be using test-only tightenings, not production ones
|
228
|
+
# assert_nothing_raised do
|
229
|
+
assert_raises(FuzzyMatch::Improver::Mismatch) do
|
230
|
+
ltd.improver.check @needle
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
should "fail if negative checks don't work" do
|
235
|
+
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
236
|
+
|
237
|
+
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
238
|
+
ltd.improver.check @needle
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
should "do inline checking" do
|
243
|
+
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
244
|
+
|
245
|
+
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
246
|
+
ltd.improver.match @b_needle
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
should "fail if negative checks don't work, even with tightening" do
|
251
|
+
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
252
|
+
@tightenings.push @t_1
|
253
|
+
|
254
|
+
assert_raises(FuzzyMatch::Improver::FalsePositive) do
|
255
|
+
ltd.improver.check @needle
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
should "succeed if proper identity is applied" do
|
260
|
+
@negatives.push [ @b_needle[0], @c_haystack[0] ]
|
261
|
+
@positives.push [ @d_needle[0], @d_haystack[0] ]
|
262
|
+
@identities.push @r_1
|
263
|
+
|
264
|
+
assert_nothing_raised do
|
265
|
+
ltd.improver.check @needle
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestIdentity < Test::Unit::TestCase
|
4
|
+
def test_001_identical
|
5
|
+
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
6
|
+
assert_equal true, i.identical?('A1', 'A 1foobar')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_002_certainly_different
|
10
|
+
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
11
|
+
assert_equal false, i.identical?('A1', 'A 2foobar')
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_003_no_information_ie_possible_identical
|
15
|
+
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
16
|
+
assert_equal nil, i.identical?('B1', 'A 2foobar')
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_004_regexp
|
20
|
+
i = FuzzyMatch::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
21
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_005_regexp_from_string
|
25
|
+
i = FuzzyMatch::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
26
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_006_regexp_from_string_using_slash_delim
|
30
|
+
i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
31
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestTightener < Test::Unit::TestCase
|
4
|
+
def test_001_apply
|
5
|
+
t = FuzzyMatch::Tightener.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
|
+
assert_equal 'Ford F350', t.apply('Ford F-350')
|
7
|
+
assert_equal 'Ford F150', t.apply('Ford F150')
|
8
|
+
assert_equal 'Ford F350', t.apply('Ford F 350')
|
9
|
+
end
|
10
|
+
end
|