loose_tight_dictionary 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +2 -6
- data/VERSION +1 -1
- data/lib/loose_tight_dictionary.rb +1 -0
- data/loose_tight_dictionary.gemspec +2 -2
- data/test/test_loose_tight_dictionary.rb +172 -177
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -4,13 +4,9 @@ Match things based on string similarity (using the Pair Distance algorithm) and
|
|
4
4
|
|
5
5
|
= Quickstart
|
6
6
|
|
7
|
-
>>
|
7
|
+
>> d = LooseTightDictionary.new %w(seamus andy ben)
|
8
8
|
=> [...]
|
9
|
-
>>
|
10
|
-
=> [...]
|
11
|
-
>> d = LooseTightDictionary.new right_records
|
12
|
-
=> [...]
|
13
|
-
>> puts d.left_to_right left_record
|
9
|
+
>> puts d.find 'Shamus Heaney'
|
14
10
|
=> 'seamus'
|
15
11
|
|
16
12
|
Try running the included example file:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.10
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{loose_tight_dictionary}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.10"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2011-03-02}
|
13
13
|
s.description = %q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
|
14
14
|
s.email = %q{seamus@abshere.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -65,212 +65,207 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
65
65
|
:tee => $tee
|
66
66
|
end
|
67
67
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
|
72
|
-
should "optionally only pay attention to things that match blockings" do
|
73
|
-
assert_equal @a_right, ltd.left_to_right(@a_left)
|
68
|
+
should "optionally only pay attention to things that match blockings" do
|
69
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
74
70
|
|
75
|
-
|
76
|
-
|
77
|
-
|
71
|
+
clear_ltd
|
72
|
+
@blocking_only = true
|
73
|
+
assert_equal nil, ltd.left_to_right(@a_left)
|
78
74
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
should "treat a String as a full record if passed through" do
|
97
|
-
dash = 'DHC8-400'
|
98
|
-
b747 = 'B747200/300'
|
99
|
-
dc9 = 'DC-9-10'
|
100
|
-
right_records = [ dash, b747, dc9 ]
|
101
|
-
simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
|
102
|
-
assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
|
103
|
-
assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
|
104
|
-
assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
|
75
|
+
clear_ltd
|
76
|
+
@blocking_only = true
|
77
|
+
@blockings.push ['/dash/i']
|
78
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
79
|
+
end
|
80
|
+
|
81
|
+
# the example from the readme, considerably uglier here
|
82
|
+
should "check a simple table" do
|
83
|
+
@right = [ 'seamus', 'andy', 'ben' ]
|
84
|
+
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
85
|
+
left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
86
|
+
|
87
|
+
assert_nothing_raised do
|
88
|
+
ltd.check left
|
105
89
|
end
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
90
|
+
end
|
91
|
+
|
92
|
+
should "treat a String as a full record if passed through" do
|
93
|
+
dash = 'DHC8-400'
|
94
|
+
b747 = 'B747200/300'
|
95
|
+
dc9 = 'DC-9-10'
|
96
|
+
right_records = [ dash, b747, dc9 ]
|
97
|
+
simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
|
98
|
+
assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
|
99
|
+
assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
|
100
|
+
assert_equal dc9, simple_ltd.find('McDonnell Douglas MD81/DC-9')
|
101
|
+
end
|
102
|
+
|
103
|
+
should "call it a mismatch if you hit a blank positive" do
|
104
|
+
@positives.push [@a_left[0], '']
|
105
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
106
|
+
ltd.left_to_right @a_left
|
112
107
|
end
|
108
|
+
end
|
113
109
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
should "have a false match without blocking" do
|
122
|
-
# @d_left will be our victim
|
123
|
-
@right.push @d_lookalike
|
124
|
-
@tightenings.push @t_1
|
125
|
-
|
126
|
-
assert_equal @d_lookalike, ltd.left_to_right(@d_left)
|
110
|
+
should "call it a false positive if you hit a blank negative" do
|
111
|
+
@negatives.push [@a_left[0], '']
|
112
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
113
|
+
ltd.left_to_right @a_left
|
127
114
|
end
|
115
|
+
end
|
116
|
+
|
117
|
+
should "have a false match without blocking" do
|
118
|
+
# @d_left will be our victim
|
119
|
+
@right.push @d_lookalike
|
120
|
+
@tightenings.push @t_1
|
128
121
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
122
|
+
assert_equal @d_lookalike, ltd.left_to_right(@d_left)
|
123
|
+
end
|
124
|
+
|
125
|
+
should "do blocking if the left matches a block" do
|
126
|
+
# @d_left will be our victim
|
127
|
+
@right.push @d_lookalike
|
128
|
+
@tightenings.push @t_1
|
129
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
137
130
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
131
|
+
assert_equal @d_right, ltd.left_to_right(@d_left)
|
132
|
+
end
|
133
|
+
|
134
|
+
should "treat blocks as exclusive" do
|
135
|
+
@right = [ @d_left ]
|
136
|
+
@tightenings.push @t_1
|
137
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
142
138
|
|
143
|
-
|
144
|
-
|
139
|
+
assert_equal nil, ltd.left_to_right(@d_lookalike)
|
140
|
+
end
|
141
|
+
|
142
|
+
should "only use identities if they stem from the same regexp" do
|
143
|
+
@identities.push @r_1
|
144
|
+
@identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
|
145
|
+
@identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
|
146
|
+
x_left = [ 'CESSNA D-333 CITATION V']
|
147
|
+
x_right = [ 'CESSNA D-333' ]
|
148
|
+
@right.push x_right
|
145
149
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
150
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
151
|
+
end
|
152
|
+
|
153
|
+
should "use the best score from all of the tightenings" do
|
154
|
+
x_left = ["BOEING 737100"]
|
155
|
+
x_right = ["BOEING BOEING 737-100/200"]
|
156
|
+
x_right_wrong = ["BOEING BOEING 737-900"]
|
157
|
+
@right.push x_right
|
158
|
+
@right.push x_right_wrong
|
159
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
160
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
156
161
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
162
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
163
|
+
end
|
164
|
+
|
165
|
+
should "compare using prefixes if tightened key is shorter than correct match" do
|
166
|
+
x_left = ["BOEING 720"]
|
167
|
+
x_right = ["BOEING BOEING 720-000"]
|
168
|
+
x_right_wrong = ["BOEING BOEING 717-200"]
|
169
|
+
@right.push x_right
|
170
|
+
@right.push x_right_wrong
|
171
|
+
@tightenings.push @t_1
|
172
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
173
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
168
174
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
177
|
-
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
178
|
-
|
179
|
-
assert_equal x_right, ltd.left_to_right(x_left)
|
180
|
-
end
|
175
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
176
|
+
end
|
177
|
+
|
178
|
+
should "use the shortest original input" do
|
179
|
+
x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
|
180
|
+
x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
|
181
|
+
x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
181
182
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
186
|
-
|
187
|
-
@right.push x_right_long
|
188
|
-
@right.push x_right
|
189
|
-
@tightenings.push @t_1
|
190
|
-
|
191
|
-
assert_equal x_right, ltd.left_to_right(x_left)
|
192
|
-
end
|
183
|
+
@right.push x_right_long
|
184
|
+
@right.push x_right
|
185
|
+
@tightenings.push @t_1
|
193
186
|
|
194
|
-
|
195
|
-
|
187
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
188
|
+
end
|
189
|
+
|
190
|
+
should "perform lookups left to right" do
|
191
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
192
|
+
end
|
193
|
+
|
194
|
+
should "succeed if there are no checks" do
|
195
|
+
assert_nothing_raised do
|
196
|
+
ltd.check @left
|
196
197
|
end
|
198
|
+
end
|
199
|
+
|
200
|
+
should "succeed if the positive checks just work" do
|
201
|
+
@positives.push [ @a_left[0], @a_right[0] ]
|
197
202
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
203
|
+
assert_nothing_raised do
|
204
|
+
ltd.check @left
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
should "fail if positive checks don't work" do
|
209
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
210
|
+
|
211
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
212
|
+
ltd.check @left
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
should "succeed if proper tightening is applied" do
|
217
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
218
|
+
@tightenings.push @t_1
|
219
|
+
|
220
|
+
assert_nothing_raised do
|
221
|
+
ltd.check @left
|
202
222
|
end
|
223
|
+
end
|
224
|
+
|
225
|
+
should "use a Google Docs spreadsheet as a source of tightenings" do
|
226
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
227
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
203
228
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
end
|
229
|
+
# sabshere 9/30/10 this shouldn't raise anything
|
230
|
+
# but the tightenings have been changed... we should be using test-only tightenings, not production ones
|
231
|
+
# assert_nothing_raised do
|
232
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
233
|
+
ltd.check @left
|
210
234
|
end
|
235
|
+
end
|
211
236
|
|
212
|
-
|
213
|
-
|
237
|
+
should "fail if negative checks don't work" do
|
238
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
214
239
|
|
215
|
-
|
216
|
-
|
217
|
-
end
|
240
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
241
|
+
ltd.check @left
|
218
242
|
end
|
243
|
+
end
|
219
244
|
|
220
|
-
|
221
|
-
|
222
|
-
@tightenings.push @t_1
|
245
|
+
should "do inline checking" do
|
246
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
223
247
|
|
224
|
-
|
225
|
-
|
226
|
-
end
|
248
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
249
|
+
ltd.left_to_right @b_left
|
227
250
|
end
|
251
|
+
end
|
228
252
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# sabshere 9/30/10 this shouldn't raise anything
|
234
|
-
# but the tightenings have been changed... we should be using test-only tightenings, not production ones
|
235
|
-
# assert_nothing_raised do
|
236
|
-
assert_raises(LooseTightDictionary::Mismatch) do
|
237
|
-
ltd.check @left
|
238
|
-
end
|
239
|
-
end
|
240
|
-
|
241
|
-
should "fail if negative checks don't work" do
|
242
|
-
@negatives.push [ @b_left[0], @c_right[0] ]
|
243
|
-
|
244
|
-
assert_raises(LooseTightDictionary::FalsePositive) do
|
245
|
-
ltd.check @left
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
should "do inline checking" do
|
250
|
-
@negatives.push [ @b_left[0], @c_right[0] ]
|
251
|
-
|
252
|
-
assert_raises(LooseTightDictionary::FalsePositive) do
|
253
|
-
ltd.left_to_right @b_left
|
254
|
-
end
|
255
|
-
end
|
253
|
+
should "fail if negative checks don't work, even with tightening" do
|
254
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
255
|
+
@tightenings.push @t_1
|
256
256
|
|
257
|
-
|
258
|
-
|
259
|
-
@tightenings.push @t_1
|
260
|
-
|
261
|
-
assert_raises(LooseTightDictionary::FalsePositive) do
|
262
|
-
ltd.check @left
|
263
|
-
end
|
257
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
258
|
+
ltd.check @left
|
264
259
|
end
|
260
|
+
end
|
261
|
+
|
262
|
+
should "succeed if proper identity is applied" do
|
263
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
264
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
265
|
+
@identities.push @r_1
|
265
266
|
|
266
|
-
|
267
|
-
|
268
|
-
@positives.push [ @d_left[0], @d_right[0] ]
|
269
|
-
@identities.push @r_1
|
270
|
-
|
271
|
-
assert_nothing_raised do
|
272
|
-
ltd.check @left
|
273
|
-
end
|
267
|
+
assert_nothing_raised do
|
268
|
+
ltd.check @left
|
274
269
|
end
|
275
270
|
end
|
276
271
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: loose_tight_dictionary
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 10
|
10
|
+
version: 0.0.10
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Seamus Abshere
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-03-02 00:00:00 -06:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|