loose_tight_dictionary 0.0.10 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +4 -0
  3. data/README.rdoc +76 -23
  4. data/Rakefile +2 -38
  5. data/benchmark/before-with-free.txt +283 -0
  6. data/benchmark/before-without-last-result.txt +257 -0
  7. data/benchmark/before.txt +304 -0
  8. data/benchmark/memory.rb +54 -0
  9. data/examples/bts_aircraft/5-2-A.htm +10305 -0
  10. data/examples/bts_aircraft/5-2-B.htm +9576 -0
  11. data/examples/bts_aircraft/5-2-D.htm +7094 -0
  12. data/examples/bts_aircraft/5-2-E.htm +2349 -0
  13. data/examples/bts_aircraft/5-2-G.htm +2922 -0
  14. data/examples/bts_aircraft/blockings.csv +1 -0
  15. data/examples/bts_aircraft/identities.csv +1 -0
  16. data/examples/bts_aircraft/negatives.csv +1 -0
  17. data/examples/bts_aircraft/number_260.csv +334 -0
  18. data/examples/bts_aircraft/positives.csv +1 -0
  19. data/examples/bts_aircraft/test_bts_aircraft.rb +123 -0
  20. data/examples/bts_aircraft/tighteners.csv +1 -0
  21. data/examples/first_name_matching.rb +14 -22
  22. data/lib/loose_tight_dictionary/blocking.rb +36 -0
  23. data/lib/loose_tight_dictionary/extract_regexp.rb +30 -0
  24. data/lib/loose_tight_dictionary/identity.rb +25 -0
  25. data/lib/loose_tight_dictionary/result.rb +23 -0
  26. data/lib/loose_tight_dictionary/score.rb +28 -0
  27. data/lib/loose_tight_dictionary/similarity.rb +62 -0
  28. data/lib/loose_tight_dictionary/tightener.rb +30 -0
  29. data/lib/loose_tight_dictionary/version.rb +3 -0
  30. data/lib/loose_tight_dictionary/wrapper.rb +37 -0
  31. data/lib/loose_tight_dictionary.rb +178 -305
  32. data/loose_tight_dictionary.gemspec +19 -64
  33. data/test/helper.rb +6 -6
  34. data/test/test_blocking.rb +23 -0
  35. data/test/test_extract_regexp.rb +18 -0
  36. data/test/test_identity.rb +18 -0
  37. data/test/test_loose_tight_dictionary.rb +52 -245
  38. data/test/test_loose_tight_dictionary_convoluted.rb.disabled +268 -0
  39. data/test/test_tightening.rb +10 -0
  40. metadata +52 -65
  41. data/VERSION +0 -1
  42. data/examples/icao-bts.rb +0 -58
@@ -0,0 +1,268 @@
1
+ require 'helper'
2
+
3
+ require 'shoulda'
4
+
5
+ $log = false
6
+
7
+ class TestLooseTightDictionaryConvoluted < Test::Unit::TestCase
8
+ def setup
9
+ clear_ltd
10
+
11
+ # dh 8 400
12
+ @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8']
13
+ @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
14
+ # dh 88
15
+ @b_needle = ['ABCDEFG DH88 HIJKLMNOP']
16
+ # dh 89
17
+ @c_haystack = ['ABCDEFG DH89 HIJKLMNOP']
18
+ # dh 8 200
19
+ @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8']
20
+ @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
21
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
22
+
23
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
24
+
25
+ @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
26
+
27
+ @needle = [
28
+ @a_needle,
29
+ @b_needle,
30
+ ['DE HAVILLAND DH89 Dragon Rapide'],
31
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
32
+ @d_needle,
33
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
34
+ ['DE HAVILLAND DH90 Dragonfly']
35
+ ]
36
+ @haystack = [
37
+ @a_haystack,
38
+ @c_haystack,
39
+ @d_haystack,
40
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
41
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
42
+ ]
43
+ @tightenings = []
44
+ @identities = []
45
+ @blockings = []
46
+ @positives = []
47
+ @negatives = []
48
+ end
49
+
50
+ def clear_ltd
51
+ @_ltd = nil
52
+ end
53
+
54
+ def ltd
55
+ @_ltd ||= LooseTightDictionary.new @haystack,
56
+ :tightenings => @tightenings,
57
+ :identities => @identities,
58
+ :blockings => @blockings,
59
+ :positives => @positives,
60
+ :negatives => @negatives,
61
+ :blocking_only => @blocking_only,
62
+ :log => $log
63
+ end
64
+
65
+ should "optionally only pay attention to things that match blockings" do
66
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
67
+
68
+ clear_ltd
69
+ @blocking_only = true
70
+ assert_equal nil, ltd.improver.match(@a_needle)
71
+
72
+ clear_ltd
73
+ @blocking_only = true
74
+ @blockings.push ['/dash/i']
75
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
76
+ end
77
+
78
+ # the example from the readme, considerably uglier here
79
+ should "check a simple table" do
80
+ @haystack = [ 'seamus', 'andy', 'ben' ]
81
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
82
+ needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
83
+
84
+ assert_nothing_raised do
85
+ ltd.improver.check needle
86
+ end
87
+ end
88
+
89
+ should "treat a String as a full record if passed through" do
90
+ dash = 'DHC8-400'
91
+ b747 = 'B747200/300'
92
+ dc9 = 'DC-9-10'
93
+ haystack_records = [ dash, b747, dc9 ]
94
+ simple_ltd = LooseTightDictionary.new haystack_records, :log => $log
95
+ assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400')
96
+ assert_equal b747, simple_ltd.improver.match('Boeing 747-300')
97
+ assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9')
98
+ end
99
+
100
+ should "call it a mismatch if you hit a blank positive" do
101
+ @positives.push [@a_needle[0], '']
102
+ assert_raises(LooseTightDictionary::Improver::Mismatch) do
103
+ ltd.improver.match @a_needle
104
+ end
105
+ end
106
+
107
+ should "call it a false positive if you hit a blank negative" do
108
+ @negatives.push [@a_needle[0], '']
109
+ assert_raises(LooseTightDictionary::Improver::FalsePositive) do
110
+ ltd.improver.match @a_needle
111
+ end
112
+ end
113
+
114
+ should "have a false match without blocking" do
115
+ # @d_needle will be our victim
116
+ @haystack.push @d_lookalike
117
+ @tightenings.push @t_1
118
+
119
+ assert_equal @d_lookalike, ltd.improver.match(@d_needle)
120
+ end
121
+
122
+ should "do blocking if the needle matches a block" do
123
+ # @d_needle will be our victim
124
+ @haystack.push @d_lookalike
125
+ @tightenings.push @t_1
126
+ @blockings.push ['/(bombardier|de ?havilland)/i']
127
+
128
+ assert_equal @d_haystack, ltd.improver.match(@d_needle)
129
+ end
130
+
131
+ should "treat blocks as exclusive" do
132
+ @haystack = [ @d_needle ]
133
+ @tightenings.push @t_1
134
+ @blockings.push ['/(bombardier|de ?havilland)/i']
135
+
136
+ assert_equal nil, ltd.improver.match(@d_lookalike)
137
+ end
138
+
139
+ should "only use identities if they stem from the same regexp" do
140
+ @identities.push @r_1
141
+ @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
142
+ @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
143
+ x_needle = [ 'CESSNA D-333 CITATION V']
144
+ x_haystack = [ 'CESSNA D-333' ]
145
+ @haystack.push x_haystack
146
+
147
+ assert_equal x_haystack, ltd.improver.match(x_needle)
148
+ end
149
+
150
+ should "use the best score from all of the tightenings" do
151
+ x_needle = ["BOEING 737100"]
152
+ x_haystack = ["BOEING BOEING 737-100/200"]
153
+ x_haystack_wrong = ["BOEING BOEING 737-900"]
154
+ @haystack.push x_haystack
155
+ @haystack.push x_haystack_wrong
156
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
157
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
158
+
159
+ assert_equal x_haystack, ltd.improver.match(x_needle)
160
+ end
161
+
162
+ should "compare using prefixes if tightened key is shorter than correct match" do
163
+ x_needle = ["BOEING 720"]
164
+ x_haystack = ["BOEING BOEING 720-000"]
165
+ x_haystack_wrong = ["BOEING BOEING 717-200"]
166
+ @haystack.push x_haystack
167
+ @haystack.push x_haystack_wrong
168
+ @tightenings.push @t_1
169
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
170
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
171
+
172
+ assert_equal x_haystack, ltd.improver.match(x_needle)
173
+ end
174
+
175
+ should "use the shortest original input" do
176
+ x_needle = ['De Havilland DHC8-777 Dash-8 Superstar']
177
+ x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
178
+ x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
179
+
180
+ @haystack.push x_haystack_long
181
+ @haystack.push x_haystack
182
+ @tightenings.push @t_1
183
+
184
+ assert_equal x_haystack, ltd.improver.match(x_needle)
185
+ end
186
+
187
+ should "perform lookups needle to haystack" do
188
+ assert_equal @a_haystack, ltd.improver.match(@a_needle)
189
+ end
190
+
191
+ should "succeed if there are no checks" do
192
+ assert_nothing_raised do
193
+ ltd.improver.check @needle
194
+ end
195
+ end
196
+
197
+ should "succeed if the positive checks just work" do
198
+ @positives.push [ @a_needle[0], @a_haystack[0] ]
199
+
200
+ assert_nothing_raised do
201
+ ltd.improver.check @needle
202
+ end
203
+ end
204
+
205
+ should "fail if positive checks don't work" do
206
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
207
+
208
+ assert_raises(LooseTightDictionary::Improver::Mismatch) do
209
+ ltd.improver.check @needle
210
+ end
211
+ end
212
+
213
+ should "succeed if proper tightening is applied" do
214
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
215
+ @tightenings.push @t_1
216
+
217
+ assert_nothing_raised do
218
+ ltd.improver.check @needle
219
+ end
220
+ end
221
+
222
+ should "use a Google Docs spreadsheet as a source of tightenings" do
223
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
224
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
225
+
226
+ # sabshere 9/30/10 this shouldn't raise anything
227
+ # but the tightenings have been changed... we should be using test-only tightenings, not production ones
228
+ # assert_nothing_raised do
229
+ assert_raises(LooseTightDictionary::Improver::Mismatch) do
230
+ ltd.improver.check @needle
231
+ end
232
+ end
233
+
234
+ should "fail if negative checks don't work" do
235
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
236
+
237
+ assert_raises(LooseTightDictionary::Improver::FalsePositive) do
238
+ ltd.improver.check @needle
239
+ end
240
+ end
241
+
242
+ should "do inline checking" do
243
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
244
+
245
+ assert_raises(LooseTightDictionary::Improver::FalsePositive) do
246
+ ltd.improver.match @b_needle
247
+ end
248
+ end
249
+
250
+ should "fail if negative checks don't work, even with tightening" do
251
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
252
+ @tightenings.push @t_1
253
+
254
+ assert_raises(LooseTightDictionary::Improver::FalsePositive) do
255
+ ltd.improver.check @needle
256
+ end
257
+ end
258
+
259
+ should "succeed if proper identity is applied" do
260
+ @negatives.push [ @b_needle[0], @c_haystack[0] ]
261
+ @positives.push [ @d_needle[0], @d_haystack[0] ]
262
+ @identities.push @r_1
263
+
264
+ assert_nothing_raised do
265
+ ltd.improver.check @needle
266
+ end
267
+ end
268
+ end
@@ -0,0 +1,10 @@
1
+ require 'helper'
2
+
3
+ class TestTightener < Test::Unit::TestCase
4
+ def test_001_apply
5
+ t = LooseTightDictionary::Tightener.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
+ assert_equal 'Ford F350', t.apply('Ford F-350')
7
+ assert_equal 'Ford F150', t.apply('Ford F150')
8
+ assert_equal 'Ford F350', t.apply('Ford F 350')
9
+ end
10
+ end
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loose_tight_dictionary
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 0
9
- - 10
10
- version: 0.0.10
4
+ prerelease:
5
+ version: 0.1.0
11
6
  platform: ruby
12
7
  authors:
13
8
  - Seamus Abshere
@@ -15,7 +10,7 @@ autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2011-03-02 00:00:00 -06:00
13
+ date: 2011-04-18 00:00:00 -05:00
19
14
  default_executable:
20
15
  dependencies:
21
16
  - !ruby/object:Gem::Dependency
@@ -26,9 +21,6 @@ dependencies:
26
21
  requirements:
27
22
  - - ">="
28
23
  - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
24
  version: "0"
33
25
  type: :development
34
26
  version_requirements: *id001
@@ -40,12 +32,7 @@ dependencies:
40
32
  requirements:
41
33
  - - ">="
42
34
  - !ruby/object:Gem::Version
43
- hash: 49
44
- segments:
45
- - 0
46
- - 2
47
- - 19
48
- version: 0.2.19
35
+ version: "0"
49
36
  type: :development
50
37
  version_requirements: *id002
51
38
  - !ruby/object:Gem::Dependency
@@ -56,76 +43,79 @@ dependencies:
56
43
  requirements:
57
44
  - - ">="
58
45
  - !ruby/object:Gem::Version
59
- hash: 11
60
- segments:
61
- - 2
62
- - 3
63
- - 4
64
46
  version: 2.3.4
65
47
  type: :runtime
66
48
  version_requirements: *id003
67
49
  - !ruby/object:Gem::Dependency
68
- name: andand
50
+ name: amatch
69
51
  prerelease: false
70
52
  requirement: &id004 !ruby/object:Gem::Requirement
71
53
  none: false
72
54
  requirements:
73
55
  - - ">="
74
56
  - !ruby/object:Gem::Version
75
- hash: 25
76
- segments:
77
- - 1
78
- - 3
79
- - 1
80
- version: 1.3.1
57
+ version: "0"
81
58
  type: :runtime
82
59
  version_requirements: *id004
83
- - !ruby/object:Gem::Dependency
84
- name: amatch
85
- prerelease: false
86
- requirement: &id005 !ruby/object:Gem::Requirement
87
- none: false
88
- requirements:
89
- - - ">="
90
- - !ruby/object:Gem::Version
91
- hash: 29
92
- segments:
93
- - 0
94
- - 2
95
- - 5
96
- version: 0.2.5
97
- type: :runtime
98
- version_requirements: *id005
99
- description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
100
- email: seamus@abshere.net
60
+ description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
61
+ email:
62
+ - seamus@abshere.net
101
63
  executables: []
102
64
 
103
65
  extensions: []
104
66
 
105
- extra_rdoc_files:
106
- - LICENSE
107
- - README.rdoc
67
+ extra_rdoc_files: []
68
+
108
69
  files:
109
70
  - .document
110
71
  - .gitignore
72
+ - Gemfile
111
73
  - LICENSE
112
74
  - README.rdoc
113
75
  - Rakefile
114
- - VERSION
76
+ - benchmark/before-with-free.txt
77
+ - benchmark/before-without-last-result.txt
78
+ - benchmark/before.txt
79
+ - benchmark/memory.rb
80
+ - examples/bts_aircraft/5-2-A.htm
81
+ - examples/bts_aircraft/5-2-B.htm
82
+ - examples/bts_aircraft/5-2-D.htm
83
+ - examples/bts_aircraft/5-2-E.htm
84
+ - examples/bts_aircraft/5-2-G.htm
85
+ - examples/bts_aircraft/blockings.csv
86
+ - examples/bts_aircraft/identities.csv
87
+ - examples/bts_aircraft/negatives.csv
88
+ - examples/bts_aircraft/number_260.csv
89
+ - examples/bts_aircraft/positives.csv
90
+ - examples/bts_aircraft/test_bts_aircraft.rb
91
+ - examples/bts_aircraft/tighteners.csv
115
92
  - examples/first_name_matching.rb
116
- - examples/icao-bts.rb
117
93
  - examples/icao-bts.xls
118
94
  - lib/loose_tight_dictionary.rb
95
+ - lib/loose_tight_dictionary/blocking.rb
96
+ - lib/loose_tight_dictionary/extract_regexp.rb
97
+ - lib/loose_tight_dictionary/identity.rb
98
+ - lib/loose_tight_dictionary/result.rb
99
+ - lib/loose_tight_dictionary/score.rb
100
+ - lib/loose_tight_dictionary/similarity.rb
101
+ - lib/loose_tight_dictionary/tightener.rb
102
+ - lib/loose_tight_dictionary/version.rb
103
+ - lib/loose_tight_dictionary/wrapper.rb
119
104
  - loose_tight_dictionary.gemspec
120
105
  - test/helper.rb
106
+ - test/test_blocking.rb
107
+ - test/test_extract_regexp.rb
108
+ - test/test_identity.rb
121
109
  - test/test_loose_tight_dictionary.rb
110
+ - test/test_loose_tight_dictionary_convoluted.rb.disabled
111
+ - test/test_tightening.rb
122
112
  has_rdoc: true
123
- homepage: http://github.com/seamusabshere/loose_tight_dictionary
113
+ homepage: https://github.com/seamusabshere/loose_tight_dictionary
124
114
  licenses: []
125
115
 
126
116
  post_install_message:
127
- rdoc_options:
128
- - --charset=UTF-8
117
+ rdoc_options: []
118
+
129
119
  require_paths:
130
120
  - lib
131
121
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -133,28 +123,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
133
123
  requirements:
134
124
  - - ">="
135
125
  - !ruby/object:Gem::Version
136
- hash: 3
137
- segments:
138
- - 0
139
126
  version: "0"
140
127
  required_rubygems_version: !ruby/object:Gem::Requirement
141
128
  none: false
142
129
  requirements:
143
130
  - - ">="
144
131
  - !ruby/object:Gem::Version
145
- hash: 3
146
- segments:
147
- - 0
148
132
  version: "0"
149
133
  requirements: []
150
134
 
151
- rubyforge_project:
152
- rubygems_version: 1.3.7
135
+ rubyforge_project: loose_tight_dictionary
136
+ rubygems_version: 1.6.2
153
137
  signing_key:
154
138
  specification_version: 3
155
139
  summary: Allows iterative development of dictionaries for big data sets.
156
140
  test_files:
157
141
  - test/helper.rb
142
+ - test/test_blocking.rb
143
+ - test/test_extract_regexp.rb
144
+ - test/test_identity.rb
158
145
  - test/test_loose_tight_dictionary.rb
159
- - examples/first_name_matching.rb
160
- - examples/icao-bts.rb
146
+ - test/test_loose_tight_dictionary_convoluted.rb.disabled
147
+ - test/test_tightening.rb