loose_tight_dictionary-ruby19 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Seamus Abshere
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,56 @@
1
+ = loose_tight_dictionary
2
+
3
+ Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
4
+
5
+ = Quickstart
6
+
7
+ >> right_records = [ 'seamus', 'andy', 'ben' ]
8
+ => [...]
9
+ >> left_record = 'Shamus Heaney'
10
+ => [...]
11
+ >> d = LooseTightDictionary.new right_records
12
+ => [...]
13
+ >> puts d.left_to_right left_record
14
+ => 'seamus'
15
+
16
+ Try running the included example file:
17
+
18
+ $ ruby examples/first_name_matching.rb
19
+ Left side (input)
20
+ ====================
21
+ Mr. Seamus
22
+ Sr. Andy
23
+ Master BenT
24
+
25
+ Right side (output)
26
+ ====================
27
+ seamus
28
+ andy
29
+ ben
30
+
31
+ Results
32
+ ====================
33
+ Left record (input) Right record (output) Prefix used (if any) Score
34
+ Mr. Seamus seamus NULL 0.666666666666667
35
+ Sr. Andy andy NULL 0.5
36
+ Master BenT ben NULL 0.2
37
+
38
+ = Improving dictionaries
39
+
40
+ Similarity matching will only get you so far.
41
+
42
+ TODO: regex usage
43
+
44
+ == Note on Patches/Pull Requests
45
+
46
+ * Fork the project.
47
+ * Make your feature addition or bug fix.
48
+ * Add tests for it. This is important so I don't break it in a
49
+ future version unintentionally.
50
+ * Commit, do not mess with rakefile, version, or history.
51
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
52
+ * Send me a pull request. Bonus points for topic branches.
53
+
54
+ == Copyright
55
+
56
+ Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "loose_tight_dictionary"
8
+ gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
9
+ gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
12
+ gem.authors = ["Seamus Abshere"]
13
+ gem.add_development_dependency "shoulda"
14
+ gem.add_development_dependency "remote_table", ">=0.2.19"
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'fastercsv', '>=1.5.3'
17
+ gem.add_dependency 'andand', '>=1.3.1'
18
+ gem.add_dependency 'amatch', '>=0.2.5'
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "loose_tight_dictionary #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.8
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ # require 'loose_tight_dictionary'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
5
+ right_side = [ 'seamus', 'andy', 'ben' ]
6
+ left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
7
+
8
+ puts "Left side (input)"
9
+ puts "=" * 20
10
+ puts left_side
11
+ puts
12
+
13
+ puts "Right side (output)"
14
+ puts "=" * 20
15
+ puts right_side
16
+ puts
17
+
18
+ puts "Results"
19
+ puts "=" * 20
20
+ d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
21
+ d.check left_side
22
+
23
+ puts d.left_to_right 'Shamus Heaney'
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'remote_table'
5
+ require 'ruby-debug'
6
+ require 'logger'
7
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
8
+
9
+ $logger = Logger.new STDERR
10
+ $logger.level = Logger::DEBUG
11
+ $logger.datetime_format = "%H:%M:%S"
12
+ # $tee = File.open('tee.csv', 'w')
13
+ $tee = STDOUT
14
+
15
+ # $ltd_left = /(super|bonanza)/i
16
+ # $ltd_right = /bonanza d-35/i
17
+ # $ltd_dd_left = /bonanza/i
18
+ # $ltd_dd_right = /musk/i
19
+ # $ltd_dd_left_not = /allison/i
20
+ # $ltd_dd_print = true
21
+ # $ltd_ddd_left = /bonanza/i
22
+ # $ltd_ddd_right = /musk/i
23
+ # $ltd_ddd_left_not = /allison/i
24
+ # $ltd_ddd_print = true
25
+
26
+ @right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
27
+ :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
28
+
29
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
30
+
31
+ @identities = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
32
+
33
+ @blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
34
+
35
+ @positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
36
+
37
+ @negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
38
+
39
+ %w{ tightenings identities blockings }.each do |name|
40
+ $logger.info name
41
+ $logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
42
+ $logger.info "\n"
43
+ end
44
+
45
+ ('A'..'Z').each do |letter|
46
+ # %w{ E }.each do |letter|
47
+ @left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
48
+ :encoding => 'US-ASCII',
49
+ :row_xpath => '//table/tr[2]/td/table/tr',
50
+ :column_xpath => 'td'
51
+
52
+ d = LooseTightDictionary.new @right, :tightenings => @tightenings, :identities => @identities, :blockings => @blockings, :logger => $logger, :tee => $tee
53
+ d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
54
+ d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
55
+ d.positives = @positives
56
+ d.negatives = @negatives
57
+ d.check @left
58
+ end
Binary file
@@ -0,0 +1,346 @@
1
+ require 'active_support'
2
+ require 'active_support/version'
3
+ %w{
4
+ active_support/core_ext/string
5
+ }.each do |active_support_3_requirement|
6
+ require active_support_3_requirement
7
+ end if ActiveSupport::VERSION::MAJOR == 3
8
+ require 'amatch'
9
+ require 'andand'
10
+ if RUBY_VERSION >= "1.9"
11
+ require 'csv'
12
+ else
13
+ require 'fastercsv'
14
+ end
15
+
16
+ class LooseTightDictionary
17
+ class MissedChecks < RuntimeError; end
18
+ class Mismatch < RuntimeError; end
19
+ class FalsePositive < RuntimeError; end
20
+
21
+ class T
22
+ attr_reader :str, :tightened_str
23
+ def initialize(str, tightened_str)
24
+ @str = str
25
+ @tightened_str = tightened_str
26
+ end
27
+
28
+ def tightened?
29
+ str != tightened_str
30
+ end
31
+
32
+ def prefix_and_score(other)
33
+ prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
34
+ score = if prefix
35
+ tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
36
+ else
37
+ tightened_str.pair_distance_similar other.tightened_str
38
+ end
39
+ [ prefix, score ]
40
+ end
41
+ end
42
+
43
+ include Amatch
44
+
45
+ attr_reader :right_records
46
+ attr_reader :case_sensitive
47
+
48
+ attr_accessor :logger
49
+ attr_accessor :tee
50
+ attr_accessor :tee_format
51
+ attr_accessor :positives
52
+ attr_accessor :negatives
53
+ attr_accessor :left_reader
54
+ attr_accessor :right_reader
55
+ attr_accessor :blocking_only
56
+
57
+ def initialize(right_records, options = {})
58
+ @right_records = right_records
59
+ @_raw_tightenings = options[:tightenings] || Array.new
60
+ @_raw_identities = options[:identities] || Array.new
61
+ @_raw_blockings = options[:blockings] || Array.new
62
+ @left_reader = options[:left_reader]
63
+ @right_reader = options[:right_reader]
64
+ @positives = options[:positives]
65
+ @negatives = options[:negatives]
66
+ @logger = options[:logger]
67
+ @tee = options[:tee]
68
+ @tee_format = options[:tee_format] || :fixed_width
69
+ @case_sensitive = options[:case_sensitive] || false
70
+ @blocking_only = options[:blocking_only] || false
71
+ end
72
+
73
+ # def tightenings
74
+ # def identities
75
+ # def blockings
76
+ %w{ tightenings identities blockings }.each do |name|
77
+ module_eval %{
78
+ def #{name}
79
+ @#{name} ||= @_raw_#{name}.map do |i|
80
+ next if i[0].blank?
81
+ literal_regexp i[0]
82
+ end
83
+ end
84
+ }
85
+ end
86
+
87
+ def blocking_only?
88
+ !!blocking_only
89
+ end
90
+
91
+ def inline_check(left_record, right_record)
92
+ return unless positives.present? or negatives.present?
93
+
94
+ left = read_left left_record
95
+ right = read_right right_record
96
+
97
+ if positive_record = positives.andand.detect { |record| record[0] == left }
98
+ correct_right = positive_record[1]
99
+ if correct_right.blank? and right.present?
100
+ logger.andand.debug " Mismatch! (should match SOMETHING)"
101
+ raise Mismatch
102
+ elsif right != correct_right
103
+ logger.andand.debug " Mismatch! (should be #{correct_right})"
104
+ raise Mismatch
105
+ end
106
+ end
107
+
108
+ if negative_record = negatives.andand.detect { |record| record[0] == left }
109
+ incorrect_right = negative_record[1]
110
+ if incorrect_right.blank? and right.present?
111
+ logger.andand.debug " False positive! (should NOT match ANYTHING)"
112
+ raise FalsePositive
113
+ elsif right == incorrect_right
114
+ logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
115
+ raise FalsePositive
116
+ end
117
+ end
118
+ end
119
+
120
+ def check(left_records)
121
+ header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
122
+ case tee_format
123
+ when :csv
124
+ tee.andand.puts header.flatten.to_csv
125
+ when :fixed_width
126
+ tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
127
+ end
128
+
129
+ left_records.each do |left_record|
130
+ begin
131
+ right_record = left_to_right left_record
132
+ ensure
133
+ case tee_format
134
+ when :csv
135
+ tee.andand.puts $ltd_1.flatten.to_csv
136
+ when :fixed_width
137
+ tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+ def left_to_right(left_record)
144
+ left = read_left left_record
145
+ blocking_left = blocking left
146
+ return if blocking_only? and blocking_left.nil?
147
+ i_options_left = i_options left
148
+ t_options_left = t_options left
149
+ history = Hash.new
150
+ right_record = right_records.select do |right_record|
151
+ right = read_right right_record
152
+ blocking_right = blocking right
153
+ (not blocking_left and not blocking_right) or
154
+ (blocking_right and blocking_right.match(left)) or
155
+ (blocking_left and blocking_left.match(right))
156
+ end.max do |a_record, b_record|
157
+ a = read_right a_record
158
+ b = read_right b_record
159
+ i_options_a = i_options a
160
+ i_options_b = i_options b
161
+ collision_a = collision? i_options_left, i_options_a
162
+ collision_b = collision? i_options_left, i_options_b
163
+ if collision_a and collision_b
164
+ # neither would ever work, so randomly rank one over the other
165
+ rand(2) == 1 ? -1 : 1
166
+ elsif collision_a
167
+ -1
168
+ elsif collision_b
169
+ 1
170
+ else
171
+ t_left_a, t_right_a = optimize t_options_left, t_options(a)
172
+ t_left_b, t_right_b = optimize t_options_left, t_options(b)
173
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
174
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
175
+ history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
176
+ history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
177
+
178
+ yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
179
+
180
+ if $ltd_dd_print and yep_dd
181
+ logger.andand.debug t_left_a.inspect
182
+ logger.andand.debug t_right_a.inspect
183
+ logger.andand.debug t_left_b.inspect
184
+ logger.andand.debug t_right_b.inspect
185
+ logger.andand.debug
186
+ end
187
+
188
+ z = 1
189
+ debugger if yep_dd
190
+ z = 1
191
+
192
+ if a_score != b_score
193
+ a_score <=> b_score
194
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
195
+ a_prefix <=> b_prefix
196
+ else
197
+ b.length <=> a.length
198
+ end
199
+ end
200
+ end
201
+ $ltd_1 = history[right_record]
202
+ right = read_right right_record
203
+ i_options_right = i_options right
204
+ z = 1
205
+ debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
206
+ z = 1
207
+ if collision? i_options_left, i_options_right
208
+ $ltd_0 = nil
209
+ return
210
+ else
211
+ $ltd_0 = right_record
212
+ end
213
+ inline_check left_record, right_record
214
+ right_record
215
+ end
216
+
217
+ def optimize(t_options_left, t_options_right)
218
+ cart_prod(t_options_left, t_options_right).max do |a, b|
219
+ t_left_a, t_right_a = a
220
+ t_left_b, t_right_b = b
221
+
222
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
223
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
224
+
225
+ yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
226
+
227
+ if $ltd_ddd_print and yep_ddd
228
+ logger.andand.debug t_left_a.inspect
229
+ logger.andand.debug t_right_a.inspect
230
+ logger.andand.debug t_left_b.inspect
231
+ logger.andand.debug t_right_b.inspect
232
+ logger.andand.debug
233
+ end
234
+
235
+ z = 1
236
+ debugger if yep_ddd
237
+ z = 1
238
+
239
+ if a_score != b_score
240
+ a_score <=> b_score
241
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
242
+ a_prefix <=> b_prefix
243
+ else
244
+ # randomly choose
245
+ # maybe later i can figure out how big the inputs are and apply occam's razor
246
+ rand(2) == 1 ? -1 : 1
247
+ end
248
+ end
249
+ end
250
+
251
+ def t_options(str)
252
+ return @_t_options[str] if @_t_options.andand.has_key?(str)
253
+ @_t_options ||= Hash.new
254
+ ary = Array.new
255
+ ary.push T.new(str, str)
256
+ tightenings.each do |regexp|
257
+ if match_data = regexp.match(str)
258
+ ary.push T.new(str, match_data.captures.compact.join)
259
+ end
260
+ end
261
+ @_t_options[str] = ary
262
+ end
263
+
264
+ class I
265
+ attr_reader :regexp, :str, :case_sensitive, :identity
266
+ def initialize(regexp, str, case_sensitive)
267
+ @regexp = regexp
268
+ @str = str
269
+ @identity = regexp.match(str).captures.compact.join
270
+ @identity = @identity.downcase if case_sensitive
271
+ end
272
+ end
273
+
274
+ def collision?(i_options_left, i_options_right)
275
+ i_options_left.any? do |r_left|
276
+ i_options_right.any? do |r_right|
277
+ r_left.regexp == r_right.regexp and r_left.identity != r_right.identity
278
+ end
279
+ end
280
+ end
281
+
282
+ def i_options(str)
283
+ return @_i_options[str] if @_i_options.andand.has_key?(str)
284
+ @_i_options ||= Hash.new
285
+ ary = Array.new
286
+ identities.each do |regexp|
287
+ if regexp.match str
288
+ ary.push I.new(regexp, str, case_sensitive)
289
+ end
290
+ end
291
+ @_i_options[str] = ary
292
+ end
293
+
294
+ def blocking(str)
295
+ return @_blocking[str] if @_blocking.andand.has_key?(str)
296
+ @_blocking ||= Hash.new
297
+ blockings.each do |regexp|
298
+ if regexp.match str
299
+ return @_blocking[str] = regexp
300
+ end
301
+ end
302
+ @_blocking[str] = nil
303
+ end
304
+
305
+ def literal_regexp(str)
306
+ return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
307
+ @_literal_regexp ||= Hash.new
308
+ raw_regexp_options = str.split('/').last
309
+ ignore_case = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
310
+ multiline = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
311
+ extended = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
312
+ @_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (ignore_case||multiline||extended)
313
+ end
314
+
315
+ def read_left(left_record)
316
+ return if left_record.nil?
317
+ if left_reader
318
+ left_reader.call(left_record)
319
+ elsif left_record.is_a?(String)
320
+ left_record
321
+ else
322
+ left_record[0]
323
+ end
324
+ end
325
+
326
+ def read_right(right_record)
327
+ return if right_record.nil?
328
+ if right_reader
329
+ right_reader.call(right_record)
330
+ elsif right_record.is_a?(String)
331
+ right_record
332
+ else
333
+ right_record[0]
334
+ end
335
+ end
336
+
337
+ # Thanks William James!
338
+ # http://www.ruby-forum.com/topic/95519#200484
339
+ def cart_prod(*args)
340
+ args.inject([[]]){|old,lst|
341
+ new = []
342
+ lst.each{|e| new += old.map{|c| c.dup << e }}
343
+ new
344
+ }
345
+ end
346
+ end
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'logger'
5
+ require 'ruby-debug'
6
+
7
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
8
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
9
+ require 'loose_tight_dictionary'
10
+
11
+ class Test::Unit::TestCase
12
+ end
@@ -0,0 +1,273 @@
1
+ require 'helper'
2
+
3
+ require 'remote_table'
4
+
5
+ # $logger = Logger.new STDERR
6
+ # $logger.level = Logger::INFO
7
+ # $tee = STDOUT
8
+
9
+ class TestLooseTightDictionary < Test::Unit::TestCase
10
+ def setup
11
+ clear_ltd
12
+
13
+ # dh 8 400
14
+ @a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
15
+ @a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
16
+ # dh 88
17
+ @b_left = ['ABCDEFG DH88 HIJKLMNOP']
18
+ # dh 89
19
+ @c_right = ['ABCDEFG DH89 HIJKLMNOP']
20
+ # dh 8 200
21
+ @d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
22
+ @d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
23
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
24
+
25
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
26
+
27
+ @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
28
+
29
+ @left = [
30
+ @a_left,
31
+ @b_left,
32
+ ['DE HAVILLAND DH89 Dragon Rapide'],
33
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
34
+ @d_left,
35
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
36
+ ['DE HAVILLAND DH90 Dragonfly']
37
+ ]
38
+ @right = [
39
+ @a_right,
40
+ @c_right,
41
+ @d_right,
42
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
43
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
44
+ ]
45
+ @tightenings = []
46
+ @identities = []
47
+ @blockings = []
48
+ @positives = []
49
+ @negatives = []
50
+ end
51
+
52
+ def clear_ltd
53
+ @_ltd = nil
54
+ end
55
+
56
+ def ltd
57
+ @_ltd ||= LooseTightDictionary.new @right,
58
+ :tightenings => @tightenings,
59
+ :identities => @identities,
60
+ :blockings => @blockings,
61
+ :positives => @positives,
62
+ :negatives => @negatives,
63
+ :blocking_only => @blocking_only,
64
+ :logger => $logger,
65
+ :tee => $tee
66
+ end
67
+
68
+ if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
69
+ end
70
+
71
+ if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
72
+ should "optionally only pay attention to things that match blockings" do
73
+ assert_equal @a_right, ltd.left_to_right(@a_left)
74
+
75
+ clear_ltd
76
+ @blocking_only = true
77
+ assert_equal nil, ltd.left_to_right(@a_left)
78
+
79
+ clear_ltd
80
+ @blocking_only = true
81
+ @blockings.push ['/dash/i']
82
+ assert_equal @a_right, ltd.left_to_right(@a_left)
83
+ end
84
+
85
+ # the example from the readme, considerably uglier here
86
+ should "check a simple table" do
87
+ @right = [ 'seamus', 'andy', 'ben' ]
88
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
89
+ left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
90
+
91
+ assert_nothing_raised do
92
+ ltd.check left
93
+ end
94
+ end
95
+
96
+ should "treat a String as a full record if passed through" do
97
+ dash = 'DHC8-400'
98
+ b747 = 'B747200/300'
99
+ dc9 = 'DC-9-10'
100
+ right_records = [ dash, b747, dc9 ]
101
+ simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
102
+ assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
103
+ assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
104
+ assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
105
+ end
106
+
107
+ should "call it a mismatch if you hit a blank positive" do
108
+ @positives.push [@a_left[0], '']
109
+ assert_raises(LooseTightDictionary::Mismatch) do
110
+ ltd.left_to_right @a_left
111
+ end
112
+ end
113
+
114
+ should "call it a false positive if you hit a blank negative" do
115
+ @negatives.push [@a_left[0], '']
116
+ assert_raises(LooseTightDictionary::FalsePositive) do
117
+ ltd.left_to_right @a_left
118
+ end
119
+ end
120
+
121
+ should "have a false match without blocking" do
122
+ # @d_left will be our victim
123
+ @right.push @d_lookalike
124
+ @tightenings.push @t_1
125
+
126
+ assert_equal @d_lookalike, ltd.left_to_right(@d_left)
127
+ end
128
+
129
+ should "do blocking if the left matches a block" do
130
+ # @d_left will be our victim
131
+ @right.push @d_lookalike
132
+ @tightenings.push @t_1
133
+ @blockings.push ['/(bombardier|de ?havilland)/i']
134
+
135
+ assert_equal @d_right, ltd.left_to_right(@d_left)
136
+ end
137
+
138
+ should "treat blocks as exclusive" do
139
+ @right = [ @d_left ]
140
+ @tightenings.push @t_1
141
+ @blockings.push ['/(bombardier|de ?havilland)/i']
142
+
143
+ assert_equal nil, ltd.left_to_right(@d_lookalike)
144
+ end
145
+
146
+ should "only use identities if they stem from the same regexp" do
147
+ @identities.push @r_1
148
+ @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
149
+ @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
150
+ x_left = [ 'CESSNA D-333 CITATION V']
151
+ x_right = [ 'CESSNA D-333' ]
152
+ @right.push x_right
153
+
154
+ assert_equal x_right, ltd.left_to_right(x_left)
155
+ end
156
+
157
+ should "use the best score from all of the tightenings" do
158
+ x_left = ["BOEING 737100"]
159
+ x_right = ["BOEING BOEING 737-100/200"]
160
+ x_right_wrong = ["BOEING BOEING 737-900"]
161
+ @right.push x_right
162
+ @right.push x_right_wrong
163
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
164
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
165
+
166
+ assert_equal x_right, ltd.left_to_right(x_left)
167
+ end
168
+
169
+ should "compare using prefixes if tightened key is shorter than correct match" do
170
+ x_left = ["BOEING 720"]
171
+ x_right = ["BOEING BOEING 720-000"]
172
+ x_right_wrong = ["BOEING BOEING 717-200"]
173
+ @right.push x_right
174
+ @right.push x_right_wrong
175
+ @tightenings.push @t_1
176
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
177
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
178
+
179
+ assert_equal x_right, ltd.left_to_right(x_left)
180
+ end
181
+
182
+ should "use the shortest original input" do
183
+ x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
184
+ x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
185
+ x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
186
+
187
+ @right.push x_right_long
188
+ @right.push x_right
189
+ @tightenings.push @t_1
190
+
191
+ assert_equal x_right, ltd.left_to_right(x_left)
192
+ end
193
+
194
+ should "perform lookups left to right" do
195
+ assert_equal @a_right, ltd.left_to_right(@a_left)
196
+ end
197
+
198
+ should "succeed if there are no checks" do
199
+ assert_nothing_raised do
200
+ ltd.check @left
201
+ end
202
+ end
203
+
204
+ should "succeed if the positive checks just work" do
205
+ @positives.push [ @a_left[0], @a_right[0] ]
206
+
207
+ assert_nothing_raised do
208
+ ltd.check @left
209
+ end
210
+ end
211
+
212
+ should "fail if positive checks don't work" do
213
+ @positives.push [ @d_left[0], @d_right[0] ]
214
+
215
+ assert_raises(LooseTightDictionary::Mismatch) do
216
+ ltd.check @left
217
+ end
218
+ end
219
+
220
+ should "succeed if proper tightening is applied" do
221
+ @positives.push [ @d_left[0], @d_right[0] ]
222
+ @tightenings.push @t_1
223
+
224
+ assert_nothing_raised do
225
+ ltd.check @left
226
+ end
227
+ end
228
+
229
+ should "use a Google Docs spreadsheet as a source of tightenings" do
230
+ @positives.push [ @d_left[0], @d_right[0] ]
231
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
232
+
233
+ assert_nothing_raised do
234
+ ltd.check @left
235
+ end
236
+ end
237
+
238
+ should "fail if negative checks don't work" do
239
+ @negatives.push [ @b_left[0], @c_right[0] ]
240
+
241
+ assert_raises(LooseTightDictionary::FalsePositive) do
242
+ ltd.check @left
243
+ end
244
+ end
245
+
246
+ should "do inline checking" do
247
+ @negatives.push [ @b_left[0], @c_right[0] ]
248
+
249
+ assert_raises(LooseTightDictionary::FalsePositive) do
250
+ ltd.left_to_right @b_left
251
+ end
252
+ end
253
+
254
+ should "fail if negative checks don't work, even with tightening" do
255
+ @negatives.push [ @b_left[0], @c_right[0] ]
256
+ @tightenings.push @t_1
257
+
258
+ assert_raises(LooseTightDictionary::FalsePositive) do
259
+ ltd.check @left
260
+ end
261
+ end
262
+
263
+ should "succeed if proper identity is applied" do
264
+ @negatives.push [ @b_left[0], @c_right[0] ]
265
+ @positives.push [ @d_left[0], @d_right[0] ]
266
+ @identities.push @r_1
267
+
268
+ assert_nothing_raised do
269
+ ltd.check @left
270
+ end
271
+ end
272
+ end
273
+ end
metadata ADDED
@@ -0,0 +1,175 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loose_tight_dictionary-ruby19
3
+ version: !ruby/object:Gem::Version
4
+ hash: 15
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 8
10
+ version: 0.0.8
11
+ platform: ruby
12
+ authors:
13
+ - Seamus Abshere
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-09-27 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: remote_table
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 49
44
+ segments:
45
+ - 0
46
+ - 2
47
+ - 19
48
+ version: 0.2.19
49
+ type: :development
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: activesupport
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 11
60
+ segments:
61
+ - 2
62
+ - 3
63
+ - 4
64
+ version: 2.3.4
65
+ type: :runtime
66
+ version_requirements: *id003
67
+ - !ruby/object:Gem::Dependency
68
+ name: fastercsv
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 5
76
+ segments:
77
+ - 1
78
+ - 5
79
+ - 3
80
+ version: 1.5.3
81
+ type: :runtime
82
+ version_requirements: *id004
83
+ - !ruby/object:Gem::Dependency
84
+ name: andand
85
+ prerelease: false
86
+ requirement: &id005 !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 25
92
+ segments:
93
+ - 1
94
+ - 3
95
+ - 1
96
+ version: 1.3.1
97
+ type: :runtime
98
+ version_requirements: *id005
99
+ - !ruby/object:Gem::Dependency
100
+ name: amatch
101
+ prerelease: false
102
+ requirement: &id006 !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 29
108
+ segments:
109
+ - 0
110
+ - 2
111
+ - 5
112
+ version: 0.2.5
113
+ type: :runtime
114
+ version_requirements: *id006
115
+ description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
116
+ email: seamus@abshere.net
117
+ executables: []
118
+
119
+ extensions: []
120
+
121
+ extra_rdoc_files:
122
+ - LICENSE
123
+ - README.rdoc
124
+ files:
125
+ - .document
126
+ - .gitignore
127
+ - LICENSE
128
+ - README.rdoc
129
+ - Rakefile
130
+ - VERSION
131
+ - examples/first_name_matching.rb
132
+ - examples/icao-bts.rb
133
+ - examples/icao-bts.xls
134
+ - lib/loose_tight_dictionary.rb
135
+ - test/helper.rb
136
+ - test/test_loose_tight_dictionary.rb
137
+ has_rdoc: true
138
+ homepage: http://github.com/seamusabshere/loose_tight_dictionary
139
+ licenses: []
140
+
141
+ post_install_message:
142
+ rdoc_options:
143
+ - --charset=UTF-8
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ none: false
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ hash: 3
152
+ segments:
153
+ - 0
154
+ version: "0"
155
+ required_rubygems_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ hash: 3
161
+ segments:
162
+ - 0
163
+ version: "0"
164
+ requirements: []
165
+
166
+ rubyforge_project:
167
+ rubygems_version: 1.3.7
168
+ signing_key:
169
+ specification_version: 3
170
+ summary: Allows iterative development of dictionaries for big data sets.
171
+ test_files:
172
+ - test/helper.rb
173
+ - test/test_loose_tight_dictionary.rb
174
+ - examples/first_name_matching.rb
175
+ - examples/icao-bts.rb