loose_tight_dictionary-ruby19 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Seamus Abshere
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,56 @@
1
+ = loose_tight_dictionary
2
+
3
+ Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
4
+
5
+ = Quickstart
6
+
7
+ >> right_records = [ 'seamus', 'andy', 'ben' ]
8
+ => [...]
9
+ >> left_record = 'Shamus Heaney'
10
+ => [...]
11
+ >> d = LooseTightDictionary.new right_records
12
+ => [...]
13
+ >> puts d.left_to_right left_record
14
+ => 'seamus'
15
+
16
+ Try running the included example file:
17
+
18
+ $ ruby examples/first_name_matching.rb
19
+ Left side (input)
20
+ ====================
21
+ Mr. Seamus
22
+ Sr. Andy
23
+ Master BenT
24
+
25
+ Right side (output)
26
+ ====================
27
+ seamus
28
+ andy
29
+ ben
30
+
31
+ Results
32
+ ====================
33
+ Left record (input) Right record (output) Prefix used (if any) Score
34
+ Mr. Seamus seamus NULL 0.666666666666667
35
+ Sr. Andy andy NULL 0.5
36
+ Master BenT ben NULL 0.2
37
+
38
+ = Improving dictionaries
39
+
40
+ Similarity matching will only get you so far.
41
+
42
+ TODO: regex usage
43
+
44
+ == Note on Patches/Pull Requests
45
+
46
+ * Fork the project.
47
+ * Make your feature addition or bug fix.
48
+ * Add tests for it. This is important so I don't break it in a
49
+ future version unintentionally.
50
+ * Commit, do not mess with rakefile, version, or history.
51
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
52
+ * Send me a pull request. Bonus points for topic branches.
53
+
54
+ == Copyright
55
+
56
+ Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "loose_tight_dictionary"
8
+ gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
9
+ gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
12
+ gem.authors = ["Seamus Abshere"]
13
+ gem.add_development_dependency "shoulda"
14
+ gem.add_development_dependency "remote_table", ">=0.2.19"
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'fastercsv', '>=1.5.3'
17
+ gem.add_dependency 'andand', '>=1.3.1'
18
+ gem.add_dependency 'amatch', '>=0.2.5'
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "loose_tight_dictionary #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.8
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ # require 'loose_tight_dictionary'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
5
+ right_side = [ 'seamus', 'andy', 'ben' ]
6
+ left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
7
+
8
+ puts "Left side (input)"
9
+ puts "=" * 20
10
+ puts left_side
11
+ puts
12
+
13
+ puts "Right side (output)"
14
+ puts "=" * 20
15
+ puts right_side
16
+ puts
17
+
18
+ puts "Results"
19
+ puts "=" * 20
20
+ d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
21
+ d.check left_side
22
+
23
+ puts d.left_to_right 'Shamus Heaney'
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'remote_table'
5
+ require 'ruby-debug'
6
+ require 'logger'
7
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
8
+
9
+ $logger = Logger.new STDERR
10
+ $logger.level = Logger::DEBUG
11
+ $logger.datetime_format = "%H:%M:%S"
12
+ # $tee = File.open('tee.csv', 'w')
13
+ $tee = STDOUT
14
+
15
+ # $ltd_left = /(super|bonanza)/i
16
+ # $ltd_right = /bonanza d-35/i
17
+ # $ltd_dd_left = /bonanza/i
18
+ # $ltd_dd_right = /musk/i
19
+ # $ltd_dd_left_not = /allison/i
20
+ # $ltd_dd_print = true
21
+ # $ltd_ddd_left = /bonanza/i
22
+ # $ltd_ddd_right = /musk/i
23
+ # $ltd_ddd_left_not = /allison/i
24
+ # $ltd_ddd_print = true
25
+
26
+ @right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
27
+ :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
28
+
29
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
30
+
31
+ @identities = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
32
+
33
+ @blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
34
+
35
+ @positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
36
+
37
+ @negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
38
+
39
+ %w{ tightenings identities blockings }.each do |name|
40
+ $logger.info name
41
+ $logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
42
+ $logger.info "\n"
43
+ end
44
+
45
+ ('A'..'Z').each do |letter|
46
+ # %w{ E }.each do |letter|
47
+ @left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
48
+ :encoding => 'US-ASCII',
49
+ :row_xpath => '//table/tr[2]/td/table/tr',
50
+ :column_xpath => 'td'
51
+
52
+ d = LooseTightDictionary.new @right, :tightenings => @tightenings, :identities => @identities, :blockings => @blockings, :logger => $logger, :tee => $tee
53
+ d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
54
+ d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
55
+ d.positives = @positives
56
+ d.negatives = @negatives
57
+ d.check @left
58
+ end
Binary file
@@ -0,0 +1,346 @@
1
+ require 'active_support'
2
+ require 'active_support/version'
3
+ %w{
4
+ active_support/core_ext/string
5
+ }.each do |active_support_3_requirement|
6
+ require active_support_3_requirement
7
+ end if ActiveSupport::VERSION::MAJOR == 3
8
+ require 'amatch'
9
+ require 'andand'
10
+ if RUBY_VERSION >= "1.9"
11
+ require 'csv'
12
+ else
13
+ require 'fastercsv'
14
+ end
15
+
16
+ class LooseTightDictionary
17
+ class MissedChecks < RuntimeError; end
18
+ class Mismatch < RuntimeError; end
19
+ class FalsePositive < RuntimeError; end
20
+
21
+ class T
22
+ attr_reader :str, :tightened_str
23
+ def initialize(str, tightened_str)
24
+ @str = str
25
+ @tightened_str = tightened_str
26
+ end
27
+
28
+ def tightened?
29
+ str != tightened_str
30
+ end
31
+
32
+ def prefix_and_score(other)
33
+ prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
34
+ score = if prefix
35
+ tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
36
+ else
37
+ tightened_str.pair_distance_similar other.tightened_str
38
+ end
39
+ [ prefix, score ]
40
+ end
41
+ end
42
+
43
+ include Amatch
44
+
45
+ attr_reader :right_records
46
+ attr_reader :case_sensitive
47
+
48
+ attr_accessor :logger
49
+ attr_accessor :tee
50
+ attr_accessor :tee_format
51
+ attr_accessor :positives
52
+ attr_accessor :negatives
53
+ attr_accessor :left_reader
54
+ attr_accessor :right_reader
55
+ attr_accessor :blocking_only
56
+
57
+ def initialize(right_records, options = {})
58
+ @right_records = right_records
59
+ @_raw_tightenings = options[:tightenings] || Array.new
60
+ @_raw_identities = options[:identities] || Array.new
61
+ @_raw_blockings = options[:blockings] || Array.new
62
+ @left_reader = options[:left_reader]
63
+ @right_reader = options[:right_reader]
64
+ @positives = options[:positives]
65
+ @negatives = options[:negatives]
66
+ @logger = options[:logger]
67
+ @tee = options[:tee]
68
+ @tee_format = options[:tee_format] || :fixed_width
69
+ @case_sensitive = options[:case_sensitive] || false
70
+ @blocking_only = options[:blocking_only] || false
71
+ end
72
+
73
+ # def tightenings
74
+ # def identities
75
+ # def blockings
76
+ %w{ tightenings identities blockings }.each do |name|
77
+ module_eval %{
78
+ def #{name}
79
+ @#{name} ||= @_raw_#{name}.map do |i|
80
+ next if i[0].blank?
81
+ literal_regexp i[0]
82
+ end
83
+ end
84
+ }
85
+ end
86
+
87
+ def blocking_only?
88
+ !!blocking_only
89
+ end
90
+
91
+ def inline_check(left_record, right_record)
92
+ return unless positives.present? or negatives.present?
93
+
94
+ left = read_left left_record
95
+ right = read_right right_record
96
+
97
+ if positive_record = positives.andand.detect { |record| record[0] == left }
98
+ correct_right = positive_record[1]
99
+ if correct_right.blank? and right.present?
100
+ logger.andand.debug " Mismatch! (should match SOMETHING)"
101
+ raise Mismatch
102
+ elsif right != correct_right
103
+ logger.andand.debug " Mismatch! (should be #{correct_right})"
104
+ raise Mismatch
105
+ end
106
+ end
107
+
108
+ if negative_record = negatives.andand.detect { |record| record[0] == left }
109
+ incorrect_right = negative_record[1]
110
+ if incorrect_right.blank? and right.present?
111
+ logger.andand.debug " False positive! (should NOT match ANYTHING)"
112
+ raise FalsePositive
113
+ elsif right == incorrect_right
114
+ logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
115
+ raise FalsePositive
116
+ end
117
+ end
118
+ end
119
+
120
+ def check(left_records)
121
+ header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
122
+ case tee_format
123
+ when :csv
124
+ tee.andand.puts header.flatten.to_csv
125
+ when :fixed_width
126
+ tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
127
+ end
128
+
129
+ left_records.each do |left_record|
130
+ begin
131
+ right_record = left_to_right left_record
132
+ ensure
133
+ case tee_format
134
+ when :csv
135
+ tee.andand.puts $ltd_1.flatten.to_csv
136
+ when :fixed_width
137
+ tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+ def left_to_right(left_record)
144
+ left = read_left left_record
145
+ blocking_left = blocking left
146
+ return if blocking_only? and blocking_left.nil?
147
+ i_options_left = i_options left
148
+ t_options_left = t_options left
149
+ history = Hash.new
150
+ right_record = right_records.select do |right_record|
151
+ right = read_right right_record
152
+ blocking_right = blocking right
153
+ (not blocking_left and not blocking_right) or
154
+ (blocking_right and blocking_right.match(left)) or
155
+ (blocking_left and blocking_left.match(right))
156
+ end.max do |a_record, b_record|
157
+ a = read_right a_record
158
+ b = read_right b_record
159
+ i_options_a = i_options a
160
+ i_options_b = i_options b
161
+ collision_a = collision? i_options_left, i_options_a
162
+ collision_b = collision? i_options_left, i_options_b
163
+ if collision_a and collision_b
164
+ # neither would ever work, so randomly rank one over the other
165
+ rand(2) == 1 ? -1 : 1
166
+ elsif collision_a
167
+ -1
168
+ elsif collision_b
169
+ 1
170
+ else
171
+ t_left_a, t_right_a = optimize t_options_left, t_options(a)
172
+ t_left_b, t_right_b = optimize t_options_left, t_options(b)
173
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
174
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
175
+ history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
176
+ history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
177
+
178
+ yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
179
+
180
+ if $ltd_dd_print and yep_dd
181
+ logger.andand.debug t_left_a.inspect
182
+ logger.andand.debug t_right_a.inspect
183
+ logger.andand.debug t_left_b.inspect
184
+ logger.andand.debug t_right_b.inspect
185
+ logger.andand.debug
186
+ end
187
+
188
+ z = 1
189
+ debugger if yep_dd
190
+ z = 1
191
+
192
+ if a_score != b_score
193
+ a_score <=> b_score
194
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
195
+ a_prefix <=> b_prefix
196
+ else
197
+ b.length <=> a.length
198
+ end
199
+ end
200
+ end
201
+ $ltd_1 = history[right_record]
202
+ right = read_right right_record
203
+ i_options_right = i_options right
204
+ z = 1
205
+ debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
206
+ z = 1
207
+ if collision? i_options_left, i_options_right
208
+ $ltd_0 = nil
209
+ return
210
+ else
211
+ $ltd_0 = right_record
212
+ end
213
+ inline_check left_record, right_record
214
+ right_record
215
+ end
216
+
217
+ def optimize(t_options_left, t_options_right)
218
+ cart_prod(t_options_left, t_options_right).max do |a, b|
219
+ t_left_a, t_right_a = a
220
+ t_left_b, t_right_b = b
221
+
222
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
223
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
224
+
225
+ yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
226
+
227
+ if $ltd_ddd_print and yep_ddd
228
+ logger.andand.debug t_left_a.inspect
229
+ logger.andand.debug t_right_a.inspect
230
+ logger.andand.debug t_left_b.inspect
231
+ logger.andand.debug t_right_b.inspect
232
+ logger.andand.debug
233
+ end
234
+
235
+ z = 1
236
+ debugger if yep_ddd
237
+ z = 1
238
+
239
+ if a_score != b_score
240
+ a_score <=> b_score
241
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
242
+ a_prefix <=> b_prefix
243
+ else
244
+ # randomly choose
245
+ # maybe later i can figure out how big the inputs are and apply occam's razor
246
+ rand(2) == 1 ? -1 : 1
247
+ end
248
+ end
249
+ end
250
+
251
+ def t_options(str)
252
+ return @_t_options[str] if @_t_options.andand.has_key?(str)
253
+ @_t_options ||= Hash.new
254
+ ary = Array.new
255
+ ary.push T.new(str, str)
256
+ tightenings.each do |regexp|
257
+ if match_data = regexp.match(str)
258
+ ary.push T.new(str, match_data.captures.compact.join)
259
+ end
260
+ end
261
+ @_t_options[str] = ary
262
+ end
263
+
264
+ class I
265
+ attr_reader :regexp, :str, :case_sensitive, :identity
266
+ def initialize(regexp, str, case_sensitive)
267
+ @regexp = regexp
268
+ @str = str
269
+ @identity = regexp.match(str).captures.compact.join
270
+ @identity = @identity.downcase if case_sensitive
271
+ end
272
+ end
273
+
274
+ def collision?(i_options_left, i_options_right)
275
+ i_options_left.any? do |r_left|
276
+ i_options_right.any? do |r_right|
277
+ r_left.regexp == r_right.regexp and r_left.identity != r_right.identity
278
+ end
279
+ end
280
+ end
281
+
282
+ def i_options(str)
283
+ return @_i_options[str] if @_i_options.andand.has_key?(str)
284
+ @_i_options ||= Hash.new
285
+ ary = Array.new
286
+ identities.each do |regexp|
287
+ if regexp.match str
288
+ ary.push I.new(regexp, str, case_sensitive)
289
+ end
290
+ end
291
+ @_i_options[str] = ary
292
+ end
293
+
294
+ def blocking(str)
295
+ return @_blocking[str] if @_blocking.andand.has_key?(str)
296
+ @_blocking ||= Hash.new
297
+ blockings.each do |regexp|
298
+ if regexp.match str
299
+ return @_blocking[str] = regexp
300
+ end
301
+ end
302
+ @_blocking[str] = nil
303
+ end
304
+
305
+ def literal_regexp(str)
306
+ return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
307
+ @_literal_regexp ||= Hash.new
308
+ raw_regexp_options = str.split('/').last
309
+ ignore_case = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
310
+ multiline = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
311
+ extended = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
312
+ @_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (ignore_case||multiline||extended)
313
+ end
314
+
315
+ def read_left(left_record)
316
+ return if left_record.nil?
317
+ if left_reader
318
+ left_reader.call(left_record)
319
+ elsif left_record.is_a?(String)
320
+ left_record
321
+ else
322
+ left_record[0]
323
+ end
324
+ end
325
+
326
+ def read_right(right_record)
327
+ return if right_record.nil?
328
+ if right_reader
329
+ right_reader.call(right_record)
330
+ elsif right_record.is_a?(String)
331
+ right_record
332
+ else
333
+ right_record[0]
334
+ end
335
+ end
336
+
337
+ # Thanks William James!
338
+ # http://www.ruby-forum.com/topic/95519#200484
339
+ def cart_prod(*args)
340
+ args.inject([[]]){|old,lst|
341
+ new = []
342
+ lst.each{|e| new += old.map{|c| c.dup << e }}
343
+ new
344
+ }
345
+ end
346
+ end
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'logger'
5
+ require 'ruby-debug'
6
+
7
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
8
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
9
+ require 'loose_tight_dictionary'
10
+
11
+ class Test::Unit::TestCase
12
+ end
@@ -0,0 +1,273 @@
1
+ require 'helper'
2
+
3
+ require 'remote_table'
4
+
5
+ # $logger = Logger.new STDERR
6
+ # $logger.level = Logger::INFO
7
+ # $tee = STDOUT
8
+
9
+ class TestLooseTightDictionary < Test::Unit::TestCase
10
+ def setup
11
+ clear_ltd
12
+
13
+ # dh 8 400
14
+ @a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
15
+ @a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
16
+ # dh 88
17
+ @b_left = ['ABCDEFG DH88 HIJKLMNOP']
18
+ # dh 89
19
+ @c_right = ['ABCDEFG DH89 HIJKLMNOP']
20
+ # dh 8 200
21
+ @d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
22
+ @d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
23
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
24
+
25
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
26
+
27
+ @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
28
+
29
+ @left = [
30
+ @a_left,
31
+ @b_left,
32
+ ['DE HAVILLAND DH89 Dragon Rapide'],
33
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
34
+ @d_left,
35
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
36
+ ['DE HAVILLAND DH90 Dragonfly']
37
+ ]
38
+ @right = [
39
+ @a_right,
40
+ @c_right,
41
+ @d_right,
42
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
43
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
44
+ ]
45
+ @tightenings = []
46
+ @identities = []
47
+ @blockings = []
48
+ @positives = []
49
+ @negatives = []
50
+ end
51
+
52
+ def clear_ltd
53
+ @_ltd = nil
54
+ end
55
+
56
+ def ltd
57
+ @_ltd ||= LooseTightDictionary.new @right,
58
+ :tightenings => @tightenings,
59
+ :identities => @identities,
60
+ :blockings => @blockings,
61
+ :positives => @positives,
62
+ :negatives => @negatives,
63
+ :blocking_only => @blocking_only,
64
+ :logger => $logger,
65
+ :tee => $tee
66
+ end
67
+
68
+ if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
69
+ end
70
+
71
+ if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
72
+ should "optionally only pay attention to things that match blockings" do
73
+ assert_equal @a_right, ltd.left_to_right(@a_left)
74
+
75
+ clear_ltd
76
+ @blocking_only = true
77
+ assert_equal nil, ltd.left_to_right(@a_left)
78
+
79
+ clear_ltd
80
+ @blocking_only = true
81
+ @blockings.push ['/dash/i']
82
+ assert_equal @a_right, ltd.left_to_right(@a_left)
83
+ end
84
+
85
+ # the example from the readme, considerably uglier here
86
+ should "check a simple table" do
87
+ @right = [ 'seamus', 'andy', 'ben' ]
88
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
89
+ left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
90
+
91
+ assert_nothing_raised do
92
+ ltd.check left
93
+ end
94
+ end
95
+
96
+ should "treat a String as a full record if passed through" do
97
+ dash = 'DHC8-400'
98
+ b747 = 'B747200/300'
99
+ dc9 = 'DC-9-10'
100
+ right_records = [ dash, b747, dc9 ]
101
+ simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
102
+ assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
103
+ assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
104
+ assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
105
+ end
106
+
107
+ should "call it a mismatch if you hit a blank positive" do
108
+ @positives.push [@a_left[0], '']
109
+ assert_raises(LooseTightDictionary::Mismatch) do
110
+ ltd.left_to_right @a_left
111
+ end
112
+ end
113
+
114
+ should "call it a false positive if you hit a blank negative" do
115
+ @negatives.push [@a_left[0], '']
116
+ assert_raises(LooseTightDictionary::FalsePositive) do
117
+ ltd.left_to_right @a_left
118
+ end
119
+ end
120
+
121
+ should "have a false match without blocking" do
122
+ # @d_left will be our victim
123
+ @right.push @d_lookalike
124
+ @tightenings.push @t_1
125
+
126
+ assert_equal @d_lookalike, ltd.left_to_right(@d_left)
127
+ end
128
+
129
+ should "do blocking if the left matches a block" do
130
+ # @d_left will be our victim
131
+ @right.push @d_lookalike
132
+ @tightenings.push @t_1
133
+ @blockings.push ['/(bombardier|de ?havilland)/i']
134
+
135
+ assert_equal @d_right, ltd.left_to_right(@d_left)
136
+ end
137
+
138
+ should "treat blocks as exclusive" do
139
+ @right = [ @d_left ]
140
+ @tightenings.push @t_1
141
+ @blockings.push ['/(bombardier|de ?havilland)/i']
142
+
143
+ assert_equal nil, ltd.left_to_right(@d_lookalike)
144
+ end
145
+
146
+ should "only use identities if they stem from the same regexp" do
147
+ @identities.push @r_1
148
+ @identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
149
+ @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
150
+ x_left = [ 'CESSNA D-333 CITATION V']
151
+ x_right = [ 'CESSNA D-333' ]
152
+ @right.push x_right
153
+
154
+ assert_equal x_right, ltd.left_to_right(x_left)
155
+ end
156
+
157
+ should "use the best score from all of the tightenings" do
158
+ x_left = ["BOEING 737100"]
159
+ x_right = ["BOEING BOEING 737-100/200"]
160
+ x_right_wrong = ["BOEING BOEING 737-900"]
161
+ @right.push x_right
162
+ @right.push x_right_wrong
163
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
164
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
165
+
166
+ assert_equal x_right, ltd.left_to_right(x_left)
167
+ end
168
+
169
+ should "compare using prefixes if tightened key is shorter than correct match" do
170
+ x_left = ["BOEING 720"]
171
+ x_right = ["BOEING BOEING 720-000"]
172
+ x_right_wrong = ["BOEING BOEING 717-200"]
173
+ @right.push x_right
174
+ @right.push x_right_wrong
175
+ @tightenings.push @t_1
176
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
177
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
178
+
179
+ assert_equal x_right, ltd.left_to_right(x_left)
180
+ end
181
+
182
+ should "use the shortest original input" do
183
+ x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
184
+ x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
185
+ x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
186
+
187
+ @right.push x_right_long
188
+ @right.push x_right
189
+ @tightenings.push @t_1
190
+
191
+ assert_equal x_right, ltd.left_to_right(x_left)
192
+ end
193
+
194
+ should "perform lookups left to right" do
195
+ assert_equal @a_right, ltd.left_to_right(@a_left)
196
+ end
197
+
198
+ should "succeed if there are no checks" do
199
+ assert_nothing_raised do
200
+ ltd.check @left
201
+ end
202
+ end
203
+
204
+ should "succeed if the positive checks just work" do
205
+ @positives.push [ @a_left[0], @a_right[0] ]
206
+
207
+ assert_nothing_raised do
208
+ ltd.check @left
209
+ end
210
+ end
211
+
212
+ should "fail if positive checks don't work" do
213
+ @positives.push [ @d_left[0], @d_right[0] ]
214
+
215
+ assert_raises(LooseTightDictionary::Mismatch) do
216
+ ltd.check @left
217
+ end
218
+ end
219
+
220
+ should "succeed if proper tightening is applied" do
221
+ @positives.push [ @d_left[0], @d_right[0] ]
222
+ @tightenings.push @t_1
223
+
224
+ assert_nothing_raised do
225
+ ltd.check @left
226
+ end
227
+ end
228
+
229
+ should "use a Google Docs spreadsheet as a source of tightenings" do
230
+ @positives.push [ @d_left[0], @d_right[0] ]
231
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
232
+
233
+ assert_nothing_raised do
234
+ ltd.check @left
235
+ end
236
+ end
237
+
238
+ should "fail if negative checks don't work" do
239
+ @negatives.push [ @b_left[0], @c_right[0] ]
240
+
241
+ assert_raises(LooseTightDictionary::FalsePositive) do
242
+ ltd.check @left
243
+ end
244
+ end
245
+
246
+ should "do inline checking" do
247
+ @negatives.push [ @b_left[0], @c_right[0] ]
248
+
249
+ assert_raises(LooseTightDictionary::FalsePositive) do
250
+ ltd.left_to_right @b_left
251
+ end
252
+ end
253
+
254
+ should "fail if negative checks don't work, even with tightening" do
255
+ @negatives.push [ @b_left[0], @c_right[0] ]
256
+ @tightenings.push @t_1
257
+
258
+ assert_raises(LooseTightDictionary::FalsePositive) do
259
+ ltd.check @left
260
+ end
261
+ end
262
+
263
+ should "succeed if proper identity is applied" do
264
+ @negatives.push [ @b_left[0], @c_right[0] ]
265
+ @positives.push [ @d_left[0], @d_right[0] ]
266
+ @identities.push @r_1
267
+
268
+ assert_nothing_raised do
269
+ ltd.check @left
270
+ end
271
+ end
272
+ end
273
+ end
metadata ADDED
@@ -0,0 +1,175 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loose_tight_dictionary-ruby19
3
+ version: !ruby/object:Gem::Version
4
+ hash: 15
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 8
10
+ version: 0.0.8
11
+ platform: ruby
12
+ authors:
13
+ - Seamus Abshere
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-09-27 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: remote_table
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 49
44
+ segments:
45
+ - 0
46
+ - 2
47
+ - 19
48
+ version: 0.2.19
49
+ type: :development
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: activesupport
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 11
60
+ segments:
61
+ - 2
62
+ - 3
63
+ - 4
64
+ version: 2.3.4
65
+ type: :runtime
66
+ version_requirements: *id003
67
+ - !ruby/object:Gem::Dependency
68
+ name: fastercsv
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 5
76
+ segments:
77
+ - 1
78
+ - 5
79
+ - 3
80
+ version: 1.5.3
81
+ type: :runtime
82
+ version_requirements: *id004
83
+ - !ruby/object:Gem::Dependency
84
+ name: andand
85
+ prerelease: false
86
+ requirement: &id005 !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 25
92
+ segments:
93
+ - 1
94
+ - 3
95
+ - 1
96
+ version: 1.3.1
97
+ type: :runtime
98
+ version_requirements: *id005
99
+ - !ruby/object:Gem::Dependency
100
+ name: amatch
101
+ prerelease: false
102
+ requirement: &id006 !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 29
108
+ segments:
109
+ - 0
110
+ - 2
111
+ - 5
112
+ version: 0.2.5
113
+ type: :runtime
114
+ version_requirements: *id006
115
+ description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
116
+ email: seamus@abshere.net
117
+ executables: []
118
+
119
+ extensions: []
120
+
121
+ extra_rdoc_files:
122
+ - LICENSE
123
+ - README.rdoc
124
+ files:
125
+ - .document
126
+ - .gitignore
127
+ - LICENSE
128
+ - README.rdoc
129
+ - Rakefile
130
+ - VERSION
131
+ - examples/first_name_matching.rb
132
+ - examples/icao-bts.rb
133
+ - examples/icao-bts.xls
134
+ - lib/loose_tight_dictionary.rb
135
+ - test/helper.rb
136
+ - test/test_loose_tight_dictionary.rb
137
+ has_rdoc: true
138
+ homepage: http://github.com/seamusabshere/loose_tight_dictionary
139
+ licenses: []
140
+
141
+ post_install_message:
142
+ rdoc_options:
143
+ - --charset=UTF-8
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ none: false
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ hash: 3
152
+ segments:
153
+ - 0
154
+ version: "0"
155
+ required_rubygems_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ hash: 3
161
+ segments:
162
+ - 0
163
+ version: "0"
164
+ requirements: []
165
+
166
+ rubyforge_project:
167
+ rubygems_version: 1.3.7
168
+ signing_key:
169
+ specification_version: 3
170
+ summary: Allows iterative development of dictionaries for big data sets.
171
+ test_files:
172
+ - test/helper.rb
173
+ - test/test_loose_tight_dictionary.rb
174
+ - examples/first_name_matching.rb
175
+ - examples/icao-bts.rb