loose_tight_dictionary 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Seamus Abshere
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ = loose_tight_dictionary
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "loose_tight_dictionary"
8
+ gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
9
+ gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
12
+ gem.authors = ["Seamus Abshere"]
13
+ gem.add_development_dependency "shoulda"
14
+ gem.add_development_dependency "remote_table", ">=0.2.16"
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'fastercsv', '>=1.5.3'
17
+ gem.add_dependency 'andand', '>=1.3.1'
18
+ gem.add_dependency 'amatch', '>=0.2.5'
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "loose_tight_dictionary #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'remote_table'
5
+ require 'ruby-debug'
6
+ require 'logger'
7
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
8
+
9
+ $logger = Logger.new STDERR
10
+ $logger.level = Logger::DEBUG
11
+ $logger.datetime_format = "%H:%M:%S"
12
+ # $tee = File.open('tee.csv', 'w')
13
+ $tee = STDOUT
14
+
15
+ # $ltd_left = /(super|bonanza)/i
16
+ # $ltd_right = /bonanza d-35/i
17
+ # $ltd_dd_left = /bonanza/i
18
+ # $ltd_dd_right = /musk/i
19
+ # $ltd_dd_left_not = /allison/i
20
+ # $ltd_dd_print = true
21
+ # $ltd_ddd_left = /bonanza/i
22
+ # $ltd_ddd_right = /musk/i
23
+ # $ltd_ddd_left_not = /allison/i
24
+ # $ltd_ddd_print = true
25
+
26
+ @right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
27
+ :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
28
+
29
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
30
+
31
+ @restrictions = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
32
+
33
+ @blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
34
+
35
+ @positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
36
+
37
+ @negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
38
+
39
+ %w{ tightenings restrictions blockings }.each do |name|
40
+ $logger.info name
41
+ $logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
42
+ $logger.info "\n"
43
+ end
44
+
45
+ ('A'..'Z').each do |letter|
46
+ # %w{ E }.each do |letter|
47
+ @left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
48
+ :encoding => 'US-ASCII',
49
+ :row_xpath => '//table/tr[2]/td/table/tr',
50
+ :column_xpath => 'td'
51
+
52
+ d = LooseTightDictionary.new @right, :tightenings => @tightenings, :restrictions => @restrictions, :blockings => @blockings, :logger => $logger, :tee => $tee
53
+ d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
54
+ d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
55
+ d.positives = @positives
56
+ d.negatives = @negatives
57
+ d.check @left
58
+ end
Binary file
@@ -0,0 +1,270 @@
1
+ require 'active_support'
2
+ require 'active_support/version'
3
+ %w{
4
+ active_support/core_ext/string
5
+ }.each do |active_support_3_requirement|
6
+ require active_support_3_requirement
7
+ end if ActiveSupport::VERSION::MAJOR == 3
8
+ require 'amatch'
9
+ require 'andand'
10
+ require 'fastercsv'
11
+
12
+ class LooseTightDictionary
13
+ class MissedChecks < RuntimeError; end
14
+ class Mismatch < RuntimeError; end
15
+ class FalsePositive < RuntimeError; end
16
+
17
+ class T
18
+ attr_reader :str, :tightened_str
19
+ def initialize(str, tightened_str)
20
+ @str = str
21
+ @tightened_str = tightened_str
22
+ end
23
+
24
+ def tightened?
25
+ str != tightened_str
26
+ end
27
+
28
+ def prefix_and_score(other)
29
+ prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
30
+ score = if prefix
31
+ tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
32
+ else
33
+ tightened_str.pair_distance_similar other.tightened_str
34
+ end
35
+ [ prefix, score ]
36
+ end
37
+ end
38
+
39
+ include Amatch
40
+
41
+ attr_reader :right_records
42
+ attr_reader :tightenings
43
+ attr_reader :restrictions
44
+ attr_reader :blockings
45
+ attr_reader :logger
46
+ attr_reader :tee
47
+ attr_reader :case_sensitive
48
+
49
+ attr_accessor :positives
50
+ attr_accessor :negatives
51
+ attr_accessor :left_reader
52
+ attr_accessor :right_reader
53
+
54
+ def initialize(right_records, options = {})
55
+ @right_records = right_records
56
+ @tightenings = options[:tightenings] || Array.new
57
+ @restrictions = options[:restrictions] || Array.new
58
+ @blockings = options[:blockings] || Array.new
59
+ @left_reader = options[:left_reader]
60
+ @right_reader = options[:right_reader]
61
+ @positives = options[:positives]
62
+ @negatives = options[:negatives]
63
+ @logger = options[:logger]
64
+ @tee = options[:tee]
65
+ @case_sensitive = options[:case_sensitive] || false
66
+ end
67
+
68
+ def inline_check(left_record, right_record)
69
+ return unless positives.present? or negatives.present?
70
+
71
+ left = read_left left_record
72
+ right = read_right right_record
73
+
74
+ if p = positives.andand.detect { |record| record[0] == left }
75
+ correct_right = p[1]
76
+ else
77
+ correct_right = :ignore
78
+ end
79
+
80
+ if n = negatives.andand.detect { |record| record[0] == left }
81
+ incorrect_right = n[1]
82
+ else
83
+ incorrect_right = :ignore
84
+ end
85
+
86
+ if correct_right != :ignore and right != correct_right
87
+ logger.andand.debug " Mismatch! (should be #{correct_right})"
88
+ raise Mismatch
89
+ end
90
+
91
+ if incorrect_right != :ignore and right == incorrect_right
92
+ logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
93
+ raise FalsePositive
94
+ end
95
+ end
96
+
97
+ def check(left_records)
98
+ unless positives.present? or negatives.present?
99
+ logger.andand.info "You didn't define any positives or negatives, so running check doesn't do anything"
100
+ return
101
+ end
102
+ left_records.each do |left_record|
103
+ right_record = left_to_right left_record
104
+ inline_check left_record, right_record
105
+ tee.andand.puts [ read_left(left_record), read_right(right_record), $ltd_1 ].flatten.to_csv
106
+ end
107
+ end
108
+
109
+ def left_to_right(left_record)
110
+ left = read_left left_record
111
+ restricted_left = restrict left
112
+ blocking_left = blocking left
113
+ t_options_left = t_options left
114
+ history = Hash.new
115
+ right_record = right_records.select { |record| blocking_left.nil? or blocking_left.match(read_right(record)) }.max do |a_record, b_record|
116
+ a = read_right a_record
117
+ b = read_right b_record
118
+ restricted_a = restrict a
119
+ restricted_b = restrict b
120
+ if restricted_left and restricted_a and restricted_b and restricted_left != restricted_a and restricted_left != restricted_b
121
+ # neither would ever work, so randomly rank one over the other
122
+ rand(2) == 1 ? -1 : 1
123
+ elsif restricted_left and restricted_a and restricted_left != restricted_a
124
+ -1
125
+ elsif restricted_left and restricted_b and restricted_left != restricted_b
126
+ 1
127
+ else
128
+ t_left_a, t_right_a = optimize t_options_left, t_options(a)
129
+ t_left_b, t_right_b = optimize t_options_left, t_options(b)
130
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
131
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
132
+ history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
133
+ history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
134
+
135
+ yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
136
+
137
+ if $ltd_dd_print and yep_dd
138
+ logger.andand.debug t_left_a.inspect
139
+ logger.andand.debug t_right_a.inspect
140
+ logger.andand.debug t_left_b.inspect
141
+ logger.andand.debug t_right_b.inspect
142
+ logger.andand.debug
143
+ end
144
+
145
+ z = 1
146
+ debugger if yep_dd
147
+ z = 1
148
+
149
+ if a_score != b_score
150
+ a_score <=> b_score
151
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
152
+ a_prefix <=> b_prefix
153
+ else
154
+ b.length <=> a.length
155
+ end
156
+ end
157
+ end
158
+ $ltd_1 = history[right_record]
159
+ right = read_right right_record
160
+ restricted_right = restrict right
161
+ z = 1
162
+ debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
163
+ z = 1
164
+ return if restricted_left and restricted_right and restricted_left != restricted_right
165
+ inline_check left_record, right_record
166
+ right_record
167
+ end
168
+
169
+ def optimize(t_options_left, t_options_right)
170
+ cart_prod(t_options_left, t_options_right).max do |a, b|
171
+ t_left_a, t_right_a = a
172
+ t_left_b, t_right_b = b
173
+
174
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
175
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
176
+
177
+ yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
178
+
179
+ if $ltd_ddd_print and yep_ddd
180
+ logger.andand.debug t_left_a.inspect
181
+ logger.andand.debug t_right_a.inspect
182
+ logger.andand.debug t_left_b.inspect
183
+ logger.andand.debug t_right_b.inspect
184
+ logger.andand.debug
185
+ end
186
+
187
+ z = 1
188
+ debugger if yep_ddd
189
+ z = 1
190
+
191
+ if a_score != b_score
192
+ a_score <=> b_score
193
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
194
+ a_prefix <=> b_prefix
195
+ else
196
+ # randomly choose
197
+ # maybe later i can figure out how big the inputs are and apply occam's razor
198
+ rand(2) == 1 ? -1 : 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def t_options(str)
204
+ return @_t_options[str] if @_t_options.andand.has_key?(str)
205
+ @_t_options ||= Hash.new
206
+ ary = Array.new
207
+ ary << T.new(str, str)
208
+ tightenings.each do |tightening|
209
+ if literal_regexp(tightening[0]).match str
210
+ ary << T.new(str, $~.captures.compact.join)
211
+ end
212
+ end
213
+ @_t_options[str] = ary
214
+ end
215
+
216
+ def blocking(str)
217
+ return @_blocking[str] if @_blocking.andand.has_key?(str)
218
+ @_blocking ||= Hash.new
219
+ blockings.each do |blocking|
220
+ regexp = literal_regexp blocking[0]
221
+ if regexp.match str
222
+ return @_blocking[str] = regexp
223
+ end
224
+ end
225
+ @_blocking[str] = nil
226
+ end
227
+
228
+ def restrict(str)
229
+ return @_restrict[str] if @_restrict.andand.has_key?(str)
230
+ @_restrict ||= Hash.new
231
+ restrictions.each do |restriction|
232
+ if literal_regexp(restriction[0]).match str
233
+ retval = $~.captures.compact.join
234
+ retval = retval.downcase unless case_sensitive
235
+ return @_restrict[str] = retval
236
+ end
237
+ end
238
+ @_restrict[str] = nil
239
+ end
240
+
241
+ def literal_regexp(str)
242
+ return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
243
+ @_literal_regexp ||= Hash.new
244
+ raw_regexp_options = str.split('/').last
245
+ i = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
246
+ m = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
247
+ x = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
248
+ @_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (i||m||x), 'U'
249
+ end
250
+
251
+ def read_left(left_record)
252
+ return if left_record.nil?
253
+ left_reader ? left_reader.call(left_record) : left_record[0]
254
+ end
255
+
256
+ def read_right(right_record)
257
+ return if right_record.nil?
258
+ right_reader ? right_reader.call(right_record) : right_record[0]
259
+ end
260
+
261
+ # Thanks William James!
262
+ # http://www.ruby-forum.com/topic/95519#200484
263
+ def cart_prod(*args)
264
+ args.inject([[]]){|old,lst|
265
+ new = []
266
+ lst.each{|e| new += old.map{|c| c.dup << e }}
267
+ new
268
+ }
269
+ end
270
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'logger'
5
+ require 'ruby-debug'
6
+
7
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
8
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
9
+ require 'loose_tight_dictionary'
10
+
11
+ class Test::Unit::TestCase
12
+ end
@@ -0,0 +1,211 @@
1
+ require 'helper'
2
+
3
+ require 'remote_table'
4
+
5
+ # $logger = Logger.new STDERR
6
+ # $logger.level = Logger::INFO
7
+ # $tee = STDOUT
8
+
9
+ class TestLooseTightDictionary < Test::Unit::TestCase
10
+ def setup
11
+ clear_ltd
12
+
13
+ # dh 8 400
14
+ @a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
15
+ @a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
16
+ # dh 88
17
+ @b_left = ['ABCDEFG DH88 HIJKLMNOP']
18
+ # dh 89
19
+ @c_right = ['ABCDEFG DH89 HIJKLMNOP']
20
+ # dh 8 200
21
+ @d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
22
+ @d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
23
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
24
+
25
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
26
+
27
+ @d_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good restriction for de havilland' ]
28
+
29
+ @left = [
30
+ @a_left,
31
+ @b_left,
32
+ ['DE HAVILLAND DH89 Dragon Rapide'],
33
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
34
+ @d_left,
35
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
36
+ ['DE HAVILLAND DH90 Dragonfly']
37
+ ]
38
+ @right = [
39
+ @a_right,
40
+ @c_right,
41
+ @d_right,
42
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
43
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
44
+ ]
45
+ @tightenings = []
46
+ @restrictions = []
47
+ @blockings = []
48
+ @positives = []
49
+ @negatives = []
50
+ end
51
+
52
+ def clear_ltd
53
+ @_ltd = nil
54
+ end
55
+
56
+ def ltd
57
+ @_ltd ||= LooseTightDictionary.new @right,
58
+ :tightenings => @tightenings,
59
+ :restrictions => @restrictions,
60
+ :blockings => @blockings,
61
+ :positives => @positives,
62
+ :negatives => @negatives,
63
+ :logger => $logger,
64
+ :tee => $tee
65
+ end
66
+
67
+ if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
68
+ end
69
+
70
+ if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
71
+ should "have a false match without blocking" do
72
+ # @d_left will be our victim
73
+ @right.push @d_lookalike
74
+ @tightenings.push @t_1
75
+
76
+ assert_equal @d_lookalike, ltd.left_to_right(@d_left)
77
+ end
78
+
79
+ should "do blocking if the left matches a block" do
80
+ # @d_left will be our victim
81
+ @right.push @d_lookalike
82
+ @tightenings.push @t_1
83
+ @blockings.push ['/(bombardier|de ?havilland)/i']
84
+
85
+ assert_equal @d_right, ltd.left_to_right(@d_left)
86
+ end
87
+
88
+ should "not do blocking if the left doesn't match any blockings" do
89
+ @tightenings.push @t_1
90
+ @blockings.push ['/(bombardier|de ?havilland)/i']
91
+
92
+ assert_equal @d_right, ltd.left_to_right(@d_lookalike)
93
+ end
94
+
95
+ should "use the best score from all of the tightenings" do
96
+ x_left = ["BOEING 737100"]
97
+ x_right = ["BOEING BOEING 737-100/200"]
98
+ x_right_wrong = ["BOEING BOEING 737-900"]
99
+ @right.push x_right
100
+ @right.push x_right_wrong
101
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
102
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
103
+
104
+ assert_equal x_right, ltd.left_to_right(x_left)
105
+ end
106
+
107
+ should "compare using prefixes if tightened key is shorter than correct match" do
108
+ x_left = ["BOEING 720"]
109
+ x_right = ["BOEING BOEING 720-000"]
110
+ x_right_wrong = ["BOEING BOEING 717-200"]
111
+ @right.push x_right
112
+ @right.push x_right_wrong
113
+ @tightenings.push @t_1
114
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
115
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
116
+
117
+ assert_equal x_right, ltd.left_to_right(x_left)
118
+ end
119
+
120
+ should "use the shortest original input" do
121
+ x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
122
+ x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
123
+ x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
124
+
125
+ @right.push x_right_long
126
+ @right.push x_right
127
+ @tightenings.push @t_1
128
+
129
+ assert_equal x_right, ltd.left_to_right(x_left)
130
+ end
131
+
132
+ should "perform lookups left to right" do
133
+ assert_equal @a_right, ltd.left_to_right(@a_left)
134
+ end
135
+
136
+ should "succeed if there are no checks" do
137
+ assert_nothing_raised do
138
+ ltd.check @left
139
+ end
140
+ end
141
+
142
+ should "succeed if the positive checks just work" do
143
+ @positives.push [ @a_left[0], @a_right[0] ]
144
+
145
+ assert_nothing_raised do
146
+ ltd.check @left
147
+ end
148
+ end
149
+
150
+ should "fail if positive checks don't work" do
151
+ @positives.push [ @d_left[0], @d_right[0] ]
152
+
153
+ assert_raises(LooseTightDictionary::Mismatch) do
154
+ ltd.check @left
155
+ end
156
+ end
157
+
158
+ should "succeed if proper tightening is applied" do
159
+ @positives.push [ @d_left[0], @d_right[0] ]
160
+ @tightenings.push @t_1
161
+
162
+ assert_nothing_raised do
163
+ ltd.check @left
164
+ end
165
+ end
166
+
167
+ should "use a Google Docs spreadsheet as a source of tightenings" do
168
+ @positives.push [ @d_left[0], @d_right[0] ]
169
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
170
+
171
+ assert_nothing_raised do
172
+ ltd.check @left
173
+ end
174
+ end
175
+
176
+ should "fail if negative checks don't work" do
177
+ @negatives.push [ @b_left[0], @c_right[0] ]
178
+
179
+ assert_raises(LooseTightDictionary::FalsePositive) do
180
+ ltd.check @left
181
+ end
182
+ end
183
+
184
+ should "do inline checking" do
185
+ @negatives.push [ @b_left[0], @c_right[0] ]
186
+
187
+ assert_raises(LooseTightDictionary::FalsePositive) do
188
+ ltd.left_to_right @b_left
189
+ end
190
+ end
191
+
192
+ should "fail if negative checks don't work, even with tightening" do
193
+ @negatives.push [ @b_left[0], @c_right[0] ]
194
+ @tightenings.push @t_1
195
+
196
+ assert_raises(LooseTightDictionary::FalsePositive) do
197
+ ltd.check @left
198
+ end
199
+ end
200
+
201
+ should "succeed if proper restriction is applied" do
202
+ @negatives.push [ @b_left[0], @c_right[0] ]
203
+ @positives.push [ @d_left[0], @d_right[0] ]
204
+ @restrictions.push @d_1
205
+
206
+ assert_nothing_raised do
207
+ ltd.check @left
208
+ end
209
+ end
210
+ end
211
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loose_tight_dictionary
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Seamus Abshere
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-28 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: remote_table
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ - 2
42
+ - 16
43
+ version: 0.2.16
44
+ type: :development
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: activesupport
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 2
55
+ - 3
56
+ - 4
57
+ version: 2.3.4
58
+ type: :runtime
59
+ version_requirements: *id003
60
+ - !ruby/object:Gem::Dependency
61
+ name: fastercsv
62
+ prerelease: false
63
+ requirement: &id004 !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 1
69
+ - 5
70
+ - 3
71
+ version: 1.5.3
72
+ type: :runtime
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: andand
76
+ prerelease: false
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 3
84
+ - 1
85
+ version: 1.3.1
86
+ type: :runtime
87
+ version_requirements: *id005
88
+ - !ruby/object:Gem::Dependency
89
+ name: amatch
90
+ prerelease: false
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ - 2
98
+ - 5
99
+ version: 0.2.5
100
+ type: :runtime
101
+ version_requirements: *id006
102
+ description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
103
+ email: seamus@abshere.net
104
+ executables: []
105
+
106
+ extensions: []
107
+
108
+ extra_rdoc_files:
109
+ - LICENSE
110
+ - README.rdoc
111
+ files:
112
+ - .document
113
+ - .gitignore
114
+ - LICENSE
115
+ - README.rdoc
116
+ - Rakefile
117
+ - VERSION
118
+ - examples/icao-bts.rb
119
+ - examples/icao-bts.xls
120
+ - lib/loose_tight_dictionary.rb
121
+ - test/helper.rb
122
+ - test/test_loose_tight_dictionary.rb
123
+ has_rdoc: true
124
+ homepage: http://github.com/seamusabshere/loose_tight_dictionary
125
+ licenses: []
126
+
127
+ post_install_message:
128
+ rdoc_options:
129
+ - --charset=UTF-8
130
+ require_paths:
131
+ - lib
132
+ required_ruby_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ segments:
137
+ - 0
138
+ version: "0"
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ segments:
144
+ - 0
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project:
149
+ rubygems_version: 1.3.6
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: Allows iterative development of dictionaries for big data sets.
153
+ test_files:
154
+ - test/helper.rb
155
+ - test/test_loose_tight_dictionary.rb
156
+ - examples/icao-bts.rb