loose_tight_dictionary 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Seamus Abshere
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ = loose_tight_dictionary
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "loose_tight_dictionary"
8
+ gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
9
+ gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
12
+ gem.authors = ["Seamus Abshere"]
13
+ gem.add_development_dependency "shoulda"
14
+ gem.add_development_dependency "remote_table", ">=0.2.16"
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'fastercsv', '>=1.5.3'
17
+ gem.add_dependency 'andand', '>=1.3.1'
18
+ gem.add_dependency 'amatch', '>=0.2.5'
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/test_*.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/test_*.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "loose_tight_dictionary #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'remote_table'
5
+ require 'ruby-debug'
6
+ require 'logger'
7
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
8
+
9
+ $logger = Logger.new STDERR
10
+ $logger.level = Logger::DEBUG
11
+ $logger.datetime_format = "%H:%M:%S"
12
+ # $tee = File.open('tee.csv', 'w')
13
+ $tee = STDOUT
14
+
15
+ # $ltd_left = /(super|bonanza)/i
16
+ # $ltd_right = /bonanza d-35/i
17
+ # $ltd_dd_left = /bonanza/i
18
+ # $ltd_dd_right = /musk/i
19
+ # $ltd_dd_left_not = /allison/i
20
+ # $ltd_dd_print = true
21
+ # $ltd_ddd_left = /bonanza/i
22
+ # $ltd_ddd_right = /musk/i
23
+ # $ltd_ddd_left_not = /allison/i
24
+ # $ltd_ddd_print = true
25
+
26
+ @right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
27
+ :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
28
+
29
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
30
+
31
+ @restrictions = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
32
+
33
+ @blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
34
+
35
+ @positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
36
+
37
+ @negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
38
+
39
+ %w{ tightenings restrictions blockings }.each do |name|
40
+ $logger.info name
41
+ $logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
42
+ $logger.info "\n"
43
+ end
44
+
45
+ ('A'..'Z').each do |letter|
46
+ # %w{ E }.each do |letter|
47
+ @left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
48
+ :encoding => 'US-ASCII',
49
+ :row_xpath => '//table/tr[2]/td/table/tr',
50
+ :column_xpath => 'td'
51
+
52
+ d = LooseTightDictionary.new @right, :tightenings => @tightenings, :restrictions => @restrictions, :blockings => @blockings, :logger => $logger, :tee => $tee
53
+ d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
54
+ d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
55
+ d.positives = @positives
56
+ d.negatives = @negatives
57
+ d.check @left
58
+ end
Binary file
@@ -0,0 +1,270 @@
1
+ require 'active_support'
2
+ require 'active_support/version'
3
+ %w{
4
+ active_support/core_ext/string
5
+ }.each do |active_support_3_requirement|
6
+ require active_support_3_requirement
7
+ end if ActiveSupport::VERSION::MAJOR == 3
8
+ require 'amatch'
9
+ require 'andand'
10
+ require 'fastercsv'
11
+
12
+ class LooseTightDictionary
13
+ class MissedChecks < RuntimeError; end
14
+ class Mismatch < RuntimeError; end
15
+ class FalsePositive < RuntimeError; end
16
+
17
+ class T
18
+ attr_reader :str, :tightened_str
19
+ def initialize(str, tightened_str)
20
+ @str = str
21
+ @tightened_str = tightened_str
22
+ end
23
+
24
+ def tightened?
25
+ str != tightened_str
26
+ end
27
+
28
+ def prefix_and_score(other)
29
+ prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
30
+ score = if prefix
31
+ tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
32
+ else
33
+ tightened_str.pair_distance_similar other.tightened_str
34
+ end
35
+ [ prefix, score ]
36
+ end
37
+ end
38
+
39
+ include Amatch
40
+
41
+ attr_reader :right_records
42
+ attr_reader :tightenings
43
+ attr_reader :restrictions
44
+ attr_reader :blockings
45
+ attr_reader :logger
46
+ attr_reader :tee
47
+ attr_reader :case_sensitive
48
+
49
+ attr_accessor :positives
50
+ attr_accessor :negatives
51
+ attr_accessor :left_reader
52
+ attr_accessor :right_reader
53
+
54
+ def initialize(right_records, options = {})
55
+ @right_records = right_records
56
+ @tightenings = options[:tightenings] || Array.new
57
+ @restrictions = options[:restrictions] || Array.new
58
+ @blockings = options[:blockings] || Array.new
59
+ @left_reader = options[:left_reader]
60
+ @right_reader = options[:right_reader]
61
+ @positives = options[:positives]
62
+ @negatives = options[:negatives]
63
+ @logger = options[:logger]
64
+ @tee = options[:tee]
65
+ @case_sensitive = options[:case_sensitive] || false
66
+ end
67
+
68
+ def inline_check(left_record, right_record)
69
+ return unless positives.present? or negatives.present?
70
+
71
+ left = read_left left_record
72
+ right = read_right right_record
73
+
74
+ if p = positives.andand.detect { |record| record[0] == left }
75
+ correct_right = p[1]
76
+ else
77
+ correct_right = :ignore
78
+ end
79
+
80
+ if n = negatives.andand.detect { |record| record[0] == left }
81
+ incorrect_right = n[1]
82
+ else
83
+ incorrect_right = :ignore
84
+ end
85
+
86
+ if correct_right != :ignore and right != correct_right
87
+ logger.andand.debug " Mismatch! (should be #{correct_right})"
88
+ raise Mismatch
89
+ end
90
+
91
+ if incorrect_right != :ignore and right == incorrect_right
92
+ logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
93
+ raise FalsePositive
94
+ end
95
+ end
96
+
97
+ def check(left_records)
98
+ unless positives.present? or negatives.present?
99
+ logger.andand.info "You didn't define any positives or negatives, so running check doesn't do anything"
100
+ return
101
+ end
102
+ left_records.each do |left_record|
103
+ right_record = left_to_right left_record
104
+ inline_check left_record, right_record
105
+ tee.andand.puts [ read_left(left_record), read_right(right_record), $ltd_1 ].flatten.to_csv
106
+ end
107
+ end
108
+
109
+ def left_to_right(left_record)
110
+ left = read_left left_record
111
+ restricted_left = restrict left
112
+ blocking_left = blocking left
113
+ t_options_left = t_options left
114
+ history = Hash.new
115
+ right_record = right_records.select { |record| blocking_left.nil? or blocking_left.match(read_right(record)) }.max do |a_record, b_record|
116
+ a = read_right a_record
117
+ b = read_right b_record
118
+ restricted_a = restrict a
119
+ restricted_b = restrict b
120
+ if restricted_left and restricted_a and restricted_b and restricted_left != restricted_a and restricted_left != restricted_b
121
+ # neither would ever work, so randomly rank one over the other
122
+ rand(2) == 1 ? -1 : 1
123
+ elsif restricted_left and restricted_a and restricted_left != restricted_a
124
+ -1
125
+ elsif restricted_left and restricted_b and restricted_left != restricted_b
126
+ 1
127
+ else
128
+ t_left_a, t_right_a = optimize t_options_left, t_options(a)
129
+ t_left_b, t_right_b = optimize t_options_left, t_options(b)
130
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
131
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
132
+ history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
133
+ history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
134
+
135
+ yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
136
+
137
+ if $ltd_dd_print and yep_dd
138
+ logger.andand.debug t_left_a.inspect
139
+ logger.andand.debug t_right_a.inspect
140
+ logger.andand.debug t_left_b.inspect
141
+ logger.andand.debug t_right_b.inspect
142
+ logger.andand.debug
143
+ end
144
+
145
+ z = 1
146
+ debugger if yep_dd
147
+ z = 1
148
+
149
+ if a_score != b_score
150
+ a_score <=> b_score
151
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
152
+ a_prefix <=> b_prefix
153
+ else
154
+ b.length <=> a.length
155
+ end
156
+ end
157
+ end
158
+ $ltd_1 = history[right_record]
159
+ right = read_right right_record
160
+ restricted_right = restrict right
161
+ z = 1
162
+ debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
163
+ z = 1
164
+ return if restricted_left and restricted_right and restricted_left != restricted_right
165
+ inline_check left_record, right_record
166
+ right_record
167
+ end
168
+
169
+ def optimize(t_options_left, t_options_right)
170
+ cart_prod(t_options_left, t_options_right).max do |a, b|
171
+ t_left_a, t_right_a = a
172
+ t_left_b, t_right_b = b
173
+
174
+ a_prefix, a_score = t_left_a.prefix_and_score t_right_a
175
+ b_prefix, b_score = t_left_b.prefix_and_score t_right_b
176
+
177
+ yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
178
+
179
+ if $ltd_ddd_print and yep_ddd
180
+ logger.andand.debug t_left_a.inspect
181
+ logger.andand.debug t_right_a.inspect
182
+ logger.andand.debug t_left_b.inspect
183
+ logger.andand.debug t_right_b.inspect
184
+ logger.andand.debug
185
+ end
186
+
187
+ z = 1
188
+ debugger if yep_ddd
189
+ z = 1
190
+
191
+ if a_score != b_score
192
+ a_score <=> b_score
193
+ elsif a_prefix and b_prefix and a_prefix != b_prefix
194
+ a_prefix <=> b_prefix
195
+ else
196
+ # randomly choose
197
+ # maybe later i can figure out how big the inputs are and apply occam's razor
198
+ rand(2) == 1 ? -1 : 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def t_options(str)
204
+ return @_t_options[str] if @_t_options.andand.has_key?(str)
205
+ @_t_options ||= Hash.new
206
+ ary = Array.new
207
+ ary << T.new(str, str)
208
+ tightenings.each do |tightening|
209
+ if literal_regexp(tightening[0]).match str
210
+ ary << T.new(str, $~.captures.compact.join)
211
+ end
212
+ end
213
+ @_t_options[str] = ary
214
+ end
215
+
216
+ def blocking(str)
217
+ return @_blocking[str] if @_blocking.andand.has_key?(str)
218
+ @_blocking ||= Hash.new
219
+ blockings.each do |blocking|
220
+ regexp = literal_regexp blocking[0]
221
+ if regexp.match str
222
+ return @_blocking[str] = regexp
223
+ end
224
+ end
225
+ @_blocking[str] = nil
226
+ end
227
+
228
+ def restrict(str)
229
+ return @_restrict[str] if @_restrict.andand.has_key?(str)
230
+ @_restrict ||= Hash.new
231
+ restrictions.each do |restriction|
232
+ if literal_regexp(restriction[0]).match str
233
+ retval = $~.captures.compact.join
234
+ retval = retval.downcase unless case_sensitive
235
+ return @_restrict[str] = retval
236
+ end
237
+ end
238
+ @_restrict[str] = nil
239
+ end
240
+
241
+ def literal_regexp(str)
242
+ return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
243
+ @_literal_regexp ||= Hash.new
244
+ raw_regexp_options = str.split('/').last
245
+ i = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
246
+ m = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
247
+ x = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
248
+ @_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (i||m||x), 'U'
249
+ end
250
+
251
+ def read_left(left_record)
252
+ return if left_record.nil?
253
+ left_reader ? left_reader.call(left_record) : left_record[0]
254
+ end
255
+
256
+ def read_right(right_record)
257
+ return if right_record.nil?
258
+ right_reader ? right_reader.call(right_record) : right_record[0]
259
+ end
260
+
261
+ # Thanks William James!
262
+ # http://www.ruby-forum.com/topic/95519#200484
263
+ def cart_prod(*args)
264
+ args.inject([[]]){|old,lst|
265
+ new = []
266
+ lst.each{|e| new += old.map{|c| c.dup << e }}
267
+ new
268
+ }
269
+ end
270
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'logger'
5
+ require 'ruby-debug'
6
+
7
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
8
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
9
+ require 'loose_tight_dictionary'
10
+
11
+ class Test::Unit::TestCase
12
+ end
@@ -0,0 +1,211 @@
1
+ require 'helper'
2
+
3
+ require 'remote_table'
4
+
5
+ # $logger = Logger.new STDERR
6
+ # $logger.level = Logger::INFO
7
+ # $tee = STDOUT
8
+
9
+ class TestLooseTightDictionary < Test::Unit::TestCase
10
+ def setup
11
+ clear_ltd
12
+
13
+ # dh 8 400
14
+ @a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
15
+ @a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
16
+ # dh 88
17
+ @b_left = ['ABCDEFG DH88 HIJKLMNOP']
18
+ # dh 89
19
+ @c_right = ['ABCDEFG DH89 HIJKLMNOP']
20
+ # dh 8 200
21
+ @d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
22
+ @d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
23
+ @d_lookalike = ['ABCD DHC8200 Dash 8']
24
+
25
+ @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
26
+
27
+ @d_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good restriction for de havilland' ]
28
+
29
+ @left = [
30
+ @a_left,
31
+ @b_left,
32
+ ['DE HAVILLAND DH89 Dragon Rapide'],
33
+ ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
34
+ @d_left,
35
+ ['DE HAVILLAND CANADA DHC8300 Dash 8'],
36
+ ['DE HAVILLAND DH90 Dragonfly']
37
+ ]
38
+ @right = [
39
+ @a_right,
40
+ @c_right,
41
+ @d_right,
42
+ ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
43
+ ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
44
+ ]
45
+ @tightenings = []
46
+ @restrictions = []
47
+ @blockings = []
48
+ @positives = []
49
+ @negatives = []
50
+ end
51
+
52
+ def clear_ltd
53
+ @_ltd = nil
54
+ end
55
+
56
+ def ltd
57
+ @_ltd ||= LooseTightDictionary.new @right,
58
+ :tightenings => @tightenings,
59
+ :restrictions => @restrictions,
60
+ :blockings => @blockings,
61
+ :positives => @positives,
62
+ :negatives => @negatives,
63
+ :logger => $logger,
64
+ :tee => $tee
65
+ end
66
+
67
+ if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
68
+ end
69
+
70
+ if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
71
+ should "have a false match without blocking" do
72
+ # @d_left will be our victim
73
+ @right.push @d_lookalike
74
+ @tightenings.push @t_1
75
+
76
+ assert_equal @d_lookalike, ltd.left_to_right(@d_left)
77
+ end
78
+
79
+ should "do blocking if the left matches a block" do
80
+ # @d_left will be our victim
81
+ @right.push @d_lookalike
82
+ @tightenings.push @t_1
83
+ @blockings.push ['/(bombardier|de ?havilland)/i']
84
+
85
+ assert_equal @d_right, ltd.left_to_right(@d_left)
86
+ end
87
+
88
+ should "not do blocking if the left doesn't match any blockings" do
89
+ @tightenings.push @t_1
90
+ @blockings.push ['/(bombardier|de ?havilland)/i']
91
+
92
+ assert_equal @d_right, ltd.left_to_right(@d_lookalike)
93
+ end
94
+
95
+ should "use the best score from all of the tightenings" do
96
+ x_left = ["BOEING 737100"]
97
+ x_right = ["BOEING BOEING 737-100/200"]
98
+ x_right_wrong = ["BOEING BOEING 737-900"]
99
+ @right.push x_right
100
+ @right.push x_right_wrong
101
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
102
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
103
+
104
+ assert_equal x_right, ltd.left_to_right(x_left)
105
+ end
106
+
107
+ should "compare using prefixes if tightened key is shorter than correct match" do
108
+ x_left = ["BOEING 720"]
109
+ x_right = ["BOEING BOEING 720-000"]
110
+ x_right_wrong = ["BOEING BOEING 717-200"]
111
+ @right.push x_right
112
+ @right.push x_right_wrong
113
+ @tightenings.push @t_1
114
+ @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
115
+ @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
116
+
117
+ assert_equal x_right, ltd.left_to_right(x_left)
118
+ end
119
+
120
+ should "use the shortest original input" do
121
+ x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
122
+ x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
123
+ x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
124
+
125
+ @right.push x_right_long
126
+ @right.push x_right
127
+ @tightenings.push @t_1
128
+
129
+ assert_equal x_right, ltd.left_to_right(x_left)
130
+ end
131
+
132
+ should "perform lookups left to right" do
133
+ assert_equal @a_right, ltd.left_to_right(@a_left)
134
+ end
135
+
136
+ should "succeed if there are no checks" do
137
+ assert_nothing_raised do
138
+ ltd.check @left
139
+ end
140
+ end
141
+
142
+ should "succeed if the positive checks just work" do
143
+ @positives.push [ @a_left[0], @a_right[0] ]
144
+
145
+ assert_nothing_raised do
146
+ ltd.check @left
147
+ end
148
+ end
149
+
150
+ should "fail if positive checks don't work" do
151
+ @positives.push [ @d_left[0], @d_right[0] ]
152
+
153
+ assert_raises(LooseTightDictionary::Mismatch) do
154
+ ltd.check @left
155
+ end
156
+ end
157
+
158
+ should "succeed if proper tightening is applied" do
159
+ @positives.push [ @d_left[0], @d_right[0] ]
160
+ @tightenings.push @t_1
161
+
162
+ assert_nothing_raised do
163
+ ltd.check @left
164
+ end
165
+ end
166
+
167
+ should "use a Google Docs spreadsheet as a source of tightenings" do
168
+ @positives.push [ @d_left[0], @d_right[0] ]
169
+ @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
170
+
171
+ assert_nothing_raised do
172
+ ltd.check @left
173
+ end
174
+ end
175
+
176
+ should "fail if negative checks don't work" do
177
+ @negatives.push [ @b_left[0], @c_right[0] ]
178
+
179
+ assert_raises(LooseTightDictionary::FalsePositive) do
180
+ ltd.check @left
181
+ end
182
+ end
183
+
184
+ should "do inline checking" do
185
+ @negatives.push [ @b_left[0], @c_right[0] ]
186
+
187
+ assert_raises(LooseTightDictionary::FalsePositive) do
188
+ ltd.left_to_right @b_left
189
+ end
190
+ end
191
+
192
+ should "fail if negative checks don't work, even with tightening" do
193
+ @negatives.push [ @b_left[0], @c_right[0] ]
194
+ @tightenings.push @t_1
195
+
196
+ assert_raises(LooseTightDictionary::FalsePositive) do
197
+ ltd.check @left
198
+ end
199
+ end
200
+
201
+ should "succeed if proper restriction is applied" do
202
+ @negatives.push [ @b_left[0], @c_right[0] ]
203
+ @positives.push [ @d_left[0], @d_right[0] ]
204
+ @restrictions.push @d_1
205
+
206
+ assert_nothing_raised do
207
+ ltd.check @left
208
+ end
209
+ end
210
+ end
211
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loose_tight_dictionary
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Seamus Abshere
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-28 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: remote_table
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ - 2
42
+ - 16
43
+ version: 0.2.16
44
+ type: :development
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: activesupport
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 2
55
+ - 3
56
+ - 4
57
+ version: 2.3.4
58
+ type: :runtime
59
+ version_requirements: *id003
60
+ - !ruby/object:Gem::Dependency
61
+ name: fastercsv
62
+ prerelease: false
63
+ requirement: &id004 !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 1
69
+ - 5
70
+ - 3
71
+ version: 1.5.3
72
+ type: :runtime
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: andand
76
+ prerelease: false
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 3
84
+ - 1
85
+ version: 1.3.1
86
+ type: :runtime
87
+ version_requirements: *id005
88
+ - !ruby/object:Gem::Dependency
89
+ name: amatch
90
+ prerelease: false
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ - 2
98
+ - 5
99
+ version: 0.2.5
100
+ type: :runtime
101
+ version_requirements: *id006
102
+ description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
103
+ email: seamus@abshere.net
104
+ executables: []
105
+
106
+ extensions: []
107
+
108
+ extra_rdoc_files:
109
+ - LICENSE
110
+ - README.rdoc
111
+ files:
112
+ - .document
113
+ - .gitignore
114
+ - LICENSE
115
+ - README.rdoc
116
+ - Rakefile
117
+ - VERSION
118
+ - examples/icao-bts.rb
119
+ - examples/icao-bts.xls
120
+ - lib/loose_tight_dictionary.rb
121
+ - test/helper.rb
122
+ - test/test_loose_tight_dictionary.rb
123
+ has_rdoc: true
124
+ homepage: http://github.com/seamusabshere/loose_tight_dictionary
125
+ licenses: []
126
+
127
+ post_install_message:
128
+ rdoc_options:
129
+ - --charset=UTF-8
130
+ require_paths:
131
+ - lib
132
+ required_ruby_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ segments:
137
+ - 0
138
+ version: "0"
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ segments:
144
+ - 0
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project:
149
+ rubygems_version: 1.3.6
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: Allows iterative development of dictionaries for big data sets.
153
+ test_files:
154
+ - test/helper.rb
155
+ - test/test_loose_tight_dictionary.rb
156
+ - examples/icao-bts.rb