loose_tight_dictionary 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/examples/icao-bts.rb +58 -0
- data/examples/icao-bts.xls +0 -0
- data/lib/loose_tight_dictionary.rb +270 -0
- data/test/helper.rb +12 -0
- data/test/test_loose_tight_dictionary.rb +211 -0
- metadata +156 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2009 Seamus Abshere
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
= loose_tight_dictionary
|
|
2
|
+
|
|
3
|
+
Description goes here.
|
|
4
|
+
|
|
5
|
+
== Note on Patches/Pull Requests
|
|
6
|
+
|
|
7
|
+
* Fork the project.
|
|
8
|
+
* Make your feature addition or bug fix.
|
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
|
10
|
+
future version unintentionally.
|
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
|
12
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
|
13
|
+
* Send me a pull request. Bonus points for topic branches.
|
|
14
|
+
|
|
15
|
+
== Copyright
|
|
16
|
+
|
|
17
|
+
Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
gem.name = "loose_tight_dictionary"
|
|
8
|
+
gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
|
|
9
|
+
gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
|
|
10
|
+
gem.email = "seamus@abshere.net"
|
|
11
|
+
gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
|
|
12
|
+
gem.authors = ["Seamus Abshere"]
|
|
13
|
+
gem.add_development_dependency "shoulda"
|
|
14
|
+
gem.add_development_dependency "remote_table", ">=0.2.16"
|
|
15
|
+
gem.add_dependency 'activesupport', '>=2.3.4'
|
|
16
|
+
gem.add_dependency 'fastercsv', '>=1.5.3'
|
|
17
|
+
gem.add_dependency 'andand', '>=1.3.1'
|
|
18
|
+
gem.add_dependency 'amatch', '>=0.2.5'
|
|
19
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
20
|
+
end
|
|
21
|
+
Jeweler::GemcutterTasks.new
|
|
22
|
+
rescue LoadError
|
|
23
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
require 'rake/testtask'
|
|
27
|
+
Rake::TestTask.new(:test) do |test|
|
|
28
|
+
test.libs << 'lib' << 'test'
|
|
29
|
+
test.pattern = 'test/**/test_*.rb'
|
|
30
|
+
test.verbose = true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
begin
|
|
34
|
+
require 'rcov/rcovtask'
|
|
35
|
+
Rcov::RcovTask.new do |test|
|
|
36
|
+
test.libs << 'test'
|
|
37
|
+
test.pattern = 'test/**/test_*.rb'
|
|
38
|
+
test.verbose = true
|
|
39
|
+
end
|
|
40
|
+
rescue LoadError
|
|
41
|
+
task :rcov do
|
|
42
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
task :test => :check_dependencies
|
|
47
|
+
|
|
48
|
+
task :default => :test
|
|
49
|
+
|
|
50
|
+
require 'rake/rdoctask'
|
|
51
|
+
Rake::RDocTask.new do |rdoc|
|
|
52
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
|
53
|
+
|
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
55
|
+
rdoc.title = "loose_tight_dictionary #{version}"
|
|
56
|
+
rdoc.rdoc_files.include('README*')
|
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
58
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.1
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'remote_table'
|
|
5
|
+
require 'ruby-debug'
|
|
6
|
+
require 'logger'
|
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
|
|
8
|
+
|
|
9
|
+
$logger = Logger.new STDERR
|
|
10
|
+
$logger.level = Logger::DEBUG
|
|
11
|
+
$logger.datetime_format = "%H:%M:%S"
|
|
12
|
+
# $tee = File.open('tee.csv', 'w')
|
|
13
|
+
$tee = STDOUT
|
|
14
|
+
|
|
15
|
+
# $ltd_left = /(super|bonanza)/i
|
|
16
|
+
# $ltd_right = /bonanza d-35/i
|
|
17
|
+
# $ltd_dd_left = /bonanza/i
|
|
18
|
+
# $ltd_dd_right = /musk/i
|
|
19
|
+
# $ltd_dd_left_not = /allison/i
|
|
20
|
+
# $ltd_dd_print = true
|
|
21
|
+
# $ltd_ddd_left = /bonanza/i
|
|
22
|
+
# $ltd_ddd_right = /musk/i
|
|
23
|
+
# $ltd_ddd_left_not = /allison/i
|
|
24
|
+
# $ltd_ddd_print = true
|
|
25
|
+
|
|
26
|
+
@right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
|
|
27
|
+
:select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
|
|
28
|
+
|
|
29
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
|
30
|
+
|
|
31
|
+
@restrictions = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
|
|
32
|
+
|
|
33
|
+
@blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
|
|
34
|
+
|
|
35
|
+
@positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
|
|
36
|
+
|
|
37
|
+
@negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
|
|
38
|
+
|
|
39
|
+
%w{ tightenings restrictions blockings }.each do |name|
|
|
40
|
+
$logger.info name
|
|
41
|
+
$logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
|
|
42
|
+
$logger.info "\n"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
('A'..'Z').each do |letter|
|
|
46
|
+
# %w{ E }.each do |letter|
|
|
47
|
+
@left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
|
48
|
+
:encoding => 'US-ASCII',
|
|
49
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
|
50
|
+
:column_xpath => 'td'
|
|
51
|
+
|
|
52
|
+
d = LooseTightDictionary.new @right, :tightenings => @tightenings, :restrictions => @restrictions, :blockings => @blockings, :logger => $logger, :tee => $tee
|
|
53
|
+
d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
|
|
54
|
+
d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
|
|
55
|
+
d.positives = @positives
|
|
56
|
+
d.negatives = @negatives
|
|
57
|
+
d.check @left
|
|
58
|
+
end
|
|
Binary file
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
require 'active_support'
|
|
2
|
+
require 'active_support/version'
|
|
3
|
+
%w{
|
|
4
|
+
active_support/core_ext/string
|
|
5
|
+
}.each do |active_support_3_requirement|
|
|
6
|
+
require active_support_3_requirement
|
|
7
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
|
8
|
+
require 'amatch'
|
|
9
|
+
require 'andand'
|
|
10
|
+
require 'fastercsv'
|
|
11
|
+
|
|
12
|
+
class LooseTightDictionary
|
|
13
|
+
class MissedChecks < RuntimeError; end
|
|
14
|
+
class Mismatch < RuntimeError; end
|
|
15
|
+
class FalsePositive < RuntimeError; end
|
|
16
|
+
|
|
17
|
+
class T
|
|
18
|
+
attr_reader :str, :tightened_str
|
|
19
|
+
def initialize(str, tightened_str)
|
|
20
|
+
@str = str
|
|
21
|
+
@tightened_str = tightened_str
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def tightened?
|
|
25
|
+
str != tightened_str
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def prefix_and_score(other)
|
|
29
|
+
prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
|
|
30
|
+
score = if prefix
|
|
31
|
+
tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
|
|
32
|
+
else
|
|
33
|
+
tightened_str.pair_distance_similar other.tightened_str
|
|
34
|
+
end
|
|
35
|
+
[ prefix, score ]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
include Amatch
|
|
40
|
+
|
|
41
|
+
attr_reader :right_records
|
|
42
|
+
attr_reader :tightenings
|
|
43
|
+
attr_reader :restrictions
|
|
44
|
+
attr_reader :blockings
|
|
45
|
+
attr_reader :logger
|
|
46
|
+
attr_reader :tee
|
|
47
|
+
attr_reader :case_sensitive
|
|
48
|
+
|
|
49
|
+
attr_accessor :positives
|
|
50
|
+
attr_accessor :negatives
|
|
51
|
+
attr_accessor :left_reader
|
|
52
|
+
attr_accessor :right_reader
|
|
53
|
+
|
|
54
|
+
def initialize(right_records, options = {})
|
|
55
|
+
@right_records = right_records
|
|
56
|
+
@tightenings = options[:tightenings] || Array.new
|
|
57
|
+
@restrictions = options[:restrictions] || Array.new
|
|
58
|
+
@blockings = options[:blockings] || Array.new
|
|
59
|
+
@left_reader = options[:left_reader]
|
|
60
|
+
@right_reader = options[:right_reader]
|
|
61
|
+
@positives = options[:positives]
|
|
62
|
+
@negatives = options[:negatives]
|
|
63
|
+
@logger = options[:logger]
|
|
64
|
+
@tee = options[:tee]
|
|
65
|
+
@case_sensitive = options[:case_sensitive] || false
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def inline_check(left_record, right_record)
|
|
69
|
+
return unless positives.present? or negatives.present?
|
|
70
|
+
|
|
71
|
+
left = read_left left_record
|
|
72
|
+
right = read_right right_record
|
|
73
|
+
|
|
74
|
+
if p = positives.andand.detect { |record| record[0] == left }
|
|
75
|
+
correct_right = p[1]
|
|
76
|
+
else
|
|
77
|
+
correct_right = :ignore
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
if n = negatives.andand.detect { |record| record[0] == left }
|
|
81
|
+
incorrect_right = n[1]
|
|
82
|
+
else
|
|
83
|
+
incorrect_right = :ignore
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
if correct_right != :ignore and right != correct_right
|
|
87
|
+
logger.andand.debug " Mismatch! (should be #{correct_right})"
|
|
88
|
+
raise Mismatch
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
if incorrect_right != :ignore and right == incorrect_right
|
|
92
|
+
logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
|
|
93
|
+
raise FalsePositive
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def check(left_records)
|
|
98
|
+
unless positives.present? or negatives.present?
|
|
99
|
+
logger.andand.info "You didn't define any positives or negatives, so running check doesn't do anything"
|
|
100
|
+
return
|
|
101
|
+
end
|
|
102
|
+
left_records.each do |left_record|
|
|
103
|
+
right_record = left_to_right left_record
|
|
104
|
+
inline_check left_record, right_record
|
|
105
|
+
tee.andand.puts [ read_left(left_record), read_right(right_record), $ltd_1 ].flatten.to_csv
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def left_to_right(left_record)
|
|
110
|
+
left = read_left left_record
|
|
111
|
+
restricted_left = restrict left
|
|
112
|
+
blocking_left = blocking left
|
|
113
|
+
t_options_left = t_options left
|
|
114
|
+
history = Hash.new
|
|
115
|
+
right_record = right_records.select { |record| blocking_left.nil? or blocking_left.match(read_right(record)) }.max do |a_record, b_record|
|
|
116
|
+
a = read_right a_record
|
|
117
|
+
b = read_right b_record
|
|
118
|
+
restricted_a = restrict a
|
|
119
|
+
restricted_b = restrict b
|
|
120
|
+
if restricted_left and restricted_a and restricted_b and restricted_left != restricted_a and restricted_left != restricted_b
|
|
121
|
+
# neither would ever work, so randomly rank one over the other
|
|
122
|
+
rand(2) == 1 ? -1 : 1
|
|
123
|
+
elsif restricted_left and restricted_a and restricted_left != restricted_a
|
|
124
|
+
-1
|
|
125
|
+
elsif restricted_left and restricted_b and restricted_left != restricted_b
|
|
126
|
+
1
|
|
127
|
+
else
|
|
128
|
+
t_left_a, t_right_a = optimize t_options_left, t_options(a)
|
|
129
|
+
t_left_b, t_right_b = optimize t_options_left, t_options(b)
|
|
130
|
+
a_prefix, a_score = t_left_a.prefix_and_score t_right_a
|
|
131
|
+
b_prefix, b_score = t_left_b.prefix_and_score t_right_b
|
|
132
|
+
history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
|
|
133
|
+
history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
|
|
134
|
+
|
|
135
|
+
yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
|
|
136
|
+
|
|
137
|
+
if $ltd_dd_print and yep_dd
|
|
138
|
+
logger.andand.debug t_left_a.inspect
|
|
139
|
+
logger.andand.debug t_right_a.inspect
|
|
140
|
+
logger.andand.debug t_left_b.inspect
|
|
141
|
+
logger.andand.debug t_right_b.inspect
|
|
142
|
+
logger.andand.debug
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
z = 1
|
|
146
|
+
debugger if yep_dd
|
|
147
|
+
z = 1
|
|
148
|
+
|
|
149
|
+
if a_score != b_score
|
|
150
|
+
a_score <=> b_score
|
|
151
|
+
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
|
152
|
+
a_prefix <=> b_prefix
|
|
153
|
+
else
|
|
154
|
+
b.length <=> a.length
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
$ltd_1 = history[right_record]
|
|
159
|
+
right = read_right right_record
|
|
160
|
+
restricted_right = restrict right
|
|
161
|
+
z = 1
|
|
162
|
+
debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
|
|
163
|
+
z = 1
|
|
164
|
+
return if restricted_left and restricted_right and restricted_left != restricted_right
|
|
165
|
+
inline_check left_record, right_record
|
|
166
|
+
right_record
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def optimize(t_options_left, t_options_right)
|
|
170
|
+
cart_prod(t_options_left, t_options_right).max do |a, b|
|
|
171
|
+
t_left_a, t_right_a = a
|
|
172
|
+
t_left_b, t_right_b = b
|
|
173
|
+
|
|
174
|
+
a_prefix, a_score = t_left_a.prefix_and_score t_right_a
|
|
175
|
+
b_prefix, b_score = t_left_b.prefix_and_score t_right_b
|
|
176
|
+
|
|
177
|
+
yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
|
|
178
|
+
|
|
179
|
+
if $ltd_ddd_print and yep_ddd
|
|
180
|
+
logger.andand.debug t_left_a.inspect
|
|
181
|
+
logger.andand.debug t_right_a.inspect
|
|
182
|
+
logger.andand.debug t_left_b.inspect
|
|
183
|
+
logger.andand.debug t_right_b.inspect
|
|
184
|
+
logger.andand.debug
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
z = 1
|
|
188
|
+
debugger if yep_ddd
|
|
189
|
+
z = 1
|
|
190
|
+
|
|
191
|
+
if a_score != b_score
|
|
192
|
+
a_score <=> b_score
|
|
193
|
+
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
|
194
|
+
a_prefix <=> b_prefix
|
|
195
|
+
else
|
|
196
|
+
# randomly choose
|
|
197
|
+
# maybe later i can figure out how big the inputs are and apply occam's razor
|
|
198
|
+
rand(2) == 1 ? -1 : 1
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def t_options(str)
|
|
204
|
+
return @_t_options[str] if @_t_options.andand.has_key?(str)
|
|
205
|
+
@_t_options ||= Hash.new
|
|
206
|
+
ary = Array.new
|
|
207
|
+
ary << T.new(str, str)
|
|
208
|
+
tightenings.each do |tightening|
|
|
209
|
+
if literal_regexp(tightening[0]).match str
|
|
210
|
+
ary << T.new(str, $~.captures.compact.join)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
@_t_options[str] = ary
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def blocking(str)
|
|
217
|
+
return @_blocking[str] if @_blocking.andand.has_key?(str)
|
|
218
|
+
@_blocking ||= Hash.new
|
|
219
|
+
blockings.each do |blocking|
|
|
220
|
+
regexp = literal_regexp blocking[0]
|
|
221
|
+
if regexp.match str
|
|
222
|
+
return @_blocking[str] = regexp
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
@_blocking[str] = nil
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def restrict(str)
|
|
229
|
+
return @_restrict[str] if @_restrict.andand.has_key?(str)
|
|
230
|
+
@_restrict ||= Hash.new
|
|
231
|
+
restrictions.each do |restriction|
|
|
232
|
+
if literal_regexp(restriction[0]).match str
|
|
233
|
+
retval = $~.captures.compact.join
|
|
234
|
+
retval = retval.downcase unless case_sensitive
|
|
235
|
+
return @_restrict[str] = retval
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
@_restrict[str] = nil
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def literal_regexp(str)
|
|
242
|
+
return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
|
|
243
|
+
@_literal_regexp ||= Hash.new
|
|
244
|
+
raw_regexp_options = str.split('/').last
|
|
245
|
+
i = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
|
|
246
|
+
m = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
|
|
247
|
+
x = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
|
|
248
|
+
@_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (i||m||x), 'U'
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def read_left(left_record)
|
|
252
|
+
return if left_record.nil?
|
|
253
|
+
left_reader ? left_reader.call(left_record) : left_record[0]
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def read_right(right_record)
|
|
257
|
+
return if right_record.nil?
|
|
258
|
+
right_reader ? right_reader.call(right_record) : right_record[0]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Thanks William James!
|
|
262
|
+
# http://www.ruby-forum.com/topic/95519#200484
|
|
263
|
+
def cart_prod(*args)
|
|
264
|
+
args.inject([[]]){|old,lst|
|
|
265
|
+
new = []
|
|
266
|
+
lst.each{|e| new += old.map{|c| c.dup << e }}
|
|
267
|
+
new
|
|
268
|
+
}
|
|
269
|
+
end
|
|
270
|
+
end
|
data/test/helper.rb
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'test/unit'
|
|
3
|
+
require 'shoulda'
|
|
4
|
+
require 'logger'
|
|
5
|
+
require 'ruby-debug'
|
|
6
|
+
|
|
7
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
8
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
9
|
+
require 'loose_tight_dictionary'
|
|
10
|
+
|
|
11
|
+
class Test::Unit::TestCase
|
|
12
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
require 'helper'
|
|
2
|
+
|
|
3
|
+
require 'remote_table'
|
|
4
|
+
|
|
5
|
+
# $logger = Logger.new STDERR
|
|
6
|
+
# $logger.level = Logger::INFO
|
|
7
|
+
# $tee = STDOUT
|
|
8
|
+
|
|
9
|
+
class TestLooseTightDictionary < Test::Unit::TestCase
|
|
10
|
+
def setup
|
|
11
|
+
clear_ltd
|
|
12
|
+
|
|
13
|
+
# dh 8 400
|
|
14
|
+
@a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
|
|
15
|
+
@a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
|
|
16
|
+
# dh 88
|
|
17
|
+
@b_left = ['ABCDEFG DH88 HIJKLMNOP']
|
|
18
|
+
# dh 89
|
|
19
|
+
@c_right = ['ABCDEFG DH89 HIJKLMNOP']
|
|
20
|
+
# dh 8 200
|
|
21
|
+
@d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
|
|
22
|
+
@d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
|
|
23
|
+
@d_lookalike = ['ABCD DHC8200 Dash 8']
|
|
24
|
+
|
|
25
|
+
@t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
|
|
26
|
+
|
|
27
|
+
@d_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good restriction for de havilland' ]
|
|
28
|
+
|
|
29
|
+
@left = [
|
|
30
|
+
@a_left,
|
|
31
|
+
@b_left,
|
|
32
|
+
['DE HAVILLAND DH89 Dragon Rapide'],
|
|
33
|
+
['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
|
|
34
|
+
@d_left,
|
|
35
|
+
['DE HAVILLAND CANADA DHC8300 Dash 8'],
|
|
36
|
+
['DE HAVILLAND DH90 Dragonfly']
|
|
37
|
+
]
|
|
38
|
+
@right = [
|
|
39
|
+
@a_right,
|
|
40
|
+
@c_right,
|
|
41
|
+
@d_right,
|
|
42
|
+
['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
|
|
43
|
+
['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
|
|
44
|
+
]
|
|
45
|
+
@tightenings = []
|
|
46
|
+
@restrictions = []
|
|
47
|
+
@blockings = []
|
|
48
|
+
@positives = []
|
|
49
|
+
@negatives = []
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def clear_ltd
|
|
53
|
+
@_ltd = nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def ltd
|
|
57
|
+
@_ltd ||= LooseTightDictionary.new @right,
|
|
58
|
+
:tightenings => @tightenings,
|
|
59
|
+
:restrictions => @restrictions,
|
|
60
|
+
:blockings => @blockings,
|
|
61
|
+
:positives => @positives,
|
|
62
|
+
:negatives => @negatives,
|
|
63
|
+
:logger => $logger,
|
|
64
|
+
:tee => $tee
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
|
|
71
|
+
should "have a false match without blocking" do
|
|
72
|
+
# @d_left will be our victim
|
|
73
|
+
@right.push @d_lookalike
|
|
74
|
+
@tightenings.push @t_1
|
|
75
|
+
|
|
76
|
+
assert_equal @d_lookalike, ltd.left_to_right(@d_left)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
should "do blocking if the left matches a block" do
|
|
80
|
+
# @d_left will be our victim
|
|
81
|
+
@right.push @d_lookalike
|
|
82
|
+
@tightenings.push @t_1
|
|
83
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
|
84
|
+
|
|
85
|
+
assert_equal @d_right, ltd.left_to_right(@d_left)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
should "not do blocking if the left doesn't match any blockings" do
|
|
89
|
+
@tightenings.push @t_1
|
|
90
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
|
91
|
+
|
|
92
|
+
assert_equal @d_right, ltd.left_to_right(@d_lookalike)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
should "use the best score from all of the tightenings" do
|
|
96
|
+
x_left = ["BOEING 737100"]
|
|
97
|
+
x_right = ["BOEING BOEING 737-100/200"]
|
|
98
|
+
x_right_wrong = ["BOEING BOEING 737-900"]
|
|
99
|
+
@right.push x_right
|
|
100
|
+
@right.push x_right_wrong
|
|
101
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
|
102
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
|
103
|
+
|
|
104
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
should "compare using prefixes if tightened key is shorter than correct match" do
|
|
108
|
+
x_left = ["BOEING 720"]
|
|
109
|
+
x_right = ["BOEING BOEING 720-000"]
|
|
110
|
+
x_right_wrong = ["BOEING BOEING 717-200"]
|
|
111
|
+
@right.push x_right
|
|
112
|
+
@right.push x_right_wrong
|
|
113
|
+
@tightenings.push @t_1
|
|
114
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
|
115
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
|
116
|
+
|
|
117
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
should "use the shortest original input" do
|
|
121
|
+
x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
|
|
122
|
+
x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
|
|
123
|
+
x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
|
124
|
+
|
|
125
|
+
@right.push x_right_long
|
|
126
|
+
@right.push x_right
|
|
127
|
+
@tightenings.push @t_1
|
|
128
|
+
|
|
129
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
should "perform lookups left to right" do
|
|
133
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
should "succeed if there are no checks" do
|
|
137
|
+
assert_nothing_raised do
|
|
138
|
+
ltd.check @left
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
should "succeed if the positive checks just work" do
|
|
143
|
+
@positives.push [ @a_left[0], @a_right[0] ]
|
|
144
|
+
|
|
145
|
+
assert_nothing_raised do
|
|
146
|
+
ltd.check @left
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
should "fail if positive checks don't work" do
|
|
151
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
152
|
+
|
|
153
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
|
154
|
+
ltd.check @left
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
should "succeed if proper tightening is applied" do
|
|
159
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
160
|
+
@tightenings.push @t_1
|
|
161
|
+
|
|
162
|
+
assert_nothing_raised do
|
|
163
|
+
ltd.check @left
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
should "use a Google Docs spreadsheet as a source of tightenings" do
|
|
168
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
169
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
|
170
|
+
|
|
171
|
+
assert_nothing_raised do
|
|
172
|
+
ltd.check @left
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
should "fail if negative checks don't work" do
|
|
177
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
178
|
+
|
|
179
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
180
|
+
ltd.check @left
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
should "do inline checking" do
|
|
185
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
186
|
+
|
|
187
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
188
|
+
ltd.left_to_right @b_left
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
should "fail if negative checks don't work, even with tightening" do
|
|
193
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
194
|
+
@tightenings.push @t_1
|
|
195
|
+
|
|
196
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
197
|
+
ltd.check @left
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
should "succeed if proper restriction is applied" do
|
|
202
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
203
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
204
|
+
@restrictions.push @d_1
|
|
205
|
+
|
|
206
|
+
assert_nothing_raised do
|
|
207
|
+
ltd.check @left
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: loose_tight_dictionary
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 0
|
|
8
|
+
- 1
|
|
9
|
+
version: 0.0.1
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Seamus Abshere
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2010-04-28 00:00:00 -04:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies:
|
|
20
|
+
- !ruby/object:Gem::Dependency
|
|
21
|
+
name: shoulda
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - ">="
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
segments:
|
|
28
|
+
- 0
|
|
29
|
+
version: "0"
|
|
30
|
+
type: :development
|
|
31
|
+
version_requirements: *id001
|
|
32
|
+
- !ruby/object:Gem::Dependency
|
|
33
|
+
name: remote_table
|
|
34
|
+
prerelease: false
|
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
segments:
|
|
40
|
+
- 0
|
|
41
|
+
- 2
|
|
42
|
+
- 16
|
|
43
|
+
version: 0.2.16
|
|
44
|
+
type: :development
|
|
45
|
+
version_requirements: *id002
|
|
46
|
+
- !ruby/object:Gem::Dependency
|
|
47
|
+
name: activesupport
|
|
48
|
+
prerelease: false
|
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
segments:
|
|
54
|
+
- 2
|
|
55
|
+
- 3
|
|
56
|
+
- 4
|
|
57
|
+
version: 2.3.4
|
|
58
|
+
type: :runtime
|
|
59
|
+
version_requirements: *id003
|
|
60
|
+
- !ruby/object:Gem::Dependency
|
|
61
|
+
name: fastercsv
|
|
62
|
+
prerelease: false
|
|
63
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
segments:
|
|
68
|
+
- 1
|
|
69
|
+
- 5
|
|
70
|
+
- 3
|
|
71
|
+
version: 1.5.3
|
|
72
|
+
type: :runtime
|
|
73
|
+
version_requirements: *id004
|
|
74
|
+
- !ruby/object:Gem::Dependency
|
|
75
|
+
name: andand
|
|
76
|
+
prerelease: false
|
|
77
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
segments:
|
|
82
|
+
- 1
|
|
83
|
+
- 3
|
|
84
|
+
- 1
|
|
85
|
+
version: 1.3.1
|
|
86
|
+
type: :runtime
|
|
87
|
+
version_requirements: *id005
|
|
88
|
+
- !ruby/object:Gem::Dependency
|
|
89
|
+
name: amatch
|
|
90
|
+
prerelease: false
|
|
91
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
segments:
|
|
96
|
+
- 0
|
|
97
|
+
- 2
|
|
98
|
+
- 5
|
|
99
|
+
version: 0.2.5
|
|
100
|
+
type: :runtime
|
|
101
|
+
version_requirements: *id006
|
|
102
|
+
description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
|
|
103
|
+
email: seamus@abshere.net
|
|
104
|
+
executables: []
|
|
105
|
+
|
|
106
|
+
extensions: []
|
|
107
|
+
|
|
108
|
+
extra_rdoc_files:
|
|
109
|
+
- LICENSE
|
|
110
|
+
- README.rdoc
|
|
111
|
+
files:
|
|
112
|
+
- .document
|
|
113
|
+
- .gitignore
|
|
114
|
+
- LICENSE
|
|
115
|
+
- README.rdoc
|
|
116
|
+
- Rakefile
|
|
117
|
+
- VERSION
|
|
118
|
+
- examples/icao-bts.rb
|
|
119
|
+
- examples/icao-bts.xls
|
|
120
|
+
- lib/loose_tight_dictionary.rb
|
|
121
|
+
- test/helper.rb
|
|
122
|
+
- test/test_loose_tight_dictionary.rb
|
|
123
|
+
has_rdoc: true
|
|
124
|
+
homepage: http://github.com/seamusabshere/loose_tight_dictionary
|
|
125
|
+
licenses: []
|
|
126
|
+
|
|
127
|
+
post_install_message:
|
|
128
|
+
rdoc_options:
|
|
129
|
+
- --charset=UTF-8
|
|
130
|
+
require_paths:
|
|
131
|
+
- lib
|
|
132
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
133
|
+
requirements:
|
|
134
|
+
- - ">="
|
|
135
|
+
- !ruby/object:Gem::Version
|
|
136
|
+
segments:
|
|
137
|
+
- 0
|
|
138
|
+
version: "0"
|
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
|
+
requirements:
|
|
141
|
+
- - ">="
|
|
142
|
+
- !ruby/object:Gem::Version
|
|
143
|
+
segments:
|
|
144
|
+
- 0
|
|
145
|
+
version: "0"
|
|
146
|
+
requirements: []
|
|
147
|
+
|
|
148
|
+
rubyforge_project:
|
|
149
|
+
rubygems_version: 1.3.6
|
|
150
|
+
signing_key:
|
|
151
|
+
specification_version: 3
|
|
152
|
+
summary: Allows iterative development of dictionaries for big data sets.
|
|
153
|
+
test_files:
|
|
154
|
+
- test/helper.rb
|
|
155
|
+
- test/test_loose_tight_dictionary.rb
|
|
156
|
+
- examples/icao-bts.rb
|