loose_tight_dictionary-ruby19 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +56 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/examples/first_name_matching.rb +23 -0
- data/examples/icao-bts.rb +58 -0
- data/examples/icao-bts.xls +0 -0
- data/lib/loose_tight_dictionary.rb +346 -0
- data/test/helper.rb +12 -0
- data/test/test_loose_tight_dictionary.rb +273 -0
- metadata +175 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2009 Seamus Abshere
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
= loose_tight_dictionary
|
|
2
|
+
|
|
3
|
+
Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
|
|
4
|
+
|
|
5
|
+
= Quickstart
|
|
6
|
+
|
|
7
|
+
>> right_records = [ 'seamus', 'andy', 'ben' ]
|
|
8
|
+
=> [...]
|
|
9
|
+
>> left_record = 'Shamus Heaney'
|
|
10
|
+
=> [...]
|
|
11
|
+
>> d = LooseTightDictionary.new right_records
|
|
12
|
+
=> [...]
|
|
13
|
+
>> puts d.left_to_right left_record
|
|
14
|
+
=> 'seamus'
|
|
15
|
+
|
|
16
|
+
Try running the included example file:
|
|
17
|
+
|
|
18
|
+
$ ruby examples/first_name_matching.rb
|
|
19
|
+
Left side (input)
|
|
20
|
+
====================
|
|
21
|
+
Mr. Seamus
|
|
22
|
+
Sr. Andy
|
|
23
|
+
Master BenT
|
|
24
|
+
|
|
25
|
+
Right side (output)
|
|
26
|
+
====================
|
|
27
|
+
seamus
|
|
28
|
+
andy
|
|
29
|
+
ben
|
|
30
|
+
|
|
31
|
+
Results
|
|
32
|
+
====================
|
|
33
|
+
Left record (input) Right record (output) Prefix used (if any) Score
|
|
34
|
+
Mr. Seamus seamus NULL 0.666666666666667
|
|
35
|
+
Sr. Andy andy NULL 0.5
|
|
36
|
+
Master BenT ben NULL 0.2
|
|
37
|
+
|
|
38
|
+
= Improving dictionaries
|
|
39
|
+
|
|
40
|
+
Similarity matching will only get you so far.
|
|
41
|
+
|
|
42
|
+
TODO: regex usage
|
|
43
|
+
|
|
44
|
+
== Note on Patches/Pull Requests
|
|
45
|
+
|
|
46
|
+
* Fork the project.
|
|
47
|
+
* Make your feature addition or bug fix.
|
|
48
|
+
* Add tests for it. This is important so I don't break it in a
|
|
49
|
+
future version unintentionally.
|
|
50
|
+
* Commit, do not mess with rakefile, version, or history.
|
|
51
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
|
52
|
+
* Send me a pull request. Bonus points for topic branches.
|
|
53
|
+
|
|
54
|
+
== Copyright
|
|
55
|
+
|
|
56
|
+
Copyright (c) 2010 Seamus Abshere. See LICENSE for details.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
gem.name = "loose_tight_dictionary"
|
|
8
|
+
gem.summary = %Q{Allows iterative development of dictionaries for big data sets.}
|
|
9
|
+
gem.description = %Q{Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.}
|
|
10
|
+
gem.email = "seamus@abshere.net"
|
|
11
|
+
gem.homepage = "http://github.com/seamusabshere/loose_tight_dictionary"
|
|
12
|
+
gem.authors = ["Seamus Abshere"]
|
|
13
|
+
gem.add_development_dependency "shoulda"
|
|
14
|
+
gem.add_development_dependency "remote_table", ">=0.2.19"
|
|
15
|
+
gem.add_dependency 'activesupport', '>=2.3.4'
|
|
16
|
+
gem.add_dependency 'fastercsv', '>=1.5.3'
|
|
17
|
+
gem.add_dependency 'andand', '>=1.3.1'
|
|
18
|
+
gem.add_dependency 'amatch', '>=0.2.5'
|
|
19
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
20
|
+
end
|
|
21
|
+
Jeweler::GemcutterTasks.new
|
|
22
|
+
rescue LoadError
|
|
23
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
require 'rake/testtask'
|
|
27
|
+
Rake::TestTask.new(:test) do |test|
|
|
28
|
+
test.libs << 'lib' << 'test'
|
|
29
|
+
test.pattern = 'test/**/test_*.rb'
|
|
30
|
+
test.verbose = true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
begin
|
|
34
|
+
require 'rcov/rcovtask'
|
|
35
|
+
Rcov::RcovTask.new do |test|
|
|
36
|
+
test.libs << 'test'
|
|
37
|
+
test.pattern = 'test/**/test_*.rb'
|
|
38
|
+
test.verbose = true
|
|
39
|
+
end
|
|
40
|
+
rescue LoadError
|
|
41
|
+
task :rcov do
|
|
42
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
task :test => :check_dependencies
|
|
47
|
+
|
|
48
|
+
task :default => :test
|
|
49
|
+
|
|
50
|
+
require 'rake/rdoctask'
|
|
51
|
+
Rake::RDocTask.new do |rdoc|
|
|
52
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
|
53
|
+
|
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
55
|
+
rdoc.title = "loose_tight_dictionary #{version}"
|
|
56
|
+
rdoc.rdoc_files.include('README*')
|
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
58
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.8
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
# require 'loose_tight_dictionary'
|
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
|
|
5
|
+
right_side = [ 'seamus', 'andy', 'ben' ]
|
|
6
|
+
left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
|
|
7
|
+
|
|
8
|
+
puts "Left side (input)"
|
|
9
|
+
puts "=" * 20
|
|
10
|
+
puts left_side
|
|
11
|
+
puts
|
|
12
|
+
|
|
13
|
+
puts "Right side (output)"
|
|
14
|
+
puts "=" * 20
|
|
15
|
+
puts right_side
|
|
16
|
+
puts
|
|
17
|
+
|
|
18
|
+
puts "Results"
|
|
19
|
+
puts "=" * 20
|
|
20
|
+
d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
|
|
21
|
+
d.check left_side
|
|
22
|
+
|
|
23
|
+
puts d.left_to_right 'Shamus Heaney'
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'remote_table'
|
|
5
|
+
require 'ruby-debug'
|
|
6
|
+
require 'logger'
|
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
|
|
8
|
+
|
|
9
|
+
$logger = Logger.new STDERR
|
|
10
|
+
$logger.level = Logger::DEBUG
|
|
11
|
+
$logger.datetime_format = "%H:%M:%S"
|
|
12
|
+
# $tee = File.open('tee.csv', 'w')
|
|
13
|
+
$tee = STDOUT
|
|
14
|
+
|
|
15
|
+
# $ltd_left = /(super|bonanza)/i
|
|
16
|
+
# $ltd_right = /bonanza d-35/i
|
|
17
|
+
# $ltd_dd_left = /bonanza/i
|
|
18
|
+
# $ltd_dd_right = /musk/i
|
|
19
|
+
# $ltd_dd_left_not = /allison/i
|
|
20
|
+
# $ltd_dd_print = true
|
|
21
|
+
# $ltd_ddd_left = /bonanza/i
|
|
22
|
+
# $ltd_ddd_right = /musk/i
|
|
23
|
+
# $ltd_ddd_left_not = /allison/i
|
|
24
|
+
# $ltd_ddd_print = true
|
|
25
|
+
|
|
26
|
+
@right = RemoteTable.new :url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv',
|
|
27
|
+
:select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }
|
|
28
|
+
|
|
29
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
|
30
|
+
|
|
31
|
+
@identities = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false
|
|
32
|
+
|
|
33
|
+
@blockings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false
|
|
34
|
+
|
|
35
|
+
@positives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=1&output=csv', :headers => false
|
|
36
|
+
|
|
37
|
+
@negatives = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=2&output=csv', :headers => false
|
|
38
|
+
|
|
39
|
+
%w{ tightenings identities blockings }.each do |name|
|
|
40
|
+
$logger.info name
|
|
41
|
+
$logger.info "\n" + instance_variable_get("@#{name}").to_a.map { |record| record[0] }.join("\n")
|
|
42
|
+
$logger.info "\n"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
('A'..'Z').each do |letter|
|
|
46
|
+
# %w{ E }.each do |letter|
|
|
47
|
+
@left = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
|
48
|
+
:encoding => 'US-ASCII',
|
|
49
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
|
50
|
+
:column_xpath => 'td'
|
|
51
|
+
|
|
52
|
+
d = LooseTightDictionary.new @right, :tightenings => @tightenings, :identities => @identities, :blockings => @blockings, :logger => $logger, :tee => $tee
|
|
53
|
+
d.left_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Model'] }
|
|
54
|
+
d.right_reader = lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
|
|
55
|
+
d.positives = @positives
|
|
56
|
+
d.negatives = @negatives
|
|
57
|
+
d.check @left
|
|
58
|
+
end
|
|
Binary file
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
require 'active_support'
|
|
2
|
+
require 'active_support/version'
|
|
3
|
+
%w{
|
|
4
|
+
active_support/core_ext/string
|
|
5
|
+
}.each do |active_support_3_requirement|
|
|
6
|
+
require active_support_3_requirement
|
|
7
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
|
8
|
+
require 'amatch'
|
|
9
|
+
require 'andand'
|
|
10
|
+
if RUBY_VERSION >= "1.9"
|
|
11
|
+
require 'csv'
|
|
12
|
+
else
|
|
13
|
+
require 'fastercsv'
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class LooseTightDictionary
|
|
17
|
+
class MissedChecks < RuntimeError; end
|
|
18
|
+
class Mismatch < RuntimeError; end
|
|
19
|
+
class FalsePositive < RuntimeError; end
|
|
20
|
+
|
|
21
|
+
class T
|
|
22
|
+
attr_reader :str, :tightened_str
|
|
23
|
+
def initialize(str, tightened_str)
|
|
24
|
+
@str = str
|
|
25
|
+
@tightened_str = tightened_str
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def tightened?
|
|
29
|
+
str != tightened_str
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def prefix_and_score(other)
|
|
33
|
+
prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
|
|
34
|
+
score = if prefix
|
|
35
|
+
tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
|
|
36
|
+
else
|
|
37
|
+
tightened_str.pair_distance_similar other.tightened_str
|
|
38
|
+
end
|
|
39
|
+
[ prefix, score ]
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
include Amatch
|
|
44
|
+
|
|
45
|
+
attr_reader :right_records
|
|
46
|
+
attr_reader :case_sensitive
|
|
47
|
+
|
|
48
|
+
attr_accessor :logger
|
|
49
|
+
attr_accessor :tee
|
|
50
|
+
attr_accessor :tee_format
|
|
51
|
+
attr_accessor :positives
|
|
52
|
+
attr_accessor :negatives
|
|
53
|
+
attr_accessor :left_reader
|
|
54
|
+
attr_accessor :right_reader
|
|
55
|
+
attr_accessor :blocking_only
|
|
56
|
+
|
|
57
|
+
def initialize(right_records, options = {})
|
|
58
|
+
@right_records = right_records
|
|
59
|
+
@_raw_tightenings = options[:tightenings] || Array.new
|
|
60
|
+
@_raw_identities = options[:identities] || Array.new
|
|
61
|
+
@_raw_blockings = options[:blockings] || Array.new
|
|
62
|
+
@left_reader = options[:left_reader]
|
|
63
|
+
@right_reader = options[:right_reader]
|
|
64
|
+
@positives = options[:positives]
|
|
65
|
+
@negatives = options[:negatives]
|
|
66
|
+
@logger = options[:logger]
|
|
67
|
+
@tee = options[:tee]
|
|
68
|
+
@tee_format = options[:tee_format] || :fixed_width
|
|
69
|
+
@case_sensitive = options[:case_sensitive] || false
|
|
70
|
+
@blocking_only = options[:blocking_only] || false
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# def tightenings
|
|
74
|
+
# def identities
|
|
75
|
+
# def blockings
|
|
76
|
+
%w{ tightenings identities blockings }.each do |name|
|
|
77
|
+
module_eval %{
|
|
78
|
+
def #{name}
|
|
79
|
+
@#{name} ||= @_raw_#{name}.map do |i|
|
|
80
|
+
next if i[0].blank?
|
|
81
|
+
literal_regexp i[0]
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
}
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def blocking_only?
|
|
88
|
+
!!blocking_only
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def inline_check(left_record, right_record)
|
|
92
|
+
return unless positives.present? or negatives.present?
|
|
93
|
+
|
|
94
|
+
left = read_left left_record
|
|
95
|
+
right = read_right right_record
|
|
96
|
+
|
|
97
|
+
if positive_record = positives.andand.detect { |record| record[0] == left }
|
|
98
|
+
correct_right = positive_record[1]
|
|
99
|
+
if correct_right.blank? and right.present?
|
|
100
|
+
logger.andand.debug " Mismatch! (should match SOMETHING)"
|
|
101
|
+
raise Mismatch
|
|
102
|
+
elsif right != correct_right
|
|
103
|
+
logger.andand.debug " Mismatch! (should be #{correct_right})"
|
|
104
|
+
raise Mismatch
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
if negative_record = negatives.andand.detect { |record| record[0] == left }
|
|
109
|
+
incorrect_right = negative_record[1]
|
|
110
|
+
if incorrect_right.blank? and right.present?
|
|
111
|
+
logger.andand.debug " False positive! (should NOT match ANYTHING)"
|
|
112
|
+
raise FalsePositive
|
|
113
|
+
elsif right == incorrect_right
|
|
114
|
+
logger.andand.debug " False positive! (should NOT be #{incorrect_right})"
|
|
115
|
+
raise FalsePositive
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def check(left_records)
|
|
121
|
+
header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
|
|
122
|
+
case tee_format
|
|
123
|
+
when :csv
|
|
124
|
+
tee.andand.puts header.flatten.to_csv
|
|
125
|
+
when :fixed_width
|
|
126
|
+
tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
left_records.each do |left_record|
|
|
130
|
+
begin
|
|
131
|
+
right_record = left_to_right left_record
|
|
132
|
+
ensure
|
|
133
|
+
case tee_format
|
|
134
|
+
when :csv
|
|
135
|
+
tee.andand.puts $ltd_1.flatten.to_csv
|
|
136
|
+
when :fixed_width
|
|
137
|
+
tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def left_to_right(left_record)
|
|
144
|
+
left = read_left left_record
|
|
145
|
+
blocking_left = blocking left
|
|
146
|
+
return if blocking_only? and blocking_left.nil?
|
|
147
|
+
i_options_left = i_options left
|
|
148
|
+
t_options_left = t_options left
|
|
149
|
+
history = Hash.new
|
|
150
|
+
right_record = right_records.select do |right_record|
|
|
151
|
+
right = read_right right_record
|
|
152
|
+
blocking_right = blocking right
|
|
153
|
+
(not blocking_left and not blocking_right) or
|
|
154
|
+
(blocking_right and blocking_right.match(left)) or
|
|
155
|
+
(blocking_left and blocking_left.match(right))
|
|
156
|
+
end.max do |a_record, b_record|
|
|
157
|
+
a = read_right a_record
|
|
158
|
+
b = read_right b_record
|
|
159
|
+
i_options_a = i_options a
|
|
160
|
+
i_options_b = i_options b
|
|
161
|
+
collision_a = collision? i_options_left, i_options_a
|
|
162
|
+
collision_b = collision? i_options_left, i_options_b
|
|
163
|
+
if collision_a and collision_b
|
|
164
|
+
# neither would ever work, so randomly rank one over the other
|
|
165
|
+
rand(2) == 1 ? -1 : 1
|
|
166
|
+
elsif collision_a
|
|
167
|
+
-1
|
|
168
|
+
elsif collision_b
|
|
169
|
+
1
|
|
170
|
+
else
|
|
171
|
+
t_left_a, t_right_a = optimize t_options_left, t_options(a)
|
|
172
|
+
t_left_b, t_right_b = optimize t_options_left, t_options(b)
|
|
173
|
+
a_prefix, a_score = t_left_a.prefix_and_score t_right_a
|
|
174
|
+
b_prefix, b_score = t_left_b.prefix_and_score t_right_b
|
|
175
|
+
history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
|
|
176
|
+
history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
|
|
177
|
+
|
|
178
|
+
yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
|
|
179
|
+
|
|
180
|
+
if $ltd_dd_print and yep_dd
|
|
181
|
+
logger.andand.debug t_left_a.inspect
|
|
182
|
+
logger.andand.debug t_right_a.inspect
|
|
183
|
+
logger.andand.debug t_left_b.inspect
|
|
184
|
+
logger.andand.debug t_right_b.inspect
|
|
185
|
+
logger.andand.debug
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
z = 1
|
|
189
|
+
debugger if yep_dd
|
|
190
|
+
z = 1
|
|
191
|
+
|
|
192
|
+
if a_score != b_score
|
|
193
|
+
a_score <=> b_score
|
|
194
|
+
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
|
195
|
+
a_prefix <=> b_prefix
|
|
196
|
+
else
|
|
197
|
+
b.length <=> a.length
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
$ltd_1 = history[right_record]
|
|
202
|
+
right = read_right right_record
|
|
203
|
+
i_options_right = i_options right
|
|
204
|
+
z = 1
|
|
205
|
+
debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
|
|
206
|
+
z = 1
|
|
207
|
+
if collision? i_options_left, i_options_right
|
|
208
|
+
$ltd_0 = nil
|
|
209
|
+
return
|
|
210
|
+
else
|
|
211
|
+
$ltd_0 = right_record
|
|
212
|
+
end
|
|
213
|
+
inline_check left_record, right_record
|
|
214
|
+
right_record
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def optimize(t_options_left, t_options_right)
|
|
218
|
+
cart_prod(t_options_left, t_options_right).max do |a, b|
|
|
219
|
+
t_left_a, t_right_a = a
|
|
220
|
+
t_left_b, t_right_b = b
|
|
221
|
+
|
|
222
|
+
a_prefix, a_score = t_left_a.prefix_and_score t_right_a
|
|
223
|
+
b_prefix, b_score = t_left_b.prefix_and_score t_right_b
|
|
224
|
+
|
|
225
|
+
yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
|
|
226
|
+
|
|
227
|
+
if $ltd_ddd_print and yep_ddd
|
|
228
|
+
logger.andand.debug t_left_a.inspect
|
|
229
|
+
logger.andand.debug t_right_a.inspect
|
|
230
|
+
logger.andand.debug t_left_b.inspect
|
|
231
|
+
logger.andand.debug t_right_b.inspect
|
|
232
|
+
logger.andand.debug
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
z = 1
|
|
236
|
+
debugger if yep_ddd
|
|
237
|
+
z = 1
|
|
238
|
+
|
|
239
|
+
if a_score != b_score
|
|
240
|
+
a_score <=> b_score
|
|
241
|
+
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
|
242
|
+
a_prefix <=> b_prefix
|
|
243
|
+
else
|
|
244
|
+
# randomly choose
|
|
245
|
+
# maybe later i can figure out how big the inputs are and apply occam's razor
|
|
246
|
+
rand(2) == 1 ? -1 : 1
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def t_options(str)
|
|
252
|
+
return @_t_options[str] if @_t_options.andand.has_key?(str)
|
|
253
|
+
@_t_options ||= Hash.new
|
|
254
|
+
ary = Array.new
|
|
255
|
+
ary.push T.new(str, str)
|
|
256
|
+
tightenings.each do |regexp|
|
|
257
|
+
if match_data = regexp.match(str)
|
|
258
|
+
ary.push T.new(str, match_data.captures.compact.join)
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
@_t_options[str] = ary
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
class I
|
|
265
|
+
attr_reader :regexp, :str, :case_sensitive, :identity
|
|
266
|
+
def initialize(regexp, str, case_sensitive)
|
|
267
|
+
@regexp = regexp
|
|
268
|
+
@str = str
|
|
269
|
+
@identity = regexp.match(str).captures.compact.join
|
|
270
|
+
@identity = @identity.downcase if case_sensitive
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def collision?(i_options_left, i_options_right)
|
|
275
|
+
i_options_left.any? do |r_left|
|
|
276
|
+
i_options_right.any? do |r_right|
|
|
277
|
+
r_left.regexp == r_right.regexp and r_left.identity != r_right.identity
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def i_options(str)
|
|
283
|
+
return @_i_options[str] if @_i_options.andand.has_key?(str)
|
|
284
|
+
@_i_options ||= Hash.new
|
|
285
|
+
ary = Array.new
|
|
286
|
+
identities.each do |regexp|
|
|
287
|
+
if regexp.match str
|
|
288
|
+
ary.push I.new(regexp, str, case_sensitive)
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
@_i_options[str] = ary
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def blocking(str)
|
|
295
|
+
return @_blocking[str] if @_blocking.andand.has_key?(str)
|
|
296
|
+
@_blocking ||= Hash.new
|
|
297
|
+
blockings.each do |regexp|
|
|
298
|
+
if regexp.match str
|
|
299
|
+
return @_blocking[str] = regexp
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
@_blocking[str] = nil
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def literal_regexp(str)
|
|
306
|
+
return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
|
|
307
|
+
@_literal_regexp ||= Hash.new
|
|
308
|
+
raw_regexp_options = str.split('/').last
|
|
309
|
+
ignore_case = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
|
|
310
|
+
multiline = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
|
|
311
|
+
extended = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
|
|
312
|
+
@_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (ignore_case||multiline||extended)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def read_left(left_record)
|
|
316
|
+
return if left_record.nil?
|
|
317
|
+
if left_reader
|
|
318
|
+
left_reader.call(left_record)
|
|
319
|
+
elsif left_record.is_a?(String)
|
|
320
|
+
left_record
|
|
321
|
+
else
|
|
322
|
+
left_record[0]
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def read_right(right_record)
|
|
327
|
+
return if right_record.nil?
|
|
328
|
+
if right_reader
|
|
329
|
+
right_reader.call(right_record)
|
|
330
|
+
elsif right_record.is_a?(String)
|
|
331
|
+
right_record
|
|
332
|
+
else
|
|
333
|
+
right_record[0]
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
# Thanks William James!
|
|
338
|
+
# http://www.ruby-forum.com/topic/95519#200484
|
|
339
|
+
def cart_prod(*args)
|
|
340
|
+
args.inject([[]]){|old,lst|
|
|
341
|
+
new = []
|
|
342
|
+
lst.each{|e| new += old.map{|c| c.dup << e }}
|
|
343
|
+
new
|
|
344
|
+
}
|
|
345
|
+
end
|
|
346
|
+
end
|
data/test/helper.rb
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'test/unit'
|
|
3
|
+
require 'shoulda'
|
|
4
|
+
require 'logger'
|
|
5
|
+
require 'ruby-debug'
|
|
6
|
+
|
|
7
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
8
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
9
|
+
require 'loose_tight_dictionary'
|
|
10
|
+
|
|
11
|
+
class Test::Unit::TestCase
|
|
12
|
+
end
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
require 'helper'
|
|
2
|
+
|
|
3
|
+
require 'remote_table'
|
|
4
|
+
|
|
5
|
+
# $logger = Logger.new STDERR
|
|
6
|
+
# $logger.level = Logger::INFO
|
|
7
|
+
# $tee = STDOUT
|
|
8
|
+
|
|
9
|
+
class TestLooseTightDictionary < Test::Unit::TestCase
|
|
10
|
+
def setup
|
|
11
|
+
clear_ltd
|
|
12
|
+
|
|
13
|
+
# dh 8 400
|
|
14
|
+
@a_left = ['DE HAVILLAND CANADA DHC8400 Dash 8']
|
|
15
|
+
@a_right = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8']
|
|
16
|
+
# dh 88
|
|
17
|
+
@b_left = ['ABCDEFG DH88 HIJKLMNOP']
|
|
18
|
+
# dh 89
|
|
19
|
+
@c_right = ['ABCDEFG DH89 HIJKLMNOP']
|
|
20
|
+
# dh 8 200
|
|
21
|
+
@d_left = ['DE HAVILLAND CANADA DHC8200 Dash 8']
|
|
22
|
+
@d_right = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8']
|
|
23
|
+
@d_lookalike = ['ABCD DHC8200 Dash 8']
|
|
24
|
+
|
|
25
|
+
@t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ]
|
|
26
|
+
|
|
27
|
+
@r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ]
|
|
28
|
+
|
|
29
|
+
@left = [
|
|
30
|
+
@a_left,
|
|
31
|
+
@b_left,
|
|
32
|
+
['DE HAVILLAND DH89 Dragon Rapide'],
|
|
33
|
+
['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'],
|
|
34
|
+
@d_left,
|
|
35
|
+
['DE HAVILLAND CANADA DHC8300 Dash 8'],
|
|
36
|
+
['DE HAVILLAND DH90 Dragonfly']
|
|
37
|
+
]
|
|
38
|
+
@right = [
|
|
39
|
+
@a_right,
|
|
40
|
+
@c_right,
|
|
41
|
+
@d_right,
|
|
42
|
+
['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'],
|
|
43
|
+
['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6']
|
|
44
|
+
]
|
|
45
|
+
@tightenings = []
|
|
46
|
+
@identities = []
|
|
47
|
+
@blockings = []
|
|
48
|
+
@positives = []
|
|
49
|
+
@negatives = []
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def clear_ltd
|
|
53
|
+
@_ltd = nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def ltd
|
|
57
|
+
@_ltd ||= LooseTightDictionary.new @right,
|
|
58
|
+
:tightenings => @tightenings,
|
|
59
|
+
:identities => @identities,
|
|
60
|
+
:blockings => @blockings,
|
|
61
|
+
:positives => @positives,
|
|
62
|
+
:negatives => @negatives,
|
|
63
|
+
:blocking_only => @blocking_only,
|
|
64
|
+
:logger => $logger,
|
|
65
|
+
:tee => $tee
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
if ENV['NEW'] == 'true' or ENV['ALL'] == 'true'
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
|
|
72
|
+
should "optionally only pay attention to things that match blockings" do
|
|
73
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
|
74
|
+
|
|
75
|
+
clear_ltd
|
|
76
|
+
@blocking_only = true
|
|
77
|
+
assert_equal nil, ltd.left_to_right(@a_left)
|
|
78
|
+
|
|
79
|
+
clear_ltd
|
|
80
|
+
@blocking_only = true
|
|
81
|
+
@blockings.push ['/dash/i']
|
|
82
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# the example from the readme, considerably uglier here
|
|
86
|
+
should "check a simple table" do
|
|
87
|
+
@right = [ 'seamus', 'andy', 'ben' ]
|
|
88
|
+
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
|
89
|
+
left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
|
90
|
+
|
|
91
|
+
assert_nothing_raised do
|
|
92
|
+
ltd.check left
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
should "treat a String as a full record if passed through" do
|
|
97
|
+
dash = 'DHC8-400'
|
|
98
|
+
b747 = 'B747200/300'
|
|
99
|
+
dc9 = 'DC-9-10'
|
|
100
|
+
right_records = [ dash, b747, dc9 ]
|
|
101
|
+
simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
|
|
102
|
+
assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
|
|
103
|
+
assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
|
|
104
|
+
assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
should "call it a mismatch if you hit a blank positive" do
|
|
108
|
+
@positives.push [@a_left[0], '']
|
|
109
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
|
110
|
+
ltd.left_to_right @a_left
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
should "call it a false positive if you hit a blank negative" do
|
|
115
|
+
@negatives.push [@a_left[0], '']
|
|
116
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
117
|
+
ltd.left_to_right @a_left
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
should "have a false match without blocking" do
|
|
122
|
+
# @d_left will be our victim
|
|
123
|
+
@right.push @d_lookalike
|
|
124
|
+
@tightenings.push @t_1
|
|
125
|
+
|
|
126
|
+
assert_equal @d_lookalike, ltd.left_to_right(@d_left)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
should "do blocking if the left matches a block" do
|
|
130
|
+
# @d_left will be our victim
|
|
131
|
+
@right.push @d_lookalike
|
|
132
|
+
@tightenings.push @t_1
|
|
133
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
|
134
|
+
|
|
135
|
+
assert_equal @d_right, ltd.left_to_right(@d_left)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
should "treat blocks as exclusive" do
|
|
139
|
+
@right = [ @d_left ]
|
|
140
|
+
@tightenings.push @t_1
|
|
141
|
+
@blockings.push ['/(bombardier|de ?havilland)/i']
|
|
142
|
+
|
|
143
|
+
assert_equal nil, ltd.left_to_right(@d_lookalike)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
should "only use identities if they stem from the same regexp" do
|
|
147
|
+
@identities.push @r_1
|
|
148
|
+
@identities.push [ '/(cessna)(?:.*?)(citation)/i' ]
|
|
149
|
+
@identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ]
|
|
150
|
+
x_left = [ 'CESSNA D-333 CITATION V']
|
|
151
|
+
x_right = [ 'CESSNA D-333' ]
|
|
152
|
+
@right.push x_right
|
|
153
|
+
|
|
154
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
should "use the best score from all of the tightenings" do
|
|
158
|
+
x_left = ["BOEING 737100"]
|
|
159
|
+
x_right = ["BOEING BOEING 737-100/200"]
|
|
160
|
+
x_right_wrong = ["BOEING BOEING 737-900"]
|
|
161
|
+
@right.push x_right
|
|
162
|
+
@right.push x_right_wrong
|
|
163
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
|
164
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
|
165
|
+
|
|
166
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
should "compare using prefixes if tightened key is shorter than correct match" do
|
|
170
|
+
x_left = ["BOEING 720"]
|
|
171
|
+
x_right = ["BOEING BOEING 720-000"]
|
|
172
|
+
x_right_wrong = ["BOEING BOEING 717-200"]
|
|
173
|
+
@right.push x_right
|
|
174
|
+
@right.push x_right_wrong
|
|
175
|
+
@tightenings.push @t_1
|
|
176
|
+
@tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i']
|
|
177
|
+
@tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i']
|
|
178
|
+
|
|
179
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
should "use the shortest original input" do
|
|
183
|
+
x_left = ['De Havilland DHC8-777 Dash-8 Superstar']
|
|
184
|
+
x_right = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar']
|
|
185
|
+
x_right_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova']
|
|
186
|
+
|
|
187
|
+
@right.push x_right_long
|
|
188
|
+
@right.push x_right
|
|
189
|
+
@tightenings.push @t_1
|
|
190
|
+
|
|
191
|
+
assert_equal x_right, ltd.left_to_right(x_left)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
should "perform lookups left to right" do
|
|
195
|
+
assert_equal @a_right, ltd.left_to_right(@a_left)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
should "succeed if there are no checks" do
|
|
199
|
+
assert_nothing_raised do
|
|
200
|
+
ltd.check @left
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
should "succeed if the positive checks just work" do
|
|
205
|
+
@positives.push [ @a_left[0], @a_right[0] ]
|
|
206
|
+
|
|
207
|
+
assert_nothing_raised do
|
|
208
|
+
ltd.check @left
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
should "fail if positive checks don't work" do
|
|
213
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
214
|
+
|
|
215
|
+
assert_raises(LooseTightDictionary::Mismatch) do
|
|
216
|
+
ltd.check @left
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
should "succeed if proper tightening is applied" do
|
|
221
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
222
|
+
@tightenings.push @t_1
|
|
223
|
+
|
|
224
|
+
assert_nothing_raised do
|
|
225
|
+
ltd.check @left
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
should "use a Google Docs spreadsheet as a source of tightenings" do
|
|
230
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
231
|
+
@tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false
|
|
232
|
+
|
|
233
|
+
assert_nothing_raised do
|
|
234
|
+
ltd.check @left
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
should "fail if negative checks don't work" do
|
|
239
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
240
|
+
|
|
241
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
242
|
+
ltd.check @left
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
should "do inline checking" do
|
|
247
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
248
|
+
|
|
249
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
250
|
+
ltd.left_to_right @b_left
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
should "fail if negative checks don't work, even with tightening" do
|
|
255
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
256
|
+
@tightenings.push @t_1
|
|
257
|
+
|
|
258
|
+
assert_raises(LooseTightDictionary::FalsePositive) do
|
|
259
|
+
ltd.check @left
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
should "succeed if proper identity is applied" do
|
|
264
|
+
@negatives.push [ @b_left[0], @c_right[0] ]
|
|
265
|
+
@positives.push [ @d_left[0], @d_right[0] ]
|
|
266
|
+
@identities.push @r_1
|
|
267
|
+
|
|
268
|
+
assert_nothing_raised do
|
|
269
|
+
ltd.check @left
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: loose_tight_dictionary-ruby19
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
hash: 15
|
|
5
|
+
prerelease: false
|
|
6
|
+
segments:
|
|
7
|
+
- 0
|
|
8
|
+
- 0
|
|
9
|
+
- 8
|
|
10
|
+
version: 0.0.8
|
|
11
|
+
platform: ruby
|
|
12
|
+
authors:
|
|
13
|
+
- Seamus Abshere
|
|
14
|
+
autorequire:
|
|
15
|
+
bindir: bin
|
|
16
|
+
cert_chain: []
|
|
17
|
+
|
|
18
|
+
date: 2010-09-27 00:00:00 -05:00
|
|
19
|
+
default_executable:
|
|
20
|
+
dependencies:
|
|
21
|
+
- !ruby/object:Gem::Dependency
|
|
22
|
+
name: shoulda
|
|
23
|
+
prerelease: false
|
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
25
|
+
none: false
|
|
26
|
+
requirements:
|
|
27
|
+
- - ">="
|
|
28
|
+
- !ruby/object:Gem::Version
|
|
29
|
+
hash: 3
|
|
30
|
+
segments:
|
|
31
|
+
- 0
|
|
32
|
+
version: "0"
|
|
33
|
+
type: :development
|
|
34
|
+
version_requirements: *id001
|
|
35
|
+
- !ruby/object:Gem::Dependency
|
|
36
|
+
name: remote_table
|
|
37
|
+
prerelease: false
|
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
39
|
+
none: false
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
hash: 49
|
|
44
|
+
segments:
|
|
45
|
+
- 0
|
|
46
|
+
- 2
|
|
47
|
+
- 19
|
|
48
|
+
version: 0.2.19
|
|
49
|
+
type: :development
|
|
50
|
+
version_requirements: *id002
|
|
51
|
+
- !ruby/object:Gem::Dependency
|
|
52
|
+
name: activesupport
|
|
53
|
+
prerelease: false
|
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
55
|
+
none: false
|
|
56
|
+
requirements:
|
|
57
|
+
- - ">="
|
|
58
|
+
- !ruby/object:Gem::Version
|
|
59
|
+
hash: 11
|
|
60
|
+
segments:
|
|
61
|
+
- 2
|
|
62
|
+
- 3
|
|
63
|
+
- 4
|
|
64
|
+
version: 2.3.4
|
|
65
|
+
type: :runtime
|
|
66
|
+
version_requirements: *id003
|
|
67
|
+
- !ruby/object:Gem::Dependency
|
|
68
|
+
name: fastercsv
|
|
69
|
+
prerelease: false
|
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
|
71
|
+
none: false
|
|
72
|
+
requirements:
|
|
73
|
+
- - ">="
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
hash: 5
|
|
76
|
+
segments:
|
|
77
|
+
- 1
|
|
78
|
+
- 5
|
|
79
|
+
- 3
|
|
80
|
+
version: 1.5.3
|
|
81
|
+
type: :runtime
|
|
82
|
+
version_requirements: *id004
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: andand
|
|
85
|
+
prerelease: false
|
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
|
87
|
+
none: false
|
|
88
|
+
requirements:
|
|
89
|
+
- - ">="
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
hash: 25
|
|
92
|
+
segments:
|
|
93
|
+
- 1
|
|
94
|
+
- 3
|
|
95
|
+
- 1
|
|
96
|
+
version: 1.3.1
|
|
97
|
+
type: :runtime
|
|
98
|
+
version_requirements: *id005
|
|
99
|
+
- !ruby/object:Gem::Dependency
|
|
100
|
+
name: amatch
|
|
101
|
+
prerelease: false
|
|
102
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
|
103
|
+
none: false
|
|
104
|
+
requirements:
|
|
105
|
+
- - ">="
|
|
106
|
+
- !ruby/object:Gem::Version
|
|
107
|
+
hash: 29
|
|
108
|
+
segments:
|
|
109
|
+
- 0
|
|
110
|
+
- 2
|
|
111
|
+
- 5
|
|
112
|
+
version: 0.2.5
|
|
113
|
+
type: :runtime
|
|
114
|
+
version_requirements: *id006
|
|
115
|
+
description: Create dictionaries that link rows between two tables (left and right) using loose matching (string similarity) by default and tight matching (regexp) by request.
|
|
116
|
+
email: seamus@abshere.net
|
|
117
|
+
executables: []
|
|
118
|
+
|
|
119
|
+
extensions: []
|
|
120
|
+
|
|
121
|
+
extra_rdoc_files:
|
|
122
|
+
- LICENSE
|
|
123
|
+
- README.rdoc
|
|
124
|
+
files:
|
|
125
|
+
- .document
|
|
126
|
+
- .gitignore
|
|
127
|
+
- LICENSE
|
|
128
|
+
- README.rdoc
|
|
129
|
+
- Rakefile
|
|
130
|
+
- VERSION
|
|
131
|
+
- examples/first_name_matching.rb
|
|
132
|
+
- examples/icao-bts.rb
|
|
133
|
+
- examples/icao-bts.xls
|
|
134
|
+
- lib/loose_tight_dictionary.rb
|
|
135
|
+
- test/helper.rb
|
|
136
|
+
- test/test_loose_tight_dictionary.rb
|
|
137
|
+
has_rdoc: true
|
|
138
|
+
homepage: http://github.com/seamusabshere/loose_tight_dictionary
|
|
139
|
+
licenses: []
|
|
140
|
+
|
|
141
|
+
post_install_message:
|
|
142
|
+
rdoc_options:
|
|
143
|
+
- --charset=UTF-8
|
|
144
|
+
require_paths:
|
|
145
|
+
- lib
|
|
146
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
147
|
+
none: false
|
|
148
|
+
requirements:
|
|
149
|
+
- - ">="
|
|
150
|
+
- !ruby/object:Gem::Version
|
|
151
|
+
hash: 3
|
|
152
|
+
segments:
|
|
153
|
+
- 0
|
|
154
|
+
version: "0"
|
|
155
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
|
+
none: false
|
|
157
|
+
requirements:
|
|
158
|
+
- - ">="
|
|
159
|
+
- !ruby/object:Gem::Version
|
|
160
|
+
hash: 3
|
|
161
|
+
segments:
|
|
162
|
+
- 0
|
|
163
|
+
version: "0"
|
|
164
|
+
requirements: []
|
|
165
|
+
|
|
166
|
+
rubyforge_project:
|
|
167
|
+
rubygems_version: 1.3.7
|
|
168
|
+
signing_key:
|
|
169
|
+
specification_version: 3
|
|
170
|
+
summary: Allows iterative development of dictionaries for big data sets.
|
|
171
|
+
test_files:
|
|
172
|
+
- test/helper.rb
|
|
173
|
+
- test/test_loose_tight_dictionary.rb
|
|
174
|
+
- examples/first_name_matching.rb
|
|
175
|
+
- examples/icao-bts.rb
|