loose_tight_dictionary 0.0.10 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +4 -0
- data/README.rdoc +76 -23
- data/Rakefile +2 -38
- data/benchmark/before-with-free.txt +283 -0
- data/benchmark/before-without-last-result.txt +257 -0
- data/benchmark/before.txt +304 -0
- data/benchmark/memory.rb +54 -0
- data/examples/bts_aircraft/5-2-A.htm +10305 -0
- data/examples/bts_aircraft/5-2-B.htm +9576 -0
- data/examples/bts_aircraft/5-2-D.htm +7094 -0
- data/examples/bts_aircraft/5-2-E.htm +2349 -0
- data/examples/bts_aircraft/5-2-G.htm +2922 -0
- data/examples/bts_aircraft/blockings.csv +1 -0
- data/examples/bts_aircraft/identities.csv +1 -0
- data/examples/bts_aircraft/negatives.csv +1 -0
- data/examples/bts_aircraft/number_260.csv +334 -0
- data/examples/bts_aircraft/positives.csv +1 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +123 -0
- data/examples/bts_aircraft/tighteners.csv +1 -0
- data/examples/first_name_matching.rb +14 -22
- data/lib/loose_tight_dictionary/blocking.rb +36 -0
- data/lib/loose_tight_dictionary/extract_regexp.rb +30 -0
- data/lib/loose_tight_dictionary/identity.rb +25 -0
- data/lib/loose_tight_dictionary/result.rb +23 -0
- data/lib/loose_tight_dictionary/score.rb +28 -0
- data/lib/loose_tight_dictionary/similarity.rb +62 -0
- data/lib/loose_tight_dictionary/tightener.rb +30 -0
- data/lib/loose_tight_dictionary/version.rb +3 -0
- data/lib/loose_tight_dictionary/wrapper.rb +37 -0
- data/lib/loose_tight_dictionary.rb +178 -305
- data/loose_tight_dictionary.gemspec +19 -64
- data/test/helper.rb +6 -6
- data/test/test_blocking.rb +23 -0
- data/test/test_extract_regexp.rb +18 -0
- data/test/test_identity.rb +18 -0
- data/test/test_loose_tight_dictionary.rb +52 -245
- data/test/test_loose_tight_dictionary_convoluted.rb.disabled +268 -0
- data/test/test_tightening.rb +10 -0
- metadata +52 -65
- data/VERSION +0 -1
- data/examples/icao-bts.rb +0 -58
@@ -0,0 +1 @@
|
|
1
|
+
regexp,notes
|
@@ -1,23 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
puts
|
15
|
-
|
16
|
-
puts
|
17
|
-
|
18
|
-
puts "Results"
|
19
|
-
puts "=" * 20
|
20
|
-
d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
|
21
|
-
d.check left_side
|
22
|
-
|
23
|
-
puts d.left_to_right 'Shamus Heaney'
|
2
|
+
unless RUBY_VERSION >= '1.9'
|
3
|
+
require 'rubygems'
|
4
|
+
end
|
5
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
6
|
+
require 'loose_tight_dictionary'
|
7
|
+
|
8
|
+
haystack = [ 'seamus', 'andy', 'ben' ]
|
9
|
+
needles = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT', 'Shamus Heaney' ]
|
10
|
+
|
11
|
+
d = LooseTightDictionary.new haystack, :log => $stdout
|
12
|
+
needles.each do |needle|
|
13
|
+
d.explain needle
|
14
|
+
puts
|
15
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
# "Record linkage typically involves two main steps: blocking and scoring..."
|
3
|
+
# http://en.wikipedia.org/wiki/Record_linkage
|
4
|
+
#
|
5
|
+
# Blockings effectively divide up the haystack into groups that match a pattern
|
6
|
+
#
|
7
|
+
# A blocking (as in a grouping) comes into effect when a str matches.
|
8
|
+
# Then the needle must also match the blocking's regexp.
|
9
|
+
class Blocking
|
10
|
+
include ExtractRegexp
|
11
|
+
|
12
|
+
attr_reader :regexp
|
13
|
+
|
14
|
+
def initialize(regexp_or_str)
|
15
|
+
@regexp = extract_regexp regexp_or_str
|
16
|
+
end
|
17
|
+
|
18
|
+
# If a blocking "encompasses" two strings, that means they both fit into it.
|
19
|
+
#
|
20
|
+
# Returns false if they certainly don't fit this blocking.
|
21
|
+
# Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
|
22
|
+
def encompass?(str1, str2 = nil)
|
23
|
+
if str2.nil?
|
24
|
+
!!(regexp.match(str1))
|
25
|
+
elsif str2_match_data = regexp.match(str2)
|
26
|
+
if str1_match_data = regexp.match(str1)
|
27
|
+
str2_match_data.captures == str1_match_data.captures
|
28
|
+
else
|
29
|
+
false
|
30
|
+
end
|
31
|
+
else
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
module ExtractRegexp #:nodoc: all
|
3
|
+
def extract_regexp(regexp_or_str)
|
4
|
+
case regexp_or_str
|
5
|
+
when ::Regexp
|
6
|
+
regexp_or_str
|
7
|
+
when ::String
|
8
|
+
regexp_from_string regexp_or_str
|
9
|
+
else
|
10
|
+
raise ::ArgumentError, "Expected regexp or string"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
REGEXP_DELIMITERS = {
|
15
|
+
'%r{' => '}',
|
16
|
+
'/' => '/'
|
17
|
+
}
|
18
|
+
def regexp_from_string(str)
|
19
|
+
delim_start, delim_end = REGEXP_DELIMITERS.detect { |k, v| str.start_with? k }.map { |delim| ::Regexp.escape delim }
|
20
|
+
%r{\A#{delim_start}(.*)#{delim_end}([^#{delim_end}]*)\z} =~ str.strip
|
21
|
+
content = $1
|
22
|
+
options = $2
|
23
|
+
content.gsub! '\\/', '/'
|
24
|
+
ignore_case = options.include?('i') ? ::Regexp::IGNORECASE : nil
|
25
|
+
multiline = options.include?('m') ? ::Regexp::MULTILINE : nil
|
26
|
+
extended = options.include?('x') ? ::Regexp::EXTENDED : nil
|
27
|
+
::Regexp.new content, (ignore_case||multiline||extended)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
# Identities take effect when needle and haystack both match a regexp
|
3
|
+
# Then the captured part of the regexp has to match exactly
|
4
|
+
class Identity
|
5
|
+
include ExtractRegexp
|
6
|
+
|
7
|
+
attr_reader :regexp
|
8
|
+
|
9
|
+
def initialize(regexp_or_str)
|
10
|
+
@regexp = extract_regexp regexp_or_str
|
11
|
+
end
|
12
|
+
|
13
|
+
# Two strings are "identical" if they both match this identity and the captures are equal.
|
14
|
+
#
|
15
|
+
# Only returns true/false if both strings match the regexp.
|
16
|
+
# Otherwise returns nil.
|
17
|
+
def identical?(str1, str2)
|
18
|
+
if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
|
19
|
+
str1_match_data.captures == match_data.captures
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
class Result #:nodoc: all
|
3
|
+
attr_accessor :needle
|
4
|
+
attr_accessor :tighteners
|
5
|
+
attr_accessor :blockings
|
6
|
+
attr_accessor :identities
|
7
|
+
attr_accessor :encompassed
|
8
|
+
attr_accessor :unencompassed
|
9
|
+
attr_accessor :possibly_identical
|
10
|
+
attr_accessor :certainly_different
|
11
|
+
attr_accessor :similarities
|
12
|
+
attr_accessor :record
|
13
|
+
attr_accessor :score
|
14
|
+
|
15
|
+
def haystack
|
16
|
+
encompassed + unencompassed
|
17
|
+
end
|
18
|
+
|
19
|
+
def free
|
20
|
+
# nothing to see here
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'amatch'
|
2
|
+
|
3
|
+
class LooseTightDictionary
|
4
|
+
class Score
|
5
|
+
attr_reader :str1, :str2
|
6
|
+
|
7
|
+
def initialize(str1, str2)
|
8
|
+
@str1 = str1
|
9
|
+
@str2 = str2
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_f
|
13
|
+
@to_f ||= str1.pair_distance_similar str2
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
%{#<Score: to_f=#{to_f}>}
|
18
|
+
end
|
19
|
+
|
20
|
+
def <=>(other)
|
21
|
+
to_f <=> other.to_f
|
22
|
+
end
|
23
|
+
|
24
|
+
def ==(other)
|
25
|
+
to_f == other.to_f
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
class Similarity
|
3
|
+
attr_reader :wrapper1
|
4
|
+
attr_reader :wrapper2
|
5
|
+
|
6
|
+
def initialize(wrapper1, wrapper2)
|
7
|
+
@wrapper1 = wrapper1
|
8
|
+
@wrapper2 = wrapper2
|
9
|
+
end
|
10
|
+
|
11
|
+
def <=>(other)
|
12
|
+
if best_score != other.best_score
|
13
|
+
best_score <=> other.best_score
|
14
|
+
else
|
15
|
+
weight <=> other.weight
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Weight things towards short original strings
|
20
|
+
def weight
|
21
|
+
@weight ||= (1.0 / (wrapper1.to_str.length * wrapper2.to_str.length))
|
22
|
+
end
|
23
|
+
|
24
|
+
def best_score
|
25
|
+
@best_score ||= Score.new best_wrapper1_variant, best_wrapper2_variant
|
26
|
+
end
|
27
|
+
|
28
|
+
def best_wrapper1_variant
|
29
|
+
best_variants[0]
|
30
|
+
end
|
31
|
+
|
32
|
+
def best_wrapper2_variant
|
33
|
+
best_variants[1]
|
34
|
+
end
|
35
|
+
|
36
|
+
def best_variants
|
37
|
+
@best_variants ||= cart_prod(wrapper1.variants, wrapper2.variants).sort do |tuple1, tuple2|
|
38
|
+
wrapper1_variant1, wrapper2_variant1 = tuple1
|
39
|
+
wrapper1_variant2, wrapper2_variant2 = tuple2
|
40
|
+
|
41
|
+
score1 = Score.new wrapper1_variant1, wrapper2_variant1
|
42
|
+
score2 = Score.new wrapper1_variant2, wrapper2_variant2
|
43
|
+
|
44
|
+
score1 <=> score2
|
45
|
+
end[-1]
|
46
|
+
end
|
47
|
+
|
48
|
+
def inspect
|
49
|
+
%{#<Similarity "#{wrapper2.to_str}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.to_str}"=>"#{best_wrapper1_variant}" weight=#{"%0.5f" % weight} best_score=#{"%0.5f" % best_score.to_f}>}
|
50
|
+
end
|
51
|
+
|
52
|
+
# Thanks William James!
|
53
|
+
# http://www.ruby-forum.com/topic/95519#200484
|
54
|
+
def cart_prod(*args)
|
55
|
+
args.inject([[]]){|old,lst|
|
56
|
+
new = []
|
57
|
+
lst.each{|e| new += old.map{|c| c.dup << e }}
|
58
|
+
new
|
59
|
+
}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
# A tightener just strips a string down to its core
|
3
|
+
class Tightener
|
4
|
+
include ExtractRegexp
|
5
|
+
|
6
|
+
attr_reader :regexp
|
7
|
+
|
8
|
+
def initialize(regexp_or_str)
|
9
|
+
@regexp = extract_regexp regexp_or_str
|
10
|
+
end
|
11
|
+
|
12
|
+
# A tightener applies when its regexp matches and captures a new (shorter) string
|
13
|
+
def apply?(str)
|
14
|
+
!!(regexp.match(str))
|
15
|
+
end
|
16
|
+
|
17
|
+
# The result of applying a tightener is just all the captures put together.
|
18
|
+
def apply(str)
|
19
|
+
if match_data = regexp.match(str)
|
20
|
+
match_data.captures.join
|
21
|
+
else
|
22
|
+
str
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
"#<Tightener regexp=#{regexp.inspect}>"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
|
+
class Wrapper #:nodoc: all
|
4
|
+
attr_reader :parent
|
5
|
+
attr_reader :record
|
6
|
+
attr_reader :reader
|
7
|
+
|
8
|
+
def initialize(attrs = {})
|
9
|
+
attrs.each do |k, v|
|
10
|
+
instance_variable_set "@#{k}", v
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def inspect
|
15
|
+
"#<Wrapper to_str=#{to_str} variants=#{variants.length}>"
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_str
|
19
|
+
@to_str ||= reader ? reader.call(record) : record.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
alias :to_s :to_str
|
23
|
+
|
24
|
+
def similarity(other)
|
25
|
+
Similarity.new self, other
|
26
|
+
end
|
27
|
+
|
28
|
+
def variants
|
29
|
+
@variants ||= parent.tighteners.inject([ to_str ]) do |memo, tightener|
|
30
|
+
if tightener.apply? to_str
|
31
|
+
memo.push tightener.apply(to_str)
|
32
|
+
end
|
33
|
+
memo
|
34
|
+
end.uniq
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -2,351 +2,224 @@ require 'active_support'
|
|
2
2
|
require 'active_support/version'
|
3
3
|
%w{
|
4
4
|
active_support/core_ext/string
|
5
|
+
active_support/core_ext/hash
|
6
|
+
active_support/core_ext/object
|
5
7
|
}.each do |active_support_3_requirement|
|
6
8
|
require active_support_3_requirement
|
7
|
-
end if ActiveSupport::VERSION::MAJOR == 3
|
8
|
-
require 'amatch'
|
9
|
-
require 'andand'
|
10
|
-
if RUBY_VERSION >= '1.9'
|
11
|
-
require 'csv'
|
12
|
-
else
|
13
|
-
begin
|
14
|
-
require 'fastercsv'
|
15
|
-
rescue LoadError
|
16
|
-
$stderr.puts "[loose_tight_dictionary gem] You probably need to manually install the fastercsv gem."
|
17
|
-
raise $!
|
18
|
-
end
|
19
|
-
end
|
9
|
+
end if ::ActiveSupport::VERSION::MAJOR == 3
|
20
10
|
|
11
|
+
# See the README for more information.
|
21
12
|
class LooseTightDictionary
|
22
|
-
|
23
|
-
|
24
|
-
|
13
|
+
autoload :ExtractRegexp, 'loose_tight_dictionary/extract_regexp'
|
14
|
+
autoload :Tightener, 'loose_tight_dictionary/tightener'
|
15
|
+
autoload :Blocking, 'loose_tight_dictionary/blocking'
|
16
|
+
autoload :Identity, 'loose_tight_dictionary/identity'
|
17
|
+
autoload :Result, 'loose_tight_dictionary/result'
|
18
|
+
autoload :Wrapper, 'loose_tight_dictionary/wrapper'
|
19
|
+
autoload :Similarity, 'loose_tight_dictionary/similarity'
|
20
|
+
autoload :Score, 'loose_tight_dictionary/score'
|
25
21
|
|
26
|
-
class
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
def tightened?
|
34
|
-
str != tightened_str
|
35
|
-
end
|
36
|
-
|
37
|
-
def prefix_and_score(other)
|
38
|
-
prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
|
39
|
-
score = if prefix
|
40
|
-
tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
|
41
|
-
else
|
42
|
-
tightened_str.pair_distance_similar other.tightened_str
|
43
|
-
end
|
44
|
-
[ prefix, score ]
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
include Amatch
|
49
|
-
|
50
|
-
attr_reader :right_records
|
51
|
-
attr_reader :case_sensitive
|
52
|
-
|
53
|
-
attr_accessor :logger
|
54
|
-
attr_accessor :tee
|
55
|
-
attr_accessor :tee_format
|
56
|
-
attr_accessor :positives
|
57
|
-
attr_accessor :negatives
|
58
|
-
attr_accessor :left_reader
|
59
|
-
attr_accessor :right_reader
|
60
|
-
attr_accessor :blocking_only
|
22
|
+
class Freed < RuntimeError; end
|
23
|
+
|
24
|
+
attr_reader :options
|
25
|
+
attr_reader :haystack
|
26
|
+
attr_reader :records
|
61
27
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
@
|
68
|
-
@
|
69
|
-
@
|
70
|
-
@negatives = options[:negatives]
|
71
|
-
@logger = options[:logger]
|
72
|
-
@tee = options[:tee]
|
73
|
-
@tee_format = options[:tee_format] || :fixed_width
|
74
|
-
@case_sensitive = options[:case_sensitive] || false
|
75
|
-
@blocking_only = options[:blocking_only] || false
|
28
|
+
# haystack - a bunch of records
|
29
|
+
# options
|
30
|
+
# * tighteners: regexps that essentialize strings down
|
31
|
+
# * identities: regexps that rule out similarities, for example a 737 cannot be identical to a 747
|
32
|
+
def initialize(records, options = {})
|
33
|
+
@options = options.symbolize_keys
|
34
|
+
@records = records
|
35
|
+
@haystack = records.map { |record| Wrapper.new :parent => self, :record => record, :reader => haystack_reader }
|
76
36
|
end
|
77
37
|
|
78
|
-
|
79
|
-
|
80
|
-
# def blockings
|
81
|
-
%w{ tightenings identities blockings }.each do |name|
|
82
|
-
module_eval %{
|
83
|
-
def #{name}
|
84
|
-
@#{name} ||= @_raw_#{name}.map do |i|
|
85
|
-
next if i[0].blank?
|
86
|
-
literal_regexp i[0]
|
87
|
-
end
|
88
|
-
end
|
89
|
-
}
|
38
|
+
def last_result
|
39
|
+
@last_result ||= Result.new
|
90
40
|
end
|
91
41
|
|
92
|
-
def
|
93
|
-
|
42
|
+
def log(str = '') #:nodoc:
|
43
|
+
(options[:log] || $stderr).puts str unless options[:log] == false
|
94
44
|
end
|
95
|
-
|
96
|
-
def
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
45
|
+
|
46
|
+
def find_with_score(needle)
|
47
|
+
record = find needle
|
48
|
+
[ record, last_result.score ]
|
49
|
+
end
|
50
|
+
|
51
|
+
# todo fix record.record confusion (should be wrapper.record or smth)
|
52
|
+
def find(needle, gather_last_result = true)
|
53
|
+
raise Freed if freed?
|
54
|
+
free_last_result
|
101
55
|
|
102
|
-
if
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
raise Mismatch
|
107
|
-
elsif right != correct_right
|
108
|
-
logger.andand.debug " Mismatch! (#{right} should be #{correct_right})"
|
109
|
-
raise Mismatch
|
110
|
-
end
|
56
|
+
if gather_last_result
|
57
|
+
last_result.tighteners = tighteners
|
58
|
+
last_result.identities = identities
|
59
|
+
last_result.blockings = blockings
|
111
60
|
end
|
112
61
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
raise FalsePositive
|
118
|
-
elsif right == incorrect_right
|
119
|
-
logger.andand.debug " False positive! (#{right} should NOT be #{incorrect_right})"
|
120
|
-
raise FalsePositive
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
def check(left_records)
|
126
|
-
header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
|
127
|
-
case tee_format
|
128
|
-
when :csv
|
129
|
-
tee.andand.puts header.flatten.to_csv
|
130
|
-
when :fixed_width
|
131
|
-
tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
|
62
|
+
needle = Wrapper.new :parent => self, :record => needle
|
63
|
+
|
64
|
+
if gather_last_result
|
65
|
+
last_result.needle = needle
|
132
66
|
end
|
67
|
+
|
68
|
+
return if strict_blocking and blockings.none? { |blocking| blocking.encompass? needle }
|
133
69
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
case tee_format
|
139
|
-
when :csv
|
140
|
-
tee.andand.puts $ltd_1.flatten.to_csv
|
141
|
-
when :fixed_width
|
142
|
-
tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
|
70
|
+
encompassed, unencompassed = if strict_blocking and blockings.any?
|
71
|
+
haystack.partition do |record|
|
72
|
+
blockings.any? do |blocking|
|
73
|
+
blocking.encompass?(needle, record) == true
|
143
74
|
end
|
144
75
|
end
|
76
|
+
else
|
77
|
+
[ haystack.dup, [] ]
|
145
78
|
end
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
blocking_right = blocking right
|
158
|
-
(not blocking_left and not blocking_right) or
|
159
|
-
(blocking_right and blocking_right.match(left)) or
|
160
|
-
(blocking_left and blocking_left.match(right))
|
161
|
-
end.max do |a_record, b_record|
|
162
|
-
a = read_right a_record
|
163
|
-
b = read_right b_record
|
164
|
-
i_options_a = i_options a
|
165
|
-
i_options_b = i_options b
|
166
|
-
collision_a = collision? i_options_left, i_options_a
|
167
|
-
collision_b = collision? i_options_left, i_options_b
|
168
|
-
if collision_a and collision_b
|
169
|
-
# neither would ever work, so randomly rank one over the other
|
170
|
-
rand(2) == 1 ? -1 : 1
|
171
|
-
elsif collision_a
|
172
|
-
-1
|
173
|
-
elsif collision_b
|
174
|
-
1
|
175
|
-
else
|
176
|
-
t_left_a, t_right_a = optimize t_options_left, t_options(a)
|
177
|
-
t_left_b, t_right_b = optimize t_options_left, t_options(b)
|
178
|
-
a_prefix, a_score = t_left_a.prefix_and_score t_right_a
|
179
|
-
b_prefix, b_score = t_left_b.prefix_and_score t_right_b
|
180
|
-
history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
|
181
|
-
history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
|
182
|
-
|
183
|
-
yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
|
184
|
-
|
185
|
-
if $ltd_dd_print and yep_dd
|
186
|
-
logger.andand.debug t_left_a.inspect
|
187
|
-
logger.andand.debug t_right_a.inspect
|
188
|
-
logger.andand.debug t_left_b.inspect
|
189
|
-
logger.andand.debug t_right_b.inspect
|
190
|
-
logger.andand.debug
|
191
|
-
end
|
192
|
-
|
193
|
-
z = 1
|
194
|
-
debugger if yep_dd
|
195
|
-
z = 1
|
196
|
-
|
197
|
-
if a_score != b_score
|
198
|
-
a_score <=> b_score
|
199
|
-
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
200
|
-
a_prefix <=> b_prefix
|
201
|
-
else
|
202
|
-
b.length <=> a.length
|
79
|
+
|
80
|
+
if gather_last_result
|
81
|
+
last_result.encompassed = encompassed
|
82
|
+
last_result.unencompassed = unencompassed
|
83
|
+
end
|
84
|
+
|
85
|
+
possibly_identical, certainly_different = if identities.any?
|
86
|
+
encompassed.partition do |record|
|
87
|
+
identities.all? do |identity|
|
88
|
+
answer = identity.identical? needle, record
|
89
|
+
answer.nil? or answer == true
|
203
90
|
end
|
204
91
|
end
|
205
|
-
end
|
206
|
-
$ltd_1 = history[right_record]
|
207
|
-
right = read_right right_record
|
208
|
-
i_options_right = i_options right
|
209
|
-
z = 1
|
210
|
-
debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
|
211
|
-
z = 1
|
212
|
-
if collision? i_options_left, i_options_right
|
213
|
-
$ltd_0 = nil
|
214
|
-
return
|
215
92
|
else
|
216
|
-
|
93
|
+
[ encompassed.dup, [] ]
|
217
94
|
end
|
218
|
-
inline_check left_record, right_record
|
219
|
-
right_record
|
220
|
-
end
|
221
|
-
alias_method :find, :left_to_right
|
222
|
-
|
223
|
-
def optimize(t_options_left, t_options_right)
|
224
|
-
cart_prod(t_options_left, t_options_right).max do |a, b|
|
225
|
-
t_left_a, t_right_a = a
|
226
|
-
t_left_b, t_right_b = b
|
227
95
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
|
232
|
-
|
233
|
-
if $ltd_ddd_print and yep_ddd
|
234
|
-
logger.andand.debug t_left_a.inspect
|
235
|
-
logger.andand.debug t_right_a.inspect
|
236
|
-
logger.andand.debug t_left_b.inspect
|
237
|
-
logger.andand.debug t_right_b.inspect
|
238
|
-
logger.andand.debug
|
239
|
-
end
|
240
|
-
|
241
|
-
z = 1
|
242
|
-
debugger if yep_ddd
|
243
|
-
z = 1
|
244
|
-
|
245
|
-
if a_score != b_score
|
246
|
-
a_score <=> b_score
|
247
|
-
elsif a_prefix and b_prefix and a_prefix != b_prefix
|
248
|
-
a_prefix <=> b_prefix
|
249
|
-
else
|
250
|
-
# randomly choose
|
251
|
-
# maybe later i can figure out how big the inputs are and apply occam's razor
|
252
|
-
rand(2) == 1 ? -1 : 1
|
253
|
-
end
|
96
|
+
if gather_last_result
|
97
|
+
last_result.possibly_identical = possibly_identical
|
98
|
+
last_result.certainly_different = certainly_different
|
254
99
|
end
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
100
|
+
|
101
|
+
similarities = possibly_identical.map do |record|
|
102
|
+
needle.similarity record
|
103
|
+
end.sort
|
104
|
+
|
105
|
+
best_similarity = similarities[-1]
|
106
|
+
record = best_similarity.wrapper2
|
107
|
+
score = best_similarity.best_score.to_f
|
108
|
+
|
109
|
+
if gather_last_result
|
110
|
+
last_result.similarities = similarities
|
111
|
+
last_result.record = record.record
|
112
|
+
last_result.score = score
|
266
113
|
end
|
267
|
-
|
114
|
+
|
115
|
+
record.record
|
268
116
|
end
|
269
117
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
118
|
+
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
119
|
+
#
|
120
|
+
# d = LooseTightDictionary.new ['737', '747', '757' ]
|
121
|
+
# d.explain 'boeing 737-100'
|
122
|
+
def explain(needle)
|
123
|
+
record = find needle
|
124
|
+
log "#" * 150
|
125
|
+
log "# Match #{needle.inspect} => #{record.inspect}"
|
126
|
+
log "#" * 150
|
127
|
+
log
|
128
|
+
log "Needle"
|
129
|
+
log "-" * 150
|
130
|
+
log last_result.needle.to_str
|
131
|
+
log
|
132
|
+
log "Haystack"
|
133
|
+
log "-" * 150
|
134
|
+
log last_result.haystack.map { |record| record.to_str }.join("\n")
|
135
|
+
log
|
136
|
+
log "Tighteners"
|
137
|
+
log "-" * 150
|
138
|
+
log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
|
139
|
+
log
|
140
|
+
log "Blockings"
|
141
|
+
log "-" * 150
|
142
|
+
log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
|
143
|
+
log
|
144
|
+
log "Identities"
|
145
|
+
log "-" * 150
|
146
|
+
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
147
|
+
log
|
148
|
+
log "Included"
|
149
|
+
log "-" * 150
|
150
|
+
log last_result.encompassed.blank? ? '(none)' : last_result.encompassed.map { |encompassed| encompassed.to_str }.join("\n")
|
151
|
+
log
|
152
|
+
log "Ignored"
|
153
|
+
log "-" * 150
|
154
|
+
log last_result.unencompassed.blank? ? '(none)' : last_result.unencompassed.map { |unencompassed| unencompassed.to_str }.join("\n")
|
155
|
+
log
|
156
|
+
log "Possibly identical"
|
157
|
+
log "-" * 150
|
158
|
+
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.to_str }.join("\n")
|
159
|
+
log
|
160
|
+
log "Certainly different"
|
161
|
+
log "-" * 150
|
162
|
+
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.to_str }.join("\n")
|
163
|
+
log
|
164
|
+
log "Similarities"
|
165
|
+
log "-" * 150
|
166
|
+
log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
|
167
|
+
log
|
168
|
+
log "Match"
|
169
|
+
log "-" * 150
|
170
|
+
log record.inspect
|
278
171
|
end
|
279
|
-
|
280
|
-
def
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
172
|
+
|
173
|
+
def haystack_reader
|
174
|
+
options[:haystack_reader]
|
175
|
+
end
|
176
|
+
|
177
|
+
def strict_blocking
|
178
|
+
options[:strict_blocking] || false
|
179
|
+
end
|
180
|
+
|
181
|
+
def tighteners
|
182
|
+
@tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
|
183
|
+
Tightener.new regexp_or_str
|
285
184
|
end
|
286
185
|
end
|
287
|
-
|
288
|
-
def
|
289
|
-
|
290
|
-
|
291
|
-
ary = Array.new
|
292
|
-
identities.each do |regexp|
|
293
|
-
if regexp.match str
|
294
|
-
ary.push I.new(regexp, str, case_sensitive)
|
295
|
-
end
|
186
|
+
|
187
|
+
def identities
|
188
|
+
@identities ||= (options[:identities] || []).map do |regexp_or_str|
|
189
|
+
Identity.new regexp_or_str
|
296
190
|
end
|
297
|
-
@_i_options[str] = ary
|
298
191
|
end
|
299
|
-
|
300
|
-
def
|
301
|
-
|
302
|
-
|
303
|
-
blockings.each do |regexp|
|
304
|
-
if regexp.match str
|
305
|
-
return @_blocking[str] = regexp
|
306
|
-
end
|
192
|
+
|
193
|
+
def blockings
|
194
|
+
@blockings ||= (options[:blockings] || []).map do |regexp_or_str|
|
195
|
+
Blocking.new regexp_or_str
|
307
196
|
end
|
308
|
-
@_blocking[str] = nil
|
309
197
|
end
|
310
198
|
|
311
|
-
def
|
312
|
-
|
313
|
-
@_literal_regexp ||= Hash.new
|
314
|
-
raw_regexp_options = str.split('/').last
|
315
|
-
ignore_case = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
|
316
|
-
multiline = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
|
317
|
-
extended = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
|
318
|
-
@_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (ignore_case||multiline||extended)
|
199
|
+
def freed?
|
200
|
+
@freed == true
|
319
201
|
end
|
320
202
|
|
321
|
-
def
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
203
|
+
def free
|
204
|
+
free_last_result
|
205
|
+
@options.try :clear
|
206
|
+
@options = nil
|
207
|
+
@haystack.try :clear
|
208
|
+
@haystack = nil
|
209
|
+
@tighteners.try :clear
|
210
|
+
@tighteners = nil
|
211
|
+
@identities.try :clear
|
212
|
+
@identities = nil
|
213
|
+
@blockings.try :clear
|
214
|
+
@blockings = nil
|
215
|
+
ensure
|
216
|
+
@freed = true
|
330
217
|
end
|
331
218
|
|
332
|
-
|
333
|
-
return if right_record.nil?
|
334
|
-
if right_reader
|
335
|
-
right_reader.call(right_record)
|
336
|
-
elsif right_record.is_a?(String)
|
337
|
-
right_record
|
338
|
-
else
|
339
|
-
right_record[0]
|
340
|
-
end
|
341
|
-
end
|
219
|
+
private
|
342
220
|
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
args.inject([[]]){|old,lst|
|
347
|
-
new = []
|
348
|
-
lst.each{|e| new += old.map{|c| c.dup << e }}
|
349
|
-
new
|
350
|
-
}
|
221
|
+
def free_last_result
|
222
|
+
@last_result.try :free
|
223
|
+
@last_result = nil
|
351
224
|
end
|
352
225
|
end
|