loose_tight_dictionary 0.0.10 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +4 -0
  3. data/README.rdoc +76 -23
  4. data/Rakefile +2 -38
  5. data/benchmark/before-with-free.txt +283 -0
  6. data/benchmark/before-without-last-result.txt +257 -0
  7. data/benchmark/before.txt +304 -0
  8. data/benchmark/memory.rb +54 -0
  9. data/examples/bts_aircraft/5-2-A.htm +10305 -0
  10. data/examples/bts_aircraft/5-2-B.htm +9576 -0
  11. data/examples/bts_aircraft/5-2-D.htm +7094 -0
  12. data/examples/bts_aircraft/5-2-E.htm +2349 -0
  13. data/examples/bts_aircraft/5-2-G.htm +2922 -0
  14. data/examples/bts_aircraft/blockings.csv +1 -0
  15. data/examples/bts_aircraft/identities.csv +1 -0
  16. data/examples/bts_aircraft/negatives.csv +1 -0
  17. data/examples/bts_aircraft/number_260.csv +334 -0
  18. data/examples/bts_aircraft/positives.csv +1 -0
  19. data/examples/bts_aircraft/test_bts_aircraft.rb +123 -0
  20. data/examples/bts_aircraft/tighteners.csv +1 -0
  21. data/examples/first_name_matching.rb +14 -22
  22. data/lib/loose_tight_dictionary/blocking.rb +36 -0
  23. data/lib/loose_tight_dictionary/extract_regexp.rb +30 -0
  24. data/lib/loose_tight_dictionary/identity.rb +25 -0
  25. data/lib/loose_tight_dictionary/result.rb +23 -0
  26. data/lib/loose_tight_dictionary/score.rb +28 -0
  27. data/lib/loose_tight_dictionary/similarity.rb +62 -0
  28. data/lib/loose_tight_dictionary/tightener.rb +30 -0
  29. data/lib/loose_tight_dictionary/version.rb +3 -0
  30. data/lib/loose_tight_dictionary/wrapper.rb +37 -0
  31. data/lib/loose_tight_dictionary.rb +178 -305
  32. data/loose_tight_dictionary.gemspec +19 -64
  33. data/test/helper.rb +6 -6
  34. data/test/test_blocking.rb +23 -0
  35. data/test/test_extract_regexp.rb +18 -0
  36. data/test/test_identity.rb +18 -0
  37. data/test/test_loose_tight_dictionary.rb +52 -245
  38. data/test/test_loose_tight_dictionary_convoluted.rb.disabled +268 -0
  39. data/test/test_tightening.rb +10 -0
  40. metadata +52 -65
  41. data/VERSION +0 -1
  42. data/examples/icao-bts.rb +0 -58
@@ -0,0 +1 @@
1
+ regexp,notes
@@ -1,23 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
- require 'rubygems'
3
- # require 'loose_tight_dictionary'
4
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
5
- right_side = [ 'seamus', 'andy', 'ben' ]
6
- left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
7
-
8
- puts "Left side (input)"
9
- puts "=" * 20
10
- puts left_side
11
- puts
12
-
13
- puts "Right side (output)"
14
- puts "=" * 20
15
- puts right_side
16
- puts
17
-
18
- puts "Results"
19
- puts "=" * 20
20
- d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
21
- d.check left_side
22
-
23
- puts d.left_to_right 'Shamus Heaney'
2
+ unless RUBY_VERSION >= '1.9'
3
+ require 'rubygems'
4
+ end
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ require 'loose_tight_dictionary'
7
+
8
+ haystack = [ 'seamus', 'andy', 'ben' ]
9
+ needles = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT', 'Shamus Heaney' ]
10
+
11
+ d = LooseTightDictionary.new haystack, :log => $stdout
12
+ needles.each do |needle|
13
+ d.explain needle
14
+ puts
15
+ end
@@ -0,0 +1,36 @@
1
+ class LooseTightDictionary
2
+ # "Record linkage typically involves two main steps: blocking and scoring..."
3
+ # http://en.wikipedia.org/wiki/Record_linkage
4
+ #
5
+ # Blockings effectively divide up the haystack into groups that match a pattern
6
+ #
7
+ # A blocking (as in a grouping) comes into effect when a str matches.
8
+ # Then the needle must also match the blocking's regexp.
9
+ class Blocking
10
+ include ExtractRegexp
11
+
12
+ attr_reader :regexp
13
+
14
+ def initialize(regexp_or_str)
15
+ @regexp = extract_regexp regexp_or_str
16
+ end
17
+
18
+ # If a blocking "encompasses" two strings, that means they both fit into it.
19
+ #
20
+ # Returns false if they certainly don't fit this blocking.
21
+ # Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
22
+ def encompass?(str1, str2 = nil)
23
+ if str2.nil?
24
+ !!(regexp.match(str1))
25
+ elsif str2_match_data = regexp.match(str2)
26
+ if str1_match_data = regexp.match(str1)
27
+ str2_match_data.captures == str1_match_data.captures
28
+ else
29
+ false
30
+ end
31
+ else
32
+ nil
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,30 @@
1
+ class LooseTightDictionary
2
+ module ExtractRegexp #:nodoc: all
3
+ def extract_regexp(regexp_or_str)
4
+ case regexp_or_str
5
+ when ::Regexp
6
+ regexp_or_str
7
+ when ::String
8
+ regexp_from_string regexp_or_str
9
+ else
10
+ raise ::ArgumentError, "Expected regexp or string"
11
+ end
12
+ end
13
+
14
+ REGEXP_DELIMITERS = {
15
+ '%r{' => '}',
16
+ '/' => '/'
17
+ }
18
+ def regexp_from_string(str)
19
+ delim_start, delim_end = REGEXP_DELIMITERS.detect { |k, v| str.start_with? k }.map { |delim| ::Regexp.escape delim }
20
+ %r{\A#{delim_start}(.*)#{delim_end}([^#{delim_end}]*)\z} =~ str.strip
21
+ content = $1
22
+ options = $2
23
+ content.gsub! '\\/', '/'
24
+ ignore_case = options.include?('i') ? ::Regexp::IGNORECASE : nil
25
+ multiline = options.include?('m') ? ::Regexp::MULTILINE : nil
26
+ extended = options.include?('x') ? ::Regexp::EXTENDED : nil
27
+ ::Regexp.new content, (ignore_case||multiline||extended)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,25 @@
1
+ class LooseTightDictionary
2
+ # Identities take effect when needle and haystack both match a regexp
3
+ # Then the captured part of the regexp has to match exactly
4
+ class Identity
5
+ include ExtractRegexp
6
+
7
+ attr_reader :regexp
8
+
9
+ def initialize(regexp_or_str)
10
+ @regexp = extract_regexp regexp_or_str
11
+ end
12
+
13
+ # Two strings are "identical" if they both match this identity and the captures are equal.
14
+ #
15
+ # Only returns true/false if both strings match the regexp.
16
+ # Otherwise returns nil.
17
+ def identical?(str1, str2)
18
+ if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
19
+ str1_match_data.captures == match_data.captures
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,23 @@
1
+ class LooseTightDictionary
2
+ class Result #:nodoc: all
3
+ attr_accessor :needle
4
+ attr_accessor :tighteners
5
+ attr_accessor :blockings
6
+ attr_accessor :identities
7
+ attr_accessor :encompassed
8
+ attr_accessor :unencompassed
9
+ attr_accessor :possibly_identical
10
+ attr_accessor :certainly_different
11
+ attr_accessor :similarities
12
+ attr_accessor :record
13
+ attr_accessor :score
14
+
15
+ def haystack
16
+ encompassed + unencompassed
17
+ end
18
+
19
+ def free
20
+ # nothing to see here
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,28 @@
1
+ require 'amatch'
2
+
3
+ class LooseTightDictionary
4
+ class Score
5
+ attr_reader :str1, :str2
6
+
7
+ def initialize(str1, str2)
8
+ @str1 = str1
9
+ @str2 = str2
10
+ end
11
+
12
+ def to_f
13
+ @to_f ||= str1.pair_distance_similar str2
14
+ end
15
+
16
+ def inspect
17
+ %{#<Score: to_f=#{to_f}>}
18
+ end
19
+
20
+ def <=>(other)
21
+ to_f <=> other.to_f
22
+ end
23
+
24
+ def ==(other)
25
+ to_f == other.to_f
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,62 @@
1
+ class LooseTightDictionary
2
+ class Similarity
3
+ attr_reader :wrapper1
4
+ attr_reader :wrapper2
5
+
6
+ def initialize(wrapper1, wrapper2)
7
+ @wrapper1 = wrapper1
8
+ @wrapper2 = wrapper2
9
+ end
10
+
11
+ def <=>(other)
12
+ if best_score != other.best_score
13
+ best_score <=> other.best_score
14
+ else
15
+ weight <=> other.weight
16
+ end
17
+ end
18
+
19
+ # Weight things towards short original strings
20
+ def weight
21
+ @weight ||= (1.0 / (wrapper1.to_str.length * wrapper2.to_str.length))
22
+ end
23
+
24
+ def best_score
25
+ @best_score ||= Score.new best_wrapper1_variant, best_wrapper2_variant
26
+ end
27
+
28
+ def best_wrapper1_variant
29
+ best_variants[0]
30
+ end
31
+
32
+ def best_wrapper2_variant
33
+ best_variants[1]
34
+ end
35
+
36
+ def best_variants
37
+ @best_variants ||= cart_prod(wrapper1.variants, wrapper2.variants).sort do |tuple1, tuple2|
38
+ wrapper1_variant1, wrapper2_variant1 = tuple1
39
+ wrapper1_variant2, wrapper2_variant2 = tuple2
40
+
41
+ score1 = Score.new wrapper1_variant1, wrapper2_variant1
42
+ score2 = Score.new wrapper1_variant2, wrapper2_variant2
43
+
44
+ score1 <=> score2
45
+ end[-1]
46
+ end
47
+
48
+ def inspect
49
+ %{#<Similarity "#{wrapper2.to_str}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.to_str}"=>"#{best_wrapper1_variant}" weight=#{"%0.5f" % weight} best_score=#{"%0.5f" % best_score.to_f}>}
50
+ end
51
+
52
+ # Thanks William James!
53
+ # http://www.ruby-forum.com/topic/95519#200484
54
+ def cart_prod(*args)
55
+ args.inject([[]]){|old,lst|
56
+ new = []
57
+ lst.each{|e| new += old.map{|c| c.dup << e }}
58
+ new
59
+ }
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,30 @@
1
+ class LooseTightDictionary
2
+ # A tightener just strips a string down to its core
3
+ class Tightener
4
+ include ExtractRegexp
5
+
6
+ attr_reader :regexp
7
+
8
+ def initialize(regexp_or_str)
9
+ @regexp = extract_regexp regexp_or_str
10
+ end
11
+
12
+ # A tightener applies when its regexp matches and captures a new (shorter) string
13
+ def apply?(str)
14
+ !!(regexp.match(str))
15
+ end
16
+
17
+ # The result of applying a tightener is just all the captures put together.
18
+ def apply(str)
19
+ if match_data = regexp.match(str)
20
+ match_data.captures.join
21
+ else
22
+ str
23
+ end
24
+ end
25
+
26
+ def inspect
27
+ "#<Tightener regexp=#{regexp.inspect}>"
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,3 @@
1
+ class LooseTightDictionary
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,37 @@
1
+ class LooseTightDictionary
2
+ # Wrappers are the tokens that are passed around when doing scoring and optimizing.
3
+ class Wrapper #:nodoc: all
4
+ attr_reader :parent
5
+ attr_reader :record
6
+ attr_reader :reader
7
+
8
+ def initialize(attrs = {})
9
+ attrs.each do |k, v|
10
+ instance_variable_set "@#{k}", v
11
+ end
12
+ end
13
+
14
+ def inspect
15
+ "#<Wrapper to_str=#{to_str} variants=#{variants.length}>"
16
+ end
17
+
18
+ def to_str
19
+ @to_str ||= reader ? reader.call(record) : record.to_s
20
+ end
21
+
22
+ alias :to_s :to_str
23
+
24
+ def similarity(other)
25
+ Similarity.new self, other
26
+ end
27
+
28
+ def variants
29
+ @variants ||= parent.tighteners.inject([ to_str ]) do |memo, tightener|
30
+ if tightener.apply? to_str
31
+ memo.push tightener.apply(to_str)
32
+ end
33
+ memo
34
+ end.uniq
35
+ end
36
+ end
37
+ end
@@ -2,351 +2,224 @@ require 'active_support'
2
2
  require 'active_support/version'
3
3
  %w{
4
4
  active_support/core_ext/string
5
+ active_support/core_ext/hash
6
+ active_support/core_ext/object
5
7
  }.each do |active_support_3_requirement|
6
8
  require active_support_3_requirement
7
- end if ActiveSupport::VERSION::MAJOR == 3
8
- require 'amatch'
9
- require 'andand'
10
- if RUBY_VERSION >= '1.9'
11
- require 'csv'
12
- else
13
- begin
14
- require 'fastercsv'
15
- rescue LoadError
16
- $stderr.puts "[loose_tight_dictionary gem] You probably need to manually install the fastercsv gem."
17
- raise $!
18
- end
19
- end
9
+ end if ::ActiveSupport::VERSION::MAJOR == 3
20
10
 
11
+ # See the README for more information.
21
12
  class LooseTightDictionary
22
- class MissedChecks < RuntimeError; end
23
- class Mismatch < RuntimeError; end
24
- class FalsePositive < RuntimeError; end
13
+ autoload :ExtractRegexp, 'loose_tight_dictionary/extract_regexp'
14
+ autoload :Tightener, 'loose_tight_dictionary/tightener'
15
+ autoload :Blocking, 'loose_tight_dictionary/blocking'
16
+ autoload :Identity, 'loose_tight_dictionary/identity'
17
+ autoload :Result, 'loose_tight_dictionary/result'
18
+ autoload :Wrapper, 'loose_tight_dictionary/wrapper'
19
+ autoload :Similarity, 'loose_tight_dictionary/similarity'
20
+ autoload :Score, 'loose_tight_dictionary/score'
25
21
 
26
- class T
27
- attr_reader :str, :tightened_str
28
- def initialize(str, tightened_str)
29
- @str = str
30
- @tightened_str = tightened_str
31
- end
32
-
33
- def tightened?
34
- str != tightened_str
35
- end
36
-
37
- def prefix_and_score(other)
38
- prefix = [ tightened_str.length, other.tightened_str.length ].min if tightened? and other.tightened?
39
- score = if prefix
40
- tightened_str.first(prefix).pair_distance_similar other.tightened_str.first(prefix)
41
- else
42
- tightened_str.pair_distance_similar other.tightened_str
43
- end
44
- [ prefix, score ]
45
- end
46
- end
47
-
48
- include Amatch
49
-
50
- attr_reader :right_records
51
- attr_reader :case_sensitive
52
-
53
- attr_accessor :logger
54
- attr_accessor :tee
55
- attr_accessor :tee_format
56
- attr_accessor :positives
57
- attr_accessor :negatives
58
- attr_accessor :left_reader
59
- attr_accessor :right_reader
60
- attr_accessor :blocking_only
22
+ class Freed < RuntimeError; end
23
+
24
+ attr_reader :options
25
+ attr_reader :haystack
26
+ attr_reader :records
61
27
 
62
- def initialize(right_records, options = {})
63
- @right_records = right_records
64
- @_raw_tightenings = options[:tightenings] || Array.new
65
- @_raw_identities = options[:identities] || Array.new
66
- @_raw_blockings = options[:blockings] || Array.new
67
- @left_reader = options[:left_reader]
68
- @right_reader = options[:right_reader]
69
- @positives = options[:positives]
70
- @negatives = options[:negatives]
71
- @logger = options[:logger]
72
- @tee = options[:tee]
73
- @tee_format = options[:tee_format] || :fixed_width
74
- @case_sensitive = options[:case_sensitive] || false
75
- @blocking_only = options[:blocking_only] || false
28
+ # haystack - a bunch of records
29
+ # options
30
+ # * tighteners: regexps that essentialize strings down
31
+ # * identities: regexps that rule out similarities, for example a 737 cannot be identical to a 747
32
+ def initialize(records, options = {})
33
+ @options = options.symbolize_keys
34
+ @records = records
35
+ @haystack = records.map { |record| Wrapper.new :parent => self, :record => record, :reader => haystack_reader }
76
36
  end
77
37
 
78
- # def tightenings
79
- # def identities
80
- # def blockings
81
- %w{ tightenings identities blockings }.each do |name|
82
- module_eval %{
83
- def #{name}
84
- @#{name} ||= @_raw_#{name}.map do |i|
85
- next if i[0].blank?
86
- literal_regexp i[0]
87
- end
88
- end
89
- }
38
+ def last_result
39
+ @last_result ||= Result.new
90
40
  end
91
41
 
92
- def blocking_only?
93
- !!blocking_only
42
+ def log(str = '') #:nodoc:
43
+ (options[:log] || $stderr).puts str unless options[:log] == false
94
44
  end
95
-
96
- def inline_check(left_record, right_record)
97
- return unless positives.present? or negatives.present?
98
-
99
- left = read_left left_record
100
- right = read_right right_record
45
+
46
+ def find_with_score(needle)
47
+ record = find needle
48
+ [ record, last_result.score ]
49
+ end
50
+
51
+ # todo fix record.record confusion (should be wrapper.record or smth)
52
+ def find(needle, gather_last_result = true)
53
+ raise Freed if freed?
54
+ free_last_result
101
55
 
102
- if positive_record = positives.andand.detect { |record| record[0] == left }
103
- correct_right = positive_record[1]
104
- if correct_right.present? and right.blank?
105
- logger.andand.debug " Mismatch! (should match SOMETHING)"
106
- raise Mismatch
107
- elsif right != correct_right
108
- logger.andand.debug " Mismatch! (#{right} should be #{correct_right})"
109
- raise Mismatch
110
- end
56
+ if gather_last_result
57
+ last_result.tighteners = tighteners
58
+ last_result.identities = identities
59
+ last_result.blockings = blockings
111
60
  end
112
61
 
113
- if negative_record = negatives.andand.detect { |record| record[0] == left }
114
- incorrect_right = negative_record[1]
115
- if incorrect_right.blank? and right.present?
116
- logger.andand.debug " False positive! (should NOT match ANYTHING)"
117
- raise FalsePositive
118
- elsif right == incorrect_right
119
- logger.andand.debug " False positive! (#{right} should NOT be #{incorrect_right})"
120
- raise FalsePositive
121
- end
122
- end
123
- end
124
-
125
- def check(left_records)
126
- header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
127
- case tee_format
128
- when :csv
129
- tee.andand.puts header.flatten.to_csv
130
- when :fixed_width
131
- tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
62
+ needle = Wrapper.new :parent => self, :record => needle
63
+
64
+ if gather_last_result
65
+ last_result.needle = needle
132
66
  end
67
+
68
+ return if strict_blocking and blockings.none? { |blocking| blocking.encompass? needle }
133
69
 
134
- left_records.each do |left_record|
135
- begin
136
- right_record = left_to_right left_record
137
- ensure
138
- case tee_format
139
- when :csv
140
- tee.andand.puts $ltd_1.flatten.to_csv
141
- when :fixed_width
142
- tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
70
+ encompassed, unencompassed = if strict_blocking and blockings.any?
71
+ haystack.partition do |record|
72
+ blockings.any? do |blocking|
73
+ blocking.encompass?(needle, record) == true
143
74
  end
144
75
  end
76
+ else
77
+ [ haystack.dup, [] ]
145
78
  end
146
- end
147
-
148
- def left_to_right(left_record)
149
- left = read_left left_record
150
- blocking_left = blocking left
151
- return if blocking_only? and blocking_left.nil?
152
- i_options_left = i_options left
153
- t_options_left = t_options left
154
- history = Hash.new
155
- right_record = right_records.select do |right_record|
156
- right = read_right right_record
157
- blocking_right = blocking right
158
- (not blocking_left and not blocking_right) or
159
- (blocking_right and blocking_right.match(left)) or
160
- (blocking_left and blocking_left.match(right))
161
- end.max do |a_record, b_record|
162
- a = read_right a_record
163
- b = read_right b_record
164
- i_options_a = i_options a
165
- i_options_b = i_options b
166
- collision_a = collision? i_options_left, i_options_a
167
- collision_b = collision? i_options_left, i_options_b
168
- if collision_a and collision_b
169
- # neither would ever work, so randomly rank one over the other
170
- rand(2) == 1 ? -1 : 1
171
- elsif collision_a
172
- -1
173
- elsif collision_b
174
- 1
175
- else
176
- t_left_a, t_right_a = optimize t_options_left, t_options(a)
177
- t_left_b, t_right_b = optimize t_options_left, t_options(b)
178
- a_prefix, a_score = t_left_a.prefix_and_score t_right_a
179
- b_prefix, b_score = t_left_b.prefix_and_score t_right_b
180
- history[a_record] = [t_left_a.tightened_str, t_right_a.tightened_str, a_prefix ? a_prefix : 'NULL', a_score]
181
- history[b_record] = [t_left_b.tightened_str, t_right_b.tightened_str, b_prefix ? b_prefix : 'NULL', b_score]
182
-
183
- yep_dd = ($ltd_dd_right and $ltd_dd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_dd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_dd_right } and (!$ltd_dd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_dd_left_not }))
184
-
185
- if $ltd_dd_print and yep_dd
186
- logger.andand.debug t_left_a.inspect
187
- logger.andand.debug t_right_a.inspect
188
- logger.andand.debug t_left_b.inspect
189
- logger.andand.debug t_right_b.inspect
190
- logger.andand.debug
191
- end
192
-
193
- z = 1
194
- debugger if yep_dd
195
- z = 1
196
-
197
- if a_score != b_score
198
- a_score <=> b_score
199
- elsif a_prefix and b_prefix and a_prefix != b_prefix
200
- a_prefix <=> b_prefix
201
- else
202
- b.length <=> a.length
79
+
80
+ if gather_last_result
81
+ last_result.encompassed = encompassed
82
+ last_result.unencompassed = unencompassed
83
+ end
84
+
85
+ possibly_identical, certainly_different = if identities.any?
86
+ encompassed.partition do |record|
87
+ identities.all? do |identity|
88
+ answer = identity.identical? needle, record
89
+ answer.nil? or answer == true
203
90
  end
204
91
  end
205
- end
206
- $ltd_1 = history[right_record]
207
- right = read_right right_record
208
- i_options_right = i_options right
209
- z = 1
210
- debugger if $ltd_left.andand.match(left) or $ltd_right.andand.match(right)
211
- z = 1
212
- if collision? i_options_left, i_options_right
213
- $ltd_0 = nil
214
- return
215
92
  else
216
- $ltd_0 = right_record
93
+ [ encompassed.dup, [] ]
217
94
  end
218
- inline_check left_record, right_record
219
- right_record
220
- end
221
- alias_method :find, :left_to_right
222
-
223
- def optimize(t_options_left, t_options_right)
224
- cart_prod(t_options_left, t_options_right).max do |a, b|
225
- t_left_a, t_right_a = a
226
- t_left_b, t_right_b = b
227
95
 
228
- a_prefix, a_score = t_left_a.prefix_and_score t_right_a
229
- b_prefix, b_score = t_left_b.prefix_and_score t_right_b
230
-
231
- yep_ddd = ($ltd_ddd_right and $ltd_ddd_left and [t_left_a, t_left_b].any? { |f| f.str =~ $ltd_ddd_left } and [t_right_a, t_right_b].any? { |f| f.str =~ $ltd_ddd_right } and (!$ltd_ddd_left_not or [t_left_a, t_left_b].none? { |f| f.str =~ $ltd_ddd_left_not }))
232
-
233
- if $ltd_ddd_print and yep_ddd
234
- logger.andand.debug t_left_a.inspect
235
- logger.andand.debug t_right_a.inspect
236
- logger.andand.debug t_left_b.inspect
237
- logger.andand.debug t_right_b.inspect
238
- logger.andand.debug
239
- end
240
-
241
- z = 1
242
- debugger if yep_ddd
243
- z = 1
244
-
245
- if a_score != b_score
246
- a_score <=> b_score
247
- elsif a_prefix and b_prefix and a_prefix != b_prefix
248
- a_prefix <=> b_prefix
249
- else
250
- # randomly choose
251
- # maybe later i can figure out how big the inputs are and apply occam's razor
252
- rand(2) == 1 ? -1 : 1
253
- end
96
+ if gather_last_result
97
+ last_result.possibly_identical = possibly_identical
98
+ last_result.certainly_different = certainly_different
254
99
  end
255
- end
256
-
257
- def t_options(str)
258
- return @_t_options[str] if @_t_options.andand.has_key?(str)
259
- @_t_options ||= Hash.new
260
- ary = Array.new
261
- ary.push T.new(str, str)
262
- tightenings.each do |regexp|
263
- if match_data = regexp.match(str)
264
- ary.push T.new(str, match_data.captures.compact.join)
265
- end
100
+
101
+ similarities = possibly_identical.map do |record|
102
+ needle.similarity record
103
+ end.sort
104
+
105
+ best_similarity = similarities[-1]
106
+ record = best_similarity.wrapper2
107
+ score = best_similarity.best_score.to_f
108
+
109
+ if gather_last_result
110
+ last_result.similarities = similarities
111
+ last_result.record = record.record
112
+ last_result.score = score
266
113
  end
267
- @_t_options[str] = ary
114
+
115
+ record.record
268
116
  end
269
117
 
270
- class I
271
- attr_reader :regexp, :str, :case_sensitive, :identity
272
- def initialize(regexp, str, case_sensitive)
273
- @regexp = regexp
274
- @str = str
275
- @identity = regexp.match(str).captures.compact.join
276
- @identity = @identity.downcase if case_sensitive
277
- end
118
+ # Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
119
+ #
120
+ # d = LooseTightDictionary.new ['737', '747', '757' ]
121
+ # d.explain 'boeing 737-100'
122
+ def explain(needle)
123
+ record = find needle
124
+ log "#" * 150
125
+ log "# Match #{needle.inspect} => #{record.inspect}"
126
+ log "#" * 150
127
+ log
128
+ log "Needle"
129
+ log "-" * 150
130
+ log last_result.needle.to_str
131
+ log
132
+ log "Haystack"
133
+ log "-" * 150
134
+ log last_result.haystack.map { |record| record.to_str }.join("\n")
135
+ log
136
+ log "Tighteners"
137
+ log "-" * 150
138
+ log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
139
+ log
140
+ log "Blockings"
141
+ log "-" * 150
142
+ log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
143
+ log
144
+ log "Identities"
145
+ log "-" * 150
146
+ log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
147
+ log
148
+ log "Included"
149
+ log "-" * 150
150
+ log last_result.encompassed.blank? ? '(none)' : last_result.encompassed.map { |encompassed| encompassed.to_str }.join("\n")
151
+ log
152
+ log "Ignored"
153
+ log "-" * 150
154
+ log last_result.unencompassed.blank? ? '(none)' : last_result.unencompassed.map { |unencompassed| unencompassed.to_str }.join("\n")
155
+ log
156
+ log "Possibly identical"
157
+ log "-" * 150
158
+ log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.to_str }.join("\n")
159
+ log
160
+ log "Certainly different"
161
+ log "-" * 150
162
+ log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.to_str }.join("\n")
163
+ log
164
+ log "Similarities"
165
+ log "-" * 150
166
+ log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
167
+ log
168
+ log "Match"
169
+ log "-" * 150
170
+ log record.inspect
278
171
  end
279
-
280
- def collision?(i_options_left, i_options_right)
281
- i_options_left.any? do |r_left|
282
- i_options_right.any? do |r_right|
283
- r_left.regexp == r_right.regexp and r_left.identity != r_right.identity
284
- end
172
+
173
+ def haystack_reader
174
+ options[:haystack_reader]
175
+ end
176
+
177
+ def strict_blocking
178
+ options[:strict_blocking] || false
179
+ end
180
+
181
+ def tighteners
182
+ @tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
183
+ Tightener.new regexp_or_str
285
184
  end
286
185
  end
287
-
288
- def i_options(str)
289
- return @_i_options[str] if @_i_options.andand.has_key?(str)
290
- @_i_options ||= Hash.new
291
- ary = Array.new
292
- identities.each do |regexp|
293
- if regexp.match str
294
- ary.push I.new(regexp, str, case_sensitive)
295
- end
186
+
187
+ def identities
188
+ @identities ||= (options[:identities] || []).map do |regexp_or_str|
189
+ Identity.new regexp_or_str
296
190
  end
297
- @_i_options[str] = ary
298
191
  end
299
-
300
- def blocking(str)
301
- return @_blocking[str] if @_blocking.andand.has_key?(str)
302
- @_blocking ||= Hash.new
303
- blockings.each do |regexp|
304
- if regexp.match str
305
- return @_blocking[str] = regexp
306
- end
192
+
193
+ def blockings
194
+ @blockings ||= (options[:blockings] || []).map do |regexp_or_str|
195
+ Blocking.new regexp_or_str
307
196
  end
308
- @_blocking[str] = nil
309
197
  end
310
198
 
311
- def literal_regexp(str)
312
- return @_literal_regexp[str] if @_literal_regexp.andand.has_key? str
313
- @_literal_regexp ||= Hash.new
314
- raw_regexp_options = str.split('/').last
315
- ignore_case = (!case_sensitive or raw_regexp_options.include?('i')) ? Regexp::IGNORECASE : nil
316
- multiline = raw_regexp_options.include?('m') ? Regexp::MULTILINE : nil
317
- extended = raw_regexp_options.include?('x') ? Regexp::EXTENDED : nil
318
- @_literal_regexp[str] = Regexp.new str.gsub(/\A\/|\/([ixm]*)\z/, ''), (ignore_case||multiline||extended)
199
+ def freed?
200
+ @freed == true
319
201
  end
320
202
 
321
- def read_left(left_record)
322
- return if left_record.nil?
323
- if left_reader
324
- left_reader.call(left_record)
325
- elsif left_record.is_a?(String)
326
- left_record
327
- else
328
- left_record[0]
329
- end
203
+ def free
204
+ free_last_result
205
+ @options.try :clear
206
+ @options = nil
207
+ @haystack.try :clear
208
+ @haystack = nil
209
+ @tighteners.try :clear
210
+ @tighteners = nil
211
+ @identities.try :clear
212
+ @identities = nil
213
+ @blockings.try :clear
214
+ @blockings = nil
215
+ ensure
216
+ @freed = true
330
217
  end
331
218
 
332
- def read_right(right_record)
333
- return if right_record.nil?
334
- if right_reader
335
- right_reader.call(right_record)
336
- elsif right_record.is_a?(String)
337
- right_record
338
- else
339
- right_record[0]
340
- end
341
- end
219
+ private
342
220
 
343
- # Thanks William James!
344
- # http://www.ruby-forum.com/topic/95519#200484
345
- def cart_prod(*args)
346
- args.inject([[]]){|old,lst|
347
- new = []
348
- lst.each{|e| new += old.map{|c| c.dup << e }}
349
- new
350
- }
221
+ def free_last_result
222
+ @last_result.try :free
223
+ @last_result = nil
351
224
  end
352
225
  end