fuzzy_match 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.rdoc +94 -0
- data/Rakefile +21 -0
- data/THANKS-WILLIAM-JAMES.rb +37 -0
- data/benchmark/before-with-free.txt +283 -0
- data/benchmark/before-without-last-result.txt +257 -0
- data/benchmark/before.txt +304 -0
- data/benchmark/memory.rb +54 -0
- data/examples/bts_aircraft/5-2-A.htm +10305 -0
- data/examples/bts_aircraft/5-2-B.htm +9576 -0
- data/examples/bts_aircraft/5-2-D.htm +7094 -0
- data/examples/bts_aircraft/5-2-E.htm +2349 -0
- data/examples/bts_aircraft/5-2-G.htm +2922 -0
- data/examples/bts_aircraft/blockings.csv +1 -0
- data/examples/bts_aircraft/identities.csv +1 -0
- data/examples/bts_aircraft/negatives.csv +1 -0
- data/examples/bts_aircraft/number_260.csv +334 -0
- data/examples/bts_aircraft/positives.csv +1 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
- data/examples/bts_aircraft/tighteners.csv +1 -0
- data/examples/first_name_matching.rb +15 -0
- data/examples/icao-bts.xls +0 -0
- data/fuzzy_match.gemspec +32 -0
- data/lib/fuzzy_match/blocking.rb +36 -0
- data/lib/fuzzy_match/cached_result.rb +74 -0
- data/lib/fuzzy_match/identity.rb +23 -0
- data/lib/fuzzy_match/result.rb +17 -0
- data/lib/fuzzy_match/score.rb +125 -0
- data/lib/fuzzy_match/similarity.rb +53 -0
- data/lib/fuzzy_match/stop_word.rb +19 -0
- data/lib/fuzzy_match/tightener.rb +28 -0
- data/lib/fuzzy_match/version.rb +3 -0
- data/lib/fuzzy_match/wrapper.rb +67 -0
- data/lib/fuzzy_match.rb +252 -0
- data/test/helper.rb +12 -0
- data/test/test_blocking.rb +23 -0
- data/test/test_cache.rb +130 -0
- data/test/test_fuzzy_match.rb +190 -0
- data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
- data/test/test_identity.rb +33 -0
- data/test/test_tightening.rb +10 -0
- metadata +197 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# Identities take effect when needle and haystack both match a regexp
|
3
|
+
# Then the captured part of the regexp has to match exactly
|
4
|
+
class Identity
|
5
|
+
attr_reader :regexp
|
6
|
+
|
7
|
+
def initialize(regexp_or_str)
|
8
|
+
@regexp = regexp_or_str.to_regexp
|
9
|
+
end
|
10
|
+
|
11
|
+
# Two strings are "identical" if they both match this identity and the captures are equal.
|
12
|
+
#
|
13
|
+
# Only returns true/false if both strings match the regexp.
|
14
|
+
# Otherwise returns nil.
|
15
|
+
def identical?(str1, str2)
|
16
|
+
if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
|
17
|
+
str1_match_data.captures == match_data.captures
|
18
|
+
else
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Result #:nodoc: all
|
3
|
+
attr_accessor :needle
|
4
|
+
attr_accessor :tighteners
|
5
|
+
attr_accessor :blockings
|
6
|
+
attr_accessor :identities
|
7
|
+
attr_accessor :stop_words
|
8
|
+
attr_accessor :candidates
|
9
|
+
attr_accessor :joint
|
10
|
+
attr_accessor :disjoint
|
11
|
+
attr_accessor :possibly_identical
|
12
|
+
attr_accessor :certainly_different
|
13
|
+
attr_accessor :similarities
|
14
|
+
attr_accessor :record
|
15
|
+
attr_accessor :score
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
begin
|
2
|
+
require 'amatch'
|
3
|
+
rescue ::LoadError
|
4
|
+
# using native ruby similarity scoring
|
5
|
+
end
|
6
|
+
|
7
|
+
class FuzzyMatch
|
8
|
+
class Score
|
9
|
+
attr_reader :str1, :str2
|
10
|
+
|
11
|
+
def initialize(str1, str2)
|
12
|
+
@str1 = str1.downcase
|
13
|
+
@str2 = str2.downcase
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient} levenshtein=#{levenshtein}>}
|
18
|
+
end
|
19
|
+
|
20
|
+
def <=>(other)
|
21
|
+
by_dices_coefficient = (dices_coefficient <=> other.dices_coefficient)
|
22
|
+
if by_dices_coefficient == 0
|
23
|
+
levenshtein <=> other.levenshtein
|
24
|
+
else
|
25
|
+
by_dices_coefficient
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def utf8?
|
30
|
+
(defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
|
31
|
+
end
|
32
|
+
|
33
|
+
if defined?(::Amatch)
|
34
|
+
|
35
|
+
def dices_coefficient
|
36
|
+
str1.pair_distance_similar str2
|
37
|
+
end
|
38
|
+
|
39
|
+
def levenshtein
|
40
|
+
str1.levenshtein_similar str2
|
41
|
+
end
|
42
|
+
|
43
|
+
else
|
44
|
+
|
45
|
+
SPACE = ' '
|
46
|
+
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
47
|
+
def dices_coefficient
|
48
|
+
if str1 == str2
|
49
|
+
return 1.0
|
50
|
+
elsif str1.length == 1 and str2.length == 1
|
51
|
+
return 0.0
|
52
|
+
end
|
53
|
+
pairs1 = (0..str1.length-2).map do |i|
|
54
|
+
str1[i,2]
|
55
|
+
end.reject do |pair|
|
56
|
+
pair.include? SPACE
|
57
|
+
end
|
58
|
+
pairs2 = (0..str2.length-2).map do |i|
|
59
|
+
str2[i,2]
|
60
|
+
end.reject do |pair|
|
61
|
+
pair.include? SPACE
|
62
|
+
end
|
63
|
+
union = pairs1.size + pairs2.size
|
64
|
+
intersection = 0
|
65
|
+
pairs1.each do |p1|
|
66
|
+
0.upto(pairs2.size-1) do |i|
|
67
|
+
if p1 == pairs2[i]
|
68
|
+
intersection += 1
|
69
|
+
pairs2.slice!(i)
|
70
|
+
break
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
(2.0 * intersection) / union
|
75
|
+
end
|
76
|
+
|
77
|
+
# extracted/adapted from the text gem version 1.0.2
|
78
|
+
# normalization added for utf-8 strings
|
79
|
+
# lib/text/levenshtein.rb
|
80
|
+
def levenshtein
|
81
|
+
if utf8?
|
82
|
+
unpack_rule = 'U*'
|
83
|
+
else
|
84
|
+
unpack_rule = 'C*'
|
85
|
+
end
|
86
|
+
s = str1.unpack(unpack_rule)
|
87
|
+
t = str2.unpack(unpack_rule)
|
88
|
+
n = s.length
|
89
|
+
m = t.length
|
90
|
+
if n == 0 or m == 0
|
91
|
+
return 0.0
|
92
|
+
end
|
93
|
+
d = (0..m).to_a
|
94
|
+
x = nil
|
95
|
+
(0...n).each do |i|
|
96
|
+
e = i+1
|
97
|
+
(0...m).each do |j|
|
98
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
99
|
+
x = [
|
100
|
+
d[j+1] + 1, # insertion
|
101
|
+
e + 1, # deletion
|
102
|
+
d[j] + cost # substitution
|
103
|
+
].min
|
104
|
+
d[j] = e
|
105
|
+
e = x
|
106
|
+
end
|
107
|
+
d[m] = x
|
108
|
+
end
|
109
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
110
|
+
# if (b_len > a_len) {
|
111
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
112
|
+
# } else {
|
113
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
114
|
+
# }
|
115
|
+
1.0 - x.to_f / [n, m].max
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
extend ::ActiveSupport::Memoizable
|
121
|
+
memoize :dices_coefficient
|
122
|
+
memoize :levenshtein
|
123
|
+
memoize :utf8?
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Similarity
|
3
|
+
attr_reader :wrapper1
|
4
|
+
attr_reader :wrapper2
|
5
|
+
|
6
|
+
def initialize(wrapper1, wrapper2)
|
7
|
+
@wrapper1 = wrapper1
|
8
|
+
@wrapper2 = wrapper2
|
9
|
+
end
|
10
|
+
|
11
|
+
def <=>(other)
|
12
|
+
by_score = best_score <=> other.best_score
|
13
|
+
if by_score == 0
|
14
|
+
original_weight <=> other.original_weight
|
15
|
+
else
|
16
|
+
by_score
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Weight things towards short original strings
|
21
|
+
def original_weight
|
22
|
+
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
23
|
+
end
|
24
|
+
|
25
|
+
def best_score
|
26
|
+
@best_score ||= Score.new best_wrapper1_variant, best_wrapper2_variant
|
27
|
+
end
|
28
|
+
|
29
|
+
def best_wrapper1_variant
|
30
|
+
best_variants[0]
|
31
|
+
end
|
32
|
+
|
33
|
+
def best_wrapper2_variant
|
34
|
+
best_variants[1]
|
35
|
+
end
|
36
|
+
|
37
|
+
def best_variants
|
38
|
+
@best_variants ||= wrapper1.variants.product(wrapper2.variants).sort do |tuple1, tuple2|
|
39
|
+
wrapper1_variant1, wrapper2_variant1 = tuple1
|
40
|
+
wrapper1_variant2, wrapper2_variant2 = tuple2
|
41
|
+
|
42
|
+
score1 = Score.new wrapper1_variant1, wrapper2_variant1
|
43
|
+
score2 = Score.new wrapper1_variant2, wrapper2_variant2
|
44
|
+
|
45
|
+
score1 <=> score2
|
46
|
+
end[-1]
|
47
|
+
end
|
48
|
+
|
49
|
+
def inspect
|
50
|
+
%{#<Similarity "#{wrapper2.render}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.render}"=>"#{best_wrapper1_variant}" original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# A stop word is ignored
|
3
|
+
class StopWord
|
4
|
+
attr_reader :regexp
|
5
|
+
|
6
|
+
def initialize(regexp_or_str)
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
|
+
end
|
9
|
+
|
10
|
+
# Destructively remove stop words from the string
|
11
|
+
def apply!(str)
|
12
|
+
str.gsub! regexp, ''
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"#<StopWord regexp=#{regexp.inspect}>"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# A tightener just strips a string down to its core
|
3
|
+
class Tightener
|
4
|
+
attr_reader :regexp
|
5
|
+
|
6
|
+
def initialize(regexp_or_str)
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
|
+
end
|
9
|
+
|
10
|
+
# A tightener applies when its regexp matches and captures a new (shorter) string
|
11
|
+
def apply?(str)
|
12
|
+
!!(regexp.match(str))
|
13
|
+
end
|
14
|
+
|
15
|
+
# The result of applying a tightener is just all the captures put together.
|
16
|
+
def apply(str)
|
17
|
+
if match_data = regexp.match(str)
|
18
|
+
match_data.captures.join
|
19
|
+
else
|
20
|
+
str
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
"#<Tightener regexp=#{regexp.inspect}>"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
|
+
class Wrapper #:nodoc: all
|
4
|
+
attr_reader :fuzzy_match
|
5
|
+
attr_reader :record
|
6
|
+
attr_reader :read
|
7
|
+
|
8
|
+
def initialize(fuzzy_match, record, read = nil)
|
9
|
+
@fuzzy_match = fuzzy_match
|
10
|
+
@record = record
|
11
|
+
@read = read
|
12
|
+
end
|
13
|
+
|
14
|
+
def inspect
|
15
|
+
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
16
|
+
end
|
17
|
+
|
18
|
+
def render
|
19
|
+
return @render if rendered?
|
20
|
+
str = case read
|
21
|
+
when ::Proc
|
22
|
+
read.call record
|
23
|
+
when ::Symbol
|
24
|
+
if record.respond_to?(read)
|
25
|
+
record.send read
|
26
|
+
else
|
27
|
+
record[read]
|
28
|
+
end
|
29
|
+
when ::NilClass
|
30
|
+
record
|
31
|
+
else
|
32
|
+
record[read]
|
33
|
+
end.to_s.dup
|
34
|
+
fuzzy_match.stop_words.each do |stop_word|
|
35
|
+
stop_word.apply! str
|
36
|
+
end
|
37
|
+
str.strip!
|
38
|
+
@render = str.freeze
|
39
|
+
@rendered = true
|
40
|
+
@render
|
41
|
+
end
|
42
|
+
|
43
|
+
alias :to_str :render
|
44
|
+
|
45
|
+
WORD_BOUNDARY = %r{\s*\b\s*}
|
46
|
+
def words
|
47
|
+
@words ||= render.split(WORD_BOUNDARY)
|
48
|
+
end
|
49
|
+
|
50
|
+
def similarity(other)
|
51
|
+
Similarity.new self, other
|
52
|
+
end
|
53
|
+
|
54
|
+
def variants
|
55
|
+
@variants ||= fuzzy_match.tighteners.inject([ render ]) do |memo, tightener|
|
56
|
+
if tightener.apply? render
|
57
|
+
memo.push tightener.apply(render)
|
58
|
+
end
|
59
|
+
memo
|
60
|
+
end.uniq
|
61
|
+
end
|
62
|
+
|
63
|
+
def rendered?
|
64
|
+
@rendered == true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
data/lib/fuzzy_match.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/version'
|
3
|
+
if ::ActiveSupport::VERSION::MAJOR >= 3
|
4
|
+
require 'active_support/core_ext'
|
5
|
+
end
|
6
|
+
require 'to_regexp'
|
7
|
+
|
8
|
+
# See the README for more information.
|
9
|
+
class FuzzyMatch
|
10
|
+
autoload :Tightener, 'fuzzy_match/tightener'
|
11
|
+
autoload :StopWord, 'fuzzy_match/stop_word'
|
12
|
+
autoload :Blocking, 'fuzzy_match/blocking'
|
13
|
+
autoload :Identity, 'fuzzy_match/identity'
|
14
|
+
autoload :Result, 'fuzzy_match/result'
|
15
|
+
autoload :Wrapper, 'fuzzy_match/wrapper'
|
16
|
+
autoload :Similarity, 'fuzzy_match/similarity'
|
17
|
+
autoload :Score, 'fuzzy_match/score'
|
18
|
+
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
|
+
|
20
|
+
attr_reader :haystack
|
21
|
+
attr_reader :blockings
|
22
|
+
attr_reader :identities
|
23
|
+
attr_reader :tighteners
|
24
|
+
attr_reader :stop_words
|
25
|
+
attr_reader :default_first_blocking_decides
|
26
|
+
attr_reader :default_must_match_blocking
|
27
|
+
attr_reader :default_must_match_at_least_one_word
|
28
|
+
|
29
|
+
# haystack - a bunch of records
|
30
|
+
# options
|
31
|
+
# * tighteners: regexps (see readme)
|
32
|
+
# * identities: regexps
|
33
|
+
# * blockings: regexps
|
34
|
+
# * stop_words: regexps
|
35
|
+
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
36
|
+
def initialize(records, options = {})
|
37
|
+
options = options.symbolize_keys
|
38
|
+
@default_first_blocking_decides = options[:first_blocking_decides]
|
39
|
+
@default_must_match_blocking = options[:must_match_blocking]
|
40
|
+
@default_must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
41
|
+
@blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
|
42
|
+
@identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
|
43
|
+
@tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
|
44
|
+
@stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
|
45
|
+
read = options[:read] || options[:haystack_reader]
|
46
|
+
@haystack = records.map { |record| Wrapper.new self, record, read }
|
47
|
+
end
|
48
|
+
|
49
|
+
def last_result
|
50
|
+
@last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true")
|
51
|
+
end
|
52
|
+
|
53
|
+
def find_all(needle, options = {})
|
54
|
+
options = options.symbolize_keys.merge(:find_all => true)
|
55
|
+
find needle, options
|
56
|
+
end
|
57
|
+
|
58
|
+
def find(needle, options = {})
|
59
|
+
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
60
|
+
|
61
|
+
options = options.symbolize_keys
|
62
|
+
gather_last_result = options.fetch(:gather_last_result, false)
|
63
|
+
is_find_all = options.fetch(:find_all, false)
|
64
|
+
first_blocking_decides = options.fetch(:first_blocking_decides, default_first_blocking_decides)
|
65
|
+
must_match_blocking = options.fetch(:must_match_blocking, default_must_match_blocking)
|
66
|
+
must_match_at_least_one_word = options.fetch(:must_match_at_least_one_word, default_must_match_at_least_one_word)
|
67
|
+
|
68
|
+
if gather_last_result
|
69
|
+
free_last_result
|
70
|
+
@last_result = Result.new
|
71
|
+
end
|
72
|
+
|
73
|
+
if gather_last_result
|
74
|
+
last_result.tighteners = tighteners
|
75
|
+
last_result.identities = identities
|
76
|
+
last_result.blockings = blockings
|
77
|
+
last_result.stop_words = stop_words
|
78
|
+
end
|
79
|
+
|
80
|
+
needle = Wrapper.new self, needle
|
81
|
+
|
82
|
+
if gather_last_result
|
83
|
+
last_result.needle = needle
|
84
|
+
end
|
85
|
+
|
86
|
+
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
87
|
+
if is_find_all
|
88
|
+
return []
|
89
|
+
else
|
90
|
+
return nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
candidates = if must_match_at_least_one_word
|
95
|
+
haystack.select do |straw|
|
96
|
+
(needle.words & straw.words).any?
|
97
|
+
end
|
98
|
+
else
|
99
|
+
haystack
|
100
|
+
end
|
101
|
+
|
102
|
+
if gather_last_result
|
103
|
+
last_result.candidates = candidates
|
104
|
+
end
|
105
|
+
|
106
|
+
joint, disjoint = if blockings.any?
|
107
|
+
candidates.partition do |straw|
|
108
|
+
if first_blocking_decides
|
109
|
+
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
110
|
+
else
|
111
|
+
blockings.any? { |blocking| blocking.join? needle, straw }
|
112
|
+
end
|
113
|
+
end
|
114
|
+
else
|
115
|
+
[ candidates.dup, [] ]
|
116
|
+
end
|
117
|
+
|
118
|
+
if joint.none?
|
119
|
+
if must_match_blocking
|
120
|
+
if is_find_all
|
121
|
+
return []
|
122
|
+
else
|
123
|
+
return nil
|
124
|
+
end
|
125
|
+
else
|
126
|
+
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
127
|
+
joint = disjoint
|
128
|
+
disjoint = []
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if gather_last_result
|
133
|
+
last_result.joint = joint
|
134
|
+
last_result.disjoint = disjoint
|
135
|
+
end
|
136
|
+
|
137
|
+
possibly_identical, certainly_different = if identities.any?
|
138
|
+
joint.partition do |straw|
|
139
|
+
identities.all? do |identity|
|
140
|
+
answer = identity.identical? needle, straw
|
141
|
+
answer.nil? or answer == true
|
142
|
+
end
|
143
|
+
end
|
144
|
+
else
|
145
|
+
[ joint.dup, [] ]
|
146
|
+
end
|
147
|
+
|
148
|
+
if gather_last_result
|
149
|
+
last_result.possibly_identical = possibly_identical
|
150
|
+
last_result.certainly_different = certainly_different
|
151
|
+
end
|
152
|
+
|
153
|
+
if is_find_all
|
154
|
+
return possibly_identical.map { |straw| straw.record }
|
155
|
+
end
|
156
|
+
|
157
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
158
|
+
|
159
|
+
if gather_last_result
|
160
|
+
last_result.similarities = similarities
|
161
|
+
end
|
162
|
+
|
163
|
+
if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
|
164
|
+
record = best_similarity.wrapper2.record
|
165
|
+
if gather_last_result
|
166
|
+
last_result.record = record
|
167
|
+
last_result.score = best_similarity.best_score.dices_coefficient
|
168
|
+
end
|
169
|
+
record
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
174
|
+
#
|
175
|
+
# d = FuzzyMatch.new ['737', '747', '757' ]
|
176
|
+
# d.explain 'boeing 737-100'
|
177
|
+
def explain(needle, options = {})
|
178
|
+
record = find needle, options.merge(:gather_last_result => true)
|
179
|
+
log "#" * 150
|
180
|
+
log "# Match #{needle.inspect} => #{record.inspect}"
|
181
|
+
log "#" * 150
|
182
|
+
log
|
183
|
+
log "Needle"
|
184
|
+
log "-" * 150
|
185
|
+
log last_result.needle.render
|
186
|
+
log
|
187
|
+
log "Stop words"
|
188
|
+
log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
|
189
|
+
log
|
190
|
+
log "Candidates"
|
191
|
+
log "-" * 150
|
192
|
+
log last_result.candidates.map { |record| record.render }.join("\n")
|
193
|
+
log
|
194
|
+
log "Tighteners"
|
195
|
+
log "-" * 150
|
196
|
+
log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
|
197
|
+
log
|
198
|
+
log "Blockings"
|
199
|
+
log "-" * 150
|
200
|
+
log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
|
201
|
+
log
|
202
|
+
log "Identities"
|
203
|
+
log "-" * 150
|
204
|
+
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
205
|
+
log
|
206
|
+
log "Joint"
|
207
|
+
log "-" * 150
|
208
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
209
|
+
log
|
210
|
+
log "Disjoint"
|
211
|
+
log "-" * 150
|
212
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
213
|
+
log
|
214
|
+
log "Possibly identical"
|
215
|
+
log "-" * 150
|
216
|
+
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
217
|
+
log
|
218
|
+
log "Certainly different"
|
219
|
+
log "-" * 150
|
220
|
+
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
221
|
+
log
|
222
|
+
log "Similarities"
|
223
|
+
log "-" * 150
|
224
|
+
log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
|
225
|
+
log
|
226
|
+
log "Match"
|
227
|
+
log "-" * 150
|
228
|
+
log record.inspect
|
229
|
+
end
|
230
|
+
|
231
|
+
def log(str = '') #:nodoc:
|
232
|
+
$stderr.puts str
|
233
|
+
end
|
234
|
+
|
235
|
+
def freed?
|
236
|
+
@freed == true
|
237
|
+
end
|
238
|
+
|
239
|
+
def free
|
240
|
+
free_last_result
|
241
|
+
@haystack.try :clear
|
242
|
+
@haystack = nil
|
243
|
+
ensure
|
244
|
+
@freed = true
|
245
|
+
end
|
246
|
+
|
247
|
+
private
|
248
|
+
|
249
|
+
def free_last_result
|
250
|
+
@last_result = nil
|
251
|
+
end
|
252
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
Bundler.setup
|
4
|
+
require 'test/unit'
|
5
|
+
require 'stringio'
|
6
|
+
require 'remote_table'
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
9
|
+
require 'fuzzy_match'
|
10
|
+
|
11
|
+
class Test::Unit::TestCase
|
12
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestBlocking < Test::Unit::TestCase
|
4
|
+
def test_001_match_one
|
5
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
6
|
+
assert_equal true, b.match?('2 apples')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_002_join_both
|
10
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
11
|
+
assert_equal true, b.join?('apple', '2 apples')
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_002_doesnt_join_both
|
15
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
16
|
+
assert_equal false, b.join?('orange', '2 apples')
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_003_no_information
|
20
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
21
|
+
assert_equal nil, b.join?('orange', 'orange')
|
22
|
+
end
|
23
|
+
end
|