fuzzy_match 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.rdoc +94 -0
- data/Rakefile +21 -0
- data/THANKS-WILLIAM-JAMES.rb +37 -0
- data/benchmark/before-with-free.txt +283 -0
- data/benchmark/before-without-last-result.txt +257 -0
- data/benchmark/before.txt +304 -0
- data/benchmark/memory.rb +54 -0
- data/examples/bts_aircraft/5-2-A.htm +10305 -0
- data/examples/bts_aircraft/5-2-B.htm +9576 -0
- data/examples/bts_aircraft/5-2-D.htm +7094 -0
- data/examples/bts_aircraft/5-2-E.htm +2349 -0
- data/examples/bts_aircraft/5-2-G.htm +2922 -0
- data/examples/bts_aircraft/blockings.csv +1 -0
- data/examples/bts_aircraft/identities.csv +1 -0
- data/examples/bts_aircraft/negatives.csv +1 -0
- data/examples/bts_aircraft/number_260.csv +334 -0
- data/examples/bts_aircraft/positives.csv +1 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +118 -0
- data/examples/bts_aircraft/tighteners.csv +1 -0
- data/examples/first_name_matching.rb +15 -0
- data/examples/icao-bts.xls +0 -0
- data/fuzzy_match.gemspec +32 -0
- data/lib/fuzzy_match/blocking.rb +36 -0
- data/lib/fuzzy_match/cached_result.rb +74 -0
- data/lib/fuzzy_match/identity.rb +23 -0
- data/lib/fuzzy_match/result.rb +17 -0
- data/lib/fuzzy_match/score.rb +125 -0
- data/lib/fuzzy_match/similarity.rb +53 -0
- data/lib/fuzzy_match/stop_word.rb +19 -0
- data/lib/fuzzy_match/tightener.rb +28 -0
- data/lib/fuzzy_match/version.rb +3 -0
- data/lib/fuzzy_match/wrapper.rb +67 -0
- data/lib/fuzzy_match.rb +252 -0
- data/test/helper.rb +12 -0
- data/test/test_blocking.rb +23 -0
- data/test/test_cache.rb +130 -0
- data/test/test_fuzzy_match.rb +190 -0
- data/test/test_fuzzy_match_convoluted.rb.disabled +268 -0
- data/test/test_identity.rb +33 -0
- data/test/test_tightening.rb +10 -0
- metadata +197 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# Identities take effect when needle and haystack both match a regexp
|
3
|
+
# Then the captured part of the regexp has to match exactly
|
4
|
+
class Identity
|
5
|
+
attr_reader :regexp
|
6
|
+
|
7
|
+
def initialize(regexp_or_str)
|
8
|
+
@regexp = regexp_or_str.to_regexp
|
9
|
+
end
|
10
|
+
|
11
|
+
# Two strings are "identical" if they both match this identity and the captures are equal.
|
12
|
+
#
|
13
|
+
# Only returns true/false if both strings match the regexp.
|
14
|
+
# Otherwise returns nil.
|
15
|
+
def identical?(str1, str2)
|
16
|
+
if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
|
17
|
+
str1_match_data.captures == match_data.captures
|
18
|
+
else
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Result #:nodoc: all
|
3
|
+
attr_accessor :needle
|
4
|
+
attr_accessor :tighteners
|
5
|
+
attr_accessor :blockings
|
6
|
+
attr_accessor :identities
|
7
|
+
attr_accessor :stop_words
|
8
|
+
attr_accessor :candidates
|
9
|
+
attr_accessor :joint
|
10
|
+
attr_accessor :disjoint
|
11
|
+
attr_accessor :possibly_identical
|
12
|
+
attr_accessor :certainly_different
|
13
|
+
attr_accessor :similarities
|
14
|
+
attr_accessor :record
|
15
|
+
attr_accessor :score
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
begin
|
2
|
+
require 'amatch'
|
3
|
+
rescue ::LoadError
|
4
|
+
# using native ruby similarity scoring
|
5
|
+
end
|
6
|
+
|
7
|
+
class FuzzyMatch
|
8
|
+
class Score
|
9
|
+
attr_reader :str1, :str2
|
10
|
+
|
11
|
+
def initialize(str1, str2)
|
12
|
+
@str1 = str1.downcase
|
13
|
+
@str2 = str2.downcase
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient} levenshtein=#{levenshtein}>}
|
18
|
+
end
|
19
|
+
|
20
|
+
def <=>(other)
|
21
|
+
by_dices_coefficient = (dices_coefficient <=> other.dices_coefficient)
|
22
|
+
if by_dices_coefficient == 0
|
23
|
+
levenshtein <=> other.levenshtein
|
24
|
+
else
|
25
|
+
by_dices_coefficient
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def utf8?
|
30
|
+
(defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
|
31
|
+
end
|
32
|
+
|
33
|
+
if defined?(::Amatch)
|
34
|
+
|
35
|
+
def dices_coefficient
|
36
|
+
str1.pair_distance_similar str2
|
37
|
+
end
|
38
|
+
|
39
|
+
def levenshtein
|
40
|
+
str1.levenshtein_similar str2
|
41
|
+
end
|
42
|
+
|
43
|
+
else
|
44
|
+
|
45
|
+
SPACE = ' '
|
46
|
+
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
47
|
+
def dices_coefficient
|
48
|
+
if str1 == str2
|
49
|
+
return 1.0
|
50
|
+
elsif str1.length == 1 and str2.length == 1
|
51
|
+
return 0.0
|
52
|
+
end
|
53
|
+
pairs1 = (0..str1.length-2).map do |i|
|
54
|
+
str1[i,2]
|
55
|
+
end.reject do |pair|
|
56
|
+
pair.include? SPACE
|
57
|
+
end
|
58
|
+
pairs2 = (0..str2.length-2).map do |i|
|
59
|
+
str2[i,2]
|
60
|
+
end.reject do |pair|
|
61
|
+
pair.include? SPACE
|
62
|
+
end
|
63
|
+
union = pairs1.size + pairs2.size
|
64
|
+
intersection = 0
|
65
|
+
pairs1.each do |p1|
|
66
|
+
0.upto(pairs2.size-1) do |i|
|
67
|
+
if p1 == pairs2[i]
|
68
|
+
intersection += 1
|
69
|
+
pairs2.slice!(i)
|
70
|
+
break
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
(2.0 * intersection) / union
|
75
|
+
end
|
76
|
+
|
77
|
+
# extracted/adapted from the text gem version 1.0.2
|
78
|
+
# normalization added for utf-8 strings
|
79
|
+
# lib/text/levenshtein.rb
|
80
|
+
def levenshtein
|
81
|
+
if utf8?
|
82
|
+
unpack_rule = 'U*'
|
83
|
+
else
|
84
|
+
unpack_rule = 'C*'
|
85
|
+
end
|
86
|
+
s = str1.unpack(unpack_rule)
|
87
|
+
t = str2.unpack(unpack_rule)
|
88
|
+
n = s.length
|
89
|
+
m = t.length
|
90
|
+
if n == 0 or m == 0
|
91
|
+
return 0.0
|
92
|
+
end
|
93
|
+
d = (0..m).to_a
|
94
|
+
x = nil
|
95
|
+
(0...n).each do |i|
|
96
|
+
e = i+1
|
97
|
+
(0...m).each do |j|
|
98
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
99
|
+
x = [
|
100
|
+
d[j+1] + 1, # insertion
|
101
|
+
e + 1, # deletion
|
102
|
+
d[j] + cost # substitution
|
103
|
+
].min
|
104
|
+
d[j] = e
|
105
|
+
e = x
|
106
|
+
end
|
107
|
+
d[m] = x
|
108
|
+
end
|
109
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
110
|
+
# if (b_len > a_len) {
|
111
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
112
|
+
# } else {
|
113
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
114
|
+
# }
|
115
|
+
1.0 - x.to_f / [n, m].max
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
extend ::ActiveSupport::Memoizable
|
121
|
+
memoize :dices_coefficient
|
122
|
+
memoize :levenshtein
|
123
|
+
memoize :utf8?
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Similarity
|
3
|
+
attr_reader :wrapper1
|
4
|
+
attr_reader :wrapper2
|
5
|
+
|
6
|
+
def initialize(wrapper1, wrapper2)
|
7
|
+
@wrapper1 = wrapper1
|
8
|
+
@wrapper2 = wrapper2
|
9
|
+
end
|
10
|
+
|
11
|
+
def <=>(other)
|
12
|
+
by_score = best_score <=> other.best_score
|
13
|
+
if by_score == 0
|
14
|
+
original_weight <=> other.original_weight
|
15
|
+
else
|
16
|
+
by_score
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Weight things towards short original strings
|
21
|
+
def original_weight
|
22
|
+
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
23
|
+
end
|
24
|
+
|
25
|
+
def best_score
|
26
|
+
@best_score ||= Score.new best_wrapper1_variant, best_wrapper2_variant
|
27
|
+
end
|
28
|
+
|
29
|
+
def best_wrapper1_variant
|
30
|
+
best_variants[0]
|
31
|
+
end
|
32
|
+
|
33
|
+
def best_wrapper2_variant
|
34
|
+
best_variants[1]
|
35
|
+
end
|
36
|
+
|
37
|
+
def best_variants
|
38
|
+
@best_variants ||= wrapper1.variants.product(wrapper2.variants).sort do |tuple1, tuple2|
|
39
|
+
wrapper1_variant1, wrapper2_variant1 = tuple1
|
40
|
+
wrapper1_variant2, wrapper2_variant2 = tuple2
|
41
|
+
|
42
|
+
score1 = Score.new wrapper1_variant1, wrapper2_variant1
|
43
|
+
score2 = Score.new wrapper1_variant2, wrapper2_variant2
|
44
|
+
|
45
|
+
score1 <=> score2
|
46
|
+
end[-1]
|
47
|
+
end
|
48
|
+
|
49
|
+
def inspect
|
50
|
+
%{#<Similarity "#{wrapper2.render}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.render}"=>"#{best_wrapper1_variant}" original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# A stop word is ignored
|
3
|
+
class StopWord
|
4
|
+
attr_reader :regexp
|
5
|
+
|
6
|
+
def initialize(regexp_or_str)
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
|
+
end
|
9
|
+
|
10
|
+
# Destructively remove stop words from the string
|
11
|
+
def apply!(str)
|
12
|
+
str.gsub! regexp, ''
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"#<StopWord regexp=#{regexp.inspect}>"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# A tightener just strips a string down to its core
|
3
|
+
class Tightener
|
4
|
+
attr_reader :regexp
|
5
|
+
|
6
|
+
def initialize(regexp_or_str)
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
|
+
end
|
9
|
+
|
10
|
+
# A tightener applies when its regexp matches and captures a new (shorter) string
|
11
|
+
def apply?(str)
|
12
|
+
!!(regexp.match(str))
|
13
|
+
end
|
14
|
+
|
15
|
+
# The result of applying a tightener is just all the captures put together.
|
16
|
+
def apply(str)
|
17
|
+
if match_data = regexp.match(str)
|
18
|
+
match_data.captures.join
|
19
|
+
else
|
20
|
+
str
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
"#<Tightener regexp=#{regexp.inspect}>"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
|
+
class Wrapper #:nodoc: all
|
4
|
+
attr_reader :fuzzy_match
|
5
|
+
attr_reader :record
|
6
|
+
attr_reader :read
|
7
|
+
|
8
|
+
def initialize(fuzzy_match, record, read = nil)
|
9
|
+
@fuzzy_match = fuzzy_match
|
10
|
+
@record = record
|
11
|
+
@read = read
|
12
|
+
end
|
13
|
+
|
14
|
+
def inspect
|
15
|
+
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
16
|
+
end
|
17
|
+
|
18
|
+
def render
|
19
|
+
return @render if rendered?
|
20
|
+
str = case read
|
21
|
+
when ::Proc
|
22
|
+
read.call record
|
23
|
+
when ::Symbol
|
24
|
+
if record.respond_to?(read)
|
25
|
+
record.send read
|
26
|
+
else
|
27
|
+
record[read]
|
28
|
+
end
|
29
|
+
when ::NilClass
|
30
|
+
record
|
31
|
+
else
|
32
|
+
record[read]
|
33
|
+
end.to_s.dup
|
34
|
+
fuzzy_match.stop_words.each do |stop_word|
|
35
|
+
stop_word.apply! str
|
36
|
+
end
|
37
|
+
str.strip!
|
38
|
+
@render = str.freeze
|
39
|
+
@rendered = true
|
40
|
+
@render
|
41
|
+
end
|
42
|
+
|
43
|
+
alias :to_str :render
|
44
|
+
|
45
|
+
WORD_BOUNDARY = %r{\s*\b\s*}
|
46
|
+
def words
|
47
|
+
@words ||= render.split(WORD_BOUNDARY)
|
48
|
+
end
|
49
|
+
|
50
|
+
def similarity(other)
|
51
|
+
Similarity.new self, other
|
52
|
+
end
|
53
|
+
|
54
|
+
def variants
|
55
|
+
@variants ||= fuzzy_match.tighteners.inject([ render ]) do |memo, tightener|
|
56
|
+
if tightener.apply? render
|
57
|
+
memo.push tightener.apply(render)
|
58
|
+
end
|
59
|
+
memo
|
60
|
+
end.uniq
|
61
|
+
end
|
62
|
+
|
63
|
+
def rendered?
|
64
|
+
@rendered == true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
data/lib/fuzzy_match.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/version'
|
3
|
+
if ::ActiveSupport::VERSION::MAJOR >= 3
|
4
|
+
require 'active_support/core_ext'
|
5
|
+
end
|
6
|
+
require 'to_regexp'
|
7
|
+
|
8
|
+
# See the README for more information.
|
9
|
+
class FuzzyMatch
|
10
|
+
autoload :Tightener, 'fuzzy_match/tightener'
|
11
|
+
autoload :StopWord, 'fuzzy_match/stop_word'
|
12
|
+
autoload :Blocking, 'fuzzy_match/blocking'
|
13
|
+
autoload :Identity, 'fuzzy_match/identity'
|
14
|
+
autoload :Result, 'fuzzy_match/result'
|
15
|
+
autoload :Wrapper, 'fuzzy_match/wrapper'
|
16
|
+
autoload :Similarity, 'fuzzy_match/similarity'
|
17
|
+
autoload :Score, 'fuzzy_match/score'
|
18
|
+
autoload :CachedResult, 'fuzzy_match/cached_result'
|
19
|
+
|
20
|
+
attr_reader :haystack
|
21
|
+
attr_reader :blockings
|
22
|
+
attr_reader :identities
|
23
|
+
attr_reader :tighteners
|
24
|
+
attr_reader :stop_words
|
25
|
+
attr_reader :default_first_blocking_decides
|
26
|
+
attr_reader :default_must_match_blocking
|
27
|
+
attr_reader :default_must_match_at_least_one_word
|
28
|
+
|
29
|
+
# haystack - a bunch of records
|
30
|
+
# options
|
31
|
+
# * tighteners: regexps (see readme)
|
32
|
+
# * identities: regexps
|
33
|
+
# * blockings: regexps
|
34
|
+
# * stop_words: regexps
|
35
|
+
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
36
|
+
def initialize(records, options = {})
|
37
|
+
options = options.symbolize_keys
|
38
|
+
@default_first_blocking_decides = options[:first_blocking_decides]
|
39
|
+
@default_must_match_blocking = options[:must_match_blocking]
|
40
|
+
@default_must_match_at_least_one_word = options[:must_match_at_least_one_word]
|
41
|
+
@blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
|
42
|
+
@identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
|
43
|
+
@tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
|
44
|
+
@stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
|
45
|
+
read = options[:read] || options[:haystack_reader]
|
46
|
+
@haystack = records.map { |record| Wrapper.new self, record, read }
|
47
|
+
end
|
48
|
+
|
49
|
+
def last_result
|
50
|
+
@last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true")
|
51
|
+
end
|
52
|
+
|
53
|
+
def find_all(needle, options = {})
|
54
|
+
options = options.symbolize_keys.merge(:find_all => true)
|
55
|
+
find needle, options
|
56
|
+
end
|
57
|
+
|
58
|
+
def find(needle, options = {})
|
59
|
+
raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
|
60
|
+
|
61
|
+
options = options.symbolize_keys
|
62
|
+
gather_last_result = options.fetch(:gather_last_result, false)
|
63
|
+
is_find_all = options.fetch(:find_all, false)
|
64
|
+
first_blocking_decides = options.fetch(:first_blocking_decides, default_first_blocking_decides)
|
65
|
+
must_match_blocking = options.fetch(:must_match_blocking, default_must_match_blocking)
|
66
|
+
must_match_at_least_one_word = options.fetch(:must_match_at_least_one_word, default_must_match_at_least_one_word)
|
67
|
+
|
68
|
+
if gather_last_result
|
69
|
+
free_last_result
|
70
|
+
@last_result = Result.new
|
71
|
+
end
|
72
|
+
|
73
|
+
if gather_last_result
|
74
|
+
last_result.tighteners = tighteners
|
75
|
+
last_result.identities = identities
|
76
|
+
last_result.blockings = blockings
|
77
|
+
last_result.stop_words = stop_words
|
78
|
+
end
|
79
|
+
|
80
|
+
needle = Wrapper.new self, needle
|
81
|
+
|
82
|
+
if gather_last_result
|
83
|
+
last_result.needle = needle
|
84
|
+
end
|
85
|
+
|
86
|
+
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
87
|
+
if is_find_all
|
88
|
+
return []
|
89
|
+
else
|
90
|
+
return nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
candidates = if must_match_at_least_one_word
|
95
|
+
haystack.select do |straw|
|
96
|
+
(needle.words & straw.words).any?
|
97
|
+
end
|
98
|
+
else
|
99
|
+
haystack
|
100
|
+
end
|
101
|
+
|
102
|
+
if gather_last_result
|
103
|
+
last_result.candidates = candidates
|
104
|
+
end
|
105
|
+
|
106
|
+
joint, disjoint = if blockings.any?
|
107
|
+
candidates.partition do |straw|
|
108
|
+
if first_blocking_decides
|
109
|
+
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
110
|
+
else
|
111
|
+
blockings.any? { |blocking| blocking.join? needle, straw }
|
112
|
+
end
|
113
|
+
end
|
114
|
+
else
|
115
|
+
[ candidates.dup, [] ]
|
116
|
+
end
|
117
|
+
|
118
|
+
if joint.none?
|
119
|
+
if must_match_blocking
|
120
|
+
if is_find_all
|
121
|
+
return []
|
122
|
+
else
|
123
|
+
return nil
|
124
|
+
end
|
125
|
+
else
|
126
|
+
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
127
|
+
joint = disjoint
|
128
|
+
disjoint = []
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if gather_last_result
|
133
|
+
last_result.joint = joint
|
134
|
+
last_result.disjoint = disjoint
|
135
|
+
end
|
136
|
+
|
137
|
+
possibly_identical, certainly_different = if identities.any?
|
138
|
+
joint.partition do |straw|
|
139
|
+
identities.all? do |identity|
|
140
|
+
answer = identity.identical? needle, straw
|
141
|
+
answer.nil? or answer == true
|
142
|
+
end
|
143
|
+
end
|
144
|
+
else
|
145
|
+
[ joint.dup, [] ]
|
146
|
+
end
|
147
|
+
|
148
|
+
if gather_last_result
|
149
|
+
last_result.possibly_identical = possibly_identical
|
150
|
+
last_result.certainly_different = certainly_different
|
151
|
+
end
|
152
|
+
|
153
|
+
if is_find_all
|
154
|
+
return possibly_identical.map { |straw| straw.record }
|
155
|
+
end
|
156
|
+
|
157
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
158
|
+
|
159
|
+
if gather_last_result
|
160
|
+
last_result.similarities = similarities
|
161
|
+
end
|
162
|
+
|
163
|
+
if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
|
164
|
+
record = best_similarity.wrapper2.record
|
165
|
+
if gather_last_result
|
166
|
+
last_result.record = record
|
167
|
+
last_result.score = best_similarity.best_score.dices_coefficient
|
168
|
+
end
|
169
|
+
record
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
174
|
+
#
|
175
|
+
# d = FuzzyMatch.new ['737', '747', '757' ]
|
176
|
+
# d.explain 'boeing 737-100'
|
177
|
+
def explain(needle, options = {})
|
178
|
+
record = find needle, options.merge(:gather_last_result => true)
|
179
|
+
log "#" * 150
|
180
|
+
log "# Match #{needle.inspect} => #{record.inspect}"
|
181
|
+
log "#" * 150
|
182
|
+
log
|
183
|
+
log "Needle"
|
184
|
+
log "-" * 150
|
185
|
+
log last_result.needle.render
|
186
|
+
log
|
187
|
+
log "Stop words"
|
188
|
+
log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
|
189
|
+
log
|
190
|
+
log "Candidates"
|
191
|
+
log "-" * 150
|
192
|
+
log last_result.candidates.map { |record| record.render }.join("\n")
|
193
|
+
log
|
194
|
+
log "Tighteners"
|
195
|
+
log "-" * 150
|
196
|
+
log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
|
197
|
+
log
|
198
|
+
log "Blockings"
|
199
|
+
log "-" * 150
|
200
|
+
log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
|
201
|
+
log
|
202
|
+
log "Identities"
|
203
|
+
log "-" * 150
|
204
|
+
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
205
|
+
log
|
206
|
+
log "Joint"
|
207
|
+
log "-" * 150
|
208
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
209
|
+
log
|
210
|
+
log "Disjoint"
|
211
|
+
log "-" * 150
|
212
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
213
|
+
log
|
214
|
+
log "Possibly identical"
|
215
|
+
log "-" * 150
|
216
|
+
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
217
|
+
log
|
218
|
+
log "Certainly different"
|
219
|
+
log "-" * 150
|
220
|
+
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
221
|
+
log
|
222
|
+
log "Similarities"
|
223
|
+
log "-" * 150
|
224
|
+
log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
|
225
|
+
log
|
226
|
+
log "Match"
|
227
|
+
log "-" * 150
|
228
|
+
log record.inspect
|
229
|
+
end
|
230
|
+
|
231
|
+
def log(str = '') #:nodoc:
|
232
|
+
$stderr.puts str
|
233
|
+
end
|
234
|
+
|
235
|
+
def freed?
|
236
|
+
@freed == true
|
237
|
+
end
|
238
|
+
|
239
|
+
def free
|
240
|
+
free_last_result
|
241
|
+
@haystack.try :clear
|
242
|
+
@haystack = nil
|
243
|
+
ensure
|
244
|
+
@freed = true
|
245
|
+
end
|
246
|
+
|
247
|
+
private
|
248
|
+
|
249
|
+
def free_last_result
|
250
|
+
@last_result = nil
|
251
|
+
end
|
252
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
Bundler.setup
|
4
|
+
require 'test/unit'
|
5
|
+
require 'stringio'
|
6
|
+
require 'remote_table'
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
9
|
+
require 'fuzzy_match'
|
10
|
+
|
11
|
+
class Test::Unit::TestCase
|
12
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestBlocking < Test::Unit::TestCase
|
4
|
+
def test_001_match_one
|
5
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
6
|
+
assert_equal true, b.match?('2 apples')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_002_join_both
|
10
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
11
|
+
assert_equal true, b.join?('apple', '2 apples')
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_002_doesnt_join_both
|
15
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
16
|
+
assert_equal false, b.join?('orange', '2 apples')
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_003_no_information
|
20
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
21
|
+
assert_equal nil, b.join?('orange', 'orange')
|
22
|
+
end
|
23
|
+
end
|