fuzzy_match 1.2.2 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +15 -9
- data/examples/bts_aircraft/test_bts_aircraft.rb +10 -12
- data/fuzzy_match.gemspec +2 -3
- data/lib/fuzzy_match.rb +23 -0
- data/lib/fuzzy_match/cached_result.rb +6 -2
- data/lib/fuzzy_match/score.rb +4 -109
- data/lib/fuzzy_match/score/amatch.rb +25 -0
- data/lib/fuzzy_match/score/pure_ruby.rb +103 -0
- data/lib/fuzzy_match/similarity.rb +3 -3
- data/lib/fuzzy_match/version.rb +1 -1
- data/test/helper.rb +3 -8
- data/test/test_amatch.rb +16 -0
- data/test/test_blocking.rb +14 -14
- data/test/test_cache.rb +17 -17
- data/test/test_fuzzy_match.rb +118 -112
- data/test/test_fuzzy_match_convoluted.rb.disabled +1 -1
- data/test/test_identity.rb +15 -17
- data/test/test_normalizer.rb +5 -5
- data/test/test_wrapper.rb +6 -6
- metadata +29 -36
data/README.markdown
CHANGED
@@ -13,6 +13,8 @@ Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight
|
|
13
13
|
>> matcher.find('Shamus')
|
14
14
|
=> "seamus"
|
15
15
|
|
16
|
+
See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2012/01/18/fuzzy-match-in-ruby/).
|
17
|
+
|
16
18
|
## Default matching (string similarity)
|
17
19
|
|
18
20
|
At the core, and even if you configure nothing else, string similarity (calculated by "pair distance" aka Dice's) is used to compare records.
|
@@ -45,12 +47,6 @@ Group records together.
|
|
45
47
|
|
46
48
|
Setting a blocking of `/Airbus/` ensures that strings containing "Airbus" will only be scored against to other strings containing "Airbus". A better blocking in this case would probably be `/airbus/i`.
|
47
49
|
|
48
|
-
### Normalizers (formerly called tighteners)
|
49
|
-
|
50
|
-
Strip strings down to the essentials.
|
51
|
-
|
52
|
-
Adding a normalizer like `/(boeing).*(7\d\d)/i` will cause "BOEING COMPANY 747" and "boeing747" to be normalized to "BOEING 747" and "boeing 747", respectively. Since things are generally downcased before they are compared, these would be an exact match.
|
53
|
-
|
54
50
|
### Identities
|
55
51
|
|
56
52
|
Prevent impossible matches.
|
@@ -59,10 +55,16 @@ Adding an identity like `/(f)-?(\d50)/i` ensures that "Ford F-150" and "Ford F-2
|
|
59
55
|
|
60
56
|
### Stop words
|
61
57
|
|
62
|
-
Ignore common and/or meaningless words.
|
58
|
+
Ignore common and/or meaningless words. Applied before normalizers.
|
63
59
|
|
64
60
|
Adding a stop word like `THE` ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
|
65
61
|
|
62
|
+
### Normalizers (formerly called tighteners)
|
63
|
+
|
64
|
+
Strip strings down to the essentials. Applied after stop words.
|
65
|
+
|
66
|
+
Adding a normalizer like `/(boeing).*(7\d\d)/i` will cause "BOEING COMPANY 747" and "boeing747" to be normalized to "BOEING 747" and "boeing 747", respectively. Since things are generally downcased before they are compared, these would be an exact match.
|
67
|
+
|
66
68
|
## Find options
|
67
69
|
|
68
70
|
* `read`: how to interpret each record in the 'haystack', either a Proc or a symbol
|
@@ -127,9 +129,13 @@ The admittedly imperfect metaphor is "look for a needle in a haystack"
|
|
127
129
|
* needle: the search term
|
128
130
|
* haystack: the records you are searching (<b>your result will be an object from here</b>)
|
129
131
|
|
130
|
-
##
|
132
|
+
## Using amatch to make it faster
|
133
|
+
|
134
|
+
You can optionally use [`amatch`](http://flori.github.com/amatch/) by [Florian Frank](https://github.com/flori) (thanks Flori!) to make string similarity calculations in a C extension.
|
131
135
|
|
132
|
-
|
136
|
+
require 'fuzzy_match'
|
137
|
+
require 'amatch' # note that you have to require this... fuzzy_match won't require it for you
|
138
|
+
FuzzyMatch.engine = :amatch
|
133
139
|
|
134
140
|
Otherwise, pure ruby versions of the string similarity algorithms derived from the [answer to a StackOverflow question](http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings) and [the text gem](https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb) are used. Thanks [marzagao](http://stackoverflow.com/users/10997/marzagao) and [threedaymonk](https://github.com/threedaymonk)!
|
135
141
|
|
@@ -1,7 +1,5 @@
|
|
1
1
|
require File.expand_path('../../../test/helper.rb', __FILE__)
|
2
2
|
|
3
|
-
require 'shoulda'
|
4
|
-
|
5
3
|
# How to iteratively develop a dictionary.
|
6
4
|
|
7
5
|
# ruby ./examples/bts_aircraft/test_bts_aircraft.rb
|
@@ -70,36 +68,36 @@ FINAL_OPTIONS = {
|
|
70
68
|
:blockings => BLOCKINGS
|
71
69
|
}
|
72
70
|
|
73
|
-
class TestBtsAircraft <
|
74
|
-
|
71
|
+
class TestBtsAircraft < MiniTest::Spec
|
72
|
+
it "understand records by using the haystack reader" do
|
75
73
|
d = FuzzyMatch.new HAYSTACK, FINAL_OPTIONS
|
76
|
-
|
74
|
+
d.haystack.map { |record| record.to_str }.must_include 'boeing boeing 707-100'
|
77
75
|
end
|
78
76
|
|
79
|
-
|
77
|
+
it "find an easy match" do
|
80
78
|
d = FuzzyMatch.new HAYSTACK, FINAL_OPTIONS
|
81
79
|
record = d.find('boeing 707(100)')
|
82
|
-
|
83
|
-
|
80
|
+
record.class.must_equal HAYSTACK_RECORD_CLASS
|
81
|
+
HAYSTACK_READER.call(record).must_equal 'boeing boeing 707-100'
|
84
82
|
end
|
85
83
|
|
86
84
|
POSITIVES.each do |row|
|
87
85
|
needle = row['needle']
|
88
86
|
correct_record = row['haystack']
|
89
|
-
|
87
|
+
it %{find #{correct_record.blank? ? 'nothing' : correct_record} when looking for #{needle}} do
|
90
88
|
d = FuzzyMatch.new HAYSTACK, FINAL_OPTIONS
|
91
89
|
record = d.find(needle.downcase)
|
92
|
-
|
90
|
+
HAYSTACK_READER.call(record).must_equal correct_record.downcase
|
93
91
|
end
|
94
92
|
end
|
95
93
|
|
96
94
|
NEGATIVES.each do |row|
|
97
95
|
needle = row['needle']
|
98
96
|
incorrect_record = row['haystack']
|
99
|
-
|
97
|
+
it %{not find #{incorrect_record} when looking for #{needle}} do
|
100
98
|
d = FuzzyMatch.new HAYSTACK, FINAL_OPTIONS
|
101
99
|
record = d.find(needle.downcase)
|
102
|
-
|
100
|
+
HAYSTACK_READER.call(record)).wont_equal incorrect_record.downcase
|
103
101
|
end
|
104
102
|
end
|
105
103
|
end
|
data/fuzzy_match.gemspec
CHANGED
@@ -19,10 +19,9 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
|
-
s.add_development_dependency "
|
23
|
-
s.add_development_dependency "remote_table"
|
22
|
+
s.add_development_dependency "minitest"
|
24
23
|
s.add_development_dependency 'activerecord', '>=3'
|
25
|
-
s.add_development_dependency '
|
24
|
+
s.add_development_dependency 'mysql2'
|
26
25
|
s.add_development_dependency 'cohort_scope'
|
27
26
|
s.add_development_dependency 'weighted_average'
|
28
27
|
s.add_development_dependency 'rake'
|
data/lib/fuzzy_match.rb
CHANGED
@@ -20,6 +20,29 @@ end
|
|
20
20
|
|
21
21
|
# See the README for more information.
|
22
22
|
class FuzzyMatch
|
23
|
+
class << self
|
24
|
+
def engine
|
25
|
+
@@engine ||= :pure_ruby
|
26
|
+
end
|
27
|
+
|
28
|
+
def engine=(alt_engine)
|
29
|
+
@@engine = alt_engine
|
30
|
+
end
|
31
|
+
|
32
|
+
def score_class
|
33
|
+
case engine
|
34
|
+
when :pure_ruby
|
35
|
+
Score::PureRuby
|
36
|
+
when :amatch
|
37
|
+
Score::Amatch
|
38
|
+
else
|
39
|
+
raise ::ArgumentError, "[fuzzy_match] #{engine.inspect} is not a recognized engine."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
DEFAULT_ENGINE = :pure_ruby
|
45
|
+
|
23
46
|
DEFAULT_OPTIONS = {
|
24
47
|
:first_blocking_decides => false,
|
25
48
|
:must_match_blocking => false,
|
@@ -1,6 +1,10 @@
|
|
1
1
|
class FuzzyMatch
|
2
2
|
class CachedResult < ::ActiveRecord::Base
|
3
|
-
|
3
|
+
if ::ActiveRecord::VERSION::STRING >= '3.2'
|
4
|
+
self.table_name = :fuzzy_match_cached_results
|
5
|
+
else
|
6
|
+
set_table_name :fuzzy_match_cached_results
|
7
|
+
end
|
4
8
|
|
5
9
|
def self.create_table
|
6
10
|
connection.create_table :fuzzy_match_cached_results do |t|
|
@@ -17,7 +21,7 @@ class FuzzyMatch
|
|
17
21
|
|
18
22
|
def self.setup(from_scratch = false)
|
19
23
|
connection.drop_table :fuzzy_match_cached_results if from_scratch and table_exists?
|
20
|
-
create_table
|
24
|
+
create_table
|
21
25
|
end
|
22
26
|
|
23
27
|
module ActiveRecordBaseExtension
|
data/lib/fuzzy_match/score.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
require 'fuzzy_match/score/pure_ruby'
|
2
|
+
require 'fuzzy_match/score/amatch'
|
3
|
+
|
1
4
|
class FuzzyMatch
|
2
5
|
class Score
|
3
|
-
extend ::ActiveSupport::Memoizable
|
4
|
-
|
5
6
|
attr_reader :str1
|
6
7
|
attr_reader :str2
|
7
8
|
|
@@ -10,10 +11,6 @@ class FuzzyMatch
|
|
10
11
|
@str2 = str2.downcase
|
11
12
|
end
|
12
13
|
|
13
|
-
def inspect
|
14
|
-
%{#<FuzzyMatch::Score: str1=#{str1.inspect} str2=#{str2.inspect} dices_coefficient_similar=#{dices_coefficient_similar} levenshtein_similar=#{levenshtein_similar}>}
|
15
|
-
end
|
16
|
-
|
17
14
|
def <=>(other)
|
18
15
|
by_dices_coefficient = (dices_coefficient_similar <=> other.dices_coefficient_similar)
|
19
16
|
if by_dices_coefficient == 0
|
@@ -22,107 +19,5 @@ class FuzzyMatch
|
|
22
19
|
by_dices_coefficient
|
23
20
|
end
|
24
21
|
end
|
25
|
-
|
26
|
-
if defined?(::Amatch)
|
27
|
-
|
28
|
-
def dices_coefficient_similar
|
29
|
-
if str1 == str2
|
30
|
-
return 1.0
|
31
|
-
elsif str1.length == 1 and str2.length == 1
|
32
|
-
return 0.0
|
33
|
-
end
|
34
|
-
str1.pair_distance_similar str2
|
35
|
-
end
|
36
|
-
memoize :dices_coefficient_similar
|
37
|
-
|
38
|
-
def levenshtein_similar
|
39
|
-
str1.levenshtein_similar str2
|
40
|
-
end
|
41
|
-
memoize :levenshtein_similar
|
42
|
-
|
43
|
-
else
|
44
|
-
|
45
|
-
SPACE = ' '
|
46
|
-
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
47
|
-
def dices_coefficient_similar
|
48
|
-
if str1 == str2
|
49
|
-
return 1.0
|
50
|
-
elsif str1.length == 1 and str2.length == 1
|
51
|
-
return 0.0
|
52
|
-
end
|
53
|
-
pairs1 = (0..str1.length-2).map do |i|
|
54
|
-
str1[i,2]
|
55
|
-
end.reject do |pair|
|
56
|
-
pair.include? SPACE
|
57
|
-
end
|
58
|
-
pairs2 = (0..str2.length-2).map do |i|
|
59
|
-
str2[i,2]
|
60
|
-
end.reject do |pair|
|
61
|
-
pair.include? SPACE
|
62
|
-
end
|
63
|
-
union = pairs1.size + pairs2.size
|
64
|
-
intersection = 0
|
65
|
-
pairs1.each do |p1|
|
66
|
-
0.upto(pairs2.size-1) do |i|
|
67
|
-
if p1 == pairs2[i]
|
68
|
-
intersection += 1
|
69
|
-
pairs2.slice!(i)
|
70
|
-
break
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
(2.0 * intersection) / union
|
75
|
-
end
|
76
|
-
memoize :dices_coefficient_similar
|
77
|
-
|
78
|
-
# this seems like it would slow things down
|
79
|
-
def utf8?
|
80
|
-
(defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
|
81
|
-
end
|
82
|
-
memoize :utf8?
|
83
|
-
|
84
|
-
# extracted/adapted from the text gem version 1.0.2
|
85
|
-
# normalization added for utf-8 strings
|
86
|
-
# lib/text/levenshtein.rb
|
87
|
-
def levenshtein_similar
|
88
|
-
if utf8?
|
89
|
-
unpack_rule = 'U*'
|
90
|
-
else
|
91
|
-
unpack_rule = 'C*'
|
92
|
-
end
|
93
|
-
s = str1.unpack(unpack_rule)
|
94
|
-
t = str2.unpack(unpack_rule)
|
95
|
-
n = s.length
|
96
|
-
m = t.length
|
97
|
-
if n == 0 or m == 0
|
98
|
-
return 0.0
|
99
|
-
end
|
100
|
-
d = (0..m).to_a
|
101
|
-
x = nil
|
102
|
-
(0...n).each do |i|
|
103
|
-
e = i+1
|
104
|
-
(0...m).each do |j|
|
105
|
-
cost = (s[i] == t[j]) ? 0 : 1
|
106
|
-
x = [
|
107
|
-
d[j+1] + 1, # insertion
|
108
|
-
e + 1, # deletion
|
109
|
-
d[j] + cost # substitution
|
110
|
-
].min
|
111
|
-
d[j] = e
|
112
|
-
e = x
|
113
|
-
end
|
114
|
-
d[m] = x
|
115
|
-
end
|
116
|
-
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
117
|
-
# if (b_len > a_len) {
|
118
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
119
|
-
# } else {
|
120
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
121
|
-
# }
|
122
|
-
1.0 - x.to_f / [n, m].max
|
123
|
-
end
|
124
|
-
memoize :levenshtein_similar
|
125
|
-
|
126
|
-
end
|
127
22
|
end
|
128
|
-
end
|
23
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Score
|
3
|
+
# be sure to `require 'amatch'` before you use this class
|
4
|
+
class Amatch < Score
|
5
|
+
|
6
|
+
def inspect
|
7
|
+
%{#<FuzzyMatch::Score::Amatch: str1=#{str1.inspect} str2=#{str2.inspect} dices_coefficient_similar=#{dices_coefficient_similar} levenshtein_similar=#{levenshtein_similar}>}
|
8
|
+
end
|
9
|
+
|
10
|
+
def dices_coefficient_similar
|
11
|
+
@dices_coefficient_similar ||= if str1 == str2
|
12
|
+
1.0
|
13
|
+
elsif str1.length == 1 and str2.length == 1
|
14
|
+
0.0
|
15
|
+
else
|
16
|
+
str1.pair_distance_similar str2
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def levenshtein_similar
|
21
|
+
@levenshtein_similar ||= str1.levenshtein_similar str2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
class FuzzyMatch
|
2
|
+
class Score
|
3
|
+
class PureRuby < Score
|
4
|
+
|
5
|
+
SPACE = ' '
|
6
|
+
|
7
|
+
def inspect
|
8
|
+
%{#<FuzzyMatch::Score::PureRuby: str1=#{str1.inspect} str2=#{str2.inspect} dices_coefficient_similar=#{dices_coefficient_similar} levenshtein_similar=#{levenshtein_similar}>}
|
9
|
+
end
|
10
|
+
|
11
|
+
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
12
|
+
def dices_coefficient_similar
|
13
|
+
return @dices_coefficient_similar if @dices_coefficient_similar.is_a?(::Float)
|
14
|
+
|
15
|
+
if str1 == str2
|
16
|
+
@dices_coefficient_similar = 1.0
|
17
|
+
return @dices_coefficient_similar
|
18
|
+
elsif str1.length == 1 and str2.length == 1
|
19
|
+
@dices_coefficient_similar = 0.0
|
20
|
+
return @dices_coefficient_similar
|
21
|
+
end
|
22
|
+
|
23
|
+
pairs1 = (0..str1.length-2).map do |i|
|
24
|
+
str1[i,2]
|
25
|
+
end.reject do |pair|
|
26
|
+
pair.include? SPACE
|
27
|
+
end
|
28
|
+
pairs2 = (0..str2.length-2).map do |i|
|
29
|
+
str2[i,2]
|
30
|
+
end.reject do |pair|
|
31
|
+
pair.include? SPACE
|
32
|
+
end
|
33
|
+
union = pairs1.size + pairs2.size
|
34
|
+
intersection = 0
|
35
|
+
pairs1.each do |p1|
|
36
|
+
0.upto(pairs2.size-1) do |i|
|
37
|
+
if p1 == pairs2[i]
|
38
|
+
intersection += 1
|
39
|
+
pairs2.slice!(i)
|
40
|
+
break
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@dices_coefficient_similar = (2.0 * intersection) / union
|
45
|
+
end
|
46
|
+
|
47
|
+
# extracted/adapted from the text gem version 1.0.2
|
48
|
+
# normalization added for utf-8 strings
|
49
|
+
# lib/text/levenshtein.rb
|
50
|
+
def levenshtein_similar
|
51
|
+
return @levenshtein_similar if @levenshtein_similar.is_a?(::Float)
|
52
|
+
|
53
|
+
if utf8?
|
54
|
+
unpack_rule = 'U*'
|
55
|
+
else
|
56
|
+
unpack_rule = 'C*'
|
57
|
+
end
|
58
|
+
s = str1.unpack(unpack_rule)
|
59
|
+
t = str2.unpack(unpack_rule)
|
60
|
+
n = s.length
|
61
|
+
m = t.length
|
62
|
+
|
63
|
+
if n == 0 or m == 0
|
64
|
+
@levenshtein_similar = 0.0
|
65
|
+
return @levenshtein_similar
|
66
|
+
end
|
67
|
+
|
68
|
+
d = (0..m).to_a
|
69
|
+
x = nil
|
70
|
+
(0...n).each do |i|
|
71
|
+
e = i+1
|
72
|
+
(0...m).each do |j|
|
73
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
74
|
+
x = [
|
75
|
+
d[j+1] + 1, # insertion
|
76
|
+
e + 1, # deletion
|
77
|
+
d[j] + cost # substitution
|
78
|
+
].min
|
79
|
+
d[j] = e
|
80
|
+
e = x
|
81
|
+
end
|
82
|
+
d[m] = x
|
83
|
+
end
|
84
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
85
|
+
# if (b_len > a_len) {
|
86
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
87
|
+
# } else {
|
88
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
89
|
+
# }
|
90
|
+
@levenshtein_similar = 1.0 - x.to_f / [n, m].max
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def utf8?
|
96
|
+
return @utf8_query[0] if @utf8_query.is_a?(::Array) # ActiveSupport::Memoizable is deprecated in 3.2, how annoying
|
97
|
+
utf8_query = (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u')
|
98
|
+
@utf8_query = [utf8_query]
|
99
|
+
utf8_query
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -23,7 +23,7 @@ class FuzzyMatch
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def best_score
|
26
|
-
@best_score ||=
|
26
|
+
@best_score ||= FuzzyMatch.score_class.new best_wrapper1_variant, best_wrapper2_variant
|
27
27
|
end
|
28
28
|
|
29
29
|
def best_wrapper1_variant
|
@@ -39,8 +39,8 @@ class FuzzyMatch
|
|
39
39
|
wrapper1_variant1, wrapper2_variant1 = tuple1
|
40
40
|
wrapper1_variant2, wrapper2_variant2 = tuple2
|
41
41
|
|
42
|
-
score1 =
|
43
|
-
score2 =
|
42
|
+
score1 = FuzzyMatch.score_class.new wrapper1_variant1, wrapper2_variant1
|
43
|
+
score2 = FuzzyMatch.score_class.new wrapper1_variant2, wrapper2_variant2
|
44
44
|
|
45
45
|
score1 <=> score2
|
46
46
|
end[-1]
|
data/lib/fuzzy_match/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -1,15 +1,10 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler'
|
3
3
|
Bundler.setup
|
4
|
-
require '
|
4
|
+
require 'minitest/spec'
|
5
|
+
require 'minitest/autorun'
|
5
6
|
require 'stringio'
|
6
|
-
|
7
|
-
if ENV['AMATCH'] == 'true'
|
8
|
-
require 'amatch'
|
9
|
-
end
|
7
|
+
|
10
8
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
11
9
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
12
10
|
require 'fuzzy_match'
|
13
|
-
|
14
|
-
class Test::Unit::TestCase
|
15
|
-
end
|
data/test/test_amatch.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
unless RUBY_PLATFORM == 'java'
|
2
|
+
require 'helper'
|
3
|
+
require 'test_fuzzy_match'
|
4
|
+
require 'amatch'
|
5
|
+
|
6
|
+
class TestAmatch < TestFuzzyMatch
|
7
|
+
before do
|
8
|
+
$testing_amatch = true
|
9
|
+
FuzzyMatch.engine = :amatch
|
10
|
+
end
|
11
|
+
after do
|
12
|
+
$testing_amatch = false
|
13
|
+
FuzzyMatch.engine = nil
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/test/test_blocking.rb
CHANGED
@@ -1,28 +1,28 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class TestBlocking <
|
4
|
-
|
3
|
+
class TestBlocking < MiniTest::Spec
|
4
|
+
it %{matches a single string argument} do
|
5
5
|
b = FuzzyMatch::Blocking.new %r{apple}
|
6
|
-
|
6
|
+
b.match?('2 apples').must_equal true
|
7
7
|
end
|
8
|
-
|
9
|
-
|
10
|
-
b = FuzzyMatch::Blocking.new %r{apple}
|
11
|
-
|
8
|
+
|
9
|
+
it %{embraces case insensitivity} do
|
10
|
+
b = FuzzyMatch::Blocking.new %r{apple}i
|
11
|
+
b.match?('2 Apples').must_equal true
|
12
12
|
end
|
13
13
|
|
14
|
-
|
14
|
+
it %{joins two string arguments} do
|
15
15
|
b = FuzzyMatch::Blocking.new %r{apple}
|
16
|
-
|
16
|
+
b.join?('apple', '2 apples').must_equal true
|
17
17
|
end
|
18
18
|
|
19
|
-
|
19
|
+
it %{fails to join two string arguments} do
|
20
20
|
b = FuzzyMatch::Blocking.new %r{apple}
|
21
|
-
|
21
|
+
b.join?('orange', '2 apples').must_equal false
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
b = FuzzyMatch::Blocking.new %r{apple}
|
26
|
-
|
24
|
+
it %{returns nil instead of false when it has no information} do
|
25
|
+
b = FuzzyMatch::Blocking.new %r{apple}
|
26
|
+
b.join?('orange', 'orange').must_be_nil
|
27
27
|
end
|
28
28
|
end
|
data/test/test_cache.rb
CHANGED
@@ -6,7 +6,7 @@ require 'cohort_scope'
|
|
6
6
|
require 'weighted_average'
|
7
7
|
|
8
8
|
ActiveRecord::Base.establish_connection(
|
9
|
-
'adapter' => '
|
9
|
+
'adapter' => 'mysql2',
|
10
10
|
'database' => 'fuzzy_match_test',
|
11
11
|
'username' => 'root',
|
12
12
|
'password' => 'password'
|
@@ -99,32 +99,32 @@ FlightSegment.find_each do |fs|
|
|
99
99
|
fs.cache_aircraft!
|
100
100
|
end
|
101
101
|
|
102
|
-
class TestCache <
|
103
|
-
|
102
|
+
class TestCache < MiniTest::Spec
|
103
|
+
it %{joins aircraft to flight segments} do
|
104
104
|
aircraft = Aircraft.find('B742')
|
105
|
-
|
105
|
+
aircraft.flight_segments.count.must_equal 2
|
106
106
|
end
|
107
107
|
|
108
|
-
|
108
|
+
it %{allow simple SQL operations} do
|
109
109
|
aircraft = Aircraft.find('B742')
|
110
|
-
|
110
|
+
aircraft.flight_segments.sum(:passengers).must_equal 110
|
111
111
|
end
|
112
112
|
|
113
|
-
|
113
|
+
it %{works with weighted_average} do
|
114
114
|
aircraft = Aircraft.find('B742')
|
115
|
-
|
115
|
+
aircraft.flight_segments.weighted_average(:seats, :weighted_by => :passengers).must_equal 5.45455
|
116
116
|
end
|
117
117
|
|
118
|
-
|
118
|
+
it %{works with cohort_scope (albeit rather clumsily)} do
|
119
119
|
aircraft = Aircraft.find('B742')
|
120
|
-
|
120
|
+
FlightSegment.big_cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).count.must_equal 2
|
121
121
|
end
|
122
122
|
|
123
|
-
def test_006_you_can_get_aircraft_from_flight_segments
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
end
|
123
|
+
# def test_006_you_can_get_aircraft_from_flight_segments
|
124
|
+
# fs = FlightSegment.first
|
125
|
+
# # you need to add an aircraft_description column
|
126
|
+
# lambda do
|
127
|
+
# fs.aircraft.count.must_equal 2
|
128
|
+
# end.must_raise ActiveRecord::StatementInvalid
|
129
|
+
# end
|
130
130
|
end
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -1,181 +1,180 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require 'helper'
|
3
3
|
|
4
|
-
class TestFuzzyMatch <
|
5
|
-
|
4
|
+
class TestFuzzyMatch < MiniTest::Spec
|
5
|
+
it %{identify the best match based on string similarity} do
|
6
6
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
7
|
-
|
8
|
-
|
7
|
+
d.find('RITZ').must_equal 'RATZ'
|
8
|
+
d.find('RíTZ').must_equal 'RATZ'
|
9
9
|
|
10
10
|
d = FuzzyMatch.new [ 'X' ]
|
11
|
-
|
12
|
-
|
11
|
+
d.find('X').must_equal 'X'
|
12
|
+
d.find('A').must_be_nil
|
13
13
|
end
|
14
14
|
|
15
|
-
|
15
|
+
it %{not gather metadata about the last result by default} do
|
16
16
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
17
17
|
d.find('MISSAM')
|
18
|
-
|
18
|
+
lambda do
|
19
19
|
d.last_result
|
20
|
-
end
|
20
|
+
end.must_raise ::RuntimeError, /gather_last_result/
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
it %{optionally gather metadata about the last result} do
|
24
24
|
d = FuzzyMatch.new %w{ NISSAN HONDA }
|
25
25
|
d.find 'MISSAM', :gather_last_result => true
|
26
|
-
|
27
|
-
|
26
|
+
d.last_result.score.must_equal 0.6
|
27
|
+
d.last_result.winner.must_equal 'NISSAN'
|
28
28
|
end
|
29
29
|
|
30
|
-
|
30
|
+
it %{use NORMALIZERS} do
|
31
31
|
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
|
32
|
-
|
32
|
+
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-900'
|
33
33
|
|
34
34
|
normalizers = [
|
35
35
|
%r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
|
36
36
|
]
|
37
37
|
d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
|
38
|
-
|
38
|
+
d.find('BOEING 737100 number 900').must_equal 'BOEING 737-100/200'
|
39
39
|
end
|
40
40
|
|
41
|
-
|
41
|
+
it %{use IDENTITIES} do
|
42
|
+
# false positive without identity
|
42
43
|
d = FuzzyMatch.new %w{ foo bar }
|
43
|
-
|
44
|
-
end
|
44
|
+
d.find('baz').must_equal 'bar'
|
45
45
|
|
46
|
-
def test_008_identify_false_positive
|
47
46
|
d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
48
|
-
|
47
|
+
d.find('baz').must_be_nil
|
49
48
|
end
|
50
49
|
|
51
50
|
# TODO this is not very helpful
|
52
|
-
|
51
|
+
it %{use BLOCKINGS} do
|
53
52
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
54
|
-
|
55
|
-
|
53
|
+
d.find('X').must_equal 'X'
|
54
|
+
d.find('A').must_be_nil
|
56
55
|
end
|
57
56
|
|
58
57
|
# TODO this is not very helpful
|
59
|
-
|
58
|
+
it %{optionally only attempt matches with records that fit into a blocking} do
|
60
59
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
61
|
-
|
62
|
-
|
60
|
+
d.find('X').must_equal 'X'
|
61
|
+
d.find('A').must_be_nil
|
63
62
|
|
64
63
|
d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
65
|
-
|
66
|
-
|
64
|
+
d.find('X', :must_match_blocking => true).must_equal 'X'
|
65
|
+
d.find('A', :must_match_blocking => true).must_be_nil
|
67
66
|
end
|
68
67
|
|
69
|
-
|
68
|
+
it %{receive the deprecated FuzzyMatch#free method without complaint} do
|
70
69
|
d = FuzzyMatch.new %w{ A B }
|
71
|
-
|
72
|
-
|
73
|
-
d.find 'A'
|
74
|
-
end
|
70
|
+
d.free
|
71
|
+
d.find('A').wont_be_nil
|
75
72
|
end
|
76
73
|
|
77
|
-
|
74
|
+
it %{return all records in sorted order} do
|
78
75
|
d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
79
|
-
|
80
|
-
|
76
|
+
d.find_all('X').must_equal ['X', 'X22' ]
|
77
|
+
d.find_all('A').must_equal []
|
81
78
|
end
|
82
79
|
|
83
|
-
|
80
|
+
it %{optionally force the first blocking to decide} do
|
84
81
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
85
|
-
|
82
|
+
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ]
|
86
83
|
|
87
84
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
88
|
-
|
85
|
+
d.find_all('Boeing 747').must_equal [ 'Boeing 747', 'Boeing 747SR' ]
|
89
86
|
|
90
87
|
# first_blocking_decides refers to the needle
|
91
88
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
92
|
-
|
89
|
+
d.find_all('Boeing ER6').must_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"]
|
93
90
|
|
94
91
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
95
|
-
|
92
|
+
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
96
93
|
|
97
94
|
# or equivalently with an identity
|
98
95
|
d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
99
|
-
|
100
|
-
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
96
|
+
d.find_all('Boeing ER6').must_equal [ 'Boeing ER6' ]
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "the :read option" do
|
100
|
+
it %{interpret a Numeric as an array index} do
|
101
|
+
ab = ['a', 'b']
|
102
|
+
ba = ['b', 'a']
|
103
|
+
haystack = [ab, ba]
|
104
|
+
by_first = FuzzyMatch.new haystack, :read => 0
|
105
|
+
by_last = FuzzyMatch.new haystack, :read => 1
|
106
|
+
by_first.find('a').must_equal ab
|
107
|
+
by_last.find('b').must_equal ab
|
108
|
+
by_first.find('b').must_equal ba
|
109
|
+
by_last.find('a').must_equal ba
|
110
|
+
end
|
111
|
+
|
112
|
+
it %{interpret a Symbol, etc. as hash key} do
|
113
|
+
ab = { :one => 'a', :two => 'b' }
|
114
|
+
ba = { :one => 'b', :two => 'a' }
|
115
|
+
haystack = [ab, ba]
|
116
|
+
by_first = FuzzyMatch.new haystack, :read => :one
|
117
|
+
by_last = FuzzyMatch.new haystack, :read => :two
|
118
|
+
by_first.find('a').must_equal ab
|
119
|
+
by_last.find('b').must_equal ab
|
120
|
+
by_first.find('b').must_equal ba
|
121
|
+
by_last.find('a').must_equal ba
|
122
|
+
end
|
123
|
+
|
124
|
+
MyStruct = Struct.new(:one, :two)
|
125
|
+
it %{interpret a Symbol as a method id (if the object responds to it)} do
|
126
|
+
ab = MyStruct.new('a', 'b')
|
127
|
+
ba = MyStruct.new('b', 'a')
|
128
|
+
haystack = [ab, ba]
|
129
|
+
by_first = FuzzyMatch.new haystack, :read => :one
|
130
|
+
by_last = FuzzyMatch.new haystack, :read => :two
|
131
|
+
by_first.read.must_equal :one
|
132
|
+
by_last.read.must_equal :two
|
133
|
+
by_first.find('a').must_equal ab
|
134
|
+
by_last.find('b').must_equal ab
|
135
|
+
by_first.find('b').must_equal ba
|
136
|
+
by_last.find('a').must_equal ba
|
137
|
+
end
|
138
|
+
|
139
|
+
it %{treat the deprecrated :haystack_reader option as an alias} do
|
140
|
+
ab = ['a', 'b']
|
141
|
+
ba = ['b', 'a']
|
142
|
+
haystack = [ab, ba]
|
143
|
+
by_first = FuzzyMatch.new haystack, :haystack_reader => 0
|
144
|
+
by_first.find('a').must_equal ab
|
145
|
+
by_first.find('b').must_equal ba
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
it %{not return any result if the maximum score is zero} do
|
150
|
+
FuzzyMatch.new(['a']).find('b').must_be_nil
|
151
|
+
end
|
152
|
+
|
153
|
+
it %{optionally require that the matching record share at least one word with the needle} do
|
155
154
|
d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
156
|
-
|
155
|
+
d.find('RITZ').must_be_nil
|
157
156
|
|
158
157
|
d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
|
159
|
-
|
160
|
-
|
161
|
-
|
158
|
+
d.find("Foo's").must_equal "Foo's Bar"
|
159
|
+
d.find("'s").must_be_nil
|
160
|
+
d.find("Foo").must_be_nil
|
162
161
|
|
163
162
|
d = FuzzyMatch.new ["Bolivia, Plurinational State of"], :must_match_at_least_one_word => true
|
164
|
-
|
163
|
+
d.find("Bolivia").must_equal "Bolivia, Plurinational State of"
|
165
164
|
end
|
166
165
|
|
167
|
-
|
166
|
+
it %{use STOP WORDS} do
|
168
167
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
|
169
|
-
|
168
|
+
d.find('A HTL', :must_match_at_least_one_word => true).must_equal 'B HTL'
|
170
169
|
|
171
170
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
172
|
-
|
171
|
+
d.find('A HTL').must_equal 'B HTL'
|
173
172
|
|
174
173
|
d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
175
|
-
|
174
|
+
d.find('A HTL').must_equal 'A HOTEL'
|
176
175
|
end
|
177
176
|
|
178
|
-
|
177
|
+
it %{print a basic explanation to stdout} do
|
179
178
|
require 'stringio'
|
180
179
|
capture = StringIO.new
|
181
180
|
begin
|
@@ -187,17 +186,24 @@ class TestFuzzyMatch < Test::Unit::TestCase
|
|
187
186
|
$stdout = old_stdout
|
188
187
|
end
|
189
188
|
capture.rewind
|
190
|
-
|
189
|
+
capture.read.must_include 'CATZ'
|
191
190
|
end
|
192
191
|
|
193
|
-
|
192
|
+
it %{not be fooled by substrings (but rather compare whole words to whole words)} do
|
194
193
|
d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
|
195
|
-
|
194
|
+
d.find('DOLCE LA HULPE BXL FI').must_be_nil
|
196
195
|
end
|
197
196
|
|
198
|
-
|
197
|
+
it %{not be case-sensitive when checking for sharing of words} do
|
199
198
|
d = FuzzyMatch.new [ 'A', 'B' ]
|
200
|
-
|
199
|
+
d.find('a', :must_match_at_least_one_word => true).must_equal 'A'
|
201
200
|
end
|
202
201
|
|
202
|
+
it %{defaults to a pure-ruby engine, but also has amatch} do
|
203
|
+
if defined?($testing_amatch) and $testing_amatch
|
204
|
+
FuzzyMatch.engine.must_equal :amatch
|
205
|
+
else
|
206
|
+
FuzzyMatch.engine.must_equal :pure_ruby
|
207
|
+
end
|
208
|
+
end
|
203
209
|
end
|
data/test/test_identity.rb
CHANGED
@@ -1,38 +1,36 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class TestIdentity <
|
4
|
-
|
3
|
+
class TestIdentity < MiniTest::Spec
|
4
|
+
it %{determines whether two records COULD be identical} do
|
5
5
|
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
6
|
-
|
6
|
+
i.identical?('A1', 'A 1foobar').must_equal true
|
7
7
|
end
|
8
8
|
|
9
|
-
|
9
|
+
it %{determines that two records MUST NOT be identical} do
|
10
10
|
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
11
|
-
|
11
|
+
i.identical?('A1', 'A 2foobar').must_equal false
|
12
12
|
end
|
13
13
|
|
14
|
-
|
14
|
+
it %{returns nil indicating no information} do
|
15
15
|
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
16
|
-
|
16
|
+
i.identical?('B1', 'A 2foobar').must_equal nil
|
17
17
|
end
|
18
18
|
|
19
|
-
|
19
|
+
it %{can be initialized with a regexp} do
|
20
20
|
i = FuzzyMatch::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
21
|
-
|
21
|
+
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
22
22
|
end
|
23
23
|
|
24
|
-
|
24
|
+
it %{can be initialized from a string (via to_regexp gem)} do
|
25
25
|
i = FuzzyMatch::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
def test_006_regexp_from_string_using_slash_delim
|
26
|
+
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
27
|
+
|
30
28
|
i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
31
|
-
|
29
|
+
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
32
30
|
end
|
33
31
|
|
34
|
-
|
32
|
+
it %{embraces case insensitivity} do
|
35
33
|
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}i
|
36
|
-
|
34
|
+
i.identical?('A1', 'a 1foobar').must_equal true
|
37
35
|
end
|
38
36
|
end
|
data/test/test_normalizer.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class TestNormalizer <
|
4
|
-
|
3
|
+
class TestNormalizer < MiniTest::Spec
|
4
|
+
it %{applies itself to a string argument} do
|
5
5
|
t = FuzzyMatch::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
t.apply('Ford F-350').must_equal 'Ford F350'
|
7
|
+
t.apply('Ford F150').must_equal 'Ford F150'
|
8
|
+
t.apply('Ford F 350').must_equal 'Ford F350'
|
9
9
|
end
|
10
10
|
end
|
data/test/test_wrapper.rb
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
class TestWrapper <
|
4
|
-
|
3
|
+
class TestWrapper < MiniTest::Spec
|
4
|
+
it %{does not treat "'s" as a word} do
|
5
5
|
assert_split ["foo's", "bar"], "Foo's Bar"
|
6
6
|
end
|
7
7
|
|
8
|
-
|
8
|
+
it %{treats "bolivia," as just "bolivia"} do
|
9
9
|
assert_split ["bolivia", "plurinational", "state"], "Bolivia, Plurinational State"
|
10
10
|
end
|
11
11
|
|
12
|
-
|
12
|
+
it %{does not split up hyphenated words} do
|
13
13
|
assert_split ['north-west'], "north-west"
|
14
14
|
end
|
15
15
|
|
16
|
-
|
16
|
+
it %{splits up words as expected} do
|
17
17
|
assert_split ['the', 'quick', "fox's", 'mouth', 'is', 'always', 'full'], "the quick fox's mouth -- is always full."
|
18
18
|
end
|
19
19
|
|
20
20
|
private
|
21
21
|
|
22
22
|
def assert_split(ary, str)
|
23
|
-
|
23
|
+
FuzzyMatch::Wrapper.new(null_fuzzy_match, str, true).words.must_equal ary
|
24
24
|
end
|
25
25
|
|
26
26
|
def null_fuzzy_match
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-02-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: minitest
|
16
|
+
requirement: &2152246040 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,21 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: remote_table
|
27
|
-
requirement: &2170569000 !ruby/object:Gem::Requirement
|
28
|
-
none: false
|
29
|
-
requirements:
|
30
|
-
- - ! '>='
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '0'
|
33
|
-
type: :development
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *2170569000
|
24
|
+
version_requirements: *2152246040
|
36
25
|
- !ruby/object:Gem::Dependency
|
37
26
|
name: activerecord
|
38
|
-
requirement: &
|
27
|
+
requirement: &2152244820 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
29
|
requirements:
|
41
30
|
- - ! '>='
|
@@ -43,10 +32,10 @@ dependencies:
|
|
43
32
|
version: '3'
|
44
33
|
type: :development
|
45
34
|
prerelease: false
|
46
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152244820
|
47
36
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement: &
|
37
|
+
name: mysql2
|
38
|
+
requirement: &2152260680 !ruby/object:Gem::Requirement
|
50
39
|
none: false
|
51
40
|
requirements:
|
52
41
|
- - ! '>='
|
@@ -54,10 +43,10 @@ dependencies:
|
|
54
43
|
version: '0'
|
55
44
|
type: :development
|
56
45
|
prerelease: false
|
57
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152260680
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: cohort_scope
|
60
|
-
requirement: &
|
49
|
+
requirement: &2152260120 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,10 +54,10 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :development
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *2152260120
|
69
58
|
- !ruby/object:Gem::Dependency
|
70
59
|
name: weighted_average
|
71
|
-
requirement: &
|
60
|
+
requirement: &2152259560 !ruby/object:Gem::Requirement
|
72
61
|
none: false
|
73
62
|
requirements:
|
74
63
|
- - ! '>='
|
@@ -76,10 +65,10 @@ dependencies:
|
|
76
65
|
version: '0'
|
77
66
|
type: :development
|
78
67
|
prerelease: false
|
79
|
-
version_requirements: *
|
68
|
+
version_requirements: *2152259560
|
80
69
|
- !ruby/object:Gem::Dependency
|
81
70
|
name: rake
|
82
|
-
requirement: &
|
71
|
+
requirement: &2152258980 !ruby/object:Gem::Requirement
|
83
72
|
none: false
|
84
73
|
requirements:
|
85
74
|
- - ! '>='
|
@@ -87,10 +76,10 @@ dependencies:
|
|
87
76
|
version: '0'
|
88
77
|
type: :development
|
89
78
|
prerelease: false
|
90
|
-
version_requirements: *
|
79
|
+
version_requirements: *2152258980
|
91
80
|
- !ruby/object:Gem::Dependency
|
92
81
|
name: yard
|
93
|
-
requirement: &
|
82
|
+
requirement: &2152258500 !ruby/object:Gem::Requirement
|
94
83
|
none: false
|
95
84
|
requirements:
|
96
85
|
- - ! '>='
|
@@ -98,10 +87,10 @@ dependencies:
|
|
98
87
|
version: '0'
|
99
88
|
type: :development
|
100
89
|
prerelease: false
|
101
|
-
version_requirements: *
|
90
|
+
version_requirements: *2152258500
|
102
91
|
- !ruby/object:Gem::Dependency
|
103
92
|
name: amatch
|
104
|
-
requirement: &
|
93
|
+
requirement: &2152258020 !ruby/object:Gem::Requirement
|
105
94
|
none: false
|
106
95
|
requirements:
|
107
96
|
- - ! '>='
|
@@ -109,10 +98,10 @@ dependencies:
|
|
109
98
|
version: '0'
|
110
99
|
type: :development
|
111
100
|
prerelease: false
|
112
|
-
version_requirements: *
|
101
|
+
version_requirements: *2152258020
|
113
102
|
- !ruby/object:Gem::Dependency
|
114
103
|
name: activesupport
|
115
|
-
requirement: &
|
104
|
+
requirement: &2152257320 !ruby/object:Gem::Requirement
|
116
105
|
none: false
|
117
106
|
requirements:
|
118
107
|
- - ! '>='
|
@@ -120,10 +109,10 @@ dependencies:
|
|
120
109
|
version: '3'
|
121
110
|
type: :runtime
|
122
111
|
prerelease: false
|
123
|
-
version_requirements: *
|
112
|
+
version_requirements: *2152257320
|
124
113
|
- !ruby/object:Gem::Dependency
|
125
114
|
name: to_regexp
|
126
|
-
requirement: &
|
115
|
+
requirement: &2152256720 !ruby/object:Gem::Requirement
|
127
116
|
none: false
|
128
117
|
requirements:
|
129
118
|
- - ! '>='
|
@@ -131,7 +120,7 @@ dependencies:
|
|
131
120
|
version: 0.0.3
|
132
121
|
type: :runtime
|
133
122
|
prerelease: false
|
134
|
-
version_requirements: *
|
123
|
+
version_requirements: *2152256720
|
135
124
|
description: Find a needle in a haystack using string similarity and (optionally)
|
136
125
|
regexp rules. Replaces loose_tight_dictionary.
|
137
126
|
email:
|
@@ -173,11 +162,14 @@ files:
|
|
173
162
|
- lib/fuzzy_match/normalizer.rb
|
174
163
|
- lib/fuzzy_match/result.rb
|
175
164
|
- lib/fuzzy_match/score.rb
|
165
|
+
- lib/fuzzy_match/score/amatch.rb
|
166
|
+
- lib/fuzzy_match/score/pure_ruby.rb
|
176
167
|
- lib/fuzzy_match/similarity.rb
|
177
168
|
- lib/fuzzy_match/stop_word.rb
|
178
169
|
- lib/fuzzy_match/version.rb
|
179
170
|
- lib/fuzzy_match/wrapper.rb
|
180
171
|
- test/helper.rb
|
172
|
+
- test/test_amatch.rb
|
181
173
|
- test/test_blocking.rb
|
182
174
|
- test/test_cache.rb
|
183
175
|
- test/test_fuzzy_match.rb
|
@@ -205,13 +197,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
197
|
version: '0'
|
206
198
|
requirements: []
|
207
199
|
rubyforge_project: fuzzy_match
|
208
|
-
rubygems_version: 1.8.
|
200
|
+
rubygems_version: 1.8.15
|
209
201
|
signing_key:
|
210
202
|
specification_version: 3
|
211
203
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|
212
204
|
rules. Replaces loose_tight_dictionary.
|
213
205
|
test_files:
|
214
206
|
- test/helper.rb
|
207
|
+
- test/test_amatch.rb
|
215
208
|
- test/test_blocking.rb
|
216
209
|
- test/test_cache.rb
|
217
210
|
- test/test_fuzzy_match.rb
|