fuzzy_match 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +12 -2
- data/History.txt +13 -0
- data/README.markdown +10 -6
- data/benchmark/before-with-free.txt +21 -21
- data/benchmark/before.txt +21 -21
- data/benchmark/memory.rb +6 -6
- data/examples/bts_aircraft/{blockings.csv → groupings.csv} +0 -0
- data/examples/bts_aircraft/test_bts_aircraft.rb +6 -6
- data/fuzzy_match.gemspec +1 -10
- data/lib/fuzzy_match.rb +41 -33
- data/lib/fuzzy_match/result.rb +1 -1
- data/lib/fuzzy_match/rule.rb +14 -0
- data/lib/fuzzy_match/rule/grouping.rb +32 -0
- data/lib/fuzzy_match/rule/identity.rb +19 -0
- data/lib/fuzzy_match/rule/normalizer.rb +20 -0
- data/lib/fuzzy_match/rule/stop_word.rb +11 -0
- data/lib/fuzzy_match/version.rb +1 -1
- data/test/helper.rb +3 -1
- data/test/test_fuzzy_match.rb +188 -124
- data/test/test_fuzzy_match_convoluted.rb.disabled +12 -12
- data/test/{test_blocking.rb → test_grouping.rb} +6 -6
- data/test/test_identity.rb +8 -8
- data/test/test_normalizer.rb +2 -2
- data/test/test_wrapper.rb +1 -1
- metadata +15 -101
- data/lib/fuzzy_match/blocking.rb +0 -36
- data/lib/fuzzy_match/identity.rb +0 -23
- data/lib/fuzzy_match/normalizer.rb +0 -28
- data/lib/fuzzy_match/stop_word.rb +0 -19
@@ -42,7 +42,7 @@ class TestFuzzyMatchConvoluted < MiniTest::Spec
|
|
42
42
|
]
|
43
43
|
@tightenings = []
|
44
44
|
@identities = []
|
45
|
-
@
|
45
|
+
@groupings = []
|
46
46
|
@positives = []
|
47
47
|
@negatives = []
|
48
48
|
end
|
@@ -55,23 +55,23 @@ class TestFuzzyMatchConvoluted < MiniTest::Spec
|
|
55
55
|
@_ltd ||= FuzzyMatch.new @haystack,
|
56
56
|
:tightenings => @tightenings,
|
57
57
|
:identities => @identities,
|
58
|
-
:
|
58
|
+
:groupings => @groupings,
|
59
59
|
:positives => @positives,
|
60
60
|
:negatives => @negatives,
|
61
|
-
:
|
61
|
+
:grouping_only => @grouping_only,
|
62
62
|
:log => $log
|
63
63
|
end
|
64
64
|
|
65
|
-
should "optionally only pay attention to things that match
|
65
|
+
should "optionally only pay attention to things that match groupings" do
|
66
66
|
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
67
67
|
|
68
68
|
clear_ltd
|
69
|
-
@
|
69
|
+
@grouping_only = true
|
70
70
|
assert_equal nil, ltd.improver.match(@a_needle)
|
71
71
|
|
72
72
|
clear_ltd
|
73
|
-
@
|
74
|
-
@
|
73
|
+
@grouping_only = true
|
74
|
+
@groupings.push ['/dash/i']
|
75
75
|
assert_equal @a_haystack, ltd.improver.match(@a_needle)
|
76
76
|
end
|
77
77
|
|
@@ -111,7 +111,7 @@ class TestFuzzyMatchConvoluted < MiniTest::Spec
|
|
111
111
|
end
|
112
112
|
end
|
113
113
|
|
114
|
-
should "have a false match without
|
114
|
+
should "have a false match without grouping" do
|
115
115
|
# @d_needle will be our victim
|
116
116
|
@haystack.push @d_lookalike
|
117
117
|
@tightenings.push @t_1
|
@@ -119,19 +119,19 @@ class TestFuzzyMatchConvoluted < MiniTest::Spec
|
|
119
119
|
assert_equal @d_lookalike, ltd.improver.match(@d_needle)
|
120
120
|
end
|
121
121
|
|
122
|
-
should "do
|
122
|
+
should "do grouping if the needle matches a group" do
|
123
123
|
# @d_needle will be our victim
|
124
124
|
@haystack.push @d_lookalike
|
125
125
|
@tightenings.push @t_1
|
126
|
-
@
|
126
|
+
@groupings.push ['/(bombardier|de ?havilland)/i']
|
127
127
|
|
128
128
|
assert_equal @d_haystack, ltd.improver.match(@d_needle)
|
129
129
|
end
|
130
130
|
|
131
|
-
should "treat
|
131
|
+
should "treat groups as exclusive" do
|
132
132
|
@haystack = [ @d_needle ]
|
133
133
|
@tightenings.push @t_1
|
134
|
-
@
|
134
|
+
@groupings.push ['/(bombardier|de ?havilland)/i']
|
135
135
|
|
136
136
|
assert_equal nil, ltd.improver.match(@d_lookalike)
|
137
137
|
end
|
@@ -1,28 +1,28 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
|
3
|
+
describe FuzzyMatch::Rule::Grouping do
|
4
4
|
it %{matches a single string argument} do
|
5
|
-
b = FuzzyMatch::
|
5
|
+
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
6
6
|
b.match?('2 apples').must_equal true
|
7
7
|
end
|
8
8
|
|
9
9
|
it %{embraces case insensitivity} do
|
10
|
-
b = FuzzyMatch::
|
10
|
+
b = FuzzyMatch::Rule::Grouping.new %r{apple}i
|
11
11
|
b.match?('2 Apples').must_equal true
|
12
12
|
end
|
13
13
|
|
14
14
|
it %{joins two string arguments} do
|
15
|
-
b = FuzzyMatch::
|
15
|
+
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
16
16
|
b.join?('apple', '2 apples').must_equal true
|
17
17
|
end
|
18
18
|
|
19
19
|
it %{fails to join two string arguments} do
|
20
|
-
b = FuzzyMatch::
|
20
|
+
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
21
21
|
b.join?('orange', '2 apples').must_equal false
|
22
22
|
end
|
23
23
|
|
24
24
|
it %{returns nil instead of false when it has no information} do
|
25
|
-
b = FuzzyMatch::
|
25
|
+
b = FuzzyMatch::Rule::Grouping.new %r{apple}
|
26
26
|
b.join?('orange', 'orange').must_be_nil
|
27
27
|
end
|
28
28
|
end
|
data/test/test_identity.rb
CHANGED
@@ -1,36 +1,36 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
|
3
|
+
describe FuzzyMatch::Rule::Identity do
|
4
4
|
it %{determines whether two records COULD be identical} do
|
5
|
-
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
5
|
+
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
6
6
|
i.identical?('A1', 'A 1foobar').must_equal true
|
7
7
|
end
|
8
8
|
|
9
9
|
it %{determines that two records MUST NOT be identical} do
|
10
|
-
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
10
|
+
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
11
11
|
i.identical?('A1', 'A 2foobar').must_equal false
|
12
12
|
end
|
13
13
|
|
14
14
|
it %{returns nil indicating no information} do
|
15
|
-
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}
|
15
|
+
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}
|
16
16
|
i.identical?('B1', 'A 2foobar').must_equal nil
|
17
17
|
end
|
18
18
|
|
19
19
|
it %{can be initialized with a regexp} do
|
20
|
-
i = FuzzyMatch::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
20
|
+
i = FuzzyMatch::Rule::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
21
21
|
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
22
22
|
end
|
23
23
|
|
24
24
|
it %{can be initialized from a string (via to_regexp gem)} do
|
25
|
-
i = FuzzyMatch::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
25
|
+
i = FuzzyMatch::Rule::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
26
26
|
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
27
27
|
|
28
|
-
i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
28
|
+
i = FuzzyMatch::Rule::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
29
29
|
i.regexp.must_equal %r{\A\\?/(.*)etc/mysql\$$}
|
30
30
|
end
|
31
31
|
|
32
32
|
it %{embraces case insensitivity} do
|
33
|
-
i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}i
|
33
|
+
i = FuzzyMatch::Rule::Identity.new %r{(A)[ ]*(\d)}i
|
34
34
|
i.identical?('A1', 'a 1foobar').must_equal true
|
35
35
|
end
|
36
36
|
end
|
data/test/test_normalizer.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
|
3
|
+
describe FuzzyMatch::Rule::Normalizer do
|
4
4
|
it %{applies itself to a string argument} do
|
5
|
-
t = FuzzyMatch::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
5
|
+
t = FuzzyMatch::Rule::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
|
6
6
|
t.apply('Ford F-350').must_equal 'Ford F350'
|
7
7
|
t.apply('Ford F150').must_equal 'Ford F150'
|
8
8
|
t.apply('Ford F 350').must_equal 'Ford F350'
|
data/test/test_wrapper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,99 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
-
- !ruby/object:Gem::Dependency
|
15
|
-
name: minitest
|
16
|
-
requirement: &2165315840 !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ! '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
22
|
-
type: :development
|
23
|
-
prerelease: false
|
24
|
-
version_requirements: *2165315840
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: activerecord
|
27
|
-
requirement: &2165314740 !ruby/object:Gem::Requirement
|
28
|
-
none: false
|
29
|
-
requirements:
|
30
|
-
- - ! '>='
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '3'
|
33
|
-
type: :development
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *2165314740
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: mysql2
|
38
|
-
requirement: &2165314140 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ! '>='
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: '0'
|
44
|
-
type: :development
|
45
|
-
prerelease: false
|
46
|
-
version_requirements: *2165314140
|
47
|
-
- !ruby/object:Gem::Dependency
|
48
|
-
name: cohort_scope
|
49
|
-
requirement: &2165313620 !ruby/object:Gem::Requirement
|
50
|
-
none: false
|
51
|
-
requirements:
|
52
|
-
- - ! '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
type: :development
|
56
|
-
prerelease: false
|
57
|
-
version_requirements: *2165313620
|
58
|
-
- !ruby/object:Gem::Dependency
|
59
|
-
name: weighted_average
|
60
|
-
requirement: &2165312980 !ruby/object:Gem::Requirement
|
61
|
-
none: false
|
62
|
-
requirements:
|
63
|
-
- - ! '>='
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
version: '0'
|
66
|
-
type: :development
|
67
|
-
prerelease: false
|
68
|
-
version_requirements: *2165312980
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rake
|
71
|
-
requirement: &2165312300 !ruby/object:Gem::Requirement
|
72
|
-
none: false
|
73
|
-
requirements:
|
74
|
-
- - ! '>='
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
version: '0'
|
77
|
-
type: :development
|
78
|
-
prerelease: false
|
79
|
-
version_requirements: *2165312300
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
name: yard
|
82
|
-
requirement: &2165311320 !ruby/object:Gem::Requirement
|
83
|
-
none: false
|
84
|
-
requirements:
|
85
|
-
- - ! '>='
|
86
|
-
- !ruby/object:Gem::Version
|
87
|
-
version: '0'
|
88
|
-
type: :development
|
89
|
-
prerelease: false
|
90
|
-
version_requirements: *2165311320
|
91
|
-
- !ruby/object:Gem::Dependency
|
92
|
-
name: amatch
|
93
|
-
requirement: &2165310720 !ruby/object:Gem::Requirement
|
94
|
-
none: false
|
95
|
-
requirements:
|
96
|
-
- - ! '>='
|
97
|
-
- !ruby/object:Gem::Version
|
98
|
-
version: '0'
|
99
|
-
type: :development
|
100
|
-
prerelease: false
|
101
|
-
version_requirements: *2165310720
|
102
14
|
- !ruby/object:Gem::Dependency
|
103
15
|
name: activesupport
|
104
|
-
requirement: &
|
16
|
+
requirement: &2151876580 !ruby/object:Gem::Requirement
|
105
17
|
none: false
|
106
18
|
requirements:
|
107
19
|
- - ! '>='
|
@@ -109,10 +21,10 @@ dependencies:
|
|
109
21
|
version: '3'
|
110
22
|
type: :runtime
|
111
23
|
prerelease: false
|
112
|
-
version_requirements: *
|
24
|
+
version_requirements: *2151876580
|
113
25
|
- !ruby/object:Gem::Dependency
|
114
26
|
name: to_regexp
|
115
|
-
requirement: &
|
27
|
+
requirement: &2151871760 !ruby/object:Gem::Requirement
|
116
28
|
none: false
|
117
29
|
requirements:
|
118
30
|
- - ! '>='
|
@@ -120,7 +32,7 @@ dependencies:
|
|
120
32
|
version: 0.0.3
|
121
33
|
type: :runtime
|
122
34
|
prerelease: false
|
123
|
-
version_requirements: *
|
35
|
+
version_requirements: *2151871760
|
124
36
|
description: Find a needle in a haystack using string similarity and (optionally)
|
125
37
|
regexp rules. Replaces loose_tight_dictionary.
|
126
38
|
email:
|
@@ -132,6 +44,7 @@ files:
|
|
132
44
|
- .document
|
133
45
|
- .gitignore
|
134
46
|
- Gemfile
|
47
|
+
- History.txt
|
135
48
|
- LICENSE
|
136
49
|
- README.markdown
|
137
50
|
- Rakefile
|
@@ -145,7 +58,7 @@ files:
|
|
145
58
|
- examples/bts_aircraft/5-2-D.htm
|
146
59
|
- examples/bts_aircraft/5-2-E.htm
|
147
60
|
- examples/bts_aircraft/5-2-G.htm
|
148
|
-
- examples/bts_aircraft/
|
61
|
+
- examples/bts_aircraft/groupings.csv
|
149
62
|
- examples/bts_aircraft/identities.csv
|
150
63
|
- examples/bts_aircraft/negatives.csv
|
151
64
|
- examples/bts_aircraft/normalizers.csv
|
@@ -156,24 +69,25 @@ files:
|
|
156
69
|
- examples/icao-bts.xls
|
157
70
|
- fuzzy_match.gemspec
|
158
71
|
- lib/fuzzy_match.rb
|
159
|
-
- lib/fuzzy_match/blocking.rb
|
160
72
|
- lib/fuzzy_match/cached_result.rb
|
161
|
-
- lib/fuzzy_match/identity.rb
|
162
|
-
- lib/fuzzy_match/normalizer.rb
|
163
73
|
- lib/fuzzy_match/result.rb
|
74
|
+
- lib/fuzzy_match/rule.rb
|
75
|
+
- lib/fuzzy_match/rule/grouping.rb
|
76
|
+
- lib/fuzzy_match/rule/identity.rb
|
77
|
+
- lib/fuzzy_match/rule/normalizer.rb
|
78
|
+
- lib/fuzzy_match/rule/stop_word.rb
|
164
79
|
- lib/fuzzy_match/score.rb
|
165
80
|
- lib/fuzzy_match/score/amatch.rb
|
166
81
|
- lib/fuzzy_match/score/pure_ruby.rb
|
167
82
|
- lib/fuzzy_match/similarity.rb
|
168
|
-
- lib/fuzzy_match/stop_word.rb
|
169
83
|
- lib/fuzzy_match/version.rb
|
170
84
|
- lib/fuzzy_match/wrapper.rb
|
171
85
|
- test/helper.rb
|
172
86
|
- test/test_amatch.rb
|
173
|
-
- test/test_blocking.rb
|
174
87
|
- test/test_cache.rb
|
175
88
|
- test/test_fuzzy_match.rb
|
176
89
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
90
|
+
- test/test_grouping.rb
|
177
91
|
- test/test_identity.rb
|
178
92
|
- test/test_normalizer.rb
|
179
93
|
- test/test_wrapper.rb
|
@@ -205,10 +119,10 @@ summary: Find a needle in a haystack using string similarity and (optionally) re
|
|
205
119
|
test_files:
|
206
120
|
- test/helper.rb
|
207
121
|
- test/test_amatch.rb
|
208
|
-
- test/test_blocking.rb
|
209
122
|
- test/test_cache.rb
|
210
123
|
- test/test_fuzzy_match.rb
|
211
124
|
- test/test_fuzzy_match_convoluted.rb.disabled
|
125
|
+
- test/test_grouping.rb
|
212
126
|
- test/test_identity.rb
|
213
127
|
- test/test_normalizer.rb
|
214
128
|
- test/test_wrapper.rb
|
data/lib/fuzzy_match/blocking.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
class FuzzyMatch
|
2
|
-
# "Record linkage typically involves two main steps: blocking and scoring..."
|
3
|
-
# http://en.wikipedia.org/wiki/Record_linkage
|
4
|
-
#
|
5
|
-
# Blockings effectively divide up the haystack into groups that match a pattern
|
6
|
-
#
|
7
|
-
# A blocking (as in a grouping) comes into effect when a str matches.
|
8
|
-
# Then the needle must also match the blocking's regexp.
|
9
|
-
class Blocking
|
10
|
-
attr_reader :regexp
|
11
|
-
|
12
|
-
def initialize(regexp_or_str)
|
13
|
-
@regexp = regexp_or_str.to_regexp
|
14
|
-
end
|
15
|
-
|
16
|
-
def match?(str)
|
17
|
-
!!(regexp.match(str))
|
18
|
-
end
|
19
|
-
|
20
|
-
# If a blocking "joins" two strings, that means they both fit into it.
|
21
|
-
#
|
22
|
-
# Returns false if they certainly don't fit this blocking.
|
23
|
-
# Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
|
24
|
-
def join?(str1, str2)
|
25
|
-
if str2_match_data = regexp.match(str2)
|
26
|
-
if str1_match_data = regexp.match(str1)
|
27
|
-
str2_match_data.captures.join.downcase == str1_match_data.captures.join.downcase
|
28
|
-
else
|
29
|
-
false
|
30
|
-
end
|
31
|
-
else
|
32
|
-
nil
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
data/lib/fuzzy_match/identity.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
class FuzzyMatch
|
2
|
-
# Identities take effect when needle and haystack both match a regexp
|
3
|
-
# Then the captured part of the regexp has to match exactly
|
4
|
-
class Identity
|
5
|
-
attr_reader :regexp
|
6
|
-
|
7
|
-
def initialize(regexp_or_str)
|
8
|
-
@regexp = regexp_or_str.to_regexp
|
9
|
-
end
|
10
|
-
|
11
|
-
# Two strings are "identical" if they both match this identity and the captures are equal.
|
12
|
-
#
|
13
|
-
# Only returns true/false if both strings match the regexp.
|
14
|
-
# Otherwise returns nil.
|
15
|
-
def identical?(str1, str2)
|
16
|
-
if str1_match_data = regexp.match(str1) and match_data = regexp.match(str2)
|
17
|
-
str1_match_data.captures.join.downcase == match_data.captures.join.downcase
|
18
|
-
else
|
19
|
-
nil
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
class FuzzyMatch
|
2
|
-
# A normalizer just strips a string down to its core
|
3
|
-
class Normalizer
|
4
|
-
attr_reader :regexp
|
5
|
-
|
6
|
-
def initialize(regexp_or_str)
|
7
|
-
@regexp = regexp_or_str.to_regexp
|
8
|
-
end
|
9
|
-
|
10
|
-
# A normalizer applies when its regexp matches and captures a new (shorter) string
|
11
|
-
def apply?(str)
|
12
|
-
!!(regexp.match(str))
|
13
|
-
end
|
14
|
-
|
15
|
-
# The result of applying a normalizer is just all the captures put together.
|
16
|
-
def apply(str)
|
17
|
-
if match_data = regexp.match(str)
|
18
|
-
match_data.captures.join
|
19
|
-
else
|
20
|
-
str
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def inspect
|
25
|
-
"#<FuzzyMatch::Normalizer regexp=#{regexp.inspect}>"
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|