matching 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +2 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +60 -0
- data/README.md +319 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/lib/matching.rb +11 -0
- data/lib/matching/active_relation_store.rb +30 -0
- data/lib/matching/array_store.rb +23 -0
- data/lib/matching/attribute_pair.rb +17 -0
- data/lib/matching/deduplicator.rb +133 -0
- data/lib/matching/hash_index.rb +25 -0
- data/lib/matching/match.rb +14 -0
- data/lib/matching/matcher.rb +266 -0
- data/lib/matching/redis_index.rb +26 -0
- data/lib/matching/similarity.rb +78 -0
- data/matching.gemspec +71 -0
- data/spec/db/database.yml +5 -0
- data/spec/integration/bank_rec_spec.rb +50 -0
- data/spec/lib/ar_spec.rb +182 -0
- data/spec/lib/deduplicator_spec.rb +221 -0
- data/spec/lib/matcher_spec.rb +297 -0
- data/spec/lib/redis_spec.rb +105 -0
- data/spec/lib/similarity_spec.rb +88 -0
- data/spec/samples/agent_recs.csv +2024 -0
- data/spec/spec_helper.rb +70 -0
- metadata +109 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
# Tests main functionality using array data stores and hash indexing.
|
|
2
|
+
# See ar_spec.rb for tests of ActiveRecord as the data store
|
|
3
|
+
# See redis_spec.rb for tests of Redis for indexing.
|
|
4
|
+
|
|
5
|
+
require 'date'
|
|
6
|
+
require File.expand_path("../../spec_helper", __FILE__)
|
|
7
|
+
include Matching
|
|
8
|
+
|
|
9
|
+
describe AttributePair do
|
|
10
|
+
it "describes the relationship of two attributes from two classes for the matcher" do
|
|
11
|
+
rab = AttributePair.new(:mid, :mid, 0.5)
|
|
12
|
+
rab.left_attr.should == :mid
|
|
13
|
+
rab.right_attr.should == :mid
|
|
14
|
+
rab.weight.should == 0.5
|
|
15
|
+
rab.is_fuzzy.should == false
|
|
16
|
+
|
|
17
|
+
expect { AttributePair.new(:mid, :mid, 0.0) }.to raise_error
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe HashIndex do
|
|
22
|
+
it "indexes object ids for a given attribute and value" do
|
|
23
|
+
subject.put(:mid, "7275551111", 1)
|
|
24
|
+
subject.get(:mid, "7275551111").should == [1]
|
|
25
|
+
subject.put(:mid, "7275551111", 2)
|
|
26
|
+
subject.get(:mid, "7275551111").should == [1,2]
|
|
27
|
+
subject.put(:mid, "8135554444", 3)
|
|
28
|
+
subject.get(:mid, "8135554444").should == [3]
|
|
29
|
+
subject.get(:mid, "2015558888").should be_nil
|
|
30
|
+
subject.get(:esn, "1111111111").should be_nil
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "should not index nil values" do
|
|
34
|
+
subject.put(:mid, nil, 1)
|
|
35
|
+
subject.get(:mid, nil).should be_nil
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
describe ArrayStore do
|
|
40
|
+
include MatcherSpecHelper
|
|
41
|
+
|
|
42
|
+
let(:left_as) { create_test_data; ArrayStore.new(@lefts) }
|
|
43
|
+
let(:right_as) { create_test_data; ArrayStore.new(@rights) }
|
|
44
|
+
|
|
45
|
+
it "should store data in left and right arrays" do
|
|
46
|
+
left_as.arr.should have(3).items
|
|
47
|
+
right_as.arr.should have(4).items
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it "should enumerate left array objects with index" do
|
|
51
|
+
cnt = 0
|
|
52
|
+
expect { left_as.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(3)
|
|
53
|
+
|
|
54
|
+
obj, id = nil, nil
|
|
55
|
+
left_as.each do |o,idx|
|
|
56
|
+
obj, id = o, idx
|
|
57
|
+
break
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
id.should == 0
|
|
61
|
+
obj.should == @left_a
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "should retrieve objects by their array index through the find method" do
|
|
65
|
+
left_as.find(0).should == @left_a
|
|
66
|
+
left_as.find(1).should == @left_b
|
|
67
|
+
right_as.find(-1).should == @right_d
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe Matcher do
|
|
72
|
+
include MatcherSpecHelper
|
|
73
|
+
|
|
74
|
+
let(:compare_string_size) { lambda { |l,r| (l.size == r.size ? 1.0 : 0.0) } }
|
|
75
|
+
|
|
76
|
+
let(:esn_has_ones_filter) { lambda { |l,r| l.esn =~ /1/ && r.esn =~ /1/ } }
|
|
77
|
+
|
|
78
|
+
it "calculates non-fuzzy similarity of pairs of strings" do
|
|
79
|
+
subject.compare_values("hello","hello").should == 1.0
|
|
80
|
+
subject.compare_values("hello","world").should == 0.0
|
|
81
|
+
subject.compare_values(nil,nil).should == 0.0
|
|
82
|
+
subject.compare_values("hello",nil).should == 0.0
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "calculates fuzzy similarity of pairs of strings" do
|
|
86
|
+
subject.compare_values("hello","hullo",:fuzzy => true).should be_within(0.1).of(0.75)
|
|
87
|
+
subject.compare_values("hello","world",:fuzzy => true).should be_within(0.1).of(0.2)
|
|
88
|
+
subject.compare_values("hello","zippy",:fuzzy => true).should be_within(0.1).of(0.0)
|
|
89
|
+
subject.compare_values("John Q Public","Public,John", :fuzzy => true, :comparison => :name).should == 1.0
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "calculates non-fuzzy similarity of pairs of dates" do
|
|
93
|
+
subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,1)).should == 1.0
|
|
94
|
+
subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,2)).should == 0.0
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it "calculates fuzzy similarity of pairs of dates" do
|
|
98
|
+
subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,2), :fuzzy => true).should be_within(0.1).of(0.9)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it "should raise exception if fuzzy comparison is requested on unsupported class" do
|
|
102
|
+
expect { subject.compare_values(["cat"], ["bat"], :fuzzy => true) }.to raise_error(ArgumentError, "Cannot calculate fuzzy comparison for type Array")
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it "should raise an exception if pairs for comparison are of different base types" do
|
|
106
|
+
expect { subject.compare_values("hello",2) }.to raise_error
|
|
107
|
+
expect { subject.compare_values(Date.new(2011,1,1),2) }.to raise_error
|
|
108
|
+
expect { subject.compare_values(Date.new(2011,1,1),"world") }.to raise_error
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "creates AttributePairs using the join method" do
|
|
112
|
+
expect { subject.join(:mid, :mid, 1.0) }.to change{ subject.join_pairs.size }.from(0).to(1)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "creates AttributePairs using the compare method" do
|
|
116
|
+
expect { subject.compare(:mid, :mid, 1.0) }.to change{ subject.compare_pairs.size }.from(0).to(1)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
it "allows custom lambdas to be used for comparison rules" do
|
|
120
|
+
expect { subject.custom(compare_string_size) }.to change{ subject.custom_functions.size }.from(0).to(1)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it "allows filters to be defined as custom lambdas that return boolean" do
|
|
124
|
+
expect { subject.filter(esn_has_ones_filter) }.to change{ subject.filter_functions.size }.from(0).to(1)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
it "defines the rules for matching through a define block" do
|
|
128
|
+
subject.join_pairs.size.should == 0
|
|
129
|
+
subject.compare_pairs.size.should == 0
|
|
130
|
+
define_mid_esn_date_matcher(subject)
|
|
131
|
+
subject.join_pairs.size.should == 2
|
|
132
|
+
subject.compare_pairs.size.should == 1
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it "scores two objects based on defined matching rules" do
|
|
136
|
+
create_array_matcher
|
|
137
|
+
define_mid_matcher(@matcher)
|
|
138
|
+
@matcher.score_pair(@left_a,@right_b).should == 1.0
|
|
139
|
+
@matcher.score_pair(@left_a,@right_a).should == 0.0
|
|
140
|
+
|
|
141
|
+
@matcher.define { join :esn, :esn, 1.0 }
|
|
142
|
+
@matcher.score_pair(@left_a,@right_b).should == 2.0
|
|
143
|
+
@matcher.score_pair(@left_a,@right_a).should == 1.0
|
|
144
|
+
|
|
145
|
+
@matcher.define { compare :date, :date, 0.5, :fuzzy => true }
|
|
146
|
+
@matcher.score_pair(@left_a,@right_b).should == 2.5
|
|
147
|
+
@matcher.score_pair(@left_a,@right_a).should == 1.5
|
|
148
|
+
@matcher.score_pair(@left_c,@right_d).should be_within(0.1).of(1.5)
|
|
149
|
+
|
|
150
|
+
lmbda = lambda { |l,r| (l.mid[0] == '7' ? 1.0 : 0.0) }
|
|
151
|
+
@matcher.custom(lmbda)
|
|
152
|
+
@matcher.score_pair(@left_a,@right_b).should == 3.5
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "requires left and right store objects be defined before matching" do
|
|
156
|
+
m = Matching::Matcher.new(:left_store => nil, :right_store => nil)
|
|
157
|
+
expect { m.match }.to raise_error(ArgumentError)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
context "with hash index and array store" do
|
|
161
|
+
|
|
162
|
+
before(:each) do
|
|
163
|
+
create_array_matcher
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it "requires at least one join pair to be defined" do
|
|
167
|
+
expect { @matcher.index_right_objects }.to raise_error
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
context "using mid and esn matcher" do
|
|
171
|
+
|
|
172
|
+
before(:each) do
|
|
173
|
+
define_mid_esn_matcher(@matcher)
|
|
174
|
+
@matcher.index_right_objects
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it "indexes right records on join attributes" do
|
|
178
|
+
@matcher.right_index.get(:esn, "11111111111").should_not be_nil
|
|
179
|
+
@matcher.right_index.get(:esn, "11111111111").size.should == 2
|
|
180
|
+
@matcher.right_index.get(:mid, "8135554444").size.should == 1
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it "finds potential matches for left_objects from right_objects based on join criteria" do
|
|
184
|
+
right_matches = @matcher.find_potential_matches(@left_a)
|
|
185
|
+
right_matches.should have(3).items
|
|
186
|
+
right_matches.should include(@right_a)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it "finds scored matches by applying rules after finding potential matches" do
|
|
190
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
191
|
+
right_matches.should have(3).items
|
|
192
|
+
|
|
193
|
+
#raise matching threshold
|
|
194
|
+
@matcher.min_score = 2.0
|
|
195
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
196
|
+
right_matches.should have(1).items
|
|
197
|
+
|
|
198
|
+
#note: return value is an array of arrays, not an array of just
|
|
199
|
+
#right_objects
|
|
200
|
+
right_matches[0].should == [@right_b, 2.0]
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it "should reconcile test data based on single attribute pair" do
|
|
205
|
+
define_esn_matcher(@matcher)
|
|
206
|
+
@matcher.match
|
|
207
|
+
@matcher.right_matches.size.should == 2
|
|
208
|
+
@matcher.left_matches.size.should == 2
|
|
209
|
+
|
|
210
|
+
@matcher.left_matches.should include(@left_a)
|
|
211
|
+
@matcher.left_matches.should include(@left_b)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it "should reconcile test data based on two attribute pairs" do
|
|
215
|
+
define_mid_esn_matcher(@matcher)
|
|
216
|
+
@matcher.match
|
|
217
|
+
@matcher.right_matches.size.should == 3
|
|
218
|
+
@matcher.left_matches.size.should == 3
|
|
219
|
+
|
|
220
|
+
create_array_matcher
|
|
221
|
+
define_mid_esn_matcher(@matcher)
|
|
222
|
+
@matcher.min_score = 2.0
|
|
223
|
+
@matcher.match
|
|
224
|
+
@matcher.right_matches.size.should == 2
|
|
225
|
+
@matcher.left_matches.size.should == 2
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
it "should list non-matched objects as exceptions" do
|
|
229
|
+
define_mid_esn_matcher(@matcher)
|
|
230
|
+
@matcher.min_score = 2.0
|
|
231
|
+
@matcher.match
|
|
232
|
+
@matcher.left_exceptions.should have(1).item
|
|
233
|
+
@matcher.right_exceptions.should have(2).items
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
it "should allow veto of matches using filtering rules" do
|
|
237
|
+
create_array_matcher
|
|
238
|
+
define_esn_matcher(@matcher)
|
|
239
|
+
@matcher.filter(esn_has_ones_filter)
|
|
240
|
+
@matcher.match
|
|
241
|
+
@matcher.left_matches.size.should == 1
|
|
242
|
+
@matcher.left_matches[@left_a].left_obj.esn.should =~ /1/
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
end #hash index and array store tests
|
|
246
|
+
|
|
247
|
+
context "conflict resolution" do
|
|
248
|
+
|
|
249
|
+
let(:amatcher) do
|
|
250
|
+
|
|
251
|
+
#initially, A will match the first record it comes to (an outer record),
|
|
252
|
+
#then A will be made loser and should eventually match Y
|
|
253
|
+
@left_a = Transaction.new(:id => 1, :esn => "11111111111", :mid => "cdcd")
|
|
254
|
+
@left_b = Transaction.new(:id => 2, :esn => "11111111111", :mid => "abab")
|
|
255
|
+
@left_c = Transaction.new(:id => 3, :esn => "11111111111", :mid => "yzyz")
|
|
256
|
+
@lefts = [@left_a, @left_b, @left_c]
|
|
257
|
+
|
|
258
|
+
@right_x = Transaction.new(:id => 1, :esn => "11111111111", :mid => "abab")
|
|
259
|
+
@right_y = Transaction.new(:id => 2, :esn => "11111111111", :mid => "mnmn")
|
|
260
|
+
@right_z = Transaction.new(:id => 3, :esn => "11111111111", :mid => "yzyz")
|
|
261
|
+
@rights = [@right_x, @right_y, @right_z]
|
|
262
|
+
|
|
263
|
+
as_l = ArrayStore.new(@lefts)
|
|
264
|
+
as_r = ArrayStore.new(@rights)
|
|
265
|
+
matcher = Matching::Matcher.new(:left_store => as_l, :right_store => as_r)
|
|
266
|
+
define_mid_esn_matcher(matcher)
|
|
267
|
+
matcher
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
it "should find best fit for all objects" do
|
|
271
|
+
amatcher.match
|
|
272
|
+
amatcher.left_matches.should have(3).items
|
|
273
|
+
amatcher.left_matches[@left_a].right_obj.should == @right_y
|
|
274
|
+
amatcher.left_matches[@left_a].score.should == 1.0
|
|
275
|
+
amatcher.left_matches[@left_b].right_obj.should == @right_x
|
|
276
|
+
amatcher.left_matches[@left_b].score.should == 2.0
|
|
277
|
+
amatcher.left_matches[@left_c].right_obj.should == @right_z
|
|
278
|
+
amatcher.left_matches[@left_c].score.should == 2.0
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
it "should not find best fit unless evaluate_left_losers executes normally" do
|
|
282
|
+
class << amatcher
|
|
283
|
+
define_method(:evaluate_left_losers, proc { })
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
amatcher.match
|
|
287
|
+
amatcher.left_matches.should have(2).items
|
|
288
|
+
amatcher.left_matches[@left_b].right_obj.should == @right_x
|
|
289
|
+
amatcher.left_matches[@left_b].score.should == 2.0
|
|
290
|
+
amatcher.left_matches[@left_c].right_obj.should == @right_z
|
|
291
|
+
amatcher.left_matches[@left_c].score.should == 2.0
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
end #conflict resolution
|
|
295
|
+
|
|
296
|
+
#See ar_spec.rb and redis_spec.rb for tests involving ActiveRecord and Redis
|
|
297
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Tests Redis as the indexer
|
|
2
|
+
|
|
3
|
+
require File.expand_path("../../spec_helper", __FILE__)
|
|
4
|
+
require File.expand_path("../../../lib/matching/redis_index", __FILE__)
|
|
5
|
+
|
|
6
|
+
include Matching
|
|
7
|
+
|
|
8
|
+
def test_redis_connection
|
|
9
|
+
r = Redis.new
|
|
10
|
+
r.inspect rescue puts "Start Redis to run redis_spec" and return false
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
if test_redis_connection
|
|
14
|
+
describe RedisIndex do
|
|
15
|
+
it "index object ids for a given attribute and value" do
|
|
16
|
+
subject.put(:mid, "7275551111", 1)
|
|
17
|
+
subject.get(:mid, "7275551111").should == [1]
|
|
18
|
+
subject.put(:mid, "7275551111", 2)
|
|
19
|
+
subject.get(:mid, "7275551111").should == [1,2]
|
|
20
|
+
subject.put(:mid, "8135554444", 3)
|
|
21
|
+
subject.get(:mid, "8135554444").should == [3]
|
|
22
|
+
subject.get(:mid, "2015558888").should be_nil
|
|
23
|
+
subject.get(:esn, "1111111111").should be_nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "should not index nil values" do
|
|
27
|
+
subject.put(:mid, nil, 1)
|
|
28
|
+
subject.get(:mid, nil).should be_nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe Matcher do
|
|
33
|
+
include MatcherSpecHelper
|
|
34
|
+
|
|
35
|
+
context "with redis index and array store" do
|
|
36
|
+
|
|
37
|
+
before(:each) do
|
|
38
|
+
create_array_matcher(:use_redis => true)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "requires at least one join pair to be defined" do
|
|
42
|
+
expect { @matcher.index_right_objects }.to raise_error
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
context "using mid and esn matcher" do
|
|
46
|
+
|
|
47
|
+
before(:each) do
|
|
48
|
+
define_mid_esn_matcher(@matcher)
|
|
49
|
+
@matcher.index_right_objects
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "indexes right records on join attributes" do
|
|
53
|
+
@matcher.right_index.get(:esn, "11111111111").should_not be_nil
|
|
54
|
+
@matcher.right_index.get(:esn, "11111111111").size.should == 2
|
|
55
|
+
@matcher.right_index.get(:mid, "8135554444").size.should == 1
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "finds potential matches for left_objects from right_objects based on join criteria" do
|
|
59
|
+
right_matches = @matcher.find_potential_matches(@left_a)
|
|
60
|
+
right_matches.should have(3).items
|
|
61
|
+
right_matches.should include(@right_a)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "finds scored matches by applying rules after finding potential matches" do
|
|
65
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
66
|
+
right_matches.should have(3).items
|
|
67
|
+
|
|
68
|
+
#raise matching threshold
|
|
69
|
+
@matcher.min_score = 2.0
|
|
70
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
71
|
+
right_matches.should have(1).items
|
|
72
|
+
|
|
73
|
+
#note: return value is an array of arrays, not an array of just
|
|
74
|
+
#right_objects
|
|
75
|
+
right_matches[0].should == [@right_b, 2.0]
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "should reconcile test data based on single attribute pair" do
|
|
80
|
+
define_esn_matcher(@matcher)
|
|
81
|
+
@matcher.match
|
|
82
|
+
@matcher.right_matches.size.should == 2
|
|
83
|
+
@matcher.left_matches.size.should == 2
|
|
84
|
+
|
|
85
|
+
@matcher.left_matches.should include(@left_a)
|
|
86
|
+
@matcher.left_matches.should include(@left_b)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "should reconcile test data based on two attribute pairs" do
|
|
90
|
+
define_mid_esn_matcher(@matcher)
|
|
91
|
+
@matcher.match
|
|
92
|
+
@matcher.right_matches.size.should == 3
|
|
93
|
+
@matcher.left_matches.size.should == 3
|
|
94
|
+
|
|
95
|
+
create_array_matcher(:use_redis => true)
|
|
96
|
+
@matcher.min_score = 2.0
|
|
97
|
+
define_mid_esn_matcher(@matcher)
|
|
98
|
+
@matcher.match
|
|
99
|
+
@matcher.right_matches.size.should == 2
|
|
100
|
+
@matcher.left_matches.size.should == 2
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
end #redis index and array store tests
|
|
104
|
+
end
|
|
105
|
+
end #test redis cnn
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
require 'rspec'
|
|
2
|
+
require 'date'
|
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/matching.rb')
|
|
4
|
+
include Matching
|
|
5
|
+
|
|
6
|
+
describe Date do
|
|
7
|
+
|
|
8
|
+
let(:a_date) { Date.new(2007,1,1) }
|
|
9
|
+
|
|
10
|
+
it "requires days_scale parameter be numeric" do
|
|
11
|
+
expect { a_date.similarity_to(Date.new(2007,1,15), :days_scale => 30) }.to_not raise_error
|
|
12
|
+
expect { a_date.similarity_to(Date.new(2007,1,15), :days_scale => "thirty") }.to raise_error
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it "scores date differences less than the days_scale as > 0.0 and < 1.0" do
|
|
16
|
+
a_date.similarity_to(a_date).should == 1.0
|
|
17
|
+
a_date.similarity_to(Date.new(2007,1,15), :days_scale => 30).should be_within(0.05).of(0.5)
|
|
18
|
+
a_date.similarity_to(Date.new(2007,1,2), :days_scale => 30).should be_within(0.05).of(1.0)
|
|
19
|
+
a_date.similarity_to(Date.new(2007,1,30), :days_scale => 30).should be_within(0.05).of(0.0)
|
|
20
|
+
a_date.similarity_to(Date.new(2007,2,1), :days_scale => 30).should == 0.0
|
|
21
|
+
a_date.similarity_to(Date.new(2006,12,16), :days_scale => 30).should be_within(0.05).of(0.5)
|
|
22
|
+
a_date.similarity_to(Date.new(2006,11,30), :days_scale => 30).should == 0.0
|
|
23
|
+
a_date.similarity_to(Date.new(2006,11,30), :days_scale => 60).should be > 0.0
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "treats datetime as date" do
|
|
27
|
+
dt1 = DateTime.new(2007,1,1)
|
|
28
|
+
dt1.similarity_to(dt1).should == 1.0
|
|
29
|
+
dt1.similarity_to(DateTime.new(2007,1,15)).should be > 0.0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
describe String do
|
|
35
|
+
|
|
36
|
+
let(:a_string) { "Horse" }
|
|
37
|
+
|
|
38
|
+
it "uses text gem to calculate Levenshtein distance between two strings" do
|
|
39
|
+
Text::Levenshtein::distance(a_string,a_string).should == 0
|
|
40
|
+
Text::Levenshtein::distance(a_string,"Hose").should == 1
|
|
41
|
+
Text::Levenshtein::distance(a_string,"Hosse").should == 1
|
|
42
|
+
Text::Levenshtein::distance(a_string,"Horsey").should == 1
|
|
43
|
+
Text::Levenshtein::distance(a_string,"Hotel").should == 3
|
|
44
|
+
Text::Levenshtein::distance(a_string,"horse").should == 1
|
|
45
|
+
Text::Levenshtein::distance(a_string,"Apple").should == 4
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "scores raw similiarity between 0.0 and 1.0 using Levenshtein edit distance" do
|
|
49
|
+
a_string.raw_similarity_to("Horse").should == 1.0
|
|
50
|
+
a_string.raw_similarity_to("Hose").should be_within(0.1).of(0.75)
|
|
51
|
+
a_string.raw_similarity_to("Trombone").should be_within(0.1).of(0.0)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it "performs case-insensitive raw similarity comparisons" do
|
|
55
|
+
a_string.raw_similarity_to("hose").should be_within(0.1).of(0.75)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "uses raw similarity to calculate overall similarity with no comparison argument" do
|
|
59
|
+
a_string.similarity_to("Horsey").should be_within(0.1).of(0.75)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it "should tokenize strings according to rules" do
|
|
63
|
+
"horse".tokenize.should == %w(horse)
|
|
64
|
+
"horse ".tokenize.should == %w(horse)
|
|
65
|
+
" horse".tokenize.should == %w(horse)
|
|
66
|
+
" horse ".tokenize.should == %w(horse)
|
|
67
|
+
"horse hoof".tokenize.should == %w(horse hoof)
|
|
68
|
+
"horse m. hoof".tokenize.should == %w(horse hoof)
|
|
69
|
+
"horse mildred hoof".tokenize.should == %w(horse mildred hoof)
|
|
70
|
+
"horse .mildred - hoof".tokenize.should == %w(horse mildred hoof)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it "compares the similarity of names" do
|
|
74
|
+
name1 = "Ruth Ginsburg"
|
|
75
|
+
name1.similarity_to("Ruth Ginsburg", :comparison => :name).should == 1.0
|
|
76
|
+
name1.similarity_to("Ginsburg Ruth", :comparison => :name).should == 1.0
|
|
77
|
+
name1.similarity_to("Ginsburg, Ruth", :comparison => :name).should == 1.0
|
|
78
|
+
name1.similarity_to("Ginsburg,Ruth", :comparison => :name).should == 1.0
|
|
79
|
+
name1.similarity_to("Baby Ruth", :comparison => :name).should == 0.5
|
|
80
|
+
name1.similarity_to("Ruth Ginsberg", :comparison => :name).should be_within(0.05).of(0.9)
|
|
81
|
+
name1.similarity_to("Roth Ginsburg", :comparison => :name).should be_within(0.05).of(0.9)
|
|
82
|
+
name1.similarity_to("Roth Ginsberg", :comparison => :name).should be_within(0.1).of(0.8)
|
|
83
|
+
name1.similarity_to("Ruth Bader Ginsburg", :comparison => :name).should be > 0.5
|
|
84
|
+
name1.similarity_to("Ruth Joan Bader Ginsburg", :comparison => :name).should be > 0.5
|
|
85
|
+
name1.similarity_to("Antonin Scalia", :comparison => :name).should be_within(0.09).of(0.1)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
end
|