matching 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ # Tests main functionality using array data stores and hash indexing.
2
+ # See ar_spec.rb for tests of ActiveRecord as the data store
3
+ # See redis_spec.rb for tests of Redis for indexing.
4
+
5
+ require 'date'
6
+ require File.expand_path("../../spec_helper", __FILE__)
7
+ include Matching
8
+
9
+ describe AttributePair do
10
+ it "describes the relationship of two attributes from two classes for the matcher" do
11
+ rab = AttributePair.new(:mid, :mid, 0.5)
12
+ rab.left_attr.should == :mid
13
+ rab.right_attr.should == :mid
14
+ rab.weight.should == 0.5
15
+ rab.is_fuzzy.should == false
16
+
17
+ expect { AttributePair.new(:mid, :mid, 0.0) }.to raise_error
18
+ end
19
+ end
20
+
21
+ describe HashIndex do
22
+ it "indexes object ids for a given attribute and value" do
23
+ subject.put(:mid, "7275551111", 1)
24
+ subject.get(:mid, "7275551111").should == [1]
25
+ subject.put(:mid, "7275551111", 2)
26
+ subject.get(:mid, "7275551111").should == [1,2]
27
+ subject.put(:mid, "8135554444", 3)
28
+ subject.get(:mid, "8135554444").should == [3]
29
+ subject.get(:mid, "2015558888").should be_nil
30
+ subject.get(:esn, "1111111111").should be_nil
31
+ end
32
+
33
+ it "should not index nil values" do
34
+ subject.put(:mid, nil, 1)
35
+ subject.get(:mid, nil).should be_nil
36
+ end
37
+ end
38
+
39
+ describe ArrayStore do
40
+ include MatcherSpecHelper
41
+
42
+ let(:left_as) { create_test_data; ArrayStore.new(@lefts) }
43
+ let(:right_as) { create_test_data; ArrayStore.new(@rights) }
44
+
45
+ it "should store data in left and right arrays" do
46
+ left_as.arr.should have(3).items
47
+ right_as.arr.should have(4).items
48
+ end
49
+
50
+ it "should enumerate left array objects with index" do
51
+ cnt = 0
52
+ expect { left_as.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(3)
53
+
54
+ obj, id = nil, nil
55
+ left_as.each do |o,idx|
56
+ obj, id = o, idx
57
+ break
58
+ end
59
+
60
+ id.should == 0
61
+ obj.should == @left_a
62
+ end
63
+
64
+ it "should retrieve objects by their array index through the find method" do
65
+ left_as.find(0).should == @left_a
66
+ left_as.find(1).should == @left_b
67
+ right_as.find(-1).should == @right_d
68
+ end
69
+ end
70
+
71
+ describe Matcher do
72
+ include MatcherSpecHelper
73
+
74
+ let(:compare_string_size) { lambda { |l,r| (l.size == r.size ? 1.0 : 0.0) } }
75
+
76
+ let(:esn_has_ones_filter) { lambda { |l,r| l.esn =~ /1/ && r.esn =~ /1/ } }
77
+
78
+ it "calculates non-fuzzy similarity of pairs of strings" do
79
+ subject.compare_values("hello","hello").should == 1.0
80
+ subject.compare_values("hello","world").should == 0.0
81
+ subject.compare_values(nil,nil).should == 0.0
82
+ subject.compare_values("hello",nil).should == 0.0
83
+ end
84
+
85
+ it "calculates fuzzy similarity of pairs of strings" do
86
+ subject.compare_values("hello","hullo",:fuzzy => true).should be_within(0.1).of(0.75)
87
+ subject.compare_values("hello","world",:fuzzy => true).should be_within(0.1).of(0.2)
88
+ subject.compare_values("hello","zippy",:fuzzy => true).should be_within(0.1).of(0.0)
89
+ subject.compare_values("John Q Public","Public,John", :fuzzy => true, :comparison => :name).should == 1.0
90
+ end
91
+
92
+ it "calculates non-fuzzy similarity of pairs of dates" do
93
+ subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,1)).should == 1.0
94
+ subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,2)).should == 0.0
95
+ end
96
+
97
+ it "calculates fuzzy similarity of pairs of dates" do
98
+ subject.compare_values(Date.new(2011,1,1),Date.new(2011,1,2), :fuzzy => true).should be_within(0.1).of(0.9)
99
+ end
100
+
101
+ it "should raise exception if fuzzy comparison is requested on unsupported class" do
102
+ expect { subject.compare_values(["cat"], ["bat"], :fuzzy => true) }.to raise_error(ArgumentError, "Cannot calculate fuzzy comparison for type Array")
103
+ end
104
+
105
+ it "should raise an exception if pairs for comparison are of different base types" do
106
+ expect { subject.compare_values("hello",2) }.to raise_error
107
+ expect { subject.compare_values(Date.new(2011,1,1),2) }.to raise_error
108
+ expect { subject.compare_values(Date.new(2011,1,1),"world") }.to raise_error
109
+ end
110
+
111
+ it "creates AttributePairs using the join method" do
112
+ expect { subject.join(:mid, :mid, 1.0) }.to change{ subject.join_pairs.size }.from(0).to(1)
113
+ end
114
+
115
+ it "creates AttributePairs using the compare method" do
116
+ expect { subject.compare(:mid, :mid, 1.0) }.to change{ subject.compare_pairs.size }.from(0).to(1)
117
+ end
118
+
119
+ it "allows custom lambdas to be used for comparison rules" do
120
+ expect { subject.custom(compare_string_size) }.to change{ subject.custom_functions.size }.from(0).to(1)
121
+ end
122
+
123
+ it "allows filters to be defined as custom lambdas that return boolean" do
124
+ expect { subject.filter(esn_has_ones_filter) }.to change{ subject.filter_functions.size }.from(0).to(1)
125
+ end
126
+
127
+ it "defines the rules for matching through a define block" do
128
+ subject.join_pairs.size.should == 0
129
+ subject.compare_pairs.size.should == 0
130
+ define_mid_esn_date_matcher(subject)
131
+ subject.join_pairs.size.should == 2
132
+ subject.compare_pairs.size.should == 1
133
+ end
134
+
135
+ it "scores two objects based on defined matching rules" do
136
+ create_array_matcher
137
+ define_mid_matcher(@matcher)
138
+ @matcher.score_pair(@left_a,@right_b).should == 1.0
139
+ @matcher.score_pair(@left_a,@right_a).should == 0.0
140
+
141
+ @matcher.define { join :esn, :esn, 1.0 }
142
+ @matcher.score_pair(@left_a,@right_b).should == 2.0
143
+ @matcher.score_pair(@left_a,@right_a).should == 1.0
144
+
145
+ @matcher.define { compare :date, :date, 0.5, :fuzzy => true }
146
+ @matcher.score_pair(@left_a,@right_b).should == 2.5
147
+ @matcher.score_pair(@left_a,@right_a).should == 1.5
148
+ @matcher.score_pair(@left_c,@right_d).should be_within(0.1).of(1.5)
149
+
150
+ lmbda = lambda { |l,r| (l.mid[0] == '7' ? 1.0 : 0.0) }
151
+ @matcher.custom(lmbda)
152
+ @matcher.score_pair(@left_a,@right_b).should == 3.5
153
+ end
154
+
155
+ it "requires left and right store objects be defined before matching" do
156
+ m = Matching::Matcher.new(:left_store => nil, :right_store => nil)
157
+ expect { m.match }.to raise_error(ArgumentError)
158
+ end
159
+
160
+ context "with hash index and array store" do
161
+
162
+ before(:each) do
163
+ create_array_matcher
164
+ end
165
+
166
+ it "requires at least one join pair to be defined" do
167
+ expect { @matcher.index_right_objects }.to raise_error
168
+ end
169
+
170
+ context "using mid and esn matcher" do
171
+
172
+ before(:each) do
173
+ define_mid_esn_matcher(@matcher)
174
+ @matcher.index_right_objects
175
+ end
176
+
177
+ it "indexes right records on join attributes" do
178
+ @matcher.right_index.get(:esn, "11111111111").should_not be_nil
179
+ @matcher.right_index.get(:esn, "11111111111").size.should == 2
180
+ @matcher.right_index.get(:mid, "8135554444").size.should == 1
181
+ end
182
+
183
+ it "finds potential matches for left_objects from right_objects based on join criteria" do
184
+ right_matches = @matcher.find_potential_matches(@left_a)
185
+ right_matches.should have(3).items
186
+ right_matches.should include(@right_a)
187
+ end
188
+
189
+ it "finds scored matches by applying rules after finding potential matches" do
190
+ right_matches = @matcher.find_matches(@left_a)
191
+ right_matches.should have(3).items
192
+
193
+ #raise matching threshold
194
+ @matcher.min_score = 2.0
195
+ right_matches = @matcher.find_matches(@left_a)
196
+ right_matches.should have(1).items
197
+
198
+ #note: return value is an array of arrays, not an array of just
199
+ #right_objects
200
+ right_matches[0].should == [@right_b, 2.0]
201
+ end
202
+ end
203
+
204
+ it "should reconcile test data based on single attribute pair" do
205
+ define_esn_matcher(@matcher)
206
+ @matcher.match
207
+ @matcher.right_matches.size.should == 2
208
+ @matcher.left_matches.size.should == 2
209
+
210
+ @matcher.left_matches.should include(@left_a)
211
+ @matcher.left_matches.should include(@left_b)
212
+ end
213
+
214
+ it "should reconcile test data based on two attribute pairs" do
215
+ define_mid_esn_matcher(@matcher)
216
+ @matcher.match
217
+ @matcher.right_matches.size.should == 3
218
+ @matcher.left_matches.size.should == 3
219
+
220
+ create_array_matcher
221
+ define_mid_esn_matcher(@matcher)
222
+ @matcher.min_score = 2.0
223
+ @matcher.match
224
+ @matcher.right_matches.size.should == 2
225
+ @matcher.left_matches.size.should == 2
226
+ end
227
+
228
+ it "should list non-matched objects as exceptions" do
229
+ define_mid_esn_matcher(@matcher)
230
+ @matcher.min_score = 2.0
231
+ @matcher.match
232
+ @matcher.left_exceptions.should have(1).item
233
+ @matcher.right_exceptions.should have(2).items
234
+ end
235
+
236
+ it "should allow veto of matches using filtering rules" do
237
+ create_array_matcher
238
+ define_esn_matcher(@matcher)
239
+ @matcher.filter(esn_has_ones_filter)
240
+ @matcher.match
241
+ @matcher.left_matches.size.should == 1
242
+ @matcher.left_matches[@left_a].left_obj.esn.should =~ /1/
243
+ end
244
+
245
+ end #hash index and array store tests
246
+
247
+ context "conflict resolution" do
248
+
249
+ let(:amatcher) do
250
+
251
+ #initially, A will match the first record it comes to (an outer record),
252
+ #then A will be made loser and should eventually match Y
253
+ @left_a = Transaction.new(:id => 1, :esn => "11111111111", :mid => "cdcd")
254
+ @left_b = Transaction.new(:id => 2, :esn => "11111111111", :mid => "abab")
255
+ @left_c = Transaction.new(:id => 3, :esn => "11111111111", :mid => "yzyz")
256
+ @lefts = [@left_a, @left_b, @left_c]
257
+
258
+ @right_x = Transaction.new(:id => 1, :esn => "11111111111", :mid => "abab")
259
+ @right_y = Transaction.new(:id => 2, :esn => "11111111111", :mid => "mnmn")
260
+ @right_z = Transaction.new(:id => 3, :esn => "11111111111", :mid => "yzyz")
261
+ @rights = [@right_x, @right_y, @right_z]
262
+
263
+ as_l = ArrayStore.new(@lefts)
264
+ as_r = ArrayStore.new(@rights)
265
+ matcher = Matching::Matcher.new(:left_store => as_l, :right_store => as_r)
266
+ define_mid_esn_matcher(matcher)
267
+ matcher
268
+ end
269
+
270
+ it "should find best fit for all objects" do
271
+ amatcher.match
272
+ amatcher.left_matches.should have(3).items
273
+ amatcher.left_matches[@left_a].right_obj.should == @right_y
274
+ amatcher.left_matches[@left_a].score.should == 1.0
275
+ amatcher.left_matches[@left_b].right_obj.should == @right_x
276
+ amatcher.left_matches[@left_b].score.should == 2.0
277
+ amatcher.left_matches[@left_c].right_obj.should == @right_z
278
+ amatcher.left_matches[@left_c].score.should == 2.0
279
+ end
280
+
281
+ it "should not find best fit unless evaluate_left_losers executes normally" do
282
+ class << amatcher
283
+ define_method(:evaluate_left_losers, proc { })
284
+ end
285
+
286
+ amatcher.match
287
+ amatcher.left_matches.should have(2).items
288
+ amatcher.left_matches[@left_b].right_obj.should == @right_x
289
+ amatcher.left_matches[@left_b].score.should == 2.0
290
+ amatcher.left_matches[@left_c].right_obj.should == @right_z
291
+ amatcher.left_matches[@left_c].score.should == 2.0
292
+ end
293
+
294
+ end #conflict resolution
295
+
296
+ #See ar_spec.rb and redis_spec.rb for tests involving ActiveRecord and Redis
297
+ end
@@ -0,0 +1,105 @@
1
+ # Tests Redis as the indexer
2
+
3
+ require File.expand_path("../../spec_helper", __FILE__)
4
+ require File.expand_path("../../../lib/matching/redis_index", __FILE__)
5
+
6
+ include Matching
7
+
8
+ def test_redis_connection
9
+ r = Redis.new
10
+ r.inspect rescue puts "Start Redis to run redis_spec" and return false
11
+ end
12
+
13
+ if test_redis_connection
14
+ describe RedisIndex do
15
+ it "index object ids for a given attribute and value" do
16
+ subject.put(:mid, "7275551111", 1)
17
+ subject.get(:mid, "7275551111").should == [1]
18
+ subject.put(:mid, "7275551111", 2)
19
+ subject.get(:mid, "7275551111").should == [1,2]
20
+ subject.put(:mid, "8135554444", 3)
21
+ subject.get(:mid, "8135554444").should == [3]
22
+ subject.get(:mid, "2015558888").should be_nil
23
+ subject.get(:esn, "1111111111").should be_nil
24
+ end
25
+
26
+ it "should not index nil values" do
27
+ subject.put(:mid, nil, 1)
28
+ subject.get(:mid, nil).should be_nil
29
+ end
30
+ end
31
+
32
+ describe Matcher do
33
+ include MatcherSpecHelper
34
+
35
+ context "with redis index and array store" do
36
+
37
+ before(:each) do
38
+ create_array_matcher(:use_redis => true)
39
+ end
40
+
41
+ it "requires at least one join pair to be defined" do
42
+ expect { @matcher.index_right_objects }.to raise_error
43
+ end
44
+
45
+ context "using mid and esn matcher" do
46
+
47
+ before(:each) do
48
+ define_mid_esn_matcher(@matcher)
49
+ @matcher.index_right_objects
50
+ end
51
+
52
+ it "indexes right records on join attributes" do
53
+ @matcher.right_index.get(:esn, "11111111111").should_not be_nil
54
+ @matcher.right_index.get(:esn, "11111111111").size.should == 2
55
+ @matcher.right_index.get(:mid, "8135554444").size.should == 1
56
+ end
57
+
58
+ it "finds potential matches for left_objects from right_objects based on join criteria" do
59
+ right_matches = @matcher.find_potential_matches(@left_a)
60
+ right_matches.should have(3).items
61
+ right_matches.should include(@right_a)
62
+ end
63
+
64
+ it "finds scored matches by applying rules after finding potential matches" do
65
+ right_matches = @matcher.find_matches(@left_a)
66
+ right_matches.should have(3).items
67
+
68
+ #raise matching threshold
69
+ @matcher.min_score = 2.0
70
+ right_matches = @matcher.find_matches(@left_a)
71
+ right_matches.should have(1).items
72
+
73
+ #note: return value is an array of arrays, not an array of just
74
+ #right_objects
75
+ right_matches[0].should == [@right_b, 2.0]
76
+ end
77
+ end
78
+
79
+ it "should reconcile test data based on single attribute pair" do
80
+ define_esn_matcher(@matcher)
81
+ @matcher.match
82
+ @matcher.right_matches.size.should == 2
83
+ @matcher.left_matches.size.should == 2
84
+
85
+ @matcher.left_matches.should include(@left_a)
86
+ @matcher.left_matches.should include(@left_b)
87
+ end
88
+
89
+ it "should reconcile test data based on two attribute pairs" do
90
+ define_mid_esn_matcher(@matcher)
91
+ @matcher.match
92
+ @matcher.right_matches.size.should == 3
93
+ @matcher.left_matches.size.should == 3
94
+
95
+ create_array_matcher(:use_redis => true)
96
+ @matcher.min_score = 2.0
97
+ define_mid_esn_matcher(@matcher)
98
+ @matcher.match
99
+ @matcher.right_matches.size.should == 2
100
+ @matcher.left_matches.size.should == 2
101
+ end
102
+
103
+ end #redis index and array store tests
104
+ end
105
+ end #test redis cnn
@@ -0,0 +1,88 @@
1
+ require 'rspec'
2
+ require 'date'
3
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/matching.rb')
4
+ include Matching
5
+
6
+ describe Date do
7
+
8
+ let(:a_date) { Date.new(2007,1,1) }
9
+
10
+ it "requires days_scale parameter be numeric" do
11
+ expect { a_date.similarity_to(Date.new(2007,1,15), :days_scale => 30) }.to_not raise_error
12
+ expect { a_date.similarity_to(Date.new(2007,1,15), :days_scale => "thirty") }.to raise_error
13
+ end
14
+
15
+ it "scores date differences less than the days_scale as > 0.0 and < 1.0" do
16
+ a_date.similarity_to(a_date).should == 1.0
17
+ a_date.similarity_to(Date.new(2007,1,15), :days_scale => 30).should be_within(0.05).of(0.5)
18
+ a_date.similarity_to(Date.new(2007,1,2), :days_scale => 30).should be_within(0.05).of(1.0)
19
+ a_date.similarity_to(Date.new(2007,1,30), :days_scale => 30).should be_within(0.05).of(0.0)
20
+ a_date.similarity_to(Date.new(2007,2,1), :days_scale => 30).should == 0.0
21
+ a_date.similarity_to(Date.new(2006,12,16), :days_scale => 30).should be_within(0.05).of(0.5)
22
+ a_date.similarity_to(Date.new(2006,11,30), :days_scale => 30).should == 0.0
23
+ a_date.similarity_to(Date.new(2006,11,30), :days_scale => 60).should be > 0.0
24
+ end
25
+
26
+ it "treats datetime as date" do
27
+ dt1 = DateTime.new(2007,1,1)
28
+ dt1.similarity_to(dt1).should == 1.0
29
+ dt1.similarity_to(DateTime.new(2007,1,15)).should be > 0.0
30
+ end
31
+
32
+ end
33
+
34
+ describe String do
35
+
36
+ let(:a_string) { "Horse" }
37
+
38
+ it "uses text gem to calculate Levenshtein distance between two strings" do
39
+ Text::Levenshtein::distance(a_string,a_string).should == 0
40
+ Text::Levenshtein::distance(a_string,"Hose").should == 1
41
+ Text::Levenshtein::distance(a_string,"Hosse").should == 1
42
+ Text::Levenshtein::distance(a_string,"Horsey").should == 1
43
+ Text::Levenshtein::distance(a_string,"Hotel").should == 3
44
+ Text::Levenshtein::distance(a_string,"horse").should == 1
45
+ Text::Levenshtein::distance(a_string,"Apple").should == 4
46
+ end
47
+
48
+ it "scores raw similiarity between 0.0 and 1.0 using Levenshtein edit distance" do
49
+ a_string.raw_similarity_to("Horse").should == 1.0
50
+ a_string.raw_similarity_to("Hose").should be_within(0.1).of(0.75)
51
+ a_string.raw_similarity_to("Trombone").should be_within(0.1).of(0.0)
52
+ end
53
+
54
+ it "performs case-insensitive raw similarity comparisons" do
55
+ a_string.raw_similarity_to("hose").should be_within(0.1).of(0.75)
56
+ end
57
+
58
+ it "uses raw similarity to calculate overall similarity with no comparison argument" do
59
+ a_string.similarity_to("Horsey").should be_within(0.1).of(0.75)
60
+ end
61
+
62
+ it "should tokenize strings according to rules" do
63
+ "horse".tokenize.should == %w(horse)
64
+ "horse ".tokenize.should == %w(horse)
65
+ " horse".tokenize.should == %w(horse)
66
+ " horse ".tokenize.should == %w(horse)
67
+ "horse hoof".tokenize.should == %w(horse hoof)
68
+ "horse m. hoof".tokenize.should == %w(horse hoof)
69
+ "horse mildred hoof".tokenize.should == %w(horse mildred hoof)
70
+ "horse .mildred - hoof".tokenize.should == %w(horse mildred hoof)
71
+ end
72
+
73
+ it "compares the similarity of names" do
74
+ name1 = "Ruth Ginsburg"
75
+ name1.similarity_to("Ruth Ginsburg", :comparison => :name).should == 1.0
76
+ name1.similarity_to("Ginsburg Ruth", :comparison => :name).should == 1.0
77
+ name1.similarity_to("Ginsburg, Ruth", :comparison => :name).should == 1.0
78
+ name1.similarity_to("Ginsburg,Ruth", :comparison => :name).should == 1.0
79
+ name1.similarity_to("Baby Ruth", :comparison => :name).should == 0.5
80
+ name1.similarity_to("Ruth Ginsberg", :comparison => :name).should be_within(0.05).of(0.9)
81
+ name1.similarity_to("Roth Ginsburg", :comparison => :name).should be_within(0.05).of(0.9)
82
+ name1.similarity_to("Roth Ginsberg", :comparison => :name).should be_within(0.1).of(0.8)
83
+ name1.similarity_to("Ruth Bader Ginsburg", :comparison => :name).should be > 0.5
84
+ name1.similarity_to("Ruth Joan Bader Ginsburg", :comparison => :name).should be > 0.5
85
+ name1.similarity_to("Antonin Scalia", :comparison => :name).should be_within(0.09).of(0.1)
86
+ end
87
+
88
+ end