linkage 0.0.6 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +10 -0
- data/Gemfile +15 -13
- data/Gemfile.lock +67 -37
- data/Guardfile +0 -2
- data/Rakefile +122 -25
- data/lib/linkage/comparator.rb +172 -0
- data/lib/linkage/comparators/binary.rb +12 -0
- data/lib/linkage/comparators/compare.rb +46 -0
- data/lib/linkage/comparators/within.rb +32 -0
- data/lib/linkage/configuration.rb +285 -153
- data/lib/linkage/data.rb +32 -7
- data/lib/linkage/dataset.rb +107 -32
- data/lib/linkage/decollation.rb +93 -0
- data/lib/linkage/expectation.rb +21 -0
- data/lib/linkage/expectations/exhaustive.rb +63 -0
- data/lib/linkage/expectations/simple.rb +168 -0
- data/lib/linkage/field.rb +30 -4
- data/lib/linkage/field_set.rb +6 -3
- data/lib/linkage/function.rb +50 -3
- data/lib/linkage/functions/binary.rb +30 -0
- data/lib/linkage/functions/cast.rb +54 -0
- data/lib/linkage/functions/length.rb +29 -0
- data/lib/linkage/functions/strftime.rb +12 -11
- data/lib/linkage/functions/trim.rb +8 -0
- data/lib/linkage/group.rb +20 -0
- data/lib/linkage/import_buffer.rb +5 -16
- data/lib/linkage/meta_object.rb +139 -0
- data/lib/linkage/result_set.rb +74 -17
- data/lib/linkage/runner/single_threaded.rb +125 -10
- data/lib/linkage/version.rb +3 -0
- data/lib/linkage.rb +11 -0
- data/linkage.gemspec +16 -121
- data/test/config.yml +5 -0
- data/test/helper.rb +73 -8
- data/test/integration/test_collation.rb +45 -0
- data/test/integration/test_configuration.rb +268 -0
- data/test/integration/test_cross_linkage.rb +4 -17
- data/test/integration/test_dataset.rb +45 -2
- data/test/integration/test_dual_linkage.rb +40 -24
- data/test/integration/test_functions.rb +22 -0
- data/test/integration/test_result_set.rb +85 -0
- data/test/integration/test_scoring.rb +84 -0
- data/test/integration/test_self_linkage.rb +5 -0
- data/test/integration/test_within_comparator.rb +100 -0
- data/test/unit/comparators/test_compare.rb +105 -0
- data/test/unit/comparators/test_within.rb +57 -0
- data/test/unit/expectations/test_exhaustive.rb +111 -0
- data/test/unit/expectations/test_simple.rb +303 -0
- data/test/unit/functions/test_binary.rb +54 -0
- data/test/unit/functions/test_cast.rb +98 -0
- data/test/unit/functions/test_length.rb +52 -0
- data/test/unit/functions/test_strftime.rb +17 -13
- data/test/unit/functions/test_trim.rb +11 -4
- data/test/unit/test_comparator.rb +124 -0
- data/test/unit/test_configuration.rb +137 -175
- data/test/unit/test_data.rb +44 -0
- data/test/unit/test_dataset.rb +73 -21
- data/test/unit/test_decollation.rb +201 -0
- data/test/unit/test_field.rb +38 -14
- data/test/unit/test_field_set.rb +12 -8
- data/test/unit/test_function.rb +83 -16
- data/test/unit/test_group.rb +28 -0
- data/test/unit/test_import_buffer.rb +13 -27
- data/test/unit/test_meta_object.rb +208 -0
- data/test/unit/test_result_set.rb +221 -3
- metadata +82 -190
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestConfiguration < Test::Unit::TestCase
|
5
|
+
test "linkage_type is self when the two datasets are the same" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
8
|
+
end
|
9
|
+
|
10
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
11
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
12
|
+
assert_equal :self, conf.linkage_type
|
13
|
+
end
|
14
|
+
|
15
|
+
test "linkage_type is dual when the two datasets are different" do
|
16
|
+
database_for('sqlite') do |db|
|
17
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
18
|
+
db.create_table(:bar) { primary_key(:id); String(:foo); String(:bar) }
|
19
|
+
end
|
20
|
+
|
21
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
22
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), "bar")
|
23
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_2)
|
24
|
+
assert_equal :dual, conf.linkage_type
|
25
|
+
end
|
26
|
+
|
27
|
+
test "linkage_type is cross when there's different filters on both sides" do
|
28
|
+
database_for('sqlite') do |db|
|
29
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
30
|
+
end
|
31
|
+
|
32
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
33
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
34
|
+
conf.configure do
|
35
|
+
lhs[:foo].must == "foo"
|
36
|
+
rhs[:foo].must == "bar"
|
37
|
+
end
|
38
|
+
assert_equal :cross, conf.linkage_type
|
39
|
+
end
|
40
|
+
|
41
|
+
test "linkage_type is self when there's identical static filters on each side" do
|
42
|
+
database_for('sqlite') do |db|
|
43
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
44
|
+
end
|
45
|
+
|
46
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
47
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
48
|
+
conf.configure do
|
49
|
+
lhs[:foo].must == "foo"
|
50
|
+
rhs[:foo].must == "foo"
|
51
|
+
end
|
52
|
+
assert_equal :self, conf.linkage_type
|
53
|
+
end
|
54
|
+
|
55
|
+
test "linkage_type is cross when exhaustive expectations use different fields" do
|
56
|
+
database_for('sqlite') do |db|
|
57
|
+
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
58
|
+
end
|
59
|
+
|
60
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
61
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
62
|
+
conf.configure do
|
63
|
+
lhs[:foo].must(be_within(5).of(rhs[:bar]))
|
64
|
+
end
|
65
|
+
assert_equal :cross, conf.linkage_type
|
66
|
+
end
|
67
|
+
|
68
|
+
test "static expectation" do
|
69
|
+
database_for('sqlite') do |db|
|
70
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
71
|
+
end
|
72
|
+
|
73
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
74
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
75
|
+
conf.configure do
|
76
|
+
lhs[:foo].must == "foo"
|
77
|
+
end
|
78
|
+
|
79
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
80
|
+
assert_equal dataset_2.obj, dataset_1.filter(:foo => "foo").obj
|
81
|
+
end
|
82
|
+
|
83
|
+
test "complain if an invalid field is accessed" do
|
84
|
+
database_for('sqlite') do |db|
|
85
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
86
|
+
end
|
87
|
+
|
88
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
89
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
90
|
+
assert_raises(ArgumentError) do
|
91
|
+
conf.configure do
|
92
|
+
lhs[:foo].must == rhs[:non_existant_field]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
operators = [:>, :<, :>=, :<=]
|
98
|
+
operators.each do |operator|
|
99
|
+
test "DSL #{operator} filter operator" do
|
100
|
+
database_for('sqlite') do |db|
|
101
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
102
|
+
end
|
103
|
+
|
104
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
105
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
106
|
+
conf.configure do
|
107
|
+
lhs[:foo].must.send(operator, 123)
|
108
|
+
end
|
109
|
+
|
110
|
+
expr = Sequel::SQL::BooleanExpression.new(operator, Sequel::SQL::Identifier.new(:foo), 123)
|
111
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
112
|
+
assert_equal dataset_2.obj, dataset_1.filter(expr).obj
|
113
|
+
end
|
114
|
+
|
115
|
+
test "comparing two data sources with #{operator}" do
|
116
|
+
database_for('sqlite') do |db|
|
117
|
+
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
118
|
+
end
|
119
|
+
|
120
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
121
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
122
|
+
conf.configure do
|
123
|
+
lhs[:foo].must.send(operator, rhs[:bar])
|
124
|
+
end
|
125
|
+
assert_equal 1, conf.exhaustive_expectations.length
|
126
|
+
|
127
|
+
comp = conf.exhaustive_expectations[0].comparator
|
128
|
+
assert_instance_of Linkage::Comparators::Compare, comp
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
test "must_not expectation" do
|
133
|
+
database_for('sqlite') do |db|
|
134
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
135
|
+
end
|
136
|
+
|
137
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
138
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
139
|
+
conf.configure do
|
140
|
+
lhs[:foo].must_not == "foo"
|
141
|
+
end
|
142
|
+
|
143
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
144
|
+
assert_equal dataset_2.obj, dataset_1.filter(~{:foo => "foo"}).obj
|
145
|
+
end
|
146
|
+
|
147
|
+
test "static database function" do
|
148
|
+
database_for('sqlite') do |db|
|
149
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
150
|
+
end
|
151
|
+
|
152
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
153
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
154
|
+
conf.configure do
|
155
|
+
lhs[:foo].must == trim("foo")
|
156
|
+
end
|
157
|
+
|
158
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
159
|
+
assert_equal dataset_1.filter({:foo => :trim.sql_function("foo")}).obj, dataset_2.obj
|
160
|
+
end
|
161
|
+
|
162
|
+
test "save_results_in" do
|
163
|
+
database_for('sqlite') do |db|
|
164
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
165
|
+
end
|
166
|
+
|
167
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
168
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
169
|
+
conf.configure do
|
170
|
+
save_results_in("mysql://localhost/results", {:foo => 'bar'})
|
171
|
+
end
|
172
|
+
assert_equal "mysql://localhost/results", conf.results_uri
|
173
|
+
assert_equal({:foo => 'bar'}, conf.results_uri_options)
|
174
|
+
end
|
175
|
+
|
176
|
+
test "case insensitive field names" do
|
177
|
+
database_for('sqlite') do |db|
|
178
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
179
|
+
end
|
180
|
+
|
181
|
+
assert_nothing_raised do
|
182
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
183
|
+
results_uri = database_options_for('sqlite')
|
184
|
+
conf = dataset.link_with(dataset) do
|
185
|
+
lhs[:Foo].must == rhs[:baR]
|
186
|
+
save_results_in(results_uri)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
test "decollation_needed? is false when the datasets and results dataset all have the same database and collations" do
|
192
|
+
database_for('mysql') do |db|
|
193
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
|
194
|
+
db.create_table!(:bar) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
|
195
|
+
end
|
196
|
+
|
197
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
198
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
|
199
|
+
conf = dataset_1.link_with(dataset_2) do
|
200
|
+
lhs[:foo].must == rhs[:foo]
|
201
|
+
end
|
202
|
+
conf.results_uri = database_options_for('mysql')
|
203
|
+
assert !conf.decollation_needed?
|
204
|
+
end
|
205
|
+
|
206
|
+
test "decollation_needed? is true when the datasets have different database types" do
|
207
|
+
database_for('mysql') do |db|
|
208
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
209
|
+
end
|
210
|
+
|
211
|
+
database_for('sqlite') do |db|
|
212
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
213
|
+
end
|
214
|
+
|
215
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
216
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
217
|
+
conf = dataset_1.link_with(dataset_2) do
|
218
|
+
lhs[:foo].must == rhs[:foo]
|
219
|
+
end
|
220
|
+
conf.results_uri = database_options_for('mysql')
|
221
|
+
assert conf.decollation_needed?
|
222
|
+
end
|
223
|
+
|
224
|
+
test "decollation_needed? is true when the result dataset has different database type than the datasets" do
|
225
|
+
database_for('mysql') do |db|
|
226
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
227
|
+
db.create_table!(:bar) { primary_key(:id); String(:foo) }
|
228
|
+
end
|
229
|
+
|
230
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
231
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
|
232
|
+
conf = dataset_1.link_with(dataset_2) do
|
233
|
+
lhs[:foo].must == rhs[:foo]
|
234
|
+
end
|
235
|
+
conf.results_uri = database_options_for('sqlite')
|
236
|
+
assert conf.decollation_needed?
|
237
|
+
end
|
238
|
+
|
239
|
+
test "decollation_needed? is false when not comparing string columns" do
|
240
|
+
database_for('mysql') do |db|
|
241
|
+
db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
|
242
|
+
end
|
243
|
+
|
244
|
+
database_for('sqlite') do |db|
|
245
|
+
db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
|
246
|
+
end
|
247
|
+
|
248
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
249
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
250
|
+
conf = dataset_1.link_with(dataset_2) do
|
251
|
+
lhs[:foo].must == rhs[:foo]
|
252
|
+
end
|
253
|
+
conf.results_uri = database_options_for('mysql')
|
254
|
+
assert !conf.decollation_needed?
|
255
|
+
end
|
256
|
+
|
257
|
+
test "creating comparator expectation for within" do
|
258
|
+
database_for('mysql') do |db|
|
259
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
260
|
+
end
|
261
|
+
dataset = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
262
|
+
|
263
|
+
conf = dataset.link_with(dataset) do
|
264
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
@@ -16,10 +16,6 @@ module IntegrationTests
|
|
16
16
|
end
|
17
17
|
|
18
18
|
test "one mandatory field equality on single threaded runner" do
|
19
|
-
#setup_logger = Logger.new(STDERR)
|
20
|
-
#setup_logger.formatter = lambda { |severity, time, progname, msg|
|
21
|
-
#" SETUP : %s [%s]: %s\n" % [severity, time, msg]
|
22
|
-
#}
|
23
19
|
# insert the test data
|
24
20
|
database do |db|
|
25
21
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -27,16 +23,8 @@ module IntegrationTests
|
|
27
23
|
Array.new(100) { |i| [i, i % 10, i % 5] })
|
28
24
|
end
|
29
25
|
|
30
|
-
#ds_logger = Logger.new(STDERR)
|
31
|
-
#ds_logger.formatter = lambda { |severity, time, progname, msg|
|
32
|
-
#"DATASET: %s [%s]: %s\n" % [severity, time, msg]
|
33
|
-
#}
|
34
26
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
35
27
|
|
36
|
-
#rs_logger = Logger.new(STDERR)
|
37
|
-
#rs_logger.formatter = lambda { |severity, time, progname, msg|
|
38
|
-
#"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
|
39
|
-
#}
|
40
28
|
tmpuri = @tmpuri
|
41
29
|
conf = ds.link_with(ds) do
|
42
30
|
lhs[:foo].must == rhs[:bar]
|
@@ -52,11 +40,10 @@ module IntegrationTests
|
|
52
40
|
assert_equal i, row[:foo_bar]
|
53
41
|
end
|
54
42
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
#end
|
43
|
+
assert_equal 1000, db[:matches].count
|
44
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
45
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
|
46
|
+
end
|
60
47
|
end
|
61
48
|
end
|
62
49
|
|
@@ -38,7 +38,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
38
38
|
end
|
39
39
|
|
40
40
|
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
41
|
-
ds = ds.
|
41
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
42
42
|
ds.each_group do |group|
|
43
43
|
assert_equal({:bar => "foo"}, group.values)
|
44
44
|
assert_equal(2, group.count)
|
@@ -51,6 +51,26 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
51
51
|
assert_equal 3, groups.length
|
52
52
|
end
|
53
53
|
|
54
|
+
test "each_group with alias" do
|
55
|
+
database do |db|
|
56
|
+
db.create_table(:foo) do
|
57
|
+
primary_key :id
|
58
|
+
String :bar
|
59
|
+
end
|
60
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
|
61
|
+
end
|
62
|
+
|
63
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
64
|
+
ds = ds.group_match({
|
65
|
+
:meta_object => Linkage::MetaObject.new(ds.field_set[:bar]),
|
66
|
+
:alias => :bar_baz
|
67
|
+
})
|
68
|
+
ds.each_group do |group|
|
69
|
+
assert_equal({:bar_baz => "foo"}, group.values)
|
70
|
+
assert_equal(2, group.count)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
54
74
|
test "each_group with filters" do
|
55
75
|
database do |db|
|
56
76
|
db.create_table(:foo) do
|
@@ -62,7 +82,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
62
82
|
end
|
63
83
|
|
64
84
|
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
65
|
-
ds = ds.
|
85
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
66
86
|
ds = ds.filter { baz >= 3 }
|
67
87
|
groups = []
|
68
88
|
ds.each_group(1) do |group|
|
@@ -70,4 +90,27 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
70
90
|
end
|
71
91
|
assert_equal 2, groups.length
|
72
92
|
end
|
93
|
+
|
94
|
+
test "each_group with collation" do
|
95
|
+
database_for('mysql') do |db|
|
96
|
+
db.create_table!(:foo) do
|
97
|
+
primary_key :id
|
98
|
+
String :bar, :collate => :latin1_swedish_ci
|
99
|
+
end
|
100
|
+
db[:foo].import([:id, :bar], [[1, 'fOo'], [2, 'foO'], [3, 'bar'], [4, 'baz']])
|
101
|
+
end
|
102
|
+
|
103
|
+
ds = Linkage::Dataset.new(database_options_for('mysql'), "foo")
|
104
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
105
|
+
groups = []
|
106
|
+
ds.each_group(1) do |group|
|
107
|
+
groups << group
|
108
|
+
end
|
109
|
+
expected = [
|
110
|
+
{:bar => 'BAR'},
|
111
|
+
{:bar => 'BAZ'},
|
112
|
+
{:bar => 'FOO'}
|
113
|
+
]
|
114
|
+
assert_equal expected, groups.collect(&:decollated_values)
|
115
|
+
end
|
73
116
|
end
|
@@ -34,6 +34,8 @@ module IntegrationTests
|
|
34
34
|
lhs[:ssn].must == rhs[:ssn]
|
35
35
|
save_results_in(tmpuri)
|
36
36
|
end
|
37
|
+
assert_equal :dual, conf.linkage_type
|
38
|
+
|
37
39
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
38
40
|
runner.execute
|
39
41
|
|
@@ -43,16 +45,10 @@ module IntegrationTests
|
|
43
45
|
assert_equal "12345678#{i%10}", row[:ssn]
|
44
46
|
end
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#else
|
51
|
-
#assert_equal 1, row[:dataset], row.inspect
|
52
|
-
#end
|
53
|
-
#expected_group_id = i / 20 + 1
|
54
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
-
#end
|
48
|
+
assert_equal 1000, db[:matches].count
|
49
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
50
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
51
|
+
end
|
56
52
|
end
|
57
53
|
end
|
58
54
|
|
@@ -87,32 +83,52 @@ module IntegrationTests
|
|
87
83
|
end
|
88
84
|
|
89
85
|
test "reacts properly when using two databases with different string equality methods" do
|
90
|
-
|
91
|
-
|
86
|
+
foo_logger = nil #prefixed_logger("FOO")
|
87
|
+
bar_logger = nil #prefixed_logger("BAR")
|
88
|
+
|
89
|
+
database_for('mysql', :logger => foo_logger) do |db|
|
90
|
+
db.create_table!(:foo) do
|
91
|
+
primary_key(:id)
|
92
|
+
String :baz, :collate => "latin1_swedish_ci"
|
93
|
+
end
|
94
|
+
db[:foo].import([:id, :baz], [
|
95
|
+
[1, "tEst"],
|
96
|
+
[2, "teSt"],
|
97
|
+
[3, "tesT "],
|
98
|
+
[4, "TEST"],
|
99
|
+
[5, "junk"]
|
100
|
+
])
|
92
101
|
end
|
93
|
-
uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
|
94
|
-
Sequel.connect(uri) do |db|
|
95
|
-
db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
|
96
|
-
db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
|
97
102
|
|
98
|
-
|
99
|
-
db
|
103
|
+
database_for('mysql', :logger => bar_logger) do |db|
|
104
|
+
db.create_table!(:bar) do
|
105
|
+
primary_key(:id)
|
106
|
+
String :baz, :collate => "latin1_swedish_ci"
|
107
|
+
end
|
108
|
+
db[:bar].import([:id, :baz], [
|
109
|
+
[1, "Test "],
|
110
|
+
[2, "tEst "],
|
111
|
+
[3, "teSt"],
|
112
|
+
[4, "TEST"],
|
113
|
+
[5, "junk"]
|
114
|
+
])
|
100
115
|
end
|
101
116
|
|
102
|
-
|
103
|
-
|
117
|
+
options = database_options_for('mysql')
|
118
|
+
ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
|
119
|
+
ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
|
104
120
|
tmpuri = @tmpuri
|
121
|
+
results_logger = nil #prefixed_logger("RESULTS")
|
105
122
|
conf = ds_1.link_with(ds_2) do
|
106
|
-
lhs[:
|
107
|
-
|
108
|
-
save_results_in(tmpuri)
|
123
|
+
lhs[:baz].must == rhs[:baz]
|
124
|
+
save_results_in(tmpuri, :logger => results_logger)
|
109
125
|
end
|
110
126
|
|
111
127
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
112
128
|
runner.execute
|
113
129
|
|
114
130
|
database do |db|
|
115
|
-
assert_equal
|
131
|
+
assert_equal 2, db[:groups].count
|
116
132
|
end
|
117
133
|
end
|
118
134
|
end
|
@@ -62,5 +62,27 @@ module IntegrationTests
|
|
62
62
|
assert_equal 1, db[:groups].count
|
63
63
|
end
|
64
64
|
end
|
65
|
+
|
66
|
+
test "binary function with static argument" do
|
67
|
+
database do |db|
|
68
|
+
db.create_table(:foo) { primary_key(:id); String(:bar) }
|
69
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo']])
|
70
|
+
end
|
71
|
+
|
72
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
73
|
+
tmpuri = @tmpuri
|
74
|
+
conf = ds.link_with(ds) do
|
75
|
+
lhs[:bar].must == rhs[:bar]
|
76
|
+
binary(lhs[:bar]).must == binary('foo')
|
77
|
+
binary(rhs[:bar]).must == binary('foo')
|
78
|
+
save_results_in(tmpuri)
|
79
|
+
end
|
80
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
81
|
+
runner.execute
|
82
|
+
|
83
|
+
database do |db|
|
84
|
+
assert_equal 1, db[:groups].count
|
85
|
+
end
|
86
|
+
end
|
65
87
|
end
|
66
88
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestResultSet < Test::Unit::TestCase
|
5
|
+
test "#create_tables! creates original_groups table when decollation is needed" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
8
|
+
end
|
9
|
+
|
10
|
+
database_for('mysql') do |db|
|
11
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
12
|
+
end
|
13
|
+
|
14
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
15
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
16
|
+
results_uri = database_options_for('sqlite')
|
17
|
+
conf = dataset_1.link_with(dataset_2) do
|
18
|
+
lhs[:foo].must == rhs[:foo]
|
19
|
+
save_results_in(results_uri)
|
20
|
+
end
|
21
|
+
conf.result_set.create_tables!
|
22
|
+
assert_include conf.result_set.database.tables, :original_groups
|
23
|
+
end
|
24
|
+
|
25
|
+
test "#create_tables! doesn't create original_groups table when decollation is needed" do
|
26
|
+
database_for('sqlite') do |db|
|
27
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
28
|
+
end
|
29
|
+
|
30
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
31
|
+
results_uri = database_options_for('sqlite')
|
32
|
+
conf = dataset.link_with(dataset) do
|
33
|
+
lhs[:foo].must == rhs[:foo]
|
34
|
+
save_results_in(results_uri)
|
35
|
+
end
|
36
|
+
conf.result_set.create_tables!
|
37
|
+
assert_not_include conf.result_set.database.tables, :original_groups
|
38
|
+
end
|
39
|
+
|
40
|
+
test "#create_tables! doesn't create groups table when not needed" do
|
41
|
+
database_for('sqlite') do |db|
|
42
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
43
|
+
end
|
44
|
+
|
45
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
46
|
+
results_uri = database_options_for('sqlite')
|
47
|
+
conf = dataset.link_with(dataset) do
|
48
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
49
|
+
save_results_in(results_uri)
|
50
|
+
end
|
51
|
+
conf.result_set.create_tables!
|
52
|
+
assert_not_include conf.result_set.database.tables, :groups
|
53
|
+
end
|
54
|
+
|
55
|
+
test "#create_tables! creates scores table when there are exhaustive expectations" do
|
56
|
+
database_for('sqlite') do |db|
|
57
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
58
|
+
end
|
59
|
+
|
60
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
61
|
+
results_uri = database_options_for('sqlite')
|
62
|
+
conf = dataset.link_with(dataset) do
|
63
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
64
|
+
save_results_in(results_uri)
|
65
|
+
end
|
66
|
+
conf.result_set.create_tables!
|
67
|
+
assert_include conf.result_set.database.tables, :scores
|
68
|
+
end
|
69
|
+
|
70
|
+
test "#create_tables! doesn't create scores table when not needed" do
|
71
|
+
database_for('sqlite') do |db|
|
72
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
73
|
+
end
|
74
|
+
|
75
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
76
|
+
results_uri = database_options_for('sqlite')
|
77
|
+
conf = dataset.link_with(dataset) do
|
78
|
+
lhs[:foo].must == rhs[:foo]
|
79
|
+
save_results_in(results_uri)
|
80
|
+
end
|
81
|
+
conf.result_set.create_tables!
|
82
|
+
assert_not_include conf.result_set.database.tables, :scores
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestScoring < Test::Unit::TestCase
|
5
|
+
test "stop scoring if must expectation fails" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
8
|
+
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
9
|
+
db[:foo].import([:id, :num], [[1, 1]])
|
10
|
+
db[:bar].import([:id, :num], [[1, 5]])
|
11
|
+
end
|
12
|
+
|
13
|
+
db_opts = database_options_for('sqlite')
|
14
|
+
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
15
|
+
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
16
|
+
conf = dataset_1.link_with(dataset_2) do
|
17
|
+
lhs[:num].must_not be_within(5).of(rhs[:num])
|
18
|
+
lhs[:num].must be_within(5).of(rhs[:num])
|
19
|
+
save_results_in(db_opts)
|
20
|
+
end
|
21
|
+
|
22
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
23
|
+
runner.execute
|
24
|
+
|
25
|
+
database_for('sqlite') do |db|
|
26
|
+
assert_equal db[:scores].count, 1
|
27
|
+
record = db[:scores].first
|
28
|
+
assert_equal 1, record[:score]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
test "scoring phase adds matches as needed" do
|
33
|
+
database_for('sqlite') do |db|
|
34
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
35
|
+
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
36
|
+
db[:foo].import([:id, :num], (0..15).collect { |i| [i, i] })
|
37
|
+
db[:bar].import([:id, :num], (0..15).collect { |i| [i, i] })
|
38
|
+
end
|
39
|
+
|
40
|
+
db_opts = database_options_for('sqlite')
|
41
|
+
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
42
|
+
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
43
|
+
conf = dataset_1.link_with(dataset_2) do
|
44
|
+
lhs[:num].must be_within(10).of(rhs[:num])
|
45
|
+
lhs[:num].must_not be_within(5).of(rhs[:num])
|
46
|
+
save_results_in(db_opts)
|
47
|
+
end
|
48
|
+
|
49
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
50
|
+
runner.execute
|
51
|
+
|
52
|
+
database_for('sqlite') do |db|
|
53
|
+
assert_equal 80, db[:matches].count
|
54
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
55
|
+
assert_equal 1, row[:total_score]
|
56
|
+
assert_include 6..10, (row[:record_1_id] - row[:record_2_id]).abs
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
test "optimize scoring for self linkage" do
|
62
|
+
database_for('sqlite') do |db|
|
63
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
64
|
+
db[:foo].import([:id, :num], [[1, 1], [2, 5], [3, 10]])
|
65
|
+
end
|
66
|
+
|
67
|
+
db_opts = database_options_for('sqlite')
|
68
|
+
dataset = Linkage::Dataset.new(db_opts, "foo")
|
69
|
+
conf = dataset.link_with(dataset) do
|
70
|
+
lhs[:num].must be_within(5).of(rhs[:num])
|
71
|
+
save_results_in(db_opts)
|
72
|
+
end
|
73
|
+
|
74
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
75
|
+
runner.execute
|
76
|
+
|
77
|
+
database_for('sqlite') do |db|
|
78
|
+
assert_equal db[:scores].count, 3
|
79
|
+
scores = db[:scores].order(:record_1_id, :record_2_id).select_map(:score)
|
80
|
+
assert_equal [1, 0, 1], scores
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -42,6 +42,11 @@ module IntegrationTests
|
|
42
42
|
dataset, _ = result_set.groups_records_datasets(group)
|
43
43
|
assert_equal 10, dataset.count
|
44
44
|
end
|
45
|
+
|
46
|
+
assert_equal 450, db[:matches].count
|
47
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
48
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
49
|
+
end
|
45
50
|
end
|
46
51
|
end
|
47
52
|
|