linkage 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +10 -0
- data/Gemfile +15 -13
- data/Gemfile.lock +67 -37
- data/Guardfile +0 -2
- data/Rakefile +122 -25
- data/lib/linkage/comparator.rb +172 -0
- data/lib/linkage/comparators/binary.rb +12 -0
- data/lib/linkage/comparators/compare.rb +46 -0
- data/lib/linkage/comparators/within.rb +32 -0
- data/lib/linkage/configuration.rb +285 -153
- data/lib/linkage/data.rb +32 -7
- data/lib/linkage/dataset.rb +107 -32
- data/lib/linkage/decollation.rb +93 -0
- data/lib/linkage/expectation.rb +21 -0
- data/lib/linkage/expectations/exhaustive.rb +63 -0
- data/lib/linkage/expectations/simple.rb +168 -0
- data/lib/linkage/field.rb +30 -4
- data/lib/linkage/field_set.rb +6 -3
- data/lib/linkage/function.rb +50 -3
- data/lib/linkage/functions/binary.rb +30 -0
- data/lib/linkage/functions/cast.rb +54 -0
- data/lib/linkage/functions/length.rb +29 -0
- data/lib/linkage/functions/strftime.rb +12 -11
- data/lib/linkage/functions/trim.rb +8 -0
- data/lib/linkage/group.rb +20 -0
- data/lib/linkage/import_buffer.rb +5 -16
- data/lib/linkage/meta_object.rb +139 -0
- data/lib/linkage/result_set.rb +74 -17
- data/lib/linkage/runner/single_threaded.rb +125 -10
- data/lib/linkage/version.rb +3 -0
- data/lib/linkage.rb +11 -0
- data/linkage.gemspec +16 -121
- data/test/config.yml +5 -0
- data/test/helper.rb +73 -8
- data/test/integration/test_collation.rb +45 -0
- data/test/integration/test_configuration.rb +268 -0
- data/test/integration/test_cross_linkage.rb +4 -17
- data/test/integration/test_dataset.rb +45 -2
- data/test/integration/test_dual_linkage.rb +40 -24
- data/test/integration/test_functions.rb +22 -0
- data/test/integration/test_result_set.rb +85 -0
- data/test/integration/test_scoring.rb +84 -0
- data/test/integration/test_self_linkage.rb +5 -0
- data/test/integration/test_within_comparator.rb +100 -0
- data/test/unit/comparators/test_compare.rb +105 -0
- data/test/unit/comparators/test_within.rb +57 -0
- data/test/unit/expectations/test_exhaustive.rb +111 -0
- data/test/unit/expectations/test_simple.rb +303 -0
- data/test/unit/functions/test_binary.rb +54 -0
- data/test/unit/functions/test_cast.rb +98 -0
- data/test/unit/functions/test_length.rb +52 -0
- data/test/unit/functions/test_strftime.rb +17 -13
- data/test/unit/functions/test_trim.rb +11 -4
- data/test/unit/test_comparator.rb +124 -0
- data/test/unit/test_configuration.rb +137 -175
- data/test/unit/test_data.rb +44 -0
- data/test/unit/test_dataset.rb +73 -21
- data/test/unit/test_decollation.rb +201 -0
- data/test/unit/test_field.rb +38 -14
- data/test/unit/test_field_set.rb +12 -8
- data/test/unit/test_function.rb +83 -16
- data/test/unit/test_group.rb +28 -0
- data/test/unit/test_import_buffer.rb +13 -27
- data/test/unit/test_meta_object.rb +208 -0
- data/test/unit/test_result_set.rb +221 -3
- metadata +82 -190
@@ -0,0 +1,268 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestConfiguration < Test::Unit::TestCase
|
5
|
+
test "linkage_type is self when the two datasets are the same" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
8
|
+
end
|
9
|
+
|
10
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
11
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
12
|
+
assert_equal :self, conf.linkage_type
|
13
|
+
end
|
14
|
+
|
15
|
+
test "linkage_type is dual when the two datasets are different" do
|
16
|
+
database_for('sqlite') do |db|
|
17
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
18
|
+
db.create_table(:bar) { primary_key(:id); String(:foo); String(:bar) }
|
19
|
+
end
|
20
|
+
|
21
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
22
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), "bar")
|
23
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_2)
|
24
|
+
assert_equal :dual, conf.linkage_type
|
25
|
+
end
|
26
|
+
|
27
|
+
test "linkage_type is cross when there's different filters on both sides" do
|
28
|
+
database_for('sqlite') do |db|
|
29
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
30
|
+
end
|
31
|
+
|
32
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
33
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
34
|
+
conf.configure do
|
35
|
+
lhs[:foo].must == "foo"
|
36
|
+
rhs[:foo].must == "bar"
|
37
|
+
end
|
38
|
+
assert_equal :cross, conf.linkage_type
|
39
|
+
end
|
40
|
+
|
41
|
+
test "linkage_type is self when there's identical static filters on each side" do
|
42
|
+
database_for('sqlite') do |db|
|
43
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
44
|
+
end
|
45
|
+
|
46
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
47
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
48
|
+
conf.configure do
|
49
|
+
lhs[:foo].must == "foo"
|
50
|
+
rhs[:foo].must == "foo"
|
51
|
+
end
|
52
|
+
assert_equal :self, conf.linkage_type
|
53
|
+
end
|
54
|
+
|
55
|
+
test "linkage_type is cross when exhaustive expectations use different fields" do
|
56
|
+
database_for('sqlite') do |db|
|
57
|
+
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
58
|
+
end
|
59
|
+
|
60
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
61
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
62
|
+
conf.configure do
|
63
|
+
lhs[:foo].must(be_within(5).of(rhs[:bar]))
|
64
|
+
end
|
65
|
+
assert_equal :cross, conf.linkage_type
|
66
|
+
end
|
67
|
+
|
68
|
+
test "static expectation" do
|
69
|
+
database_for('sqlite') do |db|
|
70
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
71
|
+
end
|
72
|
+
|
73
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
74
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
75
|
+
conf.configure do
|
76
|
+
lhs[:foo].must == "foo"
|
77
|
+
end
|
78
|
+
|
79
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
80
|
+
assert_equal dataset_2.obj, dataset_1.filter(:foo => "foo").obj
|
81
|
+
end
|
82
|
+
|
83
|
+
test "complain if an invalid field is accessed" do
|
84
|
+
database_for('sqlite') do |db|
|
85
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
86
|
+
end
|
87
|
+
|
88
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
89
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
90
|
+
assert_raises(ArgumentError) do
|
91
|
+
conf.configure do
|
92
|
+
lhs[:foo].must == rhs[:non_existant_field]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
operators = [:>, :<, :>=, :<=]
|
98
|
+
operators.each do |operator|
|
99
|
+
test "DSL #{operator} filter operator" do
|
100
|
+
database_for('sqlite') do |db|
|
101
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
102
|
+
end
|
103
|
+
|
104
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
105
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
106
|
+
conf.configure do
|
107
|
+
lhs[:foo].must.send(operator, 123)
|
108
|
+
end
|
109
|
+
|
110
|
+
expr = Sequel::SQL::BooleanExpression.new(operator, Sequel::SQL::Identifier.new(:foo), 123)
|
111
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
112
|
+
assert_equal dataset_2.obj, dataset_1.filter(expr).obj
|
113
|
+
end
|
114
|
+
|
115
|
+
test "comparing two data sources with #{operator}" do
|
116
|
+
database_for('sqlite') do |db|
|
117
|
+
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
118
|
+
end
|
119
|
+
|
120
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
121
|
+
conf = Linkage::Configuration.new(dataset, dataset)
|
122
|
+
conf.configure do
|
123
|
+
lhs[:foo].must.send(operator, rhs[:bar])
|
124
|
+
end
|
125
|
+
assert_equal 1, conf.exhaustive_expectations.length
|
126
|
+
|
127
|
+
comp = conf.exhaustive_expectations[0].comparator
|
128
|
+
assert_instance_of Linkage::Comparators::Compare, comp
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
test "must_not expectation" do
|
133
|
+
database_for('sqlite') do |db|
|
134
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
135
|
+
end
|
136
|
+
|
137
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
138
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
139
|
+
conf.configure do
|
140
|
+
lhs[:foo].must_not == "foo"
|
141
|
+
end
|
142
|
+
|
143
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
144
|
+
assert_equal dataset_2.obj, dataset_1.filter(~{:foo => "foo"}).obj
|
145
|
+
end
|
146
|
+
|
147
|
+
test "static database function" do
|
148
|
+
database_for('sqlite') do |db|
|
149
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
150
|
+
end
|
151
|
+
|
152
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
153
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
154
|
+
conf.configure do
|
155
|
+
lhs[:foo].must == trim("foo")
|
156
|
+
end
|
157
|
+
|
158
|
+
dataset_2, _ = conf.datasets_with_applied_simple_expectations
|
159
|
+
assert_equal dataset_1.filter({:foo => :trim.sql_function("foo")}).obj, dataset_2.obj
|
160
|
+
end
|
161
|
+
|
162
|
+
test "save_results_in" do
|
163
|
+
database_for('sqlite') do |db|
|
164
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
165
|
+
end
|
166
|
+
|
167
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
168
|
+
conf = Linkage::Configuration.new(dataset_1, dataset_1)
|
169
|
+
conf.configure do
|
170
|
+
save_results_in("mysql://localhost/results", {:foo => 'bar'})
|
171
|
+
end
|
172
|
+
assert_equal "mysql://localhost/results", conf.results_uri
|
173
|
+
assert_equal({:foo => 'bar'}, conf.results_uri_options)
|
174
|
+
end
|
175
|
+
|
176
|
+
test "case insensitive field names" do
|
177
|
+
database_for('sqlite') do |db|
|
178
|
+
db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
|
179
|
+
end
|
180
|
+
|
181
|
+
assert_nothing_raised do
|
182
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
|
183
|
+
results_uri = database_options_for('sqlite')
|
184
|
+
conf = dataset.link_with(dataset) do
|
185
|
+
lhs[:Foo].must == rhs[:baR]
|
186
|
+
save_results_in(results_uri)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
test "decollation_needed? is false when the datasets and results dataset all have the same database and collations" do
|
192
|
+
database_for('mysql') do |db|
|
193
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
|
194
|
+
db.create_table!(:bar) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
|
195
|
+
end
|
196
|
+
|
197
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
198
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
|
199
|
+
conf = dataset_1.link_with(dataset_2) do
|
200
|
+
lhs[:foo].must == rhs[:foo]
|
201
|
+
end
|
202
|
+
conf.results_uri = database_options_for('mysql')
|
203
|
+
assert !conf.decollation_needed?
|
204
|
+
end
|
205
|
+
|
206
|
+
test "decollation_needed? is true when the datasets have different database types" do
|
207
|
+
database_for('mysql') do |db|
|
208
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
209
|
+
end
|
210
|
+
|
211
|
+
database_for('sqlite') do |db|
|
212
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
213
|
+
end
|
214
|
+
|
215
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
216
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
217
|
+
conf = dataset_1.link_with(dataset_2) do
|
218
|
+
lhs[:foo].must == rhs[:foo]
|
219
|
+
end
|
220
|
+
conf.results_uri = database_options_for('mysql')
|
221
|
+
assert conf.decollation_needed?
|
222
|
+
end
|
223
|
+
|
224
|
+
test "decollation_needed? is true when the result dataset has different database type than the datasets" do
|
225
|
+
database_for('mysql') do |db|
|
226
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
227
|
+
db.create_table!(:bar) { primary_key(:id); String(:foo) }
|
228
|
+
end
|
229
|
+
|
230
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
231
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
|
232
|
+
conf = dataset_1.link_with(dataset_2) do
|
233
|
+
lhs[:foo].must == rhs[:foo]
|
234
|
+
end
|
235
|
+
conf.results_uri = database_options_for('sqlite')
|
236
|
+
assert conf.decollation_needed?
|
237
|
+
end
|
238
|
+
|
239
|
+
test "decollation_needed? is false when not comparing string columns" do
|
240
|
+
database_for('mysql') do |db|
|
241
|
+
db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
|
242
|
+
end
|
243
|
+
|
244
|
+
database_for('sqlite') do |db|
|
245
|
+
db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
|
246
|
+
end
|
247
|
+
|
248
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
249
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
250
|
+
conf = dataset_1.link_with(dataset_2) do
|
251
|
+
lhs[:foo].must == rhs[:foo]
|
252
|
+
end
|
253
|
+
conf.results_uri = database_options_for('mysql')
|
254
|
+
assert !conf.decollation_needed?
|
255
|
+
end
|
256
|
+
|
257
|
+
test "creating comparator expectation for within" do
|
258
|
+
database_for('mysql') do |db|
|
259
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
260
|
+
end
|
261
|
+
dataset = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
262
|
+
|
263
|
+
conf = dataset.link_with(dataset) do
|
264
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
@@ -16,10 +16,6 @@ module IntegrationTests
|
|
16
16
|
end
|
17
17
|
|
18
18
|
test "one mandatory field equality on single threaded runner" do
|
19
|
-
#setup_logger = Logger.new(STDERR)
|
20
|
-
#setup_logger.formatter = lambda { |severity, time, progname, msg|
|
21
|
-
#" SETUP : %s [%s]: %s\n" % [severity, time, msg]
|
22
|
-
#}
|
23
19
|
# insert the test data
|
24
20
|
database do |db|
|
25
21
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -27,16 +23,8 @@ module IntegrationTests
|
|
27
23
|
Array.new(100) { |i| [i, i % 10, i % 5] })
|
28
24
|
end
|
29
25
|
|
30
|
-
#ds_logger = Logger.new(STDERR)
|
31
|
-
#ds_logger.formatter = lambda { |severity, time, progname, msg|
|
32
|
-
#"DATASET: %s [%s]: %s\n" % [severity, time, msg]
|
33
|
-
#}
|
34
26
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
35
27
|
|
36
|
-
#rs_logger = Logger.new(STDERR)
|
37
|
-
#rs_logger.formatter = lambda { |severity, time, progname, msg|
|
38
|
-
#"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
|
39
|
-
#}
|
40
28
|
tmpuri = @tmpuri
|
41
29
|
conf = ds.link_with(ds) do
|
42
30
|
lhs[:foo].must == rhs[:bar]
|
@@ -52,11 +40,10 @@ module IntegrationTests
|
|
52
40
|
assert_equal i, row[:foo_bar]
|
53
41
|
end
|
54
42
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
#end
|
43
|
+
assert_equal 1000, db[:matches].count
|
44
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
45
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
|
46
|
+
end
|
60
47
|
end
|
61
48
|
end
|
62
49
|
|
@@ -38,7 +38,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
38
38
|
end
|
39
39
|
|
40
40
|
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
41
|
-
ds = ds.
|
41
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
42
42
|
ds.each_group do |group|
|
43
43
|
assert_equal({:bar => "foo"}, group.values)
|
44
44
|
assert_equal(2, group.count)
|
@@ -51,6 +51,26 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
51
51
|
assert_equal 3, groups.length
|
52
52
|
end
|
53
53
|
|
54
|
+
test "each_group with alias" do
|
55
|
+
database do |db|
|
56
|
+
db.create_table(:foo) do
|
57
|
+
primary_key :id
|
58
|
+
String :bar
|
59
|
+
end
|
60
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
|
61
|
+
end
|
62
|
+
|
63
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
64
|
+
ds = ds.group_match({
|
65
|
+
:meta_object => Linkage::MetaObject.new(ds.field_set[:bar]),
|
66
|
+
:alias => :bar_baz
|
67
|
+
})
|
68
|
+
ds.each_group do |group|
|
69
|
+
assert_equal({:bar_baz => "foo"}, group.values)
|
70
|
+
assert_equal(2, group.count)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
54
74
|
test "each_group with filters" do
|
55
75
|
database do |db|
|
56
76
|
db.create_table(:foo) do
|
@@ -62,7 +82,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
62
82
|
end
|
63
83
|
|
64
84
|
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
65
|
-
ds = ds.
|
85
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
66
86
|
ds = ds.filter { baz >= 3 }
|
67
87
|
groups = []
|
68
88
|
ds.each_group(1) do |group|
|
@@ -70,4 +90,27 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
70
90
|
end
|
71
91
|
assert_equal 2, groups.length
|
72
92
|
end
|
93
|
+
|
94
|
+
test "each_group with collation" do
|
95
|
+
database_for('mysql') do |db|
|
96
|
+
db.create_table!(:foo) do
|
97
|
+
primary_key :id
|
98
|
+
String :bar, :collate => :latin1_swedish_ci
|
99
|
+
end
|
100
|
+
db[:foo].import([:id, :bar], [[1, 'fOo'], [2, 'foO'], [3, 'bar'], [4, 'baz']])
|
101
|
+
end
|
102
|
+
|
103
|
+
ds = Linkage::Dataset.new(database_options_for('mysql'), "foo")
|
104
|
+
ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
|
105
|
+
groups = []
|
106
|
+
ds.each_group(1) do |group|
|
107
|
+
groups << group
|
108
|
+
end
|
109
|
+
expected = [
|
110
|
+
{:bar => 'BAR'},
|
111
|
+
{:bar => 'BAZ'},
|
112
|
+
{:bar => 'FOO'}
|
113
|
+
]
|
114
|
+
assert_equal expected, groups.collect(&:decollated_values)
|
115
|
+
end
|
73
116
|
end
|
@@ -34,6 +34,8 @@ module IntegrationTests
|
|
34
34
|
lhs[:ssn].must == rhs[:ssn]
|
35
35
|
save_results_in(tmpuri)
|
36
36
|
end
|
37
|
+
assert_equal :dual, conf.linkage_type
|
38
|
+
|
37
39
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
38
40
|
runner.execute
|
39
41
|
|
@@ -43,16 +45,10 @@ module IntegrationTests
|
|
43
45
|
assert_equal "12345678#{i%10}", row[:ssn]
|
44
46
|
end
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#else
|
51
|
-
#assert_equal 1, row[:dataset], row.inspect
|
52
|
-
#end
|
53
|
-
#expected_group_id = i / 20 + 1
|
54
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
-
#end
|
48
|
+
assert_equal 1000, db[:matches].count
|
49
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
50
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
51
|
+
end
|
56
52
|
end
|
57
53
|
end
|
58
54
|
|
@@ -87,32 +83,52 @@ module IntegrationTests
|
|
87
83
|
end
|
88
84
|
|
89
85
|
test "reacts properly when using two databases with different string equality methods" do
|
90
|
-
|
91
|
-
|
86
|
+
foo_logger = nil #prefixed_logger("FOO")
|
87
|
+
bar_logger = nil #prefixed_logger("BAR")
|
88
|
+
|
89
|
+
database_for('mysql', :logger => foo_logger) do |db|
|
90
|
+
db.create_table!(:foo) do
|
91
|
+
primary_key(:id)
|
92
|
+
String :baz, :collate => "latin1_swedish_ci"
|
93
|
+
end
|
94
|
+
db[:foo].import([:id, :baz], [
|
95
|
+
[1, "tEst"],
|
96
|
+
[2, "teSt"],
|
97
|
+
[3, "tesT "],
|
98
|
+
[4, "TEST"],
|
99
|
+
[5, "junk"]
|
100
|
+
])
|
92
101
|
end
|
93
|
-
uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
|
94
|
-
Sequel.connect(uri) do |db|
|
95
|
-
db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
|
96
|
-
db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
|
97
102
|
|
98
|
-
|
99
|
-
db
|
103
|
+
database_for('mysql', :logger => bar_logger) do |db|
|
104
|
+
db.create_table!(:bar) do
|
105
|
+
primary_key(:id)
|
106
|
+
String :baz, :collate => "latin1_swedish_ci"
|
107
|
+
end
|
108
|
+
db[:bar].import([:id, :baz], [
|
109
|
+
[1, "Test "],
|
110
|
+
[2, "tEst "],
|
111
|
+
[3, "teSt"],
|
112
|
+
[4, "TEST"],
|
113
|
+
[5, "junk"]
|
114
|
+
])
|
100
115
|
end
|
101
116
|
|
102
|
-
|
103
|
-
|
117
|
+
options = database_options_for('mysql')
|
118
|
+
ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
|
119
|
+
ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
|
104
120
|
tmpuri = @tmpuri
|
121
|
+
results_logger = nil #prefixed_logger("RESULTS")
|
105
122
|
conf = ds_1.link_with(ds_2) do
|
106
|
-
lhs[:
|
107
|
-
|
108
|
-
save_results_in(tmpuri)
|
123
|
+
lhs[:baz].must == rhs[:baz]
|
124
|
+
save_results_in(tmpuri, :logger => results_logger)
|
109
125
|
end
|
110
126
|
|
111
127
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
112
128
|
runner.execute
|
113
129
|
|
114
130
|
database do |db|
|
115
|
-
assert_equal
|
131
|
+
assert_equal 2, db[:groups].count
|
116
132
|
end
|
117
133
|
end
|
118
134
|
end
|
@@ -62,5 +62,27 @@ module IntegrationTests
|
|
62
62
|
assert_equal 1, db[:groups].count
|
63
63
|
end
|
64
64
|
end
|
65
|
+
|
66
|
+
test "binary function with static argument" do
|
67
|
+
database do |db|
|
68
|
+
db.create_table(:foo) { primary_key(:id); String(:bar) }
|
69
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo']])
|
70
|
+
end
|
71
|
+
|
72
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
73
|
+
tmpuri = @tmpuri
|
74
|
+
conf = ds.link_with(ds) do
|
75
|
+
lhs[:bar].must == rhs[:bar]
|
76
|
+
binary(lhs[:bar]).must == binary('foo')
|
77
|
+
binary(rhs[:bar]).must == binary('foo')
|
78
|
+
save_results_in(tmpuri)
|
79
|
+
end
|
80
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
81
|
+
runner.execute
|
82
|
+
|
83
|
+
database do |db|
|
84
|
+
assert_equal 1, db[:groups].count
|
85
|
+
end
|
86
|
+
end
|
65
87
|
end
|
66
88
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestResultSet < Test::Unit::TestCase
|
5
|
+
test "#create_tables! creates original_groups table when decollation is needed" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
8
|
+
end
|
9
|
+
|
10
|
+
database_for('mysql') do |db|
|
11
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
12
|
+
end
|
13
|
+
|
14
|
+
dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
15
|
+
dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
|
16
|
+
results_uri = database_options_for('sqlite')
|
17
|
+
conf = dataset_1.link_with(dataset_2) do
|
18
|
+
lhs[:foo].must == rhs[:foo]
|
19
|
+
save_results_in(results_uri)
|
20
|
+
end
|
21
|
+
conf.result_set.create_tables!
|
22
|
+
assert_include conf.result_set.database.tables, :original_groups
|
23
|
+
end
|
24
|
+
|
25
|
+
test "#create_tables! doesn't create original_groups table when decollation is needed" do
|
26
|
+
database_for('sqlite') do |db|
|
27
|
+
db.create_table!(:foo) { primary_key(:id); String(:foo) }
|
28
|
+
end
|
29
|
+
|
30
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
31
|
+
results_uri = database_options_for('sqlite')
|
32
|
+
conf = dataset.link_with(dataset) do
|
33
|
+
lhs[:foo].must == rhs[:foo]
|
34
|
+
save_results_in(results_uri)
|
35
|
+
end
|
36
|
+
conf.result_set.create_tables!
|
37
|
+
assert_not_include conf.result_set.database.tables, :original_groups
|
38
|
+
end
|
39
|
+
|
40
|
+
test "#create_tables! doesn't create groups table when not needed" do
|
41
|
+
database_for('sqlite') do |db|
|
42
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
43
|
+
end
|
44
|
+
|
45
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
46
|
+
results_uri = database_options_for('sqlite')
|
47
|
+
conf = dataset.link_with(dataset) do
|
48
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
49
|
+
save_results_in(results_uri)
|
50
|
+
end
|
51
|
+
conf.result_set.create_tables!
|
52
|
+
assert_not_include conf.result_set.database.tables, :groups
|
53
|
+
end
|
54
|
+
|
55
|
+
test "#create_tables! creates scores table when there are exhaustive expectations" do
|
56
|
+
database_for('sqlite') do |db|
|
57
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
58
|
+
end
|
59
|
+
|
60
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
61
|
+
results_uri = database_options_for('sqlite')
|
62
|
+
conf = dataset.link_with(dataset) do
|
63
|
+
lhs[:foo].must be_within(5).of(rhs[:foo])
|
64
|
+
save_results_in(results_uri)
|
65
|
+
end
|
66
|
+
conf.result_set.create_tables!
|
67
|
+
assert_include conf.result_set.database.tables, :scores
|
68
|
+
end
|
69
|
+
|
70
|
+
test "#create_tables! doesn't create scores table when not needed" do
|
71
|
+
database_for('sqlite') do |db|
|
72
|
+
db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
|
73
|
+
end
|
74
|
+
|
75
|
+
dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
|
76
|
+
results_uri = database_options_for('sqlite')
|
77
|
+
conf = dataset.link_with(dataset) do
|
78
|
+
lhs[:foo].must == rhs[:foo]
|
79
|
+
save_results_in(results_uri)
|
80
|
+
end
|
81
|
+
conf.result_set.create_tables!
|
82
|
+
assert_not_include conf.result_set.database.tables, :scores
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module IntegrationTests
|
4
|
+
class TestScoring < Test::Unit::TestCase
|
5
|
+
test "stop scoring if must expectation fails" do
|
6
|
+
database_for('sqlite') do |db|
|
7
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
8
|
+
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
9
|
+
db[:foo].import([:id, :num], [[1, 1]])
|
10
|
+
db[:bar].import([:id, :num], [[1, 5]])
|
11
|
+
end
|
12
|
+
|
13
|
+
db_opts = database_options_for('sqlite')
|
14
|
+
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
15
|
+
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
16
|
+
conf = dataset_1.link_with(dataset_2) do
|
17
|
+
lhs[:num].must_not be_within(5).of(rhs[:num])
|
18
|
+
lhs[:num].must be_within(5).of(rhs[:num])
|
19
|
+
save_results_in(db_opts)
|
20
|
+
end
|
21
|
+
|
22
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
23
|
+
runner.execute
|
24
|
+
|
25
|
+
database_for('sqlite') do |db|
|
26
|
+
assert_equal db[:scores].count, 1
|
27
|
+
record = db[:scores].first
|
28
|
+
assert_equal 1, record[:score]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
test "scoring phase adds matches as needed" do
|
33
|
+
database_for('sqlite') do |db|
|
34
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
35
|
+
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
36
|
+
db[:foo].import([:id, :num], (0..15).collect { |i| [i, i] })
|
37
|
+
db[:bar].import([:id, :num], (0..15).collect { |i| [i, i] })
|
38
|
+
end
|
39
|
+
|
40
|
+
db_opts = database_options_for('sqlite')
|
41
|
+
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
42
|
+
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
43
|
+
conf = dataset_1.link_with(dataset_2) do
|
44
|
+
lhs[:num].must be_within(10).of(rhs[:num])
|
45
|
+
lhs[:num].must_not be_within(5).of(rhs[:num])
|
46
|
+
save_results_in(db_opts)
|
47
|
+
end
|
48
|
+
|
49
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
50
|
+
runner.execute
|
51
|
+
|
52
|
+
database_for('sqlite') do |db|
|
53
|
+
assert_equal 80, db[:matches].count
|
54
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
55
|
+
assert_equal 1, row[:total_score]
|
56
|
+
assert_include 6..10, (row[:record_1_id] - row[:record_2_id]).abs
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
test "optimize scoring for self linkage" do
|
62
|
+
database_for('sqlite') do |db|
|
63
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
64
|
+
db[:foo].import([:id, :num], [[1, 1], [2, 5], [3, 10]])
|
65
|
+
end
|
66
|
+
|
67
|
+
db_opts = database_options_for('sqlite')
|
68
|
+
dataset = Linkage::Dataset.new(db_opts, "foo")
|
69
|
+
conf = dataset.link_with(dataset) do
|
70
|
+
lhs[:num].must be_within(5).of(rhs[:num])
|
71
|
+
save_results_in(db_opts)
|
72
|
+
end
|
73
|
+
|
74
|
+
runner = Linkage::SingleThreadedRunner.new(conf)
|
75
|
+
runner.execute
|
76
|
+
|
77
|
+
database_for('sqlite') do |db|
|
78
|
+
assert_equal db[:scores].count, 3
|
79
|
+
scores = db[:scores].order(:record_1_id, :record_2_id).select_map(:score)
|
80
|
+
assert_equal [1, 0, 1], scores
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -42,6 +42,11 @@ module IntegrationTests
|
|
42
42
|
dataset, _ = result_set.groups_records_datasets(group)
|
43
43
|
assert_equal 10, dataset.count
|
44
44
|
end
|
45
|
+
|
46
|
+
assert_equal 450, db[:matches].count
|
47
|
+
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
48
|
+
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
49
|
+
end
|
45
50
|
end
|
46
51
|
end
|
47
52
|
|