linkage 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'mocha'
12
+ require 'tmpdir'
13
+ require 'logger'
14
+ require 'pp'
15
+ require 'versionomy'
16
+
17
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
18
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
19
+ require 'linkage'
20
+
21
+ class Test::Unit::TestCase
22
+ def stub_field(name, options = {}, &block)
23
+ f = Linkage::Field.allocate
24
+ f.stubs(options)
25
+ if block
26
+ f.send(:instance_eval, &block)
27
+ end
28
+ f.stubs(:is_a?).returns(false)
29
+ f.stubs(:is_a?).with(Linkage::Field).returns(true)
30
+ f
31
+ end
32
+
33
+ def self.current_ruby_version
34
+ @current_ruby_version ||= Versionomy.parse(RUBY_VERSION)
35
+ end
36
+
37
+ def self.ruby19
38
+ @ruby19 ||= Versionomy.parse("1.9")
39
+ end
40
+ end
41
+
42
+ module UnitTests; end
43
+ module IntegrationTests; end
@@ -0,0 +1,68 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestCrossLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # insert the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
22
+ db[:foo].import([:id, :foo, :bar],
23
+ Array.new(100) { |i| [i, i % 10, i % 5] })
24
+ end
25
+
26
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = ds.link_with(ds) do
28
+ lhs[:foo].must == rhs[:bar]
29
+ end
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
31
+ runner.execute
32
+
33
+ database do |db|
34
+ assert_equal 5, db[:groups].count
35
+ db[:groups].order(:foo_bar).each_with_index do |row, i|
36
+ assert_equal i, row[:foo_bar]
37
+ end
38
+
39
+ assert_equal 150, db[:groups_records].count
40
+ db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
41
+ expected_group_id = (row[:record_id] % 5) + 1
42
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
43
+ end
44
+ end
45
+ end
46
+
47
+ test "match same field with different filters" do
48
+ database do |db|
49
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
50
+ db[:foo].import([:id, :foo, :bar],
51
+ Array.new(100) { |i| [i, i % 10, i % 20] })
52
+ end
53
+
54
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
55
+ conf = ds.link_with(ds) do
56
+ lhs[:foo].must == rhs[:foo]
57
+ lhs[:bar].must == 0
58
+ rhs[:bar].must == 10
59
+ end
60
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
61
+ runner.execute
62
+
63
+ database do |db|
64
+ assert_equal 1, db[:groups].count
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,85 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestDualLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # create the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
+ db[:foo].import([:id, :ssn],
23
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
+
25
+ db.create_table(:bar) { primary_key(:id); String(:ssn) }
26
+ db[:bar].import([:id, :ssn],
27
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
28
+ end
29
+
30
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
31
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
32
+ conf = ds_1.link_with(ds_2) do
33
+ lhs[:ssn].must == rhs[:ssn]
34
+ end
35
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
36
+ runner.execute
37
+
38
+ database do |db|
39
+ assert_equal 10, db[:groups].count
40
+ db[:groups].order(:ssn).each_with_index do |row, i|
41
+ assert_equal "12345678#{i%10}", row[:ssn]
42
+ end
43
+
44
+ assert_equal 200, db[:groups_records].count
45
+ db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
46
+ if i % 20 >= 10
47
+ assert_equal 2, row[:dataset], row.inspect
48
+ else
49
+ assert_equal 1, row[:dataset], row.inspect
50
+ end
51
+ expected_group_id = i / 20 + 1
52
+ assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
53
+ end
54
+ end
55
+ end
56
+
57
+ test "don't ignore 1-record groups before the combining phase" do
58
+ # create the test data
59
+ database do |db|
60
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
61
+ db[:foo].import([:id, :ssn],
62
+ Array.new(100) { |i| [i, "1234567%03d" % i] })
63
+
64
+ db.create_table(:bar) { primary_key(:id); String(:ssn) }
65
+ db[:bar].import([:id, :ssn],
66
+ Array.new(100) { |i| [i, "1234567%03d" % i] })
67
+ end
68
+
69
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
70
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
71
+ conf = ds_1.link_with(ds_2) do
72
+ lhs[:ssn].must == rhs[:ssn]
73
+ end
74
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
75
+ runner.execute
76
+
77
+ database do |db|
78
+ assert_equal 100, db[:groups].count
79
+ db[:groups].order(:ssn).each_with_index do |row, i|
80
+ assert_equal "1234567%03d" % i, row[:ssn]
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,209 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestSelfLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # insert the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
+ db[:foo].import([:id, :ssn],
23
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
+ end
25
+
26
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = ds.link_with(ds) do
28
+ lhs[:ssn].must == rhs[:ssn]
29
+ end
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
31
+ runner.execute
32
+
33
+ database do |db|
34
+ assert_equal 10, db[:groups].count
35
+ db[:groups].order(:ssn).each_with_index do |row, i|
36
+ assert_equal "12345678#{i%10}", row[:ssn]
37
+ end
38
+
39
+ assert_equal 100, db[:groups_records].count
40
+ expected_group_id = nil
41
+ db[:groups_records].order(:record_id).each do |row|
42
+ expected_group_id = (row[:record_id] % 10) + 1
43
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
44
+ end
45
+ end
46
+ end
47
+
48
+ test "two mandatory field equalities on single threaded runner" do
49
+ # insert the test data
50
+ database do |db|
51
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
52
+ db[:foo].import([:id, :ssn, :dob],
53
+ Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
54
+ end
55
+
56
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
57
+ conf = ds.link_with(ds) do
58
+ lhs[:ssn].must == rhs[:ssn]
59
+ lhs[:dob].must == rhs[:dob]
60
+ end
61
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
62
+ runner.execute
63
+
64
+ database do |db|
65
+ assert_equal 20, db[:groups].count
66
+ db[:groups].order(:ssn).each_with_index do |row, i|
67
+ assert_equal "12345678#{i/2}", row[:ssn]
68
+ assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
69
+ end
70
+
71
+ assert_equal 100, db[:groups_records].count
72
+ expected_group_id = nil
73
+ db[:groups_records].order(:record_id).each do |row|
74
+ v = row[:record_id] % 20
75
+ expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
76
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
77
+ end
78
+ end
79
+ end
80
+
81
+ test "one mandatory field equality on single threaded runner, with filter" do
82
+ # insert the test data
83
+ database do |db|
84
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
85
+ db[:foo].import([:id, :ssn, :mod_5],
86
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
87
+ end
88
+
89
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
90
+ conf = ds.link_with(ds) do
91
+ lhs[:ssn].must == rhs[:ssn]
92
+ lhs[:mod_5].must == 3
93
+ end
94
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
95
+ runner.execute
96
+
97
+ database do |db|
98
+ assert_equal 2, db[:groups].count
99
+ db[:groups].order(:ssn).each_with_index do |row, i|
100
+ assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
101
+ end
102
+
103
+ assert_equal 20, db[:groups_records].count
104
+ expected_group_id = nil
105
+ db[:groups_records].order(:record_id).each do |row|
106
+ expected_group_id = (row[:record_id] / 5) % 2 + 1
107
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
108
+ end
109
+ end
110
+ end
111
+
112
+ test "one mandatory field equality on single threaded runner, with inequality filters" do
113
+ # insert the test data
114
+ database do |db|
115
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
116
+ db[:foo].import([:id, :ssn, :mod_5],
117
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
118
+ end
119
+
120
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
121
+ conf = ds.link_with(ds) do
122
+ lhs[:ssn].must == rhs[:ssn]
123
+ lhs[:mod_5].must > 2
124
+ lhs[:mod_5].must <= 3
125
+ end
126
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
127
+ runner.execute
128
+
129
+ database do |db|
130
+ assert_equal 2, db[:groups].count
131
+ db[:groups].order(:ssn).each_with_index do |row, i|
132
+ assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
133
+ end
134
+
135
+ assert_equal 20, db[:groups_records].count
136
+ expected_group_id = nil
137
+ db[:groups_records].order(:record_id).each do |row|
138
+ expected_group_id = (row[:record_id] / 5) % 2 + 1
139
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
140
+ end
141
+ end
142
+ end
143
+
144
+ test "one mandatory field equality on single threaded runner, with field filter" do
145
+ # insert the test data
146
+ database do |db|
147
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
148
+ db[:foo].import([:id, :ssn, :mod_5, :mod_20],
149
+ Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
150
+ end
151
+
152
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
153
+ conf = ds.link_with(ds) do
154
+ lhs[:ssn].must == rhs[:ssn]
155
+ lhs[:mod_5].must == lhs[:mod_20]
156
+ end
157
+ assert_equal :self, conf.linkage_type
158
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
159
+ runner.execute
160
+
161
+ database do |db|
162
+ assert_equal 5, db[:groups].count
163
+ db[:groups].order(:ssn).each_with_index do |row, i|
164
+ assert_equal "123456789#{i}", row[:ssn]
165
+ end
166
+
167
+ assert_equal 25, db[:groups_records].count
168
+ expected_group_id = nil
169
+ db[:groups_records].order(:record_id).each do |row|
170
+ expected_group_id = row[:record_id] % 5 + 1
171
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
172
+ end
173
+ end
174
+ end
175
+
176
+ test "one mandatory field equality on single threaded runner, with identical filters" do
177
+ # insert the test data
178
+ database do |db|
179
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
180
+ db[:foo].import([:id, :ssn, :mod_5, :mod_20],
181
+ Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
182
+ end
183
+
184
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
185
+ conf = ds.link_with(ds) do
186
+ lhs[:ssn].must == rhs[:ssn]
187
+ lhs[:mod_5].must == lhs[:mod_20]
188
+ rhs[:mod_5].must == rhs[:mod_20]
189
+ end
190
+ assert_equal :self, conf.linkage_type
191
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
192
+ runner.execute
193
+
194
+ database do |db|
195
+ assert_equal 5, db[:groups].count
196
+ db[:groups].order(:ssn).each_with_index do |row, i|
197
+ assert_equal "123456789#{i}", row[:ssn]
198
+ end
199
+
200
+ assert_equal 25, db[:groups_records].count
201
+ expected_group_id = nil
202
+ db[:groups_records].order(:record_id).each do |row|
203
+ expected_group_id = row[:record_id] % 5 + 1
204
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,145 @@
1
+ require 'helper'
2
+
3
+ class UnitTests::TestConfiguration < Test::Unit::TestCase
4
+ test "linkage_type is self when the two datasets are the same" do
5
+ dataset = stub('dataset')
6
+ c = Linkage::Configuration.new(dataset, dataset)
7
+ assert_equal :self, c.linkage_type
8
+ end
9
+
10
+ test "linkage_type is dual when the two datasets are different" do
11
+ dataset_1 = stub('dataset')
12
+ dataset_2 = stub('dataset')
13
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
14
+ assert_equal :dual, c.linkage_type
15
+ end
16
+
17
+ test "linkage_type is cross when there's a 'cross-join'" do
18
+ dataset = mock('dataset', :set_new_id => nil)
19
+ c = Linkage::Configuration.new(dataset, dataset)
20
+ exp = stub('expectation', :kind => :cross)
21
+ c.add_expectation(exp)
22
+ assert_equal :cross, c.linkage_type
23
+ end
24
+
25
+ test "linkage_type is cross when there's different filters on both sides" do
26
+ field = stub('field')
27
+ dataset = stub('dataset', :set_new_id => nil)
28
+ dataset.stubs(:fields).returns({:foo => field})
29
+ c = Linkage::Configuration.new(dataset, dataset)
30
+ exp_1 = stub('expectation', :kind => :filter)
31
+ Linkage::MustExpectation.expects(:new).with(:==, field, 123, nil).returns(exp_1)
32
+ exp_2 = stub('expectation', :kind => :filter)
33
+ Linkage::MustExpectation.expects(:new).with(:==, field, 456, nil).returns(exp_2)
34
+ c.send(:instance_eval) do
35
+ lhs[:foo].must == 123
36
+ rhs[:foo].must == 456
37
+ end
38
+ assert_equal :cross, c.linkage_type
39
+ end
40
+
41
+ test "linkage_type is self when there's identical static filters on each side" do
42
+ field = stub('field')
43
+ dataset = stub('dataset', :set_new_id => nil)
44
+ dataset.stubs(:fields).returns({:foo => field})
45
+ c = Linkage::Configuration.new(dataset, dataset)
46
+ exp_1 = stub('expectation', :kind => :filter)
47
+ Linkage::MustExpectation.expects(:new).twice.with(:==, field, 123, nil).returns(exp_1)
48
+ c.send(:instance_eval) do
49
+ lhs[:foo].must == 123
50
+ rhs[:foo].must == 123
51
+ end
52
+ assert_equal :self, c.linkage_type
53
+ end
54
+
55
+ test "linkage_type is self when there's a two-field filter on one side" do
56
+ field_1 = stub('field 1')
57
+ field_2 = stub('field 2')
58
+ dataset = stub('dataset', :set_new_id => nil)
59
+ dataset.stubs(:fields).returns({:foo => field_1, :bar => field_2})
60
+ c = Linkage::Configuration.new(dataset, dataset)
61
+ exp_1 = stub('expectation', :kind => :filter)
62
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, field_2, :filter).returns(exp_1)
63
+ exp_2 = stub('expectation', :kind => :self)
64
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, field_1, nil).returns(exp_2)
65
+ c.send(:instance_eval) do
66
+ lhs[:foo].must == lhs[:bar]
67
+ lhs[:foo].must == rhs[:foo]
68
+ end
69
+ assert_equal :self, c.linkage_type
70
+ end
71
+
72
+ test "static expectation" do
73
+ dataset_1 = stub('dataset')
74
+ field = stub('field')
75
+ dataset_1.stubs(:fields).returns({:foo => field})
76
+ dataset_2 = stub('dataset')
77
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
78
+ Linkage::MustExpectation.expects(:new).with(:==, field, 123, nil)
79
+ c.send(:instance_eval) do
80
+ lhs[:foo].must == 123
81
+ end
82
+ end
83
+
84
+ ## Maybe in the future
85
+ #test "static expectation, flopped" do
86
+ #dataset_1 = stub('dataset')
87
+ #field = stub('field')
88
+ #dataset_1.stubs(:fields).returns({:foo => field})
89
+ #dataset_2 = stub('dataset')
90
+ #c = Linkage::Configuration.new(dataset_1, dataset_2)
91
+ #Linkage::MustExpectation.expects(:new).with(:==, 123, field)
92
+ #c.send(:instance_eval) do
93
+ #123.must == lhs[:foo]
94
+ #end
95
+ #end
96
+
97
+ test "complain if an invalid field is accessed" do
98
+ dataset_1 = stub('dataset')
99
+ field_1 = stub_field('field 1')
100
+ dataset_1.stubs(:fields).returns({:foo => field_1})
101
+
102
+ dataset_2 = stub('dataset')
103
+ field_2 = stub_field('field 2')
104
+ dataset_2.stubs(:fields).returns({:bar => field_2})
105
+
106
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
107
+ assert_raises(ArgumentError) do
108
+ c.send(:instance_eval) do
109
+ lhs[:foo].must == rhs[:non_existant_field]
110
+ end
111
+ end
112
+ end
113
+
114
+ operators = [:>, :<, :>=, :<=]
115
+ operators << :'!=' if current_ruby_version >= ruby19
116
+ operators.each do |operator|
117
+ test "DSL #{operator} filter operator" do
118
+ dataset_1 = stub('dataset 1')
119
+ field_1 = stub_field('field 1')
120
+ dataset_1.stubs(:fields).returns({:foo => field_1})
121
+
122
+ dataset_2 = stub('dataset 2')
123
+ field_2 = stub_field('field 2')
124
+ dataset_2.stubs(:fields).returns({:bar => field_2})
125
+
126
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
127
+ Linkage::MustExpectation.expects(:new).with(operator, field_1, field_2, nil)
128
+ block = eval("Proc.new { lhs[:foo].must #{operator} rhs[:bar] }")
129
+ c.send(:instance_eval, &block)
130
+ end
131
+ end
132
+
133
+ test "must_not expectation" do
134
+ dataset_1 = stub('dataset 1')
135
+ field_1 = stub_field('field 1')
136
+ dataset_1.stubs(:fields).returns({:foo => field_1})
137
+ dataset_2 = stub('dataset 2')
138
+
139
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
140
+ Linkage::MustNotExpectation.expects(:new).with(:==, field_1, 123, nil)
141
+ c.send(:instance_eval) do
142
+ lhs[:foo].must_not == 123
143
+ end
144
+ end
145
+ end