linkage 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'mocha'
12
+ require 'tmpdir'
13
+ require 'logger'
14
+ require 'pp'
15
+ require 'versionomy'
16
+
17
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
18
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
19
+ require 'linkage'
20
+
21
+ class Test::Unit::TestCase
22
+ def stub_field(name, options = {}, &block)
23
+ f = Linkage::Field.allocate
24
+ f.stubs(options)
25
+ if block
26
+ f.send(:instance_eval, &block)
27
+ end
28
+ f.stubs(:is_a?).returns(false)
29
+ f.stubs(:is_a?).with(Linkage::Field).returns(true)
30
+ f
31
+ end
32
+
33
+ def self.current_ruby_version
34
+ @current_ruby_version ||= Versionomy.parse(RUBY_VERSION)
35
+ end
36
+
37
+ def self.ruby19
38
+ @ruby19 ||= Versionomy.parse("1.9")
39
+ end
40
+ end
41
+
42
+ module UnitTests; end
43
+ module IntegrationTests; end
@@ -0,0 +1,68 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestCrossLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # insert the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
22
+ db[:foo].import([:id, :foo, :bar],
23
+ Array.new(100) { |i| [i, i % 10, i % 5] })
24
+ end
25
+
26
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = ds.link_with(ds) do
28
+ lhs[:foo].must == rhs[:bar]
29
+ end
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
31
+ runner.execute
32
+
33
+ database do |db|
34
+ assert_equal 5, db[:groups].count
35
+ db[:groups].order(:foo_bar).each_with_index do |row, i|
36
+ assert_equal i, row[:foo_bar]
37
+ end
38
+
39
+ assert_equal 150, db[:groups_records].count
40
+ db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
41
+ expected_group_id = (row[:record_id] % 5) + 1
42
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
43
+ end
44
+ end
45
+ end
46
+
47
+ test "match same field with different filters" do
48
+ database do |db|
49
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
50
+ db[:foo].import([:id, :foo, :bar],
51
+ Array.new(100) { |i| [i, i % 10, i % 20] })
52
+ end
53
+
54
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
55
+ conf = ds.link_with(ds) do
56
+ lhs[:foo].must == rhs[:foo]
57
+ lhs[:bar].must == 0
58
+ rhs[:bar].must == 10
59
+ end
60
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
61
+ runner.execute
62
+
63
+ database do |db|
64
+ assert_equal 1, db[:groups].count
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,85 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestDualLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # create the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
+ db[:foo].import([:id, :ssn],
23
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
+
25
+ db.create_table(:bar) { primary_key(:id); String(:ssn) }
26
+ db[:bar].import([:id, :ssn],
27
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
28
+ end
29
+
30
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
31
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
32
+ conf = ds_1.link_with(ds_2) do
33
+ lhs[:ssn].must == rhs[:ssn]
34
+ end
35
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
36
+ runner.execute
37
+
38
+ database do |db|
39
+ assert_equal 10, db[:groups].count
40
+ db[:groups].order(:ssn).each_with_index do |row, i|
41
+ assert_equal "12345678#{i%10}", row[:ssn]
42
+ end
43
+
44
+ assert_equal 200, db[:groups_records].count
45
+ db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
46
+ if i % 20 >= 10
47
+ assert_equal 2, row[:dataset], row.inspect
48
+ else
49
+ assert_equal 1, row[:dataset], row.inspect
50
+ end
51
+ expected_group_id = i / 20 + 1
52
+ assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
53
+ end
54
+ end
55
+ end
56
+
57
+ test "don't ignore 1-record groups before the combining phase" do
58
+ # create the test data
59
+ database do |db|
60
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
61
+ db[:foo].import([:id, :ssn],
62
+ Array.new(100) { |i| [i, "1234567%03d" % i] })
63
+
64
+ db.create_table(:bar) { primary_key(:id); String(:ssn) }
65
+ db[:bar].import([:id, :ssn],
66
+ Array.new(100) { |i| [i, "1234567%03d" % i] })
67
+ end
68
+
69
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
70
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
71
+ conf = ds_1.link_with(ds_2) do
72
+ lhs[:ssn].must == rhs[:ssn]
73
+ end
74
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
75
+ runner.execute
76
+
77
+ database do |db|
78
+ assert_equal 100, db[:groups].count
79
+ db[:groups].order(:ssn).each_with_index do |row, i|
80
+ assert_equal "1234567%03d" % i, row[:ssn]
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,209 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestSelfLinkage < Test::Unit::TestCase
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
7
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
+ end
9
+
10
+ def database(&block)
11
+ Sequel.connect(@tmpuri, &block)
12
+ end
13
+
14
+ def teardown
15
+ FileUtils.remove_entry_secure(@tmpdir)
16
+ end
17
+
18
+ test "one mandatory field equality on single threaded runner" do
19
+ # insert the test data
20
+ database do |db|
21
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
+ db[:foo].import([:id, :ssn],
23
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
+ end
25
+
26
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = ds.link_with(ds) do
28
+ lhs[:ssn].must == rhs[:ssn]
29
+ end
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
31
+ runner.execute
32
+
33
+ database do |db|
34
+ assert_equal 10, db[:groups].count
35
+ db[:groups].order(:ssn).each_with_index do |row, i|
36
+ assert_equal "12345678#{i%10}", row[:ssn]
37
+ end
38
+
39
+ assert_equal 100, db[:groups_records].count
40
+ expected_group_id = nil
41
+ db[:groups_records].order(:record_id).each do |row|
42
+ expected_group_id = (row[:record_id] % 10) + 1
43
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
44
+ end
45
+ end
46
+ end
47
+
48
+ test "two mandatory field equalities on single threaded runner" do
49
+ # insert the test data
50
+ database do |db|
51
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
52
+ db[:foo].import([:id, :ssn, :dob],
53
+ Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
54
+ end
55
+
56
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
57
+ conf = ds.link_with(ds) do
58
+ lhs[:ssn].must == rhs[:ssn]
59
+ lhs[:dob].must == rhs[:dob]
60
+ end
61
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
62
+ runner.execute
63
+
64
+ database do |db|
65
+ assert_equal 20, db[:groups].count
66
+ db[:groups].order(:ssn).each_with_index do |row, i|
67
+ assert_equal "12345678#{i/2}", row[:ssn]
68
+ assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
69
+ end
70
+
71
+ assert_equal 100, db[:groups_records].count
72
+ expected_group_id = nil
73
+ db[:groups_records].order(:record_id).each do |row|
74
+ v = row[:record_id] % 20
75
+ expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
76
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
77
+ end
78
+ end
79
+ end
80
+
81
+ test "one mandatory field equality on single threaded runner, with filter" do
82
+ # insert the test data
83
+ database do |db|
84
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
85
+ db[:foo].import([:id, :ssn, :mod_5],
86
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
87
+ end
88
+
89
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
90
+ conf = ds.link_with(ds) do
91
+ lhs[:ssn].must == rhs[:ssn]
92
+ lhs[:mod_5].must == 3
93
+ end
94
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
95
+ runner.execute
96
+
97
+ database do |db|
98
+ assert_equal 2, db[:groups].count
99
+ db[:groups].order(:ssn).each_with_index do |row, i|
100
+ assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
101
+ end
102
+
103
+ assert_equal 20, db[:groups_records].count
104
+ expected_group_id = nil
105
+ db[:groups_records].order(:record_id).each do |row|
106
+ expected_group_id = (row[:record_id] / 5) % 2 + 1
107
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
108
+ end
109
+ end
110
+ end
111
+
112
+ test "one mandatory field equality on single threaded runner, with inequality filters" do
113
+ # insert the test data
114
+ database do |db|
115
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
116
+ db[:foo].import([:id, :ssn, :mod_5],
117
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
118
+ end
119
+
120
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
121
+ conf = ds.link_with(ds) do
122
+ lhs[:ssn].must == rhs[:ssn]
123
+ lhs[:mod_5].must > 2
124
+ lhs[:mod_5].must <= 3
125
+ end
126
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
127
+ runner.execute
128
+
129
+ database do |db|
130
+ assert_equal 2, db[:groups].count
131
+ db[:groups].order(:ssn).each_with_index do |row, i|
132
+ assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
133
+ end
134
+
135
+ assert_equal 20, db[:groups_records].count
136
+ expected_group_id = nil
137
+ db[:groups_records].order(:record_id).each do |row|
138
+ expected_group_id = (row[:record_id] / 5) % 2 + 1
139
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
140
+ end
141
+ end
142
+ end
143
+
144
+ test "one mandatory field equality on single threaded runner, with field filter" do
145
+ # insert the test data
146
+ database do |db|
147
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
148
+ db[:foo].import([:id, :ssn, :mod_5, :mod_20],
149
+ Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
150
+ end
151
+
152
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
153
+ conf = ds.link_with(ds) do
154
+ lhs[:ssn].must == rhs[:ssn]
155
+ lhs[:mod_5].must == lhs[:mod_20]
156
+ end
157
+ assert_equal :self, conf.linkage_type
158
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
159
+ runner.execute
160
+
161
+ database do |db|
162
+ assert_equal 5, db[:groups].count
163
+ db[:groups].order(:ssn).each_with_index do |row, i|
164
+ assert_equal "123456789#{i}", row[:ssn]
165
+ end
166
+
167
+ assert_equal 25, db[:groups_records].count
168
+ expected_group_id = nil
169
+ db[:groups_records].order(:record_id).each do |row|
170
+ expected_group_id = row[:record_id] % 5 + 1
171
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
172
+ end
173
+ end
174
+ end
175
+
176
+ test "one mandatory field equality on single threaded runner, with identical filters" do
177
+ # insert the test data
178
+ database do |db|
179
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
180
+ db[:foo].import([:id, :ssn, :mod_5, :mod_20],
181
+ Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
182
+ end
183
+
184
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
185
+ conf = ds.link_with(ds) do
186
+ lhs[:ssn].must == rhs[:ssn]
187
+ lhs[:mod_5].must == lhs[:mod_20]
188
+ rhs[:mod_5].must == rhs[:mod_20]
189
+ end
190
+ assert_equal :self, conf.linkage_type
191
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
192
+ runner.execute
193
+
194
+ database do |db|
195
+ assert_equal 5, db[:groups].count
196
+ db[:groups].order(:ssn).each_with_index do |row, i|
197
+ assert_equal "123456789#{i}", row[:ssn]
198
+ end
199
+
200
+ assert_equal 25, db[:groups_records].count
201
+ expected_group_id = nil
202
+ db[:groups_records].order(:record_id).each do |row|
203
+ expected_group_id = row[:record_id] % 5 + 1
204
+ assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,145 @@
1
+ require 'helper'
2
+
3
+ class UnitTests::TestConfiguration < Test::Unit::TestCase
4
+ test "linkage_type is self when the two datasets are the same" do
5
+ dataset = stub('dataset')
6
+ c = Linkage::Configuration.new(dataset, dataset)
7
+ assert_equal :self, c.linkage_type
8
+ end
9
+
10
+ test "linkage_type is dual when the two datasets are different" do
11
+ dataset_1 = stub('dataset')
12
+ dataset_2 = stub('dataset')
13
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
14
+ assert_equal :dual, c.linkage_type
15
+ end
16
+
17
+ test "linkage_type is cross when there's a 'cross-join'" do
18
+ dataset = mock('dataset', :set_new_id => nil)
19
+ c = Linkage::Configuration.new(dataset, dataset)
20
+ exp = stub('expectation', :kind => :cross)
21
+ c.add_expectation(exp)
22
+ assert_equal :cross, c.linkage_type
23
+ end
24
+
25
+ test "linkage_type is cross when there's different filters on both sides" do
26
+ field = stub('field')
27
+ dataset = stub('dataset', :set_new_id => nil)
28
+ dataset.stubs(:fields).returns({:foo => field})
29
+ c = Linkage::Configuration.new(dataset, dataset)
30
+ exp_1 = stub('expectation', :kind => :filter)
31
+ Linkage::MustExpectation.expects(:new).with(:==, field, 123, nil).returns(exp_1)
32
+ exp_2 = stub('expectation', :kind => :filter)
33
+ Linkage::MustExpectation.expects(:new).with(:==, field, 456, nil).returns(exp_2)
34
+ c.send(:instance_eval) do
35
+ lhs[:foo].must == 123
36
+ rhs[:foo].must == 456
37
+ end
38
+ assert_equal :cross, c.linkage_type
39
+ end
40
+
41
+ test "linkage_type is self when there's identical static filters on each side" do
42
+ field = stub('field')
43
+ dataset = stub('dataset', :set_new_id => nil)
44
+ dataset.stubs(:fields).returns({:foo => field})
45
+ c = Linkage::Configuration.new(dataset, dataset)
46
+ exp_1 = stub('expectation', :kind => :filter)
47
+ Linkage::MustExpectation.expects(:new).twice.with(:==, field, 123, nil).returns(exp_1)
48
+ c.send(:instance_eval) do
49
+ lhs[:foo].must == 123
50
+ rhs[:foo].must == 123
51
+ end
52
+ assert_equal :self, c.linkage_type
53
+ end
54
+
55
+ test "linkage_type is self when there's a two-field filter on one side" do
56
+ field_1 = stub('field 1')
57
+ field_2 = stub('field 2')
58
+ dataset = stub('dataset', :set_new_id => nil)
59
+ dataset.stubs(:fields).returns({:foo => field_1, :bar => field_2})
60
+ c = Linkage::Configuration.new(dataset, dataset)
61
+ exp_1 = stub('expectation', :kind => :filter)
62
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, field_2, :filter).returns(exp_1)
63
+ exp_2 = stub('expectation', :kind => :self)
64
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, field_1, nil).returns(exp_2)
65
+ c.send(:instance_eval) do
66
+ lhs[:foo].must == lhs[:bar]
67
+ lhs[:foo].must == rhs[:foo]
68
+ end
69
+ assert_equal :self, c.linkage_type
70
+ end
71
+
72
+ test "static expectation" do
73
+ dataset_1 = stub('dataset')
74
+ field = stub('field')
75
+ dataset_1.stubs(:fields).returns({:foo => field})
76
+ dataset_2 = stub('dataset')
77
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
78
+ Linkage::MustExpectation.expects(:new).with(:==, field, 123, nil)
79
+ c.send(:instance_eval) do
80
+ lhs[:foo].must == 123
81
+ end
82
+ end
83
+
84
+ ## Maybe in the future
85
+ #test "static expectation, flopped" do
86
+ #dataset_1 = stub('dataset')
87
+ #field = stub('field')
88
+ #dataset_1.stubs(:fields).returns({:foo => field})
89
+ #dataset_2 = stub('dataset')
90
+ #c = Linkage::Configuration.new(dataset_1, dataset_2)
91
+ #Linkage::MustExpectation.expects(:new).with(:==, 123, field)
92
+ #c.send(:instance_eval) do
93
+ #123.must == lhs[:foo]
94
+ #end
95
+ #end
96
+
97
+ test "complain if an invalid field is accessed" do
98
+ dataset_1 = stub('dataset')
99
+ field_1 = stub_field('field 1')
100
+ dataset_1.stubs(:fields).returns({:foo => field_1})
101
+
102
+ dataset_2 = stub('dataset')
103
+ field_2 = stub_field('field 2')
104
+ dataset_2.stubs(:fields).returns({:bar => field_2})
105
+
106
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
107
+ assert_raises(ArgumentError) do
108
+ c.send(:instance_eval) do
109
+ lhs[:foo].must == rhs[:non_existant_field]
110
+ end
111
+ end
112
+ end
113
+
114
+ operators = [:>, :<, :>=, :<=]
115
+ operators << :'!=' if current_ruby_version >= ruby19
116
+ operators.each do |operator|
117
+ test "DSL #{operator} filter operator" do
118
+ dataset_1 = stub('dataset 1')
119
+ field_1 = stub_field('field 1')
120
+ dataset_1.stubs(:fields).returns({:foo => field_1})
121
+
122
+ dataset_2 = stub('dataset 2')
123
+ field_2 = stub_field('field 2')
124
+ dataset_2.stubs(:fields).returns({:bar => field_2})
125
+
126
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
127
+ Linkage::MustExpectation.expects(:new).with(operator, field_1, field_2, nil)
128
+ block = eval("Proc.new { lhs[:foo].must #{operator} rhs[:bar] }")
129
+ c.send(:instance_eval, &block)
130
+ end
131
+ end
132
+
133
+ test "must_not expectation" do
134
+ dataset_1 = stub('dataset 1')
135
+ field_1 = stub_field('field 1')
136
+ dataset_1.stubs(:fields).returns({:foo => field_1})
137
+ dataset_2 = stub('dataset 2')
138
+
139
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
140
+ Linkage::MustNotExpectation.expects(:new).with(:==, field_1, 123, nil)
141
+ c.send(:instance_eval) do
142
+ lhs[:foo].must_not == 123
143
+ end
144
+ end
145
+ end