linkage 0.0.6 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/.gitignore +10 -0
  2. data/Gemfile +15 -13
  3. data/Gemfile.lock +67 -37
  4. data/Guardfile +0 -2
  5. data/Rakefile +122 -25
  6. data/lib/linkage/comparator.rb +172 -0
  7. data/lib/linkage/comparators/binary.rb +12 -0
  8. data/lib/linkage/comparators/compare.rb +46 -0
  9. data/lib/linkage/comparators/within.rb +32 -0
  10. data/lib/linkage/configuration.rb +285 -153
  11. data/lib/linkage/data.rb +32 -7
  12. data/lib/linkage/dataset.rb +107 -32
  13. data/lib/linkage/decollation.rb +93 -0
  14. data/lib/linkage/expectation.rb +21 -0
  15. data/lib/linkage/expectations/exhaustive.rb +63 -0
  16. data/lib/linkage/expectations/simple.rb +168 -0
  17. data/lib/linkage/field.rb +30 -4
  18. data/lib/linkage/field_set.rb +6 -3
  19. data/lib/linkage/function.rb +50 -3
  20. data/lib/linkage/functions/binary.rb +30 -0
  21. data/lib/linkage/functions/cast.rb +54 -0
  22. data/lib/linkage/functions/length.rb +29 -0
  23. data/lib/linkage/functions/strftime.rb +12 -11
  24. data/lib/linkage/functions/trim.rb +8 -0
  25. data/lib/linkage/group.rb +20 -0
  26. data/lib/linkage/import_buffer.rb +5 -16
  27. data/lib/linkage/meta_object.rb +139 -0
  28. data/lib/linkage/result_set.rb +74 -17
  29. data/lib/linkage/runner/single_threaded.rb +125 -10
  30. data/lib/linkage/version.rb +3 -0
  31. data/lib/linkage.rb +11 -0
  32. data/linkage.gemspec +16 -121
  33. data/test/config.yml +5 -0
  34. data/test/helper.rb +73 -8
  35. data/test/integration/test_collation.rb +45 -0
  36. data/test/integration/test_configuration.rb +268 -0
  37. data/test/integration/test_cross_linkage.rb +4 -17
  38. data/test/integration/test_dataset.rb +45 -2
  39. data/test/integration/test_dual_linkage.rb +40 -24
  40. data/test/integration/test_functions.rb +22 -0
  41. data/test/integration/test_result_set.rb +85 -0
  42. data/test/integration/test_scoring.rb +84 -0
  43. data/test/integration/test_self_linkage.rb +5 -0
  44. data/test/integration/test_within_comparator.rb +100 -0
  45. data/test/unit/comparators/test_compare.rb +105 -0
  46. data/test/unit/comparators/test_within.rb +57 -0
  47. data/test/unit/expectations/test_exhaustive.rb +111 -0
  48. data/test/unit/expectations/test_simple.rb +303 -0
  49. data/test/unit/functions/test_binary.rb +54 -0
  50. data/test/unit/functions/test_cast.rb +98 -0
  51. data/test/unit/functions/test_length.rb +52 -0
  52. data/test/unit/functions/test_strftime.rb +17 -13
  53. data/test/unit/functions/test_trim.rb +11 -4
  54. data/test/unit/test_comparator.rb +124 -0
  55. data/test/unit/test_configuration.rb +137 -175
  56. data/test/unit/test_data.rb +44 -0
  57. data/test/unit/test_dataset.rb +73 -21
  58. data/test/unit/test_decollation.rb +201 -0
  59. data/test/unit/test_field.rb +38 -14
  60. data/test/unit/test_field_set.rb +12 -8
  61. data/test/unit/test_function.rb +83 -16
  62. data/test/unit/test_group.rb +28 -0
  63. data/test/unit/test_import_buffer.rb +13 -27
  64. data/test/unit/test_meta_object.rb +208 -0
  65. data/test/unit/test_result_set.rb +221 -3
  66. metadata +82 -190
@@ -0,0 +1,268 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestConfiguration < Test::Unit::TestCase
5
+ test "linkage_type is self when the two datasets are the same" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
8
+ end
9
+
10
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
11
+ conf = Linkage::Configuration.new(dataset, dataset)
12
+ assert_equal :self, conf.linkage_type
13
+ end
14
+
15
+ test "linkage_type is dual when the two datasets are different" do
16
+ database_for('sqlite') do |db|
17
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
18
+ db.create_table(:bar) { primary_key(:id); String(:foo); String(:bar) }
19
+ end
20
+
21
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
22
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), "bar")
23
+ conf = Linkage::Configuration.new(dataset_1, dataset_2)
24
+ assert_equal :dual, conf.linkage_type
25
+ end
26
+
27
+ test "linkage_type is cross when there's different filters on both sides" do
28
+ database_for('sqlite') do |db|
29
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
30
+ end
31
+
32
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
33
+ conf = Linkage::Configuration.new(dataset, dataset)
34
+ conf.configure do
35
+ lhs[:foo].must == "foo"
36
+ rhs[:foo].must == "bar"
37
+ end
38
+ assert_equal :cross, conf.linkage_type
39
+ end
40
+
41
+ test "linkage_type is self when there's identical static filters on each side" do
42
+ database_for('sqlite') do |db|
43
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
44
+ end
45
+
46
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
47
+ conf = Linkage::Configuration.new(dataset, dataset)
48
+ conf.configure do
49
+ lhs[:foo].must == "foo"
50
+ rhs[:foo].must == "foo"
51
+ end
52
+ assert_equal :self, conf.linkage_type
53
+ end
54
+
55
+ test "linkage_type is cross when exhaustive expectations use different fields" do
56
+ database_for('sqlite') do |db|
57
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
58
+ end
59
+
60
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
61
+ conf = Linkage::Configuration.new(dataset, dataset)
62
+ conf.configure do
63
+ lhs[:foo].must(be_within(5).of(rhs[:bar]))
64
+ end
65
+ assert_equal :cross, conf.linkage_type
66
+ end
67
+
68
+ test "static expectation" do
69
+ database_for('sqlite') do |db|
70
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
71
+ end
72
+
73
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
74
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
75
+ conf.configure do
76
+ lhs[:foo].must == "foo"
77
+ end
78
+
79
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
80
+ assert_equal dataset_2.obj, dataset_1.filter(:foo => "foo").obj
81
+ end
82
+
83
+ test "complain if an invalid field is accessed" do
84
+ database_for('sqlite') do |db|
85
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
86
+ end
87
+
88
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
89
+ conf = Linkage::Configuration.new(dataset, dataset)
90
+ assert_raises(ArgumentError) do
91
+ conf.configure do
92
+ lhs[:foo].must == rhs[:non_existant_field]
93
+ end
94
+ end
95
+ end
96
+
97
+ operators = [:>, :<, :>=, :<=]
98
+ operators.each do |operator|
99
+ test "DSL #{operator} filter operator" do
100
+ database_for('sqlite') do |db|
101
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
102
+ end
103
+
104
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
105
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
106
+ conf.configure do
107
+ lhs[:foo].must.send(operator, 123)
108
+ end
109
+
110
+ expr = Sequel::SQL::BooleanExpression.new(operator, Sequel::SQL::Identifier.new(:foo), 123)
111
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
112
+ assert_equal dataset_2.obj, dataset_1.filter(expr).obj
113
+ end
114
+
115
+ test "comparing two data sources with #{operator}" do
116
+ database_for('sqlite') do |db|
117
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
118
+ end
119
+
120
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
121
+ conf = Linkage::Configuration.new(dataset, dataset)
122
+ conf.configure do
123
+ lhs[:foo].must.send(operator, rhs[:bar])
124
+ end
125
+ assert_equal 1, conf.exhaustive_expectations.length
126
+
127
+ comp = conf.exhaustive_expectations[0].comparator
128
+ assert_instance_of Linkage::Comparators::Compare, comp
129
+ end
130
+ end
131
+
132
+ test "must_not expectation" do
133
+ database_for('sqlite') do |db|
134
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
135
+ end
136
+
137
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
138
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
139
+ conf.configure do
140
+ lhs[:foo].must_not == "foo"
141
+ end
142
+
143
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
144
+ assert_equal dataset_2.obj, dataset_1.filter(~{:foo => "foo"}).obj
145
+ end
146
+
147
+ test "static database function" do
148
+ database_for('sqlite') do |db|
149
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
150
+ end
151
+
152
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
153
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
154
+ conf.configure do
155
+ lhs[:foo].must == trim("foo")
156
+ end
157
+
158
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
159
+ assert_equal dataset_1.filter({:foo => :trim.sql_function("foo")}).obj, dataset_2.obj
160
+ end
161
+
162
+ test "save_results_in" do
163
+ database_for('sqlite') do |db|
164
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
165
+ end
166
+
167
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
168
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
169
+ conf.configure do
170
+ save_results_in("mysql://localhost/results", {:foo => 'bar'})
171
+ end
172
+ assert_equal "mysql://localhost/results", conf.results_uri
173
+ assert_equal({:foo => 'bar'}, conf.results_uri_options)
174
+ end
175
+
176
+ test "case insensitive field names" do
177
+ database_for('sqlite') do |db|
178
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
179
+ end
180
+
181
+ assert_nothing_raised do
182
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
183
+ results_uri = database_options_for('sqlite')
184
+ conf = dataset.link_with(dataset) do
185
+ lhs[:Foo].must == rhs[:baR]
186
+ save_results_in(results_uri)
187
+ end
188
+ end
189
+ end
190
+
191
+ test "decollation_needed? is false when the datasets and results dataset all have the same database and collations" do
192
+ database_for('mysql') do |db|
193
+ db.create_table!(:foo) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
194
+ db.create_table!(:bar) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
195
+ end
196
+
197
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
198
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
199
+ conf = dataset_1.link_with(dataset_2) do
200
+ lhs[:foo].must == rhs[:foo]
201
+ end
202
+ conf.results_uri = database_options_for('mysql')
203
+ assert !conf.decollation_needed?
204
+ end
205
+
206
+ test "decollation_needed? is true when the datasets have different database types" do
207
+ database_for('mysql') do |db|
208
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
209
+ end
210
+
211
+ database_for('sqlite') do |db|
212
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
213
+ end
214
+
215
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
216
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
217
+ conf = dataset_1.link_with(dataset_2) do
218
+ lhs[:foo].must == rhs[:foo]
219
+ end
220
+ conf.results_uri = database_options_for('mysql')
221
+ assert conf.decollation_needed?
222
+ end
223
+
224
+ test "decollation_needed? is true when the result dataset has different database type than the datasets" do
225
+ database_for('mysql') do |db|
226
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
227
+ db.create_table!(:bar) { primary_key(:id); String(:foo) }
228
+ end
229
+
230
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
231
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
232
+ conf = dataset_1.link_with(dataset_2) do
233
+ lhs[:foo].must == rhs[:foo]
234
+ end
235
+ conf.results_uri = database_options_for('sqlite')
236
+ assert conf.decollation_needed?
237
+ end
238
+
239
+ test "decollation_needed? is false when not comparing string columns" do
240
+ database_for('mysql') do |db|
241
+ db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
242
+ end
243
+
244
+ database_for('sqlite') do |db|
245
+ db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
246
+ end
247
+
248
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
249
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
250
+ conf = dataset_1.link_with(dataset_2) do
251
+ lhs[:foo].must == rhs[:foo]
252
+ end
253
+ conf.results_uri = database_options_for('mysql')
254
+ assert !conf.decollation_needed?
255
+ end
256
+
257
+ test "creating comparator expectation for within" do
258
+ database_for('mysql') do |db|
259
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
260
+ end
261
+ dataset = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
262
+
263
+ conf = dataset.link_with(dataset) do
264
+ lhs[:foo].must be_within(5).of(rhs[:foo])
265
+ end
266
+ end
267
+ end
268
+ end
@@ -16,10 +16,6 @@ module IntegrationTests
16
16
  end
17
17
 
18
18
  test "one mandatory field equality on single threaded runner" do
19
- #setup_logger = Logger.new(STDERR)
20
- #setup_logger.formatter = lambda { |severity, time, progname, msg|
21
- #" SETUP : %s [%s]: %s\n" % [severity, time, msg]
22
- #}
23
19
  # insert the test data
24
20
  database do |db|
25
21
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -27,16 +23,8 @@ module IntegrationTests
27
23
  Array.new(100) { |i| [i, i % 10, i % 5] })
28
24
  end
29
25
 
30
- #ds_logger = Logger.new(STDERR)
31
- #ds_logger.formatter = lambda { |severity, time, progname, msg|
32
- #"DATASET: %s [%s]: %s\n" % [severity, time, msg]
33
- #}
34
26
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
35
27
 
36
- #rs_logger = Logger.new(STDERR)
37
- #rs_logger.formatter = lambda { |severity, time, progname, msg|
38
- #"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
39
- #}
40
28
  tmpuri = @tmpuri
41
29
  conf = ds.link_with(ds) do
42
30
  lhs[:foo].must == rhs[:bar]
@@ -52,11 +40,10 @@ module IntegrationTests
52
40
  assert_equal i, row[:foo_bar]
53
41
  end
54
42
 
55
- #assert_equal 150, db[:groups_records].count
56
- #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
57
- #expected_group_id = (row[:record_id] % 5) + 1
58
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
59
- #end
43
+ assert_equal 1000, db[:matches].count
44
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
45
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
46
+ end
60
47
  end
61
48
  end
62
49
 
@@ -38,7 +38,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
38
38
  end
39
39
 
40
40
  ds = Linkage::Dataset.new(@tmpuri, "foo")
41
- ds = ds.match(:bar)
41
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
42
42
  ds.each_group do |group|
43
43
  assert_equal({:bar => "foo"}, group.values)
44
44
  assert_equal(2, group.count)
@@ -51,6 +51,26 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
51
51
  assert_equal 3, groups.length
52
52
  end
53
53
 
54
+ test "each_group with alias" do
55
+ database do |db|
56
+ db.create_table(:foo) do
57
+ primary_key :id
58
+ String :bar
59
+ end
60
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
61
+ end
62
+
63
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
64
+ ds = ds.group_match({
65
+ :meta_object => Linkage::MetaObject.new(ds.field_set[:bar]),
66
+ :alias => :bar_baz
67
+ })
68
+ ds.each_group do |group|
69
+ assert_equal({:bar_baz => "foo"}, group.values)
70
+ assert_equal(2, group.count)
71
+ end
72
+ end
73
+
54
74
  test "each_group with filters" do
55
75
  database do |db|
56
76
  db.create_table(:foo) do
@@ -62,7 +82,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
62
82
  end
63
83
 
64
84
  ds = Linkage::Dataset.new(@tmpuri, "foo")
65
- ds = ds.match(:bar)
85
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
66
86
  ds = ds.filter { baz >= 3 }
67
87
  groups = []
68
88
  ds.each_group(1) do |group|
@@ -70,4 +90,27 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
70
90
  end
71
91
  assert_equal 2, groups.length
72
92
  end
93
+
94
+ test "each_group with collation" do
95
+ database_for('mysql') do |db|
96
+ db.create_table!(:foo) do
97
+ primary_key :id
98
+ String :bar, :collate => :latin1_swedish_ci
99
+ end
100
+ db[:foo].import([:id, :bar], [[1, 'fOo'], [2, 'foO'], [3, 'bar'], [4, 'baz']])
101
+ end
102
+
103
+ ds = Linkage::Dataset.new(database_options_for('mysql'), "foo")
104
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
105
+ groups = []
106
+ ds.each_group(1) do |group|
107
+ groups << group
108
+ end
109
+ expected = [
110
+ {:bar => 'BAR'},
111
+ {:bar => 'BAZ'},
112
+ {:bar => 'FOO'}
113
+ ]
114
+ assert_equal expected, groups.collect(&:decollated_values)
115
+ end
73
116
  end
@@ -34,6 +34,8 @@ module IntegrationTests
34
34
  lhs[:ssn].must == rhs[:ssn]
35
35
  save_results_in(tmpuri)
36
36
  end
37
+ assert_equal :dual, conf.linkage_type
38
+
37
39
  runner = Linkage::SingleThreadedRunner.new(conf)
38
40
  runner.execute
39
41
 
@@ -43,16 +45,10 @@ module IntegrationTests
43
45
  assert_equal "12345678#{i%10}", row[:ssn]
44
46
  end
45
47
 
46
- #assert_equal 200, db[:groups_records].count
47
- #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
- #if i % 20 >= 10
49
- #assert_equal 2, row[:dataset], row.inspect
50
- #else
51
- #assert_equal 1, row[:dataset], row.inspect
52
- #end
53
- #expected_group_id = i / 20 + 1
54
- #assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
- #end
48
+ assert_equal 1000, db[:matches].count
49
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
50
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
51
+ end
56
52
  end
57
53
  end
58
54
 
@@ -87,32 +83,52 @@ module IntegrationTests
87
83
  end
88
84
 
89
85
  test "reacts properly when using two databases with different string equality methods" do
90
- if !test_config['mysql']
91
- omission("No MySQL test configuration found")
86
+ foo_logger = nil #prefixed_logger("FOO")
87
+ bar_logger = nil #prefixed_logger("BAR")
88
+
89
+ database_for('mysql', :logger => foo_logger) do |db|
90
+ db.create_table!(:foo) do
91
+ primary_key(:id)
92
+ String :baz, :collate => "latin1_swedish_ci"
93
+ end
94
+ db[:foo].import([:id, :baz], [
95
+ [1, "tEst"],
96
+ [2, "teSt"],
97
+ [3, "tesT "],
98
+ [4, "TEST"],
99
+ [5, "junk"]
100
+ ])
92
101
  end
93
- uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
94
- Sequel.connect(uri) do |db|
95
- db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
96
- db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
97
102
 
98
- db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
99
- db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
103
+ database_for('mysql', :logger => bar_logger) do |db|
104
+ db.create_table!(:bar) do
105
+ primary_key(:id)
106
+ String :baz, :collate => "latin1_swedish_ci"
107
+ end
108
+ db[:bar].import([:id, :baz], [
109
+ [1, "Test "],
110
+ [2, "tEst "],
111
+ [3, "teSt"],
112
+ [4, "TEST"],
113
+ [5, "junk"]
114
+ ])
100
115
  end
101
116
 
102
- ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
103
- ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
117
+ options = database_options_for('mysql')
118
+ ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
119
+ ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
104
120
  tmpuri = @tmpuri
121
+ results_logger = nil #prefixed_logger("RESULTS")
105
122
  conf = ds_1.link_with(ds_2) do
106
- lhs[:one].must == rhs[:one]
107
- lhs[:two].must == rhs[:two]
108
- save_results_in(tmpuri)
123
+ lhs[:baz].must == rhs[:baz]
124
+ save_results_in(tmpuri, :logger => results_logger)
109
125
  end
110
126
 
111
127
  runner = Linkage::SingleThreadedRunner.new(conf)
112
128
  runner.execute
113
129
 
114
130
  database do |db|
115
- assert_equal 1, db[:groups].count
131
+ assert_equal 2, db[:groups].count
116
132
  end
117
133
  end
118
134
  end
@@ -62,5 +62,27 @@ module IntegrationTests
62
62
  assert_equal 1, db[:groups].count
63
63
  end
64
64
  end
65
+
66
+ test "binary function with static argument" do
67
+ database do |db|
68
+ db.create_table(:foo) { primary_key(:id); String(:bar) }
69
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo']])
70
+ end
71
+
72
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
73
+ tmpuri = @tmpuri
74
+ conf = ds.link_with(ds) do
75
+ lhs[:bar].must == rhs[:bar]
76
+ binary(lhs[:bar]).must == binary('foo')
77
+ binary(rhs[:bar]).must == binary('foo')
78
+ save_results_in(tmpuri)
79
+ end
80
+ runner = Linkage::SingleThreadedRunner.new(conf)
81
+ runner.execute
82
+
83
+ database do |db|
84
+ assert_equal 1, db[:groups].count
85
+ end
86
+ end
65
87
  end
66
88
  end
@@ -0,0 +1,85 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestResultSet < Test::Unit::TestCase
5
+ test "#create_tables! creates original_groups table when decollation is needed" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
8
+ end
9
+
10
+ database_for('mysql') do |db|
11
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
12
+ end
13
+
14
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
15
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
16
+ results_uri = database_options_for('sqlite')
17
+ conf = dataset_1.link_with(dataset_2) do
18
+ lhs[:foo].must == rhs[:foo]
19
+ save_results_in(results_uri)
20
+ end
21
+ conf.result_set.create_tables!
22
+ assert_include conf.result_set.database.tables, :original_groups
23
+ end
24
+
25
+ test "#create_tables! doesn't create original_groups table when decollation is needed" do
26
+ database_for('sqlite') do |db|
27
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
28
+ end
29
+
30
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
31
+ results_uri = database_options_for('sqlite')
32
+ conf = dataset.link_with(dataset) do
33
+ lhs[:foo].must == rhs[:foo]
34
+ save_results_in(results_uri)
35
+ end
36
+ conf.result_set.create_tables!
37
+ assert_not_include conf.result_set.database.tables, :original_groups
38
+ end
39
+
40
+ test "#create_tables! doesn't create groups table when not needed" do
41
+ database_for('sqlite') do |db|
42
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
43
+ end
44
+
45
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
46
+ results_uri = database_options_for('sqlite')
47
+ conf = dataset.link_with(dataset) do
48
+ lhs[:foo].must be_within(5).of(rhs[:foo])
49
+ save_results_in(results_uri)
50
+ end
51
+ conf.result_set.create_tables!
52
+ assert_not_include conf.result_set.database.tables, :groups
53
+ end
54
+
55
+ test "#create_tables! creates scores table when there are exhaustive expectations" do
56
+ database_for('sqlite') do |db|
57
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
58
+ end
59
+
60
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
61
+ results_uri = database_options_for('sqlite')
62
+ conf = dataset.link_with(dataset) do
63
+ lhs[:foo].must be_within(5).of(rhs[:foo])
64
+ save_results_in(results_uri)
65
+ end
66
+ conf.result_set.create_tables!
67
+ assert_include conf.result_set.database.tables, :scores
68
+ end
69
+
70
+ test "#create_tables! doesn't create scores table when not needed" do
71
+ database_for('sqlite') do |db|
72
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
73
+ end
74
+
75
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
76
+ results_uri = database_options_for('sqlite')
77
+ conf = dataset.link_with(dataset) do
78
+ lhs[:foo].must == rhs[:foo]
79
+ save_results_in(results_uri)
80
+ end
81
+ conf.result_set.create_tables!
82
+ assert_not_include conf.result_set.database.tables, :scores
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,84 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestScoring < Test::Unit::TestCase
5
+ test "stop scoring if must expectation fails" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
8
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
9
+ db[:foo].import([:id, :num], [[1, 1]])
10
+ db[:bar].import([:id, :num], [[1, 5]])
11
+ end
12
+
13
+ db_opts = database_options_for('sqlite')
14
+ dataset_1 = Linkage::Dataset.new(db_opts, "foo")
15
+ dataset_2 = Linkage::Dataset.new(db_opts, "bar")
16
+ conf = dataset_1.link_with(dataset_2) do
17
+ lhs[:num].must_not be_within(5).of(rhs[:num])
18
+ lhs[:num].must be_within(5).of(rhs[:num])
19
+ save_results_in(db_opts)
20
+ end
21
+
22
+ runner = Linkage::SingleThreadedRunner.new(conf)
23
+ runner.execute
24
+
25
+ database_for('sqlite') do |db|
26
+ assert_equal db[:scores].count, 1
27
+ record = db[:scores].first
28
+ assert_equal 1, record[:score]
29
+ end
30
+ end
31
+
32
+ test "scoring phase adds matches as needed" do
33
+ database_for('sqlite') do |db|
34
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
35
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
36
+ db[:foo].import([:id, :num], (0..15).collect { |i| [i, i] })
37
+ db[:bar].import([:id, :num], (0..15).collect { |i| [i, i] })
38
+ end
39
+
40
+ db_opts = database_options_for('sqlite')
41
+ dataset_1 = Linkage::Dataset.new(db_opts, "foo")
42
+ dataset_2 = Linkage::Dataset.new(db_opts, "bar")
43
+ conf = dataset_1.link_with(dataset_2) do
44
+ lhs[:num].must be_within(10).of(rhs[:num])
45
+ lhs[:num].must_not be_within(5).of(rhs[:num])
46
+ save_results_in(db_opts)
47
+ end
48
+
49
+ runner = Linkage::SingleThreadedRunner.new(conf)
50
+ runner.execute
51
+
52
+ database_for('sqlite') do |db|
53
+ assert_equal 80, db[:matches].count
54
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
55
+ assert_equal 1, row[:total_score]
56
+ assert_include 6..10, (row[:record_1_id] - row[:record_2_id]).abs
57
+ end
58
+ end
59
+ end
60
+
61
+ test "optimize scoring for self linkage" do
62
+ database_for('sqlite') do |db|
63
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
64
+ db[:foo].import([:id, :num], [[1, 1], [2, 5], [3, 10]])
65
+ end
66
+
67
+ db_opts = database_options_for('sqlite')
68
+ dataset = Linkage::Dataset.new(db_opts, "foo")
69
+ conf = dataset.link_with(dataset) do
70
+ lhs[:num].must be_within(5).of(rhs[:num])
71
+ save_results_in(db_opts)
72
+ end
73
+
74
+ runner = Linkage::SingleThreadedRunner.new(conf)
75
+ runner.execute
76
+
77
+ database_for('sqlite') do |db|
78
+ assert_equal db[:scores].count, 3
79
+ scores = db[:scores].order(:record_1_id, :record_2_id).select_map(:score)
80
+ assert_equal [1, 0, 1], scores
81
+ end
82
+ end
83
+ end
84
+ end
@@ -42,6 +42,11 @@ module IntegrationTests
42
42
  dataset, _ = result_set.groups_records_datasets(group)
43
43
  assert_equal 10, dataset.count
44
44
  end
45
+
46
+ assert_equal 450, db[:matches].count
47
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
48
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
49
+ end
45
50
  end
46
51
  end
47
52