linkage 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/.gitignore +10 -0
  2. data/Gemfile +15 -13
  3. data/Gemfile.lock +67 -37
  4. data/Guardfile +0 -2
  5. data/Rakefile +122 -25
  6. data/lib/linkage/comparator.rb +172 -0
  7. data/lib/linkage/comparators/binary.rb +12 -0
  8. data/lib/linkage/comparators/compare.rb +46 -0
  9. data/lib/linkage/comparators/within.rb +32 -0
  10. data/lib/linkage/configuration.rb +285 -153
  11. data/lib/linkage/data.rb +32 -7
  12. data/lib/linkage/dataset.rb +107 -32
  13. data/lib/linkage/decollation.rb +93 -0
  14. data/lib/linkage/expectation.rb +21 -0
  15. data/lib/linkage/expectations/exhaustive.rb +63 -0
  16. data/lib/linkage/expectations/simple.rb +168 -0
  17. data/lib/linkage/field.rb +30 -4
  18. data/lib/linkage/field_set.rb +6 -3
  19. data/lib/linkage/function.rb +50 -3
  20. data/lib/linkage/functions/binary.rb +30 -0
  21. data/lib/linkage/functions/cast.rb +54 -0
  22. data/lib/linkage/functions/length.rb +29 -0
  23. data/lib/linkage/functions/strftime.rb +12 -11
  24. data/lib/linkage/functions/trim.rb +8 -0
  25. data/lib/linkage/group.rb +20 -0
  26. data/lib/linkage/import_buffer.rb +5 -16
  27. data/lib/linkage/meta_object.rb +139 -0
  28. data/lib/linkage/result_set.rb +74 -17
  29. data/lib/linkage/runner/single_threaded.rb +125 -10
  30. data/lib/linkage/version.rb +3 -0
  31. data/lib/linkage.rb +11 -0
  32. data/linkage.gemspec +16 -121
  33. data/test/config.yml +5 -0
  34. data/test/helper.rb +73 -8
  35. data/test/integration/test_collation.rb +45 -0
  36. data/test/integration/test_configuration.rb +268 -0
  37. data/test/integration/test_cross_linkage.rb +4 -17
  38. data/test/integration/test_dataset.rb +45 -2
  39. data/test/integration/test_dual_linkage.rb +40 -24
  40. data/test/integration/test_functions.rb +22 -0
  41. data/test/integration/test_result_set.rb +85 -0
  42. data/test/integration/test_scoring.rb +84 -0
  43. data/test/integration/test_self_linkage.rb +5 -0
  44. data/test/integration/test_within_comparator.rb +100 -0
  45. data/test/unit/comparators/test_compare.rb +105 -0
  46. data/test/unit/comparators/test_within.rb +57 -0
  47. data/test/unit/expectations/test_exhaustive.rb +111 -0
  48. data/test/unit/expectations/test_simple.rb +303 -0
  49. data/test/unit/functions/test_binary.rb +54 -0
  50. data/test/unit/functions/test_cast.rb +98 -0
  51. data/test/unit/functions/test_length.rb +52 -0
  52. data/test/unit/functions/test_strftime.rb +17 -13
  53. data/test/unit/functions/test_trim.rb +11 -4
  54. data/test/unit/test_comparator.rb +124 -0
  55. data/test/unit/test_configuration.rb +137 -175
  56. data/test/unit/test_data.rb +44 -0
  57. data/test/unit/test_dataset.rb +73 -21
  58. data/test/unit/test_decollation.rb +201 -0
  59. data/test/unit/test_field.rb +38 -14
  60. data/test/unit/test_field_set.rb +12 -8
  61. data/test/unit/test_function.rb +83 -16
  62. data/test/unit/test_group.rb +28 -0
  63. data/test/unit/test_import_buffer.rb +13 -27
  64. data/test/unit/test_meta_object.rb +208 -0
  65. data/test/unit/test_result_set.rb +221 -3
  66. metadata +82 -190
@@ -0,0 +1,268 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestConfiguration < Test::Unit::TestCase
5
+ test "linkage_type is self when the two datasets are the same" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
8
+ end
9
+
10
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
11
+ conf = Linkage::Configuration.new(dataset, dataset)
12
+ assert_equal :self, conf.linkage_type
13
+ end
14
+
15
+ test "linkage_type is dual when the two datasets are different" do
16
+ database_for('sqlite') do |db|
17
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
18
+ db.create_table(:bar) { primary_key(:id); String(:foo); String(:bar) }
19
+ end
20
+
21
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
22
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), "bar")
23
+ conf = Linkage::Configuration.new(dataset_1, dataset_2)
24
+ assert_equal :dual, conf.linkage_type
25
+ end
26
+
27
+ test "linkage_type is cross when there's different filters on both sides" do
28
+ database_for('sqlite') do |db|
29
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
30
+ end
31
+
32
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
33
+ conf = Linkage::Configuration.new(dataset, dataset)
34
+ conf.configure do
35
+ lhs[:foo].must == "foo"
36
+ rhs[:foo].must == "bar"
37
+ end
38
+ assert_equal :cross, conf.linkage_type
39
+ end
40
+
41
+ test "linkage_type is self when there's identical static filters on each side" do
42
+ database_for('sqlite') do |db|
43
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
44
+ end
45
+
46
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
47
+ conf = Linkage::Configuration.new(dataset, dataset)
48
+ conf.configure do
49
+ lhs[:foo].must == "foo"
50
+ rhs[:foo].must == "foo"
51
+ end
52
+ assert_equal :self, conf.linkage_type
53
+ end
54
+
55
+ test "linkage_type is cross when exhaustive expectations use different fields" do
56
+ database_for('sqlite') do |db|
57
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
58
+ end
59
+
60
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
61
+ conf = Linkage::Configuration.new(dataset, dataset)
62
+ conf.configure do
63
+ lhs[:foo].must(be_within(5).of(rhs[:bar]))
64
+ end
65
+ assert_equal :cross, conf.linkage_type
66
+ end
67
+
68
+ test "static expectation" do
69
+ database_for('sqlite') do |db|
70
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
71
+ end
72
+
73
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
74
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
75
+ conf.configure do
76
+ lhs[:foo].must == "foo"
77
+ end
78
+
79
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
80
+ assert_equal dataset_2.obj, dataset_1.filter(:foo => "foo").obj
81
+ end
82
+
83
+ test "complain if an invalid field is accessed" do
84
+ database_for('sqlite') do |db|
85
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
86
+ end
87
+
88
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
89
+ conf = Linkage::Configuration.new(dataset, dataset)
90
+ assert_raises(ArgumentError) do
91
+ conf.configure do
92
+ lhs[:foo].must == rhs[:non_existant_field]
93
+ end
94
+ end
95
+ end
96
+
97
+ operators = [:>, :<, :>=, :<=]
98
+ operators.each do |operator|
99
+ test "DSL #{operator} filter operator" do
100
+ database_for('sqlite') do |db|
101
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
102
+ end
103
+
104
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
105
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
106
+ conf.configure do
107
+ lhs[:foo].must.send(operator, 123)
108
+ end
109
+
110
+ expr = Sequel::SQL::BooleanExpression.new(operator, Sequel::SQL::Identifier.new(:foo), 123)
111
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
112
+ assert_equal dataset_2.obj, dataset_1.filter(expr).obj
113
+ end
114
+
115
+ test "comparing two data sources with #{operator}" do
116
+ database_for('sqlite') do |db|
117
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
118
+ end
119
+
120
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
121
+ conf = Linkage::Configuration.new(dataset, dataset)
122
+ conf.configure do
123
+ lhs[:foo].must.send(operator, rhs[:bar])
124
+ end
125
+ assert_equal 1, conf.exhaustive_expectations.length
126
+
127
+ comp = conf.exhaustive_expectations[0].comparator
128
+ assert_instance_of Linkage::Comparators::Compare, comp
129
+ end
130
+ end
131
+
132
+ test "must_not expectation" do
133
+ database_for('sqlite') do |db|
134
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
135
+ end
136
+
137
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
138
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
139
+ conf.configure do
140
+ lhs[:foo].must_not == "foo"
141
+ end
142
+
143
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
144
+ assert_equal dataset_2.obj, dataset_1.filter(~{:foo => "foo"}).obj
145
+ end
146
+
147
+ test "static database function" do
148
+ database_for('sqlite') do |db|
149
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
150
+ end
151
+
152
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
153
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
154
+ conf.configure do
155
+ lhs[:foo].must == trim("foo")
156
+ end
157
+
158
+ dataset_2, _ = conf.datasets_with_applied_simple_expectations
159
+ assert_equal dataset_1.filter({:foo => :trim.sql_function("foo")}).obj, dataset_2.obj
160
+ end
161
+
162
+ test "save_results_in" do
163
+ database_for('sqlite') do |db|
164
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
165
+ end
166
+
167
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
168
+ conf = Linkage::Configuration.new(dataset_1, dataset_1)
169
+ conf.configure do
170
+ save_results_in("mysql://localhost/results", {:foo => 'bar'})
171
+ end
172
+ assert_equal "mysql://localhost/results", conf.results_uri
173
+ assert_equal({:foo => 'bar'}, conf.results_uri_options)
174
+ end
175
+
176
+ test "case insensitive field names" do
177
+ database_for('sqlite') do |db|
178
+ db.create_table(:foo) { primary_key(:id); String(:foo); String(:bar) }
179
+ end
180
+
181
+ assert_nothing_raised do
182
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), "foo")
183
+ results_uri = database_options_for('sqlite')
184
+ conf = dataset.link_with(dataset) do
185
+ lhs[:Foo].must == rhs[:baR]
186
+ save_results_in(results_uri)
187
+ end
188
+ end
189
+ end
190
+
191
+ test "decollation_needed? is false when the datasets and results dataset all have the same database and collations" do
192
+ database_for('mysql') do |db|
193
+ db.create_table!(:foo) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
194
+ db.create_table!(:bar) { primary_key(:id); String(:foo, :collate => :latin1_swedish_ci) }
195
+ end
196
+
197
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
198
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
199
+ conf = dataset_1.link_with(dataset_2) do
200
+ lhs[:foo].must == rhs[:foo]
201
+ end
202
+ conf.results_uri = database_options_for('mysql')
203
+ assert !conf.decollation_needed?
204
+ end
205
+
206
+ test "decollation_needed? is true when the datasets have different database types" do
207
+ database_for('mysql') do |db|
208
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
209
+ end
210
+
211
+ database_for('sqlite') do |db|
212
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
213
+ end
214
+
215
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
216
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
217
+ conf = dataset_1.link_with(dataset_2) do
218
+ lhs[:foo].must == rhs[:foo]
219
+ end
220
+ conf.results_uri = database_options_for('mysql')
221
+ assert conf.decollation_needed?
222
+ end
223
+
224
+ test "decollation_needed? is true when the result dataset has different database type than the datasets" do
225
+ database_for('mysql') do |db|
226
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
227
+ db.create_table!(:bar) { primary_key(:id); String(:foo) }
228
+ end
229
+
230
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
231
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'bar')
232
+ conf = dataset_1.link_with(dataset_2) do
233
+ lhs[:foo].must == rhs[:foo]
234
+ end
235
+ conf.results_uri = database_options_for('sqlite')
236
+ assert conf.decollation_needed?
237
+ end
238
+
239
+ test "decollation_needed? is false when not comparing string columns" do
240
+ database_for('mysql') do |db|
241
+ db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
242
+ end
243
+
244
+ database_for('sqlite') do |db|
245
+ db.create_table!(:foo) { primary_key(:id); Fixnum(:foo) }
246
+ end
247
+
248
+ dataset_1 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
249
+ dataset_2 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
250
+ conf = dataset_1.link_with(dataset_2) do
251
+ lhs[:foo].must == rhs[:foo]
252
+ end
253
+ conf.results_uri = database_options_for('mysql')
254
+ assert !conf.decollation_needed?
255
+ end
256
+
257
+ test "creating comparator expectation for within" do
258
+ database_for('mysql') do |db|
259
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
260
+ end
261
+ dataset = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
262
+
263
+ conf = dataset.link_with(dataset) do
264
+ lhs[:foo].must be_within(5).of(rhs[:foo])
265
+ end
266
+ end
267
+ end
268
+ end
@@ -16,10 +16,6 @@ module IntegrationTests
16
16
  end
17
17
 
18
18
  test "one mandatory field equality on single threaded runner" do
19
- #setup_logger = Logger.new(STDERR)
20
- #setup_logger.formatter = lambda { |severity, time, progname, msg|
21
- #" SETUP : %s [%s]: %s\n" % [severity, time, msg]
22
- #}
23
19
  # insert the test data
24
20
  database do |db|
25
21
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -27,16 +23,8 @@ module IntegrationTests
27
23
  Array.new(100) { |i| [i, i % 10, i % 5] })
28
24
  end
29
25
 
30
- #ds_logger = Logger.new(STDERR)
31
- #ds_logger.formatter = lambda { |severity, time, progname, msg|
32
- #"DATASET: %s [%s]: %s\n" % [severity, time, msg]
33
- #}
34
26
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
35
27
 
36
- #rs_logger = Logger.new(STDERR)
37
- #rs_logger.formatter = lambda { |severity, time, progname, msg|
38
- #"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
39
- #}
40
28
  tmpuri = @tmpuri
41
29
  conf = ds.link_with(ds) do
42
30
  lhs[:foo].must == rhs[:bar]
@@ -52,11 +40,10 @@ module IntegrationTests
52
40
  assert_equal i, row[:foo_bar]
53
41
  end
54
42
 
55
- #assert_equal 150, db[:groups_records].count
56
- #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
57
- #expected_group_id = (row[:record_id] % 5) + 1
58
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
59
- #end
43
+ assert_equal 1000, db[:matches].count
44
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
45
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
46
+ end
60
47
  end
61
48
  end
62
49
 
@@ -38,7 +38,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
38
38
  end
39
39
 
40
40
  ds = Linkage::Dataset.new(@tmpuri, "foo")
41
- ds = ds.match(:bar)
41
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
42
42
  ds.each_group do |group|
43
43
  assert_equal({:bar => "foo"}, group.values)
44
44
  assert_equal(2, group.count)
@@ -51,6 +51,26 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
51
51
  assert_equal 3, groups.length
52
52
  end
53
53
 
54
+ test "each_group with alias" do
55
+ database do |db|
56
+ db.create_table(:foo) do
57
+ primary_key :id
58
+ String :bar
59
+ end
60
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
61
+ end
62
+
63
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
64
+ ds = ds.group_match({
65
+ :meta_object => Linkage::MetaObject.new(ds.field_set[:bar]),
66
+ :alias => :bar_baz
67
+ })
68
+ ds.each_group do |group|
69
+ assert_equal({:bar_baz => "foo"}, group.values)
70
+ assert_equal(2, group.count)
71
+ end
72
+ end
73
+
54
74
  test "each_group with filters" do
55
75
  database do |db|
56
76
  db.create_table(:foo) do
@@ -62,7 +82,7 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
62
82
  end
63
83
 
64
84
  ds = Linkage::Dataset.new(@tmpuri, "foo")
65
- ds = ds.match(:bar)
85
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
66
86
  ds = ds.filter { baz >= 3 }
67
87
  groups = []
68
88
  ds.each_group(1) do |group|
@@ -70,4 +90,27 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
70
90
  end
71
91
  assert_equal 2, groups.length
72
92
  end
93
+
94
+ test "each_group with collation" do
95
+ database_for('mysql') do |db|
96
+ db.create_table!(:foo) do
97
+ primary_key :id
98
+ String :bar, :collate => :latin1_swedish_ci
99
+ end
100
+ db[:foo].import([:id, :bar], [[1, 'fOo'], [2, 'foO'], [3, 'bar'], [4, 'baz']])
101
+ end
102
+
103
+ ds = Linkage::Dataset.new(database_options_for('mysql'), "foo")
104
+ ds = ds.group_match(Linkage::MetaObject.new(ds.field_set[:bar]))
105
+ groups = []
106
+ ds.each_group(1) do |group|
107
+ groups << group
108
+ end
109
+ expected = [
110
+ {:bar => 'BAR'},
111
+ {:bar => 'BAZ'},
112
+ {:bar => 'FOO'}
113
+ ]
114
+ assert_equal expected, groups.collect(&:decollated_values)
115
+ end
73
116
  end
@@ -34,6 +34,8 @@ module IntegrationTests
34
34
  lhs[:ssn].must == rhs[:ssn]
35
35
  save_results_in(tmpuri)
36
36
  end
37
+ assert_equal :dual, conf.linkage_type
38
+
37
39
  runner = Linkage::SingleThreadedRunner.new(conf)
38
40
  runner.execute
39
41
 
@@ -43,16 +45,10 @@ module IntegrationTests
43
45
  assert_equal "12345678#{i%10}", row[:ssn]
44
46
  end
45
47
 
46
- #assert_equal 200, db[:groups_records].count
47
- #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
- #if i % 20 >= 10
49
- #assert_equal 2, row[:dataset], row.inspect
50
- #else
51
- #assert_equal 1, row[:dataset], row.inspect
52
- #end
53
- #expected_group_id = i / 20 + 1
54
- #assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
- #end
48
+ assert_equal 1000, db[:matches].count
49
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
50
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
51
+ end
56
52
  end
57
53
  end
58
54
 
@@ -87,32 +83,52 @@ module IntegrationTests
87
83
  end
88
84
 
89
85
  test "reacts properly when using two databases with different string equality methods" do
90
- if !test_config['mysql']
91
- omission("No MySQL test configuration found")
86
+ foo_logger = nil #prefixed_logger("FOO")
87
+ bar_logger = nil #prefixed_logger("BAR")
88
+
89
+ database_for('mysql', :logger => foo_logger) do |db|
90
+ db.create_table!(:foo) do
91
+ primary_key(:id)
92
+ String :baz, :collate => "latin1_swedish_ci"
93
+ end
94
+ db[:foo].import([:id, :baz], [
95
+ [1, "tEst"],
96
+ [2, "teSt"],
97
+ [3, "tesT "],
98
+ [4, "TEST"],
99
+ [5, "junk"]
100
+ ])
92
101
  end
93
- uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
94
- Sequel.connect(uri) do |db|
95
- db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
96
- db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
97
102
 
98
- db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
99
- db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
103
+ database_for('mysql', :logger => bar_logger) do |db|
104
+ db.create_table!(:bar) do
105
+ primary_key(:id)
106
+ String :baz, :collate => "latin1_swedish_ci"
107
+ end
108
+ db[:bar].import([:id, :baz], [
109
+ [1, "Test "],
110
+ [2, "tEst "],
111
+ [3, "teSt"],
112
+ [4, "TEST"],
113
+ [5, "junk"]
114
+ ])
100
115
  end
101
116
 
102
- ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
103
- ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
117
+ options = database_options_for('mysql')
118
+ ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
119
+ ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
104
120
  tmpuri = @tmpuri
121
+ results_logger = nil #prefixed_logger("RESULTS")
105
122
  conf = ds_1.link_with(ds_2) do
106
- lhs[:one].must == rhs[:one]
107
- lhs[:two].must == rhs[:two]
108
- save_results_in(tmpuri)
123
+ lhs[:baz].must == rhs[:baz]
124
+ save_results_in(tmpuri, :logger => results_logger)
109
125
  end
110
126
 
111
127
  runner = Linkage::SingleThreadedRunner.new(conf)
112
128
  runner.execute
113
129
 
114
130
  database do |db|
115
- assert_equal 1, db[:groups].count
131
+ assert_equal 2, db[:groups].count
116
132
  end
117
133
  end
118
134
  end
@@ -62,5 +62,27 @@ module IntegrationTests
62
62
  assert_equal 1, db[:groups].count
63
63
  end
64
64
  end
65
+
66
+ test "binary function with static argument" do
67
+ database do |db|
68
+ db.create_table(:foo) { primary_key(:id); String(:bar) }
69
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo']])
70
+ end
71
+
72
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
73
+ tmpuri = @tmpuri
74
+ conf = ds.link_with(ds) do
75
+ lhs[:bar].must == rhs[:bar]
76
+ binary(lhs[:bar]).must == binary('foo')
77
+ binary(rhs[:bar]).must == binary('foo')
78
+ save_results_in(tmpuri)
79
+ end
80
+ runner = Linkage::SingleThreadedRunner.new(conf)
81
+ runner.execute
82
+
83
+ database do |db|
84
+ assert_equal 1, db[:groups].count
85
+ end
86
+ end
65
87
  end
66
88
  end
@@ -0,0 +1,85 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestResultSet < Test::Unit::TestCase
5
+ test "#create_tables! creates original_groups table when decollation is needed" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
8
+ end
9
+
10
+ database_for('mysql') do |db|
11
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
12
+ end
13
+
14
+ dataset_1 = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
15
+ dataset_2 = Linkage::Dataset.new(database_options_for('mysql'), 'foo')
16
+ results_uri = database_options_for('sqlite')
17
+ conf = dataset_1.link_with(dataset_2) do
18
+ lhs[:foo].must == rhs[:foo]
19
+ save_results_in(results_uri)
20
+ end
21
+ conf.result_set.create_tables!
22
+ assert_include conf.result_set.database.tables, :original_groups
23
+ end
24
+
25
+ test "#create_tables! doesn't create original_groups table when decollation is needed" do
26
+ database_for('sqlite') do |db|
27
+ db.create_table!(:foo) { primary_key(:id); String(:foo) }
28
+ end
29
+
30
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
31
+ results_uri = database_options_for('sqlite')
32
+ conf = dataset.link_with(dataset) do
33
+ lhs[:foo].must == rhs[:foo]
34
+ save_results_in(results_uri)
35
+ end
36
+ conf.result_set.create_tables!
37
+ assert_not_include conf.result_set.database.tables, :original_groups
38
+ end
39
+
40
+ test "#create_tables! doesn't create groups table when not needed" do
41
+ database_for('sqlite') do |db|
42
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
43
+ end
44
+
45
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
46
+ results_uri = database_options_for('sqlite')
47
+ conf = dataset.link_with(dataset) do
48
+ lhs[:foo].must be_within(5).of(rhs[:foo])
49
+ save_results_in(results_uri)
50
+ end
51
+ conf.result_set.create_tables!
52
+ assert_not_include conf.result_set.database.tables, :groups
53
+ end
54
+
55
+ test "#create_tables! creates scores table when there are exhaustive expectations" do
56
+ database_for('sqlite') do |db|
57
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
58
+ end
59
+
60
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
61
+ results_uri = database_options_for('sqlite')
62
+ conf = dataset.link_with(dataset) do
63
+ lhs[:foo].must be_within(5).of(rhs[:foo])
64
+ save_results_in(results_uri)
65
+ end
66
+ conf.result_set.create_tables!
67
+ assert_include conf.result_set.database.tables, :scores
68
+ end
69
+
70
+ test "#create_tables! doesn't create scores table when not needed" do
71
+ database_for('sqlite') do |db|
72
+ db.create_table!(:foo) { primary_key(:id); Integer(:foo) }
73
+ end
74
+
75
+ dataset = Linkage::Dataset.new(database_options_for('sqlite'), 'foo')
76
+ results_uri = database_options_for('sqlite')
77
+ conf = dataset.link_with(dataset) do
78
+ lhs[:foo].must == rhs[:foo]
79
+ save_results_in(results_uri)
80
+ end
81
+ conf.result_set.create_tables!
82
+ assert_not_include conf.result_set.database.tables, :scores
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,84 @@
1
+ require 'helper'
2
+
3
+ module IntegrationTests
4
+ class TestScoring < Test::Unit::TestCase
5
+ test "stop scoring if must expectation fails" do
6
+ database_for('sqlite') do |db|
7
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
8
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
9
+ db[:foo].import([:id, :num], [[1, 1]])
10
+ db[:bar].import([:id, :num], [[1, 5]])
11
+ end
12
+
13
+ db_opts = database_options_for('sqlite')
14
+ dataset_1 = Linkage::Dataset.new(db_opts, "foo")
15
+ dataset_2 = Linkage::Dataset.new(db_opts, "bar")
16
+ conf = dataset_1.link_with(dataset_2) do
17
+ lhs[:num].must_not be_within(5).of(rhs[:num])
18
+ lhs[:num].must be_within(5).of(rhs[:num])
19
+ save_results_in(db_opts)
20
+ end
21
+
22
+ runner = Linkage::SingleThreadedRunner.new(conf)
23
+ runner.execute
24
+
25
+ database_for('sqlite') do |db|
26
+ assert_equal db[:scores].count, 1
27
+ record = db[:scores].first
28
+ assert_equal 1, record[:score]
29
+ end
30
+ end
31
+
32
+ test "scoring phase adds matches as needed" do
33
+ database_for('sqlite') do |db|
34
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
35
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
36
+ db[:foo].import([:id, :num], (0..15).collect { |i| [i, i] })
37
+ db[:bar].import([:id, :num], (0..15).collect { |i| [i, i] })
38
+ end
39
+
40
+ db_opts = database_options_for('sqlite')
41
+ dataset_1 = Linkage::Dataset.new(db_opts, "foo")
42
+ dataset_2 = Linkage::Dataset.new(db_opts, "bar")
43
+ conf = dataset_1.link_with(dataset_2) do
44
+ lhs[:num].must be_within(10).of(rhs[:num])
45
+ lhs[:num].must_not be_within(5).of(rhs[:num])
46
+ save_results_in(db_opts)
47
+ end
48
+
49
+ runner = Linkage::SingleThreadedRunner.new(conf)
50
+ runner.execute
51
+
52
+ database_for('sqlite') do |db|
53
+ assert_equal 80, db[:matches].count
54
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
55
+ assert_equal 1, row[:total_score]
56
+ assert_include 6..10, (row[:record_1_id] - row[:record_2_id]).abs
57
+ end
58
+ end
59
+ end
60
+
61
+ test "optimize scoring for self linkage" do
62
+ database_for('sqlite') do |db|
63
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
64
+ db[:foo].import([:id, :num], [[1, 1], [2, 5], [3, 10]])
65
+ end
66
+
67
+ db_opts = database_options_for('sqlite')
68
+ dataset = Linkage::Dataset.new(db_opts, "foo")
69
+ conf = dataset.link_with(dataset) do
70
+ lhs[:num].must be_within(5).of(rhs[:num])
71
+ save_results_in(db_opts)
72
+ end
73
+
74
+ runner = Linkage::SingleThreadedRunner.new(conf)
75
+ runner.execute
76
+
77
+ database_for('sqlite') do |db|
78
+ assert_equal db[:scores].count, 3
79
+ scores = db[:scores].order(:record_1_id, :record_2_id).select_map(:score)
80
+ assert_equal [1, 0, 1], scores
81
+ end
82
+ end
83
+ end
84
+ end
@@ -42,6 +42,11 @@ module IntegrationTests
42
42
  dataset, _ = result_set.groups_records_datasets(group)
43
43
  assert_equal 10, dataset.count
44
44
  end
45
+
46
+ assert_equal 450, db[:matches].count
47
+ db[:matches].order(:record_1_id, :record_2_id).each do |row|
48
+ assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
49
+ end
45
50
  end
46
51
  end
47
52