linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class IntegrationTests::TestDatabaseResultSet < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@dir = Dir.mktmpdir('linkage')
|
6
|
+
@data_uri = "sqlite://" + File.join(@dir, "foo")
|
7
|
+
@results_uri = "sqlite://" + File.join(@dir, "bar")
|
8
|
+
end
|
9
|
+
|
10
|
+
def teardown
|
11
|
+
FileUtils.remove_entry_secure(@dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
def data_database(options = {}, &block)
|
15
|
+
Sequel.connect(@data_uri, options, &block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def results_database(options = {}, &block)
|
19
|
+
Sequel.connect(@results_uri, options, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
test "using a database for storing results" do
|
23
|
+
data_database do |db|
|
24
|
+
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
25
|
+
db[:foo].import([:id, :foo, :bar], Array.new(10) { |i| [i, i, i] })
|
26
|
+
end
|
27
|
+
|
28
|
+
ds = Linkage::Dataset.new(@data_uri, 'foo')
|
29
|
+
result_set = Linkage::ResultSet['database'].new(@results_uri)
|
30
|
+
conf = ds.link_with(ds, result_set) do |conf|
|
31
|
+
conf.compare([:foo], [:bar], :equal)
|
32
|
+
conf.algorithm = :mean
|
33
|
+
conf.threshold = 1
|
34
|
+
end
|
35
|
+
runner = Linkage::Runner.new(conf)
|
36
|
+
runner.execute
|
37
|
+
|
38
|
+
results_database do |db|
|
39
|
+
assert db.table_exists?(:scores)
|
40
|
+
assert_equal 10, db[:scores].count
|
41
|
+
db[:scores].order(:id_1, :id_2).each do |row|
|
42
|
+
assert_equal row[:id_1], row[:id_2]
|
43
|
+
assert_equal 1, row[:comparator_id]
|
44
|
+
assert_same 1.0, row[:score]
|
45
|
+
end
|
46
|
+
|
47
|
+
assert db.table_exists?(:matches)
|
48
|
+
assert_equal 10, db[:matches].count
|
49
|
+
db[:matches].order(:id_1, :id_2).each do |row|
|
50
|
+
assert_equal row[:id_1], row[:id_2]
|
51
|
+
assert_same 1.0, row[:score]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -15,7 +15,7 @@ module IntegrationTests
|
|
15
15
|
FileUtils.remove_entry_secure(@tmpdir)
|
16
16
|
end
|
17
17
|
|
18
|
-
test "one
|
18
|
+
test "one-field equality on single threaded runner" do
|
19
19
|
# create the test data
|
20
20
|
database do |db|
|
21
21
|
db.create_table(:foo) { primary_key(:id); String(:ssn) }
|
@@ -27,108 +27,33 @@ module IntegrationTests
|
|
27
27
|
Array.new(100) { |i| [i, "12345678#{i%10}"] })
|
28
28
|
end
|
29
29
|
|
30
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
30
31
|
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
31
32
|
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
|
32
33
|
tmpuri = @tmpuri
|
33
|
-
conf = ds_1.link_with(ds_2) do
|
34
|
-
|
35
|
-
|
34
|
+
conf = ds_1.link_with(ds_2, result_set) do |conf|
|
35
|
+
conf.compare([:ssn], [:ssn], :equal)
|
36
|
+
conf.algorithm = :mean
|
37
|
+
conf.threshold = 1.0
|
36
38
|
end
|
37
|
-
assert_equal :dual, conf.linkage_type
|
38
39
|
|
39
|
-
runner = Linkage::
|
40
|
+
runner = Linkage::Runner.new(conf)
|
40
41
|
runner.execute
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
assert_equal 1000, db[:matches].count
|
49
|
-
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
50
|
-
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
51
|
-
end
|
43
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
44
|
+
assert_equal 1000, score_csv.length
|
45
|
+
score_csv.each do |row|
|
46
|
+
id_1 = row['id_1'].to_i
|
47
|
+
id_2 = row['id_2'].to_i
|
48
|
+
assert (id_1 % 10) == (id_2 % 10)
|
52
49
|
end
|
53
|
-
end
|
54
50
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
db.create_table(:bar) { primary_key(:id); String(:ssn) }
|
63
|
-
db[:bar].import([:id, :ssn],
|
64
|
-
Array.new(100) { |i| [i, "1234567%03d" % i] })
|
65
|
-
end
|
66
|
-
|
67
|
-
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
68
|
-
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
|
69
|
-
tmpuri = @tmpuri
|
70
|
-
conf = ds_1.link_with(ds_2) do
|
71
|
-
lhs[:ssn].must == rhs[:ssn]
|
72
|
-
save_results_in(tmpuri)
|
73
|
-
end
|
74
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
75
|
-
runner.execute
|
76
|
-
|
77
|
-
database do |db|
|
78
|
-
assert_equal 100, db[:groups].count
|
79
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
80
|
-
assert_equal "1234567%03d" % i, row[:ssn]
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
test "reacts properly when using two databases with different string equality methods" do
|
86
|
-
foo_logger = nil #prefixed_logger("FOO")
|
87
|
-
bar_logger = nil #prefixed_logger("BAR")
|
88
|
-
|
89
|
-
database_for('mysql', :logger => foo_logger) do |db|
|
90
|
-
db.create_table!(:foo) do
|
91
|
-
primary_key(:id)
|
92
|
-
String :baz, :collate => "latin1_swedish_ci"
|
93
|
-
end
|
94
|
-
db[:foo].import([:id, :baz], [
|
95
|
-
[1, "tEst"],
|
96
|
-
[2, "teSt"],
|
97
|
-
[3, "tesT "],
|
98
|
-
[4, "TEST"],
|
99
|
-
[5, "junk"]
|
100
|
-
])
|
101
|
-
end
|
102
|
-
|
103
|
-
database_for('mysql', :logger => bar_logger) do |db|
|
104
|
-
db.create_table!(:bar) do
|
105
|
-
primary_key(:id)
|
106
|
-
String :baz, :collate => "latin1_swedish_ci"
|
107
|
-
end
|
108
|
-
db[:bar].import([:id, :baz], [
|
109
|
-
[1, "Test "],
|
110
|
-
[2, "tEst "],
|
111
|
-
[3, "teSt"],
|
112
|
-
[4, "TEST"],
|
113
|
-
[5, "junk"]
|
114
|
-
])
|
115
|
-
end
|
116
|
-
|
117
|
-
options = database_options_for('mysql')
|
118
|
-
ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
|
119
|
-
ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
|
120
|
-
tmpuri = @tmpuri
|
121
|
-
results_logger = nil #prefixed_logger("RESULTS")
|
122
|
-
conf = ds_1.link_with(ds_2) do
|
123
|
-
lhs[:baz].must == rhs[:baz]
|
124
|
-
save_results_in(tmpuri, :logger => results_logger)
|
125
|
-
end
|
126
|
-
|
127
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
128
|
-
runner.execute
|
129
|
-
|
130
|
-
database do |db|
|
131
|
-
assert_equal 2, db[:groups].count
|
51
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
52
|
+
assert_equal 1000, match_csv.length
|
53
|
+
match_csv.each do |row|
|
54
|
+
id_1 = row['id_1'].to_i
|
55
|
+
id_2 = row['id_2'].to_i
|
56
|
+
assert (id_1 % 10) == (id_2 % 10)
|
132
57
|
end
|
133
58
|
end
|
134
59
|
end
|
@@ -1,224 +1,121 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
class IntegrationTests::TestSelfLinkage < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@tmpdir = Dir.mktmpdir('linkage')
|
6
|
+
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
7
|
+
end
|
8
|
+
|
9
|
+
def database(&block)
|
10
|
+
Sequel.connect(@tmpuri, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
def teardown
|
14
|
+
FileUtils.remove_entry_secure(@tmpdir)
|
15
|
+
end
|
16
|
+
|
17
|
+
test "one-field equality" do
|
18
|
+
# insert the test data
|
19
|
+
database do |db|
|
20
|
+
db.create_table(:foo) { primary_key(:id); String(:ssn) }
|
21
|
+
db[:foo].import([:id, :ssn],
|
22
|
+
Array.new(100) { |i| [i, "12345678#{i%10}"] })
|
8
23
|
end
|
9
24
|
|
10
|
-
|
11
|
-
|
25
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
26
|
+
dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
27
|
+
conf = dataset.link_with(dataset, result_set) do |conf|
|
28
|
+
conf.compare([:ssn], [:ssn], :equal)
|
29
|
+
conf.algorithm = :mean
|
30
|
+
conf.threshold = 1.0
|
12
31
|
end
|
13
32
|
|
14
|
-
|
15
|
-
|
33
|
+
runner = Linkage::Runner.new(conf)
|
34
|
+
runner.execute
|
35
|
+
|
36
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
37
|
+
assert_equal 450, score_csv.length
|
38
|
+
score_csv.each do |row|
|
39
|
+
assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
|
16
40
|
end
|
17
41
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
db[:foo].import([:id, :ssn],
|
23
|
-
Array.new(100) { |i| [i, "12345678#{i%10}"] })
|
24
|
-
end
|
25
|
-
|
26
|
-
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
27
|
-
tmpuri = @tmpuri
|
28
|
-
conf = ds.link_with(ds) do
|
29
|
-
lhs[:ssn].must == rhs[:ssn]
|
30
|
-
save_results_in(tmpuri)
|
31
|
-
end
|
32
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
33
|
-
result_set = runner.execute
|
34
|
-
assert_kind_of Linkage::ResultSet, result_set
|
35
|
-
|
36
|
-
database do |db|
|
37
|
-
assert_equal 10, db[:groups].count
|
38
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
39
|
-
assert_equal "12345678#{i%10}", row[:ssn]
|
40
|
-
|
41
|
-
group = Linkage::Group.from_row(row)
|
42
|
-
dataset, _ = result_set.groups_records_datasets(group)
|
43
|
-
assert_equal 10, dataset.count
|
44
|
-
end
|
45
|
-
|
46
|
-
assert_equal 450, db[:matches].count
|
47
|
-
db[:matches].order(:record_1_id, :record_2_id).each do |row|
|
48
|
-
assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
|
49
|
-
end
|
50
|
-
end
|
42
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
43
|
+
assert_equal 450, match_csv.length
|
44
|
+
match_csv.each do |row|
|
45
|
+
assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
|
51
46
|
end
|
47
|
+
end
|
52
48
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
end
|
60
|
-
|
61
|
-
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
62
|
-
tmpuri = @tmpuri
|
63
|
-
conf = ds.link_with(ds) do
|
64
|
-
lhs[:ssn].must == rhs[:ssn]
|
65
|
-
lhs[:dob].must == rhs[:dob]
|
66
|
-
save_results_in(tmpuri)
|
67
|
-
end
|
68
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
69
|
-
runner.execute
|
70
|
-
|
71
|
-
database do |db|
|
72
|
-
assert_equal 20, db[:groups].count
|
73
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
74
|
-
assert_equal "12345678#{i/2}", row[:ssn]
|
75
|
-
assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
|
76
|
-
end
|
77
|
-
|
78
|
-
#assert_equal 100, db[:groups_records].count
|
79
|
-
#expected_group_id = nil
|
80
|
-
#db[:groups_records].order(:record_id).each do |row|
|
81
|
-
#v = row[:record_id] % 20
|
82
|
-
#expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
|
83
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
84
|
-
#end
|
85
|
-
end
|
49
|
+
test "two-field equality" do
|
50
|
+
# insert the test data
|
51
|
+
database do |db|
|
52
|
+
db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
|
53
|
+
db[:foo].import([:id, :ssn, :dob],
|
54
|
+
Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
|
86
55
|
end
|
87
56
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
db[:foo].import([:id, :ssn, :mod_5],
|
93
|
-
Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
|
94
|
-
end
|
95
|
-
|
96
|
-
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
97
|
-
tmpuri = @tmpuri
|
98
|
-
conf = ds.link_with(ds) do
|
99
|
-
lhs[:ssn].must == rhs[:ssn]
|
100
|
-
lhs[:mod_5].must == 3
|
101
|
-
save_results_in(tmpuri)
|
102
|
-
end
|
103
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
104
|
-
runner.execute
|
105
|
-
|
106
|
-
database do |db|
|
107
|
-
assert_equal 2, db[:groups].count
|
108
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
109
|
-
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
110
|
-
end
|
111
|
-
|
112
|
-
#assert_equal 20, db[:groups_records].count
|
113
|
-
#expected_group_id = nil
|
114
|
-
#db[:groups_records].order(:record_id).each do |row|
|
115
|
-
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
116
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
117
|
-
#end
|
118
|
-
end
|
57
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
58
|
+
dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
59
|
+
conf = dataset.link_with(dataset, result_set) do |conf|
|
60
|
+
conf.compare([:ssn, :dob], [:ssn, :dob], :equal)
|
119
61
|
end
|
120
62
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
conf = ds.link_with(ds) do
|
132
|
-
lhs[:ssn].must == rhs[:ssn]
|
133
|
-
lhs[:mod_5].must > 2
|
134
|
-
lhs[:mod_5].must <= 3
|
135
|
-
save_results_in(tmpuri)
|
136
|
-
end
|
137
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
138
|
-
runner.execute
|
139
|
-
|
140
|
-
database do |db|
|
141
|
-
assert_equal 2, db[:groups].count
|
142
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
143
|
-
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
144
|
-
end
|
145
|
-
|
146
|
-
#assert_equal 20, db[:groups_records].count
|
147
|
-
#expected_group_id = nil
|
148
|
-
#db[:groups_records].order(:record_id).each do |row|
|
149
|
-
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
150
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
151
|
-
#end
|
152
|
-
end
|
63
|
+
runner = Linkage::Runner.new(conf)
|
64
|
+
runner.execute
|
65
|
+
|
66
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
67
|
+
assert_equal 200, score_csv.length
|
68
|
+
score_csv.each do |row|
|
69
|
+
id_1 = row['id_1'].to_i
|
70
|
+
id_2 = row['id_2'].to_i
|
71
|
+
assert id_1 % 10 == id_2 % 10
|
72
|
+
assert id_1 % 20 == id_2 % 20
|
153
73
|
end
|
154
74
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
75
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
76
|
+
assert_equal 200, match_csv.length
|
77
|
+
match_csv.each do |row|
|
78
|
+
id_1 = row['id_1'].to_i
|
79
|
+
id_2 = row['id_2'].to_i
|
80
|
+
assert id_1 % 10 == id_2 % 10
|
81
|
+
assert id_1 % 20 == id_2 % 20
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
test "one-field equality with blocking" do
|
86
|
+
# insert the test data
|
87
|
+
database do |db|
|
88
|
+
db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
|
89
|
+
db[:foo].import([:id, :ssn, :mod_5],
|
90
|
+
Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
|
91
|
+
end
|
92
|
+
|
93
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
94
|
+
dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
95
|
+
dataset = dataset.filter(:mod_5 => 3)
|
96
|
+
conf = dataset.link_with(dataset, result_set) do |conf|
|
97
|
+
conf.compare([:ssn], [:ssn], :equal)
|
98
|
+
end
|
99
|
+
|
100
|
+
runner = Linkage::Runner.new(conf)
|
101
|
+
runner.execute
|
102
|
+
|
103
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
104
|
+
assert_equal 90, score_csv.length
|
105
|
+
score_csv.each do |row|
|
106
|
+
id_1 = row['id_1'].to_i
|
107
|
+
id_2 = row['id_2'].to_i
|
108
|
+
assert id_1 % 10 == id_2 % 10
|
109
|
+
assert id_1 % 5 == id_2 % 5
|
187
110
|
end
|
188
111
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
198
|
-
tmpuri = @tmpuri
|
199
|
-
conf = ds.link_with(ds) do
|
200
|
-
lhs[:ssn].must == rhs[:ssn]
|
201
|
-
lhs[:mod_5].must == lhs[:mod_20]
|
202
|
-
rhs[:mod_5].must == rhs[:mod_20]
|
203
|
-
save_results_in(tmpuri)
|
204
|
-
end
|
205
|
-
assert_equal :self, conf.linkage_type
|
206
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
207
|
-
runner.execute
|
208
|
-
|
209
|
-
database do |db|
|
210
|
-
assert_equal 5, db[:groups].count
|
211
|
-
db[:groups].order(:ssn).each_with_index do |row, i|
|
212
|
-
assert_equal "123456789#{i}", row[:ssn]
|
213
|
-
end
|
214
|
-
|
215
|
-
#assert_equal 25, db[:groups_records].count
|
216
|
-
#expected_group_id = nil
|
217
|
-
#db[:groups_records].order(:record_id).each do |row|
|
218
|
-
#expected_group_id = row[:record_id] % 5 + 1
|
219
|
-
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
220
|
-
#end
|
221
|
-
end
|
112
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
113
|
+
assert_equal 90, match_csv.length
|
114
|
+
match_csv.each do |row|
|
115
|
+
id_1 = row['id_1'].to_i
|
116
|
+
id_2 = row['id_2'].to_i
|
117
|
+
assert id_1 % 10 == id_2 % 10
|
118
|
+
assert id_1 % 5 == id_2 % 5
|
222
119
|
end
|
223
120
|
end
|
224
121
|
end
|
@@ -2,97 +2,44 @@ require 'helper'
|
|
2
2
|
|
3
3
|
module IntegrationTests
|
4
4
|
class TestWithinComparator < Test::Unit::TestCase
|
5
|
-
|
6
|
-
|
7
|
-
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
8
|
-
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
9
|
-
db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
|
10
|
-
db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
|
11
|
-
end
|
12
|
-
|
13
|
-
db_opts = database_options_for('sqlite')
|
14
|
-
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
15
|
-
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
16
|
-
conf = dataset_1.link_with(dataset_2) do
|
17
|
-
lhs[:num].must be_within(5).of(rhs[:num])
|
18
|
-
save_results_in(db_opts)
|
19
|
-
end
|
20
|
-
|
21
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
22
|
-
runner.execute
|
23
|
-
|
24
|
-
database_for('sqlite') do |db|
|
25
|
-
assert_equal db[:scores].count, 100
|
26
|
-
db[:scores].order(:record_1_id, :record_2_id).each do |score|
|
27
|
-
if (score[:record_2_id] - score[:record_1_id]).abs <= 5
|
28
|
-
assert_equal 1, score[:score], score.inspect
|
29
|
-
else
|
30
|
-
assert_equal 0, score[:score], score.inspect
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
5
|
+
def setup
|
6
|
+
@tmpdir = Dir.mktmpdir('linkage')
|
34
7
|
end
|
35
8
|
|
36
|
-
|
37
|
-
|
38
|
-
db.create_table(:foo) { primary_key(:id); Integer(:num); String(:parity) }
|
39
|
-
db.create_table(:bar) { primary_key(:id); Integer(:num); String(:parity) }
|
40
|
-
db[:foo].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
|
41
|
-
db[:bar].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
|
42
|
-
end
|
43
|
-
|
44
|
-
db_opts = database_options_for('sqlite')
|
45
|
-
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
46
|
-
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
47
|
-
conf = dataset_1.link_with(dataset_2) do
|
48
|
-
lhs[:parity].must == rhs[:parity]
|
49
|
-
lhs[:num].must be_within(5).of(rhs[:num])
|
50
|
-
save_results_in(db_opts)
|
51
|
-
end
|
52
|
-
|
53
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
54
|
-
runner.execute
|
55
|
-
|
56
|
-
database_for('sqlite') do |db|
|
57
|
-
assert_equal db[:scores].count, 50
|
58
|
-
db[:scores].order(:record_1_id, :record_2_id).each do |score|
|
59
|
-
if (score[:record_2_id] - score[:record_1_id]).abs <= 5
|
60
|
-
assert_equal 1, score[:score]
|
61
|
-
else
|
62
|
-
assert_equal 0, score[:score]
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
9
|
+
def teardown
|
10
|
+
FileUtils.remove_entry_secure(@tmpdir)
|
66
11
|
end
|
67
12
|
|
68
|
-
test "within comparator
|
13
|
+
test "within comparator" do
|
69
14
|
database_for('sqlite') do |db|
|
70
|
-
db.create_table(:foo) { primary_key(:id); Integer(:num)
|
71
|
-
db.create_table(:bar) { primary_key(:id); Integer(:num)
|
72
|
-
db[:foo].import([:id, :num
|
73
|
-
db[:bar].import([:id, :num
|
15
|
+
db.create_table(:foo) { primary_key(:id); Integer(:num) }
|
16
|
+
db.create_table(:bar) { primary_key(:id); Integer(:num) }
|
17
|
+
db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
|
18
|
+
db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
|
74
19
|
end
|
75
20
|
|
21
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
76
22
|
db_opts = database_options_for('sqlite')
|
77
23
|
dataset_1 = Linkage::Dataset.new(db_opts, "foo")
|
78
24
|
dataset_2 = Linkage::Dataset.new(db_opts, "bar")
|
79
|
-
conf = dataset_1.link_with(dataset_2) do
|
80
|
-
|
81
|
-
cast(lhs[:num], 'integer').must be_within(5).of(cast(rhs[:num], 'integer'))
|
82
|
-
save_results_in(db_opts)
|
25
|
+
conf = dataset_1.link_with(dataset_2, result_set) do |conf|
|
26
|
+
conf.within(:num, :num, 5)
|
83
27
|
end
|
84
28
|
|
85
|
-
runner = Linkage::
|
29
|
+
runner = Linkage::Runner.new(conf)
|
86
30
|
runner.execute
|
87
31
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
32
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
33
|
+
assert_equal 100, score_csv.length
|
34
|
+
score_csv.each do |row|
|
35
|
+
assert_equal "1", row['comparator_id']
|
36
|
+
# ids same as values
|
37
|
+
id_1 = row['id_1'].to_i
|
38
|
+
id_2 = row['id_2'].to_i
|
39
|
+
if (id_2 - id_1).abs <= 5
|
40
|
+
assert_equal 1, row['score'].to_i, row
|
41
|
+
else
|
42
|
+
assert_equal 0, row['score'].to_i
|
96
43
|
end
|
97
44
|
end
|
98
45
|
end
|