linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,55 @@
1
+ require 'helper'
2
+
3
+ class IntegrationTests::TestDatabaseResultSet < Test::Unit::TestCase
4
+ def setup
5
+ @dir = Dir.mktmpdir('linkage')
6
+ @data_uri = "sqlite://" + File.join(@dir, "foo")
7
+ @results_uri = "sqlite://" + File.join(@dir, "bar")
8
+ end
9
+
10
+ def teardown
11
+ FileUtils.remove_entry_secure(@dir)
12
+ end
13
+
14
+ def data_database(options = {}, &block)
15
+ Sequel.connect(@data_uri, options, &block)
16
+ end
17
+
18
+ def results_database(options = {}, &block)
19
+ Sequel.connect(@results_uri, options, &block)
20
+ end
21
+
22
+ test "using a database for storing results" do
23
+ data_database do |db|
24
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
25
+ db[:foo].import([:id, :foo, :bar], Array.new(10) { |i| [i, i, i] })
26
+ end
27
+
28
+ ds = Linkage::Dataset.new(@data_uri, 'foo')
29
+ result_set = Linkage::ResultSet['database'].new(@results_uri)
30
+ conf = ds.link_with(ds, result_set) do |conf|
31
+ conf.compare([:foo], [:bar], :equal)
32
+ conf.algorithm = :mean
33
+ conf.threshold = 1
34
+ end
35
+ runner = Linkage::Runner.new(conf)
36
+ runner.execute
37
+
38
+ results_database do |db|
39
+ assert db.table_exists?(:scores)
40
+ assert_equal 10, db[:scores].count
41
+ db[:scores].order(:id_1, :id_2).each do |row|
42
+ assert_equal row[:id_1], row[:id_2]
43
+ assert_equal 1, row[:comparator_id]
44
+ assert_same 1.0, row[:score]
45
+ end
46
+
47
+ assert db.table_exists?(:matches)
48
+ assert_equal 10, db[:matches].count
49
+ db[:matches].order(:id_1, :id_2).each do |row|
50
+ assert_equal row[:id_1], row[:id_2]
51
+ assert_same 1.0, row[:score]
52
+ end
53
+ end
54
+ end
55
+ end
@@ -15,7 +15,7 @@ module IntegrationTests
15
15
  FileUtils.remove_entry_secure(@tmpdir)
16
16
  end
17
17
 
18
- test "one mandatory field equality on single threaded runner" do
18
+ test "one-field equality on single threaded runner" do
19
19
  # create the test data
20
20
  database do |db|
21
21
  db.create_table(:foo) { primary_key(:id); String(:ssn) }
@@ -27,108 +27,33 @@ module IntegrationTests
27
27
  Array.new(100) { |i| [i, "12345678#{i%10}"] })
28
28
  end
29
29
 
30
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
30
31
  ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
31
32
  ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
32
33
  tmpuri = @tmpuri
33
- conf = ds_1.link_with(ds_2) do
34
- lhs[:ssn].must == rhs[:ssn]
35
- save_results_in(tmpuri)
34
+ conf = ds_1.link_with(ds_2, result_set) do |conf|
35
+ conf.compare([:ssn], [:ssn], :equal)
36
+ conf.algorithm = :mean
37
+ conf.threshold = 1.0
36
38
  end
37
- assert_equal :dual, conf.linkage_type
38
39
 
39
- runner = Linkage::SingleThreadedRunner.new(conf)
40
+ runner = Linkage::Runner.new(conf)
40
41
  runner.execute
41
42
 
42
- database do |db|
43
- assert_equal 10, db[:groups].count
44
- db[:groups].order(:ssn).each_with_index do |row, i|
45
- assert_equal "12345678#{i%10}", row[:ssn]
46
- end
47
-
48
- assert_equal 1000, db[:matches].count
49
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
50
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
51
- end
43
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
44
+ assert_equal 1000, score_csv.length
45
+ score_csv.each do |row|
46
+ id_1 = row['id_1'].to_i
47
+ id_2 = row['id_2'].to_i
48
+ assert (id_1 % 10) == (id_2 % 10)
52
49
  end
53
- end
54
50
 
55
- test "don't ignore 1-record groups before the combining phase" do
56
- # create the test data
57
- database do |db|
58
- db.create_table(:foo) { primary_key(:id); String(:ssn) }
59
- db[:foo].import([:id, :ssn],
60
- Array.new(100) { |i| [i, "1234567%03d" % i] })
61
-
62
- db.create_table(:bar) { primary_key(:id); String(:ssn) }
63
- db[:bar].import([:id, :ssn],
64
- Array.new(100) { |i| [i, "1234567%03d" % i] })
65
- end
66
-
67
- ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
68
- ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
69
- tmpuri = @tmpuri
70
- conf = ds_1.link_with(ds_2) do
71
- lhs[:ssn].must == rhs[:ssn]
72
- save_results_in(tmpuri)
73
- end
74
- runner = Linkage::SingleThreadedRunner.new(conf)
75
- runner.execute
76
-
77
- database do |db|
78
- assert_equal 100, db[:groups].count
79
- db[:groups].order(:ssn).each_with_index do |row, i|
80
- assert_equal "1234567%03d" % i, row[:ssn]
81
- end
82
- end
83
- end
84
-
85
- test "reacts properly when using two databases with different string equality methods" do
86
- foo_logger = nil #prefixed_logger("FOO")
87
- bar_logger = nil #prefixed_logger("BAR")
88
-
89
- database_for('mysql', :logger => foo_logger) do |db|
90
- db.create_table!(:foo) do
91
- primary_key(:id)
92
- String :baz, :collate => "latin1_swedish_ci"
93
- end
94
- db[:foo].import([:id, :baz], [
95
- [1, "tEst"],
96
- [2, "teSt"],
97
- [3, "tesT "],
98
- [4, "TEST"],
99
- [5, "junk"]
100
- ])
101
- end
102
-
103
- database_for('mysql', :logger => bar_logger) do |db|
104
- db.create_table!(:bar) do
105
- primary_key(:id)
106
- String :baz, :collate => "latin1_swedish_ci"
107
- end
108
- db[:bar].import([:id, :baz], [
109
- [1, "Test "],
110
- [2, "tEst "],
111
- [3, "teSt"],
112
- [4, "TEST"],
113
- [5, "junk"]
114
- ])
115
- end
116
-
117
- options = database_options_for('mysql')
118
- ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
119
- ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
120
- tmpuri = @tmpuri
121
- results_logger = nil #prefixed_logger("RESULTS")
122
- conf = ds_1.link_with(ds_2) do
123
- lhs[:baz].must == rhs[:baz]
124
- save_results_in(tmpuri, :logger => results_logger)
125
- end
126
-
127
- runner = Linkage::SingleThreadedRunner.new(conf)
128
- runner.execute
129
-
130
- database do |db|
131
- assert_equal 2, db[:groups].count
51
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
52
+ assert_equal 1000, match_csv.length
53
+ match_csv.each do |row|
54
+ id_1 = row['id_1'].to_i
55
+ id_2 = row['id_2'].to_i
56
+ assert (id_1 % 10) == (id_2 % 10)
132
57
  end
133
58
  end
134
59
  end
@@ -1,224 +1,121 @@
1
1
  require 'helper'
2
2
 
3
- module IntegrationTests
4
- class TestSelfLinkage < Test::Unit::TestCase
5
- def setup
6
- @tmpdir = Dir.mktmpdir('linkage')
7
- @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
3
+ class IntegrationTests::TestSelfLinkage < Test::Unit::TestCase
4
+ def setup
5
+ @tmpdir = Dir.mktmpdir('linkage')
6
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
7
+ end
8
+
9
+ def database(&block)
10
+ Sequel.connect(@tmpuri, &block)
11
+ end
12
+
13
+ def teardown
14
+ FileUtils.remove_entry_secure(@tmpdir)
15
+ end
16
+
17
+ test "one-field equality" do
18
+ # insert the test data
19
+ database do |db|
20
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
21
+ db[:foo].import([:id, :ssn],
22
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
8
23
  end
9
24
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
25
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
26
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = dataset.link_with(dataset, result_set) do |conf|
28
+ conf.compare([:ssn], [:ssn], :equal)
29
+ conf.algorithm = :mean
30
+ conf.threshold = 1.0
12
31
  end
13
32
 
14
- def teardown
15
- FileUtils.remove_entry_secure(@tmpdir)
33
+ runner = Linkage::Runner.new(conf)
34
+ runner.execute
35
+
36
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
37
+ assert_equal 450, score_csv.length
38
+ score_csv.each do |row|
39
+ assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
16
40
  end
17
41
 
18
- test "one mandatory field equality on single threaded runner" do
19
- # insert the test data
20
- database do |db|
21
- db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
- db[:foo].import([:id, :ssn],
23
- Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
- end
25
-
26
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
- tmpuri = @tmpuri
28
- conf = ds.link_with(ds) do
29
- lhs[:ssn].must == rhs[:ssn]
30
- save_results_in(tmpuri)
31
- end
32
- runner = Linkage::SingleThreadedRunner.new(conf)
33
- result_set = runner.execute
34
- assert_kind_of Linkage::ResultSet, result_set
35
-
36
- database do |db|
37
- assert_equal 10, db[:groups].count
38
- db[:groups].order(:ssn).each_with_index do |row, i|
39
- assert_equal "12345678#{i%10}", row[:ssn]
40
-
41
- group = Linkage::Group.from_row(row)
42
- dataset, _ = result_set.groups_records_datasets(group)
43
- assert_equal 10, dataset.count
44
- end
45
-
46
- assert_equal 450, db[:matches].count
47
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
48
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
49
- end
50
- end
42
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
43
+ assert_equal 450, match_csv.length
44
+ match_csv.each do |row|
45
+ assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
51
46
  end
47
+ end
52
48
 
53
- test "two mandatory field equalities on single threaded runner" do
54
- # insert the test data
55
- database do |db|
56
- db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
57
- db[:foo].import([:id, :ssn, :dob],
58
- Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
59
- end
60
-
61
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
62
- tmpuri = @tmpuri
63
- conf = ds.link_with(ds) do
64
- lhs[:ssn].must == rhs[:ssn]
65
- lhs[:dob].must == rhs[:dob]
66
- save_results_in(tmpuri)
67
- end
68
- runner = Linkage::SingleThreadedRunner.new(conf)
69
- runner.execute
70
-
71
- database do |db|
72
- assert_equal 20, db[:groups].count
73
- db[:groups].order(:ssn).each_with_index do |row, i|
74
- assert_equal "12345678#{i/2}", row[:ssn]
75
- assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
76
- end
77
-
78
- #assert_equal 100, db[:groups_records].count
79
- #expected_group_id = nil
80
- #db[:groups_records].order(:record_id).each do |row|
81
- #v = row[:record_id] % 20
82
- #expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
83
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
84
- #end
85
- end
49
+ test "two-field equality" do
50
+ # insert the test data
51
+ database do |db|
52
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
53
+ db[:foo].import([:id, :ssn, :dob],
54
+ Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
86
55
  end
87
56
 
88
- test "one mandatory field equality on single threaded runner, with filter" do
89
- # insert the test data
90
- database do |db|
91
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
92
- db[:foo].import([:id, :ssn, :mod_5],
93
- Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
94
- end
95
-
96
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
97
- tmpuri = @tmpuri
98
- conf = ds.link_with(ds) do
99
- lhs[:ssn].must == rhs[:ssn]
100
- lhs[:mod_5].must == 3
101
- save_results_in(tmpuri)
102
- end
103
- runner = Linkage::SingleThreadedRunner.new(conf)
104
- runner.execute
105
-
106
- database do |db|
107
- assert_equal 2, db[:groups].count
108
- db[:groups].order(:ssn).each_with_index do |row, i|
109
- assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
110
- end
111
-
112
- #assert_equal 20, db[:groups_records].count
113
- #expected_group_id = nil
114
- #db[:groups_records].order(:record_id).each do |row|
115
- #expected_group_id = (row[:record_id] / 5) % 2 + 1
116
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
117
- #end
118
- end
57
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
58
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
59
+ conf = dataset.link_with(dataset, result_set) do |conf|
60
+ conf.compare([:ssn, :dob], [:ssn, :dob], :equal)
119
61
  end
120
62
 
121
- test "one mandatory field equality on single threaded runner, with inequality filters" do
122
- # insert the test data
123
- database do |db|
124
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
125
- db[:foo].import([:id, :ssn, :mod_5],
126
- Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
127
- end
128
-
129
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
130
- tmpuri = @tmpuri
131
- conf = ds.link_with(ds) do
132
- lhs[:ssn].must == rhs[:ssn]
133
- lhs[:mod_5].must > 2
134
- lhs[:mod_5].must <= 3
135
- save_results_in(tmpuri)
136
- end
137
- runner = Linkage::SingleThreadedRunner.new(conf)
138
- runner.execute
139
-
140
- database do |db|
141
- assert_equal 2, db[:groups].count
142
- db[:groups].order(:ssn).each_with_index do |row, i|
143
- assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
144
- end
145
-
146
- #assert_equal 20, db[:groups_records].count
147
- #expected_group_id = nil
148
- #db[:groups_records].order(:record_id).each do |row|
149
- #expected_group_id = (row[:record_id] / 5) % 2 + 1
150
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
151
- #end
152
- end
63
+ runner = Linkage::Runner.new(conf)
64
+ runner.execute
65
+
66
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
67
+ assert_equal 200, score_csv.length
68
+ score_csv.each do |row|
69
+ id_1 = row['id_1'].to_i
70
+ id_2 = row['id_2'].to_i
71
+ assert id_1 % 10 == id_2 % 10
72
+ assert id_1 % 20 == id_2 % 20
153
73
  end
154
74
 
155
- test "one mandatory field equality on single threaded runner, with field filter" do
156
- # insert the test data
157
- database do |db|
158
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
159
- db[:foo].import([:id, :ssn, :mod_5, :mod_20],
160
- Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
161
- end
162
-
163
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
164
- tmpuri = @tmpuri
165
- conf = ds.link_with(ds) do
166
- lhs[:ssn].must == rhs[:ssn]
167
- lhs[:mod_5].must == lhs[:mod_20]
168
- save_results_in(tmpuri)
169
- end
170
- assert_equal :self, conf.linkage_type
171
- runner = Linkage::SingleThreadedRunner.new(conf)
172
- runner.execute
173
-
174
- database do |db|
175
- assert_equal 5, db[:groups].count
176
- db[:groups].order(:ssn).each_with_index do |row, i|
177
- assert_equal "123456789#{i}", row[:ssn]
178
- end
179
-
180
- #assert_equal 25, db[:groups_records].count
181
- #expected_group_id = nil
182
- #db[:groups_records].order(:record_id).each do |row|
183
- #expected_group_id = row[:record_id] % 5 + 1
184
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
185
- #end
186
- end
75
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
76
+ assert_equal 200, match_csv.length
77
+ match_csv.each do |row|
78
+ id_1 = row['id_1'].to_i
79
+ id_2 = row['id_2'].to_i
80
+ assert id_1 % 10 == id_2 % 10
81
+ assert id_1 % 20 == id_2 % 20
82
+ end
83
+ end
84
+
85
+ test "one-field equality with blocking" do
86
+ # insert the test data
87
+ database do |db|
88
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
89
+ db[:foo].import([:id, :ssn, :mod_5],
90
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
91
+ end
92
+
93
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
94
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
95
+ dataset = dataset.filter(:mod_5 => 3)
96
+ conf = dataset.link_with(dataset, result_set) do |conf|
97
+ conf.compare([:ssn], [:ssn], :equal)
98
+ end
99
+
100
+ runner = Linkage::Runner.new(conf)
101
+ runner.execute
102
+
103
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
104
+ assert_equal 90, score_csv.length
105
+ score_csv.each do |row|
106
+ id_1 = row['id_1'].to_i
107
+ id_2 = row['id_2'].to_i
108
+ assert id_1 % 10 == id_2 % 10
109
+ assert id_1 % 5 == id_2 % 5
187
110
  end
188
111
 
189
- test "one mandatory field equality on single threaded runner, with identical filters" do
190
- # insert the test data
191
- database do |db|
192
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
193
- db[:foo].import([:id, :ssn, :mod_5, :mod_20],
194
- Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
195
- end
196
-
197
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
198
- tmpuri = @tmpuri
199
- conf = ds.link_with(ds) do
200
- lhs[:ssn].must == rhs[:ssn]
201
- lhs[:mod_5].must == lhs[:mod_20]
202
- rhs[:mod_5].must == rhs[:mod_20]
203
- save_results_in(tmpuri)
204
- end
205
- assert_equal :self, conf.linkage_type
206
- runner = Linkage::SingleThreadedRunner.new(conf)
207
- runner.execute
208
-
209
- database do |db|
210
- assert_equal 5, db[:groups].count
211
- db[:groups].order(:ssn).each_with_index do |row, i|
212
- assert_equal "123456789#{i}", row[:ssn]
213
- end
214
-
215
- #assert_equal 25, db[:groups_records].count
216
- #expected_group_id = nil
217
- #db[:groups_records].order(:record_id).each do |row|
218
- #expected_group_id = row[:record_id] % 5 + 1
219
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
220
- #end
221
- end
112
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
113
+ assert_equal 90, match_csv.length
114
+ match_csv.each do |row|
115
+ id_1 = row['id_1'].to_i
116
+ id_2 = row['id_2'].to_i
117
+ assert id_1 % 10 == id_2 % 10
118
+ assert id_1 % 5 == id_2 % 5
222
119
  end
223
120
  end
224
121
  end
@@ -2,97 +2,44 @@ require 'helper'
2
2
 
3
3
  module IntegrationTests
4
4
  class TestWithinComparator < Test::Unit::TestCase
5
- test "within comparator with no simple expectations" do
6
- database_for('sqlite') do |db|
7
- db.create_table(:foo) { primary_key(:id); Integer(:num) }
8
- db.create_table(:bar) { primary_key(:id); Integer(:num) }
9
- db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
10
- db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
11
- end
12
-
13
- db_opts = database_options_for('sqlite')
14
- dataset_1 = Linkage::Dataset.new(db_opts, "foo")
15
- dataset_2 = Linkage::Dataset.new(db_opts, "bar")
16
- conf = dataset_1.link_with(dataset_2) do
17
- lhs[:num].must be_within(5).of(rhs[:num])
18
- save_results_in(db_opts)
19
- end
20
-
21
- runner = Linkage::SingleThreadedRunner.new(conf)
22
- runner.execute
23
-
24
- database_for('sqlite') do |db|
25
- assert_equal db[:scores].count, 100
26
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
27
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
28
- assert_equal 1, score[:score], score.inspect
29
- else
30
- assert_equal 0, score[:score], score.inspect
31
- end
32
- end
33
- end
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
34
7
  end
35
8
 
36
- test "within comparator with simple expectations" do
37
- database_for('sqlite') do |db|
38
- db.create_table(:foo) { primary_key(:id); Integer(:num); String(:parity) }
39
- db.create_table(:bar) { primary_key(:id); Integer(:num); String(:parity) }
40
- db[:foo].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
41
- db[:bar].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
42
- end
43
-
44
- db_opts = database_options_for('sqlite')
45
- dataset_1 = Linkage::Dataset.new(db_opts, "foo")
46
- dataset_2 = Linkage::Dataset.new(db_opts, "bar")
47
- conf = dataset_1.link_with(dataset_2) do
48
- lhs[:parity].must == rhs[:parity]
49
- lhs[:num].must be_within(5).of(rhs[:num])
50
- save_results_in(db_opts)
51
- end
52
-
53
- runner = Linkage::SingleThreadedRunner.new(conf)
54
- runner.execute
55
-
56
- database_for('sqlite') do |db|
57
- assert_equal db[:scores].count, 50
58
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
59
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
60
- assert_equal 1, score[:score]
61
- else
62
- assert_equal 0, score[:score]
63
- end
64
- end
65
- end
9
+ def teardown
10
+ FileUtils.remove_entry_secure(@tmpdir)
66
11
  end
67
12
 
68
- test "within comparator with simple expectations and functions" do
13
+ test "within comparator" do
69
14
  database_for('sqlite') do |db|
70
- db.create_table(:foo) { primary_key(:id); Integer(:num); String(:parity) }
71
- db.create_table(:bar) { primary_key(:id); Integer(:num); String(:parity) }
72
- db[:foo].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
73
- db[:bar].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
15
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
16
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
17
+ db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
18
+ db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
74
19
  end
75
20
 
21
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
76
22
  db_opts = database_options_for('sqlite')
77
23
  dataset_1 = Linkage::Dataset.new(db_opts, "foo")
78
24
  dataset_2 = Linkage::Dataset.new(db_opts, "bar")
79
- conf = dataset_1.link_with(dataset_2) do
80
- lhs[:parity].must == rhs[:parity]
81
- cast(lhs[:num], 'integer').must be_within(5).of(cast(rhs[:num], 'integer'))
82
- save_results_in(db_opts)
25
+ conf = dataset_1.link_with(dataset_2, result_set) do |conf|
26
+ conf.within(:num, :num, 5)
83
27
  end
84
28
 
85
- runner = Linkage::SingleThreadedRunner.new(conf)
29
+ runner = Linkage::Runner.new(conf)
86
30
  runner.execute
87
31
 
88
- database_for('sqlite') do |db|
89
- assert_equal db[:scores].count, 50
90
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
91
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
92
- assert_equal 1, score[:score]
93
- else
94
- assert_equal 0, score[:score]
95
- end
32
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
33
+ assert_equal 100, score_csv.length
34
+ score_csv.each do |row|
35
+ assert_equal "1", row['comparator_id']
36
+ # ids same as values
37
+ id_1 = row['id_1'].to_i
38
+ id_2 = row['id_2'].to_i
39
+ if (id_2 - id_1).abs <= 5
40
+ assert_equal 1, row['score'].to_i, row
41
+ else
42
+ assert_equal 0, row['score'].to_i
96
43
  end
97
44
  end
98
45
  end