linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,55 @@
1
+ require 'helper'
2
+
3
+ class IntegrationTests::TestDatabaseResultSet < Test::Unit::TestCase
4
+ def setup
5
+ @dir = Dir.mktmpdir('linkage')
6
+ @data_uri = "sqlite://" + File.join(@dir, "foo")
7
+ @results_uri = "sqlite://" + File.join(@dir, "bar")
8
+ end
9
+
10
+ def teardown
11
+ FileUtils.remove_entry_secure(@dir)
12
+ end
13
+
14
+ def data_database(options = {}, &block)
15
+ Sequel.connect(@data_uri, options, &block)
16
+ end
17
+
18
+ def results_database(options = {}, &block)
19
+ Sequel.connect(@results_uri, options, &block)
20
+ end
21
+
22
+ test "using a database for storing results" do
23
+ data_database do |db|
24
+ db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
25
+ db[:foo].import([:id, :foo, :bar], Array.new(10) { |i| [i, i, i] })
26
+ end
27
+
28
+ ds = Linkage::Dataset.new(@data_uri, 'foo')
29
+ result_set = Linkage::ResultSet['database'].new(@results_uri)
30
+ conf = ds.link_with(ds, result_set) do |conf|
31
+ conf.compare([:foo], [:bar], :equal)
32
+ conf.algorithm = :mean
33
+ conf.threshold = 1
34
+ end
35
+ runner = Linkage::Runner.new(conf)
36
+ runner.execute
37
+
38
+ results_database do |db|
39
+ assert db.table_exists?(:scores)
40
+ assert_equal 10, db[:scores].count
41
+ db[:scores].order(:id_1, :id_2).each do |row|
42
+ assert_equal row[:id_1], row[:id_2]
43
+ assert_equal 1, row[:comparator_id]
44
+ assert_same 1.0, row[:score]
45
+ end
46
+
47
+ assert db.table_exists?(:matches)
48
+ assert_equal 10, db[:matches].count
49
+ db[:matches].order(:id_1, :id_2).each do |row|
50
+ assert_equal row[:id_1], row[:id_2]
51
+ assert_same 1.0, row[:score]
52
+ end
53
+ end
54
+ end
55
+ end
@@ -15,7 +15,7 @@ module IntegrationTests
15
15
  FileUtils.remove_entry_secure(@tmpdir)
16
16
  end
17
17
 
18
- test "one mandatory field equality on single threaded runner" do
18
+ test "one-field equality on single threaded runner" do
19
19
  # create the test data
20
20
  database do |db|
21
21
  db.create_table(:foo) { primary_key(:id); String(:ssn) }
@@ -27,108 +27,33 @@ module IntegrationTests
27
27
  Array.new(100) { |i| [i, "12345678#{i%10}"] })
28
28
  end
29
29
 
30
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
30
31
  ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
31
32
  ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
32
33
  tmpuri = @tmpuri
33
- conf = ds_1.link_with(ds_2) do
34
- lhs[:ssn].must == rhs[:ssn]
35
- save_results_in(tmpuri)
34
+ conf = ds_1.link_with(ds_2, result_set) do |conf|
35
+ conf.compare([:ssn], [:ssn], :equal)
36
+ conf.algorithm = :mean
37
+ conf.threshold = 1.0
36
38
  end
37
- assert_equal :dual, conf.linkage_type
38
39
 
39
- runner = Linkage::SingleThreadedRunner.new(conf)
40
+ runner = Linkage::Runner.new(conf)
40
41
  runner.execute
41
42
 
42
- database do |db|
43
- assert_equal 10, db[:groups].count
44
- db[:groups].order(:ssn).each_with_index do |row, i|
45
- assert_equal "12345678#{i%10}", row[:ssn]
46
- end
47
-
48
- assert_equal 1000, db[:matches].count
49
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
50
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
51
- end
43
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
44
+ assert_equal 1000, score_csv.length
45
+ score_csv.each do |row|
46
+ id_1 = row['id_1'].to_i
47
+ id_2 = row['id_2'].to_i
48
+ assert (id_1 % 10) == (id_2 % 10)
52
49
  end
53
- end
54
50
 
55
- test "don't ignore 1-record groups before the combining phase" do
56
- # create the test data
57
- database do |db|
58
- db.create_table(:foo) { primary_key(:id); String(:ssn) }
59
- db[:foo].import([:id, :ssn],
60
- Array.new(100) { |i| [i, "1234567%03d" % i] })
61
-
62
- db.create_table(:bar) { primary_key(:id); String(:ssn) }
63
- db[:bar].import([:id, :ssn],
64
- Array.new(100) { |i| [i, "1234567%03d" % i] })
65
- end
66
-
67
- ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
68
- ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
69
- tmpuri = @tmpuri
70
- conf = ds_1.link_with(ds_2) do
71
- lhs[:ssn].must == rhs[:ssn]
72
- save_results_in(tmpuri)
73
- end
74
- runner = Linkage::SingleThreadedRunner.new(conf)
75
- runner.execute
76
-
77
- database do |db|
78
- assert_equal 100, db[:groups].count
79
- db[:groups].order(:ssn).each_with_index do |row, i|
80
- assert_equal "1234567%03d" % i, row[:ssn]
81
- end
82
- end
83
- end
84
-
85
- test "reacts properly when using two databases with different string equality methods" do
86
- foo_logger = nil #prefixed_logger("FOO")
87
- bar_logger = nil #prefixed_logger("BAR")
88
-
89
- database_for('mysql', :logger => foo_logger) do |db|
90
- db.create_table!(:foo) do
91
- primary_key(:id)
92
- String :baz, :collate => "latin1_swedish_ci"
93
- end
94
- db[:foo].import([:id, :baz], [
95
- [1, "tEst"],
96
- [2, "teSt"],
97
- [3, "tesT "],
98
- [4, "TEST"],
99
- [5, "junk"]
100
- ])
101
- end
102
-
103
- database_for('mysql', :logger => bar_logger) do |db|
104
- db.create_table!(:bar) do
105
- primary_key(:id)
106
- String :baz, :collate => "latin1_swedish_ci"
107
- end
108
- db[:bar].import([:id, :baz], [
109
- [1, "Test "],
110
- [2, "tEst "],
111
- [3, "teSt"],
112
- [4, "TEST"],
113
- [5, "junk"]
114
- ])
115
- end
116
-
117
- options = database_options_for('mysql')
118
- ds_1 = Linkage::Dataset.new(options, "foo", :logger => foo_logger)
119
- ds_2 = Linkage::Dataset.new(options, "bar", :logger => bar_logger)
120
- tmpuri = @tmpuri
121
- results_logger = nil #prefixed_logger("RESULTS")
122
- conf = ds_1.link_with(ds_2) do
123
- lhs[:baz].must == rhs[:baz]
124
- save_results_in(tmpuri, :logger => results_logger)
125
- end
126
-
127
- runner = Linkage::SingleThreadedRunner.new(conf)
128
- runner.execute
129
-
130
- database do |db|
131
- assert_equal 2, db[:groups].count
51
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
52
+ assert_equal 1000, match_csv.length
53
+ match_csv.each do |row|
54
+ id_1 = row['id_1'].to_i
55
+ id_2 = row['id_2'].to_i
56
+ assert (id_1 % 10) == (id_2 % 10)
132
57
  end
133
58
  end
134
59
  end
@@ -1,224 +1,121 @@
1
1
  require 'helper'
2
2
 
3
- module IntegrationTests
4
- class TestSelfLinkage < Test::Unit::TestCase
5
- def setup
6
- @tmpdir = Dir.mktmpdir('linkage')
7
- @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
3
+ class IntegrationTests::TestSelfLinkage < Test::Unit::TestCase
4
+ def setup
5
+ @tmpdir = Dir.mktmpdir('linkage')
6
+ @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
7
+ end
8
+
9
+ def database(&block)
10
+ Sequel.connect(@tmpuri, &block)
11
+ end
12
+
13
+ def teardown
14
+ FileUtils.remove_entry_secure(@tmpdir)
15
+ end
16
+
17
+ test "one-field equality" do
18
+ # insert the test data
19
+ database do |db|
20
+ db.create_table(:foo) { primary_key(:id); String(:ssn) }
21
+ db[:foo].import([:id, :ssn],
22
+ Array.new(100) { |i| [i, "12345678#{i%10}"] })
8
23
  end
9
24
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
25
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
26
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
+ conf = dataset.link_with(dataset, result_set) do |conf|
28
+ conf.compare([:ssn], [:ssn], :equal)
29
+ conf.algorithm = :mean
30
+ conf.threshold = 1.0
12
31
  end
13
32
 
14
- def teardown
15
- FileUtils.remove_entry_secure(@tmpdir)
33
+ runner = Linkage::Runner.new(conf)
34
+ runner.execute
35
+
36
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
37
+ assert_equal 450, score_csv.length
38
+ score_csv.each do |row|
39
+ assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
16
40
  end
17
41
 
18
- test "one mandatory field equality on single threaded runner" do
19
- # insert the test data
20
- database do |db|
21
- db.create_table(:foo) { primary_key(:id); String(:ssn) }
22
- db[:foo].import([:id, :ssn],
23
- Array.new(100) { |i| [i, "12345678#{i%10}"] })
24
- end
25
-
26
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
- tmpuri = @tmpuri
28
- conf = ds.link_with(ds) do
29
- lhs[:ssn].must == rhs[:ssn]
30
- save_results_in(tmpuri)
31
- end
32
- runner = Linkage::SingleThreadedRunner.new(conf)
33
- result_set = runner.execute
34
- assert_kind_of Linkage::ResultSet, result_set
35
-
36
- database do |db|
37
- assert_equal 10, db[:groups].count
38
- db[:groups].order(:ssn).each_with_index do |row, i|
39
- assert_equal "12345678#{i%10}", row[:ssn]
40
-
41
- group = Linkage::Group.from_row(row)
42
- dataset, _ = result_set.groups_records_datasets(group)
43
- assert_equal 10, dataset.count
44
- end
45
-
46
- assert_equal 450, db[:matches].count
47
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
48
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 10
49
- end
50
- end
42
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
43
+ assert_equal 450, match_csv.length
44
+ match_csv.each do |row|
45
+ assert_equal row['id_1'].to_i % 10, row['id_2'].to_i % 10
51
46
  end
47
+ end
52
48
 
53
- test "two mandatory field equalities on single threaded runner" do
54
- # insert the test data
55
- database do |db|
56
- db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
57
- db[:foo].import([:id, :ssn, :dob],
58
- Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
59
- end
60
-
61
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
62
- tmpuri = @tmpuri
63
- conf = ds.link_with(ds) do
64
- lhs[:ssn].must == rhs[:ssn]
65
- lhs[:dob].must == rhs[:dob]
66
- save_results_in(tmpuri)
67
- end
68
- runner = Linkage::SingleThreadedRunner.new(conf)
69
- runner.execute
70
-
71
- database do |db|
72
- assert_equal 20, db[:groups].count
73
- db[:groups].order(:ssn).each_with_index do |row, i|
74
- assert_equal "12345678#{i/2}", row[:ssn]
75
- assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
76
- end
77
-
78
- #assert_equal 100, db[:groups_records].count
79
- #expected_group_id = nil
80
- #db[:groups_records].order(:record_id).each do |row|
81
- #v = row[:record_id] % 20
82
- #expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
83
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
84
- #end
85
- end
49
+ test "two-field equality" do
50
+ # insert the test data
51
+ database do |db|
52
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Date(:dob) }
53
+ db[:foo].import([:id, :ssn, :dob],
54
+ Array.new(100) { |i| [i, "12345678#{i%10}", Date.civil(1985, 1, (i % 20) + 1)] })
86
55
  end
87
56
 
88
- test "one mandatory field equality on single threaded runner, with filter" do
89
- # insert the test data
90
- database do |db|
91
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
92
- db[:foo].import([:id, :ssn, :mod_5],
93
- Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
94
- end
95
-
96
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
97
- tmpuri = @tmpuri
98
- conf = ds.link_with(ds) do
99
- lhs[:ssn].must == rhs[:ssn]
100
- lhs[:mod_5].must == 3
101
- save_results_in(tmpuri)
102
- end
103
- runner = Linkage::SingleThreadedRunner.new(conf)
104
- runner.execute
105
-
106
- database do |db|
107
- assert_equal 2, db[:groups].count
108
- db[:groups].order(:ssn).each_with_index do |row, i|
109
- assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
110
- end
111
-
112
- #assert_equal 20, db[:groups_records].count
113
- #expected_group_id = nil
114
- #db[:groups_records].order(:record_id).each do |row|
115
- #expected_group_id = (row[:record_id] / 5) % 2 + 1
116
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
117
- #end
118
- end
57
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
58
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
59
+ conf = dataset.link_with(dataset, result_set) do |conf|
60
+ conf.compare([:ssn, :dob], [:ssn, :dob], :equal)
119
61
  end
120
62
 
121
- test "one mandatory field equality on single threaded runner, with inequality filters" do
122
- # insert the test data
123
- database do |db|
124
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
125
- db[:foo].import([:id, :ssn, :mod_5],
126
- Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
127
- end
128
-
129
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
130
- tmpuri = @tmpuri
131
- conf = ds.link_with(ds) do
132
- lhs[:ssn].must == rhs[:ssn]
133
- lhs[:mod_5].must > 2
134
- lhs[:mod_5].must <= 3
135
- save_results_in(tmpuri)
136
- end
137
- runner = Linkage::SingleThreadedRunner.new(conf)
138
- runner.execute
139
-
140
- database do |db|
141
- assert_equal 2, db[:groups].count
142
- db[:groups].order(:ssn).each_with_index do |row, i|
143
- assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
144
- end
145
-
146
- #assert_equal 20, db[:groups_records].count
147
- #expected_group_id = nil
148
- #db[:groups_records].order(:record_id).each do |row|
149
- #expected_group_id = (row[:record_id] / 5) % 2 + 1
150
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
151
- #end
152
- end
63
+ runner = Linkage::Runner.new(conf)
64
+ runner.execute
65
+
66
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
67
+ assert_equal 200, score_csv.length
68
+ score_csv.each do |row|
69
+ id_1 = row['id_1'].to_i
70
+ id_2 = row['id_2'].to_i
71
+ assert id_1 % 10 == id_2 % 10
72
+ assert id_1 % 20 == id_2 % 20
153
73
  end
154
74
 
155
- test "one mandatory field equality on single threaded runner, with field filter" do
156
- # insert the test data
157
- database do |db|
158
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
159
- db[:foo].import([:id, :ssn, :mod_5, :mod_20],
160
- Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
161
- end
162
-
163
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
164
- tmpuri = @tmpuri
165
- conf = ds.link_with(ds) do
166
- lhs[:ssn].must == rhs[:ssn]
167
- lhs[:mod_5].must == lhs[:mod_20]
168
- save_results_in(tmpuri)
169
- end
170
- assert_equal :self, conf.linkage_type
171
- runner = Linkage::SingleThreadedRunner.new(conf)
172
- runner.execute
173
-
174
- database do |db|
175
- assert_equal 5, db[:groups].count
176
- db[:groups].order(:ssn).each_with_index do |row, i|
177
- assert_equal "123456789#{i}", row[:ssn]
178
- end
179
-
180
- #assert_equal 25, db[:groups_records].count
181
- #expected_group_id = nil
182
- #db[:groups_records].order(:record_id).each do |row|
183
- #expected_group_id = row[:record_id] % 5 + 1
184
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
185
- #end
186
- end
75
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
76
+ assert_equal 200, match_csv.length
77
+ match_csv.each do |row|
78
+ id_1 = row['id_1'].to_i
79
+ id_2 = row['id_2'].to_i
80
+ assert id_1 % 10 == id_2 % 10
81
+ assert id_1 % 20 == id_2 % 20
82
+ end
83
+ end
84
+
85
+ test "one-field equality with blocking" do
86
+ # insert the test data
87
+ database do |db|
88
+ db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5) }
89
+ db[:foo].import([:id, :ssn, :mod_5],
90
+ Array.new(100) { |i| [i, "12345678#{i%10}", i % 5] })
91
+ end
92
+
93
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
94
+ dataset = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
95
+ dataset = dataset.filter(:mod_5 => 3)
96
+ conf = dataset.link_with(dataset, result_set) do |conf|
97
+ conf.compare([:ssn], [:ssn], :equal)
98
+ end
99
+
100
+ runner = Linkage::Runner.new(conf)
101
+ runner.execute
102
+
103
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
104
+ assert_equal 90, score_csv.length
105
+ score_csv.each do |row|
106
+ id_1 = row['id_1'].to_i
107
+ id_2 = row['id_2'].to_i
108
+ assert id_1 % 10 == id_2 % 10
109
+ assert id_1 % 5 == id_2 % 5
187
110
  end
188
111
 
189
- test "one mandatory field equality on single threaded runner, with identical filters" do
190
- # insert the test data
191
- database do |db|
192
- db.create_table(:foo) { primary_key(:id); String(:ssn); Integer(:mod_5); Integer(:mod_20) }
193
- db[:foo].import([:id, :ssn, :mod_5, :mod_20],
194
- Array.new(100) { |i| [i, "123456789#{i%10}", i % 5, i % 20] })
195
- end
196
-
197
- ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
198
- tmpuri = @tmpuri
199
- conf = ds.link_with(ds) do
200
- lhs[:ssn].must == rhs[:ssn]
201
- lhs[:mod_5].must == lhs[:mod_20]
202
- rhs[:mod_5].must == rhs[:mod_20]
203
- save_results_in(tmpuri)
204
- end
205
- assert_equal :self, conf.linkage_type
206
- runner = Linkage::SingleThreadedRunner.new(conf)
207
- runner.execute
208
-
209
- database do |db|
210
- assert_equal 5, db[:groups].count
211
- db[:groups].order(:ssn).each_with_index do |row, i|
212
- assert_equal "123456789#{i}", row[:ssn]
213
- end
214
-
215
- #assert_equal 25, db[:groups_records].count
216
- #expected_group_id = nil
217
- #db[:groups_records].order(:record_id).each do |row|
218
- #expected_group_id = row[:record_id] % 5 + 1
219
- #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
220
- #end
221
- end
112
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
113
+ assert_equal 90, match_csv.length
114
+ match_csv.each do |row|
115
+ id_1 = row['id_1'].to_i
116
+ id_2 = row['id_2'].to_i
117
+ assert id_1 % 10 == id_2 % 10
118
+ assert id_1 % 5 == id_2 % 5
222
119
  end
223
120
  end
224
121
  end
@@ -2,97 +2,44 @@ require 'helper'
2
2
 
3
3
  module IntegrationTests
4
4
  class TestWithinComparator < Test::Unit::TestCase
5
- test "within comparator with no simple expectations" do
6
- database_for('sqlite') do |db|
7
- db.create_table(:foo) { primary_key(:id); Integer(:num) }
8
- db.create_table(:bar) { primary_key(:id); Integer(:num) }
9
- db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
10
- db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
11
- end
12
-
13
- db_opts = database_options_for('sqlite')
14
- dataset_1 = Linkage::Dataset.new(db_opts, "foo")
15
- dataset_2 = Linkage::Dataset.new(db_opts, "bar")
16
- conf = dataset_1.link_with(dataset_2) do
17
- lhs[:num].must be_within(5).of(rhs[:num])
18
- save_results_in(db_opts)
19
- end
20
-
21
- runner = Linkage::SingleThreadedRunner.new(conf)
22
- runner.execute
23
-
24
- database_for('sqlite') do |db|
25
- assert_equal db[:scores].count, 100
26
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
27
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
28
- assert_equal 1, score[:score], score.inspect
29
- else
30
- assert_equal 0, score[:score], score.inspect
31
- end
32
- end
33
- end
5
+ def setup
6
+ @tmpdir = Dir.mktmpdir('linkage')
34
7
  end
35
8
 
36
- test "within comparator with simple expectations" do
37
- database_for('sqlite') do |db|
38
- db.create_table(:foo) { primary_key(:id); Integer(:num); String(:parity) }
39
- db.create_table(:bar) { primary_key(:id); Integer(:num); String(:parity) }
40
- db[:foo].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
41
- db[:bar].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
42
- end
43
-
44
- db_opts = database_options_for('sqlite')
45
- dataset_1 = Linkage::Dataset.new(db_opts, "foo")
46
- dataset_2 = Linkage::Dataset.new(db_opts, "bar")
47
- conf = dataset_1.link_with(dataset_2) do
48
- lhs[:parity].must == rhs[:parity]
49
- lhs[:num].must be_within(5).of(rhs[:num])
50
- save_results_in(db_opts)
51
- end
52
-
53
- runner = Linkage::SingleThreadedRunner.new(conf)
54
- runner.execute
55
-
56
- database_for('sqlite') do |db|
57
- assert_equal db[:scores].count, 50
58
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
59
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
60
- assert_equal 1, score[:score]
61
- else
62
- assert_equal 0, score[:score]
63
- end
64
- end
65
- end
9
+ def teardown
10
+ FileUtils.remove_entry_secure(@tmpdir)
66
11
  end
67
12
 
68
- test "within comparator with simple expectations and functions" do
13
+ test "within comparator" do
69
14
  database_for('sqlite') do |db|
70
- db.create_table(:foo) { primary_key(:id); Integer(:num); String(:parity) }
71
- db.create_table(:bar) { primary_key(:id); Integer(:num); String(:parity) }
72
- db[:foo].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
73
- db[:bar].import([:id, :num, :parity], (1..10).collect { |i| [i, i, i % 2 == 0 ? "even" : "odd"] })
15
+ db.create_table(:foo) { primary_key(:id); Integer(:num) }
16
+ db.create_table(:bar) { primary_key(:id); Integer(:num) }
17
+ db[:foo].import([:id, :num], (1..10).collect { |i| [i, i] })
18
+ db[:bar].import([:id, :num], (1..10).collect { |i| [i, i] })
74
19
  end
75
20
 
21
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
76
22
  db_opts = database_options_for('sqlite')
77
23
  dataset_1 = Linkage::Dataset.new(db_opts, "foo")
78
24
  dataset_2 = Linkage::Dataset.new(db_opts, "bar")
79
- conf = dataset_1.link_with(dataset_2) do
80
- lhs[:parity].must == rhs[:parity]
81
- cast(lhs[:num], 'integer').must be_within(5).of(cast(rhs[:num], 'integer'))
82
- save_results_in(db_opts)
25
+ conf = dataset_1.link_with(dataset_2, result_set) do |conf|
26
+ conf.within(:num, :num, 5)
83
27
  end
84
28
 
85
- runner = Linkage::SingleThreadedRunner.new(conf)
29
+ runner = Linkage::Runner.new(conf)
86
30
  runner.execute
87
31
 
88
- database_for('sqlite') do |db|
89
- assert_equal db[:scores].count, 50
90
- db[:scores].order(:record_1_id, :record_2_id).each do |score|
91
- if (score[:record_2_id] - score[:record_1_id]).abs <= 5
92
- assert_equal 1, score[:score]
93
- else
94
- assert_equal 0, score[:score]
95
- end
32
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
33
+ assert_equal 100, score_csv.length
34
+ score_csv.each do |row|
35
+ assert_equal "1", row['comparator_id']
36
+ # ids same as values
37
+ id_1 = row['id_1'].to_i
38
+ id_2 = row['id_2'].to_i
39
+ if (id_2 - id_1).abs <= 5
40
+ assert_equal 1, row['score'].to_i, row
41
+ else
42
+ assert_equal 0, row['score'].to_i
96
43
  end
97
44
  end
98
45
  end