linkage 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +5 -5
- data/Rakefile +2 -0
- data/VERSION +1 -1
- data/lib/linkage/configuration.rb +14 -4
- data/lib/linkage/dataset.rb +68 -13
- data/lib/linkage/group.rb +23 -31
- data/lib/linkage/result_set.rb +11 -11
- data/lib/linkage/runner/single_threaded.rb +21 -57
- data/linkage.gemspec +3 -3
- data/test/helper.rb +10 -0
- data/test/integration/test_cross_linkage.rb +21 -7
- data/test/integration/test_dataset.rb +43 -0
- data/test/integration/test_dual_linkage.rb +16 -20
- data/test/integration/test_functions.rb +6 -5
- data/test/integration/test_self_linkage.rb +34 -37
- data/test/unit/test_configuration.rb +3 -7
- data/test/unit/test_dataset.rb +51 -0
- data/test/unit/test_group.rb +4 -15
- metadata +4 -4
data/Gemfile.lock
CHANGED
@@ -2,7 +2,7 @@ GEM
|
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
4
|
blockenspiel (0.4.3)
|
5
|
-
coderay (1.0.
|
5
|
+
coderay (1.0.6)
|
6
6
|
ffi (1.0.11)
|
7
7
|
git (1.2.5)
|
8
8
|
guard (1.0.1)
|
@@ -25,7 +25,7 @@ GEM
|
|
25
25
|
mocha (0.10.5)
|
26
26
|
metaclass (~> 0.0.1)
|
27
27
|
mysql2 (0.3.11)
|
28
|
-
pry (0.9.
|
28
|
+
pry (0.9.9)
|
29
29
|
coderay (~> 1.0.5)
|
30
30
|
method_source (~> 0.7.1)
|
31
31
|
slop (>= 2.4.4, < 3)
|
@@ -33,12 +33,12 @@ GEM
|
|
33
33
|
rdiscount (1.6.8)
|
34
34
|
rdoc (3.12)
|
35
35
|
json (~> 1.4)
|
36
|
-
sequel (3.
|
36
|
+
sequel (3.34.1)
|
37
37
|
slop (2.4.4)
|
38
|
-
sqlite3 (1.3.
|
38
|
+
sqlite3 (1.3.6)
|
39
39
|
test-unit (2.4.8)
|
40
40
|
thor (0.14.6)
|
41
|
-
versionomy (0.4.
|
41
|
+
versionomy (0.4.3)
|
42
42
|
blockenspiel (>= 0.4.3)
|
43
43
|
yard (0.7.5)
|
44
44
|
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
@@ -117,12 +117,12 @@ module Linkage
|
|
117
117
|
end
|
118
118
|
|
119
119
|
expr = target.to_expr(side)
|
120
|
-
|
120
|
+
aliaz = nil
|
121
121
|
if expr != merged_field.name
|
122
|
-
|
122
|
+
aliaz = merged_field.name
|
123
123
|
end
|
124
124
|
|
125
|
-
dataset.
|
125
|
+
dataset.match(expr, aliaz)
|
126
126
|
end
|
127
127
|
|
128
128
|
def same_filter?(other)
|
@@ -318,9 +318,9 @@ module Linkage
|
|
318
318
|
def initialize(dataset_1, dataset_2)
|
319
319
|
@dataset_1 = dataset_1
|
320
320
|
@dataset_2 = dataset_2
|
321
|
+
@linkage_type = dataset_1 == dataset_2 ? :self : :dual
|
321
322
|
@expectations = []
|
322
323
|
@visual_comparisons = []
|
323
|
-
@linkage_type = dataset_1 == dataset_2 ? :self : :dual
|
324
324
|
end
|
325
325
|
|
326
326
|
def configure(&block)
|
@@ -348,5 +348,15 @@ module Linkage
|
|
348
348
|
def result_set
|
349
349
|
@result_set ||= ResultSet.new(self)
|
350
350
|
end
|
351
|
+
|
352
|
+
def datasets_with_applied_expectations
|
353
|
+
dataset_1 = @dataset_1
|
354
|
+
dataset_2 = @dataset_2
|
355
|
+
@expectations.each do |exp|
|
356
|
+
dataset_1 = exp.apply_to(dataset_1, :lhs)
|
357
|
+
dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
|
358
|
+
end
|
359
|
+
@linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
|
360
|
+
end
|
351
361
|
end
|
352
362
|
end
|
data/lib/linkage/dataset.rb
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
module Linkage
|
2
|
-
|
2
|
+
# Delegator around Sequel::Dataset with some extra functionality.
|
3
|
+
class Dataset
|
3
4
|
attr_reader :field_set, :table_name
|
4
5
|
|
5
6
|
def initialize(uri, table, options = {})
|
6
7
|
@table_name = table.to_sym
|
7
8
|
db = Sequel.connect(uri, options)
|
8
|
-
|
9
|
-
super(ds)
|
9
|
+
@dataset = db[@table_name]
|
10
10
|
@field_set = FieldSet.new(db.schema(@table_name))
|
11
|
+
@_match = []
|
11
12
|
end
|
12
13
|
|
13
|
-
def
|
14
|
-
|
14
|
+
def obj
|
15
|
+
@dataset
|
16
|
+
end
|
17
|
+
|
18
|
+
def obj=(value)
|
19
|
+
@dataset = value
|
20
|
+
end
|
15
21
|
|
16
22
|
# Setup a linkage with another dataset
|
17
23
|
#
|
@@ -26,21 +32,70 @@ module Linkage
|
|
26
32
|
@dataset.db.adapter_scheme
|
27
33
|
end
|
28
34
|
|
29
|
-
def
|
30
|
-
|
35
|
+
def match(expr, aliaz = nil)
|
36
|
+
clone(:match => {:expr => expr, :alias => aliaz})
|
37
|
+
end
|
38
|
+
|
39
|
+
def clone(new_opts={})
|
40
|
+
new_opts = new_opts.dup
|
41
|
+
new_obj = new_opts.delete(:new_obj)
|
42
|
+
|
43
|
+
match = new_opts.delete(:match)
|
44
|
+
result = super()
|
45
|
+
result.send(:_match, match)
|
46
|
+
|
31
47
|
if new_obj
|
32
|
-
|
48
|
+
result.obj = new_obj
|
33
49
|
else
|
34
|
-
|
50
|
+
result.obj = obj.clone(new_opts)
|
51
|
+
end
|
52
|
+
result
|
53
|
+
end
|
54
|
+
|
55
|
+
def each_group(min = 2)
|
56
|
+
@dataset.group_and_count(*aliased_match_expressions).having{count >= min}.each do |row|
|
57
|
+
count = row.delete(:count)
|
58
|
+
yield Group.new(row, {:count => count})
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def group_by_matches(aliased = false)
|
63
|
+
expr = aliased ? aliased_match_expressions : match_expressions
|
64
|
+
group(*expr)
|
65
|
+
end
|
66
|
+
|
67
|
+
def dataset_for_group(group)
|
68
|
+
filters = []
|
69
|
+
group.values.each_pair do |key, value|
|
70
|
+
# find a matched expression with this alias
|
71
|
+
m = @_match.detect { |h| h[:alias] ? h[:alias] == key : h[:expr] == key }
|
72
|
+
raise "this dataset isn't compatible with the given group" if !m
|
73
|
+
filters << {m[:expr] => value}
|
74
|
+
end
|
75
|
+
filter(*filters)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def _match(opts)
|
81
|
+
if opts
|
82
|
+
@_match += [opts]
|
35
83
|
end
|
36
84
|
end
|
37
85
|
|
86
|
+
def match_expressions
|
87
|
+
@_match.collect { |m| m[:expr] }
|
88
|
+
end
|
89
|
+
|
90
|
+
def aliased_match_expressions
|
91
|
+
@_match.collect { |m| m[:alias] ? m[:expr].as(m[:alias]) : m[:expr] }
|
92
|
+
end
|
93
|
+
|
38
94
|
def method_missing(name, *args, &block)
|
39
|
-
result =
|
95
|
+
result = @dataset.send(name, *args, &block)
|
40
96
|
if result.kind_of?(Sequel::Dataset)
|
41
|
-
|
42
|
-
result = clone
|
43
|
-
@new_obj = nil
|
97
|
+
new_obj = result
|
98
|
+
result = clone(:new_obj => result)
|
44
99
|
end
|
45
100
|
result
|
46
101
|
end
|
data/lib/linkage/group.rb
CHANGED
@@ -1,43 +1,35 @@
|
|
1
1
|
module Linkage
|
2
|
-
# This class represents a group of records that match based on criteria
|
3
|
-
# described via the {Dataset#link_with} method. Group's are created by
|
4
|
-
# subclasses of the {Runner} class during execution.
|
5
|
-
#
|
6
|
-
# @see Dataset#link_with
|
7
|
-
# @see SingleThreadedRunner
|
8
2
|
class Group
|
9
|
-
# @return [Array<Object>] An array of this group's record ids
|
10
|
-
attr_reader :records
|
11
|
-
|
12
3
|
# @return [Hash] Hash of matching values
|
13
4
|
attr_reader :values
|
14
5
|
|
15
|
-
# @
|
16
|
-
|
17
|
-
# Linkage::Group.new({:foo => 123, :bar => 'baz'})
|
18
|
-
def initialize(matching_values)
|
19
|
-
@values = matching_values
|
20
|
-
@records = []
|
21
|
-
end
|
6
|
+
# @return [Integer] Number of records in this group
|
7
|
+
attr_reader :count
|
22
8
|
|
23
|
-
#
|
24
|
-
|
25
|
-
# @param [Hash] values Hash of values
|
26
|
-
# @return [Boolean] true if match, false if not
|
27
|
-
def matches?(values)
|
28
|
-
@values == values
|
29
|
-
end
|
9
|
+
# @return [Integer] This group's ID (if it exists)
|
10
|
+
attr_reader :id
|
30
11
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
12
|
+
def self.from_row(row)
|
13
|
+
values = {}
|
14
|
+
options = {}
|
15
|
+
row.each_pair do |key, value|
|
16
|
+
if key == :id || key == :count
|
17
|
+
options[key] = value
|
18
|
+
else
|
19
|
+
values[key] = value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
new(values, options)
|
36
23
|
end
|
37
24
|
|
38
|
-
# @
|
39
|
-
|
40
|
-
|
25
|
+
# @param [Hash] values Values that define this group
|
26
|
+
# @param [Hash] options
|
27
|
+
# @example
|
28
|
+
# Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
|
29
|
+
def initialize(values, options)
|
30
|
+
@count = options[:count]
|
31
|
+
@id = options[:id]
|
32
|
+
@values = values
|
41
33
|
end
|
42
34
|
end
|
43
35
|
end
|
data/lib/linkage/result_set.rb
CHANGED
@@ -7,11 +7,7 @@ module Linkage
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def groups_dataset
|
10
|
-
Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
|
11
|
-
end
|
12
|
-
|
13
|
-
def groups_records_dataset
|
14
|
-
Dataset.new(@config.results_uri, :groups_records, @config.results_uri_options)
|
10
|
+
@groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
|
15
11
|
end
|
16
12
|
|
17
13
|
def database(&block)
|
@@ -40,18 +36,22 @@ module Linkage
|
|
40
36
|
groups_headers = [:id] + group.values.keys
|
41
37
|
@groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
|
42
38
|
end
|
43
|
-
@groups_records_buffer ||= ImportBuffer.new(@config.results_uri, :groups_records, [:group_id, :dataset, :record_id], @config.results_uri_options)
|
44
|
-
|
45
39
|
group_id = next_group_id
|
46
40
|
@groups_buffer.add([group_id] + group.values.values)
|
47
|
-
group.records.each do |record_id|
|
48
|
-
@groups_records_buffer.add([group_id, dataset_id, record_id])
|
49
|
-
end
|
50
41
|
end
|
51
42
|
|
52
43
|
def flush!
|
53
44
|
@groups_buffer.flush if @groups_buffer
|
54
|
-
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_group(index)
|
48
|
+
values = groups_dataset.order(:id).limit(1, index).first
|
49
|
+
Group.from_row(values)
|
50
|
+
end
|
51
|
+
|
52
|
+
def groups_records_datasets(group)
|
53
|
+
datasets = @config.datasets_with_applied_expectations
|
54
|
+
datasets.collect! { |ds| ds.dataset_for_group(group) }
|
55
55
|
end
|
56
56
|
|
57
57
|
private
|
@@ -6,7 +6,6 @@ module Linkage
|
|
6
6
|
# @return [Linkage::ResultSet]
|
7
7
|
def execute
|
8
8
|
setup_datasets
|
9
|
-
apply_expectations
|
10
9
|
group_records
|
11
10
|
|
12
11
|
return result_set
|
@@ -15,18 +14,13 @@ module Linkage
|
|
15
14
|
private
|
16
15
|
|
17
16
|
def setup_datasets
|
18
|
-
|
19
|
-
@dataset_1 = config.dataset_1.select(pk.to_expr)
|
20
|
-
if @config.linkage_type != :self
|
21
|
-
pk = config.dataset_2.field_set.primary_key
|
22
|
-
@dataset_2 = config.dataset_2.select(pk.to_expr)
|
23
|
-
end
|
24
|
-
end
|
17
|
+
@dataset_1, @dataset_2 = config.datasets_with_applied_expectations
|
25
18
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
pk = @dataset_1.field_set.primary_key
|
20
|
+
@dataset_1 = @dataset_1.select(pk.to_expr)
|
21
|
+
if @config.linkage_type != :self
|
22
|
+
pk = @dataset_2.field_set.primary_key
|
23
|
+
@dataset_2 = @dataset_2.select(pk.to_expr)
|
30
24
|
end
|
31
25
|
end
|
32
26
|
|
@@ -47,23 +41,10 @@ module Linkage
|
|
47
41
|
# @param [Boolean] ignore_empty_groups
|
48
42
|
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
49
43
|
# the block. Otherwise, call ResultSet#add_group on the group.
|
50
|
-
def group_records_for(dataset, dataset_id
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
dataset.each do |row|
|
55
|
-
pk = row.delete(primary_key)
|
56
|
-
if current_group.nil? || !current_group.matches?(row)
|
57
|
-
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
58
|
-
block.call(current_group)
|
59
|
-
end
|
60
|
-
new_group = Group.new(row)
|
61
|
-
current_group = new_group
|
62
|
-
end
|
63
|
-
current_group.add_record(pk)
|
64
|
-
end
|
65
|
-
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
66
|
-
block.call(current_group)
|
44
|
+
def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
|
45
|
+
group_minimum = ignore_empty_groups ? 2 : 1
|
46
|
+
dataset.each_group(group_minimum) do |group|
|
47
|
+
result_set.add_group(group, dataset_id)
|
67
48
|
end
|
68
49
|
result_set.flush!
|
69
50
|
end
|
@@ -72,37 +53,20 @@ module Linkage
|
|
72
53
|
# Create a new dataset for the groups table
|
73
54
|
groups_dataset = result_set.groups_dataset
|
74
55
|
|
75
|
-
|
56
|
+
groups_dataset.field_set.values.each do |field|
|
76
57
|
# Sort on all fields
|
77
|
-
field.primary_key?
|
78
|
-
|
79
|
-
groups_dataset = groups_dataset.select(*exprs, groups_dataset.field_set.primary_key.to_expr).order(*exprs) # ensure matching groups are sorted by id
|
80
|
-
|
81
|
-
result_set.database do |db|
|
82
|
-
groups_to_delete = []
|
83
|
-
db.transaction do # for speed reasons
|
84
|
-
group_records_for(groups_dataset, nil, false) do |group|
|
85
|
-
if group.count == 1
|
86
|
-
# Delete the empty group
|
87
|
-
groups_to_delete << group.records[0]
|
88
|
-
else
|
89
|
-
# Change group_id in the groups_records table to the first group
|
90
|
-
# id, delete other groups.
|
91
|
-
new_group_id = group.records[0]
|
92
|
-
group.records[1..-1].each do |old_group_id|
|
93
|
-
# NOTE: There can only be a group with max size of 2, but
|
94
|
-
# this adds in future support for matching more than
|
95
|
-
# 2 datasets at once.
|
96
|
-
db[:groups_records].filter(:group_id => old_group_id).
|
97
|
-
update(:group_id => new_group_id)
|
98
|
-
groups_to_delete << old_group_id
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
58
|
+
if !field.primary_key?
|
59
|
+
groups_dataset = groups_dataset.match(field.to_expr)
|
102
60
|
end
|
103
|
-
db[:groups_records].filter(:group_id => groups_to_delete).delete
|
104
|
-
db[:groups].filter(:id => groups_to_delete).delete
|
105
61
|
end
|
62
|
+
|
63
|
+
# Delete non-matching groups
|
64
|
+
sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
|
65
|
+
groups_dataset.filter(:id => sub_dataset.obj).delete
|
66
|
+
|
67
|
+
# Delete duplicate groups
|
68
|
+
sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
|
69
|
+
groups_dataset.filter(:id => sub_dataset.obj).delete
|
106
70
|
end
|
107
71
|
end
|
108
72
|
end
|
data/linkage.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "linkage"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jeremy Stephens"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-05-08"
|
13
13
|
s.description = "Performs record linkage between one or two datasets, using Sequel on the backend"
|
14
14
|
s.email = "jeremy.f.stephens@vanderbilt.edu"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -69,7 +69,7 @@ Gem::Specification.new do |s|
|
|
69
69
|
s.homepage = "http://github.com/coupler/linkage"
|
70
70
|
s.licenses = ["MIT"]
|
71
71
|
s.require_paths = ["lib"]
|
72
|
-
s.rubygems_version = "1.8.
|
72
|
+
s.rubygems_version = "1.8.23"
|
73
73
|
s.summary = "Record linkage library"
|
74
74
|
|
75
75
|
if s.respond_to? :specification_version then
|
data/test/helper.rb
CHANGED
@@ -62,6 +62,16 @@ class Test::Unit::TestCase
|
|
62
62
|
def test_config
|
63
63
|
@test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
|
64
64
|
end
|
65
|
+
|
66
|
+
def prefixed_logger(prefix)
|
67
|
+
logger = Logger.new(STDERR)
|
68
|
+
original_formatter = Logger::Formatter.new
|
69
|
+
logger.formatter = proc { |severity, datetime, progname, msg|
|
70
|
+
result = original_formatter.call(severity, datetime, progname, msg)
|
71
|
+
"[#{prefix}] #{result}"
|
72
|
+
}
|
73
|
+
logger
|
74
|
+
end
|
65
75
|
end
|
66
76
|
|
67
77
|
module UnitTests; end
|
@@ -7,8 +7,8 @@ module IntegrationTests
|
|
7
7
|
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
8
|
end
|
9
9
|
|
10
|
-
def database(&block)
|
11
|
-
Sequel.connect(@tmpuri, &block)
|
10
|
+
def database(options = {}, &block)
|
11
|
+
Sequel.connect(@tmpuri, options, &block)
|
12
12
|
end
|
13
13
|
|
14
14
|
def teardown
|
@@ -16,6 +16,10 @@ module IntegrationTests
|
|
16
16
|
end
|
17
17
|
|
18
18
|
test "one mandatory field equality on single threaded runner" do
|
19
|
+
#setup_logger = Logger.new(STDERR)
|
20
|
+
#setup_logger.formatter = lambda { |severity, time, progname, msg|
|
21
|
+
#" SETUP : %s [%s]: %s\n" % [severity, time, msg]
|
22
|
+
#}
|
19
23
|
# insert the test data
|
20
24
|
database do |db|
|
21
25
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -23,12 +27,22 @@ module IntegrationTests
|
|
23
27
|
Array.new(100) { |i| [i, i % 10, i % 5] })
|
24
28
|
end
|
25
29
|
|
30
|
+
#ds_logger = Logger.new(STDERR)
|
31
|
+
#ds_logger.formatter = lambda { |severity, time, progname, msg|
|
32
|
+
#"DATASET: %s [%s]: %s\n" % [severity, time, msg]
|
33
|
+
#}
|
26
34
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
35
|
+
|
36
|
+
#rs_logger = Logger.new(STDERR)
|
37
|
+
#rs_logger.formatter = lambda { |severity, time, progname, msg|
|
38
|
+
#"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
|
39
|
+
#}
|
27
40
|
tmpuri = @tmpuri
|
28
41
|
conf = ds.link_with(ds) do
|
29
42
|
lhs[:foo].must == rhs[:bar]
|
30
43
|
save_results_in(tmpuri, :single_threaded => true)
|
31
44
|
end
|
45
|
+
assert_equal :cross, conf.linkage_type
|
32
46
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
33
47
|
runner.execute
|
34
48
|
|
@@ -38,11 +52,11 @@ module IntegrationTests
|
|
38
52
|
assert_equal i, row[:foo_bar]
|
39
53
|
end
|
40
54
|
|
41
|
-
assert_equal 150, db[:groups_records].count
|
42
|
-
db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
43
|
-
expected_group_id = (row[:record_id] % 5) + 1
|
44
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
45
|
-
end
|
55
|
+
#assert_equal 150, db[:groups_records].count
|
56
|
+
#db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
57
|
+
#expected_group_id = (row[:record_id] % 5) + 1
|
58
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
59
|
+
#end
|
46
60
|
end
|
47
61
|
end
|
48
62
|
|
@@ -27,4 +27,47 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
27
27
|
assert_equal ds_2.field_set, ds_1.field_set
|
28
28
|
assert_match /`foo` = 'bar'/, ds_2.sql
|
29
29
|
end
|
30
|
+
|
31
|
+
test "each_group" do
|
32
|
+
database do |db|
|
33
|
+
db.create_table(:foo) do
|
34
|
+
primary_key :id
|
35
|
+
String :bar
|
36
|
+
end
|
37
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
|
38
|
+
end
|
39
|
+
|
40
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
41
|
+
ds = ds.match(:bar)
|
42
|
+
ds.each_group do |group|
|
43
|
+
assert_equal({:bar => "foo"}, group.values)
|
44
|
+
assert_equal(2, group.count)
|
45
|
+
end
|
46
|
+
|
47
|
+
groups = []
|
48
|
+
ds.each_group(1) do |group|
|
49
|
+
groups << group
|
50
|
+
end
|
51
|
+
assert_equal 3, groups.length
|
52
|
+
end
|
53
|
+
|
54
|
+
test "each_group with filters" do
|
55
|
+
database do |db|
|
56
|
+
db.create_table(:foo) do
|
57
|
+
primary_key :id
|
58
|
+
String :bar
|
59
|
+
Integer :baz
|
60
|
+
end
|
61
|
+
db[:foo].import([:id, :bar, :baz], [[1, 'foo', 1], [2, 'foo', 2], [3, 'bar', 3], [4, 'baz', 4]])
|
62
|
+
end
|
63
|
+
|
64
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
65
|
+
ds = ds.match(:bar)
|
66
|
+
ds = ds.filter { baz >= 3 }
|
67
|
+
groups = []
|
68
|
+
ds.each_group(1) do |group|
|
69
|
+
groups << group
|
70
|
+
end
|
71
|
+
assert_equal 2, groups.length
|
72
|
+
end
|
30
73
|
end
|
@@ -7,8 +7,8 @@ module IntegrationTests
|
|
7
7
|
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
8
|
end
|
9
9
|
|
10
|
-
def database(&block)
|
11
|
-
Sequel.connect(@tmpuri, &block)
|
10
|
+
def database(options = {}, &block)
|
11
|
+
Sequel.connect(@tmpuri, options, &block)
|
12
12
|
end
|
13
13
|
|
14
14
|
def teardown
|
@@ -43,16 +43,16 @@ module IntegrationTests
|
|
43
43
|
assert_equal "12345678#{i%10}", row[:ssn]
|
44
44
|
end
|
45
45
|
|
46
|
-
assert_equal 200, db[:groups_records].count
|
47
|
-
db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
48
|
-
if i % 20 >= 10
|
49
|
-
assert_equal 2, row[:dataset], row.inspect
|
50
|
-
else
|
51
|
-
assert_equal 1, row[:dataset], row.inspect
|
52
|
-
end
|
53
|
-
expected_group_id = i / 20 + 1
|
54
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
-
end
|
46
|
+
#assert_equal 200, db[:groups_records].count
|
47
|
+
#db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
48
|
+
#if i % 20 >= 10
|
49
|
+
#assert_equal 2, row[:dataset], row.inspect
|
50
|
+
#else
|
51
|
+
#assert_equal 1, row[:dataset], row.inspect
|
52
|
+
#end
|
53
|
+
#expected_group_id = i / 20 + 1
|
54
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
+
#end
|
56
56
|
end
|
57
57
|
end
|
58
58
|
|
@@ -86,8 +86,7 @@ module IntegrationTests
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
-
test "
|
90
|
-
pend
|
89
|
+
test "reacts properly when using two databases with different string equality methods" do
|
91
90
|
if !test_config['mysql']
|
92
91
|
omission("No MySQL test configuration found")
|
93
92
|
end
|
@@ -98,24 +97,21 @@ module IntegrationTests
|
|
98
97
|
|
99
98
|
db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
|
100
99
|
db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
|
101
|
-
|
102
|
-
db.run("DROP TABLE IF EXISTS groups")
|
103
|
-
db.run("DROP TABLE IF EXISTS groups_records")
|
104
100
|
end
|
105
101
|
|
106
102
|
ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
|
107
103
|
ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
|
108
|
-
|
104
|
+
tmpuri = @tmpuri
|
109
105
|
conf = ds_1.link_with(ds_2) do
|
110
106
|
lhs[:one].must == rhs[:one]
|
111
107
|
lhs[:two].must == rhs[:two]
|
112
|
-
save_results_in(
|
108
|
+
save_results_in(tmpuri)
|
113
109
|
end
|
114
110
|
|
115
111
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
116
112
|
runner.execute
|
117
113
|
|
118
|
-
|
114
|
+
database do |db|
|
119
115
|
assert_equal 1, db[:groups].count
|
120
116
|
end
|
121
117
|
end
|
@@ -39,20 +39,21 @@ module IntegrationTests
|
|
39
39
|
end
|
40
40
|
|
41
41
|
test "strftime in sqlite" do
|
42
|
-
logger = Logger.new(STDERR)
|
43
|
-
database(:logger => logger) do |db|
|
42
|
+
#logger = Logger.new(STDERR)
|
43
|
+
#database(:logger => logger) do |db|
|
44
|
+
database do |db|
|
44
45
|
db.create_table(:foo) { primary_key(:id); Date(:foo_date) }
|
45
46
|
db.create_table(:bar) { primary_key(:id); String(:bar_string) }
|
46
47
|
db[:foo].insert({:id => 1, :foo_date => Date.today})
|
47
48
|
db[:bar].insert({:id => 1, :bar_string => Date.today.strftime("%Y-%m-%d")})
|
48
49
|
end
|
49
50
|
|
50
|
-
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true
|
51
|
-
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true
|
51
|
+
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
52
|
+
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
|
52
53
|
tmpuri = @tmpuri
|
53
54
|
conf = ds_1.link_with(ds_2) do
|
54
55
|
strftime(lhs[:foo_date], "%Y-%m-%d").must == rhs[:bar_string]
|
55
|
-
save_results_in(tmpuri
|
56
|
+
save_results_in(tmpuri)
|
56
57
|
end
|
57
58
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
58
59
|
runner.execute
|
@@ -37,13 +37,10 @@ module IntegrationTests
|
|
37
37
|
assert_equal 10, db[:groups].count
|
38
38
|
db[:groups].order(:ssn).each_with_index do |row, i|
|
39
39
|
assert_equal "12345678#{i%10}", row[:ssn]
|
40
|
-
end
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
expected_group_id = (row[:record_id] % 10) + 1
|
46
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
41
|
+
group = Linkage::Group.from_row(row)
|
42
|
+
dataset, _ = result_set.groups_records_datasets(group)
|
43
|
+
assert_equal 10, dataset.count
|
47
44
|
end
|
48
45
|
end
|
49
46
|
end
|
@@ -73,13 +70,13 @@ module IntegrationTests
|
|
73
70
|
assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
|
74
71
|
end
|
75
72
|
|
76
|
-
assert_equal 100, db[:groups_records].count
|
77
|
-
expected_group_id = nil
|
78
|
-
db[:groups_records].order(:record_id).each do |row|
|
79
|
-
v = row[:record_id] % 20
|
80
|
-
expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
|
81
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
82
|
-
end
|
73
|
+
#assert_equal 100, db[:groups_records].count
|
74
|
+
#expected_group_id = nil
|
75
|
+
#db[:groups_records].order(:record_id).each do |row|
|
76
|
+
#v = row[:record_id] % 20
|
77
|
+
#expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
|
78
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
79
|
+
#end
|
83
80
|
end
|
84
81
|
end
|
85
82
|
|
@@ -107,12 +104,12 @@ module IntegrationTests
|
|
107
104
|
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
108
105
|
end
|
109
106
|
|
110
|
-
assert_equal 20, db[:groups_records].count
|
111
|
-
expected_group_id = nil
|
112
|
-
db[:groups_records].order(:record_id).each do |row|
|
113
|
-
expected_group_id = (row[:record_id] / 5) % 2 + 1
|
114
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
115
|
-
end
|
107
|
+
#assert_equal 20, db[:groups_records].count
|
108
|
+
#expected_group_id = nil
|
109
|
+
#db[:groups_records].order(:record_id).each do |row|
|
110
|
+
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
111
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
112
|
+
#end
|
116
113
|
end
|
117
114
|
end
|
118
115
|
|
@@ -141,12 +138,12 @@ module IntegrationTests
|
|
141
138
|
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
142
139
|
end
|
143
140
|
|
144
|
-
assert_equal 20, db[:groups_records].count
|
145
|
-
expected_group_id = nil
|
146
|
-
db[:groups_records].order(:record_id).each do |row|
|
147
|
-
expected_group_id = (row[:record_id] / 5) % 2 + 1
|
148
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
149
|
-
end
|
141
|
+
#assert_equal 20, db[:groups_records].count
|
142
|
+
#expected_group_id = nil
|
143
|
+
#db[:groups_records].order(:record_id).each do |row|
|
144
|
+
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
145
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
146
|
+
#end
|
150
147
|
end
|
151
148
|
end
|
152
149
|
|
@@ -175,12 +172,12 @@ module IntegrationTests
|
|
175
172
|
assert_equal "123456789#{i}", row[:ssn]
|
176
173
|
end
|
177
174
|
|
178
|
-
assert_equal 25, db[:groups_records].count
|
179
|
-
expected_group_id = nil
|
180
|
-
db[:groups_records].order(:record_id).each do |row|
|
181
|
-
expected_group_id = row[:record_id] % 5 + 1
|
182
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
183
|
-
end
|
175
|
+
#assert_equal 25, db[:groups_records].count
|
176
|
+
#expected_group_id = nil
|
177
|
+
#db[:groups_records].order(:record_id).each do |row|
|
178
|
+
#expected_group_id = row[:record_id] % 5 + 1
|
179
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
180
|
+
#end
|
184
181
|
end
|
185
182
|
end
|
186
183
|
|
@@ -210,12 +207,12 @@ module IntegrationTests
|
|
210
207
|
assert_equal "123456789#{i}", row[:ssn]
|
211
208
|
end
|
212
209
|
|
213
|
-
assert_equal 25, db[:groups_records].count
|
214
|
-
expected_group_id = nil
|
215
|
-
db[:groups_records].order(:record_id).each do |row|
|
216
|
-
expected_group_id = row[:record_id] % 5 + 1
|
217
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
218
|
-
end
|
210
|
+
#assert_equal 25, db[:groups_records].count
|
211
|
+
#expected_group_id = nil
|
212
|
+
#db[:groups_records].order(:record_id).each do |row|
|
213
|
+
#expected_group_id = row[:record_id] % 5 + 1
|
214
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
215
|
+
#end
|
219
216
|
end
|
220
217
|
end
|
221
218
|
end
|
@@ -135,9 +135,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
|
|
135
135
|
field_2 = stub('field 2', :to_expr => :foo)
|
136
136
|
dataset_2.stubs(:field_set).returns({:foo => field_2})
|
137
137
|
|
138
|
-
func_expr = stub('function expression')
|
139
|
-
expects(:as).with(:trim_foo_foo).returns(self)
|
140
|
-
end
|
138
|
+
func_expr = stub('function expression')
|
141
139
|
func = stub('function', :static? => false, :to_expr => func_expr)
|
142
140
|
Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
|
143
141
|
merged_field = stub('merged field', :name => :trim_foo_foo)
|
@@ -147,12 +145,10 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
|
|
147
145
|
c.configure do
|
148
146
|
trim(lhs[:foo]).must == rhs[:foo]
|
149
147
|
end
|
150
|
-
dataset_1.expects(:
|
151
|
-
dataset_1.expects(:order_more).with(func_expr).returns(dataset_1)
|
148
|
+
dataset_1.expects(:match).with(func_expr, :trim_foo_foo).returns(dataset_1)
|
152
149
|
c.expectations[0].apply_to(dataset_1, :lhs)
|
153
150
|
|
154
|
-
dataset_2.expects(:
|
155
|
-
dataset_2.expects(:order_more).with(:foo).returns(dataset_2)
|
151
|
+
dataset_2.expects(:match).with(:foo, :trim_foo_foo).returns(dataset_2)
|
156
152
|
c.expectations[0].apply_to(dataset_2, :rhs)
|
157
153
|
end
|
158
154
|
|
data/test/unit/test_dataset.rb
CHANGED
@@ -34,4 +34,55 @@ class UnitTests::TestDataset < Test::Unit::TestCase
|
|
34
34
|
@database.expects(:adapter_scheme).returns(:foo)
|
35
35
|
assert_equal :foo, ds.adapter_scheme
|
36
36
|
end
|
37
|
+
|
38
|
+
test "add match expression" do
|
39
|
+
ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
40
|
+
@dataset.expects(:clone).returns(@dataset)
|
41
|
+
ds_2 = ds_1.match(:foo)
|
42
|
+
assert_not_same ds_1, ds_2
|
43
|
+
assert_not_equal ds_1.instance_variable_get(:@_match),
|
44
|
+
ds_2.instance_variable_get(:@_match)
|
45
|
+
end
|
46
|
+
|
47
|
+
test "add match expression with alias, then each_group" do
|
48
|
+
ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
49
|
+
@dataset.expects(:clone).returns(@dataset)
|
50
|
+
ds_2 = ds_1.match(:foo, :aliased_foo)
|
51
|
+
@dataset.expects(:group_and_count).with(:foo.as(:aliased_foo)).returns(@dataset)
|
52
|
+
@dataset.expects(:having).returns(@dataset)
|
53
|
+
@dataset.expects(:each).yields({:aliased_foo => 123, :count => 1})
|
54
|
+
ds_2.each_group { |g| }
|
55
|
+
end
|
56
|
+
|
57
|
+
test "group_by_matches" do
|
58
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
59
|
+
|
60
|
+
@dataset.expects(:clone).returns(@dataset)
|
61
|
+
ds = ds.match(:foo)
|
62
|
+
@dataset.expects(:group).with(:foo).returns(@dataset)
|
63
|
+
|
64
|
+
ds.group_by_matches
|
65
|
+
end
|
66
|
+
|
67
|
+
test "dataset_for_group" do
|
68
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
69
|
+
@dataset.expects(:clone).returns(@dataset)
|
70
|
+
ds = ds.match(:foo, :foo_bar)
|
71
|
+
|
72
|
+
group = stub("group", :values => {:foo_bar => 'baz'})
|
73
|
+
filtered_dataset = stub('filtered dataset')
|
74
|
+
@dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
|
75
|
+
assert_equal filtered_dataset, ds.dataset_for_group(group)
|
76
|
+
end
|
77
|
+
|
78
|
+
test "dataset_for_group without aliases" do
|
79
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
80
|
+
@dataset.expects(:clone).returns(@dataset)
|
81
|
+
ds = ds.match(:foo)
|
82
|
+
|
83
|
+
group = stub("group", :values => {:foo => 'baz'})
|
84
|
+
filtered_dataset = stub('filtered dataset')
|
85
|
+
@dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
|
86
|
+
assert_equal filtered_dataset, ds.dataset_for_group(group)
|
87
|
+
end
|
37
88
|
end
|
data/test/unit/test_group.rb
CHANGED
@@ -1,21 +1,10 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
3
|
class UnitTests::TestGroup < Test::Unit::TestCase
|
4
|
-
test "
|
5
|
-
g = Linkage::Group.new(:test => 'test')
|
6
|
-
|
7
|
-
assert !g.matches?({:foo => 'bar'})
|
8
|
-
end
|
9
|
-
|
10
|
-
test "add_record adds a record" do
|
11
|
-
g = Linkage::Group.new(:test => 'test')
|
12
|
-
g.add_record(123)
|
13
|
-
assert_equal [123], g.records
|
14
|
-
end
|
15
|
-
|
16
|
-
test "count returns number of records" do
|
17
|
-
g = Linkage::Group.new(:test => 'test')
|
18
|
-
g.add_record(123)
|
4
|
+
test "initialize" do
|
5
|
+
g = Linkage::Group.new({:test => 'test'}, {:count => 1, :id => 456})
|
6
|
+
assert_equal({:test => 'test'}, g.values)
|
19
7
|
assert_equal 1, g.count
|
8
|
+
assert_equal 456, g.id
|
20
9
|
end
|
21
10
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sequel
|
@@ -307,7 +307,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
307
307
|
version: '0'
|
308
308
|
segments:
|
309
309
|
- 0
|
310
|
-
hash: -
|
310
|
+
hash: -1901911346636016746
|
311
311
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
312
312
|
none: false
|
313
313
|
requirements:
|
@@ -316,7 +316,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
316
316
|
version: '0'
|
317
317
|
requirements: []
|
318
318
|
rubyforge_project:
|
319
|
-
rubygems_version: 1.8.
|
319
|
+
rubygems_version: 1.8.23
|
320
320
|
signing_key:
|
321
321
|
specification_version: 3
|
322
322
|
summary: Record linkage library
|