linkage 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +5 -5
- data/Rakefile +2 -0
- data/VERSION +1 -1
- data/lib/linkage/configuration.rb +14 -4
- data/lib/linkage/dataset.rb +68 -13
- data/lib/linkage/group.rb +23 -31
- data/lib/linkage/result_set.rb +11 -11
- data/lib/linkage/runner/single_threaded.rb +21 -57
- data/linkage.gemspec +3 -3
- data/test/helper.rb +10 -0
- data/test/integration/test_cross_linkage.rb +21 -7
- data/test/integration/test_dataset.rb +43 -0
- data/test/integration/test_dual_linkage.rb +16 -20
- data/test/integration/test_functions.rb +6 -5
- data/test/integration/test_self_linkage.rb +34 -37
- data/test/unit/test_configuration.rb +3 -7
- data/test/unit/test_dataset.rb +51 -0
- data/test/unit/test_group.rb +4 -15
- metadata +4 -4
data/Gemfile.lock
CHANGED
@@ -2,7 +2,7 @@ GEM
|
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
4
|
blockenspiel (0.4.3)
|
5
|
-
coderay (1.0.
|
5
|
+
coderay (1.0.6)
|
6
6
|
ffi (1.0.11)
|
7
7
|
git (1.2.5)
|
8
8
|
guard (1.0.1)
|
@@ -25,7 +25,7 @@ GEM
|
|
25
25
|
mocha (0.10.5)
|
26
26
|
metaclass (~> 0.0.1)
|
27
27
|
mysql2 (0.3.11)
|
28
|
-
pry (0.9.
|
28
|
+
pry (0.9.9)
|
29
29
|
coderay (~> 1.0.5)
|
30
30
|
method_source (~> 0.7.1)
|
31
31
|
slop (>= 2.4.4, < 3)
|
@@ -33,12 +33,12 @@ GEM
|
|
33
33
|
rdiscount (1.6.8)
|
34
34
|
rdoc (3.12)
|
35
35
|
json (~> 1.4)
|
36
|
-
sequel (3.
|
36
|
+
sequel (3.34.1)
|
37
37
|
slop (2.4.4)
|
38
|
-
sqlite3 (1.3.
|
38
|
+
sqlite3 (1.3.6)
|
39
39
|
test-unit (2.4.8)
|
40
40
|
thor (0.14.6)
|
41
|
-
versionomy (0.4.
|
41
|
+
versionomy (0.4.3)
|
42
42
|
blockenspiel (>= 0.4.3)
|
43
43
|
yard (0.7.5)
|
44
44
|
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
@@ -117,12 +117,12 @@ module Linkage
|
|
117
117
|
end
|
118
118
|
|
119
119
|
expr = target.to_expr(side)
|
120
|
-
|
120
|
+
aliaz = nil
|
121
121
|
if expr != merged_field.name
|
122
|
-
|
122
|
+
aliaz = merged_field.name
|
123
123
|
end
|
124
124
|
|
125
|
-
dataset.
|
125
|
+
dataset.match(expr, aliaz)
|
126
126
|
end
|
127
127
|
|
128
128
|
def same_filter?(other)
|
@@ -318,9 +318,9 @@ module Linkage
|
|
318
318
|
def initialize(dataset_1, dataset_2)
|
319
319
|
@dataset_1 = dataset_1
|
320
320
|
@dataset_2 = dataset_2
|
321
|
+
@linkage_type = dataset_1 == dataset_2 ? :self : :dual
|
321
322
|
@expectations = []
|
322
323
|
@visual_comparisons = []
|
323
|
-
@linkage_type = dataset_1 == dataset_2 ? :self : :dual
|
324
324
|
end
|
325
325
|
|
326
326
|
def configure(&block)
|
@@ -348,5 +348,15 @@ module Linkage
|
|
348
348
|
def result_set
|
349
349
|
@result_set ||= ResultSet.new(self)
|
350
350
|
end
|
351
|
+
|
352
|
+
def datasets_with_applied_expectations
|
353
|
+
dataset_1 = @dataset_1
|
354
|
+
dataset_2 = @dataset_2
|
355
|
+
@expectations.each do |exp|
|
356
|
+
dataset_1 = exp.apply_to(dataset_1, :lhs)
|
357
|
+
dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
|
358
|
+
end
|
359
|
+
@linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
|
360
|
+
end
|
351
361
|
end
|
352
362
|
end
|
data/lib/linkage/dataset.rb
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
module Linkage
|
2
|
-
|
2
|
+
# Delegator around Sequel::Dataset with some extra functionality.
|
3
|
+
class Dataset
|
3
4
|
attr_reader :field_set, :table_name
|
4
5
|
|
5
6
|
def initialize(uri, table, options = {})
|
6
7
|
@table_name = table.to_sym
|
7
8
|
db = Sequel.connect(uri, options)
|
8
|
-
|
9
|
-
super(ds)
|
9
|
+
@dataset = db[@table_name]
|
10
10
|
@field_set = FieldSet.new(db.schema(@table_name))
|
11
|
+
@_match = []
|
11
12
|
end
|
12
13
|
|
13
|
-
def
|
14
|
-
|
14
|
+
def obj
|
15
|
+
@dataset
|
16
|
+
end
|
17
|
+
|
18
|
+
def obj=(value)
|
19
|
+
@dataset = value
|
20
|
+
end
|
15
21
|
|
16
22
|
# Setup a linkage with another dataset
|
17
23
|
#
|
@@ -26,21 +32,70 @@ module Linkage
|
|
26
32
|
@dataset.db.adapter_scheme
|
27
33
|
end
|
28
34
|
|
29
|
-
def
|
30
|
-
|
35
|
+
def match(expr, aliaz = nil)
|
36
|
+
clone(:match => {:expr => expr, :alias => aliaz})
|
37
|
+
end
|
38
|
+
|
39
|
+
def clone(new_opts={})
|
40
|
+
new_opts = new_opts.dup
|
41
|
+
new_obj = new_opts.delete(:new_obj)
|
42
|
+
|
43
|
+
match = new_opts.delete(:match)
|
44
|
+
result = super()
|
45
|
+
result.send(:_match, match)
|
46
|
+
|
31
47
|
if new_obj
|
32
|
-
|
48
|
+
result.obj = new_obj
|
33
49
|
else
|
34
|
-
|
50
|
+
result.obj = obj.clone(new_opts)
|
51
|
+
end
|
52
|
+
result
|
53
|
+
end
|
54
|
+
|
55
|
+
def each_group(min = 2)
|
56
|
+
@dataset.group_and_count(*aliased_match_expressions).having{count >= min}.each do |row|
|
57
|
+
count = row.delete(:count)
|
58
|
+
yield Group.new(row, {:count => count})
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def group_by_matches(aliased = false)
|
63
|
+
expr = aliased ? aliased_match_expressions : match_expressions
|
64
|
+
group(*expr)
|
65
|
+
end
|
66
|
+
|
67
|
+
def dataset_for_group(group)
|
68
|
+
filters = []
|
69
|
+
group.values.each_pair do |key, value|
|
70
|
+
# find a matched expression with this alias
|
71
|
+
m = @_match.detect { |h| h[:alias] ? h[:alias] == key : h[:expr] == key }
|
72
|
+
raise "this dataset isn't compatible with the given group" if !m
|
73
|
+
filters << {m[:expr] => value}
|
74
|
+
end
|
75
|
+
filter(*filters)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def _match(opts)
|
81
|
+
if opts
|
82
|
+
@_match += [opts]
|
35
83
|
end
|
36
84
|
end
|
37
85
|
|
86
|
+
def match_expressions
|
87
|
+
@_match.collect { |m| m[:expr] }
|
88
|
+
end
|
89
|
+
|
90
|
+
def aliased_match_expressions
|
91
|
+
@_match.collect { |m| m[:alias] ? m[:expr].as(m[:alias]) : m[:expr] }
|
92
|
+
end
|
93
|
+
|
38
94
|
def method_missing(name, *args, &block)
|
39
|
-
result =
|
95
|
+
result = @dataset.send(name, *args, &block)
|
40
96
|
if result.kind_of?(Sequel::Dataset)
|
41
|
-
|
42
|
-
result = clone
|
43
|
-
@new_obj = nil
|
97
|
+
new_obj = result
|
98
|
+
result = clone(:new_obj => result)
|
44
99
|
end
|
45
100
|
result
|
46
101
|
end
|
data/lib/linkage/group.rb
CHANGED
@@ -1,43 +1,35 @@
|
|
1
1
|
module Linkage
|
2
|
-
# This class represents a group of records that match based on criteria
|
3
|
-
# described via the {Dataset#link_with} method. Group's are created by
|
4
|
-
# subclasses of the {Runner} class during execution.
|
5
|
-
#
|
6
|
-
# @see Dataset#link_with
|
7
|
-
# @see SingleThreadedRunner
|
8
2
|
class Group
|
9
|
-
# @return [Array<Object>] An array of this group's record ids
|
10
|
-
attr_reader :records
|
11
|
-
|
12
3
|
# @return [Hash] Hash of matching values
|
13
4
|
attr_reader :values
|
14
5
|
|
15
|
-
# @
|
16
|
-
|
17
|
-
# Linkage::Group.new({:foo => 123, :bar => 'baz'})
|
18
|
-
def initialize(matching_values)
|
19
|
-
@values = matching_values
|
20
|
-
@records = []
|
21
|
-
end
|
6
|
+
# @return [Integer] Number of records in this group
|
7
|
+
attr_reader :count
|
22
8
|
|
23
|
-
#
|
24
|
-
|
25
|
-
# @param [Hash] values Hash of values
|
26
|
-
# @return [Boolean] true if match, false if not
|
27
|
-
def matches?(values)
|
28
|
-
@values == values
|
29
|
-
end
|
9
|
+
# @return [Integer] This group's ID (if it exists)
|
10
|
+
attr_reader :id
|
30
11
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
12
|
+
def self.from_row(row)
|
13
|
+
values = {}
|
14
|
+
options = {}
|
15
|
+
row.each_pair do |key, value|
|
16
|
+
if key == :id || key == :count
|
17
|
+
options[key] = value
|
18
|
+
else
|
19
|
+
values[key] = value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
new(values, options)
|
36
23
|
end
|
37
24
|
|
38
|
-
# @
|
39
|
-
|
40
|
-
|
25
|
+
# @param [Hash] values Values that define this group
|
26
|
+
# @param [Hash] options
|
27
|
+
# @example
|
28
|
+
# Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
|
29
|
+
def initialize(values, options)
|
30
|
+
@count = options[:count]
|
31
|
+
@id = options[:id]
|
32
|
+
@values = values
|
41
33
|
end
|
42
34
|
end
|
43
35
|
end
|
data/lib/linkage/result_set.rb
CHANGED
@@ -7,11 +7,7 @@ module Linkage
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def groups_dataset
|
10
|
-
Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
|
11
|
-
end
|
12
|
-
|
13
|
-
def groups_records_dataset
|
14
|
-
Dataset.new(@config.results_uri, :groups_records, @config.results_uri_options)
|
10
|
+
@groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
|
15
11
|
end
|
16
12
|
|
17
13
|
def database(&block)
|
@@ -40,18 +36,22 @@ module Linkage
|
|
40
36
|
groups_headers = [:id] + group.values.keys
|
41
37
|
@groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
|
42
38
|
end
|
43
|
-
@groups_records_buffer ||= ImportBuffer.new(@config.results_uri, :groups_records, [:group_id, :dataset, :record_id], @config.results_uri_options)
|
44
|
-
|
45
39
|
group_id = next_group_id
|
46
40
|
@groups_buffer.add([group_id] + group.values.values)
|
47
|
-
group.records.each do |record_id|
|
48
|
-
@groups_records_buffer.add([group_id, dataset_id, record_id])
|
49
|
-
end
|
50
41
|
end
|
51
42
|
|
52
43
|
def flush!
|
53
44
|
@groups_buffer.flush if @groups_buffer
|
54
|
-
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_group(index)
|
48
|
+
values = groups_dataset.order(:id).limit(1, index).first
|
49
|
+
Group.from_row(values)
|
50
|
+
end
|
51
|
+
|
52
|
+
def groups_records_datasets(group)
|
53
|
+
datasets = @config.datasets_with_applied_expectations
|
54
|
+
datasets.collect! { |ds| ds.dataset_for_group(group) }
|
55
55
|
end
|
56
56
|
|
57
57
|
private
|
@@ -6,7 +6,6 @@ module Linkage
|
|
6
6
|
# @return [Linkage::ResultSet]
|
7
7
|
def execute
|
8
8
|
setup_datasets
|
9
|
-
apply_expectations
|
10
9
|
group_records
|
11
10
|
|
12
11
|
return result_set
|
@@ -15,18 +14,13 @@ module Linkage
|
|
15
14
|
private
|
16
15
|
|
17
16
|
def setup_datasets
|
18
|
-
|
19
|
-
@dataset_1 = config.dataset_1.select(pk.to_expr)
|
20
|
-
if @config.linkage_type != :self
|
21
|
-
pk = config.dataset_2.field_set.primary_key
|
22
|
-
@dataset_2 = config.dataset_2.select(pk.to_expr)
|
23
|
-
end
|
24
|
-
end
|
17
|
+
@dataset_1, @dataset_2 = config.datasets_with_applied_expectations
|
25
18
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
pk = @dataset_1.field_set.primary_key
|
20
|
+
@dataset_1 = @dataset_1.select(pk.to_expr)
|
21
|
+
if @config.linkage_type != :self
|
22
|
+
pk = @dataset_2.field_set.primary_key
|
23
|
+
@dataset_2 = @dataset_2.select(pk.to_expr)
|
30
24
|
end
|
31
25
|
end
|
32
26
|
|
@@ -47,23 +41,10 @@ module Linkage
|
|
47
41
|
# @param [Boolean] ignore_empty_groups
|
48
42
|
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
49
43
|
# the block. Otherwise, call ResultSet#add_group on the group.
|
50
|
-
def group_records_for(dataset, dataset_id
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
dataset.each do |row|
|
55
|
-
pk = row.delete(primary_key)
|
56
|
-
if current_group.nil? || !current_group.matches?(row)
|
57
|
-
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
58
|
-
block.call(current_group)
|
59
|
-
end
|
60
|
-
new_group = Group.new(row)
|
61
|
-
current_group = new_group
|
62
|
-
end
|
63
|
-
current_group.add_record(pk)
|
64
|
-
end
|
65
|
-
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
66
|
-
block.call(current_group)
|
44
|
+
def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
|
45
|
+
group_minimum = ignore_empty_groups ? 2 : 1
|
46
|
+
dataset.each_group(group_minimum) do |group|
|
47
|
+
result_set.add_group(group, dataset_id)
|
67
48
|
end
|
68
49
|
result_set.flush!
|
69
50
|
end
|
@@ -72,37 +53,20 @@ module Linkage
|
|
72
53
|
# Create a new dataset for the groups table
|
73
54
|
groups_dataset = result_set.groups_dataset
|
74
55
|
|
75
|
-
|
56
|
+
groups_dataset.field_set.values.each do |field|
|
76
57
|
# Sort on all fields
|
77
|
-
field.primary_key?
|
78
|
-
|
79
|
-
groups_dataset = groups_dataset.select(*exprs, groups_dataset.field_set.primary_key.to_expr).order(*exprs) # ensure matching groups are sorted by id
|
80
|
-
|
81
|
-
result_set.database do |db|
|
82
|
-
groups_to_delete = []
|
83
|
-
db.transaction do # for speed reasons
|
84
|
-
group_records_for(groups_dataset, nil, false) do |group|
|
85
|
-
if group.count == 1
|
86
|
-
# Delete the empty group
|
87
|
-
groups_to_delete << group.records[0]
|
88
|
-
else
|
89
|
-
# Change group_id in the groups_records table to the first group
|
90
|
-
# id, delete other groups.
|
91
|
-
new_group_id = group.records[0]
|
92
|
-
group.records[1..-1].each do |old_group_id|
|
93
|
-
# NOTE: There can only be a group with max size of 2, but
|
94
|
-
# this adds in future support for matching more than
|
95
|
-
# 2 datasets at once.
|
96
|
-
db[:groups_records].filter(:group_id => old_group_id).
|
97
|
-
update(:group_id => new_group_id)
|
98
|
-
groups_to_delete << old_group_id
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
58
|
+
if !field.primary_key?
|
59
|
+
groups_dataset = groups_dataset.match(field.to_expr)
|
102
60
|
end
|
103
|
-
db[:groups_records].filter(:group_id => groups_to_delete).delete
|
104
|
-
db[:groups].filter(:id => groups_to_delete).delete
|
105
61
|
end
|
62
|
+
|
63
|
+
# Delete non-matching groups
|
64
|
+
sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
|
65
|
+
groups_dataset.filter(:id => sub_dataset.obj).delete
|
66
|
+
|
67
|
+
# Delete duplicate groups
|
68
|
+
sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
|
69
|
+
groups_dataset.filter(:id => sub_dataset.obj).delete
|
106
70
|
end
|
107
71
|
end
|
108
72
|
end
|
data/linkage.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "linkage"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jeremy Stephens"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-05-08"
|
13
13
|
s.description = "Performs record linkage between one or two datasets, using Sequel on the backend"
|
14
14
|
s.email = "jeremy.f.stephens@vanderbilt.edu"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -69,7 +69,7 @@ Gem::Specification.new do |s|
|
|
69
69
|
s.homepage = "http://github.com/coupler/linkage"
|
70
70
|
s.licenses = ["MIT"]
|
71
71
|
s.require_paths = ["lib"]
|
72
|
-
s.rubygems_version = "1.8.
|
72
|
+
s.rubygems_version = "1.8.23"
|
73
73
|
s.summary = "Record linkage library"
|
74
74
|
|
75
75
|
if s.respond_to? :specification_version then
|
data/test/helper.rb
CHANGED
@@ -62,6 +62,16 @@ class Test::Unit::TestCase
|
|
62
62
|
def test_config
|
63
63
|
@test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
|
64
64
|
end
|
65
|
+
|
66
|
+
def prefixed_logger(prefix)
|
67
|
+
logger = Logger.new(STDERR)
|
68
|
+
original_formatter = Logger::Formatter.new
|
69
|
+
logger.formatter = proc { |severity, datetime, progname, msg|
|
70
|
+
result = original_formatter.call(severity, datetime, progname, msg)
|
71
|
+
"[#{prefix}] #{result}"
|
72
|
+
}
|
73
|
+
logger
|
74
|
+
end
|
65
75
|
end
|
66
76
|
|
67
77
|
module UnitTests; end
|
@@ -7,8 +7,8 @@ module IntegrationTests
|
|
7
7
|
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
8
|
end
|
9
9
|
|
10
|
-
def database(&block)
|
11
|
-
Sequel.connect(@tmpuri, &block)
|
10
|
+
def database(options = {}, &block)
|
11
|
+
Sequel.connect(@tmpuri, options, &block)
|
12
12
|
end
|
13
13
|
|
14
14
|
def teardown
|
@@ -16,6 +16,10 @@ module IntegrationTests
|
|
16
16
|
end
|
17
17
|
|
18
18
|
test "one mandatory field equality on single threaded runner" do
|
19
|
+
#setup_logger = Logger.new(STDERR)
|
20
|
+
#setup_logger.formatter = lambda { |severity, time, progname, msg|
|
21
|
+
#" SETUP : %s [%s]: %s\n" % [severity, time, msg]
|
22
|
+
#}
|
19
23
|
# insert the test data
|
20
24
|
database do |db|
|
21
25
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -23,12 +27,22 @@ module IntegrationTests
|
|
23
27
|
Array.new(100) { |i| [i, i % 10, i % 5] })
|
24
28
|
end
|
25
29
|
|
30
|
+
#ds_logger = Logger.new(STDERR)
|
31
|
+
#ds_logger.formatter = lambda { |severity, time, progname, msg|
|
32
|
+
#"DATASET: %s [%s]: %s\n" % [severity, time, msg]
|
33
|
+
#}
|
26
34
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
35
|
+
|
36
|
+
#rs_logger = Logger.new(STDERR)
|
37
|
+
#rs_logger.formatter = lambda { |severity, time, progname, msg|
|
38
|
+
#"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
|
39
|
+
#}
|
27
40
|
tmpuri = @tmpuri
|
28
41
|
conf = ds.link_with(ds) do
|
29
42
|
lhs[:foo].must == rhs[:bar]
|
30
43
|
save_results_in(tmpuri, :single_threaded => true)
|
31
44
|
end
|
45
|
+
assert_equal :cross, conf.linkage_type
|
32
46
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
33
47
|
runner.execute
|
34
48
|
|
@@ -38,11 +52,11 @@ module IntegrationTests
|
|
38
52
|
assert_equal i, row[:foo_bar]
|
39
53
|
end
|
40
54
|
|
41
|
-
assert_equal 150, db[:groups_records].count
|
42
|
-
db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
43
|
-
expected_group_id = (row[:record_id] % 5) + 1
|
44
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
45
|
-
end
|
55
|
+
#assert_equal 150, db[:groups_records].count
|
56
|
+
#db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
57
|
+
#expected_group_id = (row[:record_id] % 5) + 1
|
58
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
59
|
+
#end
|
46
60
|
end
|
47
61
|
end
|
48
62
|
|
@@ -27,4 +27,47 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
|
|
27
27
|
assert_equal ds_2.field_set, ds_1.field_set
|
28
28
|
assert_match /`foo` = 'bar'/, ds_2.sql
|
29
29
|
end
|
30
|
+
|
31
|
+
test "each_group" do
|
32
|
+
database do |db|
|
33
|
+
db.create_table(:foo) do
|
34
|
+
primary_key :id
|
35
|
+
String :bar
|
36
|
+
end
|
37
|
+
db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
|
38
|
+
end
|
39
|
+
|
40
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
41
|
+
ds = ds.match(:bar)
|
42
|
+
ds.each_group do |group|
|
43
|
+
assert_equal({:bar => "foo"}, group.values)
|
44
|
+
assert_equal(2, group.count)
|
45
|
+
end
|
46
|
+
|
47
|
+
groups = []
|
48
|
+
ds.each_group(1) do |group|
|
49
|
+
groups << group
|
50
|
+
end
|
51
|
+
assert_equal 3, groups.length
|
52
|
+
end
|
53
|
+
|
54
|
+
test "each_group with filters" do
|
55
|
+
database do |db|
|
56
|
+
db.create_table(:foo) do
|
57
|
+
primary_key :id
|
58
|
+
String :bar
|
59
|
+
Integer :baz
|
60
|
+
end
|
61
|
+
db[:foo].import([:id, :bar, :baz], [[1, 'foo', 1], [2, 'foo', 2], [3, 'bar', 3], [4, 'baz', 4]])
|
62
|
+
end
|
63
|
+
|
64
|
+
ds = Linkage::Dataset.new(@tmpuri, "foo")
|
65
|
+
ds = ds.match(:bar)
|
66
|
+
ds = ds.filter { baz >= 3 }
|
67
|
+
groups = []
|
68
|
+
ds.each_group(1) do |group|
|
69
|
+
groups << group
|
70
|
+
end
|
71
|
+
assert_equal 2, groups.length
|
72
|
+
end
|
30
73
|
end
|
@@ -7,8 +7,8 @@ module IntegrationTests
|
|
7
7
|
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
8
|
end
|
9
9
|
|
10
|
-
def database(&block)
|
11
|
-
Sequel.connect(@tmpuri, &block)
|
10
|
+
def database(options = {}, &block)
|
11
|
+
Sequel.connect(@tmpuri, options, &block)
|
12
12
|
end
|
13
13
|
|
14
14
|
def teardown
|
@@ -43,16 +43,16 @@ module IntegrationTests
|
|
43
43
|
assert_equal "12345678#{i%10}", row[:ssn]
|
44
44
|
end
|
45
45
|
|
46
|
-
assert_equal 200, db[:groups_records].count
|
47
|
-
db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
48
|
-
if i % 20 >= 10
|
49
|
-
assert_equal 2, row[:dataset], row.inspect
|
50
|
-
else
|
51
|
-
assert_equal 1, row[:dataset], row.inspect
|
52
|
-
end
|
53
|
-
expected_group_id = i / 20 + 1
|
54
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
-
end
|
46
|
+
#assert_equal 200, db[:groups_records].count
|
47
|
+
#db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
|
48
|
+
#if i % 20 >= 10
|
49
|
+
#assert_equal 2, row[:dataset], row.inspect
|
50
|
+
#else
|
51
|
+
#assert_equal 1, row[:dataset], row.inspect
|
52
|
+
#end
|
53
|
+
#expected_group_id = i / 20 + 1
|
54
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
|
55
|
+
#end
|
56
56
|
end
|
57
57
|
end
|
58
58
|
|
@@ -86,8 +86,7 @@ module IntegrationTests
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
-
test "
|
90
|
-
pend
|
89
|
+
test "reacts properly when using two databases with different string equality methods" do
|
91
90
|
if !test_config['mysql']
|
92
91
|
omission("No MySQL test configuration found")
|
93
92
|
end
|
@@ -98,24 +97,21 @@ module IntegrationTests
|
|
98
97
|
|
99
98
|
db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
|
100
99
|
db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
|
101
|
-
|
102
|
-
db.run("DROP TABLE IF EXISTS groups")
|
103
|
-
db.run("DROP TABLE IF EXISTS groups_records")
|
104
100
|
end
|
105
101
|
|
106
102
|
ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
|
107
103
|
ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
|
108
|
-
|
104
|
+
tmpuri = @tmpuri
|
109
105
|
conf = ds_1.link_with(ds_2) do
|
110
106
|
lhs[:one].must == rhs[:one]
|
111
107
|
lhs[:two].must == rhs[:two]
|
112
|
-
save_results_in(
|
108
|
+
save_results_in(tmpuri)
|
113
109
|
end
|
114
110
|
|
115
111
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
116
112
|
runner.execute
|
117
113
|
|
118
|
-
|
114
|
+
database do |db|
|
119
115
|
assert_equal 1, db[:groups].count
|
120
116
|
end
|
121
117
|
end
|
@@ -39,20 +39,21 @@ module IntegrationTests
|
|
39
39
|
end
|
40
40
|
|
41
41
|
test "strftime in sqlite" do
|
42
|
-
logger = Logger.new(STDERR)
|
43
|
-
database(:logger => logger) do |db|
|
42
|
+
#logger = Logger.new(STDERR)
|
43
|
+
#database(:logger => logger) do |db|
|
44
|
+
database do |db|
|
44
45
|
db.create_table(:foo) { primary_key(:id); Date(:foo_date) }
|
45
46
|
db.create_table(:bar) { primary_key(:id); String(:bar_string) }
|
46
47
|
db[:foo].insert({:id => 1, :foo_date => Date.today})
|
47
48
|
db[:bar].insert({:id => 1, :bar_string => Date.today.strftime("%Y-%m-%d")})
|
48
49
|
end
|
49
50
|
|
50
|
-
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true
|
51
|
-
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true
|
51
|
+
ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
52
|
+
ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
|
52
53
|
tmpuri = @tmpuri
|
53
54
|
conf = ds_1.link_with(ds_2) do
|
54
55
|
strftime(lhs[:foo_date], "%Y-%m-%d").must == rhs[:bar_string]
|
55
|
-
save_results_in(tmpuri
|
56
|
+
save_results_in(tmpuri)
|
56
57
|
end
|
57
58
|
runner = Linkage::SingleThreadedRunner.new(conf)
|
58
59
|
runner.execute
|
@@ -37,13 +37,10 @@ module IntegrationTests
|
|
37
37
|
assert_equal 10, db[:groups].count
|
38
38
|
db[:groups].order(:ssn).each_with_index do |row, i|
|
39
39
|
assert_equal "12345678#{i%10}", row[:ssn]
|
40
|
-
end
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
expected_group_id = (row[:record_id] % 10) + 1
|
46
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
41
|
+
group = Linkage::Group.from_row(row)
|
42
|
+
dataset, _ = result_set.groups_records_datasets(group)
|
43
|
+
assert_equal 10, dataset.count
|
47
44
|
end
|
48
45
|
end
|
49
46
|
end
|
@@ -73,13 +70,13 @@ module IntegrationTests
|
|
73
70
|
assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
|
74
71
|
end
|
75
72
|
|
76
|
-
assert_equal 100, db[:groups_records].count
|
77
|
-
expected_group_id = nil
|
78
|
-
db[:groups_records].order(:record_id).each do |row|
|
79
|
-
v = row[:record_id] % 20
|
80
|
-
expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
|
81
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
82
|
-
end
|
73
|
+
#assert_equal 100, db[:groups_records].count
|
74
|
+
#expected_group_id = nil
|
75
|
+
#db[:groups_records].order(:record_id).each do |row|
|
76
|
+
#v = row[:record_id] % 20
|
77
|
+
#expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
|
78
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
79
|
+
#end
|
83
80
|
end
|
84
81
|
end
|
85
82
|
|
@@ -107,12 +104,12 @@ module IntegrationTests
|
|
107
104
|
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
108
105
|
end
|
109
106
|
|
110
|
-
assert_equal 20, db[:groups_records].count
|
111
|
-
expected_group_id = nil
|
112
|
-
db[:groups_records].order(:record_id).each do |row|
|
113
|
-
expected_group_id = (row[:record_id] / 5) % 2 + 1
|
114
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
115
|
-
end
|
107
|
+
#assert_equal 20, db[:groups_records].count
|
108
|
+
#expected_group_id = nil
|
109
|
+
#db[:groups_records].order(:record_id).each do |row|
|
110
|
+
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
111
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
112
|
+
#end
|
116
113
|
end
|
117
114
|
end
|
118
115
|
|
@@ -141,12 +138,12 @@ module IntegrationTests
|
|
141
138
|
assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
|
142
139
|
end
|
143
140
|
|
144
|
-
assert_equal 20, db[:groups_records].count
|
145
|
-
expected_group_id = nil
|
146
|
-
db[:groups_records].order(:record_id).each do |row|
|
147
|
-
expected_group_id = (row[:record_id] / 5) % 2 + 1
|
148
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
149
|
-
end
|
141
|
+
#assert_equal 20, db[:groups_records].count
|
142
|
+
#expected_group_id = nil
|
143
|
+
#db[:groups_records].order(:record_id).each do |row|
|
144
|
+
#expected_group_id = (row[:record_id] / 5) % 2 + 1
|
145
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
146
|
+
#end
|
150
147
|
end
|
151
148
|
end
|
152
149
|
|
@@ -175,12 +172,12 @@ module IntegrationTests
|
|
175
172
|
assert_equal "123456789#{i}", row[:ssn]
|
176
173
|
end
|
177
174
|
|
178
|
-
assert_equal 25, db[:groups_records].count
|
179
|
-
expected_group_id = nil
|
180
|
-
db[:groups_records].order(:record_id).each do |row|
|
181
|
-
expected_group_id = row[:record_id] % 5 + 1
|
182
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
183
|
-
end
|
175
|
+
#assert_equal 25, db[:groups_records].count
|
176
|
+
#expected_group_id = nil
|
177
|
+
#db[:groups_records].order(:record_id).each do |row|
|
178
|
+
#expected_group_id = row[:record_id] % 5 + 1
|
179
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
180
|
+
#end
|
184
181
|
end
|
185
182
|
end
|
186
183
|
|
@@ -210,12 +207,12 @@ module IntegrationTests
|
|
210
207
|
assert_equal "123456789#{i}", row[:ssn]
|
211
208
|
end
|
212
209
|
|
213
|
-
assert_equal 25, db[:groups_records].count
|
214
|
-
expected_group_id = nil
|
215
|
-
db[:groups_records].order(:record_id).each do |row|
|
216
|
-
expected_group_id = row[:record_id] % 5 + 1
|
217
|
-
assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
218
|
-
end
|
210
|
+
#assert_equal 25, db[:groups_records].count
|
211
|
+
#expected_group_id = nil
|
212
|
+
#db[:groups_records].order(:record_id).each do |row|
|
213
|
+
#expected_group_id = row[:record_id] % 5 + 1
|
214
|
+
#assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
|
215
|
+
#end
|
219
216
|
end
|
220
217
|
end
|
221
218
|
end
|
@@ -135,9 +135,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
|
|
135
135
|
field_2 = stub('field 2', :to_expr => :foo)
|
136
136
|
dataset_2.stubs(:field_set).returns({:foo => field_2})
|
137
137
|
|
138
|
-
func_expr = stub('function expression')
|
139
|
-
expects(:as).with(:trim_foo_foo).returns(self)
|
140
|
-
end
|
138
|
+
func_expr = stub('function expression')
|
141
139
|
func = stub('function', :static? => false, :to_expr => func_expr)
|
142
140
|
Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
|
143
141
|
merged_field = stub('merged field', :name => :trim_foo_foo)
|
@@ -147,12 +145,10 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
|
|
147
145
|
c.configure do
|
148
146
|
trim(lhs[:foo]).must == rhs[:foo]
|
149
147
|
end
|
150
|
-
dataset_1.expects(:
|
151
|
-
dataset_1.expects(:order_more).with(func_expr).returns(dataset_1)
|
148
|
+
dataset_1.expects(:match).with(func_expr, :trim_foo_foo).returns(dataset_1)
|
152
149
|
c.expectations[0].apply_to(dataset_1, :lhs)
|
153
150
|
|
154
|
-
dataset_2.expects(:
|
155
|
-
dataset_2.expects(:order_more).with(:foo).returns(dataset_2)
|
151
|
+
dataset_2.expects(:match).with(:foo, :trim_foo_foo).returns(dataset_2)
|
156
152
|
c.expectations[0].apply_to(dataset_2, :rhs)
|
157
153
|
end
|
158
154
|
|
data/test/unit/test_dataset.rb
CHANGED
@@ -34,4 +34,55 @@ class UnitTests::TestDataset < Test::Unit::TestCase
|
|
34
34
|
@database.expects(:adapter_scheme).returns(:foo)
|
35
35
|
assert_equal :foo, ds.adapter_scheme
|
36
36
|
end
|
37
|
+
|
38
|
+
test "add match expression" do
|
39
|
+
ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
40
|
+
@dataset.expects(:clone).returns(@dataset)
|
41
|
+
ds_2 = ds_1.match(:foo)
|
42
|
+
assert_not_same ds_1, ds_2
|
43
|
+
assert_not_equal ds_1.instance_variable_get(:@_match),
|
44
|
+
ds_2.instance_variable_get(:@_match)
|
45
|
+
end
|
46
|
+
|
47
|
+
test "add match expression with alias, then each_group" do
|
48
|
+
ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
49
|
+
@dataset.expects(:clone).returns(@dataset)
|
50
|
+
ds_2 = ds_1.match(:foo, :aliased_foo)
|
51
|
+
@dataset.expects(:group_and_count).with(:foo.as(:aliased_foo)).returns(@dataset)
|
52
|
+
@dataset.expects(:having).returns(@dataset)
|
53
|
+
@dataset.expects(:each).yields({:aliased_foo => 123, :count => 1})
|
54
|
+
ds_2.each_group { |g| }
|
55
|
+
end
|
56
|
+
|
57
|
+
test "group_by_matches" do
|
58
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
59
|
+
|
60
|
+
@dataset.expects(:clone).returns(@dataset)
|
61
|
+
ds = ds.match(:foo)
|
62
|
+
@dataset.expects(:group).with(:foo).returns(@dataset)
|
63
|
+
|
64
|
+
ds.group_by_matches
|
65
|
+
end
|
66
|
+
|
67
|
+
test "dataset_for_group" do
|
68
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
69
|
+
@dataset.expects(:clone).returns(@dataset)
|
70
|
+
ds = ds.match(:foo, :foo_bar)
|
71
|
+
|
72
|
+
group = stub("group", :values => {:foo_bar => 'baz'})
|
73
|
+
filtered_dataset = stub('filtered dataset')
|
74
|
+
@dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
|
75
|
+
assert_equal filtered_dataset, ds.dataset_for_group(group)
|
76
|
+
end
|
77
|
+
|
78
|
+
test "dataset_for_group without aliases" do
|
79
|
+
ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
|
80
|
+
@dataset.expects(:clone).returns(@dataset)
|
81
|
+
ds = ds.match(:foo)
|
82
|
+
|
83
|
+
group = stub("group", :values => {:foo => 'baz'})
|
84
|
+
filtered_dataset = stub('filtered dataset')
|
85
|
+
@dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
|
86
|
+
assert_equal filtered_dataset, ds.dataset_for_group(group)
|
87
|
+
end
|
37
88
|
end
|
data/test/unit/test_group.rb
CHANGED
@@ -1,21 +1,10 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
3
|
class UnitTests::TestGroup < Test::Unit::TestCase
|
4
|
-
test "
|
5
|
-
g = Linkage::Group.new(:test => 'test')
|
6
|
-
|
7
|
-
assert !g.matches?({:foo => 'bar'})
|
8
|
-
end
|
9
|
-
|
10
|
-
test "add_record adds a record" do
|
11
|
-
g = Linkage::Group.new(:test => 'test')
|
12
|
-
g.add_record(123)
|
13
|
-
assert_equal [123], g.records
|
14
|
-
end
|
15
|
-
|
16
|
-
test "count returns number of records" do
|
17
|
-
g = Linkage::Group.new(:test => 'test')
|
18
|
-
g.add_record(123)
|
4
|
+
test "initialize" do
|
5
|
+
g = Linkage::Group.new({:test => 'test'}, {:count => 1, :id => 456})
|
6
|
+
assert_equal({:test => 'test'}, g.values)
|
19
7
|
assert_equal 1, g.count
|
8
|
+
assert_equal 456, g.id
|
20
9
|
end
|
21
10
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sequel
|
@@ -307,7 +307,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
307
307
|
version: '0'
|
308
308
|
segments:
|
309
309
|
- 0
|
310
|
-
hash: -
|
310
|
+
hash: -1901911346636016746
|
311
311
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
312
312
|
none: false
|
313
313
|
requirements:
|
@@ -316,7 +316,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
316
316
|
version: '0'
|
317
317
|
requirements: []
|
318
318
|
rubyforge_project:
|
319
|
-
rubygems_version: 1.8.
|
319
|
+
rubygems_version: 1.8.23
|
320
320
|
signing_key:
|
321
321
|
specification_version: 3
|
322
322
|
summary: Record linkage library
|