linkage 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/linkage/runner/single_threaded.rb +33 -29
- data/linkage.gemspec +2 -2
- data/test/integration/test_cross_linkage.rb +2 -2
- metadata +125 -181
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.1
|
@@ -27,21 +27,26 @@ module Linkage
|
|
27
27
|
|
28
28
|
def group_records
|
29
29
|
if config.linkage_type == :self
|
30
|
-
|
30
|
+
group_records_for(@dataset_1, 1)
|
31
31
|
else
|
32
|
-
|
33
|
-
|
32
|
+
group_records_for(@dataset_1, 1, false)
|
33
|
+
group_records_for(@dataset_2, 2, false)
|
34
34
|
combine_groups
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
# @param [Linkage::Dataset] dataset
|
39
|
+
# @param [Fixnum, nil] dataset_id
|
40
|
+
# @param [Boolean] ignore_empty_groups
|
41
|
+
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
42
|
+
# the block. Otherwise, call save_group on the group.
|
43
|
+
def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
|
40
44
|
current_group = nil
|
45
|
+
block ||= lambda { |group| save_group(current_group, dataset_id) }
|
41
46
|
dataset.each do |row|
|
42
47
|
if current_group.nil? || !current_group.matches?(row[:values])
|
43
48
|
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
44
|
-
|
49
|
+
block.call(current_group)
|
45
50
|
end
|
46
51
|
new_group = Group.new(row[:values])
|
47
52
|
current_group = new_group
|
@@ -49,33 +54,33 @@ module Linkage
|
|
49
54
|
current_group.add_record(row[:pk])
|
50
55
|
end
|
51
56
|
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
52
|
-
|
57
|
+
block.call(current_group)
|
53
58
|
end
|
54
|
-
|
59
|
+
flush_buffers
|
55
60
|
end
|
56
61
|
|
57
|
-
def
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
|
62
|
+
def save_group(group, dataset_id = nil)
|
63
|
+
if !@groups_buffer
|
64
|
+
groups_headers = [:id] + group.values.keys
|
65
|
+
@groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
|
66
|
+
end
|
67
|
+
@groups_records_buffer ||= ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
|
64
68
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
groups_records_buffer.add([group_id, dataset_id, record_id])
|
70
|
-
end
|
69
|
+
group_id = next_group_id
|
70
|
+
@groups_buffer.add([group_id] + group.values.values)
|
71
|
+
group.records.each do |record_id|
|
72
|
+
@groups_records_buffer.add([group_id, dataset_id, record_id])
|
71
73
|
end
|
72
|
-
|
73
|
-
|
74
|
+
end
|
75
|
+
|
76
|
+
def flush_buffers
|
77
|
+
@groups_buffer.flush if @groups_buffer
|
78
|
+
@groups_records_buffer.flush if @groups_records_buffer
|
74
79
|
end
|
75
80
|
|
76
81
|
def combine_groups
|
77
82
|
# Create a new dataset for the groups table
|
78
|
-
ds = Dataset.new(@uri, :groups,
|
83
|
+
ds = Dataset.new(@uri, :groups, @options)
|
79
84
|
ds.fields.each_value do |field|
|
80
85
|
# Sort on all fields
|
81
86
|
next if field.primary_key?
|
@@ -83,11 +88,10 @@ module Linkage
|
|
83
88
|
ds.add_select(field)
|
84
89
|
end
|
85
90
|
ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
|
86
|
-
combined_groups = group_records_for(ds, false)
|
87
91
|
database do |db|
|
88
92
|
groups_to_delete = []
|
89
93
|
db.transaction do # for speed reasons
|
90
|
-
|
94
|
+
group_records_for(ds, nil, false) do |group|
|
91
95
|
if group.count == 1
|
92
96
|
# Delete the empty group
|
93
97
|
groups_to_delete << group.records[0]
|
@@ -96,9 +100,9 @@ module Linkage
|
|
96
100
|
# id, delete other groups.
|
97
101
|
new_group_id = group.records[0]
|
98
102
|
group.records[1..-1].each do |old_group_id|
|
99
|
-
# There can only be a group with max size of 2, but
|
100
|
-
# adds in future support for matching more than
|
101
|
-
# at once.
|
103
|
+
# NOTE: There can only be a group with max size of 2, but
|
104
|
+
# this adds in future support for matching more than
|
105
|
+
# 2 datasets at once.
|
102
106
|
db[:groups_records].filter(:group_id => old_group_id).
|
103
107
|
update(:group_id => new_group_id)
|
104
108
|
groups_to_delete << old_group_id
|
data/linkage.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "linkage"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jeremy Stephens"]
|
12
|
-
s.date = "2011-
|
12
|
+
s.date = "2011-10-25"
|
13
13
|
s.description = "Wraps Sequel to perform record linkage between one or two datasets"
|
14
14
|
s.email = "jeremy.f.stephens@vanderbilt.edu"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -27,11 +27,11 @@ module IntegrationTests
|
|
27
27
|
conf = ds.link_with(ds) do
|
28
28
|
lhs[:foo].must == rhs[:bar]
|
29
29
|
end
|
30
|
-
runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
|
30
|
+
runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :single_threaded => true)
|
31
31
|
runner.execute
|
32
32
|
|
33
33
|
database do |db|
|
34
|
-
assert_equal 5, db[:groups].count
|
34
|
+
assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
|
35
35
|
db[:groups].order(:foo_bar).each_with_index do |row, i|
|
36
36
|
assert_equal i, row[:foo_bar]
|
37
37
|
end
|
metadata
CHANGED
@@ -1,206 +1,156 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkage
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 0
|
10
|
-
version: 0.0.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Jeremy Stephens
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
21
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
-
none: false
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
hash: 3
|
27
|
-
segments:
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
version_requirements: *id001
|
12
|
+
date: 2011-10-25 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
31
15
|
name: sequel
|
32
|
-
|
16
|
+
requirement: &14148960 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
33
22
|
type: :runtime
|
34
|
-
|
35
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *14148960
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &14147000 !ruby/object:Gem::Requirement
|
36
28
|
none: false
|
37
|
-
requirements:
|
29
|
+
requirements:
|
38
30
|
- - ~>
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
hash: 23
|
41
|
-
segments:
|
42
|
-
- 1
|
43
|
-
- 0
|
44
|
-
- 0
|
31
|
+
- !ruby/object:Gem::Version
|
45
32
|
version: 1.0.0
|
46
|
-
version_requirements: *id002
|
47
|
-
name: bundler
|
48
|
-
prerelease: false
|
49
33
|
type: :development
|
50
|
-
|
51
|
-
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *14147000
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &14143380 !ruby/object:Gem::Requirement
|
52
39
|
none: false
|
53
|
-
requirements:
|
40
|
+
requirements:
|
54
41
|
- - ~>
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
hash: 7
|
57
|
-
segments:
|
58
|
-
- 1
|
59
|
-
- 6
|
60
|
-
- 4
|
42
|
+
- !ruby/object:Gem::Version
|
61
43
|
version: 1.6.4
|
62
|
-
version_requirements: *id003
|
63
|
-
name: jeweler
|
64
|
-
prerelease: false
|
65
44
|
type: :development
|
66
|
-
- !ruby/object:Gem::Dependency
|
67
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
68
|
-
none: false
|
69
|
-
requirements:
|
70
|
-
- - ">="
|
71
|
-
- !ruby/object:Gem::Version
|
72
|
-
hash: 3
|
73
|
-
segments:
|
74
|
-
- 0
|
75
|
-
version: "0"
|
76
|
-
version_requirements: *id004
|
77
|
-
name: rcov
|
78
45
|
prerelease: false
|
79
|
-
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
|
46
|
+
version_requirements: *14143380
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rcov
|
49
|
+
requirement: &14142120 !ruby/object:Gem::Requirement
|
82
50
|
none: false
|
83
|
-
requirements:
|
84
|
-
- -
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
|
87
|
-
|
88
|
-
- 0
|
89
|
-
version: "0"
|
90
|
-
version_requirements: *id005
|
91
|
-
name: guard-test
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
92
56
|
prerelease: false
|
57
|
+
version_requirements: *14142120
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: guard-test
|
60
|
+
requirement: &14139060 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
93
66
|
type: :development
|
94
|
-
|
95
|
-
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *14139060
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: test-unit
|
71
|
+
requirement: &14137580 !ruby/object:Gem::Requirement
|
96
72
|
none: false
|
97
|
-
requirements:
|
98
|
-
- -
|
99
|
-
- !ruby/object:Gem::Version
|
100
|
-
hash: 7
|
101
|
-
segments:
|
102
|
-
- 2
|
103
|
-
- 3
|
104
|
-
- 2
|
73
|
+
requirements:
|
74
|
+
- - =
|
75
|
+
- !ruby/object:Gem::Version
|
105
76
|
version: 2.3.2
|
106
|
-
version_requirements: *id006
|
107
|
-
name: test-unit
|
108
|
-
prerelease: false
|
109
77
|
type: :development
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
requirement: &id007 !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
hash: 3
|
117
|
-
segments:
|
118
|
-
- 0
|
119
|
-
version: "0"
|
120
|
-
version_requirements: *id007
|
121
|
-
name: mocha
|
122
78
|
prerelease: false
|
123
|
-
|
124
|
-
- !ruby/object:Gem::Dependency
|
125
|
-
|
79
|
+
version_requirements: *14137580
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: mocha
|
82
|
+
requirement: &14135540 !ruby/object:Gem::Requirement
|
126
83
|
none: false
|
127
|
-
requirements:
|
128
|
-
- -
|
129
|
-
- !ruby/object:Gem::Version
|
130
|
-
|
131
|
-
segments:
|
132
|
-
- 0
|
133
|
-
version: "0"
|
134
|
-
version_requirements: *id008
|
135
|
-
name: sqlite3
|
136
|
-
prerelease: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
137
88
|
type: :development
|
138
|
-
- !ruby/object:Gem::Dependency
|
139
|
-
requirement: &id009 !ruby/object:Gem::Requirement
|
140
|
-
none: false
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
hash: 3
|
145
|
-
segments:
|
146
|
-
- 0
|
147
|
-
version: "0"
|
148
|
-
version_requirements: *id009
|
149
|
-
name: yard
|
150
89
|
prerelease: false
|
151
|
-
|
152
|
-
- !ruby/object:Gem::Dependency
|
153
|
-
|
90
|
+
version_requirements: *14135540
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: sqlite3
|
93
|
+
requirement: &14133820 !ruby/object:Gem::Requirement
|
154
94
|
none: false
|
155
|
-
requirements:
|
156
|
-
- -
|
157
|
-
- !ruby/object:Gem::Version
|
158
|
-
|
159
|
-
segments:
|
160
|
-
- 0
|
161
|
-
version: "0"
|
162
|
-
version_requirements: *id010
|
163
|
-
name: rake
|
164
|
-
prerelease: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
165
99
|
type: :development
|
166
|
-
|
167
|
-
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *14133820
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: yard
|
104
|
+
requirement: &14132420 !ruby/object:Gem::Requirement
|
168
105
|
none: false
|
169
|
-
requirements:
|
170
|
-
- -
|
171
|
-
- !ruby/object:Gem::Version
|
172
|
-
|
173
|
-
|
174
|
-
- 0
|
175
|
-
version: "0"
|
176
|
-
version_requirements: *id011
|
177
|
-
name: versionomy
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
178
111
|
prerelease: false
|
112
|
+
version_requirements: *14132420
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: rake
|
115
|
+
requirement: &14130000 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
179
121
|
type: :development
|
180
|
-
|
181
|
-
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: *14130000
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: versionomy
|
126
|
+
requirement: &14129120 !ruby/object:Gem::Requirement
|
182
127
|
none: false
|
183
|
-
requirements:
|
184
|
-
- -
|
185
|
-
- !ruby/object:Gem::Version
|
186
|
-
|
187
|
-
|
188
|
-
- 0
|
189
|
-
version: "0"
|
190
|
-
version_requirements: *id012
|
191
|
-
name: guard-yard
|
128
|
+
requirements:
|
129
|
+
- - ! '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
192
133
|
prerelease: false
|
134
|
+
version_requirements: *14129120
|
135
|
+
- !ruby/object:Gem::Dependency
|
136
|
+
name: guard-yard
|
137
|
+
requirement: &14127660 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
193
143
|
type: :development
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: *14127660
|
194
146
|
description: Wraps Sequel to perform record linkage between one or two datasets
|
195
147
|
email: jeremy.f.stephens@vanderbilt.edu
|
196
148
|
executables: []
|
197
|
-
|
198
149
|
extensions: []
|
199
|
-
|
200
|
-
extra_rdoc_files:
|
150
|
+
extra_rdoc_files:
|
201
151
|
- LICENSE.txt
|
202
152
|
- README.markdown
|
203
|
-
files:
|
153
|
+
files:
|
204
154
|
- .document
|
205
155
|
- .vimrc
|
206
156
|
- Gemfile
|
@@ -236,37 +186,31 @@ files:
|
|
236
186
|
- test/unit/test_single_threaded_runner.rb
|
237
187
|
- test/unit/test_utils.rb
|
238
188
|
homepage: http://github.com/coupler/linkage
|
239
|
-
licenses:
|
189
|
+
licenses:
|
240
190
|
- MIT
|
241
191
|
post_install_message:
|
242
192
|
rdoc_options: []
|
243
|
-
|
244
|
-
require_paths:
|
193
|
+
require_paths:
|
245
194
|
- lib
|
246
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
195
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
247
196
|
none: false
|
248
|
-
requirements:
|
249
|
-
- -
|
250
|
-
- !ruby/object:Gem::Version
|
251
|
-
|
252
|
-
segments:
|
197
|
+
requirements:
|
198
|
+
- - ! '>='
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: '0'
|
201
|
+
segments:
|
253
202
|
- 0
|
254
|
-
|
255
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
203
|
+
hash: 2889761151257020688
|
204
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
256
205
|
none: false
|
257
|
-
requirements:
|
258
|
-
- -
|
259
|
-
- !ruby/object:Gem::Version
|
260
|
-
|
261
|
-
segments:
|
262
|
-
- 0
|
263
|
-
version: "0"
|
206
|
+
requirements:
|
207
|
+
- - ! '>='
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: '0'
|
264
210
|
requirements: []
|
265
|
-
|
266
211
|
rubyforge_project:
|
267
212
|
rubygems_version: 1.8.10
|
268
213
|
signing_key:
|
269
214
|
specification_version: 3
|
270
215
|
summary: Sequel-based record linkage
|
271
216
|
test_files: []
|
272
|
-
|