linkage 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/linkage/runner/single_threaded.rb +33 -29
- data/linkage.gemspec +2 -2
- data/test/integration/test_cross_linkage.rb +2 -2
- metadata +125 -181
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.1
|
@@ -27,21 +27,26 @@ module Linkage
|
|
27
27
|
|
28
28
|
def group_records
|
29
29
|
if config.linkage_type == :self
|
30
|
-
|
30
|
+
group_records_for(@dataset_1, 1)
|
31
31
|
else
|
32
|
-
|
33
|
-
|
32
|
+
group_records_for(@dataset_1, 1, false)
|
33
|
+
group_records_for(@dataset_2, 2, false)
|
34
34
|
combine_groups
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
# @param [Linkage::Dataset] dataset
|
39
|
+
# @param [Fixnum, nil] dataset_id
|
40
|
+
# @param [Boolean] ignore_empty_groups
|
41
|
+
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
42
|
+
# the block. Otherwise, call save_group on the group.
|
43
|
+
def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
|
40
44
|
current_group = nil
|
45
|
+
block ||= lambda { |group| save_group(current_group, dataset_id) }
|
41
46
|
dataset.each do |row|
|
42
47
|
if current_group.nil? || !current_group.matches?(row[:values])
|
43
48
|
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
44
|
-
|
49
|
+
block.call(current_group)
|
45
50
|
end
|
46
51
|
new_group = Group.new(row[:values])
|
47
52
|
current_group = new_group
|
@@ -49,33 +54,33 @@ module Linkage
|
|
49
54
|
current_group.add_record(row[:pk])
|
50
55
|
end
|
51
56
|
if current_group && (!ignore_empty_groups || current_group.count > 1)
|
52
|
-
|
57
|
+
block.call(current_group)
|
53
58
|
end
|
54
|
-
|
59
|
+
flush_buffers
|
55
60
|
end
|
56
61
|
|
57
|
-
def
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
|
62
|
+
def save_group(group, dataset_id = nil)
|
63
|
+
if !@groups_buffer
|
64
|
+
groups_headers = [:id] + group.values.keys
|
65
|
+
@groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
|
66
|
+
end
|
67
|
+
@groups_records_buffer ||= ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
|
64
68
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
groups_records_buffer.add([group_id, dataset_id, record_id])
|
70
|
-
end
|
69
|
+
group_id = next_group_id
|
70
|
+
@groups_buffer.add([group_id] + group.values.values)
|
71
|
+
group.records.each do |record_id|
|
72
|
+
@groups_records_buffer.add([group_id, dataset_id, record_id])
|
71
73
|
end
|
72
|
-
|
73
|
-
|
74
|
+
end
|
75
|
+
|
76
|
+
def flush_buffers
|
77
|
+
@groups_buffer.flush if @groups_buffer
|
78
|
+
@groups_records_buffer.flush if @groups_records_buffer
|
74
79
|
end
|
75
80
|
|
76
81
|
def combine_groups
|
77
82
|
# Create a new dataset for the groups table
|
78
|
-
ds = Dataset.new(@uri, :groups,
|
83
|
+
ds = Dataset.new(@uri, :groups, @options)
|
79
84
|
ds.fields.each_value do |field|
|
80
85
|
# Sort on all fields
|
81
86
|
next if field.primary_key?
|
@@ -83,11 +88,10 @@ module Linkage
|
|
83
88
|
ds.add_select(field)
|
84
89
|
end
|
85
90
|
ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
|
86
|
-
combined_groups = group_records_for(ds, false)
|
87
91
|
database do |db|
|
88
92
|
groups_to_delete = []
|
89
93
|
db.transaction do # for speed reasons
|
90
|
-
|
94
|
+
group_records_for(ds, nil, false) do |group|
|
91
95
|
if group.count == 1
|
92
96
|
# Delete the empty group
|
93
97
|
groups_to_delete << group.records[0]
|
@@ -96,9 +100,9 @@ module Linkage
|
|
96
100
|
# id, delete other groups.
|
97
101
|
new_group_id = group.records[0]
|
98
102
|
group.records[1..-1].each do |old_group_id|
|
99
|
-
# There can only be a group with max size of 2, but
|
100
|
-
# adds in future support for matching more than
|
101
|
-
# at once.
|
103
|
+
# NOTE: There can only be a group with max size of 2, but
|
104
|
+
# this adds in future support for matching more than
|
105
|
+
# 2 datasets at once.
|
102
106
|
db[:groups_records].filter(:group_id => old_group_id).
|
103
107
|
update(:group_id => new_group_id)
|
104
108
|
groups_to_delete << old_group_id
|
data/linkage.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "linkage"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jeremy Stephens"]
|
12
|
-
s.date = "2011-
|
12
|
+
s.date = "2011-10-25"
|
13
13
|
s.description = "Wraps Sequel to perform record linkage between one or two datasets"
|
14
14
|
s.email = "jeremy.f.stephens@vanderbilt.edu"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -27,11 +27,11 @@ module IntegrationTests
|
|
27
27
|
conf = ds.link_with(ds) do
|
28
28
|
lhs[:foo].must == rhs[:bar]
|
29
29
|
end
|
30
|
-
runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
|
30
|
+
runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :single_threaded => true)
|
31
31
|
runner.execute
|
32
32
|
|
33
33
|
database do |db|
|
34
|
-
assert_equal 5, db[:groups].count
|
34
|
+
assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
|
35
35
|
db[:groups].order(:foo_bar).each_with_index do |row, i|
|
36
36
|
assert_equal i, row[:foo_bar]
|
37
37
|
end
|
metadata
CHANGED
@@ -1,206 +1,156 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkage
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 0
|
10
|
-
version: 0.0.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Jeremy Stephens
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
21
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
-
none: false
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
hash: 3
|
27
|
-
segments:
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
version_requirements: *id001
|
12
|
+
date: 2011-10-25 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
31
15
|
name: sequel
|
32
|
-
|
16
|
+
requirement: &14148960 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
33
22
|
type: :runtime
|
34
|
-
|
35
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *14148960
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &14147000 !ruby/object:Gem::Requirement
|
36
28
|
none: false
|
37
|
-
requirements:
|
29
|
+
requirements:
|
38
30
|
- - ~>
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
hash: 23
|
41
|
-
segments:
|
42
|
-
- 1
|
43
|
-
- 0
|
44
|
-
- 0
|
31
|
+
- !ruby/object:Gem::Version
|
45
32
|
version: 1.0.0
|
46
|
-
version_requirements: *id002
|
47
|
-
name: bundler
|
48
|
-
prerelease: false
|
49
33
|
type: :development
|
50
|
-
|
51
|
-
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *14147000
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &14143380 !ruby/object:Gem::Requirement
|
52
39
|
none: false
|
53
|
-
requirements:
|
40
|
+
requirements:
|
54
41
|
- - ~>
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
hash: 7
|
57
|
-
segments:
|
58
|
-
- 1
|
59
|
-
- 6
|
60
|
-
- 4
|
42
|
+
- !ruby/object:Gem::Version
|
61
43
|
version: 1.6.4
|
62
|
-
version_requirements: *id003
|
63
|
-
name: jeweler
|
64
|
-
prerelease: false
|
65
44
|
type: :development
|
66
|
-
- !ruby/object:Gem::Dependency
|
67
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
68
|
-
none: false
|
69
|
-
requirements:
|
70
|
-
- - ">="
|
71
|
-
- !ruby/object:Gem::Version
|
72
|
-
hash: 3
|
73
|
-
segments:
|
74
|
-
- 0
|
75
|
-
version: "0"
|
76
|
-
version_requirements: *id004
|
77
|
-
name: rcov
|
78
45
|
prerelease: false
|
79
|
-
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
|
46
|
+
version_requirements: *14143380
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rcov
|
49
|
+
requirement: &14142120 !ruby/object:Gem::Requirement
|
82
50
|
none: false
|
83
|
-
requirements:
|
84
|
-
- -
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
|
87
|
-
|
88
|
-
- 0
|
89
|
-
version: "0"
|
90
|
-
version_requirements: *id005
|
91
|
-
name: guard-test
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
92
56
|
prerelease: false
|
57
|
+
version_requirements: *14142120
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: guard-test
|
60
|
+
requirement: &14139060 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
93
66
|
type: :development
|
94
|
-
|
95
|
-
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *14139060
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: test-unit
|
71
|
+
requirement: &14137580 !ruby/object:Gem::Requirement
|
96
72
|
none: false
|
97
|
-
requirements:
|
98
|
-
- -
|
99
|
-
- !ruby/object:Gem::Version
|
100
|
-
hash: 7
|
101
|
-
segments:
|
102
|
-
- 2
|
103
|
-
- 3
|
104
|
-
- 2
|
73
|
+
requirements:
|
74
|
+
- - =
|
75
|
+
- !ruby/object:Gem::Version
|
105
76
|
version: 2.3.2
|
106
|
-
version_requirements: *id006
|
107
|
-
name: test-unit
|
108
|
-
prerelease: false
|
109
77
|
type: :development
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
requirement: &id007 !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
hash: 3
|
117
|
-
segments:
|
118
|
-
- 0
|
119
|
-
version: "0"
|
120
|
-
version_requirements: *id007
|
121
|
-
name: mocha
|
122
78
|
prerelease: false
|
123
|
-
|
124
|
-
- !ruby/object:Gem::Dependency
|
125
|
-
|
79
|
+
version_requirements: *14137580
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: mocha
|
82
|
+
requirement: &14135540 !ruby/object:Gem::Requirement
|
126
83
|
none: false
|
127
|
-
requirements:
|
128
|
-
- -
|
129
|
-
- !ruby/object:Gem::Version
|
130
|
-
|
131
|
-
segments:
|
132
|
-
- 0
|
133
|
-
version: "0"
|
134
|
-
version_requirements: *id008
|
135
|
-
name: sqlite3
|
136
|
-
prerelease: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
137
88
|
type: :development
|
138
|
-
- !ruby/object:Gem::Dependency
|
139
|
-
requirement: &id009 !ruby/object:Gem::Requirement
|
140
|
-
none: false
|
141
|
-
requirements:
|
142
|
-
- - ">="
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
hash: 3
|
145
|
-
segments:
|
146
|
-
- 0
|
147
|
-
version: "0"
|
148
|
-
version_requirements: *id009
|
149
|
-
name: yard
|
150
89
|
prerelease: false
|
151
|
-
|
152
|
-
- !ruby/object:Gem::Dependency
|
153
|
-
|
90
|
+
version_requirements: *14135540
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: sqlite3
|
93
|
+
requirement: &14133820 !ruby/object:Gem::Requirement
|
154
94
|
none: false
|
155
|
-
requirements:
|
156
|
-
- -
|
157
|
-
- !ruby/object:Gem::Version
|
158
|
-
|
159
|
-
segments:
|
160
|
-
- 0
|
161
|
-
version: "0"
|
162
|
-
version_requirements: *id010
|
163
|
-
name: rake
|
164
|
-
prerelease: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
165
99
|
type: :development
|
166
|
-
|
167
|
-
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *14133820
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: yard
|
104
|
+
requirement: &14132420 !ruby/object:Gem::Requirement
|
168
105
|
none: false
|
169
|
-
requirements:
|
170
|
-
- -
|
171
|
-
- !ruby/object:Gem::Version
|
172
|
-
|
173
|
-
|
174
|
-
- 0
|
175
|
-
version: "0"
|
176
|
-
version_requirements: *id011
|
177
|
-
name: versionomy
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
178
111
|
prerelease: false
|
112
|
+
version_requirements: *14132420
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: rake
|
115
|
+
requirement: &14130000 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
179
121
|
type: :development
|
180
|
-
|
181
|
-
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: *14130000
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: versionomy
|
126
|
+
requirement: &14129120 !ruby/object:Gem::Requirement
|
182
127
|
none: false
|
183
|
-
requirements:
|
184
|
-
- -
|
185
|
-
- !ruby/object:Gem::Version
|
186
|
-
|
187
|
-
|
188
|
-
- 0
|
189
|
-
version: "0"
|
190
|
-
version_requirements: *id012
|
191
|
-
name: guard-yard
|
128
|
+
requirements:
|
129
|
+
- - ! '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
192
133
|
prerelease: false
|
134
|
+
version_requirements: *14129120
|
135
|
+
- !ruby/object:Gem::Dependency
|
136
|
+
name: guard-yard
|
137
|
+
requirement: &14127660 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
193
143
|
type: :development
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: *14127660
|
194
146
|
description: Wraps Sequel to perform record linkage between one or two datasets
|
195
147
|
email: jeremy.f.stephens@vanderbilt.edu
|
196
148
|
executables: []
|
197
|
-
|
198
149
|
extensions: []
|
199
|
-
|
200
|
-
extra_rdoc_files:
|
150
|
+
extra_rdoc_files:
|
201
151
|
- LICENSE.txt
|
202
152
|
- README.markdown
|
203
|
-
files:
|
153
|
+
files:
|
204
154
|
- .document
|
205
155
|
- .vimrc
|
206
156
|
- Gemfile
|
@@ -236,37 +186,31 @@ files:
|
|
236
186
|
- test/unit/test_single_threaded_runner.rb
|
237
187
|
- test/unit/test_utils.rb
|
238
188
|
homepage: http://github.com/coupler/linkage
|
239
|
-
licenses:
|
189
|
+
licenses:
|
240
190
|
- MIT
|
241
191
|
post_install_message:
|
242
192
|
rdoc_options: []
|
243
|
-
|
244
|
-
require_paths:
|
193
|
+
require_paths:
|
245
194
|
- lib
|
246
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
195
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
247
196
|
none: false
|
248
|
-
requirements:
|
249
|
-
- -
|
250
|
-
- !ruby/object:Gem::Version
|
251
|
-
|
252
|
-
segments:
|
197
|
+
requirements:
|
198
|
+
- - ! '>='
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: '0'
|
201
|
+
segments:
|
253
202
|
- 0
|
254
|
-
|
255
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
203
|
+
hash: 2889761151257020688
|
204
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
256
205
|
none: false
|
257
|
-
requirements:
|
258
|
-
- -
|
259
|
-
- !ruby/object:Gem::Version
|
260
|
-
|
261
|
-
segments:
|
262
|
-
- 0
|
263
|
-
version: "0"
|
206
|
+
requirements:
|
207
|
+
- - ! '>='
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: '0'
|
264
210
|
requirements: []
|
265
|
-
|
266
211
|
rubyforge_project:
|
267
212
|
rubygems_version: 1.8.10
|
268
213
|
signing_key:
|
269
214
|
specification_version: 3
|
270
215
|
summary: Sequel-based record linkage
|
271
216
|
test_files: []
|
272
|
-
|