linkage 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
@@ -27,21 +27,26 @@ module Linkage
27
27
 
28
28
  def group_records
29
29
  if config.linkage_type == :self
30
- add_groups(group_records_for(@dataset_1), 1)
30
+ group_records_for(@dataset_1, 1)
31
31
  else
32
- add_groups(group_records_for(@dataset_1, false), 1)
33
- add_groups(group_records_for(@dataset_2, false), 2)
32
+ group_records_for(@dataset_1, 1, false)
33
+ group_records_for(@dataset_2, 2, false)
34
34
  combine_groups
35
35
  end
36
36
  end
37
37
 
38
- def group_records_for(dataset, ignore_empty_groups = true)
39
- groups = []
38
+ # @param [Linkage::Dataset] dataset
39
+ # @param [Fixnum, nil] dataset_id
40
+ # @param [Boolean] ignore_empty_groups
41
+ # @yield [Linkage::Group] If a block is given, yield completed groups to
42
+ # the block. Otherwise, call save_group on the group.
43
+ def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
40
44
  current_group = nil
45
+ block ||= lambda { |group| save_group(current_group, dataset_id) }
41
46
  dataset.each do |row|
42
47
  if current_group.nil? || !current_group.matches?(row[:values])
43
48
  if current_group && (!ignore_empty_groups || current_group.count > 1)
44
- groups << current_group
49
+ block.call(current_group)
45
50
  end
46
51
  new_group = Group.new(row[:values])
47
52
  current_group = new_group
@@ -49,33 +54,33 @@ module Linkage
49
54
  current_group.add_record(row[:pk])
50
55
  end
51
56
  if current_group && (!ignore_empty_groups || current_group.count > 1)
52
- groups << current_group
57
+ block.call(current_group)
53
58
  end
54
- groups
59
+ flush_buffers
55
60
  end
56
61
 
57
- def add_groups(groups, dataset_id = nil)
58
- return if groups.empty?
59
-
60
- groups_headers = [:id] + groups[0].values.keys
61
- groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
62
-
63
- groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
62
+ def save_group(group, dataset_id = nil)
63
+ if !@groups_buffer
64
+ groups_headers = [:id] + group.values.keys
65
+ @groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
66
+ end
67
+ @groups_records_buffer ||= ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
64
68
 
65
- groups.each_with_index do |group, i|
66
- group_id = next_group_id
67
- groups_buffer.add([group_id] + group.values.values)
68
- group.records.each do |record_id|
69
- groups_records_buffer.add([group_id, dataset_id, record_id])
70
- end
69
+ group_id = next_group_id
70
+ @groups_buffer.add([group_id] + group.values.values)
71
+ group.records.each do |record_id|
72
+ @groups_records_buffer.add([group_id, dataset_id, record_id])
71
73
  end
72
- groups_buffer.flush
73
- groups_records_buffer.flush
74
+ end
75
+
76
+ def flush_buffers
77
+ @groups_buffer.flush if @groups_buffer
78
+ @groups_records_buffer.flush if @groups_records_buffer
74
79
  end
75
80
 
76
81
  def combine_groups
77
82
  # Create a new dataset for the groups table
78
- ds = Dataset.new(@uri, :groups, :single_threaded => true)
83
+ ds = Dataset.new(@uri, :groups, @options)
79
84
  ds.fields.each_value do |field|
80
85
  # Sort on all fields
81
86
  next if field.primary_key?
@@ -83,11 +88,10 @@ module Linkage
83
88
  ds.add_select(field)
84
89
  end
85
90
  ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
86
- combined_groups = group_records_for(ds, false)
87
91
  database do |db|
88
92
  groups_to_delete = []
89
93
  db.transaction do # for speed reasons
90
- combined_groups.each do |group|
94
+ group_records_for(ds, nil, false) do |group|
91
95
  if group.count == 1
92
96
  # Delete the empty group
93
97
  groups_to_delete << group.records[0]
@@ -96,9 +100,9 @@ module Linkage
96
100
  # id, delete other groups.
97
101
  new_group_id = group.records[0]
98
102
  group.records[1..-1].each do |old_group_id|
99
- # There can only be a group with max size of 2, but this
100
- # adds in future support for matching more than 2 datasets
101
- # at once. Code smell?
103
+ # NOTE: There can only be a group with max size of 2, but
104
+ # this adds in future support for matching more than
105
+ # 2 datasets at once.
102
106
  db[:groups_records].filter(:group_id => old_group_id).
103
107
  update(:group_id => new_group_id)
104
108
  groups_to_delete << old_group_id
data/linkage.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2011-09-30"
12
+ s.date = "2011-10-25"
13
13
  s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -27,11 +27,11 @@ module IntegrationTests
27
27
  conf = ds.link_with(ds) do
28
28
  lhs[:foo].must == rhs[:bar]
29
29
  end
30
- runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :single_threaded => true)
31
31
  runner.execute
32
32
 
33
33
  database do |db|
34
- assert_equal 5, db[:groups].count
34
+ assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
35
35
  db[:groups].order(:foo_bar).each_with_index do |row, i|
36
36
  assert_equal i, row[:foo_bar]
37
37
  end
metadata CHANGED
@@ -1,206 +1,156 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: linkage
3
- version: !ruby/object:Gem::Version
4
- hash: 31
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 0
10
- version: 0.0.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jeremy Stephens
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-09-30 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- requirement: &id001 !ruby/object:Gem::Requirement
22
- none: false
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- hash: 3
27
- segments:
28
- - 0
29
- version: "0"
30
- version_requirements: *id001
12
+ date: 2011-10-25 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
31
15
  name: sequel
32
- prerelease: false
16
+ requirement: &14148960 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
33
22
  type: :runtime
34
- - !ruby/object:Gem::Dependency
35
- requirement: &id002 !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ version_requirements: *14148960
25
+ - !ruby/object:Gem::Dependency
26
+ name: bundler
27
+ requirement: &14147000 !ruby/object:Gem::Requirement
36
28
  none: false
37
- requirements:
29
+ requirements:
38
30
  - - ~>
39
- - !ruby/object:Gem::Version
40
- hash: 23
41
- segments:
42
- - 1
43
- - 0
44
- - 0
31
+ - !ruby/object:Gem::Version
45
32
  version: 1.0.0
46
- version_requirements: *id002
47
- name: bundler
48
- prerelease: false
49
33
  type: :development
50
- - !ruby/object:Gem::Dependency
51
- requirement: &id003 !ruby/object:Gem::Requirement
34
+ prerelease: false
35
+ version_requirements: *14147000
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &14143380 !ruby/object:Gem::Requirement
52
39
  none: false
53
- requirements:
40
+ requirements:
54
41
  - - ~>
55
- - !ruby/object:Gem::Version
56
- hash: 7
57
- segments:
58
- - 1
59
- - 6
60
- - 4
42
+ - !ruby/object:Gem::Version
61
43
  version: 1.6.4
62
- version_requirements: *id003
63
- name: jeweler
64
- prerelease: false
65
44
  type: :development
66
- - !ruby/object:Gem::Dependency
67
- requirement: &id004 !ruby/object:Gem::Requirement
68
- none: false
69
- requirements:
70
- - - ">="
71
- - !ruby/object:Gem::Version
72
- hash: 3
73
- segments:
74
- - 0
75
- version: "0"
76
- version_requirements: *id004
77
- name: rcov
78
45
  prerelease: false
79
- type: :development
80
- - !ruby/object:Gem::Dependency
81
- requirement: &id005 !ruby/object:Gem::Requirement
46
+ version_requirements: *14143380
47
+ - !ruby/object:Gem::Dependency
48
+ name: rcov
49
+ requirement: &14142120 !ruby/object:Gem::Requirement
82
50
  none: false
83
- requirements:
84
- - - ">="
85
- - !ruby/object:Gem::Version
86
- hash: 3
87
- segments:
88
- - 0
89
- version: "0"
90
- version_requirements: *id005
91
- name: guard-test
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
92
56
  prerelease: false
57
+ version_requirements: *14142120
58
+ - !ruby/object:Gem::Dependency
59
+ name: guard-test
60
+ requirement: &14139060 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
93
66
  type: :development
94
- - !ruby/object:Gem::Dependency
95
- requirement: &id006 !ruby/object:Gem::Requirement
67
+ prerelease: false
68
+ version_requirements: *14139060
69
+ - !ruby/object:Gem::Dependency
70
+ name: test-unit
71
+ requirement: &14137580 !ruby/object:Gem::Requirement
96
72
  none: false
97
- requirements:
98
- - - "="
99
- - !ruby/object:Gem::Version
100
- hash: 7
101
- segments:
102
- - 2
103
- - 3
104
- - 2
73
+ requirements:
74
+ - - =
75
+ - !ruby/object:Gem::Version
105
76
  version: 2.3.2
106
- version_requirements: *id006
107
- name: test-unit
108
- prerelease: false
109
77
  type: :development
110
- - !ruby/object:Gem::Dependency
111
- requirement: &id007 !ruby/object:Gem::Requirement
112
- none: false
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- hash: 3
117
- segments:
118
- - 0
119
- version: "0"
120
- version_requirements: *id007
121
- name: mocha
122
78
  prerelease: false
123
- type: :development
124
- - !ruby/object:Gem::Dependency
125
- requirement: &id008 !ruby/object:Gem::Requirement
79
+ version_requirements: *14137580
80
+ - !ruby/object:Gem::Dependency
81
+ name: mocha
82
+ requirement: &14135540 !ruby/object:Gem::Requirement
126
83
  none: false
127
- requirements:
128
- - - ">="
129
- - !ruby/object:Gem::Version
130
- hash: 3
131
- segments:
132
- - 0
133
- version: "0"
134
- version_requirements: *id008
135
- name: sqlite3
136
- prerelease: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
137
88
  type: :development
138
- - !ruby/object:Gem::Dependency
139
- requirement: &id009 !ruby/object:Gem::Requirement
140
- none: false
141
- requirements:
142
- - - ">="
143
- - !ruby/object:Gem::Version
144
- hash: 3
145
- segments:
146
- - 0
147
- version: "0"
148
- version_requirements: *id009
149
- name: yard
150
89
  prerelease: false
151
- type: :development
152
- - !ruby/object:Gem::Dependency
153
- requirement: &id010 !ruby/object:Gem::Requirement
90
+ version_requirements: *14135540
91
+ - !ruby/object:Gem::Dependency
92
+ name: sqlite3
93
+ requirement: &14133820 !ruby/object:Gem::Requirement
154
94
  none: false
155
- requirements:
156
- - - ">="
157
- - !ruby/object:Gem::Version
158
- hash: 3
159
- segments:
160
- - 0
161
- version: "0"
162
- version_requirements: *id010
163
- name: rake
164
- prerelease: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
165
99
  type: :development
166
- - !ruby/object:Gem::Dependency
167
- requirement: &id011 !ruby/object:Gem::Requirement
100
+ prerelease: false
101
+ version_requirements: *14133820
102
+ - !ruby/object:Gem::Dependency
103
+ name: yard
104
+ requirement: &14132420 !ruby/object:Gem::Requirement
168
105
  none: false
169
- requirements:
170
- - - ">="
171
- - !ruby/object:Gem::Version
172
- hash: 3
173
- segments:
174
- - 0
175
- version: "0"
176
- version_requirements: *id011
177
- name: versionomy
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :development
178
111
  prerelease: false
112
+ version_requirements: *14132420
113
+ - !ruby/object:Gem::Dependency
114
+ name: rake
115
+ requirement: &14130000 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
179
121
  type: :development
180
- - !ruby/object:Gem::Dependency
181
- requirement: &id012 !ruby/object:Gem::Requirement
122
+ prerelease: false
123
+ version_requirements: *14130000
124
+ - !ruby/object:Gem::Dependency
125
+ name: versionomy
126
+ requirement: &14129120 !ruby/object:Gem::Requirement
182
127
  none: false
183
- requirements:
184
- - - ">="
185
- - !ruby/object:Gem::Version
186
- hash: 3
187
- segments:
188
- - 0
189
- version: "0"
190
- version_requirements: *id012
191
- name: guard-yard
128
+ requirements:
129
+ - - ! '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
192
133
  prerelease: false
134
+ version_requirements: *14129120
135
+ - !ruby/object:Gem::Dependency
136
+ name: guard-yard
137
+ requirement: &14127660 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
193
143
  type: :development
144
+ prerelease: false
145
+ version_requirements: *14127660
194
146
  description: Wraps Sequel to perform record linkage between one or two datasets
195
147
  email: jeremy.f.stephens@vanderbilt.edu
196
148
  executables: []
197
-
198
149
  extensions: []
199
-
200
- extra_rdoc_files:
150
+ extra_rdoc_files:
201
151
  - LICENSE.txt
202
152
  - README.markdown
203
- files:
153
+ files:
204
154
  - .document
205
155
  - .vimrc
206
156
  - Gemfile
@@ -236,37 +186,31 @@ files:
236
186
  - test/unit/test_single_threaded_runner.rb
237
187
  - test/unit/test_utils.rb
238
188
  homepage: http://github.com/coupler/linkage
239
- licenses:
189
+ licenses:
240
190
  - MIT
241
191
  post_install_message:
242
192
  rdoc_options: []
243
-
244
- require_paths:
193
+ require_paths:
245
194
  - lib
246
- required_ruby_version: !ruby/object:Gem::Requirement
195
+ required_ruby_version: !ruby/object:Gem::Requirement
247
196
  none: false
248
- requirements:
249
- - - ">="
250
- - !ruby/object:Gem::Version
251
- hash: 3
252
- segments:
197
+ requirements:
198
+ - - ! '>='
199
+ - !ruby/object:Gem::Version
200
+ version: '0'
201
+ segments:
253
202
  - 0
254
- version: "0"
255
- required_rubygems_version: !ruby/object:Gem::Requirement
203
+ hash: 2889761151257020688
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
256
205
  none: false
257
- requirements:
258
- - - ">="
259
- - !ruby/object:Gem::Version
260
- hash: 3
261
- segments:
262
- - 0
263
- version: "0"
206
+ requirements:
207
+ - - ! '>='
208
+ - !ruby/object:Gem::Version
209
+ version: '0'
264
210
  requirements: []
265
-
266
211
  rubyforge_project:
267
212
  rubygems_version: 1.8.10
268
213
  signing_key:
269
214
  specification_version: 3
270
215
  summary: Sequel-based record linkage
271
216
  test_files: []
272
-