linkage 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
@@ -27,21 +27,26 @@ module Linkage
27
27
 
28
28
  def group_records
29
29
  if config.linkage_type == :self
30
- add_groups(group_records_for(@dataset_1), 1)
30
+ group_records_for(@dataset_1, 1)
31
31
  else
32
- add_groups(group_records_for(@dataset_1, false), 1)
33
- add_groups(group_records_for(@dataset_2, false), 2)
32
+ group_records_for(@dataset_1, 1, false)
33
+ group_records_for(@dataset_2, 2, false)
34
34
  combine_groups
35
35
  end
36
36
  end
37
37
 
38
- def group_records_for(dataset, ignore_empty_groups = true)
39
- groups = []
38
+ # @param [Linkage::Dataset] dataset
39
+ # @param [Fixnum, nil] dataset_id
40
+ # @param [Boolean] ignore_empty_groups
41
+ # @yield [Linkage::Group] If a block is given, yield completed groups to
42
+ # the block. Otherwise, call save_group on the group.
43
+ def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
40
44
  current_group = nil
45
+ block ||= lambda { |group| save_group(current_group, dataset_id) }
41
46
  dataset.each do |row|
42
47
  if current_group.nil? || !current_group.matches?(row[:values])
43
48
  if current_group && (!ignore_empty_groups || current_group.count > 1)
44
- groups << current_group
49
+ block.call(current_group)
45
50
  end
46
51
  new_group = Group.new(row[:values])
47
52
  current_group = new_group
@@ -49,33 +54,33 @@ module Linkage
49
54
  current_group.add_record(row[:pk])
50
55
  end
51
56
  if current_group && (!ignore_empty_groups || current_group.count > 1)
52
- groups << current_group
57
+ block.call(current_group)
53
58
  end
54
- groups
59
+ flush_buffers
55
60
  end
56
61
 
57
- def add_groups(groups, dataset_id = nil)
58
- return if groups.empty?
59
-
60
- groups_headers = [:id] + groups[0].values.keys
61
- groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
62
-
63
- groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
62
+ def save_group(group, dataset_id = nil)
63
+ if !@groups_buffer
64
+ groups_headers = [:id] + group.values.keys
65
+ @groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
66
+ end
67
+ @groups_records_buffer ||= ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
64
68
 
65
- groups.each_with_index do |group, i|
66
- group_id = next_group_id
67
- groups_buffer.add([group_id] + group.values.values)
68
- group.records.each do |record_id|
69
- groups_records_buffer.add([group_id, dataset_id, record_id])
70
- end
69
+ group_id = next_group_id
70
+ @groups_buffer.add([group_id] + group.values.values)
71
+ group.records.each do |record_id|
72
+ @groups_records_buffer.add([group_id, dataset_id, record_id])
71
73
  end
72
- groups_buffer.flush
73
- groups_records_buffer.flush
74
+ end
75
+
76
+ def flush_buffers
77
+ @groups_buffer.flush if @groups_buffer
78
+ @groups_records_buffer.flush if @groups_records_buffer
74
79
  end
75
80
 
76
81
  def combine_groups
77
82
  # Create a new dataset for the groups table
78
- ds = Dataset.new(@uri, :groups, :single_threaded => true)
83
+ ds = Dataset.new(@uri, :groups, @options)
79
84
  ds.fields.each_value do |field|
80
85
  # Sort on all fields
81
86
  next if field.primary_key?
@@ -83,11 +88,10 @@ module Linkage
83
88
  ds.add_select(field)
84
89
  end
85
90
  ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
86
- combined_groups = group_records_for(ds, false)
87
91
  database do |db|
88
92
  groups_to_delete = []
89
93
  db.transaction do # for speed reasons
90
- combined_groups.each do |group|
94
+ group_records_for(ds, nil, false) do |group|
91
95
  if group.count == 1
92
96
  # Delete the empty group
93
97
  groups_to_delete << group.records[0]
@@ -96,9 +100,9 @@ module Linkage
96
100
  # id, delete other groups.
97
101
  new_group_id = group.records[0]
98
102
  group.records[1..-1].each do |old_group_id|
99
- # There can only be a group with max size of 2, but this
100
- # adds in future support for matching more than 2 datasets
101
- # at once. Code smell?
103
+ # NOTE: There can only be a group with max size of 2, but
104
+ # this adds in future support for matching more than
105
+ # 2 datasets at once.
102
106
  db[:groups_records].filter(:group_id => old_group_id).
103
107
  update(:group_id => new_group_id)
104
108
  groups_to_delete << old_group_id
data/linkage.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2011-09-30"
12
+ s.date = "2011-10-25"
13
13
  s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -27,11 +27,11 @@ module IntegrationTests
27
27
  conf = ds.link_with(ds) do
28
28
  lhs[:foo].must == rhs[:bar]
29
29
  end
30
- runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
30
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :single_threaded => true)
31
31
  runner.execute
32
32
 
33
33
  database do |db|
34
- assert_equal 5, db[:groups].count
34
+ assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
35
35
  db[:groups].order(:foo_bar).each_with_index do |row, i|
36
36
  assert_equal i, row[:foo_bar]
37
37
  end
metadata CHANGED
@@ -1,206 +1,156 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: linkage
3
- version: !ruby/object:Gem::Version
4
- hash: 31
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 0
10
- version: 0.0.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jeremy Stephens
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-09-30 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- requirement: &id001 !ruby/object:Gem::Requirement
22
- none: false
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- hash: 3
27
- segments:
28
- - 0
29
- version: "0"
30
- version_requirements: *id001
12
+ date: 2011-10-25 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
31
15
  name: sequel
32
- prerelease: false
16
+ requirement: &14148960 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
33
22
  type: :runtime
34
- - !ruby/object:Gem::Dependency
35
- requirement: &id002 !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ version_requirements: *14148960
25
+ - !ruby/object:Gem::Dependency
26
+ name: bundler
27
+ requirement: &14147000 !ruby/object:Gem::Requirement
36
28
  none: false
37
- requirements:
29
+ requirements:
38
30
  - - ~>
39
- - !ruby/object:Gem::Version
40
- hash: 23
41
- segments:
42
- - 1
43
- - 0
44
- - 0
31
+ - !ruby/object:Gem::Version
45
32
  version: 1.0.0
46
- version_requirements: *id002
47
- name: bundler
48
- prerelease: false
49
33
  type: :development
50
- - !ruby/object:Gem::Dependency
51
- requirement: &id003 !ruby/object:Gem::Requirement
34
+ prerelease: false
35
+ version_requirements: *14147000
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &14143380 !ruby/object:Gem::Requirement
52
39
  none: false
53
- requirements:
40
+ requirements:
54
41
  - - ~>
55
- - !ruby/object:Gem::Version
56
- hash: 7
57
- segments:
58
- - 1
59
- - 6
60
- - 4
42
+ - !ruby/object:Gem::Version
61
43
  version: 1.6.4
62
- version_requirements: *id003
63
- name: jeweler
64
- prerelease: false
65
44
  type: :development
66
- - !ruby/object:Gem::Dependency
67
- requirement: &id004 !ruby/object:Gem::Requirement
68
- none: false
69
- requirements:
70
- - - ">="
71
- - !ruby/object:Gem::Version
72
- hash: 3
73
- segments:
74
- - 0
75
- version: "0"
76
- version_requirements: *id004
77
- name: rcov
78
45
  prerelease: false
79
- type: :development
80
- - !ruby/object:Gem::Dependency
81
- requirement: &id005 !ruby/object:Gem::Requirement
46
+ version_requirements: *14143380
47
+ - !ruby/object:Gem::Dependency
48
+ name: rcov
49
+ requirement: &14142120 !ruby/object:Gem::Requirement
82
50
  none: false
83
- requirements:
84
- - - ">="
85
- - !ruby/object:Gem::Version
86
- hash: 3
87
- segments:
88
- - 0
89
- version: "0"
90
- version_requirements: *id005
91
- name: guard-test
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
92
56
  prerelease: false
57
+ version_requirements: *14142120
58
+ - !ruby/object:Gem::Dependency
59
+ name: guard-test
60
+ requirement: &14139060 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
93
66
  type: :development
94
- - !ruby/object:Gem::Dependency
95
- requirement: &id006 !ruby/object:Gem::Requirement
67
+ prerelease: false
68
+ version_requirements: *14139060
69
+ - !ruby/object:Gem::Dependency
70
+ name: test-unit
71
+ requirement: &14137580 !ruby/object:Gem::Requirement
96
72
  none: false
97
- requirements:
98
- - - "="
99
- - !ruby/object:Gem::Version
100
- hash: 7
101
- segments:
102
- - 2
103
- - 3
104
- - 2
73
+ requirements:
74
+ - - =
75
+ - !ruby/object:Gem::Version
105
76
  version: 2.3.2
106
- version_requirements: *id006
107
- name: test-unit
108
- prerelease: false
109
77
  type: :development
110
- - !ruby/object:Gem::Dependency
111
- requirement: &id007 !ruby/object:Gem::Requirement
112
- none: false
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- hash: 3
117
- segments:
118
- - 0
119
- version: "0"
120
- version_requirements: *id007
121
- name: mocha
122
78
  prerelease: false
123
- type: :development
124
- - !ruby/object:Gem::Dependency
125
- requirement: &id008 !ruby/object:Gem::Requirement
79
+ version_requirements: *14137580
80
+ - !ruby/object:Gem::Dependency
81
+ name: mocha
82
+ requirement: &14135540 !ruby/object:Gem::Requirement
126
83
  none: false
127
- requirements:
128
- - - ">="
129
- - !ruby/object:Gem::Version
130
- hash: 3
131
- segments:
132
- - 0
133
- version: "0"
134
- version_requirements: *id008
135
- name: sqlite3
136
- prerelease: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
137
88
  type: :development
138
- - !ruby/object:Gem::Dependency
139
- requirement: &id009 !ruby/object:Gem::Requirement
140
- none: false
141
- requirements:
142
- - - ">="
143
- - !ruby/object:Gem::Version
144
- hash: 3
145
- segments:
146
- - 0
147
- version: "0"
148
- version_requirements: *id009
149
- name: yard
150
89
  prerelease: false
151
- type: :development
152
- - !ruby/object:Gem::Dependency
153
- requirement: &id010 !ruby/object:Gem::Requirement
90
+ version_requirements: *14135540
91
+ - !ruby/object:Gem::Dependency
92
+ name: sqlite3
93
+ requirement: &14133820 !ruby/object:Gem::Requirement
154
94
  none: false
155
- requirements:
156
- - - ">="
157
- - !ruby/object:Gem::Version
158
- hash: 3
159
- segments:
160
- - 0
161
- version: "0"
162
- version_requirements: *id010
163
- name: rake
164
- prerelease: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
165
99
  type: :development
166
- - !ruby/object:Gem::Dependency
167
- requirement: &id011 !ruby/object:Gem::Requirement
100
+ prerelease: false
101
+ version_requirements: *14133820
102
+ - !ruby/object:Gem::Dependency
103
+ name: yard
104
+ requirement: &14132420 !ruby/object:Gem::Requirement
168
105
  none: false
169
- requirements:
170
- - - ">="
171
- - !ruby/object:Gem::Version
172
- hash: 3
173
- segments:
174
- - 0
175
- version: "0"
176
- version_requirements: *id011
177
- name: versionomy
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :development
178
111
  prerelease: false
112
+ version_requirements: *14132420
113
+ - !ruby/object:Gem::Dependency
114
+ name: rake
115
+ requirement: &14130000 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
179
121
  type: :development
180
- - !ruby/object:Gem::Dependency
181
- requirement: &id012 !ruby/object:Gem::Requirement
122
+ prerelease: false
123
+ version_requirements: *14130000
124
+ - !ruby/object:Gem::Dependency
125
+ name: versionomy
126
+ requirement: &14129120 !ruby/object:Gem::Requirement
182
127
  none: false
183
- requirements:
184
- - - ">="
185
- - !ruby/object:Gem::Version
186
- hash: 3
187
- segments:
188
- - 0
189
- version: "0"
190
- version_requirements: *id012
191
- name: guard-yard
128
+ requirements:
129
+ - - ! '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
192
133
  prerelease: false
134
+ version_requirements: *14129120
135
+ - !ruby/object:Gem::Dependency
136
+ name: guard-yard
137
+ requirement: &14127660 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
193
143
  type: :development
144
+ prerelease: false
145
+ version_requirements: *14127660
194
146
  description: Wraps Sequel to perform record linkage between one or two datasets
195
147
  email: jeremy.f.stephens@vanderbilt.edu
196
148
  executables: []
197
-
198
149
  extensions: []
199
-
200
- extra_rdoc_files:
150
+ extra_rdoc_files:
201
151
  - LICENSE.txt
202
152
  - README.markdown
203
- files:
153
+ files:
204
154
  - .document
205
155
  - .vimrc
206
156
  - Gemfile
@@ -236,37 +186,31 @@ files:
236
186
  - test/unit/test_single_threaded_runner.rb
237
187
  - test/unit/test_utils.rb
238
188
  homepage: http://github.com/coupler/linkage
239
- licenses:
189
+ licenses:
240
190
  - MIT
241
191
  post_install_message:
242
192
  rdoc_options: []
243
-
244
- require_paths:
193
+ require_paths:
245
194
  - lib
246
- required_ruby_version: !ruby/object:Gem::Requirement
195
+ required_ruby_version: !ruby/object:Gem::Requirement
247
196
  none: false
248
- requirements:
249
- - - ">="
250
- - !ruby/object:Gem::Version
251
- hash: 3
252
- segments:
197
+ requirements:
198
+ - - ! '>='
199
+ - !ruby/object:Gem::Version
200
+ version: '0'
201
+ segments:
253
202
  - 0
254
- version: "0"
255
- required_rubygems_version: !ruby/object:Gem::Requirement
203
+ hash: 2889761151257020688
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
256
205
  none: false
257
- requirements:
258
- - - ">="
259
- - !ruby/object:Gem::Version
260
- hash: 3
261
- segments:
262
- - 0
263
- version: "0"
206
+ requirements:
207
+ - - ! '>='
208
+ - !ruby/object:Gem::Version
209
+ version: '0'
264
210
  requirements: []
265
-
266
211
  rubyforge_project:
267
212
  rubygems_version: 1.8.10
268
213
  signing_key:
269
214
  specification_version: 3
270
215
  summary: Sequel-based record linkage
271
216
  test_files: []
272
-