linkage 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.vimrc +34 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +44 -0
- data/Guardfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +64 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/lib/linkage.rb +15 -0
- data/lib/linkage/configuration.rb +178 -0
- data/lib/linkage/dataset.rb +205 -0
- data/lib/linkage/expectation.rb +138 -0
- data/lib/linkage/field.rb +227 -0
- data/lib/linkage/group.rb +43 -0
- data/lib/linkage/import_buffer.rb +39 -0
- data/lib/linkage/runner.rb +59 -0
- data/lib/linkage/runner/single_threaded.rb +114 -0
- data/lib/linkage/utils.rb +164 -0
- data/linkage.gemspec +106 -0
- data/test/helper.rb +43 -0
- data/test/integration/test_cross_linkage.rb +68 -0
- data/test/integration/test_dual_linkage.rb +85 -0
- data/test/integration/test_self_linkage.rb +209 -0
- data/test/unit/test_configuration.rb +145 -0
- data/test/unit/test_dataset.rb +274 -0
- data/test/unit/test_expectation.rb +294 -0
- data/test/unit/test_field.rb +447 -0
- data/test/unit/test_group.rb +21 -0
- data/test/unit/test_import_buffer.rb +51 -0
- data/test/unit/test_linkage.rb +6 -0
- data/test/unit/test_runner.rb +14 -0
- data/test/unit/test_single_threaded_runner.rb +12 -0
- data/test/unit/test_utils.rb +341 -0
- metadata +272 -0
data/.document
ADDED
data/.vimrc
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
function! s:AlternateFile()
|
2
|
+
let fn = substitute(expand('%'), "^".getcwd()."/", "", "")
|
3
|
+
let head = fnamemodify(fn, ':h')
|
4
|
+
let tail = fnamemodify(fn, ':t')
|
5
|
+
|
6
|
+
if match(head, '^lib') >= 0
|
7
|
+
return substitute(head, '^lib/linkage', 'test/unit', '').'/test_'.tail
|
8
|
+
elseif match(head, '^test') >= 0
|
9
|
+
return substitute(head, '^test/unit', 'lib/linkage', '').'/'.substitute(tail, '^test_', '', '')
|
10
|
+
endif
|
11
|
+
return ''
|
12
|
+
endfunction
|
13
|
+
|
14
|
+
function! s:Alternate(cmd)
|
15
|
+
let file = s:AlternateFile()
|
16
|
+
"if file != '' && filereadable(file)
|
17
|
+
if a:cmd == 'T'
|
18
|
+
let cmd = 'tabe'
|
19
|
+
elseif a:cmd == 'S'
|
20
|
+
let cmd = 'sp'
|
21
|
+
else
|
22
|
+
let cmd = 'e'
|
23
|
+
endif
|
24
|
+
exe ':'.cmd.' '.file
|
25
|
+
"else
|
26
|
+
"echomsg 'No alternate file is defined: '.file
|
27
|
+
"endif
|
28
|
+
endfunction
|
29
|
+
|
30
|
+
command! A :call s:Alternate('')
|
31
|
+
command! AE :call s:Alternate('E')
|
32
|
+
command! AS :call s:Alternate('S')
|
33
|
+
command! AV :call s:Alternate('V')
|
34
|
+
command! AT :call s:Alternate('T')
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "sequel"
|
4
|
+
|
5
|
+
group :development do
|
6
|
+
gem "bundler", "~> 1.0.0"
|
7
|
+
gem "jeweler", "~> 1.6.4"
|
8
|
+
gem "rcov", ">= 0"
|
9
|
+
gem "guard-test"
|
10
|
+
gem "test-unit", "2.3.2"
|
11
|
+
gem "mocha"
|
12
|
+
gem "sqlite3"
|
13
|
+
gem "yard"
|
14
|
+
gem "rake"
|
15
|
+
gem "versionomy"
|
16
|
+
gem "guard-yard", :platforms => :ruby_19
|
17
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
blockenspiel (0.4.3)
|
5
|
+
git (1.2.5)
|
6
|
+
guard (0.6.2)
|
7
|
+
thor (~> 0.14.6)
|
8
|
+
guard-test (0.3.0)
|
9
|
+
guard (>= 0.2.2)
|
10
|
+
test-unit (~> 2.2)
|
11
|
+
guard-yard (1.0.1)
|
12
|
+
guard (>= 0.2.2)
|
13
|
+
yard (>= 0.7.0)
|
14
|
+
jeweler (1.6.4)
|
15
|
+
bundler (~> 1.0)
|
16
|
+
git (>= 1.2.5)
|
17
|
+
rake
|
18
|
+
mocha (0.9.12)
|
19
|
+
rake (0.9.2)
|
20
|
+
rcov (0.9.10)
|
21
|
+
sequel (3.26.0)
|
22
|
+
sqlite3 (1.3.3)
|
23
|
+
test-unit (2.3.2)
|
24
|
+
thor (0.14.6)
|
25
|
+
versionomy (0.4.1)
|
26
|
+
blockenspiel (>= 0.4.1)
|
27
|
+
yard (0.7.2)
|
28
|
+
|
29
|
+
PLATFORMS
|
30
|
+
ruby
|
31
|
+
|
32
|
+
DEPENDENCIES
|
33
|
+
bundler (~> 1.0.0)
|
34
|
+
guard-test
|
35
|
+
guard-yard
|
36
|
+
jeweler (~> 1.6.4)
|
37
|
+
mocha
|
38
|
+
rake
|
39
|
+
rcov
|
40
|
+
sequel
|
41
|
+
sqlite3
|
42
|
+
test-unit (= 2.3.2)
|
43
|
+
versionomy
|
44
|
+
yard
|
data/Guardfile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
guard 'test' do
|
2
|
+
watch(%r{^lib/linkage/runner/([^/]+)\.rb$}) { |m| "test/unit/test_#{m[1]}_runner.rb" }
|
3
|
+
watch(%r{^lib/linkage/([^/]+)\.rb$}) { |m| "test/unit/test_#{m[1]}.rb" }
|
4
|
+
watch(%r{^test/unit/test_.+\.rb$})
|
5
|
+
watch(%r{^test/integration/test_.+\.rb$})
|
6
|
+
watch('lib/linkage/configuration.rb') { "test/unit/test_dataset.rb" }
|
7
|
+
watch('test/helper.rb') { "test" }
|
8
|
+
end
|
9
|
+
|
10
|
+
guard 'yard' do
|
11
|
+
watch(%r{lib/[^.].*\.rb$})
|
12
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Vanderbilt University
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# linkage
|
2
|
+
|
3
|
+
Linkage is a library for record linkage between one or two database tables.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
Linkage uses Sequel to talk to databases, so any database that Sequel can
|
8
|
+
talk to, Linkage can talk to. You just give Linkage the Sequel-style URI
|
9
|
+
and the database table name:
|
10
|
+
|
11
|
+
ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
|
12
|
+
|
13
|
+
To describe a linkage, you use the `Dataset#link_with` method.
|
14
|
+
|
15
|
+
parents = Linkage::Dataset.new('postgres://example.com/foo', 'parents')
|
16
|
+
children = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'children')
|
17
|
+
config = parents.link_with(children) do
|
18
|
+
lhs[:first_name].must == rhs[:parent_first_name]
|
19
|
+
lhs[:last_name].must == rhs[:parent_last_name]
|
20
|
+
lhs[:last_name].must_not == "Smith" # exclude parents with the last
|
21
|
+
# name "Smith"
|
22
|
+
end
|
23
|
+
|
24
|
+
Note that the datasets don't have to be in the same database, or even on
|
25
|
+
the same machine.
|
26
|
+
|
27
|
+
To run a linkage, use a Runner with the resulting configuration from
|
28
|
+
`Dataset#link_with`:
|
29
|
+
|
30
|
+
runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
|
31
|
+
runner.execute
|
32
|
+
|
33
|
+
The runner needs a database URI, since it stores its results in two
|
34
|
+
database tables: `groups` and `groups_records`. The `groups` table contains
|
35
|
+
all of the unique combinations of values in your datasets, and
|
36
|
+
`groups_records` maps records to groups.
|
37
|
+
|
38
|
+
You can also link a dataset to itself:
|
39
|
+
|
40
|
+
births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
|
41
|
+
config = births.link_with(births) do
|
42
|
+
lhs[:mother_first_name].must == rhs[:mother_first_name]
|
43
|
+
lhs[:mother_last_name].must == rhs[:mother_last_name]
|
44
|
+
end
|
45
|
+
runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
|
46
|
+
runner.execute
|
47
|
+
|
48
|
+
The above example would find birth records that have mothers with the same
|
49
|
+
name.
|
50
|
+
|
51
|
+
## Contributing to linkage
|
52
|
+
|
53
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
54
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
55
|
+
* Fork the project
|
56
|
+
* Start a feature/bugfix branch
|
57
|
+
* Commit and push until you are happy with your contribution
|
58
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
59
|
+
|
60
|
+
## Copyright
|
61
|
+
|
62
|
+
Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
|
63
|
+
further details.
|
64
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "linkage"
|
18
|
+
gem.homepage = "http://github.com/coupler/linkage"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Sequel-based record linkage}
|
21
|
+
gem.description = %Q{Wraps Sequel to perform record linkage between one or two datasets}
|
22
|
+
gem.email = "jeremy.f.stephens@vanderbilt.edu"
|
23
|
+
gem.authors = ["Jeremy Stephens"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
require 'rcov/rcovtask'
|
36
|
+
Rcov::RcovTask.new do |test|
|
37
|
+
test.libs << 'test'
|
38
|
+
test.pattern = 'test/**/test_*.rb'
|
39
|
+
test.verbose = true
|
40
|
+
test.rcov_opts << '--exclude "gems/*"'
|
41
|
+
end
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "linkage #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
54
|
+
|
55
|
+
require 'yard'
|
56
|
+
YARD::Rake::YardocTask.new do |t|
|
57
|
+
t.files = ['lib/**/*.rb']
|
58
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/lib/linkage.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'sequel'
|
3
|
+
|
4
|
+
module Linkage
|
5
|
+
end
|
6
|
+
|
7
|
+
path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
|
8
|
+
require path + 'utils'
|
9
|
+
require path + 'dataset'
|
10
|
+
require path + 'runner'
|
11
|
+
require path + 'expectation'
|
12
|
+
require path + 'field'
|
13
|
+
require path + 'group'
|
14
|
+
require path + 'import_buffer'
|
15
|
+
require path + 'configuration'
|
@@ -0,0 +1,178 @@
|
|
1
|
+
module Linkage
|
2
|
+
# {Configuration} is used to configure linkages. When you call
|
3
|
+
# {Dataset#link_with}, the block you supply gets called in the context of
|
4
|
+
# an instance of {Configuration}.
|
5
|
+
#
|
6
|
+
# @example
|
7
|
+
# dataset_1 = Linkage::Dataset.new("mysql://example.com/database_name", "table_1")
|
8
|
+
# dataset_2 = Linkage::Dataset.new("mysql://example.com/database_name", "table_2")
|
9
|
+
# dataset_1.link_with(dataset_2) do
|
10
|
+
# # this gets run inside of a Configuration instance
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# @see Dataset#link_with
|
14
|
+
class Configuration
|
15
|
+
# @private
|
16
|
+
class ExpectationWrapper
|
17
|
+
def initialize(type, field, config)
|
18
|
+
@type = type
|
19
|
+
@field = field
|
20
|
+
@config = config
|
21
|
+
@side = nil
|
22
|
+
@forced_kind = nil
|
23
|
+
end
|
24
|
+
|
25
|
+
Linkage::Expectation::VALID_OPERATORS.each do |op|
|
26
|
+
define_method(op) do |other|
|
27
|
+
case other
|
28
|
+
when FieldWrapper
|
29
|
+
@other = other.field
|
30
|
+
if other.side == @field.side
|
31
|
+
@forced_kind = :filter
|
32
|
+
@side = @field.side
|
33
|
+
end
|
34
|
+
else
|
35
|
+
@other = other
|
36
|
+
@side = @field.side
|
37
|
+
end
|
38
|
+
add_expectation(op)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def add_expectation(operator)
|
45
|
+
klass = Expectation.get(@type)
|
46
|
+
exp = klass.new(operator, @field.field, @other, @forced_kind)
|
47
|
+
@config.add_expectation(exp, @side)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @private
|
52
|
+
class FieldWrapper
|
53
|
+
attr_reader :field, :side
|
54
|
+
def initialize(field, side, config)
|
55
|
+
@field = field
|
56
|
+
@side = side
|
57
|
+
@config = config
|
58
|
+
end
|
59
|
+
|
60
|
+
def must
|
61
|
+
ExpectationWrapper.new(:must, self, @config)
|
62
|
+
end
|
63
|
+
|
64
|
+
def must_not
|
65
|
+
ExpectationWrapper.new(:must_not, self, @config)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# @private
|
70
|
+
class DatasetWrapper
|
71
|
+
def initialize(dataset, side, config)
|
72
|
+
@dataset = dataset
|
73
|
+
@side = side
|
74
|
+
@config = config
|
75
|
+
end
|
76
|
+
|
77
|
+
def [](field_name)
|
78
|
+
field = @dataset.fields[field_name]
|
79
|
+
if field.nil?
|
80
|
+
raise ArgumentError, "The '#{field_name}' field doesn't exist for that dataset!"
|
81
|
+
end
|
82
|
+
FieldWrapper.new(field, @side, @config)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
include Utils
|
87
|
+
|
88
|
+
# @return [Symbol] :self, :dual, or :cross
|
89
|
+
attr_reader :linkage_type
|
90
|
+
|
91
|
+
# @return [Array<Linkage::Expectation>]
|
92
|
+
attr_reader :expectations
|
93
|
+
|
94
|
+
# @return [Linkage::Dataset]
|
95
|
+
attr_reader :dataset_1
|
96
|
+
|
97
|
+
# @return [Linkage::Dataset]
|
98
|
+
attr_reader :dataset_2
|
99
|
+
|
100
|
+
def initialize(dataset_1, dataset_2)
|
101
|
+
@dataset_1 = dataset_1.clone
|
102
|
+
@dataset_2 = dataset_2.clone
|
103
|
+
@expectations = []
|
104
|
+
@linkage_type = dataset_1 == dataset_2 ? :self : :dual
|
105
|
+
@lhs_filters = []
|
106
|
+
@rhs_filters = []
|
107
|
+
end
|
108
|
+
|
109
|
+
def lhs
|
110
|
+
@lhs ||= DatasetWrapper.new(@dataset_1, :lhs, self)
|
111
|
+
end
|
112
|
+
|
113
|
+
def rhs
|
114
|
+
@rhs ||= DatasetWrapper.new(@dataset_2, :rhs, self)
|
115
|
+
end
|
116
|
+
|
117
|
+
# @private
|
118
|
+
def add_expectation(expectation, side = nil)
|
119
|
+
# If the expectation created turns the linkage type from a self to a
|
120
|
+
# cross, then the dataset gets a new id. This is so that
|
121
|
+
# Expectation#apply does the right thing.
|
122
|
+
|
123
|
+
@expectations << expectation
|
124
|
+
if @linkage_type == :self
|
125
|
+
cross = false
|
126
|
+
|
127
|
+
case expectation.kind
|
128
|
+
when :cross
|
129
|
+
cross = true
|
130
|
+
when :filter
|
131
|
+
# If there different filters on both 'sides' of a self-linkage,
|
132
|
+
# it turns into a cross linkage.
|
133
|
+
these_filters, other_filters =
|
134
|
+
case side
|
135
|
+
when :lhs
|
136
|
+
[@lhs_filters, @rhs_filters]
|
137
|
+
when :rhs
|
138
|
+
[@rhs_filters, @lhs_filters]
|
139
|
+
end
|
140
|
+
|
141
|
+
if !other_filters.empty? && !other_filters.include?(expectation)
|
142
|
+
cross = true
|
143
|
+
else
|
144
|
+
these_filters << expectation
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
if cross
|
149
|
+
@linkage_type = :cross
|
150
|
+
@dataset_2.send(:set_new_id)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# @private
|
156
|
+
def groups_table_schema
|
157
|
+
schema = []
|
158
|
+
|
159
|
+
# add id
|
160
|
+
schema << [:id, Integer, {:primary_key => true}]
|
161
|
+
|
162
|
+
# add values
|
163
|
+
@expectations.each do |exp|
|
164
|
+
next if exp.kind == :filter
|
165
|
+
|
166
|
+
merged_type = exp.merged_field.ruby_type
|
167
|
+
schema << [exp.name, merged_type[:type], merged_type[:opts] || {}]
|
168
|
+
end
|
169
|
+
|
170
|
+
schema
|
171
|
+
end
|
172
|
+
|
173
|
+
# @private
|
174
|
+
def inspect
|
175
|
+
to_s
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|