linkage 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.vimrc ADDED
@@ -0,0 +1,34 @@
1
+ function! s:AlternateFile()
2
+ let fn = substitute(expand('%'), "^".getcwd()."/", "", "")
3
+ let head = fnamemodify(fn, ':h')
4
+ let tail = fnamemodify(fn, ':t')
5
+
6
+ if match(head, '^lib') >= 0
7
+ return substitute(head, '^lib/linkage', 'test/unit', '').'/test_'.tail
8
+ elseif match(head, '^test') >= 0
9
+ return substitute(head, '^test/unit', 'lib/linkage', '').'/'.substitute(tail, '^test_', '', '')
10
+ endif
11
+ return ''
12
+ endfunction
13
+
14
+ function! s:Alternate(cmd)
15
+ let file = s:AlternateFile()
16
+ "if file != '' && filereadable(file)
17
+ if a:cmd == 'T'
18
+ let cmd = 'tabe'
19
+ elseif a:cmd == 'S'
20
+ let cmd = 'sp'
21
+ else
22
+ let cmd = 'e'
23
+ endif
24
+ exe ':'.cmd.' '.file
25
+ "else
26
+ "echomsg 'No alternate file is defined: '.file
27
+ "endif
28
+ endfunction
29
+
30
+ command! A :call s:Alternate('')
31
+ command! AE :call s:Alternate('E')
32
+ command! AS :call s:Alternate('S')
33
+ command! AV :call s:Alternate('V')
34
+ command! AT :call s:Alternate('T')
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "sequel"
4
+
5
+ group :development do
6
+ gem "bundler", "~> 1.0.0"
7
+ gem "jeweler", "~> 1.6.4"
8
+ gem "rcov", ">= 0"
9
+ gem "guard-test"
10
+ gem "test-unit", "2.3.2"
11
+ gem "mocha"
12
+ gem "sqlite3"
13
+ gem "yard"
14
+ gem "rake"
15
+ gem "versionomy"
16
+ gem "guard-yard", :platforms => :ruby_19
17
+ end
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ blockenspiel (0.4.3)
5
+ git (1.2.5)
6
+ guard (0.6.2)
7
+ thor (~> 0.14.6)
8
+ guard-test (0.3.0)
9
+ guard (>= 0.2.2)
10
+ test-unit (~> 2.2)
11
+ guard-yard (1.0.1)
12
+ guard (>= 0.2.2)
13
+ yard (>= 0.7.0)
14
+ jeweler (1.6.4)
15
+ bundler (~> 1.0)
16
+ git (>= 1.2.5)
17
+ rake
18
+ mocha (0.9.12)
19
+ rake (0.9.2)
20
+ rcov (0.9.10)
21
+ sequel (3.26.0)
22
+ sqlite3 (1.3.3)
23
+ test-unit (2.3.2)
24
+ thor (0.14.6)
25
+ versionomy (0.4.1)
26
+ blockenspiel (>= 0.4.1)
27
+ yard (0.7.2)
28
+
29
+ PLATFORMS
30
+ ruby
31
+
32
+ DEPENDENCIES
33
+ bundler (~> 1.0.0)
34
+ guard-test
35
+ guard-yard
36
+ jeweler (~> 1.6.4)
37
+ mocha
38
+ rake
39
+ rcov
40
+ sequel
41
+ sqlite3
42
+ test-unit (= 2.3.2)
43
+ versionomy
44
+ yard
@@ -0,0 +1,12 @@
1
+ guard 'test' do
2
+ watch(%r{^lib/linkage/runner/([^/]+)\.rb$}) { |m| "test/unit/test_#{m[1]}_runner.rb" }
3
+ watch(%r{^lib/linkage/([^/]+)\.rb$}) { |m| "test/unit/test_#{m[1]}.rb" }
4
+ watch(%r{^test/unit/test_.+\.rb$})
5
+ watch(%r{^test/integration/test_.+\.rb$})
6
+ watch('lib/linkage/configuration.rb') { "test/unit/test_dataset.rb" }
7
+ watch('test/helper.rb') { "test" }
8
+ end
9
+
10
+ guard 'yard' do
11
+ watch(%r{lib/[^.].*\.rb$})
12
+ end
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Vanderbilt University
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,64 @@
1
+ # linkage
2
+
3
+ Linkage is a library for record linkage between one or two database tables.
4
+
5
+ ## Usage
6
+
7
+ Linkage uses Sequel to talk to databases, so any database that Sequel can
8
+ talk to, Linkage can talk to. You just give Linkage the Sequel-style URI
9
+ and the database table name:
10
+
11
+ ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
12
+
13
+ To describe a linkage, you use the `Dataset#link_with` method.
14
+
15
+ parents = Linkage::Dataset.new('postgres://example.com/foo', 'parents')
16
+ children = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'children')
17
+ config = parents.link_with(children) do
18
+ lhs[:first_name].must == rhs[:parent_first_name]
19
+ lhs[:last_name].must == rhs[:parent_last_name]
20
+ lhs[:last_name].must_not == "Smith" # exclude parents with the last
21
+ # name "Smith"
22
+ end
23
+
24
+ Note that the datasets don't have to be in the same database, or even on
25
+ the same machine.
26
+
27
+ To run a linkage, use a Runner with the resulting configuration from
28
+ `Dataset#link_with`:
29
+
30
+ runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
31
+ runner.execute
32
+
33
+ The runner needs a database URI, since it stores its results in two
34
+ database tables: `groups` and `groups_records`. The `groups` table contains
35
+ all of the unique combinations of values in your datasets, and
36
+ `groups_records` maps records to groups.
37
+
38
+ You can also link a dataset to itself:
39
+
40
+ births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
41
+ config = births.link_with(births) do
42
+ lhs[:mother_first_name].must == rhs[:mother_first_name]
43
+ lhs[:mother_last_name].must == rhs[:mother_last_name]
44
+ end
45
+ runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
46
+ runner.execute
47
+
48
+ The above example would find birth records that have mothers with the same
49
+ name.
50
+
51
+ ## Contributing to linkage
52
+
53
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
54
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
55
+ * Fork the project
56
+ * Start a feature/bugfix branch
57
+ * Commit and push until you are happy with your contribution
58
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
59
+
60
+ ## Copyright
61
+
62
+ Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
63
+ further details.
64
+
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "linkage"
18
+ gem.homepage = "http://github.com/coupler/linkage"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Sequel-based record linkage}
21
+ gem.description = %Q{Wraps Sequel to perform record linkage between one or two datasets}
22
+ gem.email = "jeremy.f.stephens@vanderbilt.edu"
23
+ gem.authors = ["Jeremy Stephens"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ require 'rcov/rcovtask'
36
+ Rcov::RcovTask.new do |test|
37
+ test.libs << 'test'
38
+ test.pattern = 'test/**/test_*.rb'
39
+ test.verbose = true
40
+ test.rcov_opts << '--exclude "gems/*"'
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "linkage #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
54
+
55
+ require 'yard'
56
+ YARD::Rake::YardocTask.new do |t|
57
+ t.files = ['lib/**/*.rb']
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,15 @@
1
+ require 'pathname'
2
+ require 'sequel'
3
+
4
+ module Linkage
5
+ end
6
+
7
+ path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
8
+ require path + 'utils'
9
+ require path + 'dataset'
10
+ require path + 'runner'
11
+ require path + 'expectation'
12
+ require path + 'field'
13
+ require path + 'group'
14
+ require path + 'import_buffer'
15
+ require path + 'configuration'
@@ -0,0 +1,178 @@
1
+ module Linkage
2
+ # {Configuration} is used to configure linkages. When you call
3
+ # {Dataset#link_with}, the block you supply gets called in the context of
4
+ # an instance of {Configuration}.
5
+ #
6
+ # @example
7
+ # dataset_1 = Linkage::Dataset.new("mysql://example.com/database_name", "table_1")
8
+ # dataset_2 = Linkage::Dataset.new("mysql://example.com/database_name", "table_2")
9
+ # dataset_1.link_with(dataset_2) do
10
+ # # this gets run inside of a Configuration instance
11
+ # end
12
+ #
13
+ # @see Dataset#link_with
14
+ class Configuration
15
+ # @private
16
+ class ExpectationWrapper
17
+ def initialize(type, field, config)
18
+ @type = type
19
+ @field = field
20
+ @config = config
21
+ @side = nil
22
+ @forced_kind = nil
23
+ end
24
+
25
+ Linkage::Expectation::VALID_OPERATORS.each do |op|
26
+ define_method(op) do |other|
27
+ case other
28
+ when FieldWrapper
29
+ @other = other.field
30
+ if other.side == @field.side
31
+ @forced_kind = :filter
32
+ @side = @field.side
33
+ end
34
+ else
35
+ @other = other
36
+ @side = @field.side
37
+ end
38
+ add_expectation(op)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def add_expectation(operator)
45
+ klass = Expectation.get(@type)
46
+ exp = klass.new(operator, @field.field, @other, @forced_kind)
47
+ @config.add_expectation(exp, @side)
48
+ end
49
+ end
50
+
51
+ # @private
52
+ class FieldWrapper
53
+ attr_reader :field, :side
54
+ def initialize(field, side, config)
55
+ @field = field
56
+ @side = side
57
+ @config = config
58
+ end
59
+
60
+ def must
61
+ ExpectationWrapper.new(:must, self, @config)
62
+ end
63
+
64
+ def must_not
65
+ ExpectationWrapper.new(:must_not, self, @config)
66
+ end
67
+ end
68
+
69
+ # @private
70
+ class DatasetWrapper
71
+ def initialize(dataset, side, config)
72
+ @dataset = dataset
73
+ @side = side
74
+ @config = config
75
+ end
76
+
77
+ def [](field_name)
78
+ field = @dataset.fields[field_name]
79
+ if field.nil?
80
+ raise ArgumentError, "The '#{field_name}' field doesn't exist for that dataset!"
81
+ end
82
+ FieldWrapper.new(field, @side, @config)
83
+ end
84
+ end
85
+
86
+ include Utils
87
+
88
+ # @return [Symbol] :self, :dual, or :cross
89
+ attr_reader :linkage_type
90
+
91
+ # @return [Array<Linkage::Expectation>]
92
+ attr_reader :expectations
93
+
94
+ # @return [Linkage::Dataset]
95
+ attr_reader :dataset_1
96
+
97
+ # @return [Linkage::Dataset]
98
+ attr_reader :dataset_2
99
+
100
+ def initialize(dataset_1, dataset_2)
101
+ @dataset_1 = dataset_1.clone
102
+ @dataset_2 = dataset_2.clone
103
+ @expectations = []
104
+ @linkage_type = dataset_1 == dataset_2 ? :self : :dual
105
+ @lhs_filters = []
106
+ @rhs_filters = []
107
+ end
108
+
109
+ def lhs
110
+ @lhs ||= DatasetWrapper.new(@dataset_1, :lhs, self)
111
+ end
112
+
113
+ def rhs
114
+ @rhs ||= DatasetWrapper.new(@dataset_2, :rhs, self)
115
+ end
116
+
117
+ # @private
118
+ def add_expectation(expectation, side = nil)
119
+ # If the expectation created turns the linkage type from a self to a
120
+ # cross, then the dataset gets a new id. This is so that
121
+ # Expectation#apply does the right thing.
122
+
123
+ @expectations << expectation
124
+ if @linkage_type == :self
125
+ cross = false
126
+
127
+ case expectation.kind
128
+ when :cross
129
+ cross = true
130
+ when :filter
131
+ # If there different filters on both 'sides' of a self-linkage,
132
+ # it turns into a cross linkage.
133
+ these_filters, other_filters =
134
+ case side
135
+ when :lhs
136
+ [@lhs_filters, @rhs_filters]
137
+ when :rhs
138
+ [@rhs_filters, @lhs_filters]
139
+ end
140
+
141
+ if !other_filters.empty? && !other_filters.include?(expectation)
142
+ cross = true
143
+ else
144
+ these_filters << expectation
145
+ end
146
+ end
147
+
148
+ if cross
149
+ @linkage_type = :cross
150
+ @dataset_2.send(:set_new_id)
151
+ end
152
+ end
153
+ end
154
+
155
+ # @private
156
+ def groups_table_schema
157
+ schema = []
158
+
159
+ # add id
160
+ schema << [:id, Integer, {:primary_key => true}]
161
+
162
+ # add values
163
+ @expectations.each do |exp|
164
+ next if exp.kind == :filter
165
+
166
+ merged_type = exp.merged_field.ruby_type
167
+ schema << [exp.name, merged_type[:type], merged_type[:opts] || {}]
168
+ end
169
+
170
+ schema
171
+ end
172
+
173
+ # @private
174
+ def inspect
175
+ to_s
176
+ end
177
+ end
178
+ end