linkage 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ module Linkage
2
+ # This class represents a group of records that match based on criteria
3
+ # described via the {Dataset#link_with} method. Group's are created by
4
+ # subclasses of the {Runner} class during execution.
5
+ #
6
+ # @see Dataset#link_with
7
+ # @see SingleThreadedRunner
8
+ class Group
9
+ # @return [Array<Object>] An array of this group's record ids
10
+ attr_reader :records
11
+
12
+ # @return [Hash] Hash of matching values
13
+ attr_reader :values
14
+
15
+ # @param [Hash] matching_values Values that define this group
16
+ # @example
17
+ # Linkage::Group.new({:foo => 123, :bar => 'baz'})
18
+ def initialize(matching_values)
19
+ @values = matching_values
20
+ @records = []
21
+ end
22
+
23
+ # Check to see if the given set of values matches this group's values.
24
+ #
25
+ # @param [Hash] values Hash of values
26
+ # @return [Boolean] true if match, false if not
27
+ def matches?(values)
28
+ @values == values
29
+ end
30
+
31
+ # Add a record id to this group's set of records.
32
+ #
33
+ # @param [Object] record_id
34
+ def add_record(record_id)
35
+ @records << record_id
36
+ end
37
+
38
+ # @return [Fixnum] Number of records in this group
39
+ def count
40
+ @records.count
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,39 @@
1
+ module Linkage
2
+ class ImportBuffer
3
+ # @param [String] uri Sequel-style URI
4
+ # @param [Symbol, String] table_name
5
+ # @param [Array<Symbol>] headers List of fields you want to insert
6
+ # @param [Hash] options Sequel.connect options
7
+ # @param [Fixnum] limit Number of records to insert at a time
8
+ def initialize(uri, table_name, headers, options = {}, limit = 1000)
9
+ @uri = uri
10
+ @table_name = table_name.to_sym
11
+ @headers = headers
12
+ @options = options
13
+ @limit = limit
14
+ @values = []
15
+ end
16
+
17
+ def add(values)
18
+ @values << values
19
+ if @values.length == @limit
20
+ flush
21
+ end
22
+ end
23
+
24
+ def flush
25
+ return if @values.empty?
26
+ database do |db|
27
+ ds = db[@table_name]
28
+ ds.import(@headers, @values)
29
+ @values.clear
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def database(&block)
36
+ Sequel.connect(@uri, @options, &block)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,59 @@
1
+ module Linkage
2
+ # Use this class to run a configuration created by {Dataset#link_with}.
3
+ class Runner
4
+ attr_reader :config
5
+
6
+ # @param [Linkage::Configuration] config
7
+ # @param [String] uri Sequel-style database URI
8
+ # @param [Hash] options Sequel.connect options
9
+ # @see Dataset#link_with
10
+ # @see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html Sequel: Connecting to a database
11
+ def initialize(config, uri, options = {})
12
+ @config = config
13
+ @uri = uri
14
+ @options = options
15
+ @next_group_id = 1
16
+ @next_group_mutex = Mutex.new
17
+ end
18
+
19
+ # @abstract
20
+ def execute
21
+ raise NotImplementedError
22
+ end
23
+
24
+ protected
25
+
26
+ def database(&block)
27
+ Sequel.connect(@uri, @options, &block)
28
+ end
29
+
30
+ def create_tables
31
+ database do |db|
32
+ schema = config.groups_table_schema
33
+ db.create_table(:groups) do
34
+ schema.each { |col| column(*col) }
35
+ end
36
+
37
+ pk_type = config.dataset_1.primary_key.merge(config.dataset_2.primary_key).ruby_type
38
+ db.create_table(:groups_records) do
39
+ column(:record_id, pk_type[:type], pk_type[:opts] || {})
40
+ Integer :group_id
41
+ Integer :dataset
42
+ index :group_id
43
+ end
44
+ end
45
+ end
46
+
47
+ def next_group_id
48
+ result = nil
49
+ @next_group_mutex.synchronize do
50
+ result = @next_group_id
51
+ @next_group_id += 1
52
+ end
53
+ result
54
+ end
55
+ end
56
+ end
57
+
58
+ path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
59
+ require path + 'single_threaded'
@@ -0,0 +1,114 @@
1
+ module Linkage
2
+ # A runner class that only uses a single thread to execute a linkage.
3
+ #
4
+ # @see Runner
5
+ class SingleThreadedRunner < Runner
6
+ def execute
7
+ create_tables
8
+ setup_datasets
9
+ apply_expectations
10
+ group_records
11
+ nil
12
+ end
13
+
14
+ private
15
+
16
+ def setup_datasets
17
+ @dataset_1 = config.dataset_1.clone
18
+ @dataset_2 = config.dataset_2.clone if @config.linkage_type != :self
19
+ end
20
+
21
+ def apply_expectations
22
+ config.expectations.each do |exp|
23
+ exp.apply_to(@dataset_1)
24
+ exp.apply_to(@dataset_2) if config.linkage_type != :self
25
+ end
26
+ end
27
+
28
+ def group_records
29
+ if config.linkage_type == :self
30
+ add_groups(group_records_for(@dataset_1), 1)
31
+ else
32
+ add_groups(group_records_for(@dataset_1, false), 1)
33
+ add_groups(group_records_for(@dataset_2, false), 2)
34
+ combine_groups
35
+ end
36
+ end
37
+
38
+ def group_records_for(dataset, ignore_empty_groups = true)
39
+ groups = []
40
+ current_group = nil
41
+ dataset.each do |row|
42
+ if current_group.nil? || !current_group.matches?(row[:values])
43
+ if current_group && (!ignore_empty_groups || current_group.count > 1)
44
+ groups << current_group
45
+ end
46
+ new_group = Group.new(row[:values])
47
+ current_group = new_group
48
+ end
49
+ current_group.add_record(row[:pk])
50
+ end
51
+ if current_group && (!ignore_empty_groups || current_group.count > 1)
52
+ groups << current_group
53
+ end
54
+ groups
55
+ end
56
+
57
+ def add_groups(groups, dataset_id = nil)
58
+ return if groups.empty?
59
+
60
+ groups_headers = [:id] + groups[0].values.keys
61
+ groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
62
+
63
+ groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
64
+
65
+ groups.each_with_index do |group, i|
66
+ group_id = next_group_id
67
+ groups_buffer.add([group_id] + group.values.values)
68
+ group.records.each do |record_id|
69
+ groups_records_buffer.add([group_id, dataset_id, record_id])
70
+ end
71
+ end
72
+ groups_buffer.flush
73
+ groups_records_buffer.flush
74
+ end
75
+
76
+ def combine_groups
77
+ # Create a new dataset for the groups table
78
+ ds = Dataset.new(@uri, :groups, :single_threaded => true)
79
+ ds.fields.each_value do |field|
80
+ # Sort on all fields
81
+ next if field.primary_key?
82
+ ds.add_order(field)
83
+ ds.add_select(field)
84
+ end
85
+ ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
86
+ combined_groups = group_records_for(ds, false)
87
+ database do |db|
88
+ groups_to_delete = []
89
+ db.transaction do # for speed reasons
90
+ combined_groups.each do |group|
91
+ if group.count == 1
92
+ # Delete the empty group
93
+ groups_to_delete << group.records[0]
94
+ else
95
+ # Change group_id in the groups_records table to the first group
96
+ # id, delete other groups.
97
+ new_group_id = group.records[0]
98
+ group.records[1..-1].each do |old_group_id|
99
+ # There can only be a group with max size of 2, but this
100
+ # adds in future support for matching more than 2 datasets
101
+ # at once. Code smell?
102
+ db[:groups_records].filter(:group_id => old_group_id).
103
+ update(:group_id => new_group_id)
104
+ groups_to_delete << old_group_id
105
+ end
106
+ end
107
+ end
108
+ end
109
+ db[:groups_records].filter(:group_id => groups_to_delete).delete
110
+ db[:groups].filter(:id => groups_to_delete).delete
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,164 @@
1
+ module Linkage
2
+ module Utils
3
+ # A "tree" used to find compatible types.
4
+ TYPE_CONVERSION_TREE = {
5
+ TrueClass => [Integer],
6
+ Integer => [Bignum, Float],
7
+ Bignum => [BigDecimal],
8
+ Float => [BigDecimal],
9
+ BigDecimal => [String],
10
+ String => nil,
11
+ DateTime => nil,
12
+ Date => nil,
13
+ Time => nil,
14
+ File => nil
15
+ }
16
+
17
+ # Create field information for a field that can hold data from two other
18
+ # fields. If the fields have different types, the resulting type is
19
+ # determined via a type-conversion tree.
20
+ #
21
+ # @param [Array] field_1 Schema information for the first field
22
+ # @param [Array] field_2 Schema information for the second field
23
+ # @return [Array] Schema information for the new field
24
+ def merge_fields(field_1, field_2)
25
+ schema_1 = column_schema_to_ruby_type(field_1)
26
+ schema_1.delete_if { |k, v| v.nil? }
27
+ schema_2 = column_schema_to_ruby_type(field_2)
28
+ schema_2.delete_if { |k, v| v.nil? }
29
+ if schema_1 == schema_2
30
+ result = schema_1
31
+ else
32
+ result = schema_1.dup
33
+
34
+ # type
35
+ if schema_1[:type] != schema_2[:type]
36
+ result[:type] = first_common_type(schema_1[:type], schema_2[:type])
37
+ end
38
+
39
+ # text
40
+ if schema_1[:text] != schema_2[:text]
41
+ # This can only be of type String.
42
+ result[:text] = true
43
+ result.delete(:size)
44
+ end
45
+
46
+ # size
47
+ if !result[:text] && schema_1[:size] != schema_2[:size]
48
+ types = [schema_1[:type], schema_2[:type]].uniq
49
+ if types.length == 1 && types[0] == BigDecimal
50
+ # Two decimals
51
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
52
+ s_1 = schema_1[:size]
53
+ s_2 = schema_2[:size]
54
+ result[:size] = [ s_1[0] > s_2[0] ? s_1[0] : s_2[0] ]
55
+
56
+ if s_1[1] && s_2[1]
57
+ result[:size][1] = s_1[1] > s_2[1] ? s_1[1] : s_2[1]
58
+ else
59
+ result[:size][1] = s_1[1] ? s_1[1] : s_2[1]
60
+ end
61
+ else
62
+ result[:size] = schema_1.has_key?(:size) ? schema_1[:size] : schema_2[:size]
63
+ end
64
+ elsif types.include?(String) && types.include?(BigDecimal)
65
+ # Add one to the precision of the BigDecimal (for the dot)
66
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
67
+ s_1 = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
68
+ s_2 = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
69
+ result[:size] = s_1 > s_2 ? s_1 : s_2
70
+ elsif schema_1.has_key?(:size)
71
+ result[:size] = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
72
+ elsif schema_2.has_key?(:size)
73
+ result[:size] = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
74
+ end
75
+ else
76
+ # Treat as two strings
77
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
78
+ result[:size] = schema_1[:size] > schema_2[:size] ? schema_1[:size] : schema_2[:size]
79
+ elsif schema_1.has_key?(:size)
80
+ result[:size] = schema_1[:size]
81
+ else
82
+ result[:size] = schema_2[:size]
83
+ end
84
+ end
85
+ end
86
+
87
+ # fixed
88
+ if schema_1[:fixed] != schema_2[:fixed]
89
+ # This can only be of type String.
90
+ result[:fixed] = true
91
+ end
92
+ end
93
+
94
+ {:type => result.delete(:type), :opts => result}
95
+ end
96
+
97
+ private
98
+
99
+ # Convert the column schema information to a hash of column options, one of which must
100
+ # be :type. The other options added should modify that type (e.g. :size). If a
101
+ # database type is not recognized, return it as a String type.
102
+ #
103
+ # @note This method comes straight from Sequel (lib/sequel/extensions/schema_dumper.rb).
104
+ def column_schema_to_ruby_type(schema)
105
+ case t = schema[:db_type].downcase
106
+ when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
107
+ {:type=>Integer}
108
+ when /\Atinyint(?:\((\d+)\))?\z/o
109
+ {:type =>schema[:type] == :boolean ? TrueClass : Integer}
110
+ when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
111
+ {:type=>Bignum}
112
+ when /\A(?:real|float|double(?: precision)?)\z/o
113
+ {:type=>Float}
114
+ when 'boolean'
115
+ {:type=>TrueClass}
116
+ when /\A(?:(?:tiny|medium|long|n)?text|clob)\z/o
117
+ {:type=>String, :text=>true}
118
+ when 'date'
119
+ {:type=>Date}
120
+ when /\A(?:small)?datetime\z/o
121
+ {:type=>DateTime}
122
+ when /\Atimestamp(?:\((\d+)\))?(?: with(?:out)? time zone)?\z/o
123
+ {:type=>DateTime, :size=>($1.to_i if $1)}
124
+ when /\Atime(?: with(?:out)? time zone)?\z/o
125
+ {:type=>Time, :only_time=>true}
126
+ when /\An?char(?:acter)?(?:\((\d+)\))?\z/o
127
+ {:type=>String, :size=>($1.to_i if $1), :fixed=>true}
128
+ when /\A(?:n?varchar|character varying|bpchar|string)(?:\((\d+)\))?\z/o
129
+ {:type=>String, :size=>($1.to_i if $1)}
130
+ when /\A(?:small)?money\z/o
131
+ {:type=>BigDecimal, :size=>[19,2]}
132
+ when /\A(?:decimal|numeric|number)(?:\((\d+)(?:,\s*(\d+))?\))?\z/o
133
+ s = [($1.to_i if $1), ($2.to_i if $2)].compact
134
+ {:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
135
+ when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
136
+ {:type=>File, :size=>($1.to_i if $1)}
137
+ when 'year'
138
+ {:type=>Integer}
139
+ else
140
+ {:type=>String}
141
+ end
142
+ end
143
+
144
+ def first_common_type(type_1, type_2)
145
+ types_1 = [type_1] + get_types(type_1)
146
+ types_2 = [type_2] + get_types(type_2)
147
+ (types_1 & types_2).first
148
+ end
149
+
150
+ # Get all types that the specified type can be converted to. Order
151
+ # matters.
152
+ def get_types(type)
153
+ result = []
154
+ types = TYPE_CONVERSION_TREE[type]
155
+ if types
156
+ result += types
157
+ types.each do |t|
158
+ result |= get_types(t)
159
+ end
160
+ end
161
+ result
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,106 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "linkage"
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jeremy Stephens"]
12
+ s.date = "2011-09-30"
13
+ s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
+ s.email = "jeremy.f.stephens@vanderbilt.edu"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".vimrc",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "Guardfile",
25
+ "LICENSE.txt",
26
+ "README.markdown",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "lib/linkage.rb",
30
+ "lib/linkage/configuration.rb",
31
+ "lib/linkage/dataset.rb",
32
+ "lib/linkage/expectation.rb",
33
+ "lib/linkage/field.rb",
34
+ "lib/linkage/group.rb",
35
+ "lib/linkage/import_buffer.rb",
36
+ "lib/linkage/runner.rb",
37
+ "lib/linkage/runner/single_threaded.rb",
38
+ "lib/linkage/utils.rb",
39
+ "linkage.gemspec",
40
+ "test/helper.rb",
41
+ "test/integration/test_cross_linkage.rb",
42
+ "test/integration/test_dual_linkage.rb",
43
+ "test/integration/test_self_linkage.rb",
44
+ "test/unit/test_configuration.rb",
45
+ "test/unit/test_dataset.rb",
46
+ "test/unit/test_expectation.rb",
47
+ "test/unit/test_field.rb",
48
+ "test/unit/test_group.rb",
49
+ "test/unit/test_import_buffer.rb",
50
+ "test/unit/test_linkage.rb",
51
+ "test/unit/test_runner.rb",
52
+ "test/unit/test_single_threaded_runner.rb",
53
+ "test/unit/test_utils.rb"
54
+ ]
55
+ s.homepage = "http://github.com/coupler/linkage"
56
+ s.licenses = ["MIT"]
57
+ s.require_paths = ["lib"]
58
+ s.rubygems_version = "1.8.10"
59
+ s.summary = "Sequel-based record linkage"
60
+
61
+ if s.respond_to? :specification_version then
62
+ s.specification_version = 3
63
+
64
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
65
+ s.add_runtime_dependency(%q<sequel>, [">= 0"])
66
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
67
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
68
+ s.add_development_dependency(%q<rcov>, [">= 0"])
69
+ s.add_development_dependency(%q<guard-test>, [">= 0"])
70
+ s.add_development_dependency(%q<test-unit>, ["= 2.3.2"])
71
+ s.add_development_dependency(%q<mocha>, [">= 0"])
72
+ s.add_development_dependency(%q<sqlite3>, [">= 0"])
73
+ s.add_development_dependency(%q<yard>, [">= 0"])
74
+ s.add_development_dependency(%q<rake>, [">= 0"])
75
+ s.add_development_dependency(%q<versionomy>, [">= 0"])
76
+ s.add_development_dependency(%q<guard-yard>, [">= 0"])
77
+ else
78
+ s.add_dependency(%q<sequel>, [">= 0"])
79
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
80
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
81
+ s.add_dependency(%q<rcov>, [">= 0"])
82
+ s.add_dependency(%q<guard-test>, [">= 0"])
83
+ s.add_dependency(%q<test-unit>, ["= 2.3.2"])
84
+ s.add_dependency(%q<mocha>, [">= 0"])
85
+ s.add_dependency(%q<sqlite3>, [">= 0"])
86
+ s.add_dependency(%q<yard>, [">= 0"])
87
+ s.add_dependency(%q<rake>, [">= 0"])
88
+ s.add_dependency(%q<versionomy>, [">= 0"])
89
+ s.add_dependency(%q<guard-yard>, [">= 0"])
90
+ end
91
+ else
92
+ s.add_dependency(%q<sequel>, [">= 0"])
93
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
94
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
95
+ s.add_dependency(%q<rcov>, [">= 0"])
96
+ s.add_dependency(%q<guard-test>, [">= 0"])
97
+ s.add_dependency(%q<test-unit>, ["= 2.3.2"])
98
+ s.add_dependency(%q<mocha>, [">= 0"])
99
+ s.add_dependency(%q<sqlite3>, [">= 0"])
100
+ s.add_dependency(%q<yard>, [">= 0"])
101
+ s.add_dependency(%q<rake>, [">= 0"])
102
+ s.add_dependency(%q<versionomy>, [">= 0"])
103
+ s.add_dependency(%q<guard-yard>, [">= 0"])
104
+ end
105
+ end
106
+