linkage 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ module Linkage
2
+ # This class represents a group of records that match based on criteria
3
+ # described via the {Dataset#link_with} method. Group's are created by
4
+ # subclasses of the {Runner} class during execution.
5
+ #
6
+ # @see Dataset#link_with
7
+ # @see SingleThreadedRunner
8
+ class Group
9
+ # @return [Array<Object>] An array of this group's record ids
10
+ attr_reader :records
11
+
12
+ # @return [Hash] Hash of matching values
13
+ attr_reader :values
14
+
15
+ # @param [Hash] matching_values Values that define this group
16
+ # @example
17
+ # Linkage::Group.new({:foo => 123, :bar => 'baz'})
18
+ def initialize(matching_values)
19
+ @values = matching_values
20
+ @records = []
21
+ end
22
+
23
+ # Check to see if the given set of values matches this group's values.
24
+ #
25
+ # @param [Hash] values Hash of values
26
+ # @return [Boolean] true if match, false if not
27
+ def matches?(values)
28
+ @values == values
29
+ end
30
+
31
+ # Add a record id to this group's set of records.
32
+ #
33
+ # @param [Object] record_id
34
+ def add_record(record_id)
35
+ @records << record_id
36
+ end
37
+
38
+ # @return [Fixnum] Number of records in this group
39
+ def count
40
+ @records.count
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,39 @@
1
+ module Linkage
2
+ class ImportBuffer
3
+ # @param [String] uri Sequel-style URI
4
+ # @param [Symbol, String] table_name
5
+ # @param [Array<Symbol>] headers List of fields you want to insert
6
+ # @param [Hash] options Sequel.connect options
7
+ # @param [Fixnum] limit Number of records to insert at a time
8
+ def initialize(uri, table_name, headers, options = {}, limit = 1000)
9
+ @uri = uri
10
+ @table_name = table_name.to_sym
11
+ @headers = headers
12
+ @options = options
13
+ @limit = limit
14
+ @values = []
15
+ end
16
+
17
+ def add(values)
18
+ @values << values
19
+ if @values.length == @limit
20
+ flush
21
+ end
22
+ end
23
+
24
+ def flush
25
+ return if @values.empty?
26
+ database do |db|
27
+ ds = db[@table_name]
28
+ ds.import(@headers, @values)
29
+ @values.clear
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def database(&block)
36
+ Sequel.connect(@uri, @options, &block)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,59 @@
1
+ module Linkage
2
+ # Use this class to run a configuration created by {Dataset#link_with}.
3
+ class Runner
4
+ attr_reader :config
5
+
6
+ # @param [Linkage::Configuration] config
7
+ # @param [String] uri Sequel-style database URI
8
+ # @param [Hash] options Sequel.connect options
9
+ # @see Dataset#link_with
10
+ # @see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html Sequel: Connecting to a database
11
+ def initialize(config, uri, options = {})
12
+ @config = config
13
+ @uri = uri
14
+ @options = options
15
+ @next_group_id = 1
16
+ @next_group_mutex = Mutex.new
17
+ end
18
+
19
+ # @abstract
20
+ def execute
21
+ raise NotImplementedError
22
+ end
23
+
24
+ protected
25
+
26
+ def database(&block)
27
+ Sequel.connect(@uri, @options, &block)
28
+ end
29
+
30
+ def create_tables
31
+ database do |db|
32
+ schema = config.groups_table_schema
33
+ db.create_table(:groups) do
34
+ schema.each { |col| column(*col) }
35
+ end
36
+
37
+ pk_type = config.dataset_1.primary_key.merge(config.dataset_2.primary_key).ruby_type
38
+ db.create_table(:groups_records) do
39
+ column(:record_id, pk_type[:type], pk_type[:opts] || {})
40
+ Integer :group_id
41
+ Integer :dataset
42
+ index :group_id
43
+ end
44
+ end
45
+ end
46
+
47
+ def next_group_id
48
+ result = nil
49
+ @next_group_mutex.synchronize do
50
+ result = @next_group_id
51
+ @next_group_id += 1
52
+ end
53
+ result
54
+ end
55
+ end
56
+ end
57
+
58
+ path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
59
+ require path + 'single_threaded'
@@ -0,0 +1,114 @@
1
+ module Linkage
2
+ # A runner class that only uses a single thread to execute a linkage.
3
+ #
4
+ # @see Runner
5
+ class SingleThreadedRunner < Runner
6
+ def execute
7
+ create_tables
8
+ setup_datasets
9
+ apply_expectations
10
+ group_records
11
+ nil
12
+ end
13
+
14
+ private
15
+
16
+ def setup_datasets
17
+ @dataset_1 = config.dataset_1.clone
18
+ @dataset_2 = config.dataset_2.clone if @config.linkage_type != :self
19
+ end
20
+
21
+ def apply_expectations
22
+ config.expectations.each do |exp|
23
+ exp.apply_to(@dataset_1)
24
+ exp.apply_to(@dataset_2) if config.linkage_type != :self
25
+ end
26
+ end
27
+
28
+ def group_records
29
+ if config.linkage_type == :self
30
+ add_groups(group_records_for(@dataset_1), 1)
31
+ else
32
+ add_groups(group_records_for(@dataset_1, false), 1)
33
+ add_groups(group_records_for(@dataset_2, false), 2)
34
+ combine_groups
35
+ end
36
+ end
37
+
38
+ def group_records_for(dataset, ignore_empty_groups = true)
39
+ groups = []
40
+ current_group = nil
41
+ dataset.each do |row|
42
+ if current_group.nil? || !current_group.matches?(row[:values])
43
+ if current_group && (!ignore_empty_groups || current_group.count > 1)
44
+ groups << current_group
45
+ end
46
+ new_group = Group.new(row[:values])
47
+ current_group = new_group
48
+ end
49
+ current_group.add_record(row[:pk])
50
+ end
51
+ if current_group && (!ignore_empty_groups || current_group.count > 1)
52
+ groups << current_group
53
+ end
54
+ groups
55
+ end
56
+
57
+ def add_groups(groups, dataset_id = nil)
58
+ return if groups.empty?
59
+
60
+ groups_headers = [:id] + groups[0].values.keys
61
+ groups_buffer = ImportBuffer.new(@uri, :groups, groups_headers, @options)
62
+
63
+ groups_records_buffer = ImportBuffer.new(@uri, :groups_records, [:group_id, :dataset, :record_id], @options)
64
+
65
+ groups.each_with_index do |group, i|
66
+ group_id = next_group_id
67
+ groups_buffer.add([group_id] + group.values.values)
68
+ group.records.each do |record_id|
69
+ groups_records_buffer.add([group_id, dataset_id, record_id])
70
+ end
71
+ end
72
+ groups_buffer.flush
73
+ groups_records_buffer.flush
74
+ end
75
+
76
+ def combine_groups
77
+ # Create a new dataset for the groups table
78
+ ds = Dataset.new(@uri, :groups, :single_threaded => true)
79
+ ds.fields.each_value do |field|
80
+ # Sort on all fields
81
+ next if field.primary_key?
82
+ ds.add_order(field)
83
+ ds.add_select(field)
84
+ end
85
+ ds.add_order(ds.primary_key) # ensure matching groups are sorted by id
86
+ combined_groups = group_records_for(ds, false)
87
+ database do |db|
88
+ groups_to_delete = []
89
+ db.transaction do # for speed reasons
90
+ combined_groups.each do |group|
91
+ if group.count == 1
92
+ # Delete the empty group
93
+ groups_to_delete << group.records[0]
94
+ else
95
+ # Change group_id in the groups_records table to the first group
96
+ # id, delete other groups.
97
+ new_group_id = group.records[0]
98
+ group.records[1..-1].each do |old_group_id|
99
+ # There can only be a group with max size of 2, but this
100
+ # adds in future support for matching more than 2 datasets
101
+ # at once. Code smell?
102
+ db[:groups_records].filter(:group_id => old_group_id).
103
+ update(:group_id => new_group_id)
104
+ groups_to_delete << old_group_id
105
+ end
106
+ end
107
+ end
108
+ end
109
+ db[:groups_records].filter(:group_id => groups_to_delete).delete
110
+ db[:groups].filter(:id => groups_to_delete).delete
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,164 @@
1
+ module Linkage
2
+ module Utils
3
+ # A "tree" used to find compatible types.
4
+ TYPE_CONVERSION_TREE = {
5
+ TrueClass => [Integer],
6
+ Integer => [Bignum, Float],
7
+ Bignum => [BigDecimal],
8
+ Float => [BigDecimal],
9
+ BigDecimal => [String],
10
+ String => nil,
11
+ DateTime => nil,
12
+ Date => nil,
13
+ Time => nil,
14
+ File => nil
15
+ }
16
+
17
+ # Create field information for a field that can hold data from two other
18
+ # fields. If the fields have different types, the resulting type is
19
+ # determined via a type-conversion tree.
20
+ #
21
+ # @param [Array] field_1 Schema information for the first field
22
+ # @param [Array] field_2 Schema information for the second field
23
+ # @return [Array] Schema information for the new field
24
+ def merge_fields(field_1, field_2)
25
+ schema_1 = column_schema_to_ruby_type(field_1)
26
+ schema_1.delete_if { |k, v| v.nil? }
27
+ schema_2 = column_schema_to_ruby_type(field_2)
28
+ schema_2.delete_if { |k, v| v.nil? }
29
+ if schema_1 == schema_2
30
+ result = schema_1
31
+ else
32
+ result = schema_1.dup
33
+
34
+ # type
35
+ if schema_1[:type] != schema_2[:type]
36
+ result[:type] = first_common_type(schema_1[:type], schema_2[:type])
37
+ end
38
+
39
+ # text
40
+ if schema_1[:text] != schema_2[:text]
41
+ # This can only be of type String.
42
+ result[:text] = true
43
+ result.delete(:size)
44
+ end
45
+
46
+ # size
47
+ if !result[:text] && schema_1[:size] != schema_2[:size]
48
+ types = [schema_1[:type], schema_2[:type]].uniq
49
+ if types.length == 1 && types[0] == BigDecimal
50
+ # Two decimals
51
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
52
+ s_1 = schema_1[:size]
53
+ s_2 = schema_2[:size]
54
+ result[:size] = [ s_1[0] > s_2[0] ? s_1[0] : s_2[0] ]
55
+
56
+ if s_1[1] && s_2[1]
57
+ result[:size][1] = s_1[1] > s_2[1] ? s_1[1] : s_2[1]
58
+ else
59
+ result[:size][1] = s_1[1] ? s_1[1] : s_2[1]
60
+ end
61
+ else
62
+ result[:size] = schema_1.has_key?(:size) ? schema_1[:size] : schema_2[:size]
63
+ end
64
+ elsif types.include?(String) && types.include?(BigDecimal)
65
+ # Add one to the precision of the BigDecimal (for the dot)
66
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
67
+ s_1 = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
68
+ s_2 = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
69
+ result[:size] = s_1 > s_2 ? s_1 : s_2
70
+ elsif schema_1.has_key?(:size)
71
+ result[:size] = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
72
+ elsif schema_2.has_key?(:size)
73
+ result[:size] = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
74
+ end
75
+ else
76
+ # Treat as two strings
77
+ if schema_1.has_key?(:size) && schema_2.has_key?(:size)
78
+ result[:size] = schema_1[:size] > schema_2[:size] ? schema_1[:size] : schema_2[:size]
79
+ elsif schema_1.has_key?(:size)
80
+ result[:size] = schema_1[:size]
81
+ else
82
+ result[:size] = schema_2[:size]
83
+ end
84
+ end
85
+ end
86
+
87
+ # fixed
88
+ if schema_1[:fixed] != schema_2[:fixed]
89
+ # This can only be of type String.
90
+ result[:fixed] = true
91
+ end
92
+ end
93
+
94
+ {:type => result.delete(:type), :opts => result}
95
+ end
96
+
97
+ private
98
+
99
+ # Convert the column schema information to a hash of column options, one of which must
100
+ # be :type. The other options added should modify that type (e.g. :size). If a
101
+ # database type is not recognized, return it as a String type.
102
+ #
103
+ # @note This method comes straight from Sequel (lib/sequel/extensions/schema_dumper.rb).
104
+ def column_schema_to_ruby_type(schema)
105
+ case t = schema[:db_type].downcase
106
+ when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
107
+ {:type=>Integer}
108
+ when /\Atinyint(?:\((\d+)\))?\z/o
109
+ {:type =>schema[:type] == :boolean ? TrueClass : Integer}
110
+ when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
111
+ {:type=>Bignum}
112
+ when /\A(?:real|float|double(?: precision)?)\z/o
113
+ {:type=>Float}
114
+ when 'boolean'
115
+ {:type=>TrueClass}
116
+ when /\A(?:(?:tiny|medium|long|n)?text|clob)\z/o
117
+ {:type=>String, :text=>true}
118
+ when 'date'
119
+ {:type=>Date}
120
+ when /\A(?:small)?datetime\z/o
121
+ {:type=>DateTime}
122
+ when /\Atimestamp(?:\((\d+)\))?(?: with(?:out)? time zone)?\z/o
123
+ {:type=>DateTime, :size=>($1.to_i if $1)}
124
+ when /\Atime(?: with(?:out)? time zone)?\z/o
125
+ {:type=>Time, :only_time=>true}
126
+ when /\An?char(?:acter)?(?:\((\d+)\))?\z/o
127
+ {:type=>String, :size=>($1.to_i if $1), :fixed=>true}
128
+ when /\A(?:n?varchar|character varying|bpchar|string)(?:\((\d+)\))?\z/o
129
+ {:type=>String, :size=>($1.to_i if $1)}
130
+ when /\A(?:small)?money\z/o
131
+ {:type=>BigDecimal, :size=>[19,2]}
132
+ when /\A(?:decimal|numeric|number)(?:\((\d+)(?:,\s*(\d+))?\))?\z/o
133
+ s = [($1.to_i if $1), ($2.to_i if $2)].compact
134
+ {:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
135
+ when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
136
+ {:type=>File, :size=>($1.to_i if $1)}
137
+ when 'year'
138
+ {:type=>Integer}
139
+ else
140
+ {:type=>String}
141
+ end
142
+ end
143
+
144
+ def first_common_type(type_1, type_2)
145
+ types_1 = [type_1] + get_types(type_1)
146
+ types_2 = [type_2] + get_types(type_2)
147
+ (types_1 & types_2).first
148
+ end
149
+
150
+ # Get all types that the specified type can be converted to. Order
151
+ # matters.
152
+ def get_types(type)
153
+ result = []
154
+ types = TYPE_CONVERSION_TREE[type]
155
+ if types
156
+ result += types
157
+ types.each do |t|
158
+ result |= get_types(t)
159
+ end
160
+ end
161
+ result
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,106 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "linkage"
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jeremy Stephens"]
12
+ s.date = "2011-09-30"
13
+ s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
+ s.email = "jeremy.f.stephens@vanderbilt.edu"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".vimrc",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "Guardfile",
25
+ "LICENSE.txt",
26
+ "README.markdown",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "lib/linkage.rb",
30
+ "lib/linkage/configuration.rb",
31
+ "lib/linkage/dataset.rb",
32
+ "lib/linkage/expectation.rb",
33
+ "lib/linkage/field.rb",
34
+ "lib/linkage/group.rb",
35
+ "lib/linkage/import_buffer.rb",
36
+ "lib/linkage/runner.rb",
37
+ "lib/linkage/runner/single_threaded.rb",
38
+ "lib/linkage/utils.rb",
39
+ "linkage.gemspec",
40
+ "test/helper.rb",
41
+ "test/integration/test_cross_linkage.rb",
42
+ "test/integration/test_dual_linkage.rb",
43
+ "test/integration/test_self_linkage.rb",
44
+ "test/unit/test_configuration.rb",
45
+ "test/unit/test_dataset.rb",
46
+ "test/unit/test_expectation.rb",
47
+ "test/unit/test_field.rb",
48
+ "test/unit/test_group.rb",
49
+ "test/unit/test_import_buffer.rb",
50
+ "test/unit/test_linkage.rb",
51
+ "test/unit/test_runner.rb",
52
+ "test/unit/test_single_threaded_runner.rb",
53
+ "test/unit/test_utils.rb"
54
+ ]
55
+ s.homepage = "http://github.com/coupler/linkage"
56
+ s.licenses = ["MIT"]
57
+ s.require_paths = ["lib"]
58
+ s.rubygems_version = "1.8.10"
59
+ s.summary = "Sequel-based record linkage"
60
+
61
+ if s.respond_to? :specification_version then
62
+ s.specification_version = 3
63
+
64
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
65
+ s.add_runtime_dependency(%q<sequel>, [">= 0"])
66
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
67
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
68
+ s.add_development_dependency(%q<rcov>, [">= 0"])
69
+ s.add_development_dependency(%q<guard-test>, [">= 0"])
70
+ s.add_development_dependency(%q<test-unit>, ["= 2.3.2"])
71
+ s.add_development_dependency(%q<mocha>, [">= 0"])
72
+ s.add_development_dependency(%q<sqlite3>, [">= 0"])
73
+ s.add_development_dependency(%q<yard>, [">= 0"])
74
+ s.add_development_dependency(%q<rake>, [">= 0"])
75
+ s.add_development_dependency(%q<versionomy>, [">= 0"])
76
+ s.add_development_dependency(%q<guard-yard>, [">= 0"])
77
+ else
78
+ s.add_dependency(%q<sequel>, [">= 0"])
79
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
80
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
81
+ s.add_dependency(%q<rcov>, [">= 0"])
82
+ s.add_dependency(%q<guard-test>, [">= 0"])
83
+ s.add_dependency(%q<test-unit>, ["= 2.3.2"])
84
+ s.add_dependency(%q<mocha>, [">= 0"])
85
+ s.add_dependency(%q<sqlite3>, [">= 0"])
86
+ s.add_dependency(%q<yard>, [">= 0"])
87
+ s.add_dependency(%q<rake>, [">= 0"])
88
+ s.add_dependency(%q<versionomy>, [">= 0"])
89
+ s.add_dependency(%q<guard-yard>, [">= 0"])
90
+ end
91
+ else
92
+ s.add_dependency(%q<sequel>, [">= 0"])
93
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
94
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
95
+ s.add_dependency(%q<rcov>, [">= 0"])
96
+ s.add_dependency(%q<guard-test>, [">= 0"])
97
+ s.add_dependency(%q<test-unit>, ["= 2.3.2"])
98
+ s.add_dependency(%q<mocha>, [">= 0"])
99
+ s.add_dependency(%q<sqlite3>, [">= 0"])
100
+ s.add_dependency(%q<yard>, [">= 0"])
101
+ s.add_dependency(%q<rake>, [">= 0"])
102
+ s.add_dependency(%q<versionomy>, [">= 0"])
103
+ s.add_dependency(%q<guard-yard>, [">= 0"])
104
+ end
105
+ end
106
+