linkage 0.0.8 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
data/lib/linkage/group.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
class Group
|
3
|
-
include Linkage::Decollation
|
4
|
-
|
5
|
-
# @return [Hash] Hash of matching values
|
6
|
-
attr_reader :values
|
7
|
-
|
8
|
-
# @return [Integer] Number of records in this group
|
9
|
-
attr_reader :count
|
10
|
-
|
11
|
-
# @return [Integer] This group's ID (if it exists)
|
12
|
-
attr_reader :id
|
13
|
-
|
14
|
-
def self.from_row(row)
|
15
|
-
values = {}
|
16
|
-
options = {}
|
17
|
-
row.each_pair do |key, value|
|
18
|
-
if key == :id || key == :count
|
19
|
-
options[key] = value
|
20
|
-
else
|
21
|
-
values[key] = value
|
22
|
-
end
|
23
|
-
end
|
24
|
-
new(values, options)
|
25
|
-
end
|
26
|
-
|
27
|
-
# @param [Hash] values Values that define this group
|
28
|
-
# @param [Hash] options
|
29
|
-
# @option options [Fixnum] :id The group ID
|
30
|
-
# @option options [Fixnum] :count How many records are in the group
|
31
|
-
# @option options [Hash] :ruby_types Hash of ruby types for each value
|
32
|
-
# @option options [Symbol] :database_type
|
33
|
-
# @example
|
34
|
-
# Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
|
35
|
-
def initialize(values, options)
|
36
|
-
@count = options[:count]
|
37
|
-
@id = options[:id]
|
38
|
-
@ruby_types = options[:ruby_types]
|
39
|
-
@database_type = options[:database_type]
|
40
|
-
@values = values
|
41
|
-
end
|
42
|
-
|
43
|
-
def decollated_values
|
44
|
-
@values.inject({}) do |hsh, (key, value)|
|
45
|
-
ruby_type = @ruby_types[key]
|
46
|
-
if ruby_type && ruby_type.has_key?(:opts) && ruby_type[:opts].has_key?(:collate)
|
47
|
-
hsh[key] = decollate(value, @database_type, ruby_type[:opts][:collate])
|
48
|
-
else
|
49
|
-
hsh[key] = value
|
50
|
-
end
|
51
|
-
hsh
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
data/lib/linkage/meta_object.rb
DELETED
@@ -1,139 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
class MetaObject
|
3
|
-
attr_reader :object
|
4
|
-
attr_writer :side
|
5
|
-
|
6
|
-
# Creates a new MetaObject.
|
7
|
-
#
|
8
|
-
# @param [Object] object This can be a {Field}, {Function} or a regular
|
9
|
-
# Ruby object (Fixnum, String, etc). If `object` is not static (a {Field}
|
10
|
-
# or a {Function} that contains one or more {Field} objects), you should
|
11
|
-
# specify which "side" of the linkage the object belongs to (left-hand
|
12
|
-
# side or right-hand side) in the `side` argument.
|
13
|
-
# @param [Symbol] side `:lhs` for left-hand side or `:rhs` for right-hand
|
14
|
-
# side
|
15
|
-
def initialize(object, side = nil)
|
16
|
-
@object = object
|
17
|
-
@static = object.kind_of?(Linkage::Data) ? object.static? : true
|
18
|
-
if !side.nil? && side != :lhs && side != :rhs
|
19
|
-
raise ArgumentError, "invalid `side` argument, must be :lhs or :rhs"
|
20
|
-
end
|
21
|
-
@side = side
|
22
|
-
end
|
23
|
-
|
24
|
-
def side
|
25
|
-
if !@static && @side.nil?
|
26
|
-
raise RuntimeError, "Object is dynamic and side is not set"
|
27
|
-
end
|
28
|
-
@side
|
29
|
-
end
|
30
|
-
|
31
|
-
def dataset
|
32
|
-
@object.kind_of?(Linkage::Data) ? @object.dataset : nil
|
33
|
-
end
|
34
|
-
|
35
|
-
def dataset=(dataset)
|
36
|
-
if @object.kind_of?(Linkage::Data)
|
37
|
-
@object.dataset = dataset
|
38
|
-
else
|
39
|
-
raise RuntimeError, "You can't set the dataset of a non-data object."
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def database_type
|
44
|
-
ds = dataset
|
45
|
-
ds ? ds.database_type : nil
|
46
|
-
end
|
47
|
-
|
48
|
-
def static?
|
49
|
-
@static
|
50
|
-
end
|
51
|
-
|
52
|
-
# Returns true if the argument has the same object as the instance.
|
53
|
-
#
|
54
|
-
# @param [Linkage::MetaObject] other
|
55
|
-
# @return [Boolean]
|
56
|
-
def objects_equal?(other)
|
57
|
-
other.is_a?(Linkage::MetaObject) && other.object == self.object
|
58
|
-
end
|
59
|
-
|
60
|
-
# Returns true if the argument has the same dataset as the instance.
|
61
|
-
#
|
62
|
-
# @param [Linkage::MetaObject] other
|
63
|
-
# @return [Boolean]
|
64
|
-
def datasets_equal?(other)
|
65
|
-
other.is_a?(Linkage::MetaObject) && other.dataset == self.dataset
|
66
|
-
end
|
67
|
-
|
68
|
-
# Returns an expression suitable for use in Sequel queries.
|
69
|
-
# @return [Object]
|
70
|
-
def to_expr
|
71
|
-
if @object.kind_of?(Linkage::Data)
|
72
|
-
@object.to_expr
|
73
|
-
else
|
74
|
-
@object
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Returns a Sequel identifier for {Data} objects, or the object itself.
|
79
|
-
# @return [Sequel::SQL::Identifier, Object]
|
80
|
-
def to_identifier
|
81
|
-
if @object.kind_of?(Linkage::Data)
|
82
|
-
Sequel::SQL::Identifier.new(@object.to_expr)
|
83
|
-
else
|
84
|
-
@object
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
# Return the name of the object for {Data} objects, nil for others.
|
89
|
-
# @return [Symbol, nil]
|
90
|
-
def name
|
91
|
-
if @object.kind_of?(Linkage::Data)
|
92
|
-
@object.name
|
93
|
-
else
|
94
|
-
nil
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Returns a {MergeField} if both objects are {Data} objects, otherwise,
|
99
|
-
# raises an exception.
|
100
|
-
#
|
101
|
-
# @return [Linkage::MergeField]
|
102
|
-
def merge(other)
|
103
|
-
if @object.kind_of?(Linkage::Data) && other.object.kind_of?(Linkage::Data)
|
104
|
-
@object.merge(other.object)
|
105
|
-
else
|
106
|
-
raise ArgumentError, "Cannot merge a non-data object"
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# Returns the Ruby type of the underlying object.
|
111
|
-
#
|
112
|
-
# @return [Hash]
|
113
|
-
# @see Linkage::Field#ruby_type
|
114
|
-
# @see Linkage::Function#ruby_type
|
115
|
-
def ruby_type
|
116
|
-
if @object.kind_of?(Linkage::Data)
|
117
|
-
@object.ruby_type
|
118
|
-
else
|
119
|
-
{:type => @object.class}
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Returns the collation of the underlying object.
|
124
|
-
#
|
125
|
-
# @return [Symbol]
|
126
|
-
def collation
|
127
|
-
if @object.kind_of?(Linkage::Data)
|
128
|
-
@object.collation
|
129
|
-
else
|
130
|
-
nil
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# Returns true if underlying object is not a subclass of {Linkage::Data}.
|
135
|
-
def raw?
|
136
|
-
!@object.kind_of?(Linkage::Data)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
@@ -1,187 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
# A runner class that only uses a single thread to execute a linkage.
|
3
|
-
#
|
4
|
-
# @see Runner
|
5
|
-
class SingleThreadedRunner < Runner
|
6
|
-
# @return [Linkage::ResultSet]
|
7
|
-
def execute
|
8
|
-
result_set.create_tables!
|
9
|
-
|
10
|
-
@pk_1 = config.dataset_1.field_set.primary_key.to_expr
|
11
|
-
@pk_2 = config.dataset_2.field_set.primary_key.to_expr
|
12
|
-
if config.has_simple_expectations?
|
13
|
-
setup_datasets
|
14
|
-
group_records
|
15
|
-
|
16
|
-
if config.has_exhaustive_expectations?
|
17
|
-
score_records_with_groups
|
18
|
-
else
|
19
|
-
create_matches
|
20
|
-
end
|
21
|
-
else
|
22
|
-
dataset_1, dataset_2 = config.datasets_with_applied_exhaustive_expectations
|
23
|
-
score_records_without_groups(dataset_1, dataset_2)
|
24
|
-
end
|
25
|
-
|
26
|
-
result_set.flush!
|
27
|
-
return result_set
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def setup_datasets
|
33
|
-
@dataset_1, @dataset_2 = config.datasets_with_applied_simple_expectations
|
34
|
-
|
35
|
-
@dataset_1 = @dataset_1.select(@pk_1)
|
36
|
-
if @config.linkage_type != :self
|
37
|
-
@dataset_2 = @dataset_2.select(@pk_2)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
def group_records
|
42
|
-
if config.linkage_type == :self
|
43
|
-
group_records_for(@dataset_1, 1)
|
44
|
-
else
|
45
|
-
group_records_for(@dataset_1, 1, false)
|
46
|
-
group_records_for(@dataset_2, 2, false)
|
47
|
-
combine_groups
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# @param [Linkage::Dataset] dataset
|
52
|
-
# @param [Fixnum, nil] dataset_id
|
53
|
-
# @param [Boolean] ignore_empty_groups
|
54
|
-
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
55
|
-
# the block. Otherwise, call ResultSet#add_group on the group.
|
56
|
-
def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
|
57
|
-
group_minimum = ignore_empty_groups ? 2 : 1
|
58
|
-
dataset.each_group(group_minimum) do |group|
|
59
|
-
result_set.add_group(group, dataset_id)
|
60
|
-
end
|
61
|
-
result_set.flush!
|
62
|
-
end
|
63
|
-
|
64
|
-
def combine_groups
|
65
|
-
# Create a new dataset for the groups table
|
66
|
-
groups_dataset = result_set.groups_dataset
|
67
|
-
|
68
|
-
groups_dataset.field_set.values.each do |field|
|
69
|
-
# Sort on all fields
|
70
|
-
if !field.primary_key?
|
71
|
-
meta_object = MetaObject.new(field)
|
72
|
-
groups_dataset = groups_dataset.group_match_more(meta_object)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# Delete non-matching groups
|
77
|
-
sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
|
78
|
-
groups_dataset.filter(:id => sub_dataset.obj).delete
|
79
|
-
|
80
|
-
# Delete duplicate groups
|
81
|
-
sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
|
82
|
-
groups_dataset.filter(:id => sub_dataset.obj).delete
|
83
|
-
end
|
84
|
-
|
85
|
-
def score_records_with_groups
|
86
|
-
result_set.groups_dataset.each do |group_record|
|
87
|
-
group = Group.from_row(group_record)
|
88
|
-
dataset_1, dataset_2 = config.apply_exhaustive_expectations(
|
89
|
-
*result_set.groups_records_datasets(group))
|
90
|
-
score_records_without_groups(dataset_1, dataset_2)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def score_records_without_groups(dataset_1, dataset_2)
|
95
|
-
if config.linkage_type == :self
|
96
|
-
keys = dataset_1.select_map(@pk_1)
|
97
|
-
unfiltered_dataset = dataset_1.unfiltered
|
98
|
-
cache = Hashery::LRUHash.new(config.record_cache_size) do |h, k|
|
99
|
-
h[k] = unfiltered_dataset.filter(@pk_1 => k).first
|
100
|
-
end
|
101
|
-
upper_bound = keys.length - 1
|
102
|
-
|
103
|
-
forward = true
|
104
|
-
keys.each_with_index do |key_1, key_1_index|
|
105
|
-
record_1 = cache[key_1]
|
106
|
-
|
107
|
-
lower_bound = key_1_index + 1
|
108
|
-
enum =
|
109
|
-
if forward
|
110
|
-
lower_bound.upto(upper_bound)
|
111
|
-
else
|
112
|
-
upper_bound.downto(lower_bound)
|
113
|
-
end
|
114
|
-
enum.each do |key_2_index|
|
115
|
-
record_2 = cache[keys[key_2_index]]
|
116
|
-
score(record_1, record_2)
|
117
|
-
end
|
118
|
-
forward = !forward
|
119
|
-
end
|
120
|
-
else
|
121
|
-
keys_2 = dataset_2.select_map(@pk_2)
|
122
|
-
unfiltered_dataset_2 = dataset_2.unfiltered
|
123
|
-
cache_2 = Hashery::LRUHash.new(config.record_cache_size) do |h, k|
|
124
|
-
h[k] = unfiltered_dataset_2.filter(@pk_2 => k).first
|
125
|
-
end
|
126
|
-
keys_2_last = keys_2.length - 1
|
127
|
-
|
128
|
-
forward = true
|
129
|
-
dataset_1.each do |record_1|
|
130
|
-
enum = forward ? 0.upto(keys_2_last) : keys_2_last.downto(0)
|
131
|
-
enum.each do |key_2_index|
|
132
|
-
record_2 = cache_2[keys_2[key_2_index]]
|
133
|
-
score(record_1, record_2)
|
134
|
-
end
|
135
|
-
forward = !forward
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
def score(record_1, record_2)
|
141
|
-
pk_1 = record_1[@pk_1]
|
142
|
-
pk_2 = record_2[@pk_2]
|
143
|
-
|
144
|
-
catch(:stop) do
|
145
|
-
total_score = 0
|
146
|
-
config.exhaustive_expectations.each_with_index do |expectation, comparator_id|
|
147
|
-
comparator = expectation.comparator
|
148
|
-
|
149
|
-
score = comparator.score(record_1, record_2)
|
150
|
-
result_set.add_score(comparator_id, pk_1, pk_2, score)
|
151
|
-
|
152
|
-
throw(:stop) unless expectation.satisfied?(score)
|
153
|
-
total_score += score
|
154
|
-
end
|
155
|
-
result_set.add_match(pk_1, pk_2, total_score)
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
# Only needed for linkages without exhaustive expectations
|
160
|
-
def create_matches
|
161
|
-
result_set.groups_dataset.each do |group_record|
|
162
|
-
group = Group.from_row(group_record)
|
163
|
-
dataset_1, dataset_2 = result_set.groups_records_datasets(group)
|
164
|
-
|
165
|
-
if config.linkage_type == :self
|
166
|
-
keys = dataset_1.select_map(@pk_1)
|
167
|
-
keys_last = keys.length - 1
|
168
|
-
keys.each_with_index do |key_1, key_1_index|
|
169
|
-
(key_1_index + 1).upto(keys_last) do |key_2_index|
|
170
|
-
key_2 = keys[key_2_index]
|
171
|
-
result_set.add_match(key_1, key_2, nil)
|
172
|
-
end
|
173
|
-
end
|
174
|
-
else
|
175
|
-
keys_1 = dataset_1.select_map(@pk_1)
|
176
|
-
keys_2 = dataset_2.select_map(@pk_2)
|
177
|
-
|
178
|
-
keys_1.each do |key_1|
|
179
|
-
keys_2.each do |key_2|
|
180
|
-
result_set.add_match(key_1, key_2, nil)
|
181
|
-
end
|
182
|
-
end
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
data/lib/linkage/utils.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
module Utils
|
3
|
-
# A "tree" used to find compatible types.
|
4
|
-
TYPE_CONVERSION_TREE = {
|
5
|
-
TrueClass => [Integer],
|
6
|
-
Integer => [Bignum, Float],
|
7
|
-
Bignum => [BigDecimal],
|
8
|
-
Float => [BigDecimal],
|
9
|
-
BigDecimal => [String],
|
10
|
-
String => nil,
|
11
|
-
DateTime => nil,
|
12
|
-
Date => nil,
|
13
|
-
Time => nil,
|
14
|
-
File => nil
|
15
|
-
}
|
16
|
-
|
17
|
-
# Create field information for a field that can hold data from two other
|
18
|
-
# fields. If the fields have different types, the resulting type is
|
19
|
-
# determined via a type-conversion tree.
|
20
|
-
#
|
21
|
-
# @param [Array] field_1 Schema information for the first field
|
22
|
-
# @param [Array] field_2 Schema information for the second field
|
23
|
-
# @return [Array] Schema information for the new field
|
24
|
-
def merge_fields(field_1, field_2)
|
25
|
-
schema_1 = column_schema_to_ruby_type(field_1)
|
26
|
-
schema_1.delete_if { |k, v| v.nil? }
|
27
|
-
schema_2 = column_schema_to_ruby_type(field_2)
|
28
|
-
schema_2.delete_if { |k, v| v.nil? }
|
29
|
-
if schema_1 == schema_2
|
30
|
-
result = schema_1
|
31
|
-
else
|
32
|
-
result = schema_1.dup
|
33
|
-
|
34
|
-
# type
|
35
|
-
if schema_1[:type] != schema_2[:type]
|
36
|
-
result[:type] = first_common_type(schema_1[:type], schema_2[:type])
|
37
|
-
end
|
38
|
-
|
39
|
-
# text
|
40
|
-
if schema_1[:text] != schema_2[:text]
|
41
|
-
# This can only be of type String.
|
42
|
-
result[:text] = true
|
43
|
-
result.delete(:size)
|
44
|
-
end
|
45
|
-
|
46
|
-
# size
|
47
|
-
if !result[:text] && schema_1[:size] != schema_2[:size]
|
48
|
-
types = [schema_1[:type], schema_2[:type]].uniq
|
49
|
-
if types.length == 1 && types[0] == BigDecimal
|
50
|
-
# Two decimals
|
51
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
52
|
-
s_1 = schema_1[:size]
|
53
|
-
s_2 = schema_2[:size]
|
54
|
-
result[:size] = [ s_1[0] > s_2[0] ? s_1[0] : s_2[0] ]
|
55
|
-
|
56
|
-
if s_1[1] && s_2[1]
|
57
|
-
result[:size][1] = s_1[1] > s_2[1] ? s_1[1] : s_2[1]
|
58
|
-
else
|
59
|
-
result[:size][1] = s_1[1] ? s_1[1] : s_2[1]
|
60
|
-
end
|
61
|
-
else
|
62
|
-
result[:size] = schema_1.has_key?(:size) ? schema_1[:size] : schema_2[:size]
|
63
|
-
end
|
64
|
-
elsif types.include?(String) && types.include?(BigDecimal)
|
65
|
-
# Add one to the precision of the BigDecimal (for the dot)
|
66
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
67
|
-
s_1 = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
|
68
|
-
s_2 = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
|
69
|
-
result[:size] = s_1 > s_2 ? s_1 : s_2
|
70
|
-
elsif schema_1.has_key?(:size)
|
71
|
-
result[:size] = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
|
72
|
-
elsif schema_2.has_key?(:size)
|
73
|
-
result[:size] = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
|
74
|
-
end
|
75
|
-
else
|
76
|
-
# Treat as two strings
|
77
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
78
|
-
result[:size] = schema_1[:size] > schema_2[:size] ? schema_1[:size] : schema_2[:size]
|
79
|
-
elsif schema_1.has_key?(:size)
|
80
|
-
result[:size] = schema_1[:size]
|
81
|
-
else
|
82
|
-
result[:size] = schema_2[:size]
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# fixed
|
88
|
-
if schema_1[:fixed] != schema_2[:fixed]
|
89
|
-
# This can only be of type String.
|
90
|
-
result[:fixed] = true
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
{:type => result.delete(:type), :opts => result}
|
95
|
-
end
|
96
|
-
|
97
|
-
private
|
98
|
-
|
99
|
-
# Convert the column schema information to a hash of column options, one of which must
|
100
|
-
# be :type. The other options added should modify that type (e.g. :size). If a
|
101
|
-
# database type is not recognized, return it as a String type.
|
102
|
-
#
|
103
|
-
# @note This method comes straight from Sequel (lib/sequel/extensions/schema_dumper.rb).
|
104
|
-
def column_schema_to_ruby_type(schema)
|
105
|
-
case t = schema[:db_type].downcase
|
106
|
-
when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
107
|
-
{:type=>Integer}
|
108
|
-
when /\Atinyint(?:\((\d+)\))?\z/o
|
109
|
-
{:type =>schema[:type] == :boolean ? TrueClass : Integer}
|
110
|
-
when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
111
|
-
{:type=>Bignum}
|
112
|
-
when /\A(?:real|float|double(?: precision)?)\z/o
|
113
|
-
{:type=>Float}
|
114
|
-
when 'boolean'
|
115
|
-
{:type=>TrueClass}
|
116
|
-
when /\A(?:(?:tiny|medium|long|n)?text|clob)\z/o
|
117
|
-
{:type=>String, :text=>true}
|
118
|
-
when 'date'
|
119
|
-
{:type=>Date}
|
120
|
-
when /\A(?:small)?datetime\z/o
|
121
|
-
{:type=>DateTime}
|
122
|
-
when /\Atimestamp(?:\((\d+)\))?(?: with(?:out)? time zone)?\z/o
|
123
|
-
{:type=>DateTime, :size=>($1.to_i if $1)}
|
124
|
-
when /\Atime(?: with(?:out)? time zone)?\z/o
|
125
|
-
{:type=>Time, :only_time=>true}
|
126
|
-
when /\An?char(?:acter)?(?:\((\d+)\))?\z/o
|
127
|
-
{:type=>String, :size=>($1.to_i if $1), :fixed=>true}
|
128
|
-
when /\A(?:n?varchar|character varying|bpchar|string)(?:\((\d+)\))?\z/o
|
129
|
-
{:type=>String, :size=>($1.to_i if $1)}
|
130
|
-
when /\A(?:small)?money\z/o
|
131
|
-
{:type=>BigDecimal, :size=>[19,2]}
|
132
|
-
when /\A(?:decimal|numeric|number)(?:\((\d+)(?:,\s*(\d+))?\))?\z/o
|
133
|
-
s = [($1.to_i if $1), ($2.to_i if $2)].compact
|
134
|
-
{:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
|
135
|
-
when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
|
136
|
-
{:type=>File, :size=>($1.to_i if $1)}
|
137
|
-
when 'year'
|
138
|
-
{:type=>Integer}
|
139
|
-
else
|
140
|
-
{:type=>String}
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
def first_common_type(type_1, type_2)
|
145
|
-
types_1 = [type_1] + get_types(type_1)
|
146
|
-
types_2 = [type_2] + get_types(type_2)
|
147
|
-
(types_1 & types_2).first
|
148
|
-
end
|
149
|
-
|
150
|
-
# Get all types that the specified type can be converted to. Order
|
151
|
-
# matters.
|
152
|
-
def get_types(type)
|
153
|
-
result = []
|
154
|
-
types = TYPE_CONVERSION_TREE[type]
|
155
|
-
if types
|
156
|
-
result += types
|
157
|
-
types.each do |t|
|
158
|
-
result |= get_types(t)
|
159
|
-
end
|
160
|
-
end
|
161
|
-
result
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
data/lib/linkage/warnings.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
module IntegrationTests
|
4
|
-
class TestCollation < Test::Unit::TestCase
|
5
|
-
def setup
|
6
|
-
@tmpdir = Dir.mktmpdir('linkage')
|
7
|
-
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
|
-
end
|
9
|
-
|
10
|
-
def database(options = {}, &block)
|
11
|
-
Sequel.connect(@tmpuri, options, &block)
|
12
|
-
end
|
13
|
-
|
14
|
-
def teardown
|
15
|
-
FileUtils.remove_entry_secure(@tmpdir)
|
16
|
-
end
|
17
|
-
|
18
|
-
test "comparing strings exactly in MySQL" do
|
19
|
-
options = database_options_for('mysql')
|
20
|
-
database_for('mysql') do |db|
|
21
|
-
db.create_table!(:foo) do
|
22
|
-
primary_key :id
|
23
|
-
String :foo
|
24
|
-
String :bar
|
25
|
-
end
|
26
|
-
db[:foo].import([:foo, :bar], [
|
27
|
-
["Foo", "foo"],
|
28
|
-
["bar", "bar "],
|
29
|
-
])
|
30
|
-
end
|
31
|
-
dataset = Linkage::Dataset.new(options, :foo)
|
32
|
-
tmpuri = @tmpuri
|
33
|
-
conf = dataset.link_with(dataset) do
|
34
|
-
(lhs[:foo].must == rhs[:bar]).exactly
|
35
|
-
save_results_in(tmpuri)
|
36
|
-
end
|
37
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
38
|
-
runner.execute
|
39
|
-
|
40
|
-
database do |db|
|
41
|
-
assert_equal 0, db[:groups].count
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|