linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
data/lib/linkage/group.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
class Group
|
3
|
-
include Linkage::Decollation
|
4
|
-
|
5
|
-
# @return [Hash] Hash of matching values
|
6
|
-
attr_reader :values
|
7
|
-
|
8
|
-
# @return [Integer] Number of records in this group
|
9
|
-
attr_reader :count
|
10
|
-
|
11
|
-
# @return [Integer] This group's ID (if it exists)
|
12
|
-
attr_reader :id
|
13
|
-
|
14
|
-
def self.from_row(row)
|
15
|
-
values = {}
|
16
|
-
options = {}
|
17
|
-
row.each_pair do |key, value|
|
18
|
-
if key == :id || key == :count
|
19
|
-
options[key] = value
|
20
|
-
else
|
21
|
-
values[key] = value
|
22
|
-
end
|
23
|
-
end
|
24
|
-
new(values, options)
|
25
|
-
end
|
26
|
-
|
27
|
-
# @param [Hash] values Values that define this group
|
28
|
-
# @param [Hash] options
|
29
|
-
# @option options [Fixnum] :id The group ID
|
30
|
-
# @option options [Fixnum] :count How many records are in the group
|
31
|
-
# @option options [Hash] :ruby_types Hash of ruby types for each value
|
32
|
-
# @option options [Symbol] :database_type
|
33
|
-
# @example
|
34
|
-
# Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
|
35
|
-
def initialize(values, options)
|
36
|
-
@count = options[:count]
|
37
|
-
@id = options[:id]
|
38
|
-
@ruby_types = options[:ruby_types]
|
39
|
-
@database_type = options[:database_type]
|
40
|
-
@values = values
|
41
|
-
end
|
42
|
-
|
43
|
-
def decollated_values
|
44
|
-
@values.inject({}) do |hsh, (key, value)|
|
45
|
-
ruby_type = @ruby_types[key]
|
46
|
-
if ruby_type && ruby_type.has_key?(:opts) && ruby_type[:opts].has_key?(:collate)
|
47
|
-
hsh[key] = decollate(value, @database_type, ruby_type[:opts][:collate])
|
48
|
-
else
|
49
|
-
hsh[key] = value
|
50
|
-
end
|
51
|
-
hsh
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
data/lib/linkage/meta_object.rb
DELETED
@@ -1,139 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
class MetaObject
|
3
|
-
attr_reader :object
|
4
|
-
attr_writer :side
|
5
|
-
|
6
|
-
# Creates a new MetaObject.
|
7
|
-
#
|
8
|
-
# @param [Object] object This can be a {Field}, {Function} or a regular
|
9
|
-
# Ruby object (Fixnum, String, etc). If `object` is not static (a {Field}
|
10
|
-
# or a {Function} that contains one or more {Field} objects), you should
|
11
|
-
# specify which "side" of the linkage the object belongs to (left-hand
|
12
|
-
# side or right-hand side) in the `side` argument.
|
13
|
-
# @param [Symbol] side `:lhs` for left-hand side or `:rhs` for right-hand
|
14
|
-
# side
|
15
|
-
def initialize(object, side = nil)
|
16
|
-
@object = object
|
17
|
-
@static = object.kind_of?(Linkage::Data) ? object.static? : true
|
18
|
-
if !side.nil? && side != :lhs && side != :rhs
|
19
|
-
raise ArgumentError, "invalid `side` argument, must be :lhs or :rhs"
|
20
|
-
end
|
21
|
-
@side = side
|
22
|
-
end
|
23
|
-
|
24
|
-
def side
|
25
|
-
if !@static && @side.nil?
|
26
|
-
raise RuntimeError, "Object is dynamic and side is not set"
|
27
|
-
end
|
28
|
-
@side
|
29
|
-
end
|
30
|
-
|
31
|
-
def dataset
|
32
|
-
@object.kind_of?(Linkage::Data) ? @object.dataset : nil
|
33
|
-
end
|
34
|
-
|
35
|
-
def dataset=(dataset)
|
36
|
-
if @object.kind_of?(Linkage::Data)
|
37
|
-
@object.dataset = dataset
|
38
|
-
else
|
39
|
-
raise RuntimeError, "You can't set the dataset of a non-data object."
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def database_type
|
44
|
-
ds = dataset
|
45
|
-
ds ? ds.database_type : nil
|
46
|
-
end
|
47
|
-
|
48
|
-
def static?
|
49
|
-
@static
|
50
|
-
end
|
51
|
-
|
52
|
-
# Returns true if the argument has the same object as the instance.
|
53
|
-
#
|
54
|
-
# @param [Linkage::MetaObject] other
|
55
|
-
# @return [Boolean]
|
56
|
-
def objects_equal?(other)
|
57
|
-
other.is_a?(Linkage::MetaObject) && other.object == self.object
|
58
|
-
end
|
59
|
-
|
60
|
-
# Returns true if the argument has the same dataset as the instance.
|
61
|
-
#
|
62
|
-
# @param [Linkage::MetaObject] other
|
63
|
-
# @return [Boolean]
|
64
|
-
def datasets_equal?(other)
|
65
|
-
other.is_a?(Linkage::MetaObject) && other.dataset == self.dataset
|
66
|
-
end
|
67
|
-
|
68
|
-
# Returns an expression suitable for use in Sequel queries.
|
69
|
-
# @return [Object]
|
70
|
-
def to_expr
|
71
|
-
if @object.kind_of?(Linkage::Data)
|
72
|
-
@object.to_expr
|
73
|
-
else
|
74
|
-
@object
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Returns a Sequel identifier for {Data} objects, or the object itself.
|
79
|
-
# @return [Sequel::SQL::Identifier, Object]
|
80
|
-
def to_identifier
|
81
|
-
if @object.kind_of?(Linkage::Data)
|
82
|
-
Sequel::SQL::Identifier.new(@object.to_expr)
|
83
|
-
else
|
84
|
-
@object
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
# Return the name of the object for {Data} objects, nil for others.
|
89
|
-
# @return [Symbol, nil]
|
90
|
-
def name
|
91
|
-
if @object.kind_of?(Linkage::Data)
|
92
|
-
@object.name
|
93
|
-
else
|
94
|
-
nil
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Returns a {MergeField} if both objects are {Data} objects, otherwise,
|
99
|
-
# raises an exception.
|
100
|
-
#
|
101
|
-
# @return [Linkage::MergeField]
|
102
|
-
def merge(other)
|
103
|
-
if @object.kind_of?(Linkage::Data) && other.object.kind_of?(Linkage::Data)
|
104
|
-
@object.merge(other.object)
|
105
|
-
else
|
106
|
-
raise ArgumentError, "Cannot merge a non-data object"
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# Returns the Ruby type of the underlying object.
|
111
|
-
#
|
112
|
-
# @return [Hash]
|
113
|
-
# @see Linkage::Field#ruby_type
|
114
|
-
# @see Linkage::Function#ruby_type
|
115
|
-
def ruby_type
|
116
|
-
if @object.kind_of?(Linkage::Data)
|
117
|
-
@object.ruby_type
|
118
|
-
else
|
119
|
-
{:type => @object.class}
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Returns the collation of the underlying object.
|
124
|
-
#
|
125
|
-
# @return [Symbol]
|
126
|
-
def collation
|
127
|
-
if @object.kind_of?(Linkage::Data)
|
128
|
-
@object.collation
|
129
|
-
else
|
130
|
-
nil
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# Returns true if underlying object is not a subclass of {Linkage::Data}.
|
135
|
-
def raw?
|
136
|
-
!@object.kind_of?(Linkage::Data)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
@@ -1,187 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
# A runner class that only uses a single thread to execute a linkage.
|
3
|
-
#
|
4
|
-
# @see Runner
|
5
|
-
class SingleThreadedRunner < Runner
|
6
|
-
# @return [Linkage::ResultSet]
|
7
|
-
def execute
|
8
|
-
result_set.create_tables!
|
9
|
-
|
10
|
-
@pk_1 = config.dataset_1.field_set.primary_key.to_expr
|
11
|
-
@pk_2 = config.dataset_2.field_set.primary_key.to_expr
|
12
|
-
if config.has_simple_expectations?
|
13
|
-
setup_datasets
|
14
|
-
group_records
|
15
|
-
|
16
|
-
if config.has_exhaustive_expectations?
|
17
|
-
score_records_with_groups
|
18
|
-
else
|
19
|
-
create_matches
|
20
|
-
end
|
21
|
-
else
|
22
|
-
dataset_1, dataset_2 = config.datasets_with_applied_exhaustive_expectations
|
23
|
-
score_records_without_groups(dataset_1, dataset_2)
|
24
|
-
end
|
25
|
-
|
26
|
-
result_set.flush!
|
27
|
-
return result_set
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def setup_datasets
|
33
|
-
@dataset_1, @dataset_2 = config.datasets_with_applied_simple_expectations
|
34
|
-
|
35
|
-
@dataset_1 = @dataset_1.select(@pk_1)
|
36
|
-
if @config.linkage_type != :self
|
37
|
-
@dataset_2 = @dataset_2.select(@pk_2)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
def group_records
|
42
|
-
if config.linkage_type == :self
|
43
|
-
group_records_for(@dataset_1, 1)
|
44
|
-
else
|
45
|
-
group_records_for(@dataset_1, 1, false)
|
46
|
-
group_records_for(@dataset_2, 2, false)
|
47
|
-
combine_groups
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# @param [Linkage::Dataset] dataset
|
52
|
-
# @param [Fixnum, nil] dataset_id
|
53
|
-
# @param [Boolean] ignore_empty_groups
|
54
|
-
# @yield [Linkage::Group] If a block is given, yield completed groups to
|
55
|
-
# the block. Otherwise, call ResultSet#add_group on the group.
|
56
|
-
def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
|
57
|
-
group_minimum = ignore_empty_groups ? 2 : 1
|
58
|
-
dataset.each_group(group_minimum) do |group|
|
59
|
-
result_set.add_group(group, dataset_id)
|
60
|
-
end
|
61
|
-
result_set.flush!
|
62
|
-
end
|
63
|
-
|
64
|
-
def combine_groups
|
65
|
-
# Create a new dataset for the groups table
|
66
|
-
groups_dataset = result_set.groups_dataset
|
67
|
-
|
68
|
-
groups_dataset.field_set.values.each do |field|
|
69
|
-
# Sort on all fields
|
70
|
-
if !field.primary_key?
|
71
|
-
meta_object = MetaObject.new(field)
|
72
|
-
groups_dataset = groups_dataset.group_match_more(meta_object)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
# Delete non-matching groups
|
77
|
-
sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
|
78
|
-
groups_dataset.filter(:id => sub_dataset.obj).delete
|
79
|
-
|
80
|
-
# Delete duplicate groups
|
81
|
-
sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
|
82
|
-
groups_dataset.filter(:id => sub_dataset.obj).delete
|
83
|
-
end
|
84
|
-
|
85
|
-
def score_records_with_groups
|
86
|
-
result_set.groups_dataset.each do |group_record|
|
87
|
-
group = Group.from_row(group_record)
|
88
|
-
dataset_1, dataset_2 = config.apply_exhaustive_expectations(
|
89
|
-
*result_set.groups_records_datasets(group))
|
90
|
-
score_records_without_groups(dataset_1, dataset_2)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def score_records_without_groups(dataset_1, dataset_2)
|
95
|
-
if config.linkage_type == :self
|
96
|
-
keys = dataset_1.select_map(@pk_1)
|
97
|
-
unfiltered_dataset = dataset_1.unfiltered
|
98
|
-
cache = Hashery::LRUHash.new(config.record_cache_size) do |h, k|
|
99
|
-
h[k] = unfiltered_dataset.filter(@pk_1 => k).first
|
100
|
-
end
|
101
|
-
upper_bound = keys.length - 1
|
102
|
-
|
103
|
-
forward = true
|
104
|
-
keys.each_with_index do |key_1, key_1_index|
|
105
|
-
record_1 = cache[key_1]
|
106
|
-
|
107
|
-
lower_bound = key_1_index + 1
|
108
|
-
enum =
|
109
|
-
if forward
|
110
|
-
lower_bound.upto(upper_bound)
|
111
|
-
else
|
112
|
-
upper_bound.downto(lower_bound)
|
113
|
-
end
|
114
|
-
enum.each do |key_2_index|
|
115
|
-
record_2 = cache[keys[key_2_index]]
|
116
|
-
score(record_1, record_2)
|
117
|
-
end
|
118
|
-
forward = !forward
|
119
|
-
end
|
120
|
-
else
|
121
|
-
keys_2 = dataset_2.select_map(@pk_2)
|
122
|
-
unfiltered_dataset_2 = dataset_2.unfiltered
|
123
|
-
cache_2 = Hashery::LRUHash.new(config.record_cache_size) do |h, k|
|
124
|
-
h[k] = unfiltered_dataset_2.filter(@pk_2 => k).first
|
125
|
-
end
|
126
|
-
keys_2_last = keys_2.length - 1
|
127
|
-
|
128
|
-
forward = true
|
129
|
-
dataset_1.each do |record_1|
|
130
|
-
enum = forward ? 0.upto(keys_2_last) : keys_2_last.downto(0)
|
131
|
-
enum.each do |key_2_index|
|
132
|
-
record_2 = cache_2[keys_2[key_2_index]]
|
133
|
-
score(record_1, record_2)
|
134
|
-
end
|
135
|
-
forward = !forward
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
def score(record_1, record_2)
|
141
|
-
pk_1 = record_1[@pk_1]
|
142
|
-
pk_2 = record_2[@pk_2]
|
143
|
-
|
144
|
-
catch(:stop) do
|
145
|
-
total_score = 0
|
146
|
-
config.exhaustive_expectations.each_with_index do |expectation, comparator_id|
|
147
|
-
comparator = expectation.comparator
|
148
|
-
|
149
|
-
score = comparator.score(record_1, record_2)
|
150
|
-
result_set.add_score(comparator_id, pk_1, pk_2, score)
|
151
|
-
|
152
|
-
throw(:stop) unless expectation.satisfied?(score)
|
153
|
-
total_score += score
|
154
|
-
end
|
155
|
-
result_set.add_match(pk_1, pk_2, total_score)
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
# Only needed for linkages without exhaustive expectations
|
160
|
-
def create_matches
|
161
|
-
result_set.groups_dataset.each do |group_record|
|
162
|
-
group = Group.from_row(group_record)
|
163
|
-
dataset_1, dataset_2 = result_set.groups_records_datasets(group)
|
164
|
-
|
165
|
-
if config.linkage_type == :self
|
166
|
-
keys = dataset_1.select_map(@pk_1)
|
167
|
-
keys_last = keys.length - 1
|
168
|
-
keys.each_with_index do |key_1, key_1_index|
|
169
|
-
(key_1_index + 1).upto(keys_last) do |key_2_index|
|
170
|
-
key_2 = keys[key_2_index]
|
171
|
-
result_set.add_match(key_1, key_2, nil)
|
172
|
-
end
|
173
|
-
end
|
174
|
-
else
|
175
|
-
keys_1 = dataset_1.select_map(@pk_1)
|
176
|
-
keys_2 = dataset_2.select_map(@pk_2)
|
177
|
-
|
178
|
-
keys_1.each do |key_1|
|
179
|
-
keys_2.each do |key_2|
|
180
|
-
result_set.add_match(key_1, key_2, nil)
|
181
|
-
end
|
182
|
-
end
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
data/lib/linkage/utils.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
module Linkage
|
2
|
-
module Utils
|
3
|
-
# A "tree" used to find compatible types.
|
4
|
-
TYPE_CONVERSION_TREE = {
|
5
|
-
TrueClass => [Integer],
|
6
|
-
Integer => [Bignum, Float],
|
7
|
-
Bignum => [BigDecimal],
|
8
|
-
Float => [BigDecimal],
|
9
|
-
BigDecimal => [String],
|
10
|
-
String => nil,
|
11
|
-
DateTime => nil,
|
12
|
-
Date => nil,
|
13
|
-
Time => nil,
|
14
|
-
File => nil
|
15
|
-
}
|
16
|
-
|
17
|
-
# Create field information for a field that can hold data from two other
|
18
|
-
# fields. If the fields have different types, the resulting type is
|
19
|
-
# determined via a type-conversion tree.
|
20
|
-
#
|
21
|
-
# @param [Array] field_1 Schema information for the first field
|
22
|
-
# @param [Array] field_2 Schema information for the second field
|
23
|
-
# @return [Array] Schema information for the new field
|
24
|
-
def merge_fields(field_1, field_2)
|
25
|
-
schema_1 = column_schema_to_ruby_type(field_1)
|
26
|
-
schema_1.delete_if { |k, v| v.nil? }
|
27
|
-
schema_2 = column_schema_to_ruby_type(field_2)
|
28
|
-
schema_2.delete_if { |k, v| v.nil? }
|
29
|
-
if schema_1 == schema_2
|
30
|
-
result = schema_1
|
31
|
-
else
|
32
|
-
result = schema_1.dup
|
33
|
-
|
34
|
-
# type
|
35
|
-
if schema_1[:type] != schema_2[:type]
|
36
|
-
result[:type] = first_common_type(schema_1[:type], schema_2[:type])
|
37
|
-
end
|
38
|
-
|
39
|
-
# text
|
40
|
-
if schema_1[:text] != schema_2[:text]
|
41
|
-
# This can only be of type String.
|
42
|
-
result[:text] = true
|
43
|
-
result.delete(:size)
|
44
|
-
end
|
45
|
-
|
46
|
-
# size
|
47
|
-
if !result[:text] && schema_1[:size] != schema_2[:size]
|
48
|
-
types = [schema_1[:type], schema_2[:type]].uniq
|
49
|
-
if types.length == 1 && types[0] == BigDecimal
|
50
|
-
# Two decimals
|
51
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
52
|
-
s_1 = schema_1[:size]
|
53
|
-
s_2 = schema_2[:size]
|
54
|
-
result[:size] = [ s_1[0] > s_2[0] ? s_1[0] : s_2[0] ]
|
55
|
-
|
56
|
-
if s_1[1] && s_2[1]
|
57
|
-
result[:size][1] = s_1[1] > s_2[1] ? s_1[1] : s_2[1]
|
58
|
-
else
|
59
|
-
result[:size][1] = s_1[1] ? s_1[1] : s_2[1]
|
60
|
-
end
|
61
|
-
else
|
62
|
-
result[:size] = schema_1.has_key?(:size) ? schema_1[:size] : schema_2[:size]
|
63
|
-
end
|
64
|
-
elsif types.include?(String) && types.include?(BigDecimal)
|
65
|
-
# Add one to the precision of the BigDecimal (for the dot)
|
66
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
67
|
-
s_1 = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
|
68
|
-
s_2 = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
|
69
|
-
result[:size] = s_1 > s_2 ? s_1 : s_2
|
70
|
-
elsif schema_1.has_key?(:size)
|
71
|
-
result[:size] = schema_1[:size].is_a?(Array) ? schema_1[:size][0] + 1 : schema_1[:size]
|
72
|
-
elsif schema_2.has_key?(:size)
|
73
|
-
result[:size] = schema_2[:size].is_a?(Array) ? schema_2[:size][0] + 1 : schema_2[:size]
|
74
|
-
end
|
75
|
-
else
|
76
|
-
# Treat as two strings
|
77
|
-
if schema_1.has_key?(:size) && schema_2.has_key?(:size)
|
78
|
-
result[:size] = schema_1[:size] > schema_2[:size] ? schema_1[:size] : schema_2[:size]
|
79
|
-
elsif schema_1.has_key?(:size)
|
80
|
-
result[:size] = schema_1[:size]
|
81
|
-
else
|
82
|
-
result[:size] = schema_2[:size]
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# fixed
|
88
|
-
if schema_1[:fixed] != schema_2[:fixed]
|
89
|
-
# This can only be of type String.
|
90
|
-
result[:fixed] = true
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
{:type => result.delete(:type), :opts => result}
|
95
|
-
end
|
96
|
-
|
97
|
-
private
|
98
|
-
|
99
|
-
# Convert the column schema information to a hash of column options, one of which must
|
100
|
-
# be :type. The other options added should modify that type (e.g. :size). If a
|
101
|
-
# database type is not recognized, return it as a String type.
|
102
|
-
#
|
103
|
-
# @note This method comes straight from Sequel (lib/sequel/extensions/schema_dumper.rb).
|
104
|
-
def column_schema_to_ruby_type(schema)
|
105
|
-
case t = schema[:db_type].downcase
|
106
|
-
when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
107
|
-
{:type=>Integer}
|
108
|
-
when /\Atinyint(?:\((\d+)\))?\z/o
|
109
|
-
{:type =>schema[:type] == :boolean ? TrueClass : Integer}
|
110
|
-
when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
111
|
-
{:type=>Bignum}
|
112
|
-
when /\A(?:real|float|double(?: precision)?)\z/o
|
113
|
-
{:type=>Float}
|
114
|
-
when 'boolean'
|
115
|
-
{:type=>TrueClass}
|
116
|
-
when /\A(?:(?:tiny|medium|long|n)?text|clob)\z/o
|
117
|
-
{:type=>String, :text=>true}
|
118
|
-
when 'date'
|
119
|
-
{:type=>Date}
|
120
|
-
when /\A(?:small)?datetime\z/o
|
121
|
-
{:type=>DateTime}
|
122
|
-
when /\Atimestamp(?:\((\d+)\))?(?: with(?:out)? time zone)?\z/o
|
123
|
-
{:type=>DateTime, :size=>($1.to_i if $1)}
|
124
|
-
when /\Atime(?: with(?:out)? time zone)?\z/o
|
125
|
-
{:type=>Time, :only_time=>true}
|
126
|
-
when /\An?char(?:acter)?(?:\((\d+)\))?\z/o
|
127
|
-
{:type=>String, :size=>($1.to_i if $1), :fixed=>true}
|
128
|
-
when /\A(?:n?varchar|character varying|bpchar|string)(?:\((\d+)\))?\z/o
|
129
|
-
{:type=>String, :size=>($1.to_i if $1)}
|
130
|
-
when /\A(?:small)?money\z/o
|
131
|
-
{:type=>BigDecimal, :size=>[19,2]}
|
132
|
-
when /\A(?:decimal|numeric|number)(?:\((\d+)(?:,\s*(\d+))?\))?\z/o
|
133
|
-
s = [($1.to_i if $1), ($2.to_i if $2)].compact
|
134
|
-
{:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
|
135
|
-
when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
|
136
|
-
{:type=>File, :size=>($1.to_i if $1)}
|
137
|
-
when 'year'
|
138
|
-
{:type=>Integer}
|
139
|
-
else
|
140
|
-
{:type=>String}
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
def first_common_type(type_1, type_2)
|
145
|
-
types_1 = [type_1] + get_types(type_1)
|
146
|
-
types_2 = [type_2] + get_types(type_2)
|
147
|
-
(types_1 & types_2).first
|
148
|
-
end
|
149
|
-
|
150
|
-
# Get all types that the specified type can be converted to. Order
|
151
|
-
# matters.
|
152
|
-
def get_types(type)
|
153
|
-
result = []
|
154
|
-
types = TYPE_CONVERSION_TREE[type]
|
155
|
-
if types
|
156
|
-
result += types
|
157
|
-
types.each do |t|
|
158
|
-
result |= get_types(t)
|
159
|
-
end
|
160
|
-
end
|
161
|
-
result
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
data/lib/linkage/warnings.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
module IntegrationTests
|
4
|
-
class TestCollation < Test::Unit::TestCase
|
5
|
-
def setup
|
6
|
-
@tmpdir = Dir.mktmpdir('linkage')
|
7
|
-
@tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
|
8
|
-
end
|
9
|
-
|
10
|
-
def database(options = {}, &block)
|
11
|
-
Sequel.connect(@tmpuri, options, &block)
|
12
|
-
end
|
13
|
-
|
14
|
-
def teardown
|
15
|
-
FileUtils.remove_entry_secure(@tmpdir)
|
16
|
-
end
|
17
|
-
|
18
|
-
test "comparing strings exactly in MySQL" do
|
19
|
-
options = database_options_for('mysql')
|
20
|
-
database_for('mysql') do |db|
|
21
|
-
db.create_table!(:foo) do
|
22
|
-
primary_key :id
|
23
|
-
String :foo
|
24
|
-
String :bar
|
25
|
-
end
|
26
|
-
db[:foo].import([:foo, :bar], [
|
27
|
-
["Foo", "foo"],
|
28
|
-
["bar", "bar "],
|
29
|
-
])
|
30
|
-
end
|
31
|
-
dataset = Linkage::Dataset.new(options, :foo)
|
32
|
-
tmpuri = @tmpuri
|
33
|
-
conf = dataset.link_with(dataset) do
|
34
|
-
(lhs[:foo].must == rhs[:bar]).exactly
|
35
|
-
save_results_in(tmpuri)
|
36
|
-
end
|
37
|
-
runner = Linkage::SingleThreadedRunner.new(conf)
|
38
|
-
runner.execute
|
39
|
-
|
40
|
-
database do |db|
|
41
|
-
assert_equal 0, db[:groups].count
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|