linkage 0.0.8 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
@@ -1,46 +1,253 @@
|
|
1
1
|
module Linkage
|
2
2
|
module Comparators
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
# Compare is the most basic comparator in Linkage, conceptually. It scores
|
4
|
+
# two records based on whether or not field values satisfy the specified
|
5
|
+
# operator. Score is either 0 or 1.
|
6
|
+
#
|
7
|
+
# To use Compare, you must specify two sets of fields to use in the
|
8
|
+
# comparison, along with an operator. Valid operators are:
|
9
|
+
#
|
10
|
+
# * `:equal`
|
11
|
+
# * `:not_equal`
|
12
|
+
# * `:greater_than`
|
13
|
+
# * `:greater_than_or_equal`
|
14
|
+
# * `:less_than`
|
15
|
+
# * `:less_than_or_equal`
|
16
|
+
#
|
17
|
+
# Sets of fields must be of equal length. If you specify more than one
|
18
|
+
# field, each field will be compared to its counterpart in the other set.
|
19
|
+
# All of the field values must meet the conditions in order for the score to
|
20
|
+
# be 1. Otherwise, the score is 0.
|
21
|
+
#
|
22
|
+
# Consider the following example, using a {Configuration} as part of
|
23
|
+
# {Dataset#link_with}:
|
24
|
+
#
|
25
|
+
# ```ruby
|
26
|
+
# config.compare([:foo, :bar], [:baz, :qux], :equal)
|
27
|
+
# ```
|
28
|
+
#
|
29
|
+
# For each record, the values of `foo` and `baz` are compared, and the
|
30
|
+
# values of `bar` and `qux` are compared. If both of these two comparisons
|
31
|
+
# are `true`, then the score of 1 is given. If `foo` and `baz` are equal but
|
32
|
+
# `bar` and `qux` are not equal, or if both comparisons are false, then a
|
33
|
+
# score of 0 is given.
|
34
|
+
#
|
35
|
+
# Algorithms
|
36
|
+
# ----------
|
37
|
+
#
|
38
|
+
# The way records are chosen for comparison depends on which operator you
|
39
|
+
# use. The `:equal` operator is treated differently than the other
|
40
|
+
# operators. When using operators other than `:equal`, each record is
|
41
|
+
# compared to every other record (and {#type} returns `:simple`). When using
|
42
|
+
# `:equal`, {#type} is `:advanced` and a different algorithm is used.
|
43
|
+
#
|
44
|
+
# "Equal" mode uses an algorithm similar to the sorted neighborhood method.
|
45
|
+
# Values are sorted (via database query) and then compared. This way, only
|
46
|
+
# adjacent records are compared. Using the transitive property of equality,
|
47
|
+
# records are grouped together. All pairs of records in the group are scored
|
48
|
+
# as 1. Scores of 0 are not given at all (absence of score means 0).
|
49
|
+
class Compare < Comparator
|
50
|
+
VALID_OPERATIONS = [
|
51
|
+
:not_equal, :greater_than, :greater_than_or_equal,
|
52
|
+
:less_than_or_equal, :less_than, :equal
|
8
53
|
]
|
9
|
-
def self.parameters
|
10
|
-
@@parameters
|
11
|
-
end
|
12
54
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
55
|
+
def initialize(set_1, set_2, operation)
|
56
|
+
if set_1.length != set_2.length
|
57
|
+
raise "sets must be of equal length"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Check value data types
|
61
|
+
set_1.each_with_index do |value_1, index|
|
62
|
+
value_2 = set_2[index]
|
63
|
+
if value_1.ruby_type != value_2.ruby_type
|
64
|
+
raise "values at index #{index} had different types"
|
65
|
+
end
|
66
|
+
end
|
17
67
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
68
|
+
# Check compare operator
|
69
|
+
if !VALID_OPERATIONS.include?(operation)
|
70
|
+
raise "operation is not valid"
|
71
|
+
end
|
72
|
+
@type = operation == :equal ? :advanced : :simple
|
73
|
+
@names_1 = set_1.collect(&:name)
|
74
|
+
@names_2 = set_2.collect(&:name)
|
75
|
+
@operation = operation
|
23
76
|
end
|
24
77
|
|
25
78
|
def score(record_1, record_2)
|
79
|
+
values_1 = record_1.values_at(*@names_1)
|
80
|
+
values_2 = record_2.values_at(*@names_2)
|
26
81
|
result =
|
27
|
-
case @
|
28
|
-
when
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
when
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
when
|
37
|
-
|
82
|
+
case @operation
|
83
|
+
when :not_equal
|
84
|
+
values_1.each_with_index.all? do |value_1, i|
|
85
|
+
value_1 != values_2[i]
|
86
|
+
end
|
87
|
+
when :greater_than
|
88
|
+
values_1.each_with_index.all? do |value_1, i|
|
89
|
+
value_1 > values_2[i]
|
90
|
+
end
|
91
|
+
when :greater_than_or_equal
|
92
|
+
values_1.each_with_index.all? do |value_1, i|
|
93
|
+
value_1 >= values_2[i]
|
94
|
+
end
|
95
|
+
when :less_than_or_equal
|
96
|
+
values_1.each_with_index.all? do |value_1, i|
|
97
|
+
value_1 <= values_2[i]
|
98
|
+
end
|
99
|
+
when :less_than
|
100
|
+
values_1.each_with_index.all? do |value_1, i|
|
101
|
+
value_1 < values_2[i]
|
102
|
+
end
|
38
103
|
end
|
39
104
|
|
40
105
|
result ? 1 : 0
|
41
106
|
end
|
107
|
+
|
108
|
+
def score_datasets(dataset_1, dataset_2)
|
109
|
+
# FIXME: nil value equality
|
110
|
+
|
111
|
+
_score_datasets(dataset_1, dataset_2)
|
112
|
+
end
|
113
|
+
|
114
|
+
def score_dataset(dataset)
|
115
|
+
# FIXME: nil value equality
|
116
|
+
|
117
|
+
if @names_1 != @names_2
|
118
|
+
return _score_datasets(dataset, dataset)
|
119
|
+
end
|
120
|
+
|
121
|
+
enum = dataset.order(*@names_1).to_enum
|
122
|
+
begin
|
123
|
+
record = enum.next
|
124
|
+
rescue StopIteration
|
125
|
+
return
|
126
|
+
end
|
127
|
+
group = [record]
|
128
|
+
last_value = record.values_at(*@names_1)
|
129
|
+
loop do
|
130
|
+
begin
|
131
|
+
record = enum.next
|
132
|
+
rescue StopIteration
|
133
|
+
break
|
134
|
+
end
|
135
|
+
value = record.values_at(*@names_1)
|
136
|
+
if value == last_value
|
137
|
+
group << record
|
138
|
+
else
|
139
|
+
score_group(group)
|
140
|
+
group.clear
|
141
|
+
group << record
|
142
|
+
last_value = value
|
143
|
+
end
|
144
|
+
end
|
145
|
+
score_group(group)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def _score_datasets(dataset_1, dataset_2)
|
151
|
+
enum_1 = dataset_1.order(*@names_1).to_enum
|
152
|
+
enum_2 = dataset_2.order(*@names_2).to_enum
|
153
|
+
|
154
|
+
begin
|
155
|
+
record_1 = enum_1.next
|
156
|
+
record_2 = enum_2.next
|
157
|
+
rescue StopIteration
|
158
|
+
# no pairs to score
|
159
|
+
return
|
160
|
+
end
|
161
|
+
group_1 = []
|
162
|
+
group_2 = []
|
163
|
+
loop do
|
164
|
+
value_1 = record_1.values_at(*@names_1)
|
165
|
+
value_2 = record_2.values_at(*@names_2)
|
166
|
+
result = value_1 <=> value_2
|
167
|
+
if result == 0
|
168
|
+
last_value = value_1
|
169
|
+
group_1 << record_1
|
170
|
+
group_2 << record_2
|
171
|
+
|
172
|
+
state = :right
|
173
|
+
loop do
|
174
|
+
begin
|
175
|
+
case state
|
176
|
+
when :left
|
177
|
+
record_1 = enum_1.next
|
178
|
+
value_1 = record_1.values_at(*@names_1)
|
179
|
+
result = last_value == value_1
|
180
|
+
when :right
|
181
|
+
record_2 = enum_2.next
|
182
|
+
value_2 = record_2.values_at(*@names_2)
|
183
|
+
result = last_value == value_2
|
184
|
+
end
|
185
|
+
rescue StopIteration
|
186
|
+
result = false
|
187
|
+
case state
|
188
|
+
when :left
|
189
|
+
record_1 = :eof
|
190
|
+
when :right
|
191
|
+
record_2 = :eof
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
if result
|
196
|
+
case state
|
197
|
+
when :left
|
198
|
+
group_1 << record_1
|
199
|
+
when :right
|
200
|
+
group_2 << record_2
|
201
|
+
end
|
202
|
+
else
|
203
|
+
case state
|
204
|
+
when :left
|
205
|
+
# done with this group
|
206
|
+
score_groups(group_1, group_2)
|
207
|
+
group_1.clear
|
208
|
+
group_2.clear
|
209
|
+
break
|
210
|
+
when :right
|
211
|
+
state = :left
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
if record_1 == :eof || record_2 == :eof
|
216
|
+
break
|
217
|
+
end
|
218
|
+
else
|
219
|
+
begin
|
220
|
+
if result < 0
|
221
|
+
record_1 = enum_1.next
|
222
|
+
else
|
223
|
+
record_2 = enum_2.next
|
224
|
+
end
|
225
|
+
rescue StopIteration
|
226
|
+
break
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def score_groups(group_1, group_2)
|
233
|
+
group_1.each do |record_1|
|
234
|
+
group_2.each do |record_2|
|
235
|
+
changed
|
236
|
+
notify_observers(self, record_1, record_2, 1)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def score_group(group)
|
242
|
+
(group.length - 1).times do |i|
|
243
|
+
((i+1)...group.length).each do |j|
|
244
|
+
changed
|
245
|
+
notify_observers(self, group[i], group[j], 1)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
42
249
|
end
|
43
250
|
|
44
|
-
Comparator.register(Compare)
|
251
|
+
Comparator.register('compare', Compare)
|
45
252
|
end
|
46
253
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Comparators
|
3
|
+
# Strcompare is a string comparison comparator. It uses the specified
|
4
|
+
# operation to compare string-type fields. Score ranges from 0 to 1.
|
5
|
+
#
|
6
|
+
# To use Strcompare, you must specify one field for each record to use in
|
7
|
+
# the comparison, along with an operator. Valid operators are:
|
8
|
+
#
|
9
|
+
# * `:jarowinkler` ([Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance))
|
10
|
+
#
|
11
|
+
# Consider the following example, using a {Configuration} as part of
|
12
|
+
# {Dataset#link_with}:
|
13
|
+
#
|
14
|
+
# ```ruby
|
15
|
+
# config.strcompare(:foo, :bar, :jarowinkler)
|
16
|
+
# ```
|
17
|
+
#
|
18
|
+
# For each record, the values of the `foo` and `bar` fields are compared
|
19
|
+
# using the Jaro-Winkler distance algorithm.
|
20
|
+
class Strcompare < Comparator
|
21
|
+
VALID_OPERATIONS = [:jarowinkler]
|
22
|
+
|
23
|
+
def initialize(field_1, field_2, operation)
|
24
|
+
if field_1.ruby_type[:type] != String || field_2.ruby_type[:type] != String
|
25
|
+
raise "fields must be string types"
|
26
|
+
end
|
27
|
+
if !VALID_OPERATIONS.include?(operation)
|
28
|
+
raise "#{operation.inspect} is not a valid operation"
|
29
|
+
end
|
30
|
+
|
31
|
+
@name_1 = field_1.name
|
32
|
+
@name_2 = field_2.name
|
33
|
+
@operation = operation
|
34
|
+
end
|
35
|
+
|
36
|
+
def score(record_1, record_2)
|
37
|
+
result =
|
38
|
+
case @operation
|
39
|
+
when :jarowinkler
|
40
|
+
jarowinkler(record_1[@name_1], record_2[@name_2])
|
41
|
+
end
|
42
|
+
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
def jarowinkler(w1, w2)
|
47
|
+
a = w1.downcase
|
48
|
+
b = w2.downcase
|
49
|
+
aa = a.split('')
|
50
|
+
ba = b.split('')
|
51
|
+
al = a.length
|
52
|
+
bl = b.length
|
53
|
+
l = 0
|
54
|
+
for i in Range.new(0, [[al, bl].min, 4].min-1)
|
55
|
+
break if aa[i] != ba[i]
|
56
|
+
l += 1
|
57
|
+
end
|
58
|
+
aj = aa - (aa - ba)
|
59
|
+
bj = ba - (ba - aa)
|
60
|
+
nm = 0
|
61
|
+
nt = 0
|
62
|
+
md = [[al, bl].max/2 - 1, 0].max
|
63
|
+
for i in Range.new(0, al-1)
|
64
|
+
bi = ba.index(aa[i])
|
65
|
+
aji = aj.index(aa[i])
|
66
|
+
bji = bj.index(aa[i])
|
67
|
+
if !bi.nil? && (bi + nm - i).abs <= md
|
68
|
+
nm += 1
|
69
|
+
nt += 1 if !bji.nil? && aji != bji
|
70
|
+
end
|
71
|
+
ba.delete_at(bi) if !bi.nil?
|
72
|
+
aj.delete_at(aji) if !aji.nil?
|
73
|
+
bj.delete_at(bji) if !bji.nil?
|
74
|
+
end
|
75
|
+
return 0 if nm == 0
|
76
|
+
d = (nm/al.to_f + nm/bl.to_f + (nm-nt)/nm.to_f)/3.0
|
77
|
+
w = (d + l * 0.1 * (1 - d)).round(3)
|
78
|
+
w
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Comparator.register('strcompare', Strcompare)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
@@ -1,25 +1,29 @@
|
|
1
1
|
module Linkage
|
2
2
|
module Comparators
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
# Within is a integer comparator. It checks if two values are within a
|
4
|
+
# specified range. Score is either 0 to 1.
|
5
|
+
#
|
6
|
+
# To use Within, you must specify one field for each record to use in
|
7
|
+
# the comparison, along with a range value.
|
8
|
+
#
|
9
|
+
# Consider the following example, using a {Configuration} as part of
|
10
|
+
# {Dataset#link_with}:
|
11
|
+
#
|
12
|
+
# ```ruby
|
13
|
+
# config.within(:foo, :bar, 5)
|
14
|
+
# ```
|
15
|
+
#
|
16
|
+
# For each pair of records, if value of `foo` is within 5 (inclusive) of
|
17
|
+
# the value of `bar`, the score is 1. Otherwise, the score is 0.
|
18
|
+
class Within < Comparator
|
19
|
+
def initialize(field_1, field_2, value)
|
20
|
+
if field_1.ruby_type != field_2.ruby_type
|
21
|
+
raise "fields must have the same type"
|
22
|
+
end
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
@
|
21
|
-
@value = @args[1].object
|
22
|
-
@name_2 = @args[2].name
|
24
|
+
@name_1 = field_1.name
|
25
|
+
@name_2 = field_2.name
|
26
|
+
@value = value
|
23
27
|
end
|
24
28
|
|
25
29
|
def score(record_1, record_2)
|
@@ -27,6 +31,6 @@ module Linkage
|
|
27
31
|
end
|
28
32
|
end
|
29
33
|
|
30
|
-
Comparator.register(Within)
|
34
|
+
Comparator.register('within', Within)
|
31
35
|
end
|
32
36
|
end
|