linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
@@ -1,46 +1,253 @@
|
|
1
1
|
module Linkage
|
2
2
|
module Comparators
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
# Compare is the most basic comparator in Linkage, conceptually. It scores
|
4
|
+
# two records based on whether or not field values satisfy the specified
|
5
|
+
# operator. Score is either 0 or 1.
|
6
|
+
#
|
7
|
+
# To use Compare, you must specify two sets of fields to use in the
|
8
|
+
# comparison, along with an operator. Valid operators are:
|
9
|
+
#
|
10
|
+
# * `:equal`
|
11
|
+
# * `:not_equal`
|
12
|
+
# * `:greater_than`
|
13
|
+
# * `:greater_than_or_equal`
|
14
|
+
# * `:less_than`
|
15
|
+
# * `:less_than_or_equal`
|
16
|
+
#
|
17
|
+
# Sets of fields must be of equal length. If you specify more than one
|
18
|
+
# field, each field will be compared to its counterpart in the other set.
|
19
|
+
# All of the field values must meet the conditions in order for the score to
|
20
|
+
# be 1. Otherwise, the score is 0.
|
21
|
+
#
|
22
|
+
# Consider the following example, using a {Configuration} as part of
|
23
|
+
# {Dataset#link_with}:
|
24
|
+
#
|
25
|
+
# ```ruby
|
26
|
+
# config.compare([:foo, :bar], [:baz, :qux], :equal)
|
27
|
+
# ```
|
28
|
+
#
|
29
|
+
# For each record, the values of `foo` and `baz` are compared, and the
|
30
|
+
# values of `bar` and `qux` are compared. If both of these two comparisons
|
31
|
+
# are `true`, then the score of 1 is given. If `foo` and `baz` are equal but
|
32
|
+
# `bar` and `qux` are not equal, or if both comparisons are false, then a
|
33
|
+
# score of 0 is given.
|
34
|
+
#
|
35
|
+
# Algorithms
|
36
|
+
# ----------
|
37
|
+
#
|
38
|
+
# The way records are chosen for comparison depends on which operator you
|
39
|
+
# use. The `:equal` operator is treated differently than the other
|
40
|
+
# operators. When using operators other than `:equal`, each record is
|
41
|
+
# compared to every other record (and {#type} returns `:simple`). When using
|
42
|
+
# `:equal`, {#type} is `:advanced` and a different algorithm is used.
|
43
|
+
#
|
44
|
+
# "Equal" mode uses an algorithm similar to the sorted neighborhood method.
|
45
|
+
# Values are sorted (via database query) and then compared. This way, only
|
46
|
+
# adjacent records are compared. Using the transitive property of equality,
|
47
|
+
# records are grouped together. All pairs of records in the group are scored
|
48
|
+
# as 1. Scores of 0 are not given at all (absence of score means 0).
|
49
|
+
class Compare < Comparator
|
50
|
+
VALID_OPERATIONS = [
|
51
|
+
:not_equal, :greater_than, :greater_than_or_equal,
|
52
|
+
:less_than_or_equal, :less_than, :equal
|
8
53
|
]
|
9
|
-
def self.parameters
|
10
|
-
@@parameters
|
11
|
-
end
|
12
54
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
55
|
+
def initialize(set_1, set_2, operation)
|
56
|
+
if set_1.length != set_2.length
|
57
|
+
raise "sets must be of equal length"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Check value data types
|
61
|
+
set_1.each_with_index do |value_1, index|
|
62
|
+
value_2 = set_2[index]
|
63
|
+
if value_1.ruby_type != value_2.ruby_type
|
64
|
+
raise "values at index #{index} had different types"
|
65
|
+
end
|
66
|
+
end
|
17
67
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
68
|
+
# Check compare operator
|
69
|
+
if !VALID_OPERATIONS.include?(operation)
|
70
|
+
raise "operation is not valid"
|
71
|
+
end
|
72
|
+
@type = operation == :equal ? :advanced : :simple
|
73
|
+
@names_1 = set_1.collect(&:name)
|
74
|
+
@names_2 = set_2.collect(&:name)
|
75
|
+
@operation = operation
|
23
76
|
end
|
24
77
|
|
25
78
|
def score(record_1, record_2)
|
79
|
+
values_1 = record_1.values_at(*@names_1)
|
80
|
+
values_2 = record_2.values_at(*@names_2)
|
26
81
|
result =
|
27
|
-
case @
|
28
|
-
when
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
when
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
when
|
37
|
-
|
82
|
+
case @operation
|
83
|
+
when :not_equal
|
84
|
+
values_1.each_with_index.all? do |value_1, i|
|
85
|
+
value_1 != values_2[i]
|
86
|
+
end
|
87
|
+
when :greater_than
|
88
|
+
values_1.each_with_index.all? do |value_1, i|
|
89
|
+
value_1 > values_2[i]
|
90
|
+
end
|
91
|
+
when :greater_than_or_equal
|
92
|
+
values_1.each_with_index.all? do |value_1, i|
|
93
|
+
value_1 >= values_2[i]
|
94
|
+
end
|
95
|
+
when :less_than_or_equal
|
96
|
+
values_1.each_with_index.all? do |value_1, i|
|
97
|
+
value_1 <= values_2[i]
|
98
|
+
end
|
99
|
+
when :less_than
|
100
|
+
values_1.each_with_index.all? do |value_1, i|
|
101
|
+
value_1 < values_2[i]
|
102
|
+
end
|
38
103
|
end
|
39
104
|
|
40
105
|
result ? 1 : 0
|
41
106
|
end
|
107
|
+
|
108
|
+
def score_datasets(dataset_1, dataset_2)
|
109
|
+
# FIXME: nil value equality
|
110
|
+
|
111
|
+
_score_datasets(dataset_1, dataset_2)
|
112
|
+
end
|
113
|
+
|
114
|
+
def score_dataset(dataset)
|
115
|
+
# FIXME: nil value equality
|
116
|
+
|
117
|
+
if @names_1 != @names_2
|
118
|
+
return _score_datasets(dataset, dataset)
|
119
|
+
end
|
120
|
+
|
121
|
+
enum = dataset.order(*@names_1).to_enum
|
122
|
+
begin
|
123
|
+
record = enum.next
|
124
|
+
rescue StopIteration
|
125
|
+
return
|
126
|
+
end
|
127
|
+
group = [record]
|
128
|
+
last_value = record.values_at(*@names_1)
|
129
|
+
loop do
|
130
|
+
begin
|
131
|
+
record = enum.next
|
132
|
+
rescue StopIteration
|
133
|
+
break
|
134
|
+
end
|
135
|
+
value = record.values_at(*@names_1)
|
136
|
+
if value == last_value
|
137
|
+
group << record
|
138
|
+
else
|
139
|
+
score_group(group)
|
140
|
+
group.clear
|
141
|
+
group << record
|
142
|
+
last_value = value
|
143
|
+
end
|
144
|
+
end
|
145
|
+
score_group(group)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def _score_datasets(dataset_1, dataset_2)
|
151
|
+
enum_1 = dataset_1.order(*@names_1).to_enum
|
152
|
+
enum_2 = dataset_2.order(*@names_2).to_enum
|
153
|
+
|
154
|
+
begin
|
155
|
+
record_1 = enum_1.next
|
156
|
+
record_2 = enum_2.next
|
157
|
+
rescue StopIteration
|
158
|
+
# no pairs to score
|
159
|
+
return
|
160
|
+
end
|
161
|
+
group_1 = []
|
162
|
+
group_2 = []
|
163
|
+
loop do
|
164
|
+
value_1 = record_1.values_at(*@names_1)
|
165
|
+
value_2 = record_2.values_at(*@names_2)
|
166
|
+
result = value_1 <=> value_2
|
167
|
+
if result == 0
|
168
|
+
last_value = value_1
|
169
|
+
group_1 << record_1
|
170
|
+
group_2 << record_2
|
171
|
+
|
172
|
+
state = :right
|
173
|
+
loop do
|
174
|
+
begin
|
175
|
+
case state
|
176
|
+
when :left
|
177
|
+
record_1 = enum_1.next
|
178
|
+
value_1 = record_1.values_at(*@names_1)
|
179
|
+
result = last_value == value_1
|
180
|
+
when :right
|
181
|
+
record_2 = enum_2.next
|
182
|
+
value_2 = record_2.values_at(*@names_2)
|
183
|
+
result = last_value == value_2
|
184
|
+
end
|
185
|
+
rescue StopIteration
|
186
|
+
result = false
|
187
|
+
case state
|
188
|
+
when :left
|
189
|
+
record_1 = :eof
|
190
|
+
when :right
|
191
|
+
record_2 = :eof
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
if result
|
196
|
+
case state
|
197
|
+
when :left
|
198
|
+
group_1 << record_1
|
199
|
+
when :right
|
200
|
+
group_2 << record_2
|
201
|
+
end
|
202
|
+
else
|
203
|
+
case state
|
204
|
+
when :left
|
205
|
+
# done with this group
|
206
|
+
score_groups(group_1, group_2)
|
207
|
+
group_1.clear
|
208
|
+
group_2.clear
|
209
|
+
break
|
210
|
+
when :right
|
211
|
+
state = :left
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
if record_1 == :eof || record_2 == :eof
|
216
|
+
break
|
217
|
+
end
|
218
|
+
else
|
219
|
+
begin
|
220
|
+
if result < 0
|
221
|
+
record_1 = enum_1.next
|
222
|
+
else
|
223
|
+
record_2 = enum_2.next
|
224
|
+
end
|
225
|
+
rescue StopIteration
|
226
|
+
break
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def score_groups(group_1, group_2)
|
233
|
+
group_1.each do |record_1|
|
234
|
+
group_2.each do |record_2|
|
235
|
+
changed
|
236
|
+
notify_observers(self, record_1, record_2, 1)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def score_group(group)
|
242
|
+
(group.length - 1).times do |i|
|
243
|
+
((i+1)...group.length).each do |j|
|
244
|
+
changed
|
245
|
+
notify_observers(self, group[i], group[j], 1)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
42
249
|
end
|
43
250
|
|
44
|
-
Comparator.register(Compare)
|
251
|
+
Comparator.register('compare', Compare)
|
45
252
|
end
|
46
253
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Comparators
|
3
|
+
# Strcompare is a string comparison comparator. It uses the specified
|
4
|
+
# operation to compare string-type fields. Score ranges from 0 to 1.
|
5
|
+
#
|
6
|
+
# To use Strcompare, you must specify one field for each record to use in
|
7
|
+
# the comparison, along with an operator. Valid operators are:
|
8
|
+
#
|
9
|
+
# * `:jarowinkler` ([Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance))
|
10
|
+
#
|
11
|
+
# Consider the following example, using a {Configuration} as part of
|
12
|
+
# {Dataset#link_with}:
|
13
|
+
#
|
14
|
+
# ```ruby
|
15
|
+
# config.strcompare(:foo, :bar, :jarowinkler)
|
16
|
+
# ```
|
17
|
+
#
|
18
|
+
# For each record, the values of the `foo` and `bar` fields are compared
|
19
|
+
# using the Jaro-Winkler distance algorithm.
|
20
|
+
class Strcompare < Comparator
|
21
|
+
VALID_OPERATIONS = [:jarowinkler]
|
22
|
+
|
23
|
+
def initialize(field_1, field_2, operation)
|
24
|
+
if field_1.ruby_type[:type] != String || field_2.ruby_type[:type] != String
|
25
|
+
raise "fields must be string types"
|
26
|
+
end
|
27
|
+
if !VALID_OPERATIONS.include?(operation)
|
28
|
+
raise "#{operation.inspect} is not a valid operation"
|
29
|
+
end
|
30
|
+
|
31
|
+
@name_1 = field_1.name
|
32
|
+
@name_2 = field_2.name
|
33
|
+
@operation = operation
|
34
|
+
end
|
35
|
+
|
36
|
+
def score(record_1, record_2)
|
37
|
+
result =
|
38
|
+
case @operation
|
39
|
+
when :jarowinkler
|
40
|
+
jarowinkler(record_1[@name_1], record_2[@name_2])
|
41
|
+
end
|
42
|
+
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
def jarowinkler(w1, w2)
|
47
|
+
a = w1.downcase
|
48
|
+
b = w2.downcase
|
49
|
+
aa = a.split('')
|
50
|
+
ba = b.split('')
|
51
|
+
al = a.length
|
52
|
+
bl = b.length
|
53
|
+
l = 0
|
54
|
+
for i in Range.new(0, [[al, bl].min, 4].min-1)
|
55
|
+
break if aa[i] != ba[i]
|
56
|
+
l += 1
|
57
|
+
end
|
58
|
+
aj = aa - (aa - ba)
|
59
|
+
bj = ba - (ba - aa)
|
60
|
+
nm = 0
|
61
|
+
nt = 0
|
62
|
+
md = [[al, bl].max/2 - 1, 0].max
|
63
|
+
for i in Range.new(0, al-1)
|
64
|
+
bi = ba.index(aa[i])
|
65
|
+
aji = aj.index(aa[i])
|
66
|
+
bji = bj.index(aa[i])
|
67
|
+
if !bi.nil? && (bi + nm - i).abs <= md
|
68
|
+
nm += 1
|
69
|
+
nt += 1 if !bji.nil? && aji != bji
|
70
|
+
end
|
71
|
+
ba.delete_at(bi) if !bi.nil?
|
72
|
+
aj.delete_at(aji) if !aji.nil?
|
73
|
+
bj.delete_at(bji) if !bji.nil?
|
74
|
+
end
|
75
|
+
return 0 if nm == 0
|
76
|
+
d = (nm/al.to_f + nm/bl.to_f + (nm-nt)/nm.to_f)/3.0
|
77
|
+
w = (d + l * 0.1 * (1 - d)).round(3)
|
78
|
+
w
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Comparator.register('strcompare', Strcompare)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
@@ -1,25 +1,29 @@
|
|
1
1
|
module Linkage
|
2
2
|
module Comparators
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
# Within is a integer comparator. It checks if two values are within a
|
4
|
+
# specified range. Score is either 0 to 1.
|
5
|
+
#
|
6
|
+
# To use Within, you must specify one field for each record to use in
|
7
|
+
# the comparison, along with a range value.
|
8
|
+
#
|
9
|
+
# Consider the following example, using a {Configuration} as part of
|
10
|
+
# {Dataset#link_with}:
|
11
|
+
#
|
12
|
+
# ```ruby
|
13
|
+
# config.within(:foo, :bar, 5)
|
14
|
+
# ```
|
15
|
+
#
|
16
|
+
# For each pair of records, if value of `foo` is within 5 (inclusive) of
|
17
|
+
# the value of `bar`, the score is 1. Otherwise, the score is 0.
|
18
|
+
class Within < Comparator
|
19
|
+
def initialize(field_1, field_2, value)
|
20
|
+
if field_1.ruby_type != field_2.ruby_type
|
21
|
+
raise "fields must have the same type"
|
22
|
+
end
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
@
|
21
|
-
@value = @args[1].object
|
22
|
-
@name_2 = @args[2].name
|
24
|
+
@name_1 = field_1.name
|
25
|
+
@name_2 = field_2.name
|
26
|
+
@value = value
|
23
27
|
end
|
24
28
|
|
25
29
|
def score(record_1, record_2)
|
@@ -27,6 +31,6 @@ module Linkage
|
|
27
31
|
end
|
28
32
|
end
|
29
33
|
|
30
|
-
Comparator.register(Within)
|
34
|
+
Comparator.register('within', Within)
|
31
35
|
end
|
32
36
|
end
|