linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -1,46 +1,253 @@
1
1
  module Linkage
2
2
  module Comparators
3
- class Compare < Binary
4
- @@parameters = [
5
- [:any, :static => false, :side => :first],
6
- [String, :values => %w{> >= <= < !=}],
7
- [:any, :same_type_as => 0, :static => false, :side => :second]
3
+ # Compare is the most basic comparator in Linkage, conceptually. It scores
4
+ # two records based on whether or not field values satisfy the specified
5
+ # operator. Score is either 0 or 1.
6
+ #
7
+ # To use Compare, you must specify two sets of fields to use in the
8
+ # comparison, along with an operator. Valid operators are:
9
+ #
10
+ # * `:equal`
11
+ # * `:not_equal`
12
+ # * `:greater_than`
13
+ # * `:greater_than_or_equal`
14
+ # * `:less_than`
15
+ # * `:less_than_or_equal`
16
+ #
17
+ # Sets of fields must be of equal length. If you specify more than one
18
+ # field, each field will be compared to its counterpart in the other set.
19
+ # All of the field values must meet the conditions in order for the score to
20
+ # be 1. Otherwise, the score is 0.
21
+ #
22
+ # Consider the following example, using a {Configuration} as part of
23
+ # {Dataset#link_with}:
24
+ #
25
+ # ```ruby
26
+ # config.compare([:foo, :bar], [:baz, :qux], :equal)
27
+ # ```
28
+ #
29
+ # For each record, the values of `foo` and `baz` are compared, and the
30
+ # values of `bar` and `qux` are compared. If both of these two comparisons
31
+ # are `true`, then the score of 1 is given. If `foo` and `baz` are equal but
32
+ # `bar` and `qux` are not equal, or if both comparisons are false, then a
33
+ # score of 0 is given.
34
+ #
35
+ # Algorithms
36
+ # ----------
37
+ #
38
+ # The way records are chosen for comparison depends on which operator you
39
+ # use. The `:equal` operator is treated differently than the other
40
+ # operators. When using operators other than `:equal`, each record is
41
+ # compared to every other record (and {#type} returns `:simple`). When using
42
+ # `:equal`, {#type} is `:advanced` and a different algorithm is used.
43
+ #
44
+ # "Equal" mode uses an algorithm similar to the sorted neighborhood method.
45
+ # Values are sorted (via database query) and then compared. This way, only
46
+ # adjacent records are compared. Using the transitive property of equality,
47
+ # records are grouped together. All pairs of records in the group are scored
48
+ # as 1. Scores of 0 are not given at all (absence of score means 0).
49
+ class Compare < Comparator
50
+ VALID_OPERATIONS = [
51
+ :not_equal, :greater_than, :greater_than_or_equal,
52
+ :less_than_or_equal, :less_than, :equal
8
53
  ]
9
- def self.parameters
10
- @@parameters
11
- end
12
54
 
13
- @@comparator_name = 'compare'
14
- def self.comparator_name
15
- @@comparator_name
16
- end
55
+ def initialize(set_1, set_2, operation)
56
+ if set_1.length != set_2.length
57
+ raise "sets must be of equal length"
58
+ end
59
+
60
+ # Check value data types
61
+ set_1.each_with_index do |value_1, index|
62
+ value_2 = set_2[index]
63
+ if value_1.ruby_type != value_2.ruby_type
64
+ raise "values at index #{index} had different types"
65
+ end
66
+ end
17
67
 
18
- def initialize(*args)
19
- super
20
- @name_1 = @args[0].name
21
- @operator = @args[1].object
22
- @name_2 = @args[2].name
68
+ # Check compare operator
69
+ if !VALID_OPERATIONS.include?(operation)
70
+ raise "operation is not valid"
71
+ end
72
+ @type = operation == :equal ? :advanced : :simple
73
+ @names_1 = set_1.collect(&:name)
74
+ @names_2 = set_2.collect(&:name)
75
+ @operation = operation
23
76
  end
24
77
 
25
78
  def score(record_1, record_2)
79
+ values_1 = record_1.values_at(*@names_1)
80
+ values_2 = record_2.values_at(*@names_2)
26
81
  result =
27
- case @operator
28
- when '!='
29
- record_1[@name_1] != record_2[@name_2]
30
- when '>'
31
- record_1[@name_1] > record_2[@name_2]
32
- when '>='
33
- record_1[@name_1] >= record_2[@name_2]
34
- when '<='
35
- record_1[@name_1] <= record_2[@name_2]
36
- when '<'
37
- record_1[@name_1] < record_2[@name_2]
82
+ case @operation
83
+ when :not_equal
84
+ values_1.each_with_index.all? do |value_1, i|
85
+ value_1 != values_2[i]
86
+ end
87
+ when :greater_than
88
+ values_1.each_with_index.all? do |value_1, i|
89
+ value_1 > values_2[i]
90
+ end
91
+ when :greater_than_or_equal
92
+ values_1.each_with_index.all? do |value_1, i|
93
+ value_1 >= values_2[i]
94
+ end
95
+ when :less_than_or_equal
96
+ values_1.each_with_index.all? do |value_1, i|
97
+ value_1 <= values_2[i]
98
+ end
99
+ when :less_than
100
+ values_1.each_with_index.all? do |value_1, i|
101
+ value_1 < values_2[i]
102
+ end
38
103
  end
39
104
 
40
105
  result ? 1 : 0
41
106
  end
107
+
108
+ def score_datasets(dataset_1, dataset_2)
109
+ # FIXME: nil value equality
110
+
111
+ _score_datasets(dataset_1, dataset_2)
112
+ end
113
+
114
+ def score_dataset(dataset)
115
+ # FIXME: nil value equality
116
+
117
+ if @names_1 != @names_2
118
+ return _score_datasets(dataset, dataset)
119
+ end
120
+
121
+ enum = dataset.order(*@names_1).to_enum
122
+ begin
123
+ record = enum.next
124
+ rescue StopIteration
125
+ return
126
+ end
127
+ group = [record]
128
+ last_value = record.values_at(*@names_1)
129
+ loop do
130
+ begin
131
+ record = enum.next
132
+ rescue StopIteration
133
+ break
134
+ end
135
+ value = record.values_at(*@names_1)
136
+ if value == last_value
137
+ group << record
138
+ else
139
+ score_group(group)
140
+ group.clear
141
+ group << record
142
+ last_value = value
143
+ end
144
+ end
145
+ score_group(group)
146
+ end
147
+
148
+ private
149
+
150
+ def _score_datasets(dataset_1, dataset_2)
151
+ enum_1 = dataset_1.order(*@names_1).to_enum
152
+ enum_2 = dataset_2.order(*@names_2).to_enum
153
+
154
+ begin
155
+ record_1 = enum_1.next
156
+ record_2 = enum_2.next
157
+ rescue StopIteration
158
+ # no pairs to score
159
+ return
160
+ end
161
+ group_1 = []
162
+ group_2 = []
163
+ loop do
164
+ value_1 = record_1.values_at(*@names_1)
165
+ value_2 = record_2.values_at(*@names_2)
166
+ result = value_1 <=> value_2
167
+ if result == 0
168
+ last_value = value_1
169
+ group_1 << record_1
170
+ group_2 << record_2
171
+
172
+ state = :right
173
+ loop do
174
+ begin
175
+ case state
176
+ when :left
177
+ record_1 = enum_1.next
178
+ value_1 = record_1.values_at(*@names_1)
179
+ result = last_value == value_1
180
+ when :right
181
+ record_2 = enum_2.next
182
+ value_2 = record_2.values_at(*@names_2)
183
+ result = last_value == value_2
184
+ end
185
+ rescue StopIteration
186
+ result = false
187
+ case state
188
+ when :left
189
+ record_1 = :eof
190
+ when :right
191
+ record_2 = :eof
192
+ end
193
+ end
194
+
195
+ if result
196
+ case state
197
+ when :left
198
+ group_1 << record_1
199
+ when :right
200
+ group_2 << record_2
201
+ end
202
+ else
203
+ case state
204
+ when :left
205
+ # done with this group
206
+ score_groups(group_1, group_2)
207
+ group_1.clear
208
+ group_2.clear
209
+ break
210
+ when :right
211
+ state = :left
212
+ end
213
+ end
214
+ end
215
+ if record_1 == :eof || record_2 == :eof
216
+ break
217
+ end
218
+ else
219
+ begin
220
+ if result < 0
221
+ record_1 = enum_1.next
222
+ else
223
+ record_2 = enum_2.next
224
+ end
225
+ rescue StopIteration
226
+ break
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+ def score_groups(group_1, group_2)
233
+ group_1.each do |record_1|
234
+ group_2.each do |record_2|
235
+ changed
236
+ notify_observers(self, record_1, record_2, 1)
237
+ end
238
+ end
239
+ end
240
+
241
+ def score_group(group)
242
+ (group.length - 1).times do |i|
243
+ ((i+1)...group.length).each do |j|
244
+ changed
245
+ notify_observers(self, group[i], group[j], 1)
246
+ end
247
+ end
248
+ end
42
249
  end
43
250
 
44
- Comparator.register(Compare)
251
+ Comparator.register('compare', Compare)
45
252
  end
46
253
  end
@@ -0,0 +1,85 @@
1
+ module Linkage
2
+ module Comparators
3
+ # Strcompare is a string comparison comparator. It uses the specified
4
+ # operation to compare string-type fields. Score ranges from 0 to 1.
5
+ #
6
+ # To use Strcompare, you must specify one field for each record to use in
7
+ # the comparison, along with an operator. Valid operators are:
8
+ #
9
+ # * `:jarowinkler` ([Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance))
10
+ #
11
+ # Consider the following example, using a {Configuration} as part of
12
+ # {Dataset#link_with}:
13
+ #
14
+ # ```ruby
15
+ # config.strcompare(:foo, :bar, :jarowinkler)
16
+ # ```
17
+ #
18
+ # For each record, the values of the `foo` and `bar` fields are compared
19
+ # using the Jaro-Winkler distance algorithm.
20
+ class Strcompare < Comparator
21
+ VALID_OPERATIONS = [:jarowinkler]
22
+
23
+ def initialize(field_1, field_2, operation)
24
+ if field_1.ruby_type[:type] != String || field_2.ruby_type[:type] != String
25
+ raise "fields must be string types"
26
+ end
27
+ if !VALID_OPERATIONS.include?(operation)
28
+ raise "#{operation.inspect} is not a valid operation"
29
+ end
30
+
31
+ @name_1 = field_1.name
32
+ @name_2 = field_2.name
33
+ @operation = operation
34
+ end
35
+
36
+ def score(record_1, record_2)
37
+ result =
38
+ case @operation
39
+ when :jarowinkler
40
+ jarowinkler(record_1[@name_1], record_2[@name_2])
41
+ end
42
+
43
+ result
44
+ end
45
+
46
+ def jarowinkler(w1, w2)
47
+ a = w1.downcase
48
+ b = w2.downcase
49
+ aa = a.split('')
50
+ ba = b.split('')
51
+ al = a.length
52
+ bl = b.length
53
+ l = 0
54
+ for i in Range.new(0, [[al, bl].min, 4].min-1)
55
+ break if aa[i] != ba[i]
56
+ l += 1
57
+ end
58
+ aj = aa - (aa - ba)
59
+ bj = ba - (ba - aa)
60
+ nm = 0
61
+ nt = 0
62
+ md = [[al, bl].max/2 - 1, 0].max
63
+ for i in Range.new(0, al-1)
64
+ bi = ba.index(aa[i])
65
+ aji = aj.index(aa[i])
66
+ bji = bj.index(aa[i])
67
+ if !bi.nil? && (bi + nm - i).abs <= md
68
+ nm += 1
69
+ nt += 1 if !bji.nil? && aji != bji
70
+ end
71
+ ba.delete_at(bi) if !bi.nil?
72
+ aj.delete_at(aji) if !aji.nil?
73
+ bj.delete_at(bji) if !bji.nil?
74
+ end
75
+ return 0 if nm == 0
76
+ d = (nm/al.to_f + nm/bl.to_f + (nm-nt)/nm.to_f)/3.0
77
+ w = (d + l * 0.1 * (1 - d)).round(3)
78
+ w
79
+ end
80
+ end
81
+
82
+ Comparator.register('strcompare', Strcompare)
83
+ end
84
+ end
85
+
@@ -1,25 +1,29 @@
1
1
  module Linkage
2
2
  module Comparators
3
- class Within < Binary
4
- @@parameters = [
5
- [:any, :static => false, :side => :first],
6
- [Fixnum],
7
- [:any, :same_type_as => 0, :static => false, :side => :second]
8
- ]
9
- def self.parameters
10
- @@parameters
11
- end
12
-
13
- @@comparator_name = 'within'
14
- def self.comparator_name
15
- @@comparator_name
16
- end
3
+ # Within is a integer comparator. It checks if two values are within a
4
+ # specified range. Score is either 0 to 1.
5
+ #
6
+ # To use Within, you must specify one field for each record to use in
7
+ # the comparison, along with a range value.
8
+ #
9
+ # Consider the following example, using a {Configuration} as part of
10
+ # {Dataset#link_with}:
11
+ #
12
+ # ```ruby
13
+ # config.within(:foo, :bar, 5)
14
+ # ```
15
+ #
16
+ # For each pair of records, if value of `foo` is within 5 (inclusive) of
17
+ # the value of `bar`, the score is 1. Otherwise, the score is 0.
18
+ class Within < Comparator
19
+ def initialize(field_1, field_2, value)
20
+ if field_1.ruby_type != field_2.ruby_type
21
+ raise "fields must have the same type"
22
+ end
17
23
 
18
- def initialize(*args)
19
- super
20
- @name_1 = @args[0].name
21
- @value = @args[1].object
22
- @name_2 = @args[2].name
24
+ @name_1 = field_1.name
25
+ @name_2 = field_2.name
26
+ @value = value
23
27
  end
24
28
 
25
29
  def score(record_1, record_2)
@@ -27,6 +31,6 @@ module Linkage
27
31
  end
28
32
  end
29
33
 
30
- Comparator.register(Within)
34
+ Comparator.register('within', Within)
31
35
  end
32
36
  end