linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -1,46 +1,253 @@
1
1
  module Linkage
2
2
  module Comparators
3
- class Compare < Binary
4
- @@parameters = [
5
- [:any, :static => false, :side => :first],
6
- [String, :values => %w{> >= <= < !=}],
7
- [:any, :same_type_as => 0, :static => false, :side => :second]
3
+ # Compare is the most basic comparator in Linkage, conceptually. It scores
4
+ # two records based on whether or not field values satisfy the specified
5
+ # operator. Score is either 0 or 1.
6
+ #
7
+ # To use Compare, you must specify two sets of fields to use in the
8
+ # comparison, along with an operator. Valid operators are:
9
+ #
10
+ # * `:equal`
11
+ # * `:not_equal`
12
+ # * `:greater_than`
13
+ # * `:greater_than_or_equal`
14
+ # * `:less_than`
15
+ # * `:less_than_or_equal`
16
+ #
17
+ # Sets of fields must be of equal length. If you specify more than one
18
+ # field, each field will be compared to its counterpart in the other set.
19
+ # All of the field values must meet the conditions in order for the score to
20
+ # be 1. Otherwise, the score is 0.
21
+ #
22
+ # Consider the following example, using a {Configuration} as part of
23
+ # {Dataset#link_with}:
24
+ #
25
+ # ```ruby
26
+ # config.compare([:foo, :bar], [:baz, :qux], :equal)
27
+ # ```
28
+ #
29
+ # For each record, the values of `foo` and `baz` are compared, and the
30
+ # values of `bar` and `qux` are compared. If both of these two comparisons
31
+ # are `true`, then the score of 1 is given. If `foo` and `baz` are equal but
32
+ # `bar` and `qux` are not equal, or if both comparisons are false, then a
33
+ # score of 0 is given.
34
+ #
35
+ # Algorithms
36
+ # ----------
37
+ #
38
+ # The way records are chosen for comparison depends on which operator you
39
+ # use. The `:equal` operator is treated differently than the other
40
+ # operators. When using operators other than `:equal`, each record is
41
+ # compared to every other record (and {#type} returns `:simple`). When using
42
+ # `:equal`, {#type} is `:advanced` and a different algorithm is used.
43
+ #
44
+ # "Equal" mode uses an algorithm similar to the sorted neighborhood method.
45
+ # Values are sorted (via database query) and then compared. This way, only
46
+ # adjacent records are compared. Using the transitive property of equality,
47
+ # records are grouped together. All pairs of records in the group are scored
48
+ # as 1. Scores of 0 are not given at all (absence of score means 0).
49
+ class Compare < Comparator
50
+ VALID_OPERATIONS = [
51
+ :not_equal, :greater_than, :greater_than_or_equal,
52
+ :less_than_or_equal, :less_than, :equal
8
53
  ]
9
- def self.parameters
10
- @@parameters
11
- end
12
54
 
13
- @@comparator_name = 'compare'
14
- def self.comparator_name
15
- @@comparator_name
16
- end
55
+ def initialize(set_1, set_2, operation)
56
+ if set_1.length != set_2.length
57
+ raise "sets must be of equal length"
58
+ end
59
+
60
+ # Check value data types
61
+ set_1.each_with_index do |value_1, index|
62
+ value_2 = set_2[index]
63
+ if value_1.ruby_type != value_2.ruby_type
64
+ raise "values at index #{index} had different types"
65
+ end
66
+ end
17
67
 
18
- def initialize(*args)
19
- super
20
- @name_1 = @args[0].name
21
- @operator = @args[1].object
22
- @name_2 = @args[2].name
68
+ # Check compare operator
69
+ if !VALID_OPERATIONS.include?(operation)
70
+ raise "operation is not valid"
71
+ end
72
+ @type = operation == :equal ? :advanced : :simple
73
+ @names_1 = set_1.collect(&:name)
74
+ @names_2 = set_2.collect(&:name)
75
+ @operation = operation
23
76
  end
24
77
 
25
78
  def score(record_1, record_2)
79
+ values_1 = record_1.values_at(*@names_1)
80
+ values_2 = record_2.values_at(*@names_2)
26
81
  result =
27
- case @operator
28
- when '!='
29
- record_1[@name_1] != record_2[@name_2]
30
- when '>'
31
- record_1[@name_1] > record_2[@name_2]
32
- when '>='
33
- record_1[@name_1] >= record_2[@name_2]
34
- when '<='
35
- record_1[@name_1] <= record_2[@name_2]
36
- when '<'
37
- record_1[@name_1] < record_2[@name_2]
82
+ case @operation
83
+ when :not_equal
84
+ values_1.each_with_index.all? do |value_1, i|
85
+ value_1 != values_2[i]
86
+ end
87
+ when :greater_than
88
+ values_1.each_with_index.all? do |value_1, i|
89
+ value_1 > values_2[i]
90
+ end
91
+ when :greater_than_or_equal
92
+ values_1.each_with_index.all? do |value_1, i|
93
+ value_1 >= values_2[i]
94
+ end
95
+ when :less_than_or_equal
96
+ values_1.each_with_index.all? do |value_1, i|
97
+ value_1 <= values_2[i]
98
+ end
99
+ when :less_than
100
+ values_1.each_with_index.all? do |value_1, i|
101
+ value_1 < values_2[i]
102
+ end
38
103
  end
39
104
 
40
105
  result ? 1 : 0
41
106
  end
107
+
108
+ def score_datasets(dataset_1, dataset_2)
109
+ # FIXME: nil value equality
110
+
111
+ _score_datasets(dataset_1, dataset_2)
112
+ end
113
+
114
+ def score_dataset(dataset)
115
+ # FIXME: nil value equality
116
+
117
+ if @names_1 != @names_2
118
+ return _score_datasets(dataset, dataset)
119
+ end
120
+
121
+ enum = dataset.order(*@names_1).to_enum
122
+ begin
123
+ record = enum.next
124
+ rescue StopIteration
125
+ return
126
+ end
127
+ group = [record]
128
+ last_value = record.values_at(*@names_1)
129
+ loop do
130
+ begin
131
+ record = enum.next
132
+ rescue StopIteration
133
+ break
134
+ end
135
+ value = record.values_at(*@names_1)
136
+ if value == last_value
137
+ group << record
138
+ else
139
+ score_group(group)
140
+ group.clear
141
+ group << record
142
+ last_value = value
143
+ end
144
+ end
145
+ score_group(group)
146
+ end
147
+
148
+ private
149
+
150
+ def _score_datasets(dataset_1, dataset_2)
151
+ enum_1 = dataset_1.order(*@names_1).to_enum
152
+ enum_2 = dataset_2.order(*@names_2).to_enum
153
+
154
+ begin
155
+ record_1 = enum_1.next
156
+ record_2 = enum_2.next
157
+ rescue StopIteration
158
+ # no pairs to score
159
+ return
160
+ end
161
+ group_1 = []
162
+ group_2 = []
163
+ loop do
164
+ value_1 = record_1.values_at(*@names_1)
165
+ value_2 = record_2.values_at(*@names_2)
166
+ result = value_1 <=> value_2
167
+ if result == 0
168
+ last_value = value_1
169
+ group_1 << record_1
170
+ group_2 << record_2
171
+
172
+ state = :right
173
+ loop do
174
+ begin
175
+ case state
176
+ when :left
177
+ record_1 = enum_1.next
178
+ value_1 = record_1.values_at(*@names_1)
179
+ result = last_value == value_1
180
+ when :right
181
+ record_2 = enum_2.next
182
+ value_2 = record_2.values_at(*@names_2)
183
+ result = last_value == value_2
184
+ end
185
+ rescue StopIteration
186
+ result = false
187
+ case state
188
+ when :left
189
+ record_1 = :eof
190
+ when :right
191
+ record_2 = :eof
192
+ end
193
+ end
194
+
195
+ if result
196
+ case state
197
+ when :left
198
+ group_1 << record_1
199
+ when :right
200
+ group_2 << record_2
201
+ end
202
+ else
203
+ case state
204
+ when :left
205
+ # done with this group
206
+ score_groups(group_1, group_2)
207
+ group_1.clear
208
+ group_2.clear
209
+ break
210
+ when :right
211
+ state = :left
212
+ end
213
+ end
214
+ end
215
+ if record_1 == :eof || record_2 == :eof
216
+ break
217
+ end
218
+ else
219
+ begin
220
+ if result < 0
221
+ record_1 = enum_1.next
222
+ else
223
+ record_2 = enum_2.next
224
+ end
225
+ rescue StopIteration
226
+ break
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+ def score_groups(group_1, group_2)
233
+ group_1.each do |record_1|
234
+ group_2.each do |record_2|
235
+ changed
236
+ notify_observers(self, record_1, record_2, 1)
237
+ end
238
+ end
239
+ end
240
+
241
+ def score_group(group)
242
+ (group.length - 1).times do |i|
243
+ ((i+1)...group.length).each do |j|
244
+ changed
245
+ notify_observers(self, group[i], group[j], 1)
246
+ end
247
+ end
248
+ end
42
249
  end
43
250
 
44
- Comparator.register(Compare)
251
+ Comparator.register('compare', Compare)
45
252
  end
46
253
  end
@@ -0,0 +1,85 @@
1
+ module Linkage
2
+ module Comparators
3
+ # Strcompare is a string comparison comparator. It uses the specified
4
+ # operation to compare string-type fields. Score ranges from 0 to 1.
5
+ #
6
+ # To use Strcompare, you must specify one field for each record to use in
7
+ # the comparison, along with an operator. Valid operators are:
8
+ #
9
+ # * `:jarowinkler` ([Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance))
10
+ #
11
+ # Consider the following example, using a {Configuration} as part of
12
+ # {Dataset#link_with}:
13
+ #
14
+ # ```ruby
15
+ # config.strcompare(:foo, :bar, :jarowinkler)
16
+ # ```
17
+ #
18
+ # For each record, the values of the `foo` and `bar` fields are compared
19
+ # using the Jaro-Winkler distance algorithm.
20
+ class Strcompare < Comparator
21
+ VALID_OPERATIONS = [:jarowinkler]
22
+
23
+ def initialize(field_1, field_2, operation)
24
+ if field_1.ruby_type[:type] != String || field_2.ruby_type[:type] != String
25
+ raise "fields must be string types"
26
+ end
27
+ if !VALID_OPERATIONS.include?(operation)
28
+ raise "#{operation.inspect} is not a valid operation"
29
+ end
30
+
31
+ @name_1 = field_1.name
32
+ @name_2 = field_2.name
33
+ @operation = operation
34
+ end
35
+
36
+ def score(record_1, record_2)
37
+ result =
38
+ case @operation
39
+ when :jarowinkler
40
+ jarowinkler(record_1[@name_1], record_2[@name_2])
41
+ end
42
+
43
+ result
44
+ end
45
+
46
+ def jarowinkler(w1, w2)
47
+ a = w1.downcase
48
+ b = w2.downcase
49
+ aa = a.split('')
50
+ ba = b.split('')
51
+ al = a.length
52
+ bl = b.length
53
+ l = 0
54
+ for i in Range.new(0, [[al, bl].min, 4].min-1)
55
+ break if aa[i] != ba[i]
56
+ l += 1
57
+ end
58
+ aj = aa - (aa - ba)
59
+ bj = ba - (ba - aa)
60
+ nm = 0
61
+ nt = 0
62
+ md = [[al, bl].max/2 - 1, 0].max
63
+ for i in Range.new(0, al-1)
64
+ bi = ba.index(aa[i])
65
+ aji = aj.index(aa[i])
66
+ bji = bj.index(aa[i])
67
+ if !bi.nil? && (bi + nm - i).abs <= md
68
+ nm += 1
69
+ nt += 1 if !bji.nil? && aji != bji
70
+ end
71
+ ba.delete_at(bi) if !bi.nil?
72
+ aj.delete_at(aji) if !aji.nil?
73
+ bj.delete_at(bji) if !bji.nil?
74
+ end
75
+ return 0 if nm == 0
76
+ d = (nm/al.to_f + nm/bl.to_f + (nm-nt)/nm.to_f)/3.0
77
+ w = (d + l * 0.1 * (1 - d)).round(3)
78
+ w
79
+ end
80
+ end
81
+
82
+ Comparator.register('strcompare', Strcompare)
83
+ end
84
+ end
85
+
@@ -1,25 +1,29 @@
1
1
  module Linkage
2
2
  module Comparators
3
- class Within < Binary
4
- @@parameters = [
5
- [:any, :static => false, :side => :first],
6
- [Fixnum],
7
- [:any, :same_type_as => 0, :static => false, :side => :second]
8
- ]
9
- def self.parameters
10
- @@parameters
11
- end
12
-
13
- @@comparator_name = 'within'
14
- def self.comparator_name
15
- @@comparator_name
16
- end
3
+ # Within is a integer comparator. It checks if two values are within a
4
+ # specified range. Score is either 0 to 1.
5
+ #
6
+ # To use Within, you must specify one field for each record to use in
7
+ # the comparison, along with a range value.
8
+ #
9
+ # Consider the following example, using a {Configuration} as part of
10
+ # {Dataset#link_with}:
11
+ #
12
+ # ```ruby
13
+ # config.within(:foo, :bar, 5)
14
+ # ```
15
+ #
16
+ # For each pair of records, if value of `foo` is within 5 (inclusive) of
17
+ # the value of `bar`, the score is 1. Otherwise, the score is 0.
18
+ class Within < Comparator
19
+ def initialize(field_1, field_2, value)
20
+ if field_1.ruby_type != field_2.ruby_type
21
+ raise "fields must have the same type"
22
+ end
17
23
 
18
- def initialize(*args)
19
- super
20
- @name_1 = @args[0].name
21
- @value = @args[1].object
22
- @name_2 = @args[2].name
24
+ @name_1 = field_1.name
25
+ @name_2 = field_2.name
26
+ @value = value
23
27
  end
24
28
 
25
29
  def score(record_1, record_2)
@@ -27,6 +31,6 @@ module Linkage
27
31
  end
28
32
  end
29
33
 
30
- Comparator.register(Within)
34
+ Comparator.register('within', Within)
31
35
  end
32
36
  end