linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -1,494 +1,72 @@
1
1
  module Linkage
2
2
  class Configuration
3
- class DSL
4
- # Class for visually comparing matched records
5
- class VisualComparisonWrapper
6
- attr_reader :dsl, :lhs, :rhs
3
+ attr_reader :dataset_1, :dataset_2, :result_set, :comparators
4
+ attr_accessor :record_cache_size, :algorithm, :threshold
7
5
 
8
- def initialize(dsl, lhs, rhs)
9
- @dsl = dsl
10
- @lhs = lhs
11
- @rhs = rhs
12
-
13
- if @lhs.is_a?(DataWrapper) && @rhs.is_a?(DataWrapper)
14
- if @lhs.side == @rhs.side
15
- raise ArgumentError, "Can't visually compare two data sources on the same side"
16
- end
17
- else
18
- raise ArgumentError, "Must supply two data sources for visual comparison"
19
- end
20
-
21
- @dsl.add_visual_comparison(self)
22
- end
23
- end
24
-
25
- class ExpectationWrapper
26
- VALID_OPERATORS = [:==, :>, :<, :>=, :<=]
27
- OPERATOR_OPPOSITES = {
28
- :== => :'!=',
29
- :> => :<=,
30
- :<= => :>,
31
- :< => :>=,
32
- :>= => :<
33
- }
34
-
35
- def initialize(dsl, type, lhs, *args)
36
- @dsl = dsl
37
- @type = type
38
- @lhs = lhs
39
- end
40
-
41
- def compare_with(operator, rhs)
42
- # NOTE: lhs is always a DataWrapper
43
-
44
- if !rhs.is_a?(DataWrapper) || @lhs.static? || rhs.static? || @lhs.side == rhs.side
45
- @side = !@lhs.static? ? @lhs.side : rhs.side
46
-
47
- # If one of the objects in this comparison is a static function, we need to set the side
48
- # and the dataset based on the other object
49
- if rhs.is_a?(DataWrapper) && !rhs.static? && @lhs.is_a?(FunctionWrapper) && @lhs.static?
50
- @lhs.dataset = rhs.dataset
51
- @lhs.side = @side
52
- elsif @lhs.is_a?(DataWrapper) && !@lhs.static? && rhs.is_a?(FunctionWrapper) && rhs.static?
53
- rhs.dataset = @lhs.dataset
54
- rhs.side = @side
55
- end
56
- elsif rhs.is_a?(DataWrapper) && operator != :==
57
- # create an exhaustive expectation with the Compare comparator instead
58
- comparator = Comparators::Compare.new(@lhs.meta_object,
59
- MetaObject.new(operator.to_s), rhs.meta_object)
60
-
61
- score_range = Comparators::Compare.score_range
62
- threshold = @type == :must ? score_range.last : score_range.first
63
-
64
- expectation = Expectations::Exhaustive.new(comparator, threshold, :equal)
65
- @dsl.add_exhaustive_expectation(expectation)
66
- return self
67
- end
68
-
69
- exp_operator = @type == :must_not ? OPERATOR_OPPOSITES[operator] : operator
70
-
71
- rhs_meta_object = rhs.is_a?(DataWrapper) ? rhs.meta_object : MetaObject.new(rhs)
72
- @expectation = Expectations::Simple.create(@lhs.meta_object,
73
- rhs_meta_object, exp_operator)
74
- @dsl.add_simple_expectation(@expectation)
75
- self
76
- end
77
-
78
- VALID_OPERATORS.each do |operator|
79
- define_method(operator) do |rhs|
80
- compare_with(operator, rhs)
81
- end
82
- end
83
-
84
- def exactly
85
- if !@exact_match
86
- @expectation.exactly!
87
- end
88
- end
89
- end
90
-
91
- class DataWrapper
92
- attr_reader :meta_object
93
-
94
- def initialize
95
- raise NotImplementedError
96
- end
97
-
98
- [:must, :must_not].each do |type|
99
- define_method(type) do |*args|
100
- if args.length > 0
101
- wrapper = args[0]
102
- comparator = wrapper.to_comparator(self)
103
-
104
- score_range = wrapper.klass.score_range
105
- threshold = type == :must ? score_range.last : score_range.first
106
-
107
- expectation = Expectations::Exhaustive.new(comparator, threshold, :equal)
108
- @dsl.add_exhaustive_expectation(expectation)
109
- else
110
- ExpectationWrapper.new(@dsl, type, self)
111
- end
112
- end
113
- end
114
-
115
- def compare_with(other)
116
- VisualComparisonWrapper.new(@dsl, self, other)
117
- end
118
-
119
- def method_missing(m, *args, &block)
120
- if meta_object.respond_to?(m)
121
- meta_object.send(m, *args, &block)
122
- else
123
- super(m, *args, &block)
124
- end
125
- end
126
- end
127
-
128
- class FieldWrapper < DataWrapper
129
- attr_reader :name
130
-
131
- def initialize(dsl, side, dataset, name)
132
- @dsl = dsl
133
- @meta_object = MetaObject.new(dataset.field_set[name], side)
134
- end
135
- end
136
-
137
- class FunctionWrapper < DataWrapper
138
- def initialize(dsl, klass, args)
139
- @dsl = dsl
140
-
141
- side = dataset = nil
142
- static = true
143
- function_args = []
144
- args.each do |arg|
145
- if arg.kind_of?(DataWrapper)
146
- raise "conflicting sides" if side && side != arg.side
147
- side = arg.side
148
- static &&= arg.static?
149
- dataset = arg.dataset
150
- function_args << arg.object
151
- else
152
- function_args << arg
153
- end
154
- end
155
- @meta_object = MetaObject.new(klass.new(*function_args), side)
156
- end
6
+ def initialize(*args)
7
+ if args.length < 2 || args.length > 3
8
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 3..4)"
157
9
  end
158
10
 
159
- class ComparatorWrapper
160
- attr_reader :klass, :args
161
-
162
- def initialize(dsl, klass, args)
163
- @dsl = dsl
164
- @klass = klass
165
- @args = args
166
- end
167
-
168
- def of(*args)
169
- @args.push(*args)
170
- self
171
- end
172
-
173
- def to_comparator(receiver)
174
- comparator_args = ([receiver] + @args).collect do |arg|
175
- arg.is_a?(DataWrapper) ? arg.meta_object : MetaObject.new(arg)
176
- end
177
- comparator = klass.new(*comparator_args)
178
- end
179
- end
180
-
181
- class DatasetWrapper
182
- attr_reader :dataset
183
-
184
- def initialize(dsl, side, dataset)
185
- @dsl = dsl
186
- @dataset = dataset
187
- @side = side
188
- end
189
-
190
- def [](field_name)
191
- if @dataset.field_set.has_key?(field_name)
192
- FieldWrapper.new(@dsl, @side, @dataset, field_name)
193
- else
194
- raise ArgumentError, "The '#{field_name}' field doesn't exist for the #{@side} dataset!"
195
- end
196
- end
197
- end
198
-
199
- def initialize(config, &block)
200
- @config = config
201
- @lhs_filters = []
202
- @rhs_filters = []
203
- instance_eval(&block)
204
- end
205
-
206
- def lhs
207
- DatasetWrapper.new(self, :lhs, @config.dataset_1)
208
- end
209
-
210
- def rhs
211
- DatasetWrapper.new(self, :rhs, @config.dataset_2)
212
- end
213
-
214
- def save_results_in(uri, options = {})
215
- @config.results_uri = uri
216
- @config.results_uri_options = options
217
- end
218
-
219
- def set_record_cache_size(num)
220
- @config.record_cache_size = num
221
- end
222
-
223
- def add_simple_expectation(expectation)
224
- @config.add_simple_expectation(expectation)
225
-
226
- if @config.linkage_type == :self
227
- case expectation.kind
228
- when :cross
229
- @config.linkage_type = :cross
230
- when :filter
231
- # If there different filters on both 'sides' of a self-linkage,
232
- # it turns into a cross linkage.
233
- these_filters, other_filters =
234
- case expectation.side
235
- when :lhs
236
- [@lhs_filters, @rhs_filters]
237
- when :rhs
238
- [@rhs_filters, @lhs_filters]
239
- end
240
-
241
- these_filters << expectation
242
- other_filters.each do |other|
243
- if !expectation.same_except_side?(other)
244
- @config.linkage_type = :cross
245
- break
246
- end
247
- end
248
- end
249
- end
11
+ @dataset_1 = args[0]
12
+ if args.length > 2 && args[1]
13
+ @dataset_2 = args[1]
250
14
  end
15
+ @result_set = args[-1]
251
16
 
252
- def add_exhaustive_expectation(expectation)
253
- @config.add_exhaustive_expectation(expectation)
254
- if @config.linkage_type == :self
255
- @config.linkage_type = expectation.kind
256
- end
257
- end
258
-
259
- def add_visual_comparison(visual_comparison)
260
- @config.visual_comparisons << visual_comparison
261
- end
262
-
263
- def groups_table_name(new_name)
264
- @config.groups_table_name = new_name
265
- end
266
-
267
- def original_groups_table_name(new_name)
268
- @config.original_groups_table_name = new_name
269
- end
270
-
271
- def scores_table_name(new_name)
272
- @config.scores_table_name = new_name
273
- end
274
-
275
- def matches_table_name(new_name)
276
- @config.matches_table_name = new_name
277
- end
278
-
279
- def method_missing(name, *args, &block)
280
- # check for comparators
281
- md = name.to_s.match(/^be_(.+)$/)
282
- if md
283
- klass = Comparator[md[1]]
284
- if klass
285
- ComparatorWrapper.new(self, klass, args)
286
- else
287
- super
288
- end
289
- else
290
- # check for functions
291
- klass = Function[name.to_s]
292
- if klass
293
- FunctionWrapper.new(self, klass, args)
294
- else
295
- super
296
- end
297
- end
298
- end
299
- end
300
-
301
- attr_reader :dataset_1, :dataset_2, :simple_expectations,
302
- :exhaustive_expectations, :visual_comparisons
303
- attr_accessor :linkage_type, :results_uri, :results_uri_options,
304
- :record_cache_size, :groups_table_name, :original_groups_table_name,
305
- :scores_table_name, :matches_table_name
306
-
307
- def initialize(dataset_1, dataset_2)
308
- @dataset_1 = dataset_1
309
- @dataset_2 = dataset_2
310
- @linkage_type = dataset_1 == dataset_2 ? :self : :dual
311
- @simple_expectations = []
312
- @exhaustive_expectations = []
313
- @visual_comparisons = []
314
- @results_uri_options = {}
315
- @decollation_needed = false
17
+ @comparators = []
316
18
  @record_cache_size = 10_000
317
- @groups_table_name = :groups
318
- @original_groups_table_name = :original_groups
319
- @scores_table_name = :scores
320
- @matches_table_name = :matches
321
- end
322
-
323
- def configure(&block)
324
- DSL.new(self, &block)
325
19
  end
326
20
 
327
- def results_uri=(uri)
328
- @results_uri = uri
329
- if !@decollation_needed
330
- @simple_expectations.each do |expectation|
331
- if decollation_needed_for_simple_expectation?(expectation)
332
- @decollation_needed = true
333
- break
334
- end
335
- end
21
+ def score_recorder
22
+ pk_1 = @dataset_1.field_set.primary_key.name
23
+ if @dataset_2
24
+ pk_2 = @dataset_2.field_set.primary_key.name
25
+ else
26
+ pk_2 = pk_1
336
27
  end
337
- uri
338
- end
339
-
340
- def decollation_needed?
341
- @decollation_needed
28
+ ScoreRecorder.new(@comparators, @result_set.score_set, [pk_1, pk_2])
342
29
  end
343
30
 
344
- def groups_table_schema
345
- schema = []
346
-
347
- # add id
348
- schema << [:id, Integer, {:primary_key => true}]
349
-
350
- # add values
351
- @simple_expectations.each do |exp|
352
- next if exp.kind == :filter
353
-
354
- merged_field = exp.merged_field
355
- merged_type = merged_field.ruby_type
356
-
357
- # if the merged field's database type is different than the result
358
- # database, strip collation information
359
- result_db_type = nil
360
- result_set.database do |db|
361
- result_db_type = db.database_type
362
- end
363
- if merged_field.database_type != result_db_type && merged_type.has_key?(:opts)
364
- new_opts = merged_type[:opts].reject { |k, v| k == :collate }
365
- merged_type = merged_type.merge(:opts => new_opts)
366
- end
367
-
368
- col = [merged_field.name, merged_type[:type], merged_type[:opts] || {}]
369
- schema << col
370
- end
371
-
372
- schema
31
+ def matcher
32
+ Matcher.new(@comparators, @result_set.score_set, @algorithm || :mean, @threshold || 0.5)
373
33
  end
374
34
 
375
- def scores_table_schema
376
- schema = []
377
-
378
- # add id
379
- schema << [:id, Integer, {:primary_key => true}]
380
-
381
- # add comparator id
382
- schema << [:comparator_id, Integer, {}]
383
-
384
- # add record ids
385
- pk = dataset_1.field_set.primary_key
386
- ruby_type = pk.ruby_type
387
- schema << [:record_1_id, ruby_type[:type], ruby_type[:opts] || {}]
388
-
389
- pk = dataset_2.field_set.primary_key
390
- ruby_type = pk.ruby_type
391
- schema << [:record_2_id, ruby_type[:type], ruby_type[:opts] || {}]
392
-
393
- # add score
394
- schema << [:score, Integer, {}]
395
-
396
- schema
35
+ def match_recorder(matcher)
36
+ MatchRecorder.new(matcher, @result_set.match_set)
397
37
  end
398
38
 
399
- def matches_table_schema
400
- schema = []
401
-
402
- # add id
403
- schema << [:id, Integer, {:primary_key => true}]
404
-
405
- # add record ids
406
- pk = dataset_1.field_set.primary_key
407
- ruby_type = pk.ruby_type
408
- schema << [:record_1_id, ruby_type[:type], ruby_type[:opts] || {}]
409
-
410
- pk = dataset_2.field_set.primary_key
411
- ruby_type = pk.ruby_type
412
- schema << [:record_2_id, ruby_type[:type], ruby_type[:opts] || {}]
413
-
414
- # add score
415
- schema << [:total_score, Integer, {}]
416
-
417
- schema
418
- end
419
-
420
- def add_simple_expectation(expectation)
421
- @simple_expectations << expectation
422
- @decollation_needed ||= decollation_needed_for_simple_expectation?(expectation)
423
- expectation
424
- end
425
-
426
- def add_exhaustive_expectation(expectation)
427
- @exhaustive_expectations << expectation
428
- expectation
429
- end
430
-
431
- def result_set
432
- @result_set ||= ResultSet.new(self)
433
- end
434
-
435
- def datasets_with_applied_simple_expectations
436
- dataset_1 = @dataset_1
437
- dataset_2 = @dataset_2
438
- @simple_expectations.each do |exp|
439
- dataset_1 = exp.apply_to(dataset_1, :lhs)
440
- dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
39
+ def method_missing(name, *args, &block)
40
+ klass = Comparator[name.to_s]
41
+ if klass.nil?
42
+ raise "unknown comparator: #{name}"
441
43
  end
442
- @linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
443
- end
444
44
 
445
- def datasets_with_applied_exhaustive_expectations
446
- apply_exhaustive_expectations(@dataset_1, @dataset_2)
447
- end
448
-
449
- def apply_exhaustive_expectations(dataset_1, dataset_2)
450
- dataset_1 = dataset_1.select(dataset_1.field_set.primary_key.to_expr)
451
- dataset_2 = dataset_2.select(dataset_2.field_set.primary_key.to_expr)
452
- @exhaustive_expectations.each do |exp|
453
- dataset_1 = exp.apply_to(dataset_1, :lhs)
454
- dataset_2 = exp.apply_to(dataset_2, :rhs)
45
+ set_1 = args[0]
46
+ if set_1.is_a?(Array)
47
+ set_1 = fields_for(dataset_1, *set_1)
48
+ else
49
+ set_1 = fields_for(dataset_1, set_1).first
455
50
  end
456
- [dataset_1, dataset_2]
457
- end
458
-
459
- def groups_table_needed?
460
- has_simple_expectations?
461
- end
462
-
463
- def scores_table_needed?
464
- has_exhaustive_expectations?
465
- end
51
+ args[0] = set_1
466
52
 
467
- def has_simple_expectations?
468
- !@simple_expectations.empty?
469
- end
53
+ set_2 = args[1]
54
+ if set_2.is_a?(Array)
55
+ set_2 = fields_for(dataset_2 || dataset_1, *set_2)
56
+ else
57
+ set_2 = fields_for(dataset_2 || dataset_1, set_2).first
58
+ end
59
+ args[1] = set_2
470
60
 
471
- def has_exhaustive_expectations?
472
- !@exhaustive_expectations.empty?
61
+ comparator = klass.new(*args, &block)
62
+ @comparators << comparator
473
63
  end
474
64
 
475
- private
65
+ protected
476
66
 
477
- def decollation_needed_for_simple_expectation?(expectation)
478
- if expectation.decollation_needed?
479
- true
480
- elsif results_uri && expectation.kind != :filter
481
- result_set_database_type = ResultSet.new(self).database.database_type
482
- database_types_differ =
483
- result_set_database_type != dataset_1.database_type ||
484
- result_set_database_type != dataset_2.database_type
485
-
486
- merged_field = expectation.merged_field
487
- merged_field.ruby_type[:type] == String &&
488
- !merged_field.collation.nil? && database_types_differ
489
- else
490
- false
491
- end
67
+ def fields_for(dataset, *args)
68
+ field_set = dataset.field_set
69
+ args.collect { |name| field_set[name] }
492
70
  end
493
71
  end
494
72
  end