linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -1,494 +1,72 @@
1
1
  module Linkage
2
2
  class Configuration
3
- class DSL
4
- # Class for visually comparing matched records
5
- class VisualComparisonWrapper
6
- attr_reader :dsl, :lhs, :rhs
3
+ attr_reader :dataset_1, :dataset_2, :result_set, :comparators
4
+ attr_accessor :record_cache_size, :algorithm, :threshold
7
5
 
8
- def initialize(dsl, lhs, rhs)
9
- @dsl = dsl
10
- @lhs = lhs
11
- @rhs = rhs
12
-
13
- if @lhs.is_a?(DataWrapper) && @rhs.is_a?(DataWrapper)
14
- if @lhs.side == @rhs.side
15
- raise ArgumentError, "Can't visually compare two data sources on the same side"
16
- end
17
- else
18
- raise ArgumentError, "Must supply two data sources for visual comparison"
19
- end
20
-
21
- @dsl.add_visual_comparison(self)
22
- end
23
- end
24
-
25
- class ExpectationWrapper
26
- VALID_OPERATORS = [:==, :>, :<, :>=, :<=]
27
- OPERATOR_OPPOSITES = {
28
- :== => :'!=',
29
- :> => :<=,
30
- :<= => :>,
31
- :< => :>=,
32
- :>= => :<
33
- }
34
-
35
- def initialize(dsl, type, lhs, *args)
36
- @dsl = dsl
37
- @type = type
38
- @lhs = lhs
39
- end
40
-
41
- def compare_with(operator, rhs)
42
- # NOTE: lhs is always a DataWrapper
43
-
44
- if !rhs.is_a?(DataWrapper) || @lhs.static? || rhs.static? || @lhs.side == rhs.side
45
- @side = !@lhs.static? ? @lhs.side : rhs.side
46
-
47
- # If one of the objects in this comparison is a static function, we need to set the side
48
- # and the dataset based on the other object
49
- if rhs.is_a?(DataWrapper) && !rhs.static? && @lhs.is_a?(FunctionWrapper) && @lhs.static?
50
- @lhs.dataset = rhs.dataset
51
- @lhs.side = @side
52
- elsif @lhs.is_a?(DataWrapper) && !@lhs.static? && rhs.is_a?(FunctionWrapper) && rhs.static?
53
- rhs.dataset = @lhs.dataset
54
- rhs.side = @side
55
- end
56
- elsif rhs.is_a?(DataWrapper) && operator != :==
57
- # create an exhaustive expectation with the Compare comparator instead
58
- comparator = Comparators::Compare.new(@lhs.meta_object,
59
- MetaObject.new(operator.to_s), rhs.meta_object)
60
-
61
- score_range = Comparators::Compare.score_range
62
- threshold = @type == :must ? score_range.last : score_range.first
63
-
64
- expectation = Expectations::Exhaustive.new(comparator, threshold, :equal)
65
- @dsl.add_exhaustive_expectation(expectation)
66
- return self
67
- end
68
-
69
- exp_operator = @type == :must_not ? OPERATOR_OPPOSITES[operator] : operator
70
-
71
- rhs_meta_object = rhs.is_a?(DataWrapper) ? rhs.meta_object : MetaObject.new(rhs)
72
- @expectation = Expectations::Simple.create(@lhs.meta_object,
73
- rhs_meta_object, exp_operator)
74
- @dsl.add_simple_expectation(@expectation)
75
- self
76
- end
77
-
78
- VALID_OPERATORS.each do |operator|
79
- define_method(operator) do |rhs|
80
- compare_with(operator, rhs)
81
- end
82
- end
83
-
84
- def exactly
85
- if !@exact_match
86
- @expectation.exactly!
87
- end
88
- end
89
- end
90
-
91
- class DataWrapper
92
- attr_reader :meta_object
93
-
94
- def initialize
95
- raise NotImplementedError
96
- end
97
-
98
- [:must, :must_not].each do |type|
99
- define_method(type) do |*args|
100
- if args.length > 0
101
- wrapper = args[0]
102
- comparator = wrapper.to_comparator(self)
103
-
104
- score_range = wrapper.klass.score_range
105
- threshold = type == :must ? score_range.last : score_range.first
106
-
107
- expectation = Expectations::Exhaustive.new(comparator, threshold, :equal)
108
- @dsl.add_exhaustive_expectation(expectation)
109
- else
110
- ExpectationWrapper.new(@dsl, type, self)
111
- end
112
- end
113
- end
114
-
115
- def compare_with(other)
116
- VisualComparisonWrapper.new(@dsl, self, other)
117
- end
118
-
119
- def method_missing(m, *args, &block)
120
- if meta_object.respond_to?(m)
121
- meta_object.send(m, *args, &block)
122
- else
123
- super(m, *args, &block)
124
- end
125
- end
126
- end
127
-
128
- class FieldWrapper < DataWrapper
129
- attr_reader :name
130
-
131
- def initialize(dsl, side, dataset, name)
132
- @dsl = dsl
133
- @meta_object = MetaObject.new(dataset.field_set[name], side)
134
- end
135
- end
136
-
137
- class FunctionWrapper < DataWrapper
138
- def initialize(dsl, klass, args)
139
- @dsl = dsl
140
-
141
- side = dataset = nil
142
- static = true
143
- function_args = []
144
- args.each do |arg|
145
- if arg.kind_of?(DataWrapper)
146
- raise "conflicting sides" if side && side != arg.side
147
- side = arg.side
148
- static &&= arg.static?
149
- dataset = arg.dataset
150
- function_args << arg.object
151
- else
152
- function_args << arg
153
- end
154
- end
155
- @meta_object = MetaObject.new(klass.new(*function_args), side)
156
- end
6
+ def initialize(*args)
7
+ if args.length < 2 || args.length > 3
8
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 3..4)"
157
9
  end
158
10
 
159
- class ComparatorWrapper
160
- attr_reader :klass, :args
161
-
162
- def initialize(dsl, klass, args)
163
- @dsl = dsl
164
- @klass = klass
165
- @args = args
166
- end
167
-
168
- def of(*args)
169
- @args.push(*args)
170
- self
171
- end
172
-
173
- def to_comparator(receiver)
174
- comparator_args = ([receiver] + @args).collect do |arg|
175
- arg.is_a?(DataWrapper) ? arg.meta_object : MetaObject.new(arg)
176
- end
177
- comparator = klass.new(*comparator_args)
178
- end
179
- end
180
-
181
- class DatasetWrapper
182
- attr_reader :dataset
183
-
184
- def initialize(dsl, side, dataset)
185
- @dsl = dsl
186
- @dataset = dataset
187
- @side = side
188
- end
189
-
190
- def [](field_name)
191
- if @dataset.field_set.has_key?(field_name)
192
- FieldWrapper.new(@dsl, @side, @dataset, field_name)
193
- else
194
- raise ArgumentError, "The '#{field_name}' field doesn't exist for the #{@side} dataset!"
195
- end
196
- end
197
- end
198
-
199
- def initialize(config, &block)
200
- @config = config
201
- @lhs_filters = []
202
- @rhs_filters = []
203
- instance_eval(&block)
204
- end
205
-
206
- def lhs
207
- DatasetWrapper.new(self, :lhs, @config.dataset_1)
208
- end
209
-
210
- def rhs
211
- DatasetWrapper.new(self, :rhs, @config.dataset_2)
212
- end
213
-
214
- def save_results_in(uri, options = {})
215
- @config.results_uri = uri
216
- @config.results_uri_options = options
217
- end
218
-
219
- def set_record_cache_size(num)
220
- @config.record_cache_size = num
221
- end
222
-
223
- def add_simple_expectation(expectation)
224
- @config.add_simple_expectation(expectation)
225
-
226
- if @config.linkage_type == :self
227
- case expectation.kind
228
- when :cross
229
- @config.linkage_type = :cross
230
- when :filter
231
- # If there different filters on both 'sides' of a self-linkage,
232
- # it turns into a cross linkage.
233
- these_filters, other_filters =
234
- case expectation.side
235
- when :lhs
236
- [@lhs_filters, @rhs_filters]
237
- when :rhs
238
- [@rhs_filters, @lhs_filters]
239
- end
240
-
241
- these_filters << expectation
242
- other_filters.each do |other|
243
- if !expectation.same_except_side?(other)
244
- @config.linkage_type = :cross
245
- break
246
- end
247
- end
248
- end
249
- end
11
+ @dataset_1 = args[0]
12
+ if args.length > 2 && args[1]
13
+ @dataset_2 = args[1]
250
14
  end
15
+ @result_set = args[-1]
251
16
 
252
- def add_exhaustive_expectation(expectation)
253
- @config.add_exhaustive_expectation(expectation)
254
- if @config.linkage_type == :self
255
- @config.linkage_type = expectation.kind
256
- end
257
- end
258
-
259
- def add_visual_comparison(visual_comparison)
260
- @config.visual_comparisons << visual_comparison
261
- end
262
-
263
- def groups_table_name(new_name)
264
- @config.groups_table_name = new_name
265
- end
266
-
267
- def original_groups_table_name(new_name)
268
- @config.original_groups_table_name = new_name
269
- end
270
-
271
- def scores_table_name(new_name)
272
- @config.scores_table_name = new_name
273
- end
274
-
275
- def matches_table_name(new_name)
276
- @config.matches_table_name = new_name
277
- end
278
-
279
- def method_missing(name, *args, &block)
280
- # check for comparators
281
- md = name.to_s.match(/^be_(.+)$/)
282
- if md
283
- klass = Comparator[md[1]]
284
- if klass
285
- ComparatorWrapper.new(self, klass, args)
286
- else
287
- super
288
- end
289
- else
290
- # check for functions
291
- klass = Function[name.to_s]
292
- if klass
293
- FunctionWrapper.new(self, klass, args)
294
- else
295
- super
296
- end
297
- end
298
- end
299
- end
300
-
301
- attr_reader :dataset_1, :dataset_2, :simple_expectations,
302
- :exhaustive_expectations, :visual_comparisons
303
- attr_accessor :linkage_type, :results_uri, :results_uri_options,
304
- :record_cache_size, :groups_table_name, :original_groups_table_name,
305
- :scores_table_name, :matches_table_name
306
-
307
- def initialize(dataset_1, dataset_2)
308
- @dataset_1 = dataset_1
309
- @dataset_2 = dataset_2
310
- @linkage_type = dataset_1 == dataset_2 ? :self : :dual
311
- @simple_expectations = []
312
- @exhaustive_expectations = []
313
- @visual_comparisons = []
314
- @results_uri_options = {}
315
- @decollation_needed = false
17
+ @comparators = []
316
18
  @record_cache_size = 10_000
317
- @groups_table_name = :groups
318
- @original_groups_table_name = :original_groups
319
- @scores_table_name = :scores
320
- @matches_table_name = :matches
321
- end
322
-
323
- def configure(&block)
324
- DSL.new(self, &block)
325
19
  end
326
20
 
327
- def results_uri=(uri)
328
- @results_uri = uri
329
- if !@decollation_needed
330
- @simple_expectations.each do |expectation|
331
- if decollation_needed_for_simple_expectation?(expectation)
332
- @decollation_needed = true
333
- break
334
- end
335
- end
21
+ def score_recorder
22
+ pk_1 = @dataset_1.field_set.primary_key.name
23
+ if @dataset_2
24
+ pk_2 = @dataset_2.field_set.primary_key.name
25
+ else
26
+ pk_2 = pk_1
336
27
  end
337
- uri
338
- end
339
-
340
- def decollation_needed?
341
- @decollation_needed
28
+ ScoreRecorder.new(@comparators, @result_set.score_set, [pk_1, pk_2])
342
29
  end
343
30
 
344
- def groups_table_schema
345
- schema = []
346
-
347
- # add id
348
- schema << [:id, Integer, {:primary_key => true}]
349
-
350
- # add values
351
- @simple_expectations.each do |exp|
352
- next if exp.kind == :filter
353
-
354
- merged_field = exp.merged_field
355
- merged_type = merged_field.ruby_type
356
-
357
- # if the merged field's database type is different than the result
358
- # database, strip collation information
359
- result_db_type = nil
360
- result_set.database do |db|
361
- result_db_type = db.database_type
362
- end
363
- if merged_field.database_type != result_db_type && merged_type.has_key?(:opts)
364
- new_opts = merged_type[:opts].reject { |k, v| k == :collate }
365
- merged_type = merged_type.merge(:opts => new_opts)
366
- end
367
-
368
- col = [merged_field.name, merged_type[:type], merged_type[:opts] || {}]
369
- schema << col
370
- end
371
-
372
- schema
31
+ def matcher
32
+ Matcher.new(@comparators, @result_set.score_set, @algorithm || :mean, @threshold || 0.5)
373
33
  end
374
34
 
375
- def scores_table_schema
376
- schema = []
377
-
378
- # add id
379
- schema << [:id, Integer, {:primary_key => true}]
380
-
381
- # add comparator id
382
- schema << [:comparator_id, Integer, {}]
383
-
384
- # add record ids
385
- pk = dataset_1.field_set.primary_key
386
- ruby_type = pk.ruby_type
387
- schema << [:record_1_id, ruby_type[:type], ruby_type[:opts] || {}]
388
-
389
- pk = dataset_2.field_set.primary_key
390
- ruby_type = pk.ruby_type
391
- schema << [:record_2_id, ruby_type[:type], ruby_type[:opts] || {}]
392
-
393
- # add score
394
- schema << [:score, Integer, {}]
395
-
396
- schema
35
+ def match_recorder(matcher)
36
+ MatchRecorder.new(matcher, @result_set.match_set)
397
37
  end
398
38
 
399
- def matches_table_schema
400
- schema = []
401
-
402
- # add id
403
- schema << [:id, Integer, {:primary_key => true}]
404
-
405
- # add record ids
406
- pk = dataset_1.field_set.primary_key
407
- ruby_type = pk.ruby_type
408
- schema << [:record_1_id, ruby_type[:type], ruby_type[:opts] || {}]
409
-
410
- pk = dataset_2.field_set.primary_key
411
- ruby_type = pk.ruby_type
412
- schema << [:record_2_id, ruby_type[:type], ruby_type[:opts] || {}]
413
-
414
- # add score
415
- schema << [:total_score, Integer, {}]
416
-
417
- schema
418
- end
419
-
420
- def add_simple_expectation(expectation)
421
- @simple_expectations << expectation
422
- @decollation_needed ||= decollation_needed_for_simple_expectation?(expectation)
423
- expectation
424
- end
425
-
426
- def add_exhaustive_expectation(expectation)
427
- @exhaustive_expectations << expectation
428
- expectation
429
- end
430
-
431
- def result_set
432
- @result_set ||= ResultSet.new(self)
433
- end
434
-
435
- def datasets_with_applied_simple_expectations
436
- dataset_1 = @dataset_1
437
- dataset_2 = @dataset_2
438
- @simple_expectations.each do |exp|
439
- dataset_1 = exp.apply_to(dataset_1, :lhs)
440
- dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
39
+ def method_missing(name, *args, &block)
40
+ klass = Comparator[name.to_s]
41
+ if klass.nil?
42
+ raise "unknown comparator: #{name}"
441
43
  end
442
- @linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
443
- end
444
44
 
445
- def datasets_with_applied_exhaustive_expectations
446
- apply_exhaustive_expectations(@dataset_1, @dataset_2)
447
- end
448
-
449
- def apply_exhaustive_expectations(dataset_1, dataset_2)
450
- dataset_1 = dataset_1.select(dataset_1.field_set.primary_key.to_expr)
451
- dataset_2 = dataset_2.select(dataset_2.field_set.primary_key.to_expr)
452
- @exhaustive_expectations.each do |exp|
453
- dataset_1 = exp.apply_to(dataset_1, :lhs)
454
- dataset_2 = exp.apply_to(dataset_2, :rhs)
45
+ set_1 = args[0]
46
+ if set_1.is_a?(Array)
47
+ set_1 = fields_for(dataset_1, *set_1)
48
+ else
49
+ set_1 = fields_for(dataset_1, set_1).first
455
50
  end
456
- [dataset_1, dataset_2]
457
- end
458
-
459
- def groups_table_needed?
460
- has_simple_expectations?
461
- end
462
-
463
- def scores_table_needed?
464
- has_exhaustive_expectations?
465
- end
51
+ args[0] = set_1
466
52
 
467
- def has_simple_expectations?
468
- !@simple_expectations.empty?
469
- end
53
+ set_2 = args[1]
54
+ if set_2.is_a?(Array)
55
+ set_2 = fields_for(dataset_2 || dataset_1, *set_2)
56
+ else
57
+ set_2 = fields_for(dataset_2 || dataset_1, set_2).first
58
+ end
59
+ args[1] = set_2
470
60
 
471
- def has_exhaustive_expectations?
472
- !@exhaustive_expectations.empty?
61
+ comparator = klass.new(*args, &block)
62
+ @comparators << comparator
473
63
  end
474
64
 
475
- private
65
+ protected
476
66
 
477
- def decollation_needed_for_simple_expectation?(expectation)
478
- if expectation.decollation_needed?
479
- true
480
- elsif results_uri && expectation.kind != :filter
481
- result_set_database_type = ResultSet.new(self).database.database_type
482
- database_types_differ =
483
- result_set_database_type != dataset_1.database_type ||
484
- result_set_database_type != dataset_2.database_type
485
-
486
- merged_field = expectation.merged_field
487
- merged_field.ruby_type[:type] == String &&
488
- !merged_field.collation.nil? && database_types_differ
489
- else
490
- false
491
- end
67
+ def fields_for(dataset, *args)
68
+ field_set = dataset.field_set
69
+ args.collect { |name| field_set[name] }
492
70
  end
493
71
  end
494
72
  end