linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -2,28 +2,33 @@ module Linkage
2
2
  # Delegator around Sequel::Dataset with some extra functionality.
3
3
  class Dataset
4
4
  attr_reader :field_set, :table_name
5
- attr_accessor :linkage_options
6
5
 
7
6
  def initialize(*args)
7
+ if args.length == 0 || args.length > 3
8
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1..3)"
9
+ end
10
+
8
11
  if args.length == 1
12
+ unless args[0].kind_of?(Sequel::Dataset)
13
+ raise ArgumentError, "expected Sequel::Dataset, got #{args[0].class}"
14
+ end
15
+
9
16
  @dataset = args[0]
10
17
  @db = @dataset.db
11
18
  @table_name = @dataset.first_source_table
12
-
13
- if !@db.kind_of?(Sequel::Collation)
14
- @db.extend(Sequel::Collation)
15
- end
19
+ elsif args.length == 2 && args[0].kind_of?(Sequel::Database)
20
+ @db = args[0]
21
+ @table_name = args[1].to_sym
22
+ @dataset = @db[@table_name]
16
23
  else
17
- uri, table, options = args
24
+ uri, table_name, options = args
18
25
  options ||= {}
19
26
 
20
- @table_name = table.to_sym
21
27
  @db = Sequel.connect(uri, options)
22
- @db.extend(Sequel::Collation)
28
+ @table_name = table_name.to_sym
23
29
  @dataset = @db[@table_name]
24
30
  end
25
31
  @field_set = FieldSet.new(self)
26
- @linkage_options = {}
27
32
  end
28
33
 
29
34
  def obj
@@ -37,9 +42,12 @@ module Linkage
37
42
  # Setup a linkage with another dataset
38
43
  #
39
44
  # @return [Linkage::Configuration]
40
- def link_with(dataset, &block)
41
- conf = Configuration.new(self, dataset)
42
- conf.configure(&block)
45
+ def link_with(dataset, result_set)
46
+ other = dataset.eql?(self) ? nil : dataset
47
+ conf = Configuration.new(self, other, result_set)
48
+ if block_given?
49
+ yield conf
50
+ end
43
51
  conf
44
52
  end
45
53
 
@@ -47,132 +55,25 @@ module Linkage
47
55
  @db.database_type
48
56
  end
49
57
 
50
- # Set objects to use for group matching. Accepts either {Linkage::MetaObject} or a
51
- # hash with options (valid options are :meta_object, :alias, and :cast).
52
- #
53
- # @example
54
- # dataset.group_match(meta_object_1,
55
- # {:meta_object => meta_object_2, :alias => :foo})
56
- def group_match(*args)
57
- args.collect! do |arg|
58
- case arg
59
- when Linkage::MetaObject
60
- { :meta_object => arg }
61
- when Hash
62
- if !arg.has_key?(:meta_object)
63
- raise ArgumentError, "Invalid option hash, missing :meta_object key"
64
- end
65
- (arg.keys - [:meta_object, :alias, :cast]).each do |invalid_key|
66
- warn "Invalid key in option hash: #{invalid_key}"
67
- end
68
- arg
69
- else
70
- raise ArgumentError, "expected Hash or MetaObject, got #{arg.class}"
71
- end
72
- end
73
- clone(:group_match => args)
74
- end
75
-
76
- # Add additional objects to use for group matching.
77
- def group_match_more(*args)
78
- args = @linkage_options[:group_match] + args if @linkage_options[:group_match]
79
- group_match(*args)
80
- end
81
-
82
- def clone(new_options = {})
83
- new_linkage_options = {}
84
- new_obj_options = {}
85
- new_options.each_pair do |k, v|
86
- case k
87
- when :group_match
88
- new_linkage_options[k] = v
89
- else
90
- new_obj_options[k] = v
91
- end
92
- end
93
- new_obj = new_options[:new_obj]
94
-
95
- result = super()
96
- result.linkage_options = @linkage_options.merge(new_linkage_options)
97
-
98
- if new_obj
99
- result.obj = new_obj
100
- else
101
- result.obj = obj.clone(new_options)
102
- end
103
-
104
- result
105
- end
106
-
107
- def each_group(min = 2)
108
- group_match = @linkage_options[:group_match] || []
109
- ruby_types = group_match.inject({}) do |hsh, m|
110
- key = m[:alias] || m[:meta_object].to_expr
111
- hsh[key] = m[:meta_object].ruby_type
112
- hsh
113
- end
114
- options = {:database_type => database_type, :ruby_types => ruby_types }
115
- @dataset.group_and_count(*match_expressions).having{count >= min}.each do |row|
116
- count = row.delete(:count)
117
- group = Group.new(row, options.merge(:count => count))
118
- yield group
119
- end
120
- end
121
-
122
- def group_by_matches(raw = true)
123
- expr = raw ? raw_match_expressions : match_expressions
124
- group(*expr)
125
- end
126
-
127
- def dataset_for_group(group)
128
- filters = []
129
- group_match = @linkage_options[:group_match] || []
130
- group.values.each_pair do |key, value|
131
- # find a matched expression with this alias
132
- found = false
133
- group_match.each do |m|
134
- expr = m[:meta_object].to_expr
135
- if (m[:alias] && m[:alias] == key) || expr == key
136
- found = true
137
- filters << {expr => value}
138
- break
139
- end
140
- end
141
- if !found
142
- raise "this dataset isn't compatible with the given group"
143
- end
144
- end
145
- filter(*filters)
146
- end
147
-
148
58
  def schema
149
59
  @db.schema(@table_name)
150
60
  end
151
61
 
152
- private
153
-
154
- def raw_match_expressions
155
- group_match = @linkage_options[:group_match] || []
156
- group_match.collect { |m| m[:meta_object].to_expr }
62
+ def primary_key
63
+ @field_set.primary_key
157
64
  end
158
65
 
159
- def match_expressions
160
- group_match = @linkage_options[:group_match] || []
161
- group_match.collect do |m|
162
- expr = m[:meta_object].to_expr
163
- expr = expr.as(m[:alias]) if m[:alias]
164
- expr = expr.cast(m[:cast]) if m[:cast]
165
- expr
166
- end
167
- end
66
+ protected
168
67
 
169
68
  def method_missing(name, *args, &block)
170
69
  result = @dataset.send(name, *args, &block)
171
70
  if result.kind_of?(Sequel::Dataset)
172
- new_obj = result
173
- result = clone(:new_obj => result)
71
+ new_object = clone
72
+ new_object.obj = result
73
+ new_object
74
+ else
75
+ result
174
76
  end
175
- result
176
77
  end
177
78
  end
178
79
  end
@@ -0,0 +1,5 @@
1
+ module Linkage
2
+ class Error < Exception; end
3
+ class ExistsError < Error; end
4
+ class MissingError < Error; end
5
+ end
data/lib/linkage/field.rb CHANGED
@@ -1,17 +1,19 @@
1
1
  module Linkage
2
2
  # This class is for holding information about a particular field in a
3
3
  # dataset.
4
- class Field < Data
4
+ class Field
5
+ # @!attribute [r] name
6
+ # @return [Symbol] This object's name
7
+ attr_reader :name
8
+
5
9
  # @return [Symbol] This field's schema information
6
10
  attr_reader :schema
7
11
 
8
12
  # Create a new instance of Field.
9
13
  #
10
- # @param [Linkage::Dataset] dataset
11
14
  # @param [Symbol] name The field's name
12
15
  # @param [Hash] schema The field's schema information
13
- def initialize(dataset, name, schema)
14
- @dataset = dataset
16
+ def initialize(name, schema)
15
17
  @name = name
16
18
  @schema = schema
17
19
  end
@@ -63,7 +65,6 @@ module Linkage
63
65
  else
64
66
  {:type=>String}
65
67
  end
66
- hsh[:collate] = collation
67
68
 
68
69
  hsh.delete_if { |k, v| v.nil? }
69
70
  @ruby_type = {:type => hsh.delete(:type)}
@@ -72,40 +73,8 @@ module Linkage
72
73
  @ruby_type
73
74
  end
74
75
 
75
- def to_expr(options = {})
76
- @name
77
- end
78
-
79
- def static?
80
- false
81
- end
82
-
83
76
  def primary_key?
84
77
  schema && schema[:primary_key]
85
78
  end
86
-
87
- def collation
88
- schema[:collation]
89
- end
90
- end
91
-
92
- # A special field used for merging two {Data} objects together. It
93
- # has no dataset or schema.
94
- class MergeField < Field
95
- attr_reader :database_type
96
-
97
- # Create a new instance of MergeField.
98
- #
99
- # @param [Symbol] name The field's name
100
- # @param [Hash] ruby_type The field's schema information
101
- def initialize(name, ruby_type, database_type = nil)
102
- @name = name
103
- @ruby_type = ruby_type
104
- @database_type = database_type
105
- end
106
-
107
- def collation
108
- @ruby_type.has_key?(:opts) ? @ruby_type[:opts][:collate] : nil
109
- end
110
79
  end
111
80
  end
@@ -7,11 +7,11 @@ module Linkage
7
7
  # @param [Linkage::Dataset] dataset
8
8
  def initialize(dataset)
9
9
  dataset.schema.each do |(name, column_schema)|
10
- f = Field.new(dataset, name, column_schema)
11
- self[name] = f
10
+ field = Field.new(name, column_schema)
11
+ self[name] = field
12
12
 
13
13
  if @primary_key.nil? && column_schema[:primary_key]
14
- @primary_key = f
14
+ @primary_key = field
15
15
  end
16
16
  end
17
17
  end
@@ -0,0 +1,22 @@
1
+ module Linkage
2
+ class MatchRecorder
3
+ def initialize(matcher, match_set)
4
+ @matcher = matcher
5
+ @match_set = match_set
6
+ end
7
+
8
+ def start
9
+ @matcher.add_observer(self)
10
+ @match_set.open_for_writing
11
+ end
12
+
13
+ def update(id_1, id_2, score)
14
+ @match_set.add_match(id_1, id_2, score)
15
+ end
16
+
17
+ def stop
18
+ @match_set.close
19
+ @matcher.delete_observer(self)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,34 @@
1
+ module Linkage
2
+ class MatchSet
3
+ # Register a match set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ unless methods.include?(:add_match)
9
+ raise ArgumentError, "class must define #add_match"
10
+ end
11
+
12
+ @match_sets ||= {}
13
+ @match_sets[name] = klass
14
+ end
15
+
16
+ def self.[](name)
17
+ @match_sets ? @match_sets[name] : nil
18
+ end
19
+
20
+ def open_for_writing
21
+ end
22
+
23
+ # @abstract
24
+ def add_match(id_1, id_2, score)
25
+ raise NotImplementedError
26
+ end
27
+
28
+ def close
29
+ end
30
+ end
31
+ end
32
+
33
+ require 'linkage/match_sets/csv'
34
+ require 'linkage/match_sets/database'
@@ -0,0 +1,39 @@
1
+ require 'csv'
2
+
3
+ module Linkage
4
+ module MatchSets
5
+ class CSV < MatchSet
6
+ def initialize(filename, options = {})
7
+ @filename = filename
8
+ @overwrite = options[:overwrite]
9
+ end
10
+
11
+ def open_for_writing
12
+ return if @mode == :write
13
+
14
+ if !@overwrite && File.exist?(@filename)
15
+ raise ExistsError, "#{@filename} exists and not in overwrite mode"
16
+ end
17
+
18
+ @csv = ::CSV.open(@filename, 'wb')
19
+ @csv << %w{id_1 id_2 score}
20
+ @mode = :write
21
+ end
22
+
23
+ def add_match(id_1, id_2, score)
24
+ raise "not in write mode" if @mode != :write
25
+ if score.equal?(1.0) || score.equal?(0.0)
26
+ score = score.floor
27
+ end
28
+ @csv << [id_1, id_2, score]
29
+ end
30
+
31
+ def close
32
+ @mode = nil
33
+ @csv.close if @csv
34
+ end
35
+ end
36
+
37
+ MatchSet.register('csv', CSV)
38
+ end
39
+ end
@@ -0,0 +1,45 @@
1
+ module Linkage
2
+ module MatchSets
3
+ class Database < MatchSet
4
+ def initialize(database, options = {})
5
+ @database = database
6
+ @table_name = options[:table_name] || :matches
7
+ @overwrite = options[:overwrite]
8
+ end
9
+
10
+ def open_for_writing
11
+ return if @mode == :write
12
+
13
+ if @overwrite
14
+ @database.drop_table?(@table_name)
15
+ elsif @database.table_exists?(@table_name)
16
+ raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
17
+ end
18
+
19
+ @database.create_table(@table_name) do
20
+ String :id_1
21
+ String :id_2
22
+ Float :score
23
+ end
24
+ @dataset = @database[@table_name]
25
+ @mode = :write
26
+ end
27
+
28
+ def add_match(id_1, id_2, score)
29
+ raise "not in write mode" if @mode != :write
30
+
31
+ @dataset.insert({
32
+ :id_1 => id_1,
33
+ :id_2 => id_2,
34
+ :score => score
35
+ })
36
+ end
37
+
38
+ def close
39
+ @mode = nil
40
+ end
41
+ end
42
+
43
+ MatchSet.register('database', Database)
44
+ end
45
+ end
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ class Matcher
3
+ include Observable
4
+
5
+ attr_reader :comparators, :score_set, :algorithm, :threshold
6
+
7
+ def initialize(comparators, score_set, algorithm, threshold)
8
+ @comparators = comparators
9
+ @score_set = score_set
10
+ @algorithm = algorithm
11
+ @threshold = threshold
12
+ end
13
+
14
+ def run
15
+ send(@algorithm)
16
+ end
17
+
18
+ private
19
+
20
+ def mean
21
+ @score_set.each_pair do |id_1, id_2, scores|
22
+ mean = scores.values.inject(:+) / @comparators.length.to_f
23
+ if mean >= @threshold
24
+ changed
25
+ notify_observers(id_1, id_2, mean)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,125 +1,40 @@
1
1
  module Linkage
2
2
  class ResultSet
3
- def initialize(config)
4
- @config = config
5
- @next_group_id = 1
6
- @next_group_mutex = Mutex.new
7
- end
8
-
9
- def groups_dataset
10
- @groups_dataset ||= Dataset.new(database[:groups])
11
- end
12
-
13
- def database
14
- # FIXME: If the results database is the same as one of the datasets
15
- # being linked, there will be two connections to said database. This
16
- # could result in unexpected locking for non-concurrent databases (like
17
- # SQLite).
18
- @database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
19
- end
20
-
21
- def create_tables!
22
- if @config.groups_table_needed?
23
- schema = @config.groups_table_schema
24
- if @config.decollation_needed?
25
- database.create_table(@config.original_groups_table_name) do
26
- schema.each { |col| column(*col) }
27
- end
28
- end
29
-
30
- database.create_table(@config.groups_table_name) do
31
- schema.each { |col| column(*col) }
32
- end
33
- end
34
-
35
- if @config.scores_table_needed?
36
- schema = @config.scores_table_schema
37
- database.create_table(@config.scores_table_name) do
38
- schema.each { |col| column(*col) }
39
- end
40
- end
41
-
42
- schema = @config.matches_table_schema
43
- database.create_table(@config.matches_table_name) do
44
- schema.each { |col| column(*col) }
45
- end
46
- end
47
-
48
- def add_group(group, dataset_id = nil)
49
- if @config.decollation_needed?
50
- original_values = group.values
51
- values = group.decollated_values
52
- if !@groups_buffer
53
- groups_headers = [:id] + values.keys
54
- @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
55
- groups_headers)
56
-
57
- original_groups_headers = [:id] + original_values.keys
58
- @original_groups_buffer = ImportBuffer.new(
59
- database[@config.original_groups_table_name],
60
- original_groups_headers)
61
- end
62
-
63
- group_id = next_group_id
64
- @groups_buffer.add([group_id] + values.values)
65
- @original_groups_buffer.add([group_id] + original_values.values)
66
- else
67
- # Non-DRY for minute speed improvements
68
- values = group.values
69
- if !@groups_buffer
70
- groups_headers = [:id] + values.keys
71
- @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
72
- groups_headers)
73
- end
74
- group_id = next_group_id
75
- @groups_buffer.add([group_id] + values.values)
3
+ # Register a result set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ missing = []
9
+ unless methods.include?(:score_set)
10
+ missing.push("#score_set")
76
11
  end
77
- end
78
-
79
- def add_score(comparator_id, record_1_id, record_2_id, score)
80
- if !@scores_buffer
81
- scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
82
- @scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
83
- scores_headers)
12
+ unless methods.include?(:match_set)
13
+ missing.push("#match_set")
84
14
  end
85
- @scores_buffer.add([comparator_id, record_1_id, record_2_id, score])
86
- end
87
-
88
- def add_match(record_1_id, record_2_id, total_score)
89
- if !@matches_buffer
90
- matches_headers = [:record_1_id, :record_2_id, :total_score]
91
- @matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
92
- matches_headers)
15
+ unless missing.empty?
16
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
93
17
  end
94
- @matches_buffer.add([record_1_id, record_2_id, total_score])
95
- end
96
18
 
97
- def flush!
98
- @groups_buffer.flush if @groups_buffer
99
- @original_groups_buffer.flush if @original_groups_buffer
100
- @scores_buffer.flush if @scores_buffer
101
- @matches_buffer.flush if @matches_buffer
19
+ @result_set ||= {}
20
+ @result_set[name] = klass
102
21
  end
103
22
 
104
- def get_group(index)
105
- values = groups_dataset.order(:id).limit(1, index).first
106
- Group.from_row(values)
23
+ def self.[](name)
24
+ @result_set ? @result_set[name] : nil
107
25
  end
108
26
 
109
- def groups_records_datasets(group)
110
- datasets = @config.datasets_with_applied_simple_expectations
111
- datasets.collect! { |ds| ds.dataset_for_group(group) }
27
+ # @abstract
28
+ def score_set
29
+ raise NotImplementedError
112
30
  end
113
31
 
114
- private
115
-
116
- def next_group_id
117
- result = nil
118
- @next_group_mutex.synchronize do
119
- result = @next_group_id
120
- @next_group_id += 1
121
- end
122
- result
32
+ # @abstract
33
+ def match_set
34
+ raise NotImplementedError
123
35
  end
124
36
  end
125
37
  end
38
+
39
+ require 'linkage/result_sets/csv'
40
+ require 'linkage/result_sets/database'
@@ -0,0 +1,54 @@
1
+ module Linkage
2
+ module ResultSets
3
+ class CSV < ResultSet
4
+ def initialize(dir_or_options = nil)
5
+ opts =
6
+ case dir_or_options
7
+ when nil
8
+ {}
9
+ when String
10
+ {:dir => dir_or_options}
11
+ when Hash
12
+ dir_or_options
13
+ else
14
+ raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
15
+ end
16
+
17
+ if opts[:dir]
18
+ opts[:dir] = File.expand_path(opts[:dir])
19
+ FileUtils.mkdir_p(opts[:dir])
20
+ end
21
+
22
+ @score_set_args = extract_args_for(:scores, opts)
23
+ @match_set_args = extract_args_for(:matches, opts)
24
+ end
25
+
26
+ def score_set
27
+ @score_set ||= ScoreSet['csv'].new(*@score_set_args)
28
+ end
29
+
30
+ def match_set
31
+ @match_set ||= MatchSet['csv'].new(*@match_set_args)
32
+ end
33
+
34
+ private
35
+
36
+ def extract_args_for(name, opts)
37
+ dir = opts[:dir] || '.'
38
+ opts = opts[name]
39
+
40
+ filename =
41
+ case opts
42
+ when Hash, nil
43
+ opts = opts ? opts.dup : {}
44
+ opts.delete(:filename) || "#{name}.csv"
45
+ when String
46
+ opts
47
+ end
48
+ [File.join(dir, filename), opts]
49
+ end
50
+ end
51
+
52
+ ResultSet.register('csv', CSV)
53
+ end
54
+ end