linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -2,28 +2,33 @@ module Linkage
2
2
  # Delegator around Sequel::Dataset with some extra functionality.
3
3
  class Dataset
4
4
  attr_reader :field_set, :table_name
5
- attr_accessor :linkage_options
6
5
 
7
6
  def initialize(*args)
7
+ if args.length == 0 || args.length > 3
8
+ raise ArgumentError, "wrong number of arguments (#{args.length} for 1..3)"
9
+ end
10
+
8
11
  if args.length == 1
12
+ unless args[0].kind_of?(Sequel::Dataset)
13
+ raise ArgumentError, "expected Sequel::Dataset, got #{args[0].class}"
14
+ end
15
+
9
16
  @dataset = args[0]
10
17
  @db = @dataset.db
11
18
  @table_name = @dataset.first_source_table
12
-
13
- if !@db.kind_of?(Sequel::Collation)
14
- @db.extend(Sequel::Collation)
15
- end
19
+ elsif args.length == 2 && args[0].kind_of?(Sequel::Database)
20
+ @db = args[0]
21
+ @table_name = args[1].to_sym
22
+ @dataset = @db[@table_name]
16
23
  else
17
- uri, table, options = args
24
+ uri, table_name, options = args
18
25
  options ||= {}
19
26
 
20
- @table_name = table.to_sym
21
27
  @db = Sequel.connect(uri, options)
22
- @db.extend(Sequel::Collation)
28
+ @table_name = table_name.to_sym
23
29
  @dataset = @db[@table_name]
24
30
  end
25
31
  @field_set = FieldSet.new(self)
26
- @linkage_options = {}
27
32
  end
28
33
 
29
34
  def obj
@@ -37,9 +42,12 @@ module Linkage
37
42
  # Setup a linkage with another dataset
38
43
  #
39
44
  # @return [Linkage::Configuration]
40
- def link_with(dataset, &block)
41
- conf = Configuration.new(self, dataset)
42
- conf.configure(&block)
45
+ def link_with(dataset, result_set)
46
+ other = dataset.eql?(self) ? nil : dataset
47
+ conf = Configuration.new(self, other, result_set)
48
+ if block_given?
49
+ yield conf
50
+ end
43
51
  conf
44
52
  end
45
53
 
@@ -47,132 +55,25 @@ module Linkage
47
55
  @db.database_type
48
56
  end
49
57
 
50
- # Set objects to use for group matching. Accepts either {Linkage::MetaObject} or a
51
- # hash with options (valid options are :meta_object, :alias, and :cast).
52
- #
53
- # @example
54
- # dataset.group_match(meta_object_1,
55
- # {:meta_object => meta_object_2, :alias => :foo})
56
- def group_match(*args)
57
- args.collect! do |arg|
58
- case arg
59
- when Linkage::MetaObject
60
- { :meta_object => arg }
61
- when Hash
62
- if !arg.has_key?(:meta_object)
63
- raise ArgumentError, "Invalid option hash, missing :meta_object key"
64
- end
65
- (arg.keys - [:meta_object, :alias, :cast]).each do |invalid_key|
66
- warn "Invalid key in option hash: #{invalid_key}"
67
- end
68
- arg
69
- else
70
- raise ArgumentError, "expected Hash or MetaObject, got #{arg.class}"
71
- end
72
- end
73
- clone(:group_match => args)
74
- end
75
-
76
- # Add additional objects to use for group matching.
77
- def group_match_more(*args)
78
- args = @linkage_options[:group_match] + args if @linkage_options[:group_match]
79
- group_match(*args)
80
- end
81
-
82
- def clone(new_options = {})
83
- new_linkage_options = {}
84
- new_obj_options = {}
85
- new_options.each_pair do |k, v|
86
- case k
87
- when :group_match
88
- new_linkage_options[k] = v
89
- else
90
- new_obj_options[k] = v
91
- end
92
- end
93
- new_obj = new_options[:new_obj]
94
-
95
- result = super()
96
- result.linkage_options = @linkage_options.merge(new_linkage_options)
97
-
98
- if new_obj
99
- result.obj = new_obj
100
- else
101
- result.obj = obj.clone(new_options)
102
- end
103
-
104
- result
105
- end
106
-
107
- def each_group(min = 2)
108
- group_match = @linkage_options[:group_match] || []
109
- ruby_types = group_match.inject({}) do |hsh, m|
110
- key = m[:alias] || m[:meta_object].to_expr
111
- hsh[key] = m[:meta_object].ruby_type
112
- hsh
113
- end
114
- options = {:database_type => database_type, :ruby_types => ruby_types }
115
- @dataset.group_and_count(*match_expressions).having{count >= min}.each do |row|
116
- count = row.delete(:count)
117
- group = Group.new(row, options.merge(:count => count))
118
- yield group
119
- end
120
- end
121
-
122
- def group_by_matches(raw = true)
123
- expr = raw ? raw_match_expressions : match_expressions
124
- group(*expr)
125
- end
126
-
127
- def dataset_for_group(group)
128
- filters = []
129
- group_match = @linkage_options[:group_match] || []
130
- group.values.each_pair do |key, value|
131
- # find a matched expression with this alias
132
- found = false
133
- group_match.each do |m|
134
- expr = m[:meta_object].to_expr
135
- if (m[:alias] && m[:alias] == key) || expr == key
136
- found = true
137
- filters << {expr => value}
138
- break
139
- end
140
- end
141
- if !found
142
- raise "this dataset isn't compatible with the given group"
143
- end
144
- end
145
- filter(*filters)
146
- end
147
-
148
58
  def schema
149
59
  @db.schema(@table_name)
150
60
  end
151
61
 
152
- private
153
-
154
- def raw_match_expressions
155
- group_match = @linkage_options[:group_match] || []
156
- group_match.collect { |m| m[:meta_object].to_expr }
62
+ def primary_key
63
+ @field_set.primary_key
157
64
  end
158
65
 
159
- def match_expressions
160
- group_match = @linkage_options[:group_match] || []
161
- group_match.collect do |m|
162
- expr = m[:meta_object].to_expr
163
- expr = expr.as(m[:alias]) if m[:alias]
164
- expr = expr.cast(m[:cast]) if m[:cast]
165
- expr
166
- end
167
- end
66
+ protected
168
67
 
169
68
  def method_missing(name, *args, &block)
170
69
  result = @dataset.send(name, *args, &block)
171
70
  if result.kind_of?(Sequel::Dataset)
172
- new_obj = result
173
- result = clone(:new_obj => result)
71
+ new_object = clone
72
+ new_object.obj = result
73
+ new_object
74
+ else
75
+ result
174
76
  end
175
- result
176
77
  end
177
78
  end
178
79
  end
@@ -0,0 +1,5 @@
1
+ module Linkage
2
+ class Error < Exception; end
3
+ class ExistsError < Error; end
4
+ class MissingError < Error; end
5
+ end
data/lib/linkage/field.rb CHANGED
@@ -1,17 +1,19 @@
1
1
  module Linkage
2
2
  # This class is for holding information about a particular field in a
3
3
  # dataset.
4
- class Field < Data
4
+ class Field
5
+ # @!attribute [r] name
6
+ # @return [Symbol] This object's name
7
+ attr_reader :name
8
+
5
9
  # @return [Symbol] This field's schema information
6
10
  attr_reader :schema
7
11
 
8
12
  # Create a new instance of Field.
9
13
  #
10
- # @param [Linkage::Dataset] dataset
11
14
  # @param [Symbol] name The field's name
12
15
  # @param [Hash] schema The field's schema information
13
- def initialize(dataset, name, schema)
14
- @dataset = dataset
16
+ def initialize(name, schema)
15
17
  @name = name
16
18
  @schema = schema
17
19
  end
@@ -63,7 +65,6 @@ module Linkage
63
65
  else
64
66
  {:type=>String}
65
67
  end
66
- hsh[:collate] = collation
67
68
 
68
69
  hsh.delete_if { |k, v| v.nil? }
69
70
  @ruby_type = {:type => hsh.delete(:type)}
@@ -72,40 +73,8 @@ module Linkage
72
73
  @ruby_type
73
74
  end
74
75
 
75
- def to_expr(options = {})
76
- @name
77
- end
78
-
79
- def static?
80
- false
81
- end
82
-
83
76
  def primary_key?
84
77
  schema && schema[:primary_key]
85
78
  end
86
-
87
- def collation
88
- schema[:collation]
89
- end
90
- end
91
-
92
- # A special field used for merging two {Data} objects together. It
93
- # has no dataset or schema.
94
- class MergeField < Field
95
- attr_reader :database_type
96
-
97
- # Create a new instance of MergeField.
98
- #
99
- # @param [Symbol] name The field's name
100
- # @param [Hash] ruby_type The field's schema information
101
- def initialize(name, ruby_type, database_type = nil)
102
- @name = name
103
- @ruby_type = ruby_type
104
- @database_type = database_type
105
- end
106
-
107
- def collation
108
- @ruby_type.has_key?(:opts) ? @ruby_type[:opts][:collate] : nil
109
- end
110
79
  end
111
80
  end
@@ -7,11 +7,11 @@ module Linkage
7
7
  # @param [Linkage::Dataset] dataset
8
8
  def initialize(dataset)
9
9
  dataset.schema.each do |(name, column_schema)|
10
- f = Field.new(dataset, name, column_schema)
11
- self[name] = f
10
+ field = Field.new(name, column_schema)
11
+ self[name] = field
12
12
 
13
13
  if @primary_key.nil? && column_schema[:primary_key]
14
- @primary_key = f
14
+ @primary_key = field
15
15
  end
16
16
  end
17
17
  end
@@ -0,0 +1,22 @@
1
+ module Linkage
2
+ class MatchRecorder
3
+ def initialize(matcher, match_set)
4
+ @matcher = matcher
5
+ @match_set = match_set
6
+ end
7
+
8
+ def start
9
+ @matcher.add_observer(self)
10
+ @match_set.open_for_writing
11
+ end
12
+
13
+ def update(id_1, id_2, score)
14
+ @match_set.add_match(id_1, id_2, score)
15
+ end
16
+
17
+ def stop
18
+ @match_set.close
19
+ @matcher.delete_observer(self)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,34 @@
1
+ module Linkage
2
+ class MatchSet
3
+ # Register a match set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ unless methods.include?(:add_match)
9
+ raise ArgumentError, "class must define #add_match"
10
+ end
11
+
12
+ @match_sets ||= {}
13
+ @match_sets[name] = klass
14
+ end
15
+
16
+ def self.[](name)
17
+ @match_sets ? @match_sets[name] : nil
18
+ end
19
+
20
+ def open_for_writing
21
+ end
22
+
23
+ # @abstract
24
+ def add_match(id_1, id_2, score)
25
+ raise NotImplementedError
26
+ end
27
+
28
+ def close
29
+ end
30
+ end
31
+ end
32
+
33
+ require 'linkage/match_sets/csv'
34
+ require 'linkage/match_sets/database'
@@ -0,0 +1,39 @@
1
+ require 'csv'
2
+
3
+ module Linkage
4
+ module MatchSets
5
+ class CSV < MatchSet
6
+ def initialize(filename, options = {})
7
+ @filename = filename
8
+ @overwrite = options[:overwrite]
9
+ end
10
+
11
+ def open_for_writing
12
+ return if @mode == :write
13
+
14
+ if !@overwrite && File.exist?(@filename)
15
+ raise ExistsError, "#{@filename} exists and not in overwrite mode"
16
+ end
17
+
18
+ @csv = ::CSV.open(@filename, 'wb')
19
+ @csv << %w{id_1 id_2 score}
20
+ @mode = :write
21
+ end
22
+
23
+ def add_match(id_1, id_2, score)
24
+ raise "not in write mode" if @mode != :write
25
+ if score.equal?(1.0) || score.equal?(0.0)
26
+ score = score.floor
27
+ end
28
+ @csv << [id_1, id_2, score]
29
+ end
30
+
31
+ def close
32
+ @mode = nil
33
+ @csv.close if @csv
34
+ end
35
+ end
36
+
37
+ MatchSet.register('csv', CSV)
38
+ end
39
+ end
@@ -0,0 +1,45 @@
1
+ module Linkage
2
+ module MatchSets
3
+ class Database < MatchSet
4
+ def initialize(database, options = {})
5
+ @database = database
6
+ @table_name = options[:table_name] || :matches
7
+ @overwrite = options[:overwrite]
8
+ end
9
+
10
+ def open_for_writing
11
+ return if @mode == :write
12
+
13
+ if @overwrite
14
+ @database.drop_table?(@table_name)
15
+ elsif @database.table_exists?(@table_name)
16
+ raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
17
+ end
18
+
19
+ @database.create_table(@table_name) do
20
+ String :id_1
21
+ String :id_2
22
+ Float :score
23
+ end
24
+ @dataset = @database[@table_name]
25
+ @mode = :write
26
+ end
27
+
28
+ def add_match(id_1, id_2, score)
29
+ raise "not in write mode" if @mode != :write
30
+
31
+ @dataset.insert({
32
+ :id_1 => id_1,
33
+ :id_2 => id_2,
34
+ :score => score
35
+ })
36
+ end
37
+
38
+ def close
39
+ @mode = nil
40
+ end
41
+ end
42
+
43
+ MatchSet.register('database', Database)
44
+ end
45
+ end
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ class Matcher
3
+ include Observable
4
+
5
+ attr_reader :comparators, :score_set, :algorithm, :threshold
6
+
7
+ def initialize(comparators, score_set, algorithm, threshold)
8
+ @comparators = comparators
9
+ @score_set = score_set
10
+ @algorithm = algorithm
11
+ @threshold = threshold
12
+ end
13
+
14
+ def run
15
+ send(@algorithm)
16
+ end
17
+
18
+ private
19
+
20
+ def mean
21
+ @score_set.each_pair do |id_1, id_2, scores|
22
+ mean = scores.values.inject(:+) / @comparators.length.to_f
23
+ if mean >= @threshold
24
+ changed
25
+ notify_observers(id_1, id_2, mean)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,125 +1,40 @@
1
1
  module Linkage
2
2
  class ResultSet
3
- def initialize(config)
4
- @config = config
5
- @next_group_id = 1
6
- @next_group_mutex = Mutex.new
7
- end
8
-
9
- def groups_dataset
10
- @groups_dataset ||= Dataset.new(database[:groups])
11
- end
12
-
13
- def database
14
- # FIXME: If the results database is the same as one of the datasets
15
- # being linked, there will be two connections to said database. This
16
- # could result in unexpected locking for non-concurrent databases (like
17
- # SQLite).
18
- @database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
19
- end
20
-
21
- def create_tables!
22
- if @config.groups_table_needed?
23
- schema = @config.groups_table_schema
24
- if @config.decollation_needed?
25
- database.create_table(@config.original_groups_table_name) do
26
- schema.each { |col| column(*col) }
27
- end
28
- end
29
-
30
- database.create_table(@config.groups_table_name) do
31
- schema.each { |col| column(*col) }
32
- end
33
- end
34
-
35
- if @config.scores_table_needed?
36
- schema = @config.scores_table_schema
37
- database.create_table(@config.scores_table_name) do
38
- schema.each { |col| column(*col) }
39
- end
40
- end
41
-
42
- schema = @config.matches_table_schema
43
- database.create_table(@config.matches_table_name) do
44
- schema.each { |col| column(*col) }
45
- end
46
- end
47
-
48
- def add_group(group, dataset_id = nil)
49
- if @config.decollation_needed?
50
- original_values = group.values
51
- values = group.decollated_values
52
- if !@groups_buffer
53
- groups_headers = [:id] + values.keys
54
- @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
55
- groups_headers)
56
-
57
- original_groups_headers = [:id] + original_values.keys
58
- @original_groups_buffer = ImportBuffer.new(
59
- database[@config.original_groups_table_name],
60
- original_groups_headers)
61
- end
62
-
63
- group_id = next_group_id
64
- @groups_buffer.add([group_id] + values.values)
65
- @original_groups_buffer.add([group_id] + original_values.values)
66
- else
67
- # Non-DRY for minute speed improvements
68
- values = group.values
69
- if !@groups_buffer
70
- groups_headers = [:id] + values.keys
71
- @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
72
- groups_headers)
73
- end
74
- group_id = next_group_id
75
- @groups_buffer.add([group_id] + values.values)
3
+ # Register a result set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ missing = []
9
+ unless methods.include?(:score_set)
10
+ missing.push("#score_set")
76
11
  end
77
- end
78
-
79
- def add_score(comparator_id, record_1_id, record_2_id, score)
80
- if !@scores_buffer
81
- scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
82
- @scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
83
- scores_headers)
12
+ unless methods.include?(:match_set)
13
+ missing.push("#match_set")
84
14
  end
85
- @scores_buffer.add([comparator_id, record_1_id, record_2_id, score])
86
- end
87
-
88
- def add_match(record_1_id, record_2_id, total_score)
89
- if !@matches_buffer
90
- matches_headers = [:record_1_id, :record_2_id, :total_score]
91
- @matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
92
- matches_headers)
15
+ unless missing.empty?
16
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
93
17
  end
94
- @matches_buffer.add([record_1_id, record_2_id, total_score])
95
- end
96
18
 
97
- def flush!
98
- @groups_buffer.flush if @groups_buffer
99
- @original_groups_buffer.flush if @original_groups_buffer
100
- @scores_buffer.flush if @scores_buffer
101
- @matches_buffer.flush if @matches_buffer
19
+ @result_set ||= {}
20
+ @result_set[name] = klass
102
21
  end
103
22
 
104
- def get_group(index)
105
- values = groups_dataset.order(:id).limit(1, index).first
106
- Group.from_row(values)
23
+ def self.[](name)
24
+ @result_set ? @result_set[name] : nil
107
25
  end
108
26
 
109
- def groups_records_datasets(group)
110
- datasets = @config.datasets_with_applied_simple_expectations
111
- datasets.collect! { |ds| ds.dataset_for_group(group) }
27
+ # @abstract
28
+ def score_set
29
+ raise NotImplementedError
112
30
  end
113
31
 
114
- private
115
-
116
- def next_group_id
117
- result = nil
118
- @next_group_mutex.synchronize do
119
- result = @next_group_id
120
- @next_group_id += 1
121
- end
122
- result
32
+ # @abstract
33
+ def match_set
34
+ raise NotImplementedError
123
35
  end
124
36
  end
125
37
  end
38
+
39
+ require 'linkage/result_sets/csv'
40
+ require 'linkage/result_sets/database'
@@ -0,0 +1,54 @@
1
+ module Linkage
2
+ module ResultSets
3
+ class CSV < ResultSet
4
+ def initialize(dir_or_options = nil)
5
+ opts =
6
+ case dir_or_options
7
+ when nil
8
+ {}
9
+ when String
10
+ {:dir => dir_or_options}
11
+ when Hash
12
+ dir_or_options
13
+ else
14
+ raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
15
+ end
16
+
17
+ if opts[:dir]
18
+ opts[:dir] = File.expand_path(opts[:dir])
19
+ FileUtils.mkdir_p(opts[:dir])
20
+ end
21
+
22
+ @score_set_args = extract_args_for(:scores, opts)
23
+ @match_set_args = extract_args_for(:matches, opts)
24
+ end
25
+
26
+ def score_set
27
+ @score_set ||= ScoreSet['csv'].new(*@score_set_args)
28
+ end
29
+
30
+ def match_set
31
+ @match_set ||= MatchSet['csv'].new(*@match_set_args)
32
+ end
33
+
34
+ private
35
+
36
+ def extract_args_for(name, opts)
37
+ dir = opts[:dir] || '.'
38
+ opts = opts[name]
39
+
40
+ filename =
41
+ case opts
42
+ when Hash, nil
43
+ opts = opts ? opts.dup : {}
44
+ opts.delete(:filename) || "#{name}.csv"
45
+ when String
46
+ opts
47
+ end
48
+ [File.join(dir, filename), opts]
49
+ end
50
+ end
51
+
52
+ ResultSet.register('csv', CSV)
53
+ end
54
+ end