linkage 0.0.8 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
data/lib/linkage/dataset.rb
CHANGED
@@ -2,28 +2,33 @@ module Linkage
|
|
2
2
|
# Delegator around Sequel::Dataset with some extra functionality.
|
3
3
|
class Dataset
|
4
4
|
attr_reader :field_set, :table_name
|
5
|
-
attr_accessor :linkage_options
|
6
5
|
|
7
6
|
def initialize(*args)
|
7
|
+
if args.length == 0 || args.length > 3
|
8
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 1..3)"
|
9
|
+
end
|
10
|
+
|
8
11
|
if args.length == 1
|
12
|
+
unless args[0].kind_of?(Sequel::Dataset)
|
13
|
+
raise ArgumentError, "expected Sequel::Dataset, got #{args[0].class}"
|
14
|
+
end
|
15
|
+
|
9
16
|
@dataset = args[0]
|
10
17
|
@db = @dataset.db
|
11
18
|
@table_name = @dataset.first_source_table
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
19
|
+
elsif args.length == 2 && args[0].kind_of?(Sequel::Database)
|
20
|
+
@db = args[0]
|
21
|
+
@table_name = args[1].to_sym
|
22
|
+
@dataset = @db[@table_name]
|
16
23
|
else
|
17
|
-
uri,
|
24
|
+
uri, table_name, options = args
|
18
25
|
options ||= {}
|
19
26
|
|
20
|
-
@table_name = table.to_sym
|
21
27
|
@db = Sequel.connect(uri, options)
|
22
|
-
@
|
28
|
+
@table_name = table_name.to_sym
|
23
29
|
@dataset = @db[@table_name]
|
24
30
|
end
|
25
31
|
@field_set = FieldSet.new(self)
|
26
|
-
@linkage_options = {}
|
27
32
|
end
|
28
33
|
|
29
34
|
def obj
|
@@ -37,9 +42,12 @@ module Linkage
|
|
37
42
|
# Setup a linkage with another dataset
|
38
43
|
#
|
39
44
|
# @return [Linkage::Configuration]
|
40
|
-
def link_with(dataset,
|
41
|
-
|
42
|
-
conf.
|
45
|
+
def link_with(dataset, result_set)
|
46
|
+
other = dataset.eql?(self) ? nil : dataset
|
47
|
+
conf = Configuration.new(self, other, result_set)
|
48
|
+
if block_given?
|
49
|
+
yield conf
|
50
|
+
end
|
43
51
|
conf
|
44
52
|
end
|
45
53
|
|
@@ -47,132 +55,25 @@ module Linkage
|
|
47
55
|
@db.database_type
|
48
56
|
end
|
49
57
|
|
50
|
-
# Set objects to use for group matching. Accepts either {Linkage::MetaObject} or a
|
51
|
-
# hash with options (valid options are :meta_object, :alias, and :cast).
|
52
|
-
#
|
53
|
-
# @example
|
54
|
-
# dataset.group_match(meta_object_1,
|
55
|
-
# {:meta_object => meta_object_2, :alias => :foo})
|
56
|
-
def group_match(*args)
|
57
|
-
args.collect! do |arg|
|
58
|
-
case arg
|
59
|
-
when Linkage::MetaObject
|
60
|
-
{ :meta_object => arg }
|
61
|
-
when Hash
|
62
|
-
if !arg.has_key?(:meta_object)
|
63
|
-
raise ArgumentError, "Invalid option hash, missing :meta_object key"
|
64
|
-
end
|
65
|
-
(arg.keys - [:meta_object, :alias, :cast]).each do |invalid_key|
|
66
|
-
warn "Invalid key in option hash: #{invalid_key}"
|
67
|
-
end
|
68
|
-
arg
|
69
|
-
else
|
70
|
-
raise ArgumentError, "expected Hash or MetaObject, got #{arg.class}"
|
71
|
-
end
|
72
|
-
end
|
73
|
-
clone(:group_match => args)
|
74
|
-
end
|
75
|
-
|
76
|
-
# Add additional objects to use for group matching.
|
77
|
-
def group_match_more(*args)
|
78
|
-
args = @linkage_options[:group_match] + args if @linkage_options[:group_match]
|
79
|
-
group_match(*args)
|
80
|
-
end
|
81
|
-
|
82
|
-
def clone(new_options = {})
|
83
|
-
new_linkage_options = {}
|
84
|
-
new_obj_options = {}
|
85
|
-
new_options.each_pair do |k, v|
|
86
|
-
case k
|
87
|
-
when :group_match
|
88
|
-
new_linkage_options[k] = v
|
89
|
-
else
|
90
|
-
new_obj_options[k] = v
|
91
|
-
end
|
92
|
-
end
|
93
|
-
new_obj = new_options[:new_obj]
|
94
|
-
|
95
|
-
result = super()
|
96
|
-
result.linkage_options = @linkage_options.merge(new_linkage_options)
|
97
|
-
|
98
|
-
if new_obj
|
99
|
-
result.obj = new_obj
|
100
|
-
else
|
101
|
-
result.obj = obj.clone(new_options)
|
102
|
-
end
|
103
|
-
|
104
|
-
result
|
105
|
-
end
|
106
|
-
|
107
|
-
def each_group(min = 2)
|
108
|
-
group_match = @linkage_options[:group_match] || []
|
109
|
-
ruby_types = group_match.inject({}) do |hsh, m|
|
110
|
-
key = m[:alias] || m[:meta_object].to_expr
|
111
|
-
hsh[key] = m[:meta_object].ruby_type
|
112
|
-
hsh
|
113
|
-
end
|
114
|
-
options = {:database_type => database_type, :ruby_types => ruby_types }
|
115
|
-
@dataset.group_and_count(*match_expressions).having{count >= min}.each do |row|
|
116
|
-
count = row.delete(:count)
|
117
|
-
group = Group.new(row, options.merge(:count => count))
|
118
|
-
yield group
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
def group_by_matches(raw = true)
|
123
|
-
expr = raw ? raw_match_expressions : match_expressions
|
124
|
-
group(*expr)
|
125
|
-
end
|
126
|
-
|
127
|
-
def dataset_for_group(group)
|
128
|
-
filters = []
|
129
|
-
group_match = @linkage_options[:group_match] || []
|
130
|
-
group.values.each_pair do |key, value|
|
131
|
-
# find a matched expression with this alias
|
132
|
-
found = false
|
133
|
-
group_match.each do |m|
|
134
|
-
expr = m[:meta_object].to_expr
|
135
|
-
if (m[:alias] && m[:alias] == key) || expr == key
|
136
|
-
found = true
|
137
|
-
filters << {expr => value}
|
138
|
-
break
|
139
|
-
end
|
140
|
-
end
|
141
|
-
if !found
|
142
|
-
raise "this dataset isn't compatible with the given group"
|
143
|
-
end
|
144
|
-
end
|
145
|
-
filter(*filters)
|
146
|
-
end
|
147
|
-
|
148
58
|
def schema
|
149
59
|
@db.schema(@table_name)
|
150
60
|
end
|
151
61
|
|
152
|
-
|
153
|
-
|
154
|
-
def raw_match_expressions
|
155
|
-
group_match = @linkage_options[:group_match] || []
|
156
|
-
group_match.collect { |m| m[:meta_object].to_expr }
|
62
|
+
def primary_key
|
63
|
+
@field_set.primary_key
|
157
64
|
end
|
158
65
|
|
159
|
-
|
160
|
-
group_match = @linkage_options[:group_match] || []
|
161
|
-
group_match.collect do |m|
|
162
|
-
expr = m[:meta_object].to_expr
|
163
|
-
expr = expr.as(m[:alias]) if m[:alias]
|
164
|
-
expr = expr.cast(m[:cast]) if m[:cast]
|
165
|
-
expr
|
166
|
-
end
|
167
|
-
end
|
66
|
+
protected
|
168
67
|
|
169
68
|
def method_missing(name, *args, &block)
|
170
69
|
result = @dataset.send(name, *args, &block)
|
171
70
|
if result.kind_of?(Sequel::Dataset)
|
172
|
-
|
173
|
-
|
71
|
+
new_object = clone
|
72
|
+
new_object.obj = result
|
73
|
+
new_object
|
74
|
+
else
|
75
|
+
result
|
174
76
|
end
|
175
|
-
result
|
176
77
|
end
|
177
78
|
end
|
178
79
|
end
|
data/lib/linkage/field.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Linkage
|
2
2
|
# This class is for holding information about a particular field in a
|
3
3
|
# dataset.
|
4
|
-
class Field
|
4
|
+
class Field
|
5
|
+
# @!attribute [r] name
|
6
|
+
# @return [Symbol] This object's name
|
7
|
+
attr_reader :name
|
8
|
+
|
5
9
|
# @return [Symbol] This field's schema information
|
6
10
|
attr_reader :schema
|
7
11
|
|
8
12
|
# Create a new instance of Field.
|
9
13
|
#
|
10
|
-
# @param [Linkage::Dataset] dataset
|
11
14
|
# @param [Symbol] name The field's name
|
12
15
|
# @param [Hash] schema The field's schema information
|
13
|
-
def initialize(
|
14
|
-
@dataset = dataset
|
16
|
+
def initialize(name, schema)
|
15
17
|
@name = name
|
16
18
|
@schema = schema
|
17
19
|
end
|
@@ -63,7 +65,6 @@ module Linkage
|
|
63
65
|
else
|
64
66
|
{:type=>String}
|
65
67
|
end
|
66
|
-
hsh[:collate] = collation
|
67
68
|
|
68
69
|
hsh.delete_if { |k, v| v.nil? }
|
69
70
|
@ruby_type = {:type => hsh.delete(:type)}
|
@@ -72,40 +73,8 @@ module Linkage
|
|
72
73
|
@ruby_type
|
73
74
|
end
|
74
75
|
|
75
|
-
def to_expr(options = {})
|
76
|
-
@name
|
77
|
-
end
|
78
|
-
|
79
|
-
def static?
|
80
|
-
false
|
81
|
-
end
|
82
|
-
|
83
76
|
def primary_key?
|
84
77
|
schema && schema[:primary_key]
|
85
78
|
end
|
86
|
-
|
87
|
-
def collation
|
88
|
-
schema[:collation]
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# A special field used for merging two {Data} objects together. It
|
93
|
-
# has no dataset or schema.
|
94
|
-
class MergeField < Field
|
95
|
-
attr_reader :database_type
|
96
|
-
|
97
|
-
# Create a new instance of MergeField.
|
98
|
-
#
|
99
|
-
# @param [Symbol] name The field's name
|
100
|
-
# @param [Hash] ruby_type The field's schema information
|
101
|
-
def initialize(name, ruby_type, database_type = nil)
|
102
|
-
@name = name
|
103
|
-
@ruby_type = ruby_type
|
104
|
-
@database_type = database_type
|
105
|
-
end
|
106
|
-
|
107
|
-
def collation
|
108
|
-
@ruby_type.has_key?(:opts) ? @ruby_type[:opts][:collate] : nil
|
109
|
-
end
|
110
79
|
end
|
111
80
|
end
|
data/lib/linkage/field_set.rb
CHANGED
@@ -7,11 +7,11 @@ module Linkage
|
|
7
7
|
# @param [Linkage::Dataset] dataset
|
8
8
|
def initialize(dataset)
|
9
9
|
dataset.schema.each do |(name, column_schema)|
|
10
|
-
|
11
|
-
self[name] =
|
10
|
+
field = Field.new(name, column_schema)
|
11
|
+
self[name] = field
|
12
12
|
|
13
13
|
if @primary_key.nil? && column_schema[:primary_key]
|
14
|
-
@primary_key =
|
14
|
+
@primary_key = field
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Linkage
|
2
|
+
class MatchRecorder
|
3
|
+
def initialize(matcher, match_set)
|
4
|
+
@matcher = matcher
|
5
|
+
@match_set = match_set
|
6
|
+
end
|
7
|
+
|
8
|
+
def start
|
9
|
+
@matcher.add_observer(self)
|
10
|
+
@match_set.open_for_writing
|
11
|
+
end
|
12
|
+
|
13
|
+
def update(id_1, id_2, score)
|
14
|
+
@match_set.add_match(id_1, id_2, score)
|
15
|
+
end
|
16
|
+
|
17
|
+
def stop
|
18
|
+
@match_set.close
|
19
|
+
@matcher.delete_observer(self)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Linkage
|
2
|
+
class MatchSet
|
3
|
+
# Register a match set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
unless methods.include?(:add_match)
|
9
|
+
raise ArgumentError, "class must define #add_match"
|
10
|
+
end
|
11
|
+
|
12
|
+
@match_sets ||= {}
|
13
|
+
@match_sets[name] = klass
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.[](name)
|
17
|
+
@match_sets ? @match_sets[name] : nil
|
18
|
+
end
|
19
|
+
|
20
|
+
def open_for_writing
|
21
|
+
end
|
22
|
+
|
23
|
+
# @abstract
|
24
|
+
def add_match(id_1, id_2, score)
|
25
|
+
raise NotImplementedError
|
26
|
+
end
|
27
|
+
|
28
|
+
def close
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'linkage/match_sets/csv'
|
34
|
+
require 'linkage/match_sets/database'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Linkage
|
4
|
+
module MatchSets
|
5
|
+
class CSV < MatchSet
|
6
|
+
def initialize(filename, options = {})
|
7
|
+
@filename = filename
|
8
|
+
@overwrite = options[:overwrite]
|
9
|
+
end
|
10
|
+
|
11
|
+
def open_for_writing
|
12
|
+
return if @mode == :write
|
13
|
+
|
14
|
+
if !@overwrite && File.exist?(@filename)
|
15
|
+
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
16
|
+
end
|
17
|
+
|
18
|
+
@csv = ::CSV.open(@filename, 'wb')
|
19
|
+
@csv << %w{id_1 id_2 score}
|
20
|
+
@mode = :write
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_match(id_1, id_2, score)
|
24
|
+
raise "not in write mode" if @mode != :write
|
25
|
+
if score.equal?(1.0) || score.equal?(0.0)
|
26
|
+
score = score.floor
|
27
|
+
end
|
28
|
+
@csv << [id_1, id_2, score]
|
29
|
+
end
|
30
|
+
|
31
|
+
def close
|
32
|
+
@mode = nil
|
33
|
+
@csv.close if @csv
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
MatchSet.register('csv', CSV)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Linkage
|
2
|
+
module MatchSets
|
3
|
+
class Database < MatchSet
|
4
|
+
def initialize(database, options = {})
|
5
|
+
@database = database
|
6
|
+
@table_name = options[:table_name] || :matches
|
7
|
+
@overwrite = options[:overwrite]
|
8
|
+
end
|
9
|
+
|
10
|
+
def open_for_writing
|
11
|
+
return if @mode == :write
|
12
|
+
|
13
|
+
if @overwrite
|
14
|
+
@database.drop_table?(@table_name)
|
15
|
+
elsif @database.table_exists?(@table_name)
|
16
|
+
raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
|
17
|
+
end
|
18
|
+
|
19
|
+
@database.create_table(@table_name) do
|
20
|
+
String :id_1
|
21
|
+
String :id_2
|
22
|
+
Float :score
|
23
|
+
end
|
24
|
+
@dataset = @database[@table_name]
|
25
|
+
@mode = :write
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_match(id_1, id_2, score)
|
29
|
+
raise "not in write mode" if @mode != :write
|
30
|
+
|
31
|
+
@dataset.insert({
|
32
|
+
:id_1 => id_1,
|
33
|
+
:id_2 => id_2,
|
34
|
+
:score => score
|
35
|
+
})
|
36
|
+
end
|
37
|
+
|
38
|
+
def close
|
39
|
+
@mode = nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
MatchSet.register('database', Database)
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Linkage
|
2
|
+
class Matcher
|
3
|
+
include Observable
|
4
|
+
|
5
|
+
attr_reader :comparators, :score_set, :algorithm, :threshold
|
6
|
+
|
7
|
+
def initialize(comparators, score_set, algorithm, threshold)
|
8
|
+
@comparators = comparators
|
9
|
+
@score_set = score_set
|
10
|
+
@algorithm = algorithm
|
11
|
+
@threshold = threshold
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
send(@algorithm)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def mean
|
21
|
+
@score_set.each_pair do |id_1, id_2, scores|
|
22
|
+
mean = scores.values.inject(:+) / @comparators.length.to_f
|
23
|
+
if mean >= @threshold
|
24
|
+
changed
|
25
|
+
notify_observers(id_1, id_2, mean)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/linkage/result_set.rb
CHANGED
@@ -1,125 +1,40 @@
|
|
1
1
|
module Linkage
|
2
2
|
class ResultSet
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
12
|
-
|
13
|
-
def database
|
14
|
-
# FIXME: If the results database is the same as one of the datasets
|
15
|
-
# being linked, there will be two connections to said database. This
|
16
|
-
# could result in unexpected locking for non-concurrent databases (like
|
17
|
-
# SQLite).
|
18
|
-
@database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
|
19
|
-
end
|
20
|
-
|
21
|
-
def create_tables!
|
22
|
-
if @config.groups_table_needed?
|
23
|
-
schema = @config.groups_table_schema
|
24
|
-
if @config.decollation_needed?
|
25
|
-
database.create_table(@config.original_groups_table_name) do
|
26
|
-
schema.each { |col| column(*col) }
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
database.create_table(@config.groups_table_name) do
|
31
|
-
schema.each { |col| column(*col) }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
if @config.scores_table_needed?
|
36
|
-
schema = @config.scores_table_schema
|
37
|
-
database.create_table(@config.scores_table_name) do
|
38
|
-
schema.each { |col| column(*col) }
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
schema = @config.matches_table_schema
|
43
|
-
database.create_table(@config.matches_table_name) do
|
44
|
-
schema.each { |col| column(*col) }
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def add_group(group, dataset_id = nil)
|
49
|
-
if @config.decollation_needed?
|
50
|
-
original_values = group.values
|
51
|
-
values = group.decollated_values
|
52
|
-
if !@groups_buffer
|
53
|
-
groups_headers = [:id] + values.keys
|
54
|
-
@groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
|
55
|
-
groups_headers)
|
56
|
-
|
57
|
-
original_groups_headers = [:id] + original_values.keys
|
58
|
-
@original_groups_buffer = ImportBuffer.new(
|
59
|
-
database[@config.original_groups_table_name],
|
60
|
-
original_groups_headers)
|
61
|
-
end
|
62
|
-
|
63
|
-
group_id = next_group_id
|
64
|
-
@groups_buffer.add([group_id] + values.values)
|
65
|
-
@original_groups_buffer.add([group_id] + original_values.values)
|
66
|
-
else
|
67
|
-
# Non-DRY for minute speed improvements
|
68
|
-
values = group.values
|
69
|
-
if !@groups_buffer
|
70
|
-
groups_headers = [:id] + values.keys
|
71
|
-
@groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
|
72
|
-
groups_headers)
|
73
|
-
end
|
74
|
-
group_id = next_group_id
|
75
|
-
@groups_buffer.add([group_id] + values.values)
|
3
|
+
# Register a result set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
missing = []
|
9
|
+
unless methods.include?(:score_set)
|
10
|
+
missing.push("#score_set")
|
76
11
|
end
|
77
|
-
|
78
|
-
|
79
|
-
def add_score(comparator_id, record_1_id, record_2_id, score)
|
80
|
-
if !@scores_buffer
|
81
|
-
scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
|
82
|
-
@scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
|
83
|
-
scores_headers)
|
12
|
+
unless methods.include?(:match_set)
|
13
|
+
missing.push("#match_set")
|
84
14
|
end
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
def add_match(record_1_id, record_2_id, total_score)
|
89
|
-
if !@matches_buffer
|
90
|
-
matches_headers = [:record_1_id, :record_2_id, :total_score]
|
91
|
-
@matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
|
92
|
-
matches_headers)
|
15
|
+
unless missing.empty?
|
16
|
+
raise ArgumentError, "class must define #{missing.join(" and ")}"
|
93
17
|
end
|
94
|
-
@matches_buffer.add([record_1_id, record_2_id, total_score])
|
95
|
-
end
|
96
18
|
|
97
|
-
|
98
|
-
@
|
99
|
-
@original_groups_buffer.flush if @original_groups_buffer
|
100
|
-
@scores_buffer.flush if @scores_buffer
|
101
|
-
@matches_buffer.flush if @matches_buffer
|
19
|
+
@result_set ||= {}
|
20
|
+
@result_set[name] = klass
|
102
21
|
end
|
103
22
|
|
104
|
-
def
|
105
|
-
|
106
|
-
Group.from_row(values)
|
23
|
+
def self.[](name)
|
24
|
+
@result_set ? @result_set[name] : nil
|
107
25
|
end
|
108
26
|
|
109
|
-
|
110
|
-
|
111
|
-
|
27
|
+
# @abstract
|
28
|
+
def score_set
|
29
|
+
raise NotImplementedError
|
112
30
|
end
|
113
31
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
result = nil
|
118
|
-
@next_group_mutex.synchronize do
|
119
|
-
result = @next_group_id
|
120
|
-
@next_group_id += 1
|
121
|
-
end
|
122
|
-
result
|
32
|
+
# @abstract
|
33
|
+
def match_set
|
34
|
+
raise NotImplementedError
|
123
35
|
end
|
124
36
|
end
|
125
37
|
end
|
38
|
+
|
39
|
+
require 'linkage/result_sets/csv'
|
40
|
+
require 'linkage/result_sets/database'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ResultSets
|
3
|
+
class CSV < ResultSet
|
4
|
+
def initialize(dir_or_options = nil)
|
5
|
+
opts =
|
6
|
+
case dir_or_options
|
7
|
+
when nil
|
8
|
+
{}
|
9
|
+
when String
|
10
|
+
{:dir => dir_or_options}
|
11
|
+
when Hash
|
12
|
+
dir_or_options
|
13
|
+
else
|
14
|
+
raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
|
15
|
+
end
|
16
|
+
|
17
|
+
if opts[:dir]
|
18
|
+
opts[:dir] = File.expand_path(opts[:dir])
|
19
|
+
FileUtils.mkdir_p(opts[:dir])
|
20
|
+
end
|
21
|
+
|
22
|
+
@score_set_args = extract_args_for(:scores, opts)
|
23
|
+
@match_set_args = extract_args_for(:matches, opts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def score_set
|
27
|
+
@score_set ||= ScoreSet['csv'].new(*@score_set_args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def match_set
|
31
|
+
@match_set ||= MatchSet['csv'].new(*@match_set_args)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def extract_args_for(name, opts)
|
37
|
+
dir = opts[:dir] || '.'
|
38
|
+
opts = opts[name]
|
39
|
+
|
40
|
+
filename =
|
41
|
+
case opts
|
42
|
+
when Hash, nil
|
43
|
+
opts = opts ? opts.dup : {}
|
44
|
+
opts.delete(:filename) || "#{name}.csv"
|
45
|
+
when String
|
46
|
+
opts
|
47
|
+
end
|
48
|
+
[File.join(dir, filename), opts]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
ResultSet.register('csv', CSV)
|
53
|
+
end
|
54
|
+
end
|