linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
data/lib/linkage/dataset.rb
CHANGED
@@ -2,28 +2,33 @@ module Linkage
|
|
2
2
|
# Delegator around Sequel::Dataset with some extra functionality.
|
3
3
|
class Dataset
|
4
4
|
attr_reader :field_set, :table_name
|
5
|
-
attr_accessor :linkage_options
|
6
5
|
|
7
6
|
def initialize(*args)
|
7
|
+
if args.length == 0 || args.length > 3
|
8
|
+
raise ArgumentError, "wrong number of arguments (#{args.length} for 1..3)"
|
9
|
+
end
|
10
|
+
|
8
11
|
if args.length == 1
|
12
|
+
unless args[0].kind_of?(Sequel::Dataset)
|
13
|
+
raise ArgumentError, "expected Sequel::Dataset, got #{args[0].class}"
|
14
|
+
end
|
15
|
+
|
9
16
|
@dataset = args[0]
|
10
17
|
@db = @dataset.db
|
11
18
|
@table_name = @dataset.first_source_table
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
19
|
+
elsif args.length == 2 && args[0].kind_of?(Sequel::Database)
|
20
|
+
@db = args[0]
|
21
|
+
@table_name = args[1].to_sym
|
22
|
+
@dataset = @db[@table_name]
|
16
23
|
else
|
17
|
-
uri,
|
24
|
+
uri, table_name, options = args
|
18
25
|
options ||= {}
|
19
26
|
|
20
|
-
@table_name = table.to_sym
|
21
27
|
@db = Sequel.connect(uri, options)
|
22
|
-
@
|
28
|
+
@table_name = table_name.to_sym
|
23
29
|
@dataset = @db[@table_name]
|
24
30
|
end
|
25
31
|
@field_set = FieldSet.new(self)
|
26
|
-
@linkage_options = {}
|
27
32
|
end
|
28
33
|
|
29
34
|
def obj
|
@@ -37,9 +42,12 @@ module Linkage
|
|
37
42
|
# Setup a linkage with another dataset
|
38
43
|
#
|
39
44
|
# @return [Linkage::Configuration]
|
40
|
-
def link_with(dataset,
|
41
|
-
|
42
|
-
conf.
|
45
|
+
def link_with(dataset, result_set)
|
46
|
+
other = dataset.eql?(self) ? nil : dataset
|
47
|
+
conf = Configuration.new(self, other, result_set)
|
48
|
+
if block_given?
|
49
|
+
yield conf
|
50
|
+
end
|
43
51
|
conf
|
44
52
|
end
|
45
53
|
|
@@ -47,132 +55,25 @@ module Linkage
|
|
47
55
|
@db.database_type
|
48
56
|
end
|
49
57
|
|
50
|
-
# Set objects to use for group matching. Accepts either {Linkage::MetaObject} or a
|
51
|
-
# hash with options (valid options are :meta_object, :alias, and :cast).
|
52
|
-
#
|
53
|
-
# @example
|
54
|
-
# dataset.group_match(meta_object_1,
|
55
|
-
# {:meta_object => meta_object_2, :alias => :foo})
|
56
|
-
def group_match(*args)
|
57
|
-
args.collect! do |arg|
|
58
|
-
case arg
|
59
|
-
when Linkage::MetaObject
|
60
|
-
{ :meta_object => arg }
|
61
|
-
when Hash
|
62
|
-
if !arg.has_key?(:meta_object)
|
63
|
-
raise ArgumentError, "Invalid option hash, missing :meta_object key"
|
64
|
-
end
|
65
|
-
(arg.keys - [:meta_object, :alias, :cast]).each do |invalid_key|
|
66
|
-
warn "Invalid key in option hash: #{invalid_key}"
|
67
|
-
end
|
68
|
-
arg
|
69
|
-
else
|
70
|
-
raise ArgumentError, "expected Hash or MetaObject, got #{arg.class}"
|
71
|
-
end
|
72
|
-
end
|
73
|
-
clone(:group_match => args)
|
74
|
-
end
|
75
|
-
|
76
|
-
# Add additional objects to use for group matching.
|
77
|
-
def group_match_more(*args)
|
78
|
-
args = @linkage_options[:group_match] + args if @linkage_options[:group_match]
|
79
|
-
group_match(*args)
|
80
|
-
end
|
81
|
-
|
82
|
-
def clone(new_options = {})
|
83
|
-
new_linkage_options = {}
|
84
|
-
new_obj_options = {}
|
85
|
-
new_options.each_pair do |k, v|
|
86
|
-
case k
|
87
|
-
when :group_match
|
88
|
-
new_linkage_options[k] = v
|
89
|
-
else
|
90
|
-
new_obj_options[k] = v
|
91
|
-
end
|
92
|
-
end
|
93
|
-
new_obj = new_options[:new_obj]
|
94
|
-
|
95
|
-
result = super()
|
96
|
-
result.linkage_options = @linkage_options.merge(new_linkage_options)
|
97
|
-
|
98
|
-
if new_obj
|
99
|
-
result.obj = new_obj
|
100
|
-
else
|
101
|
-
result.obj = obj.clone(new_options)
|
102
|
-
end
|
103
|
-
|
104
|
-
result
|
105
|
-
end
|
106
|
-
|
107
|
-
def each_group(min = 2)
|
108
|
-
group_match = @linkage_options[:group_match] || []
|
109
|
-
ruby_types = group_match.inject({}) do |hsh, m|
|
110
|
-
key = m[:alias] || m[:meta_object].to_expr
|
111
|
-
hsh[key] = m[:meta_object].ruby_type
|
112
|
-
hsh
|
113
|
-
end
|
114
|
-
options = {:database_type => database_type, :ruby_types => ruby_types }
|
115
|
-
@dataset.group_and_count(*match_expressions).having{count >= min}.each do |row|
|
116
|
-
count = row.delete(:count)
|
117
|
-
group = Group.new(row, options.merge(:count => count))
|
118
|
-
yield group
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
def group_by_matches(raw = true)
|
123
|
-
expr = raw ? raw_match_expressions : match_expressions
|
124
|
-
group(*expr)
|
125
|
-
end
|
126
|
-
|
127
|
-
def dataset_for_group(group)
|
128
|
-
filters = []
|
129
|
-
group_match = @linkage_options[:group_match] || []
|
130
|
-
group.values.each_pair do |key, value|
|
131
|
-
# find a matched expression with this alias
|
132
|
-
found = false
|
133
|
-
group_match.each do |m|
|
134
|
-
expr = m[:meta_object].to_expr
|
135
|
-
if (m[:alias] && m[:alias] == key) || expr == key
|
136
|
-
found = true
|
137
|
-
filters << {expr => value}
|
138
|
-
break
|
139
|
-
end
|
140
|
-
end
|
141
|
-
if !found
|
142
|
-
raise "this dataset isn't compatible with the given group"
|
143
|
-
end
|
144
|
-
end
|
145
|
-
filter(*filters)
|
146
|
-
end
|
147
|
-
|
148
58
|
def schema
|
149
59
|
@db.schema(@table_name)
|
150
60
|
end
|
151
61
|
|
152
|
-
|
153
|
-
|
154
|
-
def raw_match_expressions
|
155
|
-
group_match = @linkage_options[:group_match] || []
|
156
|
-
group_match.collect { |m| m[:meta_object].to_expr }
|
62
|
+
def primary_key
|
63
|
+
@field_set.primary_key
|
157
64
|
end
|
158
65
|
|
159
|
-
|
160
|
-
group_match = @linkage_options[:group_match] || []
|
161
|
-
group_match.collect do |m|
|
162
|
-
expr = m[:meta_object].to_expr
|
163
|
-
expr = expr.as(m[:alias]) if m[:alias]
|
164
|
-
expr = expr.cast(m[:cast]) if m[:cast]
|
165
|
-
expr
|
166
|
-
end
|
167
|
-
end
|
66
|
+
protected
|
168
67
|
|
169
68
|
def method_missing(name, *args, &block)
|
170
69
|
result = @dataset.send(name, *args, &block)
|
171
70
|
if result.kind_of?(Sequel::Dataset)
|
172
|
-
|
173
|
-
|
71
|
+
new_object = clone
|
72
|
+
new_object.obj = result
|
73
|
+
new_object
|
74
|
+
else
|
75
|
+
result
|
174
76
|
end
|
175
|
-
result
|
176
77
|
end
|
177
78
|
end
|
178
79
|
end
|
data/lib/linkage/field.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Linkage
|
2
2
|
# This class is for holding information about a particular field in a
|
3
3
|
# dataset.
|
4
|
-
class Field
|
4
|
+
class Field
|
5
|
+
# @!attribute [r] name
|
6
|
+
# @return [Symbol] This object's name
|
7
|
+
attr_reader :name
|
8
|
+
|
5
9
|
# @return [Symbol] This field's schema information
|
6
10
|
attr_reader :schema
|
7
11
|
|
8
12
|
# Create a new instance of Field.
|
9
13
|
#
|
10
|
-
# @param [Linkage::Dataset] dataset
|
11
14
|
# @param [Symbol] name The field's name
|
12
15
|
# @param [Hash] schema The field's schema information
|
13
|
-
def initialize(
|
14
|
-
@dataset = dataset
|
16
|
+
def initialize(name, schema)
|
15
17
|
@name = name
|
16
18
|
@schema = schema
|
17
19
|
end
|
@@ -63,7 +65,6 @@ module Linkage
|
|
63
65
|
else
|
64
66
|
{:type=>String}
|
65
67
|
end
|
66
|
-
hsh[:collate] = collation
|
67
68
|
|
68
69
|
hsh.delete_if { |k, v| v.nil? }
|
69
70
|
@ruby_type = {:type => hsh.delete(:type)}
|
@@ -72,40 +73,8 @@ module Linkage
|
|
72
73
|
@ruby_type
|
73
74
|
end
|
74
75
|
|
75
|
-
def to_expr(options = {})
|
76
|
-
@name
|
77
|
-
end
|
78
|
-
|
79
|
-
def static?
|
80
|
-
false
|
81
|
-
end
|
82
|
-
|
83
76
|
def primary_key?
|
84
77
|
schema && schema[:primary_key]
|
85
78
|
end
|
86
|
-
|
87
|
-
def collation
|
88
|
-
schema[:collation]
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# A special field used for merging two {Data} objects together. It
|
93
|
-
# has no dataset or schema.
|
94
|
-
class MergeField < Field
|
95
|
-
attr_reader :database_type
|
96
|
-
|
97
|
-
# Create a new instance of MergeField.
|
98
|
-
#
|
99
|
-
# @param [Symbol] name The field's name
|
100
|
-
# @param [Hash] ruby_type The field's schema information
|
101
|
-
def initialize(name, ruby_type, database_type = nil)
|
102
|
-
@name = name
|
103
|
-
@ruby_type = ruby_type
|
104
|
-
@database_type = database_type
|
105
|
-
end
|
106
|
-
|
107
|
-
def collation
|
108
|
-
@ruby_type.has_key?(:opts) ? @ruby_type[:opts][:collate] : nil
|
109
|
-
end
|
110
79
|
end
|
111
80
|
end
|
data/lib/linkage/field_set.rb
CHANGED
@@ -7,11 +7,11 @@ module Linkage
|
|
7
7
|
# @param [Linkage::Dataset] dataset
|
8
8
|
def initialize(dataset)
|
9
9
|
dataset.schema.each do |(name, column_schema)|
|
10
|
-
|
11
|
-
self[name] =
|
10
|
+
field = Field.new(name, column_schema)
|
11
|
+
self[name] = field
|
12
12
|
|
13
13
|
if @primary_key.nil? && column_schema[:primary_key]
|
14
|
-
@primary_key =
|
14
|
+
@primary_key = field
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Linkage
|
2
|
+
class MatchRecorder
|
3
|
+
def initialize(matcher, match_set)
|
4
|
+
@matcher = matcher
|
5
|
+
@match_set = match_set
|
6
|
+
end
|
7
|
+
|
8
|
+
def start
|
9
|
+
@matcher.add_observer(self)
|
10
|
+
@match_set.open_for_writing
|
11
|
+
end
|
12
|
+
|
13
|
+
def update(id_1, id_2, score)
|
14
|
+
@match_set.add_match(id_1, id_2, score)
|
15
|
+
end
|
16
|
+
|
17
|
+
def stop
|
18
|
+
@match_set.close
|
19
|
+
@matcher.delete_observer(self)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Linkage
|
2
|
+
class MatchSet
|
3
|
+
# Register a match set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
unless methods.include?(:add_match)
|
9
|
+
raise ArgumentError, "class must define #add_match"
|
10
|
+
end
|
11
|
+
|
12
|
+
@match_sets ||= {}
|
13
|
+
@match_sets[name] = klass
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.[](name)
|
17
|
+
@match_sets ? @match_sets[name] : nil
|
18
|
+
end
|
19
|
+
|
20
|
+
def open_for_writing
|
21
|
+
end
|
22
|
+
|
23
|
+
# @abstract
|
24
|
+
def add_match(id_1, id_2, score)
|
25
|
+
raise NotImplementedError
|
26
|
+
end
|
27
|
+
|
28
|
+
def close
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'linkage/match_sets/csv'
|
34
|
+
require 'linkage/match_sets/database'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Linkage
|
4
|
+
module MatchSets
|
5
|
+
class CSV < MatchSet
|
6
|
+
def initialize(filename, options = {})
|
7
|
+
@filename = filename
|
8
|
+
@overwrite = options[:overwrite]
|
9
|
+
end
|
10
|
+
|
11
|
+
def open_for_writing
|
12
|
+
return if @mode == :write
|
13
|
+
|
14
|
+
if !@overwrite && File.exist?(@filename)
|
15
|
+
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
16
|
+
end
|
17
|
+
|
18
|
+
@csv = ::CSV.open(@filename, 'wb')
|
19
|
+
@csv << %w{id_1 id_2 score}
|
20
|
+
@mode = :write
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_match(id_1, id_2, score)
|
24
|
+
raise "not in write mode" if @mode != :write
|
25
|
+
if score.equal?(1.0) || score.equal?(0.0)
|
26
|
+
score = score.floor
|
27
|
+
end
|
28
|
+
@csv << [id_1, id_2, score]
|
29
|
+
end
|
30
|
+
|
31
|
+
def close
|
32
|
+
@mode = nil
|
33
|
+
@csv.close if @csv
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
MatchSet.register('csv', CSV)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Linkage
|
2
|
+
module MatchSets
|
3
|
+
class Database < MatchSet
|
4
|
+
def initialize(database, options = {})
|
5
|
+
@database = database
|
6
|
+
@table_name = options[:table_name] || :matches
|
7
|
+
@overwrite = options[:overwrite]
|
8
|
+
end
|
9
|
+
|
10
|
+
def open_for_writing
|
11
|
+
return if @mode == :write
|
12
|
+
|
13
|
+
if @overwrite
|
14
|
+
@database.drop_table?(@table_name)
|
15
|
+
elsif @database.table_exists?(@table_name)
|
16
|
+
raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
|
17
|
+
end
|
18
|
+
|
19
|
+
@database.create_table(@table_name) do
|
20
|
+
String :id_1
|
21
|
+
String :id_2
|
22
|
+
Float :score
|
23
|
+
end
|
24
|
+
@dataset = @database[@table_name]
|
25
|
+
@mode = :write
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_match(id_1, id_2, score)
|
29
|
+
raise "not in write mode" if @mode != :write
|
30
|
+
|
31
|
+
@dataset.insert({
|
32
|
+
:id_1 => id_1,
|
33
|
+
:id_2 => id_2,
|
34
|
+
:score => score
|
35
|
+
})
|
36
|
+
end
|
37
|
+
|
38
|
+
def close
|
39
|
+
@mode = nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
MatchSet.register('database', Database)
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Linkage
|
2
|
+
class Matcher
|
3
|
+
include Observable
|
4
|
+
|
5
|
+
attr_reader :comparators, :score_set, :algorithm, :threshold
|
6
|
+
|
7
|
+
def initialize(comparators, score_set, algorithm, threshold)
|
8
|
+
@comparators = comparators
|
9
|
+
@score_set = score_set
|
10
|
+
@algorithm = algorithm
|
11
|
+
@threshold = threshold
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
send(@algorithm)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def mean
|
21
|
+
@score_set.each_pair do |id_1, id_2, scores|
|
22
|
+
mean = scores.values.inject(:+) / @comparators.length.to_f
|
23
|
+
if mean >= @threshold
|
24
|
+
changed
|
25
|
+
notify_observers(id_1, id_2, mean)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/linkage/result_set.rb
CHANGED
@@ -1,125 +1,40 @@
|
|
1
1
|
module Linkage
|
2
2
|
class ResultSet
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
12
|
-
|
13
|
-
def database
|
14
|
-
# FIXME: If the results database is the same as one of the datasets
|
15
|
-
# being linked, there will be two connections to said database. This
|
16
|
-
# could result in unexpected locking for non-concurrent databases (like
|
17
|
-
# SQLite).
|
18
|
-
@database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
|
19
|
-
end
|
20
|
-
|
21
|
-
def create_tables!
|
22
|
-
if @config.groups_table_needed?
|
23
|
-
schema = @config.groups_table_schema
|
24
|
-
if @config.decollation_needed?
|
25
|
-
database.create_table(@config.original_groups_table_name) do
|
26
|
-
schema.each { |col| column(*col) }
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
database.create_table(@config.groups_table_name) do
|
31
|
-
schema.each { |col| column(*col) }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
if @config.scores_table_needed?
|
36
|
-
schema = @config.scores_table_schema
|
37
|
-
database.create_table(@config.scores_table_name) do
|
38
|
-
schema.each { |col| column(*col) }
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
schema = @config.matches_table_schema
|
43
|
-
database.create_table(@config.matches_table_name) do
|
44
|
-
schema.each { |col| column(*col) }
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def add_group(group, dataset_id = nil)
|
49
|
-
if @config.decollation_needed?
|
50
|
-
original_values = group.values
|
51
|
-
values = group.decollated_values
|
52
|
-
if !@groups_buffer
|
53
|
-
groups_headers = [:id] + values.keys
|
54
|
-
@groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
|
55
|
-
groups_headers)
|
56
|
-
|
57
|
-
original_groups_headers = [:id] + original_values.keys
|
58
|
-
@original_groups_buffer = ImportBuffer.new(
|
59
|
-
database[@config.original_groups_table_name],
|
60
|
-
original_groups_headers)
|
61
|
-
end
|
62
|
-
|
63
|
-
group_id = next_group_id
|
64
|
-
@groups_buffer.add([group_id] + values.values)
|
65
|
-
@original_groups_buffer.add([group_id] + original_values.values)
|
66
|
-
else
|
67
|
-
# Non-DRY for minute speed improvements
|
68
|
-
values = group.values
|
69
|
-
if !@groups_buffer
|
70
|
-
groups_headers = [:id] + values.keys
|
71
|
-
@groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
|
72
|
-
groups_headers)
|
73
|
-
end
|
74
|
-
group_id = next_group_id
|
75
|
-
@groups_buffer.add([group_id] + values.values)
|
3
|
+
# Register a result set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
missing = []
|
9
|
+
unless methods.include?(:score_set)
|
10
|
+
missing.push("#score_set")
|
76
11
|
end
|
77
|
-
|
78
|
-
|
79
|
-
def add_score(comparator_id, record_1_id, record_2_id, score)
|
80
|
-
if !@scores_buffer
|
81
|
-
scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
|
82
|
-
@scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
|
83
|
-
scores_headers)
|
12
|
+
unless methods.include?(:match_set)
|
13
|
+
missing.push("#match_set")
|
84
14
|
end
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
def add_match(record_1_id, record_2_id, total_score)
|
89
|
-
if !@matches_buffer
|
90
|
-
matches_headers = [:record_1_id, :record_2_id, :total_score]
|
91
|
-
@matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
|
92
|
-
matches_headers)
|
15
|
+
unless missing.empty?
|
16
|
+
raise ArgumentError, "class must define #{missing.join(" and ")}"
|
93
17
|
end
|
94
|
-
@matches_buffer.add([record_1_id, record_2_id, total_score])
|
95
|
-
end
|
96
18
|
|
97
|
-
|
98
|
-
@
|
99
|
-
@original_groups_buffer.flush if @original_groups_buffer
|
100
|
-
@scores_buffer.flush if @scores_buffer
|
101
|
-
@matches_buffer.flush if @matches_buffer
|
19
|
+
@result_set ||= {}
|
20
|
+
@result_set[name] = klass
|
102
21
|
end
|
103
22
|
|
104
|
-
def
|
105
|
-
|
106
|
-
Group.from_row(values)
|
23
|
+
def self.[](name)
|
24
|
+
@result_set ? @result_set[name] : nil
|
107
25
|
end
|
108
26
|
|
109
|
-
|
110
|
-
|
111
|
-
|
27
|
+
# @abstract
|
28
|
+
def score_set
|
29
|
+
raise NotImplementedError
|
112
30
|
end
|
113
31
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
result = nil
|
118
|
-
@next_group_mutex.synchronize do
|
119
|
-
result = @next_group_id
|
120
|
-
@next_group_id += 1
|
121
|
-
end
|
122
|
-
result
|
32
|
+
# @abstract
|
33
|
+
def match_set
|
34
|
+
raise NotImplementedError
|
123
35
|
end
|
124
36
|
end
|
125
37
|
end
|
38
|
+
|
39
|
+
require 'linkage/result_sets/csv'
|
40
|
+
require 'linkage/result_sets/database'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ResultSets
|
3
|
+
class CSV < ResultSet
|
4
|
+
def initialize(dir_or_options = nil)
|
5
|
+
opts =
|
6
|
+
case dir_or_options
|
7
|
+
when nil
|
8
|
+
{}
|
9
|
+
when String
|
10
|
+
{:dir => dir_or_options}
|
11
|
+
when Hash
|
12
|
+
dir_or_options
|
13
|
+
else
|
14
|
+
raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
|
15
|
+
end
|
16
|
+
|
17
|
+
if opts[:dir]
|
18
|
+
opts[:dir] = File.expand_path(opts[:dir])
|
19
|
+
FileUtils.mkdir_p(opts[:dir])
|
20
|
+
end
|
21
|
+
|
22
|
+
@score_set_args = extract_args_for(:scores, opts)
|
23
|
+
@match_set_args = extract_args_for(:matches, opts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def score_set
|
27
|
+
@score_set ||= ScoreSet['csv'].new(*@score_set_args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def match_set
|
31
|
+
@match_set ||= MatchSet['csv'].new(*@match_set_args)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def extract_args_for(name, opts)
|
37
|
+
dir = opts[:dir] || '.'
|
38
|
+
opts = opts[name]
|
39
|
+
|
40
|
+
filename =
|
41
|
+
case opts
|
42
|
+
when Hash, nil
|
43
|
+
opts = opts ? opts.dup : {}
|
44
|
+
opts.delete(:filename) || "#{name}.csv"
|
45
|
+
when String
|
46
|
+
opts
|
47
|
+
end
|
48
|
+
[File.join(dir, filename), opts]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
ResultSet.register('csv', CSV)
|
53
|
+
end
|
54
|
+
end
|