linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,42 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ResultSets
|
3
|
+
class Database < ResultSet
|
4
|
+
def initialize(database_or_options = nil)
|
5
|
+
@database = nil
|
6
|
+
@options = {}
|
7
|
+
|
8
|
+
if database_or_options.kind_of?(Sequel::Database)
|
9
|
+
@database = database_or_options
|
10
|
+
else
|
11
|
+
database_opts = nil
|
12
|
+
case database_or_options
|
13
|
+
when String
|
14
|
+
database_opts = database_or_options
|
15
|
+
when Hash
|
16
|
+
database_opts = {}
|
17
|
+
database_or_options.each_pair do |key, value|
|
18
|
+
if key == :scores || key == :matches
|
19
|
+
@options[key] = value
|
20
|
+
else
|
21
|
+
database_opts[key] = value
|
22
|
+
end
|
23
|
+
end
|
24
|
+
else
|
25
|
+
raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
|
26
|
+
end
|
27
|
+
@database = Sequel.connect(database_opts)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def score_set
|
32
|
+
@score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
|
33
|
+
end
|
34
|
+
|
35
|
+
def match_set
|
36
|
+
@match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
ResultSet.register('database', Database)
|
41
|
+
end
|
42
|
+
end
|
data/lib/linkage/runner.rb
CHANGED
@@ -1,31 +1,72 @@
|
|
1
1
|
module Linkage
|
2
2
|
# Use this class to run a configuration created by {Dataset#link_with}.
|
3
3
|
class Runner
|
4
|
-
attr_reader :config
|
4
|
+
attr_reader :config
|
5
5
|
|
6
6
|
# @param [Linkage::Configuration] config
|
7
|
-
# @param [String] uri Sequel-style database URI
|
8
|
-
# @param [Hash] options Sequel.connect options
|
9
7
|
# @see Dataset#link_with
|
10
|
-
|
11
|
-
def initialize(config, uri = nil, options = {})
|
8
|
+
def initialize(config)
|
12
9
|
@config = config
|
13
|
-
if uri
|
14
|
-
warn("[DEPRECATION] Please use Configuration#save_results_in with the database URI and options instead")
|
15
|
-
@config.save_results_in(uri, options)
|
16
|
-
end
|
17
10
|
end
|
18
11
|
|
19
|
-
# @abstract
|
20
12
|
def execute
|
21
|
-
|
13
|
+
score_records
|
14
|
+
match_records
|
22
15
|
end
|
23
16
|
|
24
|
-
def
|
25
|
-
|
17
|
+
def score_records
|
18
|
+
score_recorder = config.score_recorder
|
19
|
+
score_recorder.start
|
20
|
+
dataset_1 = config.dataset_1
|
21
|
+
dataset_2 = config.dataset_2
|
22
|
+
simple_comparators = []
|
23
|
+
config.comparators.each do |comparator|
|
24
|
+
if comparator.type == :simple
|
25
|
+
simple_comparators << comparator
|
26
|
+
else
|
27
|
+
if dataset_2
|
28
|
+
comparator.score_datasets(dataset_1, dataset_2)
|
29
|
+
else
|
30
|
+
comparator.score_dataset(dataset_1)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Handle simple comparators
|
36
|
+
unless simple_comparators.empty?
|
37
|
+
if dataset_2
|
38
|
+
# Two datasets
|
39
|
+
dataset_1.each do |record_1|
|
40
|
+
dataset_2.each do |record_2|
|
41
|
+
simple_comparators.each do |comparator|
|
42
|
+
comparator.score_and_notify(record_1, record_2)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
else
|
47
|
+
# One dataset
|
48
|
+
# NOTE: very naive implementation
|
49
|
+
records = dataset_1.all
|
50
|
+
0.upto(records.length - 2) do |i|
|
51
|
+
record_1 = records[i]
|
52
|
+
(i + 1).upto(records.length - 1) do |j|
|
53
|
+
record_2 = records[j]
|
54
|
+
simple_comparators.each do |comparator|
|
55
|
+
comparator.score_and_notify(record_1, record_2)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
score_recorder.stop
|
62
|
+
end
|
63
|
+
|
64
|
+
def match_records
|
65
|
+
matcher = config.matcher
|
66
|
+
match_recorder = config.match_recorder(matcher)
|
67
|
+
match_recorder.start
|
68
|
+
matcher.run
|
69
|
+
match_recorder.stop
|
26
70
|
end
|
27
71
|
end
|
28
72
|
end
|
29
|
-
|
30
|
-
path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
|
31
|
-
require path + 'single_threaded'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Linkage
|
2
|
+
class ScoreRecorder
|
3
|
+
def initialize(comparators, score_set, primary_keys)
|
4
|
+
@comparators = comparators
|
5
|
+
@score_set = score_set
|
6
|
+
@primary_keys = primary_keys
|
7
|
+
end
|
8
|
+
|
9
|
+
def start
|
10
|
+
@comparators.each do |comparator|
|
11
|
+
comparator.add_observer(self)
|
12
|
+
end
|
13
|
+
@score_set.open_for_writing
|
14
|
+
end
|
15
|
+
|
16
|
+
def update(comparator, record_1, record_2, score)
|
17
|
+
index = @comparators.index(comparator)
|
18
|
+
primary_key_1 = record_1[@primary_keys[0]]
|
19
|
+
primary_key_2 = record_2[@primary_keys[1]]
|
20
|
+
@score_set.add_score(index + 1, primary_key_1, primary_key_2, score)
|
21
|
+
end
|
22
|
+
|
23
|
+
def stop
|
24
|
+
@score_set.close
|
25
|
+
@comparators.each do |comparator|
|
26
|
+
comparator.delete_observer(self)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Linkage
|
2
|
+
class ScoreSet
|
3
|
+
# Register a score set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
missing = []
|
9
|
+
unless methods.include?(:add_score)
|
10
|
+
missing.push("#add_score")
|
11
|
+
end
|
12
|
+
unless methods.include?(:each_pair)
|
13
|
+
missing.push("#each_pair")
|
14
|
+
end
|
15
|
+
unless missing.empty?
|
16
|
+
raise ArgumentError, "class must define #{missing.join(" and ")}"
|
17
|
+
end
|
18
|
+
|
19
|
+
@score_sets ||= {}
|
20
|
+
@score_sets[name] = klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.[](name)
|
24
|
+
@score_sets ? @score_sets[name] : nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def open_for_reading
|
28
|
+
end
|
29
|
+
|
30
|
+
def open_for_writing
|
31
|
+
end
|
32
|
+
|
33
|
+
# @abstract
|
34
|
+
def add_score(comparator_id, id_1, id_2, value)
|
35
|
+
raise NotImplementedError
|
36
|
+
end
|
37
|
+
|
38
|
+
# @abstract
|
39
|
+
def each_pair(&block)
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
|
43
|
+
def close
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
require 'linkage/score_sets/csv'
|
49
|
+
require 'linkage/score_sets/database'
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Linkage
|
4
|
+
module ScoreSets
|
5
|
+
class CSV < ScoreSet
|
6
|
+
def initialize(filename, options = {})
|
7
|
+
@filename = filename
|
8
|
+
@overwrite = options[:overwrite]
|
9
|
+
end
|
10
|
+
|
11
|
+
def open_for_reading
|
12
|
+
raise "already open for writing, try closing first" if @mode == :write
|
13
|
+
return if @mode == :read
|
14
|
+
|
15
|
+
if !File.exist?(@filename)
|
16
|
+
raise MissingError, "#{@filename} does not exist"
|
17
|
+
end
|
18
|
+
@csv = ::CSV.open(@filename, 'rb', :headers => true)
|
19
|
+
@mode = :read
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_for_writing
|
23
|
+
raise "already open for reading, try closing first" if @mode == :read
|
24
|
+
return if @mode == :write
|
25
|
+
|
26
|
+
if !@overwrite && File.exist?(@filename)
|
27
|
+
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
28
|
+
end
|
29
|
+
|
30
|
+
@csv = ::CSV.open(@filename, 'wb')
|
31
|
+
@csv << %w{comparator_id id_1 id_2 score}
|
32
|
+
@mode = :write
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_score(comparator_id, id_1, id_2, score)
|
36
|
+
raise "not in write mode" if @mode != :write
|
37
|
+
@csv << [comparator_id, id_1, id_2, score]
|
38
|
+
end
|
39
|
+
|
40
|
+
def each_pair
|
41
|
+
open_for_reading
|
42
|
+
|
43
|
+
pairs = Hash.new { |h, k| h[k] = {} }
|
44
|
+
@csv.each do |row|
|
45
|
+
key = [row['id_1'], row['id_2']]
|
46
|
+
score = row['score']
|
47
|
+
pairs[key][row['comparator_id'].to_i] = score.to_f
|
48
|
+
end
|
49
|
+
pairs.each_pair do |pair, scores|
|
50
|
+
yield pair[0], pair[1], scores
|
51
|
+
end
|
52
|
+
|
53
|
+
close
|
54
|
+
end
|
55
|
+
|
56
|
+
def close
|
57
|
+
@mode = nil
|
58
|
+
@csv.close if @csv
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
ScoreSet.register('csv', CSV)
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ScoreSets
|
3
|
+
class Database < ScoreSet
|
4
|
+
def initialize(database, options = {})
|
5
|
+
@database = database
|
6
|
+
@table_name = options[:table_name] || :scores
|
7
|
+
@overwrite = options[:overwrite]
|
8
|
+
end
|
9
|
+
|
10
|
+
def open_for_reading
|
11
|
+
raise "already open for writing, try closing first" if @mode == :write
|
12
|
+
return if @mode == :read
|
13
|
+
|
14
|
+
if !@database.table_exists?(@table_name)
|
15
|
+
raise MissingError, "#{@table_name} table does not exist"
|
16
|
+
end
|
17
|
+
|
18
|
+
@dataset = @database[@table_name]
|
19
|
+
@mode = :read
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_for_writing
|
23
|
+
raise "already open for reading, try closing first" if @mode == :read
|
24
|
+
return if @mode == :write
|
25
|
+
|
26
|
+
if @overwrite
|
27
|
+
@database.drop_table?(@table_name)
|
28
|
+
elsif @database.table_exists?(@table_name)
|
29
|
+
raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
|
30
|
+
end
|
31
|
+
|
32
|
+
@database.create_table(@table_name) do
|
33
|
+
Integer :comparator_id
|
34
|
+
String :id_1
|
35
|
+
String :id_2
|
36
|
+
Float :score
|
37
|
+
end
|
38
|
+
@dataset = @database[@table_name]
|
39
|
+
@mode = :write
|
40
|
+
end
|
41
|
+
|
42
|
+
def add_score(comparator_id, id_1, id_2, score)
|
43
|
+
raise "not in write mode" if @mode != :write
|
44
|
+
|
45
|
+
@dataset.insert({
|
46
|
+
:comparator_id => comparator_id,
|
47
|
+
:id_1 => id_1,
|
48
|
+
:id_2 => id_2,
|
49
|
+
:score => score
|
50
|
+
})
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_pair
|
54
|
+
open_for_reading
|
55
|
+
|
56
|
+
current_pair = nil
|
57
|
+
@dataset.order(:id_1, :id_2, :comparator_id).each do |row|
|
58
|
+
if current_pair.nil? || current_pair[0] != row[:id_1] || current_pair[1] != row[:id_2]
|
59
|
+
yield(*current_pair) unless current_pair.nil?
|
60
|
+
current_pair = [row[:id_1], row[:id_2], {}]
|
61
|
+
end
|
62
|
+
scores = current_pair[2]
|
63
|
+
scores[row[:comparator_id]] = row[:score]
|
64
|
+
end
|
65
|
+
yield(*current_pair) unless current_pair.nil?
|
66
|
+
|
67
|
+
close
|
68
|
+
end
|
69
|
+
|
70
|
+
def close
|
71
|
+
@mode = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
ScoreSet.register('database', Database)
|
76
|
+
end
|
77
|
+
end
|
data/lib/linkage/version.rb
CHANGED
data/lib/linkage.rb
CHANGED
@@ -1,31 +1,28 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'fileutils'
|
2
3
|
require 'delegate'
|
3
4
|
require 'sequel'
|
4
5
|
require 'hashery'
|
6
|
+
require 'observer'
|
5
7
|
|
6
8
|
module Linkage
|
7
9
|
end
|
8
10
|
|
9
11
|
path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
|
10
|
-
require path + '
|
11
|
-
require path + '
|
12
|
-
require path + 'warnings'
|
13
|
-
require path + 'decollation'
|
12
|
+
require path + 'comparator'
|
13
|
+
require path + 'configuration'
|
14
14
|
require path + 'dataset'
|
15
|
-
require path + '
|
16
|
-
require path + 'data'
|
15
|
+
require path + 'exceptions'
|
17
16
|
require path + 'field'
|
18
|
-
require path + '
|
19
|
-
require path + 'group'
|
17
|
+
require path + 'field_set'
|
20
18
|
require path + 'import_buffer'
|
21
|
-
require path + '
|
22
|
-
require path + '
|
23
|
-
require path + '
|
19
|
+
require path + 'match_recorder'
|
20
|
+
require path + 'match_set'
|
21
|
+
require path + 'matcher'
|
24
22
|
require path + 'result_set'
|
25
|
-
require path + '
|
26
|
-
require path + '
|
23
|
+
require path + 'runner'
|
24
|
+
require path + 'score_recorder'
|
25
|
+
require path + 'score_set'
|
26
|
+
require path + 'version'
|
27
27
|
|
28
|
-
Sequel.extension :
|
29
|
-
if Sequel::Collation.respond_to?(:suppress_warnings=)
|
30
|
-
Sequel::Collation.suppress_warnings = true
|
31
|
-
end
|
28
|
+
Sequel.extension :core_extensions
|
data/linkage.gemspec
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require File.expand_path('../lib/linkage/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "linkage"
|
5
6
|
gem.authors = ["Jeremy Stephens"]
|
6
7
|
gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
|
7
8
|
gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
|
@@ -16,6 +17,17 @@ Gem::Specification.new do |gem|
|
|
16
17
|
gem.version = Linkage::VERSION
|
17
18
|
|
18
19
|
gem.add_dependency "sequel"
|
19
|
-
gem.add_dependency "sequel-collation"
|
20
20
|
gem.add_dependency "hashery"
|
21
|
+
|
22
|
+
gem.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
gem.add_development_dependency "rake"
|
24
|
+
gem.add_development_dependency "test-unit"
|
25
|
+
gem.add_development_dependency "mocha"
|
26
|
+
gem.add_development_dependency "versionomy"
|
27
|
+
gem.add_development_dependency "sqlite3"
|
28
|
+
gem.add_development_dependency "mysql2"
|
29
|
+
gem.add_development_dependency "guard-test"
|
30
|
+
gem.add_development_dependency "guard-yard"
|
31
|
+
|
32
|
+
gem.required_ruby_version = '>= 1.9'
|
21
33
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/linkage/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "linkage"
|
6
|
+
gem.authors = ["Jeremy Stephens"]
|
7
|
+
gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
|
8
|
+
gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
|
9
|
+
gem.summary = %q{Record linkage library}
|
10
|
+
gem.homepage = "http://github.com/coupler/linkage"
|
11
|
+
|
12
|
+
gem.files = `git ls-files`.split($\)
|
13
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
14
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
15
|
+
gem.name = "linkage"
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
gem.version = Linkage::VERSION
|
18
|
+
gem.platform = "java"
|
19
|
+
|
20
|
+
gem.add_dependency "sequel"
|
21
|
+
gem.add_dependency "sequel-collation"
|
22
|
+
gem.add_dependency "hashery"
|
23
|
+
|
24
|
+
gem.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
gem.add_development_dependency "rake"
|
26
|
+
gem.add_development_dependency "test-unit"
|
27
|
+
gem.add_development_dependency "mocha"
|
28
|
+
gem.add_development_dependency "versionomy"
|
29
|
+
gem.add_development_dependency "jdbc-sqlite3"
|
30
|
+
gem.add_development_dependency "jdbc-mysql"
|
31
|
+
gem.add_development_dependency "guard-test"
|
32
|
+
end
|
data/test/helper.rb
CHANGED
@@ -14,6 +14,7 @@ require 'logger'
|
|
14
14
|
require 'pp'
|
15
15
|
require 'versionomy'
|
16
16
|
require 'erb'
|
17
|
+
require 'tempfile'
|
17
18
|
|
18
19
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
20
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
@@ -46,8 +47,8 @@ class Test::Unit::TestCase
|
|
46
47
|
f
|
47
48
|
end
|
48
49
|
|
49
|
-
def
|
50
|
-
f =
|
50
|
+
def stub_instance(klass, options = {}, &block)
|
51
|
+
f = klass.allocate
|
51
52
|
f.stubs(options)
|
52
53
|
if block
|
53
54
|
f.send(:instance_eval, &block)
|
@@ -55,37 +56,43 @@ class Test::Unit::TestCase
|
|
55
56
|
f
|
56
57
|
end
|
57
58
|
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
if
|
62
|
-
|
59
|
+
def new_comparator(&block)
|
60
|
+
klass = Class.new(Linkage::Comparator)
|
61
|
+
klass.send(:define_method, :score) { |record_1, record_2| 1 }
|
62
|
+
if block_given?
|
63
|
+
klass.class_eval(&block)
|
63
64
|
end
|
64
|
-
|
65
|
+
klass
|
65
66
|
end
|
66
67
|
|
67
|
-
def
|
68
|
-
klass = Class.new(Linkage::
|
69
|
-
klass.send(:
|
70
|
-
if ruby_type
|
71
|
-
klass.send(:define_method, :ruby_type) { ruby_type }
|
68
|
+
def new_score_set(&block)
|
69
|
+
klass = Class.new(Linkage::ScoreSet)
|
70
|
+
klass.send(:define_method, :add_score) do |comparator_index, id_1, id_2, value|
|
72
71
|
end
|
73
|
-
|
74
|
-
|
72
|
+
klass.send(:define_method, :each_pair) do
|
73
|
+
end
|
74
|
+
if block_given?
|
75
|
+
klass.class_eval(&block)
|
75
76
|
end
|
76
77
|
klass
|
77
78
|
end
|
78
79
|
|
79
|
-
def
|
80
|
-
klass = Class.new(Linkage::
|
81
|
-
klass.send(:
|
82
|
-
|
83
|
-
|
80
|
+
def new_match_set(&block)
|
81
|
+
klass = Class.new(Linkage::MatchSet)
|
82
|
+
klass.send(:define_method, :add_match) do |id_1, id_2, value|
|
83
|
+
end
|
84
|
+
if block_given?
|
85
|
+
klass.class_eval(&block)
|
86
|
+
end
|
87
|
+
klass
|
88
|
+
end
|
89
|
+
|
90
|
+
def new_result_set(&block)
|
91
|
+
klass = Class.new(Linkage::ResultSet)
|
92
|
+
klass.send(:define_method, :score_set) do
|
84
93
|
end
|
85
|
-
|
86
|
-
klass.send(:define_singleton_method, :score_range) { score_range }
|
94
|
+
klass.send(:define_method, :match_set) do
|
87
95
|
end
|
88
|
-
klass.send(:define_method, :score) { |record_1, record_2| 100 }
|
89
96
|
if block_given?
|
90
97
|
klass.class_eval(&block)
|
91
98
|
end
|
@@ -15,7 +15,7 @@ module IntegrationTests
|
|
15
15
|
FileUtils.remove_entry_secure(@tmpdir)
|
16
16
|
end
|
17
17
|
|
18
|
-
test "one
|
18
|
+
test "one field equality on single threaded runner" do
|
19
19
|
# insert the test data
|
20
20
|
database do |db|
|
21
21
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -25,25 +25,32 @@ module IntegrationTests
|
|
25
25
|
|
26
26
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
27
27
|
|
28
|
-
|
29
|
-
conf = ds.link_with(ds) do
|
30
|
-
|
31
|
-
|
28
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
29
|
+
conf = ds.link_with(ds, result_set) do |conf|
|
30
|
+
conf.compare([:foo], [:bar], :equal)
|
31
|
+
conf.algorithm = :mean
|
32
|
+
conf.threshold = 1
|
32
33
|
end
|
33
|
-
|
34
|
-
runner = Linkage::
|
34
|
+
|
35
|
+
runner = Linkage::Runner.new(conf)
|
35
36
|
runner.execute
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
39
|
+
assert_equal 1000, score_csv.length
|
40
|
+
score_csv.each do |row|
|
41
|
+
id_1 = row['id_1'].to_i
|
42
|
+
id_2 = row['id_2'].to_i
|
43
|
+
assert (id_1 % 10) == (id_2 % 5)
|
44
|
+
assert_equal "1", row['score']
|
45
|
+
end
|
42
46
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
48
|
+
assert_equal 1000, match_csv.length
|
49
|
+
match_csv.each do |row|
|
50
|
+
id_1 = row['id_1'].to_i
|
51
|
+
id_2 = row['id_2'].to_i
|
52
|
+
assert (id_1 % 10) == (id_2 % 5)
|
53
|
+
assert_equal "1", row['score']
|
47
54
|
end
|
48
55
|
end
|
49
56
|
|
@@ -54,19 +61,33 @@ module IntegrationTests
|
|
54
61
|
Array.new(100) { |i| [i, i % 10, i % 20] })
|
55
62
|
end
|
56
63
|
|
64
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
57
65
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
66
|
+
ds_1 = ds.filter(:bar => 0)
|
67
|
+
ds_2 = ds.filter(:bar => 10)
|
68
|
+
conf = ds_1.link_with(ds_2, result_set) do |conf|
|
69
|
+
conf.compare([:foo], [:foo], :equal)
|
70
|
+
conf.algorithm = :mean
|
71
|
+
conf.threshold = 1
|
64
72
|
end
|
65
|
-
|
73
|
+
|
74
|
+
runner = Linkage::Runner.new(conf)
|
66
75
|
runner.execute
|
67
76
|
|
68
|
-
|
69
|
-
|
77
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
78
|
+
assert_equal 25, score_csv.length
|
79
|
+
score_csv.each do |row|
|
80
|
+
id_1 = row['id_1'].to_i
|
81
|
+
id_2 = row['id_2'].to_i
|
82
|
+
assert (id_1 % 10) == (id_1 % 10)
|
83
|
+
end
|
84
|
+
|
85
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
86
|
+
assert_equal 25, match_csv.length
|
87
|
+
match_csv.each do |row|
|
88
|
+
id_1 = row['id_1'].to_i
|
89
|
+
id_2 = row['id_2'].to_i
|
90
|
+
assert (id_1 % 10) == (id_1 % 10)
|
70
91
|
end
|
71
92
|
end
|
72
93
|
end
|