linkage 0.0.8 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,42 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ResultSets
|
3
|
+
class Database < ResultSet
|
4
|
+
def initialize(database_or_options = nil)
|
5
|
+
@database = nil
|
6
|
+
@options = {}
|
7
|
+
|
8
|
+
if database_or_options.kind_of?(Sequel::Database)
|
9
|
+
@database = database_or_options
|
10
|
+
else
|
11
|
+
database_opts = nil
|
12
|
+
case database_or_options
|
13
|
+
when String
|
14
|
+
database_opts = database_or_options
|
15
|
+
when Hash
|
16
|
+
database_opts = {}
|
17
|
+
database_or_options.each_pair do |key, value|
|
18
|
+
if key == :scores || key == :matches
|
19
|
+
@options[key] = value
|
20
|
+
else
|
21
|
+
database_opts[key] = value
|
22
|
+
end
|
23
|
+
end
|
24
|
+
else
|
25
|
+
raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
|
26
|
+
end
|
27
|
+
@database = Sequel.connect(database_opts)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def score_set
|
32
|
+
@score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
|
33
|
+
end
|
34
|
+
|
35
|
+
def match_set
|
36
|
+
@match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
ResultSet.register('database', Database)
|
41
|
+
end
|
42
|
+
end
|
data/lib/linkage/runner.rb
CHANGED
@@ -1,31 +1,72 @@
|
|
1
1
|
module Linkage
|
2
2
|
# Use this class to run a configuration created by {Dataset#link_with}.
|
3
3
|
class Runner
|
4
|
-
attr_reader :config
|
4
|
+
attr_reader :config
|
5
5
|
|
6
6
|
# @param [Linkage::Configuration] config
|
7
|
-
# @param [String] uri Sequel-style database URI
|
8
|
-
# @param [Hash] options Sequel.connect options
|
9
7
|
# @see Dataset#link_with
|
10
|
-
|
11
|
-
def initialize(config, uri = nil, options = {})
|
8
|
+
def initialize(config)
|
12
9
|
@config = config
|
13
|
-
if uri
|
14
|
-
warn("[DEPRECATION] Please use Configuration#save_results_in with the database URI and options instead")
|
15
|
-
@config.save_results_in(uri, options)
|
16
|
-
end
|
17
10
|
end
|
18
11
|
|
19
|
-
# @abstract
|
20
12
|
def execute
|
21
|
-
|
13
|
+
score_records
|
14
|
+
match_records
|
22
15
|
end
|
23
16
|
|
24
|
-
def
|
25
|
-
|
17
|
+
def score_records
|
18
|
+
score_recorder = config.score_recorder
|
19
|
+
score_recorder.start
|
20
|
+
dataset_1 = config.dataset_1
|
21
|
+
dataset_2 = config.dataset_2
|
22
|
+
simple_comparators = []
|
23
|
+
config.comparators.each do |comparator|
|
24
|
+
if comparator.type == :simple
|
25
|
+
simple_comparators << comparator
|
26
|
+
else
|
27
|
+
if dataset_2
|
28
|
+
comparator.score_datasets(dataset_1, dataset_2)
|
29
|
+
else
|
30
|
+
comparator.score_dataset(dataset_1)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Handle simple comparators
|
36
|
+
unless simple_comparators.empty?
|
37
|
+
if dataset_2
|
38
|
+
# Two datasets
|
39
|
+
dataset_1.each do |record_1|
|
40
|
+
dataset_2.each do |record_2|
|
41
|
+
simple_comparators.each do |comparator|
|
42
|
+
comparator.score_and_notify(record_1, record_2)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
else
|
47
|
+
# One dataset
|
48
|
+
# NOTE: very naive implementation
|
49
|
+
records = dataset_1.all
|
50
|
+
0.upto(records.length - 2) do |i|
|
51
|
+
record_1 = records[i]
|
52
|
+
(i + 1).upto(records.length - 1) do |j|
|
53
|
+
record_2 = records[j]
|
54
|
+
simple_comparators.each do |comparator|
|
55
|
+
comparator.score_and_notify(record_1, record_2)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
score_recorder.stop
|
62
|
+
end
|
63
|
+
|
64
|
+
def match_records
|
65
|
+
matcher = config.matcher
|
66
|
+
match_recorder = config.match_recorder(matcher)
|
67
|
+
match_recorder.start
|
68
|
+
matcher.run
|
69
|
+
match_recorder.stop
|
26
70
|
end
|
27
71
|
end
|
28
72
|
end
|
29
|
-
|
30
|
-
path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
|
31
|
-
require path + 'single_threaded'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Linkage
|
2
|
+
class ScoreRecorder
|
3
|
+
def initialize(comparators, score_set, primary_keys)
|
4
|
+
@comparators = comparators
|
5
|
+
@score_set = score_set
|
6
|
+
@primary_keys = primary_keys
|
7
|
+
end
|
8
|
+
|
9
|
+
def start
|
10
|
+
@comparators.each do |comparator|
|
11
|
+
comparator.add_observer(self)
|
12
|
+
end
|
13
|
+
@score_set.open_for_writing
|
14
|
+
end
|
15
|
+
|
16
|
+
def update(comparator, record_1, record_2, score)
|
17
|
+
index = @comparators.index(comparator)
|
18
|
+
primary_key_1 = record_1[@primary_keys[0]]
|
19
|
+
primary_key_2 = record_2[@primary_keys[1]]
|
20
|
+
@score_set.add_score(index + 1, primary_key_1, primary_key_2, score)
|
21
|
+
end
|
22
|
+
|
23
|
+
def stop
|
24
|
+
@score_set.close
|
25
|
+
@comparators.each do |comparator|
|
26
|
+
comparator.delete_observer(self)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Linkage
|
2
|
+
class ScoreSet
|
3
|
+
# Register a score set.
|
4
|
+
#
|
5
|
+
# @param [Class] klass
|
6
|
+
def self.register(name, klass)
|
7
|
+
methods = klass.instance_methods(false)
|
8
|
+
missing = []
|
9
|
+
unless methods.include?(:add_score)
|
10
|
+
missing.push("#add_score")
|
11
|
+
end
|
12
|
+
unless methods.include?(:each_pair)
|
13
|
+
missing.push("#each_pair")
|
14
|
+
end
|
15
|
+
unless missing.empty?
|
16
|
+
raise ArgumentError, "class must define #{missing.join(" and ")}"
|
17
|
+
end
|
18
|
+
|
19
|
+
@score_sets ||= {}
|
20
|
+
@score_sets[name] = klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.[](name)
|
24
|
+
@score_sets ? @score_sets[name] : nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def open_for_reading
|
28
|
+
end
|
29
|
+
|
30
|
+
def open_for_writing
|
31
|
+
end
|
32
|
+
|
33
|
+
# @abstract
|
34
|
+
def add_score(comparator_id, id_1, id_2, value)
|
35
|
+
raise NotImplementedError
|
36
|
+
end
|
37
|
+
|
38
|
+
# @abstract
|
39
|
+
def each_pair(&block)
|
40
|
+
raise NotImplementedError
|
41
|
+
end
|
42
|
+
|
43
|
+
def close
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
require 'linkage/score_sets/csv'
|
49
|
+
require 'linkage/score_sets/database'
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Linkage
|
4
|
+
module ScoreSets
|
5
|
+
class CSV < ScoreSet
|
6
|
+
def initialize(filename, options = {})
|
7
|
+
@filename = filename
|
8
|
+
@overwrite = options[:overwrite]
|
9
|
+
end
|
10
|
+
|
11
|
+
def open_for_reading
|
12
|
+
raise "already open for writing, try closing first" if @mode == :write
|
13
|
+
return if @mode == :read
|
14
|
+
|
15
|
+
if !File.exist?(@filename)
|
16
|
+
raise MissingError, "#{@filename} does not exist"
|
17
|
+
end
|
18
|
+
@csv = ::CSV.open(@filename, 'rb', :headers => true)
|
19
|
+
@mode = :read
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_for_writing
|
23
|
+
raise "already open for reading, try closing first" if @mode == :read
|
24
|
+
return if @mode == :write
|
25
|
+
|
26
|
+
if !@overwrite && File.exist?(@filename)
|
27
|
+
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
28
|
+
end
|
29
|
+
|
30
|
+
@csv = ::CSV.open(@filename, 'wb')
|
31
|
+
@csv << %w{comparator_id id_1 id_2 score}
|
32
|
+
@mode = :write
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_score(comparator_id, id_1, id_2, score)
|
36
|
+
raise "not in write mode" if @mode != :write
|
37
|
+
@csv << [comparator_id, id_1, id_2, score]
|
38
|
+
end
|
39
|
+
|
40
|
+
def each_pair
|
41
|
+
open_for_reading
|
42
|
+
|
43
|
+
pairs = Hash.new { |h, k| h[k] = {} }
|
44
|
+
@csv.each do |row|
|
45
|
+
key = [row['id_1'], row['id_2']]
|
46
|
+
score = row['score']
|
47
|
+
pairs[key][row['comparator_id'].to_i] = score.to_f
|
48
|
+
end
|
49
|
+
pairs.each_pair do |pair, scores|
|
50
|
+
yield pair[0], pair[1], scores
|
51
|
+
end
|
52
|
+
|
53
|
+
close
|
54
|
+
end
|
55
|
+
|
56
|
+
def close
|
57
|
+
@mode = nil
|
58
|
+
@csv.close if @csv
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
ScoreSet.register('csv', CSV)
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Linkage
|
2
|
+
module ScoreSets
|
3
|
+
class Database < ScoreSet
|
4
|
+
def initialize(database, options = {})
|
5
|
+
@database = database
|
6
|
+
@table_name = options[:table_name] || :scores
|
7
|
+
@overwrite = options[:overwrite]
|
8
|
+
end
|
9
|
+
|
10
|
+
def open_for_reading
|
11
|
+
raise "already open for writing, try closing first" if @mode == :write
|
12
|
+
return if @mode == :read
|
13
|
+
|
14
|
+
if !@database.table_exists?(@table_name)
|
15
|
+
raise MissingError, "#{@table_name} table does not exist"
|
16
|
+
end
|
17
|
+
|
18
|
+
@dataset = @database[@table_name]
|
19
|
+
@mode = :read
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_for_writing
|
23
|
+
raise "already open for reading, try closing first" if @mode == :read
|
24
|
+
return if @mode == :write
|
25
|
+
|
26
|
+
if @overwrite
|
27
|
+
@database.drop_table?(@table_name)
|
28
|
+
elsif @database.table_exists?(@table_name)
|
29
|
+
raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
|
30
|
+
end
|
31
|
+
|
32
|
+
@database.create_table(@table_name) do
|
33
|
+
Integer :comparator_id
|
34
|
+
String :id_1
|
35
|
+
String :id_2
|
36
|
+
Float :score
|
37
|
+
end
|
38
|
+
@dataset = @database[@table_name]
|
39
|
+
@mode = :write
|
40
|
+
end
|
41
|
+
|
42
|
+
def add_score(comparator_id, id_1, id_2, score)
|
43
|
+
raise "not in write mode" if @mode != :write
|
44
|
+
|
45
|
+
@dataset.insert({
|
46
|
+
:comparator_id => comparator_id,
|
47
|
+
:id_1 => id_1,
|
48
|
+
:id_2 => id_2,
|
49
|
+
:score => score
|
50
|
+
})
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_pair
|
54
|
+
open_for_reading
|
55
|
+
|
56
|
+
current_pair = nil
|
57
|
+
@dataset.order(:id_1, :id_2, :comparator_id).each do |row|
|
58
|
+
if current_pair.nil? || current_pair[0] != row[:id_1] || current_pair[1] != row[:id_2]
|
59
|
+
yield(*current_pair) unless current_pair.nil?
|
60
|
+
current_pair = [row[:id_1], row[:id_2], {}]
|
61
|
+
end
|
62
|
+
scores = current_pair[2]
|
63
|
+
scores[row[:comparator_id]] = row[:score]
|
64
|
+
end
|
65
|
+
yield(*current_pair) unless current_pair.nil?
|
66
|
+
|
67
|
+
close
|
68
|
+
end
|
69
|
+
|
70
|
+
def close
|
71
|
+
@mode = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
ScoreSet.register('database', Database)
|
76
|
+
end
|
77
|
+
end
|
data/lib/linkage/version.rb
CHANGED
data/lib/linkage.rb
CHANGED
@@ -1,31 +1,28 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'fileutils'
|
2
3
|
require 'delegate'
|
3
4
|
require 'sequel'
|
4
5
|
require 'hashery'
|
6
|
+
require 'observer'
|
5
7
|
|
6
8
|
module Linkage
|
7
9
|
end
|
8
10
|
|
9
11
|
path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
|
10
|
-
require path + '
|
11
|
-
require path + '
|
12
|
-
require path + 'warnings'
|
13
|
-
require path + 'decollation'
|
12
|
+
require path + 'comparator'
|
13
|
+
require path + 'configuration'
|
14
14
|
require path + 'dataset'
|
15
|
-
require path + '
|
16
|
-
require path + 'data'
|
15
|
+
require path + 'exceptions'
|
17
16
|
require path + 'field'
|
18
|
-
require path + '
|
19
|
-
require path + 'group'
|
17
|
+
require path + 'field_set'
|
20
18
|
require path + 'import_buffer'
|
21
|
-
require path + '
|
22
|
-
require path + '
|
23
|
-
require path + '
|
19
|
+
require path + 'match_recorder'
|
20
|
+
require path + 'match_set'
|
21
|
+
require path + 'matcher'
|
24
22
|
require path + 'result_set'
|
25
|
-
require path + '
|
26
|
-
require path + '
|
23
|
+
require path + 'runner'
|
24
|
+
require path + 'score_recorder'
|
25
|
+
require path + 'score_set'
|
26
|
+
require path + 'version'
|
27
27
|
|
28
|
-
Sequel.extension :
|
29
|
-
if Sequel::Collation.respond_to?(:suppress_warnings=)
|
30
|
-
Sequel::Collation.suppress_warnings = true
|
31
|
-
end
|
28
|
+
Sequel.extension :core_extensions
|
data/linkage.gemspec
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require File.expand_path('../lib/linkage/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "linkage"
|
5
6
|
gem.authors = ["Jeremy Stephens"]
|
6
7
|
gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
|
7
8
|
gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
|
@@ -16,6 +17,17 @@ Gem::Specification.new do |gem|
|
|
16
17
|
gem.version = Linkage::VERSION
|
17
18
|
|
18
19
|
gem.add_dependency "sequel"
|
19
|
-
gem.add_dependency "sequel-collation"
|
20
20
|
gem.add_dependency "hashery"
|
21
|
+
|
22
|
+
gem.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
gem.add_development_dependency "rake"
|
24
|
+
gem.add_development_dependency "test-unit"
|
25
|
+
gem.add_development_dependency "mocha"
|
26
|
+
gem.add_development_dependency "versionomy"
|
27
|
+
gem.add_development_dependency "sqlite3"
|
28
|
+
gem.add_development_dependency "mysql2"
|
29
|
+
gem.add_development_dependency "guard-test"
|
30
|
+
gem.add_development_dependency "guard-yard"
|
31
|
+
|
32
|
+
gem.required_ruby_version = '>= 1.9'
|
21
33
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/linkage/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "linkage"
|
6
|
+
gem.authors = ["Jeremy Stephens"]
|
7
|
+
gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
|
8
|
+
gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
|
9
|
+
gem.summary = %q{Record linkage library}
|
10
|
+
gem.homepage = "http://github.com/coupler/linkage"
|
11
|
+
|
12
|
+
gem.files = `git ls-files`.split($\)
|
13
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
14
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
15
|
+
gem.name = "linkage"
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
gem.version = Linkage::VERSION
|
18
|
+
gem.platform = "java"
|
19
|
+
|
20
|
+
gem.add_dependency "sequel"
|
21
|
+
gem.add_dependency "sequel-collation"
|
22
|
+
gem.add_dependency "hashery"
|
23
|
+
|
24
|
+
gem.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
gem.add_development_dependency "rake"
|
26
|
+
gem.add_development_dependency "test-unit"
|
27
|
+
gem.add_development_dependency "mocha"
|
28
|
+
gem.add_development_dependency "versionomy"
|
29
|
+
gem.add_development_dependency "jdbc-sqlite3"
|
30
|
+
gem.add_development_dependency "jdbc-mysql"
|
31
|
+
gem.add_development_dependency "guard-test"
|
32
|
+
end
|
data/test/helper.rb
CHANGED
@@ -14,6 +14,7 @@ require 'logger'
|
|
14
14
|
require 'pp'
|
15
15
|
require 'versionomy'
|
16
16
|
require 'erb'
|
17
|
+
require 'tempfile'
|
17
18
|
|
18
19
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
20
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
@@ -46,8 +47,8 @@ class Test::Unit::TestCase
|
|
46
47
|
f
|
47
48
|
end
|
48
49
|
|
49
|
-
def
|
50
|
-
f =
|
50
|
+
def stub_instance(klass, options = {}, &block)
|
51
|
+
f = klass.allocate
|
51
52
|
f.stubs(options)
|
52
53
|
if block
|
53
54
|
f.send(:instance_eval, &block)
|
@@ -55,37 +56,43 @@ class Test::Unit::TestCase
|
|
55
56
|
f
|
56
57
|
end
|
57
58
|
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
if
|
62
|
-
|
59
|
+
def new_comparator(&block)
|
60
|
+
klass = Class.new(Linkage::Comparator)
|
61
|
+
klass.send(:define_method, :score) { |record_1, record_2| 1 }
|
62
|
+
if block_given?
|
63
|
+
klass.class_eval(&block)
|
63
64
|
end
|
64
|
-
|
65
|
+
klass
|
65
66
|
end
|
66
67
|
|
67
|
-
def
|
68
|
-
klass = Class.new(Linkage::
|
69
|
-
klass.send(:
|
70
|
-
if ruby_type
|
71
|
-
klass.send(:define_method, :ruby_type) { ruby_type }
|
68
|
+
def new_score_set(&block)
|
69
|
+
klass = Class.new(Linkage::ScoreSet)
|
70
|
+
klass.send(:define_method, :add_score) do |comparator_index, id_1, id_2, value|
|
72
71
|
end
|
73
|
-
|
74
|
-
|
72
|
+
klass.send(:define_method, :each_pair) do
|
73
|
+
end
|
74
|
+
if block_given?
|
75
|
+
klass.class_eval(&block)
|
75
76
|
end
|
76
77
|
klass
|
77
78
|
end
|
78
79
|
|
79
|
-
def
|
80
|
-
klass = Class.new(Linkage::
|
81
|
-
klass.send(:
|
82
|
-
|
83
|
-
|
80
|
+
def new_match_set(&block)
|
81
|
+
klass = Class.new(Linkage::MatchSet)
|
82
|
+
klass.send(:define_method, :add_match) do |id_1, id_2, value|
|
83
|
+
end
|
84
|
+
if block_given?
|
85
|
+
klass.class_eval(&block)
|
86
|
+
end
|
87
|
+
klass
|
88
|
+
end
|
89
|
+
|
90
|
+
def new_result_set(&block)
|
91
|
+
klass = Class.new(Linkage::ResultSet)
|
92
|
+
klass.send(:define_method, :score_set) do
|
84
93
|
end
|
85
|
-
|
86
|
-
klass.send(:define_singleton_method, :score_range) { score_range }
|
94
|
+
klass.send(:define_method, :match_set) do
|
87
95
|
end
|
88
|
-
klass.send(:define_method, :score) { |record_1, record_2| 100 }
|
89
96
|
if block_given?
|
90
97
|
klass.class_eval(&block)
|
91
98
|
end
|
@@ -15,7 +15,7 @@ module IntegrationTests
|
|
15
15
|
FileUtils.remove_entry_secure(@tmpdir)
|
16
16
|
end
|
17
17
|
|
18
|
-
test "one
|
18
|
+
test "one field equality on single threaded runner" do
|
19
19
|
# insert the test data
|
20
20
|
database do |db|
|
21
21
|
db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
|
@@ -25,25 +25,32 @@ module IntegrationTests
|
|
25
25
|
|
26
26
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
27
27
|
|
28
|
-
|
29
|
-
conf = ds.link_with(ds) do
|
30
|
-
|
31
|
-
|
28
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
29
|
+
conf = ds.link_with(ds, result_set) do |conf|
|
30
|
+
conf.compare([:foo], [:bar], :equal)
|
31
|
+
conf.algorithm = :mean
|
32
|
+
conf.threshold = 1
|
32
33
|
end
|
33
|
-
|
34
|
-
runner = Linkage::
|
34
|
+
|
35
|
+
runner = Linkage::Runner.new(conf)
|
35
36
|
runner.execute
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
39
|
+
assert_equal 1000, score_csv.length
|
40
|
+
score_csv.each do |row|
|
41
|
+
id_1 = row['id_1'].to_i
|
42
|
+
id_2 = row['id_2'].to_i
|
43
|
+
assert (id_1 % 10) == (id_2 % 5)
|
44
|
+
assert_equal "1", row['score']
|
45
|
+
end
|
42
46
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
48
|
+
assert_equal 1000, match_csv.length
|
49
|
+
match_csv.each do |row|
|
50
|
+
id_1 = row['id_1'].to_i
|
51
|
+
id_2 = row['id_2'].to_i
|
52
|
+
assert (id_1 % 10) == (id_2 % 5)
|
53
|
+
assert_equal "1", row['score']
|
47
54
|
end
|
48
55
|
end
|
49
56
|
|
@@ -54,19 +61,33 @@ module IntegrationTests
|
|
54
61
|
Array.new(100) { |i| [i, i % 10, i % 20] })
|
55
62
|
end
|
56
63
|
|
64
|
+
result_set = Linkage::ResultSet['csv'].new(@tmpdir)
|
57
65
|
ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
66
|
+
ds_1 = ds.filter(:bar => 0)
|
67
|
+
ds_2 = ds.filter(:bar => 10)
|
68
|
+
conf = ds_1.link_with(ds_2, result_set) do |conf|
|
69
|
+
conf.compare([:foo], [:foo], :equal)
|
70
|
+
conf.algorithm = :mean
|
71
|
+
conf.threshold = 1
|
64
72
|
end
|
65
|
-
|
73
|
+
|
74
|
+
runner = Linkage::Runner.new(conf)
|
66
75
|
runner.execute
|
67
76
|
|
68
|
-
|
69
|
-
|
77
|
+
score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
|
78
|
+
assert_equal 25, score_csv.length
|
79
|
+
score_csv.each do |row|
|
80
|
+
id_1 = row['id_1'].to_i
|
81
|
+
id_2 = row['id_2'].to_i
|
82
|
+
assert (id_1 % 10) == (id_1 % 10)
|
83
|
+
end
|
84
|
+
|
85
|
+
match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
|
86
|
+
assert_equal 25, match_csv.length
|
87
|
+
match_csv.each do |row|
|
88
|
+
id_1 = row['id_1'].to_i
|
89
|
+
id_2 = row['id_2'].to_i
|
90
|
+
assert (id_1 % 10) == (id_1 % 10)
|
70
91
|
end
|
71
92
|
end
|
72
93
|
end
|