linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,42 @@
1
+ module Linkage
2
+ module ResultSets
3
+ class Database < ResultSet
4
+ def initialize(database_or_options = nil)
5
+ @database = nil
6
+ @options = {}
7
+
8
+ if database_or_options.kind_of?(Sequel::Database)
9
+ @database = database_or_options
10
+ else
11
+ database_opts = nil
12
+ case database_or_options
13
+ when String
14
+ database_opts = database_or_options
15
+ when Hash
16
+ database_opts = {}
17
+ database_or_options.each_pair do |key, value|
18
+ if key == :scores || key == :matches
19
+ @options[key] = value
20
+ else
21
+ database_opts[key] = value
22
+ end
23
+ end
24
+ else
25
+ raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
26
+ end
27
+ @database = Sequel.connect(database_opts)
28
+ end
29
+ end
30
+
31
+ def score_set
32
+ @score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
33
+ end
34
+
35
+ def match_set
36
+ @match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
37
+ end
38
+ end
39
+
40
+ ResultSet.register('database', Database)
41
+ end
42
+ end
@@ -1,31 +1,72 @@
1
1
  module Linkage
2
2
  # Use this class to run a configuration created by {Dataset#link_with}.
3
3
  class Runner
4
- attr_reader :config, :result_set
4
+ attr_reader :config
5
5
 
6
6
  # @param [Linkage::Configuration] config
7
- # @param [String] uri Sequel-style database URI
8
- # @param [Hash] options Sequel.connect options
9
7
  # @see Dataset#link_with
10
- # @see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html Sequel: Connecting to a database
11
- def initialize(config, uri = nil, options = {})
8
+ def initialize(config)
12
9
  @config = config
13
- if uri
14
- warn("[DEPRECATION] Please use Configuration#save_results_in with the database URI and options instead")
15
- @config.save_results_in(uri, options)
16
- end
17
10
  end
18
11
 
19
- # @abstract
20
12
  def execute
21
- raise NotImplementedError
13
+ score_records
14
+ match_records
22
15
  end
23
16
 
24
- def result_set
25
- @config.result_set
17
+ def score_records
18
+ score_recorder = config.score_recorder
19
+ score_recorder.start
20
+ dataset_1 = config.dataset_1
21
+ dataset_2 = config.dataset_2
22
+ simple_comparators = []
23
+ config.comparators.each do |comparator|
24
+ if comparator.type == :simple
25
+ simple_comparators << comparator
26
+ else
27
+ if dataset_2
28
+ comparator.score_datasets(dataset_1, dataset_2)
29
+ else
30
+ comparator.score_dataset(dataset_1)
31
+ end
32
+ end
33
+ end
34
+
35
+ # Handle simple comparators
36
+ unless simple_comparators.empty?
37
+ if dataset_2
38
+ # Two datasets
39
+ dataset_1.each do |record_1|
40
+ dataset_2.each do |record_2|
41
+ simple_comparators.each do |comparator|
42
+ comparator.score_and_notify(record_1, record_2)
43
+ end
44
+ end
45
+ end
46
+ else
47
+ # One dataset
48
+ # NOTE: very naive implementation
49
+ records = dataset_1.all
50
+ 0.upto(records.length - 2) do |i|
51
+ record_1 = records[i]
52
+ (i + 1).upto(records.length - 1) do |j|
53
+ record_2 = records[j]
54
+ simple_comparators.each do |comparator|
55
+ comparator.score_and_notify(record_1, record_2)
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ score_recorder.stop
62
+ end
63
+
64
+ def match_records
65
+ matcher = config.matcher
66
+ match_recorder = config.match_recorder(matcher)
67
+ match_recorder.start
68
+ matcher.run
69
+ match_recorder.stop
26
70
  end
27
71
  end
28
72
  end
29
-
30
- path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
31
- require path + 'single_threaded'
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ class ScoreRecorder
3
+ def initialize(comparators, score_set, primary_keys)
4
+ @comparators = comparators
5
+ @score_set = score_set
6
+ @primary_keys = primary_keys
7
+ end
8
+
9
+ def start
10
+ @comparators.each do |comparator|
11
+ comparator.add_observer(self)
12
+ end
13
+ @score_set.open_for_writing
14
+ end
15
+
16
+ def update(comparator, record_1, record_2, score)
17
+ index = @comparators.index(comparator)
18
+ primary_key_1 = record_1[@primary_keys[0]]
19
+ primary_key_2 = record_2[@primary_keys[1]]
20
+ @score_set.add_score(index + 1, primary_key_1, primary_key_2, score)
21
+ end
22
+
23
+ def stop
24
+ @score_set.close
25
+ @comparators.each do |comparator|
26
+ comparator.delete_observer(self)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ module Linkage
2
+ class ScoreSet
3
+ # Register a score set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ missing = []
9
+ unless methods.include?(:add_score)
10
+ missing.push("#add_score")
11
+ end
12
+ unless methods.include?(:each_pair)
13
+ missing.push("#each_pair")
14
+ end
15
+ unless missing.empty?
16
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
17
+ end
18
+
19
+ @score_sets ||= {}
20
+ @score_sets[name] = klass
21
+ end
22
+
23
+ def self.[](name)
24
+ @score_sets ? @score_sets[name] : nil
25
+ end
26
+
27
+ def open_for_reading
28
+ end
29
+
30
+ def open_for_writing
31
+ end
32
+
33
+ # @abstract
34
+ def add_score(comparator_id, id_1, id_2, value)
35
+ raise NotImplementedError
36
+ end
37
+
38
+ # @abstract
39
+ def each_pair(&block)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ def close
44
+ end
45
+ end
46
+ end
47
+
48
+ require 'linkage/score_sets/csv'
49
+ require 'linkage/score_sets/database'
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ module Linkage
4
+ module ScoreSets
5
+ class CSV < ScoreSet
6
+ def initialize(filename, options = {})
7
+ @filename = filename
8
+ @overwrite = options[:overwrite]
9
+ end
10
+
11
+ def open_for_reading
12
+ raise "already open for writing, try closing first" if @mode == :write
13
+ return if @mode == :read
14
+
15
+ if !File.exist?(@filename)
16
+ raise MissingError, "#{@filename} does not exist"
17
+ end
18
+ @csv = ::CSV.open(@filename, 'rb', :headers => true)
19
+ @mode = :read
20
+ end
21
+
22
+ def open_for_writing
23
+ raise "already open for reading, try closing first" if @mode == :read
24
+ return if @mode == :write
25
+
26
+ if !@overwrite && File.exist?(@filename)
27
+ raise ExistsError, "#{@filename} exists and not in overwrite mode"
28
+ end
29
+
30
+ @csv = ::CSV.open(@filename, 'wb')
31
+ @csv << %w{comparator_id id_1 id_2 score}
32
+ @mode = :write
33
+ end
34
+
35
+ def add_score(comparator_id, id_1, id_2, score)
36
+ raise "not in write mode" if @mode != :write
37
+ @csv << [comparator_id, id_1, id_2, score]
38
+ end
39
+
40
+ def each_pair
41
+ open_for_reading
42
+
43
+ pairs = Hash.new { |h, k| h[k] = {} }
44
+ @csv.each do |row|
45
+ key = [row['id_1'], row['id_2']]
46
+ score = row['score']
47
+ pairs[key][row['comparator_id'].to_i] = score.to_f
48
+ end
49
+ pairs.each_pair do |pair, scores|
50
+ yield pair[0], pair[1], scores
51
+ end
52
+
53
+ close
54
+ end
55
+
56
+ def close
57
+ @mode = nil
58
+ @csv.close if @csv
59
+ end
60
+ end
61
+
62
+ ScoreSet.register('csv', CSV)
63
+ end
64
+ end
@@ -0,0 +1,77 @@
1
+ module Linkage
2
+ module ScoreSets
3
+ class Database < ScoreSet
4
+ def initialize(database, options = {})
5
+ @database = database
6
+ @table_name = options[:table_name] || :scores
7
+ @overwrite = options[:overwrite]
8
+ end
9
+
10
+ def open_for_reading
11
+ raise "already open for writing, try closing first" if @mode == :write
12
+ return if @mode == :read
13
+
14
+ if !@database.table_exists?(@table_name)
15
+ raise MissingError, "#{@table_name} table does not exist"
16
+ end
17
+
18
+ @dataset = @database[@table_name]
19
+ @mode = :read
20
+ end
21
+
22
+ def open_for_writing
23
+ raise "already open for reading, try closing first" if @mode == :read
24
+ return if @mode == :write
25
+
26
+ if @overwrite
27
+ @database.drop_table?(@table_name)
28
+ elsif @database.table_exists?(@table_name)
29
+ raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
30
+ end
31
+
32
+ @database.create_table(@table_name) do
33
+ Integer :comparator_id
34
+ String :id_1
35
+ String :id_2
36
+ Float :score
37
+ end
38
+ @dataset = @database[@table_name]
39
+ @mode = :write
40
+ end
41
+
42
+ def add_score(comparator_id, id_1, id_2, score)
43
+ raise "not in write mode" if @mode != :write
44
+
45
+ @dataset.insert({
46
+ :comparator_id => comparator_id,
47
+ :id_1 => id_1,
48
+ :id_2 => id_2,
49
+ :score => score
50
+ })
51
+ end
52
+
53
+ def each_pair
54
+ open_for_reading
55
+
56
+ current_pair = nil
57
+ @dataset.order(:id_1, :id_2, :comparator_id).each do |row|
58
+ if current_pair.nil? || current_pair[0] != row[:id_1] || current_pair[1] != row[:id_2]
59
+ yield(*current_pair) unless current_pair.nil?
60
+ current_pair = [row[:id_1], row[:id_2], {}]
61
+ end
62
+ scores = current_pair[2]
63
+ scores[row[:comparator_id]] = row[:score]
64
+ end
65
+ yield(*current_pair) unless current_pair.nil?
66
+
67
+ close
68
+ end
69
+
70
+ def close
71
+ @mode = nil
72
+ end
73
+ end
74
+
75
+ ScoreSet.register('database', Database)
76
+ end
77
+ end
@@ -1,3 +1,3 @@
1
1
  module Linkage
2
- VERSION = "0.0.8"
2
+ VERSION = "0.1.0.pre"
3
3
  end
data/lib/linkage.rb CHANGED
@@ -1,31 +1,28 @@
1
1
  require 'pathname'
2
+ require 'fileutils'
2
3
  require 'delegate'
3
4
  require 'sequel'
4
5
  require 'hashery'
6
+ require 'observer'
5
7
 
6
8
  module Linkage
7
9
  end
8
10
 
9
11
  path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
10
- require path + 'version'
11
- require path + 'utils'
12
- require path + 'warnings'
13
- require path + 'decollation'
12
+ require path + 'comparator'
13
+ require path + 'configuration'
14
14
  require path + 'dataset'
15
- require path + 'runner'
16
- require path + 'data'
15
+ require path + 'exceptions'
17
16
  require path + 'field'
18
- require path + 'function'
19
- require path + 'group'
17
+ require path + 'field_set'
20
18
  require path + 'import_buffer'
21
- require path + 'meta_object'
22
- require path + 'expectation'
23
- require path + 'configuration'
19
+ require path + 'match_recorder'
20
+ require path + 'match_set'
21
+ require path + 'matcher'
24
22
  require path + 'result_set'
25
- require path + 'field_set'
26
- require path + 'comparator'
23
+ require path + 'runner'
24
+ require path + 'score_recorder'
25
+ require path + 'score_set'
26
+ require path + 'version'
27
27
 
28
- Sequel.extension :collation
29
- if Sequel::Collation.respond_to?(:suppress_warnings=)
30
- Sequel::Collation.suppress_warnings = true
31
- end
28
+ Sequel.extension :core_extensions
data/linkage.gemspec CHANGED
@@ -2,6 +2,7 @@
2
2
  require File.expand_path('../lib/linkage/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
+ gem.name = "linkage"
5
6
  gem.authors = ["Jeremy Stephens"]
6
7
  gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
7
8
  gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
@@ -16,6 +17,17 @@ Gem::Specification.new do |gem|
16
17
  gem.version = Linkage::VERSION
17
18
 
18
19
  gem.add_dependency "sequel"
19
- gem.add_dependency "sequel-collation"
20
20
  gem.add_dependency "hashery"
21
+
22
+ gem.add_development_dependency "bundler", "~> 1.3"
23
+ gem.add_development_dependency "rake"
24
+ gem.add_development_dependency "test-unit"
25
+ gem.add_development_dependency "mocha"
26
+ gem.add_development_dependency "versionomy"
27
+ gem.add_development_dependency "sqlite3"
28
+ gem.add_development_dependency "mysql2"
29
+ gem.add_development_dependency "guard-test"
30
+ gem.add_development_dependency "guard-yard"
31
+
32
+ gem.required_ruby_version = '>= 1.9'
21
33
  end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/linkage/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "linkage"
6
+ gem.authors = ["Jeremy Stephens"]
7
+ gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
8
+ gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
9
+ gem.summary = %q{Record linkage library}
10
+ gem.homepage = "http://github.com/coupler/linkage"
11
+
12
+ gem.files = `git ls-files`.split($\)
13
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
14
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
15
+ gem.name = "linkage"
16
+ gem.require_paths = ["lib"]
17
+ gem.version = Linkage::VERSION
18
+ gem.platform = "java"
19
+
20
+ gem.add_dependency "sequel"
21
+ gem.add_dependency "sequel-collation"
22
+ gem.add_dependency "hashery"
23
+
24
+ gem.add_development_dependency "bundler", "~> 1.3"
25
+ gem.add_development_dependency "rake"
26
+ gem.add_development_dependency "test-unit"
27
+ gem.add_development_dependency "mocha"
28
+ gem.add_development_dependency "versionomy"
29
+ gem.add_development_dependency "jdbc-sqlite3"
30
+ gem.add_development_dependency "jdbc-mysql"
31
+ gem.add_development_dependency "guard-test"
32
+ end
data/test/helper.rb CHANGED
@@ -14,6 +14,7 @@ require 'logger'
14
14
  require 'pp'
15
15
  require 'versionomy'
16
16
  require 'erb'
17
+ require 'tempfile'
17
18
 
18
19
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
20
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
@@ -46,8 +47,8 @@ class Test::Unit::TestCase
46
47
  f
47
48
  end
48
49
 
49
- def stub_function(name, options = {}, &block)
50
- f = Linkage::Function.allocate
50
+ def stub_instance(klass, options = {}, &block)
51
+ f = klass.allocate
51
52
  f.stubs(options)
52
53
  if block
53
54
  f.send(:instance_eval, &block)
@@ -55,37 +56,43 @@ class Test::Unit::TestCase
55
56
  f
56
57
  end
57
58
 
58
- def stub_instance(klass, options = {}, &block)
59
- f = klass.allocate
60
- f.stubs(options)
61
- if block
62
- f.send(:instance_eval, &block)
59
+ def new_comparator(&block)
60
+ klass = Class.new(Linkage::Comparator)
61
+ klass.send(:define_method, :score) { |record_1, record_2| 1 }
62
+ if block_given?
63
+ klass.class_eval(&block)
63
64
  end
64
- f
65
+ klass
65
66
  end
66
67
 
67
- def new_function(name, ruby_type = nil, params = nil, &block)
68
- klass = Class.new(Linkage::Function)
69
- klass.send(:define_singleton_method, :function_name) { name }
70
- if ruby_type
71
- klass.send(:define_method, :ruby_type) { ruby_type }
68
+ def new_score_set(&block)
69
+ klass = Class.new(Linkage::ScoreSet)
70
+ klass.send(:define_method, :add_score) do |comparator_index, id_1, id_2, value|
72
71
  end
73
- if params
74
- klass.send(:define_singleton_method, :parameters) { params }
72
+ klass.send(:define_method, :each_pair) do
73
+ end
74
+ if block_given?
75
+ klass.class_eval(&block)
75
76
  end
76
77
  klass
77
78
  end
78
79
 
79
- def new_comparator(name, params = nil, score_range = nil, &block)
80
- klass = Class.new(Linkage::Comparator)
81
- klass.send(:define_singleton_method, :comparator_name) { name }
82
- if params
83
- klass.send(:define_singleton_method, :parameters) { params }
80
+ def new_match_set(&block)
81
+ klass = Class.new(Linkage::MatchSet)
82
+ klass.send(:define_method, :add_match) do |id_1, id_2, value|
83
+ end
84
+ if block_given?
85
+ klass.class_eval(&block)
86
+ end
87
+ klass
88
+ end
89
+
90
+ def new_result_set(&block)
91
+ klass = Class.new(Linkage::ResultSet)
92
+ klass.send(:define_method, :score_set) do
84
93
  end
85
- if score_range
86
- klass.send(:define_singleton_method, :score_range) { score_range }
94
+ klass.send(:define_method, :match_set) do
87
95
  end
88
- klass.send(:define_method, :score) { |record_1, record_2| 100 }
89
96
  if block_given?
90
97
  klass.class_eval(&block)
91
98
  end
@@ -15,7 +15,7 @@ module IntegrationTests
15
15
  FileUtils.remove_entry_secure(@tmpdir)
16
16
  end
17
17
 
18
- test "one mandatory field equality on single threaded runner" do
18
+ test "one field equality on single threaded runner" do
19
19
  # insert the test data
20
20
  database do |db|
21
21
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -25,25 +25,32 @@ module IntegrationTests
25
25
 
26
26
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
27
 
28
- tmpuri = @tmpuri
29
- conf = ds.link_with(ds) do
30
- lhs[:foo].must == rhs[:bar]
31
- save_results_in(tmpuri, :single_threaded => true)
28
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
29
+ conf = ds.link_with(ds, result_set) do |conf|
30
+ conf.compare([:foo], [:bar], :equal)
31
+ conf.algorithm = :mean
32
+ conf.threshold = 1
32
33
  end
33
- assert_equal :cross, conf.linkage_type
34
- runner = Linkage::SingleThreadedRunner.new(conf)
34
+
35
+ runner = Linkage::Runner.new(conf)
35
36
  runner.execute
36
37
 
37
- database do |db|
38
- assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
39
- db[:groups].order(:foo_bar).each_with_index do |row, i|
40
- assert_equal i, row[:foo_bar]
41
- end
38
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
39
+ assert_equal 1000, score_csv.length
40
+ score_csv.each do |row|
41
+ id_1 = row['id_1'].to_i
42
+ id_2 = row['id_2'].to_i
43
+ assert (id_1 % 10) == (id_2 % 5)
44
+ assert_equal "1", row['score']
45
+ end
42
46
 
43
- assert_equal 1000, db[:matches].count
44
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
45
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
46
- end
47
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
48
+ assert_equal 1000, match_csv.length
49
+ match_csv.each do |row|
50
+ id_1 = row['id_1'].to_i
51
+ id_2 = row['id_2'].to_i
52
+ assert (id_1 % 10) == (id_2 % 5)
53
+ assert_equal "1", row['score']
47
54
  end
48
55
  end
49
56
 
@@ -54,19 +61,33 @@ module IntegrationTests
54
61
  Array.new(100) { |i| [i, i % 10, i % 20] })
55
62
  end
56
63
 
64
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
57
65
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
58
- tmpuri = @tmpuri
59
- conf = ds.link_with(ds) do
60
- lhs[:foo].must == rhs[:foo]
61
- lhs[:bar].must == 0
62
- rhs[:bar].must == 10
63
- save_results_in(tmpuri)
66
+ ds_1 = ds.filter(:bar => 0)
67
+ ds_2 = ds.filter(:bar => 10)
68
+ conf = ds_1.link_with(ds_2, result_set) do |conf|
69
+ conf.compare([:foo], [:foo], :equal)
70
+ conf.algorithm = :mean
71
+ conf.threshold = 1
64
72
  end
65
- runner = Linkage::SingleThreadedRunner.new(conf)
73
+
74
+ runner = Linkage::Runner.new(conf)
66
75
  runner.execute
67
76
 
68
- database do |db|
69
- assert_equal 1, db[:groups].count
77
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
78
+ assert_equal 25, score_csv.length
79
+ score_csv.each do |row|
80
+ id_1 = row['id_1'].to_i
81
+ id_2 = row['id_2'].to_i
82
+ assert (id_1 % 10) == (id_1 % 10)
83
+ end
84
+
85
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
86
+ assert_equal 25, match_csv.length
87
+ match_csv.each do |row|
88
+ id_1 = row['id_1'].to_i
89
+ id_2 = row['id_2'].to_i
90
+ assert (id_1 % 10) == (id_1 % 10)
70
91
  end
71
92
  end
72
93
  end