linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
@@ -0,0 +1,42 @@
1
+ module Linkage
2
+ module ResultSets
3
+ class Database < ResultSet
4
+ def initialize(database_or_options = nil)
5
+ @database = nil
6
+ @options = {}
7
+
8
+ if database_or_options.kind_of?(Sequel::Database)
9
+ @database = database_or_options
10
+ else
11
+ database_opts = nil
12
+ case database_or_options
13
+ when String
14
+ database_opts = database_or_options
15
+ when Hash
16
+ database_opts = {}
17
+ database_or_options.each_pair do |key, value|
18
+ if key == :scores || key == :matches
19
+ @options[key] = value
20
+ else
21
+ database_opts[key] = value
22
+ end
23
+ end
24
+ else
25
+ raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
26
+ end
27
+ @database = Sequel.connect(database_opts)
28
+ end
29
+ end
30
+
31
+ def score_set
32
+ @score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
33
+ end
34
+
35
+ def match_set
36
+ @match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
37
+ end
38
+ end
39
+
40
+ ResultSet.register('database', Database)
41
+ end
42
+ end
@@ -1,31 +1,72 @@
1
1
  module Linkage
2
2
  # Use this class to run a configuration created by {Dataset#link_with}.
3
3
  class Runner
4
- attr_reader :config, :result_set
4
+ attr_reader :config
5
5
 
6
6
  # @param [Linkage::Configuration] config
7
- # @param [String] uri Sequel-style database URI
8
- # @param [Hash] options Sequel.connect options
9
7
  # @see Dataset#link_with
10
- # @see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html Sequel: Connecting to a database
11
- def initialize(config, uri = nil, options = {})
8
+ def initialize(config)
12
9
  @config = config
13
- if uri
14
- warn("[DEPRECATION] Please use Configuration#save_results_in with the database URI and options instead")
15
- @config.save_results_in(uri, options)
16
- end
17
10
  end
18
11
 
19
- # @abstract
20
12
  def execute
21
- raise NotImplementedError
13
+ score_records
14
+ match_records
22
15
  end
23
16
 
24
- def result_set
25
- @config.result_set
17
+ def score_records
18
+ score_recorder = config.score_recorder
19
+ score_recorder.start
20
+ dataset_1 = config.dataset_1
21
+ dataset_2 = config.dataset_2
22
+ simple_comparators = []
23
+ config.comparators.each do |comparator|
24
+ if comparator.type == :simple
25
+ simple_comparators << comparator
26
+ else
27
+ if dataset_2
28
+ comparator.score_datasets(dataset_1, dataset_2)
29
+ else
30
+ comparator.score_dataset(dataset_1)
31
+ end
32
+ end
33
+ end
34
+
35
+ # Handle simple comparators
36
+ unless simple_comparators.empty?
37
+ if dataset_2
38
+ # Two datasets
39
+ dataset_1.each do |record_1|
40
+ dataset_2.each do |record_2|
41
+ simple_comparators.each do |comparator|
42
+ comparator.score_and_notify(record_1, record_2)
43
+ end
44
+ end
45
+ end
46
+ else
47
+ # One dataset
48
+ # NOTE: very naive implementation
49
+ records = dataset_1.all
50
+ 0.upto(records.length - 2) do |i|
51
+ record_1 = records[i]
52
+ (i + 1).upto(records.length - 1) do |j|
53
+ record_2 = records[j]
54
+ simple_comparators.each do |comparator|
55
+ comparator.score_and_notify(record_1, record_2)
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ score_recorder.stop
62
+ end
63
+
64
+ def match_records
65
+ matcher = config.matcher
66
+ match_recorder = config.match_recorder(matcher)
67
+ match_recorder.start
68
+ matcher.run
69
+ match_recorder.stop
26
70
  end
27
71
  end
28
72
  end
29
-
30
- path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'runner'
31
- require path + 'single_threaded'
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ class ScoreRecorder
3
+ def initialize(comparators, score_set, primary_keys)
4
+ @comparators = comparators
5
+ @score_set = score_set
6
+ @primary_keys = primary_keys
7
+ end
8
+
9
+ def start
10
+ @comparators.each do |comparator|
11
+ comparator.add_observer(self)
12
+ end
13
+ @score_set.open_for_writing
14
+ end
15
+
16
+ def update(comparator, record_1, record_2, score)
17
+ index = @comparators.index(comparator)
18
+ primary_key_1 = record_1[@primary_keys[0]]
19
+ primary_key_2 = record_2[@primary_keys[1]]
20
+ @score_set.add_score(index + 1, primary_key_1, primary_key_2, score)
21
+ end
22
+
23
+ def stop
24
+ @score_set.close
25
+ @comparators.each do |comparator|
26
+ comparator.delete_observer(self)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ module Linkage
2
+ class ScoreSet
3
+ # Register a score set.
4
+ #
5
+ # @param [Class] klass
6
+ def self.register(name, klass)
7
+ methods = klass.instance_methods(false)
8
+ missing = []
9
+ unless methods.include?(:add_score)
10
+ missing.push("#add_score")
11
+ end
12
+ unless methods.include?(:each_pair)
13
+ missing.push("#each_pair")
14
+ end
15
+ unless missing.empty?
16
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
17
+ end
18
+
19
+ @score_sets ||= {}
20
+ @score_sets[name] = klass
21
+ end
22
+
23
+ def self.[](name)
24
+ @score_sets ? @score_sets[name] : nil
25
+ end
26
+
27
+ def open_for_reading
28
+ end
29
+
30
+ def open_for_writing
31
+ end
32
+
33
+ # @abstract
34
+ def add_score(comparator_id, id_1, id_2, value)
35
+ raise NotImplementedError
36
+ end
37
+
38
+ # @abstract
39
+ def each_pair(&block)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ def close
44
+ end
45
+ end
46
+ end
47
+
48
+ require 'linkage/score_sets/csv'
49
+ require 'linkage/score_sets/database'
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ module Linkage
4
+ module ScoreSets
5
+ class CSV < ScoreSet
6
+ def initialize(filename, options = {})
7
+ @filename = filename
8
+ @overwrite = options[:overwrite]
9
+ end
10
+
11
+ def open_for_reading
12
+ raise "already open for writing, try closing first" if @mode == :write
13
+ return if @mode == :read
14
+
15
+ if !File.exist?(@filename)
16
+ raise MissingError, "#{@filename} does not exist"
17
+ end
18
+ @csv = ::CSV.open(@filename, 'rb', :headers => true)
19
+ @mode = :read
20
+ end
21
+
22
+ def open_for_writing
23
+ raise "already open for reading, try closing first" if @mode == :read
24
+ return if @mode == :write
25
+
26
+ if !@overwrite && File.exist?(@filename)
27
+ raise ExistsError, "#{@filename} exists and not in overwrite mode"
28
+ end
29
+
30
+ @csv = ::CSV.open(@filename, 'wb')
31
+ @csv << %w{comparator_id id_1 id_2 score}
32
+ @mode = :write
33
+ end
34
+
35
+ def add_score(comparator_id, id_1, id_2, score)
36
+ raise "not in write mode" if @mode != :write
37
+ @csv << [comparator_id, id_1, id_2, score]
38
+ end
39
+
40
+ def each_pair
41
+ open_for_reading
42
+
43
+ pairs = Hash.new { |h, k| h[k] = {} }
44
+ @csv.each do |row|
45
+ key = [row['id_1'], row['id_2']]
46
+ score = row['score']
47
+ pairs[key][row['comparator_id'].to_i] = score.to_f
48
+ end
49
+ pairs.each_pair do |pair, scores|
50
+ yield pair[0], pair[1], scores
51
+ end
52
+
53
+ close
54
+ end
55
+
56
+ def close
57
+ @mode = nil
58
+ @csv.close if @csv
59
+ end
60
+ end
61
+
62
+ ScoreSet.register('csv', CSV)
63
+ end
64
+ end
@@ -0,0 +1,77 @@
1
+ module Linkage
2
+ module ScoreSets
3
+ class Database < ScoreSet
4
+ def initialize(database, options = {})
5
+ @database = database
6
+ @table_name = options[:table_name] || :scores
7
+ @overwrite = options[:overwrite]
8
+ end
9
+
10
+ def open_for_reading
11
+ raise "already open for writing, try closing first" if @mode == :write
12
+ return if @mode == :read
13
+
14
+ if !@database.table_exists?(@table_name)
15
+ raise MissingError, "#{@table_name} table does not exist"
16
+ end
17
+
18
+ @dataset = @database[@table_name]
19
+ @mode = :read
20
+ end
21
+
22
+ def open_for_writing
23
+ raise "already open for reading, try closing first" if @mode == :read
24
+ return if @mode == :write
25
+
26
+ if @overwrite
27
+ @database.drop_table?(@table_name)
28
+ elsif @database.table_exists?(@table_name)
29
+ raise ExistsError, "#{@table_name} table exists and not in overwrite mode"
30
+ end
31
+
32
+ @database.create_table(@table_name) do
33
+ Integer :comparator_id
34
+ String :id_1
35
+ String :id_2
36
+ Float :score
37
+ end
38
+ @dataset = @database[@table_name]
39
+ @mode = :write
40
+ end
41
+
42
+ def add_score(comparator_id, id_1, id_2, score)
43
+ raise "not in write mode" if @mode != :write
44
+
45
+ @dataset.insert({
46
+ :comparator_id => comparator_id,
47
+ :id_1 => id_1,
48
+ :id_2 => id_2,
49
+ :score => score
50
+ })
51
+ end
52
+
53
+ def each_pair
54
+ open_for_reading
55
+
56
+ current_pair = nil
57
+ @dataset.order(:id_1, :id_2, :comparator_id).each do |row|
58
+ if current_pair.nil? || current_pair[0] != row[:id_1] || current_pair[1] != row[:id_2]
59
+ yield(*current_pair) unless current_pair.nil?
60
+ current_pair = [row[:id_1], row[:id_2], {}]
61
+ end
62
+ scores = current_pair[2]
63
+ scores[row[:comparator_id]] = row[:score]
64
+ end
65
+ yield(*current_pair) unless current_pair.nil?
66
+
67
+ close
68
+ end
69
+
70
+ def close
71
+ @mode = nil
72
+ end
73
+ end
74
+
75
+ ScoreSet.register('database', Database)
76
+ end
77
+ end
@@ -1,3 +1,3 @@
1
1
  module Linkage
2
- VERSION = "0.0.8"
2
+ VERSION = "0.1.0.pre"
3
3
  end
data/lib/linkage.rb CHANGED
@@ -1,31 +1,28 @@
1
1
  require 'pathname'
2
+ require 'fileutils'
2
3
  require 'delegate'
3
4
  require 'sequel'
4
5
  require 'hashery'
6
+ require 'observer'
5
7
 
6
8
  module Linkage
7
9
  end
8
10
 
9
11
  path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
10
- require path + 'version'
11
- require path + 'utils'
12
- require path + 'warnings'
13
- require path + 'decollation'
12
+ require path + 'comparator'
13
+ require path + 'configuration'
14
14
  require path + 'dataset'
15
- require path + 'runner'
16
- require path + 'data'
15
+ require path + 'exceptions'
17
16
  require path + 'field'
18
- require path + 'function'
19
- require path + 'group'
17
+ require path + 'field_set'
20
18
  require path + 'import_buffer'
21
- require path + 'meta_object'
22
- require path + 'expectation'
23
- require path + 'configuration'
19
+ require path + 'match_recorder'
20
+ require path + 'match_set'
21
+ require path + 'matcher'
24
22
  require path + 'result_set'
25
- require path + 'field_set'
26
- require path + 'comparator'
23
+ require path + 'runner'
24
+ require path + 'score_recorder'
25
+ require path + 'score_set'
26
+ require path + 'version'
27
27
 
28
- Sequel.extension :collation
29
- if Sequel::Collation.respond_to?(:suppress_warnings=)
30
- Sequel::Collation.suppress_warnings = true
31
- end
28
+ Sequel.extension :core_extensions
data/linkage.gemspec CHANGED
@@ -2,6 +2,7 @@
2
2
  require File.expand_path('../lib/linkage/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
+ gem.name = "linkage"
5
6
  gem.authors = ["Jeremy Stephens"]
6
7
  gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
7
8
  gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
@@ -16,6 +17,17 @@ Gem::Specification.new do |gem|
16
17
  gem.version = Linkage::VERSION
17
18
 
18
19
  gem.add_dependency "sequel"
19
- gem.add_dependency "sequel-collation"
20
20
  gem.add_dependency "hashery"
21
+
22
+ gem.add_development_dependency "bundler", "~> 1.3"
23
+ gem.add_development_dependency "rake"
24
+ gem.add_development_dependency "test-unit"
25
+ gem.add_development_dependency "mocha"
26
+ gem.add_development_dependency "versionomy"
27
+ gem.add_development_dependency "sqlite3"
28
+ gem.add_development_dependency "mysql2"
29
+ gem.add_development_dependency "guard-test"
30
+ gem.add_development_dependency "guard-yard"
31
+
32
+ gem.required_ruby_version = '>= 1.9'
21
33
  end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/linkage/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "linkage"
6
+ gem.authors = ["Jeremy Stephens"]
7
+ gem.email = ["jeremy.f.stephens@vanderbilt.edu"]
8
+ gem.description = %q{Performs record linkage between one or two datasets, using Sequel on the backend}
9
+ gem.summary = %q{Record linkage library}
10
+ gem.homepage = "http://github.com/coupler/linkage"
11
+
12
+ gem.files = `git ls-files`.split($\)
13
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
14
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
15
+ gem.name = "linkage"
16
+ gem.require_paths = ["lib"]
17
+ gem.version = Linkage::VERSION
18
+ gem.platform = "java"
19
+
20
+ gem.add_dependency "sequel"
21
+ gem.add_dependency "sequel-collation"
22
+ gem.add_dependency "hashery"
23
+
24
+ gem.add_development_dependency "bundler", "~> 1.3"
25
+ gem.add_development_dependency "rake"
26
+ gem.add_development_dependency "test-unit"
27
+ gem.add_development_dependency "mocha"
28
+ gem.add_development_dependency "versionomy"
29
+ gem.add_development_dependency "jdbc-sqlite3"
30
+ gem.add_development_dependency "jdbc-mysql"
31
+ gem.add_development_dependency "guard-test"
32
+ end
data/test/helper.rb CHANGED
@@ -14,6 +14,7 @@ require 'logger'
14
14
  require 'pp'
15
15
  require 'versionomy'
16
16
  require 'erb'
17
+ require 'tempfile'
17
18
 
18
19
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
20
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
@@ -46,8 +47,8 @@ class Test::Unit::TestCase
46
47
  f
47
48
  end
48
49
 
49
- def stub_function(name, options = {}, &block)
50
- f = Linkage::Function.allocate
50
+ def stub_instance(klass, options = {}, &block)
51
+ f = klass.allocate
51
52
  f.stubs(options)
52
53
  if block
53
54
  f.send(:instance_eval, &block)
@@ -55,37 +56,43 @@ class Test::Unit::TestCase
55
56
  f
56
57
  end
57
58
 
58
- def stub_instance(klass, options = {}, &block)
59
- f = klass.allocate
60
- f.stubs(options)
61
- if block
62
- f.send(:instance_eval, &block)
59
+ def new_comparator(&block)
60
+ klass = Class.new(Linkage::Comparator)
61
+ klass.send(:define_method, :score) { |record_1, record_2| 1 }
62
+ if block_given?
63
+ klass.class_eval(&block)
63
64
  end
64
- f
65
+ klass
65
66
  end
66
67
 
67
- def new_function(name, ruby_type = nil, params = nil, &block)
68
- klass = Class.new(Linkage::Function)
69
- klass.send(:define_singleton_method, :function_name) { name }
70
- if ruby_type
71
- klass.send(:define_method, :ruby_type) { ruby_type }
68
+ def new_score_set(&block)
69
+ klass = Class.new(Linkage::ScoreSet)
70
+ klass.send(:define_method, :add_score) do |comparator_index, id_1, id_2, value|
72
71
  end
73
- if params
74
- klass.send(:define_singleton_method, :parameters) { params }
72
+ klass.send(:define_method, :each_pair) do
73
+ end
74
+ if block_given?
75
+ klass.class_eval(&block)
75
76
  end
76
77
  klass
77
78
  end
78
79
 
79
- def new_comparator(name, params = nil, score_range = nil, &block)
80
- klass = Class.new(Linkage::Comparator)
81
- klass.send(:define_singleton_method, :comparator_name) { name }
82
- if params
83
- klass.send(:define_singleton_method, :parameters) { params }
80
+ def new_match_set(&block)
81
+ klass = Class.new(Linkage::MatchSet)
82
+ klass.send(:define_method, :add_match) do |id_1, id_2, value|
83
+ end
84
+ if block_given?
85
+ klass.class_eval(&block)
86
+ end
87
+ klass
88
+ end
89
+
90
+ def new_result_set(&block)
91
+ klass = Class.new(Linkage::ResultSet)
92
+ klass.send(:define_method, :score_set) do
84
93
  end
85
- if score_range
86
- klass.send(:define_singleton_method, :score_range) { score_range }
94
+ klass.send(:define_method, :match_set) do
87
95
  end
88
- klass.send(:define_method, :score) { |record_1, record_2| 100 }
89
96
  if block_given?
90
97
  klass.class_eval(&block)
91
98
  end
@@ -15,7 +15,7 @@ module IntegrationTests
15
15
  FileUtils.remove_entry_secure(@tmpdir)
16
16
  end
17
17
 
18
- test "one mandatory field equality on single threaded runner" do
18
+ test "one field equality on single threaded runner" do
19
19
  # insert the test data
20
20
  database do |db|
21
21
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -25,25 +25,32 @@ module IntegrationTests
25
25
 
26
26
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
27
27
 
28
- tmpuri = @tmpuri
29
- conf = ds.link_with(ds) do
30
- lhs[:foo].must == rhs[:bar]
31
- save_results_in(tmpuri, :single_threaded => true)
28
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
29
+ conf = ds.link_with(ds, result_set) do |conf|
30
+ conf.compare([:foo], [:bar], :equal)
31
+ conf.algorithm = :mean
32
+ conf.threshold = 1
32
33
  end
33
- assert_equal :cross, conf.linkage_type
34
- runner = Linkage::SingleThreadedRunner.new(conf)
34
+
35
+ runner = Linkage::Runner.new(conf)
35
36
  runner.execute
36
37
 
37
- database do |db|
38
- assert_equal 5, db[:groups].count, PP.pp(db[:groups].all, "")
39
- db[:groups].order(:foo_bar).each_with_index do |row, i|
40
- assert_equal i, row[:foo_bar]
41
- end
38
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
39
+ assert_equal 1000, score_csv.length
40
+ score_csv.each do |row|
41
+ id_1 = row['id_1'].to_i
42
+ id_2 = row['id_2'].to_i
43
+ assert (id_1 % 10) == (id_2 % 5)
44
+ assert_equal "1", row['score']
45
+ end
42
46
 
43
- assert_equal 1000, db[:matches].count
44
- db[:matches].order(:record_1_id, :record_2_id).each do |row|
45
- assert_equal row[:record_1_id] % 10, row[:record_2_id] % 5
46
- end
47
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
48
+ assert_equal 1000, match_csv.length
49
+ match_csv.each do |row|
50
+ id_1 = row['id_1'].to_i
51
+ id_2 = row['id_2'].to_i
52
+ assert (id_1 % 10) == (id_2 % 5)
53
+ assert_equal "1", row['score']
47
54
  end
48
55
  end
49
56
 
@@ -54,19 +61,33 @@ module IntegrationTests
54
61
  Array.new(100) { |i| [i, i % 10, i % 20] })
55
62
  end
56
63
 
64
+ result_set = Linkage::ResultSet['csv'].new(@tmpdir)
57
65
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
58
- tmpuri = @tmpuri
59
- conf = ds.link_with(ds) do
60
- lhs[:foo].must == rhs[:foo]
61
- lhs[:bar].must == 0
62
- rhs[:bar].must == 10
63
- save_results_in(tmpuri)
66
+ ds_1 = ds.filter(:bar => 0)
67
+ ds_2 = ds.filter(:bar => 10)
68
+ conf = ds_1.link_with(ds_2, result_set) do |conf|
69
+ conf.compare([:foo], [:foo], :equal)
70
+ conf.algorithm = :mean
71
+ conf.threshold = 1
64
72
  end
65
- runner = Linkage::SingleThreadedRunner.new(conf)
73
+
74
+ runner = Linkage::Runner.new(conf)
66
75
  runner.execute
67
76
 
68
- database do |db|
69
- assert_equal 1, db[:groups].count
77
+ score_csv = CSV.read(File.join(@tmpdir, 'scores.csv'), :headers => true)
78
+ assert_equal 25, score_csv.length
79
+ score_csv.each do |row|
80
+ id_1 = row['id_1'].to_i
81
+ id_2 = row['id_2'].to_i
82
+ assert (id_1 % 10) == (id_1 % 10)
83
+ end
84
+
85
+ match_csv = CSV.read(File.join(@tmpdir, 'matches.csv'), :headers => true)
86
+ assert_equal 25, match_csv.length
87
+ match_csv.each do |row|
88
+ id_1 = row['id_1'].to_i
89
+ id_2 = row['id_2'].to_i
90
+ assert (id_1 % 10) == (id_1 % 10)
70
91
  end
71
92
  end
72
93
  end