linkage 0.0.8 → 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ad6a9ee6a6add94a342e3d02d49d8a4bfeb9122b
4
+ data.tar.gz: a989e8e810602dfcd4da596fcc1154a922dedccd
5
+ SHA512:
6
+ metadata.gz: 0d78da4904100679826cf76ae07f7daf984c12e2a762aceb0ad2d1387c5a5778403a782ff104ce48c1fd3ec5588c728fe2b8ea7a79a896da153a8d91c7e4cb14
7
+ data.tar.gz: 6797e5598f47413022d18c6732cbf0613efaccba886cda56eb390e3344d131ccfd977b5992c5a2c4557cfbc4f93bf747ddffe849770be5860e793f27fe3e2286
data/.gitignore CHANGED
@@ -8,3 +8,4 @@ test.rb
8
8
  results.db
9
9
  .rbenv-version
10
10
  bin
11
+ Gemfile.lock
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ -m markdown
data/Gemfile CHANGED
@@ -1,21 +1,3 @@
1
- source 'http://rubygems.org'
1
+ source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
-
5
- group :development do
6
- gem 'bundler'
7
- gem 'test-unit'
8
- gem 'mocha'
9
- gem 'yard'
10
- gem 'rake'
11
- gem 'versionomy'
12
- gem 'sqlite3', :platforms => :ruby
13
- gem 'mysql2', :platforms => :ruby
14
- gem 'jdbc-sqlite3', :platforms => :jruby
15
- gem 'jdbc-mysql', :platforms => :jruby
16
- gem 'rdiscount'
17
- gem 'guard-test'
18
- gem 'guard-yard', :platforms => :ruby_19
19
- gem 'rb-inotify', '~> 0.8.8'
20
- gem 'debugger'
21
- end
data/Gemfile-java ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.markdown CHANGED
@@ -1,52 +1,106 @@
1
- # linkage
1
+ # Linkage
2
2
 
3
- Linkage is a library for record linkage between one or two database tables.
3
+ Linkage is a Ruby library for record linkage between one or two database tables.
4
+
5
+ ## What is record linkage?
6
+
7
+ In an ideal world, records that reference the same entity can be easily
8
+ identified. Unfortunately, this isn't always the case. Sometimes there are no
9
+ good identifiers in the datasets that you're interested in (ID, social security
10
+ number, etc). In such cases, it is necessary to use other means to determine
11
+ which records refer to which entity, and this process is known as **record
12
+ linkage**.
13
+
14
+ ## Prerequisites
15
+
16
+ In order to use Linkage, the records you want to link must be in a database.
17
+ Linkage has the ability to perform record linkage across different kinds of
18
+ databases, so it's okay if your records are not all in the same place.
19
+
20
+ Since Linkage uses [Sequel](http://sequel.jeremyevans.net/) to communicate with
21
+ databases, any database that Sequel supports will work. See [Connecting to a
22
+ database](http://sequel.jeremyevans.net/documentation.html) on the Sequel
23
+ website for more information about what databases are supported.
4
24
 
5
25
  ## Usage
6
26
 
7
- Linkage uses Sequel to talk to databases, so any database that Sequel can
8
- talk to, Linkage can talk to. You just give Linkage the Sequel-style URI
9
- and the database table name:
27
+ To perform a record linkage, Linkage needs information about the following:
28
+ datasets, result set, and comparators. A dataset refers to a table in a
29
+ database. A result set is a place to put score and match information that
30
+ Linkage generates. Comparators describe how records are compared.
31
+
32
+ A dataset is created via the `Linkage::Dataset` class, along with a connection URI
33
+ and a table name:
10
34
 
11
- ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
35
+ ```ruby
36
+ ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
37
+ ```
12
38
 
13
- To describe a linkage, you use the `Dataset#link_with` method.
39
+ Result sets have different options depending on what storage medium you're
40
+ using (CSV or database). For CSVs, you could use:
14
41
 
15
- parents = Linkage::Dataset.new('postgres://example.com/foo', 'parents')
16
- children = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'children')
17
- config = parents.link_with(children) do
18
- lhs[:first_name].must == rhs[:parent_first_name]
19
- lhs[:last_name].must == rhs[:parent_last_name]
20
- lhs[:last_name].must_not == "Smith" # exclude parents with the last
21
- # name "Smith"
42
+ ```ruby
43
+ result_set = Linkage::ResultSet['csv'].new('~/my_results')
44
+ ```
22
45
 
23
- save_results_in('sqlite://results.db') # see below
24
- end
46
+ In this case, scores and matches will be saved in CSV files in the `my_results`
47
+ directory in your home folder.
25
48
 
26
- Note that the datasets don't have to be in the same database, or even on
27
- the same machine.
49
+ To describe a linkage, you can use the `Dataset#link_with` method. This creates
50
+ a linkage configuration that you can use to describe how you want the records in
51
+ each dataset to be compared. For example:
52
+
53
+ ```ruby
54
+ demo = Linkage::Dataset.new('postgres://example.com/foo', 'demographics')
55
+ visits = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'visits')
56
+ result_set = Linkage::ResultSet['csv'].new('~/my_results')
57
+ config = demo.link_with(visits, result_set) do |config|
58
+ config.compare([:first_name, :last_name], [:first_name, :last_name], :equal)
59
+ end
60
+ ```
61
+
62
+ This linkage would match records from a demographics table to records in a table
63
+ with information about doctor visits by using first name and last name.
64
+
65
+ The `compare` method creates a `Compare` comparator. This is the simplest
66
+ comparator in Linkage, and it just compares fields with the operator you specify
67
+ (`:equal`, `:less_than`, `:greater_than`, etc). When a comparator compares
68
+ two records, it gives the pair of records a score between 0 and 1. In the case
69
+ of the example above, records that have the same first name and last name get a
70
+ score of 1, and records that don't get a score of 0 (or sometimes, they aren't
71
+ scored and assumed to have a score of 0).
72
+
73
+ Other comparators are `Strcompare` for approximate string matching and
74
+ `Within` for matching numbers within a range.
28
75
 
29
76
  To run a linkage, use a Runner with the resulting configuration from
30
77
  `Dataset#link_with`:
31
78
 
32
- runner = Linkage::SingleThreadedRunner.new(config)
33
- runner.execute
79
+ ```ruby
80
+ runner = Linkage::Runner.new(config)
81
+ runner.execute
82
+ ```
83
+
84
+ After running a linkage, there will be a list of matches in a CSV file or
85
+ database, depending on how you configured your result set.
86
+
87
+ The default way linkage determines if two records match is by comparing the
88
+ average score to a threshold value (which is 0.5 by default). You can configure
89
+ the threshold value like so: `config.threshold = 0.9`.
34
90
 
35
- The runner saves results in a database that you specify in the configuration
36
- (via the `save_results_in` method). It stores its results in two database
37
- tables: `groups` and `groups_records`. The `groups` table contains all of the
38
- unique combinations of values in your datasets, and `groups_records` maps
39
- records to groups.
91
+ ## Other examples
40
92
 
41
- You can also link a dataset to itself:
93
+ Linking a dataset to itself:
42
94
 
43
- births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
44
- config = births.link_with(births) do
45
- lhs[:mother_first_name].must == rhs[:mother_first_name]
46
- lhs[:mother_last_name].must == rhs[:mother_last_name]
47
- end
48
- runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
49
- runner.execute
95
+ ```ruby
96
+ births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
97
+ result_set = Linkage::ResultSet['csv'].new('~/my_birth_results')
98
+ config = births.link_with(births, result_set) do |config|
99
+ config.compare([:mother_first_name, :mother_last_name], [:mother_first_name, :mother_last_name], :equal)
100
+ end
101
+ runner = Linkage::Runner.new(config)
102
+ runner.execute
103
+ ```
50
104
 
51
105
  The above example would find birth records that have mothers with the same
52
106
  name.
@@ -62,6 +116,6 @@ name.
62
116
 
63
117
  ## Copyright
64
118
 
65
- Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
119
+ Copyright (c) 2011-2014 Vanderbilt University. See LICENSE.txt for
66
120
  further details.
67
121
 
data/Rakefile CHANGED
@@ -1,16 +1,4 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
12
- require 'rake'
13
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
14
2
 
15
3
  require 'rake/testtask'
16
4
  Rake::TestTask.new(:test) do |test|
@@ -18,9 +6,22 @@ Rake::TestTask.new(:test) do |test|
18
6
  test.pattern = 'test/**/test_*.rb'
19
7
  test.verbose = true
20
8
  end
21
-
22
9
  task :default => :test
23
10
 
11
+ namespace :test do
12
+ Rake::TestTask.new(:unit) do |test|
13
+ test.libs << 'lib' << 'test'
14
+ test.pattern = 'test/unit/**/test_*.rb'
15
+ test.verbose = true
16
+ end
17
+
18
+ Rake::TestTask.new(:integration) do |test|
19
+ test.libs << 'lib' << 'test'
20
+ test.pattern = 'test/integration/**/test_*.rb'
21
+ test.verbose = true
22
+ end
23
+ end
24
+
24
25
  require 'yard'
25
26
  YARD::Rake::YardocTask.new do |t|
26
27
  t.files = ['lib/**/*.rb']
@@ -28,7 +29,7 @@ end
28
29
 
29
30
  # Yoinked from https://github.com/rails/rails/blob/master/railties/lib/rails/tasks/annotations.rake
30
31
  namespace :notes do
31
- ["OPTIMIZE", "FIXME", "TODO"].each do |annotation|
32
+ ['OPTIMIZE', 'FIXME', 'TODO'].each do |annotation|
32
33
  desc "Enumerate all #{annotation} annotations"
33
34
  task annotation.downcase.intern do
34
35
  SourceAnnotationExtractor.enumerate annotation
data/TODO ADDED
@@ -0,0 +1,4 @@
1
+ Features
2
+ - add matcher algorithms
3
+ - change comparator_id to more than just an index; need to get comparator ids
4
+ from result set instead of configuration
@@ -1,172 +1,167 @@
1
1
  module Linkage
2
- # @abstract Abstract class to represent record comparators.
2
+ # {Comparator} is the superclass for comparators in Linkage. Comparators are
3
+ # used to compare two records and compute scores based on how closely the two
4
+ # records relate.
5
+ #
6
+ # Each comparator should inherit from {Comparator} and declare itself as
7
+ # simple or advanced by overriding {#type} (the default is simple). Simple
8
+ # comparators must define the {#score} method that uses data from two records
9
+ # and returns a number (`Integer` or `Float`) between 0 and 1 (inclusive).
10
+ # Advanced comparators must define both {#score_dataset} and {#score_datasets}
11
+ # that use one or two {Dataset}s respectively to create scores.
12
+ #
13
+ # Each comparator can be registered via the {.register} function. This allows
14
+ # {Configuration} a way to find a comparator by name via
15
+ # {Configuration#method_missing}. For example, `config.compare(...)` creates a
16
+ # new {Comparators::Compare} object, since that comparator is registered under
17
+ # the name `"compare"`.
18
+ #
19
+ # See documentation for the methods below for more information.
20
+ #
21
+ # @abstract
3
22
  class Comparator
4
- # Register a new comparator.
5
- #
6
- # @param [Class] klass Comparator subclass
7
- def self.register(klass)
8
- name = nil
9
- begin
10
- name = klass.comparator_name
11
- rescue NotImplementedError
12
- raise ArgumentError, "comparator_name class method must be defined"
13
- end
14
-
15
- if !klass.instance_methods(false).include?(:score)
16
- raise ArgumentError, "class must define the score method"
17
- end
18
-
19
- begin
20
- if klass.parameters.length > 0
21
- @comparators ||= {}
22
- @comparators[name] = klass
23
- else
24
- raise ArgumentError, "class must have at least one parameter"
23
+ include Observable
24
+
25
+ class << self
26
+ # Register a new comparator. Subclasses must define at least {#score} for
27
+ # simple comparators, or {#score_dataset} and {#score_datasets} for
28
+ # advanced comparators. Otherwise, an `ArgumentError` will be raised when
29
+ # you try to call {.register}. The `name` parameter is used in
30
+ # {Configuration#method_missing} as an easy way for users to select
31
+ # comparators for their linkage.
32
+ #
33
+ # @param [String] name Comparator name used in {.klass_for}
34
+ # @param [Class] klass Comparator subclass
35
+ def register(name, klass)
36
+ methods = klass.instance_methods(false)
37
+ if !methods.include?(:score) && (!methods.include?(:score_datasets) || !methods.include?(:score_dataset))
38
+ raise ArgumentError, "class must define either #score or both #score_datasets and #score_dataset methods"
25
39
  end
26
- rescue NotImplementedError
27
- raise ArgumentError, "parameters class method must be defined"
40
+
41
+ @comparators ||= {}
42
+ @comparators[name] = klass
28
43
  end
29
44
 
30
- begin
31
- range = klass.score_range
32
- if !range.is_a?(Range) || !range.first.is_a?(Numeric) ||
33
- !range.last.is_a?(Numeric)
34
- raise ArgumentError, "score_range must be a Range of two numbers"
35
- end
36
- rescue NotImplementedError
37
- raise ArgumentError, "score_range class method must be defined"
45
+ # Return a registered Comparator subclass or `nil` if it doesn't exist.
46
+ #
47
+ # @param [String] name of registered comparator
48
+ # @return [Class, nil]
49
+ def klass_for(name)
50
+ @comparators ? @comparators[name] : nil
38
51
  end
52
+ alias :[] :klass_for
39
53
  end
40
54
 
41
- def self.[](name)
42
- @comparators ? @comparators[name] : nil
55
+ # Return the type of this comparator. When {#type} returns `:simple`,
56
+ # {#score_and_notify} is called by {Runner#score_records} with each pair of
57
+ # records in order to create scores. When {#type} returns `:advanced`,
58
+ # either {#score_dataset} or {#score_datasets} is called by
59
+ # {Runner#score_records}. In advanced mode, it is left up to the
60
+ # {Comparator} subclass to determine which records to compare and how to
61
+ # compare them.
62
+ #
63
+ # @return [Symbol] either `:simple` or `:advanced`
64
+ def type
65
+ @type || :simple
43
66
  end
44
67
 
45
- # @abstract Override this to return the name of the comparator.
46
- # @return [String]
47
- def self.comparator_name
68
+ # Override this to return the score of the linkage strength of two records.
69
+ # This method is used to score records by {Runner#score_records} when
70
+ # {#type} returns `:simple`.
71
+ #
72
+ # @abstract
73
+ # @param [Hash] record_1 data from first record
74
+ # @param [Hash] record_2 data from second record
75
+ # @return [Numeric] value between 0 and 1 (inclusive)
76
+ def score(record_1, record_2)
48
77
  raise NotImplementedError
49
78
  end
50
79
 
51
- # @abstract Override this to require a specific number of arguments of a
52
- # certain class. To require two parameters of either String or Integer,
53
- # do something like this:
80
+ # Override this to score the linkage strength of records in two datasets.
81
+ # This method is used to score records by {Runner#score_records} when
82
+ # {#type} returns `:advanced` and {Configuration} is setup to link two
83
+ # datasets together.
54
84
  #
55
- # @@parameters = [[String, Integer], [String, Integer]]
56
- # def self.parameters
57
- # @@parameters
58
- # end
85
+ # Since each {Dataset} delegates to a
86
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
87
+ # you can use any
88
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
89
+ # methods that you wish in order to select records to compare.
59
90
  #
60
- # At least one argument must be defined.
61
- # @return [Array]
62
- def self.parameters
63
- raise NotImplementedError
64
- end
65
-
66
- # @abstract Override this to return a Range of the possible scores for the
67
- # comparator.
68
- # @return [Range]
69
- def self.score_range
91
+ # To record scores, subclasses must call
92
+ # {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
93
+ # like so:
94
+ #
95
+ # ```ruby
96
+ # changed
97
+ # notify_observers(self, record_1, record_2, score)
98
+ # ```
99
+ #
100
+ # This works by notifying any observers, typically {ScoreRecorder}, that a
101
+ # new score has been generated. {ScoreRecorder#update} then calls
102
+ # {ScoreSet#add_score} with comparator ID, the primary key of each record
103
+ # and the score.
104
+ #
105
+ # @abstract
106
+ # @param [Linkage::Dataset] dataset_1
107
+ # @param [Linkage::Dataset] dataset_2
108
+ # @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
109
+ # @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
110
+ def score_datasets(dataset_1, dataset_2)
70
111
  raise NotImplementedError
71
112
  end
72
113
 
73
- attr_reader :args, :lhs_args, :rhs_args
74
-
75
- # Create a new Comparator object.
76
- # @param [Linkage::MetaObject, Hash] args Comparator arguments
77
- def initialize(*args)
78
- @args = args
79
- @lhs_args = []
80
- @rhs_args = []
81
- @options = args.last.is_a?(Hash) ? args.pop : {}
82
- process_args
83
- end
84
-
85
- # @abstract Override this to return the score of the linkage strength of
86
- # two records.
87
- # @return [Numeric]
88
- def score(record_1, record_2)
114
+ # Override this to score the linkage strength of records in one dataset.
115
+ # This method is used to score records by {Runner#score_records} when
116
+ # {#type} returns `:advanced` and {Configuration} is setup to link a
117
+ # dataset to itself.
118
+ #
119
+ # Since a {Dataset} delegates to a
120
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
121
+ # you can use any
122
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
123
+ # methods that you wish in order to select records to compare.
124
+ #
125
+ # To record scores, subclasses must call
126
+ # {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
127
+ # like so:
128
+ #
129
+ # ```ruby
130
+ # changed
131
+ # notify_observers(self, record_1, record_2, score)
132
+ # ```
133
+ #
134
+ # This works by notifying any observers, typically {ScoreRecorder}, that a
135
+ # new score has been generated. {ScoreRecorder#update} then calls
136
+ # {ScoreSet#add_score} with comparator ID, the primary key of each record
137
+ # and the score.
138
+ #
139
+ # @abstract
140
+ # @param [Linkage::Dataset] dataset
141
+ # @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
142
+ # @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
143
+ def score_dataset(dataset)
89
144
  raise NotImplementedError
90
145
  end
91
146
 
92
- private
93
-
94
- def process_args
95
- parameters = self.class.parameters
96
- if parameters.length != @args.length
97
- raise ArgumentError, "wrong number of arguments (#{@args.length} for #{parameters.length})"
98
- end
99
-
100
- first_side = nil
101
- second_side = nil
102
- @args.each_with_index do |arg, i|
103
- type = arg.ruby_type[:type]
104
-
105
- parameter_types = parameters[i]
106
- if parameter_types.last.is_a?(Hash)
107
- parameter_options = parameter_types[-1]
108
- parameter_types = parameter_types[0..-2]
109
- else
110
- parameter_options = {}
111
- end
112
-
113
- if parameter_types[0] != :any && !parameter_types.include?(type)
114
- raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
115
- end
116
-
117
- if parameter_options.has_key?(:values) && arg.raw? && !parameter_options[:values].include?(arg.object)
118
- raise ArgumentError, "argument #{i + 1} (#{arg.object.inspect}) was not one of the expected values: #{parameter_options[:values].inspect}"
119
- end
120
-
121
- if parameter_options.has_key?(:same_type_as)
122
- arg_index = parameter_options[:same_type_as]
123
- other_type = @args[arg_index].ruby_type[:type]
124
- if type != other_type
125
- raise TypeError, "argument #{i + 1} (#{type}) was expected to have the same type as argument #{arg_index + 1} (#{other_type})"
126
- end
127
- end
128
-
129
- if parameter_options.has_key?(:static) &&
130
- parameter_options[:static] != arg.static?
131
- raise TypeError, "argument #{i + 1} was expected to #{arg.static? ? "not be" : "be"} static"
132
- end
133
-
134
- if !arg.static?
135
- if first_side.nil?
136
- first_side = arg.side
137
- elsif arg.side != first_side && second_side.nil?
138
- second_side = arg.side
139
- end
140
-
141
- valid_side = true
142
- case parameter_options[:side]
143
- when :first
144
- if arg.side != first_side
145
- valid_side = false
146
- end
147
- when :second
148
- if second_side.nil? || arg.side != second_side
149
- valid_side = false
150
- end
151
- end
152
-
153
- if !valid_side
154
- raise TypeError, "argument #{i + 1} was expected to have a different side value"
155
- end
156
-
157
- case arg.side
158
- when :lhs
159
- @lhs_args << arg
160
- when :rhs
161
- @rhs_args << arg
162
- end
163
- end
164
- end
147
+ # Calls {#score} with two hashes of record data. The result is then used to
148
+ # notify any observers (typically {ScoreRecorder}).
149
+ #
150
+ # This method is used by {Runner#score_records} when {#type} returns
151
+ # `:simple`. Subclasses should override {#score} to implement the scoring
152
+ # algorithm.
153
+ #
154
+ # @param [Hash] record_1 data from first record
155
+ # @param [Hash] record_2 data from second record
156
+ def score_and_notify(record_1, record_2)
157
+ value = score(record_1, record_2)
158
+ changed
159
+ notify_observers(self, record_1, record_2, value)
165
160
  end
166
161
  end
167
162
  end
168
163
 
169
164
  path = File.expand_path(File.join(File.dirname(__FILE__), "comparators"))
170
- require File.join(path, "binary")
171
165
  require File.join(path, "compare")
172
166
  require File.join(path, "within")
167
+ require File.join(path, "strcompare")