linkage 0.0.8 → 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile +1 -19
  5. data/Gemfile-java +3 -0
  6. data/README.markdown +88 -34
  7. data/Rakefile +16 -15
  8. data/TODO +4 -0
  9. data/lib/linkage/comparator.rb +139 -144
  10. data/lib/linkage/comparators/compare.rb +236 -29
  11. data/lib/linkage/comparators/strcompare.rb +85 -0
  12. data/lib/linkage/comparators/within.rb +24 -20
  13. data/lib/linkage/configuration.rb +44 -466
  14. data/lib/linkage/dataset.rb +28 -127
  15. data/lib/linkage/exceptions.rb +5 -0
  16. data/lib/linkage/field.rb +6 -37
  17. data/lib/linkage/field_set.rb +3 -3
  18. data/lib/linkage/match_recorder.rb +22 -0
  19. data/lib/linkage/match_set.rb +34 -0
  20. data/lib/linkage/match_sets/csv.rb +39 -0
  21. data/lib/linkage/match_sets/database.rb +45 -0
  22. data/lib/linkage/matcher.rb +30 -0
  23. data/lib/linkage/result_set.rb +25 -110
  24. data/lib/linkage/result_sets/csv.rb +54 -0
  25. data/lib/linkage/result_sets/database.rb +42 -0
  26. data/lib/linkage/runner.rb +57 -16
  27. data/lib/linkage/score_recorder.rb +30 -0
  28. data/lib/linkage/score_set.rb +49 -0
  29. data/lib/linkage/score_sets/csv.rb +64 -0
  30. data/lib/linkage/score_sets/database.rb +77 -0
  31. data/lib/linkage/version.rb +1 -1
  32. data/lib/linkage.rb +14 -17
  33. data/linkage.gemspec +13 -1
  34. data/linkage.gemspec-java +32 -0
  35. data/test/helper.rb +30 -23
  36. data/test/integration/test_cross_linkage.rb +46 -25
  37. data/test/integration/test_database_result_set.rb +55 -0
  38. data/test/integration/test_dual_linkage.rb +19 -94
  39. data/test/integration/test_self_linkage.rb +100 -203
  40. data/test/integration/test_within_comparator.rb +24 -77
  41. data/test/unit/comparators/test_compare.rb +254 -50
  42. data/test/unit/comparators/test_strcompare.rb +45 -0
  43. data/test/unit/comparators/test_within.rb +14 -26
  44. data/test/unit/match_sets/test_csv.rb +78 -0
  45. data/test/unit/match_sets/test_database.rb +63 -0
  46. data/test/unit/result_sets/test_csv.rb +111 -0
  47. data/test/unit/result_sets/test_database.rb +68 -0
  48. data/test/unit/score_sets/test_csv.rb +151 -0
  49. data/test/unit/score_sets/test_database.rb +149 -0
  50. data/test/unit/test_comparator.rb +46 -83
  51. data/test/unit/test_comparators.rb +4 -0
  52. data/test/unit/test_configuration.rb +99 -145
  53. data/test/unit/test_dataset.rb +52 -73
  54. data/test/unit/test_field.rb +4 -55
  55. data/test/unit/test_field_set.rb +6 -6
  56. data/test/unit/test_match_recorder.rb +23 -0
  57. data/test/unit/test_match_set.rb +23 -0
  58. data/test/unit/test_match_sets.rb +4 -0
  59. data/test/unit/test_matcher.rb +44 -0
  60. data/test/unit/test_result_set.rb +24 -223
  61. data/test/unit/test_result_sets.rb +4 -0
  62. data/test/unit/test_runner.rb +122 -17
  63. data/test/unit/test_runners.rb +4 -0
  64. data/test/unit/test_score_recorder.rb +25 -0
  65. data/test/unit/test_score_set.rb +37 -0
  66. data/test/unit/test_score_sets.rb +4 -0
  67. metadata +183 -90
  68. data/Gemfile.lock +0 -92
  69. data/lib/linkage/comparators/binary.rb +0 -12
  70. data/lib/linkage/data.rb +0 -175
  71. data/lib/linkage/decollation.rb +0 -93
  72. data/lib/linkage/expectation.rb +0 -21
  73. data/lib/linkage/expectations/exhaustive.rb +0 -63
  74. data/lib/linkage/expectations/simple.rb +0 -168
  75. data/lib/linkage/function.rb +0 -148
  76. data/lib/linkage/functions/binary.rb +0 -30
  77. data/lib/linkage/functions/cast.rb +0 -54
  78. data/lib/linkage/functions/length.rb +0 -29
  79. data/lib/linkage/functions/strftime.rb +0 -33
  80. data/lib/linkage/functions/trim.rb +0 -30
  81. data/lib/linkage/group.rb +0 -55
  82. data/lib/linkage/meta_object.rb +0 -139
  83. data/lib/linkage/runner/single_threaded.rb +0 -187
  84. data/lib/linkage/utils.rb +0 -164
  85. data/lib/linkage/warnings.rb +0 -5
  86. data/test/integration/test_collation.rb +0 -45
  87. data/test/integration/test_configuration.rb +0 -268
  88. data/test/integration/test_dataset.rb +0 -116
  89. data/test/integration/test_functions.rb +0 -88
  90. data/test/integration/test_result_set.rb +0 -85
  91. data/test/integration/test_scoring.rb +0 -84
  92. data/test/unit/expectations/test_exhaustive.rb +0 -111
  93. data/test/unit/expectations/test_simple.rb +0 -303
  94. data/test/unit/functions/test_binary.rb +0 -54
  95. data/test/unit/functions/test_cast.rb +0 -98
  96. data/test/unit/functions/test_length.rb +0 -52
  97. data/test/unit/functions/test_strftime.rb +0 -60
  98. data/test/unit/functions/test_trim.rb +0 -43
  99. data/test/unit/runner/test_single_threaded.rb +0 -12
  100. data/test/unit/test_data.rb +0 -445
  101. data/test/unit/test_decollation.rb +0 -201
  102. data/test/unit/test_function.rb +0 -233
  103. data/test/unit/test_group.rb +0 -38
  104. data/test/unit/test_meta_object.rb +0 -208
  105. data/test/unit/test_utils.rb +0 -341
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ad6a9ee6a6add94a342e3d02d49d8a4bfeb9122b
4
+ data.tar.gz: a989e8e810602dfcd4da596fcc1154a922dedccd
5
+ SHA512:
6
+ metadata.gz: 0d78da4904100679826cf76ae07f7daf984c12e2a762aceb0ad2d1387c5a5778403a782ff104ce48c1fd3ec5588c728fe2b8ea7a79a896da153a8d91c7e4cb14
7
+ data.tar.gz: 6797e5598f47413022d18c6732cbf0613efaccba886cda56eb390e3344d131ccfd977b5992c5a2c4557cfbc4f93bf747ddffe849770be5860e793f27fe3e2286
data/.gitignore CHANGED
@@ -8,3 +8,4 @@ test.rb
8
8
  results.db
9
9
  .rbenv-version
10
10
  bin
11
+ Gemfile.lock
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ -m markdown
data/Gemfile CHANGED
@@ -1,21 +1,3 @@
1
- source 'http://rubygems.org'
1
+ source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
-
5
- group :development do
6
- gem 'bundler'
7
- gem 'test-unit'
8
- gem 'mocha'
9
- gem 'yard'
10
- gem 'rake'
11
- gem 'versionomy'
12
- gem 'sqlite3', :platforms => :ruby
13
- gem 'mysql2', :platforms => :ruby
14
- gem 'jdbc-sqlite3', :platforms => :jruby
15
- gem 'jdbc-mysql', :platforms => :jruby
16
- gem 'rdiscount'
17
- gem 'guard-test'
18
- gem 'guard-yard', :platforms => :ruby_19
19
- gem 'rb-inotify', '~> 0.8.8'
20
- gem 'debugger'
21
- end
data/Gemfile-java ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.markdown CHANGED
@@ -1,52 +1,106 @@
1
- # linkage
1
+ # Linkage
2
2
 
3
- Linkage is a library for record linkage between one or two database tables.
3
+ Linkage is a Ruby library for record linkage between one or two database tables.
4
+
5
+ ## What is record linkage?
6
+
7
+ In an ideal world, records that reference the same entity can be easily
8
+ identified. Unfortunately, this isn't always the case. Sometimes there are no
9
+ good identifiers in the datasets that you're interested in (ID, social security
10
+ number, etc). In such cases, it is necessary to use other means to determine
11
+ which records refer to which entity, and this process is known as **record
12
+ linkage**.
13
+
14
+ ## Prerequisites
15
+
16
+ In order to use Linkage, the records you want to link must be in a database.
17
+ Linkage has the ability to perform record linkage across different kinds of
18
+ databases, so it's okay if your records are not all in the same place.
19
+
20
+ Since Linkage uses [Sequel](http://sequel.jeremyevans.net/) to communicate with
21
+ databases, any database that Sequel supports will work. See [Connecting to a
22
+ database](http://sequel.jeremyevans.net/documentation.html) on the Sequel
23
+ website for more information about what databases are supported.
4
24
 
5
25
  ## Usage
6
26
 
7
- Linkage uses Sequel to talk to databases, so any database that Sequel can
8
- talk to, Linkage can talk to. You just give Linkage the Sequel-style URI
9
- and the database table name:
27
+ To perform a record linkage, Linkage needs information about the following:
28
+ datasets, result set, and comparators. A dataset refers to a table in a
29
+ database. A result set is a place to put score and match information that
30
+ Linkage generates. Comparators describe how records are compared.
31
+
32
+ A dataset is created via the `Linkage::Dataset` class, along with a connection URI
33
+ and a table name:
10
34
 
11
- ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
35
+ ```ruby
36
+ ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
37
+ ```
12
38
 
13
- To describe a linkage, you use the `Dataset#link_with` method.
39
+ Result sets have different options depending on what storage medium you're
40
+ using (CSV or database). For CSVs, you could use:
14
41
 
15
- parents = Linkage::Dataset.new('postgres://example.com/foo', 'parents')
16
- children = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'children')
17
- config = parents.link_with(children) do
18
- lhs[:first_name].must == rhs[:parent_first_name]
19
- lhs[:last_name].must == rhs[:parent_last_name]
20
- lhs[:last_name].must_not == "Smith" # exclude parents with the last
21
- # name "Smith"
42
+ ```ruby
43
+ result_set = Linkage::ResultSet['csv'].new('~/my_results')
44
+ ```
22
45
 
23
- save_results_in('sqlite://results.db') # see below
24
- end
46
+ In this case, scores and matches will be saved in CSV files in the `my_results`
47
+ directory in your home folder.
25
48
 
26
- Note that the datasets don't have to be in the same database, or even on
27
- the same machine.
49
+ To describe a linkage, you can use the `Dataset#link_with` method. This creates
50
+ a linkage configuration that you can use to describe how you want the records in
51
+ each dataset to be compared. For example:
52
+
53
+ ```ruby
54
+ demo = Linkage::Dataset.new('postgres://example.com/foo', 'demographics')
55
+ visits = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'visits')
56
+ result_set = Linkage::ResultSet['csv'].new('~/my_results')
57
+ config = demo.link_with(visits, result_set) do |config|
58
+ config.compare([:first_name, :last_name], [:first_name, :last_name], :equal)
59
+ end
60
+ ```
61
+
62
+ This linkage would match records from a demographics table to records in a table
63
+ with information about doctor visits by using first name and last name.
64
+
65
+ The `compare` method creates a `Compare` comparator. This is the simplest
66
+ comparator in Linkage, and it just compares fields with the operator you specify
67
+ (`:equal`, `:less_than`, `:greater_than`, etc). When a comparator compares
68
+ two records, it gives the pair of records a score between 0 and 1. In the case
69
+ of the example above, records that have the same first name and last name get a
70
+ score of 1, and records that don't get a score of 0 (or sometimes, they aren't
71
+ scored and assumed to have a score of 0).
72
+
73
+ Other comparators are `Strcompare` for approximate string matching and
74
+ `Within` for matching numbers within a range.
28
75
 
29
76
  To run a linkage, use a Runner with the resulting configuration from
30
77
  `Dataset#link_with`:
31
78
 
32
- runner = Linkage::SingleThreadedRunner.new(config)
33
- runner.execute
79
+ ```ruby
80
+ runner = Linkage::Runner.new(config)
81
+ runner.execute
82
+ ```
83
+
84
+ After running a linkage, there will be a list of matches in a CSV file or
85
+ database, depending on how you configured your result set.
86
+
87
+ The default way linkage determines if two records match is by comparing the
88
+ average score to a threshold value (which is 0.5 by default). You can configure
89
+ the threshold value like so: `config.threshold = 0.9`.
34
90
 
35
- The runner saves results in a database that you specify in the configuration
36
- (via the `save_results_in` method). It stores its results in two database
37
- tables: `groups` and `groups_records`. The `groups` table contains all of the
38
- unique combinations of values in your datasets, and `groups_records` maps
39
- records to groups.
91
+ ## Other examples
40
92
 
41
- You can also link a dataset to itself:
93
+ Linking a dataset to itself:
42
94
 
43
- births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
44
- config = births.link_with(births) do
45
- lhs[:mother_first_name].must == rhs[:mother_first_name]
46
- lhs[:mother_last_name].must == rhs[:mother_last_name]
47
- end
48
- runner = Linkage::SingleThreadedRunner.new(config, 'sqlite://results.db')
49
- runner.execute
95
+ ```ruby
96
+ births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
97
+ result_set = Linkage::ResultSet['csv'].new('~/my_birth_results')
98
+ config = births.link_with(births, result_set) do |config|
99
+ config.compare([:mother_first_name, :mother_last_name], [:mother_first_name, :mother_last_name], :equal)
100
+ end
101
+ runner = Linkage::Runner.new(config)
102
+ runner.execute
103
+ ```
50
104
 
51
105
  The above example would find birth records that have mothers with the same
52
106
  name.
@@ -62,6 +116,6 @@ name.
62
116
 
63
117
  ## Copyright
64
118
 
65
- Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
119
+ Copyright (c) 2011-2014 Vanderbilt University. See LICENSE.txt for
66
120
  further details.
67
121
 
data/Rakefile CHANGED
@@ -1,16 +1,4 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
12
- require 'rake'
13
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
14
2
 
15
3
  require 'rake/testtask'
16
4
  Rake::TestTask.new(:test) do |test|
@@ -18,9 +6,22 @@ Rake::TestTask.new(:test) do |test|
18
6
  test.pattern = 'test/**/test_*.rb'
19
7
  test.verbose = true
20
8
  end
21
-
22
9
  task :default => :test
23
10
 
11
+ namespace :test do
12
+ Rake::TestTask.new(:unit) do |test|
13
+ test.libs << 'lib' << 'test'
14
+ test.pattern = 'test/unit/**/test_*.rb'
15
+ test.verbose = true
16
+ end
17
+
18
+ Rake::TestTask.new(:integration) do |test|
19
+ test.libs << 'lib' << 'test'
20
+ test.pattern = 'test/integration/**/test_*.rb'
21
+ test.verbose = true
22
+ end
23
+ end
24
+
24
25
  require 'yard'
25
26
  YARD::Rake::YardocTask.new do |t|
26
27
  t.files = ['lib/**/*.rb']
@@ -28,7 +29,7 @@ end
28
29
 
29
30
  # Yoinked from https://github.com/rails/rails/blob/master/railties/lib/rails/tasks/annotations.rake
30
31
  namespace :notes do
31
- ["OPTIMIZE", "FIXME", "TODO"].each do |annotation|
32
+ ['OPTIMIZE', 'FIXME', 'TODO'].each do |annotation|
32
33
  desc "Enumerate all #{annotation} annotations"
33
34
  task annotation.downcase.intern do
34
35
  SourceAnnotationExtractor.enumerate annotation
data/TODO ADDED
@@ -0,0 +1,4 @@
1
+ Features
2
+ - add matcher algorithms
3
+ - change comparator_id to more than just an index; need to get comparator ids
4
+ from result set instead of configuration
@@ -1,172 +1,167 @@
1
1
  module Linkage
2
- # @abstract Abstract class to represent record comparators.
2
+ # {Comparator} is the superclass for comparators in Linkage. Comparators are
3
+ # used to compare two records and compute scores based on how closely the two
4
+ # records relate.
5
+ #
6
+ # Each comparator should inherit from {Comparator} and declare itself as
7
+ # simple or advanced by overriding {#type} (the default is simple). Simple
8
+ # comparators must define the {#score} method that uses data from two records
9
+ # and returns a number (`Integer` or `Float`) between 0 and 1 (inclusive).
10
+ # Advanced comparators must define both {#score_dataset} and {#score_datasets}
11
+ # that use one or two {Dataset}s respectively to create scores.
12
+ #
13
+ # Each comparator can be registered via the {.register} function. This allows
14
+ # {Configuration} a way to find a comparator by name via
15
+ # {Configuration#method_missing}. For example, `config.compare(...)` creates a
16
+ # new {Comparators::Compare} object, since that comparator is registered under
17
+ # the name `"compare"`.
18
+ #
19
+ # See documentation for the methods below for more information.
20
+ #
21
+ # @abstract
3
22
  class Comparator
4
- # Register a new comparator.
5
- #
6
- # @param [Class] klass Comparator subclass
7
- def self.register(klass)
8
- name = nil
9
- begin
10
- name = klass.comparator_name
11
- rescue NotImplementedError
12
- raise ArgumentError, "comparator_name class method must be defined"
13
- end
14
-
15
- if !klass.instance_methods(false).include?(:score)
16
- raise ArgumentError, "class must define the score method"
17
- end
18
-
19
- begin
20
- if klass.parameters.length > 0
21
- @comparators ||= {}
22
- @comparators[name] = klass
23
- else
24
- raise ArgumentError, "class must have at least one parameter"
23
+ include Observable
24
+
25
+ class << self
26
+ # Register a new comparator. Subclasses must define at least {#score} for
27
+ # simple comparators, or {#score_dataset} and {#score_datasets} for
28
+ # advanced comparators. Otherwise, an `ArgumentError` will be raised when
29
+ # you try to call {.register}. The `name` parameter is used in
30
+ # {Configuration#method_missing} as an easy way for users to select
31
+ # comparators for their linkage.
32
+ #
33
+ # @param [String] name Comparator name used in {.klass_for}
34
+ # @param [Class] klass Comparator subclass
35
+ def register(name, klass)
36
+ methods = klass.instance_methods(false)
37
+ if !methods.include?(:score) && (!methods.include?(:score_datasets) || !methods.include?(:score_dataset))
38
+ raise ArgumentError, "class must define either #score or both #score_datasets and #score_dataset methods"
25
39
  end
26
- rescue NotImplementedError
27
- raise ArgumentError, "parameters class method must be defined"
40
+
41
+ @comparators ||= {}
42
+ @comparators[name] = klass
28
43
  end
29
44
 
30
- begin
31
- range = klass.score_range
32
- if !range.is_a?(Range) || !range.first.is_a?(Numeric) ||
33
- !range.last.is_a?(Numeric)
34
- raise ArgumentError, "score_range must be a Range of two numbers"
35
- end
36
- rescue NotImplementedError
37
- raise ArgumentError, "score_range class method must be defined"
45
+ # Return a registered Comparator subclass or `nil` if it doesn't exist.
46
+ #
47
+ # @param [String] name of registered comparator
48
+ # @return [Class, nil]
49
+ def klass_for(name)
50
+ @comparators ? @comparators[name] : nil
38
51
  end
52
+ alias :[] :klass_for
39
53
  end
40
54
 
41
- def self.[](name)
42
- @comparators ? @comparators[name] : nil
55
+ # Return the type of this comparator. When {#type} returns `:simple`,
56
+ # {#score_and_notify} is called by {Runner#score_records} with each pair of
57
+ # records in order to create scores. When {#type} returns `:advanced`,
58
+ # either {#score_dataset} or {#score_datasets} is called by
59
+ # {Runner#score_records}. In advanced mode, it is left up to the
60
+ # {Comparator} subclass to determine which records to compare and how to
61
+ # compare them.
62
+ #
63
+ # @return [Symbol] either `:simple` or `:advanced`
64
+ def type
65
+ @type || :simple
43
66
  end
44
67
 
45
- # @abstract Override this to return the name of the comparator.
46
- # @return [String]
47
- def self.comparator_name
68
+ # Override this to return the score of the linkage strength of two records.
69
+ # This method is used to score records by {Runner#score_records} when
70
+ # {#type} returns `:simple`.
71
+ #
72
+ # @abstract
73
+ # @param [Hash] record_1 data from first record
74
+ # @param [Hash] record_2 data from second record
75
+ # @return [Numeric] value between 0 and 1 (inclusive)
76
+ def score(record_1, record_2)
48
77
  raise NotImplementedError
49
78
  end
50
79
 
51
- # @abstract Override this to require a specific number of arguments of a
52
- # certain class. To require two parameters of either String or Integer,
53
- # do something like this:
80
+ # Override this to score the linkage strength of records in two datasets.
81
+ # This method is used to score records by {Runner#score_records} when
82
+ # {#type} returns `:advanced` and {Configuration} is setup to link two
83
+ # datasets together.
54
84
  #
55
- # @@parameters = [[String, Integer], [String, Integer]]
56
- # def self.parameters
57
- # @@parameters
58
- # end
85
+ # Since each {Dataset} delegates to a
86
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
87
+ # you can use any
88
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
89
+ # methods that you wish in order to select records to compare.
59
90
  #
60
- # At least one argument must be defined.
61
- # @return [Array]
62
- def self.parameters
63
- raise NotImplementedError
64
- end
65
-
66
- # @abstract Override this to return a Range of the possible scores for the
67
- # comparator.
68
- # @return [Range]
69
- def self.score_range
91
+ # To record scores, subclasses must call
92
+ # {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
93
+ # like so:
94
+ #
95
+ # ```ruby
96
+ # changed
97
+ # notify_observers(self, record_1, record_2, score)
98
+ # ```
99
+ #
100
+ # This works by notifying any observers, typically {ScoreRecorder}, that a
101
+ # new score has been generated. {ScoreRecorder#update} then calls
102
+ # {ScoreSet#add_score} with comparator ID, the primary key of each record
103
+ # and the score.
104
+ #
105
+ # @abstract
106
+ # @param [Linkage::Dataset] dataset_1
107
+ # @param [Linkage::Dataset] dataset_2
108
+ # @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
109
+ # @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
110
+ def score_datasets(dataset_1, dataset_2)
70
111
  raise NotImplementedError
71
112
  end
72
113
 
73
- attr_reader :args, :lhs_args, :rhs_args
74
-
75
- # Create a new Comparator object.
76
- # @param [Linkage::MetaObject, Hash] args Comparator arguments
77
- def initialize(*args)
78
- @args = args
79
- @lhs_args = []
80
- @rhs_args = []
81
- @options = args.last.is_a?(Hash) ? args.pop : {}
82
- process_args
83
- end
84
-
85
- # @abstract Override this to return the score of the linkage strength of
86
- # two records.
87
- # @return [Numeric]
88
- def score(record_1, record_2)
114
+ # Override this to score the linkage strength of records in one dataset.
115
+ # This method is used to score records by {Runner#score_records} when
116
+ # {#type} returns `:advanced` and {Configuration} is setup to link a
117
+ # dataset to itself.
118
+ #
119
+ # Since a {Dataset} delegates to a
120
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
121
+ # you can use any
122
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
123
+ # methods that you wish in order to select records to compare.
124
+ #
125
+ # To record scores, subclasses must call
126
+ # {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
127
+ # like so:
128
+ #
129
+ # ```ruby
130
+ # changed
131
+ # notify_observers(self, record_1, record_2, score)
132
+ # ```
133
+ #
134
+ # This works by notifying any observers, typically {ScoreRecorder}, that a
135
+ # new score has been generated. {ScoreRecorder#update} then calls
136
+ # {ScoreSet#add_score} with comparator ID, the primary key of each record
137
+ # and the score.
138
+ #
139
+ # @abstract
140
+ # @param [Linkage::Dataset] dataset
141
+ # @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
142
+ # @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
143
+ def score_dataset(dataset)
89
144
  raise NotImplementedError
90
145
  end
91
146
 
92
- private
93
-
94
- def process_args
95
- parameters = self.class.parameters
96
- if parameters.length != @args.length
97
- raise ArgumentError, "wrong number of arguments (#{@args.length} for #{parameters.length})"
98
- end
99
-
100
- first_side = nil
101
- second_side = nil
102
- @args.each_with_index do |arg, i|
103
- type = arg.ruby_type[:type]
104
-
105
- parameter_types = parameters[i]
106
- if parameter_types.last.is_a?(Hash)
107
- parameter_options = parameter_types[-1]
108
- parameter_types = parameter_types[0..-2]
109
- else
110
- parameter_options = {}
111
- end
112
-
113
- if parameter_types[0] != :any && !parameter_types.include?(type)
114
- raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
115
- end
116
-
117
- if parameter_options.has_key?(:values) && arg.raw? && !parameter_options[:values].include?(arg.object)
118
- raise ArgumentError, "argument #{i + 1} (#{arg.object.inspect}) was not one of the expected values: #{parameter_options[:values].inspect}"
119
- end
120
-
121
- if parameter_options.has_key?(:same_type_as)
122
- arg_index = parameter_options[:same_type_as]
123
- other_type = @args[arg_index].ruby_type[:type]
124
- if type != other_type
125
- raise TypeError, "argument #{i + 1} (#{type}) was expected to have the same type as argument #{arg_index + 1} (#{other_type})"
126
- end
127
- end
128
-
129
- if parameter_options.has_key?(:static) &&
130
- parameter_options[:static] != arg.static?
131
- raise TypeError, "argument #{i + 1} was expected to #{arg.static? ? "not be" : "be"} static"
132
- end
133
-
134
- if !arg.static?
135
- if first_side.nil?
136
- first_side = arg.side
137
- elsif arg.side != first_side && second_side.nil?
138
- second_side = arg.side
139
- end
140
-
141
- valid_side = true
142
- case parameter_options[:side]
143
- when :first
144
- if arg.side != first_side
145
- valid_side = false
146
- end
147
- when :second
148
- if second_side.nil? || arg.side != second_side
149
- valid_side = false
150
- end
151
- end
152
-
153
- if !valid_side
154
- raise TypeError, "argument #{i + 1} was expected to have a different side value"
155
- end
156
-
157
- case arg.side
158
- when :lhs
159
- @lhs_args << arg
160
- when :rhs
161
- @rhs_args << arg
162
- end
163
- end
164
- end
147
+ # Calls {#score} with two hashes of record data. The result is then used to
148
+ # notify any observers (typically {ScoreRecorder}).
149
+ #
150
+ # This method is used by {Runner#score_records} when {#type} returns
151
+ # `:simple`. Subclasses should override {#score} to implement the scoring
152
+ # algorithm.
153
+ #
154
+ # @param [Hash] record_1 data from first record
155
+ # @param [Hash] record_2 data from second record
156
+ def score_and_notify(record_1, record_2)
157
+ value = score(record_1, record_2)
158
+ changed
159
+ notify_observers(self, record_1, record_2, value)
165
160
  end
166
161
  end
167
162
  end
168
163
 
169
164
  path = File.expand_path(File.join(File.dirname(__FILE__), "comparators"))
170
- require File.join(path, "binary")
171
165
  require File.join(path, "compare")
172
166
  require File.join(path, "within")
167
+ require File.join(path, "strcompare")