linkage 0.0.8 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ad6a9ee6a6add94a342e3d02d49d8a4bfeb9122b
|
4
|
+
data.tar.gz: a989e8e810602dfcd4da596fcc1154a922dedccd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0d78da4904100679826cf76ae07f7daf984c12e2a762aceb0ad2d1387c5a5778403a782ff104ce48c1fd3ec5588c728fe2b8ea7a79a896da153a8d91c7e4cb14
|
7
|
+
data.tar.gz: 6797e5598f47413022d18c6732cbf0613efaccba886cda56eb390e3344d131ccfd977b5992c5a2c4557cfbc4f93bf747ddffe849770be5860e793f27fe3e2286
|
data/.gitignore
CHANGED
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-m markdown
|
data/Gemfile
CHANGED
@@ -1,21 +1,3 @@
|
|
1
|
-
source '
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gemspec
|
4
|
-
|
5
|
-
group :development do
|
6
|
-
gem 'bundler'
|
7
|
-
gem 'test-unit'
|
8
|
-
gem 'mocha'
|
9
|
-
gem 'yard'
|
10
|
-
gem 'rake'
|
11
|
-
gem 'versionomy'
|
12
|
-
gem 'sqlite3', :platforms => :ruby
|
13
|
-
gem 'mysql2', :platforms => :ruby
|
14
|
-
gem 'jdbc-sqlite3', :platforms => :jruby
|
15
|
-
gem 'jdbc-mysql', :platforms => :jruby
|
16
|
-
gem 'rdiscount'
|
17
|
-
gem 'guard-test'
|
18
|
-
gem 'guard-yard', :platforms => :ruby_19
|
19
|
-
gem 'rb-inotify', '~> 0.8.8'
|
20
|
-
gem 'debugger'
|
21
|
-
end
|
data/Gemfile-java
ADDED
data/README.markdown
CHANGED
@@ -1,52 +1,106 @@
|
|
1
|
-
#
|
1
|
+
# Linkage
|
2
2
|
|
3
|
-
Linkage is a library for record linkage between one or two database tables.
|
3
|
+
Linkage is a Ruby library for record linkage between one or two database tables.
|
4
|
+
|
5
|
+
## What is record linkage?
|
6
|
+
|
7
|
+
In an ideal world, records that reference the same entity can be easily
|
8
|
+
identified. Unfortunately, this isn't always the case. Sometimes there are no
|
9
|
+
good identifiers in the datasets that you're interested in (ID, social security
|
10
|
+
number, etc). In such cases, it is necessary to use other means to determine
|
11
|
+
which records refer to which entity, and this process is known as **record
|
12
|
+
linkage**.
|
13
|
+
|
14
|
+
## Prerequisites
|
15
|
+
|
16
|
+
In order to use Linkage, the records you want to link must be in a database.
|
17
|
+
Linkage has the ability to perform record linkage across different kinds of
|
18
|
+
databases, so it's okay if your records are not all in the same place.
|
19
|
+
|
20
|
+
Since Linkage uses [Sequel](http://sequel.jeremyevans.net/) to communicate with
|
21
|
+
databases, any database that Sequel supports will work. See [Connecting to a
|
22
|
+
database](http://sequel.jeremyevans.net/documentation.html) on the Sequel
|
23
|
+
website for more information about what databases are supported.
|
4
24
|
|
5
25
|
## Usage
|
6
26
|
|
7
|
-
|
8
|
-
|
9
|
-
and
|
27
|
+
To perform a record linkage, Linkage needs information about the following:
|
28
|
+
datasets, result set, and comparators. A dataset refers to a table in a
|
29
|
+
database. A result set is a place to put score and match information that
|
30
|
+
Linkage generates. Comparators describe how records are compared.
|
31
|
+
|
32
|
+
A dataset is created via the `Linkage::Dataset` class, along with a connection URI
|
33
|
+
and a table name:
|
10
34
|
|
11
|
-
|
35
|
+
```ruby
|
36
|
+
ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
|
37
|
+
```
|
12
38
|
|
13
|
-
|
39
|
+
Result sets have different options depending on what storage medium you're
|
40
|
+
using (CSV or database). For CSVs, you could use:
|
14
41
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
lhs[:first_name].must == rhs[:parent_first_name]
|
19
|
-
lhs[:last_name].must == rhs[:parent_last_name]
|
20
|
-
lhs[:last_name].must_not == "Smith" # exclude parents with the last
|
21
|
-
# name "Smith"
|
42
|
+
```ruby
|
43
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_results')
|
44
|
+
```
|
22
45
|
|
23
|
-
|
24
|
-
|
46
|
+
In this case, scores and matches will be saved in CSV files in the `my_results`
|
47
|
+
directory in your home folder.
|
25
48
|
|
26
|
-
|
27
|
-
the
|
49
|
+
To describe a linkage, you can use the `Dataset#link_with` method. This creates
|
50
|
+
a linkage configuration that you can use to describe how you want the records in
|
51
|
+
each dataset to be compared. For example:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
demo = Linkage::Dataset.new('postgres://example.com/foo', 'demographics')
|
55
|
+
visits = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'visits')
|
56
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_results')
|
57
|
+
config = demo.link_with(visits, result_set) do |config|
|
58
|
+
config.compare([:first_name, :last_name], [:first_name, :last_name], :equal)
|
59
|
+
end
|
60
|
+
```
|
61
|
+
|
62
|
+
This linkage would match records from a demographics table to records in a table
|
63
|
+
with information about doctor visits by using first name and last name.
|
64
|
+
|
65
|
+
The `compare` method creates a `Compare` comparator. This is the simplest
|
66
|
+
comparator in Linkage, and it just compares fields with the operator you specify
|
67
|
+
(`:equal`, `:less_than`, `:greater_than`, etc). When a comparator compares
|
68
|
+
two records, it gives the pair of records a score between 0 and 1. In the case
|
69
|
+
of the example above, records that have the same first name and last name get a
|
70
|
+
score of 1, and records that don't get a score of 0 (or sometimes, they aren't
|
71
|
+
scored and assumed to have a score of 0).
|
72
|
+
|
73
|
+
Other comparators are `Strcompare` for approximate string matching and
|
74
|
+
`Within` for matching numbers within a range.
|
28
75
|
|
29
76
|
To run a linkage, use a Runner with the resulting configuration from
|
30
77
|
`Dataset#link_with`:
|
31
78
|
|
32
|
-
|
33
|
-
|
79
|
+
```ruby
|
80
|
+
runner = Linkage::Runner.new(config)
|
81
|
+
runner.execute
|
82
|
+
```
|
83
|
+
|
84
|
+
After running a linkage, there will be a list of matches in a CSV file or
|
85
|
+
database, depending on how you configured your result set.
|
86
|
+
|
87
|
+
The default way linkage determines if two records match is by comparing the
|
88
|
+
average score to a threshold value (which is 0.5 by default). You can configure
|
89
|
+
the threshold value like so: `config.threshold = 0.9`.
|
34
90
|
|
35
|
-
|
36
|
-
(via the `save_results_in` method). It stores its results in two database
|
37
|
-
tables: `groups` and `groups_records`. The `groups` table contains all of the
|
38
|
-
unique combinations of values in your datasets, and `groups_records` maps
|
39
|
-
records to groups.
|
91
|
+
## Other examples
|
40
92
|
|
41
|
-
|
93
|
+
Linking a dataset to itself:
|
42
94
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
95
|
+
```ruby
|
96
|
+
births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
|
97
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_birth_results')
|
98
|
+
config = births.link_with(births, result_set) do |config|
|
99
|
+
config.compare([:mother_first_name, :mother_last_name], [:mother_first_name, :mother_last_name], :equal)
|
100
|
+
end
|
101
|
+
runner = Linkage::Runner.new(config)
|
102
|
+
runner.execute
|
103
|
+
```
|
50
104
|
|
51
105
|
The above example would find birth records that have mothers with the same
|
52
106
|
name.
|
@@ -62,6 +116,6 @@ name.
|
|
62
116
|
|
63
117
|
## Copyright
|
64
118
|
|
65
|
-
Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
|
119
|
+
Copyright (c) 2011-2014 Vanderbilt University. See LICENSE.txt for
|
66
120
|
further details.
|
67
121
|
|
data/Rakefile
CHANGED
@@ -1,16 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
13
|
-
require "bundler/gem_tasks"
|
1
|
+
require 'bundler/gem_tasks'
|
14
2
|
|
15
3
|
require 'rake/testtask'
|
16
4
|
Rake::TestTask.new(:test) do |test|
|
@@ -18,9 +6,22 @@ Rake::TestTask.new(:test) do |test|
|
|
18
6
|
test.pattern = 'test/**/test_*.rb'
|
19
7
|
test.verbose = true
|
20
8
|
end
|
21
|
-
|
22
9
|
task :default => :test
|
23
10
|
|
11
|
+
namespace :test do
|
12
|
+
Rake::TestTask.new(:unit) do |test|
|
13
|
+
test.libs << 'lib' << 'test'
|
14
|
+
test.pattern = 'test/unit/**/test_*.rb'
|
15
|
+
test.verbose = true
|
16
|
+
end
|
17
|
+
|
18
|
+
Rake::TestTask.new(:integration) do |test|
|
19
|
+
test.libs << 'lib' << 'test'
|
20
|
+
test.pattern = 'test/integration/**/test_*.rb'
|
21
|
+
test.verbose = true
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
24
25
|
require 'yard'
|
25
26
|
YARD::Rake::YardocTask.new do |t|
|
26
27
|
t.files = ['lib/**/*.rb']
|
@@ -28,7 +29,7 @@ end
|
|
28
29
|
|
29
30
|
# Yoinked from https://github.com/rails/rails/blob/master/railties/lib/rails/tasks/annotations.rake
|
30
31
|
namespace :notes do
|
31
|
-
[
|
32
|
+
['OPTIMIZE', 'FIXME', 'TODO'].each do |annotation|
|
32
33
|
desc "Enumerate all #{annotation} annotations"
|
33
34
|
task annotation.downcase.intern do
|
34
35
|
SourceAnnotationExtractor.enumerate annotation
|
data/TODO
ADDED
data/lib/linkage/comparator.rb
CHANGED
@@ -1,172 +1,167 @@
|
|
1
1
|
module Linkage
|
2
|
-
#
|
2
|
+
# {Comparator} is the superclass for comparators in Linkage. Comparators are
|
3
|
+
# used to compare two records and compute scores based on how closely the two
|
4
|
+
# records relate.
|
5
|
+
#
|
6
|
+
# Each comparator should inherit from {Comparator} and declare itself as
|
7
|
+
# simple or advanced by overriding {#type} (the default is simple). Simple
|
8
|
+
# comparators must define the {#score} method that uses data from two records
|
9
|
+
# and returns a number (`Integer` or `Float`) between 0 and 1 (inclusive).
|
10
|
+
# Advanced comparators must define both {#score_dataset} and {#score_datasets}
|
11
|
+
# that use one or two {Dataset}s respectively to create scores.
|
12
|
+
#
|
13
|
+
# Each comparator can be registered via the {.register} function. This allows
|
14
|
+
# {Configuration} a way to find a comparator by name via
|
15
|
+
# {Configuration#method_missing}. For example, `config.compare(...)` creates a
|
16
|
+
# new {Comparators::Compare} object, since that comparator is registered under
|
17
|
+
# the name `"compare"`.
|
18
|
+
#
|
19
|
+
# See documentation for the methods below for more information.
|
20
|
+
#
|
21
|
+
# @abstract
|
3
22
|
class Comparator
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
if klass.parameters.length > 0
|
21
|
-
@comparators ||= {}
|
22
|
-
@comparators[name] = klass
|
23
|
-
else
|
24
|
-
raise ArgumentError, "class must have at least one parameter"
|
23
|
+
include Observable
|
24
|
+
|
25
|
+
class << self
|
26
|
+
# Register a new comparator. Subclasses must define at least {#score} for
|
27
|
+
# simple comparators, or {#score_dataset} and {#score_datasets} for
|
28
|
+
# advanced comparators. Otherwise, an `ArgumentError` will be raised when
|
29
|
+
# you try to call {.register}. The `name` parameter is used in
|
30
|
+
# {Configuration#method_missing} as an easy way for users to select
|
31
|
+
# comparators for their linkage.
|
32
|
+
#
|
33
|
+
# @param [String] name Comparator name used in {.klass_for}
|
34
|
+
# @param [Class] klass Comparator subclass
|
35
|
+
def register(name, klass)
|
36
|
+
methods = klass.instance_methods(false)
|
37
|
+
if !methods.include?(:score) && (!methods.include?(:score_datasets) || !methods.include?(:score_dataset))
|
38
|
+
raise ArgumentError, "class must define either #score or both #score_datasets and #score_dataset methods"
|
25
39
|
end
|
26
|
-
|
27
|
-
|
40
|
+
|
41
|
+
@comparators ||= {}
|
42
|
+
@comparators[name] = klass
|
28
43
|
end
|
29
44
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
rescue NotImplementedError
|
37
|
-
raise ArgumentError, "score_range class method must be defined"
|
45
|
+
# Return a registered Comparator subclass or `nil` if it doesn't exist.
|
46
|
+
#
|
47
|
+
# @param [String] name of registered comparator
|
48
|
+
# @return [Class, nil]
|
49
|
+
def klass_for(name)
|
50
|
+
@comparators ? @comparators[name] : nil
|
38
51
|
end
|
52
|
+
alias :[] :klass_for
|
39
53
|
end
|
40
54
|
|
41
|
-
|
42
|
-
|
55
|
+
# Return the type of this comparator. When {#type} returns `:simple`,
|
56
|
+
# {#score_and_notify} is called by {Runner#score_records} with each pair of
|
57
|
+
# records in order to create scores. When {#type} returns `:advanced`,
|
58
|
+
# either {#score_dataset} or {#score_datasets} is called by
|
59
|
+
# {Runner#score_records}. In advanced mode, it is left up to the
|
60
|
+
# {Comparator} subclass to determine which records to compare and how to
|
61
|
+
# compare them.
|
62
|
+
#
|
63
|
+
# @return [Symbol] either `:simple` or `:advanced`
|
64
|
+
def type
|
65
|
+
@type || :simple
|
43
66
|
end
|
44
67
|
|
45
|
-
#
|
46
|
-
#
|
47
|
-
|
68
|
+
# Override this to return the score of the linkage strength of two records.
|
69
|
+
# This method is used to score records by {Runner#score_records} when
|
70
|
+
# {#type} returns `:simple`.
|
71
|
+
#
|
72
|
+
# @abstract
|
73
|
+
# @param [Hash] record_1 data from first record
|
74
|
+
# @param [Hash] record_2 data from second record
|
75
|
+
# @return [Numeric] value between 0 and 1 (inclusive)
|
76
|
+
def score(record_1, record_2)
|
48
77
|
raise NotImplementedError
|
49
78
|
end
|
50
79
|
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
80
|
+
# Override this to score the linkage strength of records in two datasets.
|
81
|
+
# This method is used to score records by {Runner#score_records} when
|
82
|
+
# {#type} returns `:advanced` and {Configuration} is setup to link two
|
83
|
+
# datasets together.
|
54
84
|
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
85
|
+
# Since each {Dataset} delegates to a
|
86
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
|
87
|
+
# you can use any
|
88
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
|
89
|
+
# methods that you wish in order to select records to compare.
|
59
90
|
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
|
91
|
+
# To record scores, subclasses must call
|
92
|
+
# {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
|
93
|
+
# like so:
|
94
|
+
#
|
95
|
+
# ```ruby
|
96
|
+
# changed
|
97
|
+
# notify_observers(self, record_1, record_2, score)
|
98
|
+
# ```
|
99
|
+
#
|
100
|
+
# This works by notifying any observers, typically {ScoreRecorder}, that a
|
101
|
+
# new score has been generated. {ScoreRecorder#update} then calls
|
102
|
+
# {ScoreSet#add_score} with comparator ID, the primary key of each record
|
103
|
+
# and the score.
|
104
|
+
#
|
105
|
+
# @abstract
|
106
|
+
# @param [Linkage::Dataset] dataset_1
|
107
|
+
# @param [Linkage::Dataset] dataset_2
|
108
|
+
# @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
|
109
|
+
# @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
|
110
|
+
def score_datasets(dataset_1, dataset_2)
|
70
111
|
raise NotImplementedError
|
71
112
|
end
|
72
113
|
|
73
|
-
|
74
|
-
|
75
|
-
#
|
76
|
-
#
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
|
114
|
+
# Override this to score the linkage strength of records in one dataset.
|
115
|
+
# This method is used to score records by {Runner#score_records} when
|
116
|
+
# {#type} returns `:advanced` and {Configuration} is setup to link a
|
117
|
+
# dataset to itself.
|
118
|
+
#
|
119
|
+
# Since a {Dataset} delegates to a
|
120
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
|
121
|
+
# you can use any
|
122
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
|
123
|
+
# methods that you wish in order to select records to compare.
|
124
|
+
#
|
125
|
+
# To record scores, subclasses must call
|
126
|
+
# {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
|
127
|
+
# like so:
|
128
|
+
#
|
129
|
+
# ```ruby
|
130
|
+
# changed
|
131
|
+
# notify_observers(self, record_1, record_2, score)
|
132
|
+
# ```
|
133
|
+
#
|
134
|
+
# This works by notifying any observers, typically {ScoreRecorder}, that a
|
135
|
+
# new score has been generated. {ScoreRecorder#update} then calls
|
136
|
+
# {ScoreSet#add_score} with comparator ID, the primary key of each record
|
137
|
+
# and the score.
|
138
|
+
#
|
139
|
+
# @abstract
|
140
|
+
# @param [Linkage::Dataset] dataset
|
141
|
+
# @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
|
142
|
+
# @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
|
143
|
+
def score_dataset(dataset)
|
89
144
|
raise NotImplementedError
|
90
145
|
end
|
91
146
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
parameter_types = parameters[i]
|
106
|
-
if parameter_types.last.is_a?(Hash)
|
107
|
-
parameter_options = parameter_types[-1]
|
108
|
-
parameter_types = parameter_types[0..-2]
|
109
|
-
else
|
110
|
-
parameter_options = {}
|
111
|
-
end
|
112
|
-
|
113
|
-
if parameter_types[0] != :any && !parameter_types.include?(type)
|
114
|
-
raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
|
115
|
-
end
|
116
|
-
|
117
|
-
if parameter_options.has_key?(:values) && arg.raw? && !parameter_options[:values].include?(arg.object)
|
118
|
-
raise ArgumentError, "argument #{i + 1} (#{arg.object.inspect}) was not one of the expected values: #{parameter_options[:values].inspect}"
|
119
|
-
end
|
120
|
-
|
121
|
-
if parameter_options.has_key?(:same_type_as)
|
122
|
-
arg_index = parameter_options[:same_type_as]
|
123
|
-
other_type = @args[arg_index].ruby_type[:type]
|
124
|
-
if type != other_type
|
125
|
-
raise TypeError, "argument #{i + 1} (#{type}) was expected to have the same type as argument #{arg_index + 1} (#{other_type})"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
if parameter_options.has_key?(:static) &&
|
130
|
-
parameter_options[:static] != arg.static?
|
131
|
-
raise TypeError, "argument #{i + 1} was expected to #{arg.static? ? "not be" : "be"} static"
|
132
|
-
end
|
133
|
-
|
134
|
-
if !arg.static?
|
135
|
-
if first_side.nil?
|
136
|
-
first_side = arg.side
|
137
|
-
elsif arg.side != first_side && second_side.nil?
|
138
|
-
second_side = arg.side
|
139
|
-
end
|
140
|
-
|
141
|
-
valid_side = true
|
142
|
-
case parameter_options[:side]
|
143
|
-
when :first
|
144
|
-
if arg.side != first_side
|
145
|
-
valid_side = false
|
146
|
-
end
|
147
|
-
when :second
|
148
|
-
if second_side.nil? || arg.side != second_side
|
149
|
-
valid_side = false
|
150
|
-
end
|
151
|
-
end
|
152
|
-
|
153
|
-
if !valid_side
|
154
|
-
raise TypeError, "argument #{i + 1} was expected to have a different side value"
|
155
|
-
end
|
156
|
-
|
157
|
-
case arg.side
|
158
|
-
when :lhs
|
159
|
-
@lhs_args << arg
|
160
|
-
when :rhs
|
161
|
-
@rhs_args << arg
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
147
|
+
# Calls {#score} with two hashes of record data. The result is then used to
|
148
|
+
# notify any observers (typically {ScoreRecorder}).
|
149
|
+
#
|
150
|
+
# This method is used by {Runner#score_records} when {#type} returns
|
151
|
+
# `:simple`. Subclasses should override {#score} to implement the scoring
|
152
|
+
# algorithm.
|
153
|
+
#
|
154
|
+
# @param [Hash] record_1 data from first record
|
155
|
+
# @param [Hash] record_2 data from second record
|
156
|
+
def score_and_notify(record_1, record_2)
|
157
|
+
value = score(record_1, record_2)
|
158
|
+
changed
|
159
|
+
notify_observers(self, record_1, record_2, value)
|
165
160
|
end
|
166
161
|
end
|
167
162
|
end
|
168
163
|
|
169
164
|
path = File.expand_path(File.join(File.dirname(__FILE__), "comparators"))
|
170
|
-
require File.join(path, "binary")
|
171
165
|
require File.join(path, "compare")
|
172
166
|
require File.join(path, "within")
|
167
|
+
require File.join(path, "strcompare")
|