linkage 0.0.8 → 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.yardopts +1 -0
- data/Gemfile +1 -19
- data/Gemfile-java +3 -0
- data/README.markdown +88 -34
- data/Rakefile +16 -15
- data/TODO +4 -0
- data/lib/linkage/comparator.rb +139 -144
- data/lib/linkage/comparators/compare.rb +236 -29
- data/lib/linkage/comparators/strcompare.rb +85 -0
- data/lib/linkage/comparators/within.rb +24 -20
- data/lib/linkage/configuration.rb +44 -466
- data/lib/linkage/dataset.rb +28 -127
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +6 -37
- data/lib/linkage/field_set.rb +3 -3
- data/lib/linkage/match_recorder.rb +22 -0
- data/lib/linkage/match_set.rb +34 -0
- data/lib/linkage/match_sets/csv.rb +39 -0
- data/lib/linkage/match_sets/database.rb +45 -0
- data/lib/linkage/matcher.rb +30 -0
- data/lib/linkage/result_set.rb +25 -110
- data/lib/linkage/result_sets/csv.rb +54 -0
- data/lib/linkage/result_sets/database.rb +42 -0
- data/lib/linkage/runner.rb +57 -16
- data/lib/linkage/score_recorder.rb +30 -0
- data/lib/linkage/score_set.rb +49 -0
- data/lib/linkage/score_sets/csv.rb +64 -0
- data/lib/linkage/score_sets/database.rb +77 -0
- data/lib/linkage/version.rb +1 -1
- data/lib/linkage.rb +14 -17
- data/linkage.gemspec +13 -1
- data/linkage.gemspec-java +32 -0
- data/test/helper.rb +30 -23
- data/test/integration/test_cross_linkage.rb +46 -25
- data/test/integration/test_database_result_set.rb +55 -0
- data/test/integration/test_dual_linkage.rb +19 -94
- data/test/integration/test_self_linkage.rb +100 -203
- data/test/integration/test_within_comparator.rb +24 -77
- data/test/unit/comparators/test_compare.rb +254 -50
- data/test/unit/comparators/test_strcompare.rb +45 -0
- data/test/unit/comparators/test_within.rb +14 -26
- data/test/unit/match_sets/test_csv.rb +78 -0
- data/test/unit/match_sets/test_database.rb +63 -0
- data/test/unit/result_sets/test_csv.rb +111 -0
- data/test/unit/result_sets/test_database.rb +68 -0
- data/test/unit/score_sets/test_csv.rb +151 -0
- data/test/unit/score_sets/test_database.rb +149 -0
- data/test/unit/test_comparator.rb +46 -83
- data/test/unit/test_comparators.rb +4 -0
- data/test/unit/test_configuration.rb +99 -145
- data/test/unit/test_dataset.rb +52 -73
- data/test/unit/test_field.rb +4 -55
- data/test/unit/test_field_set.rb +6 -6
- data/test/unit/test_match_recorder.rb +23 -0
- data/test/unit/test_match_set.rb +23 -0
- data/test/unit/test_match_sets.rb +4 -0
- data/test/unit/test_matcher.rb +44 -0
- data/test/unit/test_result_set.rb +24 -223
- data/test/unit/test_result_sets.rb +4 -0
- data/test/unit/test_runner.rb +122 -17
- data/test/unit/test_runners.rb +4 -0
- data/test/unit/test_score_recorder.rb +25 -0
- data/test/unit/test_score_set.rb +37 -0
- data/test/unit/test_score_sets.rb +4 -0
- metadata +183 -90
- data/Gemfile.lock +0 -92
- data/lib/linkage/comparators/binary.rb +0 -12
- data/lib/linkage/data.rb +0 -175
- data/lib/linkage/decollation.rb +0 -93
- data/lib/linkage/expectation.rb +0 -21
- data/lib/linkage/expectations/exhaustive.rb +0 -63
- data/lib/linkage/expectations/simple.rb +0 -168
- data/lib/linkage/function.rb +0 -148
- data/lib/linkage/functions/binary.rb +0 -30
- data/lib/linkage/functions/cast.rb +0 -54
- data/lib/linkage/functions/length.rb +0 -29
- data/lib/linkage/functions/strftime.rb +0 -33
- data/lib/linkage/functions/trim.rb +0 -30
- data/lib/linkage/group.rb +0 -55
- data/lib/linkage/meta_object.rb +0 -139
- data/lib/linkage/runner/single_threaded.rb +0 -187
- data/lib/linkage/utils.rb +0 -164
- data/lib/linkage/warnings.rb +0 -5
- data/test/integration/test_collation.rb +0 -45
- data/test/integration/test_configuration.rb +0 -268
- data/test/integration/test_dataset.rb +0 -116
- data/test/integration/test_functions.rb +0 -88
- data/test/integration/test_result_set.rb +0 -85
- data/test/integration/test_scoring.rb +0 -84
- data/test/unit/expectations/test_exhaustive.rb +0 -111
- data/test/unit/expectations/test_simple.rb +0 -303
- data/test/unit/functions/test_binary.rb +0 -54
- data/test/unit/functions/test_cast.rb +0 -98
- data/test/unit/functions/test_length.rb +0 -52
- data/test/unit/functions/test_strftime.rb +0 -60
- data/test/unit/functions/test_trim.rb +0 -43
- data/test/unit/runner/test_single_threaded.rb +0 -12
- data/test/unit/test_data.rb +0 -445
- data/test/unit/test_decollation.rb +0 -201
- data/test/unit/test_function.rb +0 -233
- data/test/unit/test_group.rb +0 -38
- data/test/unit/test_meta_object.rb +0 -208
- data/test/unit/test_utils.rb +0 -341
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ad6a9ee6a6add94a342e3d02d49d8a4bfeb9122b
|
4
|
+
data.tar.gz: a989e8e810602dfcd4da596fcc1154a922dedccd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0d78da4904100679826cf76ae07f7daf984c12e2a762aceb0ad2d1387c5a5778403a782ff104ce48c1fd3ec5588c728fe2b8ea7a79a896da153a8d91c7e4cb14
|
7
|
+
data.tar.gz: 6797e5598f47413022d18c6732cbf0613efaccba886cda56eb390e3344d131ccfd977b5992c5a2c4557cfbc4f93bf747ddffe849770be5860e793f27fe3e2286
|
data/.gitignore
CHANGED
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-m markdown
|
data/Gemfile
CHANGED
@@ -1,21 +1,3 @@
|
|
1
|
-
source '
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gemspec
|
4
|
-
|
5
|
-
group :development do
|
6
|
-
gem 'bundler'
|
7
|
-
gem 'test-unit'
|
8
|
-
gem 'mocha'
|
9
|
-
gem 'yard'
|
10
|
-
gem 'rake'
|
11
|
-
gem 'versionomy'
|
12
|
-
gem 'sqlite3', :platforms => :ruby
|
13
|
-
gem 'mysql2', :platforms => :ruby
|
14
|
-
gem 'jdbc-sqlite3', :platforms => :jruby
|
15
|
-
gem 'jdbc-mysql', :platforms => :jruby
|
16
|
-
gem 'rdiscount'
|
17
|
-
gem 'guard-test'
|
18
|
-
gem 'guard-yard', :platforms => :ruby_19
|
19
|
-
gem 'rb-inotify', '~> 0.8.8'
|
20
|
-
gem 'debugger'
|
21
|
-
end
|
data/Gemfile-java
ADDED
data/README.markdown
CHANGED
@@ -1,52 +1,106 @@
|
|
1
|
-
#
|
1
|
+
# Linkage
|
2
2
|
|
3
|
-
Linkage is a library for record linkage between one or two database tables.
|
3
|
+
Linkage is a Ruby library for record linkage between one or two database tables.
|
4
|
+
|
5
|
+
## What is record linkage?
|
6
|
+
|
7
|
+
In an ideal world, records that reference the same entity can be easily
|
8
|
+
identified. Unfortunately, this isn't always the case. Sometimes there are no
|
9
|
+
good identifiers in the datasets that you're interested in (ID, social security
|
10
|
+
number, etc). In such cases, it is necessary to use other means to determine
|
11
|
+
which records refer to which entity, and this process is known as **record
|
12
|
+
linkage**.
|
13
|
+
|
14
|
+
## Prerequisites
|
15
|
+
|
16
|
+
In order to use Linkage, the records you want to link must be in a database.
|
17
|
+
Linkage has the ability to perform record linkage across different kinds of
|
18
|
+
databases, so it's okay if your records are not all in the same place.
|
19
|
+
|
20
|
+
Since Linkage uses [Sequel](http://sequel.jeremyevans.net/) to communicate with
|
21
|
+
databases, any database that Sequel supports will work. See [Connecting to a
|
22
|
+
database](http://sequel.jeremyevans.net/documentation.html) on the Sequel
|
23
|
+
website for more information about what databases are supported.
|
4
24
|
|
5
25
|
## Usage
|
6
26
|
|
7
|
-
|
8
|
-
|
9
|
-
and
|
27
|
+
To perform a record linkage, Linkage needs information about the following:
|
28
|
+
datasets, result set, and comparators. A dataset refers to a table in a
|
29
|
+
database. A result set is a place to put score and match information that
|
30
|
+
Linkage generates. Comparators describe how records are compared.
|
31
|
+
|
32
|
+
A dataset is created via the `Linkage::Dataset` class, along with a connection URI
|
33
|
+
and a table name:
|
10
34
|
|
11
|
-
|
35
|
+
```ruby
|
36
|
+
ds = Linkage::Dataset.new('mysql://example.com/database_name', 'table_name')
|
37
|
+
```
|
12
38
|
|
13
|
-
|
39
|
+
Result sets have different options depending on what storage medium you're
|
40
|
+
using (CSV or database). For CSVs, you could use:
|
14
41
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
lhs[:first_name].must == rhs[:parent_first_name]
|
19
|
-
lhs[:last_name].must == rhs[:parent_last_name]
|
20
|
-
lhs[:last_name].must_not == "Smith" # exclude parents with the last
|
21
|
-
# name "Smith"
|
42
|
+
```ruby
|
43
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_results')
|
44
|
+
```
|
22
45
|
|
23
|
-
|
24
|
-
|
46
|
+
In this case, scores and matches will be saved in CSV files in the `my_results`
|
47
|
+
directory in your home folder.
|
25
48
|
|
26
|
-
|
27
|
-
the
|
49
|
+
To describe a linkage, you can use the `Dataset#link_with` method. This creates
|
50
|
+
a linkage configuration that you can use to describe how you want the records in
|
51
|
+
each dataset to be compared. For example:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
demo = Linkage::Dataset.new('postgres://example.com/foo', 'demographics')
|
55
|
+
visits = Linkage::Dataset.new('mysql://some-other-host.net/bar', 'visits')
|
56
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_results')
|
57
|
+
config = demo.link_with(visits, result_set) do |config|
|
58
|
+
config.compare([:first_name, :last_name], [:first_name, :last_name], :equal)
|
59
|
+
end
|
60
|
+
```
|
61
|
+
|
62
|
+
This linkage would match records from a demographics table to records in a table
|
63
|
+
with information about doctor visits by using first name and last name.
|
64
|
+
|
65
|
+
The `compare` method creates a `Compare` comparator. This is the simplest
|
66
|
+
comparator in Linkage, and it just compares fields with the operator you specify
|
67
|
+
(`:equal`, `:less_than`, `:greater_than`, etc). When a comparator compares
|
68
|
+
two records, it gives the pair of records a score between 0 and 1. In the case
|
69
|
+
of the example above, records that have the same first name and last name get a
|
70
|
+
score of 1, and records that don't get a score of 0 (or sometimes, they aren't
|
71
|
+
scored and assumed to have a score of 0).
|
72
|
+
|
73
|
+
Other comparators are `Strcompare` for approximate string matching and
|
74
|
+
`Within` for matching numbers within a range.
|
28
75
|
|
29
76
|
To run a linkage, use a Runner with the resulting configuration from
|
30
77
|
`Dataset#link_with`:
|
31
78
|
|
32
|
-
|
33
|
-
|
79
|
+
```ruby
|
80
|
+
runner = Linkage::Runner.new(config)
|
81
|
+
runner.execute
|
82
|
+
```
|
83
|
+
|
84
|
+
After running a linkage, there will be a list of matches in a CSV file or
|
85
|
+
database, depending on how you configured your result set.
|
86
|
+
|
87
|
+
The default way linkage determines if two records match is by comparing the
|
88
|
+
average score to a threshold value (which is 0.5 by default). You can configure
|
89
|
+
the threshold value like so: `config.threshold = 0.9`.
|
34
90
|
|
35
|
-
|
36
|
-
(via the `save_results_in` method). It stores its results in two database
|
37
|
-
tables: `groups` and `groups_records`. The `groups` table contains all of the
|
38
|
-
unique combinations of values in your datasets, and `groups_records` maps
|
39
|
-
records to groups.
|
91
|
+
## Other examples
|
40
92
|
|
41
|
-
|
93
|
+
Linking a dataset to itself:
|
42
94
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
95
|
+
```ruby
|
96
|
+
births = Linkage::Dataset.new('postgres://example.com/hospital_data', 'births')
|
97
|
+
result_set = Linkage::ResultSet['csv'].new('~/my_birth_results')
|
98
|
+
config = births.link_with(births, result_set) do |config|
|
99
|
+
config.compare([:mother_first_name, :mother_last_name], [:mother_first_name, :mother_last_name], :equal)
|
100
|
+
end
|
101
|
+
runner = Linkage::Runner.new(config)
|
102
|
+
runner.execute
|
103
|
+
```
|
50
104
|
|
51
105
|
The above example would find birth records that have mothers with the same
|
52
106
|
name.
|
@@ -62,6 +116,6 @@ name.
|
|
62
116
|
|
63
117
|
## Copyright
|
64
118
|
|
65
|
-
Copyright (c) 2011 Vanderbilt University. See LICENSE.txt for
|
119
|
+
Copyright (c) 2011-2014 Vanderbilt University. See LICENSE.txt for
|
66
120
|
further details.
|
67
121
|
|
data/Rakefile
CHANGED
@@ -1,16 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
13
|
-
require "bundler/gem_tasks"
|
1
|
+
require 'bundler/gem_tasks'
|
14
2
|
|
15
3
|
require 'rake/testtask'
|
16
4
|
Rake::TestTask.new(:test) do |test|
|
@@ -18,9 +6,22 @@ Rake::TestTask.new(:test) do |test|
|
|
18
6
|
test.pattern = 'test/**/test_*.rb'
|
19
7
|
test.verbose = true
|
20
8
|
end
|
21
|
-
|
22
9
|
task :default => :test
|
23
10
|
|
11
|
+
namespace :test do
|
12
|
+
Rake::TestTask.new(:unit) do |test|
|
13
|
+
test.libs << 'lib' << 'test'
|
14
|
+
test.pattern = 'test/unit/**/test_*.rb'
|
15
|
+
test.verbose = true
|
16
|
+
end
|
17
|
+
|
18
|
+
Rake::TestTask.new(:integration) do |test|
|
19
|
+
test.libs << 'lib' << 'test'
|
20
|
+
test.pattern = 'test/integration/**/test_*.rb'
|
21
|
+
test.verbose = true
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
24
25
|
require 'yard'
|
25
26
|
YARD::Rake::YardocTask.new do |t|
|
26
27
|
t.files = ['lib/**/*.rb']
|
@@ -28,7 +29,7 @@ end
|
|
28
29
|
|
29
30
|
# Yoinked from https://github.com/rails/rails/blob/master/railties/lib/rails/tasks/annotations.rake
|
30
31
|
namespace :notes do
|
31
|
-
[
|
32
|
+
['OPTIMIZE', 'FIXME', 'TODO'].each do |annotation|
|
32
33
|
desc "Enumerate all #{annotation} annotations"
|
33
34
|
task annotation.downcase.intern do
|
34
35
|
SourceAnnotationExtractor.enumerate annotation
|
data/TODO
ADDED
data/lib/linkage/comparator.rb
CHANGED
@@ -1,172 +1,167 @@
|
|
1
1
|
module Linkage
|
2
|
-
#
|
2
|
+
# {Comparator} is the superclass for comparators in Linkage. Comparators are
|
3
|
+
# used to compare two records and compute scores based on how closely the two
|
4
|
+
# records relate.
|
5
|
+
#
|
6
|
+
# Each comparator should inherit from {Comparator} and declare itself as
|
7
|
+
# simple or advanced by overriding {#type} (the default is simple). Simple
|
8
|
+
# comparators must define the {#score} method that uses data from two records
|
9
|
+
# and returns a number (`Integer` or `Float`) between 0 and 1 (inclusive).
|
10
|
+
# Advanced comparators must define both {#score_dataset} and {#score_datasets}
|
11
|
+
# that use one or two {Dataset}s respectively to create scores.
|
12
|
+
#
|
13
|
+
# Each comparator can be registered via the {.register} function. This allows
|
14
|
+
# {Configuration} a way to find a comparator by name via
|
15
|
+
# {Configuration#method_missing}. For example, `config.compare(...)` creates a
|
16
|
+
# new {Comparators::Compare} object, since that comparator is registered under
|
17
|
+
# the name `"compare"`.
|
18
|
+
#
|
19
|
+
# See documentation for the methods below for more information.
|
20
|
+
#
|
21
|
+
# @abstract
|
3
22
|
class Comparator
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
if klass.parameters.length > 0
|
21
|
-
@comparators ||= {}
|
22
|
-
@comparators[name] = klass
|
23
|
-
else
|
24
|
-
raise ArgumentError, "class must have at least one parameter"
|
23
|
+
include Observable
|
24
|
+
|
25
|
+
class << self
|
26
|
+
# Register a new comparator. Subclasses must define at least {#score} for
|
27
|
+
# simple comparators, or {#score_dataset} and {#score_datasets} for
|
28
|
+
# advanced comparators. Otherwise, an `ArgumentError` will be raised when
|
29
|
+
# you try to call {.register}. The `name` parameter is used in
|
30
|
+
# {Configuration#method_missing} as an easy way for users to select
|
31
|
+
# comparators for their linkage.
|
32
|
+
#
|
33
|
+
# @param [String] name Comparator name used in {.klass_for}
|
34
|
+
# @param [Class] klass Comparator subclass
|
35
|
+
def register(name, klass)
|
36
|
+
methods = klass.instance_methods(false)
|
37
|
+
if !methods.include?(:score) && (!methods.include?(:score_datasets) || !methods.include?(:score_dataset))
|
38
|
+
raise ArgumentError, "class must define either #score or both #score_datasets and #score_dataset methods"
|
25
39
|
end
|
26
|
-
|
27
|
-
|
40
|
+
|
41
|
+
@comparators ||= {}
|
42
|
+
@comparators[name] = klass
|
28
43
|
end
|
29
44
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
rescue NotImplementedError
|
37
|
-
raise ArgumentError, "score_range class method must be defined"
|
45
|
+
# Return a registered Comparator subclass or `nil` if it doesn't exist.
|
46
|
+
#
|
47
|
+
# @param [String] name of registered comparator
|
48
|
+
# @return [Class, nil]
|
49
|
+
def klass_for(name)
|
50
|
+
@comparators ? @comparators[name] : nil
|
38
51
|
end
|
52
|
+
alias :[] :klass_for
|
39
53
|
end
|
40
54
|
|
41
|
-
|
42
|
-
|
55
|
+
# Return the type of this comparator. When {#type} returns `:simple`,
|
56
|
+
# {#score_and_notify} is called by {Runner#score_records} with each pair of
|
57
|
+
# records in order to create scores. When {#type} returns `:advanced`,
|
58
|
+
# either {#score_dataset} or {#score_datasets} is called by
|
59
|
+
# {Runner#score_records}. In advanced mode, it is left up to the
|
60
|
+
# {Comparator} subclass to determine which records to compare and how to
|
61
|
+
# compare them.
|
62
|
+
#
|
63
|
+
# @return [Symbol] either `:simple` or `:advanced`
|
64
|
+
def type
|
65
|
+
@type || :simple
|
43
66
|
end
|
44
67
|
|
45
|
-
#
|
46
|
-
#
|
47
|
-
|
68
|
+
# Override this to return the score of the linkage strength of two records.
|
69
|
+
# This method is used to score records by {Runner#score_records} when
|
70
|
+
# {#type} returns `:simple`.
|
71
|
+
#
|
72
|
+
# @abstract
|
73
|
+
# @param [Hash] record_1 data from first record
|
74
|
+
# @param [Hash] record_2 data from second record
|
75
|
+
# @return [Numeric] value between 0 and 1 (inclusive)
|
76
|
+
def score(record_1, record_2)
|
48
77
|
raise NotImplementedError
|
49
78
|
end
|
50
79
|
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
80
|
+
# Override this to score the linkage strength of records in two datasets.
|
81
|
+
# This method is used to score records by {Runner#score_records} when
|
82
|
+
# {#type} returns `:advanced` and {Configuration} is setup to link two
|
83
|
+
# datasets together.
|
54
84
|
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
85
|
+
# Since each {Dataset} delegates to a
|
86
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
|
87
|
+
# you can use any
|
88
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
|
89
|
+
# methods that you wish in order to select records to compare.
|
59
90
|
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
|
91
|
+
# To record scores, subclasses must call
|
92
|
+
# {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
|
93
|
+
# like so:
|
94
|
+
#
|
95
|
+
# ```ruby
|
96
|
+
# changed
|
97
|
+
# notify_observers(self, record_1, record_2, score)
|
98
|
+
# ```
|
99
|
+
#
|
100
|
+
# This works by notifying any observers, typically {ScoreRecorder}, that a
|
101
|
+
# new score has been generated. {ScoreRecorder#update} then calls
|
102
|
+
# {ScoreSet#add_score} with comparator ID, the primary key of each record
|
103
|
+
# and the score.
|
104
|
+
#
|
105
|
+
# @abstract
|
106
|
+
# @param [Linkage::Dataset] dataset_1
|
107
|
+
# @param [Linkage::Dataset] dataset_2
|
108
|
+
# @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
|
109
|
+
# @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
|
110
|
+
def score_datasets(dataset_1, dataset_2)
|
70
111
|
raise NotImplementedError
|
71
112
|
end
|
72
113
|
|
73
|
-
|
74
|
-
|
75
|
-
#
|
76
|
-
#
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
|
114
|
+
# Override this to score the linkage strength of records in one dataset.
|
115
|
+
# This method is used to score records by {Runner#score_records} when
|
116
|
+
# {#type} returns `:advanced` and {Configuration} is setup to link a
|
117
|
+
# dataset to itself.
|
118
|
+
#
|
119
|
+
# Since a {Dataset} delegates to a
|
120
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`},
|
121
|
+
# you can use any
|
122
|
+
# {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html `Sequel::Dataset`}
|
123
|
+
# methods that you wish in order to select records to compare.
|
124
|
+
#
|
125
|
+
# To record scores, subclasses must call
|
126
|
+
# {http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html `Observable#notify_observers`}
|
127
|
+
# like so:
|
128
|
+
#
|
129
|
+
# ```ruby
|
130
|
+
# changed
|
131
|
+
# notify_observers(self, record_1, record_2, score)
|
132
|
+
# ```
|
133
|
+
#
|
134
|
+
# This works by notifying any observers, typically {ScoreRecorder}, that a
|
135
|
+
# new score has been generated. {ScoreRecorder#update} then calls
|
136
|
+
# {ScoreSet#add_score} with comparator ID, the primary key of each record
|
137
|
+
# and the score.
|
138
|
+
#
|
139
|
+
# @abstract
|
140
|
+
# @param [Linkage::Dataset] dataset
|
141
|
+
# @see http://ruby-doc.org/stdlib/libdoc/observer/rdoc/Observable.html Observable
|
142
|
+
# @see http://sequel.jeremyevans.net/rdoc/classes/Sequel/Dataset.html Sequel::Dataset
|
143
|
+
def score_dataset(dataset)
|
89
144
|
raise NotImplementedError
|
90
145
|
end
|
91
146
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
parameter_types = parameters[i]
|
106
|
-
if parameter_types.last.is_a?(Hash)
|
107
|
-
parameter_options = parameter_types[-1]
|
108
|
-
parameter_types = parameter_types[0..-2]
|
109
|
-
else
|
110
|
-
parameter_options = {}
|
111
|
-
end
|
112
|
-
|
113
|
-
if parameter_types[0] != :any && !parameter_types.include?(type)
|
114
|
-
raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
|
115
|
-
end
|
116
|
-
|
117
|
-
if parameter_options.has_key?(:values) && arg.raw? && !parameter_options[:values].include?(arg.object)
|
118
|
-
raise ArgumentError, "argument #{i + 1} (#{arg.object.inspect}) was not one of the expected values: #{parameter_options[:values].inspect}"
|
119
|
-
end
|
120
|
-
|
121
|
-
if parameter_options.has_key?(:same_type_as)
|
122
|
-
arg_index = parameter_options[:same_type_as]
|
123
|
-
other_type = @args[arg_index].ruby_type[:type]
|
124
|
-
if type != other_type
|
125
|
-
raise TypeError, "argument #{i + 1} (#{type}) was expected to have the same type as argument #{arg_index + 1} (#{other_type})"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
if parameter_options.has_key?(:static) &&
|
130
|
-
parameter_options[:static] != arg.static?
|
131
|
-
raise TypeError, "argument #{i + 1} was expected to #{arg.static? ? "not be" : "be"} static"
|
132
|
-
end
|
133
|
-
|
134
|
-
if !arg.static?
|
135
|
-
if first_side.nil?
|
136
|
-
first_side = arg.side
|
137
|
-
elsif arg.side != first_side && second_side.nil?
|
138
|
-
second_side = arg.side
|
139
|
-
end
|
140
|
-
|
141
|
-
valid_side = true
|
142
|
-
case parameter_options[:side]
|
143
|
-
when :first
|
144
|
-
if arg.side != first_side
|
145
|
-
valid_side = false
|
146
|
-
end
|
147
|
-
when :second
|
148
|
-
if second_side.nil? || arg.side != second_side
|
149
|
-
valid_side = false
|
150
|
-
end
|
151
|
-
end
|
152
|
-
|
153
|
-
if !valid_side
|
154
|
-
raise TypeError, "argument #{i + 1} was expected to have a different side value"
|
155
|
-
end
|
156
|
-
|
157
|
-
case arg.side
|
158
|
-
when :lhs
|
159
|
-
@lhs_args << arg
|
160
|
-
when :rhs
|
161
|
-
@rhs_args << arg
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
147
|
+
# Calls {#score} with two hashes of record data. The result is then used to
|
148
|
+
# notify any observers (typically {ScoreRecorder}).
|
149
|
+
#
|
150
|
+
# This method is used by {Runner#score_records} when {#type} returns
|
151
|
+
# `:simple`. Subclasses should override {#score} to implement the scoring
|
152
|
+
# algorithm.
|
153
|
+
#
|
154
|
+
# @param [Hash] record_1 data from first record
|
155
|
+
# @param [Hash] record_2 data from second record
|
156
|
+
def score_and_notify(record_1, record_2)
|
157
|
+
value = score(record_1, record_2)
|
158
|
+
changed
|
159
|
+
notify_observers(self, record_1, record_2, value)
|
165
160
|
end
|
166
161
|
end
|
167
162
|
end
|
168
163
|
|
169
164
|
path = File.expand_path(File.join(File.dirname(__FILE__), "comparators"))
|
170
|
-
require File.join(path, "binary")
|
171
165
|
require File.join(path, "compare")
|
172
166
|
require File.join(path, "within")
|
167
|
+
require File.join(path, "strcompare")
|