linkage 0.1.0.pre → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +2 -0
- data/Guardfile +0 -1
- data/TODO +2 -0
- data/lib/linkage.rb +1 -0
- data/lib/linkage/comparator.rb +12 -2
- data/lib/linkage/comparators/strcompare.rb +68 -16
- data/lib/linkage/configuration.rb +112 -8
- data/lib/linkage/dataset.rb +124 -9
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +55 -18
- data/lib/linkage/field_set.rb +20 -0
- data/lib/linkage/helpers.rb +7 -0
- data/lib/linkage/helpers/csv.rb +28 -0
- data/lib/linkage/helpers/database.rb +47 -0
- data/lib/linkage/import_buffer.rb +3 -3
- data/lib/linkage/match_recorder.rb +4 -0
- data/lib/linkage/match_set.rb +51 -13
- data/lib/linkage/match_sets/csv.rb +36 -9
- data/lib/linkage/match_sets/database.rb +43 -2
- data/lib/linkage/matcher.rb +49 -3
- data/lib/linkage/result_set.rb +60 -22
- data/lib/linkage/result_sets/csv.rb +46 -28
- data/lib/linkage/result_sets/database.rb +44 -26
- data/lib/linkage/runner.rb +10 -0
- data/lib/linkage/score_recorder.rb +5 -0
- data/lib/linkage/score_set.rb +78 -20
- data/lib/linkage/score_sets/csv.rb +41 -15
- data/lib/linkage/score_sets/database.rb +43 -5
- data/lib/linkage/version.rb +1 -1
- data/linkage.gemspec +2 -0
- data/misc/uml/linkage.dia +0 -0
- data/misc/uml/linkage.png +0 -0
- data/misc/uml/linkage.svg +197 -0
- data/test/helper.rb +2 -11
- data/test/integration/test_database_result_set.rb +4 -2
- data/test/unit/comparators/test_strcompare.rb +29 -0
- data/test/unit/match_sets/test_csv.rb +44 -13
- data/test/unit/match_sets/test_database.rb +42 -1
- data/test/unit/result_sets/test_csv.rb +9 -69
- data/test/unit/result_sets/test_database.rb +20 -11
- data/test/unit/score_sets/test_csv.rb +68 -25
- data/test/unit/score_sets/test_database.rb +57 -1
- data/test/unit/test_comparator.rb +8 -0
- data/test/unit/test_configuration.rb +33 -6
- data/test/unit/test_dataset.rb +0 -7
- data/test/unit/test_matcher.rb +52 -3
- data/test/unit/test_result_set.rb +8 -14
- metadata +66 -32
data/lib/linkage/exceptions.rb
CHANGED
data/lib/linkage/field.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
module Linkage
|
2
|
-
#
|
3
|
-
#
|
2
|
+
# {Field} describes a field in a dataset, otherwise known as database table
|
3
|
+
# column.
|
4
4
|
class Field
|
5
|
-
#
|
6
|
-
# @return [Symbol] This object's name
|
5
|
+
# @return [Symbol] This field's name
|
7
6
|
attr_reader :name
|
8
7
|
|
9
|
-
# @return [
|
8
|
+
# @return [Array] This field's schema information
|
10
9
|
attr_reader :schema
|
11
10
|
|
12
|
-
#
|
11
|
+
# Returns a new instance of Field.
|
13
12
|
#
|
14
13
|
# @param [Symbol] name The field's name
|
15
14
|
# @param [Hash] schema The field's schema information
|
@@ -19,23 +18,58 @@ module Linkage
|
|
19
18
|
end
|
20
19
|
|
21
20
|
# Convert the column schema information to a hash of column options, one of
|
22
|
-
# which
|
23
|
-
# (e.g. :size). If a database type is not recognized, return it as a String
|
24
|
-
# type.
|
21
|
+
# which is `:type`. The other options modify that type (e.g. `:size`).
|
25
22
|
#
|
26
|
-
#
|
27
|
-
#
|
23
|
+
# Here are some examples:
|
24
|
+
#
|
25
|
+
# | Database type | Ruby type | Other modifiers |
|
26
|
+
# |------------------|--------------------|-----------------------|
|
27
|
+
# | mediumint | Fixnum | |
|
28
|
+
# | smallint | Fixnum | |
|
29
|
+
# | int | Fixnum | |
|
30
|
+
# | int(10) unsigned | Bignum | |
|
31
|
+
# | tinyint | TrueClass, Integer | |
|
32
|
+
# | bigint | Bignum | |
|
33
|
+
# | real | Float | |
|
34
|
+
# | float | Float | |
|
35
|
+
# | double | Float | |
|
36
|
+
# | boolean | TrueClass | |
|
37
|
+
# | text | String | text: true |
|
38
|
+
# | date | Date | |
|
39
|
+
# | datetime | DateTime | |
|
40
|
+
# | timestamp | DateTime | |
|
41
|
+
# | time | Time | only_time: true |
|
42
|
+
# | varchar(255) | String | size: 255 |
|
43
|
+
# | char(10) | String | size: 10, fixed: true |
|
44
|
+
# | money | BigDecimal | size: [19, 2] |
|
45
|
+
# | decimal | BigDecimal | |
|
46
|
+
# | numeric | BigDecimal | |
|
47
|
+
# | number | BigDecimal | |
|
48
|
+
# | blob | File | |
|
49
|
+
# | year | Integer | |
|
50
|
+
# | identity | Integer | |
|
51
|
+
# | **other types** | String | |
|
52
|
+
#
|
53
|
+
# @note This method is copied from
|
54
|
+
# {http://sequel.jeremyevans.net/rdoc-plugins/classes/Sequel/SchemaDumper.html `Sequel::SchemaDumper`}.
|
55
|
+
# @return [Hash]
|
28
56
|
def ruby_type
|
29
57
|
unless @ruby_type
|
30
58
|
hsh =
|
31
|
-
case
|
32
|
-
when /\A(
|
33
|
-
|
34
|
-
|
35
|
-
|
59
|
+
case @schema[:db_type].downcase
|
60
|
+
when /\A(medium|small)?int(?:eger)?(?:\((\d+)\))?( unsigned)?\z/o
|
61
|
+
if !$1 && $2 && $2.to_i >= 10 && $3
|
62
|
+
# Unsigned integer type with 10 digits can potentially contain values which
|
63
|
+
# don't fit signed integer type, so use bigint type in target database.
|
64
|
+
{:type=>Bignum}
|
65
|
+
else
|
66
|
+
{:type=>Integer}
|
67
|
+
end
|
68
|
+
when /\Atinyint(?:\((\d+)\))?(?: unsigned)?\z/o
|
69
|
+
{:type =>schema[:type] == :boolean ? TrueClass : Integer}
|
36
70
|
when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
37
71
|
{:type=>Bignum}
|
38
|
-
when /\A(?:real|float|double(?: precision)?)\z/o
|
72
|
+
when /\A(?:real|float|double(?: precision)?|double\(\d+,\d+\)(?: unsigned)?)\z/o
|
39
73
|
{:type=>Float}
|
40
74
|
when 'boolean'
|
41
75
|
{:type=>TrueClass}
|
@@ -60,7 +94,7 @@ module Linkage
|
|
60
94
|
{:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
|
61
95
|
when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
|
62
96
|
{:type=>File, :size=>($1.to_i if $1)}
|
63
|
-
when
|
97
|
+
when /\A(?:year|(?:int )?identity)\z/o
|
64
98
|
{:type=>Integer}
|
65
99
|
else
|
66
100
|
{:type=>String}
|
@@ -73,6 +107,9 @@ module Linkage
|
|
73
107
|
@ruby_type
|
74
108
|
end
|
75
109
|
|
110
|
+
# Returns whether or not this field is a primary key.
|
111
|
+
#
|
112
|
+
# @return [Boolean]
|
76
113
|
def primary_key?
|
77
114
|
schema && schema[:primary_key]
|
78
115
|
end
|
data/lib/linkage/field_set.rb
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
module Linkage
|
2
|
+
# {FieldSet} is a `Hash` of {Field} values. It is usually associated with a
|
3
|
+
# {Dataset}. It looks up keys in a case-insensitive manner and doesn't care if
|
4
|
+
# you use strings or symbols.
|
5
|
+
#
|
6
|
+
# @see Dataset#field_set
|
2
7
|
class FieldSet < Hash
|
8
|
+
# @return [Field] primary key of this field set.
|
3
9
|
attr_reader :primary_key
|
4
10
|
|
5
11
|
# Create a new FieldSet.
|
@@ -16,15 +22,29 @@ module Linkage
|
|
16
22
|
end
|
17
23
|
end
|
18
24
|
|
25
|
+
# Returns whether or not `key` is contained in the field set
|
26
|
+
# (case-insensitive).
|
27
|
+
#
|
28
|
+
# @param key [String, Symbol]
|
29
|
+
# @return [Boolean]
|
19
30
|
def has_key?(key)
|
20
31
|
!fetch_key(key).nil?
|
21
32
|
end
|
22
33
|
|
34
|
+
# Returns a key that matches the parameter in a case-insensitive manner.
|
35
|
+
#
|
36
|
+
# @param key [String, Symbol]
|
37
|
+
# @return [Symbol]
|
23
38
|
def fetch_key(key)
|
24
39
|
string_key = key.to_s
|
25
40
|
keys.detect { |k| k.to_s.casecmp(string_key) == 0 }
|
26
41
|
end
|
27
42
|
|
43
|
+
# Returns the value for `key`, where `key` is matched in a case-insensitive
|
44
|
+
# manner.
|
45
|
+
#
|
46
|
+
# @param key [String, Symbol]
|
47
|
+
# @return [Field]
|
28
48
|
def [](key)
|
29
49
|
k = fetch_key(key)
|
30
50
|
k ? super(k) : nil
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Helpers
|
3
|
+
module CSV
|
4
|
+
def csv_filename(options)
|
5
|
+
File.expand_path(options[:filename], options[:dir] || '.')
|
6
|
+
end
|
7
|
+
|
8
|
+
def open_csv_for_reading(options)
|
9
|
+
filename = csv_filename(options)
|
10
|
+
if !File.exist?(filename)
|
11
|
+
raise MissingError, "#{filename} does not exist"
|
12
|
+
end
|
13
|
+
::CSV.open(filename, 'rb', :headers => true)
|
14
|
+
end
|
15
|
+
|
16
|
+
def open_csv_for_writing(options)
|
17
|
+
filename = csv_filename(options)
|
18
|
+
if !options[:overwrite] && File.exist?(filename)
|
19
|
+
raise ExistsError, "#{filename} exists and not in overwrite mode"
|
20
|
+
end
|
21
|
+
if options[:dir]
|
22
|
+
FileUtils.mkdir_p(File.dirname(filename))
|
23
|
+
end
|
24
|
+
::CSV.open(filename, 'wb')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Helpers
|
3
|
+
module Database
|
4
|
+
# Returns a `Sequel::Database`.
|
5
|
+
#
|
6
|
+
# @overload database_connection(connection_options = {}, default_options = {})
|
7
|
+
# @param connection_options [Hash] Options to establish a connection. Any
|
8
|
+
# options not explicitly listed below are passed directly to `Sequel.connect`.
|
9
|
+
# @option connection_options [String] :dir Parent directory to use for SQLite database
|
10
|
+
# @option connection_options [String] :filename SQLite database filename
|
11
|
+
# @overload database_connection(url)
|
12
|
+
# @param url [String] Sequel-style connection url
|
13
|
+
# @overload database_connection(database)
|
14
|
+
# @param database [Sequel::Database]
|
15
|
+
def database_connection(connection_options = {}, default_options = {})
|
16
|
+
sequel_options = nil
|
17
|
+
connection_options ||= default_options
|
18
|
+
|
19
|
+
case connection_options
|
20
|
+
when Hash
|
21
|
+
connection_options = default_options.merge(connection_options)
|
22
|
+
sequel_options = connection_options.reject do |key, value|
|
23
|
+
key == :dir || key == :filename
|
24
|
+
end
|
25
|
+
|
26
|
+
if sequel_options.empty?
|
27
|
+
filename = connection_options[:filename] || 'linkage.db'
|
28
|
+
if connection_options[:dir]
|
29
|
+
dir = File.expand_path(connection_options[:dir])
|
30
|
+
FileUtils.mkdir_p(dir)
|
31
|
+
filename = File.join(dir, filename)
|
32
|
+
end
|
33
|
+
sequel_options[:adapter] = :sqlite
|
34
|
+
sequel_options[:database] = filename
|
35
|
+
end
|
36
|
+
when String
|
37
|
+
sequel_options = connection_options
|
38
|
+
when Sequel::Database
|
39
|
+
return connection_options
|
40
|
+
else
|
41
|
+
raise ArgumentError, "Expected Hash or String, got #{connection_options.class}"
|
42
|
+
end
|
43
|
+
Sequel.connect(sequel_options)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Linkage
|
2
2
|
class ImportBuffer
|
3
|
-
# @param [Sequel::Dataset]
|
4
|
-
# @param [Array<Symbol>]
|
5
|
-
# @param [Fixnum]
|
3
|
+
# @param dataset [Sequel::Dataset]
|
4
|
+
# @param headers [Array<Symbol>] List of fields you want to insert
|
5
|
+
# @param limit [Fixnum] Number of records to insert at a time
|
6
6
|
def initialize(dataset, headers, limit = 1000)
|
7
7
|
@dataset = dataset
|
8
8
|
@headers = headers
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module Linkage
|
2
|
+
# {MatchRecorder} is responsible for observing {Matcher} for changes and
|
3
|
+
# saving matches to a {MatchSet} via {MatchSet#add_match}.
|
2
4
|
class MatchRecorder
|
5
|
+
# @param matcher [Matcher]
|
6
|
+
# @param match_set [MatchSet]
|
3
7
|
def initialize(matcher, match_set)
|
4
8
|
@matcher = matcher
|
5
9
|
@match_set = match_set
|
data/lib/linkage/match_set.rb
CHANGED
@@ -1,30 +1,68 @@
|
|
1
1
|
module Linkage
|
2
|
+
# A {MatchSet} is responsible for keeping track of matches. After the scoring
|
3
|
+
# process, a {Matcher} uses scores from a {ScoreSet} to calculate which record
|
4
|
+
# pairs match. Those pairs are then recorded by a {MatchRecorder} to a
|
5
|
+
# {MatchSet}.
|
6
|
+
#
|
7
|
+
# {MatchSet} is the superclass of implementations for different formats.
|
8
|
+
# Currently there are two formats for storing matches:
|
9
|
+
#
|
10
|
+
# * CSV ({MatchSets::CSV})
|
11
|
+
# * Database ({MatchSets::Database})
|
12
|
+
#
|
13
|
+
# See the documentation for match set you're interested in for more
|
14
|
+
# information.
|
15
|
+
#
|
16
|
+
# If you want to implement a custom {MatchSet}, create a class that inherits
|
17
|
+
# {MatchSet} and defines at least {#add_match}. You can then register that
|
18
|
+
# class via {.register}.
|
19
|
+
#
|
20
|
+
# @abstract
|
2
21
|
class MatchSet
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
22
|
+
class << self
|
23
|
+
# Register a new match set. Subclasses must define at least {#add_match},
|
24
|
+
# otherwise an `ArgumentError` will be raised.
|
25
|
+
#
|
26
|
+
# @param [String] name Match set name used in {.klass_for}
|
27
|
+
# @param [Class] klass MatchSet subclass
|
28
|
+
def register(name, klass)
|
29
|
+
methods = klass.instance_methods(false)
|
30
|
+
unless methods.include?(:add_match)
|
31
|
+
raise ArgumentError, "class must define #add_match"
|
32
|
+
end
|
11
33
|
|
12
|
-
|
13
|
-
|
14
|
-
|
34
|
+
@match_sets ||= {}
|
35
|
+
@match_sets[name] = klass
|
36
|
+
end
|
15
37
|
|
16
|
-
|
17
|
-
|
38
|
+
# Return a registered MatchSet subclass or `nil` if it doesn't exist.
|
39
|
+
#
|
40
|
+
# @param [String] name of registered match set
|
41
|
+
# @return [Class, nil]
|
42
|
+
def klass_for(name)
|
43
|
+
@match_sets ? @match_sets[name] : nil
|
44
|
+
end
|
45
|
+
alias :[] :klass_for
|
18
46
|
end
|
19
47
|
|
48
|
+
# This is called by {MatchRecorder#start}, before any matches are added via
|
49
|
+
# {#add_match}. Subclasses can redefine this to perform any setup needed for
|
50
|
+
# saving matches.
|
20
51
|
def open_for_writing
|
21
52
|
end
|
22
53
|
|
54
|
+
# Add a match to the MatchSet. Subclasses must redefine this.
|
55
|
+
#
|
56
|
+
# @param id_1 [Object] record id from first dataset
|
57
|
+
# @param id_2 [Object] record id from second dataset
|
58
|
+
# @param value [Fixnum, Float] match value
|
23
59
|
# @abstract
|
24
60
|
def add_match(id_1, id_2, score)
|
25
61
|
raise NotImplementedError
|
26
62
|
end
|
27
63
|
|
64
|
+
# This is called by {MatchRecorder#stop}, after all matches have been added.
|
65
|
+
# Subclasses can redefine this to perform any teardown needed.
|
28
66
|
def close
|
29
67
|
end
|
30
68
|
end
|
@@ -2,27 +2,54 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module Linkage
|
4
4
|
module MatchSets
|
5
|
+
# {CSV MatchSets::CSV} is an implementation of {MatchSet} for saving
|
6
|
+
# matches in a CSV file.
|
7
|
+
#
|
8
|
+
# There are three options available:
|
9
|
+
#
|
10
|
+
# * `:filename` - which file to store matches in; can be an absolute path
|
11
|
+
# or relative path
|
12
|
+
# * `:dir` - which directory to put the file in; used if `:filename` is a
|
13
|
+
# relative path
|
14
|
+
# * `:overwrite` - indicate whether or not to overwrite an existing file
|
15
|
+
#
|
16
|
+
# By default, `:filename` is `'matches.csv'`, and the other options are
|
17
|
+
# blank. This means that it will write matches to the `'matches.csv'` file
|
18
|
+
# in the current working directory and will raise an error if the file
|
19
|
+
# already exists.
|
20
|
+
#
|
21
|
+
# If you specify `:dir`, that path will be created if it doesn't exist yet.
|
22
|
+
#
|
23
|
+
# The resulting file looks like this:
|
24
|
+
#
|
25
|
+
# id_1,id_2,score
|
26
|
+
# 123,456,0.75
|
27
|
+
# 124,457,1
|
28
|
+
#
|
29
|
+
# @see Helpers::CSV
|
5
30
|
class CSV < MatchSet
|
6
|
-
|
7
|
-
|
8
|
-
|
31
|
+
include Helpers::CSV
|
32
|
+
|
33
|
+
DEFAULT_OPTIONS = {
|
34
|
+
:filename => 'matches.csv'
|
35
|
+
}
|
36
|
+
|
37
|
+
def initialize(options = {})
|
38
|
+
@options = DEFAULT_OPTIONS.merge(options.reject { |k, v| v.nil? })
|
9
39
|
end
|
10
40
|
|
11
41
|
def open_for_writing
|
12
42
|
return if @mode == :write
|
13
43
|
|
14
|
-
|
15
|
-
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
16
|
-
end
|
17
|
-
|
18
|
-
@csv = ::CSV.open(@filename, 'wb')
|
44
|
+
@csv = open_csv_for_writing(@options)
|
19
45
|
@csv << %w{id_1 id_2 score}
|
20
46
|
@mode = :write
|
21
47
|
end
|
22
48
|
|
23
49
|
def add_match(id_1, id_2, score)
|
24
50
|
raise "not in write mode" if @mode != :write
|
25
|
-
|
51
|
+
|
52
|
+
if score.floor.equal?(score.ceil)
|
26
53
|
score = score.floor
|
27
54
|
end
|
28
55
|
@csv << [id_1, id_2, score]
|
@@ -1,8 +1,49 @@
|
|
1
1
|
module Linkage
|
2
2
|
module MatchSets
|
3
|
+
# {Database MatchSets::Database} is an implementation of {MatchSet} for saving
|
4
|
+
# matches in a relational database.
|
5
|
+
#
|
6
|
+
# Matches are saved in a database table with the following columns:
|
7
|
+
# - id_1 (string)
|
8
|
+
# - id_2 (string)
|
9
|
+
# - score (float)
|
10
|
+
#
|
11
|
+
# You can setup a database connection in a few different ways. By default, a
|
12
|
+
# SQLite database with the filename of `matches.db` will be created in the
|
13
|
+
# current working directory. If you want something different, you can either
|
14
|
+
# specify a Sequel-style URI, provide connection options for
|
15
|
+
# `Sequel.connect`, or you can just specify a `Sequel::Database` object to
|
16
|
+
# use.
|
17
|
+
#
|
18
|
+
# There are a couple of non-Sequel connection options:
|
19
|
+
# * `:filename` - specify filename to use for a SQLite database
|
20
|
+
# * `:dir` - specify the parent directory for a SQLite database
|
21
|
+
#
|
22
|
+
# In addition to connection options, there are behavioral options you can
|
23
|
+
# set. By default, the table name used is called `matches`, but you change
|
24
|
+
# that by setting the `:table_name` option in the second options hash. If
|
25
|
+
# the table already exists, an {ExistsError} will be raised unless you set
|
26
|
+
# the `:overwrite` option to a truthy value in the second options hash.
|
27
|
+
#
|
28
|
+
# @see Helpers::Database
|
3
29
|
class Database < MatchSet
|
4
|
-
|
5
|
-
|
30
|
+
include Helpers::Database
|
31
|
+
|
32
|
+
DEFAULT_OPTIONS = {
|
33
|
+
:filename => 'matches.db'
|
34
|
+
}
|
35
|
+
|
36
|
+
# @override initialize(connection_options = {}, options = {})
|
37
|
+
# @param connection_options [Hash]
|
38
|
+
# @param options [Hash]
|
39
|
+
# @override initialize(uri, options = {})
|
40
|
+
# @param uri [String]
|
41
|
+
# @param options [Hash]
|
42
|
+
# @override initialize(database, options = {})
|
43
|
+
# @param database [Sequel::Database]
|
44
|
+
# @param options [Hash]
|
45
|
+
def initialize(connection_options = {}, options = {})
|
46
|
+
@database = database_connection(connection_options, DEFAULT_OPTIONS)
|
6
47
|
@table_name = options[:table_name] || :matches
|
7
48
|
@overwrite = options[:overwrite]
|
8
49
|
end
|