linkage 0.1.0.pre → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +2 -0
- data/Guardfile +0 -1
- data/TODO +2 -0
- data/lib/linkage.rb +1 -0
- data/lib/linkage/comparator.rb +12 -2
- data/lib/linkage/comparators/strcompare.rb +68 -16
- data/lib/linkage/configuration.rb +112 -8
- data/lib/linkage/dataset.rb +124 -9
- data/lib/linkage/exceptions.rb +5 -0
- data/lib/linkage/field.rb +55 -18
- data/lib/linkage/field_set.rb +20 -0
- data/lib/linkage/helpers.rb +7 -0
- data/lib/linkage/helpers/csv.rb +28 -0
- data/lib/linkage/helpers/database.rb +47 -0
- data/lib/linkage/import_buffer.rb +3 -3
- data/lib/linkage/match_recorder.rb +4 -0
- data/lib/linkage/match_set.rb +51 -13
- data/lib/linkage/match_sets/csv.rb +36 -9
- data/lib/linkage/match_sets/database.rb +43 -2
- data/lib/linkage/matcher.rb +49 -3
- data/lib/linkage/result_set.rb +60 -22
- data/lib/linkage/result_sets/csv.rb +46 -28
- data/lib/linkage/result_sets/database.rb +44 -26
- data/lib/linkage/runner.rb +10 -0
- data/lib/linkage/score_recorder.rb +5 -0
- data/lib/linkage/score_set.rb +78 -20
- data/lib/linkage/score_sets/csv.rb +41 -15
- data/lib/linkage/score_sets/database.rb +43 -5
- data/lib/linkage/version.rb +1 -1
- data/linkage.gemspec +2 -0
- data/misc/uml/linkage.dia +0 -0
- data/misc/uml/linkage.png +0 -0
- data/misc/uml/linkage.svg +197 -0
- data/test/helper.rb +2 -11
- data/test/integration/test_database_result_set.rb +4 -2
- data/test/unit/comparators/test_strcompare.rb +29 -0
- data/test/unit/match_sets/test_csv.rb +44 -13
- data/test/unit/match_sets/test_database.rb +42 -1
- data/test/unit/result_sets/test_csv.rb +9 -69
- data/test/unit/result_sets/test_database.rb +20 -11
- data/test/unit/score_sets/test_csv.rb +68 -25
- data/test/unit/score_sets/test_database.rb +57 -1
- data/test/unit/test_comparator.rb +8 -0
- data/test/unit/test_configuration.rb +33 -6
- data/test/unit/test_dataset.rb +0 -7
- data/test/unit/test_matcher.rb +52 -3
- data/test/unit/test_result_set.rb +8 -14
- metadata +66 -32
data/lib/linkage/exceptions.rb
CHANGED
data/lib/linkage/field.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
module Linkage
|
2
|
-
#
|
3
|
-
#
|
2
|
+
# {Field} describes a field in a dataset, otherwise known as database table
|
3
|
+
# column.
|
4
4
|
class Field
|
5
|
-
#
|
6
|
-
# @return [Symbol] This object's name
|
5
|
+
# @return [Symbol] This field's name
|
7
6
|
attr_reader :name
|
8
7
|
|
9
|
-
# @return [
|
8
|
+
# @return [Array] This field's schema information
|
10
9
|
attr_reader :schema
|
11
10
|
|
12
|
-
#
|
11
|
+
# Returns a new instance of Field.
|
13
12
|
#
|
14
13
|
# @param [Symbol] name The field's name
|
15
14
|
# @param [Hash] schema The field's schema information
|
@@ -19,23 +18,58 @@ module Linkage
|
|
19
18
|
end
|
20
19
|
|
21
20
|
# Convert the column schema information to a hash of column options, one of
|
22
|
-
# which
|
23
|
-
# (e.g. :size). If a database type is not recognized, return it as a String
|
24
|
-
# type.
|
21
|
+
# which is `:type`. The other options modify that type (e.g. `:size`).
|
25
22
|
#
|
26
|
-
#
|
27
|
-
#
|
23
|
+
# Here are some examples:
|
24
|
+
#
|
25
|
+
# | Database type | Ruby type | Other modifiers |
|
26
|
+
# |------------------|--------------------|-----------------------|
|
27
|
+
# | mediumint | Fixnum | |
|
28
|
+
# | smallint | Fixnum | |
|
29
|
+
# | int | Fixnum | |
|
30
|
+
# | int(10) unsigned | Bignum | |
|
31
|
+
# | tinyint | TrueClass, Integer | |
|
32
|
+
# | bigint | Bignum | |
|
33
|
+
# | real | Float | |
|
34
|
+
# | float | Float | |
|
35
|
+
# | double | Float | |
|
36
|
+
# | boolean | TrueClass | |
|
37
|
+
# | text | String | text: true |
|
38
|
+
# | date | Date | |
|
39
|
+
# | datetime | DateTime | |
|
40
|
+
# | timestamp | DateTime | |
|
41
|
+
# | time | Time | only_time: true |
|
42
|
+
# | varchar(255) | String | size: 255 |
|
43
|
+
# | char(10) | String | size: 10, fixed: true |
|
44
|
+
# | money | BigDecimal | size: [19, 2] |
|
45
|
+
# | decimal | BigDecimal | |
|
46
|
+
# | numeric | BigDecimal | |
|
47
|
+
# | number | BigDecimal | |
|
48
|
+
# | blob | File | |
|
49
|
+
# | year | Integer | |
|
50
|
+
# | identity | Integer | |
|
51
|
+
# | **other types** | String | |
|
52
|
+
#
|
53
|
+
# @note This method is copied from
|
54
|
+
# {http://sequel.jeremyevans.net/rdoc-plugins/classes/Sequel/SchemaDumper.html `Sequel::SchemaDumper`}.
|
55
|
+
# @return [Hash]
|
28
56
|
def ruby_type
|
29
57
|
unless @ruby_type
|
30
58
|
hsh =
|
31
|
-
case
|
32
|
-
when /\A(
|
33
|
-
|
34
|
-
|
35
|
-
|
59
|
+
case @schema[:db_type].downcase
|
60
|
+
when /\A(medium|small)?int(?:eger)?(?:\((\d+)\))?( unsigned)?\z/o
|
61
|
+
if !$1 && $2 && $2.to_i >= 10 && $3
|
62
|
+
# Unsigned integer type with 10 digits can potentially contain values which
|
63
|
+
# don't fit signed integer type, so use bigint type in target database.
|
64
|
+
{:type=>Bignum}
|
65
|
+
else
|
66
|
+
{:type=>Integer}
|
67
|
+
end
|
68
|
+
when /\Atinyint(?:\((\d+)\))?(?: unsigned)?\z/o
|
69
|
+
{:type =>schema[:type] == :boolean ? TrueClass : Integer}
|
36
70
|
when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
|
37
71
|
{:type=>Bignum}
|
38
|
-
when /\A(?:real|float|double(?: precision)?)\z/o
|
72
|
+
when /\A(?:real|float|double(?: precision)?|double\(\d+,\d+\)(?: unsigned)?)\z/o
|
39
73
|
{:type=>Float}
|
40
74
|
when 'boolean'
|
41
75
|
{:type=>TrueClass}
|
@@ -60,7 +94,7 @@ module Linkage
|
|
60
94
|
{:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
|
61
95
|
when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
|
62
96
|
{:type=>File, :size=>($1.to_i if $1)}
|
63
|
-
when
|
97
|
+
when /\A(?:year|(?:int )?identity)\z/o
|
64
98
|
{:type=>Integer}
|
65
99
|
else
|
66
100
|
{:type=>String}
|
@@ -73,6 +107,9 @@ module Linkage
|
|
73
107
|
@ruby_type
|
74
108
|
end
|
75
109
|
|
110
|
+
# Returns whether or not this field is a primary key.
|
111
|
+
#
|
112
|
+
# @return [Boolean]
|
76
113
|
def primary_key?
|
77
114
|
schema && schema[:primary_key]
|
78
115
|
end
|
data/lib/linkage/field_set.rb
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
module Linkage
|
2
|
+
# {FieldSet} is a `Hash` of {Field} values. It is usually associated with a
|
3
|
+
# {Dataset}. It looks up keys in a case-insensitive manner and doesn't care if
|
4
|
+
# you use strings or symbols.
|
5
|
+
#
|
6
|
+
# @see Dataset#field_set
|
2
7
|
class FieldSet < Hash
|
8
|
+
# @return [Field] primary key of this field set.
|
3
9
|
attr_reader :primary_key
|
4
10
|
|
5
11
|
# Create a new FieldSet.
|
@@ -16,15 +22,29 @@ module Linkage
|
|
16
22
|
end
|
17
23
|
end
|
18
24
|
|
25
|
+
# Returns whether or not `key` is contained in the field set
|
26
|
+
# (case-insensitive).
|
27
|
+
#
|
28
|
+
# @param key [String, Symbol]
|
29
|
+
# @return [Boolean]
|
19
30
|
def has_key?(key)
|
20
31
|
!fetch_key(key).nil?
|
21
32
|
end
|
22
33
|
|
34
|
+
# Returns a key that matches the parameter in a case-insensitive manner.
|
35
|
+
#
|
36
|
+
# @param key [String, Symbol]
|
37
|
+
# @return [Symbol]
|
23
38
|
def fetch_key(key)
|
24
39
|
string_key = key.to_s
|
25
40
|
keys.detect { |k| k.to_s.casecmp(string_key) == 0 }
|
26
41
|
end
|
27
42
|
|
43
|
+
# Returns the value for `key`, where `key` is matched in a case-insensitive
|
44
|
+
# manner.
|
45
|
+
#
|
46
|
+
# @param key [String, Symbol]
|
47
|
+
# @return [Field]
|
28
48
|
def [](key)
|
29
49
|
k = fetch_key(key)
|
30
50
|
k ? super(k) : nil
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Helpers
|
3
|
+
module CSV
|
4
|
+
def csv_filename(options)
|
5
|
+
File.expand_path(options[:filename], options[:dir] || '.')
|
6
|
+
end
|
7
|
+
|
8
|
+
def open_csv_for_reading(options)
|
9
|
+
filename = csv_filename(options)
|
10
|
+
if !File.exist?(filename)
|
11
|
+
raise MissingError, "#{filename} does not exist"
|
12
|
+
end
|
13
|
+
::CSV.open(filename, 'rb', :headers => true)
|
14
|
+
end
|
15
|
+
|
16
|
+
def open_csv_for_writing(options)
|
17
|
+
filename = csv_filename(options)
|
18
|
+
if !options[:overwrite] && File.exist?(filename)
|
19
|
+
raise ExistsError, "#{filename} exists and not in overwrite mode"
|
20
|
+
end
|
21
|
+
if options[:dir]
|
22
|
+
FileUtils.mkdir_p(File.dirname(filename))
|
23
|
+
end
|
24
|
+
::CSV.open(filename, 'wb')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Linkage
|
2
|
+
module Helpers
|
3
|
+
module Database
|
4
|
+
# Returns a `Sequel::Database`.
|
5
|
+
#
|
6
|
+
# @overload database_connection(connection_options = {}, default_options = {})
|
7
|
+
# @param connection_options [Hash] Options to establish a connection. Any
|
8
|
+
# options not explicitly listed below are passed directly to `Sequel.connect`.
|
9
|
+
# @option connection_options [String] :dir Parent directory to use for SQLite database
|
10
|
+
# @option connection_options [String] :filename SQLite database filename
|
11
|
+
# @overload database_connection(url)
|
12
|
+
# @param url [String] Sequel-style connection url
|
13
|
+
# @overload database_connection(database)
|
14
|
+
# @param database [Sequel::Database]
|
15
|
+
def database_connection(connection_options = {}, default_options = {})
|
16
|
+
sequel_options = nil
|
17
|
+
connection_options ||= default_options
|
18
|
+
|
19
|
+
case connection_options
|
20
|
+
when Hash
|
21
|
+
connection_options = default_options.merge(connection_options)
|
22
|
+
sequel_options = connection_options.reject do |key, value|
|
23
|
+
key == :dir || key == :filename
|
24
|
+
end
|
25
|
+
|
26
|
+
if sequel_options.empty?
|
27
|
+
filename = connection_options[:filename] || 'linkage.db'
|
28
|
+
if connection_options[:dir]
|
29
|
+
dir = File.expand_path(connection_options[:dir])
|
30
|
+
FileUtils.mkdir_p(dir)
|
31
|
+
filename = File.join(dir, filename)
|
32
|
+
end
|
33
|
+
sequel_options[:adapter] = :sqlite
|
34
|
+
sequel_options[:database] = filename
|
35
|
+
end
|
36
|
+
when String
|
37
|
+
sequel_options = connection_options
|
38
|
+
when Sequel::Database
|
39
|
+
return connection_options
|
40
|
+
else
|
41
|
+
raise ArgumentError, "Expected Hash or String, got #{connection_options.class}"
|
42
|
+
end
|
43
|
+
Sequel.connect(sequel_options)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Linkage
|
2
2
|
class ImportBuffer
|
3
|
-
# @param [Sequel::Dataset]
|
4
|
-
# @param [Array<Symbol>]
|
5
|
-
# @param [Fixnum]
|
3
|
+
# @param dataset [Sequel::Dataset]
|
4
|
+
# @param headers [Array<Symbol>] List of fields you want to insert
|
5
|
+
# @param limit [Fixnum] Number of records to insert at a time
|
6
6
|
def initialize(dataset, headers, limit = 1000)
|
7
7
|
@dataset = dataset
|
8
8
|
@headers = headers
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module Linkage
|
2
|
+
# {MatchRecorder} is responsible for observing {Matcher} for changes and
|
3
|
+
# saving matches to a {MatchSet} via {MatchSet#add_match}.
|
2
4
|
class MatchRecorder
|
5
|
+
# @param matcher [Matcher]
|
6
|
+
# @param match_set [MatchSet]
|
3
7
|
def initialize(matcher, match_set)
|
4
8
|
@matcher = matcher
|
5
9
|
@match_set = match_set
|
data/lib/linkage/match_set.rb
CHANGED
@@ -1,30 +1,68 @@
|
|
1
1
|
module Linkage
|
2
|
+
# A {MatchSet} is responsible for keeping track of matches. After the scoring
|
3
|
+
# process, a {Matcher} uses scores from a {ScoreSet} to calculate which record
|
4
|
+
# pairs match. Those pairs are then recorded by a {MatchRecorder} to a
|
5
|
+
# {MatchSet}.
|
6
|
+
#
|
7
|
+
# {MatchSet} is the superclass of implementations for different formats.
|
8
|
+
# Currently there are two formats for storing matches:
|
9
|
+
#
|
10
|
+
# * CSV ({MatchSets::CSV})
|
11
|
+
# * Database ({MatchSets::Database})
|
12
|
+
#
|
13
|
+
# See the documentation for match set you're interested in for more
|
14
|
+
# information.
|
15
|
+
#
|
16
|
+
# If you want to implement a custom {MatchSet}, create a class that inherits
|
17
|
+
# {MatchSet} and defines at least {#add_match}. You can then register that
|
18
|
+
# class via {.register}.
|
19
|
+
#
|
20
|
+
# @abstract
|
2
21
|
class MatchSet
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
22
|
+
class << self
|
23
|
+
# Register a new match set. Subclasses must define at least {#add_match},
|
24
|
+
# otherwise an `ArgumentError` will be raised.
|
25
|
+
#
|
26
|
+
# @param [String] name Match set name used in {.klass_for}
|
27
|
+
# @param [Class] klass MatchSet subclass
|
28
|
+
def register(name, klass)
|
29
|
+
methods = klass.instance_methods(false)
|
30
|
+
unless methods.include?(:add_match)
|
31
|
+
raise ArgumentError, "class must define #add_match"
|
32
|
+
end
|
11
33
|
|
12
|
-
|
13
|
-
|
14
|
-
|
34
|
+
@match_sets ||= {}
|
35
|
+
@match_sets[name] = klass
|
36
|
+
end
|
15
37
|
|
16
|
-
|
17
|
-
|
38
|
+
# Return a registered MatchSet subclass or `nil` if it doesn't exist.
|
39
|
+
#
|
40
|
+
# @param [String] name of registered match set
|
41
|
+
# @return [Class, nil]
|
42
|
+
def klass_for(name)
|
43
|
+
@match_sets ? @match_sets[name] : nil
|
44
|
+
end
|
45
|
+
alias :[] :klass_for
|
18
46
|
end
|
19
47
|
|
48
|
+
# This is called by {MatchRecorder#start}, before any matches are added via
|
49
|
+
# {#add_match}. Subclasses can redefine this to perform any setup needed for
|
50
|
+
# saving matches.
|
20
51
|
def open_for_writing
|
21
52
|
end
|
22
53
|
|
54
|
+
# Add a match to the MatchSet. Subclasses must redefine this.
|
55
|
+
#
|
56
|
+
# @param id_1 [Object] record id from first dataset
|
57
|
+
# @param id_2 [Object] record id from second dataset
|
58
|
+
# @param value [Fixnum, Float] match value
|
23
59
|
# @abstract
|
24
60
|
def add_match(id_1, id_2, score)
|
25
61
|
raise NotImplementedError
|
26
62
|
end
|
27
63
|
|
64
|
+
# This is called by {MatchRecorder#stop}, after all matches have been added.
|
65
|
+
# Subclasses can redefine this to perform any teardown needed.
|
28
66
|
def close
|
29
67
|
end
|
30
68
|
end
|
@@ -2,27 +2,54 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module Linkage
|
4
4
|
module MatchSets
|
5
|
+
# {CSV MatchSets::CSV} is an implementation of {MatchSet} for saving
|
6
|
+
# matches in a CSV file.
|
7
|
+
#
|
8
|
+
# There are three options available:
|
9
|
+
#
|
10
|
+
# * `:filename` - which file to store matches in; can be an absolute path
|
11
|
+
# or relative path
|
12
|
+
# * `:dir` - which directory to put the file in; used if `:filename` is a
|
13
|
+
# relative path
|
14
|
+
# * `:overwrite` - indicate whether or not to overwrite an existing file
|
15
|
+
#
|
16
|
+
# By default, `:filename` is `'matches.csv'`, and the other options are
|
17
|
+
# blank. This means that it will write matches to the `'matches.csv'` file
|
18
|
+
# in the current working directory and will raise an error if the file
|
19
|
+
# already exists.
|
20
|
+
#
|
21
|
+
# If you specify `:dir`, that path will be created if it doesn't exist yet.
|
22
|
+
#
|
23
|
+
# The resulting file looks like this:
|
24
|
+
#
|
25
|
+
# id_1,id_2,score
|
26
|
+
# 123,456,0.75
|
27
|
+
# 124,457,1
|
28
|
+
#
|
29
|
+
# @see Helpers::CSV
|
5
30
|
class CSV < MatchSet
|
6
|
-
|
7
|
-
|
8
|
-
|
31
|
+
include Helpers::CSV
|
32
|
+
|
33
|
+
DEFAULT_OPTIONS = {
|
34
|
+
:filename => 'matches.csv'
|
35
|
+
}
|
36
|
+
|
37
|
+
def initialize(options = {})
|
38
|
+
@options = DEFAULT_OPTIONS.merge(options.reject { |k, v| v.nil? })
|
9
39
|
end
|
10
40
|
|
11
41
|
def open_for_writing
|
12
42
|
return if @mode == :write
|
13
43
|
|
14
|
-
|
15
|
-
raise ExistsError, "#{@filename} exists and not in overwrite mode"
|
16
|
-
end
|
17
|
-
|
18
|
-
@csv = ::CSV.open(@filename, 'wb')
|
44
|
+
@csv = open_csv_for_writing(@options)
|
19
45
|
@csv << %w{id_1 id_2 score}
|
20
46
|
@mode = :write
|
21
47
|
end
|
22
48
|
|
23
49
|
def add_match(id_1, id_2, score)
|
24
50
|
raise "not in write mode" if @mode != :write
|
25
|
-
|
51
|
+
|
52
|
+
if score.floor.equal?(score.ceil)
|
26
53
|
score = score.floor
|
27
54
|
end
|
28
55
|
@csv << [id_1, id_2, score]
|
@@ -1,8 +1,49 @@
|
|
1
1
|
module Linkage
|
2
2
|
module MatchSets
|
3
|
+
# {Database MatchSets::Database} is an implementation of {MatchSet} for saving
|
4
|
+
# matches in a relational database.
|
5
|
+
#
|
6
|
+
# Matches are saved in a database table with the following columns:
|
7
|
+
# - id_1 (string)
|
8
|
+
# - id_2 (string)
|
9
|
+
# - score (float)
|
10
|
+
#
|
11
|
+
# You can setup a database connection in a few different ways. By default, a
|
12
|
+
# SQLite database with the filename of `matches.db` will be created in the
|
13
|
+
# current working directory. If you want something different, you can either
|
14
|
+
# specify a Sequel-style URI, provide connection options for
|
15
|
+
# `Sequel.connect`, or you can just specify a `Sequel::Database` object to
|
16
|
+
# use.
|
17
|
+
#
|
18
|
+
# There are a couple of non-Sequel connection options:
|
19
|
+
# * `:filename` - specify filename to use for a SQLite database
|
20
|
+
# * `:dir` - specify the parent directory for a SQLite database
|
21
|
+
#
|
22
|
+
# In addition to connection options, there are behavioral options you can
|
23
|
+
# set. By default, the table name used is called `matches`, but you change
|
24
|
+
# that by setting the `:table_name` option in the second options hash. If
|
25
|
+
# the table already exists, an {ExistsError} will be raised unless you set
|
26
|
+
# the `:overwrite` option to a truthy value in the second options hash.
|
27
|
+
#
|
28
|
+
# @see Helpers::Database
|
3
29
|
class Database < MatchSet
|
4
|
-
|
5
|
-
|
30
|
+
include Helpers::Database
|
31
|
+
|
32
|
+
DEFAULT_OPTIONS = {
|
33
|
+
:filename => 'matches.db'
|
34
|
+
}
|
35
|
+
|
36
|
+
# @override initialize(connection_options = {}, options = {})
|
37
|
+
# @param connection_options [Hash]
|
38
|
+
# @param options [Hash]
|
39
|
+
# @override initialize(uri, options = {})
|
40
|
+
# @param uri [String]
|
41
|
+
# @param options [Hash]
|
42
|
+
# @override initialize(database, options = {})
|
43
|
+
# @param database [Sequel::Database]
|
44
|
+
# @param options [Hash]
|
45
|
+
def initialize(connection_options = {}, options = {})
|
46
|
+
@database = database_connection(connection_options, DEFAULT_OPTIONS)
|
6
47
|
@table_name = options[:table_name] || :matches
|
7
48
|
@overwrite = options[:overwrite]
|
8
49
|
end
|