linkage 0.1.0.pre → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -0
  3. data/Guardfile +0 -1
  4. data/TODO +2 -0
  5. data/lib/linkage.rb +1 -0
  6. data/lib/linkage/comparator.rb +12 -2
  7. data/lib/linkage/comparators/strcompare.rb +68 -16
  8. data/lib/linkage/configuration.rb +112 -8
  9. data/lib/linkage/dataset.rb +124 -9
  10. data/lib/linkage/exceptions.rb +5 -0
  11. data/lib/linkage/field.rb +55 -18
  12. data/lib/linkage/field_set.rb +20 -0
  13. data/lib/linkage/helpers.rb +7 -0
  14. data/lib/linkage/helpers/csv.rb +28 -0
  15. data/lib/linkage/helpers/database.rb +47 -0
  16. data/lib/linkage/import_buffer.rb +3 -3
  17. data/lib/linkage/match_recorder.rb +4 -0
  18. data/lib/linkage/match_set.rb +51 -13
  19. data/lib/linkage/match_sets/csv.rb +36 -9
  20. data/lib/linkage/match_sets/database.rb +43 -2
  21. data/lib/linkage/matcher.rb +49 -3
  22. data/lib/linkage/result_set.rb +60 -22
  23. data/lib/linkage/result_sets/csv.rb +46 -28
  24. data/lib/linkage/result_sets/database.rb +44 -26
  25. data/lib/linkage/runner.rb +10 -0
  26. data/lib/linkage/score_recorder.rb +5 -0
  27. data/lib/linkage/score_set.rb +78 -20
  28. data/lib/linkage/score_sets/csv.rb +41 -15
  29. data/lib/linkage/score_sets/database.rb +43 -5
  30. data/lib/linkage/version.rb +1 -1
  31. data/linkage.gemspec +2 -0
  32. data/misc/uml/linkage.dia +0 -0
  33. data/misc/uml/linkage.png +0 -0
  34. data/misc/uml/linkage.svg +197 -0
  35. data/test/helper.rb +2 -11
  36. data/test/integration/test_database_result_set.rb +4 -2
  37. data/test/unit/comparators/test_strcompare.rb +29 -0
  38. data/test/unit/match_sets/test_csv.rb +44 -13
  39. data/test/unit/match_sets/test_database.rb +42 -1
  40. data/test/unit/result_sets/test_csv.rb +9 -69
  41. data/test/unit/result_sets/test_database.rb +20 -11
  42. data/test/unit/score_sets/test_csv.rb +68 -25
  43. data/test/unit/score_sets/test_database.rb +57 -1
  44. data/test/unit/test_comparator.rb +8 -0
  45. data/test/unit/test_configuration.rb +33 -6
  46. data/test/unit/test_dataset.rb +0 -7
  47. data/test/unit/test_matcher.rb +52 -3
  48. data/test/unit/test_result_set.rb +8 -14
  49. metadata +66 -32
@@ -1,5 +1,10 @@
1
1
  module Linkage
2
+ # Generic error.
2
3
  class Error < Exception; end
4
+
5
+ # Error raised when a file would be overwritten.
3
6
  class ExistsError < Error; end
7
+
8
+ # Error raised when trying to read a file that doesn't exist.
4
9
  class MissingError < Error; end
5
10
  end
@@ -1,15 +1,14 @@
1
1
  module Linkage
2
- # This class is for holding information about a particular field in a
3
- # dataset.
2
+ # {Field} describes a field in a dataset, otherwise known as database table
3
+ # column.
4
4
  class Field
5
- # @!attribute [r] name
6
- # @return [Symbol] This object's name
5
+ # @return [Symbol] This field's name
7
6
  attr_reader :name
8
7
 
9
- # @return [Symbol] This field's schema information
8
+ # @return [Array] This field's schema information
10
9
  attr_reader :schema
11
10
 
12
- # Create a new instance of Field.
11
+ # Returns a new instance of Field.
13
12
  #
14
13
  # @param [Symbol] name The field's name
15
14
  # @param [Hash] schema The field's schema information
@@ -19,23 +18,58 @@ module Linkage
19
18
  end
20
19
 
21
20
  # Convert the column schema information to a hash of column options, one of
22
- # which must be :type. The other options added should modify that type
23
- # (e.g. :size). If a database type is not recognized, return it as a String
24
- # type.
21
+ # which is `:type`. The other options modify that type (e.g. `:size`).
25
22
  #
26
- # @note This method comes more or less straight from Sequel
27
- # (lib/sequel/extensions/schema_dumper.rb).
23
+ # Here are some examples:
24
+ #
25
+ # | Database type | Ruby type | Other modifiers |
26
+ # |------------------|--------------------|-----------------------|
27
+ # | mediumint | Fixnum | |
28
+ # | smallint | Fixnum | |
29
+ # | int | Fixnum | |
30
+ # | int(10) unsigned | Bignum | |
31
+ # | tinyint | TrueClass, Integer | |
32
+ # | bigint | Bignum | |
33
+ # | real | Float | |
34
+ # | float | Float | |
35
+ # | double | Float | |
36
+ # | boolean | TrueClass | |
37
+ # | text | String | text: true |
38
+ # | date | Date | |
39
+ # | datetime | DateTime | |
40
+ # | timestamp | DateTime | |
41
+ # | time | Time | only_time: true |
42
+ # | varchar(255) | String | size: 255 |
43
+ # | char(10) | String | size: 10, fixed: true |
44
+ # | money | BigDecimal | size: [19, 2] |
45
+ # | decimal | BigDecimal | |
46
+ # | numeric | BigDecimal | |
47
+ # | number | BigDecimal | |
48
+ # | blob | File | |
49
+ # | year | Integer | |
50
+ # | identity | Integer | |
51
+ # | **other types** | String | |
52
+ #
53
+ # @note This method is copied from
54
+ # {http://sequel.jeremyevans.net/rdoc-plugins/classes/Sequel/SchemaDumper.html `Sequel::SchemaDumper`}.
55
+ # @return [Hash]
28
56
  def ruby_type
29
57
  unless @ruby_type
30
58
  hsh =
31
- case t = @schema[:db_type].downcase
32
- when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
33
- {:type=>Integer}
34
- when /\Atinyint(?:\((\d+)\))?\z/o
35
- {:type =>@schema[:type] == :boolean ? TrueClass : Integer}
59
+ case @schema[:db_type].downcase
60
+ when /\A(medium|small)?int(?:eger)?(?:\((\d+)\))?( unsigned)?\z/o
61
+ if !$1 && $2 && $2.to_i >= 10 && $3
62
+ # Unsigned integer type with 10 digits can potentially contain values which
63
+ # don't fit signed integer type, so use bigint type in target database.
64
+ {:type=>Bignum}
65
+ else
66
+ {:type=>Integer}
67
+ end
68
+ when /\Atinyint(?:\((\d+)\))?(?: unsigned)?\z/o
69
+ {:type =>schema[:type] == :boolean ? TrueClass : Integer}
36
70
  when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
37
71
  {:type=>Bignum}
38
- when /\A(?:real|float|double(?: precision)?)\z/o
72
+ when /\A(?:real|float|double(?: precision)?|double\(\d+,\d+\)(?: unsigned)?)\z/o
39
73
  {:type=>Float}
40
74
  when 'boolean'
41
75
  {:type=>TrueClass}
@@ -60,7 +94,7 @@ module Linkage
60
94
  {:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
61
95
  when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
62
96
  {:type=>File, :size=>($1.to_i if $1)}
63
- when 'year'
97
+ when /\A(?:year|(?:int )?identity)\z/o
64
98
  {:type=>Integer}
65
99
  else
66
100
  {:type=>String}
@@ -73,6 +107,9 @@ module Linkage
73
107
  @ruby_type
74
108
  end
75
109
 
110
+ # Returns whether or not this field is a primary key.
111
+ #
112
+ # @return [Boolean]
76
113
  def primary_key?
77
114
  schema && schema[:primary_key]
78
115
  end
@@ -1,5 +1,11 @@
1
1
  module Linkage
2
+ # {FieldSet} is a `Hash` of {Field} values. It is usually associated with a
3
+ # {Dataset}. It looks up keys in a case-insensitive manner and doesn't care if
4
+ # you use strings or symbols.
5
+ #
6
+ # @see Dataset#field_set
2
7
  class FieldSet < Hash
8
+ # @return [Field] primary key of this field set.
3
9
  attr_reader :primary_key
4
10
 
5
11
  # Create a new FieldSet.
@@ -16,15 +22,29 @@ module Linkage
16
22
  end
17
23
  end
18
24
 
25
+ # Returns whether or not `key` is contained in the field set
26
+ # (case-insensitive).
27
+ #
28
+ # @param key [String, Symbol]
29
+ # @return [Boolean]
19
30
  def has_key?(key)
20
31
  !fetch_key(key).nil?
21
32
  end
22
33
 
34
+ # Returns a key that matches the parameter in a case-insensitive manner.
35
+ #
36
+ # @param key [String, Symbol]
37
+ # @return [Symbol]
23
38
  def fetch_key(key)
24
39
  string_key = key.to_s
25
40
  keys.detect { |k| k.to_s.casecmp(string_key) == 0 }
26
41
  end
27
42
 
43
+ # Returns the value for `key`, where `key` is matched in a case-insensitive
44
+ # manner.
45
+ #
46
+ # @param key [String, Symbol]
47
+ # @return [Field]
28
48
  def [](key)
29
49
  k = fetch_key(key)
30
50
  k ? super(k) : nil
@@ -0,0 +1,7 @@
1
+ module Linkage
2
+ module Helpers
3
+ end
4
+ end
5
+
6
+ require 'linkage/helpers/csv'
7
+ require 'linkage/helpers/database'
@@ -0,0 +1,28 @@
1
+ module Linkage
2
+ module Helpers
3
+ module CSV
4
+ def csv_filename(options)
5
+ File.expand_path(options[:filename], options[:dir] || '.')
6
+ end
7
+
8
+ def open_csv_for_reading(options)
9
+ filename = csv_filename(options)
10
+ if !File.exist?(filename)
11
+ raise MissingError, "#{filename} does not exist"
12
+ end
13
+ ::CSV.open(filename, 'rb', :headers => true)
14
+ end
15
+
16
+ def open_csv_for_writing(options)
17
+ filename = csv_filename(options)
18
+ if !options[:overwrite] && File.exist?(filename)
19
+ raise ExistsError, "#{filename} exists and not in overwrite mode"
20
+ end
21
+ if options[:dir]
22
+ FileUtils.mkdir_p(File.dirname(filename))
23
+ end
24
+ ::CSV.open(filename, 'wb')
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,47 @@
1
+ module Linkage
2
+ module Helpers
3
+ module Database
4
+ # Returns a `Sequel::Database`.
5
+ #
6
+ # @overload database_connection(connection_options = {}, default_options = {})
7
+ # @param connection_options [Hash] Options to establish a connection. Any
8
+ # options not explicitly listed below are passed directly to `Sequel.connect`.
9
+ # @option connection_options [String] :dir Parent directory to use for SQLite database
10
+ # @option connection_options [String] :filename SQLite database filename
11
+ # @overload database_connection(url)
12
+ # @param url [String] Sequel-style connection url
13
+ # @overload database_connection(database)
14
+ # @param database [Sequel::Database]
15
+ def database_connection(connection_options = {}, default_options = {})
16
+ sequel_options = nil
17
+ connection_options ||= default_options
18
+
19
+ case connection_options
20
+ when Hash
21
+ connection_options = default_options.merge(connection_options)
22
+ sequel_options = connection_options.reject do |key, value|
23
+ key == :dir || key == :filename
24
+ end
25
+
26
+ if sequel_options.empty?
27
+ filename = connection_options[:filename] || 'linkage.db'
28
+ if connection_options[:dir]
29
+ dir = File.expand_path(connection_options[:dir])
30
+ FileUtils.mkdir_p(dir)
31
+ filename = File.join(dir, filename)
32
+ end
33
+ sequel_options[:adapter] = :sqlite
34
+ sequel_options[:database] = filename
35
+ end
36
+ when String
37
+ sequel_options = connection_options
38
+ when Sequel::Database
39
+ return connection_options
40
+ else
41
+ raise ArgumentError, "Expected Hash or String, got #{connection_options.class}"
42
+ end
43
+ Sequel.connect(sequel_options)
44
+ end
45
+ end
46
+ end
47
+ end
@@ -1,8 +1,8 @@
1
1
  module Linkage
2
2
  class ImportBuffer
3
- # @param [Sequel::Dataset] dataset
4
- # @param [Array<Symbol>] headers List of fields you want to insert
5
- # @param [Fixnum] limit Number of records to insert at a time
3
+ # @param dataset [Sequel::Dataset]
4
+ # @param headers [Array<Symbol>] List of fields you want to insert
5
+ # @param limit [Fixnum] Number of records to insert at a time
6
6
  def initialize(dataset, headers, limit = 1000)
7
7
  @dataset = dataset
8
8
  @headers = headers
@@ -1,5 +1,9 @@
1
1
  module Linkage
2
+ # {MatchRecorder} is responsible for observing {Matcher} for changes and
3
+ # saving matches to a {MatchSet} via {MatchSet#add_match}.
2
4
  class MatchRecorder
5
+ # @param matcher [Matcher]
6
+ # @param match_set [MatchSet]
3
7
  def initialize(matcher, match_set)
4
8
  @matcher = matcher
5
9
  @match_set = match_set
@@ -1,30 +1,68 @@
1
1
  module Linkage
2
+ # A {MatchSet} is responsible for keeping track of matches. After the scoring
3
+ # process, a {Matcher} uses scores from a {ScoreSet} to calculate which record
4
+ # pairs match. Those pairs are then recorded by a {MatchRecorder} to a
5
+ # {MatchSet}.
6
+ #
7
+ # {MatchSet} is the superclass of implementations for different formats.
8
+ # Currently there are two formats for storing matches:
9
+ #
10
+ # * CSV ({MatchSets::CSV})
11
+ # * Database ({MatchSets::Database})
12
+ #
13
+ # See the documentation for match set you're interested in for more
14
+ # information.
15
+ #
16
+ # If you want to implement a custom {MatchSet}, create a class that inherits
17
+ # {MatchSet} and defines at least {#add_match}. You can then register that
18
+ # class via {.register}.
19
+ #
20
+ # @abstract
2
21
  class MatchSet
3
- # Register a match set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- unless methods.include?(:add_match)
9
- raise ArgumentError, "class must define #add_match"
10
- end
22
+ class << self
23
+ # Register a new match set. Subclasses must define at least {#add_match},
24
+ # otherwise an `ArgumentError` will be raised.
25
+ #
26
+ # @param [String] name Match set name used in {.klass_for}
27
+ # @param [Class] klass MatchSet subclass
28
+ def register(name, klass)
29
+ methods = klass.instance_methods(false)
30
+ unless methods.include?(:add_match)
31
+ raise ArgumentError, "class must define #add_match"
32
+ end
11
33
 
12
- @match_sets ||= {}
13
- @match_sets[name] = klass
14
- end
34
+ @match_sets ||= {}
35
+ @match_sets[name] = klass
36
+ end
15
37
 
16
- def self.[](name)
17
- @match_sets ? @match_sets[name] : nil
38
+ # Return a registered MatchSet subclass or `nil` if it doesn't exist.
39
+ #
40
+ # @param [String] name of registered match set
41
+ # @return [Class, nil]
42
+ def klass_for(name)
43
+ @match_sets ? @match_sets[name] : nil
44
+ end
45
+ alias :[] :klass_for
18
46
  end
19
47
 
48
+ # This is called by {MatchRecorder#start}, before any matches are added via
49
+ # {#add_match}. Subclasses can redefine this to perform any setup needed for
50
+ # saving matches.
20
51
  def open_for_writing
21
52
  end
22
53
 
54
+ # Add a match to the MatchSet. Subclasses must redefine this.
55
+ #
56
+ # @param id_1 [Object] record id from first dataset
57
+ # @param id_2 [Object] record id from second dataset
58
+ # @param value [Fixnum, Float] match value
23
59
  # @abstract
24
60
  def add_match(id_1, id_2, score)
25
61
  raise NotImplementedError
26
62
  end
27
63
 
64
+ # This is called by {MatchRecorder#stop}, after all matches have been added.
65
+ # Subclasses can redefine this to perform any teardown needed.
28
66
  def close
29
67
  end
30
68
  end
@@ -2,27 +2,54 @@ require 'csv'
2
2
 
3
3
  module Linkage
4
4
  module MatchSets
5
+ # {CSV MatchSets::CSV} is an implementation of {MatchSet} for saving
6
+ # matches in a CSV file.
7
+ #
8
+ # There are three options available:
9
+ #
10
+ # * `:filename` - which file to store matches in; can be an absolute path
11
+ # or relative path
12
+ # * `:dir` - which directory to put the file in; used if `:filename` is a
13
+ # relative path
14
+ # * `:overwrite` - indicate whether or not to overwrite an existing file
15
+ #
16
+ # By default, `:filename` is `'matches.csv'`, and the other options are
17
+ # blank. This means that it will write matches to the `'matches.csv'` file
18
+ # in the current working directory and will raise an error if the file
19
+ # already exists.
20
+ #
21
+ # If you specify `:dir`, that path will be created if it doesn't exist yet.
22
+ #
23
+ # The resulting file looks like this:
24
+ #
25
+ # id_1,id_2,score
26
+ # 123,456,0.75
27
+ # 124,457,1
28
+ #
29
+ # @see Helpers::CSV
5
30
  class CSV < MatchSet
6
- def initialize(filename, options = {})
7
- @filename = filename
8
- @overwrite = options[:overwrite]
31
+ include Helpers::CSV
32
+
33
+ DEFAULT_OPTIONS = {
34
+ :filename => 'matches.csv'
35
+ }
36
+
37
+ def initialize(options = {})
38
+ @options = DEFAULT_OPTIONS.merge(options.reject { |k, v| v.nil? })
9
39
  end
10
40
 
11
41
  def open_for_writing
12
42
  return if @mode == :write
13
43
 
14
- if !@overwrite && File.exist?(@filename)
15
- raise ExistsError, "#{@filename} exists and not in overwrite mode"
16
- end
17
-
18
- @csv = ::CSV.open(@filename, 'wb')
44
+ @csv = open_csv_for_writing(@options)
19
45
  @csv << %w{id_1 id_2 score}
20
46
  @mode = :write
21
47
  end
22
48
 
23
49
  def add_match(id_1, id_2, score)
24
50
  raise "not in write mode" if @mode != :write
25
- if score.equal?(1.0) || score.equal?(0.0)
51
+
52
+ if score.floor.equal?(score.ceil)
26
53
  score = score.floor
27
54
  end
28
55
  @csv << [id_1, id_2, score]
@@ -1,8 +1,49 @@
1
1
  module Linkage
2
2
  module MatchSets
3
+ # {Database MatchSets::Database} is an implementation of {MatchSet} for saving
4
+ # matches in a relational database.
5
+ #
6
+ # Matches are saved in a database table with the following columns:
7
+ # - id_1 (string)
8
+ # - id_2 (string)
9
+ # - score (float)
10
+ #
11
+ # You can setup a database connection in a few different ways. By default, a
12
+ # SQLite database with the filename of `matches.db` will be created in the
13
+ # current working directory. If you want something different, you can either
14
+ # specify a Sequel-style URI, provide connection options for
15
+ # `Sequel.connect`, or you can just specify a `Sequel::Database` object to
16
+ # use.
17
+ #
18
+ # There are a couple of non-Sequel connection options:
19
+ # * `:filename` - specify filename to use for a SQLite database
20
+ # * `:dir` - specify the parent directory for a SQLite database
21
+ #
22
+ # In addition to connection options, there are behavioral options you can
23
+ # set. By default, the table name used is called `matches`, but you change
24
+ # that by setting the `:table_name` option in the second options hash. If
25
+ # the table already exists, an {ExistsError} will be raised unless you set
26
+ # the `:overwrite` option to a truthy value in the second options hash.
27
+ #
28
+ # @see Helpers::Database
3
29
  class Database < MatchSet
4
- def initialize(database, options = {})
5
- @database = database
30
+ include Helpers::Database
31
+
32
+ DEFAULT_OPTIONS = {
33
+ :filename => 'matches.db'
34
+ }
35
+
36
+ # @override initialize(connection_options = {}, options = {})
37
+ # @param connection_options [Hash]
38
+ # @param options [Hash]
39
+ # @override initialize(uri, options = {})
40
+ # @param uri [String]
41
+ # @param options [Hash]
42
+ # @override initialize(database, options = {})
43
+ # @param database [Sequel::Database]
44
+ # @param options [Hash]
45
+ def initialize(connection_options = {}, options = {})
46
+ @database = database_connection(connection_options, DEFAULT_OPTIONS)
6
47
  @table_name = options[:table_name] || :matches
7
48
  @overwrite = options[:overwrite]
8
49
  end