linkage 0.1.0.pre → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -0
  3. data/Guardfile +0 -1
  4. data/TODO +2 -0
  5. data/lib/linkage.rb +1 -0
  6. data/lib/linkage/comparator.rb +12 -2
  7. data/lib/linkage/comparators/strcompare.rb +68 -16
  8. data/lib/linkage/configuration.rb +112 -8
  9. data/lib/linkage/dataset.rb +124 -9
  10. data/lib/linkage/exceptions.rb +5 -0
  11. data/lib/linkage/field.rb +55 -18
  12. data/lib/linkage/field_set.rb +20 -0
  13. data/lib/linkage/helpers.rb +7 -0
  14. data/lib/linkage/helpers/csv.rb +28 -0
  15. data/lib/linkage/helpers/database.rb +47 -0
  16. data/lib/linkage/import_buffer.rb +3 -3
  17. data/lib/linkage/match_recorder.rb +4 -0
  18. data/lib/linkage/match_set.rb +51 -13
  19. data/lib/linkage/match_sets/csv.rb +36 -9
  20. data/lib/linkage/match_sets/database.rb +43 -2
  21. data/lib/linkage/matcher.rb +49 -3
  22. data/lib/linkage/result_set.rb +60 -22
  23. data/lib/linkage/result_sets/csv.rb +46 -28
  24. data/lib/linkage/result_sets/database.rb +44 -26
  25. data/lib/linkage/runner.rb +10 -0
  26. data/lib/linkage/score_recorder.rb +5 -0
  27. data/lib/linkage/score_set.rb +78 -20
  28. data/lib/linkage/score_sets/csv.rb +41 -15
  29. data/lib/linkage/score_sets/database.rb +43 -5
  30. data/lib/linkage/version.rb +1 -1
  31. data/linkage.gemspec +2 -0
  32. data/misc/uml/linkage.dia +0 -0
  33. data/misc/uml/linkage.png +0 -0
  34. data/misc/uml/linkage.svg +197 -0
  35. data/test/helper.rb +2 -11
  36. data/test/integration/test_database_result_set.rb +4 -2
  37. data/test/unit/comparators/test_strcompare.rb +29 -0
  38. data/test/unit/match_sets/test_csv.rb +44 -13
  39. data/test/unit/match_sets/test_database.rb +42 -1
  40. data/test/unit/result_sets/test_csv.rb +9 -69
  41. data/test/unit/result_sets/test_database.rb +20 -11
  42. data/test/unit/score_sets/test_csv.rb +68 -25
  43. data/test/unit/score_sets/test_database.rb +57 -1
  44. data/test/unit/test_comparator.rb +8 -0
  45. data/test/unit/test_configuration.rb +33 -6
  46. data/test/unit/test_dataset.rb +0 -7
  47. data/test/unit/test_matcher.rb +52 -3
  48. data/test/unit/test_result_set.rb +8 -14
  49. metadata +66 -32
@@ -1,5 +1,10 @@
1
1
  module Linkage
2
+ # Generic error.
2
3
  class Error < Exception; end
4
+
5
+ # Error raised when a file would be overwritten.
3
6
  class ExistsError < Error; end
7
+
8
+ # Error raised when trying to read a file that doesn't exist.
4
9
  class MissingError < Error; end
5
10
  end
@@ -1,15 +1,14 @@
1
1
  module Linkage
2
- # This class is for holding information about a particular field in a
3
- # dataset.
2
+ # {Field} describes a field in a dataset, otherwise known as database table
3
+ # column.
4
4
  class Field
5
- # @!attribute [r] name
6
- # @return [Symbol] This object's name
5
+ # @return [Symbol] This field's name
7
6
  attr_reader :name
8
7
 
9
- # @return [Symbol] This field's schema information
8
+ # @return [Array] This field's schema information
10
9
  attr_reader :schema
11
10
 
12
- # Create a new instance of Field.
11
+ # Returns a new instance of Field.
13
12
  #
14
13
  # @param [Symbol] name The field's name
15
14
  # @param [Hash] schema The field's schema information
@@ -19,23 +18,58 @@ module Linkage
19
18
  end
20
19
 
21
20
  # Convert the column schema information to a hash of column options, one of
22
- # which must be :type. The other options added should modify that type
23
- # (e.g. :size). If a database type is not recognized, return it as a String
24
- # type.
21
+ # which is `:type`. The other options modify that type (e.g. `:size`).
25
22
  #
26
- # @note This method comes more or less straight from Sequel
27
- # (lib/sequel/extensions/schema_dumper.rb).
23
+ # Here are some examples:
24
+ #
25
+ # | Database type | Ruby type | Other modifiers |
26
+ # |------------------|--------------------|-----------------------|
27
+ # | mediumint | Fixnum | |
28
+ # | smallint | Fixnum | |
29
+ # | int | Fixnum | |
30
+ # | int(10) unsigned | Bignum | |
31
+ # | tinyint | TrueClass, Integer | |
32
+ # | bigint | Bignum | |
33
+ # | real | Float | |
34
+ # | float | Float | |
35
+ # | double | Float | |
36
+ # | boolean | TrueClass | |
37
+ # | text | String | text: true |
38
+ # | date | Date | |
39
+ # | datetime | DateTime | |
40
+ # | timestamp | DateTime | |
41
+ # | time | Time | only_time: true |
42
+ # | varchar(255) | String | size: 255 |
43
+ # | char(10) | String | size: 10, fixed: true |
44
+ # | money | BigDecimal | size: [19, 2] |
45
+ # | decimal | BigDecimal | |
46
+ # | numeric | BigDecimal | |
47
+ # | number | BigDecimal | |
48
+ # | blob | File | |
49
+ # | year | Integer | |
50
+ # | identity | Integer | |
51
+ # | **other types** | String | |
52
+ #
53
+ # @note This method is copied from
54
+ # {http://sequel.jeremyevans.net/rdoc-plugins/classes/Sequel/SchemaDumper.html `Sequel::SchemaDumper`}.
55
+ # @return [Hash]
28
56
  def ruby_type
29
57
  unless @ruby_type
30
58
  hsh =
31
- case t = @schema[:db_type].downcase
32
- when /\A(?:medium|small)?int(?:eger)?(?:\((?:\d+)\))?(?: unsigned)?\z/o
33
- {:type=>Integer}
34
- when /\Atinyint(?:\((\d+)\))?\z/o
35
- {:type =>@schema[:type] == :boolean ? TrueClass : Integer}
59
+ case @schema[:db_type].downcase
60
+ when /\A(medium|small)?int(?:eger)?(?:\((\d+)\))?( unsigned)?\z/o
61
+ if !$1 && $2 && $2.to_i >= 10 && $3
62
+ # Unsigned integer type with 10 digits can potentially contain values which
63
+ # don't fit signed integer type, so use bigint type in target database.
64
+ {:type=>Bignum}
65
+ else
66
+ {:type=>Integer}
67
+ end
68
+ when /\Atinyint(?:\((\d+)\))?(?: unsigned)?\z/o
69
+ {:type =>schema[:type] == :boolean ? TrueClass : Integer}
36
70
  when /\Abigint(?:\((?:\d+)\))?(?: unsigned)?\z/o
37
71
  {:type=>Bignum}
38
- when /\A(?:real|float|double(?: precision)?)\z/o
72
+ when /\A(?:real|float|double(?: precision)?|double\(\d+,\d+\)(?: unsigned)?)\z/o
39
73
  {:type=>Float}
40
74
  when 'boolean'
41
75
  {:type=>TrueClass}
@@ -60,7 +94,7 @@ module Linkage
60
94
  {:type=>BigDecimal, :size=>(s.empty? ? nil : s)}
61
95
  when /\A(?:bytea|(?:tiny|medium|long)?blob|(?:var)?binary)(?:\((\d+)\))?\z/o
62
96
  {:type=>File, :size=>($1.to_i if $1)}
63
- when 'year'
97
+ when /\A(?:year|(?:int )?identity)\z/o
64
98
  {:type=>Integer}
65
99
  else
66
100
  {:type=>String}
@@ -73,6 +107,9 @@ module Linkage
73
107
  @ruby_type
74
108
  end
75
109
 
110
+ # Returns whether or not this field is a primary key.
111
+ #
112
+ # @return [Boolean]
76
113
  def primary_key?
77
114
  schema && schema[:primary_key]
78
115
  end
@@ -1,5 +1,11 @@
1
1
  module Linkage
2
+ # {FieldSet} is a `Hash` of {Field} values. It is usually associated with a
3
+ # {Dataset}. It looks up keys in a case-insensitive manner and doesn't care if
4
+ # you use strings or symbols.
5
+ #
6
+ # @see Dataset#field_set
2
7
  class FieldSet < Hash
8
+ # @return [Field] primary key of this field set.
3
9
  attr_reader :primary_key
4
10
 
5
11
  # Create a new FieldSet.
@@ -16,15 +22,29 @@ module Linkage
16
22
  end
17
23
  end
18
24
 
25
+ # Returns whether or not `key` is contained in the field set
26
+ # (case-insensitive).
27
+ #
28
+ # @param key [String, Symbol]
29
+ # @return [Boolean]
19
30
  def has_key?(key)
20
31
  !fetch_key(key).nil?
21
32
  end
22
33
 
34
+ # Returns a key that matches the parameter in a case-insensitive manner.
35
+ #
36
+ # @param key [String, Symbol]
37
+ # @return [Symbol]
23
38
  def fetch_key(key)
24
39
  string_key = key.to_s
25
40
  keys.detect { |k| k.to_s.casecmp(string_key) == 0 }
26
41
  end
27
42
 
43
+ # Returns the value for `key`, where `key` is matched in a case-insensitive
44
+ # manner.
45
+ #
46
+ # @param key [String, Symbol]
47
+ # @return [Field]
28
48
  def [](key)
29
49
  k = fetch_key(key)
30
50
  k ? super(k) : nil
@@ -0,0 +1,7 @@
1
+ module Linkage
2
+ module Helpers
3
+ end
4
+ end
5
+
6
+ require 'linkage/helpers/csv'
7
+ require 'linkage/helpers/database'
@@ -0,0 +1,28 @@
1
+ module Linkage
2
+ module Helpers
3
+ module CSV
4
+ def csv_filename(options)
5
+ File.expand_path(options[:filename], options[:dir] || '.')
6
+ end
7
+
8
+ def open_csv_for_reading(options)
9
+ filename = csv_filename(options)
10
+ if !File.exist?(filename)
11
+ raise MissingError, "#{filename} does not exist"
12
+ end
13
+ ::CSV.open(filename, 'rb', :headers => true)
14
+ end
15
+
16
+ def open_csv_for_writing(options)
17
+ filename = csv_filename(options)
18
+ if !options[:overwrite] && File.exist?(filename)
19
+ raise ExistsError, "#{filename} exists and not in overwrite mode"
20
+ end
21
+ if options[:dir]
22
+ FileUtils.mkdir_p(File.dirname(filename))
23
+ end
24
+ ::CSV.open(filename, 'wb')
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,47 @@
1
+ module Linkage
2
+ module Helpers
3
+ module Database
4
+ # Returns a `Sequel::Database`.
5
+ #
6
+ # @overload database_connection(connection_options = {}, default_options = {})
7
+ # @param connection_options [Hash] Options to establish a connection. Any
8
+ # options not explicitly listed below are passed directly to `Sequel.connect`.
9
+ # @option connection_options [String] :dir Parent directory to use for SQLite database
10
+ # @option connection_options [String] :filename SQLite database filename
11
+ # @overload database_connection(url)
12
+ # @param url [String] Sequel-style connection url
13
+ # @overload database_connection(database)
14
+ # @param database [Sequel::Database]
15
+ def database_connection(connection_options = {}, default_options = {})
16
+ sequel_options = nil
17
+ connection_options ||= default_options
18
+
19
+ case connection_options
20
+ when Hash
21
+ connection_options = default_options.merge(connection_options)
22
+ sequel_options = connection_options.reject do |key, value|
23
+ key == :dir || key == :filename
24
+ end
25
+
26
+ if sequel_options.empty?
27
+ filename = connection_options[:filename] || 'linkage.db'
28
+ if connection_options[:dir]
29
+ dir = File.expand_path(connection_options[:dir])
30
+ FileUtils.mkdir_p(dir)
31
+ filename = File.join(dir, filename)
32
+ end
33
+ sequel_options[:adapter] = :sqlite
34
+ sequel_options[:database] = filename
35
+ end
36
+ when String
37
+ sequel_options = connection_options
38
+ when Sequel::Database
39
+ return connection_options
40
+ else
41
+ raise ArgumentError, "Expected Hash or String, got #{connection_options.class}"
42
+ end
43
+ Sequel.connect(sequel_options)
44
+ end
45
+ end
46
+ end
47
+ end
@@ -1,8 +1,8 @@
1
1
  module Linkage
2
2
  class ImportBuffer
3
- # @param [Sequel::Dataset] dataset
4
- # @param [Array<Symbol>] headers List of fields you want to insert
5
- # @param [Fixnum] limit Number of records to insert at a time
3
+ # @param dataset [Sequel::Dataset]
4
+ # @param headers [Array<Symbol>] List of fields you want to insert
5
+ # @param limit [Fixnum] Number of records to insert at a time
6
6
  def initialize(dataset, headers, limit = 1000)
7
7
  @dataset = dataset
8
8
  @headers = headers
@@ -1,5 +1,9 @@
1
1
  module Linkage
2
+ # {MatchRecorder} is responsible for observing {Matcher} for changes and
3
+ # saving matches to a {MatchSet} via {MatchSet#add_match}.
2
4
  class MatchRecorder
5
+ # @param matcher [Matcher]
6
+ # @param match_set [MatchSet]
3
7
  def initialize(matcher, match_set)
4
8
  @matcher = matcher
5
9
  @match_set = match_set
@@ -1,30 +1,68 @@
1
1
  module Linkage
2
+ # A {MatchSet} is responsible for keeping track of matches. After the scoring
3
+ # process, a {Matcher} uses scores from a {ScoreSet} to calculate which record
4
+ # pairs match. Those pairs are then recorded by a {MatchRecorder} to a
5
+ # {MatchSet}.
6
+ #
7
+ # {MatchSet} is the superclass of implementations for different formats.
8
+ # Currently there are two formats for storing matches:
9
+ #
10
+ # * CSV ({MatchSets::CSV})
11
+ # * Database ({MatchSets::Database})
12
+ #
13
+ # See the documentation for match set you're interested in for more
14
+ # information.
15
+ #
16
+ # If you want to implement a custom {MatchSet}, create a class that inherits
17
+ # {MatchSet} and defines at least {#add_match}. You can then register that
18
+ # class via {.register}.
19
+ #
20
+ # @abstract
2
21
  class MatchSet
3
- # Register a match set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- unless methods.include?(:add_match)
9
- raise ArgumentError, "class must define #add_match"
10
- end
22
+ class << self
23
+ # Register a new match set. Subclasses must define at least {#add_match},
24
+ # otherwise an `ArgumentError` will be raised.
25
+ #
26
+ # @param [String] name Match set name used in {.klass_for}
27
+ # @param [Class] klass MatchSet subclass
28
+ def register(name, klass)
29
+ methods = klass.instance_methods(false)
30
+ unless methods.include?(:add_match)
31
+ raise ArgumentError, "class must define #add_match"
32
+ end
11
33
 
12
- @match_sets ||= {}
13
- @match_sets[name] = klass
14
- end
34
+ @match_sets ||= {}
35
+ @match_sets[name] = klass
36
+ end
15
37
 
16
- def self.[](name)
17
- @match_sets ? @match_sets[name] : nil
38
+ # Return a registered MatchSet subclass or `nil` if it doesn't exist.
39
+ #
40
+ # @param [String] name of registered match set
41
+ # @return [Class, nil]
42
+ def klass_for(name)
43
+ @match_sets ? @match_sets[name] : nil
44
+ end
45
+ alias :[] :klass_for
18
46
  end
19
47
 
48
+ # This is called by {MatchRecorder#start}, before any matches are added via
49
+ # {#add_match}. Subclasses can redefine this to perform any setup needed for
50
+ # saving matches.
20
51
  def open_for_writing
21
52
  end
22
53
 
54
+ # Add a match to the MatchSet. Subclasses must redefine this.
55
+ #
56
+ # @param id_1 [Object] record id from first dataset
57
+ # @param id_2 [Object] record id from second dataset
58
+ # @param value [Fixnum, Float] match value
23
59
  # @abstract
24
60
  def add_match(id_1, id_2, score)
25
61
  raise NotImplementedError
26
62
  end
27
63
 
64
+ # This is called by {MatchRecorder#stop}, after all matches have been added.
65
+ # Subclasses can redefine this to perform any teardown needed.
28
66
  def close
29
67
  end
30
68
  end
@@ -2,27 +2,54 @@ require 'csv'
2
2
 
3
3
  module Linkage
4
4
  module MatchSets
5
+ # {CSV MatchSets::CSV} is an implementation of {MatchSet} for saving
6
+ # matches in a CSV file.
7
+ #
8
+ # There are three options available:
9
+ #
10
+ # * `:filename` - which file to store matches in; can be an absolute path
11
+ # or relative path
12
+ # * `:dir` - which directory to put the file in; used if `:filename` is a
13
+ # relative path
14
+ # * `:overwrite` - indicate whether or not to overwrite an existing file
15
+ #
16
+ # By default, `:filename` is `'matches.csv'`, and the other options are
17
+ # blank. This means that it will write matches to the `'matches.csv'` file
18
+ # in the current working directory and will raise an error if the file
19
+ # already exists.
20
+ #
21
+ # If you specify `:dir`, that path will be created if it doesn't exist yet.
22
+ #
23
+ # The resulting file looks like this:
24
+ #
25
+ # id_1,id_2,score
26
+ # 123,456,0.75
27
+ # 124,457,1
28
+ #
29
+ # @see Helpers::CSV
5
30
  class CSV < MatchSet
6
- def initialize(filename, options = {})
7
- @filename = filename
8
- @overwrite = options[:overwrite]
31
+ include Helpers::CSV
32
+
33
+ DEFAULT_OPTIONS = {
34
+ :filename => 'matches.csv'
35
+ }
36
+
37
+ def initialize(options = {})
38
+ @options = DEFAULT_OPTIONS.merge(options.reject { |k, v| v.nil? })
9
39
  end
10
40
 
11
41
  def open_for_writing
12
42
  return if @mode == :write
13
43
 
14
- if !@overwrite && File.exist?(@filename)
15
- raise ExistsError, "#{@filename} exists and not in overwrite mode"
16
- end
17
-
18
- @csv = ::CSV.open(@filename, 'wb')
44
+ @csv = open_csv_for_writing(@options)
19
45
  @csv << %w{id_1 id_2 score}
20
46
  @mode = :write
21
47
  end
22
48
 
23
49
  def add_match(id_1, id_2, score)
24
50
  raise "not in write mode" if @mode != :write
25
- if score.equal?(1.0) || score.equal?(0.0)
51
+
52
+ if score.floor.equal?(score.ceil)
26
53
  score = score.floor
27
54
  end
28
55
  @csv << [id_1, id_2, score]
@@ -1,8 +1,49 @@
1
1
  module Linkage
2
2
  module MatchSets
3
+ # {Database MatchSets::Database} is an implementation of {MatchSet} for saving
4
+ # matches in a relational database.
5
+ #
6
+ # Matches are saved in a database table with the following columns:
7
+ # - id_1 (string)
8
+ # - id_2 (string)
9
+ # - score (float)
10
+ #
11
+ # You can setup a database connection in a few different ways. By default, a
12
+ # SQLite database with the filename of `matches.db` will be created in the
13
+ # current working directory. If you want something different, you can either
14
+ # specify a Sequel-style URI, provide connection options for
15
+ # `Sequel.connect`, or you can just specify a `Sequel::Database` object to
16
+ # use.
17
+ #
18
+ # There are a couple of non-Sequel connection options:
19
+ # * `:filename` - specify filename to use for a SQLite database
20
+ # * `:dir` - specify the parent directory for a SQLite database
21
+ #
22
+ # In addition to connection options, there are behavioral options you can
23
+ # set. By default, the table name used is called `matches`, but you change
24
+ # that by setting the `:table_name` option in the second options hash. If
25
+ # the table already exists, an {ExistsError} will be raised unless you set
26
+ # the `:overwrite` option to a truthy value in the second options hash.
27
+ #
28
+ # @see Helpers::Database
3
29
  class Database < MatchSet
4
- def initialize(database, options = {})
5
- @database = database
30
+ include Helpers::Database
31
+
32
+ DEFAULT_OPTIONS = {
33
+ :filename => 'matches.db'
34
+ }
35
+
36
+ # @override initialize(connection_options = {}, options = {})
37
+ # @param connection_options [Hash]
38
+ # @param options [Hash]
39
+ # @override initialize(uri, options = {})
40
+ # @param uri [String]
41
+ # @param options [Hash]
42
+ # @override initialize(database, options = {})
43
+ # @param database [Sequel::Database]
44
+ # @param options [Hash]
45
+ def initialize(connection_options = {}, options = {})
46
+ @database = database_connection(connection_options, DEFAULT_OPTIONS)
6
47
  @table_name = options[:table_name] || :matches
7
48
  @overwrite = options[:overwrite]
8
49
  end