linkage 0.1.0.pre → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -0
  3. data/Guardfile +0 -1
  4. data/TODO +2 -0
  5. data/lib/linkage.rb +1 -0
  6. data/lib/linkage/comparator.rb +12 -2
  7. data/lib/linkage/comparators/strcompare.rb +68 -16
  8. data/lib/linkage/configuration.rb +112 -8
  9. data/lib/linkage/dataset.rb +124 -9
  10. data/lib/linkage/exceptions.rb +5 -0
  11. data/lib/linkage/field.rb +55 -18
  12. data/lib/linkage/field_set.rb +20 -0
  13. data/lib/linkage/helpers.rb +7 -0
  14. data/lib/linkage/helpers/csv.rb +28 -0
  15. data/lib/linkage/helpers/database.rb +47 -0
  16. data/lib/linkage/import_buffer.rb +3 -3
  17. data/lib/linkage/match_recorder.rb +4 -0
  18. data/lib/linkage/match_set.rb +51 -13
  19. data/lib/linkage/match_sets/csv.rb +36 -9
  20. data/lib/linkage/match_sets/database.rb +43 -2
  21. data/lib/linkage/matcher.rb +49 -3
  22. data/lib/linkage/result_set.rb +60 -22
  23. data/lib/linkage/result_sets/csv.rb +46 -28
  24. data/lib/linkage/result_sets/database.rb +44 -26
  25. data/lib/linkage/runner.rb +10 -0
  26. data/lib/linkage/score_recorder.rb +5 -0
  27. data/lib/linkage/score_set.rb +78 -20
  28. data/lib/linkage/score_sets/csv.rb +41 -15
  29. data/lib/linkage/score_sets/database.rb +43 -5
  30. data/lib/linkage/version.rb +1 -1
  31. data/linkage.gemspec +2 -0
  32. data/misc/uml/linkage.dia +0 -0
  33. data/misc/uml/linkage.png +0 -0
  34. data/misc/uml/linkage.svg +197 -0
  35. data/test/helper.rb +2 -11
  36. data/test/integration/test_database_result_set.rb +4 -2
  37. data/test/unit/comparators/test_strcompare.rb +29 -0
  38. data/test/unit/match_sets/test_csv.rb +44 -13
  39. data/test/unit/match_sets/test_database.rb +42 -1
  40. data/test/unit/result_sets/test_csv.rb +9 -69
  41. data/test/unit/result_sets/test_database.rb +20 -11
  42. data/test/unit/score_sets/test_csv.rb +68 -25
  43. data/test/unit/score_sets/test_database.rb +57 -1
  44. data/test/unit/test_comparator.rb +8 -0
  45. data/test/unit/test_configuration.rb +33 -6
  46. data/test/unit/test_dataset.rb +0 -7
  47. data/test/unit/test_matcher.rb +52 -3
  48. data/test/unit/test_result_set.rb +8 -14
  49. metadata +66 -32
@@ -1,9 +1,29 @@
1
1
  module Linkage
2
+ # {Matcher} is responsible for combining scores from a {ScoreSet} and deciding
3
+ # which pairs of records match. There are two parameters you can use to
4
+ # determine how {Matcher} does this: `algorithm` and `threshold`.
5
+ #
6
+ # There are currently two algorithm options: `:mean` and `:sum`. The mean
7
+ # algorithm will create a mean score for each pair of records. The sum
8
+ # algorithm will create a total score for each pair of records.
9
+ #
10
+ # The `threshold` parameter determines what is considered a match. If the
11
+ # result score for a pair of records (depending on the algorithm used) is
12
+ # greater than or equal to the threshold, then the pair is considered to be a
13
+ # match.
14
+ #
15
+ # Whenever {Matcher} finds a match, it uses the observer pattern to notify
16
+ # other objects that a match has been found. Usually the only observer is a
17
+ # {MatchRecorder}.
2
18
  class Matcher
3
19
  include Observable
4
20
 
5
21
  attr_reader :comparators, :score_set, :algorithm, :threshold
6
22
 
23
+ # @param comparators [Array<Comparator>]
24
+ # @param score_set [ScoreSet]
25
+ # @param algorithm [Symbol] `:mean` or `:sum`
26
+ # @param threshold [Numeric]
7
27
  def initialize(comparators, score_set, algorithm, threshold)
8
28
  @comparators = comparators
9
29
  @score_set = score_set
@@ -11,20 +31,46 @@ module Linkage
11
31
  @threshold = threshold
12
32
  end
13
33
 
34
+ # Find matches.
14
35
  def run
15
36
  send(@algorithm)
16
37
  end
17
38
 
18
- private
19
-
39
+ # Combine scores for each pair of records via mean, then compare the
40
+ # combined score to the threshold. Notify observers if there's a match.
20
41
  def mean
42
+ w = @comparators.collect { |comparator| comparator.weight || 1 }
43
+ @score_set.open_for_reading
21
44
  @score_set.each_pair do |id_1, id_2, scores|
22
- mean = scores.values.inject(:+) / @comparators.length.to_f
45
+ sum = 0
46
+ scores.each do |key, value|
47
+ sum += value * w[key-1]
48
+ end
49
+ mean = sum / @comparators.length.to_f
23
50
  if mean >= @threshold
24
51
  changed
25
52
  notify_observers(id_1, id_2, mean)
26
53
  end
27
54
  end
55
+ @score_set.close
56
+ end
57
+
58
+ # Combine scores for each pair of records via sum, then compare the
59
+ # combined score to the threshold. Notify observers if there's a match.
60
+ def sum
61
+ w = @comparators.collect { |comparator| comparator.weight || 1 }
62
+ @score_set.open_for_reading
63
+ @score_set.each_pair do |id_1, id_2, scores|
64
+ sum = 0
65
+ scores.each do |key, value|
66
+ sum += value * w[key-1]
67
+ end
68
+ if sum >= @threshold
69
+ changed
70
+ notify_observers(id_1, id_2, sum)
71
+ end
72
+ end
73
+ @score_set.close
28
74
  end
29
75
  end
30
76
  end
@@ -1,37 +1,75 @@
1
1
  module Linkage
2
+ # A {ResultSet} is a convenience class for wrapping a {ScoreSet} and a
3
+ # {MatchSet}. Most of the time, you'll want to use the same storage format for
4
+ # both scores and matches. {ResultSet} provides a way to group both sets
5
+ # together.
6
+ #
7
+ # The default implementation of {ResultSet} merely returns whatever {ScoreSet}
8
+ # and {MatchSet} you pass to it during creation (see {#initialize}). However,
9
+ # {ResultSet} can be subclassed to provide easy initialization of sets of the
10
+ # same format. Currently there are two subclasses:
11
+ #
12
+ # * CSV ({ResultSets::CSV})
13
+ # * Database ({ResultSets::Database})
14
+ #
15
+ # If you want to implement a custom {ResultSet}, create a class that inherits
16
+ # {ResultSet} and defines both {#score_set} and {#match_set} to return a
17
+ # {ScoreSet} and {MatchSet} respectively. You can then register that class via
18
+ # {.register} to make it easier to use.
2
19
  class ResultSet
3
- # Register a result set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- missing = []
9
- unless methods.include?(:score_set)
10
- missing.push("#score_set")
11
- end
12
- unless methods.include?(:match_set)
13
- missing.push("#match_set")
14
- end
15
- unless missing.empty?
16
- raise ArgumentError, "class must define #{missing.join(" and ")}"
20
+ class << self
21
+ # Register a new result set. Subclasses must define {#score_set} and
22
+ # {#match_set}. Otherwise, an `ArgumentError` will be raised when you try
23
+ # to call {.register}.
24
+ #
25
+ # @param [String] name Result set name used in {.klass_for}
26
+ # @param [Class] klass ResultSet subclass
27
+ def register(name, klass)
28
+ methods = klass.instance_methods
29
+ missing = []
30
+ unless methods.include?(:score_set)
31
+ missing.push("#score_set")
32
+ end
33
+ unless methods.include?(:match_set)
34
+ missing.push("#match_set")
35
+ end
36
+ unless missing.empty?
37
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
38
+ end
39
+
40
+ @result_set ||= {}
41
+ @result_set[name] = klass
17
42
  end
18
43
 
19
- @result_set ||= {}
20
- @result_set[name] = klass
44
+ # Return a registered ResultSet subclass or `nil` if it doesn't exist.
45
+ #
46
+ # @param [String] name of registered result set
47
+ # @return [Class, nil]
48
+ def klass_for(name)
49
+ @result_set ? @result_set[name] : nil
50
+ end
51
+ alias :[] :klass_for
21
52
  end
22
53
 
23
- def self.[](name)
24
- @result_set ? @result_set[name] : nil
54
+ # @param [ScoreSet] score_set
55
+ # @param [MatchSet] match_set
56
+ def initialize(score_set, match_set)
57
+ @score_set = score_set
58
+ @match_set = match_set
25
59
  end
26
60
 
27
- # @abstract
61
+ # Returns a {ScoreSet}.
62
+ #
63
+ # @return [ScoreSet]
28
64
  def score_set
29
- raise NotImplementedError
65
+ @score_set
30
66
  end
31
67
 
32
- # @abstract
68
+ # Returns a {MatchSet}.
69
+ #
70
+ # @return [MatchSet]
33
71
  def match_set
34
- raise NotImplementedError
72
+ @match_set
35
73
  end
36
74
  end
37
75
  end
@@ -1,8 +1,51 @@
1
1
  module Linkage
2
2
  module ResultSets
3
+ # {CSV ResultSets::CSV} is a subclass of {ResultSet ResultSet} that makes it
4
+ # convenient to set up a {ScoreSets::CSV} and {MatchSets::CSV} at the same
5
+ # time. For example:
6
+ #
7
+ # ```ruby
8
+ # result_set = Linkage::ResultSets::CSV.new('/some/path')
9
+ # ```
10
+ #
11
+ # Or by using {ResultSet.[] ResultSet.[]}:
12
+ #
13
+ # ```ruby
14
+ # result_set = Linkage::ResultSet['csv'].new('/some/path')
15
+ # ```
16
+ #
17
+ # {#initialize ResultSets::CSV.new} takes either a directory name as its
18
+ # argument or a Hash of options. Passing in a directory name is equivalent
19
+ # to passing in a Hash with the `:dir` key. For example:
20
+ #
21
+ # ```ruby
22
+ # result_set = Linkage::ResultSet['csv'].new('/some/path')
23
+ # ```
24
+ #
25
+ # is the same as:
26
+ #
27
+ # ```ruby
28
+ # result_set = Linkage::ResultSet['csv'].new({:dir => '/some/path'})
29
+ # ```
30
+ #
31
+ # The `:dir` option lets you specify the parent directory for the score set
32
+ # and result set files (which are `scores.csv` and `results.csv` by default).
33
+ #
34
+ # The only other relevant option is `:overwrite`, which controls whether or
35
+ # not overwriting existing files is permitted.
36
+ #
37
+ # @see ScoreSets::CSV
38
+ # @see MatchSets::CSV
3
39
  class CSV < ResultSet
40
+ # @overload initialize(dir)
41
+ # @param [String] dir parent directory of CSV files
42
+ # @overload initialize(options)
43
+ # @param [Hash] options
44
+ # @option options [String] :dir parent directory of CSV files
45
+ # @option options [Boolean] :overwrite (false) whether or not to allow
46
+ # overwriting existing files
4
47
  def initialize(dir_or_options = nil)
5
- opts =
48
+ @options =
6
49
  case dir_or_options
7
50
  when nil
8
51
  {}
@@ -13,39 +56,14 @@ module Linkage
13
56
  else
14
57
  raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
15
58
  end
16
-
17
- if opts[:dir]
18
- opts[:dir] = File.expand_path(opts[:dir])
19
- FileUtils.mkdir_p(opts[:dir])
20
- end
21
-
22
- @score_set_args = extract_args_for(:scores, opts)
23
- @match_set_args = extract_args_for(:matches, opts)
24
59
  end
25
60
 
26
61
  def score_set
27
- @score_set ||= ScoreSet['csv'].new(*@score_set_args)
62
+ @score_set ||= ScoreSet['csv'].new(@options)
28
63
  end
29
64
 
30
65
  def match_set
31
- @match_set ||= MatchSet['csv'].new(*@match_set_args)
32
- end
33
-
34
- private
35
-
36
- def extract_args_for(name, opts)
37
- dir = opts[:dir] || '.'
38
- opts = opts[name]
39
-
40
- filename =
41
- case opts
42
- when Hash, nil
43
- opts = opts ? opts.dup : {}
44
- opts.delete(:filename) || "#{name}.csv"
45
- when String
46
- opts
47
- end
48
- [File.join(dir, filename), opts]
66
+ @match_set ||= MatchSet['csv'].new(@options)
49
67
  end
50
68
  end
51
69
 
@@ -1,39 +1,57 @@
1
1
  module Linkage
2
2
  module ResultSets
3
+ # {Database ResultSets::Database} is the {ResultSet ResultSet} for writing
4
+ # to database tables. You can use it by either referencing it directly like
5
+ # so:
6
+ #
7
+ # ```ruby
8
+ # result_set = Linkage::ResultSets::Database.new(connection_options, options)
9
+ # ```
10
+ #
11
+ # Or by using {ResultSet.[] ResultSet.[]}:
12
+ #
13
+ # ```ruby
14
+ # result_set = Linkage::ResultSet['database'].new(connection_options, options)
15
+ # ```
16
+ #
17
+ # You can setup a database connection in a few different ways. By default, a
18
+ # SQLite database with the filename of `results.db` will be created in the
19
+ # current working directory. If you want something different, you can either
20
+ # specify a Sequel-style URI, provide connection options for
21
+ # `Sequel.connect`, or you can just specify a
22
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Database.html Sequel::Database}
23
+ # object to use.
24
+ #
25
+ # There are a couple of non-Sequel connection options:
26
+ # * `:filename` - specify filename to use for a SQLite database
27
+ # * `:dir` - specify the parent directory for a SQLite database
28
+ #
29
+ # This result set creates a {ScoreSets::Database database-backed score set}
30
+ # and a {Matchsets::Database database-backed match set} with their default
31
+ # table names (`scores` and `matches` respectively. If either table already
32
+ # exists, an {ExistsError} will be raised unless you set the `:overwrite`
33
+ # option to a truthy value in the second options hash.
34
+ #
35
+ # @see ScoreSets::Database
36
+ # @see MatchSets::Database
3
37
  class Database < ResultSet
4
- def initialize(database_or_options = nil)
5
- @database = nil
6
- @options = {}
38
+ include Helpers::Database
7
39
 
8
- if database_or_options.kind_of?(Sequel::Database)
9
- @database = database_or_options
10
- else
11
- database_opts = nil
12
- case database_or_options
13
- when String
14
- database_opts = database_or_options
15
- when Hash
16
- database_opts = {}
17
- database_or_options.each_pair do |key, value|
18
- if key == :scores || key == :matches
19
- @options[key] = value
20
- else
21
- database_opts[key] = value
22
- end
23
- end
24
- else
25
- raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
26
- end
27
- @database = Sequel.connect(database_opts)
28
- end
40
+ DEFAULT_OPTIONS = {
41
+ :filename => 'results.db'
42
+ }
43
+
44
+ def initialize(connection_options = {}, options = {})
45
+ @database = database_connection(connection_options, DEFAULT_OPTIONS)
46
+ @options = options
29
47
  end
30
48
 
31
49
  def score_set
32
- @score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
50
+ @score_set ||= ScoreSet['database'].new(@database, @options)
33
51
  end
34
52
 
35
53
  def match_set
36
- @match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
54
+ @match_set ||= MatchSet['database'].new(@database, @options)
37
55
  end
38
56
  end
39
57
 
@@ -1,5 +1,15 @@
1
1
  module Linkage
2
2
  # Use this class to run a configuration created by {Dataset#link_with}.
3
+ #
4
+ # During a record linkage, one or more {Comparator}s generate scores. Each
5
+ # score is recorded by a {ScoreRecorder}, which uses a {ScoreSet} to actually
6
+ # save the score. After the scoring is complete, a {Matcher} combines the
7
+ # scores to create matches. Each match is recorded by a {MatchRecorder}, which
8
+ # uses a {MatchSet} to actually save the match information.
9
+ #
10
+ # So to save scores and matches, we need both a {ScoreSet} and a {MatchSet}.
11
+ # To make this easier, a {ResultSet} can be used to configure both {ScoreSet}s
12
+ # and {MatchSet}s.
3
13
  class Runner
4
14
  attr_reader :config
5
15
 
@@ -1,5 +1,10 @@
1
1
  module Linkage
2
+ # {ScoreRecorder} is responsible for observing a set of {Comparator} for
3
+ # changes and saving matches to a {ScoreSet} via {ScoreSet#add_score}.
2
4
  class ScoreRecorder
5
+ # @param comparators [Array<Comparator>]
6
+ # @param score_set [ScoreSet]
7
+ # @param primary_keys [Array<Symbol>]
3
8
  def initialize(comparators, score_set, primary_keys)
4
9
  @comparators = comparators
5
10
  @score_set = score_set
@@ -1,45 +1,103 @@
1
1
  module Linkage
2
+ # A {ScoreSet} is responsible for keeping track of scores. During the record
3
+ # linkage process, one or more {Comparator}s generate scores. These scores are
4
+ # handled by a {ScoreRecorder}, which uses a {ScoreSet} to actually save the
5
+ # scores. {ScoreSet} is also used to fetch the linkage scores so that a
6
+ # {Matcher} can create matches.
7
+ #
8
+ # {ScoreSet} is the superclass of implementations for different formats.
9
+ # Currently there are two formats for storing scores:
10
+ #
11
+ # * CSV ({ScoreSets::CSV})
12
+ # * Database ({ScoreSets::Database})
13
+ #
14
+ # See the documentation for score set you're interested in for more
15
+ # information.
16
+ #
17
+ # If you want to implement a custom {ScoreSet}, create a class that inherits
18
+ # {ScoreSet} and defines at least {#add_score} and {#each_pair}. You can then
19
+ # register that class via {.register}.
20
+ #
21
+ # @abstract
2
22
  class ScoreSet
3
- # Register a score set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- missing = []
9
- unless methods.include?(:add_score)
10
- missing.push("#add_score")
11
- end
12
- unless methods.include?(:each_pair)
13
- missing.push("#each_pair")
14
- end
15
- unless missing.empty?
16
- raise ArgumentError, "class must define #{missing.join(" and ")}"
17
- end
23
+ class << self
24
+ # Register a new score set. Subclasses must define at least {#add_score}
25
+ # and {#each_pair}. Otherwise, an `ArgumentError` will be raised when you
26
+ # try to call {.register}.
27
+ #
28
+ # @param [String] name Score set name used in {.klass_for}
29
+ # @param [Class] klass ScoreSet subclass
30
+ def register(name, klass)
31
+ methods = klass.instance_methods(false)
32
+ missing = []
33
+ unless methods.include?(:add_score)
34
+ missing.push("#add_score")
35
+ end
36
+ unless methods.include?(:each_pair)
37
+ missing.push("#each_pair")
38
+ end
39
+ unless missing.empty?
40
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
41
+ end
18
42
 
19
- @score_sets ||= {}
20
- @score_sets[name] = klass
21
- end
43
+ @score_sets ||= {}
44
+ @score_sets[name] = klass
45
+ end
22
46
 
23
- def self.[](name)
24
- @score_sets ? @score_sets[name] : nil
47
+ # Return a registered ScoreSet subclass or `nil` if it doesn't exist.
48
+ #
49
+ # @param [String] name of registered score set
50
+ # @return [Class, nil]
51
+ def klass_for(name)
52
+ @score_sets ? @score_sets[name] : nil
53
+ end
54
+ alias :[] :klass_for
25
55
  end
26
56
 
57
+ # This is called by {Matcher#run}, before any scores are read via
58
+ # {#each_pair}. Subclasses can redefine this to perform any setup needed
59
+ # for reading scores.
27
60
  def open_for_reading
28
61
  end
29
62
 
63
+ # This is called by {ScoreRecorder#start}, before any scores are added via
64
+ # {#add_score}. Subclasses can redefine this to perform any setup needed
65
+ # for saving scores.
30
66
  def open_for_writing
31
67
  end
32
68
 
69
+ # Add a score to the ScoreSet. Subclasses must redefine this.
70
+ #
71
+ # @param comparator_id [Fixnum] 1-indexed comparator index
72
+ # @param id_1 [Object] record id from first dataset
73
+ # @param id_2 [Object] record id from second dataset
74
+ # @param value [Fixnum, Float] score value
33
75
  # @abstract
34
76
  def add_score(comparator_id, id_1, id_2, value)
35
77
  raise NotImplementedError
36
78
  end
37
79
 
80
+ # Yield scores for each pair of records. Subclasses must redefine this.
81
+ # This method is called by {Matcher#run} with a block with three
82
+ # parameters:
83
+ #
84
+ # ```ruby
85
+ # score_set.each_pair do |id_1, id_2, scores|
86
+ # end
87
+ # ```
88
+ #
89
+ # `scores` should be a Hash where comparator ids are keys and scores are
90
+ # values. For example: `{ 1 => 0.5, 2 => 0.75, 3 => 1 }`. Note that not all
91
+ # comparators (including {Comparators::Compare}) create scores for each
92
+ # pair. A missing score means that pair was given a score of 0.
93
+ #
38
94
  # @abstract
39
95
  def each_pair(&block)
40
96
  raise NotImplementedError
41
97
  end
42
98
 
99
+ # This is called by {ScoreRecorder#stop}, after all scores have been added.
100
+ # Subclasses can redefine this to perform any teardown needed.
43
101
  def close
44
102
  end
45
103
  end