linkage 0.1.0.pre → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -0
  3. data/Guardfile +0 -1
  4. data/TODO +2 -0
  5. data/lib/linkage.rb +1 -0
  6. data/lib/linkage/comparator.rb +12 -2
  7. data/lib/linkage/comparators/strcompare.rb +68 -16
  8. data/lib/linkage/configuration.rb +112 -8
  9. data/lib/linkage/dataset.rb +124 -9
  10. data/lib/linkage/exceptions.rb +5 -0
  11. data/lib/linkage/field.rb +55 -18
  12. data/lib/linkage/field_set.rb +20 -0
  13. data/lib/linkage/helpers.rb +7 -0
  14. data/lib/linkage/helpers/csv.rb +28 -0
  15. data/lib/linkage/helpers/database.rb +47 -0
  16. data/lib/linkage/import_buffer.rb +3 -3
  17. data/lib/linkage/match_recorder.rb +4 -0
  18. data/lib/linkage/match_set.rb +51 -13
  19. data/lib/linkage/match_sets/csv.rb +36 -9
  20. data/lib/linkage/match_sets/database.rb +43 -2
  21. data/lib/linkage/matcher.rb +49 -3
  22. data/lib/linkage/result_set.rb +60 -22
  23. data/lib/linkage/result_sets/csv.rb +46 -28
  24. data/lib/linkage/result_sets/database.rb +44 -26
  25. data/lib/linkage/runner.rb +10 -0
  26. data/lib/linkage/score_recorder.rb +5 -0
  27. data/lib/linkage/score_set.rb +78 -20
  28. data/lib/linkage/score_sets/csv.rb +41 -15
  29. data/lib/linkage/score_sets/database.rb +43 -5
  30. data/lib/linkage/version.rb +1 -1
  31. data/linkage.gemspec +2 -0
  32. data/misc/uml/linkage.dia +0 -0
  33. data/misc/uml/linkage.png +0 -0
  34. data/misc/uml/linkage.svg +197 -0
  35. data/test/helper.rb +2 -11
  36. data/test/integration/test_database_result_set.rb +4 -2
  37. data/test/unit/comparators/test_strcompare.rb +29 -0
  38. data/test/unit/match_sets/test_csv.rb +44 -13
  39. data/test/unit/match_sets/test_database.rb +42 -1
  40. data/test/unit/result_sets/test_csv.rb +9 -69
  41. data/test/unit/result_sets/test_database.rb +20 -11
  42. data/test/unit/score_sets/test_csv.rb +68 -25
  43. data/test/unit/score_sets/test_database.rb +57 -1
  44. data/test/unit/test_comparator.rb +8 -0
  45. data/test/unit/test_configuration.rb +33 -6
  46. data/test/unit/test_dataset.rb +0 -7
  47. data/test/unit/test_matcher.rb +52 -3
  48. data/test/unit/test_result_set.rb +8 -14
  49. metadata +66 -32
@@ -1,9 +1,29 @@
1
1
  module Linkage
2
+ # {Matcher} is responsible for combining scores from a {ScoreSet} and deciding
3
+ # which pairs of records match. There are two parameters you can use to
4
+ # determine how {Matcher} does this: `algorithm` and `threshold`.
5
+ #
6
+ # There are currently two algorithm options: `:mean` and `:sum`. The mean
7
+ # algorithm will create a mean score for each pair of records. The sum
8
+ # algorithm will create a total score for each pair of records.
9
+ #
10
+ # The `threshold` parameter determines what is considered a match. If the
11
+ # result score for a pair of records (depending on the algorithm used) is
12
+ # greater than or equal to the threshold, then the pair is considered to be a
13
+ # match.
14
+ #
15
+ # Whenever {Matcher} finds a match, it uses the observer pattern to notify
16
+ # other objects that a match has been found. Usually the only observer is a
17
+ # {MatchRecorder}.
2
18
  class Matcher
3
19
  include Observable
4
20
 
5
21
  attr_reader :comparators, :score_set, :algorithm, :threshold
6
22
 
23
+ # @param comparators [Array<Comparator>]
24
+ # @param score_set [ScoreSet]
25
+ # @param algorithm [Symbol] `:mean` or `:sum`
26
+ # @param threshold [Numeric]
7
27
  def initialize(comparators, score_set, algorithm, threshold)
8
28
  @comparators = comparators
9
29
  @score_set = score_set
@@ -11,20 +31,46 @@ module Linkage
11
31
  @threshold = threshold
12
32
  end
13
33
 
34
+ # Find matches.
14
35
  def run
15
36
  send(@algorithm)
16
37
  end
17
38
 
18
- private
19
-
39
+ # Combine scores for each pair of records via mean, then compare the
40
+ # combined score to the threshold. Notify observers if there's a match.
20
41
  def mean
42
+ w = @comparators.collect { |comparator| comparator.weight || 1 }
43
+ @score_set.open_for_reading
21
44
  @score_set.each_pair do |id_1, id_2, scores|
22
- mean = scores.values.inject(:+) / @comparators.length.to_f
45
+ sum = 0
46
+ scores.each do |key, value|
47
+ sum += value * w[key-1]
48
+ end
49
+ mean = sum / @comparators.length.to_f
23
50
  if mean >= @threshold
24
51
  changed
25
52
  notify_observers(id_1, id_2, mean)
26
53
  end
27
54
  end
55
+ @score_set.close
56
+ end
57
+
58
+ # Combine scores for each pair of records via sum, then compare the
59
+ # combined score to the threshold. Notify observers if there's a match.
60
+ def sum
61
+ w = @comparators.collect { |comparator| comparator.weight || 1 }
62
+ @score_set.open_for_reading
63
+ @score_set.each_pair do |id_1, id_2, scores|
64
+ sum = 0
65
+ scores.each do |key, value|
66
+ sum += value * w[key-1]
67
+ end
68
+ if sum >= @threshold
69
+ changed
70
+ notify_observers(id_1, id_2, sum)
71
+ end
72
+ end
73
+ @score_set.close
28
74
  end
29
75
  end
30
76
  end
@@ -1,37 +1,75 @@
1
1
  module Linkage
2
+ # A {ResultSet} is a convenience class for wrapping a {ScoreSet} and a
3
+ # {MatchSet}. Most of the time, you'll want to use the same storage format for
4
+ # both scores and matches. {ResultSet} provides a way to group both sets
5
+ # together.
6
+ #
7
+ # The default implementation of {ResultSet} merely returns whatever {ScoreSet}
8
+ # and {MatchSet} you pass to it during creation (see {#initialize}). However,
9
+ # {ResultSet} can be subclassed to provide easy initialization of sets of the
10
+ # same format. Currently there are two subclasses:
11
+ #
12
+ # * CSV ({ResultSets::CSV})
13
+ # * Database ({ResultSets::Database})
14
+ #
15
+ # If you want to implement a custom {ResultSet}, create a class that inherits
16
+ # {ResultSet} and defines both {#score_set} and {#match_set} to return a
17
+ # {ScoreSet} and {MatchSet} respectively. You can then register that class via
18
+ # {.register} to make it easier to use.
2
19
  class ResultSet
3
- # Register a result set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- missing = []
9
- unless methods.include?(:score_set)
10
- missing.push("#score_set")
11
- end
12
- unless methods.include?(:match_set)
13
- missing.push("#match_set")
14
- end
15
- unless missing.empty?
16
- raise ArgumentError, "class must define #{missing.join(" and ")}"
20
+ class << self
21
+ # Register a new result set. Subclasses must define {#score_set} and
22
+ # {#match_set}. Otherwise, an `ArgumentError` will be raised when you try
23
+ # to call {.register}.
24
+ #
25
+ # @param [String] name Result set name used in {.klass_for}
26
+ # @param [Class] klass ResultSet subclass
27
+ def register(name, klass)
28
+ methods = klass.instance_methods
29
+ missing = []
30
+ unless methods.include?(:score_set)
31
+ missing.push("#score_set")
32
+ end
33
+ unless methods.include?(:match_set)
34
+ missing.push("#match_set")
35
+ end
36
+ unless missing.empty?
37
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
38
+ end
39
+
40
+ @result_set ||= {}
41
+ @result_set[name] = klass
17
42
  end
18
43
 
19
- @result_set ||= {}
20
- @result_set[name] = klass
44
+ # Return a registered ResultSet subclass or `nil` if it doesn't exist.
45
+ #
46
+ # @param [String] name of registered result set
47
+ # @return [Class, nil]
48
+ def klass_for(name)
49
+ @result_set ? @result_set[name] : nil
50
+ end
51
+ alias :[] :klass_for
21
52
  end
22
53
 
23
- def self.[](name)
24
- @result_set ? @result_set[name] : nil
54
+ # @param [ScoreSet] score_set
55
+ # @param [MatchSet] match_set
56
+ def initialize(score_set, match_set)
57
+ @score_set = score_set
58
+ @match_set = match_set
25
59
  end
26
60
 
27
- # @abstract
61
+ # Returns a {ScoreSet}.
62
+ #
63
+ # @return [ScoreSet]
28
64
  def score_set
29
- raise NotImplementedError
65
+ @score_set
30
66
  end
31
67
 
32
- # @abstract
68
+ # Returns a {MatchSet}.
69
+ #
70
+ # @return [MatchSet]
33
71
  def match_set
34
- raise NotImplementedError
72
+ @match_set
35
73
  end
36
74
  end
37
75
  end
@@ -1,8 +1,51 @@
1
1
  module Linkage
2
2
  module ResultSets
3
+ # {CSV ResultSets::CSV} is a subclass of {ResultSet ResultSet} that makes it
4
+ # convenient to set up a {ScoreSets::CSV} and {MatchSets::CSV} at the same
5
+ # time. For example:
6
+ #
7
+ # ```ruby
8
+ # result_set = Linkage::ResultSets::CSV.new('/some/path')
9
+ # ```
10
+ #
11
+ # Or by using {ResultSet.[] ResultSet.[]}:
12
+ #
13
+ # ```ruby
14
+ # result_set = Linkage::ResultSet['csv'].new('/some/path')
15
+ # ```
16
+ #
17
+ # {#initialize ResultSets::CSV.new} takes either a directory name as its
18
+ # argument or a Hash of options. Passing in a directory name is equivalent
19
+ # to passing in a Hash with the `:dir` key. For example:
20
+ #
21
+ # ```ruby
22
+ # result_set = Linkage::ResultSet['csv'].new('/some/path')
23
+ # ```
24
+ #
25
+ # is the same as:
26
+ #
27
+ # ```ruby
28
+ # result_set = Linkage::ResultSet['csv'].new({:dir => '/some/path'})
29
+ # ```
30
+ #
31
+ # The `:dir` option lets you specify the parent directory for the score set
32
+ # and result set files (which are `scores.csv` and `results.csv` by default).
33
+ #
34
+ # The only other relevant option is `:overwrite`, which controls whether or
35
+ # not overwriting existing files is permitted.
36
+ #
37
+ # @see ScoreSets::CSV
38
+ # @see MatchSets::CSV
3
39
  class CSV < ResultSet
40
+ # @overload initialize(dir)
41
+ # @param [String] dir parent directory of CSV files
42
+ # @overload initialize(options)
43
+ # @param [Hash] options
44
+ # @option options [String] :dir parent directory of CSV files
45
+ # @option options [Boolean] :overwrite (false) whether or not to allow
46
+ # overwriting existing files
4
47
  def initialize(dir_or_options = nil)
5
- opts =
48
+ @options =
6
49
  case dir_or_options
7
50
  when nil
8
51
  {}
@@ -13,39 +56,14 @@ module Linkage
13
56
  else
14
57
  raise ArgumentError, "expected nil, a String, or a Hash, got #{dir_or_options.class}"
15
58
  end
16
-
17
- if opts[:dir]
18
- opts[:dir] = File.expand_path(opts[:dir])
19
- FileUtils.mkdir_p(opts[:dir])
20
- end
21
-
22
- @score_set_args = extract_args_for(:scores, opts)
23
- @match_set_args = extract_args_for(:matches, opts)
24
59
  end
25
60
 
26
61
  def score_set
27
- @score_set ||= ScoreSet['csv'].new(*@score_set_args)
62
+ @score_set ||= ScoreSet['csv'].new(@options)
28
63
  end
29
64
 
30
65
  def match_set
31
- @match_set ||= MatchSet['csv'].new(*@match_set_args)
32
- end
33
-
34
- private
35
-
36
- def extract_args_for(name, opts)
37
- dir = opts[:dir] || '.'
38
- opts = opts[name]
39
-
40
- filename =
41
- case opts
42
- when Hash, nil
43
- opts = opts ? opts.dup : {}
44
- opts.delete(:filename) || "#{name}.csv"
45
- when String
46
- opts
47
- end
48
- [File.join(dir, filename), opts]
66
+ @match_set ||= MatchSet['csv'].new(@options)
49
67
  end
50
68
  end
51
69
 
@@ -1,39 +1,57 @@
1
1
  module Linkage
2
2
  module ResultSets
3
+ # {Database ResultSets::Database} is the {ResultSet ResultSet} for writing
4
+ # to database tables. You can use it by either referencing it directly like
5
+ # so:
6
+ #
7
+ # ```ruby
8
+ # result_set = Linkage::ResultSets::Database.new(connection_options, options)
9
+ # ```
10
+ #
11
+ # Or by using {ResultSet.[] ResultSet.[]}:
12
+ #
13
+ # ```ruby
14
+ # result_set = Linkage::ResultSet['database'].new(connection_options, options)
15
+ # ```
16
+ #
17
+ # You can setup a database connection in a few different ways. By default, a
18
+ # SQLite database with the filename of `results.db` will be created in the
19
+ # current working directory. If you want something different, you can either
20
+ # specify a Sequel-style URI, provide connection options for
21
+ # `Sequel.connect`, or you can just specify a
22
+ # {http://sequel.jeremyevans.net/rdoc/classes/Sequel/Database.html Sequel::Database}
23
+ # object to use.
24
+ #
25
+ # There are a couple of non-Sequel connection options:
26
+ # * `:filename` - specify filename to use for a SQLite database
27
+ # * `:dir` - specify the parent directory for a SQLite database
28
+ #
29
+ # This result set creates a {ScoreSets::Database database-backed score set}
30
+ # and a {Matchsets::Database database-backed match set} with their default
31
+ # table names (`scores` and `matches` respectively. If either table already
32
+ # exists, an {ExistsError} will be raised unless you set the `:overwrite`
33
+ # option to a truthy value in the second options hash.
34
+ #
35
+ # @see ScoreSets::Database
36
+ # @see MatchSets::Database
3
37
  class Database < ResultSet
4
- def initialize(database_or_options = nil)
5
- @database = nil
6
- @options = {}
38
+ include Helpers::Database
7
39
 
8
- if database_or_options.kind_of?(Sequel::Database)
9
- @database = database_or_options
10
- else
11
- database_opts = nil
12
- case database_or_options
13
- when String
14
- database_opts = database_or_options
15
- when Hash
16
- database_opts = {}
17
- database_or_options.each_pair do |key, value|
18
- if key == :scores || key == :matches
19
- @options[key] = value
20
- else
21
- database_opts[key] = value
22
- end
23
- end
24
- else
25
- raise ArgumentError, "expected Sequel::Database, a String, or a Hash, got #{database_or_options.class}"
26
- end
27
- @database = Sequel.connect(database_opts)
28
- end
40
+ DEFAULT_OPTIONS = {
41
+ :filename => 'results.db'
42
+ }
43
+
44
+ def initialize(connection_options = {}, options = {})
45
+ @database = database_connection(connection_options, DEFAULT_OPTIONS)
46
+ @options = options
29
47
  end
30
48
 
31
49
  def score_set
32
- @score_set ||= ScoreSet['database'].new(@database, @options[:scores] || {})
50
+ @score_set ||= ScoreSet['database'].new(@database, @options)
33
51
  end
34
52
 
35
53
  def match_set
36
- @match_set ||= MatchSet['database'].new(@database, @options[:matches] || {})
54
+ @match_set ||= MatchSet['database'].new(@database, @options)
37
55
  end
38
56
  end
39
57
 
@@ -1,5 +1,15 @@
1
1
  module Linkage
2
2
  # Use this class to run a configuration created by {Dataset#link_with}.
3
+ #
4
+ # During a record linkage, one or more {Comparator}s generate scores. Each
5
+ # score is recorded by a {ScoreRecorder}, which uses a {ScoreSet} to actually
6
+ # save the score. After the scoring is complete, a {Matcher} combines the
7
+ # scores to create matches. Each match is recorded by a {MatchRecorder}, which
8
+ # uses a {MatchSet} to actually save the match information.
9
+ #
10
+ # So to save scores and matches, we need both a {ScoreSet} and a {MatchSet}.
11
+ # To make this easier, a {ResultSet} can be used to configure both {ScoreSet}s
12
+ # and {MatchSet}s.
3
13
  class Runner
4
14
  attr_reader :config
5
15
 
@@ -1,5 +1,10 @@
1
1
  module Linkage
2
+ # {ScoreRecorder} is responsible for observing a set of {Comparator} for
3
+ # changes and saving matches to a {ScoreSet} via {ScoreSet#add_score}.
2
4
  class ScoreRecorder
5
+ # @param comparators [Array<Comparator>]
6
+ # @param score_set [ScoreSet]
7
+ # @param primary_keys [Array<Symbol>]
3
8
  def initialize(comparators, score_set, primary_keys)
4
9
  @comparators = comparators
5
10
  @score_set = score_set
@@ -1,45 +1,103 @@
1
1
  module Linkage
2
+ # A {ScoreSet} is responsible for keeping track of scores. During the record
3
+ # linkage process, one or more {Comparator}s generate scores. These scores are
4
+ # handled by a {ScoreRecorder}, which uses a {ScoreSet} to actually save the
5
+ # scores. {ScoreSet} is also used to fetch the linkage scores so that a
6
+ # {Matcher} can create matches.
7
+ #
8
+ # {ScoreSet} is the superclass of implementations for different formats.
9
+ # Currently there are two formats for storing scores:
10
+ #
11
+ # * CSV ({ScoreSets::CSV})
12
+ # * Database ({ScoreSets::Database})
13
+ #
14
+ # See the documentation for score set you're interested in for more
15
+ # information.
16
+ #
17
+ # If you want to implement a custom {ScoreSet}, create a class that inherits
18
+ # {ScoreSet} and defines at least {#add_score} and {#each_pair}. You can then
19
+ # register that class via {.register}.
20
+ #
21
+ # @abstract
2
22
  class ScoreSet
3
- # Register a score set.
4
- #
5
- # @param [Class] klass
6
- def self.register(name, klass)
7
- methods = klass.instance_methods(false)
8
- missing = []
9
- unless methods.include?(:add_score)
10
- missing.push("#add_score")
11
- end
12
- unless methods.include?(:each_pair)
13
- missing.push("#each_pair")
14
- end
15
- unless missing.empty?
16
- raise ArgumentError, "class must define #{missing.join(" and ")}"
17
- end
23
+ class << self
24
+ # Register a new score set. Subclasses must define at least {#add_score}
25
+ # and {#each_pair}. Otherwise, an `ArgumentError` will be raised when you
26
+ # try to call {.register}.
27
+ #
28
+ # @param [String] name Score set name used in {.klass_for}
29
+ # @param [Class] klass ScoreSet subclass
30
+ def register(name, klass)
31
+ methods = klass.instance_methods(false)
32
+ missing = []
33
+ unless methods.include?(:add_score)
34
+ missing.push("#add_score")
35
+ end
36
+ unless methods.include?(:each_pair)
37
+ missing.push("#each_pair")
38
+ end
39
+ unless missing.empty?
40
+ raise ArgumentError, "class must define #{missing.join(" and ")}"
41
+ end
18
42
 
19
- @score_sets ||= {}
20
- @score_sets[name] = klass
21
- end
43
+ @score_sets ||= {}
44
+ @score_sets[name] = klass
45
+ end
22
46
 
23
- def self.[](name)
24
- @score_sets ? @score_sets[name] : nil
47
+ # Return a registered ScoreSet subclass or `nil` if it doesn't exist.
48
+ #
49
+ # @param [String] name of registered score set
50
+ # @return [Class, nil]
51
+ def klass_for(name)
52
+ @score_sets ? @score_sets[name] : nil
53
+ end
54
+ alias :[] :klass_for
25
55
  end
26
56
 
57
+ # This is called by {Matcher#run}, before any scores are read via
58
+ # {#each_pair}. Subclasses can redefine this to perform any setup needed
59
+ # for reading scores.
27
60
  def open_for_reading
28
61
  end
29
62
 
63
+ # This is called by {ScoreRecorder#start}, before any scores are added via
64
+ # {#add_score}. Subclasses can redefine this to perform any setup needed
65
+ # for saving scores.
30
66
  def open_for_writing
31
67
  end
32
68
 
69
+ # Add a score to the ScoreSet. Subclasses must redefine this.
70
+ #
71
+ # @param comparator_id [Fixnum] 1-indexed comparator index
72
+ # @param id_1 [Object] record id from first dataset
73
+ # @param id_2 [Object] record id from second dataset
74
+ # @param value [Fixnum, Float] score value
33
75
  # @abstract
34
76
  def add_score(comparator_id, id_1, id_2, value)
35
77
  raise NotImplementedError
36
78
  end
37
79
 
80
+ # Yield scores for each pair of records. Subclasses must redefine this.
81
+ # This method is called by {Matcher#run} with a block with three
82
+ # parameters:
83
+ #
84
+ # ```ruby
85
+ # score_set.each_pair do |id_1, id_2, scores|
86
+ # end
87
+ # ```
88
+ #
89
+ # `scores` should be a Hash where comparator ids are keys and scores are
90
+ # values. For example: `{ 1 => 0.5, 2 => 0.75, 3 => 1 }`. Note that not all
91
+ # comparators (including {Comparators::Compare}) create scores for each
92
+ # pair. A missing score means that pair was given a score of 0.
93
+ #
38
94
  # @abstract
39
95
  def each_pair(&block)
40
96
  raise NotImplementedError
41
97
  end
42
98
 
99
+ # This is called by {ScoreRecorder#stop}, after all scores have been added.
100
+ # Subclasses can redefine this to perform any teardown needed.
43
101
  def close
44
102
  end
45
103
  end