linkage 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/.gitignore +10 -0
  2. data/Gemfile +15 -13
  3. data/Gemfile.lock +67 -37
  4. data/Guardfile +0 -2
  5. data/Rakefile +122 -25
  6. data/lib/linkage/comparator.rb +172 -0
  7. data/lib/linkage/comparators/binary.rb +12 -0
  8. data/lib/linkage/comparators/compare.rb +46 -0
  9. data/lib/linkage/comparators/within.rb +32 -0
  10. data/lib/linkage/configuration.rb +285 -153
  11. data/lib/linkage/data.rb +32 -7
  12. data/lib/linkage/dataset.rb +107 -32
  13. data/lib/linkage/decollation.rb +93 -0
  14. data/lib/linkage/expectation.rb +21 -0
  15. data/lib/linkage/expectations/exhaustive.rb +63 -0
  16. data/lib/linkage/expectations/simple.rb +168 -0
  17. data/lib/linkage/field.rb +30 -4
  18. data/lib/linkage/field_set.rb +6 -3
  19. data/lib/linkage/function.rb +50 -3
  20. data/lib/linkage/functions/binary.rb +30 -0
  21. data/lib/linkage/functions/cast.rb +54 -0
  22. data/lib/linkage/functions/length.rb +29 -0
  23. data/lib/linkage/functions/strftime.rb +12 -11
  24. data/lib/linkage/functions/trim.rb +8 -0
  25. data/lib/linkage/group.rb +20 -0
  26. data/lib/linkage/import_buffer.rb +5 -16
  27. data/lib/linkage/meta_object.rb +139 -0
  28. data/lib/linkage/result_set.rb +74 -17
  29. data/lib/linkage/runner/single_threaded.rb +125 -10
  30. data/lib/linkage/version.rb +3 -0
  31. data/lib/linkage.rb +11 -0
  32. data/linkage.gemspec +16 -121
  33. data/test/config.yml +5 -0
  34. data/test/helper.rb +73 -8
  35. data/test/integration/test_collation.rb +45 -0
  36. data/test/integration/test_configuration.rb +268 -0
  37. data/test/integration/test_cross_linkage.rb +4 -17
  38. data/test/integration/test_dataset.rb +45 -2
  39. data/test/integration/test_dual_linkage.rb +40 -24
  40. data/test/integration/test_functions.rb +22 -0
  41. data/test/integration/test_result_set.rb +85 -0
  42. data/test/integration/test_scoring.rb +84 -0
  43. data/test/integration/test_self_linkage.rb +5 -0
  44. data/test/integration/test_within_comparator.rb +100 -0
  45. data/test/unit/comparators/test_compare.rb +105 -0
  46. data/test/unit/comparators/test_within.rb +57 -0
  47. data/test/unit/expectations/test_exhaustive.rb +111 -0
  48. data/test/unit/expectations/test_simple.rb +303 -0
  49. data/test/unit/functions/test_binary.rb +54 -0
  50. data/test/unit/functions/test_cast.rb +98 -0
  51. data/test/unit/functions/test_length.rb +52 -0
  52. data/test/unit/functions/test_strftime.rb +17 -13
  53. data/test/unit/functions/test_trim.rb +11 -4
  54. data/test/unit/test_comparator.rb +124 -0
  55. data/test/unit/test_configuration.rb +137 -175
  56. data/test/unit/test_data.rb +44 -0
  57. data/test/unit/test_dataset.rb +73 -21
  58. data/test/unit/test_decollation.rb +201 -0
  59. data/test/unit/test_field.rb +38 -14
  60. data/test/unit/test_field_set.rb +12 -8
  61. data/test/unit/test_function.rb +83 -16
  62. data/test/unit/test_group.rb +28 -0
  63. data/test/unit/test_import_buffer.rb +13 -27
  64. data/test/unit/test_meta_object.rb +208 -0
  65. data/test/unit/test_result_set.rb +221 -3
  66. metadata +82 -190
@@ -38,10 +38,23 @@ module Linkage
38
38
  nil
39
39
  end
40
40
 
41
- # @param [Linkage::Field, Object] args Function arguments
41
+ attr_reader :args
42
+
43
+ # Creates a new Function object. If the arguments contain only
44
+ # static objects, you should specify the dataset that this function
45
+ # belongs to as the last argument like so:
46
+ #
47
+ # Function.new(foo, bar, :dataset => dataset)
48
+ #
49
+ # Optionally, you can use the `dataset=` setter to do it later. Many
50
+ # functions require a dataset to work properly. If you try to use
51
+ # such a function without setting a dataset, it will raise a RuntimeError.
52
+ #
53
+ # @param [Linkage::Data, Object] args Function arguments
42
54
  def initialize(*args)
43
55
  @names = [self.class.function_name]
44
56
  @args = args
57
+ @options = args.last.is_a?(Hash) ? args.pop : {}
45
58
  process_args
46
59
  end
47
60
 
@@ -49,10 +62,25 @@ module Linkage
49
62
  @name ||= @names.join("_").to_sym
50
63
  end
51
64
 
65
+ def dataset
66
+ if @dataset.nil?
67
+ raise RuntimeError, "You must specify a dataset for static functions"
68
+ end
69
+ @dataset
70
+ end
71
+
72
+ def dataset=(dataset)
73
+ @dataset = dataset
74
+ end
75
+
52
76
  def static?
53
77
  @static
54
78
  end
55
79
 
80
+ def ==(other)
81
+ equal?(other) || (other.is_a?(Function) && name == other.name && args == other.args && dataset == other.dataset)
82
+ end
83
+
56
84
  # Subclasses must define this. The return value should be a Hash with
57
85
  # the following elements:
58
86
  # :type - column type (Ruby class) of the result
@@ -61,8 +89,14 @@ module Linkage
61
89
  raise NotImplementedError
62
90
  end
63
91
 
92
+ # Returns `nil` by default. Subclasses should redefine this if
93
+ # there is a collation.
94
+ def collation
95
+ nil
96
+ end
97
+
64
98
  # @return [Sequel::SQL::Function]
65
- def to_expr(adapter = nil)
99
+ def to_expr(options = {})
66
100
  self.class.function_name.to_sym.sql_function(*@values)
67
101
  end
68
102
 
@@ -80,6 +114,14 @@ module Linkage
80
114
  if arg.kind_of?(Data)
81
115
  @names << arg.name
82
116
  @static &&= arg.static?
117
+
118
+ # possibly set dataset
119
+ if @dataset.nil?
120
+ @dataset = arg.dataset
121
+ elsif @dataset != arg.dataset
122
+ raise ArgumentError, "Using dynamic data sources with different datasets is not permitted"
123
+ end
124
+
83
125
  type = arg.ruby_type[:type]
84
126
  value = arg.to_expr
85
127
  else
@@ -87,11 +129,16 @@ module Linkage
87
129
  type = arg.class
88
130
  value = arg
89
131
  end
90
- if parameters && !parameters[i].include?(type)
132
+ if parameters && parameters[i] != [:any] && !parameters[i].include?(type)
91
133
  raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
92
134
  end
93
135
  @values << value
94
136
  end
137
+
138
+ if @dataset.nil? && @options[:dataset]
139
+ # Set dataset for static functions manually
140
+ @dataset = @options[:dataset]
141
+ end
95
142
  end
96
143
  end
97
144
  end
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ module Functions
3
+ class Binary < Function
4
+ def self.function_name
5
+ "binary"
6
+ end
7
+
8
+ def self.parameters
9
+ [[String]]
10
+ end
11
+
12
+ def ruby_type
13
+ {:type => File}
14
+ end
15
+
16
+ def to_expr(options = {})
17
+ expr =
18
+ case dataset.database_type
19
+ when :sqlite
20
+ @values[0].cast(:blob)
21
+ when :postgres
22
+ @values[0].cast(:bytea)
23
+ else
24
+ @values[0].cast(:binary)
25
+ end
26
+ end
27
+ end
28
+ Function.register(Binary)
29
+ end
30
+ end
@@ -0,0 +1,54 @@
1
+ module Linkage
2
+ module Functions
3
+ class Cast < Function
4
+ def self.function_name
5
+ "cast"
6
+ end
7
+
8
+ def self.parameters
9
+ [[:any], [String]]
10
+ end
11
+
12
+ def ruby_type
13
+ type =
14
+ case @values[1]
15
+ when 'integer'
16
+ Fixnum
17
+ when 'binary'
18
+ File
19
+ else
20
+ raise "unknown type: #{@values[1]}"
21
+ end
22
+
23
+ {:type => type}
24
+ end
25
+
26
+ def to_expr(options = {})
27
+ cast =
28
+ case @values[1]
29
+ when 'integer'
30
+ case dataset.database_type
31
+ when :sqlite, :postgres, :h2
32
+ :integer
33
+ when :mysql
34
+ :signed
35
+ end
36
+ when 'binary'
37
+ case dataset.database_type
38
+ when :sqlite
39
+ :blob
40
+ when :postgres
41
+ :bytea
42
+ when :mysql, :h2
43
+ :binary
44
+ end
45
+ end
46
+
47
+ if cast
48
+ @values[0].cast(cast)
49
+ end
50
+ end
51
+ end
52
+ Function.register(Cast)
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ module Linkage
2
+ module Functions
3
+ # Returns the number of characters in a string.
4
+ class Length < Function
5
+ def self.function_name
6
+ "length"
7
+ end
8
+
9
+ def self.parameters
10
+ [[String]]
11
+ end
12
+
13
+ def ruby_type
14
+ {:type => Fixnum}
15
+ end
16
+
17
+ def to_expr(options = {})
18
+ expr =
19
+ case dataset.database_type
20
+ when :mysql, :postgres
21
+ :char_length.sql_function(@values[0])
22
+ else
23
+ :length.sql_function(@values[0])
24
+ end
25
+ end
26
+ end
27
+ Function.register(Length)
28
+ end
29
+ end
@@ -14,17 +14,18 @@ module Linkage
14
14
  {:type => String}
15
15
  end
16
16
 
17
- def to_expr(adapter = nil)
18
- case adapter
19
- when :mysql, :mysql2
20
- :date_format.sql_function(*@values)
21
- when :sqlite
22
- :strftime.sql_function(@values[1], @values[0])
23
- when :postgres
24
- :to_char.sql_function(*@values)
25
- else
26
- :strftime.sql_function(@values[0], @values[1])
27
- end
17
+ def to_expr(options = {})
18
+ expr =
19
+ case dataset.database_type
20
+ when :mysql
21
+ :date_format.sql_function(*@values)
22
+ when :sqlite
23
+ :strftime.sql_function(@values[1], @values[0])
24
+ when :postgres
25
+ :to_char.sql_function(*@values)
26
+ else
27
+ :strftime.sql_function(@values[0], @values[1])
28
+ end
28
29
  end
29
30
  end
30
31
  Function.register(Strftime)
@@ -16,6 +16,14 @@ module Linkage
16
16
  {:type => String}
17
17
  end
18
18
  end
19
+
20
+ def collation
21
+ if @args[0].kind_of?(Data)
22
+ @args[0].collation
23
+ else
24
+ super
25
+ end
26
+ end
19
27
  end
20
28
  Function.register(Trim)
21
29
  end
data/lib/linkage/group.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  module Linkage
2
2
  class Group
3
+ include Linkage::Decollation
4
+
3
5
  # @return [Hash] Hash of matching values
4
6
  attr_reader :values
5
7
 
@@ -24,12 +26,30 @@ module Linkage
24
26
 
25
27
  # @param [Hash] values Values that define this group
26
28
  # @param [Hash] options
29
+ # @option options [Fixnum] :id The group ID
30
+ # @option options [Fixnum] :count How many records are in the group
31
+ # @option options [Hash] :ruby_types Hash of ruby types for each value
32
+ # @option options [Symbol] :database_type
27
33
  # @example
28
34
  # Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
29
35
  def initialize(values, options)
30
36
  @count = options[:count]
31
37
  @id = options[:id]
38
+ @ruby_types = options[:ruby_types]
39
+ @database_type = options[:database_type]
32
40
  @values = values
33
41
  end
42
+
43
+ def decollated_values
44
+ @values.inject({}) do |hsh, (key, value)|
45
+ ruby_type = @ruby_types[key]
46
+ if ruby_type && ruby_type.has_key?(:opts) && ruby_type[:opts].has_key?(:collate)
47
+ hsh[key] = decollate(value, @database_type, ruby_type[:opts][:collate])
48
+ else
49
+ hsh[key] = value
50
+ end
51
+ hsh
52
+ end
53
+ end
34
54
  end
35
55
  end
@@ -1,15 +1,11 @@
1
1
  module Linkage
2
2
  class ImportBuffer
3
- # @param [String] uri Sequel-style URI
4
- # @param [Symbol, String] table_name
3
+ # @param [Sequel::Dataset] dataset
5
4
  # @param [Array<Symbol>] headers List of fields you want to insert
6
- # @param [Hash] options Sequel.connect options
7
5
  # @param [Fixnum] limit Number of records to insert at a time
8
- def initialize(uri, table_name, headers, options = {}, limit = 1000)
9
- @uri = uri
10
- @table_name = table_name.to_sym
6
+ def initialize(dataset, headers, limit = 1000)
7
+ @dataset = dataset
11
8
  @headers = headers
12
- @options = options
13
9
  @limit = limit
14
10
  @values = []
15
11
  end
@@ -23,17 +19,10 @@ module Linkage
23
19
 
24
20
  def flush
25
21
  return if @values.empty?
26
- database do |db|
27
- ds = db[@table_name]
28
- ds.import(@headers, @values)
22
+ @dataset.db.synchronize do
23
+ @dataset.import(@headers, @values)
29
24
  @values.clear
30
25
  end
31
26
  end
32
-
33
- private
34
-
35
- def database(&block)
36
- Sequel.connect(@uri, @options, &block)
37
- end
38
27
  end
39
28
  end
@@ -0,0 +1,139 @@
1
+ module Linkage
2
+ class MetaObject
3
+ attr_reader :object
4
+ attr_writer :side
5
+
6
+ # Creates a new MetaObject.
7
+ #
8
+ # @param [Object] object This can be a {Field}, {Function} or a regular
9
+ # Ruby object (Fixnum, String, etc). If `object` is not static (a {Field}
10
+ # or a {Function} that contains one or more {Field} objects), you should
11
+ # specify which "side" of the linkage the object belongs to (left-hand
12
+ # side or right-hand side) in the `side` argument.
13
+ # @param [Symbol] side `:lhs` for left-hand side or `:rhs` for right-hand
14
+ # side
15
+ def initialize(object, side = nil)
16
+ @object = object
17
+ @static = object.kind_of?(Linkage::Data) ? object.static? : true
18
+ if !side.nil? && side != :lhs && side != :rhs
19
+ raise ArgumentError, "invalid `side` argument, must be :lhs or :rhs"
20
+ end
21
+ @side = side
22
+ end
23
+
24
+ def side
25
+ if !@static && @side.nil?
26
+ raise RuntimeError, "Object is dynamic and side is not set"
27
+ end
28
+ @side
29
+ end
30
+
31
+ def dataset
32
+ @object.kind_of?(Linkage::Data) ? @object.dataset : nil
33
+ end
34
+
35
+ def dataset=(dataset)
36
+ if @object.kind_of?(Linkage::Data)
37
+ @object.dataset = dataset
38
+ else
39
+ raise RuntimeError, "You can't set the dataset of a non-data object."
40
+ end
41
+ end
42
+
43
+ def database_type
44
+ ds = dataset
45
+ ds ? ds.database_type : nil
46
+ end
47
+
48
+ def static?
49
+ @static
50
+ end
51
+
52
+ # Returns true if the argument has the same object as the instance.
53
+ #
54
+ # @param [Linkage::MetaObject] other
55
+ # @return [Boolean]
56
+ def objects_equal?(other)
57
+ other.is_a?(Linkage::MetaObject) && other.object == self.object
58
+ end
59
+
60
+ # Returns true if the argument has the same dataset as the instance.
61
+ #
62
+ # @param [Linkage::MetaObject] other
63
+ # @return [Boolean]
64
+ def datasets_equal?(other)
65
+ other.is_a?(Linkage::MetaObject) && other.dataset == self.dataset
66
+ end
67
+
68
+ # Returns an expression suitable for use in Sequel queries.
69
+ # @return [Object]
70
+ def to_expr
71
+ if @object.kind_of?(Linkage::Data)
72
+ @object.to_expr
73
+ else
74
+ @object
75
+ end
76
+ end
77
+
78
+ # Returns a Sequel identifier for {Data} objects, or the object itself.
79
+ # @return [Sequel::SQL::Identifier, Object]
80
+ def to_identifier
81
+ if @object.kind_of?(Linkage::Data)
82
+ Sequel::SQL::Identifier.new(@object.to_expr)
83
+ else
84
+ @object
85
+ end
86
+ end
87
+
88
+ # Return the name of the object for {Data} objects, nil for others.
89
+ # @return [Symbol, nil]
90
+ def name
91
+ if @object.kind_of?(Linkage::Data)
92
+ @object.name
93
+ else
94
+ nil
95
+ end
96
+ end
97
+
98
+ # Returns a {MergeField} if both objects are {Data} objects, otherwise,
99
+ # raises an exception.
100
+ #
101
+ # @return [Linkage::MergeField]
102
+ def merge(other)
103
+ if @object.kind_of?(Linkage::Data) && other.object.kind_of?(Linkage::Data)
104
+ @object.merge(other.object)
105
+ else
106
+ raise ArgumentError, "Cannot merge a non-data object"
107
+ end
108
+ end
109
+
110
+ # Returns the Ruby type of the underlying object.
111
+ #
112
+ # @return [Hash]
113
+ # @see Linkage::Field#ruby_type
114
+ # @see Linkage::Function#ruby_type
115
+ def ruby_type
116
+ if @object.kind_of?(Linkage::Data)
117
+ @object.ruby_type
118
+ else
119
+ {:type => @object.class}
120
+ end
121
+ end
122
+
123
+ # Returns the collation of the underlying object.
124
+ #
125
+ # @return [Symbol]
126
+ def collation
127
+ if @object.kind_of?(Linkage::Data)
128
+ @object.collation
129
+ else
130
+ nil
131
+ end
132
+ end
133
+
134
+ # Returns true if underlying object is not a subclass of {Linkage::Data}.
135
+ def raw?
136
+ !@object.kind_of?(Linkage::Data)
137
+ end
138
+ end
139
+ end
@@ -7,41 +7,98 @@ module Linkage
7
7
  end
8
8
 
9
9
  def groups_dataset
10
- @groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
10
+ @groups_dataset ||= Dataset.new(database[:groups])
11
11
  end
12
12
 
13
- def database(&block)
14
- Sequel.connect(@config.results_uri, @config.results_uri_options, &block)
13
+ def database
14
+ # FIXME: If the results database is the same as one of the datasets
15
+ # being linked, there will be two connections to said database. This
16
+ # could result in unexpected locking for non-concurrent databases (like
17
+ # SQLite).
18
+ @database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
15
19
  end
16
20
 
17
21
  def create_tables!
18
- database do |db|
22
+ if @config.groups_table_needed?
19
23
  schema = @config.groups_table_schema
20
- db.create_table(:groups) do
24
+ if @config.decollation_needed?
25
+ database.create_table(@config.original_groups_table_name) do
26
+ schema.each { |col| column(*col) }
27
+ end
28
+ end
29
+
30
+ database.create_table(@config.groups_table_name) do
21
31
  schema.each { |col| column(*col) }
22
32
  end
33
+ end
23
34
 
24
- pk_type = @config.dataset_1.field_set.primary_key.merge(@config.dataset_2.field_set.primary_key).ruby_type
25
- db.create_table(:groups_records) do
26
- column(:record_id, pk_type[:type], pk_type[:opts] || {})
27
- Integer :group_id
28
- Integer :dataset
29
- index :group_id
35
+ if @config.scores_table_needed?
36
+ schema = @config.scores_table_schema
37
+ database.create_table(@config.scores_table_name) do
38
+ schema.each { |col| column(*col) }
30
39
  end
31
40
  end
41
+
42
+ schema = @config.matches_table_schema
43
+ database.create_table(@config.matches_table_name) do
44
+ schema.each { |col| column(*col) }
45
+ end
32
46
  end
33
47
 
34
48
  def add_group(group, dataset_id = nil)
35
- if !@groups_buffer
36
- groups_headers = [:id] + group.values.keys
37
- @groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
49
+ if @config.decollation_needed?
50
+ original_values = group.values
51
+ values = group.decollated_values
52
+ if !@groups_buffer
53
+ groups_headers = [:id] + values.keys
54
+ @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
55
+ groups_headers)
56
+
57
+ original_groups_headers = [:id] + original_values.keys
58
+ @original_groups_buffer = ImportBuffer.new(
59
+ database[@config.original_groups_table_name],
60
+ original_groups_headers)
61
+ end
62
+
63
+ group_id = next_group_id
64
+ @groups_buffer.add([group_id] + values.values)
65
+ @original_groups_buffer.add([group_id] + original_values.values)
66
+ else
67
+ # Non-DRY for minute speed improvements
68
+ values = group.values
69
+ if !@groups_buffer
70
+ groups_headers = [:id] + values.keys
71
+ @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
72
+ groups_headers)
73
+ end
74
+ group_id = next_group_id
75
+ @groups_buffer.add([group_id] + values.values)
76
+ end
77
+ end
78
+
79
+ def add_score(comparator_id, record_1_id, record_2_id, score)
80
+ if !@scores_buffer
81
+ scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
82
+ @scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
83
+ scores_headers)
84
+ end
85
+ @scores_buffer.add([comparator_id, record_1_id, record_2_id, score])
86
+ end
87
+
88
+ def add_match(record_1_id, record_2_id, total_score)
89
+ if !@matches_buffer
90
+ matches_headers = [:record_1_id, :record_2_id, :total_score]
91
+ @matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
92
+ matches_headers)
38
93
  end
39
- group_id = next_group_id
40
- @groups_buffer.add([group_id] + group.values.values)
94
+ @matches_buffer.add([record_1_id, record_2_id, total_score])
41
95
  end
42
96
 
43
97
  def flush!
44
98
  @groups_buffer.flush if @groups_buffer
99
+ @original_groups_buffer.flush if @original_groups_buffer
100
+ @scores_buffer.flush if @scores_buffer
101
+ @matches_buffer.flush if @matches_buffer
45
102
  end
46
103
 
47
104
  def get_group(index)
@@ -50,7 +107,7 @@ module Linkage
50
107
  end
51
108
 
52
109
  def groups_records_datasets(group)
53
- datasets = @config.datasets_with_applied_expectations
110
+ datasets = @config.datasets_with_applied_simple_expectations
54
111
  datasets.collect! { |ds| ds.dataset_for_group(group) }
55
112
  end
56
113