linkage 0.0.6 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/.gitignore +10 -0
  2. data/Gemfile +15 -13
  3. data/Gemfile.lock +67 -37
  4. data/Guardfile +0 -2
  5. data/Rakefile +122 -25
  6. data/lib/linkage/comparator.rb +172 -0
  7. data/lib/linkage/comparators/binary.rb +12 -0
  8. data/lib/linkage/comparators/compare.rb +46 -0
  9. data/lib/linkage/comparators/within.rb +32 -0
  10. data/lib/linkage/configuration.rb +285 -153
  11. data/lib/linkage/data.rb +32 -7
  12. data/lib/linkage/dataset.rb +107 -32
  13. data/lib/linkage/decollation.rb +93 -0
  14. data/lib/linkage/expectation.rb +21 -0
  15. data/lib/linkage/expectations/exhaustive.rb +63 -0
  16. data/lib/linkage/expectations/simple.rb +168 -0
  17. data/lib/linkage/field.rb +30 -4
  18. data/lib/linkage/field_set.rb +6 -3
  19. data/lib/linkage/function.rb +50 -3
  20. data/lib/linkage/functions/binary.rb +30 -0
  21. data/lib/linkage/functions/cast.rb +54 -0
  22. data/lib/linkage/functions/length.rb +29 -0
  23. data/lib/linkage/functions/strftime.rb +12 -11
  24. data/lib/linkage/functions/trim.rb +8 -0
  25. data/lib/linkage/group.rb +20 -0
  26. data/lib/linkage/import_buffer.rb +5 -16
  27. data/lib/linkage/meta_object.rb +139 -0
  28. data/lib/linkage/result_set.rb +74 -17
  29. data/lib/linkage/runner/single_threaded.rb +125 -10
  30. data/lib/linkage/version.rb +3 -0
  31. data/lib/linkage.rb +11 -0
  32. data/linkage.gemspec +16 -121
  33. data/test/config.yml +5 -0
  34. data/test/helper.rb +73 -8
  35. data/test/integration/test_collation.rb +45 -0
  36. data/test/integration/test_configuration.rb +268 -0
  37. data/test/integration/test_cross_linkage.rb +4 -17
  38. data/test/integration/test_dataset.rb +45 -2
  39. data/test/integration/test_dual_linkage.rb +40 -24
  40. data/test/integration/test_functions.rb +22 -0
  41. data/test/integration/test_result_set.rb +85 -0
  42. data/test/integration/test_scoring.rb +84 -0
  43. data/test/integration/test_self_linkage.rb +5 -0
  44. data/test/integration/test_within_comparator.rb +100 -0
  45. data/test/unit/comparators/test_compare.rb +105 -0
  46. data/test/unit/comparators/test_within.rb +57 -0
  47. data/test/unit/expectations/test_exhaustive.rb +111 -0
  48. data/test/unit/expectations/test_simple.rb +303 -0
  49. data/test/unit/functions/test_binary.rb +54 -0
  50. data/test/unit/functions/test_cast.rb +98 -0
  51. data/test/unit/functions/test_length.rb +52 -0
  52. data/test/unit/functions/test_strftime.rb +17 -13
  53. data/test/unit/functions/test_trim.rb +11 -4
  54. data/test/unit/test_comparator.rb +124 -0
  55. data/test/unit/test_configuration.rb +137 -175
  56. data/test/unit/test_data.rb +44 -0
  57. data/test/unit/test_dataset.rb +73 -21
  58. data/test/unit/test_decollation.rb +201 -0
  59. data/test/unit/test_field.rb +38 -14
  60. data/test/unit/test_field_set.rb +12 -8
  61. data/test/unit/test_function.rb +83 -16
  62. data/test/unit/test_group.rb +28 -0
  63. data/test/unit/test_import_buffer.rb +13 -27
  64. data/test/unit/test_meta_object.rb +208 -0
  65. data/test/unit/test_result_set.rb +221 -3
  66. metadata +82 -190
@@ -38,10 +38,23 @@ module Linkage
38
38
  nil
39
39
  end
40
40
 
41
- # @param [Linkage::Field, Object] args Function arguments
41
+ attr_reader :args
42
+
43
+ # Creates a new Function object. If the arguments contain only
44
+ # static objects, you should specify the dataset that this function
45
+ # belongs to as the last argument like so:
46
+ #
47
+ # Function.new(foo, bar, :dataset => dataset)
48
+ #
49
+ # Optionally, you can use the `dataset=` setter to do it later. Many
50
+ # functions require a dataset to work properly. If you try to use
51
+ # such a function without setting a dataset, it will raise a RuntimeError.
52
+ #
53
+ # @param [Linkage::Data, Object] args Function arguments
42
54
  def initialize(*args)
43
55
  @names = [self.class.function_name]
44
56
  @args = args
57
+ @options = args.last.is_a?(Hash) ? args.pop : {}
45
58
  process_args
46
59
  end
47
60
 
@@ -49,10 +62,25 @@ module Linkage
49
62
  @name ||= @names.join("_").to_sym
50
63
  end
51
64
 
65
+ def dataset
66
+ if @dataset.nil?
67
+ raise RuntimeError, "You must specify a dataset for static functions"
68
+ end
69
+ @dataset
70
+ end
71
+
72
+ def dataset=(dataset)
73
+ @dataset = dataset
74
+ end
75
+
52
76
  def static?
53
77
  @static
54
78
  end
55
79
 
80
+ def ==(other)
81
+ equal?(other) || (other.is_a?(Function) && name == other.name && args == other.args && dataset == other.dataset)
82
+ end
83
+
56
84
  # Subclasses must define this. The return value should be a Hash with
57
85
  # the following elements:
58
86
  # :type - column type (Ruby class) of the result
@@ -61,8 +89,14 @@ module Linkage
61
89
  raise NotImplementedError
62
90
  end
63
91
 
92
+ # Returns `nil` by default. Subclasses should redefine this if
93
+ # there is a collation.
94
+ def collation
95
+ nil
96
+ end
97
+
64
98
  # @return [Sequel::SQL::Function]
65
- def to_expr(adapter = nil)
99
+ def to_expr(options = {})
66
100
  self.class.function_name.to_sym.sql_function(*@values)
67
101
  end
68
102
 
@@ -80,6 +114,14 @@ module Linkage
80
114
  if arg.kind_of?(Data)
81
115
  @names << arg.name
82
116
  @static &&= arg.static?
117
+
118
+ # possibly set dataset
119
+ if @dataset.nil?
120
+ @dataset = arg.dataset
121
+ elsif @dataset != arg.dataset
122
+ raise ArgumentError, "Using dynamic data sources with different datasets is not permitted"
123
+ end
124
+
83
125
  type = arg.ruby_type[:type]
84
126
  value = arg.to_expr
85
127
  else
@@ -87,11 +129,16 @@ module Linkage
87
129
  type = arg.class
88
130
  value = arg
89
131
  end
90
- if parameters && !parameters[i].include?(type)
132
+ if parameters && parameters[i] != [:any] && !parameters[i].include?(type)
91
133
  raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
92
134
  end
93
135
  @values << value
94
136
  end
137
+
138
+ if @dataset.nil? && @options[:dataset]
139
+ # Set dataset for static functions manually
140
+ @dataset = @options[:dataset]
141
+ end
95
142
  end
96
143
  end
97
144
  end
@@ -0,0 +1,30 @@
1
+ module Linkage
2
+ module Functions
3
+ class Binary < Function
4
+ def self.function_name
5
+ "binary"
6
+ end
7
+
8
+ def self.parameters
9
+ [[String]]
10
+ end
11
+
12
+ def ruby_type
13
+ {:type => File}
14
+ end
15
+
16
+ def to_expr(options = {})
17
+ expr =
18
+ case dataset.database_type
19
+ when :sqlite
20
+ @values[0].cast(:blob)
21
+ when :postgres
22
+ @values[0].cast(:bytea)
23
+ else
24
+ @values[0].cast(:binary)
25
+ end
26
+ end
27
+ end
28
+ Function.register(Binary)
29
+ end
30
+ end
@@ -0,0 +1,54 @@
1
+ module Linkage
2
+ module Functions
3
+ class Cast < Function
4
+ def self.function_name
5
+ "cast"
6
+ end
7
+
8
+ def self.parameters
9
+ [[:any], [String]]
10
+ end
11
+
12
+ def ruby_type
13
+ type =
14
+ case @values[1]
15
+ when 'integer'
16
+ Fixnum
17
+ when 'binary'
18
+ File
19
+ else
20
+ raise "unknown type: #{@values[1]}"
21
+ end
22
+
23
+ {:type => type}
24
+ end
25
+
26
+ def to_expr(options = {})
27
+ cast =
28
+ case @values[1]
29
+ when 'integer'
30
+ case dataset.database_type
31
+ when :sqlite, :postgres, :h2
32
+ :integer
33
+ when :mysql
34
+ :signed
35
+ end
36
+ when 'binary'
37
+ case dataset.database_type
38
+ when :sqlite
39
+ :blob
40
+ when :postgres
41
+ :bytea
42
+ when :mysql, :h2
43
+ :binary
44
+ end
45
+ end
46
+
47
+ if cast
48
+ @values[0].cast(cast)
49
+ end
50
+ end
51
+ end
52
+ Function.register(Cast)
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ module Linkage
2
+ module Functions
3
+ # Returns the number of characters in a string.
4
+ class Length < Function
5
+ def self.function_name
6
+ "length"
7
+ end
8
+
9
+ def self.parameters
10
+ [[String]]
11
+ end
12
+
13
+ def ruby_type
14
+ {:type => Fixnum}
15
+ end
16
+
17
+ def to_expr(options = {})
18
+ expr =
19
+ case dataset.database_type
20
+ when :mysql, :postgres
21
+ :char_length.sql_function(@values[0])
22
+ else
23
+ :length.sql_function(@values[0])
24
+ end
25
+ end
26
+ end
27
+ Function.register(Length)
28
+ end
29
+ end
@@ -14,17 +14,18 @@ module Linkage
14
14
  {:type => String}
15
15
  end
16
16
 
17
- def to_expr(adapter = nil)
18
- case adapter
19
- when :mysql, :mysql2
20
- :date_format.sql_function(*@values)
21
- when :sqlite
22
- :strftime.sql_function(@values[1], @values[0])
23
- when :postgres
24
- :to_char.sql_function(*@values)
25
- else
26
- :strftime.sql_function(@values[0], @values[1])
27
- end
17
+ def to_expr(options = {})
18
+ expr =
19
+ case dataset.database_type
20
+ when :mysql
21
+ :date_format.sql_function(*@values)
22
+ when :sqlite
23
+ :strftime.sql_function(@values[1], @values[0])
24
+ when :postgres
25
+ :to_char.sql_function(*@values)
26
+ else
27
+ :strftime.sql_function(@values[0], @values[1])
28
+ end
28
29
  end
29
30
  end
30
31
  Function.register(Strftime)
@@ -16,6 +16,14 @@ module Linkage
16
16
  {:type => String}
17
17
  end
18
18
  end
19
+
20
+ def collation
21
+ if @args[0].kind_of?(Data)
22
+ @args[0].collation
23
+ else
24
+ super
25
+ end
26
+ end
19
27
  end
20
28
  Function.register(Trim)
21
29
  end
data/lib/linkage/group.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  module Linkage
2
2
  class Group
3
+ include Linkage::Decollation
4
+
3
5
  # @return [Hash] Hash of matching values
4
6
  attr_reader :values
5
7
 
@@ -24,12 +26,30 @@ module Linkage
24
26
 
25
27
  # @param [Hash] values Values that define this group
26
28
  # @param [Hash] options
29
+ # @option options [Fixnum] :id The group ID
30
+ # @option options [Fixnum] :count How many records are in the group
31
+ # @option options [Hash] :ruby_types Hash of ruby types for each value
32
+ # @option options [Symbol] :database_type
27
33
  # @example
28
34
  # Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
29
35
  def initialize(values, options)
30
36
  @count = options[:count]
31
37
  @id = options[:id]
38
+ @ruby_types = options[:ruby_types]
39
+ @database_type = options[:database_type]
32
40
  @values = values
33
41
  end
42
+
43
+ def decollated_values
44
+ @values.inject({}) do |hsh, (key, value)|
45
+ ruby_type = @ruby_types[key]
46
+ if ruby_type && ruby_type.has_key?(:opts) && ruby_type[:opts].has_key?(:collate)
47
+ hsh[key] = decollate(value, @database_type, ruby_type[:opts][:collate])
48
+ else
49
+ hsh[key] = value
50
+ end
51
+ hsh
52
+ end
53
+ end
34
54
  end
35
55
  end
@@ -1,15 +1,11 @@
1
1
  module Linkage
2
2
  class ImportBuffer
3
- # @param [String] uri Sequel-style URI
4
- # @param [Symbol, String] table_name
3
+ # @param [Sequel::Dataset] dataset
5
4
  # @param [Array<Symbol>] headers List of fields you want to insert
6
- # @param [Hash] options Sequel.connect options
7
5
  # @param [Fixnum] limit Number of records to insert at a time
8
- def initialize(uri, table_name, headers, options = {}, limit = 1000)
9
- @uri = uri
10
- @table_name = table_name.to_sym
6
+ def initialize(dataset, headers, limit = 1000)
7
+ @dataset = dataset
11
8
  @headers = headers
12
- @options = options
13
9
  @limit = limit
14
10
  @values = []
15
11
  end
@@ -23,17 +19,10 @@ module Linkage
23
19
 
24
20
  def flush
25
21
  return if @values.empty?
26
- database do |db|
27
- ds = db[@table_name]
28
- ds.import(@headers, @values)
22
+ @dataset.db.synchronize do
23
+ @dataset.import(@headers, @values)
29
24
  @values.clear
30
25
  end
31
26
  end
32
-
33
- private
34
-
35
- def database(&block)
36
- Sequel.connect(@uri, @options, &block)
37
- end
38
27
  end
39
28
  end
@@ -0,0 +1,139 @@
1
+ module Linkage
2
+ class MetaObject
3
+ attr_reader :object
4
+ attr_writer :side
5
+
6
+ # Creates a new MetaObject.
7
+ #
8
+ # @param [Object] object This can be a {Field}, {Function} or a regular
9
+ # Ruby object (Fixnum, String, etc). If `object` is not static (a {Field}
10
+ # or a {Function} that contains one or more {Field} objects), you should
11
+ # specify which "side" of the linkage the object belongs to (left-hand
12
+ # side or right-hand side) in the `side` argument.
13
+ # @param [Symbol] side `:lhs` for left-hand side or `:rhs` for right-hand
14
+ # side
15
+ def initialize(object, side = nil)
16
+ @object = object
17
+ @static = object.kind_of?(Linkage::Data) ? object.static? : true
18
+ if !side.nil? && side != :lhs && side != :rhs
19
+ raise ArgumentError, "invalid `side` argument, must be :lhs or :rhs"
20
+ end
21
+ @side = side
22
+ end
23
+
24
+ def side
25
+ if !@static && @side.nil?
26
+ raise RuntimeError, "Object is dynamic and side is not set"
27
+ end
28
+ @side
29
+ end
30
+
31
+ def dataset
32
+ @object.kind_of?(Linkage::Data) ? @object.dataset : nil
33
+ end
34
+
35
+ def dataset=(dataset)
36
+ if @object.kind_of?(Linkage::Data)
37
+ @object.dataset = dataset
38
+ else
39
+ raise RuntimeError, "You can't set the dataset of a non-data object."
40
+ end
41
+ end
42
+
43
+ def database_type
44
+ ds = dataset
45
+ ds ? ds.database_type : nil
46
+ end
47
+
48
+ def static?
49
+ @static
50
+ end
51
+
52
+ # Returns true if the argument has the same object as the instance.
53
+ #
54
+ # @param [Linkage::MetaObject] other
55
+ # @return [Boolean]
56
+ def objects_equal?(other)
57
+ other.is_a?(Linkage::MetaObject) && other.object == self.object
58
+ end
59
+
60
+ # Returns true if the argument has the same dataset as the instance.
61
+ #
62
+ # @param [Linkage::MetaObject] other
63
+ # @return [Boolean]
64
+ def datasets_equal?(other)
65
+ other.is_a?(Linkage::MetaObject) && other.dataset == self.dataset
66
+ end
67
+
68
+ # Returns an expression suitable for use in Sequel queries.
69
+ # @return [Object]
70
+ def to_expr
71
+ if @object.kind_of?(Linkage::Data)
72
+ @object.to_expr
73
+ else
74
+ @object
75
+ end
76
+ end
77
+
78
+ # Returns a Sequel identifier for {Data} objects, or the object itself.
79
+ # @return [Sequel::SQL::Identifier, Object]
80
+ def to_identifier
81
+ if @object.kind_of?(Linkage::Data)
82
+ Sequel::SQL::Identifier.new(@object.to_expr)
83
+ else
84
+ @object
85
+ end
86
+ end
87
+
88
+ # Return the name of the object for {Data} objects, nil for others.
89
+ # @return [Symbol, nil]
90
+ def name
91
+ if @object.kind_of?(Linkage::Data)
92
+ @object.name
93
+ else
94
+ nil
95
+ end
96
+ end
97
+
98
+ # Returns a {MergeField} if both objects are {Data} objects, otherwise,
99
+ # raises an exception.
100
+ #
101
+ # @return [Linkage::MergeField]
102
+ def merge(other)
103
+ if @object.kind_of?(Linkage::Data) && other.object.kind_of?(Linkage::Data)
104
+ @object.merge(other.object)
105
+ else
106
+ raise ArgumentError, "Cannot merge a non-data object"
107
+ end
108
+ end
109
+
110
+ # Returns the Ruby type of the underlying object.
111
+ #
112
+ # @return [Hash]
113
+ # @see Linkage::Field#ruby_type
114
+ # @see Linkage::Function#ruby_type
115
+ def ruby_type
116
+ if @object.kind_of?(Linkage::Data)
117
+ @object.ruby_type
118
+ else
119
+ {:type => @object.class}
120
+ end
121
+ end
122
+
123
+ # Returns the collation of the underlying object.
124
+ #
125
+ # @return [Symbol]
126
+ def collation
127
+ if @object.kind_of?(Linkage::Data)
128
+ @object.collation
129
+ else
130
+ nil
131
+ end
132
+ end
133
+
134
+ # Returns true if underlying object is not a subclass of {Linkage::Data}.
135
+ def raw?
136
+ !@object.kind_of?(Linkage::Data)
137
+ end
138
+ end
139
+ end
@@ -7,41 +7,98 @@ module Linkage
7
7
  end
8
8
 
9
9
  def groups_dataset
10
- @groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
10
+ @groups_dataset ||= Dataset.new(database[:groups])
11
11
  end
12
12
 
13
- def database(&block)
14
- Sequel.connect(@config.results_uri, @config.results_uri_options, &block)
13
+ def database
14
+ # FIXME: If the results database is the same as one of the datasets
15
+ # being linked, there will be two connections to said database. This
16
+ # could result in unexpected locking for non-concurrent databases (like
17
+ # SQLite).
18
+ @database ||= Sequel.connect(@config.results_uri, @config.results_uri_options)
15
19
  end
16
20
 
17
21
  def create_tables!
18
- database do |db|
22
+ if @config.groups_table_needed?
19
23
  schema = @config.groups_table_schema
20
- db.create_table(:groups) do
24
+ if @config.decollation_needed?
25
+ database.create_table(@config.original_groups_table_name) do
26
+ schema.each { |col| column(*col) }
27
+ end
28
+ end
29
+
30
+ database.create_table(@config.groups_table_name) do
21
31
  schema.each { |col| column(*col) }
22
32
  end
33
+ end
23
34
 
24
- pk_type = @config.dataset_1.field_set.primary_key.merge(@config.dataset_2.field_set.primary_key).ruby_type
25
- db.create_table(:groups_records) do
26
- column(:record_id, pk_type[:type], pk_type[:opts] || {})
27
- Integer :group_id
28
- Integer :dataset
29
- index :group_id
35
+ if @config.scores_table_needed?
36
+ schema = @config.scores_table_schema
37
+ database.create_table(@config.scores_table_name) do
38
+ schema.each { |col| column(*col) }
30
39
  end
31
40
  end
41
+
42
+ schema = @config.matches_table_schema
43
+ database.create_table(@config.matches_table_name) do
44
+ schema.each { |col| column(*col) }
45
+ end
32
46
  end
33
47
 
34
48
  def add_group(group, dataset_id = nil)
35
- if !@groups_buffer
36
- groups_headers = [:id] + group.values.keys
37
- @groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
49
+ if @config.decollation_needed?
50
+ original_values = group.values
51
+ values = group.decollated_values
52
+ if !@groups_buffer
53
+ groups_headers = [:id] + values.keys
54
+ @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
55
+ groups_headers)
56
+
57
+ original_groups_headers = [:id] + original_values.keys
58
+ @original_groups_buffer = ImportBuffer.new(
59
+ database[@config.original_groups_table_name],
60
+ original_groups_headers)
61
+ end
62
+
63
+ group_id = next_group_id
64
+ @groups_buffer.add([group_id] + values.values)
65
+ @original_groups_buffer.add([group_id] + original_values.values)
66
+ else
67
+ # Non-DRY for minute speed improvements
68
+ values = group.values
69
+ if !@groups_buffer
70
+ groups_headers = [:id] + values.keys
71
+ @groups_buffer = ImportBuffer.new(database[@config.groups_table_name],
72
+ groups_headers)
73
+ end
74
+ group_id = next_group_id
75
+ @groups_buffer.add([group_id] + values.values)
76
+ end
77
+ end
78
+
79
+ def add_score(comparator_id, record_1_id, record_2_id, score)
80
+ if !@scores_buffer
81
+ scores_headers = [:comparator_id, :record_1_id, :record_2_id, :score]
82
+ @scores_buffer = ImportBuffer.new(database[@config.scores_table_name],
83
+ scores_headers)
84
+ end
85
+ @scores_buffer.add([comparator_id, record_1_id, record_2_id, score])
86
+ end
87
+
88
+ def add_match(record_1_id, record_2_id, total_score)
89
+ if !@matches_buffer
90
+ matches_headers = [:record_1_id, :record_2_id, :total_score]
91
+ @matches_buffer = ImportBuffer.new(database[@config.matches_table_name],
92
+ matches_headers)
38
93
  end
39
- group_id = next_group_id
40
- @groups_buffer.add([group_id] + group.values.values)
94
+ @matches_buffer.add([record_1_id, record_2_id, total_score])
41
95
  end
42
96
 
43
97
  def flush!
44
98
  @groups_buffer.flush if @groups_buffer
99
+ @original_groups_buffer.flush if @original_groups_buffer
100
+ @scores_buffer.flush if @scores_buffer
101
+ @matches_buffer.flush if @matches_buffer
45
102
  end
46
103
 
47
104
  def get_group(index)
@@ -50,7 +107,7 @@ module Linkage
50
107
  end
51
108
 
52
109
  def groups_records_datasets(group)
53
- datasets = @config.datasets_with_applied_expectations
110
+ datasets = @config.datasets_with_applied_simple_expectations
54
111
  datasets.collect! { |ds| ds.dataset_for_group(group) }
55
112
  end
56
113