activewarehouse-etl 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -0,0 +1,24 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to rename a field in the row.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:source</tt>: the source field name
7
+ # * <tt>:dest</tt>: The destination field name
8
+ class RenameProcessor < ETL::Processor::RowProcessor
9
+ def process(row)
10
+ source_value = row[configuration[:source]]
11
+ case source_value
12
+ when Numeric
13
+ row[configuration[:dest]] = source_value
14
+ when nil
15
+ row[configuration[:dest]] = nil
16
+ else
17
+ row[configuration[:dest]] = source_value.dup
18
+ end
19
+ row.delete(configuration[:source])
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -8,7 +8,7 @@ module ETL #:nodoc:
8
8
  def initialize(control, configuration)
9
9
  super
10
10
  end
11
- # Process the specified row
11
+ # Process the specified row. This method must return the row.
12
12
  def process(row)
13
13
  raise "process_row is an abstract method"
14
14
  end
@@ -0,0 +1,23 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to generate a sequence.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:context</tt>: A context name, if none is specified then the context will be
7
+ # the current ETL run
8
+ # * <tt>:dest</tt>: The destination field name
9
+ class SequenceProcessor < ETL::Processor::RowProcessor
10
+ def process(row)
11
+ sequences[configuration[:context]] ||= 0
12
+ row[configuration[:dest]] = sequences[configuration[:context]] += 1
13
+ row
14
+ end
15
+
16
+ protected
17
+ # Get a Hash of sequences
18
+ def sequences
19
+ @sequences ||= {}
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row level processor that provides surrogate keys
4
+ class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
+ attr_accessor :query
6
+ attr_accessor :destination
7
+
8
+ # Initialize the surrogate key generator
9
+ def initialize(control, configuration)
10
+ super
11
+ @surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
12
+ #puts "initial surrogate key: #{@surrogate_key}"
13
+ @surrogate_key = 0 if @surrogate_key.blank?
14
+ @surrogate_key = @surrogate_key.to_i
15
+ #puts "surrogate key: #{@surrogate_key}"
16
+ @destination = configuration[:destination] || :id
17
+ end
18
+
19
+ # Add a surrogate key to the row
20
+ def process(row)
21
+ if row
22
+ #puts "processing row #{row.inspect}"
23
+ @surrogate_key += 1
24
+ #puts "adding surrogate key to row: #{@surrogate_key}"
25
+ row[destination] = @surrogate_key
26
+ row
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -3,8 +3,6 @@ module ETL #:nodoc:
3
3
  # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
4
  # prior to loading
5
5
  class TruncateProcessor < ETL::Processor::Processor
6
- #attr_reader :file
7
-
8
6
  # Defines the table to truncate
9
7
  attr_reader :table
10
8
 
data/lib/etl/row.rb ADDED
@@ -0,0 +1,17 @@
1
+ # This source file contains the ETL::Row class.
2
+
3
+ module ETL #:nodoc:
4
+ # This class represents a single row currently passing through the ETL pipeline
5
+ class Row < Hash
6
+ # All change types
7
+ CHANGE_TYPES = [:insert, :update, :delete]
8
+
9
+ # Accessor for the row's change type
10
+ attr_accessor :change_type
11
+
12
+ # Get the change type, defaults to :insert
13
+ def change_type
14
+ @change_type ||= :insert
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ module ETL
2
+ module Screen
3
+ # This screen validates the number of rows which will be bulk loaded
4
+ # against the results from some sort of a row count query. If there
5
+ # is a difference then the screen will not pass
6
+ class RowCountScreen
7
+ def initialize(control, configuration={})
8
+
9
+ end
10
+ def execute
11
+
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ module ETL
2
+ module Transform
3
+ class BlockTransform < ETL::Transform::Transform
4
+ def initialize(control, name, configuration)
5
+ super
6
+ @block = configuration[:block]
7
+ end
8
+ def transform(name, value, row)
9
+ @block.call(name, value, row)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -6,7 +6,7 @@ module ETL #:nodoc:
6
6
  #
7
7
  # Configuration options:
8
8
  # * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
9
- def initialize(control, configuration={})
9
+ def initialize(control, name, configuration={})
10
10
  super
11
11
  @format = configuration[:format] || "%Y-%m-%d"
12
12
  end
@@ -14,7 +14,7 @@ module ETL #:nodoc:
14
14
  # * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
15
15
  # * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
16
16
  # * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
17
- def initialize(control, configuration={})
17
+ def initialize(control, name, configuration={})
18
18
  super
19
19
 
20
20
  if configuration[:decode_table_path]
@@ -3,10 +3,15 @@ module ETL #:nodoc:
3
3
  # Transform which will replace nil or empty values with a specified value.
4
4
  class DefaultTransform < Transform
5
5
  attr_accessor :default_value
6
- def initialize(control, configuration)
6
+ # Initialize the transform
7
+ #
8
+ # Configuration options:
9
+ # * <tt>:default_value</tt>: The default value to use if the incoming value is blank
10
+ def initialize(control, name, configuration)
7
11
  super
8
12
  @default_value = configuration[:default_value]
9
13
  end
14
+ # Transform the value
10
15
  def transform(name, value, row)
11
16
  value.blank? ? default_value : value
12
17
  end
@@ -12,7 +12,7 @@ module ETL #:nodoc:
12
12
  # an empty Hash will be used. This Hash will be used to cache values that have been resolved already
13
13
  # for future use.
14
14
  # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
15
- def initialize(control, configuration={})
15
+ def initialize(control, name, configuration={})
16
16
  super
17
17
 
18
18
  @collection = (configuration[:collection] || {})
@@ -20,7 +20,7 @@ module ETL #:nodoc:
20
20
  # * <tt>:table</tt>: The table to search (required)
21
21
  # * <tt>:connection</tt>: The ActiveRecord adapter (required)
22
22
  # * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
23
- def initialize(control, configuration={})
23
+ def initialize(control, name, configuration={})
24
24
  super
25
25
  @parent_id_field = configuration[:parent_id_field] || :parent_id
26
26
  end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a number to an ordinalized version using the ActiveSupport ordinalize
4
+ # core extension
5
+ class OrdinalizeTransform < ETL::Transform::Transform
6
+ # Transform the value from a number to an ordinalized number
7
+ def transform(name, value, row)
8
+ value.ordinalize
9
+ end
10
+ end
11
+ end
12
+ end
@@ -4,9 +4,6 @@ module ETL #:nodoc:
4
4
  module Transform #:nodoc:
5
5
  # Transform which hashes the original value with a SHA-1 hash algorithm
6
6
  class Sha1Transform < ETL::Transform::Transform
7
- def initialize(control, configuration={})
8
- super
9
- end
10
7
  # Transform the value with a SHA1 digest algorithm.
11
8
  def transform(name, value, row)
12
9
  Digest::SHA1.hexdigest(value)
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a Date instance
4
4
  class StringToDateTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using Date.parse
9
6
  def transform(name, value, row)
10
7
  Date.parse(value)
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a DateTime instance
4
4
  class StringToDateTimeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using DateTime.parse.
9
6
  #
10
7
  # WARNING: This transform is slow (due to the Ruby implementation), but if you need to
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a Time instance
4
4
  class StringToTimeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using Time.parse
9
6
  def transform(name, value, row)
10
7
  Time.parse(value)
@@ -22,25 +22,34 @@ module ETL#:nodoc:
22
22
  # Any other object will result in a ControlError being raised.
23
23
  def transform(name, value, row, transforms)
24
24
  transforms.each do |transform|
25
- Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
26
- case transform
27
- when Proc
28
- value = transform.call([name, value, row])
29
- when Transform
30
- value = transform.transform(name, value, row)
31
- else
32
- raise ControlError, "Unsupported transform configuration type: #{transform}"
25
+ benchmarks[transform.class] ||= 0
26
+ benchmarks[transform.class] += Benchmark.realtime do
27
+ Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
28
+ case transform
29
+ when Proc
30
+ value = transform.call([name, value, row])
31
+ when Transform
32
+ value = transform.transform(name, value, row)
33
+ else
34
+ raise ControlError, "Unsupported transform configuration type: #{transform}"
35
+ end
33
36
  end
34
37
  end
35
38
  value
36
39
  end
40
+
41
+ def benchmarks
42
+ @benchmarks ||= {}
43
+ end
37
44
  end
38
45
 
39
- attr_reader :control, :configuration
46
+ attr_reader :control, :name, :configuration
40
47
 
41
- # Initialize the transform object
42
- def initialize(control, configuration={})
48
+ # Initialize the transform object with the given control object, field name and
49
+ # configuration hash
50
+ def initialize(control, name, configuration={})
43
51
  @control = control
52
+ @name = name
44
53
  @configuration = configuration
45
54
  end
46
55
 
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform to trim string
4
+ class TrimTransform < ETL::Transform::Transform
5
+ # Configuration options:
6
+ # * <tt>:type</tt>: :left, :right or :both. Default is :both
7
+ def initialize(control, name, configuration={})
8
+ super
9
+ @type = (configuration[:type] || :both).to_sym
10
+ end
11
+ # Transform the value
12
+ def transform(name, value, row)
13
+ case @type
14
+ when :left
15
+ value.lstrip
16
+ when :right
17
+ value.rstrip
18
+ when :both
19
+ value.strip
20
+ else
21
+ raise "Trim type, if specified, must be :left, :right or :both"
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -2,7 +2,15 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform from one type to another
4
4
  class TypeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:type</tt>: The type to convert to. Supported types:
9
+ # ** :string
10
+ # ** :number,:integer
11
+ # ** :float
12
+ # ** :decimal
13
+ def initialize(control, name, configuration={})
6
14
  super
7
15
  @type = configuration[:type]
8
16
  @significant = configuration[:significant] ||= 0
data/lib/etl/version.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 6
5
- TINY = 1
4
+ MINOR = 7
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.6.1
7
- date: 2007-03-22 00:00:00 -04:00
6
+ version: 0.7.0
7
+ date: 2007-04-08 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -30,6 +30,7 @@ authors:
30
30
  - Anthony Eden
31
31
  files:
32
32
  - CHANGELOG
33
+ - LICENSE
33
34
  - README
34
35
  - TODO
35
36
  - Rakefile
@@ -40,6 +41,8 @@ files:
40
41
  - lib/etl/control
41
42
  - lib/etl/control.rb
42
43
  - lib/etl/engine.rb
44
+ - lib/etl/execution
45
+ - lib/etl/execution.rb
43
46
  - lib/etl/generator
44
47
  - lib/etl/generator.rb
45
48
  - lib/etl/http_tools.rb
@@ -47,6 +50,8 @@ files:
47
50
  - lib/etl/parser.rb
48
51
  - lib/etl/processor
49
52
  - lib/etl/processor.rb
53
+ - lib/etl/row.rb
54
+ - lib/etl/screen
50
55
  - lib/etl/transform
51
56
  - lib/etl/transform.rb
52
57
  - lib/etl/version.rb
@@ -60,6 +65,10 @@ files:
60
65
  - lib/etl/control/destination/file_destination.rb
61
66
  - lib/etl/control/source/database_source.rb
62
67
  - lib/etl/control/source/file_source.rb
68
+ - lib/etl/execution/base.rb
69
+ - lib/etl/execution/job.rb
70
+ - lib/etl/execution/migration.rb
71
+ - lib/etl/execution/record.rb
63
72
  - lib/etl/generator/generator.rb
64
73
  - lib/etl/generator/surrogate_key_generator.rb
65
74
  - lib/etl/parser/apache_combined_log_parser.rb
@@ -69,21 +78,30 @@ files:
69
78
  - lib/etl/parser/sax_parser.rb
70
79
  - lib/etl/parser/xml_parser.rb
71
80
  - lib/etl/processor/bulk_import_processor.rb
81
+ - lib/etl/processor/check_exist_processor.rb
82
+ - lib/etl/processor/check_unique_processor.rb
72
83
  - lib/etl/processor/copy_field_processor.rb
73
84
  - lib/etl/processor/hierarchy_exploder_processor.rb
74
85
  - lib/etl/processor/processor.rb
86
+ - lib/etl/processor/rename_processor.rb
75
87
  - lib/etl/processor/row_processor.rb
88
+ - lib/etl/processor/sequence_processor.rb
89
+ - lib/etl/processor/surrogate_key_processor.rb
76
90
  - lib/etl/processor/truncate_processor.rb
91
+ - lib/etl/screen/row_count_screen.rb
92
+ - lib/etl/transform/block_transform.rb
77
93
  - lib/etl/transform/date_to_string_transform.rb
78
94
  - lib/etl/transform/decode_transform.rb
79
95
  - lib/etl/transform/default_transform.rb
80
96
  - lib/etl/transform/foreign_key_lookup_transform.rb
81
97
  - lib/etl/transform/hierarchy_lookup_transform.rb
98
+ - lib/etl/transform/ordinalize_transform.rb
82
99
  - lib/etl/transform/sha1_transform.rb
83
100
  - lib/etl/transform/string_to_date_transform.rb
84
101
  - lib/etl/transform/string_to_datetime_transform.rb
85
102
  - lib/etl/transform/string_to_time_transform.rb
86
103
  - lib/etl/transform/transform.rb
104
+ - lib/etl/transform/trim_transform.rb
87
105
  - lib/etl/transform/type_transform.rb
88
106
  test_files: []
89
107
 
@@ -133,7 +151,7 @@ dependencies:
133
151
  requirements:
134
152
  - - ">="
135
153
  - !ruby/object:Gem::Version
136
- version: 1.0.0
154
+ version: 1.2.0
137
155
  version:
138
156
  - !ruby/object:Gem::Dependency
139
157
  name: adapter_extensions