activewarehouse-etl 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -0,0 +1,24 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to rename a field in the row.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:source</tt>: the source field name
7
+ # * <tt>:dest</tt>: The destination field name
8
+ class RenameProcessor < ETL::Processor::RowProcessor
9
+ def process(row)
10
+ source_value = row[configuration[:source]]
11
+ case source_value
12
+ when Numeric
13
+ row[configuration[:dest]] = source_value
14
+ when nil
15
+ row[configuration[:dest]] = nil
16
+ else
17
+ row[configuration[:dest]] = source_value.dup
18
+ end
19
+ row.delete(configuration[:source])
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -8,7 +8,7 @@ module ETL #:nodoc:
8
8
  def initialize(control, configuration)
9
9
  super
10
10
  end
11
- # Process the specified row
11
+ # Process the specified row. This method must return the row.
12
12
  def process(row)
13
13
  raise "process_row is an abstract method"
14
14
  end
@@ -0,0 +1,23 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to generate a sequence.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:context</tt>: A context name, if none is specified then the context will be
7
+ # the current ETL run
8
+ # * <tt>:dest</tt>: The destination field name
9
+ class SequenceProcessor < ETL::Processor::RowProcessor
10
+ def process(row)
11
+ sequences[configuration[:context]] ||= 0
12
+ row[configuration[:dest]] = sequences[configuration[:context]] += 1
13
+ row
14
+ end
15
+
16
+ protected
17
+ # Get a Hash of sequences
18
+ def sequences
19
+ @sequences ||= {}
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row level processor that provides surrogate keys
4
+ class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
+ attr_accessor :query
6
+ attr_accessor :destination
7
+
8
+ # Initialize the surrogate key generator
9
+ def initialize(control, configuration)
10
+ super
11
+ @surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
12
+ #puts "initial surrogate key: #{@surrogate_key}"
13
+ @surrogate_key = 0 if @surrogate_key.blank?
14
+ @surrogate_key = @surrogate_key.to_i
15
+ #puts "surrogate key: #{@surrogate_key}"
16
+ @destination = configuration[:destination] || :id
17
+ end
18
+
19
+ # Add a surrogate key to the row
20
+ def process(row)
21
+ if row
22
+ #puts "processing row #{row.inspect}"
23
+ @surrogate_key += 1
24
+ #puts "adding surrogate key to row: #{@surrogate_key}"
25
+ row[destination] = @surrogate_key
26
+ row
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -3,8 +3,6 @@ module ETL #:nodoc:
3
3
  # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
4
  # prior to loading
5
5
  class TruncateProcessor < ETL::Processor::Processor
6
- #attr_reader :file
7
-
8
6
  # Defines the table to truncate
9
7
  attr_reader :table
10
8
 
data/lib/etl/row.rb ADDED
@@ -0,0 +1,17 @@
1
+ # This source file contains the ETL::Row class.
2
+
3
+ module ETL #:nodoc:
4
+ # This class represents a single row currently passing through the ETL pipeline
5
+ class Row < Hash
6
+ # All change types
7
+ CHANGE_TYPES = [:insert, :update, :delete]
8
+
9
+ # Accessor for the row's change type
10
+ attr_accessor :change_type
11
+
12
+ # Get the change type, defaults to :insert
13
+ def change_type
14
+ @change_type ||= :insert
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ module ETL
2
+ module Screen
3
+ # This screen validates the number of rows which will be bulk loaded
4
+ # against the results from some sort of a row count query. If there
5
+ # is a difference then the screen will not pass
6
+ class RowCountScreen
7
+ def initialize(control, configuration={})
8
+
9
+ end
10
+ def execute
11
+
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ module ETL
2
+ module Transform
3
+ class BlockTransform < ETL::Transform::Transform
4
+ def initialize(control, name, configuration)
5
+ super
6
+ @block = configuration[:block]
7
+ end
8
+ def transform(name, value, row)
9
+ @block.call(name, value, row)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -6,7 +6,7 @@ module ETL #:nodoc:
6
6
  #
7
7
  # Configuration options:
8
8
  # * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
9
- def initialize(control, configuration={})
9
+ def initialize(control, name, configuration={})
10
10
  super
11
11
  @format = configuration[:format] || "%Y-%m-%d"
12
12
  end
@@ -14,7 +14,7 @@ module ETL #:nodoc:
14
14
  # * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
15
15
  # * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
16
16
  # * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
17
- def initialize(control, configuration={})
17
+ def initialize(control, name, configuration={})
18
18
  super
19
19
 
20
20
  if configuration[:decode_table_path]
@@ -3,10 +3,15 @@ module ETL #:nodoc:
3
3
  # Transform which will replace nil or empty values with a specified value.
4
4
  class DefaultTransform < Transform
5
5
  attr_accessor :default_value
6
- def initialize(control, configuration)
6
+ # Initialize the transform
7
+ #
8
+ # Configuration options:
9
+ # * <tt>:default_value</tt>: The default value to use if the incoming value is blank
10
+ def initialize(control, name, configuration)
7
11
  super
8
12
  @default_value = configuration[:default_value]
9
13
  end
14
+ # Transform the value
10
15
  def transform(name, value, row)
11
16
  value.blank? ? default_value : value
12
17
  end
@@ -12,7 +12,7 @@ module ETL #:nodoc:
12
12
  # an empty Hash will be used. This Hash will be used to cache values that have been resolved already
13
13
  # for future use.
14
14
  # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
15
- def initialize(control, configuration={})
15
+ def initialize(control, name, configuration={})
16
16
  super
17
17
 
18
18
  @collection = (configuration[:collection] || {})
@@ -20,7 +20,7 @@ module ETL #:nodoc:
20
20
  # * <tt>:table</tt>: The table to search (required)
21
21
  # * <tt>:connection</tt>: The ActiveRecord adapter (required)
22
22
  # * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
23
- def initialize(control, configuration={})
23
+ def initialize(control, name, configuration={})
24
24
  super
25
25
  @parent_id_field = configuration[:parent_id_field] || :parent_id
26
26
  end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a number to an ordinalized version using the ActiveSupport ordinalize
4
+ # core extension
5
+ class OrdinalizeTransform < ETL::Transform::Transform
6
+ # Transform the value from a number to an ordinalized number
7
+ def transform(name, value, row)
8
+ value.ordinalize
9
+ end
10
+ end
11
+ end
12
+ end
@@ -4,9 +4,6 @@ module ETL #:nodoc:
4
4
  module Transform #:nodoc:
5
5
  # Transform which hashes the original value with a SHA-1 hash algorithm
6
6
  class Sha1Transform < ETL::Transform::Transform
7
- def initialize(control, configuration={})
8
- super
9
- end
10
7
  # Transform the value with a SHA1 digest algorithm.
11
8
  def transform(name, value, row)
12
9
  Digest::SHA1.hexdigest(value)
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a Date instance
4
4
  class StringToDateTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using Date.parse
9
6
  def transform(name, value, row)
10
7
  Date.parse(value)
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a DateTime instance
4
4
  class StringToDateTimeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using DateTime.parse.
9
6
  #
10
7
  # WARNING: This transform is slow (due to the Ruby implementation), but if you need to
@@ -2,9 +2,6 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform a String representation of a date to a Time instance
4
4
  class StringToTimeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
6
- super
7
- end
8
5
  # Transform the value using Time.parse
9
6
  def transform(name, value, row)
10
7
  Time.parse(value)
@@ -22,25 +22,34 @@ module ETL#:nodoc:
22
22
  # Any other object will result in a ControlError being raised.
23
23
  def transform(name, value, row, transforms)
24
24
  transforms.each do |transform|
25
- Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
26
- case transform
27
- when Proc
28
- value = transform.call([name, value, row])
29
- when Transform
30
- value = transform.transform(name, value, row)
31
- else
32
- raise ControlError, "Unsupported transform configuration type: #{transform}"
25
+ benchmarks[transform.class] ||= 0
26
+ benchmarks[transform.class] += Benchmark.realtime do
27
+ Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
28
+ case transform
29
+ when Proc
30
+ value = transform.call([name, value, row])
31
+ when Transform
32
+ value = transform.transform(name, value, row)
33
+ else
34
+ raise ControlError, "Unsupported transform configuration type: #{transform}"
35
+ end
33
36
  end
34
37
  end
35
38
  value
36
39
  end
40
+
41
+ def benchmarks
42
+ @benchmarks ||= {}
43
+ end
37
44
  end
38
45
 
39
- attr_reader :control, :configuration
46
+ attr_reader :control, :name, :configuration
40
47
 
41
- # Initialize the transform object
42
- def initialize(control, configuration={})
48
+ # Initialize the transform object with the given control object, field name and
49
+ # configuration hash
50
+ def initialize(control, name, configuration={})
43
51
  @control = control
52
+ @name = name
44
53
  @configuration = configuration
45
54
  end
46
55
 
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform to trim string
4
+ class TrimTransform < ETL::Transform::Transform
5
+ # Configuration options:
6
+ # * <tt>:type</tt>: :left, :right or :both. Default is :both
7
+ def initialize(control, name, configuration={})
8
+ super
9
+ @type = (configuration[:type] || :both).to_sym
10
+ end
11
+ # Transform the value
12
+ def transform(name, value, row)
13
+ case @type
14
+ when :left
15
+ value.lstrip
16
+ when :right
17
+ value.rstrip
18
+ when :both
19
+ value.strip
20
+ else
21
+ raise "Trim type, if specified, must be :left, :right or :both"
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -2,7 +2,15 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform from one type to another
4
4
  class TypeTransform < ETL::Transform::Transform
5
- def initialize(control, configuration={})
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:type</tt>: The type to convert to. Supported types:
9
+ # ** :string
10
+ # ** :number,:integer
11
+ # ** :float
12
+ # ** :decimal
13
+ def initialize(control, name, configuration={})
6
14
  super
7
15
  @type = configuration[:type]
8
16
  @significant = configuration[:significant] ||= 0
data/lib/etl/version.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 6
5
- TINY = 1
4
+ MINOR = 7
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.6.1
7
- date: 2007-03-22 00:00:00 -04:00
6
+ version: 0.7.0
7
+ date: 2007-04-08 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -30,6 +30,7 @@ authors:
30
30
  - Anthony Eden
31
31
  files:
32
32
  - CHANGELOG
33
+ - LICENSE
33
34
  - README
34
35
  - TODO
35
36
  - Rakefile
@@ -40,6 +41,8 @@ files:
40
41
  - lib/etl/control
41
42
  - lib/etl/control.rb
42
43
  - lib/etl/engine.rb
44
+ - lib/etl/execution
45
+ - lib/etl/execution.rb
43
46
  - lib/etl/generator
44
47
  - lib/etl/generator.rb
45
48
  - lib/etl/http_tools.rb
@@ -47,6 +50,8 @@ files:
47
50
  - lib/etl/parser.rb
48
51
  - lib/etl/processor
49
52
  - lib/etl/processor.rb
53
+ - lib/etl/row.rb
54
+ - lib/etl/screen
50
55
  - lib/etl/transform
51
56
  - lib/etl/transform.rb
52
57
  - lib/etl/version.rb
@@ -60,6 +65,10 @@ files:
60
65
  - lib/etl/control/destination/file_destination.rb
61
66
  - lib/etl/control/source/database_source.rb
62
67
  - lib/etl/control/source/file_source.rb
68
+ - lib/etl/execution/base.rb
69
+ - lib/etl/execution/job.rb
70
+ - lib/etl/execution/migration.rb
71
+ - lib/etl/execution/record.rb
63
72
  - lib/etl/generator/generator.rb
64
73
  - lib/etl/generator/surrogate_key_generator.rb
65
74
  - lib/etl/parser/apache_combined_log_parser.rb
@@ -69,21 +78,30 @@ files:
69
78
  - lib/etl/parser/sax_parser.rb
70
79
  - lib/etl/parser/xml_parser.rb
71
80
  - lib/etl/processor/bulk_import_processor.rb
81
+ - lib/etl/processor/check_exist_processor.rb
82
+ - lib/etl/processor/check_unique_processor.rb
72
83
  - lib/etl/processor/copy_field_processor.rb
73
84
  - lib/etl/processor/hierarchy_exploder_processor.rb
74
85
  - lib/etl/processor/processor.rb
86
+ - lib/etl/processor/rename_processor.rb
75
87
  - lib/etl/processor/row_processor.rb
88
+ - lib/etl/processor/sequence_processor.rb
89
+ - lib/etl/processor/surrogate_key_processor.rb
76
90
  - lib/etl/processor/truncate_processor.rb
91
+ - lib/etl/screen/row_count_screen.rb
92
+ - lib/etl/transform/block_transform.rb
77
93
  - lib/etl/transform/date_to_string_transform.rb
78
94
  - lib/etl/transform/decode_transform.rb
79
95
  - lib/etl/transform/default_transform.rb
80
96
  - lib/etl/transform/foreign_key_lookup_transform.rb
81
97
  - lib/etl/transform/hierarchy_lookup_transform.rb
98
+ - lib/etl/transform/ordinalize_transform.rb
82
99
  - lib/etl/transform/sha1_transform.rb
83
100
  - lib/etl/transform/string_to_date_transform.rb
84
101
  - lib/etl/transform/string_to_datetime_transform.rb
85
102
  - lib/etl/transform/string_to_time_transform.rb
86
103
  - lib/etl/transform/transform.rb
104
+ - lib/etl/transform/trim_transform.rb
87
105
  - lib/etl/transform/type_transform.rb
88
106
  test_files: []
89
107
 
@@ -133,7 +151,7 @@ dependencies:
133
151
  requirements:
134
152
  - - ">="
135
153
  - !ruby/object:Gem::Version
136
- version: 1.0.0
154
+ version: 1.2.0
137
155
  version:
138
156
  - !ruby/object:Gem::Dependency
139
157
  name: adapter_extensions