factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,80 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The target database
10
+ attr_accessor :target
11
+
12
+ # The name of the table to check against
13
+ attr_accessor :table
14
+
15
+ # An array of columns representing the natural key
16
+ attr_accessor :columns
17
+
18
+ # Is set to true if the processor should execute the check. If there are
19
+ # no rows in the target table then this should return false.
20
+ attr_accessor :should_check
21
+
22
+ # Initialize the processor
23
+ # Configuration options:
24
+ # * <tt>:skip</tt>: A symbol or array of column names that should not
25
+ # be checked
26
+ # * <tt>:table</tt>: The table name
27
+ # * <tt>:columns</tt>: An array of columns which represent the natural
28
+ # key
29
+ def initialize(control, configuration)
30
+ super
31
+ @skip = configuration[:skip] || []
32
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
33
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
34
+ @columns = configuration[:columns]
35
+
36
+ q = "SELECT COUNT(*) FROM #{table_name}"
37
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
38
+ end
39
+
40
+ # Return true if the given key should be skipped
41
+ def skip?(key)
42
+ case skip
43
+ when Array
44
+ skip.include?(key)
45
+ else
46
+ skip.to_sym == key.to_sym
47
+ end
48
+ end
49
+
50
+ # Return true if the row should be checked
51
+ def should_check?
52
+ @should_check ? true : false
53
+ end
54
+
55
+ # Process the row
56
+ def process(row)
57
+ return row unless should_check?
58
+ conn = ETL::Engine.connection(target)
59
+ q = "SELECT * FROM #{table_name} WHERE "
60
+ conditions = []
61
+ row.each do |k,v|
62
+ if columns.nil? || columns.include?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
64
+ end
65
+ end
66
+ q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
68
+
69
+ #puts "query: #{q}"
70
+ result = conn.select_one(q)
71
+ return row if result.nil?
72
+ end
73
+
74
+ private
75
+ def table_name
76
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ def process(row)
27
+ key = (keys.collect { |k| row[k] }).join('|')
28
+ unless compound_key_constraints[key]
29
+ compound_key_constraints[key] = 1
30
+ return row
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
11
+ def process(row)
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ target = configuration[:target]
28
+ table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
30
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
31
+ rows
32
+ end
33
+
34
+ protected
35
+ # Recursive function that will add a row for the current level and then call build_rows
36
+ # for all of the children of the current level
37
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
38
+ ids.each do |id|
39
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
40
+
41
+ row = {
42
+ :parent_id => row_id,
43
+ :child_id => id,
44
+ :num_levels_from_parent => level,
45
+ :is_bottom => (child_ids.empty? ? 1 : 0),
46
+ :is_top => (root ? 1 : 0),
47
+ }
48
+ rows << row
49
+
50
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Debugging processor for printing the current row
4
+ class PrintRowProcessor < ETL::Processor::RowProcessor
5
+ # Process the row
6
+ def process(row)
7
+ puts row.inspect
8
+ row
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,25 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Base class for pre and post processors. Subclasses must implement the +process+ method.
4
+ class Processor
5
+ def initialize(control, configuration)
6
+ @control = control
7
+ @configuration = configuration
8
+ after_initialize if respond_to?(:after_initialize)
9
+ end
10
+ protected
11
+ # Get the control object
12
+ def control
13
+ @control
14
+ end
15
+ # Get the configuration Hash
16
+ def configuration
17
+ @configuration
18
+ end
19
+ # Get the engine logger
20
+ def log
21
+ Engine.logger
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to rename a field in the row.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:source</tt>: the source field name
7
+ # * <tt>:dest</tt>: The destination field name
8
+ class RenameProcessor < ETL::Processor::RowProcessor
9
+ def process(row)
10
+ source_value = row[configuration[:source]]
11
+ case source_value
12
+ when Numeric
13
+ row[configuration[:dest]] = source_value
14
+ when nil
15
+ row[configuration[:dest]] = nil
16
+ else
17
+ row[configuration[:dest]] = source_value.dup
18
+ end
19
+ row.delete(configuration[:source])
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which requires that the particular fields are non-blank in
4
+ # order for the row to be retained.
5
+ class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6
+ # An array of fields to check
7
+ attr_reader :fields
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Options:
12
+ # * <tt>:fields</tt>: An array of fields to check, for example:
13
+ # [:first_name,:last_name]
14
+ def initialize(control, configuration)
15
+ super
16
+ @fields = configuration[:fields] || []
17
+ end
18
+
19
+ # Process the row.
20
+ def process(row)
21
+ fields.each { |field| return if row[field].blank? }
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which processes a specific row. Unlike a transformer, which deals with a specific
4
+ # value in the row, row processors can process an entire row at once, which can be used to
5
+ # explode a single row into multiple rows (for example)
6
+ class RowProcessor < Processor
7
+ # Initialize the processor
8
+ def initialize(control, configuration)
9
+ super
10
+ end
11
+ # Process the specified row. This method must return the row.
12
+ def process(row)
13
+ raise "process_row is an abstract method"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,23 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to generate a sequence.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:context</tt>: A context name, if none is specified then the context will be
7
+ # the current ETL run
8
+ # * <tt>:dest</tt>: The destination field name
9
+ class SequenceProcessor < ETL::Processor::RowProcessor
10
+ def process(row)
11
+ sequences[configuration[:context]] ||= 0
12
+ row[configuration[:dest]] = sequences[configuration[:context]] += 1
13
+ row
14
+ end
15
+
16
+ protected
17
+ # Get a Hash of sequences
18
+ def sequences
19
+ @sequences ||= {}
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,53 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row level processor that provides surrogate keys
4
+ class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
+ attr_accessor :destination
6
+ attr_accessor :table
7
+ attr_accessor :column
8
+ attr_accessor :target
9
+
10
+ # Initialize the surrogate key generator
11
+ #
12
+ # Configuration options
13
+ # * <tt>:query</tt>: If specified it contains a query to be used to
14
+ # locate the last surrogate key. If this is specified then :target
15
+ # must also be specified.
16
+ # * <tt>:target</tt>: The target connection
17
+ # * <tt>:destination</tt>: The destination column name (defaults to :id)
18
+ def initialize(control, configuration)
19
+ super
20
+ @table = configuration[:table]
21
+ @column = configuration[:column] || 'id'
22
+ @target = configuration[:target]
23
+ if configuration[:query]
24
+ raise ControlError, "Query option is no longer value, use :column and :table instead"
25
+ end
26
+ if table
27
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
28
+ end
29
+ #puts "initial surrogate key: #{@surrogate_key}"
30
+ @surrogate_key = 0 if @surrogate_key.blank?
31
+ @surrogate_key = @surrogate_key.to_i
32
+ #puts "surrogate key: #{@surrogate_key}"
33
+ @destination = configuration[:destination] || :id
34
+ end
35
+
36
+ # Add a surrogate key to the row
37
+ def process(row)
38
+ if row
39
+ #puts "processing row #{row.inspect}"
40
+ @surrogate_key += 1
41
+ #puts "adding surrogate key to row: #{@surrogate_key}"
42
+ row[destination] = @surrogate_key
43
+ row
44
+ end
45
+ end
46
+
47
+ private
48
+ def table_name
49
+ ETL::Engine.table(table, ETL::Engine.connection(target))
50
+ end
51
+ end
52
+ end
53
+ end