colincasey-activewarehouse-etl 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +75 -0
  5. data/TODO +28 -0
  6. data/VERSION.yml +4 -0
  7. data/bin/etl +28 -0
  8. data/bin/etl.cmd +8 -0
  9. data/lib/etl.rb +81 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +414 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/csv_destination.rb +84 -0
  21. data/lib/etl/control/destination/database_destination.rb +95 -0
  22. data/lib/etl/control/destination/file_destination.rb +124 -0
  23. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control/source/database_source.rb +220 -0
  26. data/lib/etl/control/source/enumerable_source.rb +11 -0
  27. data/lib/etl/control/source/file_source.rb +90 -0
  28. data/lib/etl/control/source/model_source.rb +39 -0
  29. data/lib/etl/core_ext.rb +1 -0
  30. data/lib/etl/core_ext/time.rb +5 -0
  31. data/lib/etl/core_ext/time/calculations.rb +42 -0
  32. data/lib/etl/engine.rb +574 -0
  33. data/lib/etl/execution.rb +20 -0
  34. data/lib/etl/execution/base.rb +9 -0
  35. data/lib/etl/execution/batch.rb +8 -0
  36. data/lib/etl/execution/job.rb +8 -0
  37. data/lib/etl/execution/migration.rb +85 -0
  38. data/lib/etl/generator.rb +2 -0
  39. data/lib/etl/generator/generator.rb +20 -0
  40. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  41. data/lib/etl/http_tools.rb +139 -0
  42. data/lib/etl/parser.rb +11 -0
  43. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  44. data/lib/etl/parser/delimited_parser.rb +74 -0
  45. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  46. data/lib/etl/parser/parser.rb +41 -0
  47. data/lib/etl/parser/sax_parser.rb +218 -0
  48. data/lib/etl/parser/spreadsheet_parser.rb +114 -0
  49. data/lib/etl/parser/xml_parser.rb +65 -0
  50. data/lib/etl/processor.rb +11 -0
  51. data/lib/etl/processor/block_processor.rb +14 -0
  52. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  53. data/lib/etl/processor/check_exist_processor.rb +80 -0
  54. data/lib/etl/processor/check_unique_processor.rb +35 -0
  55. data/lib/etl/processor/copy_field_processor.rb +26 -0
  56. data/lib/etl/processor/encode_processor.rb +55 -0
  57. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  58. data/lib/etl/processor/print_row_processor.rb +12 -0
  59. data/lib/etl/processor/processor.rb +25 -0
  60. data/lib/etl/processor/rename_processor.rb +24 -0
  61. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  62. data/lib/etl/processor/row_processor.rb +17 -0
  63. data/lib/etl/processor/sequence_processor.rb +23 -0
  64. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  65. data/lib/etl/processor/truncate_processor.rb +35 -0
  66. data/lib/etl/row.rb +20 -0
  67. data/lib/etl/screen.rb +14 -0
  68. data/lib/etl/screen/row_count_screen.rb +20 -0
  69. data/lib/etl/transform.rb +2 -0
  70. data/lib/etl/transform/block_transform.rb +13 -0
  71. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  72. data/lib/etl/transform/decode_transform.rb +51 -0
  73. data/lib/etl/transform/default_transform.rb +20 -0
  74. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  75. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  76. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  77. data/lib/etl/transform/sha1_transform.rb +13 -0
  78. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  79. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  80. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  81. data/lib/etl/transform/transform.rb +61 -0
  82. data/lib/etl/transform/trim_transform.rb +26 -0
  83. data/lib/etl/transform/type_transform.rb +35 -0
  84. data/lib/etl/util.rb +59 -0
  85. data/lib/etl/version.rb +10 -0
  86. metadata +224 -0
@@ -0,0 +1,11 @@
1
+ # This source file contains the ETL::Processor module and requires all of the processors
2
+
3
+ module ETL #:nodoc:
4
+ # The ETL::Processor module contains row-level and bulk processors
5
+ module Processor
6
+ end
7
+ end
8
+
9
+ require 'etl/processor/processor'
10
+ require 'etl/processor/row_processor'
11
+ Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -0,0 +1,14 @@
1
+ module ETL
2
+ module Processor
3
+ # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4
+ class BlockProcessor < ETL::Processor::RowProcessor
5
+ def initialize(control, configuration)
6
+ super
7
+ @block = configuration[:block]
8
+ end
9
+ def process(row=nil)
10
+ @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,81 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which is used to bulk import data into a target database. The
4
+ # underlying database driver from ActiveRecord must support the methods
5
+ # +bulk_load+ method.
6
+ class BulkImportProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :file
10
+ # The target database
11
+ attr_reader :target
12
+ # The table name
13
+ attr_reader :table
14
+ # Set to true to truncate
15
+ attr_reader :truncate
16
+ # Array of symbols representing the column load order
17
+ attr_reader :columns
18
+ # The field separator (defaults to a comma)
19
+ attr_accessor :field_separator
20
+ # The field enclosure (defaults to nil)
21
+ attr_accessor :field_enclosure
22
+ # The line separator (defaults to a newline)
23
+ attr_accessor :line_separator
24
+ # The string that indicates a NULL (defaults to an empty string)
25
+ attr_accessor :null_string
26
+
27
+ # Initialize the processor.
28
+ #
29
+ # Configuration options:
30
+ # * <tt>:file</tt>: The file to load data from
31
+ # * <tt>:target</tt>: The target database
32
+ # * <tt>:table</tt>: The table name
33
+ # * <tt>:truncate</tt>: Set to true to truncate before loading
34
+ # * <tt>:columns</tt>: The columns to load in the order they appear in
35
+ # the bulk data file
36
+ # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
37
+ # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
38
+ # * <tt>:field_enclosure</tt>: The field enclosure charcaters
39
+ def initialize(control, configuration)
40
+ super
41
+ @file = File.join(File.dirname(control.file), configuration[:file])
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ @truncate = configuration[:truncate] ||= false
45
+ @columns = configuration[:columns]
46
+ @field_separator = (configuration[:field_separator] || ',')
47
+ @line_separator = (configuration[:line_separator] || "\n")
48
+ @null_string = (configuration[:null_string] || "")
49
+ @field_enclosure = configuration[:field_enclosure]
50
+
51
+ raise ControlError, "Target must be specified" unless @target
52
+ raise ControlError, "Table must be specified" unless @table
53
+ end
54
+
55
+ # Execute the processor
56
+ def process
57
+ return if ETL::Engine.skip_bulk_import
58
+ return if File.size(file) == 0
59
+
60
+ conn = ETL::Engine.connection(target)
61
+ conn.transaction do
62
+ conn.truncate(table_name) if truncate
63
+ options = {}
64
+ options[:columns] = columns
65
+ if field_separator || field_enclosure || line_separator || null_string
66
+ options[:fields] = {}
67
+ options[:fields][:null_string] = null_string if null_string
68
+ options[:fields][:delimited_by] = field_separator if field_separator
69
+ options[:fields][:enclosed_by] = field_enclosure if field_enclosure
70
+ options[:fields][:terminated_by] = line_separator if line_separator
71
+ end
72
+ conn.bulk_load(file, table_name, options)
73
+ end
74
+ end
75
+
76
+ def table_name
77
+ ETL::Engine.table(table, ETL::Engine.connection(target))
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,80 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The target database
10
+ attr_accessor :target
11
+
12
+ # The name of the table to check against
13
+ attr_accessor :table
14
+
15
+ # An array of columns representing the natural key
16
+ attr_accessor :columns
17
+
18
+ # Is set to true if the processor should execute the check. If there are
19
+ # no rows in the target table then this should return false.
20
+ attr_accessor :should_check
21
+
22
+ # Initialize the processor
23
+ # Configuration options:
24
+ # * <tt>:skip</tt>: A symbol or array of column names that should not
25
+ # be checked
26
+ # * <tt>:table</tt>: The table name
27
+ # * <tt>:columns</tt>: An array of columns which represent the natural
28
+ # key
29
+ def initialize(control, configuration)
30
+ super
31
+ @skip = configuration[:skip] || []
32
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
33
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
34
+ @columns = configuration[:columns]
35
+
36
+ q = "SELECT COUNT(*) FROM #{table_name}"
37
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
38
+ end
39
+
40
+ # Return true if the given key should be skipped
41
+ def skip?(key)
42
+ case skip
43
+ when Array
44
+ skip.include?(key)
45
+ else
46
+ skip.to_sym == key.to_sym
47
+ end
48
+ end
49
+
50
+ # Return true if the row should be checked
51
+ def should_check?
52
+ @should_check ? true : false
53
+ end
54
+
55
+ # Process the row
56
+ def process(row)
57
+ return row unless should_check?
58
+ conn = ETL::Engine.connection(target)
59
+ q = "SELECT * FROM #{table_name} WHERE "
60
+ conditions = []
61
+ row.each do |k,v|
62
+ if columns.nil? || columns.include?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
64
+ end
65
+ end
66
+ q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
68
+
69
+ #puts "query: #{q}"
70
+ result = conn.select_one(q)
71
+ return row if result.nil?
72
+ end
73
+
74
+ private
75
+ def table_name
76
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ def process(row)
27
+ key = (keys.collect { |k| row[k] }).join('|')
28
+ unless compound_key_constraints[key]
29
+ compound_key_constraints[key] = 1
30
+ return row
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
11
+ def process(row)
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ target = configuration[:target]
28
+ table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
30
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
31
+ rows
32
+ end
33
+
34
+ protected
35
+ # Recursive function that will add a row for the current level and then call build_rows
36
+ # for all of the children of the current level
37
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
38
+ ids.each do |id|
39
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
40
+
41
+ row = {
42
+ :parent_id => row_id,
43
+ :child_id => id,
44
+ :num_levels_from_parent => level,
45
+ :is_bottom => (child_ids.empty? ? 1 : 0),
46
+ :is_top => (root ? 1 : 0),
47
+ }
48
+ rows << row
49
+
50
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Debugging processor for printing the current row
4
+ class PrintRowProcessor < ETL::Processor::RowProcessor
5
+ # Process the row
6
+ def process(row)
7
+ puts row.inspect
8
+ row
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,25 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Base class for pre and post processors. Subclasses must implement the +process+ method.
4
+ class Processor
5
+ def initialize(control, configuration)
6
+ @control = control
7
+ @configuration = configuration
8
+ after_initialize if respond_to?(:after_initialize)
9
+ end
10
+ protected
11
+ # Get the control object
12
+ def control
13
+ @control
14
+ end
15
+ # Get the configuration Hash
16
+ def configuration
17
+ @configuration
18
+ end
19
+ # Get the engine logger
20
+ def log
21
+ Engine.logger
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to rename a field in the row.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:source</tt>: the source field name
7
+ # * <tt>:dest</tt>: The destination field name
8
+ class RenameProcessor < ETL::Processor::RowProcessor
9
+ def process(row)
10
+ source_value = row[configuration[:source]]
11
+ case source_value
12
+ when Numeric
13
+ row[configuration[:dest]] = source_value
14
+ when nil
15
+ row[configuration[:dest]] = nil
16
+ else
17
+ row[configuration[:dest]] = source_value.dup
18
+ end
19
+ row.delete(configuration[:source])
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which requires that the particular fields are non-blank in
4
+ # order for the row to be retained.
5
+ class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6
+ # An array of fields to check
7
+ attr_reader :fields
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Options:
12
+ # * <tt>:fields</tt>: An array of fields to check, for example:
13
+ # [:first_name,:last_name]
14
+ def initialize(control, configuration)
15
+ super
16
+ @fields = configuration[:fields] || []
17
+ end
18
+
19
+ # Process the row.
20
+ def process(row)
21
+ fields.each { |field| return if row[field].blank? }
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end