darrell-activewarehouse-etl 0.9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
@@ -0,0 +1,65 @@
1
+ require 'rexml/document'
2
+
3
+ module ETL
4
+ module Parser
5
+ class XmlParser < ETL::Parser::Parser
6
+ # Initialize the parser
7
+ # * <tt>source</tt>: The Source object
8
+ # * <tt>options</tt>: Parser options Hash
9
+ def initialize(source, options={})
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Returns each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ doc = nil
18
+ t = Benchmark.realtime do
19
+ doc = REXML::Document.new(File.new(file))
20
+ end
21
+ Engine.logger.info "XML #{file} parsed in #{t}s"
22
+ doc.elements.each(@collection_xpath) do |element|
23
+ row = {}
24
+ fields.each do |f|
25
+ value = element.text(f.xpath)
26
+ row[f.name] = value
27
+ end
28
+ yield row
29
+ end
30
+ end
31
+ end
32
+
33
+ # Get an array of defined fields
34
+ def fields
35
+ @fields ||= []
36
+ end
37
+
38
+ private
39
+ def configure
40
+ @collection_xpath = source.definition[:collection]
41
+ raise "Collection XPath is required" if @collection_xpath.nil?
42
+
43
+ source.definition[:fields].each do |options|
44
+ case options
45
+ when Symbol
46
+ fields << Field.new(options, options.to_s)
47
+ when Hash
48
+ options[:xpath] ||= options[:name]
49
+ fields << Field.new(options[:name], options[:xpath].to_s)
50
+ else
51
+ raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
52
+ end
53
+ end
54
+ end
55
+
56
+ class Field
57
+ attr_reader :name, :xpath
58
+ def initialize(name, xpath)
59
+ @name = name
60
+ @xpath = xpath
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
data/lib/etl/parser.rb ADDED
@@ -0,0 +1,11 @@
1
+ # This source file contains the ETL::Parser module and requires all of the files
2
+ # in the parser directory ending with .rb
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Parser module provides various text parsers.
6
+ module Parser
7
+ end
8
+ end
9
+
10
+ require 'etl/parser/parser'
11
+ Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -0,0 +1,14 @@
1
+ module ETL
2
+ module Processor
3
+ # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4
+ class BlockProcessor < ETL::Processor::RowProcessor
5
+ def initialize(control, configuration)
6
+ super
7
+ @block = configuration[:block]
8
+ end
9
+ def process(row=nil)
10
+ @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,83 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which is used to bulk import data into a target database. The
4
+ # underlying database driver from ActiveRecord must support the methods
5
+ # +bulk_load+ method.
6
+ class BulkImportProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :file
10
+ # The target database
11
+ attr_reader :target
12
+ # The table name
13
+ attr_reader :table
14
+ # Set to true to truncate
15
+ attr_reader :truncate
16
+ # Array of symbols representing the column load order
17
+ attr_reader :columns
18
+ # The field separator (defaults to a comma)
19
+ attr_accessor :field_separator
20
+ # The field enclosure (defaults to nil)
21
+ attr_accessor :field_enclosure
22
+ # The line separator (defaults to a newline)
23
+ attr_accessor :line_separator
24
+ # The string that indicates a NULL (defaults to an empty string)
25
+ attr_accessor :null_string
26
+
27
+ # Initialize the processor.
28
+ #
29
+ # Configuration options:
30
+ # * <tt>:file</tt>: The file to load data from
31
+ # * <tt>:target</tt>: The target database
32
+ # * <tt>:table</tt>: The table name
33
+ # * <tt>:truncate</tt>: Set to true to truncate before loading
34
+ # * <tt>:columns</tt>: The columns to load in the order they appear in
35
+ # the bulk data file
36
+ # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
37
+ # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
38
+ # * <tt>:field_enclosure</tt>: The field enclosure charcaters
39
+ def initialize(control, configuration)
40
+ super
41
+ @target = configuration[:target]
42
+ path = Pathname.new(configuration[:file])
43
+ @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
44
+
45
+ @table = configuration[:table]
46
+ @truncate = configuration[:truncate] ||= false
47
+ @columns = configuration[:columns]
48
+ @field_separator = (configuration[:field_separator] || ',')
49
+ @line_separator = (configuration[:line_separator] || "\n")
50
+ @null_string = (configuration[:null_string] || "")
51
+ @field_enclosure = configuration[:field_enclosure]
52
+
53
+ raise ControlError, "Target must be specified" unless @target
54
+ raise ControlError, "Table must be specified" unless @table
55
+ end
56
+
57
+ # Execute the processor
58
+ def process
59
+ return if ETL::Engine.skip_bulk_import
60
+ return if File.size(file) == 0
61
+
62
+ conn = ETL::Engine.connection(target)
63
+ conn.transaction do
64
+ conn.truncate(table_name) if truncate
65
+ options = {}
66
+ options[:columns] = columns
67
+ if field_separator || field_enclosure || line_separator || null_string
68
+ options[:fields] = {}
69
+ options[:fields][:null_string] = null_string if null_string
70
+ options[:fields][:delimited_by] = field_separator if field_separator
71
+ options[:fields][:enclosed_by] = field_enclosure if field_enclosure
72
+ options[:fields][:terminated_by] = line_separator if line_separator
73
+ end
74
+ conn.bulk_load(file, table_name, options)
75
+ end
76
+ end
77
+
78
+ def table_name
79
+ ETL::Engine.table(table, ETL::Engine.connection(target))
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,80 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The target database
10
+ attr_accessor :target
11
+
12
+ # The name of the table to check against
13
+ attr_accessor :table
14
+
15
+ # An array of columns representing the natural key
16
+ attr_accessor :columns
17
+
18
+ # Is set to true if the processor should execute the check. If there are
19
+ # no rows in the target table then this should return false.
20
+ attr_accessor :should_check
21
+
22
+ # Initialize the processor
23
+ # Configuration options:
24
+ # * <tt>:skip</tt>: A symbol or array of column names that should not
25
+ # be checked
26
+ # * <tt>:table</tt>: The table name
27
+ # * <tt>:columns</tt>: An array of columns which represent the natural
28
+ # key
29
+ def initialize(control, configuration)
30
+ super
31
+ @skip = configuration[:skip] || []
32
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
33
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
34
+ @columns = configuration[:columns]
35
+
36
+ q = "SELECT COUNT(*) FROM #{table_name}"
37
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
38
+ end
39
+
40
+ # Return true if the given key should be skipped
41
+ def skip?(key)
42
+ case skip
43
+ when Array
44
+ skip.include?(key)
45
+ else
46
+ skip.to_sym == key.to_sym
47
+ end
48
+ end
49
+
50
+ # Return true if the row should be checked
51
+ def should_check?
52
+ @should_check ? true : false
53
+ end
54
+
55
+ # Process the row
56
+ def process(row)
57
+ return row unless should_check?
58
+ conn = ETL::Engine.connection(target)
59
+ q = "SELECT * FROM #{table_name} WHERE "
60
+ conditions = []
61
+ row.each do |k,v|
62
+ if columns.nil? || columns.include?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
64
+ end
65
+ end
66
+ q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
68
+
69
+ #puts "query: #{q}"
70
+ result = conn.select_one(q)
71
+ return row if result.nil?
72
+ end
73
+
74
+ private
75
+ def table_name
76
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ def process(row)
27
+ key = (keys.collect { |k| row[k] }).join('|')
28
+ unless compound_key_constraints[key]
29
+ compound_key_constraints[key] = 1
30
+ return row
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
11
+ def process(row)
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ target = configuration[:target]
28
+ table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
30
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
31
+ rows
32
+ end
33
+
34
+ protected
35
+ # Recursive function that will add a row for the current level and then call build_rows
36
+ # for all of the children of the current level
37
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
38
+ ids.each do |id|
39
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
40
+
41
+ row = {
42
+ :parent_id => row_id,
43
+ :child_id => id,
44
+ :num_levels_from_parent => level,
45
+ :is_bottom => (child_ids.empty? ? 1 : 0),
46
+ :is_top => (root ? 1 : 0),
47
+ }
48
+ rows << row
49
+
50
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Debugging processor for printing the current row
4
+ class PrintRowProcessor < ETL::Processor::RowProcessor
5
+ # Process the row
6
+ def process(row)
7
+ puts row.inspect
8
+ row
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,25 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Base class for pre and post processors. Subclasses must implement the +process+ method.
4
+ class Processor
5
+ def initialize(control, configuration)
6
+ @control = control
7
+ @configuration = configuration
8
+ after_initialize if respond_to?(:after_initialize)
9
+ end
10
+ protected
11
+ # Get the control object
12
+ def control
13
+ @control
14
+ end
15
+ # Get the configuration Hash
16
+ def configuration
17
+ @configuration
18
+ end
19
+ # Get the engine logger
20
+ def log
21
+ Engine.logger
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to rename a field in the row.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:source</tt>: the source field name
7
+ # * <tt>:dest</tt>: The destination field name
8
+ class RenameProcessor < ETL::Processor::RowProcessor
9
+ def process(row)
10
+ source_value = row[configuration[:source]]
11
+ case source_value
12
+ when Numeric
13
+ row[configuration[:dest]] = source_value
14
+ when nil
15
+ row[configuration[:dest]] = nil
16
+ else
17
+ row[configuration[:dest]] = source_value.dup
18
+ end
19
+ row.delete(configuration[:source])
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which requires that the particular fields are non-blank in
4
+ # order for the row to be retained.
5
+ class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6
+ # An array of fields to check
7
+ attr_reader :fields
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Options:
12
+ # * <tt>:fields</tt>: An array of fields to check, for example:
13
+ # [:first_name,:last_name]
14
+ def initialize(control, configuration)
15
+ super
16
+ @fields = configuration[:fields] || []
17
+ end
18
+
19
+ # Process the row.
20
+ def process(row)
21
+ fields.each { |field| return if row[field].blank? }
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which processes a specific row. Unlike a transformer, which deals with a specific
4
+ # value in the row, row processors can process an entire row at once, which can be used to
5
+ # explode a single row into multiple rows (for example)
6
+ class RowProcessor < Processor
7
+ # Initialize the processor
8
+ def initialize(control, configuration)
9
+ super
10
+ end
11
+ # Process the specified row. This method must return the row.
12
+ def process(row)
13
+ raise "process_row is an abstract method"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,23 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row level processor to generate a sequence.
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:context</tt>: A context name, if none is specified then the context will be
7
+ # the current ETL run
8
+ # * <tt>:dest</tt>: The destination field name
9
+ class SequenceProcessor < ETL::Processor::RowProcessor
10
+ def process(row)
11
+ sequences[configuration[:context]] ||= 0
12
+ row[configuration[:dest]] = sequences[configuration[:context]] += 1
13
+ row
14
+ end
15
+
16
+ protected
17
+ # Get a Hash of sequences
18
+ def sequences
19
+ @sequences ||= {}
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,53 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row level processor that provides surrogate keys
4
+ class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
+ attr_accessor :destination
6
+ attr_accessor :table
7
+ attr_accessor :column
8
+ attr_accessor :target
9
+
10
+ # Initialize the surrogate key generator
11
+ #
12
+ # Configuration options
13
+ # * <tt>:query</tt>: If specified it contains a query to be used to
14
+ # locate the last surrogate key. If this is specified then :target
15
+ # must also be specified.
16
+ # * <tt>:target</tt>: The target connection
17
+ # * <tt>:destination</tt>: The destination column name (defaults to :id)
18
+ def initialize(control, configuration)
19
+ super
20
+ @table = configuration[:table]
21
+ @column = configuration[:column] || 'id'
22
+ @target = configuration[:target]
23
+ if configuration[:query]
24
+ raise ControlError, "Query option is no longer value, use :column and :table instead"
25
+ end
26
+ if table
27
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
28
+ end
29
+ #puts "initial surrogate key: #{@surrogate_key}"
30
+ @surrogate_key = 0 if @surrogate_key.blank?
31
+ @surrogate_key = @surrogate_key.to_i
32
+ #puts "surrogate key: #{@surrogate_key}"
33
+ @destination = configuration[:destination] || :id
34
+ end
35
+
36
+ # Add a surrogate key to the row
37
+ def process(row)
38
+ if row
39
+ #puts "processing row #{row.inspect}"
40
+ @surrogate_key += 1
41
+ #puts "adding surrogate key to row: #{@surrogate_key}"
42
+ row[destination] = @surrogate_key
43
+ row
44
+ end
45
+ end
46
+
47
+ private
48
+ def table_name
49
+ ETL::Engine.table(table, ETL::Engine.connection(target))
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
+ # prior to loading
5
+ class TruncateProcessor < ETL::Processor::Processor
6
+ # Defines the table to truncate
7
+ attr_reader :table
8
+
9
+ # Defines the database connection to use
10
+ attr_reader :target
11
+
12
+ # Initialize the processor
13
+ #
14
+ # Options:
15
+ # * <tt>:target</tt>: The target connection
16
+ # * <tt>:table</tt>: The table name
17
+ def initialize(control, configuration)
18
+ super
19
+ #@file = File.join(File.dirname(control.file), configuration[:file])
20
+ @target = configuration[:target] || {}
21
+ @table = configuration[:table]
22
+ end
23
+
24
+ def process
25
+ conn = ETL::Engine.connection(target)
26
+ conn.truncate(table_name)
27
+ end
28
+
29
+ private
30
+ def table_name
31
+ ETL::Engine.table(table, ETL::Engine.connection(target))
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,11 @@
1
+ # This source file contains the ETL::Processor module and requires all of the processors
2
+
3
+ module ETL #:nodoc:
4
+ # The ETL::Processor module contains row-level and bulk processors
5
+ module Processor
6
+ end
7
+ end
8
+
9
+ require 'etl/processor/processor'
10
+ require 'etl/processor/row_processor'
11
+ Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }