darrell-activewarehouse-etl 0.9.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +99 -0
- data/Rakefile +175 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination.rb +448 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +83 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- data/lib/etl.rb +83 -0
- metadata +245 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class XmlParser < ETL::Parser::Parser
|
6
|
+
# Initialize the parser
|
7
|
+
# * <tt>source</tt>: The Source object
|
8
|
+
# * <tt>options</tt>: Parser options Hash
|
9
|
+
def initialize(source, options={})
|
10
|
+
super
|
11
|
+
configure
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns each row
|
15
|
+
def each
|
16
|
+
Dir.glob(file).each do |file|
|
17
|
+
doc = nil
|
18
|
+
t = Benchmark.realtime do
|
19
|
+
doc = REXML::Document.new(File.new(file))
|
20
|
+
end
|
21
|
+
Engine.logger.info "XML #{file} parsed in #{t}s"
|
22
|
+
doc.elements.each(@collection_xpath) do |element|
|
23
|
+
row = {}
|
24
|
+
fields.each do |f|
|
25
|
+
value = element.text(f.xpath)
|
26
|
+
row[f.name] = value
|
27
|
+
end
|
28
|
+
yield row
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get an array of defined fields
|
34
|
+
def fields
|
35
|
+
@fields ||= []
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def configure
|
40
|
+
@collection_xpath = source.definition[:collection]
|
41
|
+
raise "Collection XPath is required" if @collection_xpath.nil?
|
42
|
+
|
43
|
+
source.definition[:fields].each do |options|
|
44
|
+
case options
|
45
|
+
when Symbol
|
46
|
+
fields << Field.new(options, options.to_s)
|
47
|
+
when Hash
|
48
|
+
options[:xpath] ||= options[:name]
|
49
|
+
fields << Field.new(options[:name], options[:xpath].to_s)
|
50
|
+
else
|
51
|
+
raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Field
|
57
|
+
attr_reader :name, :xpath
|
58
|
+
def initialize(name, xpath)
|
59
|
+
@name = name
|
60
|
+
@xpath = xpath
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/etl/parser.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Parser module and requires all of the files
|
2
|
+
# in the parser directory ending with .rb
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Parser module provides various text parsers.
|
6
|
+
module Parser
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'etl/parser/parser'
|
11
|
+
Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
# This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
|
4
|
+
class BlockProcessor < ETL::Processor::RowProcessor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
super
|
7
|
+
@block = configuration[:block]
|
8
|
+
end
|
9
|
+
def process(row=nil)
|
10
|
+
@block.call(row)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which is used to bulk import data into a target database. The
|
4
|
+
# underlying database driver from ActiveRecord must support the methods
|
5
|
+
# +bulk_load+ method.
|
6
|
+
class BulkImportProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :file
|
10
|
+
# The target database
|
11
|
+
attr_reader :target
|
12
|
+
# The table name
|
13
|
+
attr_reader :table
|
14
|
+
# Set to true to truncate
|
15
|
+
attr_reader :truncate
|
16
|
+
# Array of symbols representing the column load order
|
17
|
+
attr_reader :columns
|
18
|
+
# The field separator (defaults to a comma)
|
19
|
+
attr_accessor :field_separator
|
20
|
+
# The field enclosure (defaults to nil)
|
21
|
+
attr_accessor :field_enclosure
|
22
|
+
# The line separator (defaults to a newline)
|
23
|
+
attr_accessor :line_separator
|
24
|
+
# The string that indicates a NULL (defaults to an empty string)
|
25
|
+
attr_accessor :null_string
|
26
|
+
|
27
|
+
# Initialize the processor.
|
28
|
+
#
|
29
|
+
# Configuration options:
|
30
|
+
# * <tt>:file</tt>: The file to load data from
|
31
|
+
# * <tt>:target</tt>: The target database
|
32
|
+
# * <tt>:table</tt>: The table name
|
33
|
+
# * <tt>:truncate</tt>: Set to true to truncate before loading
|
34
|
+
# * <tt>:columns</tt>: The columns to load in the order they appear in
|
35
|
+
# the bulk data file
|
36
|
+
# * <tt>:field_separator</tt>: The field separator. Defaults to a comma
|
37
|
+
# * <tt>:line_separator</tt>: The line separator. Defaults to a newline
|
38
|
+
# * <tt>:field_enclosure</tt>: The field enclosure charcaters
|
39
|
+
def initialize(control, configuration)
|
40
|
+
super
|
41
|
+
@target = configuration[:target]
|
42
|
+
path = Pathname.new(configuration[:file])
|
43
|
+
@file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
|
44
|
+
|
45
|
+
@table = configuration[:table]
|
46
|
+
@truncate = configuration[:truncate] ||= false
|
47
|
+
@columns = configuration[:columns]
|
48
|
+
@field_separator = (configuration[:field_separator] || ',')
|
49
|
+
@line_separator = (configuration[:line_separator] || "\n")
|
50
|
+
@null_string = (configuration[:null_string] || "")
|
51
|
+
@field_enclosure = configuration[:field_enclosure]
|
52
|
+
|
53
|
+
raise ControlError, "Target must be specified" unless @target
|
54
|
+
raise ControlError, "Table must be specified" unless @table
|
55
|
+
end
|
56
|
+
|
57
|
+
# Execute the processor
|
58
|
+
def process
|
59
|
+
return if ETL::Engine.skip_bulk_import
|
60
|
+
return if File.size(file) == 0
|
61
|
+
|
62
|
+
conn = ETL::Engine.connection(target)
|
63
|
+
conn.transaction do
|
64
|
+
conn.truncate(table_name) if truncate
|
65
|
+
options = {}
|
66
|
+
options[:columns] = columns
|
67
|
+
if field_separator || field_enclosure || line_separator || null_string
|
68
|
+
options[:fields] = {}
|
69
|
+
options[:fields][:null_string] = null_string if null_string
|
70
|
+
options[:fields][:delimited_by] = field_separator if field_separator
|
71
|
+
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
72
|
+
options[:fields][:terminated_by] = line_separator if line_separator
|
73
|
+
end
|
74
|
+
conn.bulk_load(file, table_name, options)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def table_name
|
79
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The target database
|
10
|
+
attr_accessor :target
|
11
|
+
|
12
|
+
# The name of the table to check against
|
13
|
+
attr_accessor :table
|
14
|
+
|
15
|
+
# An array of columns representing the natural key
|
16
|
+
attr_accessor :columns
|
17
|
+
|
18
|
+
# Is set to true if the processor should execute the check. If there are
|
19
|
+
# no rows in the target table then this should return false.
|
20
|
+
attr_accessor :should_check
|
21
|
+
|
22
|
+
# Initialize the processor
|
23
|
+
# Configuration options:
|
24
|
+
# * <tt>:skip</tt>: A symbol or array of column names that should not
|
25
|
+
# be checked
|
26
|
+
# * <tt>:table</tt>: The table name
|
27
|
+
# * <tt>:columns</tt>: An array of columns which represent the natural
|
28
|
+
# key
|
29
|
+
def initialize(control, configuration)
|
30
|
+
super
|
31
|
+
@skip = configuration[:skip] || []
|
32
|
+
@target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
|
33
|
+
@table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
|
34
|
+
@columns = configuration[:columns]
|
35
|
+
|
36
|
+
q = "SELECT COUNT(*) FROM #{table_name}"
|
37
|
+
@should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return true if the given key should be skipped
|
41
|
+
def skip?(key)
|
42
|
+
case skip
|
43
|
+
when Array
|
44
|
+
skip.include?(key)
|
45
|
+
else
|
46
|
+
skip.to_sym == key.to_sym
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return true if the row should be checked
|
51
|
+
def should_check?
|
52
|
+
@should_check ? true : false
|
53
|
+
end
|
54
|
+
|
55
|
+
# Process the row
|
56
|
+
def process(row)
|
57
|
+
return row unless should_check?
|
58
|
+
conn = ETL::Engine.connection(target)
|
59
|
+
q = "SELECT * FROM #{table_name} WHERE "
|
60
|
+
conditions = []
|
61
|
+
row.each do |k,v|
|
62
|
+
if columns.nil? || columns.include?(k.to_sym)
|
63
|
+
conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
q << conditions.join(" AND ")
|
67
|
+
q << " LIMIT 1"
|
68
|
+
|
69
|
+
#puts "query: #{q}"
|
70
|
+
result = conn.select_one(q)
|
71
|
+
return row if result.nil?
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
def table_name
|
76
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
def process(row)
|
27
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
28
|
+
unless compound_key_constraints[key]
|
29
|
+
compound_key_constraints[key] = 1
|
30
|
+
return row
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
11
|
+
def process(row)
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Processor #:nodoc:
|
5
|
+
# The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
|
6
|
+
class EncodeProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :source_file
|
10
|
+
# The file to write to
|
11
|
+
attr_reader :target_file
|
12
|
+
# The source file encoding
|
13
|
+
attr_reader :source_encoding
|
14
|
+
# The target file encoding
|
15
|
+
attr_reader :target_encoding
|
16
|
+
|
17
|
+
# Initialize the processor.
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:source_file</tt>: The file to load data from
|
21
|
+
# * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
|
22
|
+
# * <tt>:target_file</tt>: The file to write data to
|
23
|
+
# * <tt>:target_encoding</tt>: The target file encoding
|
24
|
+
def initialize(control, configuration)
|
25
|
+
super
|
26
|
+
raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
|
27
|
+
raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
|
28
|
+
@source_file = File.join(File.dirname(control.file), configuration[:source_file])
|
29
|
+
@source_encoding = configuration[:source_encoding]
|
30
|
+
@target_file = File.join(File.dirname(control.file), configuration[:target_file])
|
31
|
+
@target_encoding = configuration[:target_encoding]
|
32
|
+
raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
|
33
|
+
begin
|
34
|
+
@iconv = Iconv.new(target_encoding,source_encoding)
|
35
|
+
rescue Iconv::InvalidEncoding
|
36
|
+
raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the processor
|
41
|
+
def process
|
42
|
+
# operate line by line to handle large files without loading them in-memory
|
43
|
+
# could be replaced by a system iconv call when available, for greater performance
|
44
|
+
File.open(source_file) do |source|
|
45
|
+
#puts "Opening #{target_file}"
|
46
|
+
File.open(target_file,'w') do |target|
|
47
|
+
source.each_line do |line|
|
48
|
+
target << @iconv.iconv(line)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
target = configuration[:target]
|
28
|
+
table = configuration[:table]
|
29
|
+
conn = ETL::Engine.connection(target)
|
30
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
31
|
+
rows
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
36
|
+
# for all of the children of the current level
|
37
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
38
|
+
ids.each do |id|
|
39
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
40
|
+
|
41
|
+
row = {
|
42
|
+
:parent_id => row_id,
|
43
|
+
:child_id => id,
|
44
|
+
:num_levels_from_parent => level,
|
45
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
46
|
+
:is_top => (root ? 1 : 0),
|
47
|
+
}
|
48
|
+
rows << row
|
49
|
+
|
50
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Base class for pre and post processors. Subclasses must implement the +process+ method.
|
4
|
+
class Processor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
@control = control
|
7
|
+
@configuration = configuration
|
8
|
+
after_initialize if respond_to?(:after_initialize)
|
9
|
+
end
|
10
|
+
protected
|
11
|
+
# Get the control object
|
12
|
+
def control
|
13
|
+
@control
|
14
|
+
end
|
15
|
+
# Get the configuration Hash
|
16
|
+
def configuration
|
17
|
+
@configuration
|
18
|
+
end
|
19
|
+
# Get the engine logger
|
20
|
+
def log
|
21
|
+
Engine.logger
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which requires that the particular fields are non-blank in
|
4
|
+
# order for the row to be retained.
|
5
|
+
class RequireNonBlankProcessor < ETL::Processor::RowProcessor
|
6
|
+
# An array of fields to check
|
7
|
+
attr_reader :fields
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# * <tt>:fields</tt>: An array of fields to check, for example:
|
13
|
+
# [:first_name,:last_name]
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@fields = configuration[:fields] || []
|
17
|
+
end
|
18
|
+
|
19
|
+
# Process the row.
|
20
|
+
def process(row)
|
21
|
+
fields.each { |field| return if row[field].blank? }
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row. This method must return the row.
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :destination
|
6
|
+
attr_accessor :table
|
7
|
+
attr_accessor :column
|
8
|
+
attr_accessor :target
|
9
|
+
|
10
|
+
# Initialize the surrogate key generator
|
11
|
+
#
|
12
|
+
# Configuration options
|
13
|
+
# * <tt>:query</tt>: If specified it contains a query to be used to
|
14
|
+
# locate the last surrogate key. If this is specified then :target
|
15
|
+
# must also be specified.
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:destination</tt>: The destination column name (defaults to :id)
|
18
|
+
def initialize(control, configuration)
|
19
|
+
super
|
20
|
+
@table = configuration[:table]
|
21
|
+
@column = configuration[:column] || 'id'
|
22
|
+
@target = configuration[:target]
|
23
|
+
if configuration[:query]
|
24
|
+
raise ControlError, "Query option is no longer value, use :column and :table instead"
|
25
|
+
end
|
26
|
+
if table
|
27
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
28
|
+
end
|
29
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
30
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
31
|
+
@surrogate_key = @surrogate_key.to_i
|
32
|
+
#puts "surrogate key: #{@surrogate_key}"
|
33
|
+
@destination = configuration[:destination] || :id
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a surrogate key to the row
|
37
|
+
def process(row)
|
38
|
+
if row
|
39
|
+
#puts "processing row #{row.inspect}"
|
40
|
+
@surrogate_key += 1
|
41
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
42
|
+
row[destination] = @surrogate_key
|
43
|
+
row
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def table_name
|
49
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
|
+
# prior to loading
|
5
|
+
class TruncateProcessor < ETL::Processor::Processor
|
6
|
+
# Defines the table to truncate
|
7
|
+
attr_reader :table
|
8
|
+
|
9
|
+
# Defines the database connection to use
|
10
|
+
attr_reader :target
|
11
|
+
|
12
|
+
# Initialize the processor
|
13
|
+
#
|
14
|
+
# Options:
|
15
|
+
# * <tt>:target</tt>: The target connection
|
16
|
+
# * <tt>:table</tt>: The table name
|
17
|
+
def initialize(control, configuration)
|
18
|
+
super
|
19
|
+
#@file = File.join(File.dirname(control.file), configuration[:file])
|
20
|
+
@target = configuration[:target] || {}
|
21
|
+
@table = configuration[:table]
|
22
|
+
end
|
23
|
+
|
24
|
+
def process
|
25
|
+
conn = ETL::Engine.connection(target)
|
26
|
+
conn.truncate(table_name)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def table_name
|
31
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Processor module and requires all of the processors
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# The ETL::Processor module contains row-level and bulk processors
|
5
|
+
module Processor
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'etl/processor/processor'
|
10
|
+
require 'etl/processor/row_processor'
|
11
|
+
Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
|