factorylabs-activewarehouse-etl 0.9.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +85 -0
- data/Rakefile +153 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +78 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +420 -0
- data/lib/etl/control/destination/database_destination.rb +95 -0
- data/lib/etl/control/destination/file_destination.rb +124 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution.rb +20 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution/record.rb +18 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +81 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- metadata +195 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The target database
|
10
|
+
attr_accessor :target
|
11
|
+
|
12
|
+
# The name of the table to check against
|
13
|
+
attr_accessor :table
|
14
|
+
|
15
|
+
# An array of columns representing the natural key
|
16
|
+
attr_accessor :columns
|
17
|
+
|
18
|
+
# Is set to true if the processor should execute the check. If there are
|
19
|
+
# no rows in the target table then this should return false.
|
20
|
+
attr_accessor :should_check
|
21
|
+
|
22
|
+
# Initialize the processor
|
23
|
+
# Configuration options:
|
24
|
+
# * <tt>:skip</tt>: A symbol or array of column names that should not
|
25
|
+
# be checked
|
26
|
+
# * <tt>:table</tt>: The table name
|
27
|
+
# * <tt>:columns</tt>: An array of columns which represent the natural
|
28
|
+
# key
|
29
|
+
def initialize(control, configuration)
|
30
|
+
super
|
31
|
+
@skip = configuration[:skip] || []
|
32
|
+
@target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
|
33
|
+
@table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
|
34
|
+
@columns = configuration[:columns]
|
35
|
+
|
36
|
+
q = "SELECT COUNT(*) FROM #{table_name}"
|
37
|
+
@should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return true if the given key should be skipped
|
41
|
+
def skip?(key)
|
42
|
+
case skip
|
43
|
+
when Array
|
44
|
+
skip.include?(key)
|
45
|
+
else
|
46
|
+
skip.to_sym == key.to_sym
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return true if the row should be checked
|
51
|
+
def should_check?
|
52
|
+
@should_check ? true : false
|
53
|
+
end
|
54
|
+
|
55
|
+
# Process the row
|
56
|
+
def process(row)
|
57
|
+
return row unless should_check?
|
58
|
+
conn = ETL::Engine.connection(target)
|
59
|
+
q = "SELECT * FROM #{table_name} WHERE "
|
60
|
+
conditions = []
|
61
|
+
row.each do |k,v|
|
62
|
+
if columns.nil? || columns.include?(k.to_sym)
|
63
|
+
conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
q << conditions.join(" AND ")
|
67
|
+
q << " LIMIT 1"
|
68
|
+
|
69
|
+
#puts "query: #{q}"
|
70
|
+
result = conn.select_one(q)
|
71
|
+
return row if result.nil?
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
def table_name
|
76
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
def process(row)
|
27
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
28
|
+
unless compound_key_constraints[key]
|
29
|
+
compound_key_constraints[key] = 1
|
30
|
+
return row
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
11
|
+
def process(row)
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Processor #:nodoc:
|
5
|
+
# The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
|
6
|
+
class EncodeProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :source_file
|
10
|
+
# The file to write to
|
11
|
+
attr_reader :target_file
|
12
|
+
# The source file encoding
|
13
|
+
attr_reader :source_encoding
|
14
|
+
# The target file encoding
|
15
|
+
attr_reader :target_encoding
|
16
|
+
|
17
|
+
# Initialize the processor.
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:source_file</tt>: The file to load data from
|
21
|
+
# * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
|
22
|
+
# * <tt>:target_file</tt>: The file to write data to
|
23
|
+
# * <tt>:target_encoding</tt>: The target file encoding
|
24
|
+
def initialize(control, configuration)
|
25
|
+
super
|
26
|
+
raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
|
27
|
+
raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
|
28
|
+
@source_file = File.join(File.dirname(control.file), configuration[:source_file])
|
29
|
+
@source_encoding = configuration[:source_encoding]
|
30
|
+
@target_file = File.join(File.dirname(control.file), configuration[:target_file])
|
31
|
+
@target_encoding = configuration[:target_encoding]
|
32
|
+
raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
|
33
|
+
begin
|
34
|
+
@iconv = Iconv.new(target_encoding,source_encoding)
|
35
|
+
rescue Iconv::InvalidEncoding
|
36
|
+
raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the processor
|
41
|
+
def process
|
42
|
+
# operate line by line to handle large files without loading them in-memory
|
43
|
+
# could be replaced by a system iconv call when available, for greater performance
|
44
|
+
File.open(source_file) do |source|
|
45
|
+
#puts "Opening #{target_file}"
|
46
|
+
File.open(target_file,'w') do |target|
|
47
|
+
source.each_line do |line|
|
48
|
+
target << @iconv.iconv(line)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
target = configuration[:target]
|
28
|
+
table = configuration[:table]
|
29
|
+
conn = ETL::Engine.connection(target)
|
30
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
31
|
+
rows
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
36
|
+
# for all of the children of the current level
|
37
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
38
|
+
ids.each do |id|
|
39
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
40
|
+
|
41
|
+
row = {
|
42
|
+
:parent_id => row_id,
|
43
|
+
:child_id => id,
|
44
|
+
:num_levels_from_parent => level,
|
45
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
46
|
+
:is_top => (root ? 1 : 0),
|
47
|
+
}
|
48
|
+
rows << row
|
49
|
+
|
50
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Base class for pre and post processors. Subclasses must implement the +process+ method.
|
4
|
+
class Processor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
@control = control
|
7
|
+
@configuration = configuration
|
8
|
+
after_initialize if respond_to?(:after_initialize)
|
9
|
+
end
|
10
|
+
protected
|
11
|
+
# Get the control object
|
12
|
+
def control
|
13
|
+
@control
|
14
|
+
end
|
15
|
+
# Get the configuration Hash
|
16
|
+
def configuration
|
17
|
+
@configuration
|
18
|
+
end
|
19
|
+
# Get the engine logger
|
20
|
+
def log
|
21
|
+
Engine.logger
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which requires that the particular fields are non-blank in
|
4
|
+
# order for the row to be retained.
|
5
|
+
class RequireNonBlankProcessor < ETL::Processor::RowProcessor
|
6
|
+
# An array of fields to check
|
7
|
+
attr_reader :fields
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# * <tt>:fields</tt>: An array of fields to check, for example:
|
13
|
+
# [:first_name,:last_name]
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@fields = configuration[:fields] || []
|
17
|
+
end
|
18
|
+
|
19
|
+
# Process the row.
|
20
|
+
def process(row)
|
21
|
+
fields.each { |field| return if row[field].blank? }
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row. This method must return the row.
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :destination
|
6
|
+
attr_accessor :table
|
7
|
+
attr_accessor :column
|
8
|
+
attr_accessor :target
|
9
|
+
|
10
|
+
# Initialize the surrogate key generator
|
11
|
+
#
|
12
|
+
# Configuration options
|
13
|
+
# * <tt>:query</tt>: If specified it contains a query to be used to
|
14
|
+
# locate the last surrogate key. If this is specified then :target
|
15
|
+
# must also be specified.
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:destination</tt>: The destination column name (defaults to :id)
|
18
|
+
def initialize(control, configuration)
|
19
|
+
super
|
20
|
+
@table = configuration[:table]
|
21
|
+
@column = configuration[:column] || 'id'
|
22
|
+
@target = configuration[:target]
|
23
|
+
if configuration[:query]
|
24
|
+
raise ControlError, "Query option is no longer value, use :column and :table instead"
|
25
|
+
end
|
26
|
+
if table
|
27
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
28
|
+
end
|
29
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
30
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
31
|
+
@surrogate_key = @surrogate_key.to_i
|
32
|
+
#puts "surrogate key: #{@surrogate_key}"
|
33
|
+
@destination = configuration[:destination] || :id
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a surrogate key to the row
|
37
|
+
def process(row)
|
38
|
+
if row
|
39
|
+
#puts "processing row #{row.inspect}"
|
40
|
+
@surrogate_key += 1
|
41
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
42
|
+
row[destination] = @surrogate_key
|
43
|
+
row
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def table_name
|
49
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|