factorylabs-activewarehouse-etl 0.9.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +85 -0
- data/Rakefile +153 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +78 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +420 -0
- data/lib/etl/control/destination/database_destination.rb +95 -0
- data/lib/etl/control/destination/file_destination.rb +124 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution.rb +20 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution/record.rb +18 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +81 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- metadata +195 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The target database
|
10
|
+
attr_accessor :target
|
11
|
+
|
12
|
+
# The name of the table to check against
|
13
|
+
attr_accessor :table
|
14
|
+
|
15
|
+
# An array of columns representing the natural key
|
16
|
+
attr_accessor :columns
|
17
|
+
|
18
|
+
# Is set to true if the processor should execute the check. If there are
|
19
|
+
# no rows in the target table then this should return false.
|
20
|
+
attr_accessor :should_check
|
21
|
+
|
22
|
+
# Initialize the processor
|
23
|
+
# Configuration options:
|
24
|
+
# * <tt>:skip</tt>: A symbol or array of column names that should not
|
25
|
+
# be checked
|
26
|
+
# * <tt>:table</tt>: The table name
|
27
|
+
# * <tt>:columns</tt>: An array of columns which represent the natural
|
28
|
+
# key
|
29
|
+
def initialize(control, configuration)
|
30
|
+
super
|
31
|
+
@skip = configuration[:skip] || []
|
32
|
+
@target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
|
33
|
+
@table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
|
34
|
+
@columns = configuration[:columns]
|
35
|
+
|
36
|
+
q = "SELECT COUNT(*) FROM #{table_name}"
|
37
|
+
@should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return true if the given key should be skipped
|
41
|
+
def skip?(key)
|
42
|
+
case skip
|
43
|
+
when Array
|
44
|
+
skip.include?(key)
|
45
|
+
else
|
46
|
+
skip.to_sym == key.to_sym
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return true if the row should be checked
|
51
|
+
def should_check?
|
52
|
+
@should_check ? true : false
|
53
|
+
end
|
54
|
+
|
55
|
+
# Process the row
|
56
|
+
def process(row)
|
57
|
+
return row unless should_check?
|
58
|
+
conn = ETL::Engine.connection(target)
|
59
|
+
q = "SELECT * FROM #{table_name} WHERE "
|
60
|
+
conditions = []
|
61
|
+
row.each do |k,v|
|
62
|
+
if columns.nil? || columns.include?(k.to_sym)
|
63
|
+
conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
q << conditions.join(" AND ")
|
67
|
+
q << " LIMIT 1"
|
68
|
+
|
69
|
+
#puts "query: #{q}"
|
70
|
+
result = conn.select_one(q)
|
71
|
+
return row if result.nil?
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
def table_name
|
76
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
def process(row)
|
27
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
28
|
+
unless compound_key_constraints[key]
|
29
|
+
compound_key_constraints[key] = 1
|
30
|
+
return row
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
11
|
+
def process(row)
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Processor #:nodoc:
|
5
|
+
# The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
|
6
|
+
class EncodeProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :source_file
|
10
|
+
# The file to write to
|
11
|
+
attr_reader :target_file
|
12
|
+
# The source file encoding
|
13
|
+
attr_reader :source_encoding
|
14
|
+
# The target file encoding
|
15
|
+
attr_reader :target_encoding
|
16
|
+
|
17
|
+
# Initialize the processor.
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:source_file</tt>: The file to load data from
|
21
|
+
# * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
|
22
|
+
# * <tt>:target_file</tt>: The file to write data to
|
23
|
+
# * <tt>:target_encoding</tt>: The target file encoding
|
24
|
+
def initialize(control, configuration)
|
25
|
+
super
|
26
|
+
raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
|
27
|
+
raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
|
28
|
+
@source_file = File.join(File.dirname(control.file), configuration[:source_file])
|
29
|
+
@source_encoding = configuration[:source_encoding]
|
30
|
+
@target_file = File.join(File.dirname(control.file), configuration[:target_file])
|
31
|
+
@target_encoding = configuration[:target_encoding]
|
32
|
+
raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
|
33
|
+
begin
|
34
|
+
@iconv = Iconv.new(target_encoding,source_encoding)
|
35
|
+
rescue Iconv::InvalidEncoding
|
36
|
+
raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the processor
|
41
|
+
def process
|
42
|
+
# operate line by line to handle large files without loading them in-memory
|
43
|
+
# could be replaced by a system iconv call when available, for greater performance
|
44
|
+
File.open(source_file) do |source|
|
45
|
+
#puts "Opening #{target_file}"
|
46
|
+
File.open(target_file,'w') do |target|
|
47
|
+
source.each_line do |line|
|
48
|
+
target << @iconv.iconv(line)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
target = configuration[:target]
|
28
|
+
table = configuration[:table]
|
29
|
+
conn = ETL::Engine.connection(target)
|
30
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
31
|
+
rows
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
36
|
+
# for all of the children of the current level
|
37
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
38
|
+
ids.each do |id|
|
39
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
40
|
+
|
41
|
+
row = {
|
42
|
+
:parent_id => row_id,
|
43
|
+
:child_id => id,
|
44
|
+
:num_levels_from_parent => level,
|
45
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
46
|
+
:is_top => (root ? 1 : 0),
|
47
|
+
}
|
48
|
+
rows << row
|
49
|
+
|
50
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Base class for pre and post processors. Subclasses must implement the +process+ method.
|
4
|
+
class Processor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
@control = control
|
7
|
+
@configuration = configuration
|
8
|
+
after_initialize if respond_to?(:after_initialize)
|
9
|
+
end
|
10
|
+
protected
|
11
|
+
# Get the control object
|
12
|
+
def control
|
13
|
+
@control
|
14
|
+
end
|
15
|
+
# Get the configuration Hash
|
16
|
+
def configuration
|
17
|
+
@configuration
|
18
|
+
end
|
19
|
+
# Get the engine logger
|
20
|
+
def log
|
21
|
+
Engine.logger
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which requires that the particular fields are non-blank in
|
4
|
+
# order for the row to be retained.
|
5
|
+
class RequireNonBlankProcessor < ETL::Processor::RowProcessor
|
6
|
+
# An array of fields to check
|
7
|
+
attr_reader :fields
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# * <tt>:fields</tt>: An array of fields to check, for example:
|
13
|
+
# [:first_name,:last_name]
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@fields = configuration[:fields] || []
|
17
|
+
end
|
18
|
+
|
19
|
+
# Process the row.
|
20
|
+
def process(row)
|
21
|
+
fields.each { |field| return if row[field].blank? }
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row. This method must return the row.
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :destination
|
6
|
+
attr_accessor :table
|
7
|
+
attr_accessor :column
|
8
|
+
attr_accessor :target
|
9
|
+
|
10
|
+
# Initialize the surrogate key generator
|
11
|
+
#
|
12
|
+
# Configuration options
|
13
|
+
# * <tt>:query</tt>: If specified it contains a query to be used to
|
14
|
+
# locate the last surrogate key. If this is specified then :target
|
15
|
+
# must also be specified.
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:destination</tt>: The destination column name (defaults to :id)
|
18
|
+
def initialize(control, configuration)
|
19
|
+
super
|
20
|
+
@table = configuration[:table]
|
21
|
+
@column = configuration[:column] || 'id'
|
22
|
+
@target = configuration[:target]
|
23
|
+
if configuration[:query]
|
24
|
+
raise ControlError, "Query option is no longer value, use :column and :table instead"
|
25
|
+
end
|
26
|
+
if table
|
27
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
28
|
+
end
|
29
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
30
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
31
|
+
@surrogate_key = @surrogate_key.to_i
|
32
|
+
#puts "surrogate key: #{@surrogate_key}"
|
33
|
+
@destination = configuration[:destination] || :id
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a surrogate key to the row
|
37
|
+
def process(row)
|
38
|
+
if row
|
39
|
+
#puts "processing row #{row.inspect}"
|
40
|
+
@surrogate_key += 1
|
41
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
42
|
+
row[destination] = @surrogate_key
|
43
|
+
row
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def table_name
|
49
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|