activewarehouse-etl-sgonyea 0.9.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +9 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +236 -0
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +13 -0
- data/LICENSE +7 -0
- data/README.textile +111 -0
- data/Rakefile +103 -0
- data/TODO +28 -0
- data/active_support_logger.patch +78 -0
- data/activewarehouse-etl.gemspec +36 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +97 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +65 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +438 -0
- data/lib/etl/control/destination/csv_destination.rb +113 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/destination/yaml_destination.rb +74 -0
- data/lib/etl/control/source.rb +132 -0
- data/lib/etl/control/source/database_source.rb +224 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +582 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +8 -0
- data/lib/etl/execution/batch.rb +10 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +90 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/csv_parser.rb +93 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +94 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +39 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/database_join_processor.rb +82 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +27 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +40 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/md5_transform.rb +13 -0
- data/lib/etl/transform/ordinalize_transform.rb +14 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +3 -0
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/all.ebf +6 -0
- data/test/apache_combined_log.ctl +11 -0
- data/test/batch_test.rb +41 -0
- data/test/batch_with_error.ebf +6 -0
- data/test/batched1.ctl +0 -0
- data/test/batched2.ctl +0 -0
- data/test/block_processor.ctl +6 -0
- data/test/block_processor_error.ctl +1 -0
- data/test/block_processor_pre_post_process.ctl +4 -0
- data/test/block_processor_remove_rows.ctl +5 -0
- data/test/block_processor_test.rb +38 -0
- data/test/check_exist_processor_test.rb +92 -0
- data/test/check_unique_processor_test.rb +40 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +53 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +61 -0
- data/test/config/common.rb +29 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +37 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +40 -0
- data/test/control_test.rb +43 -0
- data/test/data/apache_combined_log.txt +3 -0
- data/test/data/bulk_import.txt +3 -0
- data/test/data/bulk_import_with_empties.txt +3 -0
- data/test/data/decode.txt +3 -0
- data/test/data/delimited.txt +3 -0
- data/test/data/encode_source_latin1.txt +2 -0
- data/test/data/excel.xls +0 -0
- data/test/data/excel2.xls +0 -0
- data/test/data/fixed_width.txt +3 -0
- data/test/data/multiple_delimited_1.txt +3 -0
- data/test/data/multiple_delimited_2.txt +3 -0
- data/test/data/nokogiri.xml +38 -0
- data/test/data/people.txt +3 -0
- data/test/data/sax.xml +14 -0
- data/test/data/xml.xml +16 -0
- data/test/database_join_processor_test.rb +43 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/delimited.ctl +30 -0
- data/test/delimited_absolute.ctl +31 -0
- data/test/delimited_destination_db.ctl +23 -0
- data/test/delimited_excel.ctl +31 -0
- data/test/delimited_insert_update.ctl +34 -0
- data/test/delimited_update.ctl +34 -0
- data/test/delimited_with_bulk_load.ctl +34 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +78 -0
- data/test/ensure_fields_presence_processor_test.rb +28 -0
- data/test/errors.ctl +24 -0
- data/test/etl_test.rb +42 -0
- data/test/excel.ctl +24 -0
- data/test/excel2.ctl +25 -0
- data/test/fixed_width.ctl +35 -0
- data/test/foreign_key_lookup_transform_test.rb +50 -0
- data/test/generator_test.rb +14 -0
- data/test/inline_parser.ctl +17 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/model_source.ctl +14 -0
- data/test/multiple_delimited.ctl +22 -0
- data/test/multiple_source_delimited.ctl +39 -0
- data/test/nokogiri_all.ctl +35 -0
- data/test/nokogiri_select.ctl +35 -0
- data/test/nokogiri_test.rb +35 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/sax.ctl +26 -0
- data/test/scd/1.txt +1 -0
- data/test/scd/2.txt +1 -0
- data/test/scd/3.txt +1 -0
- data/test/scd_test.rb +257 -0
- data/test/scd_test_type_1.ctl +43 -0
- data/test/scd_test_type_2.ctl +34 -0
- data/test/screen_test.rb +9 -0
- data/test/screen_test_error.ctl +3 -0
- data/test/screen_test_fatal.ctl +3 -0
- data/test/source_test.rb +154 -0
- data/test/test_helper.rb +37 -0
- data/test/transform_test.rb +101 -0
- data/test/truncate_processor_test.rb +37 -0
- data/test/xml.ctl +31 -0
- metadata +370 -0
@@ -0,0 +1,90 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# A File source.
|
4
|
+
class FileSource < Source
|
5
|
+
# The number of lines to skip, default is 0
|
6
|
+
attr_accessor :skip_lines
|
7
|
+
|
8
|
+
# Accessor for the underlying parser
|
9
|
+
attr_accessor :parser
|
10
|
+
|
11
|
+
# The source file
|
12
|
+
attr_accessor :file
|
13
|
+
|
14
|
+
# Initialize the source
|
15
|
+
#
|
16
|
+
# Configuration options:
|
17
|
+
# * <tt>:file</tt>: The source file
|
18
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
19
|
+
# symbol, a class which extends from Parser, a Hash with :name and
|
20
|
+
# optionally an :options key. Whether or not the parser uses the
|
21
|
+
# options is dependent on which parser is used. See the documentation
|
22
|
+
# for each parser for information on what options it accepts.
|
23
|
+
# * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
|
24
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
25
|
+
# source data locally for archival
|
26
|
+
def initialize(control, configuration, definition)
|
27
|
+
super
|
28
|
+
configure
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get a String identifier for the source
|
32
|
+
def to_s
|
33
|
+
file
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get the local storage directory
|
37
|
+
def local_directory
|
38
|
+
File.join(local_base, File.basename(file, File.extname(file)))
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns each row from the source
|
42
|
+
def each
|
43
|
+
count = 0
|
44
|
+
copy_sources if @store_locally
|
45
|
+
@parser.each do |row|
|
46
|
+
if ETL::Engine.offset && count < ETL::Engine.offset
|
47
|
+
count += 1
|
48
|
+
else
|
49
|
+
row = ETL::Row[row]
|
50
|
+
row.source = self
|
51
|
+
yield row
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
# Copy source data to a local directory structure
|
58
|
+
def copy_sources
|
59
|
+
sequence = 0
|
60
|
+
path = Pathname.new(file)
|
61
|
+
path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
|
62
|
+
Pathname.glob(path).each do |f|
|
63
|
+
next if f.directory?
|
64
|
+
lf = local_file(sequence)
|
65
|
+
FileUtils.cp(f, lf)
|
66
|
+
File.open(local_file_trigger(lf), 'w') {|f| }
|
67
|
+
sequence += 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Configure the source
|
72
|
+
def configure
|
73
|
+
@file = configuration[:file]
|
74
|
+
case configuration[:parser]
|
75
|
+
when Class
|
76
|
+
@parser = configuration[:parser].new(self)
|
77
|
+
when String, Symbol
|
78
|
+
@parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
|
79
|
+
when Hash
|
80
|
+
name = configuration[:parser][:name]
|
81
|
+
options = configuration[:parser][:options]
|
82
|
+
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
83
|
+
else
|
84
|
+
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
85
|
+
end
|
86
|
+
@skip_lines = configuration[:skip_lines] ||= 0
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#RAILS_ENV = 'development'
|
2
|
+
#require '../config/environment'
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
module Control #:nodoc:
|
6
|
+
class ModelSource < Source
|
7
|
+
|
8
|
+
def columns
|
9
|
+
case definition
|
10
|
+
when Array
|
11
|
+
definition.collect(&:to_sym)
|
12
|
+
when Hash
|
13
|
+
definition.keys.collect(&:to_sym)
|
14
|
+
else
|
15
|
+
raise "Definition must be either an Array or a Hash"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def railsmodel
|
20
|
+
configuration[:model]
|
21
|
+
end
|
22
|
+
|
23
|
+
def order
|
24
|
+
configuration[:order] || "id"
|
25
|
+
end
|
26
|
+
|
27
|
+
def each(&block)
|
28
|
+
railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
|
29
|
+
result_row = ETL::Row.new
|
30
|
+
result_row.source = self
|
31
|
+
columns.each do |column|
|
32
|
+
result_row[column.to_sym] = row.send(column)
|
33
|
+
end
|
34
|
+
yield result_row
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/etl/core_ext.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'etl/core_ext/time'
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#Updated by Jack Hong on 04/05/08
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module CoreExtensions #:nodoc:
|
5
|
+
module Time #:nodoc:
|
6
|
+
# Enables the use of time calculations within Time itself
|
7
|
+
module Calculations
|
8
|
+
def week
|
9
|
+
cyw = ((yday - 1) / 7) + 1
|
10
|
+
cyw = 52 if cyw == 53
|
11
|
+
cyw
|
12
|
+
end
|
13
|
+
def quarter
|
14
|
+
((month - 1) / 3) + 1
|
15
|
+
end
|
16
|
+
def fiscal_year_week(offset_month=10)
|
17
|
+
fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
|
18
|
+
fyw = 52 if fyw == 53
|
19
|
+
fyw
|
20
|
+
end
|
21
|
+
def fiscal_year_month(offset_month=10)
|
22
|
+
shifted_month = month - (offset_month - 1)
|
23
|
+
shifted_month += 12 if shifted_month <= 0
|
24
|
+
shifted_month
|
25
|
+
end
|
26
|
+
def fiscal_year_quarter(offset_month=10)
|
27
|
+
((fiscal_year_month(offset_month) - 1) / 3) + 1
|
28
|
+
end
|
29
|
+
def fiscal_year(offset_month=10)
|
30
|
+
month >= offset_month ? year + 1 : year
|
31
|
+
end
|
32
|
+
def fiscal_year_yday(offset_month=10)
|
33
|
+
offset_days = 0
|
34
|
+
1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
|
35
|
+
shifted_year_day = yday - offset_days
|
36
|
+
shifted_year_day += 365 if shifted_year_day <= 0
|
37
|
+
shifted_year_day
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/etl/engine.rb
ADDED
@@ -0,0 +1,582 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
|
3
|
+
class Base < ActiveRecord::Base
|
4
|
+
end
|
5
|
+
|
6
|
+
# The main ETL engine clas
|
7
|
+
class Engine
|
8
|
+
include ETL::Util
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Initialization that is run when a job is executed.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
# * <tt>:limit</tt>: Limit the number of records returned from sources
|
15
|
+
# * <tt>:offset</tt>: Specify the records for data from sources
|
16
|
+
# * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
|
17
|
+
# * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
|
18
|
+
# * <tt>:read_locally</tt>: Set to true to read from the local cache
|
19
|
+
# * <tt>:rails_root</tt>: Set to the rails root to boot rails
|
20
|
+
def init(options={})
|
21
|
+
unless @initialized
|
22
|
+
puts "initializing ETL engine\n\n"
|
23
|
+
@limit = options[:limit]
|
24
|
+
@offset = options[:offset]
|
25
|
+
@log_write_mode = 'w' if options[:newlog]
|
26
|
+
@skip_bulk_import = options[:skip_bulk_import]
|
27
|
+
@read_locally = options[:read_locally]
|
28
|
+
@rails_root = options[:rails_root]
|
29
|
+
@log_dir = options[:log_dir]
|
30
|
+
|
31
|
+
require File.join(@rails_root, 'config/environment') if @rails_root
|
32
|
+
options[:config] ||= 'database.yml'
|
33
|
+
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
34
|
+
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
35
|
+
ActiveRecord::Base.configurations.merge!(database_configuration)
|
36
|
+
ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
|
37
|
+
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
38
|
+
|
39
|
+
require 'etl/execution'
|
40
|
+
ETL::Execution::Base.establish_connection(options[:execution_conf] || :etl_execution)
|
41
|
+
ETL::Execution::Execution.migrate
|
42
|
+
|
43
|
+
@initialized = true
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Process the specified file. Acceptable values for file are:
|
48
|
+
# * Path to a file
|
49
|
+
# * File object
|
50
|
+
# * ETL::Control::Control instance
|
51
|
+
# * ETL::Batch::Batch instance
|
52
|
+
#
|
53
|
+
# The process command will accept either a .ctl Control file or a .ebf
|
54
|
+
# ETL Batch File.
|
55
|
+
def process(file)
|
56
|
+
new().process(file)
|
57
|
+
end
|
58
|
+
|
59
|
+
attr_accessor :timestamped_log
|
60
|
+
|
61
|
+
# Accessor for the log write mode. Default is 'a' for append.
|
62
|
+
attr_accessor :log_write_mode
|
63
|
+
def log_write_mode
|
64
|
+
@log_write_mode ||= 'a'
|
65
|
+
end
|
66
|
+
|
67
|
+
# A logger for the engine
|
68
|
+
attr_accessor :logger
|
69
|
+
|
70
|
+
def logger #:nodoc:
|
71
|
+
unless @logger
|
72
|
+
if timestamped_log
|
73
|
+
logfile = File.join(*[@log_dir, "etl_#{timestamp}.log"].compact)
|
74
|
+
|
75
|
+
@logger = Logger.new(logfile)
|
76
|
+
else
|
77
|
+
logfile = File.join(*[@log_dir, '/etl.log'].compact)
|
78
|
+
|
79
|
+
@logger = Logger.new(File.open(logfile, log_write_mode))
|
80
|
+
end
|
81
|
+
@logger.level = Logger::WARN
|
82
|
+
@logger.formatter = Logger::Formatter.new
|
83
|
+
end
|
84
|
+
@logger
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get a timestamp value as a string
|
88
|
+
def timestamp
|
89
|
+
Time.now.strftime("%Y%m%d%H%M%S")
|
90
|
+
end
|
91
|
+
|
92
|
+
# The current source
|
93
|
+
attr_accessor :current_source
|
94
|
+
|
95
|
+
# The current source row
|
96
|
+
attr_accessor :current_source_row
|
97
|
+
|
98
|
+
# The current destination
|
99
|
+
attr_accessor :current_destination
|
100
|
+
|
101
|
+
# Set to true to activate realtime activity. This will cause certain
|
102
|
+
# information messages to be printed to STDOUT
|
103
|
+
attr_accessor :realtime_activity
|
104
|
+
|
105
|
+
# Accessor for the total number of rows read from sources
|
106
|
+
attr_accessor :rows_read
|
107
|
+
def rows_read
|
108
|
+
@rows_read ||= 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# Accessor for the total number of rows processed
|
112
|
+
attr_accessor :rows_written
|
113
|
+
def rows_written
|
114
|
+
@rows_written ||= 0
|
115
|
+
end
|
116
|
+
|
117
|
+
# Access the current ETL::Execution::Job instance
|
118
|
+
attr_accessor :job
|
119
|
+
|
120
|
+
# Access the current ETL::Execution::Batch instance
|
121
|
+
attr_accessor :batch
|
122
|
+
|
123
|
+
# The limit on rows to load from the source, useful for testing the ETL
|
124
|
+
# process prior to executing the entire batch. Default value is nil and
|
125
|
+
# indicates that there is no limit
|
126
|
+
attr_accessor :limit
|
127
|
+
|
128
|
+
# The offset for the source to begin at, useful for testing the ETL
|
129
|
+
# process prior to executing the entire batch. Default value is nil and
|
130
|
+
# indicates that there is no offset
|
131
|
+
attr_accessor :offset
|
132
|
+
|
133
|
+
# Set to true to skip all bulk importing
|
134
|
+
attr_accessor :skip_bulk_import
|
135
|
+
|
136
|
+
# Set to true to read locally from the last source cache files
|
137
|
+
attr_accessor :read_locally
|
138
|
+
|
139
|
+
# Accessor for the average rows per second processed
|
140
|
+
attr_accessor :average_rows_per_second
|
141
|
+
|
142
|
+
# Get a named connection
|
143
|
+
def connection(name)
|
144
|
+
logger.debug "Retrieving connection #{name}"
|
145
|
+
conn = connections[name] ||= establish_connection(name)
|
146
|
+
#conn.verify!(ActiveRecord::Base.verification_timeout)
|
147
|
+
conn.reconnect! unless conn.active?
|
148
|
+
conn
|
149
|
+
end
|
150
|
+
|
151
|
+
# Set to true to use temp tables
|
152
|
+
attr_accessor :use_temp_tables
|
153
|
+
|
154
|
+
# Get a registry of temp tables
|
155
|
+
def temp_tables
|
156
|
+
@temp_tables ||= {}
|
157
|
+
end
|
158
|
+
|
159
|
+
# Called when a batch job finishes, allowing for cleanup to occur
|
160
|
+
def finish
|
161
|
+
temp_tables.each do |temp_table, mapping|
|
162
|
+
actual_table = mapping[:table]
|
163
|
+
#puts "move #{temp_table} to #{actual_table}"
|
164
|
+
conn = mapping[:connection]
|
165
|
+
conn.transaction do
|
166
|
+
conn.rename_table(actual_table, "#{actual_table}_old")
|
167
|
+
conn.rename_table(temp_table, actual_table)
|
168
|
+
conn.drop_table("#{actual_table}_old")
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Return true if using temp tables
|
174
|
+
def use_temp_tables?
|
175
|
+
use_temp_tables ? true : false
|
176
|
+
end
|
177
|
+
|
178
|
+
# Modify the table name if necessary
|
179
|
+
def table(table_name, connection)
|
180
|
+
if use_temp_tables?
|
181
|
+
temp_table_name = "tmp_#{table_name}"
|
182
|
+
|
183
|
+
if temp_tables[temp_table_name].nil?
|
184
|
+
# Create the temp table and add it to the mapping
|
185
|
+
begin connection.drop_table(temp_table_name); rescue; end
|
186
|
+
connection.copy_table(table_name, temp_table_name)
|
187
|
+
temp_tables[temp_table_name] = {
|
188
|
+
:table => table_name,
|
189
|
+
:connection => connection
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
temp_table_name
|
194
|
+
else
|
195
|
+
table_name
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
protected
|
200
|
+
# Hash of database connections that can be used throughout the ETL
|
201
|
+
# process
|
202
|
+
def connections
|
203
|
+
@connections ||= {}
|
204
|
+
end
|
205
|
+
|
206
|
+
# Establish the named connection and return the database specific connection
|
207
|
+
def establish_connection(name)
|
208
|
+
raise ETL::ETLError, "Connection with no name requested. Is there a missing :target parameter somewhere?" if name.blank?
|
209
|
+
|
210
|
+
logger.debug "Establishing connection to #{name}"
|
211
|
+
conn_config = ETL::Base.configurations[name.to_s]
|
212
|
+
raise ETL::ETLError, "Cannot find connection named #{name.inspect}" unless conn_config
|
213
|
+
connection_method = "#{conn_config['adapter']}_connection"
|
214
|
+
ETL::Base.send(connection_method, conn_config)
|
215
|
+
end
|
216
|
+
end # class << self
|
217
|
+
|
218
|
+
# Say the specified message, with a newline
|
219
|
+
def say(message)
|
220
|
+
say_without_newline(message + "\n")
|
221
|
+
end
|
222
|
+
|
223
|
+
# Say the specified message without a newline
|
224
|
+
def say_without_newline(message)
|
225
|
+
if ETL::Engine.realtime_activity
|
226
|
+
$stdout.print message
|
227
|
+
$stdout.flush
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# Say the message on its own line
|
232
|
+
def say_on_own_line(message)
|
233
|
+
say("\n" + message)
|
234
|
+
end
|
235
|
+
|
236
|
+
# Array of errors encountered during execution of the ETL process
|
237
|
+
def errors
|
238
|
+
@errors ||= []
|
239
|
+
end
|
240
|
+
|
241
|
+
# Get a Hash of benchmark values where each value represents the total
|
242
|
+
# amount of time in seconds spent processing in that portion of the ETL
|
243
|
+
# pipeline. Keys include:
|
244
|
+
# * <tt>:transforms</tt>
|
245
|
+
# * <tt>:after_reads</tt>
|
246
|
+
# * <tt>:before_writes</tt>
|
247
|
+
# * <tt>:writes</tt>
|
248
|
+
def benchmarks
|
249
|
+
@benchmarks ||= {
|
250
|
+
:transforms => 0,
|
251
|
+
:after_reads => 0,
|
252
|
+
:before_writes => 0,
|
253
|
+
:writes => 0,
|
254
|
+
}
|
255
|
+
end
|
256
|
+
|
257
|
+
# Process a file, control object or batch object. Acceptable values for
|
258
|
+
# file are:
|
259
|
+
# * Path to a file
|
260
|
+
# * File object
|
261
|
+
# * ETL::Control::Control instance
|
262
|
+
# * ETL::Batch::Batch instance
|
263
|
+
def process(file)
|
264
|
+
case file
|
265
|
+
when String
|
266
|
+
process(File.new(file))
|
267
|
+
when File
|
268
|
+
case file.path
|
269
|
+
when /.ctl/ then process_control(file)
|
270
|
+
when /.etl/ then process_control(file)
|
271
|
+
when /.ebf/ then process_batch(file)
|
272
|
+
else
|
273
|
+
raise RuntimeError, "Unsupported file type - #{file.path}"
|
274
|
+
end
|
275
|
+
when ETL::Control::Control
|
276
|
+
process_control(file)
|
277
|
+
when ETL::Batch::Batch
|
278
|
+
process_batch(file)
|
279
|
+
else
|
280
|
+
raise RuntimeError, "Process object must be a String, File, Control
|
281
|
+
instance or Batch instance"
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
protected
|
286
|
+
# Process the specified batch file
|
287
|
+
def process_batch(batch)
|
288
|
+
batch = ETL::Batch::Batch.resolve(batch, self)
|
289
|
+
say "Processing batch #{batch.file}"
|
290
|
+
|
291
|
+
ETL::Engine.batch = ETL::Execution::Batch.create!(
|
292
|
+
:batch_file => batch.file,
|
293
|
+
:status => 'executing'
|
294
|
+
)
|
295
|
+
|
296
|
+
batch.execute
|
297
|
+
|
298
|
+
ETL::Engine.batch.completed_at = Time.now
|
299
|
+
ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
300
|
+
ETL::Engine.batch.save!
|
301
|
+
end
|
302
|
+
|
303
|
+
# Process the specified control file
|
304
|
+
def process_control(control)
|
305
|
+
control = ETL::Control::Control.resolve(control)
|
306
|
+
say_on_own_line "Processing control #{control.file}"
|
307
|
+
|
308
|
+
ETL::Engine.job = ETL::Execution::Job.create!(
|
309
|
+
:control_file => control.file,
|
310
|
+
:status => 'executing',
|
311
|
+
:batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
|
312
|
+
)
|
313
|
+
|
314
|
+
execute_dependencies(control)
|
315
|
+
|
316
|
+
start_time = Time.now
|
317
|
+
pre_process(control)
|
318
|
+
sources = control.sources
|
319
|
+
destinations = control.destinations
|
320
|
+
|
321
|
+
say "Skipping bulk import" if Engine.skip_bulk_import
|
322
|
+
|
323
|
+
sources.each do |source|
|
324
|
+
Engine.current_source = source
|
325
|
+
Engine.logger.debug "Processing source #{source.inspect}"
|
326
|
+
say "Source: #{source}"
|
327
|
+
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
328
|
+
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
329
|
+
source.each_with_index do |row, index|
|
330
|
+
# Break out of the row loop if the +Engine.limit+ is specified and
|
331
|
+
# the number of rows read exceeds that value.
|
332
|
+
if Engine.limit != nil && Engine.rows_read >= Engine.limit
|
333
|
+
puts "Reached limit of #{Engine.limit}"
|
334
|
+
break
|
335
|
+
end
|
336
|
+
|
337
|
+
Engine.logger.debug "Row #{index}: #{row.inspect}"
|
338
|
+
Engine.rows_read += 1
|
339
|
+
Engine.current_source_row = index + 1
|
340
|
+
say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
|
341
|
+
|
342
|
+
# At this point a single row may be turned into multiple rows via row
|
343
|
+
# processors all code after this line should work with the array of
|
344
|
+
# rows rather than the single row
|
345
|
+
rows = [row]
|
346
|
+
|
347
|
+
t = Benchmark.realtime do
|
348
|
+
begin
|
349
|
+
Engine.logger.debug "Processing after read"
|
350
|
+
control.after_read_processors.each do |processor|
|
351
|
+
processed_rows = []
|
352
|
+
rows.each do |row|
|
353
|
+
processed_rows << processor.process(row) unless empty_row?(row)
|
354
|
+
end
|
355
|
+
rows = processed_rows.flatten.compact
|
356
|
+
end
|
357
|
+
rescue => e
|
358
|
+
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
359
|
+
errors << msg
|
360
|
+
Engine.logger.error(msg)
|
361
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
362
|
+
exceeded_error_threshold?(control) ? break : next
|
363
|
+
end
|
364
|
+
end
|
365
|
+
benchmarks[:after_reads] += t unless t.nil?
|
366
|
+
|
367
|
+
t = Benchmark.realtime do
|
368
|
+
begin
|
369
|
+
Engine.logger.debug "Executing transforms"
|
370
|
+
rows.each do |row|
|
371
|
+
# only do the transform if there is a row
|
372
|
+
unless empty_row?(row)
|
373
|
+
control.transforms.each do |transform|
|
374
|
+
name = transform.name.to_sym
|
375
|
+
row[name] = transform.transform(name, row[name], row)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
rescue ResolverError => e
|
380
|
+
Engine.logger.error(e.message)
|
381
|
+
errors << e.message
|
382
|
+
rescue => e
|
383
|
+
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
384
|
+
errors << msg
|
385
|
+
Engine.logger.error(msg)
|
386
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
387
|
+
ensure
|
388
|
+
begin
|
389
|
+
exceeded_error_threshold?(control) ? break : next
|
390
|
+
rescue => inner_error
|
391
|
+
puts inner_error
|
392
|
+
end
|
393
|
+
end
|
394
|
+
end
|
395
|
+
benchmarks[:transforms] += t unless t.nil?
|
396
|
+
|
397
|
+
t = Benchmark.realtime do
|
398
|
+
begin
|
399
|
+
# execute row-level "before write" processing
|
400
|
+
Engine.logger.debug "Processing before write"
|
401
|
+
control.before_write_processors.each do |processor|
|
402
|
+
processed_rows = []
|
403
|
+
rows.each do |row|
|
404
|
+
processed_rows << processor.process(row) unless empty_row?(row)
|
405
|
+
end
|
406
|
+
rows = processed_rows.flatten.compact
|
407
|
+
end
|
408
|
+
rescue => e
|
409
|
+
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
410
|
+
errors << msg
|
411
|
+
Engine.logger.error(msg)
|
412
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
413
|
+
exceeded_error_threshold?(control) ? break : next
|
414
|
+
end
|
415
|
+
end
|
416
|
+
benchmarks[:before_writes] += t unless t.nil?
|
417
|
+
|
418
|
+
t = Benchmark.realtime do
|
419
|
+
begin
|
420
|
+
# write the row to the destination
|
421
|
+
destinations.each_with_index do |destination, index|
|
422
|
+
Engine.current_destination = destination
|
423
|
+
rows.each do |row|
|
424
|
+
destination.write(row)
|
425
|
+
Engine.rows_written += 1 if index == 0
|
426
|
+
end
|
427
|
+
end
|
428
|
+
rescue => e
|
429
|
+
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
430
|
+
errors << msg
|
431
|
+
Engine.logger.error msg
|
432
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
433
|
+
exceeded_error_threshold?(control) ? break : next
|
434
|
+
end
|
435
|
+
end
|
436
|
+
benchmarks[:writes] += t unless t.nil?
|
437
|
+
end
|
438
|
+
|
439
|
+
if exceeded_error_threshold?(control)
|
440
|
+
say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
|
441
|
+
return
|
442
|
+
end
|
443
|
+
|
444
|
+
end
|
445
|
+
|
446
|
+
destinations.each do |destination|
|
447
|
+
destination.close
|
448
|
+
end
|
449
|
+
|
450
|
+
say_on_own_line "Executing before post-process screens"
|
451
|
+
begin
|
452
|
+
execute_screens(control)
|
453
|
+
rescue FatalScreenError => e
|
454
|
+
say "Fatal screen error during job execution: #{e.message}"
|
455
|
+
exit
|
456
|
+
rescue ScreenError => e
|
457
|
+
say "Screen error during job execution: #{e.message}"
|
458
|
+
return
|
459
|
+
else
|
460
|
+
say "Screens passed"
|
461
|
+
end
|
462
|
+
|
463
|
+
post_process(control)
|
464
|
+
|
465
|
+
if sources.length > 0
|
466
|
+
say_on_own_line "Read #{Engine.rows_read} lines from sources"
|
467
|
+
end
|
468
|
+
if destinations.length > 0
|
469
|
+
say "Wrote #{Engine.rows_written} lines to destinations"
|
470
|
+
end
|
471
|
+
|
472
|
+
say_on_own_line "Executing after post-process screens"
|
473
|
+
begin
|
474
|
+
execute_screens(control, :after_post_process)
|
475
|
+
rescue FatalScreenError => e
|
476
|
+
say "Fatal screen error during job execution: #{e.message}"
|
477
|
+
exit
|
478
|
+
rescue ScreenError => e
|
479
|
+
say "Screen error during job execution: #{e.message}"
|
480
|
+
return
|
481
|
+
else
|
482
|
+
say "Screens passed"
|
483
|
+
end
|
484
|
+
|
485
|
+
say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
|
486
|
+
say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
|
487
|
+
|
488
|
+
say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
|
489
|
+
say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
|
490
|
+
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
491
|
+
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
492
|
+
|
493
|
+
# say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
|
494
|
+
#
|
495
|
+
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
496
|
+
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
497
|
+
# end
|
498
|
+
|
499
|
+
ActiveRecord::Base.verify_active_connections!
|
500
|
+
ETL::Engine.job.completed_at = Time.now
|
501
|
+
ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
502
|
+
ETL::Engine.job.save!
|
503
|
+
end
|
504
|
+
|
505
|
+
def empty_row?(row)
|
506
|
+
# unsure about why it should respond to :[] - keeping it just in case for the moment
|
507
|
+
row.nil? || !row.respond_to?(:[])
|
508
|
+
end
|
509
|
+
|
510
|
+
private
|
511
|
+
# Return true if the error threshold is exceeded
|
512
|
+
def exceeded_error_threshold?(control)
|
513
|
+
errors.length > control.error_threshold
|
514
|
+
end
|
515
|
+
|
516
|
+
# Execute all preprocessors
|
517
|
+
def pre_process(control)
|
518
|
+
Engine.logger.debug "Pre-processing #{control.file}"
|
519
|
+
control.pre_processors.each do |processor|
|
520
|
+
processor.process
|
521
|
+
end
|
522
|
+
Engine.logger.debug "Pre-processing complete"
|
523
|
+
end
|
524
|
+
|
525
|
+
# Execute all postprocessors
|
526
|
+
def post_process(control)
|
527
|
+
say_on_own_line "Executing post processes"
|
528
|
+
Engine.logger.debug "Post-processing #{control.file}"
|
529
|
+
control.post_processors.each do |processor|
|
530
|
+
processor.process
|
531
|
+
end
|
532
|
+
Engine.logger.debug "Post-processing complete"
|
533
|
+
say "Post-processing complete"
|
534
|
+
end
|
535
|
+
|
536
|
+
# Execute all dependencies
|
537
|
+
def execute_dependencies(control)
|
538
|
+
Engine.logger.debug "Executing dependencies"
|
539
|
+
control.dependencies.flatten.each do |dependency|
|
540
|
+
case dependency
|
541
|
+
when Symbol
|
542
|
+
f = dependency.to_s + '.ctl'
|
543
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
544
|
+
say "Executing dependency: #{f}"
|
545
|
+
process(f)
|
546
|
+
when String
|
547
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
548
|
+
say "Executing dependency: #{f}"
|
549
|
+
process(dependency)
|
550
|
+
else
|
551
|
+
raise "Invalid dependency type: #{dependency.class}"
|
552
|
+
end
|
553
|
+
end
|
554
|
+
end
|
555
|
+
|
556
|
+
# Execute all screens
|
557
|
+
def execute_screens(control, timing = :before_post_process)
|
558
|
+
screens = case timing
|
559
|
+
when :after_post_process
|
560
|
+
control.after_post_process_screens
|
561
|
+
else # default to before post-process screens
|
562
|
+
control.screens
|
563
|
+
end
|
564
|
+
[:fatal,:error,:warn].each do |type|
|
565
|
+
screens[type].each do |block|
|
566
|
+
begin
|
567
|
+
block.call
|
568
|
+
rescue => e
|
569
|
+
case type
|
570
|
+
when :fatal
|
571
|
+
raise FatalScreenError, e
|
572
|
+
when :error
|
573
|
+
raise ScreenError, e
|
574
|
+
when :warn
|
575
|
+
say "Screen warning: #{e}"
|
576
|
+
end
|
577
|
+
end
|
578
|
+
end
|
579
|
+
end
|
580
|
+
end
|
581
|
+
end
|
582
|
+
end
|