factorylabs-activewarehouse-etl 0.9.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +85 -0
- data/Rakefile +153 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +78 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +420 -0
- data/lib/etl/control/destination/database_destination.rb +95 -0
- data/lib/etl/control/destination/file_destination.rb +124 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution.rb +20 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution/record.rb +18 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +81 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- metadata +195 -0
@@ -0,0 +1,90 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# A File source.
|
4
|
+
class FileSource < Source
|
5
|
+
# The number of lines to skip, default is 0
|
6
|
+
attr_accessor :skip_lines
|
7
|
+
|
8
|
+
# Accessor for the underlying parser
|
9
|
+
attr_accessor :parser
|
10
|
+
|
11
|
+
# The source file
|
12
|
+
attr_accessor :file
|
13
|
+
|
14
|
+
# Initialize the source
|
15
|
+
#
|
16
|
+
# Configuration options:
|
17
|
+
# * <tt>:file</tt>: The source file
|
18
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
19
|
+
# symbol, a class which extends from Parser, a Hash with :name and
|
20
|
+
# optionally an :options key. Whether or not the parser uses the
|
21
|
+
# options is dependent on which parser is used. See the documentation
|
22
|
+
# for each parser for information on what options it accepts.
|
23
|
+
# * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
|
24
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
25
|
+
# source data locally for archival
|
26
|
+
def initialize(control, configuration, definition)
|
27
|
+
super
|
28
|
+
configure
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get a String identifier for the source
|
32
|
+
def to_s
|
33
|
+
file
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get the local storage directory
|
37
|
+
def local_directory
|
38
|
+
File.join(local_base, File.basename(file, File.extname(file)))
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns each row from the source
|
42
|
+
def each
|
43
|
+
count = 0
|
44
|
+
copy_sources if store_locally
|
45
|
+
@parser.each do |row|
|
46
|
+
if ETL::Engine.offset && count < ETL::Engine.offset
|
47
|
+
count += 1
|
48
|
+
else
|
49
|
+
row = ETL::Row[row]
|
50
|
+
row.source = self
|
51
|
+
yield row
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
# Copy source data to a local directory structure
|
58
|
+
def copy_sources
|
59
|
+
sequence = 0
|
60
|
+
path = Pathname.new(file)
|
61
|
+
path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
|
62
|
+
Pathname.glob(path).each do |f|
|
63
|
+
next if f.directory?
|
64
|
+
lf = local_file(sequence)
|
65
|
+
FileUtils.cp(f, lf)
|
66
|
+
File.open(local_file_trigger(lf), 'w') {|f| }
|
67
|
+
sequence += 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Configure the source
|
72
|
+
def configure
|
73
|
+
@file = configuration[:file]
|
74
|
+
case configuration[:parser]
|
75
|
+
when Class
|
76
|
+
@parser = configuration[:parser].new(self)
|
77
|
+
when String, Symbol
|
78
|
+
@parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
|
79
|
+
when Hash
|
80
|
+
name = configuration[:parser][:name]
|
81
|
+
options = configuration[:parser][:options]
|
82
|
+
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
83
|
+
else
|
84
|
+
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
85
|
+
end
|
86
|
+
@skip_lines = configuration[:skip_lines] ||= 0
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#RAILS_ENV = 'development'
|
2
|
+
#require '../config/environment'
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
module Control #:nodoc:
|
6
|
+
class ModelSource < Source
|
7
|
+
|
8
|
+
def columns
|
9
|
+
case definition
|
10
|
+
when Array
|
11
|
+
definition.collect(&:to_sym)
|
12
|
+
when Hash
|
13
|
+
definition.keys.collect(&:to_sym)
|
14
|
+
else
|
15
|
+
raise "Definition must be either an Array or a Hash"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def railsmodel
|
20
|
+
configuration[:model]
|
21
|
+
end
|
22
|
+
|
23
|
+
def order
|
24
|
+
configuration[:order] || "id"
|
25
|
+
end
|
26
|
+
|
27
|
+
def each(&block)
|
28
|
+
railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
|
29
|
+
result_row = ETL::Row.new
|
30
|
+
result_row.source = self
|
31
|
+
columns.each do |column|
|
32
|
+
result_row[column.to_sym] = row.send(column)
|
33
|
+
end
|
34
|
+
yield result_row
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/etl/core_ext.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'etl/core_ext/time'
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#Updated by Jack Hong on 04/05/08
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module CoreExtensions #:nodoc:
|
5
|
+
module Time #:nodoc:
|
6
|
+
# Enables the use of time calculations within Time itself
|
7
|
+
module Calculations
|
8
|
+
def week
|
9
|
+
cyw = ((yday - 1) / 7) + 1
|
10
|
+
cyw = 52 if cyw == 53
|
11
|
+
cyw
|
12
|
+
end
|
13
|
+
def quarter
|
14
|
+
((month - 1) / 3) + 1
|
15
|
+
end
|
16
|
+
def fiscal_year_week(offset_month=10)
|
17
|
+
fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
|
18
|
+
fyw = 52 if fyw == 53
|
19
|
+
fyw
|
20
|
+
end
|
21
|
+
def fiscal_year_month(offset_month=10)
|
22
|
+
shifted_month = month - (offset_month - 1)
|
23
|
+
shifted_month += 12 if shifted_month <= 0
|
24
|
+
shifted_month
|
25
|
+
end
|
26
|
+
def fiscal_year_quarter(offset_month=10)
|
27
|
+
((fiscal_year_month(offset_month) - 1) / 3) + 1
|
28
|
+
end
|
29
|
+
def fiscal_year(offset_month=10)
|
30
|
+
month >= offset_month ? year + 1 : year
|
31
|
+
end
|
32
|
+
def fiscal_year_yday(offset_month=10)
|
33
|
+
offset_days = 0
|
34
|
+
1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
|
35
|
+
shifted_year_day = yday - offset_days
|
36
|
+
shifted_year_day += 365 if shifted_year_day <= 0
|
37
|
+
shifted_year_day
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/etl/engine.rb
ADDED
@@ -0,0 +1,556 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
|
3
|
+
class Base < ActiveRecord::Base
|
4
|
+
end
|
5
|
+
|
6
|
+
# The main ETL engine clas
|
7
|
+
class Engine
|
8
|
+
include ETL::Util
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Initialization that is run when a job is executed.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
# * <tt>:limit</tt>: Limit the number of records returned from sources
|
15
|
+
# * <tt>:offset</tt>: Specify the records for data from sources
|
16
|
+
# * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
|
17
|
+
# * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
|
18
|
+
# * <tt>:read_locally</tt>: Set to true to read from the local cache
|
19
|
+
# * <tt>:rails_root</tt>: Set to the rails root to boot rails
|
20
|
+
def init(options={})
|
21
|
+
unless @initialized
|
22
|
+
puts "initializing ETL engine\n\n"
|
23
|
+
@limit = options[:limit]
|
24
|
+
@offset = options[:offset]
|
25
|
+
@log_write_mode = 'w' if options[:newlog]
|
26
|
+
@skip_bulk_import = options[:skip_bulk_import]
|
27
|
+
@read_locally = options[:read_locally]
|
28
|
+
@rails_root = options[:rails_root]
|
29
|
+
|
30
|
+
require File.join(@rails_root, 'config/environment') if @rails_root
|
31
|
+
options[:config] ||= 'database.yml'
|
32
|
+
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
33
|
+
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
34
|
+
ActiveRecord::Base.configurations.merge!(database_configuration)
|
35
|
+
ETL::Base.configurations = database_configuration
|
36
|
+
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
37
|
+
|
38
|
+
require 'etl/execution'
|
39
|
+
ETL::Execution::Base.establish_connection :etl_execution
|
40
|
+
ETL::Execution::Execution.migrate
|
41
|
+
|
42
|
+
@initialized = true
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Process the specified file. Acceptable values for file are:
|
47
|
+
# * Path to a file
|
48
|
+
# * File object
|
49
|
+
# * ETL::Control::Control instance
|
50
|
+
# * ETL::Batch::Batch instance
|
51
|
+
#
|
52
|
+
# The process command will accept either a .ctl Control file or a .ebf
|
53
|
+
# ETL Batch File.
|
54
|
+
def process(file)
|
55
|
+
new().process(file)
|
56
|
+
end
|
57
|
+
|
58
|
+
attr_accessor :timestamped_log
|
59
|
+
|
60
|
+
# Accessor for the log write mode. Default is 'a' for append.
|
61
|
+
attr_accessor :log_write_mode
|
62
|
+
def log_write_mode
|
63
|
+
@log_write_mode ||= 'a'
|
64
|
+
end
|
65
|
+
|
66
|
+
# A logger for the engine
|
67
|
+
attr_accessor :logger
|
68
|
+
|
69
|
+
def logger #:nodoc:
|
70
|
+
unless @logger
|
71
|
+
if timestamped_log
|
72
|
+
@logger = Logger.new("etl_#{timestamp}.log")
|
73
|
+
else
|
74
|
+
@logger = Logger.new(File.open('etl.log', log_write_mode))
|
75
|
+
end
|
76
|
+
@logger.level = Logger::WARN
|
77
|
+
@logger.formatter = Logger::Formatter.new
|
78
|
+
end
|
79
|
+
@logger
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get a timestamp value as a string
|
83
|
+
def timestamp
|
84
|
+
Time.now.strftime("%Y%m%d%H%M%S")
|
85
|
+
end
|
86
|
+
|
87
|
+
# The current source
|
88
|
+
attr_accessor :current_source
|
89
|
+
|
90
|
+
# The current source row
|
91
|
+
attr_accessor :current_source_row
|
92
|
+
|
93
|
+
# The current destination
|
94
|
+
attr_accessor :current_destination
|
95
|
+
|
96
|
+
# Set to true to activate realtime activity. This will cause certain
|
97
|
+
# information messages to be printed to STDOUT
|
98
|
+
attr_accessor :realtime_activity
|
99
|
+
|
100
|
+
# Accessor for the total number of rows read from sources
|
101
|
+
attr_accessor :rows_read
|
102
|
+
def rows_read
|
103
|
+
@rows_read ||= 0
|
104
|
+
end
|
105
|
+
|
106
|
+
# Accessor for the total number of rows processed
|
107
|
+
attr_accessor :rows_written
|
108
|
+
def rows_written
|
109
|
+
@rows_written ||= 0
|
110
|
+
end
|
111
|
+
|
112
|
+
# Access the current ETL::Execution::Job instance
|
113
|
+
attr_accessor :job
|
114
|
+
|
115
|
+
# Access the current ETL::Execution::Batch instance
|
116
|
+
attr_accessor :batch
|
117
|
+
|
118
|
+
# The limit on rows to load from the source, useful for testing the ETL
|
119
|
+
# process prior to executing the entire batch. Default value is nil and
|
120
|
+
# indicates that there is no limit
|
121
|
+
attr_accessor :limit
|
122
|
+
|
123
|
+
# The offset for the source to begin at, useful for testing the ETL
|
124
|
+
# process prior to executing the entire batch. Default value is nil and
|
125
|
+
# indicates that there is no offset
|
126
|
+
attr_accessor :offset
|
127
|
+
|
128
|
+
# Set to true to skip all bulk importing
|
129
|
+
attr_accessor :skip_bulk_import
|
130
|
+
|
131
|
+
# Set to true to read locally from the last source cache files
|
132
|
+
attr_accessor :read_locally
|
133
|
+
|
134
|
+
# Accessor for the average rows per second processed
|
135
|
+
attr_accessor :average_rows_per_second
|
136
|
+
|
137
|
+
# Get a named connection
|
138
|
+
def connection(name)
|
139
|
+
logger.debug "Retrieving connection #{name}"
|
140
|
+
conn = connections[name] ||= establish_connection(name)
|
141
|
+
#conn.verify!(ActiveRecord::Base.verification_timeout)
|
142
|
+
conn.reconnect! unless conn.active?
|
143
|
+
conn
|
144
|
+
end
|
145
|
+
|
146
|
+
# Set to true to use temp tables
|
147
|
+
attr_accessor :use_temp_tables
|
148
|
+
|
149
|
+
# Get a registry of temp tables
|
150
|
+
def temp_tables
|
151
|
+
@temp_tables ||= {}
|
152
|
+
end
|
153
|
+
|
154
|
+
# Called when a batch job finishes, allowing for cleanup to occur
|
155
|
+
def finish
|
156
|
+
temp_tables.each do |temp_table, mapping|
|
157
|
+
actual_table = mapping[:table]
|
158
|
+
#puts "move #{temp_table} to #{actual_table}"
|
159
|
+
conn = mapping[:connection]
|
160
|
+
conn.transaction do
|
161
|
+
conn.rename_table(actual_table, "#{actual_table}_old")
|
162
|
+
conn.rename_table(temp_table, actual_table)
|
163
|
+
conn.drop_table("#{actual_table}_old")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Return true if using temp tables
|
169
|
+
def use_temp_tables?
|
170
|
+
use_temp_tables ? true : false
|
171
|
+
end
|
172
|
+
|
173
|
+
# Modify the table name if necessary
|
174
|
+
def table(table_name, connection)
|
175
|
+
if use_temp_tables?
|
176
|
+
returning "tmp_#{table_name}" do |temp_table_name|
|
177
|
+
if temp_tables[temp_table_name].nil?
|
178
|
+
# Create the temp table and add it to the mapping
|
179
|
+
begin connection.drop_table(temp_table_name); rescue; end
|
180
|
+
connection.copy_table(table_name, temp_table_name)
|
181
|
+
temp_tables[temp_table_name] = {
|
182
|
+
:table => table_name,
|
183
|
+
:connection => connection
|
184
|
+
}
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
table_name
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
protected
|
193
|
+
# Hash of database connections that can be used throughout the ETL
|
194
|
+
# process
|
195
|
+
def connections
|
196
|
+
@connections ||= {}
|
197
|
+
end
|
198
|
+
|
199
|
+
# Establish the named connection and return the database specific connection
|
200
|
+
def establish_connection(name)
|
201
|
+
logger.debug "Establishing connection to #{name}"
|
202
|
+
conn_config = ETL::Base.configurations[name.to_s]
|
203
|
+
raise ETL::ETLError, "No connection found for #{name}" unless conn_config
|
204
|
+
connection_method = "#{conn_config['adapter']}_connection"
|
205
|
+
ETL::Base.send(connection_method, conn_config)
|
206
|
+
end
|
207
|
+
end # class << self
|
208
|
+
|
209
|
+
# Say the specified message, with a newline
|
210
|
+
def say(message)
|
211
|
+
say_without_newline(message + "\n")
|
212
|
+
end
|
213
|
+
|
214
|
+
# Say the specified message without a newline
|
215
|
+
def say_without_newline(message)
|
216
|
+
if ETL::Engine.realtime_activity
|
217
|
+
$stdout.print message
|
218
|
+
$stdout.flush
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Say the message on its own line
|
223
|
+
def say_on_own_line(message)
|
224
|
+
say("\n" + message)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Array of errors encountered during execution of the ETL process
|
228
|
+
def errors
|
229
|
+
@errors ||= []
|
230
|
+
end
|
231
|
+
|
232
|
+
# Get a Hash of benchmark values where each value represents the total
|
233
|
+
# amount of time in seconds spent processing in that portion of the ETL
|
234
|
+
# pipeline. Keys include:
|
235
|
+
# * <tt>:transforms</tt>
|
236
|
+
# * <tt>:after_reads</tt>
|
237
|
+
# * <tt>:before_writes</tt>
|
238
|
+
# * <tt>:writes</tt>
|
239
|
+
def benchmarks
|
240
|
+
@benchmarks ||= {
|
241
|
+
:transforms => 0,
|
242
|
+
:after_reads => 0,
|
243
|
+
:before_writes => 0,
|
244
|
+
:writes => 0,
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
# Process a file, control object or batch object. Acceptable values for
|
249
|
+
# file are:
|
250
|
+
# * Path to a file
|
251
|
+
# * File object
|
252
|
+
# * ETL::Control::Control instance
|
253
|
+
# * ETL::Batch::Batch instance
|
254
|
+
def process(file)
|
255
|
+
case file
|
256
|
+
when String
|
257
|
+
process(File.new(file))
|
258
|
+
when File
|
259
|
+
process_control(file) if file.path =~ /.ctl$/
|
260
|
+
process_batch(file) if file.path =~ /.ebf$/
|
261
|
+
when ETL::Control::Control
|
262
|
+
process_control(file)
|
263
|
+
when ETL::Batch::Batch
|
264
|
+
process_batch(file)
|
265
|
+
else
|
266
|
+
raise RuntimeError, "Process object must be a String, File, Control
|
267
|
+
instance or Batch instance"
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
protected
|
272
|
+
# Process the specified batch file
|
273
|
+
def process_batch(batch)
|
274
|
+
batch = ETL::Batch::Batch.resolve(batch, self)
|
275
|
+
say "Processing batch #{batch.file}"
|
276
|
+
|
277
|
+
ETL::Engine.batch = ETL::Execution::Batch.create!(
|
278
|
+
:batch_file => batch.file,
|
279
|
+
:status => 'executing'
|
280
|
+
)
|
281
|
+
|
282
|
+
batch.execute
|
283
|
+
|
284
|
+
ETL::Engine.batch.completed_at = Time.now
|
285
|
+
ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
286
|
+
ETL::Engine.batch.save!
|
287
|
+
end
|
288
|
+
|
289
|
+
# Process the specified control file
|
290
|
+
def process_control(control)
|
291
|
+
control = ETL::Control::Control.resolve(control)
|
292
|
+
say_on_own_line "Processing control #{control.file}"
|
293
|
+
|
294
|
+
ETL::Engine.job = ETL::Execution::Job.create!(
|
295
|
+
:control_file => control.file,
|
296
|
+
:status => 'executing',
|
297
|
+
:batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
|
298
|
+
)
|
299
|
+
|
300
|
+
execute_dependencies(control)
|
301
|
+
|
302
|
+
start_time = Time.now
|
303
|
+
pre_process(control)
|
304
|
+
sources = control.sources
|
305
|
+
destinations = control.destinations
|
306
|
+
|
307
|
+
say "Skipping bulk import" if Engine.skip_bulk_import
|
308
|
+
|
309
|
+
sources.each do |source|
|
310
|
+
Engine.current_source = source
|
311
|
+
Engine.logger.debug "Processing source #{source}"
|
312
|
+
say "Source: #{source}"
|
313
|
+
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
314
|
+
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
315
|
+
source.each_with_index do |row, index|
|
316
|
+
# Break out of the row loop if the +Engine.limit+ is specified and
|
317
|
+
# the number of rows read exceeds that value.
|
318
|
+
if Engine.limit != nil && Engine.rows_read >= Engine.limit
|
319
|
+
puts "Reached limit of #{Engine.limit}"
|
320
|
+
break
|
321
|
+
end
|
322
|
+
|
323
|
+
Engine.logger.debug "Row #{index}: #{row.inspect}"
|
324
|
+
Engine.rows_read += 1
|
325
|
+
Engine.current_source_row = index + 1
|
326
|
+
say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
|
327
|
+
|
328
|
+
# At this point a single row may be turned into multiple rows via row
|
329
|
+
# processors all code after this line should work with the array of
|
330
|
+
# rows rather than the single row
|
331
|
+
rows = [row]
|
332
|
+
|
333
|
+
t = Benchmark.realtime do
|
334
|
+
begin
|
335
|
+
Engine.logger.debug "Processing after read"
|
336
|
+
control.after_read_processors.each do |processor|
|
337
|
+
processed_rows = []
|
338
|
+
rows.each do |row|
|
339
|
+
processed_rows << processor.process(row)
|
340
|
+
end
|
341
|
+
rows = processed_rows.flatten
|
342
|
+
end
|
343
|
+
rescue => e
|
344
|
+
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
345
|
+
errors << msg
|
346
|
+
Engine.logger.error(msg)
|
347
|
+
exceeded_error_threshold?(control) ? break : next
|
348
|
+
end
|
349
|
+
end
|
350
|
+
benchmarks[:after_reads] += t unless t.nil?
|
351
|
+
|
352
|
+
t = Benchmark.realtime do
|
353
|
+
begin
|
354
|
+
Engine.logger.debug "Executing transforms"
|
355
|
+
rows.each do |row|
|
356
|
+
control.transforms.each do |transform|
|
357
|
+
name = transform.name.to_sym
|
358
|
+
row[name] = transform.transform(name, row[name], row)
|
359
|
+
end
|
360
|
+
end
|
361
|
+
rescue ResolverError => e
|
362
|
+
Engine.logger.error(e.message)
|
363
|
+
errors << e.message
|
364
|
+
rescue => e
|
365
|
+
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
366
|
+
errors << msg
|
367
|
+
Engine.logger.error(msg)
|
368
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
369
|
+
ensure
|
370
|
+
begin
|
371
|
+
exceeded_error_threshold?(control) ? break : next
|
372
|
+
rescue => inner_error
|
373
|
+
puts inner_error
|
374
|
+
end
|
375
|
+
end
|
376
|
+
end
|
377
|
+
benchmarks[:transforms] += t unless t.nil?
|
378
|
+
|
379
|
+
t = Benchmark.realtime do
|
380
|
+
begin
|
381
|
+
# execute row-level "before write" processing
|
382
|
+
Engine.logger.debug "Processing before write"
|
383
|
+
control.before_write_processors.each do |processor|
|
384
|
+
processed_rows = []
|
385
|
+
rows.each { |row| processed_rows << processor.process(row) }
|
386
|
+
rows = processed_rows.flatten.compact
|
387
|
+
end
|
388
|
+
rescue => e
|
389
|
+
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
390
|
+
errors << msg
|
391
|
+
Engine.logger.error(msg)
|
392
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
393
|
+
exceeded_error_threshold?(control) ? break : next
|
394
|
+
end
|
395
|
+
end
|
396
|
+
benchmarks[:before_writes] += t unless t.nil?
|
397
|
+
|
398
|
+
t = Benchmark.realtime do
|
399
|
+
begin
|
400
|
+
# write the row to the destination
|
401
|
+
destinations.each_with_index do |destination, index|
|
402
|
+
Engine.current_destination = destination
|
403
|
+
rows.each do |row|
|
404
|
+
destination.write(row)
|
405
|
+
Engine.rows_written += 1 if index == 0
|
406
|
+
end
|
407
|
+
end
|
408
|
+
rescue => e
|
409
|
+
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
410
|
+
errors << msg
|
411
|
+
Engine.logger.error msg
|
412
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
413
|
+
exceeded_error_threshold?(control) ? break : next
|
414
|
+
end
|
415
|
+
end
|
416
|
+
benchmarks[:writes] += t unless t.nil?
|
417
|
+
end
|
418
|
+
|
419
|
+
if exceeded_error_threshold?(control)
|
420
|
+
say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
|
421
|
+
return
|
422
|
+
end
|
423
|
+
|
424
|
+
end
|
425
|
+
|
426
|
+
destinations.each do |destination|
|
427
|
+
destination.close
|
428
|
+
end
|
429
|
+
|
430
|
+
say_on_own_line "Executing before post-process screens"
|
431
|
+
begin
|
432
|
+
execute_screens(control)
|
433
|
+
rescue FatalScreenError => e
|
434
|
+
say "Fatal screen error during job execution: #{e.message}"
|
435
|
+
exit
|
436
|
+
rescue ScreenError => e
|
437
|
+
say "Screen error during job execution: #{e.message}"
|
438
|
+
return
|
439
|
+
else
|
440
|
+
say "Screens passed"
|
441
|
+
end
|
442
|
+
|
443
|
+
post_process(control)
|
444
|
+
|
445
|
+
if sources.length > 0
|
446
|
+
say_on_own_line "Read #{Engine.rows_read} lines from sources"
|
447
|
+
end
|
448
|
+
if destinations.length > 0
|
449
|
+
say "Wrote #{Engine.rows_written} lines to destinations"
|
450
|
+
end
|
451
|
+
|
452
|
+
say_on_own_line "Executing after post-process screens"
|
453
|
+
begin
|
454
|
+
execute_screens(control, :after_post_process)
|
455
|
+
rescue FatalScreenError => e
|
456
|
+
say "Fatal screen error during job execution: #{e.message}"
|
457
|
+
exit
|
458
|
+
rescue ScreenError => e
|
459
|
+
say "Screen error during job execution: #{e.message}"
|
460
|
+
return
|
461
|
+
else
|
462
|
+
say "Screens passed"
|
463
|
+
end
|
464
|
+
|
465
|
+
say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
|
466
|
+
say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
|
467
|
+
|
468
|
+
say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
|
469
|
+
say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
|
470
|
+
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
471
|
+
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
472
|
+
|
473
|
+
say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
|
474
|
+
|
475
|
+
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
476
|
+
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
477
|
+
# end
|
478
|
+
|
479
|
+
ETL::Engine.job.completed_at = Time.now
|
480
|
+
ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
481
|
+
ETL::Engine.job.save!
|
482
|
+
end
|
483
|
+
|
484
|
+
private
|
485
|
+
# Return true if the error threshold is exceeded
|
486
|
+
def exceeded_error_threshold?(control)
|
487
|
+
errors.length > control.error_threshold
|
488
|
+
end
|
489
|
+
|
490
|
+
# Execute all preprocessors
|
491
|
+
def pre_process(control)
|
492
|
+
Engine.logger.debug "Pre-processing #{control.file}"
|
493
|
+
control.pre_processors.each do |processor|
|
494
|
+
processor.process
|
495
|
+
end
|
496
|
+
Engine.logger.debug "Pre-processing complete"
|
497
|
+
end
|
498
|
+
|
499
|
+
# Execute all postprocessors
|
500
|
+
def post_process(control)
|
501
|
+
say_on_own_line "Executing post processes"
|
502
|
+
Engine.logger.debug "Post-processing #{control.file}"
|
503
|
+
control.post_processors.each do |processor|
|
504
|
+
processor.process
|
505
|
+
end
|
506
|
+
Engine.logger.debug "Post-processing complete"
|
507
|
+
say "Post-processing complete"
|
508
|
+
end
|
509
|
+
|
510
|
+
# Execute all dependencies
|
511
|
+
def execute_dependencies(control)
|
512
|
+
Engine.logger.debug "Executing dependencies"
|
513
|
+
control.dependencies.flatten.each do |dependency|
|
514
|
+
case dependency
|
515
|
+
when Symbol
|
516
|
+
f = dependency.to_s + '.ctl'
|
517
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
518
|
+
say "Executing dependency: #{f}"
|
519
|
+
process(f)
|
520
|
+
when String
|
521
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
522
|
+
say "Executing dependency: #{f}"
|
523
|
+
process(dependency)
|
524
|
+
else
|
525
|
+
raise "Invalid dependency type: #{dependency.class}"
|
526
|
+
end
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
# Execute all screens
|
531
|
+
def execute_screens(control, timing = :before_post_process)
|
532
|
+
screens = case timing
|
533
|
+
when :after_post_process
|
534
|
+
control.after_post_process_screens
|
535
|
+
else # default to before post-process screens
|
536
|
+
control.screens
|
537
|
+
end
|
538
|
+
[:fatal,:error,:warn].each do |type|
|
539
|
+
screens[type].each do |block|
|
540
|
+
begin
|
541
|
+
block.call
|
542
|
+
rescue => e
|
543
|
+
case type
|
544
|
+
when :fatal
|
545
|
+
raise FatalScreenError, e
|
546
|
+
when :error
|
547
|
+
raise ScreenError, e
|
548
|
+
when :warn
|
549
|
+
say "Screen warning: #{e}"
|
550
|
+
end
|
551
|
+
end
|
552
|
+
end
|
553
|
+
end
|
554
|
+
end
|
555
|
+
end
|
556
|
+
end
|