darrell-activewarehouse-etl 0.9.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +99 -0
- data/Rakefile +175 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination.rb +448 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +83 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- data/lib/etl.rb +83 -0
- metadata +245 -0
data/lib/etl/engine.rb
ADDED
@@ -0,0 +1,556 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
|
3
|
+
class Base < ActiveRecord::Base
|
4
|
+
end
|
5
|
+
|
6
|
+
# The main ETL engine clas
|
7
|
+
class Engine
|
8
|
+
include ETL::Util
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Initialization that is run when a job is executed.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
# * <tt>:limit</tt>: Limit the number of records returned from sources
|
15
|
+
# * <tt>:offset</tt>: Specify the records for data from sources
|
16
|
+
# * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
|
17
|
+
# * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
|
18
|
+
# * <tt>:read_locally</tt>: Set to true to read from the local cache
|
19
|
+
# * <tt>:rails_root</tt>: Set to the rails root to boot rails
|
20
|
+
def init(options={})
|
21
|
+
unless @initialized
|
22
|
+
puts "initializing ETL engine\n\n"
|
23
|
+
@limit = options[:limit]
|
24
|
+
@offset = options[:offset]
|
25
|
+
@log_write_mode = 'w' if options[:newlog]
|
26
|
+
@skip_bulk_import = options[:skip_bulk_import]
|
27
|
+
@read_locally = options[:read_locally]
|
28
|
+
@rails_root = options[:rails_root]
|
29
|
+
|
30
|
+
require File.join(@rails_root, 'config/environment') if @rails_root
|
31
|
+
options[:config] ||= 'database.yml'
|
32
|
+
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
33
|
+
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
34
|
+
ActiveRecord::Base.configurations.merge!(database_configuration)
|
35
|
+
ETL::Base.configurations = database_configuration
|
36
|
+
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
37
|
+
|
38
|
+
require 'etl/execution'
|
39
|
+
ETL::Execution::Base.establish_connection :etl_execution
|
40
|
+
ETL::Execution::Execution.migrate
|
41
|
+
|
42
|
+
@initialized = true
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Process the specified file. Acceptable values for file are:
|
47
|
+
# * Path to a file
|
48
|
+
# * File object
|
49
|
+
# * ETL::Control::Control instance
|
50
|
+
# * ETL::Batch::Batch instance
|
51
|
+
#
|
52
|
+
# The process command will accept either a .ctl Control file or a .ebf
|
53
|
+
# ETL Batch File.
|
54
|
+
def process(file)
|
55
|
+
new().process(file)
|
56
|
+
end
|
57
|
+
|
58
|
+
attr_accessor :timestamped_log
|
59
|
+
|
60
|
+
# Accessor for the log write mode. Default is 'a' for append.
|
61
|
+
attr_accessor :log_write_mode
|
62
|
+
def log_write_mode
|
63
|
+
@log_write_mode ||= 'a'
|
64
|
+
end
|
65
|
+
|
66
|
+
# A logger for the engine
|
67
|
+
attr_accessor :logger
|
68
|
+
|
69
|
+
def logger #:nodoc:
|
70
|
+
unless @logger
|
71
|
+
if timestamped_log
|
72
|
+
@logger = Logger.new("etl_#{timestamp}.log")
|
73
|
+
else
|
74
|
+
@logger = Logger.new(File.open('etl.log', log_write_mode))
|
75
|
+
end
|
76
|
+
@logger.level = Logger::WARN
|
77
|
+
@logger.formatter = Logger::Formatter.new
|
78
|
+
end
|
79
|
+
@logger
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get a timestamp value as a string
|
83
|
+
def timestamp
|
84
|
+
Time.now.strftime("%Y%m%d%H%M%S")
|
85
|
+
end
|
86
|
+
|
87
|
+
# The current source
|
88
|
+
attr_accessor :current_source
|
89
|
+
|
90
|
+
# The current source row
|
91
|
+
attr_accessor :current_source_row
|
92
|
+
|
93
|
+
# The current destination
|
94
|
+
attr_accessor :current_destination
|
95
|
+
|
96
|
+
# Set to true to activate realtime activity. This will cause certain
|
97
|
+
# information messages to be printed to STDOUT
|
98
|
+
attr_accessor :realtime_activity
|
99
|
+
|
100
|
+
# Accessor for the total number of rows read from sources
|
101
|
+
attr_accessor :rows_read
|
102
|
+
def rows_read
|
103
|
+
@rows_read ||= 0
|
104
|
+
end
|
105
|
+
|
106
|
+
# Accessor for the total number of rows processed
|
107
|
+
attr_accessor :rows_written
|
108
|
+
def rows_written
|
109
|
+
@rows_written ||= 0
|
110
|
+
end
|
111
|
+
|
112
|
+
# Access the current ETL::Execution::Job instance
|
113
|
+
attr_accessor :job
|
114
|
+
|
115
|
+
# Access the current ETL::Execution::Batch instance
|
116
|
+
attr_accessor :batch
|
117
|
+
|
118
|
+
# The limit on rows to load from the source, useful for testing the ETL
|
119
|
+
# process prior to executing the entire batch. Default value is nil and
|
120
|
+
# indicates that there is no limit
|
121
|
+
attr_accessor :limit
|
122
|
+
|
123
|
+
# The offset for the source to begin at, useful for testing the ETL
|
124
|
+
# process prior to executing the entire batch. Default value is nil and
|
125
|
+
# indicates that there is no offset
|
126
|
+
attr_accessor :offset
|
127
|
+
|
128
|
+
# Set to true to skip all bulk importing
|
129
|
+
attr_accessor :skip_bulk_import
|
130
|
+
|
131
|
+
# Set to true to read locally from the last source cache files
|
132
|
+
attr_accessor :read_locally
|
133
|
+
|
134
|
+
# Accessor for the average rows per second processed
|
135
|
+
attr_accessor :average_rows_per_second
|
136
|
+
|
137
|
+
# Get a named connection
|
138
|
+
def connection(name)
|
139
|
+
logger.debug "Retrieving connection #{name}"
|
140
|
+
conn = connections[name] ||= establish_connection(name)
|
141
|
+
#conn.verify!(ActiveRecord::Base.verification_timeout)
|
142
|
+
conn.reconnect! unless conn.active?
|
143
|
+
conn
|
144
|
+
end
|
145
|
+
|
146
|
+
# Set to true to use temp tables
|
147
|
+
attr_accessor :use_temp_tables
|
148
|
+
|
149
|
+
# Get a registry of temp tables
|
150
|
+
def temp_tables
|
151
|
+
@temp_tables ||= {}
|
152
|
+
end
|
153
|
+
|
154
|
+
# Called when a batch job finishes, allowing for cleanup to occur
|
155
|
+
def finish
|
156
|
+
temp_tables.each do |temp_table, mapping|
|
157
|
+
actual_table = mapping[:table]
|
158
|
+
#puts "move #{temp_table} to #{actual_table}"
|
159
|
+
conn = mapping[:connection]
|
160
|
+
conn.transaction do
|
161
|
+
conn.rename_table(actual_table, "#{actual_table}_old")
|
162
|
+
conn.rename_table(temp_table, actual_table)
|
163
|
+
conn.drop_table("#{actual_table}_old")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Return true if using temp tables
|
169
|
+
def use_temp_tables?
|
170
|
+
use_temp_tables ? true : false
|
171
|
+
end
|
172
|
+
|
173
|
+
# Modify the table name if necessary
|
174
|
+
def table(table_name, connection)
|
175
|
+
if use_temp_tables?
|
176
|
+
returning "tmp_#{table_name}" do |temp_table_name|
|
177
|
+
if temp_tables[temp_table_name].nil?
|
178
|
+
# Create the temp table and add it to the mapping
|
179
|
+
begin connection.drop_table(temp_table_name); rescue; end
|
180
|
+
connection.copy_table(table_name, temp_table_name)
|
181
|
+
temp_tables[temp_table_name] = {
|
182
|
+
:table => table_name,
|
183
|
+
:connection => connection
|
184
|
+
}
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
table_name
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
protected
|
193
|
+
# Hash of database connections that can be used throughout the ETL
|
194
|
+
# process
|
195
|
+
def connections
|
196
|
+
@connections ||= {}
|
197
|
+
end
|
198
|
+
|
199
|
+
# Establish the named connection and return the database specific connection
|
200
|
+
def establish_connection(name)
|
201
|
+
logger.debug "Establishing connection to #{name}"
|
202
|
+
conn_config = ETL::Base.configurations[name.to_s]
|
203
|
+
raise ETL::ETLError, "No connection found for #{name}" unless conn_config
|
204
|
+
connection_method = "#{conn_config['adapter']}_connection"
|
205
|
+
ETL::Base.send(connection_method, conn_config)
|
206
|
+
end
|
207
|
+
end # class << self
|
208
|
+
|
209
|
+
# Say the specified message, with a newline
|
210
|
+
def say(message)
|
211
|
+
say_without_newline(message + "\n")
|
212
|
+
end
|
213
|
+
|
214
|
+
# Say the specified message without a newline
|
215
|
+
def say_without_newline(message)
|
216
|
+
if ETL::Engine.realtime_activity
|
217
|
+
$stdout.print message
|
218
|
+
$stdout.flush
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Say the message on its own line
|
223
|
+
def say_on_own_line(message)
|
224
|
+
say("\n" + message)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Array of errors encountered during execution of the ETL process
|
228
|
+
def errors
|
229
|
+
@errors ||= []
|
230
|
+
end
|
231
|
+
|
232
|
+
# Get a Hash of benchmark values where each value represents the total
|
233
|
+
# amount of time in seconds spent processing in that portion of the ETL
|
234
|
+
# pipeline. Keys include:
|
235
|
+
# * <tt>:transforms</tt>
|
236
|
+
# * <tt>:after_reads</tt>
|
237
|
+
# * <tt>:before_writes</tt>
|
238
|
+
# * <tt>:writes</tt>
|
239
|
+
def benchmarks
|
240
|
+
@benchmarks ||= {
|
241
|
+
:transforms => 0,
|
242
|
+
:after_reads => 0,
|
243
|
+
:before_writes => 0,
|
244
|
+
:writes => 0,
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
# Process a file, control object or batch object. Acceptable values for
|
249
|
+
# file are:
|
250
|
+
# * Path to a file
|
251
|
+
# * File object
|
252
|
+
# * ETL::Control::Control instance
|
253
|
+
# * ETL::Batch::Batch instance
|
254
|
+
def process(file)
|
255
|
+
case file
|
256
|
+
when String
|
257
|
+
process(File.new(file))
|
258
|
+
when File
|
259
|
+
process_control(file) if file.path =~ /.ctl$/
|
260
|
+
process_batch(file) if file.path =~ /.ebf$/
|
261
|
+
when ETL::Control::Control
|
262
|
+
process_control(file)
|
263
|
+
when ETL::Batch::Batch
|
264
|
+
process_batch(file)
|
265
|
+
else
|
266
|
+
raise RuntimeError, "Process object must be a String, File, Control
|
267
|
+
instance or Batch instance"
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
protected
|
272
|
+
# Process the specified batch file
|
273
|
+
def process_batch(batch)
|
274
|
+
batch = ETL::Batch::Batch.resolve(batch, self)
|
275
|
+
say "Processing batch #{batch.file}"
|
276
|
+
|
277
|
+
ETL::Engine.batch = ETL::Execution::Batch.create!(
|
278
|
+
:batch_file => batch.file,
|
279
|
+
:status => 'executing'
|
280
|
+
)
|
281
|
+
|
282
|
+
batch.execute
|
283
|
+
|
284
|
+
ETL::Engine.batch.completed_at = Time.now
|
285
|
+
ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
286
|
+
ETL::Engine.batch.save!
|
287
|
+
end
|
288
|
+
|
289
|
+
# Process the specified control file
|
290
|
+
def process_control(control)
|
291
|
+
control = ETL::Control::Control.resolve(control)
|
292
|
+
say_on_own_line "Processing control #{control.file}"
|
293
|
+
|
294
|
+
ETL::Engine.job = ETL::Execution::Job.create!(
|
295
|
+
:control_file => control.file,
|
296
|
+
:status => 'executing',
|
297
|
+
:batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
|
298
|
+
)
|
299
|
+
|
300
|
+
execute_dependencies(control)
|
301
|
+
|
302
|
+
start_time = Time.now
|
303
|
+
pre_process(control)
|
304
|
+
sources = control.sources
|
305
|
+
destinations = control.destinations
|
306
|
+
|
307
|
+
say "Skipping bulk import" if Engine.skip_bulk_import
|
308
|
+
|
309
|
+
sources.each do |source|
|
310
|
+
Engine.current_source = source
|
311
|
+
Engine.logger.debug "Processing source #{source.inspect}"
|
312
|
+
say "Source: #{source}"
|
313
|
+
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
314
|
+
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
315
|
+
source.each_with_index do |row, index|
|
316
|
+
# Break out of the row loop if the +Engine.limit+ is specified and
|
317
|
+
# the number of rows read exceeds that value.
|
318
|
+
if Engine.limit != nil && Engine.rows_read >= Engine.limit
|
319
|
+
puts "Reached limit of #{Engine.limit}"
|
320
|
+
break
|
321
|
+
end
|
322
|
+
|
323
|
+
Engine.logger.debug "Row #{index}: #{row.inspect}"
|
324
|
+
Engine.rows_read += 1
|
325
|
+
Engine.current_source_row = index + 1
|
326
|
+
say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
|
327
|
+
|
328
|
+
# At this point a single row may be turned into multiple rows via row
|
329
|
+
# processors all code after this line should work with the array of
|
330
|
+
# rows rather than the single row
|
331
|
+
rows = [row]
|
332
|
+
|
333
|
+
t = Benchmark.realtime do
|
334
|
+
begin
|
335
|
+
Engine.logger.debug "Processing after read"
|
336
|
+
control.after_read_processors.each do |processor|
|
337
|
+
processed_rows = []
|
338
|
+
rows.each do |row|
|
339
|
+
processed_rows << processor.process(row)
|
340
|
+
end
|
341
|
+
rows = processed_rows.flatten
|
342
|
+
end
|
343
|
+
rescue => e
|
344
|
+
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
345
|
+
errors << msg
|
346
|
+
Engine.logger.error(msg)
|
347
|
+
exceeded_error_threshold?(control) ? break : next
|
348
|
+
end
|
349
|
+
end
|
350
|
+
benchmarks[:after_reads] += t unless t.nil?
|
351
|
+
|
352
|
+
t = Benchmark.realtime do
|
353
|
+
begin
|
354
|
+
Engine.logger.debug "Executing transforms"
|
355
|
+
rows.each do |row|
|
356
|
+
control.transforms.each do |transform|
|
357
|
+
name = transform.name.to_sym
|
358
|
+
row[name] = transform.transform(name, row[name], row)
|
359
|
+
end
|
360
|
+
end
|
361
|
+
rescue ResolverError => e
|
362
|
+
Engine.logger.error(e.message)
|
363
|
+
errors << e.message
|
364
|
+
rescue => e
|
365
|
+
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
366
|
+
errors << msg
|
367
|
+
Engine.logger.error(msg)
|
368
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
369
|
+
ensure
|
370
|
+
begin
|
371
|
+
exceeded_error_threshold?(control) ? break : next
|
372
|
+
rescue => inner_error
|
373
|
+
puts inner_error
|
374
|
+
end
|
375
|
+
end
|
376
|
+
end
|
377
|
+
benchmarks[:transforms] += t unless t.nil?
|
378
|
+
|
379
|
+
t = Benchmark.realtime do
|
380
|
+
begin
|
381
|
+
# execute row-level "before write" processing
|
382
|
+
Engine.logger.debug "Processing before write"
|
383
|
+
control.before_write_processors.each do |processor|
|
384
|
+
processed_rows = []
|
385
|
+
rows.each { |row| processed_rows << processor.process(row) }
|
386
|
+
rows = processed_rows.flatten.compact
|
387
|
+
end
|
388
|
+
rescue => e
|
389
|
+
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
390
|
+
errors << msg
|
391
|
+
Engine.logger.error(msg)
|
392
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
393
|
+
exceeded_error_threshold?(control) ? break : next
|
394
|
+
end
|
395
|
+
end
|
396
|
+
benchmarks[:before_writes] += t unless t.nil?
|
397
|
+
|
398
|
+
t = Benchmark.realtime do
|
399
|
+
begin
|
400
|
+
# write the row to the destination
|
401
|
+
destinations.each_with_index do |destination, index|
|
402
|
+
Engine.current_destination = destination
|
403
|
+
rows.each do |row|
|
404
|
+
destination.write(row)
|
405
|
+
Engine.rows_written += 1 if index == 0
|
406
|
+
end
|
407
|
+
end
|
408
|
+
rescue => e
|
409
|
+
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
410
|
+
errors << msg
|
411
|
+
Engine.logger.error msg
|
412
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
413
|
+
exceeded_error_threshold?(control) ? break : next
|
414
|
+
end
|
415
|
+
end
|
416
|
+
benchmarks[:writes] += t unless t.nil?
|
417
|
+
end
|
418
|
+
|
419
|
+
if exceeded_error_threshold?(control)
|
420
|
+
say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
|
421
|
+
return
|
422
|
+
end
|
423
|
+
|
424
|
+
end
|
425
|
+
|
426
|
+
destinations.each do |destination|
|
427
|
+
destination.close
|
428
|
+
end
|
429
|
+
|
430
|
+
say_on_own_line "Executing before post-process screens"
|
431
|
+
begin
|
432
|
+
execute_screens(control)
|
433
|
+
rescue FatalScreenError => e
|
434
|
+
say "Fatal screen error during job execution: #{e.message}"
|
435
|
+
exit
|
436
|
+
rescue ScreenError => e
|
437
|
+
say "Screen error during job execution: #{e.message}"
|
438
|
+
return
|
439
|
+
else
|
440
|
+
say "Screens passed"
|
441
|
+
end
|
442
|
+
|
443
|
+
post_process(control)
|
444
|
+
|
445
|
+
if sources.length > 0
|
446
|
+
say_on_own_line "Read #{Engine.rows_read} lines from sources"
|
447
|
+
end
|
448
|
+
if destinations.length > 0
|
449
|
+
say "Wrote #{Engine.rows_written} lines to destinations"
|
450
|
+
end
|
451
|
+
|
452
|
+
say_on_own_line "Executing after post-process screens"
|
453
|
+
begin
|
454
|
+
execute_screens(control, :after_post_process)
|
455
|
+
rescue FatalScreenError => e
|
456
|
+
say "Fatal screen error during job execution: #{e.message}"
|
457
|
+
exit
|
458
|
+
rescue ScreenError => e
|
459
|
+
say "Screen error during job execution: #{e.message}"
|
460
|
+
return
|
461
|
+
else
|
462
|
+
say "Screens passed"
|
463
|
+
end
|
464
|
+
|
465
|
+
say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
|
466
|
+
say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
|
467
|
+
|
468
|
+
say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
|
469
|
+
say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
|
470
|
+
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
471
|
+
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
472
|
+
|
473
|
+
# say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
|
474
|
+
#
|
475
|
+
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
476
|
+
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
477
|
+
# end
|
478
|
+
|
479
|
+
ETL::Engine.job.completed_at = Time.now
|
480
|
+
ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
481
|
+
ETL::Engine.job.save!
|
482
|
+
end
|
483
|
+
|
484
|
+
private
|
485
|
+
# Return true if the error threshold is exceeded
|
486
|
+
def exceeded_error_threshold?(control)
|
487
|
+
errors.length > control.error_threshold
|
488
|
+
end
|
489
|
+
|
490
|
+
# Execute all preprocessors
|
491
|
+
def pre_process(control)
|
492
|
+
Engine.logger.debug "Pre-processing #{control.file}"
|
493
|
+
control.pre_processors.each do |processor|
|
494
|
+
processor.process
|
495
|
+
end
|
496
|
+
Engine.logger.debug "Pre-processing complete"
|
497
|
+
end
|
498
|
+
|
499
|
+
# Execute all postprocessors
|
500
|
+
def post_process(control)
|
501
|
+
say_on_own_line "Executing post processes"
|
502
|
+
Engine.logger.debug "Post-processing #{control.file}"
|
503
|
+
control.post_processors.each do |processor|
|
504
|
+
processor.process
|
505
|
+
end
|
506
|
+
Engine.logger.debug "Post-processing complete"
|
507
|
+
say "Post-processing complete"
|
508
|
+
end
|
509
|
+
|
510
|
+
# Execute all dependencies
|
511
|
+
def execute_dependencies(control)
|
512
|
+
Engine.logger.debug "Executing dependencies"
|
513
|
+
control.dependencies.flatten.each do |dependency|
|
514
|
+
case dependency
|
515
|
+
when Symbol
|
516
|
+
f = dependency.to_s + '.ctl'
|
517
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
518
|
+
say "Executing dependency: #{f}"
|
519
|
+
process(f)
|
520
|
+
when String
|
521
|
+
Engine.logger.debug "Executing dependency: #{f}"
|
522
|
+
say "Executing dependency: #{f}"
|
523
|
+
process(dependency)
|
524
|
+
else
|
525
|
+
raise "Invalid dependency type: #{dependency.class}"
|
526
|
+
end
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
# Execute all screens
|
531
|
+
def execute_screens(control, timing = :before_post_process)
|
532
|
+
screens = case timing
|
533
|
+
when :after_post_process
|
534
|
+
control.after_post_process_screens
|
535
|
+
else # default to before post-process screens
|
536
|
+
control.screens
|
537
|
+
end
|
538
|
+
[:fatal,:error,:warn].each do |type|
|
539
|
+
screens[type].each do |block|
|
540
|
+
begin
|
541
|
+
block.call
|
542
|
+
rescue => e
|
543
|
+
case type
|
544
|
+
when :fatal
|
545
|
+
raise FatalScreenError, e
|
546
|
+
when :error
|
547
|
+
raise ScreenError, e
|
548
|
+
when :warn
|
549
|
+
say "Screen warning: #{e}"
|
550
|
+
end
|
551
|
+
end
|
552
|
+
end
|
553
|
+
end
|
554
|
+
end
|
555
|
+
end
|
556
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Execution #:nodoc
|
3
|
+
# Handles migration of tables required for persistent storage of meta data
|
4
|
+
# for the ETL engine
|
5
|
+
class Migration
|
6
|
+
class << self
|
7
|
+
protected
|
8
|
+
# Get the schema info table name
|
9
|
+
def schema_info_table_name
|
10
|
+
ActiveRecord::Migrator.schema_migrations_table_name
|
11
|
+
end
|
12
|
+
alias :schema_migrations_table_name :schema_info_table_name
|
13
|
+
|
14
|
+
public
|
15
|
+
# Execute the migrations
|
16
|
+
def migrate
|
17
|
+
connection.initialize_schema_migrations_table
|
18
|
+
last_migration.upto(target - 1) do |i|
|
19
|
+
__send__("migration_#{i+1}".to_sym)
|
20
|
+
connection.assume_migrated_upto_version(i+1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
def last_migration
|
26
|
+
connection.select_values(
|
27
|
+
"SELECT version FROM #{schema_migrations_table_name}"
|
28
|
+
).map(&:to_i).sort.last || 0
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get the connection to use during migration
|
32
|
+
def connection
|
33
|
+
@connection ||= ETL::Execution::Base.connection
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get the final target version number
|
37
|
+
def target
|
38
|
+
4
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def migration_1 #:nodoc:
|
43
|
+
connection.create_table :jobs do |t|
|
44
|
+
t.column :control_file, :string, :null => false
|
45
|
+
t.column :created_at, :datetime, :null => false
|
46
|
+
t.column :completed_at, :datetime
|
47
|
+
t.column :status, :string
|
48
|
+
end
|
49
|
+
connection.create_table :records do |t|
|
50
|
+
t.column :control_file, :string, :null => false
|
51
|
+
t.column :natural_key, :string, :null => false
|
52
|
+
t.column :crc, :string, :null => false
|
53
|
+
t.column :job_id, :integer, :null => false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def migration_2 #:nodoc:
|
58
|
+
connection.add_index :records, :control_file
|
59
|
+
connection.add_index :records, :natural_key
|
60
|
+
connection.add_index :records, :job_id
|
61
|
+
end
|
62
|
+
|
63
|
+
def migration_3 #:nodoc:
|
64
|
+
connection.create_table :batches do |t|
|
65
|
+
t.column :batch_file, :string, :null => false
|
66
|
+
t.column :created_at, :datetime, :null => false
|
67
|
+
t.column :completed_at, :datetime
|
68
|
+
t.column :status, :string
|
69
|
+
end
|
70
|
+
connection.add_column :jobs, :batch_id, :integer
|
71
|
+
connection.add_index :jobs, :batch_id
|
72
|
+
end
|
73
|
+
|
74
|
+
def migration_4
|
75
|
+
connection.drop_table :records
|
76
|
+
end
|
77
|
+
|
78
|
+
# Update the schema info table, setting the version value
|
79
|
+
def update_schema_info(version)
|
80
|
+
connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module ETL #:nodoc
|
2
|
+
# Classes which store information about ETL execution
|
3
|
+
module Execution
|
4
|
+
# Execution management
|
5
|
+
class Execution
|
6
|
+
class << self
|
7
|
+
# Migrate the data store
|
8
|
+
def migrate
|
9
|
+
ETL::Execution::Migration.migrate
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'etl/execution/base'
|
17
|
+
require 'etl/execution/batch'
|
18
|
+
require 'etl/execution/job'
|
19
|
+
require 'etl/execution/migration'
|