darrell-activewarehouse-etl 0.9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,556 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+
30
+ require File.join(@rails_root, 'config/environment') if @rails_root
31
+ options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
+ ActiveRecord::Base.configurations.merge!(database_configuration)
35
+ ETL::Base.configurations = database_configuration
36
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
+
38
+ require 'etl/execution'
39
+ ETL::Execution::Base.establish_connection :etl_execution
40
+ ETL::Execution::Execution.migrate
41
+
42
+ @initialized = true
43
+ end
44
+ end
45
+
46
+ # Process the specified file. Acceptable values for file are:
47
+ # * Path to a file
48
+ # * File object
49
+ # * ETL::Control::Control instance
50
+ # * ETL::Batch::Batch instance
51
+ #
52
+ # The process command will accept either a .ctl Control file or a .ebf
53
+ # ETL Batch File.
54
+ def process(file)
55
+ new().process(file)
56
+ end
57
+
58
+ attr_accessor :timestamped_log
59
+
60
+ # Accessor for the log write mode. Default is 'a' for append.
61
+ attr_accessor :log_write_mode
62
+ def log_write_mode
63
+ @log_write_mode ||= 'a'
64
+ end
65
+
66
+ # A logger for the engine
67
+ attr_accessor :logger
68
+
69
+ def logger #:nodoc:
70
+ unless @logger
71
+ if timestamped_log
72
+ @logger = Logger.new("etl_#{timestamp}.log")
73
+ else
74
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
75
+ end
76
+ @logger.level = Logger::WARN
77
+ @logger.formatter = Logger::Formatter.new
78
+ end
79
+ @logger
80
+ end
81
+
82
+ # Get a timestamp value as a string
83
+ def timestamp
84
+ Time.now.strftime("%Y%m%d%H%M%S")
85
+ end
86
+
87
+ # The current source
88
+ attr_accessor :current_source
89
+
90
+ # The current source row
91
+ attr_accessor :current_source_row
92
+
93
+ # The current destination
94
+ attr_accessor :current_destination
95
+
96
+ # Set to true to activate realtime activity. This will cause certain
97
+ # information messages to be printed to STDOUT
98
+ attr_accessor :realtime_activity
99
+
100
+ # Accessor for the total number of rows read from sources
101
+ attr_accessor :rows_read
102
+ def rows_read
103
+ @rows_read ||= 0
104
+ end
105
+
106
+ # Accessor for the total number of rows processed
107
+ attr_accessor :rows_written
108
+ def rows_written
109
+ @rows_written ||= 0
110
+ end
111
+
112
+ # Access the current ETL::Execution::Job instance
113
+ attr_accessor :job
114
+
115
+ # Access the current ETL::Execution::Batch instance
116
+ attr_accessor :batch
117
+
118
+ # The limit on rows to load from the source, useful for testing the ETL
119
+ # process prior to executing the entire batch. Default value is nil and
120
+ # indicates that there is no limit
121
+ attr_accessor :limit
122
+
123
+ # The offset for the source to begin at, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no offset
126
+ attr_accessor :offset
127
+
128
+ # Set to true to skip all bulk importing
129
+ attr_accessor :skip_bulk_import
130
+
131
+ # Set to true to read locally from the last source cache files
132
+ attr_accessor :read_locally
133
+
134
+ # Accessor for the average rows per second processed
135
+ attr_accessor :average_rows_per_second
136
+
137
+ # Get a named connection
138
+ def connection(name)
139
+ logger.debug "Retrieving connection #{name}"
140
+ conn = connections[name] ||= establish_connection(name)
141
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
142
+ conn.reconnect! unless conn.active?
143
+ conn
144
+ end
145
+
146
+ # Set to true to use temp tables
147
+ attr_accessor :use_temp_tables
148
+
149
+ # Get a registry of temp tables
150
+ def temp_tables
151
+ @temp_tables ||= {}
152
+ end
153
+
154
+ # Called when a batch job finishes, allowing for cleanup to occur
155
+ def finish
156
+ temp_tables.each do |temp_table, mapping|
157
+ actual_table = mapping[:table]
158
+ #puts "move #{temp_table} to #{actual_table}"
159
+ conn = mapping[:connection]
160
+ conn.transaction do
161
+ conn.rename_table(actual_table, "#{actual_table}_old")
162
+ conn.rename_table(temp_table, actual_table)
163
+ conn.drop_table("#{actual_table}_old")
164
+ end
165
+ end
166
+ end
167
+
168
+ # Return true if using temp tables
169
+ def use_temp_tables?
170
+ use_temp_tables ? true : false
171
+ end
172
+
173
+ # Modify the table name if necessary
174
+ def table(table_name, connection)
175
+ if use_temp_tables?
176
+ returning "tmp_#{table_name}" do |temp_table_name|
177
+ if temp_tables[temp_table_name].nil?
178
+ # Create the temp table and add it to the mapping
179
+ begin connection.drop_table(temp_table_name); rescue; end
180
+ connection.copy_table(table_name, temp_table_name)
181
+ temp_tables[temp_table_name] = {
182
+ :table => table_name,
183
+ :connection => connection
184
+ }
185
+ end
186
+ end
187
+ else
188
+ table_name
189
+ end
190
+ end
191
+
192
+ protected
193
+ # Hash of database connections that can be used throughout the ETL
194
+ # process
195
+ def connections
196
+ @connections ||= {}
197
+ end
198
+
199
+ # Establish the named connection and return the database specific connection
200
+ def establish_connection(name)
201
+ logger.debug "Establishing connection to #{name}"
202
+ conn_config = ETL::Base.configurations[name.to_s]
203
+ raise ETL::ETLError, "No connection found for #{name}" unless conn_config
204
+ connection_method = "#{conn_config['adapter']}_connection"
205
+ ETL::Base.send(connection_method, conn_config)
206
+ end
207
+ end # class << self
208
+
209
+ # Say the specified message, with a newline
210
+ def say(message)
211
+ say_without_newline(message + "\n")
212
+ end
213
+
214
+ # Say the specified message without a newline
215
+ def say_without_newline(message)
216
+ if ETL::Engine.realtime_activity
217
+ $stdout.print message
218
+ $stdout.flush
219
+ end
220
+ end
221
+
222
+ # Say the message on its own line
223
+ def say_on_own_line(message)
224
+ say("\n" + message)
225
+ end
226
+
227
+ # Array of errors encountered during execution of the ETL process
228
+ def errors
229
+ @errors ||= []
230
+ end
231
+
232
+ # Get a Hash of benchmark values where each value represents the total
233
+ # amount of time in seconds spent processing in that portion of the ETL
234
+ # pipeline. Keys include:
235
+ # * <tt>:transforms</tt>
236
+ # * <tt>:after_reads</tt>
237
+ # * <tt>:before_writes</tt>
238
+ # * <tt>:writes</tt>
239
+ def benchmarks
240
+ @benchmarks ||= {
241
+ :transforms => 0,
242
+ :after_reads => 0,
243
+ :before_writes => 0,
244
+ :writes => 0,
245
+ }
246
+ end
247
+
248
+ # Process a file, control object or batch object. Acceptable values for
249
+ # file are:
250
+ # * Path to a file
251
+ # * File object
252
+ # * ETL::Control::Control instance
253
+ # * ETL::Batch::Batch instance
254
+ def process(file)
255
+ case file
256
+ when String
257
+ process(File.new(file))
258
+ when File
259
+ process_control(file) if file.path =~ /.ctl$/
260
+ process_batch(file) if file.path =~ /.ebf$/
261
+ when ETL::Control::Control
262
+ process_control(file)
263
+ when ETL::Batch::Batch
264
+ process_batch(file)
265
+ else
266
+ raise RuntimeError, "Process object must be a String, File, Control
267
+ instance or Batch instance"
268
+ end
269
+ end
270
+
271
+ protected
272
+ # Process the specified batch file
273
+ def process_batch(batch)
274
+ batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
276
+
277
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
278
+ :batch_file => batch.file,
279
+ :status => 'executing'
280
+ )
281
+
282
+ batch.execute
283
+
284
+ ETL::Engine.batch.completed_at = Time.now
285
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
286
+ ETL::Engine.batch.save!
287
+ end
288
+
289
+ # Process the specified control file
290
+ def process_control(control)
291
+ control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
+
294
+ ETL::Engine.job = ETL::Execution::Job.create!(
295
+ :control_file => control.file,
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
298
+ )
299
+
300
+ execute_dependencies(control)
301
+
302
+ start_time = Time.now
303
+ pre_process(control)
304
+ sources = control.sources
305
+ destinations = control.destinations
306
+
307
+ say "Skipping bulk import" if Engine.skip_bulk_import
308
+
309
+ sources.each do |source|
310
+ Engine.current_source = source
311
+ Engine.logger.debug "Processing source #{source.inspect}"
312
+ say "Source: #{source}"
313
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
314
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
315
+ source.each_with_index do |row, index|
316
+ # Break out of the row loop if the +Engine.limit+ is specified and
317
+ # the number of rows read exceeds that value.
318
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
319
+ puts "Reached limit of #{Engine.limit}"
320
+ break
321
+ end
322
+
323
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
324
+ Engine.rows_read += 1
325
+ Engine.current_source_row = index + 1
326
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
327
+
328
+ # At this point a single row may be turned into multiple rows via row
329
+ # processors all code after this line should work with the array of
330
+ # rows rather than the single row
331
+ rows = [row]
332
+
333
+ t = Benchmark.realtime do
334
+ begin
335
+ Engine.logger.debug "Processing after read"
336
+ control.after_read_processors.each do |processor|
337
+ processed_rows = []
338
+ rows.each do |row|
339
+ processed_rows << processor.process(row)
340
+ end
341
+ rows = processed_rows.flatten
342
+ end
343
+ rescue => e
344
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
345
+ errors << msg
346
+ Engine.logger.error(msg)
347
+ exceeded_error_threshold?(control) ? break : next
348
+ end
349
+ end
350
+ benchmarks[:after_reads] += t unless t.nil?
351
+
352
+ t = Benchmark.realtime do
353
+ begin
354
+ Engine.logger.debug "Executing transforms"
355
+ rows.each do |row|
356
+ control.transforms.each do |transform|
357
+ name = transform.name.to_sym
358
+ row[name] = transform.transform(name, row[name], row)
359
+ end
360
+ end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
364
+ rescue => e
365
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
366
+ errors << msg
367
+ Engine.logger.error(msg)
368
+ e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
370
+ begin
371
+ exceeded_error_threshold?(control) ? break : next
372
+ rescue => inner_error
373
+ puts inner_error
374
+ end
375
+ end
376
+ end
377
+ benchmarks[:transforms] += t unless t.nil?
378
+
379
+ t = Benchmark.realtime do
380
+ begin
381
+ # execute row-level "before write" processing
382
+ Engine.logger.debug "Processing before write"
383
+ control.before_write_processors.each do |processor|
384
+ processed_rows = []
385
+ rows.each { |row| processed_rows << processor.process(row) }
386
+ rows = processed_rows.flatten.compact
387
+ end
388
+ rescue => e
389
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
390
+ errors << msg
391
+ Engine.logger.error(msg)
392
+ e.backtrace.each { |line| Engine.logger.error(line) }
393
+ exceeded_error_threshold?(control) ? break : next
394
+ end
395
+ end
396
+ benchmarks[:before_writes] += t unless t.nil?
397
+
398
+ t = Benchmark.realtime do
399
+ begin
400
+ # write the row to the destination
401
+ destinations.each_with_index do |destination, index|
402
+ Engine.current_destination = destination
403
+ rows.each do |row|
404
+ destination.write(row)
405
+ Engine.rows_written += 1 if index == 0
406
+ end
407
+ end
408
+ rescue => e
409
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error msg
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:writes] += t unless t.nil?
417
+ end
418
+
419
+ if exceeded_error_threshold?(control)
420
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
421
+ return
422
+ end
423
+
424
+ end
425
+
426
+ destinations.each do |destination|
427
+ destination.close
428
+ end
429
+
430
+ say_on_own_line "Executing before post-process screens"
431
+ begin
432
+ execute_screens(control)
433
+ rescue FatalScreenError => e
434
+ say "Fatal screen error during job execution: #{e.message}"
435
+ exit
436
+ rescue ScreenError => e
437
+ say "Screen error during job execution: #{e.message}"
438
+ return
439
+ else
440
+ say "Screens passed"
441
+ end
442
+
443
+ post_process(control)
444
+
445
+ if sources.length > 0
446
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
447
+ end
448
+ if destinations.length > 0
449
+ say "Wrote #{Engine.rows_written} lines to destinations"
450
+ end
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
466
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
467
+
468
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
469
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
470
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
471
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
472
+
473
+ # say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
474
+ #
475
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
476
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
477
+ # end
478
+
479
+ ETL::Engine.job.completed_at = Time.now
480
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
481
+ ETL::Engine.job.save!
482
+ end
483
+
484
+ private
485
+ # Return true if the error threshold is exceeded
486
+ def exceeded_error_threshold?(control)
487
+ errors.length > control.error_threshold
488
+ end
489
+
490
+ # Execute all preprocessors
491
+ def pre_process(control)
492
+ Engine.logger.debug "Pre-processing #{control.file}"
493
+ control.pre_processors.each do |processor|
494
+ processor.process
495
+ end
496
+ Engine.logger.debug "Pre-processing complete"
497
+ end
498
+
499
+ # Execute all postprocessors
500
+ def post_process(control)
501
+ say_on_own_line "Executing post processes"
502
+ Engine.logger.debug "Post-processing #{control.file}"
503
+ control.post_processors.each do |processor|
504
+ processor.process
505
+ end
506
+ Engine.logger.debug "Post-processing complete"
507
+ say "Post-processing complete"
508
+ end
509
+
510
+ # Execute all dependencies
511
+ def execute_dependencies(control)
512
+ Engine.logger.debug "Executing dependencies"
513
+ control.dependencies.flatten.each do |dependency|
514
+ case dependency
515
+ when Symbol
516
+ f = dependency.to_s + '.ctl'
517
+ Engine.logger.debug "Executing dependency: #{f}"
518
+ say "Executing dependency: #{f}"
519
+ process(f)
520
+ when String
521
+ Engine.logger.debug "Executing dependency: #{f}"
522
+ say "Executing dependency: #{f}"
523
+ process(dependency)
524
+ else
525
+ raise "Invalid dependency type: #{dependency.class}"
526
+ end
527
+ end
528
+ end
529
+
530
+ # Execute all screens
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
538
+ [:fatal,:error,:warn].each do |type|
539
+ screens[type].each do |block|
540
+ begin
541
+ block.call
542
+ rescue => e
543
+ case type
544
+ when :fatal
545
+ raise FatalScreenError, e
546
+ when :error
547
+ raise ScreenError, e
548
+ when :warn
549
+ say "Screen warning: #{e}"
550
+ end
551
+ end
552
+ end
553
+ end
554
+ end
555
+ end
556
+ end
@@ -0,0 +1,9 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Base class for ETL execution information
4
+ class Base < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ establish_connection :etl_execution
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL batch
4
+ class Batch < Base
5
+ has_many :jobs
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL job
4
+ class Job < Base
5
+ belongs_to :batch
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,85 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc
3
+ # Handles migration of tables required for persistent storage of meta data
4
+ # for the ETL engine
5
+ class Migration
6
+ class << self
7
+ protected
8
+ # Get the schema info table name
9
+ def schema_info_table_name
10
+ ActiveRecord::Migrator.schema_migrations_table_name
11
+ end
12
+ alias :schema_migrations_table_name :schema_info_table_name
13
+
14
+ public
15
+ # Execute the migrations
16
+ def migrate
17
+ connection.initialize_schema_migrations_table
18
+ last_migration.upto(target - 1) do |i|
19
+ __send__("migration_#{i+1}".to_sym)
20
+ connection.assume_migrated_upto_version(i+1)
21
+ end
22
+ end
23
+
24
+ protected
25
+ def last_migration
26
+ connection.select_values(
27
+ "SELECT version FROM #{schema_migrations_table_name}"
28
+ ).map(&:to_i).sort.last || 0
29
+ end
30
+
31
+ # Get the connection to use during migration
32
+ def connection
33
+ @connection ||= ETL::Execution::Base.connection
34
+ end
35
+
36
+ # Get the final target version number
37
+ def target
38
+ 4
39
+ end
40
+
41
+ private
42
+ def migration_1 #:nodoc:
43
+ connection.create_table :jobs do |t|
44
+ t.column :control_file, :string, :null => false
45
+ t.column :created_at, :datetime, :null => false
46
+ t.column :completed_at, :datetime
47
+ t.column :status, :string
48
+ end
49
+ connection.create_table :records do |t|
50
+ t.column :control_file, :string, :null => false
51
+ t.column :natural_key, :string, :null => false
52
+ t.column :crc, :string, :null => false
53
+ t.column :job_id, :integer, :null => false
54
+ end
55
+ end
56
+
57
+ def migration_2 #:nodoc:
58
+ connection.add_index :records, :control_file
59
+ connection.add_index :records, :natural_key
60
+ connection.add_index :records, :job_id
61
+ end
62
+
63
+ def migration_3 #:nodoc:
64
+ connection.create_table :batches do |t|
65
+ t.column :batch_file, :string, :null => false
66
+ t.column :created_at, :datetime, :null => false
67
+ t.column :completed_at, :datetime
68
+ t.column :status, :string
69
+ end
70
+ connection.add_column :jobs, :batch_id, :integer
71
+ connection.add_index :jobs, :batch_id
72
+ end
73
+
74
+ def migration_4
75
+ connection.drop_table :records
76
+ end
77
+
78
+ # Update the schema info table, setting the version value
79
+ def update_schema_info(version)
80
+ connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc
2
+ # Classes which store information about ETL execution
3
+ module Execution
4
+ # Execution management
5
+ class Execution
6
+ class << self
7
+ # Migrate the data store
8
+ def migrate
9
+ ETL::Execution::Migration.migrate
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ require 'etl/execution/base'
17
+ require 'etl/execution/batch'
18
+ require 'etl/execution/job'
19
+ require 'etl/execution/migration'