darrell-activewarehouse-etl 0.9.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,556 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+
30
+ require File.join(@rails_root, 'config/environment') if @rails_root
31
+ options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
+ ActiveRecord::Base.configurations.merge!(database_configuration)
35
+ ETL::Base.configurations = database_configuration
36
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
+
38
+ require 'etl/execution'
39
+ ETL::Execution::Base.establish_connection :etl_execution
40
+ ETL::Execution::Execution.migrate
41
+
42
+ @initialized = true
43
+ end
44
+ end
45
+
46
+ # Process the specified file. Acceptable values for file are:
47
+ # * Path to a file
48
+ # * File object
49
+ # * ETL::Control::Control instance
50
+ # * ETL::Batch::Batch instance
51
+ #
52
+ # The process command will accept either a .ctl Control file or a .ebf
53
+ # ETL Batch File.
54
+ def process(file)
55
+ new().process(file)
56
+ end
57
+
58
+ attr_accessor :timestamped_log
59
+
60
+ # Accessor for the log write mode. Default is 'a' for append.
61
+ attr_accessor :log_write_mode
62
+ def log_write_mode
63
+ @log_write_mode ||= 'a'
64
+ end
65
+
66
+ # A logger for the engine
67
+ attr_accessor :logger
68
+
69
+ def logger #:nodoc:
70
+ unless @logger
71
+ if timestamped_log
72
+ @logger = Logger.new("etl_#{timestamp}.log")
73
+ else
74
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
75
+ end
76
+ @logger.level = Logger::WARN
77
+ @logger.formatter = Logger::Formatter.new
78
+ end
79
+ @logger
80
+ end
81
+
82
+ # Get a timestamp value as a string
83
+ def timestamp
84
+ Time.now.strftime("%Y%m%d%H%M%S")
85
+ end
86
+
87
+ # The current source
88
+ attr_accessor :current_source
89
+
90
+ # The current source row
91
+ attr_accessor :current_source_row
92
+
93
+ # The current destination
94
+ attr_accessor :current_destination
95
+
96
+ # Set to true to activate realtime activity. This will cause certain
97
+ # information messages to be printed to STDOUT
98
+ attr_accessor :realtime_activity
99
+
100
+ # Accessor for the total number of rows read from sources
101
+ attr_accessor :rows_read
102
+ def rows_read
103
+ @rows_read ||= 0
104
+ end
105
+
106
+ # Accessor for the total number of rows processed
107
+ attr_accessor :rows_written
108
+ def rows_written
109
+ @rows_written ||= 0
110
+ end
111
+
112
+ # Access the current ETL::Execution::Job instance
113
+ attr_accessor :job
114
+
115
+ # Access the current ETL::Execution::Batch instance
116
+ attr_accessor :batch
117
+
118
+ # The limit on rows to load from the source, useful for testing the ETL
119
+ # process prior to executing the entire batch. Default value is nil and
120
+ # indicates that there is no limit
121
+ attr_accessor :limit
122
+
123
+ # The offset for the source to begin at, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no offset
126
+ attr_accessor :offset
127
+
128
+ # Set to true to skip all bulk importing
129
+ attr_accessor :skip_bulk_import
130
+
131
+ # Set to true to read locally from the last source cache files
132
+ attr_accessor :read_locally
133
+
134
+ # Accessor for the average rows per second processed
135
+ attr_accessor :average_rows_per_second
136
+
137
+ # Get a named connection
138
+ def connection(name)
139
+ logger.debug "Retrieving connection #{name}"
140
+ conn = connections[name] ||= establish_connection(name)
141
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
142
+ conn.reconnect! unless conn.active?
143
+ conn
144
+ end
145
+
146
+ # Set to true to use temp tables
147
+ attr_accessor :use_temp_tables
148
+
149
+ # Get a registry of temp tables
150
+ def temp_tables
151
+ @temp_tables ||= {}
152
+ end
153
+
154
+ # Called when a batch job finishes, allowing for cleanup to occur
155
+ def finish
156
+ temp_tables.each do |temp_table, mapping|
157
+ actual_table = mapping[:table]
158
+ #puts "move #{temp_table} to #{actual_table}"
159
+ conn = mapping[:connection]
160
+ conn.transaction do
161
+ conn.rename_table(actual_table, "#{actual_table}_old")
162
+ conn.rename_table(temp_table, actual_table)
163
+ conn.drop_table("#{actual_table}_old")
164
+ end
165
+ end
166
+ end
167
+
168
+ # Return true if using temp tables
169
+ def use_temp_tables?
170
+ use_temp_tables ? true : false
171
+ end
172
+
173
+ # Modify the table name if necessary
174
+ def table(table_name, connection)
175
+ if use_temp_tables?
176
+ returning "tmp_#{table_name}" do |temp_table_name|
177
+ if temp_tables[temp_table_name].nil?
178
+ # Create the temp table and add it to the mapping
179
+ begin connection.drop_table(temp_table_name); rescue; end
180
+ connection.copy_table(table_name, temp_table_name)
181
+ temp_tables[temp_table_name] = {
182
+ :table => table_name,
183
+ :connection => connection
184
+ }
185
+ end
186
+ end
187
+ else
188
+ table_name
189
+ end
190
+ end
191
+
192
+ protected
193
+ # Hash of database connections that can be used throughout the ETL
194
+ # process
195
+ def connections
196
+ @connections ||= {}
197
+ end
198
+
199
+ # Establish the named connection and return the database specific connection
200
+ def establish_connection(name)
201
+ logger.debug "Establishing connection to #{name}"
202
+ conn_config = ETL::Base.configurations[name.to_s]
203
+ raise ETL::ETLError, "No connection found for #{name}" unless conn_config
204
+ connection_method = "#{conn_config['adapter']}_connection"
205
+ ETL::Base.send(connection_method, conn_config)
206
+ end
207
+ end # class << self
208
+
209
+ # Say the specified message, with a newline
210
+ def say(message)
211
+ say_without_newline(message + "\n")
212
+ end
213
+
214
+ # Say the specified message without a newline
215
+ def say_without_newline(message)
216
+ if ETL::Engine.realtime_activity
217
+ $stdout.print message
218
+ $stdout.flush
219
+ end
220
+ end
221
+
222
+ # Say the message on its own line
223
+ def say_on_own_line(message)
224
+ say("\n" + message)
225
+ end
226
+
227
+ # Array of errors encountered during execution of the ETL process
228
+ def errors
229
+ @errors ||= []
230
+ end
231
+
232
+ # Get a Hash of benchmark values where each value represents the total
233
+ # amount of time in seconds spent processing in that portion of the ETL
234
+ # pipeline. Keys include:
235
+ # * <tt>:transforms</tt>
236
+ # * <tt>:after_reads</tt>
237
+ # * <tt>:before_writes</tt>
238
+ # * <tt>:writes</tt>
239
+ def benchmarks
240
+ @benchmarks ||= {
241
+ :transforms => 0,
242
+ :after_reads => 0,
243
+ :before_writes => 0,
244
+ :writes => 0,
245
+ }
246
+ end
247
+
248
+ # Process a file, control object or batch object. Acceptable values for
249
+ # file are:
250
+ # * Path to a file
251
+ # * File object
252
+ # * ETL::Control::Control instance
253
+ # * ETL::Batch::Batch instance
254
+ def process(file)
255
+ case file
256
+ when String
257
+ process(File.new(file))
258
+ when File
259
+ process_control(file) if file.path =~ /.ctl$/
260
+ process_batch(file) if file.path =~ /.ebf$/
261
+ when ETL::Control::Control
262
+ process_control(file)
263
+ when ETL::Batch::Batch
264
+ process_batch(file)
265
+ else
266
+ raise RuntimeError, "Process object must be a String, File, Control
267
+ instance or Batch instance"
268
+ end
269
+ end
270
+
271
+ protected
272
+ # Process the specified batch file
273
+ def process_batch(batch)
274
+ batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
276
+
277
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
278
+ :batch_file => batch.file,
279
+ :status => 'executing'
280
+ )
281
+
282
+ batch.execute
283
+
284
+ ETL::Engine.batch.completed_at = Time.now
285
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
286
+ ETL::Engine.batch.save!
287
+ end
288
+
289
+ # Process the specified control file
290
+ def process_control(control)
291
+ control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
+
294
+ ETL::Engine.job = ETL::Execution::Job.create!(
295
+ :control_file => control.file,
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
298
+ )
299
+
300
+ execute_dependencies(control)
301
+
302
+ start_time = Time.now
303
+ pre_process(control)
304
+ sources = control.sources
305
+ destinations = control.destinations
306
+
307
+ say "Skipping bulk import" if Engine.skip_bulk_import
308
+
309
+ sources.each do |source|
310
+ Engine.current_source = source
311
+ Engine.logger.debug "Processing source #{source.inspect}"
312
+ say "Source: #{source}"
313
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
314
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
315
+ source.each_with_index do |row, index|
316
+ # Break out of the row loop if the +Engine.limit+ is specified and
317
+ # the number of rows read exceeds that value.
318
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
319
+ puts "Reached limit of #{Engine.limit}"
320
+ break
321
+ end
322
+
323
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
324
+ Engine.rows_read += 1
325
+ Engine.current_source_row = index + 1
326
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
327
+
328
+ # At this point a single row may be turned into multiple rows via row
329
+ # processors all code after this line should work with the array of
330
+ # rows rather than the single row
331
+ rows = [row]
332
+
333
+ t = Benchmark.realtime do
334
+ begin
335
+ Engine.logger.debug "Processing after read"
336
+ control.after_read_processors.each do |processor|
337
+ processed_rows = []
338
+ rows.each do |row|
339
+ processed_rows << processor.process(row)
340
+ end
341
+ rows = processed_rows.flatten
342
+ end
343
+ rescue => e
344
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
345
+ errors << msg
346
+ Engine.logger.error(msg)
347
+ exceeded_error_threshold?(control) ? break : next
348
+ end
349
+ end
350
+ benchmarks[:after_reads] += t unless t.nil?
351
+
352
+ t = Benchmark.realtime do
353
+ begin
354
+ Engine.logger.debug "Executing transforms"
355
+ rows.each do |row|
356
+ control.transforms.each do |transform|
357
+ name = transform.name.to_sym
358
+ row[name] = transform.transform(name, row[name], row)
359
+ end
360
+ end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
364
+ rescue => e
365
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
366
+ errors << msg
367
+ Engine.logger.error(msg)
368
+ e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
370
+ begin
371
+ exceeded_error_threshold?(control) ? break : next
372
+ rescue => inner_error
373
+ puts inner_error
374
+ end
375
+ end
376
+ end
377
+ benchmarks[:transforms] += t unless t.nil?
378
+
379
+ t = Benchmark.realtime do
380
+ begin
381
+ # execute row-level "before write" processing
382
+ Engine.logger.debug "Processing before write"
383
+ control.before_write_processors.each do |processor|
384
+ processed_rows = []
385
+ rows.each { |row| processed_rows << processor.process(row) }
386
+ rows = processed_rows.flatten.compact
387
+ end
388
+ rescue => e
389
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
390
+ errors << msg
391
+ Engine.logger.error(msg)
392
+ e.backtrace.each { |line| Engine.logger.error(line) }
393
+ exceeded_error_threshold?(control) ? break : next
394
+ end
395
+ end
396
+ benchmarks[:before_writes] += t unless t.nil?
397
+
398
+ t = Benchmark.realtime do
399
+ begin
400
+ # write the row to the destination
401
+ destinations.each_with_index do |destination, index|
402
+ Engine.current_destination = destination
403
+ rows.each do |row|
404
+ destination.write(row)
405
+ Engine.rows_written += 1 if index == 0
406
+ end
407
+ end
408
+ rescue => e
409
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error msg
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:writes] += t unless t.nil?
417
+ end
418
+
419
+ if exceeded_error_threshold?(control)
420
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
421
+ return
422
+ end
423
+
424
+ end
425
+
426
+ destinations.each do |destination|
427
+ destination.close
428
+ end
429
+
430
+ say_on_own_line "Executing before post-process screens"
431
+ begin
432
+ execute_screens(control)
433
+ rescue FatalScreenError => e
434
+ say "Fatal screen error during job execution: #{e.message}"
435
+ exit
436
+ rescue ScreenError => e
437
+ say "Screen error during job execution: #{e.message}"
438
+ return
439
+ else
440
+ say "Screens passed"
441
+ end
442
+
443
+ post_process(control)
444
+
445
+ if sources.length > 0
446
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
447
+ end
448
+ if destinations.length > 0
449
+ say "Wrote #{Engine.rows_written} lines to destinations"
450
+ end
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
466
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
467
+
468
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
469
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
470
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
471
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
472
+
473
+ # say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
474
+ #
475
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
476
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
477
+ # end
478
+
479
+ ETL::Engine.job.completed_at = Time.now
480
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
481
+ ETL::Engine.job.save!
482
+ end
483
+
484
+ private
485
+ # Return true if the error threshold is exceeded
486
+ def exceeded_error_threshold?(control)
487
+ errors.length > control.error_threshold
488
+ end
489
+
490
+ # Execute all preprocessors
491
+ def pre_process(control)
492
+ Engine.logger.debug "Pre-processing #{control.file}"
493
+ control.pre_processors.each do |processor|
494
+ processor.process
495
+ end
496
+ Engine.logger.debug "Pre-processing complete"
497
+ end
498
+
499
+ # Execute all postprocessors
500
+ def post_process(control)
501
+ say_on_own_line "Executing post processes"
502
+ Engine.logger.debug "Post-processing #{control.file}"
503
+ control.post_processors.each do |processor|
504
+ processor.process
505
+ end
506
+ Engine.logger.debug "Post-processing complete"
507
+ say "Post-processing complete"
508
+ end
509
+
510
+ # Execute all dependencies
511
+ def execute_dependencies(control)
512
+ Engine.logger.debug "Executing dependencies"
513
+ control.dependencies.flatten.each do |dependency|
514
+ case dependency
515
+ when Symbol
516
+ f = dependency.to_s + '.ctl'
517
+ Engine.logger.debug "Executing dependency: #{f}"
518
+ say "Executing dependency: #{f}"
519
+ process(f)
520
+ when String
521
+ Engine.logger.debug "Executing dependency: #{f}"
522
+ say "Executing dependency: #{f}"
523
+ process(dependency)
524
+ else
525
+ raise "Invalid dependency type: #{dependency.class}"
526
+ end
527
+ end
528
+ end
529
+
530
+ # Execute all screens
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
538
+ [:fatal,:error,:warn].each do |type|
539
+ screens[type].each do |block|
540
+ begin
541
+ block.call
542
+ rescue => e
543
+ case type
544
+ when :fatal
545
+ raise FatalScreenError, e
546
+ when :error
547
+ raise ScreenError, e
548
+ when :warn
549
+ say "Screen warning: #{e}"
550
+ end
551
+ end
552
+ end
553
+ end
554
+ end
555
+ end
556
+ end
@@ -0,0 +1,9 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Base class for ETL execution information
4
+ class Base < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ establish_connection :etl_execution
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL batch
4
+ class Batch < Base
5
+ has_many :jobs
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL job
4
+ class Job < Base
5
+ belongs_to :batch
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,85 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc
3
+ # Handles migration of tables required for persistent storage of meta data
4
+ # for the ETL engine
5
+ class Migration
6
+ class << self
7
+ protected
8
+ # Get the schema info table name
9
+ def schema_info_table_name
10
+ ActiveRecord::Migrator.schema_migrations_table_name
11
+ end
12
+ alias :schema_migrations_table_name :schema_info_table_name
13
+
14
+ public
15
+ # Execute the migrations
16
+ def migrate
17
+ connection.initialize_schema_migrations_table
18
+ last_migration.upto(target - 1) do |i|
19
+ __send__("migration_#{i+1}".to_sym)
20
+ connection.assume_migrated_upto_version(i+1)
21
+ end
22
+ end
23
+
24
+ protected
25
+ def last_migration
26
+ connection.select_values(
27
+ "SELECT version FROM #{schema_migrations_table_name}"
28
+ ).map(&:to_i).sort.last || 0
29
+ end
30
+
31
+ # Get the connection to use during migration
32
+ def connection
33
+ @connection ||= ETL::Execution::Base.connection
34
+ end
35
+
36
+ # Get the final target version number
37
+ def target
38
+ 4
39
+ end
40
+
41
+ private
42
+ def migration_1 #:nodoc:
43
+ connection.create_table :jobs do |t|
44
+ t.column :control_file, :string, :null => false
45
+ t.column :created_at, :datetime, :null => false
46
+ t.column :completed_at, :datetime
47
+ t.column :status, :string
48
+ end
49
+ connection.create_table :records do |t|
50
+ t.column :control_file, :string, :null => false
51
+ t.column :natural_key, :string, :null => false
52
+ t.column :crc, :string, :null => false
53
+ t.column :job_id, :integer, :null => false
54
+ end
55
+ end
56
+
57
+ def migration_2 #:nodoc:
58
+ connection.add_index :records, :control_file
59
+ connection.add_index :records, :natural_key
60
+ connection.add_index :records, :job_id
61
+ end
62
+
63
+ def migration_3 #:nodoc:
64
+ connection.create_table :batches do |t|
65
+ t.column :batch_file, :string, :null => false
66
+ t.column :created_at, :datetime, :null => false
67
+ t.column :completed_at, :datetime
68
+ t.column :status, :string
69
+ end
70
+ connection.add_column :jobs, :batch_id, :integer
71
+ connection.add_index :jobs, :batch_id
72
+ end
73
+
74
+ def migration_4
75
+ connection.drop_table :records
76
+ end
77
+
78
+ # Update the schema info table, setting the version value
79
+ def update_schema_info(version)
80
+ connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc
2
+ # Classes which store information about ETL execution
3
+ module Execution
4
+ # Execution management
5
+ class Execution
6
+ class << self
7
+ # Migrate the data store
8
+ def migrate
9
+ ETL::Execution::Migration.migrate
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ require 'etl/execution/base'
17
+ require 'etl/execution/batch'
18
+ require 'etl/execution/job'
19
+ require 'etl/execution/migration'