factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,556 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+
30
+ require File.join(@rails_root, 'config/environment') if @rails_root
31
+ options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
+ ActiveRecord::Base.configurations.merge!(database_configuration)
35
+ ETL::Base.configurations = database_configuration
36
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
+
38
+ require 'etl/execution'
39
+ ETL::Execution::Base.establish_connection :etl_execution
40
+ ETL::Execution::Execution.migrate
41
+
42
+ @initialized = true
43
+ end
44
+ end
45
+
46
+ # Process the specified file. Acceptable values for file are:
47
+ # * Path to a file
48
+ # * File object
49
+ # * ETL::Control::Control instance
50
+ # * ETL::Batch::Batch instance
51
+ #
52
+ # The process command will accept either a .ctl Control file or a .ebf
53
+ # ETL Batch File.
54
+ def process(file)
55
+ new().process(file)
56
+ end
57
+
58
+ attr_accessor :timestamped_log
59
+
60
+ # Accessor for the log write mode. Default is 'a' for append.
61
+ attr_accessor :log_write_mode
62
+ def log_write_mode
63
+ @log_write_mode ||= 'a'
64
+ end
65
+
66
+ # A logger for the engine
67
+ attr_accessor :logger
68
+
69
+ def logger #:nodoc:
70
+ unless @logger
71
+ if timestamped_log
72
+ @logger = Logger.new("etl_#{timestamp}.log")
73
+ else
74
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
75
+ end
76
+ @logger.level = Logger::WARN
77
+ @logger.formatter = Logger::Formatter.new
78
+ end
79
+ @logger
80
+ end
81
+
82
+ # Get a timestamp value as a string
83
+ def timestamp
84
+ Time.now.strftime("%Y%m%d%H%M%S")
85
+ end
86
+
87
+ # The current source
88
+ attr_accessor :current_source
89
+
90
+ # The current source row
91
+ attr_accessor :current_source_row
92
+
93
+ # The current destination
94
+ attr_accessor :current_destination
95
+
96
+ # Set to true to activate realtime activity. This will cause certain
97
+ # information messages to be printed to STDOUT
98
+ attr_accessor :realtime_activity
99
+
100
+ # Accessor for the total number of rows read from sources
101
+ attr_accessor :rows_read
102
+ def rows_read
103
+ @rows_read ||= 0
104
+ end
105
+
106
+ # Accessor for the total number of rows processed
107
+ attr_accessor :rows_written
108
+ def rows_written
109
+ @rows_written ||= 0
110
+ end
111
+
112
+ # Access the current ETL::Execution::Job instance
113
+ attr_accessor :job
114
+
115
+ # Access the current ETL::Execution::Batch instance
116
+ attr_accessor :batch
117
+
118
+ # The limit on rows to load from the source, useful for testing the ETL
119
+ # process prior to executing the entire batch. Default value is nil and
120
+ # indicates that there is no limit
121
+ attr_accessor :limit
122
+
123
+ # The offset for the source to begin at, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no offset
126
+ attr_accessor :offset
127
+
128
+ # Set to true to skip all bulk importing
129
+ attr_accessor :skip_bulk_import
130
+
131
+ # Set to true to read locally from the last source cache files
132
+ attr_accessor :read_locally
133
+
134
+ # Accessor for the average rows per second processed
135
+ attr_accessor :average_rows_per_second
136
+
137
+ # Get a named connection
138
+ def connection(name)
139
+ logger.debug "Retrieving connection #{name}"
140
+ conn = connections[name] ||= establish_connection(name)
141
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
142
+ conn.reconnect! unless conn.active?
143
+ conn
144
+ end
145
+
146
+ # Set to true to use temp tables
147
+ attr_accessor :use_temp_tables
148
+
149
+ # Get a registry of temp tables
150
+ def temp_tables
151
+ @temp_tables ||= {}
152
+ end
153
+
154
+ # Called when a batch job finishes, allowing for cleanup to occur
155
+ def finish
156
+ temp_tables.each do |temp_table, mapping|
157
+ actual_table = mapping[:table]
158
+ #puts "move #{temp_table} to #{actual_table}"
159
+ conn = mapping[:connection]
160
+ conn.transaction do
161
+ conn.rename_table(actual_table, "#{actual_table}_old")
162
+ conn.rename_table(temp_table, actual_table)
163
+ conn.drop_table("#{actual_table}_old")
164
+ end
165
+ end
166
+ end
167
+
168
+ # Return true if using temp tables
169
+ def use_temp_tables?
170
+ use_temp_tables ? true : false
171
+ end
172
+
173
+ # Modify the table name if necessary
174
+ def table(table_name, connection)
175
+ if use_temp_tables?
176
+ returning "tmp_#{table_name}" do |temp_table_name|
177
+ if temp_tables[temp_table_name].nil?
178
+ # Create the temp table and add it to the mapping
179
+ begin connection.drop_table(temp_table_name); rescue; end
180
+ connection.copy_table(table_name, temp_table_name)
181
+ temp_tables[temp_table_name] = {
182
+ :table => table_name,
183
+ :connection => connection
184
+ }
185
+ end
186
+ end
187
+ else
188
+ table_name
189
+ end
190
+ end
191
+
192
+ protected
193
+ # Hash of database connections that can be used throughout the ETL
194
+ # process
195
+ def connections
196
+ @connections ||= {}
197
+ end
198
+
199
+ # Establish the named connection and return the database specific connection
200
+ def establish_connection(name)
201
+ logger.debug "Establishing connection to #{name}"
202
+ conn_config = ETL::Base.configurations[name.to_s]
203
+ raise ETL::ETLError, "No connection found for #{name}" unless conn_config
204
+ connection_method = "#{conn_config['adapter']}_connection"
205
+ ETL::Base.send(connection_method, conn_config)
206
+ end
207
+ end # class << self
208
+
209
+ # Say the specified message, with a newline
210
+ def say(message)
211
+ say_without_newline(message + "\n")
212
+ end
213
+
214
+ # Say the specified message without a newline
215
+ def say_without_newline(message)
216
+ if ETL::Engine.realtime_activity
217
+ $stdout.print message
218
+ $stdout.flush
219
+ end
220
+ end
221
+
222
+ # Say the message on its own line
223
+ def say_on_own_line(message)
224
+ say("\n" + message)
225
+ end
226
+
227
+ # Array of errors encountered during execution of the ETL process
228
+ def errors
229
+ @errors ||= []
230
+ end
231
+
232
+ # Get a Hash of benchmark values where each value represents the total
233
+ # amount of time in seconds spent processing in that portion of the ETL
234
+ # pipeline. Keys include:
235
+ # * <tt>:transforms</tt>
236
+ # * <tt>:after_reads</tt>
237
+ # * <tt>:before_writes</tt>
238
+ # * <tt>:writes</tt>
239
+ def benchmarks
240
+ @benchmarks ||= {
241
+ :transforms => 0,
242
+ :after_reads => 0,
243
+ :before_writes => 0,
244
+ :writes => 0,
245
+ }
246
+ end
247
+
248
+ # Process a file, control object or batch object. Acceptable values for
249
+ # file are:
250
+ # * Path to a file
251
+ # * File object
252
+ # * ETL::Control::Control instance
253
+ # * ETL::Batch::Batch instance
254
+ def process(file)
255
+ case file
256
+ when String
257
+ process(File.new(file))
258
+ when File
259
+ process_control(file) if file.path =~ /.ctl$/
260
+ process_batch(file) if file.path =~ /.ebf$/
261
+ when ETL::Control::Control
262
+ process_control(file)
263
+ when ETL::Batch::Batch
264
+ process_batch(file)
265
+ else
266
+ raise RuntimeError, "Process object must be a String, File, Control
267
+ instance or Batch instance"
268
+ end
269
+ end
270
+
271
+ protected
272
+ # Process the specified batch file
273
+ def process_batch(batch)
274
+ batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
276
+
277
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
278
+ :batch_file => batch.file,
279
+ :status => 'executing'
280
+ )
281
+
282
+ batch.execute
283
+
284
+ ETL::Engine.batch.completed_at = Time.now
285
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
286
+ ETL::Engine.batch.save!
287
+ end
288
+
289
+ # Process the specified control file
290
+ def process_control(control)
291
+ control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
+
294
+ ETL::Engine.job = ETL::Execution::Job.create!(
295
+ :control_file => control.file,
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
298
+ )
299
+
300
+ execute_dependencies(control)
301
+
302
+ start_time = Time.now
303
+ pre_process(control)
304
+ sources = control.sources
305
+ destinations = control.destinations
306
+
307
+ say "Skipping bulk import" if Engine.skip_bulk_import
308
+
309
+ sources.each do |source|
310
+ Engine.current_source = source
311
+ Engine.logger.debug "Processing source #{source}"
312
+ say "Source: #{source}"
313
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
314
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
315
+ source.each_with_index do |row, index|
316
+ # Break out of the row loop if the +Engine.limit+ is specified and
317
+ # the number of rows read exceeds that value.
318
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
319
+ puts "Reached limit of #{Engine.limit}"
320
+ break
321
+ end
322
+
323
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
324
+ Engine.rows_read += 1
325
+ Engine.current_source_row = index + 1
326
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
327
+
328
+ # At this point a single row may be turned into multiple rows via row
329
+ # processors all code after this line should work with the array of
330
+ # rows rather than the single row
331
+ rows = [row]
332
+
333
+ t = Benchmark.realtime do
334
+ begin
335
+ Engine.logger.debug "Processing after read"
336
+ control.after_read_processors.each do |processor|
337
+ processed_rows = []
338
+ rows.each do |row|
339
+ processed_rows << processor.process(row)
340
+ end
341
+ rows = processed_rows.flatten
342
+ end
343
+ rescue => e
344
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
345
+ errors << msg
346
+ Engine.logger.error(msg)
347
+ exceeded_error_threshold?(control) ? break : next
348
+ end
349
+ end
350
+ benchmarks[:after_reads] += t unless t.nil?
351
+
352
+ t = Benchmark.realtime do
353
+ begin
354
+ Engine.logger.debug "Executing transforms"
355
+ rows.each do |row|
356
+ control.transforms.each do |transform|
357
+ name = transform.name.to_sym
358
+ row[name] = transform.transform(name, row[name], row)
359
+ end
360
+ end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
364
+ rescue => e
365
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
366
+ errors << msg
367
+ Engine.logger.error(msg)
368
+ e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
370
+ begin
371
+ exceeded_error_threshold?(control) ? break : next
372
+ rescue => inner_error
373
+ puts inner_error
374
+ end
375
+ end
376
+ end
377
+ benchmarks[:transforms] += t unless t.nil?
378
+
379
+ t = Benchmark.realtime do
380
+ begin
381
+ # execute row-level "before write" processing
382
+ Engine.logger.debug "Processing before write"
383
+ control.before_write_processors.each do |processor|
384
+ processed_rows = []
385
+ rows.each { |row| processed_rows << processor.process(row) }
386
+ rows = processed_rows.flatten.compact
387
+ end
388
+ rescue => e
389
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
390
+ errors << msg
391
+ Engine.logger.error(msg)
392
+ e.backtrace.each { |line| Engine.logger.error(line) }
393
+ exceeded_error_threshold?(control) ? break : next
394
+ end
395
+ end
396
+ benchmarks[:before_writes] += t unless t.nil?
397
+
398
+ t = Benchmark.realtime do
399
+ begin
400
+ # write the row to the destination
401
+ destinations.each_with_index do |destination, index|
402
+ Engine.current_destination = destination
403
+ rows.each do |row|
404
+ destination.write(row)
405
+ Engine.rows_written += 1 if index == 0
406
+ end
407
+ end
408
+ rescue => e
409
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error msg
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:writes] += t unless t.nil?
417
+ end
418
+
419
+ if exceeded_error_threshold?(control)
420
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
421
+ return
422
+ end
423
+
424
+ end
425
+
426
+ destinations.each do |destination|
427
+ destination.close
428
+ end
429
+
430
+ say_on_own_line "Executing before post-process screens"
431
+ begin
432
+ execute_screens(control)
433
+ rescue FatalScreenError => e
434
+ say "Fatal screen error during job execution: #{e.message}"
435
+ exit
436
+ rescue ScreenError => e
437
+ say "Screen error during job execution: #{e.message}"
438
+ return
439
+ else
440
+ say "Screens passed"
441
+ end
442
+
443
+ post_process(control)
444
+
445
+ if sources.length > 0
446
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
447
+ end
448
+ if destinations.length > 0
449
+ say "Wrote #{Engine.rows_written} lines to destinations"
450
+ end
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
466
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
467
+
468
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
469
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
470
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
471
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
472
+
473
+ say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
474
+
475
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
476
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
477
+ # end
478
+
479
+ ETL::Engine.job.completed_at = Time.now
480
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
481
+ ETL::Engine.job.save!
482
+ end
483
+
484
+ private
485
+ # Return true if the error threshold is exceeded
486
+ def exceeded_error_threshold?(control)
487
+ errors.length > control.error_threshold
488
+ end
489
+
490
+ # Execute all preprocessors
491
+ def pre_process(control)
492
+ Engine.logger.debug "Pre-processing #{control.file}"
493
+ control.pre_processors.each do |processor|
494
+ processor.process
495
+ end
496
+ Engine.logger.debug "Pre-processing complete"
497
+ end
498
+
499
+ # Execute all postprocessors
500
+ def post_process(control)
501
+ say_on_own_line "Executing post processes"
502
+ Engine.logger.debug "Post-processing #{control.file}"
503
+ control.post_processors.each do |processor|
504
+ processor.process
505
+ end
506
+ Engine.logger.debug "Post-processing complete"
507
+ say "Post-processing complete"
508
+ end
509
+
510
+ # Execute all dependencies
511
+ def execute_dependencies(control)
512
+ Engine.logger.debug "Executing dependencies"
513
+ control.dependencies.flatten.each do |dependency|
514
+ case dependency
515
+ when Symbol
516
+ f = dependency.to_s + '.ctl'
517
+ Engine.logger.debug "Executing dependency: #{f}"
518
+ say "Executing dependency: #{f}"
519
+ process(f)
520
+ when String
521
+ Engine.logger.debug "Executing dependency: #{f}"
522
+ say "Executing dependency: #{f}"
523
+ process(dependency)
524
+ else
525
+ raise "Invalid dependency type: #{dependency.class}"
526
+ end
527
+ end
528
+ end
529
+
530
+ # Execute all screens
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
538
+ [:fatal,:error,:warn].each do |type|
539
+ screens[type].each do |block|
540
+ begin
541
+ block.call
542
+ rescue => e
543
+ case type
544
+ when :fatal
545
+ raise FatalScreenError, e
546
+ when :error
547
+ raise ScreenError, e
548
+ when :warn
549
+ say "Screen warning: #{e}"
550
+ end
551
+ end
552
+ end
553
+ end
554
+ end
555
+ end
556
+ end