colincasey-activewarehouse-etl 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +75 -0
  5. data/TODO +28 -0
  6. data/VERSION.yml +4 -0
  7. data/bin/etl +28 -0
  8. data/bin/etl.cmd +8 -0
  9. data/lib/etl.rb +81 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +414 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/csv_destination.rb +84 -0
  21. data/lib/etl/control/destination/database_destination.rb +95 -0
  22. data/lib/etl/control/destination/file_destination.rb +124 -0
  23. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control/source/database_source.rb +220 -0
  26. data/lib/etl/control/source/enumerable_source.rb +11 -0
  27. data/lib/etl/control/source/file_source.rb +90 -0
  28. data/lib/etl/control/source/model_source.rb +39 -0
  29. data/lib/etl/core_ext.rb +1 -0
  30. data/lib/etl/core_ext/time.rb +5 -0
  31. data/lib/etl/core_ext/time/calculations.rb +42 -0
  32. data/lib/etl/engine.rb +574 -0
  33. data/lib/etl/execution.rb +20 -0
  34. data/lib/etl/execution/base.rb +9 -0
  35. data/lib/etl/execution/batch.rb +8 -0
  36. data/lib/etl/execution/job.rb +8 -0
  37. data/lib/etl/execution/migration.rb +85 -0
  38. data/lib/etl/generator.rb +2 -0
  39. data/lib/etl/generator/generator.rb +20 -0
  40. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  41. data/lib/etl/http_tools.rb +139 -0
  42. data/lib/etl/parser.rb +11 -0
  43. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  44. data/lib/etl/parser/delimited_parser.rb +74 -0
  45. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  46. data/lib/etl/parser/parser.rb +41 -0
  47. data/lib/etl/parser/sax_parser.rb +218 -0
  48. data/lib/etl/parser/spreadsheet_parser.rb +114 -0
  49. data/lib/etl/parser/xml_parser.rb +65 -0
  50. data/lib/etl/processor.rb +11 -0
  51. data/lib/etl/processor/block_processor.rb +14 -0
  52. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  53. data/lib/etl/processor/check_exist_processor.rb +80 -0
  54. data/lib/etl/processor/check_unique_processor.rb +35 -0
  55. data/lib/etl/processor/copy_field_processor.rb +26 -0
  56. data/lib/etl/processor/encode_processor.rb +55 -0
  57. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  58. data/lib/etl/processor/print_row_processor.rb +12 -0
  59. data/lib/etl/processor/processor.rb +25 -0
  60. data/lib/etl/processor/rename_processor.rb +24 -0
  61. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  62. data/lib/etl/processor/row_processor.rb +17 -0
  63. data/lib/etl/processor/sequence_processor.rb +23 -0
  64. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  65. data/lib/etl/processor/truncate_processor.rb +35 -0
  66. data/lib/etl/row.rb +20 -0
  67. data/lib/etl/screen.rb +14 -0
  68. data/lib/etl/screen/row_count_screen.rb +20 -0
  69. data/lib/etl/transform.rb +2 -0
  70. data/lib/etl/transform/block_transform.rb +13 -0
  71. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  72. data/lib/etl/transform/decode_transform.rb +51 -0
  73. data/lib/etl/transform/default_transform.rb +20 -0
  74. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  75. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  76. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  77. data/lib/etl/transform/sha1_transform.rb +13 -0
  78. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  79. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  80. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  81. data/lib/etl/transform/transform.rb +61 -0
  82. data/lib/etl/transform/trim_transform.rb +26 -0
  83. data/lib/etl/transform/type_transform.rb +35 -0
  84. data/lib/etl/util.rb +59 -0
  85. data/lib/etl/version.rb +10 -0
  86. metadata +224 -0
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,574 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+
30
+ require File.join(@rails_root, 'config/environment') if @rails_root
31
+ options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
+
34
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
35
+ @sequel_configurations = sequel_configuration(database_configuration)
36
+ active_record_configuration = active_record_configuration(database_configuration)
37
+ ActiveRecord::Base.configurations.merge!(active_record_configuration)
38
+ ETL::Base.configurations = active_record_configuration
39
+
40
+ require 'etl/execution'
41
+ ETL::Execution::Base.establish_connection :etl_execution
42
+ ETL::Execution::Execution.migrate
43
+
44
+ @initialized = true
45
+ end
46
+ end
47
+
48
+ def active_record_configuration(database_configuration)
49
+ active_record_configuration = {}
50
+ database_configuration.each do |name, conn_config|
51
+ active_record_configuration[name] = conn_config unless conn_config['use_sequel'] == true
52
+ end
53
+ active_record_configuration
54
+ end
55
+
56
+ def sequel_configuration(database_configuration)
57
+ sequel_configuration = {}
58
+ database_configuration.each do |name, conn_config|
59
+ sequel_configuration[name] = conn_config if conn_config['use_sequel'] == true
60
+ end
61
+ sequel_configuration
62
+ end
63
+
64
+ # Process the specified file. Acceptable values for file are:
65
+ # * Path to a file
66
+ # * File object
67
+ # * ETL::Control::Control instance
68
+ # * ETL::Batch::Batch instance
69
+ #
70
+ # The process command will accept either a .ctl Control file or a .ebf
71
+ # ETL Batch File.
72
+ def process(file)
73
+ new().process(file)
74
+ end
75
+
76
+ attr_accessor :timestamped_log
77
+
78
+ # Accessor for the log write mode. Default is 'a' for append.
79
+ attr_accessor :log_write_mode
80
+ def log_write_mode
81
+ @log_write_mode ||= 'a'
82
+ end
83
+
84
+ # A logger for the engine
85
+ attr_accessor :logger
86
+
87
+ def logger #:nodoc:
88
+ unless @logger
89
+ if timestamped_log
90
+ @logger = Logger.new("etl_#{timestamp}.log")
91
+ else
92
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
93
+ end
94
+ @logger.level = Logger::WARN
95
+ @logger.formatter = Logger::Formatter.new
96
+ end
97
+ @logger
98
+ end
99
+
100
+ # Get a timestamp value as a string
101
+ def timestamp
102
+ Time.now.strftime("%Y%m%d%H%M%S")
103
+ end
104
+
105
+ # The current source
106
+ attr_accessor :current_source
107
+
108
+ # The current source row
109
+ attr_accessor :current_source_row
110
+
111
+ # The current destination
112
+ attr_accessor :current_destination
113
+
114
+ # Set to true to activate realtime activity. This will cause certain
115
+ # information messages to be printed to STDOUT
116
+ attr_accessor :realtime_activity
117
+
118
+ # Accessor for the total number of rows read from sources
119
+ attr_accessor :rows_read
120
+ def rows_read
121
+ @rows_read ||= 0
122
+ end
123
+
124
+ # Accessor for the total number of rows processed
125
+ attr_accessor :rows_written
126
+ def rows_written
127
+ @rows_written ||= 0
128
+ end
129
+
130
+ # Access the current ETL::Execution::Job instance
131
+ attr_accessor :job
132
+
133
+ # Access the current ETL::Execution::Batch instance
134
+ attr_accessor :batch
135
+
136
+ # The limit on rows to load from the source, useful for testing the ETL
137
+ # process prior to executing the entire batch. Default value is nil and
138
+ # indicates that there is no limit
139
+ attr_accessor :limit
140
+
141
+ # The offset for the source to begin at, useful for testing the ETL
142
+ # process prior to executing the entire batch. Default value is nil and
143
+ # indicates that there is no offset
144
+ attr_accessor :offset
145
+
146
+ # Set to true to skip all bulk importing
147
+ attr_accessor :skip_bulk_import
148
+
149
+ # Set to true to read locally from the last source cache files
150
+ attr_accessor :read_locally
151
+
152
+ # Accessor for the average rows per second processed
153
+ attr_accessor :average_rows_per_second
154
+
155
+ # Get a named connection
156
+ def connection(name)
157
+ logger.debug "Retrieving connection #{name}"
158
+ conn = connections[name] ||= establish_connection(name)
159
+ unless conn.is_a?(Sequel::Database)
160
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
161
+ conn.reconnect! unless conn.active?
162
+ end
163
+ conn
164
+ end
165
+
166
+ # Set to true to use temp tables
167
+ attr_accessor :use_temp_tables
168
+
169
+ # Get a registry of temp tables
170
+ def temp_tables
171
+ @temp_tables ||= {}
172
+ end
173
+
174
+ # Called when a batch job finishes, allowing for cleanup to occur
175
+ def finish
176
+ temp_tables.each do |temp_table, mapping|
177
+ actual_table = mapping[:table]
178
+ #puts "move #{temp_table} to #{actual_table}"
179
+ conn = mapping[:connection]
180
+ conn.transaction do
181
+ conn.rename_table(actual_table, "#{actual_table}_old")
182
+ conn.rename_table(temp_table, actual_table)
183
+ conn.drop_table("#{actual_table}_old")
184
+ end
185
+ end
186
+ end
187
+
188
+ # Return true if using temp tables
189
+ def use_temp_tables?
190
+ use_temp_tables ? true : false
191
+ end
192
+
193
+ # Modify the table name if necessary
194
+ def table(table_name, connection)
195
+ if use_temp_tables?
196
+ returning "tmp_#{table_name}" do |temp_table_name|
197
+ if temp_tables[temp_table_name].nil?
198
+ # Create the temp table and add it to the mapping
199
+ begin connection.drop_table(temp_table_name); rescue; end
200
+ connection.copy_table(table_name, temp_table_name)
201
+ temp_tables[temp_table_name] = {
202
+ :table => table_name,
203
+ :connection => connection
204
+ }
205
+ end
206
+ end
207
+ else
208
+ table_name
209
+ end
210
+ end
211
+
212
+ protected
213
+ # Hash of database connections that can be used throughout the ETL
214
+ # process
215
+ def connections
216
+ @connections ||= {}
217
+ end
218
+
219
+ # Establish the named connection and return the database specific connection
220
+ def establish_connection(name)
221
+ logger.debug "Establishing connection to #{name}"
222
+ conn_config = ETL::Base.configurations[name.to_s] || @sequel_configurations[name.to_s]
223
+ raise ETL::ETLError, "No connection found for #{name}" unless conn_config
224
+ if conn_config['use_sequel']
225
+ Sequel.connect(conn_config)
226
+ else
227
+ connection_method = "#{conn_config['adapter']}_connection"
228
+ ETL::Base.send(connection_method, conn_config)
229
+ end
230
+ end
231
+ end # class << self
232
+
233
+ # Say the specified message, with a newline
234
+ def say(message)
235
+ say_without_newline(message + "\n")
236
+ end
237
+
238
+ # Say the specified message without a newline
239
+ def say_without_newline(message)
240
+ if ETL::Engine.realtime_activity
241
+ $stdout.print message
242
+ $stdout.flush
243
+ end
244
+ end
245
+
246
+ # Say the message on its own line
247
+ def say_on_own_line(message)
248
+ say("\n" + message)
249
+ end
250
+
251
+ # Array of errors encountered during execution of the ETL process
252
+ def errors
253
+ @errors ||= []
254
+ end
255
+
256
+ # Get a Hash of benchmark values where each value represents the total
257
+ # amount of time in seconds spent processing in that portion of the ETL
258
+ # pipeline. Keys include:
259
+ # * <tt>:transforms</tt>
260
+ # * <tt>:after_reads</tt>
261
+ # * <tt>:before_writes</tt>
262
+ # * <tt>:writes</tt>
263
+ def benchmarks
264
+ @benchmarks ||= {
265
+ :transforms => 0,
266
+ :after_reads => 0,
267
+ :before_writes => 0,
268
+ :writes => 0,
269
+ }
270
+ end
271
+
272
+ # Process a file, control object or batch object. Acceptable values for
273
+ # file are:
274
+ # * Path to a file
275
+ # * File object
276
+ # * ETL::Control::Control instance
277
+ # * ETL::Batch::Batch instance
278
+ def process(file)
279
+ case file
280
+ when String
281
+ process(File.new(file))
282
+ when File
283
+ process_control(file) if file.path =~ /.ctl$/
284
+ process_batch(file) if file.path =~ /.ebf$/
285
+ when ETL::Control::Control
286
+ process_control(file)
287
+ when ETL::Batch::Batch
288
+ process_batch(file)
289
+ else
290
+ raise RuntimeError, "Process object must be a String, File, Control
291
+ instance or Batch instance"
292
+ end
293
+ end
294
+
295
+ protected
296
+ # Process the specified batch file
297
+ def process_batch(batch)
298
+ batch = ETL::Batch::Batch.resolve(batch, self)
299
+ say "Processing batch #{batch.file}"
300
+
301
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
302
+ :batch_file => batch.file,
303
+ :status => 'executing'
304
+ )
305
+
306
+ batch.execute
307
+
308
+ ETL::Engine.batch.completed_at = Time.now
309
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
310
+ ETL::Engine.batch.save!
311
+ end
312
+
313
+ # Process the specified control file
314
+ def process_control(control)
315
+ control = ETL::Control::Control.resolve(control)
316
+ say_on_own_line "Processing control #{control.file}"
317
+
318
+ ETL::Engine.job = ETL::Execution::Job.create!(
319
+ :control_file => control.file,
320
+ :status => 'executing',
321
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
322
+ )
323
+
324
+ execute_dependencies(control)
325
+
326
+ start_time = Time.now
327
+ pre_process(control)
328
+ sources = control.sources
329
+ destinations = control.destinations
330
+
331
+ say "Skipping bulk import" if Engine.skip_bulk_import
332
+
333
+ sources.each do |source|
334
+ Engine.current_source = source
335
+ Engine.logger.debug "Processing source #{source}"
336
+ say "Source: #{source}"
337
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
338
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
339
+ source.each_with_index do |row, index|
340
+ # Break out of the row loop if the +Engine.limit+ is specified and
341
+ # the number of rows read exceeds that value.
342
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
343
+ puts "Reached limit of #{Engine.limit}"
344
+ break
345
+ end
346
+
347
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
348
+ Engine.rows_read += 1
349
+ Engine.current_source_row = index + 1
350
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
351
+
352
+ # At this point a single row may be turned into multiple rows via row
353
+ # processors all code after this line should work with the array of
354
+ # rows rather than the single row
355
+ rows = [row]
356
+
357
+ t = Benchmark.realtime do
358
+ begin
359
+ Engine.logger.debug "Processing after read"
360
+ control.after_read_processors.each do |processor|
361
+ processed_rows = []
362
+ rows.each do |row|
363
+ processed_rows << processor.process(row)
364
+ end
365
+ rows = processed_rows.flatten
366
+ end
367
+ rescue => e
368
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
369
+ errors << msg
370
+ Engine.logger.error(msg)
371
+ exceeded_error_threshold?(control) ? break : next
372
+ end
373
+ end
374
+ benchmarks[:after_reads] += t unless t.nil?
375
+
376
+ t = Benchmark.realtime do
377
+ begin
378
+ Engine.logger.debug "Executing transforms"
379
+ rows.each do |row|
380
+ control.transforms.each do |transform|
381
+ name = transform.name.to_sym
382
+ row[name] = transform.transform(name, row[name], row)
383
+ end
384
+ end
385
+ rescue ResolverError => e
386
+ Engine.logger.error(e.message)
387
+ errors << e.message
388
+ rescue => e
389
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
390
+ errors << msg
391
+ Engine.logger.error(msg)
392
+ e.backtrace.each { |line| Engine.logger.error(line) }
393
+ ensure
394
+ begin
395
+ exceeded_error_threshold?(control) ? break : next
396
+ rescue => inner_error
397
+ puts inner_error
398
+ end
399
+ end
400
+ end
401
+ benchmarks[:transforms] += t unless t.nil?
402
+
403
+ t = Benchmark.realtime do
404
+ begin
405
+ # execute row-level "before write" processing
406
+ Engine.logger.debug "Processing before write"
407
+ control.before_write_processors.each do |processor|
408
+ processed_rows = []
409
+ rows.each { |row| processed_rows << processor.process(row) }
410
+ rows = processed_rows.flatten.compact
411
+ end
412
+ rescue => e
413
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
414
+ errors << msg
415
+ Engine.logger.error(msg)
416
+ e.backtrace.each { |line| Engine.logger.error(line) }
417
+ exceeded_error_threshold?(control) ? break : next
418
+ end
419
+ end
420
+ benchmarks[:before_writes] += t unless t.nil?
421
+
422
+ t = Benchmark.realtime do
423
+ begin
424
+ # write the row to the destination
425
+ destinations.each_with_index do |destination, index|
426
+ Engine.current_destination = destination
427
+ rows.each do |row|
428
+ destination.write(row)
429
+ Engine.rows_written += 1 if index == 0
430
+ end
431
+ end
432
+ rescue => e
433
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
434
+ errors << msg
435
+ Engine.logger.error msg
436
+ e.backtrace.each { |line| Engine.logger.error(line) }
437
+ exceeded_error_threshold?(control) ? break : next
438
+ end
439
+ end
440
+ benchmarks[:writes] += t unless t.nil?
441
+ end
442
+
443
+ if exceeded_error_threshold?(control)
444
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
445
+ return
446
+ end
447
+
448
+ end
449
+
450
+ destinations.each do |destination|
451
+ destination.close
452
+ end
453
+
454
+ say_on_own_line "Executing before post-process screens"
455
+ begin
456
+ execute_screens(control)
457
+ rescue FatalScreenError => e
458
+ say "Fatal screen error during job execution: #{e.message}"
459
+ exit
460
+ rescue ScreenError => e
461
+ say "Screen error during job execution: #{e.message}"
462
+ return
463
+ else
464
+ say "Screens passed"
465
+ end
466
+
467
+ post_process(control)
468
+
469
+ if sources.length > 0
470
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
471
+ end
472
+ if destinations.length > 0
473
+ say "Wrote #{Engine.rows_written} lines to destinations"
474
+ end
475
+
476
+ say_on_own_line "Executing after post-process screens"
477
+ begin
478
+ execute_screens(control, :after_post_process)
479
+ rescue FatalScreenError => e
480
+ say "Fatal screen error during job execution: #{e.message}"
481
+ exit
482
+ rescue ScreenError => e
483
+ say "Screen error during job execution: #{e.message}"
484
+ return
485
+ else
486
+ say "Screens passed"
487
+ end
488
+
489
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
490
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
491
+
492
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
493
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
494
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
495
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
496
+
497
+ ETL::Engine.job.completed_at = Time.now
498
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
499
+ ETL::Engine.job.save!
500
+ end
501
+
502
+ private
503
+ # Return true if the error threshold is exceeded
504
+ def exceeded_error_threshold?(control)
505
+ errors.length > control.error_threshold
506
+ end
507
+
508
+ # Execute all preprocessors
509
+ def pre_process(control)
510
+ Engine.logger.debug "Pre-processing #{control.file}"
511
+ control.pre_processors.each do |processor|
512
+ processor.process
513
+ end
514
+ Engine.logger.debug "Pre-processing complete"
515
+ end
516
+
517
+ # Execute all postprocessors
518
+ def post_process(control)
519
+ say_on_own_line "Executing post processes"
520
+ Engine.logger.debug "Post-processing #{control.file}"
521
+ control.post_processors.each do |processor|
522
+ processor.process
523
+ end
524
+ Engine.logger.debug "Post-processing complete"
525
+ say "Post-processing complete"
526
+ end
527
+
528
+ # Execute all dependencies
529
+ def execute_dependencies(control)
530
+ Engine.logger.debug "Executing dependencies"
531
+ control.dependencies.flatten.each do |dependency|
532
+ case dependency
533
+ when Symbol
534
+ f = dependency.to_s + '.ctl'
535
+ Engine.logger.debug "Executing dependency: #{f}"
536
+ say "Executing dependency: #{f}"
537
+ process(f)
538
+ when String
539
+ Engine.logger.debug "Executing dependency: #{f}"
540
+ say "Executing dependency: #{f}"
541
+ process(dependency)
542
+ else
543
+ raise "Invalid dependency type: #{dependency.class}"
544
+ end
545
+ end
546
+ end
547
+
548
+ # Execute all screens
549
+ def execute_screens(control, timing = :before_post_process)
550
+ screens = case timing
551
+ when :after_post_process
552
+ control.after_post_process_screens
553
+ else # default to before post-process screens
554
+ control.screens
555
+ end
556
+ [:fatal,:error,:warn].each do |type|
557
+ screens[type].each do |block|
558
+ begin
559
+ block.call
560
+ rescue => e
561
+ case type
562
+ when :fatal
563
+ raise FatalScreenError, e
564
+ when :error
565
+ raise ScreenError, e
566
+ when :warn
567
+ say "Screen warning: #{e}"
568
+ end
569
+ end
570
+ end
571
+ end
572
+ end
573
+ end
574
+ end