balinterdi-activewarehouse-etl 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (159) hide show
  1. data/.gitignore +4 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +198 -0
  4. data/HOW_TO_RELEASE +4 -0
  5. data/LICENSE +7 -0
  6. data/README +85 -0
  7. data/Rakefile +169 -0
  8. data/TODO +28 -0
  9. data/VERSION +1 -0
  10. data/active_support_logger.patch +78 -0
  11. data/balinterdi-activewarehouse-etl.gemspec +221 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +78 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +55 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +420 -0
  26. data/lib/etl/control/destination/database_destination.rb +95 -0
  27. data/lib/etl/control/destination/file_destination.rb +124 -0
  28. data/lib/etl/control/source.rb +109 -0
  29. data/lib/etl/control/source/database_source.rb +220 -0
  30. data/lib/etl/control/source/enumerable_source.rb +11 -0
  31. data/lib/etl/control/source/file_source.rb +90 -0
  32. data/lib/etl/control/source/model_source.rb +39 -0
  33. data/lib/etl/core_ext.rb +1 -0
  34. data/lib/etl/core_ext/time.rb +5 -0
  35. data/lib/etl/core_ext/time/calculations.rb +42 -0
  36. data/lib/etl/engine.rb +556 -0
  37. data/lib/etl/execution.rb +20 -0
  38. data/lib/etl/execution/base.rb +9 -0
  39. data/lib/etl/execution/batch.rb +8 -0
  40. data/lib/etl/execution/job.rb +8 -0
  41. data/lib/etl/execution/migration.rb +85 -0
  42. data/lib/etl/generator.rb +2 -0
  43. data/lib/etl/generator/generator.rb +20 -0
  44. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  45. data/lib/etl/http_tools.rb +139 -0
  46. data/lib/etl/parser.rb +11 -0
  47. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  48. data/lib/etl/parser/delimited_parser.rb +75 -0
  49. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  50. data/lib/etl/parser/parser.rb +41 -0
  51. data/lib/etl/parser/sax_parser.rb +218 -0
  52. data/lib/etl/parser/xml_parser.rb +65 -0
  53. data/lib/etl/processor.rb +11 -0
  54. data/lib/etl/processor/block_processor.rb +14 -0
  55. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  56. data/lib/etl/processor/check_exist_processor.rb +80 -0
  57. data/lib/etl/processor/check_unique_processor.rb +35 -0
  58. data/lib/etl/processor/copy_field_processor.rb +26 -0
  59. data/lib/etl/processor/encode_processor.rb +55 -0
  60. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  61. data/lib/etl/processor/print_row_processor.rb +12 -0
  62. data/lib/etl/processor/processor.rb +25 -0
  63. data/lib/etl/processor/rename_processor.rb +24 -0
  64. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  65. data/lib/etl/processor/row_processor.rb +17 -0
  66. data/lib/etl/processor/sequence_processor.rb +23 -0
  67. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  68. data/lib/etl/processor/truncate_processor.rb +35 -0
  69. data/lib/etl/row.rb +20 -0
  70. data/lib/etl/screen.rb +14 -0
  71. data/lib/etl/screen/row_count_screen.rb +20 -0
  72. data/lib/etl/transform.rb +2 -0
  73. data/lib/etl/transform/block_transform.rb +13 -0
  74. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  75. data/lib/etl/transform/decode_transform.rb +51 -0
  76. data/lib/etl/transform/default_transform.rb +20 -0
  77. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  78. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  79. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  80. data/lib/etl/transform/sha1_transform.rb +13 -0
  81. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  82. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  83. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  84. data/lib/etl/transform/transform.rb +61 -0
  85. data/lib/etl/transform/trim_transform.rb +26 -0
  86. data/lib/etl/transform/type_transform.rb +35 -0
  87. data/lib/etl/util.rb +59 -0
  88. data/lib/etl/version.rb +9 -0
  89. data/test/.ignore +2 -0
  90. data/test/all.ebf +6 -0
  91. data/test/apache_combined_log.ctl +11 -0
  92. data/test/batch_test.rb +41 -0
  93. data/test/batch_with_error.ebf +6 -0
  94. data/test/batched1.ctl +0 -0
  95. data/test/batched2.ctl +0 -0
  96. data/test/block_processor.ctl +6 -0
  97. data/test/block_processor_error.ctl +1 -0
  98. data/test/block_processor_pre_post_process.ctl +4 -0
  99. data/test/block_processor_remove_rows.ctl +5 -0
  100. data/test/block_processor_test.rb +38 -0
  101. data/test/connection/native_mysql/connection.rb +9 -0
  102. data/test/connection/native_mysql/schema.sql +36 -0
  103. data/test/connection/postgresql/connection.rb +13 -0
  104. data/test/connection/postgresql/schema.sql +43 -0
  105. data/test/control_test.rb +43 -0
  106. data/test/data/apache_combined_log.txt +3 -0
  107. data/test/data/bulk_import.txt +3 -0
  108. data/test/data/bulk_import_with_empties.txt +3 -0
  109. data/test/data/decode.txt +3 -0
  110. data/test/data/delimited.txt +3 -0
  111. data/test/data/encode_source_latin1.txt +2 -0
  112. data/test/data/fixed_width.txt +3 -0
  113. data/test/data/multiple_delimited_1.txt +3 -0
  114. data/test/data/multiple_delimited_2.txt +3 -0
  115. data/test/data/people.txt +3 -0
  116. data/test/data/sax.xml +14 -0
  117. data/test/data/xml.xml +16 -0
  118. data/test/database.example.yml +18 -0
  119. data/test/database.mysql.yml +18 -0
  120. data/test/database.postgres.yml +18 -0
  121. data/test/database.yml +18 -0
  122. data/test/date_dimension_builder_test.rb +96 -0
  123. data/test/delimited.ctl +30 -0
  124. data/test/delimited_absolute.ctl +33 -0
  125. data/test/delimited_destination_db.ctl +25 -0
  126. data/test/delimited_with_bulk_load.ctl +34 -0
  127. data/test/destination_test.rb +171 -0
  128. data/test/directive_test.rb +23 -0
  129. data/test/encode_processor_test.rb +31 -0
  130. data/test/engine_test.rb +32 -0
  131. data/test/errors.ctl +24 -0
  132. data/test/etl_test.rb +42 -0
  133. data/test/fixed_width.ctl +35 -0
  134. data/test/generator_test.rb +14 -0
  135. data/test/inline_parser.ctl +17 -0
  136. data/test/mocks/mock_destination.rb +26 -0
  137. data/test/mocks/mock_source.rb +25 -0
  138. data/test/model_source.ctl +14 -0
  139. data/test/multiple_delimited.ctl +22 -0
  140. data/test/multiple_source_delimited.ctl +39 -0
  141. data/test/parser_test.rb +200 -0
  142. data/test/performance/delimited.ctl +30 -0
  143. data/test/processor_test.rb +38 -0
  144. data/test/row_processor_test.rb +17 -0
  145. data/test/sax.ctl +26 -0
  146. data/test/scd/1.txt +1 -0
  147. data/test/scd/2.txt +1 -0
  148. data/test/scd/3.txt +1 -0
  149. data/test/scd_test.rb +263 -0
  150. data/test/scd_test_type_1.ctl +43 -0
  151. data/test/scd_test_type_2.ctl +42 -0
  152. data/test/screen_test.rb +9 -0
  153. data/test/screen_test_error.ctl +3 -0
  154. data/test/screen_test_fatal.ctl +3 -0
  155. data/test/source_test.rb +139 -0
  156. data/test/test_helper.rb +33 -0
  157. data/test/transform_test.rb +101 -0
  158. data/test/xml.ctl +31 -0
  159. metadata +237 -0
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,556 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+
30
+ require File.join(@rails_root, 'config/environment') if @rails_root
31
+ options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
+ ActiveRecord::Base.configurations.merge!(database_configuration)
35
+ ETL::Base.configurations = database_configuration
36
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
+
38
+ require 'etl/execution'
39
+ ETL::Execution::Base.establish_connection :etl_execution
40
+ ETL::Execution::Execution.migrate
41
+
42
+ @initialized = true
43
+ end
44
+ end
45
+
46
+ # Process the specified file. Acceptable values for file are:
47
+ # * Path to a file
48
+ # * File object
49
+ # * ETL::Control::Control instance
50
+ # * ETL::Batch::Batch instance
51
+ #
52
+ # The process command will accept either a .ctl Control file or a .ebf
53
+ # ETL Batch File.
54
+ def process(file)
55
+ new().process(file)
56
+ end
57
+
58
+ attr_accessor :timestamped_log
59
+
60
+ # Accessor for the log write mode. Default is 'a' for append.
61
+ attr_accessor :log_write_mode
62
+ def log_write_mode
63
+ @log_write_mode ||= 'a'
64
+ end
65
+
66
+ # A logger for the engine
67
+ attr_accessor :logger
68
+
69
+ def logger #:nodoc:
70
+ unless @logger
71
+ if timestamped_log
72
+ @logger = Logger.new("etl_#{timestamp}.log")
73
+ else
74
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
75
+ end
76
+ @logger.level = Logger::WARN
77
+ @logger.formatter = Logger::Formatter.new
78
+ end
79
+ @logger
80
+ end
81
+
82
+ # Get a timestamp value as a string
83
+ def timestamp
84
+ Time.now.strftime("%Y%m%d%H%M%S")
85
+ end
86
+
87
+ # The current source
88
+ attr_accessor :current_source
89
+
90
+ # The current source row
91
+ attr_accessor :current_source_row
92
+
93
+ # The current destination
94
+ attr_accessor :current_destination
95
+
96
+ # Set to true to activate realtime activity. This will cause certain
97
+ # information messages to be printed to STDOUT
98
+ attr_accessor :realtime_activity
99
+
100
+ # Accessor for the total number of rows read from sources
101
+ attr_accessor :rows_read
102
+ def rows_read
103
+ @rows_read ||= 0
104
+ end
105
+
106
+ # Accessor for the total number of rows processed
107
+ attr_accessor :rows_written
108
+ def rows_written
109
+ @rows_written ||= 0
110
+ end
111
+
112
+ # Access the current ETL::Execution::Job instance
113
+ attr_accessor :job
114
+
115
+ # Access the current ETL::Execution::Batch instance
116
+ attr_accessor :batch
117
+
118
+ # The limit on rows to load from the source, useful for testing the ETL
119
+ # process prior to executing the entire batch. Default value is nil and
120
+ # indicates that there is no limit
121
+ attr_accessor :limit
122
+
123
+ # The offset for the source to begin at, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no offset
126
+ attr_accessor :offset
127
+
128
+ # Set to true to skip all bulk importing
129
+ attr_accessor :skip_bulk_import
130
+
131
+ # Set to true to read locally from the last source cache files
132
+ attr_accessor :read_locally
133
+
134
+ # Accessor for the average rows per second processed
135
+ attr_accessor :average_rows_per_second
136
+
137
+ # Get a named connection
138
+ def connection(name)
139
+ logger.debug "Retrieving connection #{name}"
140
+ conn = connections[name] ||= establish_connection(name)
141
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
142
+ conn.reconnect! unless conn.active?
143
+ conn
144
+ end
145
+
146
+ # Set to true to use temp tables
147
+ attr_accessor :use_temp_tables
148
+
149
+ # Get a registry of temp tables
150
+ def temp_tables
151
+ @temp_tables ||= {}
152
+ end
153
+
154
+ # Called when a batch job finishes, allowing for cleanup to occur
155
+ def finish
156
+ temp_tables.each do |temp_table, mapping|
157
+ actual_table = mapping[:table]
158
+ #puts "move #{temp_table} to #{actual_table}"
159
+ conn = mapping[:connection]
160
+ conn.transaction do
161
+ conn.rename_table(actual_table, "#{actual_table}_old")
162
+ conn.rename_table(temp_table, actual_table)
163
+ conn.drop_table("#{actual_table}_old")
164
+ end
165
+ end
166
+ end
167
+
168
+ # Return true if using temp tables
169
+ def use_temp_tables?
170
+ use_temp_tables ? true : false
171
+ end
172
+
173
+ # Modify the table name if necessary
174
+ def table(table_name, connection)
175
+ if use_temp_tables?
176
+ returning "tmp_#{table_name}" do |temp_table_name|
177
+ if temp_tables[temp_table_name].nil?
178
+ # Create the temp table and add it to the mapping
179
+ begin connection.drop_table(temp_table_name); rescue; end
180
+ connection.copy_table(table_name, temp_table_name)
181
+ temp_tables[temp_table_name] = {
182
+ :table => table_name,
183
+ :connection => connection
184
+ }
185
+ end
186
+ end
187
+ else
188
+ table_name
189
+ end
190
+ end
191
+
192
+ protected
193
+ # Hash of database connections that can be used throughout the ETL
194
+ # process
195
+ def connections
196
+ @connections ||= {}
197
+ end
198
+
199
+ # Establish the named connection and return the database specific connection
200
+ def establish_connection(name)
201
+ logger.debug "Establishing connection to #{name}"
202
+ conn_config = ETL::Base.configurations[name.to_s]
203
+ raise ETL::ETLError, "No connection found for #{name}" unless conn_config
204
+ connection_method = "#{conn_config['adapter']}_connection"
205
+ ETL::Base.send(connection_method, conn_config)
206
+ end
207
+ end # class << self
208
+
209
+ # Say the specified message, with a newline
210
+ def say(message)
211
+ say_without_newline(message + "\n")
212
+ end
213
+
214
+ # Say the specified message without a newline
215
+ def say_without_newline(message)
216
+ if ETL::Engine.realtime_activity
217
+ $stdout.print message
218
+ $stdout.flush
219
+ end
220
+ end
221
+
222
+ # Say the message on its own line
223
+ def say_on_own_line(message)
224
+ say("\n" + message)
225
+ end
226
+
227
+ # Array of errors encountered during execution of the ETL process
228
+ def errors
229
+ @errors ||= []
230
+ end
231
+
232
+ # Get a Hash of benchmark values where each value represents the total
233
+ # amount of time in seconds spent processing in that portion of the ETL
234
+ # pipeline. Keys include:
235
+ # * <tt>:transforms</tt>
236
+ # * <tt>:after_reads</tt>
237
+ # * <tt>:before_writes</tt>
238
+ # * <tt>:writes</tt>
239
+ def benchmarks
240
+ @benchmarks ||= {
241
+ :transforms => 0,
242
+ :after_reads => 0,
243
+ :before_writes => 0,
244
+ :writes => 0,
245
+ }
246
+ end
247
+
248
+ # Process a file, control object or batch object. Acceptable values for
249
+ # file are:
250
+ # * Path to a file
251
+ # * File object
252
+ # * ETL::Control::Control instance
253
+ # * ETL::Batch::Batch instance
254
+ def process(file)
255
+ case file
256
+ when String
257
+ process(File.new(file))
258
+ when File
259
+ process_control(file) if file.path =~ /.ctl$/
260
+ process_batch(file) if file.path =~ /.ebf$/
261
+ when ETL::Control::Control
262
+ process_control(file)
263
+ when ETL::Batch::Batch
264
+ process_batch(file)
265
+ else
266
+ raise RuntimeError, "Process object must be a String, File, Control
267
+ instance or Batch instance"
268
+ end
269
+ end
270
+
271
+ protected
272
+ # Process the specified batch file
273
+ def process_batch(batch)
274
+ batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
276
+
277
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
278
+ :batch_file => batch.file,
279
+ :status => 'executing'
280
+ )
281
+
282
+ batch.execute
283
+
284
+ ETL::Engine.batch.completed_at = Time.now
285
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
286
+ ETL::Engine.batch.save!
287
+ end
288
+
289
+ # Process the specified control file
290
+ def process_control(control)
291
+ control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
+
294
+ ETL::Engine.job = ETL::Execution::Job.create!(
295
+ :control_file => control.file,
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
298
+ )
299
+
300
+ execute_dependencies(control)
301
+
302
+ start_time = Time.now
303
+ pre_process(control)
304
+ sources = control.sources
305
+ destinations = control.destinations
306
+
307
+ say "Skipping bulk import" if Engine.skip_bulk_import
308
+
309
+ sources.each do |source|
310
+ Engine.current_source = source
311
+ Engine.logger.debug "Processing source #{source}"
312
+ say "Source: #{source}"
313
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
314
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
315
+ source.each_with_index do |row, index|
316
+ # Break out of the row loop if the +Engine.limit+ is specified and
317
+ # the number of rows read exceeds that value.
318
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
319
+ puts "Reached limit of #{Engine.limit}"
320
+ break
321
+ end
322
+
323
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
324
+ Engine.rows_read += 1
325
+ Engine.current_source_row = index + 1
326
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
327
+
328
+ # At this point a single row may be turned into multiple rows via row
329
+ # processors all code after this line should work with the array of
330
+ # rows rather than the single row
331
+ rows = [row]
332
+
333
+ t = Benchmark.realtime do
334
+ begin
335
+ Engine.logger.debug "Processing after read"
336
+ control.after_read_processors.each do |processor|
337
+ processed_rows = []
338
+ rows.each do |row|
339
+ processed_rows << processor.process(row)
340
+ end
341
+ rows = processed_rows.flatten
342
+ end
343
+ rescue => e
344
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
345
+ errors << msg
346
+ Engine.logger.error(msg)
347
+ exceeded_error_threshold?(control) ? break : next
348
+ end
349
+ end
350
+ benchmarks[:after_reads] += t unless t.nil?
351
+
352
+ t = Benchmark.realtime do
353
+ begin
354
+ Engine.logger.debug "Executing transforms"
355
+ rows.each do |row|
356
+ control.transforms.each do |transform|
357
+ name = transform.name.to_sym
358
+ row[name] = transform.transform(name, row[name], row)
359
+ end
360
+ end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
364
+ rescue => e
365
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
366
+ errors << msg
367
+ Engine.logger.error(msg)
368
+ e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
370
+ begin
371
+ exceeded_error_threshold?(control) ? break : next
372
+ rescue => inner_error
373
+ puts inner_error
374
+ end
375
+ end
376
+ end
377
+ benchmarks[:transforms] += t unless t.nil?
378
+
379
+ t = Benchmark.realtime do
380
+ begin
381
+ # execute row-level "before write" processing
382
+ Engine.logger.debug "Processing before write"
383
+ control.before_write_processors.each do |processor|
384
+ processed_rows = []
385
+ rows.each { |row| processed_rows << processor.process(row) }
386
+ rows = processed_rows.flatten.compact
387
+ end
388
+ rescue => e
389
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
390
+ errors << msg
391
+ Engine.logger.error(msg)
392
+ e.backtrace.each { |line| Engine.logger.error(line) }
393
+ exceeded_error_threshold?(control) ? break : next
394
+ end
395
+ end
396
+ benchmarks[:before_writes] += t unless t.nil?
397
+
398
+ t = Benchmark.realtime do
399
+ begin
400
+ # write the row to the destination
401
+ destinations.each_with_index do |destination, index|
402
+ Engine.current_destination = destination
403
+ rows.each do |row|
404
+ destination.write(row)
405
+ Engine.rows_written += 1 if index == 0
406
+ end
407
+ end
408
+ rescue => e
409
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error msg
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:writes] += t unless t.nil?
417
+ end
418
+
419
+ if exceeded_error_threshold?(control)
420
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
421
+ return
422
+ end
423
+
424
+ end
425
+
426
+ destinations.each do |destination|
427
+ destination.close
428
+ end
429
+
430
+ say_on_own_line "Executing before post-process screens"
431
+ begin
432
+ execute_screens(control)
433
+ rescue FatalScreenError => e
434
+ say "Fatal screen error during job execution: #{e.message}"
435
+ exit
436
+ rescue ScreenError => e
437
+ say "Screen error during job execution: #{e.message}"
438
+ return
439
+ else
440
+ say "Screens passed"
441
+ end
442
+
443
+ post_process(control)
444
+
445
+ if sources.length > 0
446
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
447
+ end
448
+ if destinations.length > 0
449
+ say "Wrote #{Engine.rows_written} lines to destinations"
450
+ end
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
466
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
467
+
468
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
469
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
470
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
471
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
472
+
473
+ # say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
474
+ #
475
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
476
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
477
+ # end
478
+
479
+ ETL::Engine.job.completed_at = Time.now
480
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
481
+ ETL::Engine.job.save!
482
+ end
483
+
484
+ private
485
+ # Return true if the error threshold is exceeded
486
+ def exceeded_error_threshold?(control)
487
+ errors.length > control.error_threshold
488
+ end
489
+
490
+ # Execute all preprocessors
491
+ def pre_process(control)
492
+ Engine.logger.debug "Pre-processing #{control.file}"
493
+ control.pre_processors.each do |processor|
494
+ processor.process
495
+ end
496
+ Engine.logger.debug "Pre-processing complete"
497
+ end
498
+
499
+ # Execute all postprocessors
500
+ def post_process(control)
501
+ say_on_own_line "Executing post processes"
502
+ Engine.logger.debug "Post-processing #{control.file}"
503
+ control.post_processors.each do |processor|
504
+ processor.process
505
+ end
506
+ Engine.logger.debug "Post-processing complete"
507
+ say "Post-processing complete"
508
+ end
509
+
510
+ # Execute all dependencies
511
+ def execute_dependencies(control)
512
+ Engine.logger.debug "Executing dependencies"
513
+ control.dependencies.flatten.each do |dependency|
514
+ case dependency
515
+ when Symbol
516
+ f = dependency.to_s + '.ctl'
517
+ Engine.logger.debug "Executing dependency: #{f}"
518
+ say "Executing dependency: #{f}"
519
+ process(f)
520
+ when String
521
+ Engine.logger.debug "Executing dependency: #{f}"
522
+ say "Executing dependency: #{f}"
523
+ process(dependency)
524
+ else
525
+ raise "Invalid dependency type: #{dependency.class}"
526
+ end
527
+ end
528
+ end
529
+
530
+ # Execute all screens
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
538
+ [:fatal,:error,:warn].each do |type|
539
+ screens[type].each do |block|
540
+ begin
541
+ block.call
542
+ rescue => e
543
+ case type
544
+ when :fatal
545
+ raise FatalScreenError, e
546
+ when :error
547
+ raise ScreenError, e
548
+ when :warn
549
+ say "Screen warning: #{e}"
550
+ end
551
+ end
552
+ end
553
+ end
554
+ end
555
+ end
556
+ end