activewarehouse-etl-sgonyea 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if @store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,582 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+ @log_dir = options[:log_dir]
30
+
31
+ require File.join(@rails_root, 'config/environment') if @rails_root
32
+ options[:config] ||= 'database.yml'
33
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
34
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
35
+ ActiveRecord::Base.configurations.merge!(database_configuration)
36
+ ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
37
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
38
+
39
+ require 'etl/execution'
40
+ ETL::Execution::Base.establish_connection(options[:execution_conf] || :etl_execution)
41
+ ETL::Execution::Execution.migrate
42
+
43
+ @initialized = true
44
+ end
45
+ end
46
+
47
+ # Process the specified file. Acceptable values for file are:
48
+ # * Path to a file
49
+ # * File object
50
+ # * ETL::Control::Control instance
51
+ # * ETL::Batch::Batch instance
52
+ #
53
+ # The process command will accept either a .ctl Control file or a .ebf
54
+ # ETL Batch File.
55
+ def process(file)
56
+ new().process(file)
57
+ end
58
+
59
+ attr_accessor :timestamped_log
60
+
61
+ # Accessor for the log write mode. Default is 'a' for append.
62
+ attr_accessor :log_write_mode
63
+ def log_write_mode
64
+ @log_write_mode ||= 'a'
65
+ end
66
+
67
+ # A logger for the engine
68
+ attr_accessor :logger
69
+
70
+ def logger #:nodoc:
71
+ unless @logger
72
+ if timestamped_log
73
+ logfile = File.join(*[@log_dir, "etl_#{timestamp}.log"].compact)
74
+
75
+ @logger = Logger.new(logfile)
76
+ else
77
+ logfile = File.join(*[@log_dir, '/etl.log'].compact)
78
+
79
+ @logger = Logger.new(File.open(logfile, log_write_mode))
80
+ end
81
+ @logger.level = Logger::WARN
82
+ @logger.formatter = Logger::Formatter.new
83
+ end
84
+ @logger
85
+ end
86
+
87
+ # Get a timestamp value as a string
88
+ def timestamp
89
+ Time.now.strftime("%Y%m%d%H%M%S")
90
+ end
91
+
92
+ # The current source
93
+ attr_accessor :current_source
94
+
95
+ # The current source row
96
+ attr_accessor :current_source_row
97
+
98
+ # The current destination
99
+ attr_accessor :current_destination
100
+
101
+ # Set to true to activate realtime activity. This will cause certain
102
+ # information messages to be printed to STDOUT
103
+ attr_accessor :realtime_activity
104
+
105
+ # Accessor for the total number of rows read from sources
106
+ attr_accessor :rows_read
107
+ def rows_read
108
+ @rows_read ||= 0
109
+ end
110
+
111
+ # Accessor for the total number of rows processed
112
+ attr_accessor :rows_written
113
+ def rows_written
114
+ @rows_written ||= 0
115
+ end
116
+
117
+ # Access the current ETL::Execution::Job instance
118
+ attr_accessor :job
119
+
120
+ # Access the current ETL::Execution::Batch instance
121
+ attr_accessor :batch
122
+
123
+ # The limit on rows to load from the source, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no limit
126
+ attr_accessor :limit
127
+
128
+ # The offset for the source to begin at, useful for testing the ETL
129
+ # process prior to executing the entire batch. Default value is nil and
130
+ # indicates that there is no offset
131
+ attr_accessor :offset
132
+
133
+ # Set to true to skip all bulk importing
134
+ attr_accessor :skip_bulk_import
135
+
136
+ # Set to true to read locally from the last source cache files
137
+ attr_accessor :read_locally
138
+
139
+ # Accessor for the average rows per second processed
140
+ attr_accessor :average_rows_per_second
141
+
142
+ # Get a named connection
143
+ def connection(name)
144
+ logger.debug "Retrieving connection #{name}"
145
+ conn = connections[name] ||= establish_connection(name)
146
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
147
+ conn.reconnect! unless conn.active?
148
+ conn
149
+ end
150
+
151
+ # Set to true to use temp tables
152
+ attr_accessor :use_temp_tables
153
+
154
+ # Get a registry of temp tables
155
+ def temp_tables
156
+ @temp_tables ||= {}
157
+ end
158
+
159
+ # Called when a batch job finishes, allowing for cleanup to occur
160
+ def finish
161
+ temp_tables.each do |temp_table, mapping|
162
+ actual_table = mapping[:table]
163
+ #puts "move #{temp_table} to #{actual_table}"
164
+ conn = mapping[:connection]
165
+ conn.transaction do
166
+ conn.rename_table(actual_table, "#{actual_table}_old")
167
+ conn.rename_table(temp_table, actual_table)
168
+ conn.drop_table("#{actual_table}_old")
169
+ end
170
+ end
171
+ end
172
+
173
+ # Return true if using temp tables
174
+ def use_temp_tables?
175
+ use_temp_tables ? true : false
176
+ end
177
+
178
+ # Modify the table name if necessary
179
+ def table(table_name, connection)
180
+ if use_temp_tables?
181
+ temp_table_name = "tmp_#{table_name}"
182
+
183
+ if temp_tables[temp_table_name].nil?
184
+ # Create the temp table and add it to the mapping
185
+ begin connection.drop_table(temp_table_name); rescue; end
186
+ connection.copy_table(table_name, temp_table_name)
187
+ temp_tables[temp_table_name] = {
188
+ :table => table_name,
189
+ :connection => connection
190
+ }
191
+ end
192
+
193
+ temp_table_name
194
+ else
195
+ table_name
196
+ end
197
+ end
198
+
199
+ protected
200
+ # Hash of database connections that can be used throughout the ETL
201
+ # process
202
+ def connections
203
+ @connections ||= {}
204
+ end
205
+
206
+ # Establish the named connection and return the database specific connection
207
+ def establish_connection(name)
208
+ raise ETL::ETLError, "Connection with no name requested. Is there a missing :target parameter somewhere?" if name.blank?
209
+
210
+ logger.debug "Establishing connection to #{name}"
211
+ conn_config = ETL::Base.configurations[name.to_s]
212
+ raise ETL::ETLError, "Cannot find connection named #{name.inspect}" unless conn_config
213
+ connection_method = "#{conn_config['adapter']}_connection"
214
+ ETL::Base.send(connection_method, conn_config)
215
+ end
216
+ end # class << self
217
+
218
+ # Say the specified message, with a newline
219
+ def say(message)
220
+ say_without_newline(message + "\n")
221
+ end
222
+
223
+ # Say the specified message without a newline
224
+ def say_without_newline(message)
225
+ if ETL::Engine.realtime_activity
226
+ $stdout.print message
227
+ $stdout.flush
228
+ end
229
+ end
230
+
231
+ # Say the message on its own line
232
+ def say_on_own_line(message)
233
+ say("\n" + message)
234
+ end
235
+
236
+ # Array of errors encountered during execution of the ETL process
237
+ def errors
238
+ @errors ||= []
239
+ end
240
+
241
+ # Get a Hash of benchmark values where each value represents the total
242
+ # amount of time in seconds spent processing in that portion of the ETL
243
+ # pipeline. Keys include:
244
+ # * <tt>:transforms</tt>
245
+ # * <tt>:after_reads</tt>
246
+ # * <tt>:before_writes</tt>
247
+ # * <tt>:writes</tt>
248
+ def benchmarks
249
+ @benchmarks ||= {
250
+ :transforms => 0,
251
+ :after_reads => 0,
252
+ :before_writes => 0,
253
+ :writes => 0,
254
+ }
255
+ end
256
+
257
+ # Process a file, control object or batch object. Acceptable values for
258
+ # file are:
259
+ # * Path to a file
260
+ # * File object
261
+ # * ETL::Control::Control instance
262
+ # * ETL::Batch::Batch instance
263
+ def process(file)
264
+ case file
265
+ when String
266
+ process(File.new(file))
267
+ when File
268
+ case file.path
269
+ when /.ctl/ then process_control(file)
270
+ when /.etl/ then process_control(file)
271
+ when /.ebf/ then process_batch(file)
272
+ else
273
+ raise RuntimeError, "Unsupported file type - #{file.path}"
274
+ end
275
+ when ETL::Control::Control
276
+ process_control(file)
277
+ when ETL::Batch::Batch
278
+ process_batch(file)
279
+ else
280
+ raise RuntimeError, "Process object must be a String, File, Control
281
+ instance or Batch instance"
282
+ end
283
+ end
284
+
285
+ protected
286
+ # Process the specified batch file
287
+ def process_batch(batch)
288
+ batch = ETL::Batch::Batch.resolve(batch, self)
289
+ say "Processing batch #{batch.file}"
290
+
291
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
292
+ :batch_file => batch.file,
293
+ :status => 'executing'
294
+ )
295
+
296
+ batch.execute
297
+
298
+ ETL::Engine.batch.completed_at = Time.now
299
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
300
+ ETL::Engine.batch.save!
301
+ end
302
+
303
+ # Process the specified control file
304
+ def process_control(control)
305
+ control = ETL::Control::Control.resolve(control)
306
+ say_on_own_line "Processing control #{control.file}"
307
+
308
+ ETL::Engine.job = ETL::Execution::Job.create!(
309
+ :control_file => control.file,
310
+ :status => 'executing',
311
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
312
+ )
313
+
314
+ execute_dependencies(control)
315
+
316
+ start_time = Time.now
317
+ pre_process(control)
318
+ sources = control.sources
319
+ destinations = control.destinations
320
+
321
+ say "Skipping bulk import" if Engine.skip_bulk_import
322
+
323
+ sources.each do |source|
324
+ Engine.current_source = source
325
+ Engine.logger.debug "Processing source #{source.inspect}"
326
+ say "Source: #{source}"
327
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
328
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
329
+ source.each_with_index do |row, index|
330
+ # Break out of the row loop if the +Engine.limit+ is specified and
331
+ # the number of rows read exceeds that value.
332
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
333
+ puts "Reached limit of #{Engine.limit}"
334
+ break
335
+ end
336
+
337
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
338
+ Engine.rows_read += 1
339
+ Engine.current_source_row = index + 1
340
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
341
+
342
+ # At this point a single row may be turned into multiple rows via row
343
+ # processors all code after this line should work with the array of
344
+ # rows rather than the single row
345
+ rows = [row]
346
+
347
+ t = Benchmark.realtime do
348
+ begin
349
+ Engine.logger.debug "Processing after read"
350
+ control.after_read_processors.each do |processor|
351
+ processed_rows = []
352
+ rows.each do |row|
353
+ processed_rows << processor.process(row) unless empty_row?(row)
354
+ end
355
+ rows = processed_rows.flatten.compact
356
+ end
357
+ rescue => e
358
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
359
+ errors << msg
360
+ Engine.logger.error(msg)
361
+ e.backtrace.each { |line| Engine.logger.error(line) }
362
+ exceeded_error_threshold?(control) ? break : next
363
+ end
364
+ end
365
+ benchmarks[:after_reads] += t unless t.nil?
366
+
367
+ t = Benchmark.realtime do
368
+ begin
369
+ Engine.logger.debug "Executing transforms"
370
+ rows.each do |row|
371
+ # only do the transform if there is a row
372
+ unless empty_row?(row)
373
+ control.transforms.each do |transform|
374
+ name = transform.name.to_sym
375
+ row[name] = transform.transform(name, row[name], row)
376
+ end
377
+ end
378
+ end
379
+ rescue ResolverError => e
380
+ Engine.logger.error(e.message)
381
+ errors << e.message
382
+ rescue => e
383
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
384
+ errors << msg
385
+ Engine.logger.error(msg)
386
+ e.backtrace.each { |line| Engine.logger.error(line) }
387
+ ensure
388
+ begin
389
+ exceeded_error_threshold?(control) ? break : next
390
+ rescue => inner_error
391
+ puts inner_error
392
+ end
393
+ end
394
+ end
395
+ benchmarks[:transforms] += t unless t.nil?
396
+
397
+ t = Benchmark.realtime do
398
+ begin
399
+ # execute row-level "before write" processing
400
+ Engine.logger.debug "Processing before write"
401
+ control.before_write_processors.each do |processor|
402
+ processed_rows = []
403
+ rows.each do |row|
404
+ processed_rows << processor.process(row) unless empty_row?(row)
405
+ end
406
+ rows = processed_rows.flatten.compact
407
+ end
408
+ rescue => e
409
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error(msg)
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:before_writes] += t unless t.nil?
417
+
418
+ t = Benchmark.realtime do
419
+ begin
420
+ # write the row to the destination
421
+ destinations.each_with_index do |destination, index|
422
+ Engine.current_destination = destination
423
+ rows.each do |row|
424
+ destination.write(row)
425
+ Engine.rows_written += 1 if index == 0
426
+ end
427
+ end
428
+ rescue => e
429
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
430
+ errors << msg
431
+ Engine.logger.error msg
432
+ e.backtrace.each { |line| Engine.logger.error(line) }
433
+ exceeded_error_threshold?(control) ? break : next
434
+ end
435
+ end
436
+ benchmarks[:writes] += t unless t.nil?
437
+ end
438
+
439
+ if exceeded_error_threshold?(control)
440
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
441
+ return
442
+ end
443
+
444
+ end
445
+
446
+ destinations.each do |destination|
447
+ destination.close
448
+ end
449
+
450
+ say_on_own_line "Executing before post-process screens"
451
+ begin
452
+ execute_screens(control)
453
+ rescue FatalScreenError => e
454
+ say "Fatal screen error during job execution: #{e.message}"
455
+ exit
456
+ rescue ScreenError => e
457
+ say "Screen error during job execution: #{e.message}"
458
+ return
459
+ else
460
+ say "Screens passed"
461
+ end
462
+
463
+ post_process(control)
464
+
465
+ if sources.length > 0
466
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
467
+ end
468
+ if destinations.length > 0
469
+ say "Wrote #{Engine.rows_written} lines to destinations"
470
+ end
471
+
472
+ say_on_own_line "Executing after post-process screens"
473
+ begin
474
+ execute_screens(control, :after_post_process)
475
+ rescue FatalScreenError => e
476
+ say "Fatal screen error during job execution: #{e.message}"
477
+ exit
478
+ rescue ScreenError => e
479
+ say "Screen error during job execution: #{e.message}"
480
+ return
481
+ else
482
+ say "Screens passed"
483
+ end
484
+
485
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
486
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
487
+
488
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
489
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
490
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
491
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
492
+
493
+ # say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
494
+ #
495
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
496
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
497
+ # end
498
+
499
+ ActiveRecord::Base.verify_active_connections!
500
+ ETL::Engine.job.completed_at = Time.now
501
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
502
+ ETL::Engine.job.save!
503
+ end
504
+
505
+ def empty_row?(row)
506
+ # unsure about why it should respond to :[] - keeping it just in case for the moment
507
+ row.nil? || !row.respond_to?(:[])
508
+ end
509
+
510
+ private
511
+ # Return true if the error threshold is exceeded
512
+ def exceeded_error_threshold?(control)
513
+ errors.length > control.error_threshold
514
+ end
515
+
516
+ # Execute all preprocessors
517
+ def pre_process(control)
518
+ Engine.logger.debug "Pre-processing #{control.file}"
519
+ control.pre_processors.each do |processor|
520
+ processor.process
521
+ end
522
+ Engine.logger.debug "Pre-processing complete"
523
+ end
524
+
525
+ # Execute all postprocessors
526
+ def post_process(control)
527
+ say_on_own_line "Executing post processes"
528
+ Engine.logger.debug "Post-processing #{control.file}"
529
+ control.post_processors.each do |processor|
530
+ processor.process
531
+ end
532
+ Engine.logger.debug "Post-processing complete"
533
+ say "Post-processing complete"
534
+ end
535
+
536
+ # Execute all dependencies
537
+ def execute_dependencies(control)
538
+ Engine.logger.debug "Executing dependencies"
539
+ control.dependencies.flatten.each do |dependency|
540
+ case dependency
541
+ when Symbol
542
+ f = dependency.to_s + '.ctl'
543
+ Engine.logger.debug "Executing dependency: #{f}"
544
+ say "Executing dependency: #{f}"
545
+ process(f)
546
+ when String
547
+ Engine.logger.debug "Executing dependency: #{f}"
548
+ say "Executing dependency: #{f}"
549
+ process(dependency)
550
+ else
551
+ raise "Invalid dependency type: #{dependency.class}"
552
+ end
553
+ end
554
+ end
555
+
556
+ # Execute all screens
557
+ def execute_screens(control, timing = :before_post_process)
558
+ screens = case timing
559
+ when :after_post_process
560
+ control.after_post_process_screens
561
+ else # default to before post-process screens
562
+ control.screens
563
+ end
564
+ [:fatal,:error,:warn].each do |type|
565
+ screens[type].each do |block|
566
+ begin
567
+ block.call
568
+ rescue => e
569
+ case type
570
+ when :fatal
571
+ raise FatalScreenError, e
572
+ when :error
573
+ raise ScreenError, e
574
+ when :warn
575
+ say "Screen warning: #{e}"
576
+ end
577
+ end
578
+ end
579
+ end
580
+ end
581
+ end
582
+ end