etl 0.9.5.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (215) hide show
  1. data/.gitignore +12 -0
  2. data/.yardopts +5 -0
  3. data/0.9-UPGRADE +6 -0
  4. data/CHANGELOG +236 -0
  5. data/Gemfile +4 -0
  6. data/HOW_TO_RELEASE +13 -0
  7. data/LICENSE +7 -0
  8. data/README.textile +111 -0
  9. data/Rakefile +105 -0
  10. data/TODO +28 -0
  11. data/activewarehouse-etl.gemspec +38 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/spec/fixtures/all.ebf +6 -0
  111. data/spec/fixtures/apache_combined_log.ctl +11 -0
  112. data/spec/fixtures/batch_with_error.ebf +6 -0
  113. data/spec/fixtures/batched1.ctl +0 -0
  114. data/spec/fixtures/batched2.ctl +0 -0
  115. data/spec/fixtures/block_processor.ctl +6 -0
  116. data/spec/fixtures/block_processor_error.ctl +1 -0
  117. data/spec/fixtures/block_processor_pre_post_process.ctl +4 -0
  118. data/spec/fixtures/block_processor_remove_rows.ctl +5 -0
  119. data/spec/fixtures/data/apache_combined_log.txt +3 -0
  120. data/spec/fixtures/data/bulk_import.txt +3 -0
  121. data/spec/fixtures/data/bulk_import_with_empties.txt +3 -0
  122. data/spec/fixtures/data/decode.txt +3 -0
  123. data/spec/fixtures/data/delimited.txt +3 -0
  124. data/spec/fixtures/data/encode_source_latin1.txt +2 -0
  125. data/spec/fixtures/data/excel.xls +0 -0
  126. data/spec/fixtures/data/excel2.xls +0 -0
  127. data/spec/fixtures/data/fixed_width.txt +3 -0
  128. data/spec/fixtures/data/multiple_delimited_1.txt +3 -0
  129. data/spec/fixtures/data/multiple_delimited_2.txt +3 -0
  130. data/spec/fixtures/data/nokogiri.xml +38 -0
  131. data/spec/fixtures/data/people.txt +3 -0
  132. data/spec/fixtures/data/sax.xml +14 -0
  133. data/spec/fixtures/data/xml.xml +16 -0
  134. data/spec/fixtures/delimited.ctl +30 -0
  135. data/spec/fixtures/delimited_absolute.ctl +31 -0
  136. data/spec/fixtures/delimited_destination_db.ctl +23 -0
  137. data/spec/fixtures/delimited_excel.ctl +31 -0
  138. data/spec/fixtures/delimited_insert_update.ctl +34 -0
  139. data/spec/fixtures/delimited_update.ctl +34 -0
  140. data/spec/fixtures/delimited_with_bulk_load.ctl +34 -0
  141. data/spec/fixtures/errors.ctl +24 -0
  142. data/spec/fixtures/excel.ctl +24 -0
  143. data/spec/fixtures/excel2.ctl +25 -0
  144. data/spec/fixtures/fixed_width.ctl +35 -0
  145. data/spec/fixtures/inline_parser.ctl +17 -0
  146. data/spec/fixtures/model_source.ctl +14 -0
  147. data/spec/fixtures/multiple_delimited.ctl +22 -0
  148. data/spec/fixtures/multiple_source_delimited.ctl +39 -0
  149. data/spec/fixtures/nokogiri_all.ctl +35 -0
  150. data/spec/fixtures/nokogiri_select.ctl +35 -0
  151. data/spec/fixtures/output/.ignore +1 -0
  152. data/spec/fixtures/output/delimited.txt +3 -0
  153. data/spec/fixtures/output/encode_destination_utf-8.txt +2 -0
  154. data/spec/fixtures/output/fixed_width.txt +3 -0
  155. data/spec/fixtures/output/inline_parser.txt +3 -0
  156. data/spec/fixtures/output/multiple_source_delimited.txt +6 -0
  157. data/spec/fixtures/output/test_excel_destination.xls +0 -0
  158. data/spec/fixtures/output/test_file_destination.2.txt +2 -0
  159. data/spec/fixtures/output/test_file_destination.txt +2 -0
  160. data/spec/fixtures/output/test_multiple_unique.txt +1 -0
  161. data/spec/fixtures/output/test_unique.txt +2 -0
  162. data/spec/fixtures/sax.ctl +26 -0
  163. data/spec/fixtures/scd/1.txt +1 -0
  164. data/spec/fixtures/scd/2.txt +1 -0
  165. data/spec/fixtures/scd/3.txt +1 -0
  166. data/spec/fixtures/scd_test_type_1.ctl +43 -0
  167. data/spec/fixtures/scd_test_type_2.ctl +34 -0
  168. data/spec/fixtures/screen_test_error.ctl +3 -0
  169. data/spec/fixtures/screen_test_fatal.ctl +3 -0
  170. data/spec/fixtures/xml.ctl +31 -0
  171. data/spec/quality_spec.rb +11 -0
  172. data/spec/spec_helper.rb +10 -0
  173. data/spec/support/custom_fixtures.rb +54 -0
  174. data/spec/support/custom_matchers.rb +54 -0
  175. data/test-matrix.yml +10 -0
  176. data/test/.gitignore +1 -0
  177. data/test/.ignore +2 -0
  178. data/test/batch_test.rb +41 -0
  179. data/test/block_processor_test.rb +38 -0
  180. data/test/check_exist_processor_test.rb +92 -0
  181. data/test/check_unique_processor_test.rb +40 -0
  182. data/test/config/Gemfile.rails-2.3.x +3 -0
  183. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  184. data/test/config/Gemfile.rails-3.0.x +3 -0
  185. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  186. data/test/config/common.rb +29 -0
  187. data/test/connection/mysql/connection.rb +9 -0
  188. data/test/connection/mysql/schema.sql +37 -0
  189. data/test/connection/postgresql/connection.rb +13 -0
  190. data/test/connection/postgresql/schema.sql +40 -0
  191. data/test/control_test.rb +43 -0
  192. data/test/database_join_processor_test.rb +43 -0
  193. data/test/date_dimension_builder_test.rb +96 -0
  194. data/test/destination_test.rb +275 -0
  195. data/test/directive_test.rb +23 -0
  196. data/test/encode_processor_test.rb +32 -0
  197. data/test/engine_test.rb +78 -0
  198. data/test/ensure_fields_presence_processor_test.rb +28 -0
  199. data/test/etl_test.rb +42 -0
  200. data/test/foreign_key_lookup_transform_test.rb +50 -0
  201. data/test/generator_test.rb +14 -0
  202. data/test/mocks/mock_destination.rb +26 -0
  203. data/test/mocks/mock_source.rb +25 -0
  204. data/test/nokogiri_test.rb +35 -0
  205. data/test/parser_test.rb +224 -0
  206. data/test/performance/delimited.ctl +30 -0
  207. data/test/processor_test.rb +44 -0
  208. data/test/row_processor_test.rb +17 -0
  209. data/test/scd_test.rb +257 -0
  210. data/test/screen_test.rb +9 -0
  211. data/test/source_test.rb +154 -0
  212. data/test/test_helper.rb +37 -0
  213. data/test/transform_test.rb +101 -0
  214. data/test/truncate_processor_test.rb +37 -0
  215. metadata +510 -0
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if @store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,582 @@
1
+ module ETL #:nodoc:
2
+
3
+ class Base < ActiveRecord::Base
4
+ end
5
+
6
+ # The main ETL engine clas
7
+ class Engine
8
+ include ETL::Util
9
+
10
+ class << self
11
+ # Initialization that is run when a job is executed.
12
+ #
13
+ # Options:
14
+ # * <tt>:limit</tt>: Limit the number of records returned from sources
15
+ # * <tt>:offset</tt>: Specify the records for data from sources
16
+ # * <tt>:log_write_mode</tt>: If true then the log will write, otherwise it will append
17
+ # * <tt>:skip_bulk_import</tt>: Set to true to skip bulk import
18
+ # * <tt>:read_locally</tt>: Set to true to read from the local cache
19
+ # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
+ def init(options={})
21
+ unless @initialized
22
+ puts "initializing ETL engine\n\n"
23
+ @limit = options[:limit]
24
+ @offset = options[:offset]
25
+ @log_write_mode = 'w' if options[:newlog]
26
+ @skip_bulk_import = options[:skip_bulk_import]
27
+ @read_locally = options[:read_locally]
28
+ @rails_root = options[:rails_root]
29
+ @log_dir = options[:log_dir]
30
+
31
+ require File.join(@rails_root, 'config/environment') if @rails_root
32
+ options[:config] ||= 'database.yml'
33
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
34
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
35
+ ActiveRecord::Base.configurations.merge!(database_configuration)
36
+ ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
37
+ #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
38
+
39
+ require 'etl/execution'
40
+ ETL::Execution::Base.establish_connection(options[:execution_conf] || :etl_execution)
41
+ ETL::Execution::Execution.migrate
42
+
43
+ @initialized = true
44
+ end
45
+ end
46
+
47
+ # Process the specified file. Acceptable values for file are:
48
+ # * Path to a file
49
+ # * File object
50
+ # * ETL::Control::Control instance
51
+ # * ETL::Batch::Batch instance
52
+ #
53
+ # The process command will accept either a .ctl Control file or a .ebf
54
+ # ETL Batch File.
55
+ def process(file)
56
+ new().process(file)
57
+ end
58
+
59
+ attr_accessor :timestamped_log
60
+
61
+ # Accessor for the log write mode. Default is 'a' for append.
62
+ attr_accessor :log_write_mode
63
+ def log_write_mode
64
+ @log_write_mode ||= 'a'
65
+ end
66
+
67
+ # A logger for the engine
68
+ attr_accessor :logger
69
+
70
+ def logger #:nodoc:
71
+ unless @logger
72
+ if timestamped_log
73
+ logfile = File.join(*[@log_dir, "etl_#{timestamp}.log"].compact)
74
+
75
+ @logger = Logger.new(logfile)
76
+ else
77
+ logfile = File.join(*[@log_dir, '/etl.log'].compact)
78
+
79
+ @logger = Logger.new(File.open(logfile, log_write_mode))
80
+ end
81
+ @logger.level = Logger::WARN
82
+ @logger.formatter = Logger::Formatter.new
83
+ end
84
+ @logger
85
+ end
86
+
87
+ # Get a timestamp value as a string
88
+ def timestamp
89
+ Time.now.strftime("%Y%m%d%H%M%S")
90
+ end
91
+
92
+ # The current source
93
+ attr_accessor :current_source
94
+
95
+ # The current source row
96
+ attr_accessor :current_source_row
97
+
98
+ # The current destination
99
+ attr_accessor :current_destination
100
+
101
+ # Set to true to activate realtime activity. This will cause certain
102
+ # information messages to be printed to STDOUT
103
+ attr_accessor :realtime_activity
104
+
105
+ # Accessor for the total number of rows read from sources
106
+ attr_accessor :rows_read
107
+ def rows_read
108
+ @rows_read ||= 0
109
+ end
110
+
111
+ # Accessor for the total number of rows processed
112
+ attr_accessor :rows_written
113
+ def rows_written
114
+ @rows_written ||= 0
115
+ end
116
+
117
+ # Access the current ETL::Execution::Job instance
118
+ attr_accessor :job
119
+
120
+ # Access the current ETL::Execution::Batch instance
121
+ attr_accessor :batch
122
+
123
+ # The limit on rows to load from the source, useful for testing the ETL
124
+ # process prior to executing the entire batch. Default value is nil and
125
+ # indicates that there is no limit
126
+ attr_accessor :limit
127
+
128
+ # The offset for the source to begin at, useful for testing the ETL
129
+ # process prior to executing the entire batch. Default value is nil and
130
+ # indicates that there is no offset
131
+ attr_accessor :offset
132
+
133
+ # Set to true to skip all bulk importing
134
+ attr_accessor :skip_bulk_import
135
+
136
+ # Set to true to read locally from the last source cache files
137
+ attr_accessor :read_locally
138
+
139
+ # Accessor for the average rows per second processed
140
+ attr_accessor :average_rows_per_second
141
+
142
+ # Get a named connection
143
+ def connection(name)
144
+ logger.debug "Retrieving connection #{name}"
145
+ conn = connections[name] ||= establish_connection(name)
146
+ #conn.verify!(ActiveRecord::Base.verification_timeout)
147
+ conn.reconnect! unless conn.active?
148
+ conn
149
+ end
150
+
151
+ # Set to true to use temp tables
152
+ attr_accessor :use_temp_tables
153
+
154
+ # Get a registry of temp tables
155
+ def temp_tables
156
+ @temp_tables ||= {}
157
+ end
158
+
159
+ # Called when a batch job finishes, allowing for cleanup to occur
160
+ def finish
161
+ temp_tables.each do |temp_table, mapping|
162
+ actual_table = mapping[:table]
163
+ #puts "move #{temp_table} to #{actual_table}"
164
+ conn = mapping[:connection]
165
+ conn.transaction do
166
+ conn.rename_table(actual_table, "#{actual_table}_old")
167
+ conn.rename_table(temp_table, actual_table)
168
+ conn.drop_table("#{actual_table}_old")
169
+ end
170
+ end
171
+ end
172
+
173
+ # Return true if using temp tables
174
+ def use_temp_tables?
175
+ use_temp_tables ? true : false
176
+ end
177
+
178
+ # Modify the table name if necessary
179
+ def table(table_name, connection)
180
+ if use_temp_tables?
181
+ temp_table_name = "tmp_#{table_name}"
182
+
183
+ if temp_tables[temp_table_name].nil?
184
+ # Create the temp table and add it to the mapping
185
+ begin connection.drop_table(temp_table_name); rescue; end
186
+ connection.copy_table(table_name, temp_table_name)
187
+ temp_tables[temp_table_name] = {
188
+ :table => table_name,
189
+ :connection => connection
190
+ }
191
+ end
192
+
193
+ temp_table_name
194
+ else
195
+ table_name
196
+ end
197
+ end
198
+
199
+ protected
200
+ # Hash of database connections that can be used throughout the ETL
201
+ # process
202
+ def connections
203
+ @connections ||= {}
204
+ end
205
+
206
+ # Establish the named connection and return the database specific connection
207
+ def establish_connection(name)
208
+ raise ETL::ETLError, "Connection with no name requested. Is there a missing :target parameter somewhere?" if name.blank?
209
+
210
+ logger.debug "Establishing connection to #{name}"
211
+ conn_config = ETL::Base.configurations[name.to_s]
212
+ raise ETL::ETLError, "Cannot find connection named #{name.inspect}" unless conn_config
213
+ connection_method = "#{conn_config['adapter']}_connection"
214
+ ETL::Base.send(connection_method, conn_config)
215
+ end
216
+ end # class << self
217
+
218
+ # Say the specified message, with a newline
219
+ def say(message)
220
+ say_without_newline(message + "\n")
221
+ end
222
+
223
+ # Say the specified message without a newline
224
+ def say_without_newline(message)
225
+ if ETL::Engine.realtime_activity
226
+ $stdout.print message
227
+ $stdout.flush
228
+ end
229
+ end
230
+
231
+ # Say the message on its own line
232
+ def say_on_own_line(message)
233
+ say("\n" + message)
234
+ end
235
+
236
+ # Array of errors encountered during execution of the ETL process
237
+ def errors
238
+ @errors ||= []
239
+ end
240
+
241
+ # Get a Hash of benchmark values where each value represents the total
242
+ # amount of time in seconds spent processing in that portion of the ETL
243
+ # pipeline. Keys include:
244
+ # * <tt>:transforms</tt>
245
+ # * <tt>:after_reads</tt>
246
+ # * <tt>:before_writes</tt>
247
+ # * <tt>:writes</tt>
248
+ def benchmarks
249
+ @benchmarks ||= {
250
+ :transforms => 0,
251
+ :after_reads => 0,
252
+ :before_writes => 0,
253
+ :writes => 0,
254
+ }
255
+ end
256
+
257
+ # Process a file, control object or batch object. Acceptable values for
258
+ # file are:
259
+ # * Path to a file
260
+ # * File object
261
+ # * ETL::Control::Control instance
262
+ # * ETL::Batch::Batch instance
263
+ def process(file)
264
+ case file
265
+ when String
266
+ process(File.new(file))
267
+ when File
268
+ case file.path
269
+ when /.ctl/ then process_control(file)
270
+ when /.etl/ then process_control(file)
271
+ when /.ebf/ then process_batch(file)
272
+ else
273
+ raise RuntimeError, "Unsupported file type - #{file.path}"
274
+ end
275
+ when ETL::Control::Control
276
+ process_control(file)
277
+ when ETL::Batch::Batch
278
+ process_batch(file)
279
+ else
280
+ raise RuntimeError, "Process object must be a String, File, Control
281
+ instance or Batch instance"
282
+ end
283
+ end
284
+
285
+ protected
286
+ # Process the specified batch file
287
+ def process_batch(batch)
288
+ batch = ETL::Batch::Batch.resolve(batch, self)
289
+ say "Processing batch #{batch.file}"
290
+
291
+ ETL::Engine.batch = ETL::Execution::Batch.create!(
292
+ :batch_file => batch.file,
293
+ :status => 'executing'
294
+ )
295
+
296
+ batch.execute
297
+
298
+ ETL::Engine.batch.completed_at = Time.now
299
+ ETL::Engine.batch.status = (errors.length > 0 ? 'completed with errors' : 'completed')
300
+ ETL::Engine.batch.save!
301
+ end
302
+
303
+ # Process the specified control file
304
+ def process_control(control)
305
+ control = ETL::Control::Control.resolve(control)
306
+ say_on_own_line "Processing control #{control.file}"
307
+
308
+ ETL::Engine.job = ETL::Execution::Job.create!(
309
+ :control_file => control.file,
310
+ :status => 'executing',
311
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
312
+ )
313
+
314
+ execute_dependencies(control)
315
+
316
+ start_time = Time.now
317
+ pre_process(control)
318
+ sources = control.sources
319
+ destinations = control.destinations
320
+
321
+ say "Skipping bulk import" if Engine.skip_bulk_import
322
+
323
+ sources.each do |source|
324
+ Engine.current_source = source
325
+ Engine.logger.debug "Processing source #{source.inspect}"
326
+ say "Source: #{source}"
327
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
328
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
329
+ source.each_with_index do |row, index|
330
+ # Break out of the row loop if the +Engine.limit+ is specified and
331
+ # the number of rows read exceeds that value.
332
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
333
+ puts "Reached limit of #{Engine.limit}"
334
+ break
335
+ end
336
+
337
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
338
+ Engine.rows_read += 1
339
+ Engine.current_source_row = index + 1
340
+ say_without_newline "." if Engine.realtime_activity && index > 0 && index % 1000 == 0
341
+
342
+ # At this point a single row may be turned into multiple rows via row
343
+ # processors all code after this line should work with the array of
344
+ # rows rather than the single row
345
+ rows = [row]
346
+
347
+ t = Benchmark.realtime do
348
+ begin
349
+ Engine.logger.debug "Processing after read"
350
+ control.after_read_processors.each do |processor|
351
+ processed_rows = []
352
+ rows.each do |row|
353
+ processed_rows << processor.process(row) unless empty_row?(row)
354
+ end
355
+ rows = processed_rows.flatten.compact
356
+ end
357
+ rescue => e
358
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
359
+ errors << msg
360
+ Engine.logger.error(msg)
361
+ e.backtrace.each { |line| Engine.logger.error(line) }
362
+ exceeded_error_threshold?(control) ? break : next
363
+ end
364
+ end
365
+ benchmarks[:after_reads] += t unless t.nil?
366
+
367
+ t = Benchmark.realtime do
368
+ begin
369
+ Engine.logger.debug "Executing transforms"
370
+ rows.each do |row|
371
+ # only do the transform if there is a row
372
+ unless empty_row?(row)
373
+ control.transforms.each do |transform|
374
+ name = transform.name.to_sym
375
+ row[name] = transform.transform(name, row[name], row)
376
+ end
377
+ end
378
+ end
379
+ rescue ResolverError => e
380
+ Engine.logger.error(e.message)
381
+ errors << e.message
382
+ rescue => e
383
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
384
+ errors << msg
385
+ Engine.logger.error(msg)
386
+ e.backtrace.each { |line| Engine.logger.error(line) }
387
+ ensure
388
+ begin
389
+ exceeded_error_threshold?(control) ? break : next
390
+ rescue => inner_error
391
+ puts inner_error
392
+ end
393
+ end
394
+ end
395
+ benchmarks[:transforms] += t unless t.nil?
396
+
397
+ t = Benchmark.realtime do
398
+ begin
399
+ # execute row-level "before write" processing
400
+ Engine.logger.debug "Processing before write"
401
+ control.before_write_processors.each do |processor|
402
+ processed_rows = []
403
+ rows.each do |row|
404
+ processed_rows << processor.process(row) unless empty_row?(row)
405
+ end
406
+ rows = processed_rows.flatten.compact
407
+ end
408
+ rescue => e
409
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
410
+ errors << msg
411
+ Engine.logger.error(msg)
412
+ e.backtrace.each { |line| Engine.logger.error(line) }
413
+ exceeded_error_threshold?(control) ? break : next
414
+ end
415
+ end
416
+ benchmarks[:before_writes] += t unless t.nil?
417
+
418
+ t = Benchmark.realtime do
419
+ begin
420
+ # write the row to the destination
421
+ destinations.each_with_index do |destination, index|
422
+ Engine.current_destination = destination
423
+ rows.each do |row|
424
+ destination.write(row)
425
+ Engine.rows_written += 1 if index == 0
426
+ end
427
+ end
428
+ rescue => e
429
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
430
+ errors << msg
431
+ Engine.logger.error msg
432
+ e.backtrace.each { |line| Engine.logger.error(line) }
433
+ exceeded_error_threshold?(control) ? break : next
434
+ end
435
+ end
436
+ benchmarks[:writes] += t unless t.nil?
437
+ end
438
+
439
+ if exceeded_error_threshold?(control)
440
+ say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
441
+ return
442
+ end
443
+
444
+ end
445
+
446
+ destinations.each do |destination|
447
+ destination.close
448
+ end
449
+
450
+ say_on_own_line "Executing before post-process screens"
451
+ begin
452
+ execute_screens(control)
453
+ rescue FatalScreenError => e
454
+ say "Fatal screen error during job execution: #{e.message}"
455
+ exit
456
+ rescue ScreenError => e
457
+ say "Screen error during job execution: #{e.message}"
458
+ return
459
+ else
460
+ say "Screens passed"
461
+ end
462
+
463
+ post_process(control)
464
+
465
+ if sources.length > 0
466
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
467
+ end
468
+ if destinations.length > 0
469
+ say "Wrote #{Engine.rows_written} lines to destinations"
470
+ end
471
+
472
+ say_on_own_line "Executing after post-process screens"
473
+ begin
474
+ execute_screens(control, :after_post_process)
475
+ rescue FatalScreenError => e
476
+ say "Fatal screen error during job execution: #{e.message}"
477
+ exit
478
+ rescue ScreenError => e
479
+ say "Screen error during job execution: #{e.message}"
480
+ return
481
+ else
482
+ say "Screens passed"
483
+ end
484
+
485
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
486
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
487
+
488
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
489
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
490
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
491
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
492
+
493
+ # say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
494
+ #
495
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
496
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
497
+ # end
498
+
499
+ ActiveRecord::Base.verify_active_connections!
500
+ ETL::Engine.job.completed_at = Time.now
501
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
502
+ ETL::Engine.job.save!
503
+ end
504
+
505
+ def empty_row?(row)
506
+ # unsure about why it should respond to :[] - keeping it just in case for the moment
507
+ row.nil? || !row.respond_to?(:[])
508
+ end
509
+
510
+ private
511
+ # Return true if the error threshold is exceeded
512
+ def exceeded_error_threshold?(control)
513
+ errors.length > control.error_threshold
514
+ end
515
+
516
+ # Execute all preprocessors
517
+ def pre_process(control)
518
+ Engine.logger.debug "Pre-processing #{control.file}"
519
+ control.pre_processors.each do |processor|
520
+ processor.process
521
+ end
522
+ Engine.logger.debug "Pre-processing complete"
523
+ end
524
+
525
+ # Execute all postprocessors
526
+ def post_process(control)
527
+ say_on_own_line "Executing post processes"
528
+ Engine.logger.debug "Post-processing #{control.file}"
529
+ control.post_processors.each do |processor|
530
+ processor.process
531
+ end
532
+ Engine.logger.debug "Post-processing complete"
533
+ say "Post-processing complete"
534
+ end
535
+
536
+ # Execute all dependencies
537
+ def execute_dependencies(control)
538
+ Engine.logger.debug "Executing dependencies"
539
+ control.dependencies.flatten.each do |dependency|
540
+ case dependency
541
+ when Symbol
542
+ f = dependency.to_s + '.ctl'
543
+ Engine.logger.debug "Executing dependency: #{f}"
544
+ say "Executing dependency: #{f}"
545
+ process(f)
546
+ when String
547
+ Engine.logger.debug "Executing dependency: #{f}"
548
+ say "Executing dependency: #{f}"
549
+ process(dependency)
550
+ else
551
+ raise "Invalid dependency type: #{dependency.class}"
552
+ end
553
+ end
554
+ end
555
+
556
+ # Execute all screens
557
+ def execute_screens(control, timing = :before_post_process)
558
+ screens = case timing
559
+ when :after_post_process
560
+ control.after_post_process_screens
561
+ else # default to before post-process screens
562
+ control.screens
563
+ end
564
+ [:fatal,:error,:warn].each do |type|
565
+ screens[type].each do |block|
566
+ begin
567
+ block.call
568
+ rescue => e
569
+ case type
570
+ when :fatal
571
+ raise FatalScreenError, e
572
+ when :error
573
+ raise ScreenError, e
574
+ when :warn
575
+ say "Screen warning: #{e}"
576
+ end
577
+ end
578
+ end
579
+ end
580
+ end
581
+ end
582
+ end