darrell-activewarehouse-etl 0.9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Batch #:nodoc:
3
+ # Abstract base class for directives
4
+ class Directive
5
+ # Method to access the batch object
6
+ attr_reader :batch
7
+
8
+ # Initialize the directive with the given batch object
9
+ def initialize(batch)
10
+ @batch = batch
11
+ end
12
+
13
+ # Execute the directive
14
+ def execute
15
+ do_execute
16
+ end
17
+
18
+ protected
19
+ # Implemented by subclasses
20
+ def do_execute
21
+ raise RuntimeError, "Directive must implement do_execute method"
22
+ end
23
+ end
24
+
25
+ # Directive indicating that the specified ETL control file should be
26
+ # run
27
+ class Run < Directive
28
+ # The file to execute
29
+ attr_reader :file
30
+
31
+ # Initialize the directive with the given batch object and file
32
+ def initialize(batch, file)
33
+ super(batch)
34
+ @file = file
35
+ end
36
+
37
+ protected
38
+ # Execute the process
39
+ def do_execute
40
+ batch.engine.process(file)
41
+ end
42
+ end
43
+
44
+ # Directive indicating temp tables should be used.
45
+ class UseTempTables < Directive
46
+ def initialize(batch)
47
+ super(batch)
48
+ end
49
+ protected
50
+ def do_execute
51
+ ETL::Engine.use_temp_tables = true
52
+ end
53
+ end
54
+ end
55
+ end
data/lib/etl/batch.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'etl/batch/batch'
2
+ require 'etl/batch/directives'
@@ -0,0 +1,96 @@
1
+ module ETL #:nodoc:
2
+ module Builder #:nodoc:
3
+ # A builder which will build a data structure which can be used to populate a date dimension using
4
+ # commonly used date dimension columns.
5
+ class DateDimensionBuilder
6
+ # Specify the start date for the first record
7
+ attr_accessor :start_date
8
+
9
+ # Specify the end date for the last record
10
+ attr_accessor :end_date
11
+
12
+ # Define any holiday indicators
13
+ attr_accessor :holiday_indicators
14
+
15
+ # Add offset month for fiscal year
16
+ attr_accessor :fiscal_year_offset_month
17
+
18
+ # Define the weekday indicators. The default array begins on Sunday and goes to Saturday.
19
+ cattr_accessor :weekday_indicators
20
+ @@weekday_indicators = ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
21
+
22
+ # Initialize the builder.
23
+ #
24
+ # * <tt>start_date</tt>: The start date. Defaults to 5 years ago from today.
25
+ # * <tt>end_date</tt>: The end date. Defaults to now.
26
+ def initialize(start_date=Time.now.years_ago(5), end_date=Time.now, fiscal_year_offset_month=10)
27
+ @start_date = start_date.to_date
28
+ @end_date = end_date.to_date
29
+ @fiscal_year_offset_month = fiscal_year_offset_month.to_i
30
+ @holiday_indicators = []
31
+ end
32
+
33
+ # Returns an array of hashes representing records in the dimension.
34
+ def build(options={})
35
+ (start_date..end_date).map { |date| record_from_date(date) }
36
+ end
37
+
38
+ private
39
+
40
+ # Returns a hash representing a record in the dimension. The values for each record are
41
+ # accessed by name.
42
+ def record_from_date(date)
43
+ time = date.to_time # need methods only available in Time
44
+ record = {}
45
+ record[:date] = time.strftime("%m/%d/%Y")
46
+ record[:full_date_description] = time.strftime("%B %d,%Y")
47
+ record[:day_of_week] = time.strftime("%A")
48
+ record[:day_in_week] = record[:day_of_week] # alias
49
+ #record[:day_number_in_epoch] = time.to_i / 24
50
+ #record[:week_number_in_epoch] = time.to_i / (24 * 7)
51
+ #record[:month_number_in_epoch] = time.to_i / (24 * 7 * 30)
52
+ record[:day_number_in_calendar_month] = time.day
53
+ record[:day_number_in_calendar_year] = time.yday
54
+ record[:day_number_in_fiscal_month] = time.day # should this be different from CY?
55
+ record[:day_number_in_fiscal_year] = time.fiscal_year_yday(fiscal_year_offset_month)
56
+ #record[:last_day_in_week_indicator] =
57
+ #record[:last_day_in_month_indicator] =
58
+ #record[:calendar_week_ending_date] =
59
+ record[:calendar_week] = "Week #{time.week}"
60
+ record[:calendar_week_number] = time.week
61
+ record[:calendar_week_number_in_year] = time.week # DEPRECATED
62
+ record[:calendar_month_name] = time.strftime("%B")
63
+ record[:calendar_month_number_in_year] = time.month # DEPRECATED
64
+ record[:calendar_month_number] = time.month
65
+ record[:calendar_year_month] = time.strftime("%Y-%m")
66
+ record[:calendar_quarter] = "Q#{time.quarter}"
67
+ record[:calendar_quarter_number] = time.quarter
68
+ record[:calendar_quarter_number_in_year] = time.quarter # DEPRECATED
69
+ record[:calendar_year_quarter] = "#{time.strftime('%Y')}-#{record[:calendar_quarter]}"
70
+ #record[:calendar_half_year] =
71
+ record[:calendar_year] = "#{time.year}"
72
+ record[:fiscal_week] = "FY Week #{time.fiscal_year_week(fiscal_year_offset_month)}"
73
+ record[:fiscal_week_number_in_year] = time.fiscal_year_week(fiscal_year_offset_month) # DEPRECATED
74
+ record[:fiscal_week_number] = time.fiscal_year_week(fiscal_year_offset_month)
75
+ record[:fiscal_month] = time.fiscal_year_month(fiscal_year_offset_month)
76
+ record[:fiscal_month_number] = time.fiscal_year_month(fiscal_year_offset_month)
77
+ record[:fiscal_month_number_in_year] = time.fiscal_year_month(fiscal_year_offset_month) # DEPRECATED
78
+ record[:fiscal_year_month] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-" + time.fiscal_year_month(fiscal_year_offset_month).to_s.rjust(2, '0')
79
+ record[:fiscal_quarter] = "FY Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
80
+ record[:fiscal_year_quarter] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
81
+ record[:fiscal_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month) # DEPRECATED
82
+ record[:fiscal_year_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month)
83
+ #record[:fiscal_half_year] =
84
+ record[:fiscal_year] = "FY#{time.fiscal_year(fiscal_year_offset_month)}"
85
+ record[:fiscal_year_number] = time.fiscal_year(fiscal_year_offset_month)
86
+ record[:holiday_indicator] = holiday_indicators.include?(date) ? 'Holiday' : 'Nonholiday'
87
+ record[:weekday_indicator] = weekday_indicators[time.wday]
88
+ record[:selling_season] = 'None'
89
+ record[:major_event] = 'None'
90
+ record[:sql_date_stamp] = date
91
+
92
+ record
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,31 @@
1
+ module ETL #:nodoc:
2
+ module Builder #:nodoc:
3
+ # Builder that creates a simple time dimension.
4
+ class TimeDimensionBuilder
5
+ def initialize
6
+ # Returns an array of hashes representing records in the dimension. The values for each record are
7
+ # accessed by name.
8
+ def build(options={})
9
+ records = []
10
+ 0.upto(23) do |t_hour|
11
+ 0.upto(59) do |t_minute|
12
+ 0.upto(59) do |t_second|
13
+ t_hour_string = t_hour.to_s.rjust(2, '0')
14
+ t_minute_string = t_minute.to_s.rjust(2, '0')
15
+ t_second_string = t_second.to_s.rjust(2, '0')
16
+ record = {}
17
+ record[:hour] = t_hour
18
+ record[:minute] = t_minute
19
+ record[:second] = t_second
20
+ record[:minute_description] = "#{t_hour_string}:#{t_minute_string}"
21
+ record[:full_description] = "#{t_hour_string}:#{t_minute_string}:#{t_second_string}"
22
+ records << record
23
+ end
24
+ end
25
+ end
26
+ records
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/builder/date_dimension_builder'
2
+ require 'etl/builder/time_dimension_builder'
@@ -0,0 +1,89 @@
1
+ #--
2
+ # Copyright (c) 2006 Anthony Eden
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'benchmark'
25
+ require 'getoptlong'
26
+
27
+ # Print a usage statement
28
+ def usage #:nodoc:
29
+ puts "Usage: etl file [file file ...]" # TODO: add the command line options
30
+ end
31
+
32
+ def execute
33
+ opts = GetoptLong.new(
34
+ [ '--version', '-v', GetoptLong::NO_ARGUMENT],
35
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
36
+ [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
37
+ [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
38
+ [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
39
+ [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
40
+ [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
41
+ [ '--read-locally', GetoptLong::NO_ARGUMENT],
42
+ [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
43
+ )
44
+
45
+ options = {}
46
+ opts.each do |opt, arg|
47
+ case opt
48
+ when '--version'
49
+ puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
50
+ return
51
+ when '--help'
52
+ usage
53
+ return
54
+ when '--config'
55
+ options[:config] = arg
56
+ when '--limit'
57
+ options[:limit] = arg.to_i
58
+ when '--offset'
59
+ options[:offset] = arg.to_i
60
+ when '--newlog'
61
+ options[:newlog] = true
62
+ when '--skip-bulk-import'
63
+ puts "skip bulk import enabled"
64
+ options[:skip_bulk_import] = true
65
+ when '--read-locally'
66
+ puts "read locally enabled"
67
+ options[:read_locally] = true
68
+ when '--rails-root'
69
+ options[:rails_root] = arg
70
+ puts "rails root set to #{options[:rails_root]}"
71
+ end
72
+ end
73
+
74
+ if ARGV.length < 1
75
+ usage
76
+ else
77
+ puts "Starting ETL process"
78
+
79
+ ETL::Engine.init(options)
80
+ ARGV.each do |f|
81
+ ETL::Engine.realtime_activity = true
82
+ ETL::Engine.process(f)
83
+ end
84
+
85
+ puts "ETL process complete\n\n"
86
+ end
87
+ end
88
+
89
+ execute
@@ -0,0 +1,405 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # The Context is passed to eval.
4
+ class Context
5
+ require 'test/unit/assertions'
6
+ include Test::Unit::Assertions
7
+ attr_reader :control
8
+
9
+ class << self
10
+ # Create a Context instance
11
+ def create(control)
12
+ Context.new(control).get_binding
13
+ end
14
+ end
15
+
16
+ # Initialize the context
17
+ def initialize(control)
18
+ @control = control
19
+ end
20
+
21
+ # Get the control file
22
+ def file
23
+ control.file
24
+ end
25
+
26
+ # Set the allowed error threshold
27
+ def set_error_threshold(error_threshold)
28
+ control.error_threshold = error_threshold
29
+ end
30
+
31
+ # Define a list of control files that this file depends on. Those control
32
+ # files will be executed prior to this control file. The list may
33
+ # contain symbols that will be converted to file names by calling
34
+ # to_s + '.ctl', or they may be strings in which case they will be used
35
+ # as is
36
+ def depends_on(*args)
37
+ (dependencies << args).flatten!
38
+ end
39
+
40
+ # Get the defined dependencies
41
+ def dependencies
42
+ control.dependencies
43
+ end
44
+
45
+ # Define a source.
46
+ def source(name, configuration={}, definition={})
47
+ if configuration[:type]
48
+ case configuration[:type]
49
+ when Class
50
+ source_class = configuration[:type]
51
+ sources << source_class.new(self, configuration, definition)
52
+ when String, Symbol
53
+ source_class = ETL::Control::Source.class_for_name(configuration[:type])
54
+ sources << source_class.new(self, configuration, definition)
55
+ else
56
+ if configuration[:type].is_a?(ETL::Control::Source)
57
+ sources << configuration[:type]
58
+ else
59
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
60
+ end
61
+ end
62
+ else
63
+ source_types.each do |source_type|
64
+ if configuration[source_type]
65
+ source_class = ETL::Control::Source.class_for_name(source_type)
66
+ sources << source_class.new(self, configuration, definition)
67
+ break
68
+ end
69
+ end
70
+ raise ControlError, "A source was specified but no matching type was found" if sources.empty?
71
+ end
72
+ end
73
+
74
+ # Get the defined source
75
+ def sources
76
+ control.sources
77
+ end
78
+
79
+ # Define a destination
80
+ def destination(name, configuration={}, mapping={})
81
+ if configuration[:type]
82
+ case configuration[:type]
83
+ when Class
84
+ dest_class = configuration[:type]
85
+ destinations << dest_class.new(self, configuration, mapping)
86
+ when String, Symbol
87
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
88
+ destinations << dest_class.new(self, configuration, mapping)
89
+ else
90
+ if configuration[:type].is_a?(ETL::Control::Destination)
91
+ destinations << configuration[:type]
92
+ else
93
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
94
+ end
95
+ end
96
+ else
97
+ destination_types.each do |dest_type|
98
+ if configuration[dest_type]
99
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
100
+ destinations << dest_class.new(self, configuration, mapping)
101
+ break
102
+ end
103
+ end
104
+ raise ControlError, "A destination was specified but no matching destination type was found" if destinations.empty?
105
+ end
106
+ end
107
+
108
+ # Get the defined destinations
109
+ def destinations
110
+ control.destinations
111
+ end
112
+
113
+ # Define a transform
114
+ def transform(name, transformer=nil, configuration={}, &block)
115
+ if transformer
116
+ case transformer
117
+ when String, Symbol
118
+ class_name = "#{transformer.to_s.camelize}Transform"
119
+ begin
120
+ transform_class = ETL::Transform.const_get(class_name)
121
+ transforms << transform_class.new(self, name, configuration)
122
+ rescue NameError => e
123
+ raise ControlError, "Unable to find transformer #{class_name}: #{e}"
124
+ end
125
+ when Class
126
+ transforms << transformer.new(self, transformer.name, configuration)
127
+ else
128
+ #transformer.class.inspect
129
+ if transformer.is_a?(ETL::Transform::Transform)
130
+ Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
131
+ t = transformer.dup
132
+ t.name = name
133
+ transforms << t
134
+ else
135
+ raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
136
+ end
137
+ end
138
+ elsif block_given?
139
+ transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
140
+ else
141
+ raise ControlError, "Either a transformer or a block must be specified"
142
+ end
143
+ end
144
+
145
+ # Get the defined transforms
146
+ def transforms
147
+ control.transforms
148
+ end
149
+
150
+ # Define a before post-process screen block. The type argument must be
151
+ # one of :fatal, :error or :warn
152
+ def screen(type, &block)
153
+ screens[type] << block
154
+ end
155
+
156
+ # Get the before post-process screen blocks
157
+ def screens
158
+ control.screens
159
+ end
160
+
161
+ # Define an after post-proces screen block. The type argument must be
162
+ # one of :fatal, :error or :warn
163
+ def after_post_process_screen(type, &block)
164
+ after_post_process_screens[type] << block
165
+ end
166
+
167
+ # Get the after post-process screen blocks
168
+ def after_post_process_screens
169
+ control.after_post_process_screens
170
+ end
171
+
172
+ # Rename the source field to the destination field
173
+ def rename(source, destination)
174
+ after_read :rename, :source => source, :dest => destination
175
+ end
176
+
177
+ # Copy the source field to the destination field
178
+ def copy(source, destination)
179
+ after_read :copy_field, :source => source, :dest => destination
180
+ end
181
+
182
+ protected
183
+ # This method is used to define a processor and insert into the specified processor
184
+ # collection.
185
+ def define_processor(name, processor_collection, configuration, proc)
186
+ case name
187
+ when String, Symbol, nil
188
+ name ||= 'block'
189
+ class_name = "#{name.to_s.camelize}Processor"
190
+ begin
191
+ processor_class = ETL::Processor.const_get(class_name)
192
+ if name == 'block'
193
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
194
+ configuration[:block] = proc
195
+ end
196
+ processor_collection << processor_class.new(self, configuration)
197
+ rescue NameError => e
198
+ raise ControlError, "Unable to find processor #{class_name}: #{e}"
199
+ end
200
+ when Class
201
+ processor_collection << name.new(self, configuration)
202
+ else
203
+ raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
204
+ end
205
+ end
206
+
207
+ public
208
+ # Define an "after read" processor. This must be a row-level processor.
209
+ def after_read(name='block', configuration={}, &block)
210
+ define_processor(name, after_read_processors, configuration, block)
211
+ end
212
+
213
+ # Get the defined "after read" processors
214
+ def after_read_processors
215
+ control.after_read_processors
216
+ end
217
+
218
+ # Define a "before write" processor. This must be a row-level processor.
219
+ def before_write(name='block', configuration={}, &block)
220
+ define_processor(name, before_write_processors, configuration, block)
221
+ end
222
+
223
+ # Get the defined "before write" processors
224
+ def before_write_processors
225
+ control.before_write_processors
226
+ end
227
+
228
+ # Define a pre-processor
229
+ def pre_process(name='block', configuration={}, &block)
230
+ define_processor(name, pre_processors, configuration, block)
231
+ end
232
+
233
+ # Get the defined pre-processors
234
+ def pre_processors
235
+ control.pre_processors
236
+ end
237
+
238
+ # Define a post-processor
239
+ def post_process(name='block', configuration={}, &block)
240
+ define_processor(name, post_processors, configuration, block)
241
+ end
242
+
243
+ # Get the defined post-processors
244
+ def post_processors
245
+ control.post_processors
246
+ end
247
+
248
+ # Get the binding object
249
+ def get_binding
250
+ binding
251
+ end
252
+
253
+ protected
254
+ # Get an array of supported source types
255
+ def source_types
256
+ control.source_types
257
+ end
258
+
259
+ # Get an array of supported destination types
260
+ def destination_types
261
+ control.destination_types
262
+ end
263
+
264
+ end
265
+
266
+ # Object representation of a control file
267
+ class Control
268
+ # The File object
269
+ attr_reader :file
270
+
271
+ # The error threshold
272
+ attr_accessor :error_threshold
273
+
274
+ class << self
275
+ # Parse a control file and return a Control instance
276
+ def parse(control_file)
277
+ control_file = control_file.path if control_file.instance_of?(File)
278
+ control = ETL::Control::Control.new(control_file)
279
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
280
+ eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
281
+ control.validate
282
+ control
283
+ end
284
+
285
+ def parse_text(text)
286
+ control = ETL::Control::Control.new(nil)
287
+ eval(text, Context.create(control), 'inline')
288
+ control.validate
289
+ control
290
+ end
291
+
292
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
293
+ # are:
294
+ # * The path to a control file as a String
295
+ # * A File object referencing the control file
296
+ # * The ETL::Control::Control object (which will just be returned)
297
+ #
298
+ # Raises a ControlError if any other type is given
299
+ def resolve(control)
300
+ case control
301
+ when String
302
+ ETL::Control::Control.parse(File.new(control))
303
+ when File
304
+ ETL::Control::Control.parse(control)
305
+ when ETL::Control::Control
306
+ control
307
+ else
308
+ raise ControlError, "Control must be a String, File or Control object"
309
+ end
310
+ end
311
+ end
312
+
313
+ # Initialize the instance with the given File object
314
+ def initialize(file)
315
+ @file = file
316
+ end
317
+
318
+ # Get a list of dependencies
319
+ def dependencies
320
+ @dependencies ||= []
321
+ end
322
+
323
+ # Get the defined source
324
+ def sources
325
+ @sources ||= []
326
+ end
327
+
328
+ # Get the defined destinations
329
+ def destinations
330
+ @destinations ||= []
331
+ end
332
+
333
+ # Get the transforms with the specified name
334
+ # def transform(name)
335
+ # transforms[name] ||= []
336
+ # end
337
+
338
+ def after_read_processors
339
+ @after_read_processors ||= []
340
+ end
341
+
342
+ # Get all of the "before write" processors
343
+ def before_write_processors
344
+ @before_write_processors ||= []
345
+ end
346
+
347
+ # Get an Array of preprocessors
348
+ def pre_processors
349
+ @pre_processors ||= []
350
+ end
351
+
352
+ # Get an Array of post processors
353
+ def post_processors
354
+ @post_processors ||= []
355
+ end
356
+
357
+ # Get an Array of all transforms for this control
358
+ def transforms
359
+ @transforms ||= []
360
+ end
361
+
362
+ # A hash of the screens executed before post-process
363
+ def screens
364
+ @screens ||= {
365
+ :fatal => [],
366
+ :error => [],
367
+ :warn => []
368
+ }
369
+ end
370
+
371
+ # A hash of the screens executed after post-process
372
+ def after_post_process_screens
373
+ @after_post_process_screens ||= {
374
+ :fatal => [],
375
+ :error => [],
376
+ :warn => []
377
+ }
378
+ end
379
+
380
+ # Get the error threshold. Defaults to 100.
381
+ def error_threshold
382
+ @error_threshold ||= 100
383
+ end
384
+
385
+ # Validate the control file
386
+ def validate
387
+ #unless sources.length > 0
388
+ # raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
389
+ #end
390
+ #unless destinations.length > 0
391
+ # raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
392
+ #end
393
+ end
394
+
395
+ def source_types
396
+ [:file, :database]
397
+ end
398
+
399
+ def destination_types
400
+ [:file, :database]
401
+ end
402
+
403
+ end
404
+ end
405
+ end