darrell-activewarehouse-etl 0.9.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +99 -0
- data/Rakefile +175 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination.rb +448 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +83 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- data/lib/etl.rb +83 -0
- metadata +245 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Batch #:nodoc:
|
3
|
+
# Abstract base class for directives
|
4
|
+
class Directive
|
5
|
+
# Method to access the batch object
|
6
|
+
attr_reader :batch
|
7
|
+
|
8
|
+
# Initialize the directive with the given batch object
|
9
|
+
def initialize(batch)
|
10
|
+
@batch = batch
|
11
|
+
end
|
12
|
+
|
13
|
+
# Execute the directive
|
14
|
+
def execute
|
15
|
+
do_execute
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
# Implemented by subclasses
|
20
|
+
def do_execute
|
21
|
+
raise RuntimeError, "Directive must implement do_execute method"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Directive indicating that the specified ETL control file should be
|
26
|
+
# run
|
27
|
+
class Run < Directive
|
28
|
+
# The file to execute
|
29
|
+
attr_reader :file
|
30
|
+
|
31
|
+
# Initialize the directive with the given batch object and file
|
32
|
+
def initialize(batch, file)
|
33
|
+
super(batch)
|
34
|
+
@file = file
|
35
|
+
end
|
36
|
+
|
37
|
+
protected
|
38
|
+
# Execute the process
|
39
|
+
def do_execute
|
40
|
+
batch.engine.process(file)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Directive indicating temp tables should be used.
|
45
|
+
class UseTempTables < Directive
|
46
|
+
def initialize(batch)
|
47
|
+
super(batch)
|
48
|
+
end
|
49
|
+
protected
|
50
|
+
def do_execute
|
51
|
+
ETL::Engine.use_temp_tables = true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/etl/batch.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Builder #:nodoc:
|
3
|
+
# A builder which will build a data structure which can be used to populate a date dimension using
|
4
|
+
# commonly used date dimension columns.
|
5
|
+
class DateDimensionBuilder
|
6
|
+
# Specify the start date for the first record
|
7
|
+
attr_accessor :start_date
|
8
|
+
|
9
|
+
# Specify the end date for the last record
|
10
|
+
attr_accessor :end_date
|
11
|
+
|
12
|
+
# Define any holiday indicators
|
13
|
+
attr_accessor :holiday_indicators
|
14
|
+
|
15
|
+
# Add offset month for fiscal year
|
16
|
+
attr_accessor :fiscal_year_offset_month
|
17
|
+
|
18
|
+
# Define the weekday indicators. The default array begins on Sunday and goes to Saturday.
|
19
|
+
cattr_accessor :weekday_indicators
|
20
|
+
@@weekday_indicators = ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
|
21
|
+
|
22
|
+
# Initialize the builder.
|
23
|
+
#
|
24
|
+
# * <tt>start_date</tt>: The start date. Defaults to 5 years ago from today.
|
25
|
+
# * <tt>end_date</tt>: The end date. Defaults to now.
|
26
|
+
def initialize(start_date=Time.now.years_ago(5), end_date=Time.now, fiscal_year_offset_month=10)
|
27
|
+
@start_date = start_date.to_date
|
28
|
+
@end_date = end_date.to_date
|
29
|
+
@fiscal_year_offset_month = fiscal_year_offset_month.to_i
|
30
|
+
@holiday_indicators = []
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns an array of hashes representing records in the dimension.
|
34
|
+
def build(options={})
|
35
|
+
(start_date..end_date).map { |date| record_from_date(date) }
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# Returns a hash representing a record in the dimension. The values for each record are
|
41
|
+
# accessed by name.
|
42
|
+
def record_from_date(date)
|
43
|
+
time = date.to_time # need methods only available in Time
|
44
|
+
record = {}
|
45
|
+
record[:date] = time.strftime("%m/%d/%Y")
|
46
|
+
record[:full_date_description] = time.strftime("%B %d,%Y")
|
47
|
+
record[:day_of_week] = time.strftime("%A")
|
48
|
+
record[:day_in_week] = record[:day_of_week] # alias
|
49
|
+
#record[:day_number_in_epoch] = time.to_i / 24
|
50
|
+
#record[:week_number_in_epoch] = time.to_i / (24 * 7)
|
51
|
+
#record[:month_number_in_epoch] = time.to_i / (24 * 7 * 30)
|
52
|
+
record[:day_number_in_calendar_month] = time.day
|
53
|
+
record[:day_number_in_calendar_year] = time.yday
|
54
|
+
record[:day_number_in_fiscal_month] = time.day # should this be different from CY?
|
55
|
+
record[:day_number_in_fiscal_year] = time.fiscal_year_yday(fiscal_year_offset_month)
|
56
|
+
#record[:last_day_in_week_indicator] =
|
57
|
+
#record[:last_day_in_month_indicator] =
|
58
|
+
#record[:calendar_week_ending_date] =
|
59
|
+
record[:calendar_week] = "Week #{time.week}"
|
60
|
+
record[:calendar_week_number] = time.week
|
61
|
+
record[:calendar_week_number_in_year] = time.week # DEPRECATED
|
62
|
+
record[:calendar_month_name] = time.strftime("%B")
|
63
|
+
record[:calendar_month_number_in_year] = time.month # DEPRECATED
|
64
|
+
record[:calendar_month_number] = time.month
|
65
|
+
record[:calendar_year_month] = time.strftime("%Y-%m")
|
66
|
+
record[:calendar_quarter] = "Q#{time.quarter}"
|
67
|
+
record[:calendar_quarter_number] = time.quarter
|
68
|
+
record[:calendar_quarter_number_in_year] = time.quarter # DEPRECATED
|
69
|
+
record[:calendar_year_quarter] = "#{time.strftime('%Y')}-#{record[:calendar_quarter]}"
|
70
|
+
#record[:calendar_half_year] =
|
71
|
+
record[:calendar_year] = "#{time.year}"
|
72
|
+
record[:fiscal_week] = "FY Week #{time.fiscal_year_week(fiscal_year_offset_month)}"
|
73
|
+
record[:fiscal_week_number_in_year] = time.fiscal_year_week(fiscal_year_offset_month) # DEPRECATED
|
74
|
+
record[:fiscal_week_number] = time.fiscal_year_week(fiscal_year_offset_month)
|
75
|
+
record[:fiscal_month] = time.fiscal_year_month(fiscal_year_offset_month)
|
76
|
+
record[:fiscal_month_number] = time.fiscal_year_month(fiscal_year_offset_month)
|
77
|
+
record[:fiscal_month_number_in_year] = time.fiscal_year_month(fiscal_year_offset_month) # DEPRECATED
|
78
|
+
record[:fiscal_year_month] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-" + time.fiscal_year_month(fiscal_year_offset_month).to_s.rjust(2, '0')
|
79
|
+
record[:fiscal_quarter] = "FY Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
|
80
|
+
record[:fiscal_year_quarter] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
|
81
|
+
record[:fiscal_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month) # DEPRECATED
|
82
|
+
record[:fiscal_year_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month)
|
83
|
+
#record[:fiscal_half_year] =
|
84
|
+
record[:fiscal_year] = "FY#{time.fiscal_year(fiscal_year_offset_month)}"
|
85
|
+
record[:fiscal_year_number] = time.fiscal_year(fiscal_year_offset_month)
|
86
|
+
record[:holiday_indicator] = holiday_indicators.include?(date) ? 'Holiday' : 'Nonholiday'
|
87
|
+
record[:weekday_indicator] = weekday_indicators[time.wday]
|
88
|
+
record[:selling_season] = 'None'
|
89
|
+
record[:major_event] = 'None'
|
90
|
+
record[:sql_date_stamp] = date
|
91
|
+
|
92
|
+
record
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Builder #:nodoc:
|
3
|
+
# Builder that creates a simple time dimension.
|
4
|
+
class TimeDimensionBuilder
|
5
|
+
def initialize
|
6
|
+
# Returns an array of hashes representing records in the dimension. The values for each record are
|
7
|
+
# accessed by name.
|
8
|
+
def build(options={})
|
9
|
+
records = []
|
10
|
+
0.upto(23) do |t_hour|
|
11
|
+
0.upto(59) do |t_minute|
|
12
|
+
0.upto(59) do |t_second|
|
13
|
+
t_hour_string = t_hour.to_s.rjust(2, '0')
|
14
|
+
t_minute_string = t_minute.to_s.rjust(2, '0')
|
15
|
+
t_second_string = t_second.to_s.rjust(2, '0')
|
16
|
+
record = {}
|
17
|
+
record[:hour] = t_hour
|
18
|
+
record[:minute] = t_minute
|
19
|
+
record[:second] = t_second
|
20
|
+
record[:minute_description] = "#{t_hour_string}:#{t_minute_string}"
|
21
|
+
record[:full_description] = "#{t_hour_string}:#{t_minute_string}:#{t_second_string}"
|
22
|
+
records << record
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
records
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/etl/builder.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2006 Anthony Eden
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'benchmark'
|
25
|
+
require 'getoptlong'
|
26
|
+
|
27
|
+
# Print a usage statement
|
28
|
+
def usage #:nodoc:
|
29
|
+
puts "Usage: etl file [file file ...]" # TODO: add the command line options
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute
|
33
|
+
opts = GetoptLong.new(
|
34
|
+
[ '--version', '-v', GetoptLong::NO_ARGUMENT],
|
35
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
36
|
+
[ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
37
|
+
[ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
38
|
+
[ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
|
39
|
+
[ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
|
40
|
+
[ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
|
41
|
+
[ '--read-locally', GetoptLong::NO_ARGUMENT],
|
42
|
+
[ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
|
43
|
+
)
|
44
|
+
|
45
|
+
options = {}
|
46
|
+
opts.each do |opt, arg|
|
47
|
+
case opt
|
48
|
+
when '--version'
|
49
|
+
puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
|
50
|
+
return
|
51
|
+
when '--help'
|
52
|
+
usage
|
53
|
+
return
|
54
|
+
when '--config'
|
55
|
+
options[:config] = arg
|
56
|
+
when '--limit'
|
57
|
+
options[:limit] = arg.to_i
|
58
|
+
when '--offset'
|
59
|
+
options[:offset] = arg.to_i
|
60
|
+
when '--newlog'
|
61
|
+
options[:newlog] = true
|
62
|
+
when '--skip-bulk-import'
|
63
|
+
puts "skip bulk import enabled"
|
64
|
+
options[:skip_bulk_import] = true
|
65
|
+
when '--read-locally'
|
66
|
+
puts "read locally enabled"
|
67
|
+
options[:read_locally] = true
|
68
|
+
when '--rails-root'
|
69
|
+
options[:rails_root] = arg
|
70
|
+
puts "rails root set to #{options[:rails_root]}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if ARGV.length < 1
|
75
|
+
usage
|
76
|
+
else
|
77
|
+
puts "Starting ETL process"
|
78
|
+
|
79
|
+
ETL::Engine.init(options)
|
80
|
+
ARGV.each do |f|
|
81
|
+
ETL::Engine.realtime_activity = true
|
82
|
+
ETL::Engine.process(f)
|
83
|
+
end
|
84
|
+
|
85
|
+
puts "ETL process complete\n\n"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
execute
|
@@ -0,0 +1,405 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# The Context is passed to eval.
|
4
|
+
class Context
|
5
|
+
require 'test/unit/assertions'
|
6
|
+
include Test::Unit::Assertions
|
7
|
+
attr_reader :control
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Create a Context instance
|
11
|
+
def create(control)
|
12
|
+
Context.new(control).get_binding
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Initialize the context
|
17
|
+
def initialize(control)
|
18
|
+
@control = control
|
19
|
+
end
|
20
|
+
|
21
|
+
# Get the control file
|
22
|
+
def file
|
23
|
+
control.file
|
24
|
+
end
|
25
|
+
|
26
|
+
# Set the allowed error threshold
|
27
|
+
def set_error_threshold(error_threshold)
|
28
|
+
control.error_threshold = error_threshold
|
29
|
+
end
|
30
|
+
|
31
|
+
# Define a list of control files that this file depends on. Those control
|
32
|
+
# files will be executed prior to this control file. The list may
|
33
|
+
# contain symbols that will be converted to file names by calling
|
34
|
+
# to_s + '.ctl', or they may be strings in which case they will be used
|
35
|
+
# as is
|
36
|
+
def depends_on(*args)
|
37
|
+
(dependencies << args).flatten!
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the defined dependencies
|
41
|
+
def dependencies
|
42
|
+
control.dependencies
|
43
|
+
end
|
44
|
+
|
45
|
+
# Define a source.
|
46
|
+
def source(name, configuration={}, definition={})
|
47
|
+
if configuration[:type]
|
48
|
+
case configuration[:type]
|
49
|
+
when Class
|
50
|
+
source_class = configuration[:type]
|
51
|
+
sources << source_class.new(self, configuration, definition)
|
52
|
+
when String, Symbol
|
53
|
+
source_class = ETL::Control::Source.class_for_name(configuration[:type])
|
54
|
+
sources << source_class.new(self, configuration, definition)
|
55
|
+
else
|
56
|
+
if configuration[:type].is_a?(ETL::Control::Source)
|
57
|
+
sources << configuration[:type]
|
58
|
+
else
|
59
|
+
raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
else
|
63
|
+
source_types.each do |source_type|
|
64
|
+
if configuration[source_type]
|
65
|
+
source_class = ETL::Control::Source.class_for_name(source_type)
|
66
|
+
sources << source_class.new(self, configuration, definition)
|
67
|
+
break
|
68
|
+
end
|
69
|
+
end
|
70
|
+
raise ControlError, "A source was specified but no matching type was found" if sources.empty?
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the defined source
|
75
|
+
def sources
|
76
|
+
control.sources
|
77
|
+
end
|
78
|
+
|
79
|
+
# Define a destination
|
80
|
+
def destination(name, configuration={}, mapping={})
|
81
|
+
if configuration[:type]
|
82
|
+
case configuration[:type]
|
83
|
+
when Class
|
84
|
+
dest_class = configuration[:type]
|
85
|
+
destinations << dest_class.new(self, configuration, mapping)
|
86
|
+
when String, Symbol
|
87
|
+
dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
|
88
|
+
destinations << dest_class.new(self, configuration, mapping)
|
89
|
+
else
|
90
|
+
if configuration[:type].is_a?(ETL::Control::Destination)
|
91
|
+
destinations << configuration[:type]
|
92
|
+
else
|
93
|
+
raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
else
|
97
|
+
destination_types.each do |dest_type|
|
98
|
+
if configuration[dest_type]
|
99
|
+
dest_class = ETL::Control::Destination.class_for_name(dest_type)
|
100
|
+
destinations << dest_class.new(self, configuration, mapping)
|
101
|
+
break
|
102
|
+
end
|
103
|
+
end
|
104
|
+
raise ControlError, "A destination was specified but no matching destination type was found" if destinations.empty?
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Get the defined destinations
|
109
|
+
def destinations
|
110
|
+
control.destinations
|
111
|
+
end
|
112
|
+
|
113
|
+
# Define a transform
|
114
|
+
def transform(name, transformer=nil, configuration={}, &block)
|
115
|
+
if transformer
|
116
|
+
case transformer
|
117
|
+
when String, Symbol
|
118
|
+
class_name = "#{transformer.to_s.camelize}Transform"
|
119
|
+
begin
|
120
|
+
transform_class = ETL::Transform.const_get(class_name)
|
121
|
+
transforms << transform_class.new(self, name, configuration)
|
122
|
+
rescue NameError => e
|
123
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
124
|
+
end
|
125
|
+
when Class
|
126
|
+
transforms << transformer.new(self, transformer.name, configuration)
|
127
|
+
else
|
128
|
+
#transformer.class.inspect
|
129
|
+
if transformer.is_a?(ETL::Transform::Transform)
|
130
|
+
Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
|
131
|
+
t = transformer.dup
|
132
|
+
t.name = name
|
133
|
+
transforms << t
|
134
|
+
else
|
135
|
+
raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
elsif block_given?
|
139
|
+
transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
|
140
|
+
else
|
141
|
+
raise ControlError, "Either a transformer or a block must be specified"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Get the defined transforms
|
146
|
+
def transforms
|
147
|
+
control.transforms
|
148
|
+
end
|
149
|
+
|
150
|
+
# Define a before post-process screen block. The type argument must be
|
151
|
+
# one of :fatal, :error or :warn
|
152
|
+
def screen(type, &block)
|
153
|
+
screens[type] << block
|
154
|
+
end
|
155
|
+
|
156
|
+
# Get the before post-process screen blocks
|
157
|
+
def screens
|
158
|
+
control.screens
|
159
|
+
end
|
160
|
+
|
161
|
+
# Define an after post-proces screen block. The type argument must be
|
162
|
+
# one of :fatal, :error or :warn
|
163
|
+
def after_post_process_screen(type, &block)
|
164
|
+
after_post_process_screens[type] << block
|
165
|
+
end
|
166
|
+
|
167
|
+
# Get the after post-process screen blocks
|
168
|
+
def after_post_process_screens
|
169
|
+
control.after_post_process_screens
|
170
|
+
end
|
171
|
+
|
172
|
+
# Rename the source field to the destination field
|
173
|
+
def rename(source, destination)
|
174
|
+
after_read :rename, :source => source, :dest => destination
|
175
|
+
end
|
176
|
+
|
177
|
+
# Copy the source field to the destination field
|
178
|
+
def copy(source, destination)
|
179
|
+
after_read :copy_field, :source => source, :dest => destination
|
180
|
+
end
|
181
|
+
|
182
|
+
protected
|
183
|
+
# This method is used to define a processor and insert into the specified processor
|
184
|
+
# collection.
|
185
|
+
def define_processor(name, processor_collection, configuration, proc)
|
186
|
+
case name
|
187
|
+
when String, Symbol, nil
|
188
|
+
name ||= 'block'
|
189
|
+
class_name = "#{name.to_s.camelize}Processor"
|
190
|
+
begin
|
191
|
+
processor_class = ETL::Processor.const_get(class_name)
|
192
|
+
if name == 'block'
|
193
|
+
raise ControlError, "A block must be passed for block processor" if proc.nil?
|
194
|
+
configuration[:block] = proc
|
195
|
+
end
|
196
|
+
processor_collection << processor_class.new(self, configuration)
|
197
|
+
rescue NameError => e
|
198
|
+
raise ControlError, "Unable to find processor #{class_name}: #{e}"
|
199
|
+
end
|
200
|
+
when Class
|
201
|
+
processor_collection << name.new(self, configuration)
|
202
|
+
else
|
203
|
+
raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
public
|
208
|
+
# Define an "after read" processor. This must be a row-level processor.
|
209
|
+
def after_read(name='block', configuration={}, &block)
|
210
|
+
define_processor(name, after_read_processors, configuration, block)
|
211
|
+
end
|
212
|
+
|
213
|
+
# Get the defined "after read" processors
|
214
|
+
def after_read_processors
|
215
|
+
control.after_read_processors
|
216
|
+
end
|
217
|
+
|
218
|
+
# Define a "before write" processor. This must be a row-level processor.
|
219
|
+
def before_write(name='block', configuration={}, &block)
|
220
|
+
define_processor(name, before_write_processors, configuration, block)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get the defined "before write" processors
|
224
|
+
def before_write_processors
|
225
|
+
control.before_write_processors
|
226
|
+
end
|
227
|
+
|
228
|
+
# Define a pre-processor
|
229
|
+
def pre_process(name='block', configuration={}, &block)
|
230
|
+
define_processor(name, pre_processors, configuration, block)
|
231
|
+
end
|
232
|
+
|
233
|
+
# Get the defined pre-processors
|
234
|
+
def pre_processors
|
235
|
+
control.pre_processors
|
236
|
+
end
|
237
|
+
|
238
|
+
# Define a post-processor
|
239
|
+
def post_process(name='block', configuration={}, &block)
|
240
|
+
define_processor(name, post_processors, configuration, block)
|
241
|
+
end
|
242
|
+
|
243
|
+
# Get the defined post-processors
|
244
|
+
def post_processors
|
245
|
+
control.post_processors
|
246
|
+
end
|
247
|
+
|
248
|
+
# Get the binding object
|
249
|
+
def get_binding
|
250
|
+
binding
|
251
|
+
end
|
252
|
+
|
253
|
+
protected
|
254
|
+
# Get an array of supported source types
|
255
|
+
def source_types
|
256
|
+
control.source_types
|
257
|
+
end
|
258
|
+
|
259
|
+
# Get an array of supported destination types
|
260
|
+
def destination_types
|
261
|
+
control.destination_types
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
# Object representation of a control file
|
267
|
+
class Control
|
268
|
+
# The File object
|
269
|
+
attr_reader :file
|
270
|
+
|
271
|
+
# The error threshold
|
272
|
+
attr_accessor :error_threshold
|
273
|
+
|
274
|
+
class << self
|
275
|
+
# Parse a control file and return a Control instance
|
276
|
+
def parse(control_file)
|
277
|
+
control_file = control_file.path if control_file.instance_of?(File)
|
278
|
+
control = ETL::Control::Control.new(control_file)
|
279
|
+
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
280
|
+
eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
|
281
|
+
control.validate
|
282
|
+
control
|
283
|
+
end
|
284
|
+
|
285
|
+
def parse_text(text)
|
286
|
+
control = ETL::Control::Control.new(nil)
|
287
|
+
eval(text, Context.create(control), 'inline')
|
288
|
+
control.validate
|
289
|
+
control
|
290
|
+
end
|
291
|
+
|
292
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
293
|
+
# are:
|
294
|
+
# * The path to a control file as a String
|
295
|
+
# * A File object referencing the control file
|
296
|
+
# * The ETL::Control::Control object (which will just be returned)
|
297
|
+
#
|
298
|
+
# Raises a ControlError if any other type is given
|
299
|
+
def resolve(control)
|
300
|
+
case control
|
301
|
+
when String
|
302
|
+
ETL::Control::Control.parse(File.new(control))
|
303
|
+
when File
|
304
|
+
ETL::Control::Control.parse(control)
|
305
|
+
when ETL::Control::Control
|
306
|
+
control
|
307
|
+
else
|
308
|
+
raise ControlError, "Control must be a String, File or Control object"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Initialize the instance with the given File object
|
314
|
+
def initialize(file)
|
315
|
+
@file = file
|
316
|
+
end
|
317
|
+
|
318
|
+
# Get a list of dependencies
|
319
|
+
def dependencies
|
320
|
+
@dependencies ||= []
|
321
|
+
end
|
322
|
+
|
323
|
+
# Get the defined source
|
324
|
+
def sources
|
325
|
+
@sources ||= []
|
326
|
+
end
|
327
|
+
|
328
|
+
# Get the defined destinations
|
329
|
+
def destinations
|
330
|
+
@destinations ||= []
|
331
|
+
end
|
332
|
+
|
333
|
+
# Get the transforms with the specified name
|
334
|
+
# def transform(name)
|
335
|
+
# transforms[name] ||= []
|
336
|
+
# end
|
337
|
+
|
338
|
+
def after_read_processors
|
339
|
+
@after_read_processors ||= []
|
340
|
+
end
|
341
|
+
|
342
|
+
# Get all of the "before write" processors
|
343
|
+
def before_write_processors
|
344
|
+
@before_write_processors ||= []
|
345
|
+
end
|
346
|
+
|
347
|
+
# Get an Array of preprocessors
|
348
|
+
def pre_processors
|
349
|
+
@pre_processors ||= []
|
350
|
+
end
|
351
|
+
|
352
|
+
# Get an Array of post processors
|
353
|
+
def post_processors
|
354
|
+
@post_processors ||= []
|
355
|
+
end
|
356
|
+
|
357
|
+
# Get an Array of all transforms for this control
|
358
|
+
def transforms
|
359
|
+
@transforms ||= []
|
360
|
+
end
|
361
|
+
|
362
|
+
# A hash of the screens executed before post-process
|
363
|
+
def screens
|
364
|
+
@screens ||= {
|
365
|
+
:fatal => [],
|
366
|
+
:error => [],
|
367
|
+
:warn => []
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
# A hash of the screens executed after post-process
|
372
|
+
def after_post_process_screens
|
373
|
+
@after_post_process_screens ||= {
|
374
|
+
:fatal => [],
|
375
|
+
:error => [],
|
376
|
+
:warn => []
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
# Get the error threshold. Defaults to 100.
|
381
|
+
def error_threshold
|
382
|
+
@error_threshold ||= 100
|
383
|
+
end
|
384
|
+
|
385
|
+
# Validate the control file
|
386
|
+
def validate
|
387
|
+
#unless sources.length > 0
|
388
|
+
# raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
|
389
|
+
#end
|
390
|
+
#unless destinations.length > 0
|
391
|
+
# raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
|
392
|
+
#end
|
393
|
+
end
|
394
|
+
|
395
|
+
def source_types
|
396
|
+
[:file, :database]
|
397
|
+
end
|
398
|
+
|
399
|
+
def destination_types
|
400
|
+
[:file, :database]
|
401
|
+
end
|
402
|
+
|
403
|
+
end
|
404
|
+
end
|
405
|
+
end
|