activewarehouse-etl 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
data/CHANGELOG
CHANGED
@@ -1,18 +1,30 @@
|
|
1
1
|
0.1.0 - Dec 6, 2006
|
2
|
-
|
3
|
-
|
2
|
+
* Initial release
|
3
|
+
|
4
4
|
0.2.0 - Dec 7, 2006
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
* Added an XML parser for source parsing
|
6
|
+
* Added support for compound key constraints in destinations via the :unique => [] option
|
7
|
+
* Added ability to declare explicit columns in bulk import
|
8
|
+
* Added support for generators in destinations
|
9
|
+
* Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
|
10
|
+
|
11
11
|
0.3.0 - Dec 19, 2006
|
12
|
-
|
13
|
-
|
12
|
+
* Added support for calculated values in virtual fields with Proc
|
13
|
+
|
14
14
|
0.4.0 - Jan 11, 2006
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
15
|
+
* Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
|
16
|
+
* Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
|
17
|
+
* Added :truncate option for database destination. Set to true to truncate before importing data.
|
18
|
+
* Added support for :unique => [] option and virtual fields for the database destination
|
19
|
+
|
20
|
+
0.5.0 -
|
21
|
+
* Changed require_gem to gem and added alias to allow for older versions of rubygems
|
22
|
+
* Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
|
23
|
+
:options => {} defines options to pass to the parser
|
24
|
+
* Added support for passing a custom Parser class in the source configuration
|
25
|
+
* Removed the need to include Enumerable in each parser implementation
|
26
|
+
* Added new date_to_string and string_to_date transformers
|
27
|
+
* Implemented foreign_key_lookup transform including an ActiveRecordResolver
|
28
|
+
* Added real time activity logging which is called when the etl bin script is invoked
|
29
|
+
* Improved error handling
|
30
|
+
* Default logger level is now WARN
|
data/TODO
CHANGED
@@ -2,8 +2,10 @@ TODO
|
|
2
2
|
|
3
3
|
* Add build-in support for audit_dimension
|
4
4
|
* Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
|
5
|
-
* Add built-in FK lookup
|
6
5
|
* Provide greater control in error handling
|
7
6
|
** Allow a error threshold
|
8
7
|
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
9
|
-
** Allow mismatch row length error in delimited parser to be ignored
|
8
|
+
** Allow mismatch row length error in delimited parser to be ignored
|
9
|
+
* Improve error messages throughout, but especially in problems with the control files
|
10
|
+
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
|
+
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
data/lib/etl.rb
CHANGED
@@ -30,17 +30,22 @@ unless defined?(Logger)
|
|
30
30
|
end
|
31
31
|
|
32
32
|
require 'rubygems'
|
33
|
+
|
34
|
+
unless Kernel.respond_to?(:gem)
|
35
|
+
Kernel.send :alias_method, :gem, :require_gem
|
36
|
+
end
|
37
|
+
|
33
38
|
unless defined?(ActiveSupport)
|
34
|
-
|
39
|
+
gem 'activesupport'
|
35
40
|
require 'active_support'
|
36
41
|
end
|
37
42
|
|
38
43
|
unless defined?(ActiveRecord)
|
39
|
-
|
44
|
+
gem 'activerecord'
|
40
45
|
require 'active_record'
|
41
46
|
end
|
42
47
|
|
43
|
-
|
48
|
+
gem 'fastercsv'
|
44
49
|
require 'faster_csv'
|
45
50
|
|
46
51
|
$:.unshift(File.dirname(__FILE__))
|
@@ -64,4 +69,6 @@ module ETL #:nodoc:
|
|
64
69
|
end
|
65
70
|
class MismatchError < ETLError #:nodoc:
|
66
71
|
end
|
72
|
+
class ResolverError < ETLError #:nodoc:
|
73
|
+
end
|
67
74
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'active_record/connection_adapters/abstract_adapter'
|
2
2
|
|
3
|
-
module ActiveRecord
|
4
|
-
module ConnectionAdapters
|
3
|
+
module ActiveRecord #:nodoc:
|
4
|
+
module ConnectionAdapters #:nodoc:
|
5
5
|
class MysqlAdapter < AbstractAdapter
|
6
6
|
# Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
|
7
7
|
# the auto_increment
|
data/lib/etl/commands/etl.rb
CHANGED
@@ -34,12 +34,11 @@ if ARGV.length < 1
|
|
34
34
|
else
|
35
35
|
puts "Starting ETL process"
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
37
|
+
ARGV.each do |f|
|
38
|
+
puts "Processing #{f}"
|
39
|
+
ETL::Engine.realtime_activity = true
|
40
|
+
ETL::Engine.process(f)
|
42
41
|
end
|
43
|
-
|
44
|
-
puts "ETL process complete
|
42
|
+
|
43
|
+
puts "ETL process complete"
|
45
44
|
end
|
data/lib/etl/control/control.rb
CHANGED
@@ -1,40 +1,23 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Control #:nodoc:
|
3
|
-
#
|
4
|
-
class
|
5
|
-
attr_reader :file
|
6
|
-
|
3
|
+
# The Context is passed to eval.
|
4
|
+
class Context
|
7
5
|
class << self
|
8
|
-
#
|
9
|
-
def
|
10
|
-
|
11
|
-
# logger.debug "Parsing control file #{control_file.path}"
|
12
|
-
control = ETL::Control::Control.new(control_file)
|
13
|
-
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
14
|
-
eval(IO.readlines(control_file).join("\n"), control.get_binding)
|
15
|
-
control.validate
|
16
|
-
control
|
17
|
-
end
|
18
|
-
|
19
|
-
def resolve(control)
|
20
|
-
case control
|
21
|
-
when String
|
22
|
-
ETL::Control::Control.parse(File.new(control))
|
23
|
-
when File
|
24
|
-
ETL::Control::Control.parse(control)
|
25
|
-
when ETL::Control::Control
|
26
|
-
control
|
27
|
-
else
|
28
|
-
raise ControlError, "Control must be a String, File or Control object"
|
29
|
-
end
|
6
|
+
# Create a Context instance
|
7
|
+
def create(control)
|
8
|
+
Context.new(control).get_binding
|
30
9
|
end
|
31
10
|
end
|
32
11
|
|
33
|
-
def initialize(
|
34
|
-
@
|
12
|
+
def initialize(control)
|
13
|
+
@control = control
|
14
|
+
end
|
15
|
+
|
16
|
+
def file
|
17
|
+
@control.file
|
35
18
|
end
|
36
19
|
|
37
|
-
# Define a source
|
20
|
+
# Define a source.
|
38
21
|
def source(name, configuration={}, definition={})
|
39
22
|
source_types = [:file, :db]
|
40
23
|
source_types.each do |source_type|
|
@@ -47,7 +30,7 @@ module ETL #:nodoc:
|
|
47
30
|
|
48
31
|
# Get the defined source
|
49
32
|
def sources
|
50
|
-
@sources
|
33
|
+
@control.sources
|
51
34
|
end
|
52
35
|
|
53
36
|
# Define a destination
|
@@ -62,14 +45,19 @@ module ETL #:nodoc:
|
|
62
45
|
|
63
46
|
# Get the defined destinations
|
64
47
|
def destinations
|
65
|
-
@destinations
|
48
|
+
@control.destinations
|
66
49
|
end
|
67
50
|
|
68
51
|
def transform(name, transformer=nil, configuration={}, &block)
|
69
52
|
transforms[name] ||= []
|
70
53
|
if transformer
|
71
|
-
|
72
|
-
|
54
|
+
class_name = "#{transformer.to_s.classify}Transform"
|
55
|
+
begin
|
56
|
+
transform_class = ETL::Transform.const_get(class_name)
|
57
|
+
transforms[name] << transform_class.new(self, configuration)
|
58
|
+
rescue NameError => e
|
59
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
60
|
+
end
|
73
61
|
elsif block_given?
|
74
62
|
transforms[name] << block
|
75
63
|
else
|
@@ -77,32 +65,123 @@ module ETL #:nodoc:
|
|
77
65
|
end
|
78
66
|
end
|
79
67
|
|
80
|
-
def
|
81
|
-
transforms
|
68
|
+
def transforms
|
69
|
+
@control.transforms
|
82
70
|
end
|
83
71
|
|
84
72
|
def pre_process(name, configuration={})
|
85
|
-
|
86
|
-
|
73
|
+
class_name = "#{name.to_s.classify}Processor"
|
74
|
+
begin
|
75
|
+
processor_class = ETL::Processor.const_get(class_name)
|
76
|
+
pre_processors << processor_class.new(self, configuration)
|
77
|
+
rescue NameError
|
78
|
+
raise ControlError, "Unable to find preprocessor #{class_name}"
|
79
|
+
end
|
87
80
|
end
|
88
81
|
|
89
82
|
def pre_processors
|
90
|
-
@pre_processors
|
83
|
+
@control.pre_processors
|
91
84
|
end
|
92
85
|
|
93
86
|
def post_process(name, configuration={})
|
94
|
-
|
95
|
-
|
87
|
+
class_name = "#{name.to_s.classify}Processor"
|
88
|
+
begin
|
89
|
+
processor_class = ETL::Processor.const_get(class_name)
|
90
|
+
post_processors << processor_class.new(self, configuration)
|
91
|
+
rescue NameError
|
92
|
+
raise ControlError, "Unable to find postprocessor #{class_name}"
|
93
|
+
end
|
96
94
|
end
|
97
95
|
|
98
96
|
def post_processors
|
99
|
-
@post_processors
|
97
|
+
@control.post_processors
|
100
98
|
end
|
101
99
|
|
102
100
|
def get_binding
|
103
101
|
binding
|
104
102
|
end
|
105
103
|
|
104
|
+
protected
|
105
|
+
# Get an array of supported source types
|
106
|
+
def source_types
|
107
|
+
[:file, :database]
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get an array of supported destination types
|
111
|
+
def destination_types
|
112
|
+
[:file, :database]
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Object representation of a control file
|
118
|
+
class Control
|
119
|
+
# The File object
|
120
|
+
attr_reader :file
|
121
|
+
|
122
|
+
class << self
|
123
|
+
# Parse a control file and return a Control instance
|
124
|
+
def parse(control_file)
|
125
|
+
control_file = control_file.path if control_file.instance_of?(File)
|
126
|
+
# logger.debug "Parsing control file #{control_file.path}"
|
127
|
+
control = ETL::Control::Control.new(control_file)
|
128
|
+
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
129
|
+
eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
|
130
|
+
control.validate
|
131
|
+
control
|
132
|
+
end
|
133
|
+
|
134
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
135
|
+
# are:
|
136
|
+
# * The path to a control file as a String
|
137
|
+
# * A File object referencing the control file
|
138
|
+
# * The ETL::Control::Control object (which will just be returned)
|
139
|
+
#
|
140
|
+
# Raises a ControlError if any other type is given
|
141
|
+
def resolve(control)
|
142
|
+
case control
|
143
|
+
when String
|
144
|
+
ETL::Control::Control.parse(File.new(control))
|
145
|
+
when File
|
146
|
+
ETL::Control::Control.parse(control)
|
147
|
+
when ETL::Control::Control
|
148
|
+
control
|
149
|
+
else
|
150
|
+
raise ControlError, "Control must be a String, File or Control object"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Initialize the instance with the given File object
|
156
|
+
def initialize(file)
|
157
|
+
@file = file
|
158
|
+
end
|
159
|
+
|
160
|
+
# Get the defined source
|
161
|
+
def sources
|
162
|
+
@sources ||= []
|
163
|
+
end
|
164
|
+
|
165
|
+
# Get the defined destinations
|
166
|
+
def destinations
|
167
|
+
@destinations ||= []
|
168
|
+
end
|
169
|
+
|
170
|
+
# Get the transforms with the specified name
|
171
|
+
def transform(name)
|
172
|
+
transforms[name] ||= []
|
173
|
+
end
|
174
|
+
|
175
|
+
# Get an Array of preprocessors
|
176
|
+
def pre_processors
|
177
|
+
@pre_processors ||= []
|
178
|
+
end
|
179
|
+
|
180
|
+
# Get an Array of post processors
|
181
|
+
def post_processors
|
182
|
+
@post_processors ||= []
|
183
|
+
end
|
184
|
+
|
106
185
|
# Get a map of all transforms for this control
|
107
186
|
def transforms
|
108
187
|
@transforms ||= {}
|
@@ -118,17 +197,6 @@ module ETL #:nodoc:
|
|
118
197
|
end
|
119
198
|
end
|
120
199
|
|
121
|
-
protected
|
122
|
-
# Get an array of supported source types
|
123
|
-
def source_types
|
124
|
-
[:file, :database]
|
125
|
-
end
|
126
|
-
|
127
|
-
# Get an array of supported destination types
|
128
|
-
def destination_types
|
129
|
-
[:file, :database]
|
130
|
-
end
|
131
|
-
|
132
200
|
end
|
133
201
|
end
|
134
202
|
end
|
@@ -1,15 +1,41 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Control #:nodoc:
|
3
|
+
# Base class for destinations.
|
3
4
|
class Destination
|
4
|
-
|
5
|
-
|
5
|
+
# Read-only accessor for the ETL::Control::Control instance
|
6
|
+
attr_reader :control
|
7
|
+
|
8
|
+
# Read-only accessor for the configuration Hash
|
9
|
+
attr_reader :configuration
|
10
|
+
|
11
|
+
# Read-only accessor for the destination mapping Hash
|
12
|
+
attr_reader :mapping
|
13
|
+
|
14
|
+
# Accessor to the buffer size
|
15
|
+
attr_accessor :buffer_size
|
16
|
+
|
17
|
+
# Unique flag.
|
18
|
+
attr_accessor :unique
|
6
19
|
|
7
20
|
class << self
|
21
|
+
# Get the destination class for the specified name.
|
22
|
+
#
|
23
|
+
# For example if name is :database or 'database' then the DatabaseDestination class
|
24
|
+
# is returned
|
8
25
|
def class_for_name(name)
|
9
26
|
ETL::Control.const_get("#{name.to_s.classify}Destination")
|
10
27
|
end
|
11
28
|
end
|
12
29
|
|
30
|
+
# Initialize the destination
|
31
|
+
#
|
32
|
+
# Arguments:
|
33
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
34
|
+
# * <tt>configuration</tt>: The configuration Hash
|
35
|
+
# * <tt>mapping</tt>: The mapping Hash
|
36
|
+
#
|
37
|
+
# Options:
|
38
|
+
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
13
39
|
def initialize(control, configuration, mapping)
|
14
40
|
@control = control
|
15
41
|
@configuration = configuration
|
@@ -17,10 +43,12 @@ module ETL #:nodoc:
|
|
17
43
|
@buffer_size = configuration[:buffer_size] ||= 1000
|
18
44
|
end
|
19
45
|
|
46
|
+
# Get the current row number
|
20
47
|
def current_row
|
21
48
|
@current_row ||= 1
|
22
49
|
end
|
23
50
|
|
51
|
+
# Write the given row
|
24
52
|
def write(row)
|
25
53
|
buffer << row
|
26
54
|
flush if buffer.length >= buffer_size
|
@@ -36,11 +64,17 @@ module ETL #:nodoc:
|
|
36
64
|
raise NotImplementedError, "close method must be implemented by subclasses"
|
37
65
|
end
|
38
66
|
|
67
|
+
def errors
|
68
|
+
@errors ||= []
|
69
|
+
end
|
70
|
+
|
39
71
|
protected
|
72
|
+
# Access the buffer
|
40
73
|
def buffer
|
41
74
|
@buffer ||= []
|
42
75
|
end
|
43
76
|
|
77
|
+
# Access the generators map
|
44
78
|
def generators
|
45
79
|
@generators ||= {}
|
46
80
|
end
|
@@ -76,15 +110,22 @@ module ETL #:nodoc:
|
|
76
110
|
@compound_key_constraints ||= {}
|
77
111
|
end
|
78
112
|
|
79
|
-
# Add any virtual fields to the row
|
113
|
+
# Add any virtual fields to the row. Virtual rows will get their value from one of the following:
|
114
|
+
# * If the mapping is a Class, then an object which implements the next method
|
115
|
+
# * If the mapping is a Symbol, then the XGenerator where X is the classified symbol
|
116
|
+
# * If the mapping is a Proc, then it will be called with the row
|
117
|
+
# * Otherwise the value itself will be assigned to the field
|
80
118
|
def add_virtuals!(row)
|
81
119
|
if mapping[:virtual]
|
82
120
|
mapping[:virtual].each do |key,value|
|
83
121
|
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
84
122
|
case value
|
123
|
+
when Class
|
124
|
+
generator = generators[key] ||= value.new
|
125
|
+
row[key] = generator.next
|
85
126
|
when Symbol
|
86
|
-
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
87
|
-
row[key] =
|
127
|
+
generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
128
|
+
row[key] = generator.next
|
88
129
|
when Proc
|
89
130
|
row[key] = value.call(row)
|
90
131
|
else
|