activewarehouse-etl 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
data/CHANGELOG
CHANGED
@@ -1,18 +1,30 @@
|
|
1
1
|
0.1.0 - Dec 6, 2006
|
2
|
-
|
3
|
-
|
2
|
+
* Initial release
|
3
|
+
|
4
4
|
0.2.0 - Dec 7, 2006
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
* Added an XML parser for source parsing
|
6
|
+
* Added support for compound key constraints in destinations via the :unique => [] option
|
7
|
+
* Added ability to declare explicit columns in bulk import
|
8
|
+
* Added support for generators in destinations
|
9
|
+
* Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
|
10
|
+
|
11
11
|
0.3.0 - Dec 19, 2006
|
12
|
-
|
13
|
-
|
12
|
+
* Added support for calculated values in virtual fields with Proc
|
13
|
+
|
14
14
|
0.4.0 - Jan 11, 2006
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
15
|
+
* Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
|
16
|
+
* Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
|
17
|
+
* Added :truncate option for database destination. Set to true to truncate before importing data.
|
18
|
+
* Added support for :unique => [] option and virtual fields for the database destination
|
19
|
+
|
20
|
+
0.5.0 -
|
21
|
+
* Changed require_gem to gem and added alias to allow for older versions of rubygems
|
22
|
+
* Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
|
23
|
+
:options => {} defines options to pass to the parser
|
24
|
+
* Added support for passing a custom Parser class in the source configuration
|
25
|
+
* Removed the need to include Enumerable in each parser implementation
|
26
|
+
* Added new date_to_string and string_to_date transformers
|
27
|
+
* Implemented foreign_key_lookup transform including an ActiveRecordResolver
|
28
|
+
* Added real time activity logging which is called when the etl bin script is invoked
|
29
|
+
* Improved error handling
|
30
|
+
* Default logger level is now WARN
|
data/TODO
CHANGED
@@ -2,8 +2,10 @@ TODO
|
|
2
2
|
|
3
3
|
* Add build-in support for audit_dimension
|
4
4
|
* Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
|
5
|
-
* Add built-in FK lookup
|
6
5
|
* Provide greater control in error handling
|
7
6
|
** Allow a error threshold
|
8
7
|
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
9
|
-
** Allow mismatch row length error in delimited parser to be ignored
|
8
|
+
** Allow mismatch row length error in delimited parser to be ignored
|
9
|
+
* Improve error messages throughout, but especially in problems with the control files
|
10
|
+
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
|
+
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
data/lib/etl.rb
CHANGED
@@ -30,17 +30,22 @@ unless defined?(Logger)
|
|
30
30
|
end
|
31
31
|
|
32
32
|
require 'rubygems'
|
33
|
+
|
34
|
+
unless Kernel.respond_to?(:gem)
|
35
|
+
Kernel.send :alias_method, :gem, :require_gem
|
36
|
+
end
|
37
|
+
|
33
38
|
unless defined?(ActiveSupport)
|
34
|
-
|
39
|
+
gem 'activesupport'
|
35
40
|
require 'active_support'
|
36
41
|
end
|
37
42
|
|
38
43
|
unless defined?(ActiveRecord)
|
39
|
-
|
44
|
+
gem 'activerecord'
|
40
45
|
require 'active_record'
|
41
46
|
end
|
42
47
|
|
43
|
-
|
48
|
+
gem 'fastercsv'
|
44
49
|
require 'faster_csv'
|
45
50
|
|
46
51
|
$:.unshift(File.dirname(__FILE__))
|
@@ -64,4 +69,6 @@ module ETL #:nodoc:
|
|
64
69
|
end
|
65
70
|
class MismatchError < ETLError #:nodoc:
|
66
71
|
end
|
72
|
+
class ResolverError < ETLError #:nodoc:
|
73
|
+
end
|
67
74
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'active_record/connection_adapters/abstract_adapter'
|
2
2
|
|
3
|
-
module ActiveRecord
|
4
|
-
module ConnectionAdapters
|
3
|
+
module ActiveRecord #:nodoc:
|
4
|
+
module ConnectionAdapters #:nodoc:
|
5
5
|
class MysqlAdapter < AbstractAdapter
|
6
6
|
# Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
|
7
7
|
# the auto_increment
|
data/lib/etl/commands/etl.rb
CHANGED
@@ -34,12 +34,11 @@ if ARGV.length < 1
|
|
34
34
|
else
|
35
35
|
puts "Starting ETL process"
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
37
|
+
ARGV.each do |f|
|
38
|
+
puts "Processing #{f}"
|
39
|
+
ETL::Engine.realtime_activity = true
|
40
|
+
ETL::Engine.process(f)
|
42
41
|
end
|
43
|
-
|
44
|
-
puts "ETL process complete
|
42
|
+
|
43
|
+
puts "ETL process complete"
|
45
44
|
end
|
data/lib/etl/control/control.rb
CHANGED
@@ -1,40 +1,23 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Control #:nodoc:
|
3
|
-
#
|
4
|
-
class
|
5
|
-
attr_reader :file
|
6
|
-
|
3
|
+
# The Context is passed to eval.
|
4
|
+
class Context
|
7
5
|
class << self
|
8
|
-
#
|
9
|
-
def
|
10
|
-
|
11
|
-
# logger.debug "Parsing control file #{control_file.path}"
|
12
|
-
control = ETL::Control::Control.new(control_file)
|
13
|
-
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
14
|
-
eval(IO.readlines(control_file).join("\n"), control.get_binding)
|
15
|
-
control.validate
|
16
|
-
control
|
17
|
-
end
|
18
|
-
|
19
|
-
def resolve(control)
|
20
|
-
case control
|
21
|
-
when String
|
22
|
-
ETL::Control::Control.parse(File.new(control))
|
23
|
-
when File
|
24
|
-
ETL::Control::Control.parse(control)
|
25
|
-
when ETL::Control::Control
|
26
|
-
control
|
27
|
-
else
|
28
|
-
raise ControlError, "Control must be a String, File or Control object"
|
29
|
-
end
|
6
|
+
# Create a Context instance
|
7
|
+
def create(control)
|
8
|
+
Context.new(control).get_binding
|
30
9
|
end
|
31
10
|
end
|
32
11
|
|
33
|
-
def initialize(
|
34
|
-
@
|
12
|
+
def initialize(control)
|
13
|
+
@control = control
|
14
|
+
end
|
15
|
+
|
16
|
+
def file
|
17
|
+
@control.file
|
35
18
|
end
|
36
19
|
|
37
|
-
# Define a source
|
20
|
+
# Define a source.
|
38
21
|
def source(name, configuration={}, definition={})
|
39
22
|
source_types = [:file, :db]
|
40
23
|
source_types.each do |source_type|
|
@@ -47,7 +30,7 @@ module ETL #:nodoc:
|
|
47
30
|
|
48
31
|
# Get the defined source
|
49
32
|
def sources
|
50
|
-
@sources
|
33
|
+
@control.sources
|
51
34
|
end
|
52
35
|
|
53
36
|
# Define a destination
|
@@ -62,14 +45,19 @@ module ETL #:nodoc:
|
|
62
45
|
|
63
46
|
# Get the defined destinations
|
64
47
|
def destinations
|
65
|
-
@destinations
|
48
|
+
@control.destinations
|
66
49
|
end
|
67
50
|
|
68
51
|
def transform(name, transformer=nil, configuration={}, &block)
|
69
52
|
transforms[name] ||= []
|
70
53
|
if transformer
|
71
|
-
|
72
|
-
|
54
|
+
class_name = "#{transformer.to_s.classify}Transform"
|
55
|
+
begin
|
56
|
+
transform_class = ETL::Transform.const_get(class_name)
|
57
|
+
transforms[name] << transform_class.new(self, configuration)
|
58
|
+
rescue NameError => e
|
59
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
60
|
+
end
|
73
61
|
elsif block_given?
|
74
62
|
transforms[name] << block
|
75
63
|
else
|
@@ -77,32 +65,123 @@ module ETL #:nodoc:
|
|
77
65
|
end
|
78
66
|
end
|
79
67
|
|
80
|
-
def
|
81
|
-
transforms
|
68
|
+
def transforms
|
69
|
+
@control.transforms
|
82
70
|
end
|
83
71
|
|
84
72
|
def pre_process(name, configuration={})
|
85
|
-
|
86
|
-
|
73
|
+
class_name = "#{name.to_s.classify}Processor"
|
74
|
+
begin
|
75
|
+
processor_class = ETL::Processor.const_get(class_name)
|
76
|
+
pre_processors << processor_class.new(self, configuration)
|
77
|
+
rescue NameError
|
78
|
+
raise ControlError, "Unable to find preprocessor #{class_name}"
|
79
|
+
end
|
87
80
|
end
|
88
81
|
|
89
82
|
def pre_processors
|
90
|
-
@pre_processors
|
83
|
+
@control.pre_processors
|
91
84
|
end
|
92
85
|
|
93
86
|
def post_process(name, configuration={})
|
94
|
-
|
95
|
-
|
87
|
+
class_name = "#{name.to_s.classify}Processor"
|
88
|
+
begin
|
89
|
+
processor_class = ETL::Processor.const_get(class_name)
|
90
|
+
post_processors << processor_class.new(self, configuration)
|
91
|
+
rescue NameError
|
92
|
+
raise ControlError, "Unable to find postprocessor #{class_name}"
|
93
|
+
end
|
96
94
|
end
|
97
95
|
|
98
96
|
def post_processors
|
99
|
-
@post_processors
|
97
|
+
@control.post_processors
|
100
98
|
end
|
101
99
|
|
102
100
|
def get_binding
|
103
101
|
binding
|
104
102
|
end
|
105
103
|
|
104
|
+
protected
|
105
|
+
# Get an array of supported source types
|
106
|
+
def source_types
|
107
|
+
[:file, :database]
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get an array of supported destination types
|
111
|
+
def destination_types
|
112
|
+
[:file, :database]
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Object representation of a control file
|
118
|
+
class Control
|
119
|
+
# The File object
|
120
|
+
attr_reader :file
|
121
|
+
|
122
|
+
class << self
|
123
|
+
# Parse a control file and return a Control instance
|
124
|
+
def parse(control_file)
|
125
|
+
control_file = control_file.path if control_file.instance_of?(File)
|
126
|
+
# logger.debug "Parsing control file #{control_file.path}"
|
127
|
+
control = ETL::Control::Control.new(control_file)
|
128
|
+
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
129
|
+
eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
|
130
|
+
control.validate
|
131
|
+
control
|
132
|
+
end
|
133
|
+
|
134
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
135
|
+
# are:
|
136
|
+
# * The path to a control file as a String
|
137
|
+
# * A File object referencing the control file
|
138
|
+
# * The ETL::Control::Control object (which will just be returned)
|
139
|
+
#
|
140
|
+
# Raises a ControlError if any other type is given
|
141
|
+
def resolve(control)
|
142
|
+
case control
|
143
|
+
when String
|
144
|
+
ETL::Control::Control.parse(File.new(control))
|
145
|
+
when File
|
146
|
+
ETL::Control::Control.parse(control)
|
147
|
+
when ETL::Control::Control
|
148
|
+
control
|
149
|
+
else
|
150
|
+
raise ControlError, "Control must be a String, File or Control object"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Initialize the instance with the given File object
|
156
|
+
def initialize(file)
|
157
|
+
@file = file
|
158
|
+
end
|
159
|
+
|
160
|
+
# Get the defined source
|
161
|
+
def sources
|
162
|
+
@sources ||= []
|
163
|
+
end
|
164
|
+
|
165
|
+
# Get the defined destinations
|
166
|
+
def destinations
|
167
|
+
@destinations ||= []
|
168
|
+
end
|
169
|
+
|
170
|
+
# Get the transforms with the specified name
|
171
|
+
def transform(name)
|
172
|
+
transforms[name] ||= []
|
173
|
+
end
|
174
|
+
|
175
|
+
# Get an Array of preprocessors
|
176
|
+
def pre_processors
|
177
|
+
@pre_processors ||= []
|
178
|
+
end
|
179
|
+
|
180
|
+
# Get an Array of post processors
|
181
|
+
def post_processors
|
182
|
+
@post_processors ||= []
|
183
|
+
end
|
184
|
+
|
106
185
|
# Get a map of all transforms for this control
|
107
186
|
def transforms
|
108
187
|
@transforms ||= {}
|
@@ -118,17 +197,6 @@ module ETL #:nodoc:
|
|
118
197
|
end
|
119
198
|
end
|
120
199
|
|
121
|
-
protected
|
122
|
-
# Get an array of supported source types
|
123
|
-
def source_types
|
124
|
-
[:file, :database]
|
125
|
-
end
|
126
|
-
|
127
|
-
# Get an array of supported destination types
|
128
|
-
def destination_types
|
129
|
-
[:file, :database]
|
130
|
-
end
|
131
|
-
|
132
200
|
end
|
133
201
|
end
|
134
202
|
end
|
@@ -1,15 +1,41 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Control #:nodoc:
|
3
|
+
# Base class for destinations.
|
3
4
|
class Destination
|
4
|
-
|
5
|
-
|
5
|
+
# Read-only accessor for the ETL::Control::Control instance
|
6
|
+
attr_reader :control
|
7
|
+
|
8
|
+
# Read-only accessor for the configuration Hash
|
9
|
+
attr_reader :configuration
|
10
|
+
|
11
|
+
# Read-only accessor for the destination mapping Hash
|
12
|
+
attr_reader :mapping
|
13
|
+
|
14
|
+
# Accessor to the buffer size
|
15
|
+
attr_accessor :buffer_size
|
16
|
+
|
17
|
+
# Unique flag.
|
18
|
+
attr_accessor :unique
|
6
19
|
|
7
20
|
class << self
|
21
|
+
# Get the destination class for the specified name.
|
22
|
+
#
|
23
|
+
# For example if name is :database or 'database' then the DatabaseDestination class
|
24
|
+
# is returned
|
8
25
|
def class_for_name(name)
|
9
26
|
ETL::Control.const_get("#{name.to_s.classify}Destination")
|
10
27
|
end
|
11
28
|
end
|
12
29
|
|
30
|
+
# Initialize the destination
|
31
|
+
#
|
32
|
+
# Arguments:
|
33
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
34
|
+
# * <tt>configuration</tt>: The configuration Hash
|
35
|
+
# * <tt>mapping</tt>: The mapping Hash
|
36
|
+
#
|
37
|
+
# Options:
|
38
|
+
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
13
39
|
def initialize(control, configuration, mapping)
|
14
40
|
@control = control
|
15
41
|
@configuration = configuration
|
@@ -17,10 +43,12 @@ module ETL #:nodoc:
|
|
17
43
|
@buffer_size = configuration[:buffer_size] ||= 1000
|
18
44
|
end
|
19
45
|
|
46
|
+
# Get the current row number
|
20
47
|
def current_row
|
21
48
|
@current_row ||= 1
|
22
49
|
end
|
23
50
|
|
51
|
+
# Write the given row
|
24
52
|
def write(row)
|
25
53
|
buffer << row
|
26
54
|
flush if buffer.length >= buffer_size
|
@@ -36,11 +64,17 @@ module ETL #:nodoc:
|
|
36
64
|
raise NotImplementedError, "close method must be implemented by subclasses"
|
37
65
|
end
|
38
66
|
|
67
|
+
def errors
|
68
|
+
@errors ||= []
|
69
|
+
end
|
70
|
+
|
39
71
|
protected
|
72
|
+
# Access the buffer
|
40
73
|
def buffer
|
41
74
|
@buffer ||= []
|
42
75
|
end
|
43
76
|
|
77
|
+
# Access the generators map
|
44
78
|
def generators
|
45
79
|
@generators ||= {}
|
46
80
|
end
|
@@ -76,15 +110,22 @@ module ETL #:nodoc:
|
|
76
110
|
@compound_key_constraints ||= {}
|
77
111
|
end
|
78
112
|
|
79
|
-
# Add any virtual fields to the row
|
113
|
+
# Add any virtual fields to the row. Virtual rows will get their value from one of the following:
|
114
|
+
# * If the mapping is a Class, then an object which implements the next method
|
115
|
+
# * If the mapping is a Symbol, then the XGenerator where X is the classified symbol
|
116
|
+
# * If the mapping is a Proc, then it will be called with the row
|
117
|
+
# * Otherwise the value itself will be assigned to the field
|
80
118
|
def add_virtuals!(row)
|
81
119
|
if mapping[:virtual]
|
82
120
|
mapping[:virtual].each do |key,value|
|
83
121
|
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
84
122
|
case value
|
123
|
+
when Class
|
124
|
+
generator = generators[key] ||= value.new
|
125
|
+
row[key] = generator.next
|
85
126
|
when Symbol
|
86
|
-
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
87
|
-
row[key] =
|
127
|
+
generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
128
|
+
row[key] = generator.next
|
88
129
|
when Proc
|
89
130
|
row[key] = value.call(row)
|
90
131
|
else
|