activewarehouse-etl 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +41 -13
- data/README +1 -1
- data/Rakefile +14 -4
- data/TODO +17 -1
- data/bin/etl +3 -1
- data/lib/etl.rb +11 -7
- data/lib/etl/commands/etl.rb +0 -1
- data/lib/etl/control/control.rb +113 -36
- data/lib/etl/control/destination.rb +13 -1
- data/lib/etl/control/destination/database_destination.rb +3 -1
- data/lib/etl/control/destination/file_destination.rb +5 -2
- data/lib/etl/control/source.rb +36 -0
- data/lib/etl/control/source/database_source.rb +63 -8
- data/lib/etl/control/source/file_source.rb +25 -4
- data/lib/etl/engine.rb +128 -14
- data/lib/etl/generator/surrogate_key_generator.rb +1 -0
- data/lib/etl/http_tools.rb +119 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
- data/lib/etl/parser/sax_parser.rb +18 -6
- data/lib/etl/processor.rb +1 -0
- data/lib/etl/processor/bulk_import_processor.rb +12 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
- data/lib/etl/processor/processor.rb +1 -5
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +15 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
- data/lib/etl/transform/sha1_transform.rb +1 -1
- data/lib/etl/transform/string_to_date_transform.rb +3 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
- data/lib/etl/transform/string_to_time_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +8 -4
- data/lib/etl/transform/type_transform.rb +2 -2
- data/lib/etl/version.rb +2 -2
- metadata +21 -8
- data/lib/etl/active_record_ext.rb +0 -1
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
data/CHANGELOG
CHANGED
@@ -18,21 +18,49 @@
|
|
18
18
|
* Added support for :unique => [] option and virtual fields for the database destination
|
19
19
|
|
20
20
|
0.5.0 - Feb 17, 2007
|
21
|
-
* Changed require_gem to gem and added alias to allow for older versions of rubygems
|
21
|
+
* Changed require_gem to gem and added alias to allow for older versions of rubygems.
|
22
22
|
* Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
|
23
|
-
:options => {} defines options to pass to the parser
|
24
|
-
* Added support for passing a custom Parser class in the source configuration
|
25
|
-
* Removed the need to include Enumerable in each parser implementation
|
26
|
-
* Added new date_to_string and string_to_date transformers
|
27
|
-
* Implemented foreign_key_lookup transform including an ActiveRecordResolver
|
28
|
-
* Added real time activity logging which is called when the etl bin script is invoked
|
29
|
-
* Improved error handling
|
30
|
-
* Default logger level is now WARN
|
23
|
+
:options => {} defines options to pass to the parser.
|
24
|
+
* Added support for passing a custom Parser class in the source configuration.
|
25
|
+
* Removed the need to include Enumerable in each parser implementation.
|
26
|
+
* Added new date_to_string and string_to_date transformers.
|
27
|
+
* Implemented foreign_key_lookup transform including an ActiveRecordResolver.
|
28
|
+
* Added real time activity logging which is called when the etl bin script is invoked.
|
29
|
+
* Improved error handling.
|
30
|
+
* Default logger level is now WARN.
|
31
31
|
|
32
32
|
0.5.1 - Feb 18, 2007
|
33
|
-
* Fixed up truncate processor
|
34
|
-
* Updated HOW_TO_RELEASE doc
|
33
|
+
* Fixed up truncate processor.
|
34
|
+
* Updated HOW_TO_RELEASE doc.
|
35
35
|
|
36
36
|
0.5.2 - Feb 19, 2007
|
37
|
-
* Added error threshold
|
38
|
-
* Fixed problem with transform error handling
|
37
|
+
* Added error threshold.
|
38
|
+
* Fixed problem with transform error handling.
|
39
|
+
|
40
|
+
0.6.0 - Mar 8, 2007
|
41
|
+
* Fixed missing method problem in validate in Control class.
|
42
|
+
* Removed control validation for now (source could be code in the control file).
|
43
|
+
* Transform interface now defined as taking 3 arguments, the field name, field value and the row. This
|
44
|
+
is not backwards compatible.
|
45
|
+
* Added HierarchyLookupTransform.
|
46
|
+
* Added DefaultTransform which will return a specified value if the initial value is blank.
|
47
|
+
* Added row-level processing.
|
48
|
+
* Added HierarchyExploderProcessor which takes a single hierarchy row and explodes it to multiple rows
|
49
|
+
as used in a hierarchy bridge.
|
50
|
+
* Added ApacheCombinedLogParser which parses Apache Combined Log format, including parsing of the
|
51
|
+
user agent string and the URI, returning a Hash.
|
52
|
+
* Fixed bug in SAX parser so that attributes are now set when the start_element event is received.
|
53
|
+
* Added an HttpTools module which provides some parsing methods (for user agent and URI).
|
54
|
+
* Database source now uses its own class for establishing an ActiveRecord connection.
|
55
|
+
* Log files are now timestamped.
|
56
|
+
* Source files are now archived automatically during the extraction process
|
57
|
+
* Added a :condition option to the destination configuration Hash that accepts a Proc with a single
|
58
|
+
argument passed to it (the row).
|
59
|
+
* Added an :append_rows option to the destination configuration Hash that accepts either a Hash (to
|
60
|
+
append a single row) or an Array of Hashes (to append multiple rows).
|
61
|
+
* Only print the read and written row counts if there is at least one source and one destination
|
62
|
+
respectively.
|
63
|
+
* Added a depends_on directive that accepts a list of arguments of either strings or symbols. Each
|
64
|
+
symbol is converted to a string and .ctl is appended; strings are passed through directly. The
|
65
|
+
dependencies are executed in the order they are specified.
|
66
|
+
* The default field separator in the bulk loader is now a comma (was a tab).
|
data/README
CHANGED
data/Rakefile
CHANGED
@@ -56,10 +56,11 @@ spec = Gem::Specification.new do |s|
|
|
56
56
|
ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
57
57
|
EOF
|
58
58
|
|
59
|
-
s.add_dependency('rake',
|
60
|
-
s.add_dependency('activesupport',
|
61
|
-
s.add_dependency('activerecord',
|
62
|
-
s.add_dependency('fastercsv',
|
59
|
+
s.add_dependency('rake', '>= 0.7.1')
|
60
|
+
s.add_dependency('activesupport', '>= 1.3.1')
|
61
|
+
s.add_dependency('activerecord', '>= 1.14.4')
|
62
|
+
s.add_dependency('fastercsv', '>= 1.0.0')
|
63
|
+
s.add_dependency('adapter_extensions', '>= 0.1.0')
|
63
64
|
|
64
65
|
s.rdoc_options << '--exclude' << '.'
|
65
66
|
s.has_rdoc = false
|
@@ -122,4 +123,13 @@ end
|
|
122
123
|
desc "Publish the API documentation"
|
123
124
|
task :pdoc => [:rdoc] do
|
124
125
|
Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
|
126
|
+
end
|
127
|
+
|
128
|
+
desc "Reinstall the gem from a local package copy"
|
129
|
+
task :reinstall => [:package] do
|
130
|
+
windows = RUBY_PLATFORM =~ /mswin/
|
131
|
+
sudo = windows ? '' : 'sudo'
|
132
|
+
gem = windows ? 'gem.bat' : 'gem'
|
133
|
+
`#{sudo} #{gem} uninstall -x -i #{PKG_NAME}`
|
134
|
+
`#{sudo} #{gem} install pkg/#{PKG_NAME}-#{PKG_VERSION}`
|
125
135
|
end
|
data/TODO
CHANGED
@@ -8,4 +8,20 @@ TODO
|
|
8
8
|
** Allow mismatch row length error in delimited parser to be ignored
|
9
9
|
* Improve error messages throughout, but especially in problems with the control files
|
10
10
|
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
|
-
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
11
|
+
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
12
|
+
|
13
|
+
Audit Record
|
14
|
+
|
15
|
+
Process-Level
|
16
|
+
* Start Time
|
17
|
+
* End Time
|
18
|
+
* (Duration)
|
19
|
+
* Rows Read
|
20
|
+
* Rows Written
|
21
|
+
* Rows Rejected
|
22
|
+
* Errors
|
23
|
+
* Destination
|
24
|
+
Record-Level
|
25
|
+
* Source
|
26
|
+
* Timestamp
|
27
|
+
* Transformation Log
|
data/bin/etl
CHANGED
data/lib/etl.rb
CHANGED
@@ -25,9 +25,7 @@
|
|
25
25
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
26
26
|
#++
|
27
27
|
|
28
|
-
|
29
|
-
require 'logger'
|
30
|
-
end
|
28
|
+
require 'logger'
|
31
29
|
|
32
30
|
require 'rubygems'
|
33
31
|
|
@@ -45,11 +43,19 @@ unless defined?(ActiveRecord)
|
|
45
43
|
require 'active_record'
|
46
44
|
end
|
47
45
|
|
48
|
-
|
49
|
-
|
46
|
+
unless defined?(AdapterExtensions)
|
47
|
+
gem 'adapter_extensions'
|
48
|
+
require 'adapter_extensions'
|
49
|
+
end
|
50
|
+
|
51
|
+
unless defined?(FasterCSV)
|
52
|
+
gem 'fastercsv'
|
53
|
+
require 'faster_csv'
|
54
|
+
end
|
50
55
|
|
51
56
|
$:.unshift(File.dirname(__FILE__))
|
52
57
|
|
58
|
+
require 'etl/http_tools'
|
53
59
|
require 'etl/version'
|
54
60
|
require 'etl/engine'
|
55
61
|
require 'etl/control'
|
@@ -58,8 +64,6 @@ require 'etl/transform'
|
|
58
64
|
require 'etl/processor'
|
59
65
|
require 'etl/generator'
|
60
66
|
|
61
|
-
require 'etl/active_record_ext'
|
62
|
-
|
63
67
|
module ETL #:nodoc:
|
64
68
|
class ETLError < StandardError #:nodoc:
|
65
69
|
end
|
data/lib/etl/commands/etl.rb
CHANGED
data/lib/etl/control/control.rb
CHANGED
@@ -2,6 +2,8 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
# The Context is passed to eval.
|
4
4
|
class Context
|
5
|
+
attr_reader :control
|
6
|
+
|
5
7
|
class << self
|
6
8
|
# Create a Context instance
|
7
9
|
def create(control)
|
@@ -16,16 +18,29 @@ module ETL #:nodoc:
|
|
16
18
|
|
17
19
|
# Get the control file
|
18
20
|
def file
|
19
|
-
|
21
|
+
control.file
|
20
22
|
end
|
21
23
|
|
24
|
+
# Set the allowed error threshold
|
22
25
|
def set_error_threshold(error_threshold)
|
23
|
-
|
26
|
+
control.error_threshold = error_threshold
|
27
|
+
end
|
28
|
+
|
29
|
+
# Define a list of control files that this file depends on. Those control files
|
30
|
+
# will be executed prior to this control file. The list may contain symbols that will
|
31
|
+
# be converted to file names by calling to_s + '.ctl', or they may be strings in which
|
32
|
+
# case they will be used as is
|
33
|
+
def depends_on(*args)
|
34
|
+
dependencies << args
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the defined dependencies
|
38
|
+
def dependencies
|
39
|
+
control.dependencies
|
24
40
|
end
|
25
41
|
|
26
42
|
# Define a source.
|
27
43
|
def source(name, configuration={}, definition={})
|
28
|
-
source_types = [:file, :db]
|
29
44
|
source_types.each do |source_type|
|
30
45
|
if configuration[source_type]
|
31
46
|
source_class = ETL::Control::Source.class_for_name(source_type)
|
@@ -36,7 +51,7 @@ module ETL #:nodoc:
|
|
36
51
|
|
37
52
|
# Get the defined source
|
38
53
|
def sources
|
39
|
-
|
54
|
+
control.sources
|
40
55
|
end
|
41
56
|
|
42
57
|
# Define a destination
|
@@ -51,19 +66,30 @@ module ETL #:nodoc:
|
|
51
66
|
|
52
67
|
# Get the defined destinations
|
53
68
|
def destinations
|
54
|
-
|
69
|
+
control.destinations
|
55
70
|
end
|
56
71
|
|
57
72
|
# Define a transform
|
58
73
|
def transform(name, transformer=nil, configuration={}, &block)
|
59
74
|
transforms[name] ||= []
|
60
75
|
if transformer
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
76
|
+
case transformer
|
77
|
+
when String, Symbol
|
78
|
+
class_name = "#{transformer.to_s.classify}Transform"
|
79
|
+
begin
|
80
|
+
transform_class = ETL::Transform.const_get(class_name)
|
81
|
+
transforms[name] << transform_class.new(self, configuration)
|
82
|
+
rescue NameError => e
|
83
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
84
|
+
end
|
85
|
+
else
|
86
|
+
#transformer.class.inspect
|
87
|
+
if transformer.is_a?(ETL::Transform::Transform)
|
88
|
+
Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
|
89
|
+
transforms[name] << transformer
|
90
|
+
else
|
91
|
+
raise ControlError, "Transformer must be a String, Symbol or Transform instance"
|
92
|
+
end
|
67
93
|
end
|
68
94
|
elsif block_given?
|
69
95
|
transforms[name] << block
|
@@ -74,39 +100,68 @@ module ETL #:nodoc:
|
|
74
100
|
|
75
101
|
# Get the defined transforms
|
76
102
|
def transforms
|
77
|
-
|
103
|
+
control.transforms
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
# This method is used to define a processor and insert into the specified processor
|
108
|
+
# collection.
|
109
|
+
def define_processor(name, processor_collection, configuration)
|
110
|
+
case name
|
111
|
+
when String, Symbol
|
112
|
+
class_name = "#{name.to_s.classify}Processor"
|
113
|
+
begin
|
114
|
+
processor_class = ETL::Processor.const_get(class_name)
|
115
|
+
processor_collection << processor_class.new(self, configuration)
|
116
|
+
rescue NameError => e
|
117
|
+
raise ControlError, "Unable to find processor #{class_name}: #{e}"
|
118
|
+
end
|
119
|
+
when Class
|
120
|
+
processor_collection << name.new(self, configuration)
|
121
|
+
else
|
122
|
+
raise ControlError, "The process declaration requires a String, Symbol or Class"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
public
|
127
|
+
# Define an "after read" processor. This must be a row-level processor.
|
128
|
+
def after_read(name, configuration={})
|
129
|
+
define_processor(name, after_read_processors, configuration)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Get the defined "after read" processors
|
133
|
+
def after_read_processors
|
134
|
+
control.after_read_processors
|
135
|
+
end
|
136
|
+
|
137
|
+
# Define a "before write" processor. This must be a row-level processor.
|
138
|
+
def before_write(name, configuration={})
|
139
|
+
define_processor(name, before_write_processors, configuration)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Get the defined "before write" processors
|
143
|
+
def before_write_processors
|
144
|
+
control.before_write_processors
|
78
145
|
end
|
79
146
|
|
80
147
|
# Define a pre-processor
|
81
148
|
def pre_process(name, configuration={})
|
82
|
-
|
83
|
-
begin
|
84
|
-
processor_class = ETL::Processor.const_get(class_name)
|
85
|
-
pre_processors << processor_class.new(self, configuration)
|
86
|
-
rescue NameError => e
|
87
|
-
raise ControlError, "Unable to find preprocessor #{class_name}: #{e}"
|
88
|
-
end
|
149
|
+
define_processor(name, pre_processors, configuration)
|
89
150
|
end
|
90
151
|
|
91
152
|
# Get the defined pre-processors
|
92
153
|
def pre_processors
|
93
|
-
|
154
|
+
control.pre_processors
|
94
155
|
end
|
95
156
|
|
96
157
|
# Define a post-processor
|
97
158
|
def post_process(name, configuration={})
|
98
|
-
|
99
|
-
begin
|
100
|
-
processor_class = ETL::Processor.const_get(class_name)
|
101
|
-
post_processors << processor_class.new(self, configuration)
|
102
|
-
rescue NameError
|
103
|
-
raise ControlError, "Unable to find postprocessor #{class_name}"
|
104
|
-
end
|
159
|
+
define_processor(name, post_processors, configuration)
|
105
160
|
end
|
106
161
|
|
107
162
|
# Get the defined post-processors
|
108
163
|
def post_processors
|
109
|
-
|
164
|
+
control.post_processors
|
110
165
|
end
|
111
166
|
|
112
167
|
# Get the binding object
|
@@ -117,12 +172,12 @@ module ETL #:nodoc:
|
|
117
172
|
protected
|
118
173
|
# Get an array of supported source types
|
119
174
|
def source_types
|
120
|
-
|
175
|
+
control.source_types
|
121
176
|
end
|
122
177
|
|
123
178
|
# Get an array of supported destination types
|
124
179
|
def destination_types
|
125
|
-
|
180
|
+
control.destination_types
|
126
181
|
end
|
127
182
|
|
128
183
|
end
|
@@ -173,6 +228,11 @@ module ETL #:nodoc:
|
|
173
228
|
@file = file
|
174
229
|
end
|
175
230
|
|
231
|
+
# Get a list of dependencies
|
232
|
+
def dependencies
|
233
|
+
@dependencies ||= []
|
234
|
+
end
|
235
|
+
|
176
236
|
# Get the defined source
|
177
237
|
def sources
|
178
238
|
@sources ||= []
|
@@ -188,6 +248,15 @@ module ETL #:nodoc:
|
|
188
248
|
transforms[name] ||= []
|
189
249
|
end
|
190
250
|
|
251
|
+
def after_read_processors
|
252
|
+
@after_read_processors ||= []
|
253
|
+
end
|
254
|
+
|
255
|
+
# Get all of the "before write" processors
|
256
|
+
def before_write_processors
|
257
|
+
@before_write_processors ||= []
|
258
|
+
end
|
259
|
+
|
191
260
|
# Get an Array of preprocessors
|
192
261
|
def pre_processors
|
193
262
|
@pre_processors ||= []
|
@@ -210,12 +279,20 @@ module ETL #:nodoc:
|
|
210
279
|
|
211
280
|
# Validate the control file
|
212
281
|
def validate
|
213
|
-
unless sources.length > 0
|
214
|
-
|
215
|
-
end
|
216
|
-
unless destinations.length > 0
|
217
|
-
|
218
|
-
end
|
282
|
+
#unless sources.length > 0
|
283
|
+
# raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
|
284
|
+
#end
|
285
|
+
#unless destinations.length > 0
|
286
|
+
# raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
|
287
|
+
#end
|
288
|
+
end
|
289
|
+
|
290
|
+
def source_types
|
291
|
+
[:file, :database]
|
292
|
+
end
|
293
|
+
|
294
|
+
def destination_types
|
295
|
+
[:file, :database]
|
219
296
|
end
|
220
297
|
|
221
298
|
end
|
@@ -17,6 +17,12 @@ module ETL #:nodoc:
|
|
17
17
|
# Unique flag.
|
18
18
|
attr_accessor :unique
|
19
19
|
|
20
|
+
# A condition for writing
|
21
|
+
attr_accessor :condition
|
22
|
+
|
23
|
+
# An array of rows to append to the destination
|
24
|
+
attr_accessor :append_rows
|
25
|
+
|
20
26
|
class << self
|
21
27
|
# Get the destination class for the specified name.
|
22
28
|
#
|
@@ -36,11 +42,15 @@ module ETL #:nodoc:
|
|
36
42
|
#
|
37
43
|
# Options:
|
38
44
|
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
45
|
+
# * <tt>:condition</tt>: A conditional proc that must return true for the row to be written
|
46
|
+
# * <tt>:append_rows</tt>: An array of rows to append
|
39
47
|
def initialize(control, configuration, mapping)
|
40
48
|
@control = control
|
41
49
|
@configuration = configuration
|
42
50
|
@mapping = mapping
|
43
51
|
@buffer_size = configuration[:buffer_size] ||= 1000
|
52
|
+
@condition = configuration[:condition]
|
53
|
+
@append_rows = configuration[:append_rows]
|
44
54
|
end
|
45
55
|
|
46
56
|
# Get the current row number
|
@@ -50,7 +60,9 @@ module ETL #:nodoc:
|
|
50
60
|
|
51
61
|
# Write the given row
|
52
62
|
def write(row)
|
53
|
-
|
63
|
+
if @condition.nil? || @condition.call(row)
|
64
|
+
buffer << row
|
65
|
+
end
|
54
66
|
flush if buffer.length >= buffer_size
|
55
67
|
end
|
56
68
|
|