activewarehouse-etl 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +41 -13
- data/README +1 -1
- data/Rakefile +14 -4
- data/TODO +17 -1
- data/bin/etl +3 -1
- data/lib/etl.rb +11 -7
- data/lib/etl/commands/etl.rb +0 -1
- data/lib/etl/control/control.rb +113 -36
- data/lib/etl/control/destination.rb +13 -1
- data/lib/etl/control/destination/database_destination.rb +3 -1
- data/lib/etl/control/destination/file_destination.rb +5 -2
- data/lib/etl/control/source.rb +36 -0
- data/lib/etl/control/source/database_source.rb +63 -8
- data/lib/etl/control/source/file_source.rb +25 -4
- data/lib/etl/engine.rb +128 -14
- data/lib/etl/generator/surrogate_key_generator.rb +1 -0
- data/lib/etl/http_tools.rb +119 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
- data/lib/etl/parser/sax_parser.rb +18 -6
- data/lib/etl/processor.rb +1 -0
- data/lib/etl/processor/bulk_import_processor.rb +12 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
- data/lib/etl/processor/processor.rb +1 -5
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +15 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
- data/lib/etl/transform/sha1_transform.rb +1 -1
- data/lib/etl/transform/string_to_date_transform.rb +3 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
- data/lib/etl/transform/string_to_time_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +8 -4
- data/lib/etl/transform/type_transform.rb +2 -2
- data/lib/etl/version.rb +2 -2
- metadata +21 -8
- data/lib/etl/active_record_ext.rb +0 -1
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
data/CHANGELOG
CHANGED
@@ -18,21 +18,49 @@
|
|
18
18
|
* Added support for :unique => [] option and virtual fields for the database destination
|
19
19
|
|
20
20
|
0.5.0 - Feb 17, 2007
|
21
|
-
* Changed require_gem to gem and added alias to allow for older versions of rubygems
|
21
|
+
* Changed require_gem to gem and added alias to allow for older versions of rubygems.
|
22
22
|
* Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
|
23
|
-
:options => {} defines options to pass to the parser
|
24
|
-
* Added support for passing a custom Parser class in the source configuration
|
25
|
-
* Removed the need to include Enumerable in each parser implementation
|
26
|
-
* Added new date_to_string and string_to_date transformers
|
27
|
-
* Implemented foreign_key_lookup transform including an ActiveRecordResolver
|
28
|
-
* Added real time activity logging which is called when the etl bin script is invoked
|
29
|
-
* Improved error handling
|
30
|
-
* Default logger level is now WARN
|
23
|
+
:options => {} defines options to pass to the parser.
|
24
|
+
* Added support for passing a custom Parser class in the source configuration.
|
25
|
+
* Removed the need to include Enumerable in each parser implementation.
|
26
|
+
* Added new date_to_string and string_to_date transformers.
|
27
|
+
* Implemented foreign_key_lookup transform including an ActiveRecordResolver.
|
28
|
+
* Added real time activity logging which is called when the etl bin script is invoked.
|
29
|
+
* Improved error handling.
|
30
|
+
* Default logger level is now WARN.
|
31
31
|
|
32
32
|
0.5.1 - Feb 18, 2007
|
33
|
-
* Fixed up truncate processor
|
34
|
-
* Updated HOW_TO_RELEASE doc
|
33
|
+
* Fixed up truncate processor.
|
34
|
+
* Updated HOW_TO_RELEASE doc.
|
35
35
|
|
36
36
|
0.5.2 - Feb 19, 2007
|
37
|
-
* Added error threshold
|
38
|
-
* Fixed problem with transform error handling
|
37
|
+
* Added error threshold.
|
38
|
+
* Fixed problem with transform error handling.
|
39
|
+
|
40
|
+
0.6.0 - Mar 8, 2007
|
41
|
+
* Fixed missing method problem in validate in Control class.
|
42
|
+
* Removed control validation for now (source could be code in the control file).
|
43
|
+
* Transform interface now defined as taking 3 arguments, the field name, field value and the row. This
|
44
|
+
is not backwards compatible.
|
45
|
+
* Added HierarchyLookupTransform.
|
46
|
+
* Added DefaultTransform which will return a specified value if the initial value is blank.
|
47
|
+
* Added row-level processing.
|
48
|
+
* Added HierarchyExploderProcessor which takes a single hierarchy row and explodes it to multiple rows
|
49
|
+
as used in a hierarchy bridge.
|
50
|
+
* Added ApacheCombinedLogParser which parses Apache Combined Log format, including parsing of the
|
51
|
+
user agent string and the URI, returning a Hash.
|
52
|
+
* Fixed bug in SAX parser so that attributes are now set when the start_element event is received.
|
53
|
+
* Added an HttpTools module which provides some parsing methods (for user agent and URI).
|
54
|
+
* Database source now uses its own class for establishing an ActiveRecord connection.
|
55
|
+
* Log files are now timestamped.
|
56
|
+
* Source files are now archived automatically during the extraction process
|
57
|
+
* Added a :condition option to the destination configuration Hash that accepts a Proc with a single
|
58
|
+
argument passed to it (the row).
|
59
|
+
* Added an :append_rows option to the destination configuration Hash that accepts either a Hash (to
|
60
|
+
append a single row) or an Array of Hashes (to append multiple rows).
|
61
|
+
* Only print the read and written row counts if there is at least one source and one destination
|
62
|
+
respectively.
|
63
|
+
* Added a depends_on directive that accepts a list of arguments of either strings or symbols. Each
|
64
|
+
symbol is converted to a string and .ctl is appended; strings are passed through directly. The
|
65
|
+
dependencies are executed in the order they are specified.
|
66
|
+
* The default field separator in the bulk loader is now a comma (was a tab).
|
data/README
CHANGED
data/Rakefile
CHANGED
@@ -56,10 +56,11 @@ spec = Gem::Specification.new do |s|
|
|
56
56
|
ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
57
57
|
EOF
|
58
58
|
|
59
|
-
s.add_dependency('rake',
|
60
|
-
s.add_dependency('activesupport',
|
61
|
-
s.add_dependency('activerecord',
|
62
|
-
s.add_dependency('fastercsv',
|
59
|
+
s.add_dependency('rake', '>= 0.7.1')
|
60
|
+
s.add_dependency('activesupport', '>= 1.3.1')
|
61
|
+
s.add_dependency('activerecord', '>= 1.14.4')
|
62
|
+
s.add_dependency('fastercsv', '>= 1.0.0')
|
63
|
+
s.add_dependency('adapter_extensions', '>= 0.1.0')
|
63
64
|
|
64
65
|
s.rdoc_options << '--exclude' << '.'
|
65
66
|
s.has_rdoc = false
|
@@ -122,4 +123,13 @@ end
|
|
122
123
|
desc "Publish the API documentation"
|
123
124
|
task :pdoc => [:rdoc] do
|
124
125
|
Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
|
126
|
+
end
|
127
|
+
|
128
|
+
desc "Reinstall the gem from a local package copy"
|
129
|
+
task :reinstall => [:package] do
|
130
|
+
windows = RUBY_PLATFORM =~ /mswin/
|
131
|
+
sudo = windows ? '' : 'sudo'
|
132
|
+
gem = windows ? 'gem.bat' : 'gem'
|
133
|
+
`#{sudo} #{gem} uninstall -x -i #{PKG_NAME}`
|
134
|
+
`#{sudo} #{gem} install pkg/#{PKG_NAME}-#{PKG_VERSION}`
|
125
135
|
end
|
data/TODO
CHANGED
@@ -8,4 +8,20 @@ TODO
|
|
8
8
|
** Allow mismatch row length error in delimited parser to be ignored
|
9
9
|
* Improve error messages throughout, but especially in problems with the control files
|
10
10
|
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
|
-
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
11
|
+
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
12
|
+
|
13
|
+
Audit Record
|
14
|
+
|
15
|
+
Process-Level
|
16
|
+
* Start Time
|
17
|
+
* End Time
|
18
|
+
* (Duration)
|
19
|
+
* Rows Read
|
20
|
+
* Rows Written
|
21
|
+
* Rows Rejected
|
22
|
+
* Errors
|
23
|
+
* Destination
|
24
|
+
Record-Level
|
25
|
+
* Source
|
26
|
+
* Timestamp
|
27
|
+
* Transformation Log
|
data/bin/etl
CHANGED
data/lib/etl.rb
CHANGED
@@ -25,9 +25,7 @@
|
|
25
25
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
26
26
|
#++
|
27
27
|
|
28
|
-
|
29
|
-
require 'logger'
|
30
|
-
end
|
28
|
+
require 'logger'
|
31
29
|
|
32
30
|
require 'rubygems'
|
33
31
|
|
@@ -45,11 +43,19 @@ unless defined?(ActiveRecord)
|
|
45
43
|
require 'active_record'
|
46
44
|
end
|
47
45
|
|
48
|
-
|
49
|
-
|
46
|
+
unless defined?(AdapterExtensions)
|
47
|
+
gem 'adapter_extensions'
|
48
|
+
require 'adapter_extensions'
|
49
|
+
end
|
50
|
+
|
51
|
+
unless defined?(FasterCSV)
|
52
|
+
gem 'fastercsv'
|
53
|
+
require 'faster_csv'
|
54
|
+
end
|
50
55
|
|
51
56
|
$:.unshift(File.dirname(__FILE__))
|
52
57
|
|
58
|
+
require 'etl/http_tools'
|
53
59
|
require 'etl/version'
|
54
60
|
require 'etl/engine'
|
55
61
|
require 'etl/control'
|
@@ -58,8 +64,6 @@ require 'etl/transform'
|
|
58
64
|
require 'etl/processor'
|
59
65
|
require 'etl/generator'
|
60
66
|
|
61
|
-
require 'etl/active_record_ext'
|
62
|
-
|
63
67
|
module ETL #:nodoc:
|
64
68
|
class ETLError < StandardError #:nodoc:
|
65
69
|
end
|
data/lib/etl/commands/etl.rb
CHANGED
data/lib/etl/control/control.rb
CHANGED
@@ -2,6 +2,8 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
# The Context is passed to eval.
|
4
4
|
class Context
|
5
|
+
attr_reader :control
|
6
|
+
|
5
7
|
class << self
|
6
8
|
# Create a Context instance
|
7
9
|
def create(control)
|
@@ -16,16 +18,29 @@ module ETL #:nodoc:
|
|
16
18
|
|
17
19
|
# Get the control file
|
18
20
|
def file
|
19
|
-
|
21
|
+
control.file
|
20
22
|
end
|
21
23
|
|
24
|
+
# Set the allowed error threshold
|
22
25
|
def set_error_threshold(error_threshold)
|
23
|
-
|
26
|
+
control.error_threshold = error_threshold
|
27
|
+
end
|
28
|
+
|
29
|
+
# Define a list of control files that this file depends on. Those control files
|
30
|
+
# will be executed prior to this control file. The list may contain symbols that will
|
31
|
+
# be converted to file names by calling to_s + '.ctl', or they may be strings in which
|
32
|
+
# case they will be used as is
|
33
|
+
def depends_on(*args)
|
34
|
+
dependencies << args
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the defined dependencies
|
38
|
+
def dependencies
|
39
|
+
control.dependencies
|
24
40
|
end
|
25
41
|
|
26
42
|
# Define a source.
|
27
43
|
def source(name, configuration={}, definition={})
|
28
|
-
source_types = [:file, :db]
|
29
44
|
source_types.each do |source_type|
|
30
45
|
if configuration[source_type]
|
31
46
|
source_class = ETL::Control::Source.class_for_name(source_type)
|
@@ -36,7 +51,7 @@ module ETL #:nodoc:
|
|
36
51
|
|
37
52
|
# Get the defined source
|
38
53
|
def sources
|
39
|
-
|
54
|
+
control.sources
|
40
55
|
end
|
41
56
|
|
42
57
|
# Define a destination
|
@@ -51,19 +66,30 @@ module ETL #:nodoc:
|
|
51
66
|
|
52
67
|
# Get the defined destinations
|
53
68
|
def destinations
|
54
|
-
|
69
|
+
control.destinations
|
55
70
|
end
|
56
71
|
|
57
72
|
# Define a transform
|
58
73
|
def transform(name, transformer=nil, configuration={}, &block)
|
59
74
|
transforms[name] ||= []
|
60
75
|
if transformer
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
76
|
+
case transformer
|
77
|
+
when String, Symbol
|
78
|
+
class_name = "#{transformer.to_s.classify}Transform"
|
79
|
+
begin
|
80
|
+
transform_class = ETL::Transform.const_get(class_name)
|
81
|
+
transforms[name] << transform_class.new(self, configuration)
|
82
|
+
rescue NameError => e
|
83
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
84
|
+
end
|
85
|
+
else
|
86
|
+
#transformer.class.inspect
|
87
|
+
if transformer.is_a?(ETL::Transform::Transform)
|
88
|
+
Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
|
89
|
+
transforms[name] << transformer
|
90
|
+
else
|
91
|
+
raise ControlError, "Transformer must be a String, Symbol or Transform instance"
|
92
|
+
end
|
67
93
|
end
|
68
94
|
elsif block_given?
|
69
95
|
transforms[name] << block
|
@@ -74,39 +100,68 @@ module ETL #:nodoc:
|
|
74
100
|
|
75
101
|
# Get the defined transforms
|
76
102
|
def transforms
|
77
|
-
|
103
|
+
control.transforms
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
# This method is used to define a processor and insert into the specified processor
|
108
|
+
# collection.
|
109
|
+
def define_processor(name, processor_collection, configuration)
|
110
|
+
case name
|
111
|
+
when String, Symbol
|
112
|
+
class_name = "#{name.to_s.classify}Processor"
|
113
|
+
begin
|
114
|
+
processor_class = ETL::Processor.const_get(class_name)
|
115
|
+
processor_collection << processor_class.new(self, configuration)
|
116
|
+
rescue NameError => e
|
117
|
+
raise ControlError, "Unable to find processor #{class_name}: #{e}"
|
118
|
+
end
|
119
|
+
when Class
|
120
|
+
processor_collection << name.new(self, configuration)
|
121
|
+
else
|
122
|
+
raise ControlError, "The process declaration requires a String, Symbol or Class"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
public
|
127
|
+
# Define an "after read" processor. This must be a row-level processor.
|
128
|
+
def after_read(name, configuration={})
|
129
|
+
define_processor(name, after_read_processors, configuration)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Get the defined "after read" processors
|
133
|
+
def after_read_processors
|
134
|
+
control.after_read_processors
|
135
|
+
end
|
136
|
+
|
137
|
+
# Define a "before write" processor. This must be a row-level processor.
|
138
|
+
def before_write(name, configuration={})
|
139
|
+
define_processor(name, before_write_processors, configuration)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Get the defined "before write" processors
|
143
|
+
def before_write_processors
|
144
|
+
control.before_write_processors
|
78
145
|
end
|
79
146
|
|
80
147
|
# Define a pre-processor
|
81
148
|
def pre_process(name, configuration={})
|
82
|
-
|
83
|
-
begin
|
84
|
-
processor_class = ETL::Processor.const_get(class_name)
|
85
|
-
pre_processors << processor_class.new(self, configuration)
|
86
|
-
rescue NameError => e
|
87
|
-
raise ControlError, "Unable to find preprocessor #{class_name}: #{e}"
|
88
|
-
end
|
149
|
+
define_processor(name, pre_processors, configuration)
|
89
150
|
end
|
90
151
|
|
91
152
|
# Get the defined pre-processors
|
92
153
|
def pre_processors
|
93
|
-
|
154
|
+
control.pre_processors
|
94
155
|
end
|
95
156
|
|
96
157
|
# Define a post-processor
|
97
158
|
def post_process(name, configuration={})
|
98
|
-
|
99
|
-
begin
|
100
|
-
processor_class = ETL::Processor.const_get(class_name)
|
101
|
-
post_processors << processor_class.new(self, configuration)
|
102
|
-
rescue NameError
|
103
|
-
raise ControlError, "Unable to find postprocessor #{class_name}"
|
104
|
-
end
|
159
|
+
define_processor(name, post_processors, configuration)
|
105
160
|
end
|
106
161
|
|
107
162
|
# Get the defined post-processors
|
108
163
|
def post_processors
|
109
|
-
|
164
|
+
control.post_processors
|
110
165
|
end
|
111
166
|
|
112
167
|
# Get the binding object
|
@@ -117,12 +172,12 @@ module ETL #:nodoc:
|
|
117
172
|
protected
|
118
173
|
# Get an array of supported source types
|
119
174
|
def source_types
|
120
|
-
|
175
|
+
control.source_types
|
121
176
|
end
|
122
177
|
|
123
178
|
# Get an array of supported destination types
|
124
179
|
def destination_types
|
125
|
-
|
180
|
+
control.destination_types
|
126
181
|
end
|
127
182
|
|
128
183
|
end
|
@@ -173,6 +228,11 @@ module ETL #:nodoc:
|
|
173
228
|
@file = file
|
174
229
|
end
|
175
230
|
|
231
|
+
# Get a list of dependencies
|
232
|
+
def dependencies
|
233
|
+
@dependencies ||= []
|
234
|
+
end
|
235
|
+
|
176
236
|
# Get the defined source
|
177
237
|
def sources
|
178
238
|
@sources ||= []
|
@@ -188,6 +248,15 @@ module ETL #:nodoc:
|
|
188
248
|
transforms[name] ||= []
|
189
249
|
end
|
190
250
|
|
251
|
+
def after_read_processors
|
252
|
+
@after_read_processors ||= []
|
253
|
+
end
|
254
|
+
|
255
|
+
# Get all of the "before write" processors
|
256
|
+
def before_write_processors
|
257
|
+
@before_write_processors ||= []
|
258
|
+
end
|
259
|
+
|
191
260
|
# Get an Array of preprocessors
|
192
261
|
def pre_processors
|
193
262
|
@pre_processors ||= []
|
@@ -210,12 +279,20 @@ module ETL #:nodoc:
|
|
210
279
|
|
211
280
|
# Validate the control file
|
212
281
|
def validate
|
213
|
-
unless sources.length > 0
|
214
|
-
|
215
|
-
end
|
216
|
-
unless destinations.length > 0
|
217
|
-
|
218
|
-
end
|
282
|
+
#unless sources.length > 0
|
283
|
+
# raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
|
284
|
+
#end
|
285
|
+
#unless destinations.length > 0
|
286
|
+
# raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
|
287
|
+
#end
|
288
|
+
end
|
289
|
+
|
290
|
+
def source_types
|
291
|
+
[:file, :database]
|
292
|
+
end
|
293
|
+
|
294
|
+
def destination_types
|
295
|
+
[:file, :database]
|
219
296
|
end
|
220
297
|
|
221
298
|
end
|
@@ -17,6 +17,12 @@ module ETL #:nodoc:
|
|
17
17
|
# Unique flag.
|
18
18
|
attr_accessor :unique
|
19
19
|
|
20
|
+
# A condition for writing
|
21
|
+
attr_accessor :condition
|
22
|
+
|
23
|
+
# An array of rows to append to the destination
|
24
|
+
attr_accessor :append_rows
|
25
|
+
|
20
26
|
class << self
|
21
27
|
# Get the destination class for the specified name.
|
22
28
|
#
|
@@ -36,11 +42,15 @@ module ETL #:nodoc:
|
|
36
42
|
#
|
37
43
|
# Options:
|
38
44
|
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
45
|
+
# * <tt>:condition</tt>: A conditional proc that must return true for the row to be written
|
46
|
+
# * <tt>:append_rows</tt>: An array of rows to append
|
39
47
|
def initialize(control, configuration, mapping)
|
40
48
|
@control = control
|
41
49
|
@configuration = configuration
|
42
50
|
@mapping = mapping
|
43
51
|
@buffer_size = configuration[:buffer_size] ||= 1000
|
52
|
+
@condition = configuration[:condition]
|
53
|
+
@append_rows = configuration[:append_rows]
|
44
54
|
end
|
45
55
|
|
46
56
|
# Get the current row number
|
@@ -50,7 +60,9 @@ module ETL #:nodoc:
|
|
50
60
|
|
51
61
|
# Write the given row
|
52
62
|
def write(row)
|
53
|
-
|
63
|
+
if @condition.nil? || @condition.call(row)
|
64
|
+
buffer << row
|
65
|
+
end
|
54
66
|
flush if buffer.length >= buffer_size
|
55
67
|
end
|
56
68
|
|