activewarehouse-etl 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
@@ -1,7 +1,34 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Destination which writes directly to a database. This is useful when you are dealing with
|
4
|
+
# a small amount of data. For larger amounts of data you should probably use the bulk
|
5
|
+
# loader if it is supported with your target database as it will use a much faster load
|
6
|
+
# method.
|
3
7
|
class DatabaseDestination < Destination
|
4
|
-
|
8
|
+
# Specify the order from the source
|
9
|
+
attr_reader :order
|
10
|
+
|
11
|
+
# Set to true to truncate the destination table first
|
12
|
+
attr_reader :truncate
|
13
|
+
|
14
|
+
# Initialize the database destination
|
15
|
+
#
|
16
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
17
|
+
# * <tt>configuration</tt>: The configuration Hash
|
18
|
+
# * <tt>mapping</tt>: The mapping
|
19
|
+
#
|
20
|
+
# Configuration options:
|
21
|
+
# * <tt>:database</tt>: The database name (REQUIRED)
|
22
|
+
# * <tt>:table<tt>: The table to write to (REQUIRED)
|
23
|
+
# * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
|
24
|
+
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
25
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
26
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
27
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
28
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
29
|
+
#
|
30
|
+
# Mapping options:
|
31
|
+
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
5
32
|
def initialize(control, configuration, mapping)
|
6
33
|
super
|
7
34
|
@truncate = configuration[:truncate] ||= false
|
@@ -11,8 +38,9 @@ module ETL
|
|
11
38
|
connect
|
12
39
|
end
|
13
40
|
|
41
|
+
# Flush the currently buffered data
|
14
42
|
def flush
|
15
|
-
conn = ActiveRecord::Base.connection
|
43
|
+
conn = ETL::ActiveRecord::Base.connection
|
16
44
|
conn.transaction do
|
17
45
|
conn.truncate(configuration[:table]) if truncate
|
18
46
|
|
@@ -32,7 +60,7 @@ module ETL
|
|
32
60
|
end
|
33
61
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
34
62
|
# ETL::Engine.logger.debug("Query: #{q}")
|
35
|
-
conn.
|
63
|
+
conn.insert(q, "Insert row #{current_row}")
|
36
64
|
@current_row += 1
|
37
65
|
end
|
38
66
|
buffer.clear
|
@@ -42,12 +70,22 @@ module ETL
|
|
42
70
|
# Close the connection
|
43
71
|
def close
|
44
72
|
flush
|
45
|
-
ActiveRecord::Base.connection.disconnect!
|
73
|
+
ETL::ActiveRecord::Base.connection.disconnect!
|
46
74
|
end
|
47
75
|
|
48
76
|
private
|
77
|
+
# Connect to the database.
|
78
|
+
#
|
79
|
+
# Required options:
|
80
|
+
# * <tt>:database</tt>: The database name
|
81
|
+
#
|
82
|
+
# Options:
|
83
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
84
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
85
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
86
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
49
87
|
def connect
|
50
|
-
ActiveRecord::Base.establish_connection(
|
88
|
+
ETL::ActiveRecord::Base.establish_connection(
|
51
89
|
:adapter => (configuration[:adapter] || :mysql),
|
52
90
|
:username => (configuration[:username] || 'root'),
|
53
91
|
:host => (configuration[:host] || 'localhost'),
|
@@ -2,13 +2,39 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
|
-
|
6
|
-
|
5
|
+
# The File to write to
|
6
|
+
attr_reader :file
|
7
|
+
|
8
|
+
# The output order
|
9
|
+
attr_reader :order
|
10
|
+
|
11
|
+
# Flag which indicates to append (default is to overwrite)
|
12
|
+
attr_accessor :append
|
13
|
+
|
14
|
+
# The separator
|
15
|
+
attr_accessor :separator
|
16
|
+
|
17
|
+
# The end of line marker
|
18
|
+
attr_accessor :eol
|
19
|
+
|
20
|
+
# The enclosure character
|
21
|
+
attr_accessor :enclose
|
7
22
|
|
8
23
|
# Initialize the object.
|
9
24
|
# * <tt>control</tt>: The Control object
|
10
25
|
# * <tt>configuration</tt>: The configuration map
|
11
26
|
# * <tt>mapping</tt>: The output mapping
|
27
|
+
#
|
28
|
+
# Configuration options:
|
29
|
+
# * <tt>:file<tt>: The file to write to (REQUIRED)
|
30
|
+
# * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
|
31
|
+
# * <tt>:separator</tt>: Record separator (default is a comma)
|
32
|
+
# * <tt>:eol</tt>: End of line marker (default is \n)
|
33
|
+
# * <tt>:enclose</tt>: Enclosure character (default is none)
|
34
|
+
# * <tt>:unique</tt>: Set to true to only write unique records
|
35
|
+
#
|
36
|
+
# Mapping options:
|
37
|
+
# * <tt>:order</tt>: The order array
|
12
38
|
def initialize(control, configuration, mapping)
|
13
39
|
super
|
14
40
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
@@ -57,7 +83,6 @@ module ETL #:nodoc:
|
|
57
83
|
end
|
58
84
|
|
59
85
|
private
|
60
|
-
|
61
86
|
# Get the open file stream
|
62
87
|
def f
|
63
88
|
@f ||= open(file, mode)
|
@@ -67,7 +92,6 @@ module ETL #:nodoc:
|
|
67
92
|
def mode
|
68
93
|
append ? 'a' : 'w'
|
69
94
|
end
|
70
|
-
|
71
95
|
end
|
72
96
|
end
|
73
97
|
end
|
data/lib/etl/control/source.rb
CHANGED
@@ -3,9 +3,20 @@ module ETL #:nodoc:
|
|
3
3
|
# ETL source. Subclasses must implement the <tt>each</tt> method.
|
4
4
|
class Source
|
5
5
|
include Enumerable
|
6
|
-
|
6
|
+
|
7
|
+
# The control object
|
8
|
+
attr_accessor :control
|
9
|
+
|
10
|
+
# The configuration Hash
|
11
|
+
attr_accessor :configuration
|
12
|
+
|
13
|
+
# The definition Hash
|
14
|
+
attr_accessor :definition
|
7
15
|
|
8
16
|
class << self
|
17
|
+
# Convert the name to a Source class.
|
18
|
+
#
|
19
|
+
# For example if name is :database then this will return a DatabaseSource class
|
9
20
|
def class_for_name(name)
|
10
21
|
ETL::Control.const_get("#{name.to_s.classify}Source")
|
11
22
|
end
|
@@ -20,6 +31,10 @@ module ETL #:nodoc:
|
|
20
31
|
@configuration = configuration
|
21
32
|
@definition = definition
|
22
33
|
end
|
34
|
+
|
35
|
+
def errors
|
36
|
+
@errors ||= []
|
37
|
+
end
|
23
38
|
end
|
24
39
|
end
|
25
40
|
end
|
@@ -1,23 +1,53 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Source object which extracts data from a database using ActiveRecord.
|
3
4
|
class DatabaseSource < Source
|
5
|
+
# Initialize the source.
|
6
|
+
#
|
7
|
+
# Arguments:
|
8
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
9
|
+
# * <tt>configuration</tt>: The configuration Hash
|
10
|
+
# * <tt>definition</tt>: The source definition
|
11
|
+
#
|
12
|
+
# Required configuration options:
|
13
|
+
# * <tt>:table</tt>: The source table name
|
14
|
+
# * <tt>:database</tt>: The database name
|
15
|
+
#
|
16
|
+
# Other options:
|
17
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
18
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
19
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
20
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
4
21
|
def initialize(control, configuration, definition)
|
5
22
|
super
|
6
23
|
connect
|
7
24
|
end
|
8
25
|
|
26
|
+
def to_s
|
27
|
+
"#{configuration[:host]}/#{configuration[:database]}"
|
28
|
+
end
|
29
|
+
|
9
30
|
# Returns each row from the source
|
10
31
|
def each
|
11
|
-
conn = ActiveRecord::Base.connection
|
32
|
+
conn = ETL::ActiveRecord::Base.connection
|
12
33
|
conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
|
13
34
|
yield row
|
14
35
|
end
|
15
36
|
end
|
16
37
|
|
17
38
|
private
|
39
|
+
# Connect to the database.
|
40
|
+
#
|
41
|
+
# Required options:
|
42
|
+
# * <tt>:database</tt>: The database name
|
43
|
+
#
|
44
|
+
# Options:
|
45
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
46
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
47
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
48
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
18
49
|
def connect
|
19
|
-
|
20
|
-
ActiveRecord::Base.establish_connection(
|
50
|
+
ETL::ActiveRecord::Base.establish_connection(
|
21
51
|
:adapter => (configuration[:adapter] || :mysql),
|
22
52
|
:username => (configuration[:username] || 'root'),
|
23
53
|
:host => (configuration[:host] || 'localhost'),
|
@@ -1,19 +1,49 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# A File source.
|
3
4
|
class FileSource < Source
|
5
|
+
# The number of lines to skip, default is 0
|
4
6
|
attr_accessor :skip_lines
|
7
|
+
|
8
|
+
# Accessor for the underlying parser
|
9
|
+
attr_accessor :parser
|
10
|
+
|
11
|
+
# Initialize the source
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
|
15
|
+
# a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
|
16
|
+
# which parser is used. See the documentation for each parser for information on what options it accepts.
|
17
|
+
# * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
|
5
18
|
def initialize(control, configuration, definition)
|
6
19
|
super
|
7
20
|
configure
|
8
21
|
end
|
22
|
+
|
23
|
+
def to_s
|
24
|
+
configuration[:file]
|
25
|
+
end
|
26
|
+
|
9
27
|
# Returns each row from the source
|
10
28
|
def each
|
11
29
|
@parser.each { |row| yield row }
|
12
30
|
end
|
13
31
|
|
14
32
|
private
|
33
|
+
# Configure the source
|
15
34
|
def configure
|
16
|
-
|
35
|
+
case @configuration[:parser]
|
36
|
+
when Class
|
37
|
+
@parser = @configuration[:parser].new(self)
|
38
|
+
when String, Symbol
|
39
|
+
@parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
|
40
|
+
when Hash
|
41
|
+
name = @configuration[:parser][:name]
|
42
|
+
options = @configuration[:parser][:options]
|
43
|
+
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
44
|
+
else
|
45
|
+
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
46
|
+
end
|
17
47
|
@skip_lines = @configuration[:skip_lines] ||= 0
|
18
48
|
end
|
19
49
|
end
|
data/lib/etl/engine.rb
CHANGED
@@ -1,61 +1,181 @@
|
|
1
|
-
module ETL
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module ActiveRecord #:nodoc:
|
3
|
+
# Base class which is used for ActiveRecord connections. This is necessary
|
4
|
+
# since AR connections are tied to the class, and using ActiveRecord::Base
|
5
|
+
# directly can cause problems if the connection is closed.
|
6
|
+
class Base < ::ActiveRecord::Base
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# The main ETL engine clas
|
2
11
|
class Engine
|
3
12
|
|
4
13
|
class << self
|
14
|
+
# Process the specified control file. Acceptable values for control_file are
|
15
|
+
# * Path to a file
|
16
|
+
# * File object
|
17
|
+
# * ETL::Control::Control instance
|
5
18
|
def process(control_file)
|
6
19
|
new().process(control_file)
|
7
20
|
end
|
8
21
|
|
22
|
+
# A logger for the engine
|
9
23
|
attr_accessor :logger
|
10
24
|
|
11
|
-
def logger
|
25
|
+
def logger #:nodoc:
|
12
26
|
unless @logger
|
13
27
|
@logger = Logger.new('etl.log')
|
14
|
-
@logger.level = Logger::
|
28
|
+
@logger.level = Logger::WARN
|
15
29
|
end
|
16
30
|
@logger
|
17
31
|
end
|
32
|
+
|
33
|
+
attr_accessor :current_source
|
34
|
+
attr_accessor :current_source_row
|
35
|
+
attr_accessor :current_destination
|
36
|
+
|
37
|
+
attr_accessor :realtime_activity
|
38
|
+
end
|
39
|
+
|
40
|
+
def say(message)
|
41
|
+
say_without_newline(message + "\n")
|
42
|
+
end
|
43
|
+
|
44
|
+
def say_without_newline(message)
|
45
|
+
if Engine.realtime_activity
|
46
|
+
$stdout.print message
|
47
|
+
$stdout.flush
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def say_on_own_line(message)
|
52
|
+
say("\n" + message)
|
18
53
|
end
|
19
54
|
|
20
|
-
# Process a control file or object.
|
55
|
+
# Process a control file or object. Acceptable values for control are:
|
56
|
+
# * Path to a file
|
57
|
+
# * File object
|
58
|
+
# * ETL::Control::Control instance
|
21
59
|
def process(control)
|
60
|
+
start_time = Time.now
|
22
61
|
control = ETL::Control::Control.resolve(control)
|
23
62
|
|
63
|
+
Engine.logger.debug "Pre-processing #{control.file}"
|
24
64
|
pre_process(control)
|
65
|
+
Engine.logger.debug "Pre-processing complete"
|
25
66
|
|
26
67
|
sources = control.sources
|
27
68
|
destinations = control.destinations
|
28
69
|
|
29
70
|
sources.each do |source|
|
71
|
+
Engine.current_source = source
|
72
|
+
Engine.logger.debug "Processing source #{source}"
|
73
|
+
say "Source: #{source}"
|
30
74
|
source.each_with_index do |row, index|
|
31
|
-
|
75
|
+
Engine.current_source_row = index + 1
|
76
|
+
if Engine.realtime_activity && index % 1000 == 0
|
77
|
+
say_without_newline "."
|
78
|
+
end
|
79
|
+
|
80
|
+
begin
|
32
81
|
# execute transforms
|
33
|
-
row
|
82
|
+
row.each do |name, value|
|
83
|
+
row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
|
84
|
+
end
|
85
|
+
rescue => e
|
86
|
+
msg = "Error transforming from #{source} on line #{index}: #{e}"
|
87
|
+
source.errors << msg
|
88
|
+
Engine.logger.error msg
|
34
89
|
end
|
35
|
-
|
36
|
-
|
37
|
-
|
90
|
+
|
91
|
+
begin
|
92
|
+
# write the row to the destination
|
93
|
+
destinations.each do |destination|
|
94
|
+
Engine.current_destination = destination
|
95
|
+
destination.write(row)
|
96
|
+
end
|
97
|
+
rescue
|
98
|
+
msg = "Error writing to #{destination} on line #{index}"
|
99
|
+
destination.errors << msg
|
100
|
+
Engine.logger.error msg
|
38
101
|
end
|
39
102
|
end
|
103
|
+
say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)}"
|
40
104
|
destinations.each do |destination|
|
41
105
|
destination.close
|
42
106
|
end
|
43
107
|
end
|
44
108
|
|
109
|
+
Engine.logger.debug "Post-processing #{control.file}"
|
45
110
|
post_process(control)
|
111
|
+
Engine.logger.debug "Post-processing complete"
|
46
112
|
end
|
47
113
|
|
48
114
|
private
|
115
|
+
# Execute all preprocessors
|
49
116
|
def pre_process(control)
|
50
117
|
control.pre_processors.each do |processor|
|
51
118
|
processor.process
|
52
119
|
end
|
53
120
|
end
|
54
121
|
|
122
|
+
# Execute all postprocessors
|
55
123
|
def post_process(control)
|
56
124
|
control.post_processors.each do |processor|
|
57
125
|
processor.process
|
58
126
|
end
|
59
127
|
end
|
128
|
+
|
129
|
+
# Return the distance of time in words from the given from_time to the specified to_time. If to_time
|
130
|
+
# is not specified then Time.now is used. By default seconds are included...set the include_seconds
|
131
|
+
# argument to false to disable the seconds.
|
132
|
+
def distance_of_time_in_words(from_time, to_time=Time.now)
|
133
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
134
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
135
|
+
seconds = (to_time - from_time).round
|
136
|
+
distance_in_days = (seconds/(60*60*24)).round
|
137
|
+
seconds = seconds % (60*60*24)
|
138
|
+
distance_in_hours = (seconds/(60*60)).round
|
139
|
+
seconds = seconds % (60*60)
|
140
|
+
distance_in_minutes = (seconds/60).round
|
141
|
+
seconds = seconds % 60
|
142
|
+
distance_in_seconds = seconds
|
143
|
+
|
144
|
+
s = ''
|
145
|
+
s << "#{distance_in_days} days," if distance_in_days > 0
|
146
|
+
s << "#{distance_in_hours} hours, " if distance_in_hours > 0
|
147
|
+
s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
|
148
|
+
s << "#{distance_in_seconds} seconds"
|
149
|
+
s
|
150
|
+
end
|
151
|
+
|
152
|
+
def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
|
153
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
154
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
155
|
+
distance_in_minutes = (((to_time - from_time).abs)/60).round
|
156
|
+
distance_in_seconds = ((to_time - from_time).abs).round
|
157
|
+
|
158
|
+
case distance_in_minutes
|
159
|
+
when 0..1
|
160
|
+
return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
|
161
|
+
case distance_in_seconds
|
162
|
+
when 0..4 then 'less than 5 seconds'
|
163
|
+
when 5..9 then 'less than 10 seconds'
|
164
|
+
when 10..19 then 'less than 20 seconds'
|
165
|
+
when 20..39 then 'half a minute'
|
166
|
+
when 40..59 then 'less than a minute'
|
167
|
+
else '1 minute'
|
168
|
+
end
|
169
|
+
when 2..44 then "#{distance_in_minutes} minutes"
|
170
|
+
when 45..89 then 'about 1 hour'
|
171
|
+
when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
|
172
|
+
when 1440..2879 then '1 day'
|
173
|
+
when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
|
174
|
+
when 43200..86399 then 'about 1 month'
|
175
|
+
when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
|
176
|
+
when 525960..1051919 then 'about 1 year'
|
177
|
+
else "over #{(distance_in_minutes / 525960).round} years"
|
178
|
+
end
|
179
|
+
end
|
60
180
|
end
|
61
181
|
end
|