activewarehouse-etl 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
@@ -1,7 +1,34 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Destination which writes directly to a database. This is useful when you are dealing with
|
4
|
+
# a small amount of data. For larger amounts of data you should probably use the bulk
|
5
|
+
# loader if it is supported with your target database as it will use a much faster load
|
6
|
+
# method.
|
3
7
|
class DatabaseDestination < Destination
|
4
|
-
|
8
|
+
# Specify the order from the source
|
9
|
+
attr_reader :order
|
10
|
+
|
11
|
+
# Set to true to truncate the destination table first
|
12
|
+
attr_reader :truncate
|
13
|
+
|
14
|
+
# Initialize the database destination
|
15
|
+
#
|
16
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
17
|
+
# * <tt>configuration</tt>: The configuration Hash
|
18
|
+
# * <tt>mapping</tt>: The mapping
|
19
|
+
#
|
20
|
+
# Configuration options:
|
21
|
+
# * <tt>:database</tt>: The database name (REQUIRED)
|
22
|
+
# * <tt>:table<tt>: The table to write to (REQUIRED)
|
23
|
+
# * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
|
24
|
+
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
25
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
26
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
27
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
28
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
29
|
+
#
|
30
|
+
# Mapping options:
|
31
|
+
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
5
32
|
def initialize(control, configuration, mapping)
|
6
33
|
super
|
7
34
|
@truncate = configuration[:truncate] ||= false
|
@@ -11,8 +38,9 @@ module ETL
|
|
11
38
|
connect
|
12
39
|
end
|
13
40
|
|
41
|
+
# Flush the currently buffered data
|
14
42
|
def flush
|
15
|
-
conn = ActiveRecord::Base.connection
|
43
|
+
conn = ETL::ActiveRecord::Base.connection
|
16
44
|
conn.transaction do
|
17
45
|
conn.truncate(configuration[:table]) if truncate
|
18
46
|
|
@@ -32,7 +60,7 @@ module ETL
|
|
32
60
|
end
|
33
61
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
34
62
|
# ETL::Engine.logger.debug("Query: #{q}")
|
35
|
-
conn.
|
63
|
+
conn.insert(q, "Insert row #{current_row}")
|
36
64
|
@current_row += 1
|
37
65
|
end
|
38
66
|
buffer.clear
|
@@ -42,12 +70,22 @@ module ETL
|
|
42
70
|
# Close the connection
|
43
71
|
def close
|
44
72
|
flush
|
45
|
-
ActiveRecord::Base.connection.disconnect!
|
73
|
+
ETL::ActiveRecord::Base.connection.disconnect!
|
46
74
|
end
|
47
75
|
|
48
76
|
private
|
77
|
+
# Connect to the database.
|
78
|
+
#
|
79
|
+
# Required options:
|
80
|
+
# * <tt>:database</tt>: The database name
|
81
|
+
#
|
82
|
+
# Options:
|
83
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
84
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
85
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
86
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
49
87
|
def connect
|
50
|
-
ActiveRecord::Base.establish_connection(
|
88
|
+
ETL::ActiveRecord::Base.establish_connection(
|
51
89
|
:adapter => (configuration[:adapter] || :mysql),
|
52
90
|
:username => (configuration[:username] || 'root'),
|
53
91
|
:host => (configuration[:host] || 'localhost'),
|
@@ -2,13 +2,39 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
|
-
|
6
|
-
|
5
|
+
# The File to write to
|
6
|
+
attr_reader :file
|
7
|
+
|
8
|
+
# The output order
|
9
|
+
attr_reader :order
|
10
|
+
|
11
|
+
# Flag which indicates to append (default is to overwrite)
|
12
|
+
attr_accessor :append
|
13
|
+
|
14
|
+
# The separator
|
15
|
+
attr_accessor :separator
|
16
|
+
|
17
|
+
# The end of line marker
|
18
|
+
attr_accessor :eol
|
19
|
+
|
20
|
+
# The enclosure character
|
21
|
+
attr_accessor :enclose
|
7
22
|
|
8
23
|
# Initialize the object.
|
9
24
|
# * <tt>control</tt>: The Control object
|
10
25
|
# * <tt>configuration</tt>: The configuration map
|
11
26
|
# * <tt>mapping</tt>: The output mapping
|
27
|
+
#
|
28
|
+
# Configuration options:
|
29
|
+
# * <tt>:file<tt>: The file to write to (REQUIRED)
|
30
|
+
# * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
|
31
|
+
# * <tt>:separator</tt>: Record separator (default is a comma)
|
32
|
+
# * <tt>:eol</tt>: End of line marker (default is \n)
|
33
|
+
# * <tt>:enclose</tt>: Enclosure character (default is none)
|
34
|
+
# * <tt>:unique</tt>: Set to true to only write unique records
|
35
|
+
#
|
36
|
+
# Mapping options:
|
37
|
+
# * <tt>:order</tt>: The order array
|
12
38
|
def initialize(control, configuration, mapping)
|
13
39
|
super
|
14
40
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
@@ -57,7 +83,6 @@ module ETL #:nodoc:
|
|
57
83
|
end
|
58
84
|
|
59
85
|
private
|
60
|
-
|
61
86
|
# Get the open file stream
|
62
87
|
def f
|
63
88
|
@f ||= open(file, mode)
|
@@ -67,7 +92,6 @@ module ETL #:nodoc:
|
|
67
92
|
def mode
|
68
93
|
append ? 'a' : 'w'
|
69
94
|
end
|
70
|
-
|
71
95
|
end
|
72
96
|
end
|
73
97
|
end
|
data/lib/etl/control/source.rb
CHANGED
@@ -3,9 +3,20 @@ module ETL #:nodoc:
|
|
3
3
|
# ETL source. Subclasses must implement the <tt>each</tt> method.
|
4
4
|
class Source
|
5
5
|
include Enumerable
|
6
|
-
|
6
|
+
|
7
|
+
# The control object
|
8
|
+
attr_accessor :control
|
9
|
+
|
10
|
+
# The configuration Hash
|
11
|
+
attr_accessor :configuration
|
12
|
+
|
13
|
+
# The definition Hash
|
14
|
+
attr_accessor :definition
|
7
15
|
|
8
16
|
class << self
|
17
|
+
# Convert the name to a Source class.
|
18
|
+
#
|
19
|
+
# For example if name is :database then this will return a DatabaseSource class
|
9
20
|
def class_for_name(name)
|
10
21
|
ETL::Control.const_get("#{name.to_s.classify}Source")
|
11
22
|
end
|
@@ -20,6 +31,10 @@ module ETL #:nodoc:
|
|
20
31
|
@configuration = configuration
|
21
32
|
@definition = definition
|
22
33
|
end
|
34
|
+
|
35
|
+
def errors
|
36
|
+
@errors ||= []
|
37
|
+
end
|
23
38
|
end
|
24
39
|
end
|
25
40
|
end
|
@@ -1,23 +1,53 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Source object which extracts data from a database using ActiveRecord.
|
3
4
|
class DatabaseSource < Source
|
5
|
+
# Initialize the source.
|
6
|
+
#
|
7
|
+
# Arguments:
|
8
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
9
|
+
# * <tt>configuration</tt>: The configuration Hash
|
10
|
+
# * <tt>definition</tt>: The source definition
|
11
|
+
#
|
12
|
+
# Required configuration options:
|
13
|
+
# * <tt>:table</tt>: The source table name
|
14
|
+
# * <tt>:database</tt>: The database name
|
15
|
+
#
|
16
|
+
# Other options:
|
17
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
18
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
19
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
20
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
4
21
|
def initialize(control, configuration, definition)
|
5
22
|
super
|
6
23
|
connect
|
7
24
|
end
|
8
25
|
|
26
|
+
def to_s
|
27
|
+
"#{configuration[:host]}/#{configuration[:database]}"
|
28
|
+
end
|
29
|
+
|
9
30
|
# Returns each row from the source
|
10
31
|
def each
|
11
|
-
conn = ActiveRecord::Base.connection
|
32
|
+
conn = ETL::ActiveRecord::Base.connection
|
12
33
|
conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
|
13
34
|
yield row
|
14
35
|
end
|
15
36
|
end
|
16
37
|
|
17
38
|
private
|
39
|
+
# Connect to the database.
|
40
|
+
#
|
41
|
+
# Required options:
|
42
|
+
# * <tt>:database</tt>: The database name
|
43
|
+
#
|
44
|
+
# Options:
|
45
|
+
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
46
|
+
# * <tt>:username</tt>: The database username (defaults to 'root')
|
47
|
+
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
48
|
+
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
18
49
|
def connect
|
19
|
-
|
20
|
-
ActiveRecord::Base.establish_connection(
|
50
|
+
ETL::ActiveRecord::Base.establish_connection(
|
21
51
|
:adapter => (configuration[:adapter] || :mysql),
|
22
52
|
:username => (configuration[:username] || 'root'),
|
23
53
|
:host => (configuration[:host] || 'localhost'),
|
@@ -1,19 +1,49 @@
|
|
1
|
-
module ETL
|
2
|
-
module Control
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# A File source.
|
3
4
|
class FileSource < Source
|
5
|
+
# The number of lines to skip, default is 0
|
4
6
|
attr_accessor :skip_lines
|
7
|
+
|
8
|
+
# Accessor for the underlying parser
|
9
|
+
attr_accessor :parser
|
10
|
+
|
11
|
+
# Initialize the source
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
|
15
|
+
# a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
|
16
|
+
# which parser is used. See the documentation for each parser for information on what options it accepts.
|
17
|
+
# * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
|
5
18
|
def initialize(control, configuration, definition)
|
6
19
|
super
|
7
20
|
configure
|
8
21
|
end
|
22
|
+
|
23
|
+
def to_s
|
24
|
+
configuration[:file]
|
25
|
+
end
|
26
|
+
|
9
27
|
# Returns each row from the source
|
10
28
|
def each
|
11
29
|
@parser.each { |row| yield row }
|
12
30
|
end
|
13
31
|
|
14
32
|
private
|
33
|
+
# Configure the source
|
15
34
|
def configure
|
16
|
-
|
35
|
+
case @configuration[:parser]
|
36
|
+
when Class
|
37
|
+
@parser = @configuration[:parser].new(self)
|
38
|
+
when String, Symbol
|
39
|
+
@parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
|
40
|
+
when Hash
|
41
|
+
name = @configuration[:parser][:name]
|
42
|
+
options = @configuration[:parser][:options]
|
43
|
+
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
44
|
+
else
|
45
|
+
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
46
|
+
end
|
17
47
|
@skip_lines = @configuration[:skip_lines] ||= 0
|
18
48
|
end
|
19
49
|
end
|
data/lib/etl/engine.rb
CHANGED
@@ -1,61 +1,181 @@
|
|
1
|
-
module ETL
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module ActiveRecord #:nodoc:
|
3
|
+
# Base class which is used for ActiveRecord connections. This is necessary
|
4
|
+
# since AR connections are tied to the class, and using ActiveRecord::Base
|
5
|
+
# directly can cause problems if the connection is closed.
|
6
|
+
class Base < ::ActiveRecord::Base
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# The main ETL engine clas
|
2
11
|
class Engine
|
3
12
|
|
4
13
|
class << self
|
14
|
+
# Process the specified control file. Acceptable values for control_file are
|
15
|
+
# * Path to a file
|
16
|
+
# * File object
|
17
|
+
# * ETL::Control::Control instance
|
5
18
|
def process(control_file)
|
6
19
|
new().process(control_file)
|
7
20
|
end
|
8
21
|
|
22
|
+
# A logger for the engine
|
9
23
|
attr_accessor :logger
|
10
24
|
|
11
|
-
def logger
|
25
|
+
def logger #:nodoc:
|
12
26
|
unless @logger
|
13
27
|
@logger = Logger.new('etl.log')
|
14
|
-
@logger.level = Logger::
|
28
|
+
@logger.level = Logger::WARN
|
15
29
|
end
|
16
30
|
@logger
|
17
31
|
end
|
32
|
+
|
33
|
+
attr_accessor :current_source
|
34
|
+
attr_accessor :current_source_row
|
35
|
+
attr_accessor :current_destination
|
36
|
+
|
37
|
+
attr_accessor :realtime_activity
|
38
|
+
end
|
39
|
+
|
40
|
+
def say(message)
|
41
|
+
say_without_newline(message + "\n")
|
42
|
+
end
|
43
|
+
|
44
|
+
def say_without_newline(message)
|
45
|
+
if Engine.realtime_activity
|
46
|
+
$stdout.print message
|
47
|
+
$stdout.flush
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def say_on_own_line(message)
|
52
|
+
say("\n" + message)
|
18
53
|
end
|
19
54
|
|
20
|
-
# Process a control file or object.
|
55
|
+
# Process a control file or object. Acceptable values for control are:
|
56
|
+
# * Path to a file
|
57
|
+
# * File object
|
58
|
+
# * ETL::Control::Control instance
|
21
59
|
def process(control)
|
60
|
+
start_time = Time.now
|
22
61
|
control = ETL::Control::Control.resolve(control)
|
23
62
|
|
63
|
+
Engine.logger.debug "Pre-processing #{control.file}"
|
24
64
|
pre_process(control)
|
65
|
+
Engine.logger.debug "Pre-processing complete"
|
25
66
|
|
26
67
|
sources = control.sources
|
27
68
|
destinations = control.destinations
|
28
69
|
|
29
70
|
sources.each do |source|
|
71
|
+
Engine.current_source = source
|
72
|
+
Engine.logger.debug "Processing source #{source}"
|
73
|
+
say "Source: #{source}"
|
30
74
|
source.each_with_index do |row, index|
|
31
|
-
|
75
|
+
Engine.current_source_row = index + 1
|
76
|
+
if Engine.realtime_activity && index % 1000 == 0
|
77
|
+
say_without_newline "."
|
78
|
+
end
|
79
|
+
|
80
|
+
begin
|
32
81
|
# execute transforms
|
33
|
-
row
|
82
|
+
row.each do |name, value|
|
83
|
+
row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
|
84
|
+
end
|
85
|
+
rescue => e
|
86
|
+
msg = "Error transforming from #{source} on line #{index}: #{e}"
|
87
|
+
source.errors << msg
|
88
|
+
Engine.logger.error msg
|
34
89
|
end
|
35
|
-
|
36
|
-
|
37
|
-
|
90
|
+
|
91
|
+
begin
|
92
|
+
# write the row to the destination
|
93
|
+
destinations.each do |destination|
|
94
|
+
Engine.current_destination = destination
|
95
|
+
destination.write(row)
|
96
|
+
end
|
97
|
+
rescue
|
98
|
+
msg = "Error writing to #{destination} on line #{index}"
|
99
|
+
destination.errors << msg
|
100
|
+
Engine.logger.error msg
|
38
101
|
end
|
39
102
|
end
|
103
|
+
say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)}"
|
40
104
|
destinations.each do |destination|
|
41
105
|
destination.close
|
42
106
|
end
|
43
107
|
end
|
44
108
|
|
109
|
+
Engine.logger.debug "Post-processing #{control.file}"
|
45
110
|
post_process(control)
|
111
|
+
Engine.logger.debug "Post-processing complete"
|
46
112
|
end
|
47
113
|
|
48
114
|
private
|
115
|
+
# Execute all preprocessors
|
49
116
|
def pre_process(control)
|
50
117
|
control.pre_processors.each do |processor|
|
51
118
|
processor.process
|
52
119
|
end
|
53
120
|
end
|
54
121
|
|
122
|
+
# Execute all postprocessors
|
55
123
|
def post_process(control)
|
56
124
|
control.post_processors.each do |processor|
|
57
125
|
processor.process
|
58
126
|
end
|
59
127
|
end
|
128
|
+
|
129
|
+
# Return the distance of time in words from the given from_time to the specified to_time. If to_time
|
130
|
+
# is not specified then Time.now is used. By default seconds are included...set the include_seconds
|
131
|
+
# argument to false to disable the seconds.
|
132
|
+
def distance_of_time_in_words(from_time, to_time=Time.now)
|
133
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
134
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
135
|
+
seconds = (to_time - from_time).round
|
136
|
+
distance_in_days = (seconds/(60*60*24)).round
|
137
|
+
seconds = seconds % (60*60*24)
|
138
|
+
distance_in_hours = (seconds/(60*60)).round
|
139
|
+
seconds = seconds % (60*60)
|
140
|
+
distance_in_minutes = (seconds/60).round
|
141
|
+
seconds = seconds % 60
|
142
|
+
distance_in_seconds = seconds
|
143
|
+
|
144
|
+
s = ''
|
145
|
+
s << "#{distance_in_days} days," if distance_in_days > 0
|
146
|
+
s << "#{distance_in_hours} hours, " if distance_in_hours > 0
|
147
|
+
s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
|
148
|
+
s << "#{distance_in_seconds} seconds"
|
149
|
+
s
|
150
|
+
end
|
151
|
+
|
152
|
+
def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
|
153
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
154
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
155
|
+
distance_in_minutes = (((to_time - from_time).abs)/60).round
|
156
|
+
distance_in_seconds = ((to_time - from_time).abs).round
|
157
|
+
|
158
|
+
case distance_in_minutes
|
159
|
+
when 0..1
|
160
|
+
return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
|
161
|
+
case distance_in_seconds
|
162
|
+
when 0..4 then 'less than 5 seconds'
|
163
|
+
when 5..9 then 'less than 10 seconds'
|
164
|
+
when 10..19 then 'less than 20 seconds'
|
165
|
+
when 20..39 then 'half a minute'
|
166
|
+
when 40..59 then 'less than a minute'
|
167
|
+
else '1 minute'
|
168
|
+
end
|
169
|
+
when 2..44 then "#{distance_in_minutes} minutes"
|
170
|
+
when 45..89 then 'about 1 hour'
|
171
|
+
when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
|
172
|
+
when 1440..2879 then '1 day'
|
173
|
+
when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
|
174
|
+
when 43200..86399 then 'about 1 month'
|
175
|
+
when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
|
176
|
+
when 525960..1051919 then 'about 1 year'
|
177
|
+
else "over #{(distance_in_minutes / 525960).round} years"
|
178
|
+
end
|
179
|
+
end
|
60
180
|
end
|
61
181
|
end
|