activewarehouse-etl 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -19,13 +19,13 @@ module ETL #:nodoc:
|
|
19
19
|
#
|
20
20
|
# Configuration options:
|
21
21
|
# * <tt>:database</tt>: The database name (REQUIRED)
|
22
|
-
# * <tt>:table
|
22
|
+
# * <tt>:table</tt>: The table to write to (REQUIRED)
|
23
23
|
# * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
|
24
24
|
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
25
25
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
26
26
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
27
27
|
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
28
|
-
# * <tt>:host
|
28
|
+
# * <tt>:host</tt>: The host for the database (defaults to 'localhost')
|
29
29
|
# * <tt>:append_rows</tt>: Array of rows to append
|
30
30
|
#
|
31
31
|
# Mapping options:
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# This source file contains the ETL::Control::FileDestination
|
2
|
+
|
1
3
|
module ETL #:nodoc:
|
2
4
|
module Control #:nodoc:
|
3
5
|
# File as the final destination.
|
@@ -60,6 +62,7 @@ module ETL #:nodoc:
|
|
60
62
|
def flush
|
61
63
|
#puts "Flushing buffer (#{file}) with #{buffer.length} rows"
|
62
64
|
buffer.flatten.each do |row|
|
65
|
+
#puts "row change type: #{row.change_type}"
|
63
66
|
# check to see if this row's compound key constraint already exists
|
64
67
|
# note that the compound key constraint may not utilize virtual fields
|
65
68
|
next unless row_allowed?(row)
|
@@ -68,11 +71,22 @@ module ETL #:nodoc:
|
|
68
71
|
add_virtuals!(row)
|
69
72
|
|
70
73
|
# collect all of the values using the order designated in the configuration
|
71
|
-
values = order.collect
|
74
|
+
values = order.collect do |name|
|
75
|
+
value = row[name]
|
76
|
+
case value
|
77
|
+
when Date, Time, DateTime
|
78
|
+
value.to_s(:db)
|
79
|
+
else
|
80
|
+
value.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
values.collect! { |v| v.gsub(separator, "\\#{separator}")}
|
85
|
+
values.collect! { |v| v.gsub(/\n|\r/, '')}
|
72
86
|
|
73
87
|
# enclose the value if required
|
74
88
|
if !enclose.nil?
|
75
|
-
values.collect! { |v| enclose + v.
|
89
|
+
values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
|
76
90
|
end
|
77
91
|
|
78
92
|
# write the values joined by the separator defined in the configuration
|
@@ -81,6 +95,7 @@ module ETL #:nodoc:
|
|
81
95
|
# write the end-of-line
|
82
96
|
f.write(eol)
|
83
97
|
end
|
98
|
+
f.flush
|
84
99
|
buffer.clear
|
85
100
|
#puts "After flush there are #{buffer.length} rows"
|
86
101
|
end
|
@@ -91,6 +106,14 @@ module ETL #:nodoc:
|
|
91
106
|
@f ||= open(file, mode)
|
92
107
|
end
|
93
108
|
|
109
|
+
def options
|
110
|
+
@options ||= {
|
111
|
+
:col_sep => separator,
|
112
|
+
:row_sep => eol,
|
113
|
+
:force_quotes => !enclose.nil?
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
94
117
|
# Get the appropriate mode to open the file stream
|
95
118
|
def mode
|
96
119
|
append ? 'a' : 'w'
|
data/lib/etl/control/source.rb
CHANGED
@@ -14,12 +14,14 @@ module ETL #:nodoc:
|
|
14
14
|
attr_accessor :definition
|
15
15
|
|
16
16
|
# Returns true if the source data should be stored locally for archival
|
17
|
+
# Default behavior will return true.
|
17
18
|
attr_accessor :store_locally
|
18
19
|
|
19
20
|
class << self
|
20
21
|
# Convert the name to a Source class.
|
21
22
|
#
|
22
|
-
# For example if name is :database then this will return a
|
23
|
+
# For example if name is :database then this will return a
|
24
|
+
# DatabaseSource class
|
23
25
|
def class_for_name(name)
|
24
26
|
ETL::Control.const_get("#{name.to_s.classify}Source")
|
25
27
|
end
|
@@ -31,7 +33,8 @@ module ETL #:nodoc:
|
|
31
33
|
# * <tt>definition</tt>: The source layout definition
|
32
34
|
#
|
33
35
|
# Configuration options:
|
34
|
-
# * <tt>:store_locally</tt>: Set to false to not store source data
|
36
|
+
# * <tt>:store_locally</tt>: Set to false to not store source data
|
37
|
+
# locally (defaults to true)
|
35
38
|
def initialize(control, configuration, definition)
|
36
39
|
@control = control
|
37
40
|
@configuration = configuration
|
@@ -40,6 +43,7 @@ module ETL #:nodoc:
|
|
40
43
|
@store_locally = configuration[:store_locally] || true
|
41
44
|
end
|
42
45
|
|
46
|
+
# Get an array of errors that occur during reading from the source
|
43
47
|
def errors
|
44
48
|
@errors ||= []
|
45
49
|
end
|
@@ -57,18 +61,35 @@ module ETL #:nodoc:
|
|
57
61
|
@local_base ||= 'source_data'
|
58
62
|
end
|
59
63
|
|
60
|
-
# The local directory for storing. This method must be overriden by
|
64
|
+
# The local directory for storing. This method must be overriden by
|
65
|
+
# subclasses
|
61
66
|
def local_directory
|
62
67
|
raise "local_directory method is abstract"
|
63
68
|
end
|
64
69
|
|
65
|
-
# Return the local file for storing the raw source data. Each call to
|
66
|
-
# result in a timestamped file, so you cannot expect
|
67
|
-
# the same file
|
68
|
-
|
70
|
+
# Return the local file for storing the raw source data. Each call to
|
71
|
+
# this method will result in a timestamped file, so you cannot expect
|
72
|
+
# to call it multiple times and reference the same file
|
73
|
+
#
|
74
|
+
# Optional sequence can be specified if there are multiple source files
|
75
|
+
def local_file(sequence=nil)
|
76
|
+
filename = timestamp.to_s
|
77
|
+
filename += sequence.to_s if sequence
|
78
|
+
|
69
79
|
local_dir = local_directory
|
70
80
|
FileUtils.mkdir_p(local_dir)
|
71
|
-
File.join(local_dir, "#{
|
81
|
+
File.join(local_dir, "#{filename}.csv")
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the local trigger file that is used to indicate that the file has
|
85
|
+
# been completely written
|
86
|
+
def local_file_trigger(file)
|
87
|
+
Pathname.new(file.to_s + '.trig')
|
88
|
+
end
|
89
|
+
|
90
|
+
# Return true if the source should read locally.
|
91
|
+
def read_locally
|
92
|
+
Engine.read_locally
|
72
93
|
end
|
73
94
|
|
74
95
|
end
|
@@ -22,10 +22,18 @@ module ETL #:nodoc:
|
|
22
22
|
# Other options:
|
23
23
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
24
24
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
25
|
-
# * <tt>:password</tt>: The password to the database (defaults to
|
26
|
-
#
|
27
|
-
# * <tt>:
|
28
|
-
#
|
25
|
+
# * <tt>:password</tt>: The password to the database (defaults to
|
26
|
+
# nothing)
|
27
|
+
# * <tt>:host</tt>: The host for the database (defaults to
|
28
|
+
# 'localhost')
|
29
|
+
# * <tt>:join</tt>: Optional join part for the query (ignored unless
|
30
|
+
# specified)
|
31
|
+
# * <tt>:select</tt>: Optional select part for the query (defaults to
|
32
|
+
# '*')
|
33
|
+
# * <tt>:order</tt>: Optional order part for the query (ignored unless
|
34
|
+
# specified)
|
35
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
36
|
+
# source data locally in a flat file (defaults to true)
|
29
37
|
def initialize(control, configuration, definition)
|
30
38
|
super
|
31
39
|
connect
|
@@ -36,41 +44,117 @@ module ETL #:nodoc:
|
|
36
44
|
"#{host}/#{configuration[:database]}/#{configuration[:table]}"
|
37
45
|
end
|
38
46
|
|
39
|
-
# Get the local directory to use, which is a combination of the
|
40
|
-
# the db database name and the db table.
|
47
|
+
# Get the local directory to use, which is a combination of the
|
48
|
+
# local_base, the db hostname the db database name and the db table.
|
41
49
|
def local_directory
|
42
50
|
File.join(local_base, host, configuration[:database], configuration[:table])
|
43
51
|
end
|
44
52
|
|
45
|
-
#
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
# Get the join part of the query, defaults to nil
|
54
|
+
def join
|
55
|
+
configuration[:join]
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the select part of the query, defaults to '*'
|
59
|
+
def select
|
60
|
+
configuration[:select] || '*'
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get the order for the query, defaults to nil
|
64
|
+
def order
|
65
|
+
configuration[:order]
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the list of columns to read. This is defined in the source
|
69
|
+
# definition as either an Array or Hash
|
70
|
+
def columns
|
71
|
+
case definition
|
72
|
+
when Array
|
73
|
+
definition.collect(&:to_sym)
|
74
|
+
when Hash
|
75
|
+
definition.keys.collect(&:to_sym)
|
76
|
+
else
|
77
|
+
raise "Definition must be either an Array or a Hash"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns each row from the source. If read_locally is specified then
|
82
|
+
# this method will attempt to read from the last stored local file.
|
83
|
+
# If no locally stored file exists or if the trigger file for the last
|
84
|
+
# locally stored file does not exist then this method will raise an
|
85
|
+
# error.
|
86
|
+
def each(&block)
|
87
|
+
if read_locally # Read from the last stored source
|
88
|
+
read_rows(&block)
|
89
|
+
else # Read from the original source
|
90
|
+
if store_locally
|
91
|
+
write_local
|
92
|
+
read_rows(&block)
|
93
|
+
else
|
94
|
+
connection.select_all(query).each do |row|
|
95
|
+
row = Row.new(row.symbolize_keys)
|
96
|
+
yield row
|
56
97
|
end
|
57
98
|
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
# Read rows from the local cache
|
104
|
+
def read_rows
|
105
|
+
file = local_file
|
106
|
+
|
107
|
+
File.exists?(file) or raise "Local cache file not found"
|
108
|
+
File.exists?(local_file_trigger(file)) or raise "Local cache trigger file not found"
|
109
|
+
|
110
|
+
t = Benchmark.realtime do
|
58
111
|
FasterCSV.open(file, :headers => true).each do |row|
|
59
|
-
result_row =
|
112
|
+
result_row = ETL::Row.new
|
60
113
|
row.each do |header, field|
|
61
114
|
result_row[header.to_sym] = field
|
62
115
|
end
|
63
|
-
#puts "yielding #{result_row.inspect}"
|
64
116
|
yield result_row
|
65
117
|
end
|
66
|
-
|
67
|
-
|
68
|
-
|
118
|
+
end
|
119
|
+
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
120
|
+
end
|
121
|
+
|
122
|
+
# Write rows to the local cache
|
123
|
+
def write_local
|
124
|
+
file = local_file
|
125
|
+
|
126
|
+
lines = 0
|
127
|
+
t = Benchmark.realtime do
|
128
|
+
FasterCSV.open(file, 'w') do |f|
|
129
|
+
f << columns
|
130
|
+
connection.select_all(query).each do |row|
|
131
|
+
f << columns.collect { |column| row[column.to_s] }
|
132
|
+
lines += 1
|
133
|
+
end
|
69
134
|
end
|
135
|
+
File.open(local_file_trigger(file), 'w') {|f| }
|
70
136
|
end
|
137
|
+
ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
|
138
|
+
end
|
139
|
+
|
140
|
+
# Get the query to use
|
141
|
+
def query
|
142
|
+
return @query if @query
|
143
|
+
q = "SELECT #{select} FROM #{configuration[:table]}"
|
144
|
+
q << " #{join}" if join
|
145
|
+
q << " ORDER BY #{order}" if order
|
146
|
+
if ETL::Engine.limit || ETL::Engine.offset
|
147
|
+
options = {}
|
148
|
+
options[:limit] = ETL::Engine.limit if ETL::Engine.limit
|
149
|
+
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
150
|
+
connection.add_limit_offset!(q, options)
|
151
|
+
end
|
152
|
+
#q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
|
153
|
+
q = q.gsub(/\n/,' ')
|
154
|
+
ETL::Engine.logger.info "Query: #{q}"
|
155
|
+
@query = q
|
71
156
|
end
|
72
157
|
|
73
|
-
private
|
74
158
|
# Get the database connection to use
|
75
159
|
def connection
|
76
160
|
ETL::Source.connection
|
@@ -99,7 +183,8 @@ module ETL #:nodoc:
|
|
99
183
|
# Options:
|
100
184
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
101
185
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
102
|
-
# * <tt>:password</tt>: The password to the database (defaults
|
186
|
+
# * <tt>:password</tt>: The password to the database (defaults
|
187
|
+
# to nothing)
|
103
188
|
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
104
189
|
def connect
|
105
190
|
ETL::Source.establish_connection(
|
@@ -8,16 +8,21 @@ module ETL #:nodoc:
|
|
8
8
|
# Accessor for the underlying parser
|
9
9
|
attr_accessor :parser
|
10
10
|
|
11
|
+
# The source file
|
12
|
+
attr_accessor :file
|
13
|
+
|
11
14
|
# Initialize the source
|
12
15
|
#
|
13
16
|
# Configuration options:
|
14
17
|
# * <tt>:file</tt>: The source file
|
15
|
-
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
16
|
-
# from Parser, a Hash with :name and
|
17
|
-
#
|
18
|
-
# on
|
19
|
-
#
|
20
|
-
# * <tt>:
|
18
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
19
|
+
# symbol, a class which extends from Parser, a Hash with :name and
|
20
|
+
# optionally an :options key. Whether or not the parser uses the
|
21
|
+
# options is dependent on which parser is used. See the documentation
|
22
|
+
# for each parser for information on what options it accepts.
|
23
|
+
# * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
|
24
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
25
|
+
# source data locally for archival
|
21
26
|
def initialize(control, configuration, definition)
|
22
27
|
super
|
23
28
|
configure
|
@@ -25,18 +30,21 @@ module ETL #:nodoc:
|
|
25
30
|
|
26
31
|
# Get a String identifier for the source
|
27
32
|
def to_s
|
28
|
-
|
33
|
+
file
|
29
34
|
end
|
30
35
|
|
31
36
|
# Get the local storage directory
|
32
37
|
def local_directory
|
33
|
-
File.join(local_base, File.basename(
|
38
|
+
File.join(local_base, File.basename(file, File.extname(file)))
|
34
39
|
end
|
35
40
|
|
36
41
|
# Returns each row from the source
|
37
42
|
def each
|
38
43
|
copy_sources if store_locally
|
39
44
|
@parser.each do |row|
|
45
|
+
# TODO skip rows if offset is defined
|
46
|
+
# TODO stop processing if limit is reached
|
47
|
+
row = ETL::Row[row]
|
40
48
|
yield row
|
41
49
|
end
|
42
50
|
end
|
@@ -44,29 +52,34 @@ module ETL #:nodoc:
|
|
44
52
|
private
|
45
53
|
# Copy source data to a local directory structure
|
46
54
|
def copy_sources
|
47
|
-
|
55
|
+
sequence = 0
|
56
|
+
path = Pathname.new(file)
|
48
57
|
path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
|
49
58
|
Pathname.glob(path).each do |f|
|
50
59
|
next if f.directory?
|
51
|
-
|
60
|
+
lf = local_file(sequence)
|
61
|
+
FileUtils.cp(f, lf)
|
62
|
+
File.open(local_file_trigger(lf), 'w') {|f| }
|
63
|
+
sequence += 1
|
52
64
|
end
|
53
65
|
end
|
54
66
|
|
55
67
|
# Configure the source
|
56
68
|
def configure
|
57
|
-
|
69
|
+
@file = configuration[:file]
|
70
|
+
case configuration[:parser]
|
58
71
|
when Class
|
59
|
-
@parser =
|
72
|
+
@parser = configuration[:parser].new(self)
|
60
73
|
when String, Symbol
|
61
|
-
@parser = ETL::Parser::Parser.class_for_name(
|
74
|
+
@parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
|
62
75
|
when Hash
|
63
|
-
name =
|
64
|
-
options =
|
76
|
+
name = configuration[:parser][:name]
|
77
|
+
options = configuration[:parser][:options]
|
65
78
|
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
66
79
|
else
|
67
80
|
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
68
81
|
end
|
69
|
-
@skip_lines =
|
82
|
+
@skip_lines = configuration[:skip_lines] ||= 0
|
70
83
|
end
|
71
84
|
end
|
72
85
|
end
|
data/lib/etl/engine.rb
CHANGED
@@ -10,6 +10,24 @@ module ETL #:nodoc:
|
|
10
10
|
# The main ETL engine clas
|
11
11
|
class Engine
|
12
12
|
class << self
|
13
|
+
# Initialization that is run when a job is executed.
|
14
|
+
def init(options={})
|
15
|
+
unless @initialized
|
16
|
+
@limit = options[:limit]
|
17
|
+
@offset = options[:offset]
|
18
|
+
@log_write_mode = 'w' if options[:newlog]
|
19
|
+
@skip_bulk_import = options[:skip_bulk_import]
|
20
|
+
@read_locally = options[:read_locally]
|
21
|
+
options[:config] ||= 'database.yml'
|
22
|
+
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
23
|
+
ActiveRecord::Base.configurations = database_configuration
|
24
|
+
require 'etl/execution'
|
25
|
+
ETL::Execution::Base.establish_connection :etl_execution
|
26
|
+
ETL::Execution::Execution.migrate
|
27
|
+
@initialized = true
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
13
31
|
# Process the specified control file. Acceptable values for control_file are
|
14
32
|
# * Path to a file
|
15
33
|
# * File object
|
@@ -20,6 +38,12 @@ module ETL #:nodoc:
|
|
20
38
|
|
21
39
|
attr_accessor :timestamped_log
|
22
40
|
|
41
|
+
# Accessor for the log write mode. Default is 'a' for append.
|
42
|
+
attr_accessor :log_write_mode
|
43
|
+
def log_write_mode
|
44
|
+
@log_write_mode ||= 'a'
|
45
|
+
end
|
46
|
+
|
23
47
|
# A logger for the engine
|
24
48
|
attr_accessor :logger
|
25
49
|
|
@@ -28,7 +52,7 @@ module ETL #:nodoc:
|
|
28
52
|
if timestamped_log
|
29
53
|
@logger = Logger.new("etl_#{timestamp}.log")
|
30
54
|
else
|
31
|
-
@logger = Logger.new(File.open('etl.log',
|
55
|
+
@logger = Logger.new(File.open('etl.log', log_write_mode))
|
32
56
|
end
|
33
57
|
@logger.level = Logger::ERROR
|
34
58
|
@logger.formatter = Logger::Formatter.new
|
@@ -50,21 +74,43 @@ module ETL #:nodoc:
|
|
50
74
|
# The current destination
|
51
75
|
attr_accessor :current_destination
|
52
76
|
|
53
|
-
# Set to true to activate realtime activity. This will cause certain
|
54
|
-
# to be printed to STDOUT
|
77
|
+
# Set to true to activate realtime activity. This will cause certain
|
78
|
+
# information messages to be printed to STDOUT
|
55
79
|
attr_accessor :realtime_activity
|
56
80
|
|
81
|
+
# Accessor for the total number of rows read from sources
|
57
82
|
attr_accessor :rows_read
|
58
|
-
|
59
83
|
def rows_read
|
60
84
|
@rows_read ||= 0
|
61
85
|
end
|
62
86
|
|
87
|
+
# Accessor for the total number of rows processed
|
63
88
|
attr_accessor :rows_written
|
64
|
-
|
65
89
|
def rows_written
|
66
90
|
@rows_written ||= 0
|
67
91
|
end
|
92
|
+
|
93
|
+
# Access the current ETL::Execution::Job instance
|
94
|
+
attr_accessor :job
|
95
|
+
|
96
|
+
# The limit on rows to load from the source, useful for testing the ETL
|
97
|
+
# process prior to executing the entire batch. Default value is nil and
|
98
|
+
# indicates that there is no limit
|
99
|
+
attr_accessor :limit
|
100
|
+
|
101
|
+
# The offset for the source to begin at, useful for testing the ETL
|
102
|
+
# process prior to executing the entire batch. Default value is nil and
|
103
|
+
# indicates that there is no offset
|
104
|
+
attr_accessor :offset
|
105
|
+
|
106
|
+
# Set to true to skip all bulk importing
|
107
|
+
attr_accessor :skip_bulk_import
|
108
|
+
|
109
|
+
# Set to true to read locally from the last source cache files
|
110
|
+
attr_accessor :read_locally
|
111
|
+
|
112
|
+
# Accessor for the average rows per second processed
|
113
|
+
attr_accessor :average_rows_per_second
|
68
114
|
end
|
69
115
|
|
70
116
|
# Say the specified message, with a newline
|
@@ -89,6 +135,22 @@ module ETL #:nodoc:
|
|
89
135
|
def errors
|
90
136
|
@errors ||= []
|
91
137
|
end
|
138
|
+
|
139
|
+
# Get a Hash of benchmark values where each value represents the total
|
140
|
+
# amount of time in seconds spent processing in that portion of the ETL
|
141
|
+
# pipeline. Keys include:
|
142
|
+
# * <tt>:transforms</tt>
|
143
|
+
# * <tt>:after_reads</tt>
|
144
|
+
# * <tt>:before_writes</tt>
|
145
|
+
# * <tt>:writes</tt>
|
146
|
+
def benchmarks
|
147
|
+
@benchmarks ||= {
|
148
|
+
:transforms => 0,
|
149
|
+
:after_reads => 0,
|
150
|
+
:before_writes => 0,
|
151
|
+
:writes => 0,
|
152
|
+
}
|
153
|
+
end
|
92
154
|
|
93
155
|
# Process a control file or object. Acceptable values for control are:
|
94
156
|
# * Path to a file
|
@@ -97,6 +159,11 @@ module ETL #:nodoc:
|
|
97
159
|
def process(control)
|
98
160
|
control = ETL::Control::Control.resolve(control)
|
99
161
|
|
162
|
+
ETL::Engine.job = ETL::Execution::Job.create!(
|
163
|
+
:control_file => control.file,
|
164
|
+
:status => 'executing'
|
165
|
+
)
|
166
|
+
|
100
167
|
execute_dependencies(control)
|
101
168
|
|
102
169
|
start_time = Time.now
|
@@ -108,11 +175,22 @@ module ETL #:nodoc:
|
|
108
175
|
sources = control.sources
|
109
176
|
destinations = control.destinations
|
110
177
|
|
178
|
+
say "Skipping bulk import" if Engine.skip_bulk_import
|
179
|
+
|
111
180
|
sources.each do |source|
|
112
181
|
Engine.current_source = source
|
113
182
|
Engine.logger.debug "Processing source #{source}"
|
114
183
|
say "Source: #{source}"
|
184
|
+
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
185
|
+
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
115
186
|
source.each_with_index do |row, index|
|
187
|
+
# Break out of the row loop if the +Engine.limit+ is specified and
|
188
|
+
# the number of rows read exceeds that value.
|
189
|
+
if Engine.limit != nil && Engine.rows_read >= Engine.limit
|
190
|
+
puts "Reached limit of #{Engine.limit}"
|
191
|
+
break
|
192
|
+
end
|
193
|
+
|
116
194
|
Engine.logger.debug "Row #{index}: #{row.inspect}"
|
117
195
|
Engine.rows_read += 1
|
118
196
|
Engine.current_source_row = index + 1
|
@@ -120,80 +198,85 @@ module ETL #:nodoc:
|
|
120
198
|
say_without_newline "."
|
121
199
|
end
|
122
200
|
|
123
|
-
# At this point a single row may be turned into multiple rows via row
|
124
|
-
# all code after this line should work with the array of
|
125
|
-
# single row
|
201
|
+
# At this point a single row may be turned into multiple rows via row
|
202
|
+
# processors all code after this line should work with the array of
|
203
|
+
# rows rather than the single row
|
126
204
|
rows = [row]
|
127
205
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
206
|
+
benchmarks[:after_reads] += Benchmark.realtime do
|
207
|
+
begin
|
208
|
+
Engine.logger.debug "Processing after read"
|
209
|
+
control.after_read_processors.each do |processor|
|
210
|
+
processed_rows = []
|
211
|
+
rows.each do |row|
|
212
|
+
processed_rows << processor.process(row)
|
213
|
+
end
|
214
|
+
rows = processed_rows.flatten
|
134
215
|
end
|
135
|
-
|
216
|
+
rescue => e
|
217
|
+
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
218
|
+
errors << msg
|
219
|
+
Engine.logger.error(msg)
|
220
|
+
exceeded_error_threshold?(control) ? break : next
|
136
221
|
end
|
137
|
-
rescue => e
|
138
|
-
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
139
|
-
errors << msg
|
140
|
-
Engine.logger.error(msg)
|
141
|
-
exceeded_error_threshold?(control) ? break : next
|
142
222
|
end
|
143
223
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
224
|
+
benchmarks[:transforms] += Benchmark.realtime do
|
225
|
+
begin
|
226
|
+
# execute transforms
|
227
|
+
Engine.logger.debug "Executing transforms"
|
228
|
+
rows.each do |row|
|
229
|
+
control.transforms.each do |transform|
|
230
|
+
name = transform.name.to_sym
|
231
|
+
row[name] = transform.transform(name, row[name], row)
|
232
|
+
end
|
153
233
|
end
|
234
|
+
rescue => e
|
235
|
+
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
236
|
+
errors << msg
|
237
|
+
Engine.logger.error(msg)
|
238
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
239
|
+
exceeded_error_threshold?(control) ? break : next
|
154
240
|
end
|
155
|
-
rescue => e
|
156
|
-
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
157
|
-
errors << msg
|
158
|
-
Engine.logger.error(msg)
|
159
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
160
|
-
exceeded_error_threshold?(control) ? break : next
|
161
241
|
end
|
162
242
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
243
|
+
benchmarks[:before_writes] += Benchmark.realtime do
|
244
|
+
begin
|
245
|
+
# execute row-level "before write" processing
|
246
|
+
Engine.logger.debug "Processing before write"
|
247
|
+
control.before_write_processors.each do |processor|
|
248
|
+
processed_rows = []
|
249
|
+
rows.each do |row|
|
250
|
+
processed_rows << processor.process(row)
|
251
|
+
end
|
252
|
+
rows = processed_rows.flatten.compact
|
171
253
|
end
|
172
|
-
|
254
|
+
rescue => e
|
255
|
+
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
256
|
+
errors << msg
|
257
|
+
Engine.logger.error(msg)
|
258
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
259
|
+
exceeded_error_threshold?(control) ? break : next
|
173
260
|
end
|
174
|
-
rescue => e
|
175
|
-
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
176
|
-
errors << msg
|
177
|
-
Engine.logger.error(msg)
|
178
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
179
|
-
exceeded_error_threshold?(control) ? break : next
|
180
261
|
end
|
181
262
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
263
|
+
benchmarks[:writes] += Benchmark.realtime do
|
264
|
+
begin
|
265
|
+
# write the row to the destination
|
266
|
+
destinations.each_with_index do |destination, index|
|
267
|
+
Engine.current_destination = destination
|
268
|
+
rows.each do |row|
|
269
|
+
destination.write(row)
|
270
|
+
Engine.rows_written += 1 if index == 0
|
271
|
+
end
|
189
272
|
end
|
273
|
+
rescue => e
|
274
|
+
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
275
|
+
errors << msg
|
276
|
+
Engine.logger.error msg
|
277
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
278
|
+
exceeded_error_threshold?(control) ? break : next
|
190
279
|
end
|
191
|
-
rescue => e
|
192
|
-
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
193
|
-
errors << msg
|
194
|
-
Engine.logger.error msg
|
195
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
196
|
-
break if exceeded_error_threshold?(control)
|
197
280
|
end
|
198
281
|
end
|
199
282
|
|
@@ -220,6 +303,20 @@ module ETL #:nodoc:
|
|
220
303
|
say "Wrote #{Engine.rows_written} lines to destinations"
|
221
304
|
end
|
222
305
|
say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
|
306
|
+
say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
|
307
|
+
|
308
|
+
say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
|
309
|
+
say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
|
310
|
+
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
311
|
+
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
312
|
+
|
313
|
+
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
314
|
+
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
315
|
+
# end
|
316
|
+
|
317
|
+
ETL::Engine.job.completed_at = Time.now
|
318
|
+
ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
319
|
+
ETL::Engine.job.save!
|
223
320
|
end
|
224
321
|
|
225
322
|
private
|
@@ -285,6 +382,10 @@ module ETL #:nodoc:
|
|
285
382
|
s
|
286
383
|
end
|
287
384
|
|
385
|
+
# Get the approximate disntance of time in words from the given from_time
|
386
|
+
# to the the given to_time. If to_time is not specified then it is set
|
387
|
+
# to Time.now. By default seconds are included...set the include_seconds
|
388
|
+
# argument to false to disable the seconds.
|
288
389
|
def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
|
289
390
|
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
290
391
|
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|