activewarehouse-etl 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -19,13 +19,13 @@ module ETL #:nodoc:
|
|
19
19
|
#
|
20
20
|
# Configuration options:
|
21
21
|
# * <tt>:database</tt>: The database name (REQUIRED)
|
22
|
-
# * <tt>:table
|
22
|
+
# * <tt>:table</tt>: The table to write to (REQUIRED)
|
23
23
|
# * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
|
24
24
|
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
25
25
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
26
26
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
27
27
|
# * <tt>:password</tt>: The password to the database (defaults to nothing)
|
28
|
-
# * <tt>:host
|
28
|
+
# * <tt>:host</tt>: The host for the database (defaults to 'localhost')
|
29
29
|
# * <tt>:append_rows</tt>: Array of rows to append
|
30
30
|
#
|
31
31
|
# Mapping options:
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# This source file contains the ETL::Control::FileDestination
|
2
|
+
|
1
3
|
module ETL #:nodoc:
|
2
4
|
module Control #:nodoc:
|
3
5
|
# File as the final destination.
|
@@ -60,6 +62,7 @@ module ETL #:nodoc:
|
|
60
62
|
def flush
|
61
63
|
#puts "Flushing buffer (#{file}) with #{buffer.length} rows"
|
62
64
|
buffer.flatten.each do |row|
|
65
|
+
#puts "row change type: #{row.change_type}"
|
63
66
|
# check to see if this row's compound key constraint already exists
|
64
67
|
# note that the compound key constraint may not utilize virtual fields
|
65
68
|
next unless row_allowed?(row)
|
@@ -68,11 +71,22 @@ module ETL #:nodoc:
|
|
68
71
|
add_virtuals!(row)
|
69
72
|
|
70
73
|
# collect all of the values using the order designated in the configuration
|
71
|
-
values = order.collect
|
74
|
+
values = order.collect do |name|
|
75
|
+
value = row[name]
|
76
|
+
case value
|
77
|
+
when Date, Time, DateTime
|
78
|
+
value.to_s(:db)
|
79
|
+
else
|
80
|
+
value.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
values.collect! { |v| v.gsub(separator, "\\#{separator}")}
|
85
|
+
values.collect! { |v| v.gsub(/\n|\r/, '')}
|
72
86
|
|
73
87
|
# enclose the value if required
|
74
88
|
if !enclose.nil?
|
75
|
-
values.collect! { |v| enclose + v.
|
89
|
+
values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
|
76
90
|
end
|
77
91
|
|
78
92
|
# write the values joined by the separator defined in the configuration
|
@@ -81,6 +95,7 @@ module ETL #:nodoc:
|
|
81
95
|
# write the end-of-line
|
82
96
|
f.write(eol)
|
83
97
|
end
|
98
|
+
f.flush
|
84
99
|
buffer.clear
|
85
100
|
#puts "After flush there are #{buffer.length} rows"
|
86
101
|
end
|
@@ -91,6 +106,14 @@ module ETL #:nodoc:
|
|
91
106
|
@f ||= open(file, mode)
|
92
107
|
end
|
93
108
|
|
109
|
+
def options
|
110
|
+
@options ||= {
|
111
|
+
:col_sep => separator,
|
112
|
+
:row_sep => eol,
|
113
|
+
:force_quotes => !enclose.nil?
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
94
117
|
# Get the appropriate mode to open the file stream
|
95
118
|
def mode
|
96
119
|
append ? 'a' : 'w'
|
data/lib/etl/control/source.rb
CHANGED
@@ -14,12 +14,14 @@ module ETL #:nodoc:
|
|
14
14
|
attr_accessor :definition
|
15
15
|
|
16
16
|
# Returns true if the source data should be stored locally for archival
|
17
|
+
# Default behavior will return true.
|
17
18
|
attr_accessor :store_locally
|
18
19
|
|
19
20
|
class << self
|
20
21
|
# Convert the name to a Source class.
|
21
22
|
#
|
22
|
-
# For example if name is :database then this will return a
|
23
|
+
# For example if name is :database then this will return a
|
24
|
+
# DatabaseSource class
|
23
25
|
def class_for_name(name)
|
24
26
|
ETL::Control.const_get("#{name.to_s.classify}Source")
|
25
27
|
end
|
@@ -31,7 +33,8 @@ module ETL #:nodoc:
|
|
31
33
|
# * <tt>definition</tt>: The source layout definition
|
32
34
|
#
|
33
35
|
# Configuration options:
|
34
|
-
# * <tt>:store_locally</tt>: Set to false to not store source data
|
36
|
+
# * <tt>:store_locally</tt>: Set to false to not store source data
|
37
|
+
# locally (defaults to true)
|
35
38
|
def initialize(control, configuration, definition)
|
36
39
|
@control = control
|
37
40
|
@configuration = configuration
|
@@ -40,6 +43,7 @@ module ETL #:nodoc:
|
|
40
43
|
@store_locally = configuration[:store_locally] || true
|
41
44
|
end
|
42
45
|
|
46
|
+
# Get an array of errors that occur during reading from the source
|
43
47
|
def errors
|
44
48
|
@errors ||= []
|
45
49
|
end
|
@@ -57,18 +61,35 @@ module ETL #:nodoc:
|
|
57
61
|
@local_base ||= 'source_data'
|
58
62
|
end
|
59
63
|
|
60
|
-
# The local directory for storing. This method must be overriden by
|
64
|
+
# The local directory for storing. This method must be overriden by
|
65
|
+
# subclasses
|
61
66
|
def local_directory
|
62
67
|
raise "local_directory method is abstract"
|
63
68
|
end
|
64
69
|
|
65
|
-
# Return the local file for storing the raw source data. Each call to
|
66
|
-
# result in a timestamped file, so you cannot expect
|
67
|
-
# the same file
|
68
|
-
|
70
|
+
# Return the local file for storing the raw source data. Each call to
|
71
|
+
# this method will result in a timestamped file, so you cannot expect
|
72
|
+
# to call it multiple times and reference the same file
|
73
|
+
#
|
74
|
+
# Optional sequence can be specified if there are multiple source files
|
75
|
+
def local_file(sequence=nil)
|
76
|
+
filename = timestamp.to_s
|
77
|
+
filename += sequence.to_s if sequence
|
78
|
+
|
69
79
|
local_dir = local_directory
|
70
80
|
FileUtils.mkdir_p(local_dir)
|
71
|
-
File.join(local_dir, "#{
|
81
|
+
File.join(local_dir, "#{filename}.csv")
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the local trigger file that is used to indicate that the file has
|
85
|
+
# been completely written
|
86
|
+
def local_file_trigger(file)
|
87
|
+
Pathname.new(file.to_s + '.trig')
|
88
|
+
end
|
89
|
+
|
90
|
+
# Return true if the source should read locally.
|
91
|
+
def read_locally
|
92
|
+
Engine.read_locally
|
72
93
|
end
|
73
94
|
|
74
95
|
end
|
@@ -22,10 +22,18 @@ module ETL #:nodoc:
|
|
22
22
|
# Other options:
|
23
23
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
24
24
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
25
|
-
# * <tt>:password</tt>: The password to the database (defaults to
|
26
|
-
#
|
27
|
-
# * <tt>:
|
28
|
-
#
|
25
|
+
# * <tt>:password</tt>: The password to the database (defaults to
|
26
|
+
# nothing)
|
27
|
+
# * <tt>:host</tt>: The host for the database (defaults to
|
28
|
+
# 'localhost')
|
29
|
+
# * <tt>:join</tt>: Optional join part for the query (ignored unless
|
30
|
+
# specified)
|
31
|
+
# * <tt>:select</tt>: Optional select part for the query (defaults to
|
32
|
+
# '*')
|
33
|
+
# * <tt>:order</tt>: Optional order part for the query (ignored unless
|
34
|
+
# specified)
|
35
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
36
|
+
# source data locally in a flat file (defaults to true)
|
29
37
|
def initialize(control, configuration, definition)
|
30
38
|
super
|
31
39
|
connect
|
@@ -36,41 +44,117 @@ module ETL #:nodoc:
|
|
36
44
|
"#{host}/#{configuration[:database]}/#{configuration[:table]}"
|
37
45
|
end
|
38
46
|
|
39
|
-
# Get the local directory to use, which is a combination of the
|
40
|
-
# the db database name and the db table.
|
47
|
+
# Get the local directory to use, which is a combination of the
|
48
|
+
# local_base, the db hostname the db database name and the db table.
|
41
49
|
def local_directory
|
42
50
|
File.join(local_base, host, configuration[:database], configuration[:table])
|
43
51
|
end
|
44
52
|
|
45
|
-
#
|
46
|
-
def
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
# Get the join part of the query, defaults to nil
|
54
|
+
def join
|
55
|
+
configuration[:join]
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the select part of the query, defaults to '*'
|
59
|
+
def select
|
60
|
+
configuration[:select] || '*'
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get the order for the query, defaults to nil
|
64
|
+
def order
|
65
|
+
configuration[:order]
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the list of columns to read. This is defined in the source
|
69
|
+
# definition as either an Array or Hash
|
70
|
+
def columns
|
71
|
+
case definition
|
72
|
+
when Array
|
73
|
+
definition.collect(&:to_sym)
|
74
|
+
when Hash
|
75
|
+
definition.keys.collect(&:to_sym)
|
76
|
+
else
|
77
|
+
raise "Definition must be either an Array or a Hash"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns each row from the source. If read_locally is specified then
|
82
|
+
# this method will attempt to read from the last stored local file.
|
83
|
+
# If no locally stored file exists or if the trigger file for the last
|
84
|
+
# locally stored file does not exist then this method will raise an
|
85
|
+
# error.
|
86
|
+
def each(&block)
|
87
|
+
if read_locally # Read from the last stored source
|
88
|
+
read_rows(&block)
|
89
|
+
else # Read from the original source
|
90
|
+
if store_locally
|
91
|
+
write_local
|
92
|
+
read_rows(&block)
|
93
|
+
else
|
94
|
+
connection.select_all(query).each do |row|
|
95
|
+
row = Row.new(row.symbolize_keys)
|
96
|
+
yield row
|
56
97
|
end
|
57
98
|
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
# Read rows from the local cache
|
104
|
+
def read_rows
|
105
|
+
file = local_file
|
106
|
+
|
107
|
+
File.exists?(file) or raise "Local cache file not found"
|
108
|
+
File.exists?(local_file_trigger(file)) or raise "Local cache trigger file not found"
|
109
|
+
|
110
|
+
t = Benchmark.realtime do
|
58
111
|
FasterCSV.open(file, :headers => true).each do |row|
|
59
|
-
result_row =
|
112
|
+
result_row = ETL::Row.new
|
60
113
|
row.each do |header, field|
|
61
114
|
result_row[header.to_sym] = field
|
62
115
|
end
|
63
|
-
#puts "yielding #{result_row.inspect}"
|
64
116
|
yield result_row
|
65
117
|
end
|
66
|
-
|
67
|
-
|
68
|
-
|
118
|
+
end
|
119
|
+
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
120
|
+
end
|
121
|
+
|
122
|
+
# Write rows to the local cache
|
123
|
+
def write_local
|
124
|
+
file = local_file
|
125
|
+
|
126
|
+
lines = 0
|
127
|
+
t = Benchmark.realtime do
|
128
|
+
FasterCSV.open(file, 'w') do |f|
|
129
|
+
f << columns
|
130
|
+
connection.select_all(query).each do |row|
|
131
|
+
f << columns.collect { |column| row[column.to_s] }
|
132
|
+
lines += 1
|
133
|
+
end
|
69
134
|
end
|
135
|
+
File.open(local_file_trigger(file), 'w') {|f| }
|
70
136
|
end
|
137
|
+
ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
|
138
|
+
end
|
139
|
+
|
140
|
+
# Get the query to use
|
141
|
+
def query
|
142
|
+
return @query if @query
|
143
|
+
q = "SELECT #{select} FROM #{configuration[:table]}"
|
144
|
+
q << " #{join}" if join
|
145
|
+
q << " ORDER BY #{order}" if order
|
146
|
+
if ETL::Engine.limit || ETL::Engine.offset
|
147
|
+
options = {}
|
148
|
+
options[:limit] = ETL::Engine.limit if ETL::Engine.limit
|
149
|
+
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
150
|
+
connection.add_limit_offset!(q, options)
|
151
|
+
end
|
152
|
+
#q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
|
153
|
+
q = q.gsub(/\n/,' ')
|
154
|
+
ETL::Engine.logger.info "Query: #{q}"
|
155
|
+
@query = q
|
71
156
|
end
|
72
157
|
|
73
|
-
private
|
74
158
|
# Get the database connection to use
|
75
159
|
def connection
|
76
160
|
ETL::Source.connection
|
@@ -99,7 +183,8 @@ module ETL #:nodoc:
|
|
99
183
|
# Options:
|
100
184
|
# * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
|
101
185
|
# * <tt>:username</tt>: The database username (defaults to 'root')
|
102
|
-
# * <tt>:password</tt>: The password to the database (defaults
|
186
|
+
# * <tt>:password</tt>: The password to the database (defaults
|
187
|
+
# to nothing)
|
103
188
|
# * <tt>:host<tt>: The host for the database (defaults to 'localhost')
|
104
189
|
def connect
|
105
190
|
ETL::Source.establish_connection(
|
@@ -8,16 +8,21 @@ module ETL #:nodoc:
|
|
8
8
|
# Accessor for the underlying parser
|
9
9
|
attr_accessor :parser
|
10
10
|
|
11
|
+
# The source file
|
12
|
+
attr_accessor :file
|
13
|
+
|
11
14
|
# Initialize the source
|
12
15
|
#
|
13
16
|
# Configuration options:
|
14
17
|
# * <tt>:file</tt>: The source file
|
15
|
-
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
16
|
-
# from Parser, a Hash with :name and
|
17
|
-
#
|
18
|
-
# on
|
19
|
-
#
|
20
|
-
# * <tt>:
|
18
|
+
# * <tt>:parser</tt>: One of the following: a parser name as a String or
|
19
|
+
# symbol, a class which extends from Parser, a Hash with :name and
|
20
|
+
# optionally an :options key. Whether or not the parser uses the
|
21
|
+
# options is dependent on which parser is used. See the documentation
|
22
|
+
# for each parser for information on what options it accepts.
|
23
|
+
# * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
|
24
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
25
|
+
# source data locally for archival
|
21
26
|
def initialize(control, configuration, definition)
|
22
27
|
super
|
23
28
|
configure
|
@@ -25,18 +30,21 @@ module ETL #:nodoc:
|
|
25
30
|
|
26
31
|
# Get a String identifier for the source
|
27
32
|
def to_s
|
28
|
-
|
33
|
+
file
|
29
34
|
end
|
30
35
|
|
31
36
|
# Get the local storage directory
|
32
37
|
def local_directory
|
33
|
-
File.join(local_base, File.basename(
|
38
|
+
File.join(local_base, File.basename(file, File.extname(file)))
|
34
39
|
end
|
35
40
|
|
36
41
|
# Returns each row from the source
|
37
42
|
def each
|
38
43
|
copy_sources if store_locally
|
39
44
|
@parser.each do |row|
|
45
|
+
# TODO skip rows if offset is defined
|
46
|
+
# TODO stop processing if limit is reached
|
47
|
+
row = ETL::Row[row]
|
40
48
|
yield row
|
41
49
|
end
|
42
50
|
end
|
@@ -44,29 +52,34 @@ module ETL #:nodoc:
|
|
44
52
|
private
|
45
53
|
# Copy source data to a local directory structure
|
46
54
|
def copy_sources
|
47
|
-
|
55
|
+
sequence = 0
|
56
|
+
path = Pathname.new(file)
|
48
57
|
path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
|
49
58
|
Pathname.glob(path).each do |f|
|
50
59
|
next if f.directory?
|
51
|
-
|
60
|
+
lf = local_file(sequence)
|
61
|
+
FileUtils.cp(f, lf)
|
62
|
+
File.open(local_file_trigger(lf), 'w') {|f| }
|
63
|
+
sequence += 1
|
52
64
|
end
|
53
65
|
end
|
54
66
|
|
55
67
|
# Configure the source
|
56
68
|
def configure
|
57
|
-
|
69
|
+
@file = configuration[:file]
|
70
|
+
case configuration[:parser]
|
58
71
|
when Class
|
59
|
-
@parser =
|
72
|
+
@parser = configuration[:parser].new(self)
|
60
73
|
when String, Symbol
|
61
|
-
@parser = ETL::Parser::Parser.class_for_name(
|
74
|
+
@parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
|
62
75
|
when Hash
|
63
|
-
name =
|
64
|
-
options =
|
76
|
+
name = configuration[:parser][:name]
|
77
|
+
options = configuration[:parser][:options]
|
65
78
|
@parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
|
66
79
|
else
|
67
80
|
raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
|
68
81
|
end
|
69
|
-
@skip_lines =
|
82
|
+
@skip_lines = configuration[:skip_lines] ||= 0
|
70
83
|
end
|
71
84
|
end
|
72
85
|
end
|
data/lib/etl/engine.rb
CHANGED
@@ -10,6 +10,24 @@ module ETL #:nodoc:
|
|
10
10
|
# The main ETL engine clas
|
11
11
|
class Engine
|
12
12
|
class << self
|
13
|
+
# Initialization that is run when a job is executed.
|
14
|
+
def init(options={})
|
15
|
+
unless @initialized
|
16
|
+
@limit = options[:limit]
|
17
|
+
@offset = options[:offset]
|
18
|
+
@log_write_mode = 'w' if options[:newlog]
|
19
|
+
@skip_bulk_import = options[:skip_bulk_import]
|
20
|
+
@read_locally = options[:read_locally]
|
21
|
+
options[:config] ||= 'database.yml'
|
22
|
+
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
23
|
+
ActiveRecord::Base.configurations = database_configuration
|
24
|
+
require 'etl/execution'
|
25
|
+
ETL::Execution::Base.establish_connection :etl_execution
|
26
|
+
ETL::Execution::Execution.migrate
|
27
|
+
@initialized = true
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
13
31
|
# Process the specified control file. Acceptable values for control_file are
|
14
32
|
# * Path to a file
|
15
33
|
# * File object
|
@@ -20,6 +38,12 @@ module ETL #:nodoc:
|
|
20
38
|
|
21
39
|
attr_accessor :timestamped_log
|
22
40
|
|
41
|
+
# Accessor for the log write mode. Default is 'a' for append.
|
42
|
+
attr_accessor :log_write_mode
|
43
|
+
def log_write_mode
|
44
|
+
@log_write_mode ||= 'a'
|
45
|
+
end
|
46
|
+
|
23
47
|
# A logger for the engine
|
24
48
|
attr_accessor :logger
|
25
49
|
|
@@ -28,7 +52,7 @@ module ETL #:nodoc:
|
|
28
52
|
if timestamped_log
|
29
53
|
@logger = Logger.new("etl_#{timestamp}.log")
|
30
54
|
else
|
31
|
-
@logger = Logger.new(File.open('etl.log',
|
55
|
+
@logger = Logger.new(File.open('etl.log', log_write_mode))
|
32
56
|
end
|
33
57
|
@logger.level = Logger::ERROR
|
34
58
|
@logger.formatter = Logger::Formatter.new
|
@@ -50,21 +74,43 @@ module ETL #:nodoc:
|
|
50
74
|
# The current destination
|
51
75
|
attr_accessor :current_destination
|
52
76
|
|
53
|
-
# Set to true to activate realtime activity. This will cause certain
|
54
|
-
# to be printed to STDOUT
|
77
|
+
# Set to true to activate realtime activity. This will cause certain
|
78
|
+
# information messages to be printed to STDOUT
|
55
79
|
attr_accessor :realtime_activity
|
56
80
|
|
81
|
+
# Accessor for the total number of rows read from sources
|
57
82
|
attr_accessor :rows_read
|
58
|
-
|
59
83
|
def rows_read
|
60
84
|
@rows_read ||= 0
|
61
85
|
end
|
62
86
|
|
87
|
+
# Accessor for the total number of rows processed
|
63
88
|
attr_accessor :rows_written
|
64
|
-
|
65
89
|
def rows_written
|
66
90
|
@rows_written ||= 0
|
67
91
|
end
|
92
|
+
|
93
|
+
# Access the current ETL::Execution::Job instance
|
94
|
+
attr_accessor :job
|
95
|
+
|
96
|
+
# The limit on rows to load from the source, useful for testing the ETL
|
97
|
+
# process prior to executing the entire batch. Default value is nil and
|
98
|
+
# indicates that there is no limit
|
99
|
+
attr_accessor :limit
|
100
|
+
|
101
|
+
# The offset for the source to begin at, useful for testing the ETL
|
102
|
+
# process prior to executing the entire batch. Default value is nil and
|
103
|
+
# indicates that there is no offset
|
104
|
+
attr_accessor :offset
|
105
|
+
|
106
|
+
# Set to true to skip all bulk importing
|
107
|
+
attr_accessor :skip_bulk_import
|
108
|
+
|
109
|
+
# Set to true to read locally from the last source cache files
|
110
|
+
attr_accessor :read_locally
|
111
|
+
|
112
|
+
# Accessor for the average rows per second processed
|
113
|
+
attr_accessor :average_rows_per_second
|
68
114
|
end
|
69
115
|
|
70
116
|
# Say the specified message, with a newline
|
@@ -89,6 +135,22 @@ module ETL #:nodoc:
|
|
89
135
|
def errors
|
90
136
|
@errors ||= []
|
91
137
|
end
|
138
|
+
|
139
|
+
# Get a Hash of benchmark values where each value represents the total
|
140
|
+
# amount of time in seconds spent processing in that portion of the ETL
|
141
|
+
# pipeline. Keys include:
|
142
|
+
# * <tt>:transforms</tt>
|
143
|
+
# * <tt>:after_reads</tt>
|
144
|
+
# * <tt>:before_writes</tt>
|
145
|
+
# * <tt>:writes</tt>
|
146
|
+
def benchmarks
|
147
|
+
@benchmarks ||= {
|
148
|
+
:transforms => 0,
|
149
|
+
:after_reads => 0,
|
150
|
+
:before_writes => 0,
|
151
|
+
:writes => 0,
|
152
|
+
}
|
153
|
+
end
|
92
154
|
|
93
155
|
# Process a control file or object. Acceptable values for control are:
|
94
156
|
# * Path to a file
|
@@ -97,6 +159,11 @@ module ETL #:nodoc:
|
|
97
159
|
def process(control)
|
98
160
|
control = ETL::Control::Control.resolve(control)
|
99
161
|
|
162
|
+
ETL::Engine.job = ETL::Execution::Job.create!(
|
163
|
+
:control_file => control.file,
|
164
|
+
:status => 'executing'
|
165
|
+
)
|
166
|
+
|
100
167
|
execute_dependencies(control)
|
101
168
|
|
102
169
|
start_time = Time.now
|
@@ -108,11 +175,22 @@ module ETL #:nodoc:
|
|
108
175
|
sources = control.sources
|
109
176
|
destinations = control.destinations
|
110
177
|
|
178
|
+
say "Skipping bulk import" if Engine.skip_bulk_import
|
179
|
+
|
111
180
|
sources.each do |source|
|
112
181
|
Engine.current_source = source
|
113
182
|
Engine.logger.debug "Processing source #{source}"
|
114
183
|
say "Source: #{source}"
|
184
|
+
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
185
|
+
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
115
186
|
source.each_with_index do |row, index|
|
187
|
+
# Break out of the row loop if the +Engine.limit+ is specified and
|
188
|
+
# the number of rows read exceeds that value.
|
189
|
+
if Engine.limit != nil && Engine.rows_read >= Engine.limit
|
190
|
+
puts "Reached limit of #{Engine.limit}"
|
191
|
+
break
|
192
|
+
end
|
193
|
+
|
116
194
|
Engine.logger.debug "Row #{index}: #{row.inspect}"
|
117
195
|
Engine.rows_read += 1
|
118
196
|
Engine.current_source_row = index + 1
|
@@ -120,80 +198,85 @@ module ETL #:nodoc:
|
|
120
198
|
say_without_newline "."
|
121
199
|
end
|
122
200
|
|
123
|
-
# At this point a single row may be turned into multiple rows via row
|
124
|
-
# all code after this line should work with the array of
|
125
|
-
# single row
|
201
|
+
# At this point a single row may be turned into multiple rows via row
|
202
|
+
# processors all code after this line should work with the array of
|
203
|
+
# rows rather than the single row
|
126
204
|
rows = [row]
|
127
205
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
206
|
+
benchmarks[:after_reads] += Benchmark.realtime do
|
207
|
+
begin
|
208
|
+
Engine.logger.debug "Processing after read"
|
209
|
+
control.after_read_processors.each do |processor|
|
210
|
+
processed_rows = []
|
211
|
+
rows.each do |row|
|
212
|
+
processed_rows << processor.process(row)
|
213
|
+
end
|
214
|
+
rows = processed_rows.flatten
|
134
215
|
end
|
135
|
-
|
216
|
+
rescue => e
|
217
|
+
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
218
|
+
errors << msg
|
219
|
+
Engine.logger.error(msg)
|
220
|
+
exceeded_error_threshold?(control) ? break : next
|
136
221
|
end
|
137
|
-
rescue => e
|
138
|
-
msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
139
|
-
errors << msg
|
140
|
-
Engine.logger.error(msg)
|
141
|
-
exceeded_error_threshold?(control) ? break : next
|
142
222
|
end
|
143
223
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
224
|
+
benchmarks[:transforms] += Benchmark.realtime do
|
225
|
+
begin
|
226
|
+
# execute transforms
|
227
|
+
Engine.logger.debug "Executing transforms"
|
228
|
+
rows.each do |row|
|
229
|
+
control.transforms.each do |transform|
|
230
|
+
name = transform.name.to_sym
|
231
|
+
row[name] = transform.transform(name, row[name], row)
|
232
|
+
end
|
153
233
|
end
|
234
|
+
rescue => e
|
235
|
+
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
236
|
+
errors << msg
|
237
|
+
Engine.logger.error(msg)
|
238
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
239
|
+
exceeded_error_threshold?(control) ? break : next
|
154
240
|
end
|
155
|
-
rescue => e
|
156
|
-
msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
157
|
-
errors << msg
|
158
|
-
Engine.logger.error(msg)
|
159
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
160
|
-
exceeded_error_threshold?(control) ? break : next
|
161
241
|
end
|
162
242
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
243
|
+
benchmarks[:before_writes] += Benchmark.realtime do
|
244
|
+
begin
|
245
|
+
# execute row-level "before write" processing
|
246
|
+
Engine.logger.debug "Processing before write"
|
247
|
+
control.before_write_processors.each do |processor|
|
248
|
+
processed_rows = []
|
249
|
+
rows.each do |row|
|
250
|
+
processed_rows << processor.process(row)
|
251
|
+
end
|
252
|
+
rows = processed_rows.flatten.compact
|
171
253
|
end
|
172
|
-
|
254
|
+
rescue => e
|
255
|
+
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
256
|
+
errors << msg
|
257
|
+
Engine.logger.error(msg)
|
258
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
259
|
+
exceeded_error_threshold?(control) ? break : next
|
173
260
|
end
|
174
|
-
rescue => e
|
175
|
-
msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
|
176
|
-
errors << msg
|
177
|
-
Engine.logger.error(msg)
|
178
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
179
|
-
exceeded_error_threshold?(control) ? break : next
|
180
261
|
end
|
181
262
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
263
|
+
benchmarks[:writes] += Benchmark.realtime do
|
264
|
+
begin
|
265
|
+
# write the row to the destination
|
266
|
+
destinations.each_with_index do |destination, index|
|
267
|
+
Engine.current_destination = destination
|
268
|
+
rows.each do |row|
|
269
|
+
destination.write(row)
|
270
|
+
Engine.rows_written += 1 if index == 0
|
271
|
+
end
|
189
272
|
end
|
273
|
+
rescue => e
|
274
|
+
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
275
|
+
errors << msg
|
276
|
+
Engine.logger.error msg
|
277
|
+
e.backtrace.each { |line| Engine.logger.error(line) }
|
278
|
+
exceeded_error_threshold?(control) ? break : next
|
190
279
|
end
|
191
|
-
rescue => e
|
192
|
-
msg = "Error writing to #{Engine.current_destination}: #{e}"
|
193
|
-
errors << msg
|
194
|
-
Engine.logger.error msg
|
195
|
-
e.backtrace.each { |line| Engine.logger.error(line) }
|
196
|
-
break if exceeded_error_threshold?(control)
|
197
280
|
end
|
198
281
|
end
|
199
282
|
|
@@ -220,6 +303,20 @@ module ETL #:nodoc:
|
|
220
303
|
say "Wrote #{Engine.rows_written} lines to destinations"
|
221
304
|
end
|
222
305
|
say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
|
306
|
+
say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
|
307
|
+
|
308
|
+
say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
|
309
|
+
say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
|
310
|
+
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
311
|
+
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
312
|
+
|
313
|
+
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
314
|
+
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
315
|
+
# end
|
316
|
+
|
317
|
+
ETL::Engine.job.completed_at = Time.now
|
318
|
+
ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
|
319
|
+
ETL::Engine.job.save!
|
223
320
|
end
|
224
321
|
|
225
322
|
private
|
@@ -285,6 +382,10 @@ module ETL #:nodoc:
|
|
285
382
|
s
|
286
383
|
end
|
287
384
|
|
385
|
+
# Get the approximate disntance of time in words from the given from_time
|
386
|
+
# to the the given to_time. If to_time is not specified then it is set
|
387
|
+
# to Time.now. By default seconds are included...set the include_seconds
|
388
|
+
# argument to false to disable the seconds.
|
288
389
|
def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
|
289
390
|
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
290
391
|
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|