activewarehouse-etl 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -19,13 +19,13 @@ module ETL #:nodoc:
19
19
  #
20
20
  # Configuration options:
21
21
  # * <tt>:database</tt>: The database name (REQUIRED)
22
- # * <tt>:table<tt>: The table to write to (REQUIRED)
22
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
23
23
  # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
24
  # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
25
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
26
  # * <tt>:username</tt>: The database username (defaults to 'root')
27
27
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
28
+ # * <tt>:host</tt>: The host for the database (defaults to 'localhost')
29
29
  # * <tt>:append_rows</tt>: Array of rows to append
30
30
  #
31
31
  # Mapping options:
@@ -1,3 +1,5 @@
1
+ # This source file contains the ETL::Control::FileDestination
2
+
1
3
  module ETL #:nodoc:
2
4
  module Control #:nodoc:
3
5
  # File as the final destination.
@@ -60,6 +62,7 @@ module ETL #:nodoc:
60
62
  def flush
61
63
  #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
62
64
  buffer.flatten.each do |row|
65
+ #puts "row change type: #{row.change_type}"
63
66
  # check to see if this row's compound key constraint already exists
64
67
  # note that the compound key constraint may not utilize virtual fields
65
68
  next unless row_allowed?(row)
@@ -68,11 +71,22 @@ module ETL #:nodoc:
68
71
  add_virtuals!(row)
69
72
 
70
73
  # collect all of the values using the order designated in the configuration
71
- values = order.collect { |name| row[name] }
74
+ values = order.collect do |name|
75
+ value = row[name]
76
+ case value
77
+ when Date, Time, DateTime
78
+ value.to_s(:db)
79
+ else
80
+ value.to_s
81
+ end
82
+ end
83
+
84
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
85
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
72
86
 
73
87
  # enclose the value if required
74
88
  if !enclose.nil?
75
- values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
89
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
76
90
  end
77
91
 
78
92
  # write the values joined by the separator defined in the configuration
@@ -81,6 +95,7 @@ module ETL #:nodoc:
81
95
  # write the end-of-line
82
96
  f.write(eol)
83
97
  end
98
+ f.flush
84
99
  buffer.clear
85
100
  #puts "After flush there are #{buffer.length} rows"
86
101
  end
@@ -91,6 +106,14 @@ module ETL #:nodoc:
91
106
  @f ||= open(file, mode)
92
107
  end
93
108
 
109
+ def options
110
+ @options ||= {
111
+ :col_sep => separator,
112
+ :row_sep => eol,
113
+ :force_quotes => !enclose.nil?
114
+ }
115
+ end
116
+
94
117
  # Get the appropriate mode to open the file stream
95
118
  def mode
96
119
  append ? 'a' : 'w'
@@ -14,12 +14,14 @@ module ETL #:nodoc:
14
14
  attr_accessor :definition
15
15
 
16
16
  # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
17
18
  attr_accessor :store_locally
18
19
 
19
20
  class << self
20
21
  # Convert the name to a Source class.
21
22
  #
22
- # For example if name is :database then this will return a DatabaseSource class
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
23
25
  def class_for_name(name)
24
26
  ETL::Control.const_get("#{name.to_s.classify}Source")
25
27
  end
@@ -31,7 +33,8 @@ module ETL #:nodoc:
31
33
  # * <tt>definition</tt>: The source layout definition
32
34
  #
33
35
  # Configuration options:
34
- # * <tt>:store_locally</tt>: Set to false to not store source data locally (defaults to true)
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
35
38
  def initialize(control, configuration, definition)
36
39
  @control = control
37
40
  @configuration = configuration
@@ -40,6 +43,7 @@ module ETL #:nodoc:
40
43
  @store_locally = configuration[:store_locally] || true
41
44
  end
42
45
 
46
+ # Get an array of errors that occur during reading from the source
43
47
  def errors
44
48
  @errors ||= []
45
49
  end
@@ -57,18 +61,35 @@ module ETL #:nodoc:
57
61
  @local_base ||= 'source_data'
58
62
  end
59
63
 
60
- # The local directory for storing. This method must be overriden by subclasses
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
61
66
  def local_directory
62
67
  raise "local_directory method is abstract"
63
68
  end
64
69
 
65
- # Return the local file for storing the raw source data. Each call to this method will
66
- # result in a timestamped file, so you cannot expect to call it multiple times and reference
67
- # the same file
68
- def local_file
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
69
79
  local_dir = local_directory
70
80
  FileUtils.mkdir_p(local_dir)
71
- File.join(local_dir, "#{timestamp}.csv")
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the local trigger file that is used to indicate that the file has
85
+ # been completely written
86
+ def local_file_trigger(file)
87
+ Pathname.new(file.to_s + '.trig')
88
+ end
89
+
90
+ # Return true if the source should read locally.
91
+ def read_locally
92
+ Engine.read_locally
72
93
  end
73
94
 
74
95
  end
@@ -22,10 +22,18 @@ module ETL #:nodoc:
22
22
  # Other options:
23
23
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
24
24
  # * <tt>:username</tt>: The database username (defaults to 'root')
25
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
26
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
27
- # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally
28
- # in a flat file (defaults to true)
25
+ # * <tt>:password</tt>: The password to the database (defaults to
26
+ # nothing)
27
+ # * <tt>:host</tt>: The host for the database (defaults to
28
+ # 'localhost')
29
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
30
+ # specified)
31
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
32
+ # '*')
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
36
+ # source data locally in a flat file (defaults to true)
29
37
  def initialize(control, configuration, definition)
30
38
  super
31
39
  connect
@@ -36,41 +44,117 @@ module ETL #:nodoc:
36
44
  "#{host}/#{configuration[:database]}/#{configuration[:table]}"
37
45
  end
38
46
 
39
- # Get the local directory to use, which is a combination of the local_base, the db hostname
40
- # the db database name and the db table.
47
+ # Get the local directory to use, which is a combination of the
48
+ # local_base, the db hostname the db database name and the db table.
41
49
  def local_directory
42
50
  File.join(local_base, host, configuration[:database], configuration[:table])
43
51
  end
44
52
 
45
- # Returns each row from the source
46
- def each
47
- if store_locally
48
- file = local_file
49
- columns = connection.columns(configuration[:table].to_s)
50
- FasterCSV.open(file, 'w') do |f|
51
- f << columns.collect { |column| column.name }
52
- connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
53
- values = columns.collect { |column| row[column.name] }
54
- #puts "row: #{values.inspect}"
55
- f << values
53
+ # Get the join part of the query, defaults to nil
54
+ def join
55
+ configuration[:join]
56
+ end
57
+
58
+ # Get the select part of the query, defaults to '*'
59
+ def select
60
+ configuration[:select] || '*'
61
+ end
62
+
63
+ # Get the order for the query, defaults to nil
64
+ def order
65
+ configuration[:order]
66
+ end
67
+
68
+ # Get the list of columns to read. This is defined in the source
69
+ # definition as either an Array or Hash
70
+ def columns
71
+ case definition
72
+ when Array
73
+ definition.collect(&:to_sym)
74
+ when Hash
75
+ definition.keys.collect(&:to_sym)
76
+ else
77
+ raise "Definition must be either an Array or a Hash"
78
+ end
79
+ end
80
+
81
+ # Returns each row from the source. If read_locally is specified then
82
+ # this method will attempt to read from the last stored local file.
83
+ # If no locally stored file exists or if the trigger file for the last
84
+ # locally stored file does not exist then this method will raise an
85
+ # error.
86
+ def each(&block)
87
+ if read_locally # Read from the last stored source
88
+ read_rows(&block)
89
+ else # Read from the original source
90
+ if store_locally
91
+ write_local
92
+ read_rows(&block)
93
+ else
94
+ connection.select_all(query).each do |row|
95
+ row = Row.new(row.symbolize_keys)
96
+ yield row
56
97
  end
57
98
  end
99
+ end
100
+ end
101
+
102
+ private
103
+ # Read rows from the local cache
104
+ def read_rows
105
+ file = local_file
106
+
107
+ File.exists?(file) or raise "Local cache file not found"
108
+ File.exists?(local_file_trigger(file)) or raise "Local cache trigger file not found"
109
+
110
+ t = Benchmark.realtime do
58
111
  FasterCSV.open(file, :headers => true).each do |row|
59
- result_row = {}
112
+ result_row = ETL::Row.new
60
113
  row.each do |header, field|
61
114
  result_row[header.to_sym] = field
62
115
  end
63
- #puts "yielding #{result_row.inspect}"
64
116
  yield result_row
65
117
  end
66
- else
67
- connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
68
- yield HashWithIndifferentAccess.new(row)
118
+ end
119
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
120
+ end
121
+
122
+ # Write rows to the local cache
123
+ def write_local
124
+ file = local_file
125
+
126
+ lines = 0
127
+ t = Benchmark.realtime do
128
+ FasterCSV.open(file, 'w') do |f|
129
+ f << columns
130
+ connection.select_all(query).each do |row|
131
+ f << columns.collect { |column| row[column.to_s] }
132
+ lines += 1
133
+ end
69
134
  end
135
+ File.open(local_file_trigger(file), 'w') {|f| }
70
136
  end
137
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
138
+ end
139
+
140
+ # Get the query to use
141
+ def query
142
+ return @query if @query
143
+ q = "SELECT #{select} FROM #{configuration[:table]}"
144
+ q << " #{join}" if join
145
+ q << " ORDER BY #{order}" if order
146
+ if ETL::Engine.limit || ETL::Engine.offset
147
+ options = {}
148
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
149
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
150
+ connection.add_limit_offset!(q, options)
151
+ end
152
+ #q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
153
+ q = q.gsub(/\n/,' ')
154
+ ETL::Engine.logger.info "Query: #{q}"
155
+ @query = q
71
156
  end
72
157
 
73
- private
74
158
  # Get the database connection to use
75
159
  def connection
76
160
  ETL::Source.connection
@@ -99,7 +183,8 @@ module ETL #:nodoc:
99
183
  # Options:
100
184
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
101
185
  # * <tt>:username</tt>: The database username (defaults to 'root')
102
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
186
+ # * <tt>:password</tt>: The password to the database (defaults
187
+ # to nothing)
103
188
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
104
189
  def connect
105
190
  ETL::Source.establish_connection(
@@ -8,16 +8,21 @@ module ETL #:nodoc:
8
8
  # Accessor for the underlying parser
9
9
  attr_accessor :parser
10
10
 
11
+ # The source file
12
+ attr_accessor :file
13
+
11
14
  # Initialize the source
12
15
  #
13
16
  # Configuration options:
14
17
  # * <tt>:file</tt>: The source file
15
- # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends
16
- # from Parser, a Hash with :name and optionally an :options key. Whether or not the parser uses the
17
- # options is dependent on which parser is used. See the documentation for each parser for information
18
- # on what options it accepts.
19
- # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
20
- # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally for archival
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
21
26
  def initialize(control, configuration, definition)
22
27
  super
23
28
  configure
@@ -25,18 +30,21 @@ module ETL #:nodoc:
25
30
 
26
31
  # Get a String identifier for the source
27
32
  def to_s
28
- configuration[:file]
33
+ file
29
34
  end
30
35
 
31
36
  # Get the local storage directory
32
37
  def local_directory
33
- File.join(local_base, File.basename(configuration[:file], File.extname(configuration[:file])))
38
+ File.join(local_base, File.basename(file, File.extname(file)))
34
39
  end
35
40
 
36
41
  # Returns each row from the source
37
42
  def each
38
43
  copy_sources if store_locally
39
44
  @parser.each do |row|
45
+ # TODO skip rows if offset is defined
46
+ # TODO stop processing if limit is reached
47
+ row = ETL::Row[row]
40
48
  yield row
41
49
  end
42
50
  end
@@ -44,29 +52,34 @@ module ETL #:nodoc:
44
52
  private
45
53
  # Copy source data to a local directory structure
46
54
  def copy_sources
47
- path = Pathname.new(configuration[:file])
55
+ sequence = 0
56
+ path = Pathname.new(file)
48
57
  path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
49
58
  Pathname.glob(path).each do |f|
50
59
  next if f.directory?
51
- FileUtils.cp(f, local_file)
60
+ lf = local_file(sequence)
61
+ FileUtils.cp(f, lf)
62
+ File.open(local_file_trigger(lf), 'w') {|f| }
63
+ sequence += 1
52
64
  end
53
65
  end
54
66
 
55
67
  # Configure the source
56
68
  def configure
57
- case @configuration[:parser]
69
+ @file = configuration[:file]
70
+ case configuration[:parser]
58
71
  when Class
59
- @parser = @configuration[:parser].new(self)
72
+ @parser = configuration[:parser].new(self)
60
73
  when String, Symbol
61
- @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
74
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
62
75
  when Hash
63
- name = @configuration[:parser][:name]
64
- options = @configuration[:parser][:options]
76
+ name = configuration[:parser][:name]
77
+ options = configuration[:parser][:options]
65
78
  @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
66
79
  else
67
80
  raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
68
81
  end
69
- @skip_lines = @configuration[:skip_lines] ||= 0
82
+ @skip_lines = configuration[:skip_lines] ||= 0
70
83
  end
71
84
  end
72
85
  end
data/lib/etl/engine.rb CHANGED
@@ -10,6 +10,24 @@ module ETL #:nodoc:
10
10
  # The main ETL engine clas
11
11
  class Engine
12
12
  class << self
13
+ # Initialization that is run when a job is executed.
14
+ def init(options={})
15
+ unless @initialized
16
+ @limit = options[:limit]
17
+ @offset = options[:offset]
18
+ @log_write_mode = 'w' if options[:newlog]
19
+ @skip_bulk_import = options[:skip_bulk_import]
20
+ @read_locally = options[:read_locally]
21
+ options[:config] ||= 'database.yml'
22
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
23
+ ActiveRecord::Base.configurations = database_configuration
24
+ require 'etl/execution'
25
+ ETL::Execution::Base.establish_connection :etl_execution
26
+ ETL::Execution::Execution.migrate
27
+ @initialized = true
28
+ end
29
+ end
30
+
13
31
  # Process the specified control file. Acceptable values for control_file are
14
32
  # * Path to a file
15
33
  # * File object
@@ -20,6 +38,12 @@ module ETL #:nodoc:
20
38
 
21
39
  attr_accessor :timestamped_log
22
40
 
41
+ # Accessor for the log write mode. Default is 'a' for append.
42
+ attr_accessor :log_write_mode
43
+ def log_write_mode
44
+ @log_write_mode ||= 'a'
45
+ end
46
+
23
47
  # A logger for the engine
24
48
  attr_accessor :logger
25
49
 
@@ -28,7 +52,7 @@ module ETL #:nodoc:
28
52
  if timestamped_log
29
53
  @logger = Logger.new("etl_#{timestamp}.log")
30
54
  else
31
- @logger = Logger.new(File.open('etl.log', 'a'))
55
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
32
56
  end
33
57
  @logger.level = Logger::ERROR
34
58
  @logger.formatter = Logger::Formatter.new
@@ -50,21 +74,43 @@ module ETL #:nodoc:
50
74
  # The current destination
51
75
  attr_accessor :current_destination
52
76
 
53
- # Set to true to activate realtime activity. This will cause certain information messages
54
- # to be printed to STDOUT
77
+ # Set to true to activate realtime activity. This will cause certain
78
+ # information messages to be printed to STDOUT
55
79
  attr_accessor :realtime_activity
56
80
 
81
+ # Accessor for the total number of rows read from sources
57
82
  attr_accessor :rows_read
58
-
59
83
  def rows_read
60
84
  @rows_read ||= 0
61
85
  end
62
86
 
87
+ # Accessor for the total number of rows processed
63
88
  attr_accessor :rows_written
64
-
65
89
  def rows_written
66
90
  @rows_written ||= 0
67
91
  end
92
+
93
+ # Access the current ETL::Execution::Job instance
94
+ attr_accessor :job
95
+
96
+ # The limit on rows to load from the source, useful for testing the ETL
97
+ # process prior to executing the entire batch. Default value is nil and
98
+ # indicates that there is no limit
99
+ attr_accessor :limit
100
+
101
+ # The offset for the source to begin at, useful for testing the ETL
102
+ # process prior to executing the entire batch. Default value is nil and
103
+ # indicates that there is no offset
104
+ attr_accessor :offset
105
+
106
+ # Set to true to skip all bulk importing
107
+ attr_accessor :skip_bulk_import
108
+
109
+ # Set to true to read locally from the last source cache files
110
+ attr_accessor :read_locally
111
+
112
+ # Accessor for the average rows per second processed
113
+ attr_accessor :average_rows_per_second
68
114
  end
69
115
 
70
116
  # Say the specified message, with a newline
@@ -89,6 +135,22 @@ module ETL #:nodoc:
89
135
  def errors
90
136
  @errors ||= []
91
137
  end
138
+
139
+ # Get a Hash of benchmark values where each value represents the total
140
+ # amount of time in seconds spent processing in that portion of the ETL
141
+ # pipeline. Keys include:
142
+ # * <tt>:transforms</tt>
143
+ # * <tt>:after_reads</tt>
144
+ # * <tt>:before_writes</tt>
145
+ # * <tt>:writes</tt>
146
+ def benchmarks
147
+ @benchmarks ||= {
148
+ :transforms => 0,
149
+ :after_reads => 0,
150
+ :before_writes => 0,
151
+ :writes => 0,
152
+ }
153
+ end
92
154
 
93
155
  # Process a control file or object. Acceptable values for control are:
94
156
  # * Path to a file
@@ -97,6 +159,11 @@ module ETL #:nodoc:
97
159
  def process(control)
98
160
  control = ETL::Control::Control.resolve(control)
99
161
 
162
+ ETL::Engine.job = ETL::Execution::Job.create!(
163
+ :control_file => control.file,
164
+ :status => 'executing'
165
+ )
166
+
100
167
  execute_dependencies(control)
101
168
 
102
169
  start_time = Time.now
@@ -108,11 +175,22 @@ module ETL #:nodoc:
108
175
  sources = control.sources
109
176
  destinations = control.destinations
110
177
 
178
+ say "Skipping bulk import" if Engine.skip_bulk_import
179
+
111
180
  sources.each do |source|
112
181
  Engine.current_source = source
113
182
  Engine.logger.debug "Processing source #{source}"
114
183
  say "Source: #{source}"
184
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
185
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
115
186
  source.each_with_index do |row, index|
187
+ # Break out of the row loop if the +Engine.limit+ is specified and
188
+ # the number of rows read exceeds that value.
189
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
190
+ puts "Reached limit of #{Engine.limit}"
191
+ break
192
+ end
193
+
116
194
  Engine.logger.debug "Row #{index}: #{row.inspect}"
117
195
  Engine.rows_read += 1
118
196
  Engine.current_source_row = index + 1
@@ -120,80 +198,85 @@ module ETL #:nodoc:
120
198
  say_without_newline "."
121
199
  end
122
200
 
123
- # At this point a single row may be turned into multiple rows via row processors
124
- # all code after this line should work with the array of rows rather than the
125
- # single row
201
+ # At this point a single row may be turned into multiple rows via row
202
+ # processors all code after this line should work with the array of
203
+ # rows rather than the single row
126
204
  rows = [row]
127
205
 
128
- begin
129
- Engine.logger.debug "Processing after read"
130
- control.after_read_processors.each do |processor|
131
- processed_rows = []
132
- rows.each do |row|
133
- processed_rows << processor.process(row)
206
+ benchmarks[:after_reads] += Benchmark.realtime do
207
+ begin
208
+ Engine.logger.debug "Processing after read"
209
+ control.after_read_processors.each do |processor|
210
+ processed_rows = []
211
+ rows.each do |row|
212
+ processed_rows << processor.process(row)
213
+ end
214
+ rows = processed_rows.flatten
134
215
  end
135
- rows = processed_rows.flatten
216
+ rescue => e
217
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
218
+ errors << msg
219
+ Engine.logger.error(msg)
220
+ exceeded_error_threshold?(control) ? break : next
136
221
  end
137
- rescue => e
138
- msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
139
- errors << msg
140
- Engine.logger.error(msg)
141
- exceeded_error_threshold?(control) ? break : next
142
222
  end
143
223
 
144
- begin
145
- # execute transforms
146
- Engine.logger.debug "Executing transforms"
147
- rows.each do |row|
148
- row.each do |name, value|
149
- name = name.to_sym
150
- transformers = control.transform(name)
151
- #Engine.logger.debug "Transforms for #{name}: #{transformers.inspect}"
152
- row[name] = ETL::Transform::Transform.transform(name, value, row, transformers)
224
+ benchmarks[:transforms] += Benchmark.realtime do
225
+ begin
226
+ # execute transforms
227
+ Engine.logger.debug "Executing transforms"
228
+ rows.each do |row|
229
+ control.transforms.each do |transform|
230
+ name = transform.name.to_sym
231
+ row[name] = transform.transform(name, row[name], row)
232
+ end
153
233
  end
234
+ rescue => e
235
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
236
+ errors << msg
237
+ Engine.logger.error(msg)
238
+ e.backtrace.each { |line| Engine.logger.error(line) }
239
+ exceeded_error_threshold?(control) ? break : next
154
240
  end
155
- rescue => e
156
- msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
157
- errors << msg
158
- Engine.logger.error(msg)
159
- e.backtrace.each { |line| Engine.logger.error(line) }
160
- exceeded_error_threshold?(control) ? break : next
161
241
  end
162
242
 
163
-
164
- begin
165
- # execute row-level "before write" processing
166
- Engine.logger.debug "Processing before write"
167
- control.before_write_processors.each do |processor|
168
- processed_rows = []
169
- rows.each do |row|
170
- processed_rows << processor.process(row)
243
+ benchmarks[:before_writes] += Benchmark.realtime do
244
+ begin
245
+ # execute row-level "before write" processing
246
+ Engine.logger.debug "Processing before write"
247
+ control.before_write_processors.each do |processor|
248
+ processed_rows = []
249
+ rows.each do |row|
250
+ processed_rows << processor.process(row)
251
+ end
252
+ rows = processed_rows.flatten.compact
171
253
  end
172
- rows = processed_rows.flatten
254
+ rescue => e
255
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
256
+ errors << msg
257
+ Engine.logger.error(msg)
258
+ e.backtrace.each { |line| Engine.logger.error(line) }
259
+ exceeded_error_threshold?(control) ? break : next
173
260
  end
174
- rescue => e
175
- msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
176
- errors << msg
177
- Engine.logger.error(msg)
178
- e.backtrace.each { |line| Engine.logger.error(line) }
179
- exceeded_error_threshold?(control) ? break : next
180
261
  end
181
262
 
182
- begin
183
- # write the row to the destination
184
- destinations.each_with_index do |destination, index|
185
- Engine.current_destination = destination
186
- rows.each do |row|
187
- destination.write(row)
188
- Engine.rows_written += 1 if index == 0
263
+ benchmarks[:writes] += Benchmark.realtime do
264
+ begin
265
+ # write the row to the destination
266
+ destinations.each_with_index do |destination, index|
267
+ Engine.current_destination = destination
268
+ rows.each do |row|
269
+ destination.write(row)
270
+ Engine.rows_written += 1 if index == 0
271
+ end
189
272
  end
273
+ rescue => e
274
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
275
+ errors << msg
276
+ Engine.logger.error msg
277
+ e.backtrace.each { |line| Engine.logger.error(line) }
278
+ exceeded_error_threshold?(control) ? break : next
190
279
  end
191
- rescue => e
192
- msg = "Error writing to #{Engine.current_destination}: #{e}"
193
- errors << msg
194
- Engine.logger.error msg
195
- e.backtrace.each { |line| Engine.logger.error(line) }
196
- break if exceeded_error_threshold?(control)
197
280
  end
198
281
  end
199
282
 
@@ -220,6 +303,20 @@ module ETL #:nodoc:
220
303
  say "Wrote #{Engine.rows_written} lines to destinations"
221
304
  end
222
305
  say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
306
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
307
+
308
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
309
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
310
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
311
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
312
+
313
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
314
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
315
+ # end
316
+
317
+ ETL::Engine.job.completed_at = Time.now
318
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
319
+ ETL::Engine.job.save!
223
320
  end
224
321
 
225
322
  private
@@ -285,6 +382,10 @@ module ETL #:nodoc:
285
382
  s
286
383
  end
287
384
 
385
+ # Get the approximate disntance of time in words from the given from_time
386
+ # to the the given to_time. If to_time is not specified then it is set
387
+ # to Time.now. By default seconds are included...set the include_seconds
388
+ # argument to false to disable the seconds.
288
389
  def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
289
390
  from_time = from_time.to_time if from_time.respond_to?(:to_time)
290
391
  to_time = to_time.to_time if to_time.respond_to?(:to_time)