activewarehouse-etl 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -19,13 +19,13 @@ module ETL #:nodoc:
19
19
  #
20
20
  # Configuration options:
21
21
  # * <tt>:database</tt>: The database name (REQUIRED)
22
- # * <tt>:table<tt>: The table to write to (REQUIRED)
22
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
23
23
  # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
24
  # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
25
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
26
  # * <tt>:username</tt>: The database username (defaults to 'root')
27
27
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
28
+ # * <tt>:host</tt>: The host for the database (defaults to 'localhost')
29
29
  # * <tt>:append_rows</tt>: Array of rows to append
30
30
  #
31
31
  # Mapping options:
@@ -1,3 +1,5 @@
1
+ # This source file contains the ETL::Control::FileDestination
2
+
1
3
  module ETL #:nodoc:
2
4
  module Control #:nodoc:
3
5
  # File as the final destination.
@@ -60,6 +62,7 @@ module ETL #:nodoc:
60
62
  def flush
61
63
  #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
62
64
  buffer.flatten.each do |row|
65
+ #puts "row change type: #{row.change_type}"
63
66
  # check to see if this row's compound key constraint already exists
64
67
  # note that the compound key constraint may not utilize virtual fields
65
68
  next unless row_allowed?(row)
@@ -68,11 +71,22 @@ module ETL #:nodoc:
68
71
  add_virtuals!(row)
69
72
 
70
73
  # collect all of the values using the order designated in the configuration
71
- values = order.collect { |name| row[name] }
74
+ values = order.collect do |name|
75
+ value = row[name]
76
+ case value
77
+ when Date, Time, DateTime
78
+ value.to_s(:db)
79
+ else
80
+ value.to_s
81
+ end
82
+ end
83
+
84
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
85
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
72
86
 
73
87
  # enclose the value if required
74
88
  if !enclose.nil?
75
- values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
89
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
76
90
  end
77
91
 
78
92
  # write the values joined by the separator defined in the configuration
@@ -81,6 +95,7 @@ module ETL #:nodoc:
81
95
  # write the end-of-line
82
96
  f.write(eol)
83
97
  end
98
+ f.flush
84
99
  buffer.clear
85
100
  #puts "After flush there are #{buffer.length} rows"
86
101
  end
@@ -91,6 +106,14 @@ module ETL #:nodoc:
91
106
  @f ||= open(file, mode)
92
107
  end
93
108
 
109
+ def options
110
+ @options ||= {
111
+ :col_sep => separator,
112
+ :row_sep => eol,
113
+ :force_quotes => !enclose.nil?
114
+ }
115
+ end
116
+
94
117
  # Get the appropriate mode to open the file stream
95
118
  def mode
96
119
  append ? 'a' : 'w'
@@ -14,12 +14,14 @@ module ETL #:nodoc:
14
14
  attr_accessor :definition
15
15
 
16
16
  # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
17
18
  attr_accessor :store_locally
18
19
 
19
20
  class << self
20
21
  # Convert the name to a Source class.
21
22
  #
22
- # For example if name is :database then this will return a DatabaseSource class
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
23
25
  def class_for_name(name)
24
26
  ETL::Control.const_get("#{name.to_s.classify}Source")
25
27
  end
@@ -31,7 +33,8 @@ module ETL #:nodoc:
31
33
  # * <tt>definition</tt>: The source layout definition
32
34
  #
33
35
  # Configuration options:
34
- # * <tt>:store_locally</tt>: Set to false to not store source data locally (defaults to true)
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
35
38
  def initialize(control, configuration, definition)
36
39
  @control = control
37
40
  @configuration = configuration
@@ -40,6 +43,7 @@ module ETL #:nodoc:
40
43
  @store_locally = configuration[:store_locally] || true
41
44
  end
42
45
 
46
+ # Get an array of errors that occur during reading from the source
43
47
  def errors
44
48
  @errors ||= []
45
49
  end
@@ -57,18 +61,35 @@ module ETL #:nodoc:
57
61
  @local_base ||= 'source_data'
58
62
  end
59
63
 
60
- # The local directory for storing. This method must be overriden by subclasses
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
61
66
  def local_directory
62
67
  raise "local_directory method is abstract"
63
68
  end
64
69
 
65
- # Return the local file for storing the raw source data. Each call to this method will
66
- # result in a timestamped file, so you cannot expect to call it multiple times and reference
67
- # the same file
68
- def local_file
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
69
79
  local_dir = local_directory
70
80
  FileUtils.mkdir_p(local_dir)
71
- File.join(local_dir, "#{timestamp}.csv")
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the local trigger file that is used to indicate that the file has
85
+ # been completely written
86
+ def local_file_trigger(file)
87
+ Pathname.new(file.to_s + '.trig')
88
+ end
89
+
90
+ # Return true if the source should read locally.
91
+ def read_locally
92
+ Engine.read_locally
72
93
  end
73
94
 
74
95
  end
@@ -22,10 +22,18 @@ module ETL #:nodoc:
22
22
  # Other options:
23
23
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
24
24
  # * <tt>:username</tt>: The database username (defaults to 'root')
25
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
26
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
27
- # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally
28
- # in a flat file (defaults to true)
25
+ # * <tt>:password</tt>: The password to the database (defaults to
26
+ # nothing)
27
+ # * <tt>:host</tt>: The host for the database (defaults to
28
+ # 'localhost')
29
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
30
+ # specified)
31
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
32
+ # '*')
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
36
+ # source data locally in a flat file (defaults to true)
29
37
  def initialize(control, configuration, definition)
30
38
  super
31
39
  connect
@@ -36,41 +44,117 @@ module ETL #:nodoc:
36
44
  "#{host}/#{configuration[:database]}/#{configuration[:table]}"
37
45
  end
38
46
 
39
- # Get the local directory to use, which is a combination of the local_base, the db hostname
40
- # the db database name and the db table.
47
+ # Get the local directory to use, which is a combination of the
48
+ # local_base, the db hostname the db database name and the db table.
41
49
  def local_directory
42
50
  File.join(local_base, host, configuration[:database], configuration[:table])
43
51
  end
44
52
 
45
- # Returns each row from the source
46
- def each
47
- if store_locally
48
- file = local_file
49
- columns = connection.columns(configuration[:table].to_s)
50
- FasterCSV.open(file, 'w') do |f|
51
- f << columns.collect { |column| column.name }
52
- connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
53
- values = columns.collect { |column| row[column.name] }
54
- #puts "row: #{values.inspect}"
55
- f << values
53
+ # Get the join part of the query, defaults to nil
54
+ def join
55
+ configuration[:join]
56
+ end
57
+
58
+ # Get the select part of the query, defaults to '*'
59
+ def select
60
+ configuration[:select] || '*'
61
+ end
62
+
63
+ # Get the order for the query, defaults to nil
64
+ def order
65
+ configuration[:order]
66
+ end
67
+
68
+ # Get the list of columns to read. This is defined in the source
69
+ # definition as either an Array or Hash
70
+ def columns
71
+ case definition
72
+ when Array
73
+ definition.collect(&:to_sym)
74
+ when Hash
75
+ definition.keys.collect(&:to_sym)
76
+ else
77
+ raise "Definition must be either an Array or a Hash"
78
+ end
79
+ end
80
+
81
+ # Returns each row from the source. If read_locally is specified then
82
+ # this method will attempt to read from the last stored local file.
83
+ # If no locally stored file exists or if the trigger file for the last
84
+ # locally stored file does not exist then this method will raise an
85
+ # error.
86
+ def each(&block)
87
+ if read_locally # Read from the last stored source
88
+ read_rows(&block)
89
+ else # Read from the original source
90
+ if store_locally
91
+ write_local
92
+ read_rows(&block)
93
+ else
94
+ connection.select_all(query).each do |row|
95
+ row = Row.new(row.symbolize_keys)
96
+ yield row
56
97
  end
57
98
  end
99
+ end
100
+ end
101
+
102
+ private
103
+ # Read rows from the local cache
104
+ def read_rows
105
+ file = local_file
106
+
107
+ File.exists?(file) or raise "Local cache file not found"
108
+ File.exists?(local_file_trigger(file)) or raise "Local cache trigger file not found"
109
+
110
+ t = Benchmark.realtime do
58
111
  FasterCSV.open(file, :headers => true).each do |row|
59
- result_row = {}
112
+ result_row = ETL::Row.new
60
113
  row.each do |header, field|
61
114
  result_row[header.to_sym] = field
62
115
  end
63
- #puts "yielding #{result_row.inspect}"
64
116
  yield result_row
65
117
  end
66
- else
67
- connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
68
- yield HashWithIndifferentAccess.new(row)
118
+ end
119
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
120
+ end
121
+
122
+ # Write rows to the local cache
123
+ def write_local
124
+ file = local_file
125
+
126
+ lines = 0
127
+ t = Benchmark.realtime do
128
+ FasterCSV.open(file, 'w') do |f|
129
+ f << columns
130
+ connection.select_all(query).each do |row|
131
+ f << columns.collect { |column| row[column.to_s] }
132
+ lines += 1
133
+ end
69
134
  end
135
+ File.open(local_file_trigger(file), 'w') {|f| }
70
136
  end
137
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
138
+ end
139
+
140
+ # Get the query to use
141
+ def query
142
+ return @query if @query
143
+ q = "SELECT #{select} FROM #{configuration[:table]}"
144
+ q << " #{join}" if join
145
+ q << " ORDER BY #{order}" if order
146
+ if ETL::Engine.limit || ETL::Engine.offset
147
+ options = {}
148
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
149
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
150
+ connection.add_limit_offset!(q, options)
151
+ end
152
+ #q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
153
+ q = q.gsub(/\n/,' ')
154
+ ETL::Engine.logger.info "Query: #{q}"
155
+ @query = q
71
156
  end
72
157
 
73
- private
74
158
  # Get the database connection to use
75
159
  def connection
76
160
  ETL::Source.connection
@@ -99,7 +183,8 @@ module ETL #:nodoc:
99
183
  # Options:
100
184
  # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
101
185
  # * <tt>:username</tt>: The database username (defaults to 'root')
102
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
186
+ # * <tt>:password</tt>: The password to the database (defaults
187
+ # to nothing)
103
188
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
104
189
  def connect
105
190
  ETL::Source.establish_connection(
@@ -8,16 +8,21 @@ module ETL #:nodoc:
8
8
  # Accessor for the underlying parser
9
9
  attr_accessor :parser
10
10
 
11
+ # The source file
12
+ attr_accessor :file
13
+
11
14
  # Initialize the source
12
15
  #
13
16
  # Configuration options:
14
17
  # * <tt>:file</tt>: The source file
15
- # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends
16
- # from Parser, a Hash with :name and optionally an :options key. Whether or not the parser uses the
17
- # options is dependent on which parser is used. See the documentation for each parser for information
18
- # on what options it accepts.
19
- # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
20
- # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally for archival
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
21
26
  def initialize(control, configuration, definition)
22
27
  super
23
28
  configure
@@ -25,18 +30,21 @@ module ETL #:nodoc:
25
30
 
26
31
  # Get a String identifier for the source
27
32
  def to_s
28
- configuration[:file]
33
+ file
29
34
  end
30
35
 
31
36
  # Get the local storage directory
32
37
  def local_directory
33
- File.join(local_base, File.basename(configuration[:file], File.extname(configuration[:file])))
38
+ File.join(local_base, File.basename(file, File.extname(file)))
34
39
  end
35
40
 
36
41
  # Returns each row from the source
37
42
  def each
38
43
  copy_sources if store_locally
39
44
  @parser.each do |row|
45
+ # TODO skip rows if offset is defined
46
+ # TODO stop processing if limit is reached
47
+ row = ETL::Row[row]
40
48
  yield row
41
49
  end
42
50
  end
@@ -44,29 +52,34 @@ module ETL #:nodoc:
44
52
  private
45
53
  # Copy source data to a local directory structure
46
54
  def copy_sources
47
- path = Pathname.new(configuration[:file])
55
+ sequence = 0
56
+ path = Pathname.new(file)
48
57
  path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
49
58
  Pathname.glob(path).each do |f|
50
59
  next if f.directory?
51
- FileUtils.cp(f, local_file)
60
+ lf = local_file(sequence)
61
+ FileUtils.cp(f, lf)
62
+ File.open(local_file_trigger(lf), 'w') {|f| }
63
+ sequence += 1
52
64
  end
53
65
  end
54
66
 
55
67
  # Configure the source
56
68
  def configure
57
- case @configuration[:parser]
69
+ @file = configuration[:file]
70
+ case configuration[:parser]
58
71
  when Class
59
- @parser = @configuration[:parser].new(self)
72
+ @parser = configuration[:parser].new(self)
60
73
  when String, Symbol
61
- @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
74
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
62
75
  when Hash
63
- name = @configuration[:parser][:name]
64
- options = @configuration[:parser][:options]
76
+ name = configuration[:parser][:name]
77
+ options = configuration[:parser][:options]
65
78
  @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
66
79
  else
67
80
  raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
68
81
  end
69
- @skip_lines = @configuration[:skip_lines] ||= 0
82
+ @skip_lines = configuration[:skip_lines] ||= 0
70
83
  end
71
84
  end
72
85
  end
data/lib/etl/engine.rb CHANGED
@@ -10,6 +10,24 @@ module ETL #:nodoc:
10
10
  # The main ETL engine clas
11
11
  class Engine
12
12
  class << self
13
+ # Initialization that is run when a job is executed.
14
+ def init(options={})
15
+ unless @initialized
16
+ @limit = options[:limit]
17
+ @offset = options[:offset]
18
+ @log_write_mode = 'w' if options[:newlog]
19
+ @skip_bulk_import = options[:skip_bulk_import]
20
+ @read_locally = options[:read_locally]
21
+ options[:config] ||= 'database.yml'
22
+ database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
23
+ ActiveRecord::Base.configurations = database_configuration
24
+ require 'etl/execution'
25
+ ETL::Execution::Base.establish_connection :etl_execution
26
+ ETL::Execution::Execution.migrate
27
+ @initialized = true
28
+ end
29
+ end
30
+
13
31
  # Process the specified control file. Acceptable values for control_file are
14
32
  # * Path to a file
15
33
  # * File object
@@ -20,6 +38,12 @@ module ETL #:nodoc:
20
38
 
21
39
  attr_accessor :timestamped_log
22
40
 
41
+ # Accessor for the log write mode. Default is 'a' for append.
42
+ attr_accessor :log_write_mode
43
+ def log_write_mode
44
+ @log_write_mode ||= 'a'
45
+ end
46
+
23
47
  # A logger for the engine
24
48
  attr_accessor :logger
25
49
 
@@ -28,7 +52,7 @@ module ETL #:nodoc:
28
52
  if timestamped_log
29
53
  @logger = Logger.new("etl_#{timestamp}.log")
30
54
  else
31
- @logger = Logger.new(File.open('etl.log', 'a'))
55
+ @logger = Logger.new(File.open('etl.log', log_write_mode))
32
56
  end
33
57
  @logger.level = Logger::ERROR
34
58
  @logger.formatter = Logger::Formatter.new
@@ -50,21 +74,43 @@ module ETL #:nodoc:
50
74
  # The current destination
51
75
  attr_accessor :current_destination
52
76
 
53
- # Set to true to activate realtime activity. This will cause certain information messages
54
- # to be printed to STDOUT
77
+ # Set to true to activate realtime activity. This will cause certain
78
+ # information messages to be printed to STDOUT
55
79
  attr_accessor :realtime_activity
56
80
 
81
+ # Accessor for the total number of rows read from sources
57
82
  attr_accessor :rows_read
58
-
59
83
  def rows_read
60
84
  @rows_read ||= 0
61
85
  end
62
86
 
87
+ # Accessor for the total number of rows processed
63
88
  attr_accessor :rows_written
64
-
65
89
  def rows_written
66
90
  @rows_written ||= 0
67
91
  end
92
+
93
+ # Access the current ETL::Execution::Job instance
94
+ attr_accessor :job
95
+
96
+ # The limit on rows to load from the source, useful for testing the ETL
97
+ # process prior to executing the entire batch. Default value is nil and
98
+ # indicates that there is no limit
99
+ attr_accessor :limit
100
+
101
+ # The offset for the source to begin at, useful for testing the ETL
102
+ # process prior to executing the entire batch. Default value is nil and
103
+ # indicates that there is no offset
104
+ attr_accessor :offset
105
+
106
+ # Set to true to skip all bulk importing
107
+ attr_accessor :skip_bulk_import
108
+
109
+ # Set to true to read locally from the last source cache files
110
+ attr_accessor :read_locally
111
+
112
+ # Accessor for the average rows per second processed
113
+ attr_accessor :average_rows_per_second
68
114
  end
69
115
 
70
116
  # Say the specified message, with a newline
@@ -89,6 +135,22 @@ module ETL #:nodoc:
89
135
  def errors
90
136
  @errors ||= []
91
137
  end
138
+
139
+ # Get a Hash of benchmark values where each value represents the total
140
+ # amount of time in seconds spent processing in that portion of the ETL
141
+ # pipeline. Keys include:
142
+ # * <tt>:transforms</tt>
143
+ # * <tt>:after_reads</tt>
144
+ # * <tt>:before_writes</tt>
145
+ # * <tt>:writes</tt>
146
+ def benchmarks
147
+ @benchmarks ||= {
148
+ :transforms => 0,
149
+ :after_reads => 0,
150
+ :before_writes => 0,
151
+ :writes => 0,
152
+ }
153
+ end
92
154
 
93
155
  # Process a control file or object. Acceptable values for control are:
94
156
  # * Path to a file
@@ -97,6 +159,11 @@ module ETL #:nodoc:
97
159
  def process(control)
98
160
  control = ETL::Control::Control.resolve(control)
99
161
 
162
+ ETL::Engine.job = ETL::Execution::Job.create!(
163
+ :control_file => control.file,
164
+ :status => 'executing'
165
+ )
166
+
100
167
  execute_dependencies(control)
101
168
 
102
169
  start_time = Time.now
@@ -108,11 +175,22 @@ module ETL #:nodoc:
108
175
  sources = control.sources
109
176
  destinations = control.destinations
110
177
 
178
+ say "Skipping bulk import" if Engine.skip_bulk_import
179
+
111
180
  sources.each do |source|
112
181
  Engine.current_source = source
113
182
  Engine.logger.debug "Processing source #{source}"
114
183
  say "Source: #{source}"
184
+ say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
185
+ say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
115
186
  source.each_with_index do |row, index|
187
+ # Break out of the row loop if the +Engine.limit+ is specified and
188
+ # the number of rows read exceeds that value.
189
+ if Engine.limit != nil && Engine.rows_read >= Engine.limit
190
+ puts "Reached limit of #{Engine.limit}"
191
+ break
192
+ end
193
+
116
194
  Engine.logger.debug "Row #{index}: #{row.inspect}"
117
195
  Engine.rows_read += 1
118
196
  Engine.current_source_row = index + 1
@@ -120,80 +198,85 @@ module ETL #:nodoc:
120
198
  say_without_newline "."
121
199
  end
122
200
 
123
- # At this point a single row may be turned into multiple rows via row processors
124
- # all code after this line should work with the array of rows rather than the
125
- # single row
201
+ # At this point a single row may be turned into multiple rows via row
202
+ # processors all code after this line should work with the array of
203
+ # rows rather than the single row
126
204
  rows = [row]
127
205
 
128
- begin
129
- Engine.logger.debug "Processing after read"
130
- control.after_read_processors.each do |processor|
131
- processed_rows = []
132
- rows.each do |row|
133
- processed_rows << processor.process(row)
206
+ benchmarks[:after_reads] += Benchmark.realtime do
207
+ begin
208
+ Engine.logger.debug "Processing after read"
209
+ control.after_read_processors.each do |processor|
210
+ processed_rows = []
211
+ rows.each do |row|
212
+ processed_rows << processor.process(row)
213
+ end
214
+ rows = processed_rows.flatten
134
215
  end
135
- rows = processed_rows.flatten
216
+ rescue => e
217
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
218
+ errors << msg
219
+ Engine.logger.error(msg)
220
+ exceeded_error_threshold?(control) ? break : next
136
221
  end
137
- rescue => e
138
- msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
139
- errors << msg
140
- Engine.logger.error(msg)
141
- exceeded_error_threshold?(control) ? break : next
142
222
  end
143
223
 
144
- begin
145
- # execute transforms
146
- Engine.logger.debug "Executing transforms"
147
- rows.each do |row|
148
- row.each do |name, value|
149
- name = name.to_sym
150
- transformers = control.transform(name)
151
- #Engine.logger.debug "Transforms for #{name}: #{transformers.inspect}"
152
- row[name] = ETL::Transform::Transform.transform(name, value, row, transformers)
224
+ benchmarks[:transforms] += Benchmark.realtime do
225
+ begin
226
+ # execute transforms
227
+ Engine.logger.debug "Executing transforms"
228
+ rows.each do |row|
229
+ control.transforms.each do |transform|
230
+ name = transform.name.to_sym
231
+ row[name] = transform.transform(name, row[name], row)
232
+ end
153
233
  end
234
+ rescue => e
235
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
236
+ errors << msg
237
+ Engine.logger.error(msg)
238
+ e.backtrace.each { |line| Engine.logger.error(line) }
239
+ exceeded_error_threshold?(control) ? break : next
154
240
  end
155
- rescue => e
156
- msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
157
- errors << msg
158
- Engine.logger.error(msg)
159
- e.backtrace.each { |line| Engine.logger.error(line) }
160
- exceeded_error_threshold?(control) ? break : next
161
241
  end
162
242
 
163
-
164
- begin
165
- # execute row-level "before write" processing
166
- Engine.logger.debug "Processing before write"
167
- control.before_write_processors.each do |processor|
168
- processed_rows = []
169
- rows.each do |row|
170
- processed_rows << processor.process(row)
243
+ benchmarks[:before_writes] += Benchmark.realtime do
244
+ begin
245
+ # execute row-level "before write" processing
246
+ Engine.logger.debug "Processing before write"
247
+ control.before_write_processors.each do |processor|
248
+ processed_rows = []
249
+ rows.each do |row|
250
+ processed_rows << processor.process(row)
251
+ end
252
+ rows = processed_rows.flatten.compact
171
253
  end
172
- rows = processed_rows.flatten
254
+ rescue => e
255
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
256
+ errors << msg
257
+ Engine.logger.error(msg)
258
+ e.backtrace.each { |line| Engine.logger.error(line) }
259
+ exceeded_error_threshold?(control) ? break : next
173
260
  end
174
- rescue => e
175
- msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
176
- errors << msg
177
- Engine.logger.error(msg)
178
- e.backtrace.each { |line| Engine.logger.error(line) }
179
- exceeded_error_threshold?(control) ? break : next
180
261
  end
181
262
 
182
- begin
183
- # write the row to the destination
184
- destinations.each_with_index do |destination, index|
185
- Engine.current_destination = destination
186
- rows.each do |row|
187
- destination.write(row)
188
- Engine.rows_written += 1 if index == 0
263
+ benchmarks[:writes] += Benchmark.realtime do
264
+ begin
265
+ # write the row to the destination
266
+ destinations.each_with_index do |destination, index|
267
+ Engine.current_destination = destination
268
+ rows.each do |row|
269
+ destination.write(row)
270
+ Engine.rows_written += 1 if index == 0
271
+ end
189
272
  end
273
+ rescue => e
274
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
275
+ errors << msg
276
+ Engine.logger.error msg
277
+ e.backtrace.each { |line| Engine.logger.error(line) }
278
+ exceeded_error_threshold?(control) ? break : next
190
279
  end
191
- rescue => e
192
- msg = "Error writing to #{Engine.current_destination}: #{e}"
193
- errors << msg
194
- Engine.logger.error msg
195
- e.backtrace.each { |line| Engine.logger.error(line) }
196
- break if exceeded_error_threshold?(control)
197
280
  end
198
281
  end
199
282
 
@@ -220,6 +303,20 @@ module ETL #:nodoc:
220
303
  say "Wrote #{Engine.rows_written} lines to destinations"
221
304
  end
222
305
  say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
306
+ say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
307
+
308
+ say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
309
+ say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
310
+ say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
311
+ say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
312
+
313
+ # ETL::Transform::Transform.benchmarks.each do |klass, t|
314
+ # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
315
+ # end
316
+
317
+ ETL::Engine.job.completed_at = Time.now
318
+ ETL::Engine.job.status = (errors.length > 0 ? 'completed with errors' : 'completed')
319
+ ETL::Engine.job.save!
223
320
  end
224
321
 
225
322
  private
@@ -285,6 +382,10 @@ module ETL #:nodoc:
285
382
  s
286
383
  end
287
384
 
385
+ # Get the approximate disntance of time in words from the given from_time
386
+ # to the the given to_time. If to_time is not specified then it is set
387
+ # to Time.now. By default seconds are included...set the include_seconds
388
+ # argument to false to disable the seconds.
288
389
  def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
289
390
  from_time = from_time.to_time if from_time.respond_to?(:to_time)
290
391
  to_time = to_time.to_time if to_time.respond_to?(:to_time)