activewarehouse-etl 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/CHANGELOG +41 -13
  2. data/README +1 -1
  3. data/Rakefile +14 -4
  4. data/TODO +17 -1
  5. data/bin/etl +3 -1
  6. data/lib/etl.rb +11 -7
  7. data/lib/etl/commands/etl.rb +0 -1
  8. data/lib/etl/control/control.rb +113 -36
  9. data/lib/etl/control/destination.rb +13 -1
  10. data/lib/etl/control/destination/database_destination.rb +3 -1
  11. data/lib/etl/control/destination/file_destination.rb +5 -2
  12. data/lib/etl/control/source.rb +36 -0
  13. data/lib/etl/control/source/database_source.rb +63 -8
  14. data/lib/etl/control/source/file_source.rb +25 -4
  15. data/lib/etl/engine.rb +128 -14
  16. data/lib/etl/generator/surrogate_key_generator.rb +1 -0
  17. data/lib/etl/http_tools.rb +119 -0
  18. data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
  19. data/lib/etl/parser/sax_parser.rb +18 -6
  20. data/lib/etl/processor.rb +1 -0
  21. data/lib/etl/processor/bulk_import_processor.rb +12 -0
  22. data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
  23. data/lib/etl/processor/processor.rb +1 -5
  24. data/lib/etl/processor/row_processor.rb +17 -0
  25. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  26. data/lib/etl/transform/decode_transform.rb +1 -1
  27. data/lib/etl/transform/default_transform.rb +15 -0
  28. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  29. data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
  30. data/lib/etl/transform/sha1_transform.rb +1 -1
  31. data/lib/etl/transform/string_to_date_transform.rb +3 -3
  32. data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
  33. data/lib/etl/transform/string_to_time_transform.rb +14 -0
  34. data/lib/etl/transform/transform.rb +8 -4
  35. data/lib/etl/transform/type_transform.rb +2 -2
  36. data/lib/etl/version.rb +2 -2
  37. metadata +21 -8
  38. data/lib/etl/active_record_ext.rb +0 -1
  39. data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -26,6 +26,7 @@ module ETL #:nodoc:
26
26
  # * <tt>:username</tt>: The database username (defaults to 'root')
27
27
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
28
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
29
+ # * <tt>:append_rows</tt>: Array of rows to append
29
30
  #
30
31
  # Mapping options:
31
32
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
@@ -44,7 +45,7 @@ module ETL #:nodoc:
44
45
  conn.transaction do
45
46
  conn.truncate(configuration[:table]) if truncate
46
47
 
47
- buffer.each do |row|
48
+ buffer.flatten.each do |row|
48
49
  # check to see if this row's compound key constraint already exists
49
50
  # note that the compound key constraint may not utilize virtual fields
50
51
  next unless row_allowed?(row)
@@ -69,6 +70,7 @@ module ETL #:nodoc:
69
70
 
70
71
  # Close the connection
71
72
  def close
73
+ buffer << append_rows if append_rows
72
74
  flush
73
75
  ETL::ActiveRecord::Base.connection.disconnect!
74
76
  end
@@ -32,6 +32,7 @@ module ETL #:nodoc:
32
32
  # * <tt>:eol</tt>: End of line marker (default is \n)
33
33
  # * <tt>:enclose</tt>: Enclosure character (default is none)
34
34
  # * <tt>:unique</tt>: Set to true to only write unique records
35
+ # * <tt>:append_rows</tt>: Array of rows to append
35
36
  #
36
37
  # Mapping options:
37
38
  # * <tt>:order</tt>: The order array
@@ -50,14 +51,15 @@ module ETL #:nodoc:
50
51
 
51
52
  # Close the destination. This will flush the buffer and close the underlying stream or connection.
52
53
  def close
54
+ buffer << append_rows if append_rows
53
55
  flush
54
56
  f.close
55
57
  end
56
58
 
57
59
  # Flush the destination buffer
58
60
  def flush
59
- # Engine.logger.debug "Flushing buffer #{buffer}"
60
- buffer.each do |row|
61
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
62
+ buffer.flatten.each do |row|
61
63
  # check to see if this row's compound key constraint already exists
62
64
  # note that the compound key constraint may not utilize virtual fields
63
65
  next unless row_allowed?(row)
@@ -80,6 +82,7 @@ module ETL #:nodoc:
80
82
  f.write(eol)
81
83
  end
82
84
  buffer.clear
85
+ #puts "After flush there are #{buffer.length} rows"
83
86
  end
84
87
 
85
88
  private
@@ -13,6 +13,9 @@ module ETL #:nodoc:
13
13
  # The definition Hash
14
14
  attr_accessor :definition
15
15
 
16
+ # Returns true if the source data should be stored locally for archival
17
+ attr_accessor :store_locally
18
+
16
19
  class << self
17
20
  # Convert the name to a Source class.
18
21
  #
@@ -26,15 +29,48 @@ module ETL #:nodoc:
26
29
  # * <tt>control</tt>: The control object
27
30
  # * <tt>configuration</tt>: The configuration hash
28
31
  # * <tt>definition</tt>: The source layout definition
32
+ #
33
+ # Configuration options:
34
+ # * <tt>:store_locally</tt>: Set to false to not store source data locally (defaults to true)
29
35
  def initialize(control, configuration, definition)
30
36
  @control = control
31
37
  @configuration = configuration
32
38
  @definition = definition
39
+
40
+ @store_locally = configuration[:store_locally] || true
33
41
  end
34
42
 
35
43
  def errors
36
44
  @errors ||= []
37
45
  end
46
+
47
+ # Get a timestamp value as a string
48
+ def timestamp
49
+ Engine.timestamp
50
+ end
51
+
52
+ # The base directory where local files are stored.
53
+ attr_accessor :local_base
54
+
55
+ # Get the local base, defaults to 'source_data'
56
+ def local_base
57
+ @local_base ||= 'source_data'
58
+ end
59
+
60
+ # The local directory for storing. This method must be overriden by subclasses
61
+ def local_directory
62
+ raise "local_directory method is abstract"
63
+ end
64
+
65
+ # Return the local file for storing the raw source data. Each call to this method will
66
+ # result in a timestamped file, so you cannot expect to call it multiple times and reference
67
+ # the same file
68
+ def local_file
69
+ local_dir = local_directory
70
+ FileUtils.mkdir_p(local_dir)
71
+ File.join(local_dir, "#{timestamp}.csv")
72
+ end
73
+
38
74
  end
39
75
  end
40
76
  end
@@ -1,4 +1,10 @@
1
+ require 'fileutils'
2
+
1
3
  module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
2
8
  module Control #:nodoc:
3
9
  # Source object which extracts data from a database using ActiveRecord.
4
10
  class DatabaseSource < Source
@@ -18,24 +24,73 @@ module ETL #:nodoc:
18
24
  # * <tt>:username</tt>: The database username (defaults to 'root')
19
25
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
20
26
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
27
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally
28
+ # in a flat file (defaults to true)
21
29
  def initialize(control, configuration, definition)
22
30
  super
23
31
  connect
24
32
  end
25
33
 
34
+ # Get a String identifier for the source
26
35
  def to_s
27
- "#{configuration[:host]}/#{configuration[:database]}"
36
+ "#{host}/#{configuration[:database]}/#{configuration[:table]}"
37
+ end
38
+
39
+ # Get the local directory to use, which is a combination of the local_base, the db hostname
40
+ # the db database name and the db table.
41
+ def local_directory
42
+ File.join(local_base, host, configuration[:database], configuration[:table])
28
43
  end
29
44
 
30
45
  # Returns each row from the source
31
46
  def each
32
- conn = ETL::ActiveRecord::Base.connection
33
- conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
34
- yield row
47
+ if store_locally
48
+ file = local_file
49
+ columns = connection.columns(configuration[:table].to_s)
50
+ FasterCSV.open(file, 'w') do |f|
51
+ f << columns.collect { |column| column.name }
52
+ connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
53
+ values = columns.collect { |column| row[column.name] }
54
+ #puts "row: #{values.inspect}"
55
+ f << values
56
+ end
57
+ end
58
+ FasterCSV.open(file, :headers => true).each do |row|
59
+ result_row = {}
60
+ row.each do |header, field|
61
+ result_row[header.to_sym] = field
62
+ end
63
+ #puts "yielding #{result_row.inspect}"
64
+ yield result_row
65
+ end
66
+ else
67
+ connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
68
+ yield HashWithIndifferentAccess.new(row)
69
+ end
35
70
  end
36
71
  end
37
72
 
38
73
  private
74
+ # Get the database connection to use
75
+ def connection
76
+ ETL::Source.connection
77
+ end
78
+
79
+ # Get the adapter name, defaults to :mysql
80
+ def adapter
81
+ configuration[:adapter] || :mysql
82
+ end
83
+
84
+ # Get the host, defaults to 'localhost'
85
+ def host
86
+ configuration[:host] || "localhost"
87
+ end
88
+
89
+ # Get the username, defaults to 'root'
90
+ def username
91
+ configuration[:username] || 'root'
92
+ end
93
+
39
94
  # Connect to the database.
40
95
  #
41
96
  # Required options:
@@ -47,10 +102,10 @@ module ETL #:nodoc:
47
102
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
48
103
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
49
104
  def connect
50
- ETL::ActiveRecord::Base.establish_connection(
51
- :adapter => (configuration[:adapter] || :mysql),
52
- :username => (configuration[:username] || 'root'),
53
- :host => (configuration[:host] || 'localhost'),
105
+ ETL::Source.establish_connection(
106
+ :adapter => adapter,
107
+ :username => username,
108
+ :host => host,
54
109
  :password => configuration[:password],
55
110
  :database => configuration[:database]
56
111
  )
@@ -11,25 +11,46 @@ module ETL #:nodoc:
11
11
  # Initialize the source
12
12
  #
13
13
  # Configuration options:
14
- # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
15
- # a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
16
- # which parser is used. See the documentation for each parser for information on what options it accepts.
14
+ # * <tt>:file</tt>: The source file
15
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends
16
+ # from Parser, a Hash with :name and optionally an :options key. Whether or not the parser uses the
17
+ # options is dependent on which parser is used. See the documentation for each parser for information
18
+ # on what options it accepts.
17
19
  # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
20
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally for archival
18
21
  def initialize(control, configuration, definition)
19
22
  super
20
23
  configure
21
24
  end
22
25
 
26
+ # Get a String identifier for the source
23
27
  def to_s
24
28
  configuration[:file]
25
29
  end
26
30
 
31
+ # Get the local storage directory
32
+ def local_directory
33
+ File.join(local_base, File.basename(configuration[:file], File.extname(configuration[:file])))
34
+ end
35
+
27
36
  # Returns each row from the source
28
37
  def each
29
- @parser.each { |row| yield row }
38
+ copy_sources if store_locally
39
+ @parser.each do |row|
40
+ yield row
41
+ end
30
42
  end
31
43
 
32
44
  private
45
+ # Copy source data to a local directory structure
46
+ def copy_sources
47
+ source_file = File.join(File.dirname(control.file), configuration[:file])
48
+ Dir.glob(source_file).each do |f|
49
+ next if File.directory?(f)
50
+ FileUtils.cp(f, local_file)
51
+ end
52
+ end
53
+
33
54
  # Configure the source
34
55
  def configure
35
56
  case @configuration[:parser]
data/lib/etl/engine.rb CHANGED
@@ -9,7 +9,6 @@ module ETL #:nodoc:
9
9
 
10
10
  # The main ETL engine clas
11
11
  class Engine
12
-
13
12
  class << self
14
13
  # Process the specified control file. Acceptable values for control_file are
15
14
  # * Path to a file
@@ -24,23 +23,49 @@ module ETL #:nodoc:
24
23
 
25
24
  def logger #:nodoc:
26
25
  unless @logger
27
- @logger = Logger.new('etl.log')
26
+ @logger = Logger.new("etl_#{timestamp}.log")
28
27
  @logger.level = Logger::WARN
29
28
  end
30
29
  @logger
31
30
  end
32
31
 
32
+ # Get a timestamp value as a string
33
+ def timestamp
34
+ Time.now.strftime("%Y%m%d%H%M%S")
35
+ end
36
+
37
+ # The current source
33
38
  attr_accessor :current_source
39
+
40
+ # The current source row
34
41
  attr_accessor :current_source_row
42
+
43
+ # The current destination
35
44
  attr_accessor :current_destination
36
45
 
46
+ # Set to true to activate realtime activity. This will cause certain information messages
47
+ # to be printed to STDOUT
37
48
  attr_accessor :realtime_activity
49
+
50
+ attr_accessor :rows_read
51
+
52
+ def rows_read
53
+ @rows_read ||= 0
54
+ end
55
+
56
+ attr_accessor :rows_written
57
+
58
+ def rows_written
59
+ @rows_written ||= 0
60
+ end
38
61
  end
39
62
 
63
+ # Say the specified message, with a newline
40
64
  def say(message)
41
65
  say_without_newline(message + "\n")
42
66
  end
43
67
 
68
+ # Say the specified message without a newline
44
69
  def say_without_newline(message)
45
70
  if Engine.realtime_activity
46
71
  $stdout.print message
@@ -48,10 +73,12 @@ module ETL #:nodoc:
48
73
  end
49
74
  end
50
75
 
76
+ # Say the message on its own line
51
77
  def say_on_own_line(message)
52
78
  say("\n" + message)
53
79
  end
54
80
 
81
+ # Array of errors encountered during execution of the ETL process
55
82
  def errors
56
83
  @errors ||= []
57
84
  end
@@ -61,9 +88,12 @@ module ETL #:nodoc:
61
88
  # * File object
62
89
  # * ETL::Control::Control instance
63
90
  def process(control)
64
- start_time = Time.now
65
91
  control = ETL::Control::Control.resolve(control)
66
92
 
93
+ execute_dependencies(control)
94
+
95
+ start_time = Time.now
96
+
67
97
  Engine.logger.debug "Pre-processing #{control.file}"
68
98
  pre_process(control)
69
99
  Engine.logger.debug "Pre-processing complete"
@@ -76,40 +106,93 @@ module ETL #:nodoc:
76
106
  Engine.logger.debug "Processing source #{source}"
77
107
  say "Source: #{source}"
78
108
  source.each_with_index do |row, index|
109
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
110
+ Engine.rows_read += 1
79
111
  Engine.current_source_row = index + 1
80
112
  if Engine.realtime_activity && index > 0 && index % 1000 == 0
81
113
  say_without_newline "."
82
114
  end
83
115
 
116
+ # At this point a single row may be turned into multiple rows via row processors
117
+ # all code after this line should work with the array of rows rather than the
118
+ # single row
119
+ rows = [row]
120
+
121
+ begin
122
+ Engine.logger.debug "Processing after read"
123
+ control.after_read_processors.each do |processor|
124
+ processed_rows = []
125
+ rows.each do |row|
126
+ processed_rows << processor.process(row)
127
+ end
128
+ rows = processed_rows.flatten
129
+ end
130
+ rescue => e
131
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
132
+ errors << msg
133
+ Engine.logger.error(msg)
134
+ exceeded_error_threshold?(control) ? break : next
135
+ end
136
+
84
137
  begin
85
138
  # execute transforms
86
- row.each do |name, value|
87
- row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
139
+ Engine.logger.debug "Executing transforms"
140
+ rows.each do |row|
141
+ row.each do |name, value|
142
+ name = name.to_sym
143
+ transformers = control.transform(name)
144
+ #Engine.logger.debug "Transforms for #{name}: #{transformers.inspect}"
145
+ row[name] = ETL::Transform::Transform.transform(name, value, row, transformers)
146
+ end
88
147
  end
89
148
  rescue => e
90
- msg = "Error transforming from #{source} on line #{index}: #{e}"
149
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
91
150
  errors << msg
92
- Engine.logger.error msg
93
- break if exceeded_error_threshold?(control)
151
+ Engine.logger.error(msg)
152
+ e.backtrace.each { |line| Engine.logger.error(line) }
153
+ exceeded_error_threshold?(control) ? break : next
154
+ end
155
+
156
+
157
+ begin
158
+ # execute row-level "before write" processing
159
+ Engine.logger.debug "Processing before write"
160
+ control.before_write_processors.each do |processor|
161
+ processed_rows = []
162
+ rows.each do |row|
163
+ processed_rows << processor.process(row)
164
+ end
165
+ rows = processed_rows.flatten
166
+ end
167
+ rescue => e
168
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
169
+ errors << msg
170
+ Engine.logger.error(msg)
171
+ e.backtrace.each { |line| Engine.logger.error(line) }
172
+ exceeded_error_threshold?(control) ? break : next
94
173
  end
95
174
 
96
175
  begin
97
176
  # write the row to the destination
98
- destinations.each do |destination|
177
+ destinations.each_with_index do |destination, index|
99
178
  Engine.current_destination = destination
100
- destination.write(row)
179
+ rows.each do |row|
180
+ destination.write(row)
181
+ Engine.rows_written += 1 if index == 0
182
+ end
101
183
  end
102
- rescue
103
- msg = "Error writing to #{destination} on line #{index}"
184
+ rescue => e
185
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
104
186
  errors << msg
105
187
  Engine.logger.error msg
188
+ e.backtrace.each { |line| Engine.logger.error(line) }
106
189
  break if exceeded_error_threshold?(control)
107
190
  end
108
191
  end
192
+
109
193
  if exceeded_error_threshold?(control)
110
194
  say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
111
- else
112
- say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)} with #{errors} errors."
195
+ return
113
196
  end
114
197
 
115
198
  destinations.each do |destination|
@@ -117,12 +200,23 @@ module ETL #:nodoc:
117
200
  end
118
201
  end
119
202
 
203
+ say_on_own_line "Executing post processes"
120
204
  Engine.logger.debug "Post-processing #{control.file}"
121
205
  post_process(control)
122
206
  Engine.logger.debug "Post-processing complete"
207
+ say "Post-processing complete"
208
+
209
+ if sources.length > 0
210
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
211
+ end
212
+ if destinations.length > 0
213
+ say "Wrote #{Engine.rows_written} lines to destinations"
214
+ end
215
+ say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
123
216
  end
124
217
 
125
218
  private
219
+ # Return true if the error threshold is exceeded
126
220
  def exceeded_error_threshold?(control)
127
221
  errors.length > control.error_threshold
128
222
  end
@@ -141,6 +235,26 @@ module ETL #:nodoc:
141
235
  end
142
236
  end
143
237
 
238
+ # Execute all dependencies
239
+ def execute_dependencies(control)
240
+ Engine.logger.debug "Executing dependencies"
241
+ control.dependencies.flatten.each do |dependency|
242
+ case dependency
243
+ when Symbol
244
+ f = dependency.to_s + '.ctl'
245
+ Engine.logger.debug "Executing dependency: #{f}"
246
+ say "Executing dependency: #{f}"
247
+ process(f)
248
+ when String
249
+ Engine.logger.debug "Executing dependency: #{f}"
250
+ say "Executing dependency: #{f}"
251
+ process(dependency)
252
+ else
253
+ raise "Invalid dependency type: #{dependency.class}"
254
+ end
255
+ end
256
+ end
257
+
144
258
  # Return the distance of time in words from the given from_time to the specified to_time. If to_time
145
259
  # is not specified then Time.now is used. By default seconds are included...set the include_seconds
146
260
  # argument to false to disable the seconds.