activewarehouse-etl 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/CHANGELOG +41 -13
  2. data/README +1 -1
  3. data/Rakefile +14 -4
  4. data/TODO +17 -1
  5. data/bin/etl +3 -1
  6. data/lib/etl.rb +11 -7
  7. data/lib/etl/commands/etl.rb +0 -1
  8. data/lib/etl/control/control.rb +113 -36
  9. data/lib/etl/control/destination.rb +13 -1
  10. data/lib/etl/control/destination/database_destination.rb +3 -1
  11. data/lib/etl/control/destination/file_destination.rb +5 -2
  12. data/lib/etl/control/source.rb +36 -0
  13. data/lib/etl/control/source/database_source.rb +63 -8
  14. data/lib/etl/control/source/file_source.rb +25 -4
  15. data/lib/etl/engine.rb +128 -14
  16. data/lib/etl/generator/surrogate_key_generator.rb +1 -0
  17. data/lib/etl/http_tools.rb +119 -0
  18. data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
  19. data/lib/etl/parser/sax_parser.rb +18 -6
  20. data/lib/etl/processor.rb +1 -0
  21. data/lib/etl/processor/bulk_import_processor.rb +12 -0
  22. data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
  23. data/lib/etl/processor/processor.rb +1 -5
  24. data/lib/etl/processor/row_processor.rb +17 -0
  25. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  26. data/lib/etl/transform/decode_transform.rb +1 -1
  27. data/lib/etl/transform/default_transform.rb +15 -0
  28. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  29. data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
  30. data/lib/etl/transform/sha1_transform.rb +1 -1
  31. data/lib/etl/transform/string_to_date_transform.rb +3 -3
  32. data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
  33. data/lib/etl/transform/string_to_time_transform.rb +14 -0
  34. data/lib/etl/transform/transform.rb +8 -4
  35. data/lib/etl/transform/type_transform.rb +2 -2
  36. data/lib/etl/version.rb +2 -2
  37. metadata +21 -8
  38. data/lib/etl/active_record_ext.rb +0 -1
  39. data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -26,6 +26,7 @@ module ETL #:nodoc:
26
26
  # * <tt>:username</tt>: The database username (defaults to 'root')
27
27
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
28
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
29
+ # * <tt>:append_rows</tt>: Array of rows to append
29
30
  #
30
31
  # Mapping options:
31
32
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
@@ -44,7 +45,7 @@ module ETL #:nodoc:
44
45
  conn.transaction do
45
46
  conn.truncate(configuration[:table]) if truncate
46
47
 
47
- buffer.each do |row|
48
+ buffer.flatten.each do |row|
48
49
  # check to see if this row's compound key constraint already exists
49
50
  # note that the compound key constraint may not utilize virtual fields
50
51
  next unless row_allowed?(row)
@@ -69,6 +70,7 @@ module ETL #:nodoc:
69
70
 
70
71
  # Close the connection
71
72
  def close
73
+ buffer << append_rows if append_rows
72
74
  flush
73
75
  ETL::ActiveRecord::Base.connection.disconnect!
74
76
  end
@@ -32,6 +32,7 @@ module ETL #:nodoc:
32
32
  # * <tt>:eol</tt>: End of line marker (default is \n)
33
33
  # * <tt>:enclose</tt>: Enclosure character (default is none)
34
34
  # * <tt>:unique</tt>: Set to true to only write unique records
35
+ # * <tt>:append_rows</tt>: Array of rows to append
35
36
  #
36
37
  # Mapping options:
37
38
  # * <tt>:order</tt>: The order array
@@ -50,14 +51,15 @@ module ETL #:nodoc:
50
51
 
51
52
  # Close the destination. This will flush the buffer and close the underlying stream or connection.
52
53
  def close
54
+ buffer << append_rows if append_rows
53
55
  flush
54
56
  f.close
55
57
  end
56
58
 
57
59
  # Flush the destination buffer
58
60
  def flush
59
- # Engine.logger.debug "Flushing buffer #{buffer}"
60
- buffer.each do |row|
61
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
62
+ buffer.flatten.each do |row|
61
63
  # check to see if this row's compound key constraint already exists
62
64
  # note that the compound key constraint may not utilize virtual fields
63
65
  next unless row_allowed?(row)
@@ -80,6 +82,7 @@ module ETL #:nodoc:
80
82
  f.write(eol)
81
83
  end
82
84
  buffer.clear
85
+ #puts "After flush there are #{buffer.length} rows"
83
86
  end
84
87
 
85
88
  private
@@ -13,6 +13,9 @@ module ETL #:nodoc:
13
13
  # The definition Hash
14
14
  attr_accessor :definition
15
15
 
16
+ # Returns true if the source data should be stored locally for archival
17
+ attr_accessor :store_locally
18
+
16
19
  class << self
17
20
  # Convert the name to a Source class.
18
21
  #
@@ -26,15 +29,48 @@ module ETL #:nodoc:
26
29
  # * <tt>control</tt>: The control object
27
30
  # * <tt>configuration</tt>: The configuration hash
28
31
  # * <tt>definition</tt>: The source layout definition
32
+ #
33
+ # Configuration options:
34
+ # * <tt>:store_locally</tt>: Set to false to not store source data locally (defaults to true)
29
35
  def initialize(control, configuration, definition)
30
36
  @control = control
31
37
  @configuration = configuration
32
38
  @definition = definition
39
+
40
+ @store_locally = configuration[:store_locally] || true
33
41
  end
34
42
 
35
43
  def errors
36
44
  @errors ||= []
37
45
  end
46
+
47
+ # Get a timestamp value as a string
48
+ def timestamp
49
+ Engine.timestamp
50
+ end
51
+
52
+ # The base directory where local files are stored.
53
+ attr_accessor :local_base
54
+
55
+ # Get the local base, defaults to 'source_data'
56
+ def local_base
57
+ @local_base ||= 'source_data'
58
+ end
59
+
60
+ # The local directory for storing. This method must be overriden by subclasses
61
+ def local_directory
62
+ raise "local_directory method is abstract"
63
+ end
64
+
65
+ # Return the local file for storing the raw source data. Each call to this method will
66
+ # result in a timestamped file, so you cannot expect to call it multiple times and reference
67
+ # the same file
68
+ def local_file
69
+ local_dir = local_directory
70
+ FileUtils.mkdir_p(local_dir)
71
+ File.join(local_dir, "#{timestamp}.csv")
72
+ end
73
+
38
74
  end
39
75
  end
40
76
  end
@@ -1,4 +1,10 @@
1
+ require 'fileutils'
2
+
1
3
  module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
2
8
  module Control #:nodoc:
3
9
  # Source object which extracts data from a database using ActiveRecord.
4
10
  class DatabaseSource < Source
@@ -18,24 +24,73 @@ module ETL #:nodoc:
18
24
  # * <tt>:username</tt>: The database username (defaults to 'root')
19
25
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
20
26
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
27
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally
28
+ # in a flat file (defaults to true)
21
29
  def initialize(control, configuration, definition)
22
30
  super
23
31
  connect
24
32
  end
25
33
 
34
+ # Get a String identifier for the source
26
35
  def to_s
27
- "#{configuration[:host]}/#{configuration[:database]}"
36
+ "#{host}/#{configuration[:database]}/#{configuration[:table]}"
37
+ end
38
+
39
+ # Get the local directory to use, which is a combination of the local_base, the db hostname
40
+ # the db database name and the db table.
41
+ def local_directory
42
+ File.join(local_base, host, configuration[:database], configuration[:table])
28
43
  end
29
44
 
30
45
  # Returns each row from the source
31
46
  def each
32
- conn = ETL::ActiveRecord::Base.connection
33
- conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
34
- yield row
47
+ if store_locally
48
+ file = local_file
49
+ columns = connection.columns(configuration[:table].to_s)
50
+ FasterCSV.open(file, 'w') do |f|
51
+ f << columns.collect { |column| column.name }
52
+ connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
53
+ values = columns.collect { |column| row[column.name] }
54
+ #puts "row: #{values.inspect}"
55
+ f << values
56
+ end
57
+ end
58
+ FasterCSV.open(file, :headers => true).each do |row|
59
+ result_row = {}
60
+ row.each do |header, field|
61
+ result_row[header.to_sym] = field
62
+ end
63
+ #puts "yielding #{result_row.inspect}"
64
+ yield result_row
65
+ end
66
+ else
67
+ connection.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
68
+ yield HashWithIndifferentAccess.new(row)
69
+ end
35
70
  end
36
71
  end
37
72
 
38
73
  private
74
+ # Get the database connection to use
75
+ def connection
76
+ ETL::Source.connection
77
+ end
78
+
79
+ # Get the adapter name, defaults to :mysql
80
+ def adapter
81
+ configuration[:adapter] || :mysql
82
+ end
83
+
84
+ # Get the host, defaults to 'localhost'
85
+ def host
86
+ configuration[:host] || "localhost"
87
+ end
88
+
89
+ # Get the username, defaults to 'root'
90
+ def username
91
+ configuration[:username] || 'root'
92
+ end
93
+
39
94
  # Connect to the database.
40
95
  #
41
96
  # Required options:
@@ -47,10 +102,10 @@ module ETL #:nodoc:
47
102
  # * <tt>:password</tt>: The password to the database (defaults to nothing)
48
103
  # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
49
104
  def connect
50
- ETL::ActiveRecord::Base.establish_connection(
51
- :adapter => (configuration[:adapter] || :mysql),
52
- :username => (configuration[:username] || 'root'),
53
- :host => (configuration[:host] || 'localhost'),
105
+ ETL::Source.establish_connection(
106
+ :adapter => adapter,
107
+ :username => username,
108
+ :host => host,
54
109
  :password => configuration[:password],
55
110
  :database => configuration[:database]
56
111
  )
@@ -11,25 +11,46 @@ module ETL #:nodoc:
11
11
  # Initialize the source
12
12
  #
13
13
  # Configuration options:
14
- # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
15
- # a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
16
- # which parser is used. See the documentation for each parser for information on what options it accepts.
14
+ # * <tt>:file</tt>: The source file
15
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends
16
+ # from Parser, a Hash with :name and optionally an :options key. Whether or not the parser uses the
17
+ # options is dependent on which parser is used. See the documentation for each parser for information
18
+ # on what options it accepts.
17
19
  # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
20
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the source data locally for archival
18
21
  def initialize(control, configuration, definition)
19
22
  super
20
23
  configure
21
24
  end
22
25
 
26
+ # Get a String identifier for the source
23
27
  def to_s
24
28
  configuration[:file]
25
29
  end
26
30
 
31
+ # Get the local storage directory
32
+ def local_directory
33
+ File.join(local_base, File.basename(configuration[:file], File.extname(configuration[:file])))
34
+ end
35
+
27
36
  # Returns each row from the source
28
37
  def each
29
- @parser.each { |row| yield row }
38
+ copy_sources if store_locally
39
+ @parser.each do |row|
40
+ yield row
41
+ end
30
42
  end
31
43
 
32
44
  private
45
+ # Copy source data to a local directory structure
46
+ def copy_sources
47
+ source_file = File.join(File.dirname(control.file), configuration[:file])
48
+ Dir.glob(source_file).each do |f|
49
+ next if File.directory?(f)
50
+ FileUtils.cp(f, local_file)
51
+ end
52
+ end
53
+
33
54
  # Configure the source
34
55
  def configure
35
56
  case @configuration[:parser]
data/lib/etl/engine.rb CHANGED
@@ -9,7 +9,6 @@ module ETL #:nodoc:
9
9
 
10
10
  # The main ETL engine clas
11
11
  class Engine
12
-
13
12
  class << self
14
13
  # Process the specified control file. Acceptable values for control_file are
15
14
  # * Path to a file
@@ -24,23 +23,49 @@ module ETL #:nodoc:
24
23
 
25
24
  def logger #:nodoc:
26
25
  unless @logger
27
- @logger = Logger.new('etl.log')
26
+ @logger = Logger.new("etl_#{timestamp}.log")
28
27
  @logger.level = Logger::WARN
29
28
  end
30
29
  @logger
31
30
  end
32
31
 
32
+ # Get a timestamp value as a string
33
+ def timestamp
34
+ Time.now.strftime("%Y%m%d%H%M%S")
35
+ end
36
+
37
+ # The current source
33
38
  attr_accessor :current_source
39
+
40
+ # The current source row
34
41
  attr_accessor :current_source_row
42
+
43
+ # The current destination
35
44
  attr_accessor :current_destination
36
45
 
46
+ # Set to true to activate realtime activity. This will cause certain information messages
47
+ # to be printed to STDOUT
37
48
  attr_accessor :realtime_activity
49
+
50
+ attr_accessor :rows_read
51
+
52
+ def rows_read
53
+ @rows_read ||= 0
54
+ end
55
+
56
+ attr_accessor :rows_written
57
+
58
+ def rows_written
59
+ @rows_written ||= 0
60
+ end
38
61
  end
39
62
 
63
+ # Say the specified message, with a newline
40
64
  def say(message)
41
65
  say_without_newline(message + "\n")
42
66
  end
43
67
 
68
+ # Say the specified message without a newline
44
69
  def say_without_newline(message)
45
70
  if Engine.realtime_activity
46
71
  $stdout.print message
@@ -48,10 +73,12 @@ module ETL #:nodoc:
48
73
  end
49
74
  end
50
75
 
76
+ # Say the message on its own line
51
77
  def say_on_own_line(message)
52
78
  say("\n" + message)
53
79
  end
54
80
 
81
+ # Array of errors encountered during execution of the ETL process
55
82
  def errors
56
83
  @errors ||= []
57
84
  end
@@ -61,9 +88,12 @@ module ETL #:nodoc:
61
88
  # * File object
62
89
  # * ETL::Control::Control instance
63
90
  def process(control)
64
- start_time = Time.now
65
91
  control = ETL::Control::Control.resolve(control)
66
92
 
93
+ execute_dependencies(control)
94
+
95
+ start_time = Time.now
96
+
67
97
  Engine.logger.debug "Pre-processing #{control.file}"
68
98
  pre_process(control)
69
99
  Engine.logger.debug "Pre-processing complete"
@@ -76,40 +106,93 @@ module ETL #:nodoc:
76
106
  Engine.logger.debug "Processing source #{source}"
77
107
  say "Source: #{source}"
78
108
  source.each_with_index do |row, index|
109
+ Engine.logger.debug "Row #{index}: #{row.inspect}"
110
+ Engine.rows_read += 1
79
111
  Engine.current_source_row = index + 1
80
112
  if Engine.realtime_activity && index > 0 && index % 1000 == 0
81
113
  say_without_newline "."
82
114
  end
83
115
 
116
+ # At this point a single row may be turned into multiple rows via row processors
117
+ # all code after this line should work with the array of rows rather than the
118
+ # single row
119
+ rows = [row]
120
+
121
+ begin
122
+ Engine.logger.debug "Processing after read"
123
+ control.after_read_processors.each do |processor|
124
+ processed_rows = []
125
+ rows.each do |row|
126
+ processed_rows << processor.process(row)
127
+ end
128
+ rows = processed_rows.flatten
129
+ end
130
+ rescue => e
131
+ msg = "Error processing rows after read from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
132
+ errors << msg
133
+ Engine.logger.error(msg)
134
+ exceeded_error_threshold?(control) ? break : next
135
+ end
136
+
84
137
  begin
85
138
  # execute transforms
86
- row.each do |name, value|
87
- row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
139
+ Engine.logger.debug "Executing transforms"
140
+ rows.each do |row|
141
+ row.each do |name, value|
142
+ name = name.to_sym
143
+ transformers = control.transform(name)
144
+ #Engine.logger.debug "Transforms for #{name}: #{transformers.inspect}"
145
+ row[name] = ETL::Transform::Transform.transform(name, value, row, transformers)
146
+ end
88
147
  end
89
148
  rescue => e
90
- msg = "Error transforming from #{source} on line #{index}: #{e}"
149
+ msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
91
150
  errors << msg
92
- Engine.logger.error msg
93
- break if exceeded_error_threshold?(control)
151
+ Engine.logger.error(msg)
152
+ e.backtrace.each { |line| Engine.logger.error(line) }
153
+ exceeded_error_threshold?(control) ? break : next
154
+ end
155
+
156
+
157
+ begin
158
+ # execute row-level "before write" processing
159
+ Engine.logger.debug "Processing before write"
160
+ control.before_write_processors.each do |processor|
161
+ processed_rows = []
162
+ rows.each do |row|
163
+ processed_rows << processor.process(row)
164
+ end
165
+ rows = processed_rows.flatten
166
+ end
167
+ rescue => e
168
+ msg = "Error processing rows before write from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
169
+ errors << msg
170
+ Engine.logger.error(msg)
171
+ e.backtrace.each { |line| Engine.logger.error(line) }
172
+ exceeded_error_threshold?(control) ? break : next
94
173
  end
95
174
 
96
175
  begin
97
176
  # write the row to the destination
98
- destinations.each do |destination|
177
+ destinations.each_with_index do |destination, index|
99
178
  Engine.current_destination = destination
100
- destination.write(row)
179
+ rows.each do |row|
180
+ destination.write(row)
181
+ Engine.rows_written += 1 if index == 0
182
+ end
101
183
  end
102
- rescue
103
- msg = "Error writing to #{destination} on line #{index}"
184
+ rescue => e
185
+ msg = "Error writing to #{Engine.current_destination}: #{e}"
104
186
  errors << msg
105
187
  Engine.logger.error msg
188
+ e.backtrace.each { |line| Engine.logger.error(line) }
106
189
  break if exceeded_error_threshold?(control)
107
190
  end
108
191
  end
192
+
109
193
  if exceeded_error_threshold?(control)
110
194
  say_on_own_line "Exiting due to exceeding error threshold: #{control.error_threshold}"
111
- else
112
- say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)} with #{errors} errors."
195
+ return
113
196
  end
114
197
 
115
198
  destinations.each do |destination|
@@ -117,12 +200,23 @@ module ETL #:nodoc:
117
200
  end
118
201
  end
119
202
 
203
+ say_on_own_line "Executing post processes"
120
204
  Engine.logger.debug "Post-processing #{control.file}"
121
205
  post_process(control)
122
206
  Engine.logger.debug "Post-processing complete"
207
+ say "Post-processing complete"
208
+
209
+ if sources.length > 0
210
+ say_on_own_line "Read #{Engine.rows_read} lines from sources"
211
+ end
212
+ if destinations.length > 0
213
+ say "Wrote #{Engine.rows_written} lines to destinations"
214
+ end
215
+ say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
123
216
  end
124
217
 
125
218
  private
219
+ # Return true if the error threshold is exceeded
126
220
  def exceeded_error_threshold?(control)
127
221
  errors.length > control.error_threshold
128
222
  end
@@ -141,6 +235,26 @@ module ETL #:nodoc:
141
235
  end
142
236
  end
143
237
 
238
+ # Execute all dependencies
239
+ def execute_dependencies(control)
240
+ Engine.logger.debug "Executing dependencies"
241
+ control.dependencies.flatten.each do |dependency|
242
+ case dependency
243
+ when Symbol
244
+ f = dependency.to_s + '.ctl'
245
+ Engine.logger.debug "Executing dependency: #{f}"
246
+ say "Executing dependency: #{f}"
247
+ process(f)
248
+ when String
249
+ Engine.logger.debug "Executing dependency: #{f}"
250
+ say "Executing dependency: #{f}"
251
+ process(dependency)
252
+ else
253
+ raise "Invalid dependency type: #{dependency.class}"
254
+ end
255
+ end
256
+ end
257
+
144
258
  # Return the distance of time in words from the given from_time to the specified to_time. If to_time
145
259
  # is not specified then Time.now is used. By default seconds are included...set the include_seconds
146
260
  # argument to false to disable the seconds.