activewarehouse-etl 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -47,10 +47,7 @@ module ETL #:nodoc:
47
47
 
48
48
  # Flush the currently buffered data
49
49
  def flush
50
- conn = ETL::Engine.connection(target)
51
50
  conn.transaction do
52
- conn.truncate(table_name) if truncate
53
-
54
51
  buffer.flatten.each do |row|
55
52
  # check to see if this row's compound key constraint already exists
56
53
  # note that the compound key constraint may not utilize virtual fields
@@ -62,10 +59,10 @@ module ETL #:nodoc:
62
59
  names = []
63
60
  values = []
64
61
  order.each do |name|
65
- names << name
62
+ names << "`#{name}`"
66
63
  values << conn.quote(row[name]) # TODO: this is probably not database agnostic
67
64
  end
68
- q = "INSERT INTO #{table_name} (#{names.join(',')}) VALUES (#{values.join(',')})"
65
+ q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
69
66
  ETL::Engine.logger.debug("Executing insert: #{q}")
70
67
  conn.insert(q, "Insert row #{current_row}")
71
68
  @current_row += 1
@@ -81,6 +78,14 @@ module ETL #:nodoc:
81
78
  end
82
79
 
83
80
  private
81
+ def conn
82
+ @conn ||= begin
83
+ conn = ETL::Engine.connection(target)
84
+ conn.truncate(table_name) if truncate
85
+ conn
86
+ end
87
+ end
88
+
84
89
  def table_name
85
90
  ETL::Engine.table(table, ETL::Engine.connection(target))
86
91
  end
@@ -23,7 +23,7 @@ module ETL #:nodoc:
23
23
  # For example if name is :database then this will return a
24
24
  # DatabaseSource class
25
25
  def class_for_name(name)
26
- ETL::Control.const_get("#{name.to_s.classify}Source")
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
27
  end
28
28
  end
29
29
 
@@ -93,14 +93,8 @@ module ETL #:nodoc:
93
93
  # Get the list of columns to read. This is defined in the source
94
94
  # definition as either an Array or Hash
95
95
  def columns
96
- case definition
97
- when Array
98
- definition.collect(&:to_sym)
99
- when Hash
100
- definition.keys.collect(&:to_sym)
101
- else
102
- raise "Definition must be either an Array or a Hash"
103
- end
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
104
98
  end
105
99
 
106
100
  # Returns each row from the source. If read_locally is specified then
@@ -118,7 +112,7 @@ module ETL #:nodoc:
118
112
  write_local(file)
119
113
  read_rows(file, &block)
120
114
  else
121
- connection.select_all(query).each do |row|
115
+ query_rows.each do |row|
122
116
  row = ETL::Row.new(row.symbolize_keys)
123
117
  row.source = self
124
118
  yield row
@@ -158,7 +152,7 @@ module ETL #:nodoc:
158
152
  t = Benchmark.realtime do
159
153
  FasterCSV.open(file, 'w') do |f|
160
154
  f << columns
161
- connection.select_all(query).each do |row|
155
+ query_rows.each do |row|
162
156
  f << columns.collect { |column| row[column.to_s] }
163
157
  lines += 1
164
158
  end
@@ -204,6 +198,10 @@ module ETL #:nodoc:
204
198
  @query = q
205
199
  end
206
200
 
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
207
205
  # Get the database connection to use
208
206
  def connection
209
207
  ETL::Engine.connection(target)
@@ -1,3 +1,5 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
1
3
  module ETL #:nodoc:
2
4
  module CoreExtensions #:nodoc:
3
5
  module Time #:nodoc:
@@ -18,7 +20,7 @@ module ETL #:nodoc:
18
20
  end
19
21
  def fiscal_year_month(offset_month=10)
20
22
  shifted_month = month - (offset_month - 1)
21
- shifted_month += 12 if shifted_month < 0
23
+ shifted_month += 12 if shifted_month <= 0
22
24
  shifted_month
23
25
  end
24
26
  def fiscal_year_quarter(offset_month=10)
@@ -37,4 +39,4 @@ module ETL #:nodoc:
37
39
  end
38
40
  end
39
41
  end
40
- end
42
+ end
@@ -19,7 +19,7 @@ module ETL #:nodoc:
19
19
  # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
20
  def init(options={})
21
21
  unless @initialized
22
- puts "initializing ETL engine"
22
+ puts "initializing ETL engine\n\n"
23
23
  @limit = options[:limit]
24
24
  @offset = options[:offset]
25
25
  @log_write_mode = 'w' if options[:newlog]
@@ -28,8 +28,8 @@ module ETL #:nodoc:
28
28
  @rails_root = options[:rails_root]
29
29
 
30
30
  require File.join(@rails_root, 'config/environment') if @rails_root
31
-
32
31
  options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
33
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
34
  ActiveRecord::Base.configurations.merge!(database_configuration)
35
35
  ETL::Base.configurations = database_configuration
@@ -177,9 +177,7 @@ module ETL #:nodoc:
177
177
  if temp_tables[temp_table_name].nil?
178
178
  # Create the temp table and add it to the mapping
179
179
  begin connection.drop_table(temp_table_name); rescue; end
180
- connection.execute(
181
- connection.add_select_into_table(temp_table_name, "SELECT * FROM #{table_name}")
182
- )
180
+ connection.copy_table(table_name, temp_table_name)
183
181
  temp_tables[temp_table_name] = {
184
182
  :table => table_name,
185
183
  :connection => connection
@@ -274,6 +272,7 @@ module ETL #:nodoc:
274
272
  # Process the specified batch file
275
273
  def process_batch(batch)
276
274
  batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
277
276
 
278
277
  ETL::Engine.batch = ETL::Execution::Batch.create!(
279
278
  :batch_file => batch.file,
@@ -290,10 +289,12 @@ module ETL #:nodoc:
290
289
  # Process the specified control file
291
290
  def process_control(control)
292
291
  control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
293
 
294
294
  ETL::Engine.job = ETL::Execution::Job.create!(
295
295
  :control_file => control.file,
296
- :status => 'executing'
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
297
298
  )
298
299
 
299
300
  execute_dependencies(control)
@@ -357,11 +358,15 @@ module ETL #:nodoc:
357
358
  row[name] = transform.transform(name, row[name], row)
358
359
  end
359
360
  end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
360
364
  rescue => e
361
365
  msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
362
366
  errors << msg
363
367
  Engine.logger.error(msg)
364
368
  e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
365
370
  begin
366
371
  exceeded_error_threshold?(control) ? break : next
367
372
  rescue => inner_error
@@ -422,7 +427,7 @@ module ETL #:nodoc:
422
427
  destination.close
423
428
  end
424
429
 
425
- say_on_own_line "Executing screens"
430
+ say_on_own_line "Executing before post-process screens"
426
431
  begin
427
432
  execute_screens(control)
428
433
  rescue FatalScreenError => e
@@ -443,7 +448,21 @@ module ETL #:nodoc:
443
448
  if destinations.length > 0
444
449
  say "Wrote #{Engine.rows_written} lines to destinations"
445
450
  end
446
- say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
447
466
  say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
448
467
 
449
468
  say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
@@ -509,9 +528,15 @@ module ETL #:nodoc:
509
528
  end
510
529
 
511
530
  # Execute all screens
512
- def execute_screens(control)
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
513
538
  [:fatal,:error,:warn].each do |type|
514
- control.screens[type].each do |block|
539
+ screens[type].each do |block|
515
540
  begin
516
541
  block.call
517
542
  rescue => e
@@ -4,20 +4,28 @@ module ETL #:nodoc:
4
4
  # for the ETL engine
5
5
  class Migration
6
6
  class << self
7
+ protected
8
+ # Get the schema info table name
9
+ def schema_info_table_name
10
+ ActiveRecord::Migrator.schema_migrations_table_name
11
+ end
12
+ alias :schema_migrations_table_name :schema_info_table_name
13
+
14
+ public
7
15
  # Execute the migrations
8
16
  def migrate
9
- connection.initialize_schema_information
10
- v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
- v.upto(target - 1) do |i|
17
+ connection.initialize_schema_migrations_table
18
+ last_migration.upto(target - 1) do |i|
12
19
  __send__("migration_#{i+1}".to_sym)
13
- update_schema_info(i+1)
20
+ connection.assume_migrated_upto_version(i+1)
14
21
  end
15
22
  end
23
+
16
24
  protected
17
- # Get the schema info table name
18
- def schema_info_table_name
19
- ETL::Execution::Base.table_name_prefix + "schema_info" +
20
- ETL::Execution::Base.table_name_suffix
25
+ def last_migration
26
+ connection.select_values(
27
+ "SELECT version FROM #{schema_migrations_table_name}"
28
+ ).map(&:to_i).sort.last || 0
21
29
  end
22
30
 
23
31
  # Get the connection to use during migration
@@ -27,7 +35,7 @@ module ETL #:nodoc:
27
35
 
28
36
  # Get the final target version number
29
37
  def target
30
- 3
38
+ 4
31
39
  end
32
40
 
33
41
  private
@@ -62,6 +70,10 @@ module ETL #:nodoc:
62
70
  connection.add_column :jobs, :batch_id, :integer
63
71
  connection.add_index :jobs, :batch_id
64
72
  end
73
+
74
+ def migration_4
75
+ connection.drop_table :records
76
+ end
65
77
 
66
78
  # Update the schema info table, setting the version value
67
79
  def update_schema_info(version)
@@ -7,7 +7,7 @@ module ETL #:nodoc:
7
7
  #
8
8
  # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
9
9
  def class_for_name(name)
10
- ETL::Generator.const_get("#{name.to_s.classify}Generator")
10
+ ETL::Generator.const_get("#{name.to_s.camelize}Generator")
11
11
  end
12
12
  end
13
13
 
@@ -106,20 +106,34 @@ module HttpTools
106
106
  result
107
107
  end
108
108
 
109
- def parse_uri(uri_string)
110
- if uri_string
109
+ # Parse a URI. If options[:prefix] is set then prepend it to the keys for the hash that
110
+ # is returned.
111
+ def parse_uri(uri_string, options={})
112
+ prefix = options[:prefix] ||= ''
113
+ empty_hash = {
114
+ "#{prefix}scheme".to_sym => nil,
115
+ "#{prefix}host".to_sym => nil,
116
+ "#{prefix}port".to_sym => nil,
117
+ "#{prefix}uri_path".to_sym => nil,
118
+ "#{prefix}domain".to_sym => nil
119
+ }
120
+ if uri_string
111
121
  #attempt to parse uri --if it's a uri then catch the problem and set everything to nil
112
122
  begin
113
123
  uri = URI.parse(uri_string)
114
- results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
115
- results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
124
+ results = {
125
+ "#{prefix}scheme".to_sym => uri.scheme,
126
+ "#{prefix}host".to_sym => uri.host,
127
+ "#{prefix}port".to_sym => uri.port,
128
+ "#{prefix}uri_path".to_sym => uri.path
129
+ }
130
+ results["#{prefix}domain".to_sym] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
116
131
  results
117
132
  rescue
118
- {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
133
+ empty_hash
119
134
  end
120
-
121
135
  else
122
- {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
136
+ empty_hash
123
137
  end
124
138
  end
125
139
  end
@@ -33,9 +33,11 @@ module ETL #:nodoc:
33
33
  #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34
34
  d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil?
35
35
  fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil?
36
+
37
+ fields[:method], fields[:path] = fields[:request].split(/\s/)
36
38
 
37
39
  fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil?
38
- fields.merge!(parse_uri(fields[:referrer]))
40
+ fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_'))
39
41
 
40
42
  fields.each do |key, value|
41
43
  fields[key] = nil if value == '-'
@@ -44,7 +44,7 @@ module ETL #:nodoc:
44
44
  ETL::Engine.logger.debug "validating line #{line} in file #{file}"
45
45
  if row.length != fields.length
46
46
  raise_with_info( MismatchError,
47
- "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
47
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
48
48
  line, file
49
49
  )
50
50
  end
@@ -11,7 +11,7 @@ module ETL #:nodoc:
11
11
  # Example:
12
12
  # <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
13
13
  def class_for_name(name)
14
- ETL::Parser.const_get("#{name.to_s.classify}Parser")
14
+ ETL::Parser.const_get("#{name.to_s.camelize}Parser")
15
15
  end
16
16
  end
17
17
 
@@ -0,0 +1,14 @@
1
+ module ETL
2
+ module Processor
3
+ # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4
+ class BlockProcessor < ETL::Processor::RowProcessor
5
+ def initialize(control, configuration)
6
+ super
7
+ @block = configuration[:block]
8
+ end
9
+ def process(row=nil)
10
+ @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -21,6 +21,8 @@ module ETL #:nodoc:
21
21
  attr_accessor :field_enclosure
22
22
  # The line separator (defaults to a newline)
23
23
  attr_accessor :line_separator
24
+ # The string that indicates a NULL (defaults to an empty string)
25
+ attr_accessor :null_string
24
26
 
25
27
  # Initialize the processor.
26
28
  #
@@ -43,6 +45,7 @@ module ETL #:nodoc:
43
45
  @columns = configuration[:columns]
44
46
  @field_separator = (configuration[:field_separator] || ',')
45
47
  @line_separator = (configuration[:line_separator] || "\n")
48
+ @null_string = (configuration[:null_string] || "")
46
49
  @field_enclosure = configuration[:field_enclosure]
47
50
 
48
51
  raise ControlError, "Target must be specified" unless @target
@@ -59,8 +62,9 @@ module ETL #:nodoc:
59
62
  conn.truncate(table_name) if truncate
60
63
  options = {}
61
64
  options[:columns] = columns
62
- if field_separator || field_enclosure
65
+ if field_separator || field_enclosure || line_separator || null_string
63
66
  options[:fields] = {}
67
+ options[:fields][:null_string] = null_string if null_string
64
68
  options[:fields][:delimited_by] = field_separator if field_separator
65
69
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
66
70
  options[:fields][:terminated_by] = line_separator if line_separator
@@ -64,6 +64,7 @@ module ETL #:nodoc:
64
64
  end
65
65
  end
66
66
  q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
67
68
 
68
69
  #puts "query: #{q}"
69
70
  result = conn.select_one(q)
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end