activewarehouse-etl 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,10 +47,7 @@ module ETL #:nodoc:
47
47
 
48
48
  # Flush the currently buffered data
49
49
  def flush
50
- conn = ETL::Engine.connection(target)
51
50
  conn.transaction do
52
- conn.truncate(table_name) if truncate
53
-
54
51
  buffer.flatten.each do |row|
55
52
  # check to see if this row's compound key constraint already exists
56
53
  # note that the compound key constraint may not utilize virtual fields
@@ -62,10 +59,10 @@ module ETL #:nodoc:
62
59
  names = []
63
60
  values = []
64
61
  order.each do |name|
65
- names << name
62
+ names << "`#{name}`"
66
63
  values << conn.quote(row[name]) # TODO: this is probably not database agnostic
67
64
  end
68
- q = "INSERT INTO #{table_name} (#{names.join(',')}) VALUES (#{values.join(',')})"
65
+ q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
69
66
  ETL::Engine.logger.debug("Executing insert: #{q}")
70
67
  conn.insert(q, "Insert row #{current_row}")
71
68
  @current_row += 1
@@ -81,6 +78,14 @@ module ETL #:nodoc:
81
78
  end
82
79
 
83
80
  private
81
+ def conn
82
+ @conn ||= begin
83
+ conn = ETL::Engine.connection(target)
84
+ conn.truncate(table_name) if truncate
85
+ conn
86
+ end
87
+ end
88
+
84
89
  def table_name
85
90
  ETL::Engine.table(table, ETL::Engine.connection(target))
86
91
  end
@@ -23,7 +23,7 @@ module ETL #:nodoc:
23
23
  # For example if name is :database then this will return a
24
24
  # DatabaseSource class
25
25
  def class_for_name(name)
26
- ETL::Control.const_get("#{name.to_s.classify}Source")
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
27
  end
28
28
  end
29
29
 
@@ -93,14 +93,8 @@ module ETL #:nodoc:
93
93
  # Get the list of columns to read. This is defined in the source
94
94
  # definition as either an Array or Hash
95
95
  def columns
96
- case definition
97
- when Array
98
- definition.collect(&:to_sym)
99
- when Hash
100
- definition.keys.collect(&:to_sym)
101
- else
102
- raise "Definition must be either an Array or a Hash"
103
- end
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
104
98
  end
105
99
 
106
100
  # Returns each row from the source. If read_locally is specified then
@@ -118,7 +112,7 @@ module ETL #:nodoc:
118
112
  write_local(file)
119
113
  read_rows(file, &block)
120
114
  else
121
- connection.select_all(query).each do |row|
115
+ query_rows.each do |row|
122
116
  row = ETL::Row.new(row.symbolize_keys)
123
117
  row.source = self
124
118
  yield row
@@ -158,7 +152,7 @@ module ETL #:nodoc:
158
152
  t = Benchmark.realtime do
159
153
  FasterCSV.open(file, 'w') do |f|
160
154
  f << columns
161
- connection.select_all(query).each do |row|
155
+ query_rows.each do |row|
162
156
  f << columns.collect { |column| row[column.to_s] }
163
157
  lines += 1
164
158
  end
@@ -204,6 +198,10 @@ module ETL #:nodoc:
204
198
  @query = q
205
199
  end
206
200
 
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
207
205
  # Get the database connection to use
208
206
  def connection
209
207
  ETL::Engine.connection(target)
@@ -1,3 +1,5 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
1
3
  module ETL #:nodoc:
2
4
  module CoreExtensions #:nodoc:
3
5
  module Time #:nodoc:
@@ -18,7 +20,7 @@ module ETL #:nodoc:
18
20
  end
19
21
  def fiscal_year_month(offset_month=10)
20
22
  shifted_month = month - (offset_month - 1)
21
- shifted_month += 12 if shifted_month < 0
23
+ shifted_month += 12 if shifted_month <= 0
22
24
  shifted_month
23
25
  end
24
26
  def fiscal_year_quarter(offset_month=10)
@@ -37,4 +39,4 @@ module ETL #:nodoc:
37
39
  end
38
40
  end
39
41
  end
40
- end
42
+ end
@@ -19,7 +19,7 @@ module ETL #:nodoc:
19
19
  # * <tt>:rails_root</tt>: Set to the rails root to boot rails
20
20
  def init(options={})
21
21
  unless @initialized
22
- puts "initializing ETL engine"
22
+ puts "initializing ETL engine\n\n"
23
23
  @limit = options[:limit]
24
24
  @offset = options[:offset]
25
25
  @log_write_mode = 'w' if options[:newlog]
@@ -28,8 +28,8 @@ module ETL #:nodoc:
28
28
  @rails_root = options[:rails_root]
29
29
 
30
30
  require File.join(@rails_root, 'config/environment') if @rails_root
31
-
32
31
  options[:config] ||= 'database.yml'
32
+ options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
33
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
34
  ActiveRecord::Base.configurations.merge!(database_configuration)
35
35
  ETL::Base.configurations = database_configuration
@@ -177,9 +177,7 @@ module ETL #:nodoc:
177
177
  if temp_tables[temp_table_name].nil?
178
178
  # Create the temp table and add it to the mapping
179
179
  begin connection.drop_table(temp_table_name); rescue; end
180
- connection.execute(
181
- connection.add_select_into_table(temp_table_name, "SELECT * FROM #{table_name}")
182
- )
180
+ connection.copy_table(table_name, temp_table_name)
183
181
  temp_tables[temp_table_name] = {
184
182
  :table => table_name,
185
183
  :connection => connection
@@ -274,6 +272,7 @@ module ETL #:nodoc:
274
272
  # Process the specified batch file
275
273
  def process_batch(batch)
276
274
  batch = ETL::Batch::Batch.resolve(batch, self)
275
+ say "Processing batch #{batch.file}"
277
276
 
278
277
  ETL::Engine.batch = ETL::Execution::Batch.create!(
279
278
  :batch_file => batch.file,
@@ -290,10 +289,12 @@ module ETL #:nodoc:
290
289
  # Process the specified control file
291
290
  def process_control(control)
292
291
  control = ETL::Control::Control.resolve(control)
292
+ say_on_own_line "Processing control #{control.file}"
293
293
 
294
294
  ETL::Engine.job = ETL::Execution::Job.create!(
295
295
  :control_file => control.file,
296
- :status => 'executing'
296
+ :status => 'executing',
297
+ :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
297
298
  )
298
299
 
299
300
  execute_dependencies(control)
@@ -357,11 +358,15 @@ module ETL #:nodoc:
357
358
  row[name] = transform.transform(name, row[name], row)
358
359
  end
359
360
  end
361
+ rescue ResolverError => e
362
+ Engine.logger.error(e.message)
363
+ errors << e.message
360
364
  rescue => e
361
365
  msg = "Error transforming from #{Engine.current_source} on line #{Engine.current_source_row}: #{e}"
362
366
  errors << msg
363
367
  Engine.logger.error(msg)
364
368
  e.backtrace.each { |line| Engine.logger.error(line) }
369
+ ensure
365
370
  begin
366
371
  exceeded_error_threshold?(control) ? break : next
367
372
  rescue => inner_error
@@ -422,7 +427,7 @@ module ETL #:nodoc:
422
427
  destination.close
423
428
  end
424
429
 
425
- say_on_own_line "Executing screens"
430
+ say_on_own_line "Executing before post-process screens"
426
431
  begin
427
432
  execute_screens(control)
428
433
  rescue FatalScreenError => e
@@ -443,7 +448,21 @@ module ETL #:nodoc:
443
448
  if destinations.length > 0
444
449
  say "Wrote #{Engine.rows_written} lines to destinations"
445
450
  end
446
- say "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
451
+
452
+ say_on_own_line "Executing after post-process screens"
453
+ begin
454
+ execute_screens(control, :after_post_process)
455
+ rescue FatalScreenError => e
456
+ say "Fatal screen error during job execution: #{e.message}"
457
+ exit
458
+ rescue ScreenError => e
459
+ say "Screen error during job execution: #{e.message}"
460
+ return
461
+ else
462
+ say "Screens passed"
463
+ end
464
+
465
+ say_on_own_line "Completed #{control.file} in #{distance_of_time_in_words(start_time)} with #{errors.length} errors."
447
466
  say "Processing average: #{Engine.average_rows_per_second} rows/sec)"
448
467
 
449
468
  say "Avg after_reads: #{Engine.rows_read/benchmarks[:after_reads]} rows/sec" if benchmarks[:after_reads] > 0
@@ -509,9 +528,15 @@ module ETL #:nodoc:
509
528
  end
510
529
 
511
530
  # Execute all screens
512
- def execute_screens(control)
531
+ def execute_screens(control, timing = :before_post_process)
532
+ screens = case timing
533
+ when :after_post_process
534
+ control.after_post_process_screens
535
+ else # default to before post-process screens
536
+ control.screens
537
+ end
513
538
  [:fatal,:error,:warn].each do |type|
514
- control.screens[type].each do |block|
539
+ screens[type].each do |block|
515
540
  begin
516
541
  block.call
517
542
  rescue => e
@@ -4,20 +4,28 @@ module ETL #:nodoc:
4
4
  # for the ETL engine
5
5
  class Migration
6
6
  class << self
7
+ protected
8
+ # Get the schema info table name
9
+ def schema_info_table_name
10
+ ActiveRecord::Migrator.schema_migrations_table_name
11
+ end
12
+ alias :schema_migrations_table_name :schema_info_table_name
13
+
14
+ public
7
15
  # Execute the migrations
8
16
  def migrate
9
- connection.initialize_schema_information
10
- v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
- v.upto(target - 1) do |i|
17
+ connection.initialize_schema_migrations_table
18
+ last_migration.upto(target - 1) do |i|
12
19
  __send__("migration_#{i+1}".to_sym)
13
- update_schema_info(i+1)
20
+ connection.assume_migrated_upto_version(i+1)
14
21
  end
15
22
  end
23
+
16
24
  protected
17
- # Get the schema info table name
18
- def schema_info_table_name
19
- ETL::Execution::Base.table_name_prefix + "schema_info" +
20
- ETL::Execution::Base.table_name_suffix
25
+ def last_migration
26
+ connection.select_values(
27
+ "SELECT version FROM #{schema_migrations_table_name}"
28
+ ).map(&:to_i).sort.last || 0
21
29
  end
22
30
 
23
31
  # Get the connection to use during migration
@@ -27,7 +35,7 @@ module ETL #:nodoc:
27
35
 
28
36
  # Get the final target version number
29
37
  def target
30
- 3
38
+ 4
31
39
  end
32
40
 
33
41
  private
@@ -62,6 +70,10 @@ module ETL #:nodoc:
62
70
  connection.add_column :jobs, :batch_id, :integer
63
71
  connection.add_index :jobs, :batch_id
64
72
  end
73
+
74
+ def migration_4
75
+ connection.drop_table :records
76
+ end
65
77
 
66
78
  # Update the schema info table, setting the version value
67
79
  def update_schema_info(version)
@@ -7,7 +7,7 @@ module ETL #:nodoc:
7
7
  #
8
8
  # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
9
9
  def class_for_name(name)
10
- ETL::Generator.const_get("#{name.to_s.classify}Generator")
10
+ ETL::Generator.const_get("#{name.to_s.camelize}Generator")
11
11
  end
12
12
  end
13
13
 
@@ -106,20 +106,34 @@ module HttpTools
106
106
  result
107
107
  end
108
108
 
109
- def parse_uri(uri_string)
110
- if uri_string
109
+ # Parse a URI. If options[:prefix] is set then prepend it to the keys for the hash that
110
+ # is returned.
111
+ def parse_uri(uri_string, options={})
112
+ prefix = options[:prefix] ||= ''
113
+ empty_hash = {
114
+ "#{prefix}scheme".to_sym => nil,
115
+ "#{prefix}host".to_sym => nil,
116
+ "#{prefix}port".to_sym => nil,
117
+ "#{prefix}uri_path".to_sym => nil,
118
+ "#{prefix}domain".to_sym => nil
119
+ }
120
+ if uri_string
111
121
  #attempt to parse uri --if it's a uri then catch the problem and set everything to nil
112
122
  begin
113
123
  uri = URI.parse(uri_string)
114
- results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
115
- results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
124
+ results = {
125
+ "#{prefix}scheme".to_sym => uri.scheme,
126
+ "#{prefix}host".to_sym => uri.host,
127
+ "#{prefix}port".to_sym => uri.port,
128
+ "#{prefix}uri_path".to_sym => uri.path
129
+ }
130
+ results["#{prefix}domain".to_sym] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
116
131
  results
117
132
  rescue
118
- {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
133
+ empty_hash
119
134
  end
120
-
121
135
  else
122
- {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
136
+ empty_hash
123
137
  end
124
138
  end
125
139
  end
@@ -33,9 +33,11 @@ module ETL #:nodoc:
33
33
  #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34
34
  d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil?
35
35
  fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil?
36
+
37
+ fields[:method], fields[:path] = fields[:request].split(/\s/)
36
38
 
37
39
  fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil?
38
- fields.merge!(parse_uri(fields[:referrer]))
40
+ fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_'))
39
41
 
40
42
  fields.each do |key, value|
41
43
  fields[key] = nil if value == '-'
@@ -44,7 +44,7 @@ module ETL #:nodoc:
44
44
  ETL::Engine.logger.debug "validating line #{line} in file #{file}"
45
45
  if row.length != fields.length
46
46
  raise_with_info( MismatchError,
47
- "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
47
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
48
48
  line, file
49
49
  )
50
50
  end
@@ -11,7 +11,7 @@ module ETL #:nodoc:
11
11
  # Example:
12
12
  # <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
13
13
  def class_for_name(name)
14
- ETL::Parser.const_get("#{name.to_s.classify}Parser")
14
+ ETL::Parser.const_get("#{name.to_s.camelize}Parser")
15
15
  end
16
16
  end
17
17
 
@@ -0,0 +1,14 @@
1
+ module ETL
2
+ module Processor
3
+ # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4
+ class BlockProcessor < ETL::Processor::RowProcessor
5
+ def initialize(control, configuration)
6
+ super
7
+ @block = configuration[:block]
8
+ end
9
+ def process(row=nil)
10
+ @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -21,6 +21,8 @@ module ETL #:nodoc:
21
21
  attr_accessor :field_enclosure
22
22
  # The line separator (defaults to a newline)
23
23
  attr_accessor :line_separator
24
+ # The string that indicates a NULL (defaults to an empty string)
25
+ attr_accessor :null_string
24
26
 
25
27
  # Initialize the processor.
26
28
  #
@@ -43,6 +45,7 @@ module ETL #:nodoc:
43
45
  @columns = configuration[:columns]
44
46
  @field_separator = (configuration[:field_separator] || ',')
45
47
  @line_separator = (configuration[:line_separator] || "\n")
48
+ @null_string = (configuration[:null_string] || "")
46
49
  @field_enclosure = configuration[:field_enclosure]
47
50
 
48
51
  raise ControlError, "Target must be specified" unless @target
@@ -59,8 +62,9 @@ module ETL #:nodoc:
59
62
  conn.truncate(table_name) if truncate
60
63
  options = {}
61
64
  options[:columns] = columns
62
- if field_separator || field_enclosure
65
+ if field_separator || field_enclosure || line_separator || null_string
63
66
  options[:fields] = {}
67
+ options[:fields][:null_string] = null_string if null_string
64
68
  options[:fields][:delimited_by] = field_separator if field_separator
65
69
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
66
70
  options[:fields][:terminated_by] = line_separator if line_separator
@@ -64,6 +64,7 @@ module ETL #:nodoc:
64
64
  end
65
65
  end
66
66
  q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
67
68
 
68
69
  #puts "query: #{q}"
69
70
  result = conn.select_one(q)
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end