activewarehouse-etl 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -8,5 +8,11 @@
8
8
  * Added support for generators in destinations
9
9
  * Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
10
10
 
11
- 0.3.0
12
- * Added support for calculated values in virtual fields with Proc
11
+ 0.3.0 - Dec 19, 2006
12
+ * Added support for calculated values in virtual fields with Proc
13
+
14
+ 0.4.0 - Jan 11, 2006
15
+ * Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
16
+ * Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
17
+ * Added :truncate option for database destination. Set to true to truncate before importing data.
18
+ * Added support for :unique => [] option and virtual fields for the database destination
data/Rakefile CHANGED
@@ -41,6 +41,7 @@ end
41
41
  PKG_FILES = FileList[
42
42
  'CHANGELOG',
43
43
  'README',
44
+ 'TODO',
44
45
  'Rakefile',
45
46
  'bin/**/*',
46
47
  'doc/**/*',
@@ -116,4 +117,9 @@ task :release => [ :package ] do
116
117
  puts release_command
117
118
  system(release_command)
118
119
  end
120
+ end
121
+
122
+ desc "Publish the API documentation"
123
+ task :pdoc => [:rdoc] do
124
+ Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
119
125
  end
data/TODO ADDED
@@ -0,0 +1,9 @@
1
+ TODO
2
+
3
+ * Add build-in support for audit_dimension
4
+ * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
5
+ * Add built-in FK lookup
6
+ * Provide greater control in error handling
7
+ ** Allow a error threshold
8
+ ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
9
+ ** Allow mismatch row length error in delimited parser to be ignored
data/lib/etl.rb CHANGED
@@ -1,5 +1,9 @@
1
+ # This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
2
+ # load this source file all of the other required files and gems will also be brought into the
3
+ # runtime.
4
+
1
5
  #--
2
- # Copyright (c) 2006 Anthony Eden
6
+ # Copyright (c) 2006-2007 Anthony Eden
3
7
  #
4
8
  # Permission is hereby granted, free of charge, to any person obtaining
5
9
  # a copy of this software and associated documentation files (the
@@ -49,6 +53,8 @@ require 'etl/transform'
49
53
  require 'etl/processor'
50
54
  require 'etl/generator'
51
55
 
56
+ require 'etl/active_record_ext'
57
+
52
58
  module ETL #:nodoc:
53
59
  class ETLError < StandardError #:nodoc:
54
60
  end
@@ -56,4 +62,6 @@ module ETL #:nodoc:
56
62
  end
57
63
  class DefinitionError < ControlError #:nodoc:
58
64
  end
65
+ class MismatchError < ETLError #:nodoc:
66
+ end
59
67
  end
@@ -0,0 +1 @@
1
+ require 'etl/active_record_ext/connection_adapters/mysql_adapter'
@@ -0,0 +1,34 @@
1
+ require 'active_record/connection_adapters/abstract_adapter'
2
+
3
+ module ActiveRecord
4
+ module ConnectionAdapters
5
+ class MysqlAdapter < AbstractAdapter
6
+ # Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
7
+ # the auto_increment
8
+ def truncate(table_name)
9
+ execute("TRUNCATE #{table_name}")
10
+ end
11
+
12
+ # Bulk load the data in the specified file. This implementation always uses the LOCAL keyword
13
+ # so the file must be found locally, not on the remote server, to be loaded.
14
+ #
15
+ # Options:
16
+ # * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
17
+ # * <tt>:columns</tt> -- Array of column names defining the source file column order
18
+ # * <tt>:fields</tt> -- Hash of options for fields:
19
+ # ** <tt>:delimited_by</tt> -- The field delimiter
20
+ # ** <tt>:enclosed_by</tt> -- The field enclosure
21
+ def bulk_load(file, table_name, options={})
22
+ q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name}"
23
+ if options[:fields]
24
+ q << " FIELDS"
25
+ q << " TERMINATED BY '#{options[:fields][:delimited_by]}'" if options[:fields][:delimited_by]
26
+ q << " ENCLOSED BY '#{options[:fields][:enclosed_by]}'" if options[:fields][:enclosed_by]
27
+ end
28
+ q << " IGNORE #{options[:ignore]} LINES" if options[:ignore]
29
+ q << " (#{options[:columns].join(',')})" if options[:columns]
30
+ execute(q)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -2,7 +2,7 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  class Destination
4
4
  attr_reader :control, :configuration, :mapping
5
- attr_accessor :buffer_size, :current_row
5
+ attr_accessor :buffer_size, :current_row, :unique
6
6
 
7
7
  class << self
8
8
  def class_for_name(name)
@@ -58,6 +58,41 @@ module ETL #:nodoc:
58
58
  end
59
59
  order
60
60
  end
61
+
62
+ # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
63
+ # in the configuration and the compound key already exists
64
+ def row_allowed?(row)
65
+ if unique
66
+ key = (unique.collect { |k| row[k] }).join('|')
67
+ return false if compound_key_constraints[key]
68
+ compound_key_constraints[key] = 1
69
+ end
70
+ return true
71
+ end
72
+
73
+ # Get a hash of compound key contraints. This is used to determine if a row can be written when the
74
+ # unique option is specified
75
+ def compound_key_constraints
76
+ @compound_key_constraints ||= {}
77
+ end
78
+
79
+ # Add any virtual fields to the row
80
+ def add_virtuals!(row)
81
+ if mapping[:virtual]
82
+ mapping[:virtual].each do |key,value|
83
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
84
+ case value
85
+ when Symbol
86
+ generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
87
+ row[key] = generators[key].next
88
+ when Proc
89
+ row[key] = value.call(row)
90
+ else
91
+ row[key] = value
92
+ end
93
+ end
94
+ end
95
+ end
61
96
  end
62
97
  end
63
98
  end
@@ -1,19 +1,29 @@
1
1
  module ETL
2
2
  module Control
3
3
  class DatabaseDestination < Destination
4
- attr_reader :order
4
+ attr_reader :order, :truncate
5
5
  def initialize(control, configuration, mapping)
6
6
  super
7
+ @truncate = configuration[:truncate] ||= false
8
+ @unique = configuration[:unique]
7
9
  @order = mapping[:order] || order_from_source
8
10
  raise ControlError, "Order required in mapping" unless @order
9
11
  connect
10
12
  end
11
13
 
12
14
  def flush
13
- # TODO: add virtual fields and compound key constraint support like in the FileDestination
14
15
  conn = ActiveRecord::Base.connection
15
16
  conn.transaction do
17
+ conn.truncate(configuration[:table]) if truncate
18
+
16
19
  buffer.each do |row|
20
+ # check to see if this row's compound key constraint already exists
21
+ # note that the compound key constraint may not utilize virtual fields
22
+ next unless row_allowed?(row)
23
+
24
+ # add any virtual fields
25
+ add_virtuals!(row)
26
+
17
27
  names = []
18
28
  values = []
19
29
  order.each do |name|
@@ -28,6 +38,8 @@ module ETL
28
38
  buffer.clear
29
39
  end
30
40
  end
41
+
42
+ # Close the connection
31
43
  def close
32
44
  flush
33
45
  ActiveRecord::Base.connection.disconnect!
@@ -3,7 +3,7 @@ module ETL #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
5
  attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose, :unique
6
+ attr_accessor :append, :separator, :eol, :enclose
7
7
 
8
8
  # Initialize the object.
9
9
  # * <tt>control</tt>: The Control object
@@ -57,20 +57,6 @@ module ETL #:nodoc:
57
57
  end
58
58
 
59
59
  private
60
- # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
61
- # in the configuration and the compound key already exists
62
- def row_allowed?(row)
63
- if unique
64
- key = (unique.collect { |k| row[k] }).join('|')
65
- return false if compound_key_constraints[key]
66
- compound_key_constraints[key] = 1
67
- end
68
- return true
69
- end
70
-
71
- def compound_key_constraints
72
- @compound_key_constraints ||= {}
73
- end
74
60
 
75
61
  # Get the open file stream
76
62
  def f
@@ -82,22 +68,6 @@ module ETL #:nodoc:
82
68
  append ? 'a' : 'w'
83
69
  end
84
70
 
85
- def add_virtuals!(row)
86
- if mapping[:virtual]
87
- mapping[:virtual].each do |key,value|
88
- # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
89
- case value
90
- when Symbol
91
- generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
92
- row[key] = generators[key].next
93
- when Proc
94
- row[key] = value.call(row)
95
- else
96
- row[key] = value
97
- end
98
- end
99
- end
100
- end
101
71
  end
102
72
  end
103
73
  end
@@ -1,6 +1,7 @@
1
1
  module ETL
2
2
  module Control
3
3
  class FileSource < Source
4
+ attr_accessor :skip_lines
4
5
  def initialize(control, configuration, definition)
5
6
  super
6
7
  configure
@@ -13,6 +14,7 @@ module ETL
13
14
  private
14
15
  def configure
15
16
  @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
17
+ @skip_lines = @configuration[:skip_lines] ||= 0
16
18
  end
17
19
  end
18
20
  end
@@ -10,12 +10,22 @@ module ETL #:nodoc:
10
10
  configure
11
11
  end
12
12
 
13
- # Returns each row
13
+ # Returns each row.
14
14
  def each
15
15
  options = {}
16
16
  Dir.glob(file).each do |file|
17
+ ETL::Engine.logger.debug "parsing #{file}"
18
+ line = 0
19
+ lines_skipped = 0
17
20
  FasterCSV.foreach(file, options) do |raw_row|
21
+ if lines_skipped < source.skip_lines
22
+ ETL::Engine.logger.debug "skipping line"
23
+ lines_skipped += 1
24
+ next
25
+ end
26
+ line += 1
18
27
  row = {}
28
+ validate_row(raw_row, line, file)
19
29
  raw_row.each_with_index do |record, index|
20
30
  f = fields[index]
21
31
  row[f.name] = convert(f.name, record, f.type)
@@ -31,6 +41,16 @@ module ETL #:nodoc:
31
41
  end
32
42
 
33
43
  private
44
+ def validate_row(row, line, file)
45
+ ETL::Engine.logger.debug "validing line #{line} in file #{file}"
46
+ if row.length != fields.length
47
+ raise_with_info( MismatchError,
48
+ "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
49
+ line, file
50
+ )
51
+ end
52
+ end
53
+
34
54
  def configure
35
55
  source.definition.each do |options|
36
56
  case options
@@ -16,7 +16,12 @@ module ETL #:nodoc:
16
16
  Dir.glob(file).each do |file|
17
17
  open(file).each do |line|
18
18
  row = {}
19
+ lines_skipped = 0
19
20
  fields.each do |name, f|
21
+ if lines_skipped < source.skip_lines
22
+ lines_skipped += 1
23
+ next
24
+ end
20
25
  # TODO make strip optional?
21
26
  row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
22
27
  end
@@ -38,6 +38,10 @@ module ETL
38
38
  def file
39
39
  File.join(File.dirname(source.control.file), source.configuration[:file])
40
40
  end
41
+
42
+ def raise_with_info(error, message, file, line)
43
+ raise error, "#{message} (line #{line} in #{file})"
44
+ end
41
45
  end
42
46
  end
43
47
  end
@@ -1,5 +1,6 @@
1
1
  module ETL
2
2
  module Processor
3
+ # Processor which is used to bulk import data into a target database
3
4
  class BulkImportProcessor < ETL::Processor::Processor
4
5
  attr_reader :file, :target, :truncate, :columns
5
6
  def initialize(control, configuration)
@@ -16,21 +17,14 @@ module ETL
16
17
  conn.transaction do
17
18
  # TODO: Support all database types
18
19
  # Since LOCAL is used this must be allowed by both the client and server
19
- conn.execute("TRUNCATE #{target[:table]}") if truncate
20
- q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
21
- q << " (#{columns.join(',')})" if columns
22
- conn.execute(q)
20
+ conn.truncate(target[:table]) if truncate
21
+ options = {}
22
+ options[:columns] = columns
23
+ conn.bulk_load(file, target[:table], options)
23
24
  end
24
25
  end
25
- private
26
- def log
27
- unless @log
28
- @log = Logger.new(STDOUT)
29
- @log.level = Logger::DEBUG
30
- end
31
- @log
32
- end
33
26
 
27
+ private
34
28
  # Connect to the database
35
29
  def connect
36
30
  ActiveRecord::Base.establish_connection(
@@ -13,6 +13,13 @@ module ETL #:nodoc:
13
13
  def configuration
14
14
  @configuration
15
15
  end
16
+ def log
17
+ unless @log
18
+ @log = Logger.new(STDOUT)
19
+ @log.level = Logger::DEBUG
20
+ end
21
+ @log
22
+ end
16
23
  end
17
24
  end
18
25
  end
@@ -0,0 +1,30 @@
1
+ module ETL
2
+ module Processor
3
+ # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
+ # prior to loading
5
+ class TruncateProcessor < ETL::Processor::Processor
6
+ attr_reader :file, :table
7
+ def initialize(control, configuration)
8
+ super
9
+ @file = File.join(File.dirname(control.file), configuration[:file])
10
+ @table = configuration[:table]
11
+ connect
12
+ end
13
+ def process
14
+ conn = ActiveRecord::Base.connection
15
+ conn.truncate
16
+ end
17
+
18
+ # Connect to the database
19
+ def connect
20
+ ActiveRecord::Base.establish_connection(
21
+ :adapter => (target[:adapter] || :mysql),
22
+ :username => (target[:username] || 'root'),
23
+ :host => (target[:host] || 'localhost'),
24
+ :password => target[:password],
25
+ :database => target[:database]
26
+ )
27
+ end
28
+ end
29
+ end
30
+ end
@@ -7,6 +7,7 @@ module ETL #:nodoc:
7
7
  def initialize(control, configuration={})
8
8
  super
9
9
  end
10
+ # Transform the value with a SHA1 digest algorithm.
10
11
  def transform(value)
11
12
  Digest::SHA1.hexdigest(value)
12
13
  end
@@ -2,6 +2,9 @@ module ETL
2
2
  module Transform
3
3
  class Transform
4
4
  class << self
5
+ # Transform the specified value using the given transforms. The transforms can either be
6
+ # Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
7
+ # Any other object will result in a ControlError being raised.
5
8
  def transform(name, value, transforms)
6
9
  # logger.debug "Transforming field #{name}" if transforms.length > 0
7
10
  transforms.each do |transform|
@@ -20,6 +23,7 @@ module ETL
20
23
 
21
24
  attr_reader :control, :configuration
22
25
 
26
+ # Initialize the transform object
23
27
  def initialize(control, configuration={})
24
28
  @control = control
25
29
  @configuration = configuration
data/lib/etl/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module ETL
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 3
4
+ MINOR = 4
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2006-12-08 00:00:00 -05:00
6
+ version: 0.4.0
7
+ date: 2007-01-11 00:00:00 -05:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -31,10 +31,13 @@ authors:
31
31
  files:
32
32
  - CHANGELOG
33
33
  - README
34
+ - TODO
34
35
  - Rakefile
35
36
  - bin/etl
36
37
  - lib/etl
37
38
  - lib/etl.rb
39
+ - lib/etl/active_record_ext
40
+ - lib/etl/active_record_ext.rb
38
41
  - lib/etl/commands
39
42
  - lib/etl/control
40
43
  - lib/etl/control.rb
@@ -48,6 +51,8 @@ files:
48
51
  - lib/etl/transform
49
52
  - lib/etl/transform.rb
50
53
  - lib/etl/version.rb
54
+ - lib/etl/active_record_ext/connection_adapters
55
+ - lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb
51
56
  - lib/etl/commands/etl.rb
52
57
  - lib/etl/control/control.rb
53
58
  - lib/etl/control/destination
@@ -66,6 +71,7 @@ files:
66
71
  - lib/etl/parser/xml_parser.rb
67
72
  - lib/etl/processor/bulk_import_processor.rb
68
73
  - lib/etl/processor/processor.rb
74
+ - lib/etl/processor/truncate_processor.rb
69
75
  - lib/etl/transform/decode_transform.rb
70
76
  - lib/etl/transform/sha1_transform.rb
71
77
  - lib/etl/transform/transform.rb