activewarehouse-etl 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -8,5 +8,11 @@
8
8
  * Added support for generators in destinations
9
9
  * Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
10
10
 
11
- 0.3.0
12
- * Added support for calculated values in virtual fields with Proc
11
+ 0.3.0 - Dec 19, 2006
12
+ * Added support for calculated values in virtual fields with Proc
13
+
14
+ 0.4.0 - Jan 11, 2006
15
+ * Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
16
+ * Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
17
+ * Added :truncate option for database destination. Set to true to truncate before importing data.
18
+ * Added support for :unique => [] option and virtual fields for the database destination
data/Rakefile CHANGED
@@ -41,6 +41,7 @@ end
41
41
  PKG_FILES = FileList[
42
42
  'CHANGELOG',
43
43
  'README',
44
+ 'TODO',
44
45
  'Rakefile',
45
46
  'bin/**/*',
46
47
  'doc/**/*',
@@ -116,4 +117,9 @@ task :release => [ :package ] do
116
117
  puts release_command
117
118
  system(release_command)
118
119
  end
120
+ end
121
+
122
+ desc "Publish the API documentation"
123
+ task :pdoc => [:rdoc] do
124
+ Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
119
125
  end
data/TODO ADDED
@@ -0,0 +1,9 @@
1
+ TODO
2
+
3
+ * Add build-in support for audit_dimension
4
+ * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
5
+ * Add built-in FK lookup
6
+ * Provide greater control in error handling
7
+ ** Allow a error threshold
8
+ ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
9
+ ** Allow mismatch row length error in delimited parser to be ignored
data/lib/etl.rb CHANGED
@@ -1,5 +1,9 @@
1
+ # This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
2
+ # load this source file all of the other required files and gems will also be brought into the
3
+ # runtime.
4
+
1
5
  #--
2
- # Copyright (c) 2006 Anthony Eden
6
+ # Copyright (c) 2006-2007 Anthony Eden
3
7
  #
4
8
  # Permission is hereby granted, free of charge, to any person obtaining
5
9
  # a copy of this software and associated documentation files (the
@@ -49,6 +53,8 @@ require 'etl/transform'
49
53
  require 'etl/processor'
50
54
  require 'etl/generator'
51
55
 
56
+ require 'etl/active_record_ext'
57
+
52
58
  module ETL #:nodoc:
53
59
  class ETLError < StandardError #:nodoc:
54
60
  end
@@ -56,4 +62,6 @@ module ETL #:nodoc:
56
62
  end
57
63
  class DefinitionError < ControlError #:nodoc:
58
64
  end
65
+ class MismatchError < ETLError #:nodoc:
66
+ end
59
67
  end
@@ -0,0 +1 @@
1
+ require 'etl/active_record_ext/connection_adapters/mysql_adapter'
@@ -0,0 +1,34 @@
1
+ require 'active_record/connection_adapters/abstract_adapter'
2
+
3
+ module ActiveRecord
4
+ module ConnectionAdapters
5
+ class MysqlAdapter < AbstractAdapter
6
+ # Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
7
+ # the auto_increment
8
+ def truncate(table_name)
9
+ execute("TRUNCATE #{table_name}")
10
+ end
11
+
12
+ # Bulk load the data in the specified file. This implementation always uses the LOCAL keyword
13
+ # so the file must be found locally, not on the remote server, to be loaded.
14
+ #
15
+ # Options:
16
+ # * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
17
+ # * <tt>:columns</tt> -- Array of column names defining the source file column order
18
+ # * <tt>:fields</tt> -- Hash of options for fields:
19
+ # ** <tt>:delimited_by</tt> -- The field delimiter
20
+ # ** <tt>:enclosed_by</tt> -- The field enclosure
21
+ def bulk_load(file, table_name, options={})
22
+ q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name}"
23
+ if options[:fields]
24
+ q << " FIELDS"
25
+ q << " TERMINATED BY '#{options[:fields][:delimited_by]}'" if options[:fields][:delimited_by]
26
+ q << " ENCLOSED BY '#{options[:fields][:enclosed_by]}'" if options[:fields][:enclosed_by]
27
+ end
28
+ q << " IGNORE #{options[:ignore]} LINES" if options[:ignore]
29
+ q << " (#{options[:columns].join(',')})" if options[:columns]
30
+ execute(q)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -2,7 +2,7 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  class Destination
4
4
  attr_reader :control, :configuration, :mapping
5
- attr_accessor :buffer_size, :current_row
5
+ attr_accessor :buffer_size, :current_row, :unique
6
6
 
7
7
  class << self
8
8
  def class_for_name(name)
@@ -58,6 +58,41 @@ module ETL #:nodoc:
58
58
  end
59
59
  order
60
60
  end
61
+
62
+ # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
63
+ # in the configuration and the compound key already exists
64
+ def row_allowed?(row)
65
+ if unique
66
+ key = (unique.collect { |k| row[k] }).join('|')
67
+ return false if compound_key_constraints[key]
68
+ compound_key_constraints[key] = 1
69
+ end
70
+ return true
71
+ end
72
+
73
+ # Get a hash of compound key contraints. This is used to determine if a row can be written when the
74
+ # unique option is specified
75
+ def compound_key_constraints
76
+ @compound_key_constraints ||= {}
77
+ end
78
+
79
+ # Add any virtual fields to the row
80
+ def add_virtuals!(row)
81
+ if mapping[:virtual]
82
+ mapping[:virtual].each do |key,value|
83
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
84
+ case value
85
+ when Symbol
86
+ generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
87
+ row[key] = generators[key].next
88
+ when Proc
89
+ row[key] = value.call(row)
90
+ else
91
+ row[key] = value
92
+ end
93
+ end
94
+ end
95
+ end
61
96
  end
62
97
  end
63
98
  end
@@ -1,19 +1,29 @@
1
1
  module ETL
2
2
  module Control
3
3
  class DatabaseDestination < Destination
4
- attr_reader :order
4
+ attr_reader :order, :truncate
5
5
  def initialize(control, configuration, mapping)
6
6
  super
7
+ @truncate = configuration[:truncate] ||= false
8
+ @unique = configuration[:unique]
7
9
  @order = mapping[:order] || order_from_source
8
10
  raise ControlError, "Order required in mapping" unless @order
9
11
  connect
10
12
  end
11
13
 
12
14
  def flush
13
- # TODO: add virtual fields and compound key constraint support like in the FileDestination
14
15
  conn = ActiveRecord::Base.connection
15
16
  conn.transaction do
17
+ conn.truncate(configuration[:table]) if truncate
18
+
16
19
  buffer.each do |row|
20
+ # check to see if this row's compound key constraint already exists
21
+ # note that the compound key constraint may not utilize virtual fields
22
+ next unless row_allowed?(row)
23
+
24
+ # add any virtual fields
25
+ add_virtuals!(row)
26
+
17
27
  names = []
18
28
  values = []
19
29
  order.each do |name|
@@ -28,6 +38,8 @@ module ETL
28
38
  buffer.clear
29
39
  end
30
40
  end
41
+
42
+ # Close the connection
31
43
  def close
32
44
  flush
33
45
  ActiveRecord::Base.connection.disconnect!
@@ -3,7 +3,7 @@ module ETL #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
5
  attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose, :unique
6
+ attr_accessor :append, :separator, :eol, :enclose
7
7
 
8
8
  # Initialize the object.
9
9
  # * <tt>control</tt>: The Control object
@@ -57,20 +57,6 @@ module ETL #:nodoc:
57
57
  end
58
58
 
59
59
  private
60
- # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
61
- # in the configuration and the compound key already exists
62
- def row_allowed?(row)
63
- if unique
64
- key = (unique.collect { |k| row[k] }).join('|')
65
- return false if compound_key_constraints[key]
66
- compound_key_constraints[key] = 1
67
- end
68
- return true
69
- end
70
-
71
- def compound_key_constraints
72
- @compound_key_constraints ||= {}
73
- end
74
60
 
75
61
  # Get the open file stream
76
62
  def f
@@ -82,22 +68,6 @@ module ETL #:nodoc:
82
68
  append ? 'a' : 'w'
83
69
  end
84
70
 
85
- def add_virtuals!(row)
86
- if mapping[:virtual]
87
- mapping[:virtual].each do |key,value|
88
- # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
89
- case value
90
- when Symbol
91
- generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
92
- row[key] = generators[key].next
93
- when Proc
94
- row[key] = value.call(row)
95
- else
96
- row[key] = value
97
- end
98
- end
99
- end
100
- end
101
71
  end
102
72
  end
103
73
  end
@@ -1,6 +1,7 @@
1
1
  module ETL
2
2
  module Control
3
3
  class FileSource < Source
4
+ attr_accessor :skip_lines
4
5
  def initialize(control, configuration, definition)
5
6
  super
6
7
  configure
@@ -13,6 +14,7 @@ module ETL
13
14
  private
14
15
  def configure
15
16
  @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
17
+ @skip_lines = @configuration[:skip_lines] ||= 0
16
18
  end
17
19
  end
18
20
  end
@@ -10,12 +10,22 @@ module ETL #:nodoc:
10
10
  configure
11
11
  end
12
12
 
13
- # Returns each row
13
+ # Returns each row.
14
14
  def each
15
15
  options = {}
16
16
  Dir.glob(file).each do |file|
17
+ ETL::Engine.logger.debug "parsing #{file}"
18
+ line = 0
19
+ lines_skipped = 0
17
20
  FasterCSV.foreach(file, options) do |raw_row|
21
+ if lines_skipped < source.skip_lines
22
+ ETL::Engine.logger.debug "skipping line"
23
+ lines_skipped += 1
24
+ next
25
+ end
26
+ line += 1
18
27
  row = {}
28
+ validate_row(raw_row, line, file)
19
29
  raw_row.each_with_index do |record, index|
20
30
  f = fields[index]
21
31
  row[f.name] = convert(f.name, record, f.type)
@@ -31,6 +41,16 @@ module ETL #:nodoc:
31
41
  end
32
42
 
33
43
  private
44
+ def validate_row(row, line, file)
45
+ ETL::Engine.logger.debug "validing line #{line} in file #{file}"
46
+ if row.length != fields.length
47
+ raise_with_info( MismatchError,
48
+ "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
49
+ line, file
50
+ )
51
+ end
52
+ end
53
+
34
54
  def configure
35
55
  source.definition.each do |options|
36
56
  case options
@@ -16,7 +16,12 @@ module ETL #:nodoc:
16
16
  Dir.glob(file).each do |file|
17
17
  open(file).each do |line|
18
18
  row = {}
19
+ lines_skipped = 0
19
20
  fields.each do |name, f|
21
+ if lines_skipped < source.skip_lines
22
+ lines_skipped += 1
23
+ next
24
+ end
20
25
  # TODO make strip optional?
21
26
  row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
22
27
  end
@@ -38,6 +38,10 @@ module ETL
38
38
  def file
39
39
  File.join(File.dirname(source.control.file), source.configuration[:file])
40
40
  end
41
+
42
+ def raise_with_info(error, message, file, line)
43
+ raise error, "#{message} (line #{line} in #{file})"
44
+ end
41
45
  end
42
46
  end
43
47
  end
@@ -1,5 +1,6 @@
1
1
  module ETL
2
2
  module Processor
3
+ # Processor which is used to bulk import data into a target database
3
4
  class BulkImportProcessor < ETL::Processor::Processor
4
5
  attr_reader :file, :target, :truncate, :columns
5
6
  def initialize(control, configuration)
@@ -16,21 +17,14 @@ module ETL
16
17
  conn.transaction do
17
18
  # TODO: Support all database types
18
19
  # Since LOCAL is used this must be allowed by both the client and server
19
- conn.execute("TRUNCATE #{target[:table]}") if truncate
20
- q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
21
- q << " (#{columns.join(',')})" if columns
22
- conn.execute(q)
20
+ conn.truncate(target[:table]) if truncate
21
+ options = {}
22
+ options[:columns] = columns
23
+ conn.bulk_load(file, target[:table], options)
23
24
  end
24
25
  end
25
- private
26
- def log
27
- unless @log
28
- @log = Logger.new(STDOUT)
29
- @log.level = Logger::DEBUG
30
- end
31
- @log
32
- end
33
26
 
27
+ private
34
28
  # Connect to the database
35
29
  def connect
36
30
  ActiveRecord::Base.establish_connection(
@@ -13,6 +13,13 @@ module ETL #:nodoc:
13
13
  def configuration
14
14
  @configuration
15
15
  end
16
+ def log
17
+ unless @log
18
+ @log = Logger.new(STDOUT)
19
+ @log.level = Logger::DEBUG
20
+ end
21
+ @log
22
+ end
16
23
  end
17
24
  end
18
25
  end
@@ -0,0 +1,30 @@
1
+ module ETL
2
+ module Processor
3
+ # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
+ # prior to loading
5
+ class TruncateProcessor < ETL::Processor::Processor
6
+ attr_reader :file, :table
7
+ def initialize(control, configuration)
8
+ super
9
+ @file = File.join(File.dirname(control.file), configuration[:file])
10
+ @table = configuration[:table]
11
+ connect
12
+ end
13
+ def process
14
+ conn = ActiveRecord::Base.connection
15
+ conn.truncate
16
+ end
17
+
18
+ # Connect to the database
19
+ def connect
20
+ ActiveRecord::Base.establish_connection(
21
+ :adapter => (target[:adapter] || :mysql),
22
+ :username => (target[:username] || 'root'),
23
+ :host => (target[:host] || 'localhost'),
24
+ :password => target[:password],
25
+ :database => target[:database]
26
+ )
27
+ end
28
+ end
29
+ end
30
+ end
@@ -7,6 +7,7 @@ module ETL #:nodoc:
7
7
  def initialize(control, configuration={})
8
8
  super
9
9
  end
10
+ # Transform the value with a SHA1 digest algorithm.
10
11
  def transform(value)
11
12
  Digest::SHA1.hexdigest(value)
12
13
  end
@@ -2,6 +2,9 @@ module ETL
2
2
  module Transform
3
3
  class Transform
4
4
  class << self
5
+ # Transform the specified value using the given transforms. The transforms can either be
6
+ # Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
7
+ # Any other object will result in a ControlError being raised.
5
8
  def transform(name, value, transforms)
6
9
  # logger.debug "Transforming field #{name}" if transforms.length > 0
7
10
  transforms.each do |transform|
@@ -20,6 +23,7 @@ module ETL
20
23
 
21
24
  attr_reader :control, :configuration
22
25
 
26
+ # Initialize the transform object
23
27
  def initialize(control, configuration={})
24
28
  @control = control
25
29
  @configuration = configuration
data/lib/etl/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module ETL
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 3
4
+ MINOR = 4
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2006-12-08 00:00:00 -05:00
6
+ version: 0.4.0
7
+ date: 2007-01-11 00:00:00 -05:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -31,10 +31,13 @@ authors:
31
31
  files:
32
32
  - CHANGELOG
33
33
  - README
34
+ - TODO
34
35
  - Rakefile
35
36
  - bin/etl
36
37
  - lib/etl
37
38
  - lib/etl.rb
39
+ - lib/etl/active_record_ext
40
+ - lib/etl/active_record_ext.rb
38
41
  - lib/etl/commands
39
42
  - lib/etl/control
40
43
  - lib/etl/control.rb
@@ -48,6 +51,8 @@ files:
48
51
  - lib/etl/transform
49
52
  - lib/etl/transform.rb
50
53
  - lib/etl/version.rb
54
+ - lib/etl/active_record_ext/connection_adapters
55
+ - lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb
51
56
  - lib/etl/commands/etl.rb
52
57
  - lib/etl/control/control.rb
53
58
  - lib/etl/control/destination
@@ -66,6 +71,7 @@ files:
66
71
  - lib/etl/parser/xml_parser.rb
67
72
  - lib/etl/processor/bulk_import_processor.rb
68
73
  - lib/etl/processor/processor.rb
74
+ - lib/etl/processor/truncate_processor.rb
69
75
  - lib/etl/transform/decode_transform.rb
70
76
  - lib/etl/transform/sha1_transform.rb
71
77
  - lib/etl/transform/transform.rb