activewarehouse-etl 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -78,6 +78,8 @@ end
78
78
 
79
79
  Rake::GemPackageTask.new(spec) do |pkg|
80
80
  pkg.gem_spec = spec
81
+ pkg.need_tar = true
82
+ pkg.need_zip = true
81
83
  end
82
84
 
83
85
  desc "Generate code statistics"
data/lib/etl.rb CHANGED
@@ -47,6 +47,7 @@ require 'etl/control'
47
47
  require 'etl/parser'
48
48
  require 'etl/transform'
49
49
  require 'etl/processor'
50
+ require 'etl/generator'
50
51
 
51
52
  module ETL #:nodoc:
52
53
  class ETLError < StandardError #:nodoc:
@@ -42,6 +42,10 @@ module ETL #:nodoc:
42
42
  @buffer ||= []
43
43
  end
44
44
 
45
+ def generators
46
+ @generators ||= {}
47
+ end
48
+
45
49
  # Get the order of elements from the source order
46
50
  def order_from_source
47
51
  order = []
@@ -10,6 +10,7 @@ module ETL
10
10
  end
11
11
 
12
12
  def flush
13
+ # TODO: add virtual fields and compound key constraint support like in the FileDestination
13
14
  conn = ActiveRecord::Base.connection
14
15
  conn.transaction do
15
16
  buffer.each do |row|
@@ -17,10 +18,10 @@ module ETL
17
18
  values = []
18
19
  order.each do |name|
19
20
  names << name
20
- values << "'#{row[name]}'"
21
+ values << "'#{row[name]}'" # TODO: this is probably not database agnostic
21
22
  end
22
23
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
23
- ETL::Engine.logger.debug("Query: #{q}")
24
+ # ETL::Engine.logger.debug("Query: #{q}")
24
25
  conn.execute(q, "Insert row #{current_row}")
25
26
  @current_row += 1
26
27
  end
@@ -3,7 +3,7 @@ module ETL #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
5
  attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose
6
+ attr_accessor :append, :separator, :eol, :enclose, :unique
7
7
 
8
8
  # Initialize the object.
9
9
  # * <tt>control</tt>: The Control object
@@ -15,7 +15,8 @@ module ETL #:nodoc:
15
15
  @append = configuration[:append] ||= false
16
16
  @separator = configuration[:separator] ||= ','
17
17
  @eol = configuration[:eol] ||= "\n"
18
- @enclose = configuration[:enclose] ||= nil
18
+ @enclose = configuration[:enclose]
19
+ @unique = configuration[:unique]
19
20
 
20
21
  @order = mapping[:order] || order_from_source
21
22
  raise ControlError, "Order required in mapping" unless @order
@@ -27,20 +28,49 @@ module ETL #:nodoc:
27
28
  f.close
28
29
  end
29
30
 
31
+ # Flush the destination buffer
30
32
  def flush
31
33
  buffer.each do |row|
32
- add_virtuals(row)
34
+ # check to see if this row's compound key constraint already exists
35
+ # note that the compound key constraint may not utilize virtual fields
36
+ next unless row_allowed?(row)
37
+
38
+ # add any virtual fields
39
+ add_virtuals!(row)
40
+
41
+ # collect all of the values using the order designated in the configuration
33
42
  values = order.collect { |name| row[name] }
43
+
44
+ # enclose the value if required
34
45
  if !enclose.nil?
35
46
  values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
36
47
  end
48
+
49
+ # write the values joined by the separator defined in the configuration
37
50
  f.write(values.join(separator))
51
+
52
+ # write the end-of-line
38
53
  f.write(eol)
39
54
  end
40
55
  buffer.clear
41
56
  end
42
57
 
43
58
  private
59
+ # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
60
+ # in the configuration and the compound key already exists
61
+ def row_allowed?(row)
62
+ if unique
63
+ key = (unique.collect { |k| row[k] }).join('|')
64
+ return false if compound_key_constraints[key]
65
+ compound_key_constraints[key] = 1
66
+ end
67
+ return true
68
+ end
69
+
70
+ def compound_key_constraints
71
+ @compound_key_constraints ||= {}
72
+ end
73
+
44
74
  # Get the open file stream
45
75
  def f
46
76
  @f ||= open(file, mode)
@@ -51,10 +81,16 @@ module ETL #:nodoc:
51
81
  append ? 'a' : 'w'
52
82
  end
53
83
 
54
- def add_virtuals(row)
84
+ def add_virtuals!(row)
55
85
  if mapping[:virtual]
56
86
  mapping[:virtual].each do |key,value|
57
- row[key] = value
87
+ case value
88
+ when Symbol
89
+ generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
90
+ row[key] = generators[key].next
91
+ else
92
+ row[key] = value
93
+ end
58
94
  end
59
95
  end
60
96
  end
@@ -0,0 +1,2 @@
1
+ require 'etl/generator/generator'
2
+ Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { |file| require(file) }
@@ -0,0 +1,11 @@
1
+ module ETL
2
+ module Generator
3
+ class Generator
4
+ class << self
5
+ def class_for_name(name)
6
+ ETL::Generator.const_get("#{name.to_s.classify}Generator")
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ module ETL
2
+ module Generator
3
+ class SurrogateKeyGenerator < Generator
4
+ def next
5
+ @surrogate_key ||= 0
6
+ @surrogate_key += 1
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,66 @@
1
+ require 'rexml/document'
2
+
3
+ module ETL
4
+ module Parser
5
+ class XmlParser < ETL::Parser::Parser
6
+ include Enumerable
7
+ # Initialize the parser
8
+ # * <tt>source</tt>: The Source object
9
+ def initialize(source)
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Returns each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ doc = nil
18
+ t = Benchmark.realtime do
19
+ doc = REXML::Document.new(File.new(file))
20
+ end
21
+ Engine.logger.info "XML #{file} parsed in #{t}s"
22
+ doc.elements.each(@collection_xpath) do |element|
23
+ row = {}
24
+ fields.each do |f|
25
+ value = element.text(f.xpath)
26
+ row[f.name] = convert(f.name, value, f.type)
27
+ end
28
+ yield row
29
+ end
30
+ end
31
+ end
32
+
33
+ # Get an array of defined fields
34
+ def fields
35
+ @fields ||= []
36
+ end
37
+
38
+ private
39
+ def configure
40
+ @collection_xpath = source.definition[:collection]
41
+ raise "Collection XPath is required" if @collection_xpath.nil?
42
+
43
+ source.definition[:fields].each do |options|
44
+ case options
45
+ when Symbol
46
+ fields << Field.new(options, options.to_s)
47
+ when Hash
48
+ options[:xpath] ||= options[:name]
49
+ fields << Field.new(options[:name], options[:xpath].to_s, options[:type])
50
+ else
51
+ raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
52
+ end
53
+ end
54
+ end
55
+
56
+ class Field
57
+ attr_reader :name, :xpath, :type
58
+ def initialize(name, xpath, type=:string)
59
+ @name = name
60
+ @xpath = xpath
61
+ @type = type
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,18 +1,25 @@
1
1
  module ETL
2
2
  module Processor
3
3
  class BulkImportProcessor < ETL::Processor::Processor
4
- attr_reader :file, :target
4
+ attr_reader :file, :target, :truncate, :columns
5
5
  def initialize(control, configuration)
6
6
  super
7
7
  @file = File.join(File.dirname(control.file), configuration[:file])
8
8
  @target = configuration[:target]
9
+ @truncate = configuration[:truncate] ||= false
10
+ @columns = configuration[:columns]
9
11
  connect
10
12
  end
11
13
  def process
14
+ # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
12
15
  conn = ActiveRecord::Base.connection
13
16
  conn.transaction do
17
+ # TODO: Support all database types
14
18
  # Since LOCAL is used this must be allowed by both the client and server
15
- conn.execute("LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}")
19
+ conn.execute("TRUNCATE #{target[:table]}") if truncate
20
+ q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
21
+ q << " (#{columns.join(',')})" if columns
22
+ conn.execute(q)
16
23
  end
17
24
  end
18
25
  private
@@ -1,7 +1,7 @@
1
1
  module ETL
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 1
4
+ MINOR = 2
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2006-12-06 00:00:00 -05:00
6
+ version: 0.2.0
7
+ date: 2006-12-07 00:00:00 -05:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -39,6 +39,8 @@ files:
39
39
  - lib/etl/control
40
40
  - lib/etl/control.rb
41
41
  - lib/etl/engine.rb
42
+ - lib/etl/generator
43
+ - lib/etl/generator.rb
42
44
  - lib/etl/parser
43
45
  - lib/etl/parser.rb
44
46
  - lib/etl/processor
@@ -56,9 +58,12 @@ files:
56
58
  - lib/etl/control/destination/file_destination.rb
57
59
  - lib/etl/control/source/database_source.rb
58
60
  - lib/etl/control/source/file_source.rb
61
+ - lib/etl/generator/generator.rb
62
+ - lib/etl/generator/surrogate_key_generator.rb
59
63
  - lib/etl/parser/delimited_parser.rb
60
64
  - lib/etl/parser/fixed_width_parser.rb
61
65
  - lib/etl/parser/parser.rb
66
+ - lib/etl/parser/xml_parser.rb
62
67
  - lib/etl/processor/bulk_import_processor.rb
63
68
  - lib/etl/processor/processor.rb
64
69
  - lib/etl/transform/decode_transform.rb