activewarehouse-etl 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -78,6 +78,8 @@ end
78
78
 
79
79
  Rake::GemPackageTask.new(spec) do |pkg|
80
80
  pkg.gem_spec = spec
81
+ pkg.need_tar = true
82
+ pkg.need_zip = true
81
83
  end
82
84
 
83
85
  desc "Generate code statistics"
data/lib/etl.rb CHANGED
@@ -47,6 +47,7 @@ require 'etl/control'
47
47
  require 'etl/parser'
48
48
  require 'etl/transform'
49
49
  require 'etl/processor'
50
+ require 'etl/generator'
50
51
 
51
52
  module ETL #:nodoc:
52
53
  class ETLError < StandardError #:nodoc:
@@ -42,6 +42,10 @@ module ETL #:nodoc:
42
42
  @buffer ||= []
43
43
  end
44
44
 
45
+ def generators
46
+ @generators ||= {}
47
+ end
48
+
45
49
  # Get the order of elements from the source order
46
50
  def order_from_source
47
51
  order = []
@@ -10,6 +10,7 @@ module ETL
10
10
  end
11
11
 
12
12
  def flush
13
+ # TODO: add virtual fields and compound key constraint support like in the FileDestination
13
14
  conn = ActiveRecord::Base.connection
14
15
  conn.transaction do
15
16
  buffer.each do |row|
@@ -17,10 +18,10 @@ module ETL
17
18
  values = []
18
19
  order.each do |name|
19
20
  names << name
20
- values << "'#{row[name]}'"
21
+ values << "'#{row[name]}'" # TODO: this is probably not database agnostic
21
22
  end
22
23
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
23
- ETL::Engine.logger.debug("Query: #{q}")
24
+ # ETL::Engine.logger.debug("Query: #{q}")
24
25
  conn.execute(q, "Insert row #{current_row}")
25
26
  @current_row += 1
26
27
  end
@@ -3,7 +3,7 @@ module ETL #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
5
  attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose
6
+ attr_accessor :append, :separator, :eol, :enclose, :unique
7
7
 
8
8
  # Initialize the object.
9
9
  # * <tt>control</tt>: The Control object
@@ -15,7 +15,8 @@ module ETL #:nodoc:
15
15
  @append = configuration[:append] ||= false
16
16
  @separator = configuration[:separator] ||= ','
17
17
  @eol = configuration[:eol] ||= "\n"
18
- @enclose = configuration[:enclose] ||= nil
18
+ @enclose = configuration[:enclose]
19
+ @unique = configuration[:unique]
19
20
 
20
21
  @order = mapping[:order] || order_from_source
21
22
  raise ControlError, "Order required in mapping" unless @order
@@ -27,20 +28,49 @@ module ETL #:nodoc:
27
28
  f.close
28
29
  end
29
30
 
31
+ # Flush the destination buffer
30
32
  def flush
31
33
  buffer.each do |row|
32
- add_virtuals(row)
34
+ # check to see if this row's compound key constraint already exists
35
+ # note that the compound key constraint may not utilize virtual fields
36
+ next unless row_allowed?(row)
37
+
38
+ # add any virtual fields
39
+ add_virtuals!(row)
40
+
41
+ # collect all of the values using the order designated in the configuration
33
42
  values = order.collect { |name| row[name] }
43
+
44
+ # enclose the value if required
34
45
  if !enclose.nil?
35
46
  values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
36
47
  end
48
+
49
+ # write the values joined by the separator defined in the configuration
37
50
  f.write(values.join(separator))
51
+
52
+ # write the end-of-line
38
53
  f.write(eol)
39
54
  end
40
55
  buffer.clear
41
56
  end
42
57
 
43
58
  private
59
+ # Return true if the row is allowed. The row will not be allowed if the :unique option is specified
60
+ # in the configuration and the compound key already exists
61
+ def row_allowed?(row)
62
+ if unique
63
+ key = (unique.collect { |k| row[k] }).join('|')
64
+ return false if compound_key_constraints[key]
65
+ compound_key_constraints[key] = 1
66
+ end
67
+ return true
68
+ end
69
+
70
+ def compound_key_constraints
71
+ @compound_key_constraints ||= {}
72
+ end
73
+
44
74
  # Get the open file stream
45
75
  def f
46
76
  @f ||= open(file, mode)
@@ -51,10 +81,16 @@ module ETL #:nodoc:
51
81
  append ? 'a' : 'w'
52
82
  end
53
83
 
54
- def add_virtuals(row)
84
+ def add_virtuals!(row)
55
85
  if mapping[:virtual]
56
86
  mapping[:virtual].each do |key,value|
57
- row[key] = value
87
+ case value
88
+ when Symbol
89
+ generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
90
+ row[key] = generators[key].next
91
+ else
92
+ row[key] = value
93
+ end
58
94
  end
59
95
  end
60
96
  end
@@ -0,0 +1,2 @@
1
+ require 'etl/generator/generator'
2
+ Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { |file| require(file) }
@@ -0,0 +1,11 @@
1
+ module ETL
2
+ module Generator
3
+ class Generator
4
+ class << self
5
+ def class_for_name(name)
6
+ ETL::Generator.const_get("#{name.to_s.classify}Generator")
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ module ETL
2
+ module Generator
3
+ class SurrogateKeyGenerator < Generator
4
+ def next
5
+ @surrogate_key ||= 0
6
+ @surrogate_key += 1
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,66 @@
1
+ require 'rexml/document'
2
+
3
+ module ETL
4
+ module Parser
5
+ class XmlParser < ETL::Parser::Parser
6
+ include Enumerable
7
+ # Initialize the parser
8
+ # * <tt>source</tt>: The Source object
9
+ def initialize(source)
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Returns each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ doc = nil
18
+ t = Benchmark.realtime do
19
+ doc = REXML::Document.new(File.new(file))
20
+ end
21
+ Engine.logger.info "XML #{file} parsed in #{t}s"
22
+ doc.elements.each(@collection_xpath) do |element|
23
+ row = {}
24
+ fields.each do |f|
25
+ value = element.text(f.xpath)
26
+ row[f.name] = convert(f.name, value, f.type)
27
+ end
28
+ yield row
29
+ end
30
+ end
31
+ end
32
+
33
+ # Get an array of defined fields
34
+ def fields
35
+ @fields ||= []
36
+ end
37
+
38
+ private
39
+ def configure
40
+ @collection_xpath = source.definition[:collection]
41
+ raise "Collection XPath is required" if @collection_xpath.nil?
42
+
43
+ source.definition[:fields].each do |options|
44
+ case options
45
+ when Symbol
46
+ fields << Field.new(options, options.to_s)
47
+ when Hash
48
+ options[:xpath] ||= options[:name]
49
+ fields << Field.new(options[:name], options[:xpath].to_s, options[:type])
50
+ else
51
+ raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
52
+ end
53
+ end
54
+ end
55
+
56
+ class Field
57
+ attr_reader :name, :xpath, :type
58
+ def initialize(name, xpath, type=:string)
59
+ @name = name
60
+ @xpath = xpath
61
+ @type = type
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,18 +1,25 @@
1
1
  module ETL
2
2
  module Processor
3
3
  class BulkImportProcessor < ETL::Processor::Processor
4
- attr_reader :file, :target
4
+ attr_reader :file, :target, :truncate, :columns
5
5
  def initialize(control, configuration)
6
6
  super
7
7
  @file = File.join(File.dirname(control.file), configuration[:file])
8
8
  @target = configuration[:target]
9
+ @truncate = configuration[:truncate] ||= false
10
+ @columns = configuration[:columns]
9
11
  connect
10
12
  end
11
13
  def process
14
+ # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
12
15
  conn = ActiveRecord::Base.connection
13
16
  conn.transaction do
17
+ # TODO: Support all database types
14
18
  # Since LOCAL is used this must be allowed by both the client and server
15
- conn.execute("LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}")
19
+ conn.execute("TRUNCATE #{target[:table]}") if truncate
20
+ q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
21
+ q << " (#{columns.join(',')})" if columns
22
+ conn.execute(q)
16
23
  end
17
24
  end
18
25
  private
@@ -1,7 +1,7 @@
1
1
  module ETL
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 1
4
+ MINOR = 2
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2006-12-06 00:00:00 -05:00
6
+ version: 0.2.0
7
+ date: 2006-12-07 00:00:00 -05:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -39,6 +39,8 @@ files:
39
39
  - lib/etl/control
40
40
  - lib/etl/control.rb
41
41
  - lib/etl/engine.rb
42
+ - lib/etl/generator
43
+ - lib/etl/generator.rb
42
44
  - lib/etl/parser
43
45
  - lib/etl/parser.rb
44
46
  - lib/etl/processor
@@ -56,9 +58,12 @@ files:
56
58
  - lib/etl/control/destination/file_destination.rb
57
59
  - lib/etl/control/source/database_source.rb
58
60
  - lib/etl/control/source/file_source.rb
61
+ - lib/etl/generator/generator.rb
62
+ - lib/etl/generator/surrogate_key_generator.rb
59
63
  - lib/etl/parser/delimited_parser.rb
60
64
  - lib/etl/parser/fixed_width_parser.rb
61
65
  - lib/etl/parser/parser.rb
66
+ - lib/etl/parser/xml_parser.rb
62
67
  - lib/etl/processor/bulk_import_processor.rb
63
68
  - lib/etl/processor/processor.rb
64
69
  - lib/etl/transform/decode_transform.rb