activewarehouse-etl 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -0
- data/lib/etl.rb +1 -0
- data/lib/etl/control/destination.rb +4 -0
- data/lib/etl/control/destination/database_destination.rb +3 -2
- data/lib/etl/control/destination/file_destination.rb +41 -5
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +11 -0
- data/lib/etl/generator/surrogate_key_generator.rb +10 -0
- data/lib/etl/parser/xml_parser.rb +66 -0
- data/lib/etl/processor/bulk_import_processor.rb +9 -2
- data/lib/etl/version.rb +1 -1
- metadata +7 -2
data/Rakefile
CHANGED
data/lib/etl.rb
CHANGED
@@ -10,6 +10,7 @@ module ETL
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def flush
|
13
|
+
# TODO: add virtual fields and compound key constraint support like in the FileDestination
|
13
14
|
conn = ActiveRecord::Base.connection
|
14
15
|
conn.transaction do
|
15
16
|
buffer.each do |row|
|
@@ -17,10 +18,10 @@ module ETL
|
|
17
18
|
values = []
|
18
19
|
order.each do |name|
|
19
20
|
names << name
|
20
|
-
values << "'#{row[name]}'"
|
21
|
+
values << "'#{row[name]}'" # TODO: this is probably not database agnostic
|
21
22
|
end
|
22
23
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
23
|
-
ETL::Engine.logger.debug("Query: #{q}")
|
24
|
+
# ETL::Engine.logger.debug("Query: #{q}")
|
24
25
|
conn.execute(q, "Insert row #{current_row}")
|
25
26
|
@current_row += 1
|
26
27
|
end
|
@@ -3,7 +3,7 @@ module ETL #:nodoc:
|
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
5
|
attr_reader :file, :order
|
6
|
-
attr_accessor :append, :separator, :eol, :enclose
|
6
|
+
attr_accessor :append, :separator, :eol, :enclose, :unique
|
7
7
|
|
8
8
|
# Initialize the object.
|
9
9
|
# * <tt>control</tt>: The Control object
|
@@ -15,7 +15,8 @@ module ETL #:nodoc:
|
|
15
15
|
@append = configuration[:append] ||= false
|
16
16
|
@separator = configuration[:separator] ||= ','
|
17
17
|
@eol = configuration[:eol] ||= "\n"
|
18
|
-
@enclose = configuration[:enclose]
|
18
|
+
@enclose = configuration[:enclose]
|
19
|
+
@unique = configuration[:unique]
|
19
20
|
|
20
21
|
@order = mapping[:order] || order_from_source
|
21
22
|
raise ControlError, "Order required in mapping" unless @order
|
@@ -27,20 +28,49 @@ module ETL #:nodoc:
|
|
27
28
|
f.close
|
28
29
|
end
|
29
30
|
|
31
|
+
# Flush the destination buffer
|
30
32
|
def flush
|
31
33
|
buffer.each do |row|
|
32
|
-
|
34
|
+
# check to see if this row's compound key constraint already exists
|
35
|
+
# note that the compound key constraint may not utilize virtual fields
|
36
|
+
next unless row_allowed?(row)
|
37
|
+
|
38
|
+
# add any virtual fields
|
39
|
+
add_virtuals!(row)
|
40
|
+
|
41
|
+
# collect all of the values using the order designated in the configuration
|
33
42
|
values = order.collect { |name| row[name] }
|
43
|
+
|
44
|
+
# enclose the value if required
|
34
45
|
if !enclose.nil?
|
35
46
|
values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
|
36
47
|
end
|
48
|
+
|
49
|
+
# write the values joined by the separator defined in the configuration
|
37
50
|
f.write(values.join(separator))
|
51
|
+
|
52
|
+
# write the end-of-line
|
38
53
|
f.write(eol)
|
39
54
|
end
|
40
55
|
buffer.clear
|
41
56
|
end
|
42
57
|
|
43
58
|
private
|
59
|
+
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
60
|
+
# in the configuration and the compound key already exists
|
61
|
+
def row_allowed?(row)
|
62
|
+
if unique
|
63
|
+
key = (unique.collect { |k| row[k] }).join('|')
|
64
|
+
return false if compound_key_constraints[key]
|
65
|
+
compound_key_constraints[key] = 1
|
66
|
+
end
|
67
|
+
return true
|
68
|
+
end
|
69
|
+
|
70
|
+
def compound_key_constraints
|
71
|
+
@compound_key_constraints ||= {}
|
72
|
+
end
|
73
|
+
|
44
74
|
# Get the open file stream
|
45
75
|
def f
|
46
76
|
@f ||= open(file, mode)
|
@@ -51,10 +81,16 @@ module ETL #:nodoc:
|
|
51
81
|
append ? 'a' : 'w'
|
52
82
|
end
|
53
83
|
|
54
|
-
def add_virtuals(row)
|
84
|
+
def add_virtuals!(row)
|
55
85
|
if mapping[:virtual]
|
56
86
|
mapping[:virtual].each do |key,value|
|
57
|
-
|
87
|
+
case value
|
88
|
+
when Symbol
|
89
|
+
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
90
|
+
row[key] = generators[key].next
|
91
|
+
else
|
92
|
+
row[key] = value
|
93
|
+
end
|
58
94
|
end
|
59
95
|
end
|
60
96
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class XmlParser < ETL::Parser::Parser
|
6
|
+
include Enumerable
|
7
|
+
# Initialize the parser
|
8
|
+
# * <tt>source</tt>: The Source object
|
9
|
+
def initialize(source)
|
10
|
+
super
|
11
|
+
configure
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns each row
|
15
|
+
def each
|
16
|
+
Dir.glob(file).each do |file|
|
17
|
+
doc = nil
|
18
|
+
t = Benchmark.realtime do
|
19
|
+
doc = REXML::Document.new(File.new(file))
|
20
|
+
end
|
21
|
+
Engine.logger.info "XML #{file} parsed in #{t}s"
|
22
|
+
doc.elements.each(@collection_xpath) do |element|
|
23
|
+
row = {}
|
24
|
+
fields.each do |f|
|
25
|
+
value = element.text(f.xpath)
|
26
|
+
row[f.name] = convert(f.name, value, f.type)
|
27
|
+
end
|
28
|
+
yield row
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get an array of defined fields
|
34
|
+
def fields
|
35
|
+
@fields ||= []
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def configure
|
40
|
+
@collection_xpath = source.definition[:collection]
|
41
|
+
raise "Collection XPath is required" if @collection_xpath.nil?
|
42
|
+
|
43
|
+
source.definition[:fields].each do |options|
|
44
|
+
case options
|
45
|
+
when Symbol
|
46
|
+
fields << Field.new(options, options.to_s)
|
47
|
+
when Hash
|
48
|
+
options[:xpath] ||= options[:name]
|
49
|
+
fields << Field.new(options[:name], options[:xpath].to_s, options[:type])
|
50
|
+
else
|
51
|
+
raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Field
|
57
|
+
attr_reader :name, :xpath, :type
|
58
|
+
def initialize(name, xpath, type=:string)
|
59
|
+
@name = name
|
60
|
+
@xpath = xpath
|
61
|
+
@type = type
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -1,18 +1,25 @@
|
|
1
1
|
module ETL
|
2
2
|
module Processor
|
3
3
|
class BulkImportProcessor < ETL::Processor::Processor
|
4
|
-
attr_reader :file, :target
|
4
|
+
attr_reader :file, :target, :truncate, :columns
|
5
5
|
def initialize(control, configuration)
|
6
6
|
super
|
7
7
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
8
8
|
@target = configuration[:target]
|
9
|
+
@truncate = configuration[:truncate] ||= false
|
10
|
+
@columns = configuration[:columns]
|
9
11
|
connect
|
10
12
|
end
|
11
13
|
def process
|
14
|
+
# columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
|
12
15
|
conn = ActiveRecord::Base.connection
|
13
16
|
conn.transaction do
|
17
|
+
# TODO: Support all database types
|
14
18
|
# Since LOCAL is used this must be allowed by both the client and server
|
15
|
-
conn.execute("
|
19
|
+
conn.execute("TRUNCATE #{target[:table]}") if truncate
|
20
|
+
q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
|
21
|
+
q << " (#{columns.join(',')})" if columns
|
22
|
+
conn.execute(q)
|
16
23
|
end
|
17
24
|
end
|
18
25
|
private
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-12-
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2006-12-07 00:00:00 -05:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -39,6 +39,8 @@ files:
|
|
39
39
|
- lib/etl/control
|
40
40
|
- lib/etl/control.rb
|
41
41
|
- lib/etl/engine.rb
|
42
|
+
- lib/etl/generator
|
43
|
+
- lib/etl/generator.rb
|
42
44
|
- lib/etl/parser
|
43
45
|
- lib/etl/parser.rb
|
44
46
|
- lib/etl/processor
|
@@ -56,9 +58,12 @@ files:
|
|
56
58
|
- lib/etl/control/destination/file_destination.rb
|
57
59
|
- lib/etl/control/source/database_source.rb
|
58
60
|
- lib/etl/control/source/file_source.rb
|
61
|
+
- lib/etl/generator/generator.rb
|
62
|
+
- lib/etl/generator/surrogate_key_generator.rb
|
59
63
|
- lib/etl/parser/delimited_parser.rb
|
60
64
|
- lib/etl/parser/fixed_width_parser.rb
|
61
65
|
- lib/etl/parser/parser.rb
|
66
|
+
- lib/etl/parser/xml_parser.rb
|
62
67
|
- lib/etl/processor/bulk_import_processor.rb
|
63
68
|
- lib/etl/processor/processor.rb
|
64
69
|
- lib/etl/transform/decode_transform.rb
|