activewarehouse-etl 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -0
- data/lib/etl.rb +1 -0
- data/lib/etl/control/destination.rb +4 -0
- data/lib/etl/control/destination/database_destination.rb +3 -2
- data/lib/etl/control/destination/file_destination.rb +41 -5
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +11 -0
- data/lib/etl/generator/surrogate_key_generator.rb +10 -0
- data/lib/etl/parser/xml_parser.rb +66 -0
- data/lib/etl/processor/bulk_import_processor.rb +9 -2
- data/lib/etl/version.rb +1 -1
- metadata +7 -2
data/Rakefile
CHANGED
data/lib/etl.rb
CHANGED
@@ -10,6 +10,7 @@ module ETL
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def flush
|
13
|
+
# TODO: add virtual fields and compound key constraint support like in the FileDestination
|
13
14
|
conn = ActiveRecord::Base.connection
|
14
15
|
conn.transaction do
|
15
16
|
buffer.each do |row|
|
@@ -17,10 +18,10 @@ module ETL
|
|
17
18
|
values = []
|
18
19
|
order.each do |name|
|
19
20
|
names << name
|
20
|
-
values << "'#{row[name]}'"
|
21
|
+
values << "'#{row[name]}'" # TODO: this is probably not database agnostic
|
21
22
|
end
|
22
23
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
23
|
-
ETL::Engine.logger.debug("Query: #{q}")
|
24
|
+
# ETL::Engine.logger.debug("Query: #{q}")
|
24
25
|
conn.execute(q, "Insert row #{current_row}")
|
25
26
|
@current_row += 1
|
26
27
|
end
|
@@ -3,7 +3,7 @@ module ETL #:nodoc:
|
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
5
|
attr_reader :file, :order
|
6
|
-
attr_accessor :append, :separator, :eol, :enclose
|
6
|
+
attr_accessor :append, :separator, :eol, :enclose, :unique
|
7
7
|
|
8
8
|
# Initialize the object.
|
9
9
|
# * <tt>control</tt>: The Control object
|
@@ -15,7 +15,8 @@ module ETL #:nodoc:
|
|
15
15
|
@append = configuration[:append] ||= false
|
16
16
|
@separator = configuration[:separator] ||= ','
|
17
17
|
@eol = configuration[:eol] ||= "\n"
|
18
|
-
@enclose = configuration[:enclose]
|
18
|
+
@enclose = configuration[:enclose]
|
19
|
+
@unique = configuration[:unique]
|
19
20
|
|
20
21
|
@order = mapping[:order] || order_from_source
|
21
22
|
raise ControlError, "Order required in mapping" unless @order
|
@@ -27,20 +28,49 @@ module ETL #:nodoc:
|
|
27
28
|
f.close
|
28
29
|
end
|
29
30
|
|
31
|
+
# Flush the destination buffer
|
30
32
|
def flush
|
31
33
|
buffer.each do |row|
|
32
|
-
|
34
|
+
# check to see if this row's compound key constraint already exists
|
35
|
+
# note that the compound key constraint may not utilize virtual fields
|
36
|
+
next unless row_allowed?(row)
|
37
|
+
|
38
|
+
# add any virtual fields
|
39
|
+
add_virtuals!(row)
|
40
|
+
|
41
|
+
# collect all of the values using the order designated in the configuration
|
33
42
|
values = order.collect { |name| row[name] }
|
43
|
+
|
44
|
+
# enclose the value if required
|
34
45
|
if !enclose.nil?
|
35
46
|
values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
|
36
47
|
end
|
48
|
+
|
49
|
+
# write the values joined by the separator defined in the configuration
|
37
50
|
f.write(values.join(separator))
|
51
|
+
|
52
|
+
# write the end-of-line
|
38
53
|
f.write(eol)
|
39
54
|
end
|
40
55
|
buffer.clear
|
41
56
|
end
|
42
57
|
|
43
58
|
private
|
59
|
+
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
60
|
+
# in the configuration and the compound key already exists
|
61
|
+
def row_allowed?(row)
|
62
|
+
if unique
|
63
|
+
key = (unique.collect { |k| row[k] }).join('|')
|
64
|
+
return false if compound_key_constraints[key]
|
65
|
+
compound_key_constraints[key] = 1
|
66
|
+
end
|
67
|
+
return true
|
68
|
+
end
|
69
|
+
|
70
|
+
def compound_key_constraints
|
71
|
+
@compound_key_constraints ||= {}
|
72
|
+
end
|
73
|
+
|
44
74
|
# Get the open file stream
|
45
75
|
def f
|
46
76
|
@f ||= open(file, mode)
|
@@ -51,10 +81,16 @@ module ETL #:nodoc:
|
|
51
81
|
append ? 'a' : 'w'
|
52
82
|
end
|
53
83
|
|
54
|
-
def add_virtuals(row)
|
84
|
+
def add_virtuals!(row)
|
55
85
|
if mapping[:virtual]
|
56
86
|
mapping[:virtual].each do |key,value|
|
57
|
-
|
87
|
+
case value
|
88
|
+
when Symbol
|
89
|
+
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
90
|
+
row[key] = generators[key].next
|
91
|
+
else
|
92
|
+
row[key] = value
|
93
|
+
end
|
58
94
|
end
|
59
95
|
end
|
60
96
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class XmlParser < ETL::Parser::Parser
|
6
|
+
include Enumerable
|
7
|
+
# Initialize the parser
|
8
|
+
# * <tt>source</tt>: The Source object
|
9
|
+
def initialize(source)
|
10
|
+
super
|
11
|
+
configure
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns each row
|
15
|
+
def each
|
16
|
+
Dir.glob(file).each do |file|
|
17
|
+
doc = nil
|
18
|
+
t = Benchmark.realtime do
|
19
|
+
doc = REXML::Document.new(File.new(file))
|
20
|
+
end
|
21
|
+
Engine.logger.info "XML #{file} parsed in #{t}s"
|
22
|
+
doc.elements.each(@collection_xpath) do |element|
|
23
|
+
row = {}
|
24
|
+
fields.each do |f|
|
25
|
+
value = element.text(f.xpath)
|
26
|
+
row[f.name] = convert(f.name, value, f.type)
|
27
|
+
end
|
28
|
+
yield row
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get an array of defined fields
|
34
|
+
def fields
|
35
|
+
@fields ||= []
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def configure
|
40
|
+
@collection_xpath = source.definition[:collection]
|
41
|
+
raise "Collection XPath is required" if @collection_xpath.nil?
|
42
|
+
|
43
|
+
source.definition[:fields].each do |options|
|
44
|
+
case options
|
45
|
+
when Symbol
|
46
|
+
fields << Field.new(options, options.to_s)
|
47
|
+
when Hash
|
48
|
+
options[:xpath] ||= options[:name]
|
49
|
+
fields << Field.new(options[:name], options[:xpath].to_s, options[:type])
|
50
|
+
else
|
51
|
+
raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Field
|
57
|
+
attr_reader :name, :xpath, :type
|
58
|
+
def initialize(name, xpath, type=:string)
|
59
|
+
@name = name
|
60
|
+
@xpath = xpath
|
61
|
+
@type = type
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -1,18 +1,25 @@
|
|
1
1
|
module ETL
|
2
2
|
module Processor
|
3
3
|
class BulkImportProcessor < ETL::Processor::Processor
|
4
|
-
attr_reader :file, :target
|
4
|
+
attr_reader :file, :target, :truncate, :columns
|
5
5
|
def initialize(control, configuration)
|
6
6
|
super
|
7
7
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
8
8
|
@target = configuration[:target]
|
9
|
+
@truncate = configuration[:truncate] ||= false
|
10
|
+
@columns = configuration[:columns]
|
9
11
|
connect
|
10
12
|
end
|
11
13
|
def process
|
14
|
+
# columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
|
12
15
|
conn = ActiveRecord::Base.connection
|
13
16
|
conn.transaction do
|
17
|
+
# TODO: Support all database types
|
14
18
|
# Since LOCAL is used this must be allowed by both the client and server
|
15
|
-
conn.execute("
|
19
|
+
conn.execute("TRUNCATE #{target[:table]}") if truncate
|
20
|
+
q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}"
|
21
|
+
q << " (#{columns.join(',')})" if columns
|
22
|
+
conn.execute(q)
|
16
23
|
end
|
17
24
|
end
|
18
25
|
private
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-12-
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2006-12-07 00:00:00 -05:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -39,6 +39,8 @@ files:
|
|
39
39
|
- lib/etl/control
|
40
40
|
- lib/etl/control.rb
|
41
41
|
- lib/etl/engine.rb
|
42
|
+
- lib/etl/generator
|
43
|
+
- lib/etl/generator.rb
|
42
44
|
- lib/etl/parser
|
43
45
|
- lib/etl/parser.rb
|
44
46
|
- lib/etl/processor
|
@@ -56,9 +58,12 @@ files:
|
|
56
58
|
- lib/etl/control/destination/file_destination.rb
|
57
59
|
- lib/etl/control/source/database_source.rb
|
58
60
|
- lib/etl/control/source/file_source.rb
|
61
|
+
- lib/etl/generator/generator.rb
|
62
|
+
- lib/etl/generator/surrogate_key_generator.rb
|
59
63
|
- lib/etl/parser/delimited_parser.rb
|
60
64
|
- lib/etl/parser/fixed_width_parser.rb
|
61
65
|
- lib/etl/parser/parser.rb
|
66
|
+
- lib/etl/parser/xml_parser.rb
|
62
67
|
- lib/etl/processor/bulk_import_processor.rb
|
63
68
|
- lib/etl/processor/processor.rb
|
64
69
|
- lib/etl/transform/decode_transform.rb
|