activewarehouse-etl 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
@@ -1,11 +1,20 @@
|
|
1
|
-
module ETL
|
2
|
-
module Generator
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Generator #:nodoc:
|
3
|
+
# Base class for generators.
|
3
4
|
class Generator
|
4
5
|
class << self
|
6
|
+
# Get the Class for the specified name.
|
7
|
+
#
|
8
|
+
# For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
|
5
9
|
def class_for_name(name)
|
6
10
|
ETL::Generator.const_get("#{name.to_s.classify}Generator")
|
7
11
|
end
|
8
12
|
end
|
13
|
+
|
14
|
+
# Generate the next value. This method must be implemented by subclasses
|
15
|
+
def next
|
16
|
+
raise "Must be implemented by a subclass"
|
17
|
+
end
|
9
18
|
end
|
10
19
|
end
|
11
20
|
end
|
@@ -2,17 +2,16 @@ module ETL #:nodoc:
|
|
2
2
|
module Parser #:nodoc:
|
3
3
|
# Parses delimited files
|
4
4
|
class DelimitedParser < ETL::Parser::Parser
|
5
|
-
include Enumerable
|
6
5
|
# Initialize the parser
|
7
6
|
# * <tt>source</tt>: The Source object
|
8
|
-
|
7
|
+
# * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
|
8
|
+
def initialize(source, options={})
|
9
9
|
super
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
13
|
# Returns each row.
|
14
14
|
def each
|
15
|
-
options = {}
|
16
15
|
Dir.glob(file).each do |file|
|
17
16
|
ETL::Engine.logger.debug "parsing #{file}"
|
18
17
|
line = 0
|
@@ -64,7 +63,7 @@ module ETL #:nodoc:
|
|
64
63
|
end
|
65
64
|
end
|
66
65
|
|
67
|
-
class Field
|
66
|
+
class Field #:nodoc:
|
68
67
|
attr_reader :name, :type
|
69
68
|
def initialize(name, type=:string)
|
70
69
|
@name = name
|
@@ -2,11 +2,10 @@ module ETL #:nodoc:
|
|
2
2
|
module Parser #:nodoc:
|
3
3
|
# Parser for fixed with files
|
4
4
|
class FixedWidthParser < ETL::Parser::Parser
|
5
|
-
include Enumerable
|
6
|
-
|
7
5
|
# Initialize the parser
|
8
6
|
# * <tt>source</tt>: The source object
|
9
|
-
|
7
|
+
# * <tt>options</tt>: Parser options Hash
|
8
|
+
def initialize(source, options={})
|
10
9
|
super
|
11
10
|
configure
|
12
11
|
end
|
@@ -43,7 +42,7 @@ module ETL #:nodoc:
|
|
43
42
|
end
|
44
43
|
end
|
45
44
|
|
46
|
-
class FixedWidthField
|
45
|
+
class FixedWidthField #:nodoc:
|
47
46
|
attr_reader :name, :field_start, :field_end, :field_length, :type
|
48
47
|
def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
|
49
48
|
@name = name
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module ETL
|
2
2
|
module Parser
|
3
3
|
class Parser
|
4
|
+
include Enumerable
|
4
5
|
class << self
|
5
6
|
# Convert the name (string or symbol) to a parser class.
|
6
7
|
#
|
@@ -11,10 +12,15 @@ module ETL
|
|
11
12
|
end
|
12
13
|
end
|
13
14
|
|
15
|
+
# The Source object for the data
|
14
16
|
attr_reader :source
|
15
17
|
|
16
|
-
|
18
|
+
# Options Hash for the parser
|
19
|
+
attr_reader :options
|
20
|
+
|
21
|
+
def initialize(source, options={})
|
17
22
|
@source = source
|
23
|
+
@options = options || {}
|
18
24
|
end
|
19
25
|
|
20
26
|
# Convert the value to the specified type.
|
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'rexml/parsers/sax2parser'
|
2
|
+
require 'rexml/sax2listener'
|
3
|
+
|
4
|
+
module ETL
|
5
|
+
module Parser
|
6
|
+
class SaxParser < ETL::Parser::Parser
|
7
|
+
|
8
|
+
# The write trigger causes whatever values are currently specified for the row to be returned.
|
9
|
+
# After returning the values will not be cleared, thus allowing for values which are assigned
|
10
|
+
# higher in the XML tree to remain in memory.
|
11
|
+
attr_accessor :write_trigger
|
12
|
+
|
13
|
+
# Initialize the parser
|
14
|
+
# * <tt>source</tt>: The Source object
|
15
|
+
# * <tt>options</tt>: Parser options Hash
|
16
|
+
def initialize(source, options={})
|
17
|
+
super
|
18
|
+
configure
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns each row
|
22
|
+
def each(&block)
|
23
|
+
Dir.glob(file).each do |file|
|
24
|
+
parser = REXML::Parsers::SAX2Parser.new(File.new(file))
|
25
|
+
listener = Listener.new(self, &block)
|
26
|
+
parser.listen(listener)
|
27
|
+
parser.parse
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def fields
|
32
|
+
@fields ||= []
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
def configure
|
37
|
+
#puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
|
38
|
+
self.write_trigger = source.definition[:write_trigger]
|
39
|
+
# map paths to field names
|
40
|
+
source.definition[:fields].each do |name, path|
|
41
|
+
#puts "defined field #{name}, path: #{path}"
|
42
|
+
fields << Field.new(name, XPath::Path.parse(path))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Field
|
47
|
+
attr_reader :name, :path
|
48
|
+
def initialize(name, path)
|
49
|
+
@name = name
|
50
|
+
@path = path
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Listener
|
56
|
+
include REXML::SAX2Listener
|
57
|
+
def initialize(parser, &block)
|
58
|
+
@parser = parser
|
59
|
+
@row = {}
|
60
|
+
@value = nil
|
61
|
+
@proc = Proc.new(&block)
|
62
|
+
end
|
63
|
+
def cdata(text)
|
64
|
+
@value << text
|
65
|
+
end
|
66
|
+
def characters(text)
|
67
|
+
text = text.strip
|
68
|
+
if (!text.nil? && text != '')
|
69
|
+
@value ||= ''
|
70
|
+
@value << text
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def start_document
|
74
|
+
@path = XPath::Path.new
|
75
|
+
end
|
76
|
+
def end_document
|
77
|
+
|
78
|
+
end
|
79
|
+
def start_element(uri, localname, qname, attributes)
|
80
|
+
@path.elements << XPath::Element.new(localname, attributes)
|
81
|
+
end
|
82
|
+
def end_element(uri, localname, qname)
|
83
|
+
element = @path.elements.last
|
84
|
+
|
85
|
+
@parser.fields.each do |field|
|
86
|
+
#puts "#{@path} match? #{field.path}"
|
87
|
+
if @path.match?(field.path)
|
88
|
+
#puts "field.path: #{field.path}"
|
89
|
+
if field.path.is_attribute?
|
90
|
+
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
+
else
|
92
|
+
@row[field.name] = @value
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
#puts @path.to_s
|
97
|
+
if @path.match?(@parser.write_trigger)
|
98
|
+
#puts "matched: #{@path} =~ #{@parser.write_trigger}"
|
99
|
+
#puts "calling proc with #{@row.inspect}"
|
100
|
+
@proc.call(@row.clone)
|
101
|
+
end
|
102
|
+
|
103
|
+
@value = nil
|
104
|
+
@path.elements.pop
|
105
|
+
end
|
106
|
+
def progress(position)
|
107
|
+
@position = position
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
module XPath
|
112
|
+
class Path
|
113
|
+
attr_accessor :elements
|
114
|
+
def initialize
|
115
|
+
@elements = []
|
116
|
+
end
|
117
|
+
def to_s
|
118
|
+
@elements.map{ |e| e.to_s }.join("/")
|
119
|
+
end
|
120
|
+
# Returns true if the last part of the path refers to an attribute
|
121
|
+
def is_attribute?
|
122
|
+
elements.last.attributes.length > 0
|
123
|
+
end
|
124
|
+
# Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
|
125
|
+
# does not reference an attribute.
|
126
|
+
#
|
127
|
+
# Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
|
128
|
+
# since attributes are stored in a Hash.
|
129
|
+
def attribute
|
130
|
+
return nil unless is_attribute?
|
131
|
+
elements.last.attributes.keys.first
|
132
|
+
end
|
133
|
+
# Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
|
134
|
+
# will cause the method to return false.
|
135
|
+
def match?(s)
|
136
|
+
path = Path.parse(s)
|
137
|
+
return false unless path.elements.length == elements.length
|
138
|
+
elements.each_with_index do |element, index|
|
139
|
+
path_element = path.elements[index]
|
140
|
+
return false if path_element.nil?
|
141
|
+
return false if element.name != path_element.name
|
142
|
+
path_element.attributes.each do |key, value|
|
143
|
+
return false unless element.attributes[key] =~ value
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return true
|
147
|
+
end
|
148
|
+
|
149
|
+
# Parse the string into an XPath::Path object
|
150
|
+
def self.parse(s)
|
151
|
+
return s if s.is_a?(Path)
|
152
|
+
path = Path.new
|
153
|
+
parts = s.split('/')
|
154
|
+
parts.each_with_index do |part, i|
|
155
|
+
attributes = {}
|
156
|
+
part.gsub!(/(.*)\[(.*)\]/, '\1')
|
157
|
+
if !$2.nil?
|
158
|
+
$2.split(",").each do |pair|
|
159
|
+
key, value = pair.split("=")
|
160
|
+
value = ".*" if value.nil?
|
161
|
+
attributes[key] = Regexp.new(value)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
path.elements << Element.new(part, attributes)
|
165
|
+
end
|
166
|
+
path
|
167
|
+
end
|
168
|
+
end
|
169
|
+
class Element
|
170
|
+
attr_reader :name
|
171
|
+
attr_reader :attributes
|
172
|
+
def initialize(name, attributes={})
|
173
|
+
@name = name
|
174
|
+
@attributes = attributes
|
175
|
+
end
|
176
|
+
def to_s
|
177
|
+
s = "#{name}"
|
178
|
+
if !@attributes.empty?
|
179
|
+
attr_str = @attributes.collect do |key,value|
|
180
|
+
value = value.source if value.is_a?(Regexp)
|
181
|
+
"#{key}=#{value}"
|
182
|
+
end.join(",")
|
183
|
+
s << "[" + attr_str + "]"
|
184
|
+
end
|
185
|
+
s
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -3,10 +3,10 @@ require 'rexml/document'
|
|
3
3
|
module ETL
|
4
4
|
module Parser
|
5
5
|
class XmlParser < ETL::Parser::Parser
|
6
|
-
include Enumerable
|
7
6
|
# Initialize the parser
|
8
7
|
# * <tt>source</tt>: The Source object
|
9
|
-
|
8
|
+
# * <tt>options</tt>: Parser options Hash
|
9
|
+
def initialize(source, options={})
|
10
10
|
super
|
11
11
|
configure
|
12
12
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
3
|
# Processor which is used to bulk import data into a target database
|
4
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
5
|
attr_reader :file, :target, :truncate, :columns
|
@@ -13,7 +13,7 @@ module ETL
|
|
13
13
|
end
|
14
14
|
def process
|
15
15
|
# columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
|
16
|
-
conn = ActiveRecord::Base.connection
|
16
|
+
conn = ETL::ActiveRecord::Base.connection
|
17
17
|
conn.transaction do
|
18
18
|
# TODO: Support all database types
|
19
19
|
# Since LOCAL is used this must be allowed by both the client and server
|
@@ -27,7 +27,7 @@ module ETL
|
|
27
27
|
private
|
28
28
|
# Connect to the database
|
29
29
|
def connect
|
30
|
-
ActiveRecord::Base.establish_connection(
|
30
|
+
ETL::ActiveRecord::Base.establish_connection(
|
31
31
|
:adapter => (target[:adapter] || :mysql),
|
32
32
|
:username => (target[:username] || 'root'),
|
33
33
|
:host => (target[:host] || 'localhost'),
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
3
|
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
4
|
# prior to loading
|
5
5
|
class TruncateProcessor < ETL::Processor::Processor
|
@@ -11,13 +11,13 @@ module ETL
|
|
11
11
|
connect
|
12
12
|
end
|
13
13
|
def process
|
14
|
-
conn = ActiveRecord::Base.connection
|
14
|
+
conn = ETL::ActiveRecord::Base.connection
|
15
15
|
conn.truncate
|
16
16
|
end
|
17
17
|
|
18
18
|
# Connect to the database
|
19
19
|
def connect
|
20
|
-
ActiveRecord::Base.establish_connection(
|
20
|
+
ETL::ActiveRecord::Base.establish_connection(
|
21
21
|
:adapter => (target[:adapter] || :mysql),
|
22
22
|
:username => (target[:username] || 'root'),
|
23
23
|
:host => (target[:host] || 'localhost'),
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a Date or Time to a formatted string instance
|
4
|
+
class DateToStringTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
|
9
|
+
def initialize(control, configuration={})
|
10
|
+
super
|
11
|
+
@format = configuration[:format] || "%Y-%m-%d"
|
12
|
+
end
|
13
|
+
# Transform the value using strftime
|
14
|
+
def transform(value)
|
15
|
+
value.strftime(@format)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -2,7 +2,18 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform which decodes coded values
|
4
4
|
class DecodeTransform < ETL::Transform::Transform
|
5
|
-
attr_accessor :decode_table_path
|
5
|
+
attr_accessor :decode_table_path
|
6
|
+
|
7
|
+
attr_accessor :decode_table_delimiter
|
8
|
+
|
9
|
+
attr_accessor :default_value
|
10
|
+
|
11
|
+
# Initialize the transformer
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
|
15
|
+
# * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
|
16
|
+
# * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
|
6
17
|
def initialize(control, configuration={})
|
7
18
|
super
|
8
19
|
|
@@ -14,10 +25,13 @@ module ETL #:nodoc:
|
|
14
25
|
@decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
|
15
26
|
@default_value = (configuration[:default_value] || 'No Value')
|
16
27
|
end
|
28
|
+
|
29
|
+
# Transform the value
|
17
30
|
def transform(value)
|
18
31
|
decode_table[value] || default_value
|
19
32
|
end
|
20
33
|
|
34
|
+
# Get the decode table
|
21
35
|
def decode_table
|
22
36
|
unless @decode_table
|
23
37
|
@decode_table = {}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which looks up the value and replaces it with a foriegn key reference
|
4
|
+
class ForeignKeyLookupTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the foreign key lookup transform.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
|
9
|
+
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
10
|
+
# for future use.
|
11
|
+
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
12
|
+
def initialize(control, configuration={})
|
13
|
+
super
|
14
|
+
|
15
|
+
@collection = (configuration[:collection] || {})
|
16
|
+
@resolver = configuration[:resolver]
|
17
|
+
@resolver = @resolver.new if @resolver.is_a?(Class)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Transform the value by resolving it to a foriegn key
|
21
|
+
def transform(value)
|
22
|
+
fk = @collection[value]
|
23
|
+
unless fk
|
24
|
+
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless @resolver
|
25
|
+
raise ResolverError, "Resolver does not appear to respond to resolve method" unless @resolver.respond_to?(:resolve)
|
26
|
+
fk = @resolver.resolve(value)
|
27
|
+
raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
|
28
|
+
@collection[value] = fk
|
29
|
+
end
|
30
|
+
fk
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Resolver which resolves using ActiveRecord.
|
37
|
+
class ActiveRecordResolver
|
38
|
+
# Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
|
39
|
+
# must be a symbol for the finder method used. For example:
|
40
|
+
#
|
41
|
+
# ActiveRecordResolver.new(Person, :find_by_name)
|
42
|
+
#
|
43
|
+
# Note that the find method defined must only take a single argument.
|
44
|
+
def initialize(ar_class, find_method)
|
45
|
+
@ar_class = ar_class
|
46
|
+
@find_method = find_method
|
47
|
+
end
|
48
|
+
# Resolve the value
|
49
|
+
def resolve(value)
|
50
|
+
rec = @ar_class.__send__(@find_method, value)
|
51
|
+
rec.nil? ? nil : rec.id
|
52
|
+
end
|
53
|
+
end
|