activewarehouse-etl 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +26 -14
- data/TODO +4 -2
- data/lib/etl.rb +10 -3
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +2 -2
- data/lib/etl/commands/etl.rb +6 -7
- data/lib/etl/control/control.rb +120 -52
- data/lib/etl/control/destination.rb +46 -5
- data/lib/etl/control/destination/database_destination.rb +45 -7
- data/lib/etl/control/destination/file_destination.rb +28 -4
- data/lib/etl/control/source.rb +16 -1
- data/lib/etl/control/source/database_source.rb +35 -5
- data/lib/etl/control/source/file_source.rb +33 -3
- data/lib/etl/engine.rb +129 -9
- data/lib/etl/generator/generator.rb +11 -2
- data/lib/etl/generator/surrogate_key_generator.rb +3 -2
- data/lib/etl/parser/delimited_parser.rb +3 -4
- data/lib/etl/parser/fixed_width_parser.rb +3 -4
- data/lib/etl/parser/parser.rb +7 -1
- data/lib/etl/parser/sax_parser.rb +190 -0
- data/lib/etl/parser/xml_parser.rb +2 -2
- data/lib/etl/processor/bulk_import_processor.rb +4 -4
- data/lib/etl/processor/processor.rb +1 -1
- data/lib/etl/processor/truncate_processor.rb +4 -4
- data/lib/etl/transform/date_to_string_transform.rb +19 -0
- data/lib/etl/transform/decode_transform.rb +15 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +53 -0
- data/lib/etl/transform/string_to_date_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +28 -9
- data/lib/etl/transform/type_transform.rb +22 -0
- data/lib/etl/version.rb +2 -2
- metadata +8 -3
@@ -1,11 +1,20 @@
|
|
1
|
-
module ETL
|
2
|
-
module Generator
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Generator #:nodoc:
|
3
|
+
# Base class for generators.
|
3
4
|
class Generator
|
4
5
|
class << self
|
6
|
+
# Get the Class for the specified name.
|
7
|
+
#
|
8
|
+
# For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
|
5
9
|
def class_for_name(name)
|
6
10
|
ETL::Generator.const_get("#{name.to_s.classify}Generator")
|
7
11
|
end
|
8
12
|
end
|
13
|
+
|
14
|
+
# Generate the next value. This method must be implemented by subclasses
|
15
|
+
def next
|
16
|
+
raise "Must be implemented by a subclass"
|
17
|
+
end
|
9
18
|
end
|
10
19
|
end
|
11
20
|
end
|
@@ -2,17 +2,16 @@ module ETL #:nodoc:
|
|
2
2
|
module Parser #:nodoc:
|
3
3
|
# Parses delimited files
|
4
4
|
class DelimitedParser < ETL::Parser::Parser
|
5
|
-
include Enumerable
|
6
5
|
# Initialize the parser
|
7
6
|
# * <tt>source</tt>: The Source object
|
8
|
-
|
7
|
+
# * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
|
8
|
+
def initialize(source, options={})
|
9
9
|
super
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
13
|
# Returns each row.
|
14
14
|
def each
|
15
|
-
options = {}
|
16
15
|
Dir.glob(file).each do |file|
|
17
16
|
ETL::Engine.logger.debug "parsing #{file}"
|
18
17
|
line = 0
|
@@ -64,7 +63,7 @@ module ETL #:nodoc:
|
|
64
63
|
end
|
65
64
|
end
|
66
65
|
|
67
|
-
class Field
|
66
|
+
class Field #:nodoc:
|
68
67
|
attr_reader :name, :type
|
69
68
|
def initialize(name, type=:string)
|
70
69
|
@name = name
|
@@ -2,11 +2,10 @@ module ETL #:nodoc:
|
|
2
2
|
module Parser #:nodoc:
|
3
3
|
# Parser for fixed with files
|
4
4
|
class FixedWidthParser < ETL::Parser::Parser
|
5
|
-
include Enumerable
|
6
|
-
|
7
5
|
# Initialize the parser
|
8
6
|
# * <tt>source</tt>: The source object
|
9
|
-
|
7
|
+
# * <tt>options</tt>: Parser options Hash
|
8
|
+
def initialize(source, options={})
|
10
9
|
super
|
11
10
|
configure
|
12
11
|
end
|
@@ -43,7 +42,7 @@ module ETL #:nodoc:
|
|
43
42
|
end
|
44
43
|
end
|
45
44
|
|
46
|
-
class FixedWidthField
|
45
|
+
class FixedWidthField #:nodoc:
|
47
46
|
attr_reader :name, :field_start, :field_end, :field_length, :type
|
48
47
|
def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
|
49
48
|
@name = name
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module ETL
|
2
2
|
module Parser
|
3
3
|
class Parser
|
4
|
+
include Enumerable
|
4
5
|
class << self
|
5
6
|
# Convert the name (string or symbol) to a parser class.
|
6
7
|
#
|
@@ -11,10 +12,15 @@ module ETL
|
|
11
12
|
end
|
12
13
|
end
|
13
14
|
|
15
|
+
# The Source object for the data
|
14
16
|
attr_reader :source
|
15
17
|
|
16
|
-
|
18
|
+
# Options Hash for the parser
|
19
|
+
attr_reader :options
|
20
|
+
|
21
|
+
def initialize(source, options={})
|
17
22
|
@source = source
|
23
|
+
@options = options || {}
|
18
24
|
end
|
19
25
|
|
20
26
|
# Convert the value to the specified type.
|
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'rexml/parsers/sax2parser'
|
2
|
+
require 'rexml/sax2listener'
|
3
|
+
|
4
|
+
module ETL
|
5
|
+
module Parser
|
6
|
+
class SaxParser < ETL::Parser::Parser
|
7
|
+
|
8
|
+
# The write trigger causes whatever values are currently specified for the row to be returned.
|
9
|
+
# After returning the values will not be cleared, thus allowing for values which are assigned
|
10
|
+
# higher in the XML tree to remain in memory.
|
11
|
+
attr_accessor :write_trigger
|
12
|
+
|
13
|
+
# Initialize the parser
|
14
|
+
# * <tt>source</tt>: The Source object
|
15
|
+
# * <tt>options</tt>: Parser options Hash
|
16
|
+
def initialize(source, options={})
|
17
|
+
super
|
18
|
+
configure
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns each row
|
22
|
+
def each(&block)
|
23
|
+
Dir.glob(file).each do |file|
|
24
|
+
parser = REXML::Parsers::SAX2Parser.new(File.new(file))
|
25
|
+
listener = Listener.new(self, &block)
|
26
|
+
parser.listen(listener)
|
27
|
+
parser.parse
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def fields
|
32
|
+
@fields ||= []
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
def configure
|
37
|
+
#puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
|
38
|
+
self.write_trigger = source.definition[:write_trigger]
|
39
|
+
# map paths to field names
|
40
|
+
source.definition[:fields].each do |name, path|
|
41
|
+
#puts "defined field #{name}, path: #{path}"
|
42
|
+
fields << Field.new(name, XPath::Path.parse(path))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Field
|
47
|
+
attr_reader :name, :path
|
48
|
+
def initialize(name, path)
|
49
|
+
@name = name
|
50
|
+
@path = path
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Listener
|
56
|
+
include REXML::SAX2Listener
|
57
|
+
def initialize(parser, &block)
|
58
|
+
@parser = parser
|
59
|
+
@row = {}
|
60
|
+
@value = nil
|
61
|
+
@proc = Proc.new(&block)
|
62
|
+
end
|
63
|
+
def cdata(text)
|
64
|
+
@value << text
|
65
|
+
end
|
66
|
+
def characters(text)
|
67
|
+
text = text.strip
|
68
|
+
if (!text.nil? && text != '')
|
69
|
+
@value ||= ''
|
70
|
+
@value << text
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def start_document
|
74
|
+
@path = XPath::Path.new
|
75
|
+
end
|
76
|
+
def end_document
|
77
|
+
|
78
|
+
end
|
79
|
+
def start_element(uri, localname, qname, attributes)
|
80
|
+
@path.elements << XPath::Element.new(localname, attributes)
|
81
|
+
end
|
82
|
+
def end_element(uri, localname, qname)
|
83
|
+
element = @path.elements.last
|
84
|
+
|
85
|
+
@parser.fields.each do |field|
|
86
|
+
#puts "#{@path} match? #{field.path}"
|
87
|
+
if @path.match?(field.path)
|
88
|
+
#puts "field.path: #{field.path}"
|
89
|
+
if field.path.is_attribute?
|
90
|
+
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
+
else
|
92
|
+
@row[field.name] = @value
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
#puts @path.to_s
|
97
|
+
if @path.match?(@parser.write_trigger)
|
98
|
+
#puts "matched: #{@path} =~ #{@parser.write_trigger}"
|
99
|
+
#puts "calling proc with #{@row.inspect}"
|
100
|
+
@proc.call(@row.clone)
|
101
|
+
end
|
102
|
+
|
103
|
+
@value = nil
|
104
|
+
@path.elements.pop
|
105
|
+
end
|
106
|
+
def progress(position)
|
107
|
+
@position = position
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
module XPath
|
112
|
+
class Path
|
113
|
+
attr_accessor :elements
|
114
|
+
def initialize
|
115
|
+
@elements = []
|
116
|
+
end
|
117
|
+
def to_s
|
118
|
+
@elements.map{ |e| e.to_s }.join("/")
|
119
|
+
end
|
120
|
+
# Returns true if the last part of the path refers to an attribute
|
121
|
+
def is_attribute?
|
122
|
+
elements.last.attributes.length > 0
|
123
|
+
end
|
124
|
+
# Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
|
125
|
+
# does not reference an attribute.
|
126
|
+
#
|
127
|
+
# Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
|
128
|
+
# since attributes are stored in a Hash.
|
129
|
+
def attribute
|
130
|
+
return nil unless is_attribute?
|
131
|
+
elements.last.attributes.keys.first
|
132
|
+
end
|
133
|
+
# Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
|
134
|
+
# will cause the method to return false.
|
135
|
+
def match?(s)
|
136
|
+
path = Path.parse(s)
|
137
|
+
return false unless path.elements.length == elements.length
|
138
|
+
elements.each_with_index do |element, index|
|
139
|
+
path_element = path.elements[index]
|
140
|
+
return false if path_element.nil?
|
141
|
+
return false if element.name != path_element.name
|
142
|
+
path_element.attributes.each do |key, value|
|
143
|
+
return false unless element.attributes[key] =~ value
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return true
|
147
|
+
end
|
148
|
+
|
149
|
+
# Parse the string into an XPath::Path object
|
150
|
+
def self.parse(s)
|
151
|
+
return s if s.is_a?(Path)
|
152
|
+
path = Path.new
|
153
|
+
parts = s.split('/')
|
154
|
+
parts.each_with_index do |part, i|
|
155
|
+
attributes = {}
|
156
|
+
part.gsub!(/(.*)\[(.*)\]/, '\1')
|
157
|
+
if !$2.nil?
|
158
|
+
$2.split(",").each do |pair|
|
159
|
+
key, value = pair.split("=")
|
160
|
+
value = ".*" if value.nil?
|
161
|
+
attributes[key] = Regexp.new(value)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
path.elements << Element.new(part, attributes)
|
165
|
+
end
|
166
|
+
path
|
167
|
+
end
|
168
|
+
end
|
169
|
+
class Element
|
170
|
+
attr_reader :name
|
171
|
+
attr_reader :attributes
|
172
|
+
def initialize(name, attributes={})
|
173
|
+
@name = name
|
174
|
+
@attributes = attributes
|
175
|
+
end
|
176
|
+
def to_s
|
177
|
+
s = "#{name}"
|
178
|
+
if !@attributes.empty?
|
179
|
+
attr_str = @attributes.collect do |key,value|
|
180
|
+
value = value.source if value.is_a?(Regexp)
|
181
|
+
"#{key}=#{value}"
|
182
|
+
end.join(",")
|
183
|
+
s << "[" + attr_str + "]"
|
184
|
+
end
|
185
|
+
s
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -3,10 +3,10 @@ require 'rexml/document'
|
|
3
3
|
module ETL
|
4
4
|
module Parser
|
5
5
|
class XmlParser < ETL::Parser::Parser
|
6
|
-
include Enumerable
|
7
6
|
# Initialize the parser
|
8
7
|
# * <tt>source</tt>: The Source object
|
9
|
-
|
8
|
+
# * <tt>options</tt>: Parser options Hash
|
9
|
+
def initialize(source, options={})
|
10
10
|
super
|
11
11
|
configure
|
12
12
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
3
|
# Processor which is used to bulk import data into a target database
|
4
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
5
|
attr_reader :file, :target, :truncate, :columns
|
@@ -13,7 +13,7 @@ module ETL
|
|
13
13
|
end
|
14
14
|
def process
|
15
15
|
# columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
|
16
|
-
conn = ActiveRecord::Base.connection
|
16
|
+
conn = ETL::ActiveRecord::Base.connection
|
17
17
|
conn.transaction do
|
18
18
|
# TODO: Support all database types
|
19
19
|
# Since LOCAL is used this must be allowed by both the client and server
|
@@ -27,7 +27,7 @@ module ETL
|
|
27
27
|
private
|
28
28
|
# Connect to the database
|
29
29
|
def connect
|
30
|
-
ActiveRecord::Base.establish_connection(
|
30
|
+
ETL::ActiveRecord::Base.establish_connection(
|
31
31
|
:adapter => (target[:adapter] || :mysql),
|
32
32
|
:username => (target[:username] || 'root'),
|
33
33
|
:host => (target[:host] || 'localhost'),
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
3
|
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
4
|
# prior to loading
|
5
5
|
class TruncateProcessor < ETL::Processor::Processor
|
@@ -11,13 +11,13 @@ module ETL
|
|
11
11
|
connect
|
12
12
|
end
|
13
13
|
def process
|
14
|
-
conn = ActiveRecord::Base.connection
|
14
|
+
conn = ETL::ActiveRecord::Base.connection
|
15
15
|
conn.truncate
|
16
16
|
end
|
17
17
|
|
18
18
|
# Connect to the database
|
19
19
|
def connect
|
20
|
-
ActiveRecord::Base.establish_connection(
|
20
|
+
ETL::ActiveRecord::Base.establish_connection(
|
21
21
|
:adapter => (target[:adapter] || :mysql),
|
22
22
|
:username => (target[:username] || 'root'),
|
23
23
|
:host => (target[:host] || 'localhost'),
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a Date or Time to a formatted string instance
|
4
|
+
class DateToStringTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
|
9
|
+
def initialize(control, configuration={})
|
10
|
+
super
|
11
|
+
@format = configuration[:format] || "%Y-%m-%d"
|
12
|
+
end
|
13
|
+
# Transform the value using strftime
|
14
|
+
def transform(value)
|
15
|
+
value.strftime(@format)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -2,7 +2,18 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform which decodes coded values
|
4
4
|
class DecodeTransform < ETL::Transform::Transform
|
5
|
-
attr_accessor :decode_table_path
|
5
|
+
attr_accessor :decode_table_path
|
6
|
+
|
7
|
+
attr_accessor :decode_table_delimiter
|
8
|
+
|
9
|
+
attr_accessor :default_value
|
10
|
+
|
11
|
+
# Initialize the transformer
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
|
15
|
+
# * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
|
16
|
+
# * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
|
6
17
|
def initialize(control, configuration={})
|
7
18
|
super
|
8
19
|
|
@@ -14,10 +25,13 @@ module ETL #:nodoc:
|
|
14
25
|
@decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
|
15
26
|
@default_value = (configuration[:default_value] || 'No Value')
|
16
27
|
end
|
28
|
+
|
29
|
+
# Transform the value
|
17
30
|
def transform(value)
|
18
31
|
decode_table[value] || default_value
|
19
32
|
end
|
20
33
|
|
34
|
+
# Get the decode table
|
21
35
|
def decode_table
|
22
36
|
unless @decode_table
|
23
37
|
@decode_table = {}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which looks up the value and replaces it with a foriegn key reference
|
4
|
+
class ForeignKeyLookupTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the foreign key lookup transform.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
|
9
|
+
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
10
|
+
# for future use.
|
11
|
+
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
12
|
+
def initialize(control, configuration={})
|
13
|
+
super
|
14
|
+
|
15
|
+
@collection = (configuration[:collection] || {})
|
16
|
+
@resolver = configuration[:resolver]
|
17
|
+
@resolver = @resolver.new if @resolver.is_a?(Class)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Transform the value by resolving it to a foriegn key
|
21
|
+
def transform(value)
|
22
|
+
fk = @collection[value]
|
23
|
+
unless fk
|
24
|
+
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless @resolver
|
25
|
+
raise ResolverError, "Resolver does not appear to respond to resolve method" unless @resolver.respond_to?(:resolve)
|
26
|
+
fk = @resolver.resolve(value)
|
27
|
+
raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
|
28
|
+
@collection[value] = fk
|
29
|
+
end
|
30
|
+
fk
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Resolver which resolves using ActiveRecord.
|
37
|
+
class ActiveRecordResolver
|
38
|
+
# Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
|
39
|
+
# must be a symbol for the finder method used. For example:
|
40
|
+
#
|
41
|
+
# ActiveRecordResolver.new(Person, :find_by_name)
|
42
|
+
#
|
43
|
+
# Note that the find method defined must only take a single argument.
|
44
|
+
def initialize(ar_class, find_method)
|
45
|
+
@ar_class = ar_class
|
46
|
+
@find_method = find_method
|
47
|
+
end
|
48
|
+
# Resolve the value
|
49
|
+
def resolve(value)
|
50
|
+
rec = @ar_class.__send__(@find_method, value)
|
51
|
+
rec.nil? ? nil : rec.id
|
52
|
+
end
|
53
|
+
end
|