activewarehouse-etl 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,20 @@
1
- module ETL
2
- module Generator
1
+ module ETL #:nodoc:
2
+ module Generator #:nodoc:
3
+ # Base class for generators.
3
4
  class Generator
4
5
  class << self
6
+ # Get the Class for the specified name.
7
+ #
8
+ # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
5
9
  def class_for_name(name)
6
10
  ETL::Generator.const_get("#{name.to_s.classify}Generator")
7
11
  end
8
12
  end
13
+
14
+ # Generate the next value. This method must be implemented by subclasses
15
+ def next
16
+ raise "Must be implemented by a subclass"
17
+ end
9
18
  end
10
19
  end
11
20
  end
@@ -1,5 +1,6 @@
1
- module ETL
2
- module Generator
1
+ module ETL #:nodoc:
2
+ module Generator #:nodoc:
3
+ # Surrogate key generator.
3
4
  class SurrogateKeyGenerator < Generator
4
5
  def next
5
6
  @surrogate_key ||= 0
@@ -2,17 +2,16 @@ module ETL #:nodoc:
2
2
  module Parser #:nodoc:
3
3
  # Parses delimited files
4
4
  class DelimitedParser < ETL::Parser::Parser
5
- include Enumerable
6
5
  # Initialize the parser
7
6
  # * <tt>source</tt>: The Source object
8
- def initialize(source)
7
+ # * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
8
+ def initialize(source, options={})
9
9
  super
10
10
  configure
11
11
  end
12
12
 
13
13
  # Returns each row.
14
14
  def each
15
- options = {}
16
15
  Dir.glob(file).each do |file|
17
16
  ETL::Engine.logger.debug "parsing #{file}"
18
17
  line = 0
@@ -64,7 +63,7 @@ module ETL #:nodoc:
64
63
  end
65
64
  end
66
65
 
67
- class Field
66
+ class Field #:nodoc:
68
67
  attr_reader :name, :type
69
68
  def initialize(name, type=:string)
70
69
  @name = name
@@ -2,11 +2,10 @@ module ETL #:nodoc:
2
2
  module Parser #:nodoc:
3
3
  # Parser for fixed with files
4
4
  class FixedWidthParser < ETL::Parser::Parser
5
- include Enumerable
6
-
7
5
  # Initialize the parser
8
6
  # * <tt>source</tt>: The source object
9
- def initialize(source)
7
+ # * <tt>options</tt>: Parser options Hash
8
+ def initialize(source, options={})
10
9
  super
11
10
  configure
12
11
  end
@@ -43,7 +42,7 @@ module ETL #:nodoc:
43
42
  end
44
43
  end
45
44
 
46
- class FixedWidthField
45
+ class FixedWidthField #:nodoc:
47
46
  attr_reader :name, :field_start, :field_end, :field_length, :type
48
47
  def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
49
48
  @name = name
@@ -1,6 +1,7 @@
1
1
  module ETL
2
2
  module Parser
3
3
  class Parser
4
+ include Enumerable
4
5
  class << self
5
6
  # Convert the name (string or symbol) to a parser class.
6
7
  #
@@ -11,10 +12,15 @@ module ETL
11
12
  end
12
13
  end
13
14
 
15
+ # The Source object for the data
14
16
  attr_reader :source
15
17
 
16
- def initialize(source)
18
+ # Options Hash for the parser
19
+ attr_reader :options
20
+
21
+ def initialize(source, options={})
17
22
  @source = source
23
+ @options = options || {}
18
24
  end
19
25
 
20
26
  # Convert the value to the specified type.
@@ -0,0 +1,190 @@
1
+ require 'rexml/parsers/sax2parser'
2
+ require 'rexml/sax2listener'
3
+
4
+ module ETL
5
+ module Parser
6
+ class SaxParser < ETL::Parser::Parser
7
+
8
+ # The write trigger causes whatever values are currently specified for the row to be returned.
9
+ # After returning the values will not be cleared, thus allowing for values which are assigned
10
+ # higher in the XML tree to remain in memory.
11
+ attr_accessor :write_trigger
12
+
13
+ # Initialize the parser
14
+ # * <tt>source</tt>: The Source object
15
+ # * <tt>options</tt>: Parser options Hash
16
+ def initialize(source, options={})
17
+ super
18
+ configure
19
+ end
20
+
21
+ # Returns each row
22
+ def each(&block)
23
+ Dir.glob(file).each do |file|
24
+ parser = REXML::Parsers::SAX2Parser.new(File.new(file))
25
+ listener = Listener.new(self, &block)
26
+ parser.listen(listener)
27
+ parser.parse
28
+ end
29
+ end
30
+
31
+ def fields
32
+ @fields ||= []
33
+ end
34
+
35
+ private
36
+ def configure
37
+ #puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
38
+ self.write_trigger = source.definition[:write_trigger]
39
+ # map paths to field names
40
+ source.definition[:fields].each do |name, path|
41
+ #puts "defined field #{name}, path: #{path}"
42
+ fields << Field.new(name, XPath::Path.parse(path))
43
+ end
44
+ end
45
+
46
+ class Field
47
+ attr_reader :name, :path
48
+ def initialize(name, path)
49
+ @name = name
50
+ @path = path
51
+ end
52
+ end
53
+ end
54
+
55
+ class Listener
56
+ include REXML::SAX2Listener
57
+ def initialize(parser, &block)
58
+ @parser = parser
59
+ @row = {}
60
+ @value = nil
61
+ @proc = Proc.new(&block)
62
+ end
63
+ def cdata(text)
64
+ @value << text
65
+ end
66
+ def characters(text)
67
+ text = text.strip
68
+ if (!text.nil? && text != '')
69
+ @value ||= ''
70
+ @value << text
71
+ end
72
+ end
73
+ def start_document
74
+ @path = XPath::Path.new
75
+ end
76
+ def end_document
77
+
78
+ end
79
+ def start_element(uri, localname, qname, attributes)
80
+ @path.elements << XPath::Element.new(localname, attributes)
81
+ end
82
+ def end_element(uri, localname, qname)
83
+ element = @path.elements.last
84
+
85
+ @parser.fields.each do |field|
86
+ #puts "#{@path} match? #{field.path}"
87
+ if @path.match?(field.path)
88
+ #puts "field.path: #{field.path}"
89
+ if field.path.is_attribute?
90
+ @row[field.name] = element.attributes[field.path.attribute]
91
+ else
92
+ @row[field.name] = @value
93
+ end
94
+ end
95
+ end
96
+ #puts @path.to_s
97
+ if @path.match?(@parser.write_trigger)
98
+ #puts "matched: #{@path} =~ #{@parser.write_trigger}"
99
+ #puts "calling proc with #{@row.inspect}"
100
+ @proc.call(@row.clone)
101
+ end
102
+
103
+ @value = nil
104
+ @path.elements.pop
105
+ end
106
+ def progress(position)
107
+ @position = position
108
+ end
109
+ end
110
+
111
+ module XPath
112
+ class Path
113
+ attr_accessor :elements
114
+ def initialize
115
+ @elements = []
116
+ end
117
+ def to_s
118
+ @elements.map{ |e| e.to_s }.join("/")
119
+ end
120
+ # Returns true if the last part of the path refers to an attribute
121
+ def is_attribute?
122
+ elements.last.attributes.length > 0
123
+ end
124
+ # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
125
+ # does not reference an attribute.
126
+ #
127
+ # Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
128
+ # since attributes are stored in a Hash.
129
+ def attribute
130
+ return nil unless is_attribute?
131
+ elements.last.attributes.keys.first
132
+ end
133
+ # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
134
+ # will cause the method to return false.
135
+ def match?(s)
136
+ path = Path.parse(s)
137
+ return false unless path.elements.length == elements.length
138
+ elements.each_with_index do |element, index|
139
+ path_element = path.elements[index]
140
+ return false if path_element.nil?
141
+ return false if element.name != path_element.name
142
+ path_element.attributes.each do |key, value|
143
+ return false unless element.attributes[key] =~ value
144
+ end
145
+ end
146
+ return true
147
+ end
148
+
149
+ # Parse the string into an XPath::Path object
150
+ def self.parse(s)
151
+ return s if s.is_a?(Path)
152
+ path = Path.new
153
+ parts = s.split('/')
154
+ parts.each_with_index do |part, i|
155
+ attributes = {}
156
+ part.gsub!(/(.*)\[(.*)\]/, '\1')
157
+ if !$2.nil?
158
+ $2.split(",").each do |pair|
159
+ key, value = pair.split("=")
160
+ value = ".*" if value.nil?
161
+ attributes[key] = Regexp.new(value)
162
+ end
163
+ end
164
+ path.elements << Element.new(part, attributes)
165
+ end
166
+ path
167
+ end
168
+ end
169
+ class Element
170
+ attr_reader :name
171
+ attr_reader :attributes
172
+ def initialize(name, attributes={})
173
+ @name = name
174
+ @attributes = attributes
175
+ end
176
+ def to_s
177
+ s = "#{name}"
178
+ if !@attributes.empty?
179
+ attr_str = @attributes.collect do |key,value|
180
+ value = value.source if value.is_a?(Regexp)
181
+ "#{key}=#{value}"
182
+ end.join(",")
183
+ s << "[" + attr_str + "]"
184
+ end
185
+ s
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -3,10 +3,10 @@ require 'rexml/document'
3
3
  module ETL
4
4
  module Parser
5
5
  class XmlParser < ETL::Parser::Parser
6
- include Enumerable
7
6
  # Initialize the parser
8
7
  # * <tt>source</tt>: The Source object
9
- def initialize(source)
8
+ # * <tt>options</tt>: Parser options Hash
9
+ def initialize(source, options={})
10
10
  super
11
11
  configure
12
12
  end
@@ -1,5 +1,5 @@
1
- module ETL
2
- module Processor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
3
  # Processor which is used to bulk import data into a target database
4
4
  class BulkImportProcessor < ETL::Processor::Processor
5
5
  attr_reader :file, :target, :truncate, :columns
@@ -13,7 +13,7 @@ module ETL
13
13
  end
14
14
  def process
15
15
  # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
16
- conn = ActiveRecord::Base.connection
16
+ conn = ETL::ActiveRecord::Base.connection
17
17
  conn.transaction do
18
18
  # TODO: Support all database types
19
19
  # Since LOCAL is used this must be allowed by both the client and server
@@ -27,7 +27,7 @@ module ETL
27
27
  private
28
28
  # Connect to the database
29
29
  def connect
30
- ActiveRecord::Base.establish_connection(
30
+ ETL::ActiveRecord::Base.establish_connection(
31
31
  :adapter => (target[:adapter] || :mysql),
32
32
  :username => (target[:username] || 'root'),
33
33
  :host => (target[:host] || 'localhost'),
@@ -1,6 +1,6 @@
1
1
  module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
- # Base class for pre and post processors
3
+ # Base class for pre and post processors. Subclasses must implement the +process+ method.
4
4
  class Processor
5
5
  def initialize(control, configuration)
6
6
  @control = control
@@ -1,5 +1,5 @@
1
- module ETL
2
- module Processor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
3
  # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
4
  # prior to loading
5
5
  class TruncateProcessor < ETL::Processor::Processor
@@ -11,13 +11,13 @@ module ETL
11
11
  connect
12
12
  end
13
13
  def process
14
- conn = ActiveRecord::Base.connection
14
+ conn = ETL::ActiveRecord::Base.connection
15
15
  conn.truncate
16
16
  end
17
17
 
18
18
  # Connect to the database
19
19
  def connect
20
- ActiveRecord::Base.establish_connection(
20
+ ETL::ActiveRecord::Base.establish_connection(
21
21
  :adapter => (target[:adapter] || :mysql),
22
22
  :username => (target[:username] || 'root'),
23
23
  :host => (target[:host] || 'localhost'),
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a Date or Time to a formatted string instance
4
+ class DateToStringTransform < ETL::Transform::Transform
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
9
+ def initialize(control, configuration={})
10
+ super
11
+ @format = configuration[:format] || "%Y-%m-%d"
12
+ end
13
+ # Transform the value using strftime
14
+ def transform(value)
15
+ value.strftime(@format)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -2,7 +2,18 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform which decodes coded values
4
4
  class DecodeTransform < ETL::Transform::Transform
5
- attr_accessor :decode_table_path, :decode_table_delimiter, :default_value
5
+ attr_accessor :decode_table_path
6
+
7
+ attr_accessor :decode_table_delimiter
8
+
9
+ attr_accessor :default_value
10
+
11
+ # Initialize the transformer
12
+ #
13
+ # Configuration options:
14
+ # * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
15
+ # * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
16
+ # * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
6
17
  def initialize(control, configuration={})
7
18
  super
8
19
 
@@ -14,10 +25,13 @@ module ETL #:nodoc:
14
25
  @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
15
26
  @default_value = (configuration[:default_value] || 'No Value')
16
27
  end
28
+
29
+ # Transform the value
17
30
  def transform(value)
18
31
  decode_table[value] || default_value
19
32
  end
20
33
 
34
+ # Get the decode table
21
35
  def decode_table
22
36
  unless @decode_table
23
37
  @decode_table = {}
@@ -0,0 +1,53 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which looks up the value and replaces it with a foriegn key reference
4
+ class ForeignKeyLookupTransform < ETL::Transform::Transform
5
+ # Initialize the foreign key lookup transform.
6
+ #
7
+ # Configuration options:
8
+ # *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
9
+ # an empty Hash will be used. This Hash will be used to cache values that have been resolved already
10
+ # for future use.
11
+ # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
12
+ def initialize(control, configuration={})
13
+ super
14
+
15
+ @collection = (configuration[:collection] || {})
16
+ @resolver = configuration[:resolver]
17
+ @resolver = @resolver.new if @resolver.is_a?(Class)
18
+ end
19
+
20
+ # Transform the value by resolving it to a foriegn key
21
+ def transform(value)
22
+ fk = @collection[value]
23
+ unless fk
24
+ raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless @resolver
25
+ raise ResolverError, "Resolver does not appear to respond to resolve method" unless @resolver.respond_to?(:resolve)
26
+ fk = @resolver.resolve(value)
27
+ raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
28
+ @collection[value] = fk
29
+ end
30
+ fk
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ # Resolver which resolves using ActiveRecord.
37
+ class ActiveRecordResolver
38
+ # Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
39
+ # must be a symbol for the finder method used. For example:
40
+ #
41
+ # ActiveRecordResolver.new(Person, :find_by_name)
42
+ #
43
+ # Note that the find method defined must only take a single argument.
44
+ def initialize(ar_class, find_method)
45
+ @ar_class = ar_class
46
+ @find_method = find_method
47
+ end
48
+ # Resolve the value
49
+ def resolve(value)
50
+ rec = @ar_class.__send__(@find_method, value)
51
+ rec.nil? ? nil : rec.id
52
+ end
53
+ end