activewarehouse-etl 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,20 @@
1
- module ETL
2
- module Generator
1
+ module ETL #:nodoc:
2
+ module Generator #:nodoc:
3
+ # Base class for generators.
3
4
  class Generator
4
5
  class << self
6
+ # Get the Class for the specified name.
7
+ #
8
+ # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
5
9
  def class_for_name(name)
6
10
  ETL::Generator.const_get("#{name.to_s.classify}Generator")
7
11
  end
8
12
  end
13
+
14
+ # Generate the next value. This method must be implemented by subclasses
15
+ def next
16
+ raise "Must be implemented by a subclass"
17
+ end
9
18
  end
10
19
  end
11
20
  end
@@ -1,5 +1,6 @@
1
- module ETL
2
- module Generator
1
+ module ETL #:nodoc:
2
+ module Generator #:nodoc:
3
+ # Surrogate key generator.
3
4
  class SurrogateKeyGenerator < Generator
4
5
  def next
5
6
  @surrogate_key ||= 0
@@ -2,17 +2,16 @@ module ETL #:nodoc:
2
2
  module Parser #:nodoc:
3
3
  # Parses delimited files
4
4
  class DelimitedParser < ETL::Parser::Parser
5
- include Enumerable
6
5
  # Initialize the parser
7
6
  # * <tt>source</tt>: The Source object
8
- def initialize(source)
7
+ # * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
8
+ def initialize(source, options={})
9
9
  super
10
10
  configure
11
11
  end
12
12
 
13
13
  # Returns each row.
14
14
  def each
15
- options = {}
16
15
  Dir.glob(file).each do |file|
17
16
  ETL::Engine.logger.debug "parsing #{file}"
18
17
  line = 0
@@ -64,7 +63,7 @@ module ETL #:nodoc:
64
63
  end
65
64
  end
66
65
 
67
- class Field
66
+ class Field #:nodoc:
68
67
  attr_reader :name, :type
69
68
  def initialize(name, type=:string)
70
69
  @name = name
@@ -2,11 +2,10 @@ module ETL #:nodoc:
2
2
  module Parser #:nodoc:
3
3
  # Parser for fixed with files
4
4
  class FixedWidthParser < ETL::Parser::Parser
5
- include Enumerable
6
-
7
5
  # Initialize the parser
8
6
  # * <tt>source</tt>: The source object
9
- def initialize(source)
7
+ # * <tt>options</tt>: Parser options Hash
8
+ def initialize(source, options={})
10
9
  super
11
10
  configure
12
11
  end
@@ -43,7 +42,7 @@ module ETL #:nodoc:
43
42
  end
44
43
  end
45
44
 
46
- class FixedWidthField
45
+ class FixedWidthField #:nodoc:
47
46
  attr_reader :name, :field_start, :field_end, :field_length, :type
48
47
  def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
49
48
  @name = name
@@ -1,6 +1,7 @@
1
1
  module ETL
2
2
  module Parser
3
3
  class Parser
4
+ include Enumerable
4
5
  class << self
5
6
  # Convert the name (string or symbol) to a parser class.
6
7
  #
@@ -11,10 +12,15 @@ module ETL
11
12
  end
12
13
  end
13
14
 
15
+ # The Source object for the data
14
16
  attr_reader :source
15
17
 
16
- def initialize(source)
18
+ # Options Hash for the parser
19
+ attr_reader :options
20
+
21
+ def initialize(source, options={})
17
22
  @source = source
23
+ @options = options || {}
18
24
  end
19
25
 
20
26
  # Convert the value to the specified type.
@@ -0,0 +1,190 @@
1
+ require 'rexml/parsers/sax2parser'
2
+ require 'rexml/sax2listener'
3
+
4
+ module ETL
5
+ module Parser
6
+ class SaxParser < ETL::Parser::Parser
7
+
8
+ # The write trigger causes whatever values are currently specified for the row to be returned.
9
+ # After returning the values will not be cleared, thus allowing for values which are assigned
10
+ # higher in the XML tree to remain in memory.
11
+ attr_accessor :write_trigger
12
+
13
+ # Initialize the parser
14
+ # * <tt>source</tt>: The Source object
15
+ # * <tt>options</tt>: Parser options Hash
16
+ def initialize(source, options={})
17
+ super
18
+ configure
19
+ end
20
+
21
+ # Returns each row
22
+ def each(&block)
23
+ Dir.glob(file).each do |file|
24
+ parser = REXML::Parsers::SAX2Parser.new(File.new(file))
25
+ listener = Listener.new(self, &block)
26
+ parser.listen(listener)
27
+ parser.parse
28
+ end
29
+ end
30
+
31
+ def fields
32
+ @fields ||= []
33
+ end
34
+
35
+ private
36
+ def configure
37
+ #puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
38
+ self.write_trigger = source.definition[:write_trigger]
39
+ # map paths to field names
40
+ source.definition[:fields].each do |name, path|
41
+ #puts "defined field #{name}, path: #{path}"
42
+ fields << Field.new(name, XPath::Path.parse(path))
43
+ end
44
+ end
45
+
46
+ class Field
47
+ attr_reader :name, :path
48
+ def initialize(name, path)
49
+ @name = name
50
+ @path = path
51
+ end
52
+ end
53
+ end
54
+
55
+ class Listener
56
+ include REXML::SAX2Listener
57
+ def initialize(parser, &block)
58
+ @parser = parser
59
+ @row = {}
60
+ @value = nil
61
+ @proc = Proc.new(&block)
62
+ end
63
+ def cdata(text)
64
+ @value << text
65
+ end
66
+ def characters(text)
67
+ text = text.strip
68
+ if (!text.nil? && text != '')
69
+ @value ||= ''
70
+ @value << text
71
+ end
72
+ end
73
+ def start_document
74
+ @path = XPath::Path.new
75
+ end
76
+ def end_document
77
+
78
+ end
79
+ def start_element(uri, localname, qname, attributes)
80
+ @path.elements << XPath::Element.new(localname, attributes)
81
+ end
82
+ def end_element(uri, localname, qname)
83
+ element = @path.elements.last
84
+
85
+ @parser.fields.each do |field|
86
+ #puts "#{@path} match? #{field.path}"
87
+ if @path.match?(field.path)
88
+ #puts "field.path: #{field.path}"
89
+ if field.path.is_attribute?
90
+ @row[field.name] = element.attributes[field.path.attribute]
91
+ else
92
+ @row[field.name] = @value
93
+ end
94
+ end
95
+ end
96
+ #puts @path.to_s
97
+ if @path.match?(@parser.write_trigger)
98
+ #puts "matched: #{@path} =~ #{@parser.write_trigger}"
99
+ #puts "calling proc with #{@row.inspect}"
100
+ @proc.call(@row.clone)
101
+ end
102
+
103
+ @value = nil
104
+ @path.elements.pop
105
+ end
106
+ def progress(position)
107
+ @position = position
108
+ end
109
+ end
110
+
111
+ module XPath
112
+ class Path
113
+ attr_accessor :elements
114
+ def initialize
115
+ @elements = []
116
+ end
117
+ def to_s
118
+ @elements.map{ |e| e.to_s }.join("/")
119
+ end
120
+ # Returns true if the last part of the path refers to an attribute
121
+ def is_attribute?
122
+ elements.last.attributes.length > 0
123
+ end
124
+ # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
125
+ # does not reference an attribute.
126
+ #
127
+ # Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
128
+ # since attributes are stored in a Hash.
129
+ def attribute
130
+ return nil unless is_attribute?
131
+ elements.last.attributes.keys.first
132
+ end
133
+ # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
134
+ # will cause the method to return false.
135
+ def match?(s)
136
+ path = Path.parse(s)
137
+ return false unless path.elements.length == elements.length
138
+ elements.each_with_index do |element, index|
139
+ path_element = path.elements[index]
140
+ return false if path_element.nil?
141
+ return false if element.name != path_element.name
142
+ path_element.attributes.each do |key, value|
143
+ return false unless element.attributes[key] =~ value
144
+ end
145
+ end
146
+ return true
147
+ end
148
+
149
+ # Parse the string into an XPath::Path object
150
+ def self.parse(s)
151
+ return s if s.is_a?(Path)
152
+ path = Path.new
153
+ parts = s.split('/')
154
+ parts.each_with_index do |part, i|
155
+ attributes = {}
156
+ part.gsub!(/(.*)\[(.*)\]/, '\1')
157
+ if !$2.nil?
158
+ $2.split(",").each do |pair|
159
+ key, value = pair.split("=")
160
+ value = ".*" if value.nil?
161
+ attributes[key] = Regexp.new(value)
162
+ end
163
+ end
164
+ path.elements << Element.new(part, attributes)
165
+ end
166
+ path
167
+ end
168
+ end
169
+ class Element
170
+ attr_reader :name
171
+ attr_reader :attributes
172
+ def initialize(name, attributes={})
173
+ @name = name
174
+ @attributes = attributes
175
+ end
176
+ def to_s
177
+ s = "#{name}"
178
+ if !@attributes.empty?
179
+ attr_str = @attributes.collect do |key,value|
180
+ value = value.source if value.is_a?(Regexp)
181
+ "#{key}=#{value}"
182
+ end.join(",")
183
+ s << "[" + attr_str + "]"
184
+ end
185
+ s
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -3,10 +3,10 @@ require 'rexml/document'
3
3
  module ETL
4
4
  module Parser
5
5
  class XmlParser < ETL::Parser::Parser
6
- include Enumerable
7
6
  # Initialize the parser
8
7
  # * <tt>source</tt>: The Source object
9
- def initialize(source)
8
+ # * <tt>options</tt>: Parser options Hash
9
+ def initialize(source, options={})
10
10
  super
11
11
  configure
12
12
  end
@@ -1,5 +1,5 @@
1
- module ETL
2
- module Processor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
3
  # Processor which is used to bulk import data into a target database
4
4
  class BulkImportProcessor < ETL::Processor::Processor
5
5
  attr_reader :file, :target, :truncate, :columns
@@ -13,7 +13,7 @@ module ETL
13
13
  end
14
14
  def process
15
15
  # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
16
- conn = ActiveRecord::Base.connection
16
+ conn = ETL::ActiveRecord::Base.connection
17
17
  conn.transaction do
18
18
  # TODO: Support all database types
19
19
  # Since LOCAL is used this must be allowed by both the client and server
@@ -27,7 +27,7 @@ module ETL
27
27
  private
28
28
  # Connect to the database
29
29
  def connect
30
- ActiveRecord::Base.establish_connection(
30
+ ETL::ActiveRecord::Base.establish_connection(
31
31
  :adapter => (target[:adapter] || :mysql),
32
32
  :username => (target[:username] || 'root'),
33
33
  :host => (target[:host] || 'localhost'),
@@ -1,6 +1,6 @@
1
1
  module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
- # Base class for pre and post processors
3
+ # Base class for pre and post processors. Subclasses must implement the +process+ method.
4
4
  class Processor
5
5
  def initialize(control, configuration)
6
6
  @control = control
@@ -1,5 +1,5 @@
1
- module ETL
2
- module Processor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
3
  # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
4
  # prior to loading
5
5
  class TruncateProcessor < ETL::Processor::Processor
@@ -11,13 +11,13 @@ module ETL
11
11
  connect
12
12
  end
13
13
  def process
14
- conn = ActiveRecord::Base.connection
14
+ conn = ETL::ActiveRecord::Base.connection
15
15
  conn.truncate
16
16
  end
17
17
 
18
18
  # Connect to the database
19
19
  def connect
20
- ActiveRecord::Base.establish_connection(
20
+ ETL::ActiveRecord::Base.establish_connection(
21
21
  :adapter => (target[:adapter] || :mysql),
22
22
  :username => (target[:username] || 'root'),
23
23
  :host => (target[:host] || 'localhost'),
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a Date or Time to a formatted string instance
4
+ class DateToStringTransform < ETL::Transform::Transform
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
9
+ def initialize(control, configuration={})
10
+ super
11
+ @format = configuration[:format] || "%Y-%m-%d"
12
+ end
13
+ # Transform the value using strftime
14
+ def transform(value)
15
+ value.strftime(@format)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -2,7 +2,18 @@ module ETL #:nodoc:
2
2
  module Transform #:nodoc:
3
3
  # Transform which decodes coded values
4
4
  class DecodeTransform < ETL::Transform::Transform
5
- attr_accessor :decode_table_path, :decode_table_delimiter, :default_value
5
+ attr_accessor :decode_table_path
6
+
7
+ attr_accessor :decode_table_delimiter
8
+
9
+ attr_accessor :default_value
10
+
11
+ # Initialize the transformer
12
+ #
13
+ # Configuration options:
14
+ # * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
15
+ # * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
16
+ # * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
6
17
  def initialize(control, configuration={})
7
18
  super
8
19
 
@@ -14,10 +25,13 @@ module ETL #:nodoc:
14
25
  @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
15
26
  @default_value = (configuration[:default_value] || 'No Value')
16
27
  end
28
+
29
+ # Transform the value
17
30
  def transform(value)
18
31
  decode_table[value] || default_value
19
32
  end
20
33
 
34
+ # Get the decode table
21
35
  def decode_table
22
36
  unless @decode_table
23
37
  @decode_table = {}
@@ -0,0 +1,53 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which looks up the value and replaces it with a foriegn key reference
4
+ class ForeignKeyLookupTransform < ETL::Transform::Transform
5
+ # Initialize the foreign key lookup transform.
6
+ #
7
+ # Configuration options:
8
+ # *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
9
+ # an empty Hash will be used. This Hash will be used to cache values that have been resolved already
10
+ # for future use.
11
+ # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
12
+ def initialize(control, configuration={})
13
+ super
14
+
15
+ @collection = (configuration[:collection] || {})
16
+ @resolver = configuration[:resolver]
17
+ @resolver = @resolver.new if @resolver.is_a?(Class)
18
+ end
19
+
20
+ # Transform the value by resolving it to a foriegn key
21
+ def transform(value)
22
+ fk = @collection[value]
23
+ unless fk
24
+ raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless @resolver
25
+ raise ResolverError, "Resolver does not appear to respond to resolve method" unless @resolver.respond_to?(:resolve)
26
+ fk = @resolver.resolve(value)
27
+ raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
28
+ @collection[value] = fk
29
+ end
30
+ fk
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ # Resolver which resolves using ActiveRecord.
37
+ class ActiveRecordResolver
38
+ # Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
39
+ # must be a symbol for the finder method used. For example:
40
+ #
41
+ # ActiveRecordResolver.new(Person, :find_by_name)
42
+ #
43
+ # Note that the find method defined must only take a single argument.
44
+ def initialize(ar_class, find_method)
45
+ @ar_class = ar_class
46
+ @find_method = find_method
47
+ end
48
+ # Resolve the value
49
+ def resolve(value)
50
+ rec = @ar_class.__send__(@find_method, value)
51
+ rec.nil? ? nil : rec.id
52
+ end
53
+ end