RubyGems - darrell-activewarehouse-etl - Versions diffs - 0.9.1.4 - Mend

darrell-activewarehouse-etl 0.9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

data/CHANGELOG +198 -0
data/LICENSE +7 -0
data/README +99 -0
data/Rakefile +175 -0
data/TODO +28 -0
data/bin/etl +28 -0
data/bin/etl.cmd +8 -0
data/examples/database.example.yml +16 -0
data/lib/etl/batch/batch.rb +111 -0
data/lib/etl/batch/directives.rb +55 -0
data/lib/etl/batch.rb +2 -0
data/lib/etl/builder/date_dimension_builder.rb +96 -0
data/lib/etl/builder/time_dimension_builder.rb +31 -0
data/lib/etl/builder.rb +2 -0
data/lib/etl/commands/etl.rb +89 -0
data/lib/etl/control/control.rb +405 -0
data/lib/etl/control/destination/database_destination.rb +97 -0
data/lib/etl/control/destination/file_destination.rb +126 -0
data/lib/etl/control/destination.rb +448 -0
data/lib/etl/control/source/database_source.rb +220 -0
data/lib/etl/control/source/enumerable_source.rb +11 -0
data/lib/etl/control/source/file_source.rb +90 -0
data/lib/etl/control/source/model_source.rb +39 -0
data/lib/etl/control/source.rb +109 -0
data/lib/etl/control.rb +3 -0
data/lib/etl/core_ext/time/calculations.rb +42 -0
data/lib/etl/core_ext/time.rb +5 -0
data/lib/etl/core_ext.rb +1 -0
data/lib/etl/engine.rb +556 -0
data/lib/etl/execution/base.rb +9 -0
data/lib/etl/execution/batch.rb +8 -0
data/lib/etl/execution/job.rb +8 -0
data/lib/etl/execution/migration.rb +85 -0
data/lib/etl/execution.rb +19 -0
data/lib/etl/generator/generator.rb +20 -0
data/lib/etl/generator/surrogate_key_generator.rb +39 -0
data/lib/etl/generator.rb +2 -0
data/lib/etl/http_tools.rb +139 -0
data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
data/lib/etl/parser/delimited_parser.rb +74 -0
data/lib/etl/parser/fixed_width_parser.rb +65 -0
data/lib/etl/parser/parser.rb +41 -0
data/lib/etl/parser/sax_parser.rb +218 -0
data/lib/etl/parser/xml_parser.rb +65 -0
data/lib/etl/parser.rb +11 -0
data/lib/etl/processor/block_processor.rb +14 -0
data/lib/etl/processor/bulk_import_processor.rb +83 -0
data/lib/etl/processor/check_exist_processor.rb +80 -0
data/lib/etl/processor/check_unique_processor.rb +35 -0
data/lib/etl/processor/copy_field_processor.rb +26 -0
data/lib/etl/processor/encode_processor.rb +55 -0
data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
data/lib/etl/processor/print_row_processor.rb +12 -0
data/lib/etl/processor/processor.rb +25 -0
data/lib/etl/processor/rename_processor.rb +24 -0
data/lib/etl/processor/require_non_blank_processor.rb +26 -0
data/lib/etl/processor/row_processor.rb +17 -0
data/lib/etl/processor/sequence_processor.rb +23 -0
data/lib/etl/processor/surrogate_key_processor.rb +53 -0
data/lib/etl/processor/truncate_processor.rb +35 -0
data/lib/etl/processor.rb +11 -0
data/lib/etl/row.rb +20 -0
data/lib/etl/screen/row_count_screen.rb +20 -0
data/lib/etl/screen.rb +14 -0
data/lib/etl/transform/block_transform.rb +13 -0
data/lib/etl/transform/date_to_string_transform.rb +20 -0
data/lib/etl/transform/decode_transform.rb +51 -0
data/lib/etl/transform/default_transform.rb +20 -0
data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
data/lib/etl/transform/ordinalize_transform.rb +12 -0
data/lib/etl/transform/sha1_transform.rb +13 -0
data/lib/etl/transform/string_to_date_transform.rb +16 -0
data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
data/lib/etl/transform/string_to_time_transform.rb +11 -0
data/lib/etl/transform/transform.rb +61 -0
data/lib/etl/transform/trim_transform.rb +26 -0
data/lib/etl/transform/type_transform.rb +35 -0
data/lib/etl/transform.rb +2 -0
data/lib/etl/util.rb +59 -0
data/lib/etl/version.rb +9 -0
data/lib/etl.rb +83 -0
metadata +245 -0

data/lib/etl/generator/generator.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module ETL #:nodoc:
+  module Generator #:nodoc:
+    # Base class for generators.
+    class Generator
+      class << self
+        # Get the Class for the specified name.
+        #
+        # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
+        def class_for_name(name)
+          ETL::Generator.const_get("#{name.to_s.camelize}Generator")
+        end
+      end
+      # Generate the next value. This method must be implemented by subclasses
+      def next
+        raise "Must be implemented by a subclass"
+      end
+    end
+  end
+end

data/lib/etl/generator/surrogate_key_generator.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# This source file contains code for a basic sequential surrogate key generator
+module ETL #:nodoc:
+  module Generator #:nodoc:
+    # Surrogate key generator.
+    class SurrogateKeyGenerator < Generator
+      attr_reader :table
+      attr_reader :target
+      attr_reader :column
+      attr_reader :query
+      # Initialize the generator
+      def initialize(options={})
+        @table = options[:table]
+        @target = options[:target]
+        @column = options[:column] || 'id'
+        @query = options[:query]
+        if table
+          @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
+        elsif query
+          @surrogate_key = ETL::Engine.connection(target).select_value(query)
+        end
+        @surrogate_key = 0 if @surrogate_key.blank?
+        @surrogate_key = @surrogate_key.to_i
+      end
+      # Get the next surrogate key
+      def next
+        @surrogate_key ||= 0
+        @surrogate_key += 1
+      end
+      def table_name
+        ETL::Engine.table(table, ETL::Engine.connection(target))
+      end
+    end
+  end
+end

data/lib/etl/generator.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'etl/generator/generator'
2	+ Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { \|file\| require(file) }

data/lib/etl/http_tools.rb ADDED Viewed

@@ -0,0 +1,139 @@
+require 'uri'
+# Module which has utility methods for HTTP.
+module HttpTools
+  # Parse the given user agent string
+  #
+  # Code taken from http://gemtacular.com/gems/ParseUserAgent
+  def parse_user_agent(user_agent)
+    if '-' == user_agent
+      #raise 'Invalid User Agent'
+      #puts 'Invalid User Agent'
+    end
+    browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
+    # fix Opera
+    #useragent =~ s/Opera (\d)/Opera\/$1/i;
+    useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
+    # grab all Agent/version strings as 'agents'
+    agents = Array.new
+    user_agent.split(/\s+/).each {|string|
+      if string =~ /\//
+        agents<< string
+      end
+    }
+    # cycle through the agents to set browser and version (MSIE is set later)
+    if agents && agents.length > 0
+        agents.each {|agent|
+          parts = agent.split('/')
+          browser = parts[0]
+          browser_version = parts[1]
+          if browser == 'Firefox'
+            browser_version_major = parts[1].slice(0,3)
+            browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
+          elsif browser == 'Safari'
+            if parts[1].slice(0,3).to_f < 400
+              browser_version_major = '1'
+            else
+              browser_version_major = '2'
+            end
+          else
+            browser_version_major = parts[1].slice(0,1)
+          end
+        }
+    end
+    # grab all of the properties (within parens)
+    # should be in relation to the agent if possible
+    detail = user_agent
+    user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
+    detail = detail.gsub('(','').gsub(')','').lstrip
+    properties = detail.split(/;\s+/)
+    # cycle through the properties to set known quantities
+    properties.each do |property|
+      if property =~ /^Win/
+        ostype = 'Windows'
+        os = property
+        if parts = property.split(/ /,2)
+          if parts[1] =~ /^NT/
+            ostype = 'Windows'
+            subparts = parts[1].split(/ /,2)
+            if subparts[1] == '5'
+              os_version = '2000'
+            elsif subparts[1] == '5.1'
+              os_version = 'XP'
+            else
+              os_version = subparts[1]
+            end
+          end
+        end
+      end
+      if property == 'Macintosh'
+        ostype = 'Macintosh'
+        os = property
+      end
+      if property =~ /OS X/
+        ostype = 'Macintosh'
+        os_version = 'OS X'
+        os = property
+      end
+      if property =~ /^Linux/
+        ostype = 'Linux'
+        os = property
+      end
+      if property =~ /^MSIE/
+        browser = 'MSIE'
+        browser_version = property.gsub('MSIE ','').lstrip
+        browser_version_major,browser_version_minor = browser_version.split('.')
+      end
+    end
+    result = {
+      :browser => browser,
+      :browser_version_major => browser_version_major,
+      :browser_version_minor => browser_version_minor,
+      :ostype => ostype,
+      :os_version => os_version,
+      :os => os,
+    }
+    result.each do |key, value|
+      result[key] = value.blank? ? nil : value.strip
+    end
+    result
+  end
+  # Parse a URI. If options[:prefix] is set then prepend it to the keys for the hash that
+  # is returned.
+  def parse_uri(uri_string, options={})
+    prefix = options[:prefix] ||= ''
+    empty_hash = {
+      "#{prefix}scheme".to_sym => nil,
+      "#{prefix}host".to_sym => nil,
+      "#{prefix}port".to_sym => nil,
+      "#{prefix}uri_path".to_sym => nil,
+      "#{prefix}domain".to_sym => nil
+    }
+    if uri_string
+      #attempt to parse uri --if it's a uri then catch the problem and set everything to nil
+      begin
+        uri = URI.parse(uri_string)
+        results = {
+          "#{prefix}scheme".to_sym => uri.scheme,
+          "#{prefix}host".to_sym => uri.host,
+          "#{prefix}port".to_sym => uri.port,
+          "#{prefix}uri_path".to_sym => uri.path
+        }
+        results["#{prefix}domain".to_sym] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
+        results
+      rescue
+        empty_hash
+      end
+    else
+      empty_hash
+    end
+  end
+end

data/lib/etl/parser/apache_combined_log_parser.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module ETL #:nodoc:
+  module Parser #:nodoc:
+    # Parser which can parser the Apache Combined Log Format as defined at
+    # http://httpd.apache.org/docs/2.2/logs.html
+    class ApacheCombinedLogParser < ETL::Parser::Parser
+      include HttpTools
+      def initialize(source, options={})
+        super
+      end
+      def each
+        Dir.glob(file).each do |file|
+          File.open(file).each_line do |line|
+            yield parse(line)
+          end
+        end
+      end
+      def parse(line)
+        # example line:  127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
+        line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
+        fields = {
+          :ip_address => $1,
+          :identd => $2,
+          :user => $3,
+          :timestamp => $4,
+          :request => $5,
+          :response_code => $6,
+          :bytes => $7,
+          :referrer => $8,
+          :user_agent => $9,
+        }
+        #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
+        d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil?
+        fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil?
+        fields[:method], fields[:path] = fields[:request].split(/\s/)
+        fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil?
+        fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_'))
+        fields.each do |key, value|
+          fields[key] = nil if value == '-'
+        end
+      end
+    end
+  end
+end

data/lib/etl/parser/delimited_parser.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module ETL #:nodoc:
+  module Parser #:nodoc:
+    # Parses delimited files
+    class DelimitedParser < ETL::Parser::Parser
+      # Initialize the parser
+      # * <tt>source</tt>: The Source object
+      # * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
+      def initialize(source, options={})
+        super
+        configure
+      end
+      # Returns each row.
+      def each
+        Dir.glob(file).each do |file|
+          ETL::Engine.logger.debug "parsing #{file}"
+          line = 0
+          lines_skipped = 0
+          FasterCSV.foreach(file, options) do |raw_row|
+            if lines_skipped < source.skip_lines
+              ETL::Engine.logger.debug "skipping line"
+              lines_skipped += 1
+              next
+            end
+            line += 1
+            row = {}
+            validate_row(raw_row, line, file)
+            raw_row.each_with_index do |value, index|
+              f = fields[index]
+              row[f.name] = value
+            end
+            yield row
+          end
+        end
+      end
+      # Get an array of defined fields
+      def fields
+        @fields ||= []
+      end
+      private
+      def validate_row(row, line, file)
+        ETL::Engine.logger.debug "validating line #{line} in file #{file}"
+        if row.length != fields.length
+          raise_with_info( MismatchError,
+            "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
+            line, file
+          )
+        end
+      end
+      def configure
+        source.definition.each do |options|
+          case options
+          when Symbol
+            fields << Field.new(options)
+          when Hash
+            fields << Field.new(options[:name])
+          else
+            raise DefinitionError, "Each field definition must either be a symbol or a hash"
+          end
+        end
+      end
+      class Field #:nodoc:
+        attr_reader :name
+        def initialize(name)
+          @name = name
+        end
+      end
+    end
+  end
+end

data/lib/etl/parser/fixed_width_parser.rb ADDED Viewed

@@ -0,0 +1,65 @@
+module ETL #:nodoc:
+  module Parser #:nodoc:
+    # Parser for fixed with files
+    class FixedWidthParser < ETL::Parser::Parser
+      # Initialize the parser
+      # * <tt>source</tt>: The source object
+      # * <tt>options</tt>: Parser options Hash
+      def initialize(source, options={})
+        super
+        configure
+      end
+      # Return each row
+      def each
+        Dir.glob(file).each do |file|
+          open(file).each do |line|
+            row = {}
+            lines_skipped = 0
+            fields.each do |name, f|
+              if lines_skipped < source.skip_lines
+                lines_skipped += 1
+                next
+              end
+              # TODO make strip optional?
+              row[name] = line[f.field_start, f.field_length].strip
+            end
+            yield row
+          end
+        end
+      end
+      # Return a map of defined fields
+      def fields
+        @fields ||= {}
+      end
+      private
+      def configure
+        source.definition.each do |field, options|
+          fields[field] = FixedWidthField.new(
+            options[:name], options[:start], options[:end], options[:length]
+          )
+        end
+      end
+    end
+    class FixedWidthField #:nodoc:
+      attr_reader :name, :field_start, :field_end, :field_length
+      # Initialize the field.
+      def initialize(name, field_start, field_end=nil, field_length=nil)
+        @name = name
+        @field_start = field_start - 1
+        if field_end
+          @field_end = field_end
+          @field_length = @field_end - @field_start
+        elsif field_length
+          @field_length = field_length
+          @field_end = @field_start + @field_length
+        else
+          raise DefinitionError, "Either field_end or field_length required"
+        end
+      end
+    end
+  end
+end

data/lib/etl/parser/parser.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module ETL #:nodoc:
+  module Parser #:nodoc:
+    # Base parser class. Implementation classes must extend this class and implement
+    # the each method. The each method should return each row of the source data as
+    # a Hash.
+    class Parser
+      include Enumerable
+      class << self
+        # Convert the name (string or symbol) to a parser class.
+        #
+        # Example:
+        #   <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
+        def class_for_name(name)
+          ETL::Parser.const_get("#{name.to_s.camelize}Parser")
+        end
+      end
+      # The Source object for the data
+      attr_reader :source
+      # Options Hash for the parser
+      attr_reader :options
+      def initialize(source, options={})
+        @source = source
+        @options = options || {}
+      end
+      protected
+      def file
+        path = Pathname.new(source.configuration[:file])
+        path = path.absolute? ? path : Pathname.new(File.dirname(source.control.file)) + path
+        path
+      end
+      def raise_with_info(error, message, file, line)
+        raise error, "#{message} (line #{line} in #{file})"
+      end
+    end
+  end
+end

data/lib/etl/parser/sax_parser.rb ADDED Viewed

@@ -0,0 +1,218 @@
+require 'rexml/parsers/sax2parser'
+require 'rexml/sax2listener'
+module ETL #:nodoc:
+  module Parser #:nodoc:
+    # ETL parser implementation which uses SAX to parse XML files.
+    class SaxParser < ETL::Parser::Parser
+      # The write trigger causes whatever values are currently specified for the row to be returned.
+      # After returning the values will not be cleared, thus allowing for values which are assigned
+      # higher in the XML tree to remain in memory.
+      attr_accessor :write_trigger
+      # Initialize the parser
+      # * <tt>source</tt>: The Source object
+      # * <tt>options</tt>: Parser options Hash
+      def initialize(source, options={})
+        super
+        configure
+      end
+      # Returns each row
+      def each(&block)
+        Dir.glob(file).each do |file|
+          parser = REXML::Parsers::SAX2Parser.new(File.new(file))
+          listener = Listener.new(self, &block)
+          parser.listen(listener)
+          parser.parse
+        end
+      end
+      # Get an array of Field objects
+      def fields
+        @fields ||= []
+      end
+      private
+      def configure
+        #puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
+        self.write_trigger = source.definition[:write_trigger]
+        # map paths to field names
+        source.definition[:fields].each do |name, path|
+          #puts "defined field #{name}, path: #{path}"
+          fields << Field.new(name, XPath::Path.parse(path))
+        end
+      end
+      # Class representing a field to be loaded from the source
+      class Field
+        # The name of the field
+        attr_reader :name
+        # The XPath-like path to the field in the XML document
+        attr_reader :path
+        def initialize(name, path) #:nodoc
+          @name = name
+          @path = path
+        end
+      end
+    end
+    class Listener #:nodoc:
+      include REXML::SAX2Listener
+      def initialize(parser, &block)
+        @parser = parser
+        @row = {}
+        @value = nil
+        @proc = Proc.new(&block)
+      end
+      def cdata(text)
+        @value << text
+      end
+      def characters(text)
+        text = text.strip
+        if (!text.nil? && text != '')
+          @value ||= ''
+          @value << text
+        end
+      end
+      def start_document
+        @path = XPath::Path.new
+      end
+      def end_document
+      end
+      def start_element(uri, localname, qname, attributes)
+        element = XPath::Element.new(localname, attributes)
+        @path.elements << element
+        @parser.fields.each do |field|
+          #puts "#{@path} match? #{field.path}"
+          if @path.match?(field.path)
+            #puts "field.path: #{field.path}"
+            if field.path.is_attribute?
+              #puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
+              @row[field.name] = element.attributes[field.path.attribute]
+            end
+          end
+        end
+      end
+      def end_element(uri, localname, qname)
+        element = @path.elements.last
+        @parser.fields.each do |field|
+          #puts "#{@path} match? #{field.path}"
+          if @path.match?(field.path)
+            #puts "field.path: #{field.path}"
+            if !field.path.is_attribute?
+              @row[field.name] = @value
+            end
+          end
+        end
+        #puts @path.to_s
+        if @path.match?(@parser.write_trigger)
+          #puts "matched: #{@path} =~ #{@parser.write_trigger}"
+          #puts "calling proc with #{@row.inspect}"
+          @proc.call(@row.clone)
+        end
+        @value = nil
+        @path.elements.pop
+      end
+      def progress(position)
+        @position = position
+      end
+    end
+    # Module which contains classes that are used for XPath-like filtering
+    # on the SAX parser
+    module XPath #:nodoc:
+      class Path #:nodoc:
+        # Get the elements in the path
+        attr_accessor :elements
+        # Initialize
+        def initialize
+          @elements = []
+        end
+        # Convert to a string representation
+        def to_s
+          @elements.map{ |e| e.to_s }.join("/")
+        end
+        # Returns true if the last part of the path refers to an attribute
+        def is_attribute?
+          elements.last.attributes.length > 0
+        end
+        # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
+        # does not reference an attribute.
+        #
+        # Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
+        # since attributes are stored in a Hash.
+        def attribute
+          return nil unless is_attribute?
+          elements.last.attributes.keys.first
+        end
+        # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
+        # will cause the method to return false.
+        def match?(s)
+          path = Path.parse(s)
+          return false unless path.elements.length == elements.length
+          elements.each_with_index do |element, index|
+            path_element = path.elements[index]
+            return false if path_element.nil?
+            return false if element.name != path_element.name
+            path_element.attributes.each do |key, value|
+              return false unless element.attributes[key] =~ value
+            end
+          end
+          return true
+        end
+        # Parse the string into an XPath::Path object
+        def self.parse(s)
+          return s if s.is_a?(Path)
+          path = Path.new
+          parts = s.split('/')
+          parts.each_with_index do |part, i|
+            attributes = {}
+            part.gsub!(/(.*)\[(.*)\]/, '\1')
+            if !$2.nil?
+              $2.split(",").each do |pair|
+                key, value = pair.split("=")
+                value = ".*" if value.nil?
+                attributes[key] = Regexp.new(value)
+              end
+            end
+            path.elements << Element.new(part, attributes)
+          end
+          path
+        end
+      end
+      class Element #:nodoc
+        attr_reader :name
+        attr_reader :attributes
+        def initialize(name, attributes={})
+          @name = name
+          @attributes = attributes
+        end
+        def to_s
+          s = "#{name}"
+          if !@attributes.empty?
+            attr_str = @attributes.collect do |key,value|
+              value = value.source if value.is_a?(Regexp)
+              "#{key}=#{value}"
+            end.join(",")
+            s << "[" + attr_str + "]"
+          end
+          s
+        end
+      end
+    end
+  end
+end