RubyGems - davidrichards-etl - Versions diffs - 0.0.4 - Mend

davidrichards-etl 0.0.4

Files changed (29) hide show

data/README.rdoc +261 -0
data/VERSION.yml +4 -0
data/bin/etl +27 -0
data/lib/all.rb +4 -0
data/lib/etl/active_record_loader.rb +50 -0
data/lib/etl/bucket.rb +148 -0
data/lib/etl/csv_et.rb +64 -0
data/lib/etl/etl.rb +273 -0
data/lib/etl/time_bucket.rb +104 -0
data/lib/etl/xml_et.rb +6 -0
data/lib/etl.rb +36 -0
data/lib/helpers/array.rb +11 -0
data/lib/helpers/observation.rb +10 -0
data/lib/helpers/open_struct.rb +18 -0
data/lib/helpers/string.rb +6 -0
data/lib/helpers/symbol.rb +6 -0
data/spec/etl/bucket_spec.rb +112 -0
data/spec/etl/csv_et_spec.rb +43 -0
data/spec/etl/etl_spec.rb +237 -0
data/spec/etl/xml_et_spec.rb +50 -0
data/spec/etl_spec.rb +16 -0
data/spec/fixtures/test_file.csv +3 -0
data/spec/helpers/array_spec.rb +13 -0
data/spec/helpers/observation_spec.rb +22 -0
data/spec/helpers/open_struct_spec.rb +25 -0
data/spec/helpers/string_spec.rb +8 -0
data/spec/helpers/symbol_spec.rb +7 -0
data/spec/spec_helper.rb +15 -0
metadata +106 -0

data/lib/etl/csv_et.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'open-uri'
+require 'fastercsv'
+module CSV
+  # Extract and transform for CSV files: in context (as a string), from a
+  # local file, or from a remote file.  Uses FasterCSV and open-uri
+  class ET < ETL
+    attr_reader :header
+    after_transform :get_header_conditionally
+    protected
+      def get_header_conditionally
+        @header = @raw.shift if self.options[:extract_header]
+      end
+      # Attempts to get a string from a file, a uri, or a string
+      def extract
+        obj = self.options.fetch(:source, nil)
+        extract_locally(obj) or extract_remotely(obj) or extract_from_string(obj)
+        raise ArgumentError, "Could not determine what #{obj.inspect} was.  CSV::ET cannot work with this data." unless @raw
+      end
+      # Handles local filename cases, reading the contents of the file.
+      def extract_locally(filename)
+        @raw = File.read(filename) if File.exist?(filename)
+        ET.logger.info "Extracted the data from from filesystem" if @raw
+        @raw ? true : false
+      end
+      # Handles remote uri cases, reading the remote resource with open-uri, part of the Standard Library
+      def extract_remotely(uri)
+        begin
+          open(uri) {|f| @raw = f.read}
+          ET.logger.info "Extracted the data from a remote location."
+          return true
+        rescue
+          ET.logger.info "Tested whether #{uri} was a remote resource.  Failed to read it."
+          return false
+        end
+      end
+      # If this is a string, assumes that the contents of the string are CSV contents.
+      def extract_from_string(string)
+        @raw = string if string.is_a?(String)
+        @raw ? true : false
+      end
+      def transform
+        opts = self.options.fetch(:parse_with, {})
+        ET.logger.info "Parsing the data with FasterCSV and #{default_csv_opts.merge(opts).inspect}"
+        @raw = FCSV.parse(@data, default_csv_opts.merge(opts))
+      end
+      def default_csv_opts; {:converters => :all}; end
+  end
+  # Try this out for size:
+  # file = CSV::ET.process(:source => 'http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
+end

data/lib/etl/etl.rb ADDED Viewed

@@ -0,0 +1,273 @@
+# State machine with useful callbacks for getting data (Extract,
+# Transform, and Loading data) with some support for re-trying failed
+# stages of the process.  Raise errors liberally if things go wrong, the
+# data is being staged and the process can usually be restarted once the
+# issue has been addressed.
+class ETL
+  VALID_STATES = [:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load, :complete].freeze
+  VALID_CALLBACKS = [:before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load, :complete].freeze
+  # Because we want to interchange these steps on the queueing system
+  if defined?(TeguGears) == 'constant'
+    include TeguGears
+  end
+  # Using ActiveSupports callback system
+  include ActiveSupport::Callbacks
+  class << self
+    def process(options={}, &block)
+      etl = new
+      etl.process(options, &block)
+      etl
+    end
+    alias :call :process
+    # Sets up a logger for the class.  Respects inheritance, so a different
+    # logger will be created for each ETL subclass.
+    # Using the standard log levels here: DEBUG < INFO < WARN < ERROR < FATAL
+    def logger
+      logger_name = (self.to_s + "_logger").to_sym
+      # Find and return the cached logger, if it's setup
+      logger = read_inheritable_attribute(logger_name)
+      return logger if logger
+      # Create a logger.  Will configure it here and save it in a moment.
+      logger = Log4r::Logger.new(self.to_s)
+      # Set my default output format
+      format = Log4r::PatternFormatter.new(:pattern => "[%l] %d :: %m")
+      # Setup a console logger with our formatting
+      console = Log4r::StderrOutputter.new 'console'
+      console.level = Log4r::WARN
+      console.formatter = format
+      # Setup a logger to a file with our formatting
+      logfile = Log4r::FileOutputter.new('logfile',
+                               :filename => self.logger_filename,
+                               :trunc => false,
+                               :level => Log4r::DEBUG)
+      logfile.formatter = format
+      # Tell the logger about both outputs.
+      logger.add('console','logfile')
+      # Store the logger as an inheritable class attribute
+      write_inheritable_attribute(logger_name, logger)
+      # Return the logger
+      logger
+    end
+    # First tries to get the cached @@logger_root
+    # Second, sets the global @@logger_root unless it is cached.  Sets it to
+    # the best possible place to locate the logs:
+    # 1) where log will be from RAILS_ROOT/vendor/gems/etl
+    # 2) where log will be in a Rails model
+    # 3) where log will be in a Rails lib
+    # 4) in the local directory where ETL is being subclassed
+    # Third, uses the subclasses stored logger_root, ignoring all the rest
+    # if this is found.
+    def logger_root
+      @@logger_root ||= case
+      when File.exist?(File.dirname(__FILE__) + "/../../../../../log")
+        File.expand_path(File.dirname(__FILE__) + "/../../../../../log")
+      when File.exist?(File.dirname(__FILE__) + "/../../log")
+        File.expand_path(File.dirname(__FILE__) + '/../../log')
+      when File.exist?(File.dirname(__FILE__) + "/../log")
+        File.expand_path(File.dirname(__FILE__) + '/../log')
+      when File.exist?(File.dirname(__FILE__) + "/log")
+        File.expand_path(File.dirname(__FILE__) + '/log')
+      else
+        File.expand_path(File.dirname(__FILE__))
+      end
+      logger_root = read_inheritable_attribute(:logger_root) || @@logger_root
+    end
+    # Sets the logger root for the subclass, and sets it globally if this is
+    # set on ETL.  So, ETL.logger_root = "some location" sets the logger
+    # root for all subclasses.  This is useful if a lot of ETL is being done,
+    # and it needs to be logged in a non-standard place.
+    def logger_root=(value)
+      write_inheritable_attribute(:logger_root, value)
+      @@logger_root = value if self == ETL
+    end
+    def logger_filename
+      File.join(self.logger_root, "#{self.to_s}.log")
+    end
+  end
+  # A series of callbacks that make the process quite transparent
+  define_callbacks :before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load
+  def initialize
+    @state = :before_extract
+  end
+  # The state of the transform process
+  attr_reader :state
+  # The data being worked on, after it has successfully completed an
+  # extract, transform, or load process.
+  attr_reader :data
+  # The data generated on a process that didn't complete.
+  attr_reader :raw
+  # The options to process with.  All your code will have access to these
+  # options, so things like:
+  #
+  # :filename => '...', :destination => '...', :converters => :all
+  #
+  # would all be useful. Your extract, transform, and load methods
+  # plus your callbacks can then extract out the information they need
+  # to get the job done.
+  attr_reader :options
+  # An optional block to process with
+  attr_reader :block
+  # Working towards a universal workflow driver here.  The signature is
+  # just a hash and a block.  That should work for about anything.
+  def process(options={}, &block)
+    # Only setup the options the first time, the other times we are re-
+    # starting the process.
+    @options = options unless @options
+    @block = block
+    self.class.logger.info "Processing #{self.class.to_s}"
+    self.class.logger.info "To re-run this process, run: #{self.show_command}"
+    self.class.logger.info "Note: Also pass the same block to #{self.class.to_s}" if block
+    etl_callback(:before_extract)
+    if @state == :extract
+      extract
+      @state = :after_extract
+    end
+    etl_callback(:after_extract)
+    # To be sure this is after all after_extract callbacks
+    process_raw_data
+    etl_callback(:before_transform)
+    if @state == :transform
+      transform
+      @state = :after_transform
+    end
+    etl_callback(:after_transform)
+    # To be sure this is after all after_tranform callbacks
+    process_raw_data
+    etl_callback(:before_load)
+    if @state == :load
+      load
+      @state = :after_load
+    end
+    etl_callback(:after_load)
+    @state
+  end
+  def reverse_to(state)
+    raise ArgumentError, "State must be one of #{VALID_STATES.inspect}" unless VALID_STATES.include?(state)
+    loc = VALID_STATES.index(state)
+    possible_states = VALID_STATES[0..loc]
+    raise "Cannot reverse to a state that hasn't been acheived yet." unless possible_states.include?(state)
+    @state = state
+  end
+  protected
+    def extract
+      # Silently do nothing
+    end
+    def transform
+      # Silently do nothing
+    end
+    def load
+      # Silently do nothing
+    end
+    # Runs a callback, if there is one defined on the class.  Advances the
+    # state to the next state.  Silently ignores the request if the current
+    # state isn't the callback being asked for.  In this way, we can just
+    # call etl_callback several times, and it will advance from one state to
+    # the next.
+    def etl_callback(callback)
+      return false unless self.state == callback
+      run_callbacks(callback)
+      advance_from(callback)
+    end
+    # Advances to the next state, only if we are in a valid state.
+    def advance_from(callback)
+      raise ArgumentError, "State: #{callback} not recognized" unless VALID_CALLBACKS.include?(callback)
+      before_state = @state
+      @state = case @state
+      when :before_extract
+        :extract
+      when :extract
+        :after_extract
+      when :after_extract
+        :before_transform
+      when :before_transform
+        :transform
+      when :transform
+        :after_transform
+      when :after_transform
+        :before_load
+      when :before_load
+        :load
+      when :load
+        :after_load
+      when :after_load
+        :complete
+      when :complete
+        :complete
+      end
+      self.class.logger.info "Advanced from #{before_state} to #{@state}"
+    end
+    def process_raw_data
+      @data = @raw if defined?(@raw)
+      @raw = nil
+    end
+    def show_command
+      "#{self.class.to_s}.process(#{show_parsed_options})"
+    end
+    def show_parsed_options
+      self.options.inject("") do |str, e|
+        if e.first.is_a?(Symbol) and e.last.is_a?(Symbol)
+          str << ":#{e.first} => :#{e.last}"
+        elsif e.first.is_a?(Symbol)
+          str << ":#{e.first} => #{e.last}"
+        elsif e.last.is_a?(Symbol)
+          str << "#{e.first} => :#{e.last}"
+        else
+          str << "#{e.first} => #{e.last}"
+        end
+        str
+      end
+    end
+end

data/lib/etl/time_bucket.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# # This keeps the state of all observations in a bucket.  An observation
+# # is expected to be an OpenStruct with an occured_at field set.  An
+# # Observation class is provided in the helpers directory and is
+# # automatically loaded with this gem.  This is setup to work well in the
+# # observable pattern.
+#
+# # Uses
+# class TimeCachedOpenStruct
+#
+#   attr_reader :tick_time
+#   attr_reader :keep_for
+#   attr_reader :cache
+#
+#   def initialize(opts={})
+#     @tick_time = opts.fetch(:tick_time, 1)
+#     @keep_for = opts.fetch(:keep_for, self.tick_time * 100)
+#     @cache = ... (hash structure)
+#   end
+#
+#   def at(time)
+#     self.cache[index_for(time)]
+#   end
+#
+#   protected
+#     def index_for(time)
+#       ...
+#     end
+#
+#     def round(time)
+#     end
+# end
+#
+# require 'observable'
+# class TimeBucket
+#
+#   include Observer
+#
+#   class << self
+#
+#     # Works more like a multiton with subclasses.  Each subclass gets their
+#     # own instance.
+#     def instance(opts={})
+#       instance = read_inheritable_attribute(:instance)
+#       return instance if instance
+#       instance = new(opts)
+#       write_inheritable_attribute(:instance, instance)
+#       instance
+#     end
+#   end
+#
+#   # How often the state is broadcast
+#   attr_reader :tick_time
+#
+#   # How long to wait for messages to be gathered in the bucket.  If they
+#   # are not gathered by this time, they will never be broadcast.
+#   attr_reader :delay_time
+#
+#   # The actual state data, a OpenStruct-based cache with a time-based
+#   # eviction_policy and a time-based accessor:
+#   # TimeBucket.bucket.at(time_object)
+#   attr_reader :bucket
+#
+#   def initialize(opts={})
+#     @tick_time = opts.fetch(:tick_time, 1)
+#     @delay_time = opts.fetch(:delay_time, 0.5)
+#     keep_time = self.tick_time * 100 + self.delay_time
+#     @bucket = TimeCachedOpenStruct.new(:tick_time => self.tick_time, :keep_for => keep_time)
+#   end
+#
+#   # To be called in its own process:
+#   # Process.fork { TimeBucket.instance(...).service(@etl) }
+#   # @etl is an object that responds to process and can load the consolidated data.
+#   def service(etl)
+#     sleep self.delay_time
+#     loop do
+#       changed
+#       notify_observers(self.bucket.at(Time.now - self.sleep_time))
+#       sleep self.tick_time
+#     end
+#   end
+#
+#   def update(obj)
+#     observation = infer_observation(obj)
+#     self.bucket.merge!(observation)
+#   end
+#
+#   protected
+#     def infer_observation(obj)
+#       if obj.respond_to?(occured_at)
+#         obj
+#       elsif obj.respond_to?(observation) and obj.observation.occured_at
+#         obj.observation
+#       elsif obj.is_a?(OpenStruct)
+#         obj.occured_at = Time.now
+#         obj
+#       elsif obj.is_a?(Hash)
+#         observation = Observation.new(obj)
+#         observation.occured_at = obj.fetch(:occured_at, Time.now)
+#         observation
+#       else
+#         nil
+#       end
+#     end
+# end

data/lib/etl/xml_et.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module XML #:nodoc:
+  # Works with XML.  May want to break this up into several utilities.
+  class ET < ETL
+  end
+end