RubyGems - chicago-etl - Versions diffs - 0.0.13 → 0.1.0 - Mend

chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

data/Gemfile +8 -3
data/README.rdoc +4 -1
data/VERSION +1 -1
data/chicago-etl.gemspec +59 -22
data/chicago-flow.gemspec +92 -0
data/lib/chicago/etl/batch.rb +9 -2
data/lib/chicago/etl/core_extensions.rb +12 -0
data/lib/chicago/etl/counter.rb +8 -1
data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
data/lib/chicago/etl/key_builder.rb +17 -39
data/lib/chicago/etl/load_dataset_builder.rb +3 -1
data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
data/lib/chicago/etl/pipeline.rb +151 -0
data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
data/lib/chicago/etl/screens/column_screen.rb +26 -25
data/lib/chicago/etl/screens/invalid_element.rb +5 -5
data/lib/chicago/etl/screens/missing_value.rb +4 -2
data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
data/lib/chicago/etl/table_builder.rb +4 -2
data/lib/chicago/etl/task_invocation.rb +0 -1
data/lib/chicago/etl/transformations.rb +128 -0
data/lib/chicago/etl.rb +39 -8
data/lib/chicago/flow/array_sink.rb +35 -0
data/lib/chicago/flow/array_source.rb +15 -0
data/lib/chicago/flow/dataset_source.rb +23 -0
data/lib/chicago/flow/errors.rb +14 -0
data/lib/chicago/flow/filter.rb +15 -0
data/lib/chicago/flow/mysql.rb +4 -0
data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
data/lib/chicago/flow/mysql_file_sink.rb +68 -0
data/lib/chicago/flow/null_sink.rb +8 -0
data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
data/lib/chicago/flow/pipeline_stage.rb +68 -0
data/lib/chicago/flow/sink.rb +53 -0
data/lib/chicago/flow/transformation.rb +169 -0
data/lib/chicago/flow/transformation_chain.rb +40 -0
data/spec/etl/batch_spec.rb +2 -1
data/spec/etl/core_extensions_spec.rb +13 -0
data/spec/etl/dataset_batch_stage_spec.rb +55 -0
data/spec/etl/key_builder_spec.rb +25 -83
data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
data/spec/etl/screens/invalid_element_spec.rb +10 -11
data/spec/etl/screens/missing_value_spec.rb +21 -21
data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
data/spec/etl/transformations_spec.rb +109 -0
data/spec/flow/array_sink_spec.rb +26 -0
data/spec/flow/array_source_spec.rb +20 -0
data/spec/flow/dataset_source_spec.rb +15 -0
data/spec/flow/filter_spec.rb +13 -0
data/spec/flow/mysql_file_serializer_spec.rb +27 -0
data/spec/flow/mysql_file_sink_spec.rb +94 -0
data/spec/flow/mysql_integration_spec.rb +72 -0
data/spec/flow/pipeline_stage_spec.rb +89 -0
data/spec/flow/transformation_chain_spec.rb +76 -0
data/spec/flow/transformation_spec.rb +91 -0
data/spec/spec_helper.rb +5 -0
metadata +135 -39
data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
data/lib/chicago/etl/screens/composite_screen.rb +0 -17
data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
data/lib/chicago/etl/sink.rb +0 -61
data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
data/spec/etl/mysql_dumpfile_spec.rb +0 -42
data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
data/spec/etl/screens/composite_screen_spec.rb +0 -25
data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
data/spec/etl/sink_spec.rb +0 -7
data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9

data/lib/chicago/etl.rb CHANGED Viewed

@@ -1,35 +1,66 @@
+if RUBY_VERSION.split(".")[1] < "9"
+  require 'fastercsv'
+  CSV = FasterCSV
+else
+  require 'csv'
+end
 require 'sequel'
+require 'chicago/flow/errors'
+require 'chicago/flow/transformation'
+require 'chicago/flow/filter'
+require 'chicago/flow/transformation_chain'
+require 'chicago/flow/pipeline_stage'
+require 'chicago/flow/pipeline_endpoint'
+require 'chicago/flow/array_source'
+require 'chicago/flow/dataset_source'
+require 'chicago/flow/sink'
+require 'chicago/flow/array_sink'
+require 'chicago/flow/null_sink'
+require 'chicago/flow/mysql'
+require 'chicago/etl/core_extensions'
 require 'chicago/etl/counter'
 require 'chicago/etl/key_builder'
-require 'chicago/etl/sink'
-require 'chicago/etl/mysql_load_file_value_transformer'
-require 'chicago/etl/buffering_insert_writer'
-require 'chicago/etl/mysql_dumpfile'
+require 'chicago/etl/schema_table_sink_factory'
+require 'chicago/etl/transformations'
 require 'chicago/etl/load_dataset_builder'
+require 'chicago/etl/dataset_batch_stage'
+require 'chicago/etl/load_pipeline_stage_builder'
+require 'chicago/etl/pipeline'
 # Sequel Extensions
 require 'chicago/etl/sequel/filter_to_etl_batch'
-require 'chicago/etl/sequel/load_data_infile'
 require 'chicago/etl/sequel/dependant_tables'
 # Screens
 require 'chicago/etl/screens/column_screen'
-require 'chicago/etl/screens/composite_screen'
 require 'chicago/etl/screens/missing_value'
 require 'chicago/etl/screens/invalid_element'
 require 'chicago/etl/screens/out_of_bounds'
 # Transformations
-require 'chicago/etl/transformations/add_insert_timestamp'
 require 'chicago/etl/transformations/uk_post_code'
 require 'chicago/etl/transformations/uk_post_code_field'
 module Chicago
+  # Contains classes related to ETL processing.
   module ETL
     autoload :TableBuilder,   'chicago/etl/table_builder.rb'
     autoload :Batch,          'chicago/etl/batch.rb'
     autoload :TaskInvocation, 'chicago/etl/task_invocation.rb'
+    # Executes a pipeline stage in the context of an ETL Batch.
+    #
+    # Tasks execution status is stored in a database etl task
+    # invocations table - this ensures tasks aren't run more than once
+    # within a batch.
+    def self.execute(stage, etl_batch, reextract, logger)
+      etl_batch.perform_task(:load, stage.name) do
+        logger.debug "Starting loading #{stage.name}"
+        stage.execute(etl_batch, reextract)
+        logger.debug "Finished loading #{stage.name}"
+      end
+    end
   end
 end

data/lib/chicago/flow/array_sink.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module Chicago
+  module Flow
+    # An endpoint that stores rows in an Array.
+    #
+    # @api public
+    class ArraySink < Sink
+      # Returns the array of written rows.
+      attr_reader :data
+      # The name of this sink
+      attr_reader :name
+      # Creates an ArraySink.
+      #
+      # Optionally you may pass an array of column names if you wish
+      # to use static validation that the correct columns are written
+      # through the pipeline.
+      def initialize(name, fields=[])
+        @name = name
+        @fields = [fields].flatten
+        @data = []
+      end
+      # See Sink#<<
+      def <<(row)
+        @data << row.merge(constant_values)
+      end
+      # See Sink#truncate
+      def truncate
+        @data.clear
+      end
+    end
+  end
+end

data/lib/chicago/flow/array_source.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Chicago
+  module Flow
+    # @api public
+    class ArraySource < PipelineEndpoint
+      def initialize(array, fields=[])
+        @fields = [fields].flatten
+        @array = array
+      end
+      def each
+        @array.each {|row| yield row }
+      end
+    end
+  end
+end

data/lib/chicago/flow/dataset_source.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'sequel'
+require 'sequel/fast_columns'
+module Chicago
+  module Flow
+    # @api public
+    class DatasetSource < PipelineEndpoint
+      attr_reader :dataset
+      def initialize(dataset)
+        @dataset = dataset
+      end
+      def each
+        @dataset.each {|row| yield row }
+      end
+      def fields
+        @dataset.columns
+      end
+    end
+  end
+end

data/lib/chicago/flow/errors.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Chicago
+  module Flow
+    # @api public
+    class Error < RuntimeError
+    end
+    # @api public
+    class RaisingErrorHandler
+      def unregistered_sinks(sinks)
+        raise Error.new("Sinks not registered: #{sinks.join(",")}")
+      end
+    end
+  end
+end

data/lib/chicago/flow/filter.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Chicago
+  module Flow
+    # @api public
+    class Filter < Transformation
+      def initialize(stream=:default, &block)
+        super(stream)
+        @block = block || lambda {|row| false }
+      end
+      def process_row(row)
+        row if @block.call(row)
+      end
+    end
+  end
+end

data/lib/chicago/flow/mysql.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'sequel'
+require 'sequel/load_data_infile'
+require 'chicago/flow/mysql_file_serializer'
+require 'chicago/flow/mysql_file_sink'

data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} RENAMED Viewed

@@ -1,12 +1,15 @@
+require 'date'
 module Chicago
-  module ETL
-    class MysqlLoadFileValueTransformer
+  module Flow
+    # @api private
+    class MysqlFileSerializer
       # Transforms a value to be suitable for use in file in a LOAD
       # DATA INFILE mysql statement.
-      def transform(value)
+      def serialize(value)
         case value
         when nil
-          "\\N"
+          "NULL"
         when true
           "1"
         when false

data/lib/chicago/flow/mysql_file_sink.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'sequel'
+require 'sequel/load_data_infile'
+require 'tmpdir'
+Sequel.extension :core_extensions
+module Chicago
+  module Flow
+    # @api public
+    class MysqlFileSink < Sink
+      attr_reader :filepath
+      attr_writer :truncation_strategy
+      def initialize(db, table_name, fields, options = {})
+        @fields = [fields].flatten
+        @filepath = options[:filepath] || temp_file(table_name)
+        @serializer = MysqlFileSerializer.new
+        @db = db
+        @table_name = table_name
+        @insert_ignore = !!options[:ignore]
+      end
+      def name
+        @table_name
+      end
+      def <<(row)
+        csv << fields.map {|c| @serializer.serialize(row[c]) }
+      end
+      def close
+        csv.flush
+        load_from_file(filepath)
+        csv.close
+        File.unlink(filepath) if File.exists?(filepath)
+      end
+      # Loads data from the file into the MySQL table via LOAD DATA
+      # INFILE, if the file exists and has content.
+      def load_from_file(file)
+        return unless File.size?(file)
+        dataset.load_csv_infile(file, @fields, :set => constant_values)
+      end
+      def truncate
+        if @truncation_strategy
+          @truncation_strategy.call
+        else
+          @db[@table_name].truncate
+        end
+      end
+      private
+      def dataset
+        @insert_ignore ? @db[@table_name].insert_ignore : @db[@table_name]
+      end
+      def csv
+        @csv ||= CSV.open(filepath, "w")
+      end
+      def temp_file(table_name)
+        File.join(Dir.tmpdir, "#{table_name}.#{rand(1_000_000)}.csv")
+      end
+    end
+  end
+end

data/lib/chicago/flow/null_sink.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module Chicago
+  module Flow
+    # Supports the Sink interface, but discards all rows written to
+    # it.
+    class NullSink < Sink
+    end
+  end
+end

data/lib/chicago/flow/pipeline_endpoint.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Chicago
+  module Flow
+    # A Source or a Sink.
+    #
+    # @api public
+    # abstract
+    class PipelineEndpoint
+      attr_reader :fields
+      def has_defined_fields?
+        !fields.empty?
+      end
+    end
+  end
+end

data/lib/chicago/flow/pipeline_stage.rb ADDED Viewed

@@ -0,0 +1,68 @@
+module Chicago
+  module Flow
+    # Co-ordinates iterating over rows provided by a source, passing
+    # them through a transformation chain before writing them to
+    # sink(s).
+    #
+    # @api public
+    class PipelineStage
+      attr_reader :transformation_chain
+      def initialize(options={})
+        @sinks  = options[:sinks] || {}
+        @transformations = options[:transformations] || []
+        @error_handler = options[:error_handler] || RaisingErrorHandler.new
+        @transformation_chain = TransformationChain.new(*@transformations)
+      end
+      # Returns the named sink, if it exists
+      def sink(name)
+        @sinks[name.to_sym]
+      end
+      def sinks
+        @sinks.values
+      end
+      def register_sink(name, sink)
+        @sinks[name.to_sym] = sink
+        self
+      end
+      def validate_pipeline
+        unless unregistered_sinks.empty?
+          @error_handler.unregistered_sinks(unregistered_sinks)
+        end
+      end
+      def execute(source)
+        validate_pipeline
+        sinks.each(&:open)
+        pipe_rows_to_sinks_from(source)
+        sinks.each(&:close)
+      end
+      def required_sinks
+        transformation_chain.output_streams | [:default]
+      end
+      def unregistered_sinks
+        required_sinks - @sinks.keys
+      end
+      private
+      def pipe_rows_to_sinks_from(source)
+        source.each do |row|
+          transformation_chain.process(row).each {|row| process_row(row) }
+        end
+        transformation_chain.flush.each {|row| process_row(row) }
+      end
+      def process_row(row)
+        stream = row.delete(:_stream) || :default
+        @sinks[stream] << row
+      end
+    end
+  end
+end

data/lib/chicago/flow/sink.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module Chicago
+  module Flow
+    # The destination for rows passing through a pipeline stage.
+    #
+    # @api public
+    # @abstract
+    class Sink < PipelineEndpoint
+      # Specifies a hash of values that are assumed to apply to all
+      # rows.
+      #
+      # Subclasses should use there constant values appropriately when
+      # writing rows, by merging them with the row or otherwise
+      # ensuring that they end up in the final source this sink
+      # represents.
+      def constant_values
+        @constant_values ||= {}
+      end
+      # Sets a number of constant values.
+      def set_constant_values(hash={})
+        constant_values.merge!(hash)
+        self
+      end
+      # Performs any operations before writing rows to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def open
+      end
+      # Performs any operations after writing rows to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def close
+      end
+      # Writes a row to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def <<(row)
+      end
+      # Removes all rows from this sink.
+      #
+      # This includes all rows written prior to this particular
+      # execution of a pipeline stage.
+      #
+      # By default does nothing; should be overritten by subclasses.
+      def truncate
+      end
+    end
+  end
+end

data/lib/chicago/flow/transformation.rb ADDED Viewed

@@ -0,0 +1,169 @@
+module Chicago
+  module Flow
+    # The key used to store the stream in the row.
+    #
+    # @api private
+    STREAM = :_stream
+    # A base class for row transformations.
+    #
+    # Transformations process hash-like rows by filtering or altering
+    # their contents.
+    #
+    # @api public
+    # @abstract Subclass and add a process_row method
+    class Transformation
+      # Creates the transformation.
+      #
+      # This should not be overridden by subclasses - transformations
+      # that need their own arguments should do so by passing named
+      # options.
+      #
+      # @overload initialize(stream, options)
+      #   Specifies this transformation applies to a specific
+      #   stream. Options are specific to the stream subclass
+      # @overload initialize(options)
+      #   As above, but the stream is assumed to be :default
+      def initialize(*args)
+        stream, options = *args
+        if stream.kind_of?(Hash)
+          @stream = :default
+          @options = stream
+        else
+          @stream = stream || :default
+          @options = options || {}
+        end
+        ensure_options_present
+      end
+      # Returns the required initialization options for this transformation.
+      def self.required_options
+        @required_options ||= []
+      end
+      # Returns the fields added by this transformation.
+      def self.added_fields
+        @added_fields ||= []
+      end
+      # Returns the fields removed by this transformation.
+      def self.removed_fields
+        @removed_fields ||= []
+      end
+      # Specify which options are required in the constructor of
+      # this transformation.
+      def self.requires_options(*options)
+        required_options.concat options.flatten
+      end
+      # Specify which fields are added to the row by this
+      # transformation.
+      def self.adds_fields(*fields)
+        added_fields.concat fields.flatten
+      end
+      # Specify which fields are removed from the row by this
+      # transformation.
+      def self.removes_fields(*fields)
+        removed_fields.concat fields.flatten
+      end
+      # Returns the required initialization options for this transformation.
+      def required_options
+        self.class.required_options
+      end
+      # Returns the fields added by this transformation.
+      def added_fields
+        self.class.added_fields
+      end
+      # Returns the fields removed by this transformation.
+      def removed_fields
+        self.class.removed_fields
+      end
+      def upstream_fields(fields)
+        ((fields + removed_fields) - added_fields).uniq
+      end
+      def downstream_fields(fields)
+        ((fields - removed_fields) + added_fields).uniq
+      end
+      # Processes a row if the row is on this transformation's stream.
+      #
+      # This should not be overridden by subclasses, override
+      # process_row instead.
+      #
+      # @return Hash if a single row is returned
+      # @return Array<Hash> if multiple rows need to be returned
+      def process(row)
+        applies_to_stream?(row[STREAM]) ? process_row(row) : row
+      end
+      # Returns all remaining rows yet to make their way through the
+      # pipeline.
+      #
+      # This should be overridden by subclasses if the transformation
+      # holds back rows as it does processing (to find the maximum
+      # value in a set of rows for example), to ensure that all rows
+      # are written through the pipeline.
+      #
+      # @return Array<Hash> by default an empty array.
+      def flush
+        []
+      end
+      # Returns the streams to which this transformation may write
+      # rows.
+      #
+      # By default, transformations are assumed to write only to the
+      # :default stream. Override this in subclasses as necessary.
+      def output_streams
+        [:default]
+      end
+      # Returns true if this transformation should be applied to a row
+      # on the target stream.
+      def applies_to_stream?(target_stream)
+        @stream == :all ||
+          (target_stream.nil? && @stream == :default) ||
+          target_stream == @stream
+      end
+      protected
+      # Performs transformation on the row.
+      #
+      # By default does nothing; override in subclasses. Subclasses
+      # should return either nil, a Hash-like row or an Array of
+      # Hash-like rows.
+      def process_row(row)
+        row
+      end
+      # Assigns the row to a stream.
+      #
+      # Will raise an error if the stream is not declared by
+      # overriding output_streams.
+      def assign_stream(row, stream)
+        raise "Stream not declared" unless stream.nil? || output_streams.include?(stream)
+        row[STREAM] = stream if stream
+        row
+      end
+      private
+      def ensure_options_present
+        missing_keys = required_options - @options.keys
+        unless missing_keys.empty?
+          raise ArgumentError.new("The following options are not supplied: " + missing_keys.join(","))
+        end
+      end
+    end
+  end
+end

data/lib/chicago/flow/transformation_chain.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Chicago
+  module Flow
+    # @api private
+    class TransformationChain
+      def initialize(*transforms)
+        @transforms = transforms
+      end
+      def output_streams
+        @transforms.inject([]) {|s, t| s | t.output_streams }
+      end
+      def process(row)
+        @transforms.inject([row]) do |rows, transform|
+          process_rows(rows, transform)
+        end
+      end
+      def flush
+        @transforms.inject([]) do |rows, transform|
+          process_rows(rows, transform) + transform.flush
+        end
+      end
+      def upstream_fields(fields)
+        @transforms.inject(fields) {|t| t.upstream_fields(fields) }
+      end
+      def downstream_fields(fields)
+        @transforms.inject(fields) {|t| t.downstream_fields(fields) }
+      end
+      private
+      def process_rows(rows, transform)
+        rows.map {|row| transform.process(row) }.flatten.compact
+      end
+    end
+  end
+end

data/spec/etl/batch_spec.rb CHANGED Viewed

@@ -15,7 +15,8 @@ describe Chicago::ETL::Batch do
   end
   it "should set the start timestamp of the batch to now when created" do
-    ETL::Batch.instance.start.started_at.to_i.should == Time.now.to_i
+    (ETL::Batch.instance.start.started_at.to_i - Time.now.to_i).abs.
+      should <= 5
   end
   it "should have a state of 'Started' when started" do

data/spec/etl/core_extensions_spec.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'spec_helper'
+describe Hash do
+  it "should have a put method which returns the hash" do
+    {}.put(:a, 1).should == {:a => 1}
+  end
+  it "should have a modify existing method that ignores nil values" do
+    {:a => nil}.modify_existing(:a) {|v| 2 }.should == {:a => nil}
+    {:a => 1}.modify_existing(:a) {|v| 2 }.should == {:a => 2}
+    {}.modify_existing(:a) {|r| 2 }.should == {}
+  end
+end