RubyGems - chicago-etl - Versions diffs - 0.0.13 → 0.1.0 - Mend

chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

data/Gemfile +8 -3
data/README.rdoc +4 -1
data/VERSION +1 -1
data/chicago-etl.gemspec +59 -22
data/chicago-flow.gemspec +92 -0
data/lib/chicago/etl/batch.rb +9 -2
data/lib/chicago/etl/core_extensions.rb +12 -0
data/lib/chicago/etl/counter.rb +8 -1
data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
data/lib/chicago/etl/key_builder.rb +17 -39
data/lib/chicago/etl/load_dataset_builder.rb +3 -1
data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
data/lib/chicago/etl/pipeline.rb +151 -0
data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
data/lib/chicago/etl/screens/column_screen.rb +26 -25
data/lib/chicago/etl/screens/invalid_element.rb +5 -5
data/lib/chicago/etl/screens/missing_value.rb +4 -2
data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
data/lib/chicago/etl/table_builder.rb +4 -2
data/lib/chicago/etl/task_invocation.rb +0 -1
data/lib/chicago/etl/transformations.rb +128 -0
data/lib/chicago/etl.rb +39 -8
data/lib/chicago/flow/array_sink.rb +35 -0
data/lib/chicago/flow/array_source.rb +15 -0
data/lib/chicago/flow/dataset_source.rb +23 -0
data/lib/chicago/flow/errors.rb +14 -0
data/lib/chicago/flow/filter.rb +15 -0
data/lib/chicago/flow/mysql.rb +4 -0
data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
data/lib/chicago/flow/mysql_file_sink.rb +68 -0
data/lib/chicago/flow/null_sink.rb +8 -0
data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
data/lib/chicago/flow/pipeline_stage.rb +68 -0
data/lib/chicago/flow/sink.rb +53 -0
data/lib/chicago/flow/transformation.rb +169 -0
data/lib/chicago/flow/transformation_chain.rb +40 -0
data/spec/etl/batch_spec.rb +2 -1
data/spec/etl/core_extensions_spec.rb +13 -0
data/spec/etl/dataset_batch_stage_spec.rb +55 -0
data/spec/etl/key_builder_spec.rb +25 -83
data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
data/spec/etl/screens/invalid_element_spec.rb +10 -11
data/spec/etl/screens/missing_value_spec.rb +21 -21
data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
data/spec/etl/transformations_spec.rb +109 -0
data/spec/flow/array_sink_spec.rb +26 -0
data/spec/flow/array_source_spec.rb +20 -0
data/spec/flow/dataset_source_spec.rb +15 -0
data/spec/flow/filter_spec.rb +13 -0
data/spec/flow/mysql_file_serializer_spec.rb +27 -0
data/spec/flow/mysql_file_sink_spec.rb +94 -0
data/spec/flow/mysql_integration_spec.rb +72 -0
data/spec/flow/pipeline_stage_spec.rb +89 -0
data/spec/flow/transformation_chain_spec.rb +76 -0
data/spec/flow/transformation_spec.rb +91 -0
data/spec/spec_helper.rb +5 -0
metadata +135 -39
data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
data/lib/chicago/etl/screens/composite_screen.rb +0 -17
data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
data/lib/chicago/etl/sink.rb +0 -61
data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
data/spec/etl/mysql_dumpfile_spec.rb +0 -42
data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
data/spec/etl/screens/composite_screen_spec.rb +0 -25
data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
data/spec/etl/sink_spec.rb +0 -7
data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9

data/lib/chicago/etl/load_pipeline_stage_builder.rb ADDED Viewed

@@ -0,0 +1,142 @@
+module Chicago
+  module ETL
+    # Provides DSL methods for specifying the pipeline in an ETL
+    # stage.
+    #
+    # Clients will not normally instantiate this themselves but use it
+    # in the context of defining an ETL stage.
+    class LoadPipelineStageBuilder
+      # @api private
+      KeyMapping = Struct.new(:table, :field)
+      # The ordering of inbuilt transformation and screening steps.
+      TRANSFORMATION_ORDER = [:before_screens,
+                              :screens,
+                              :after_screens,
+                              :before_keys,
+                              :keys,
+                              :after_keys,
+                              :before_final,
+                              :final,
+                              :after_final
+                             ].freeze
+      # @api private
+      def initialize(db, schema_table)
+        @db = db
+        @schema_table = schema_table
+        @sink_factory = SchemaTableSinkFactory.new(@db, @schema_table)
+      end
+      # @api private
+      def build(&block)
+        @load_separately = []
+        @key_mappings    = []
+        @transformations = {}
+        TRANSFORMATION_ORDER.each {|k| @transformations[k] = [] }
+        @ignore_present_rows = false
+        instance_eval &block
+        add_screens
+        add_key_transforms
+        add_final_transforms
+        pipeline_stage = create_pipeline_stage
+        register_additional_sinks(pipeline_stage)
+        pipeline_stage
+      end
+      protected
+      # Ignore rows already present in the target table, rather than
+      # replacing them.
+      def ignore_present_rows
+        @ignore_present_rows = true
+      end
+      # Specify columns that won't be loaded or screened as part of
+      # this pipeline stage
+      def load_separately(*columns)
+        @load_separately += columns
+      end
+      # Add an additional key mapping.
+      def key_mapping(table, field)
+        @key_mappings << KeyMapping.new(table, field)
+      end
+      # Add a transformation before the specified point in the
+      # transformation chain (defined in TRANSFORMATION_ORDER)
+      def before(point_in_transformation_chain, transform)
+        key = "before_#{point_in_transformation_chain}".to_sym
+        @transformations[key] << transform
+      end
+      # Add a transformation after the specified point in the
+      # transformation chain (defined in TRANSFORMATION_ORDER)
+      def after(point_in_transformation_chain, transform)
+        key = "after_#{point_in_transformation_chain}".to_sym
+        @transformations[key] << transform
+      end
+      private
+      def create_pipeline_stage
+        default = @sink_factory.sink(:ignore => @ignore_present_rows,
+                                     :exclude => @load_separately)
+        key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
+                     @sink_factory.key_sink
+                   else
+                     # Facts have no key table to write to.
+                     Flow::NullSink.new
+                   end
+        Flow::PipelineStage.
+          new(:transformations => concat_transformations,
+              :sinks => {
+                :default => default,
+                :dimension_key => key_sink,
+                :error => @sink_factory.error_sink
+              })
+      end
+      def concat_transformations
+        TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
+      end
+      def register_additional_sinks(pipeline_stage)
+        @key_mappings.each do |mapping|
+          sink = @sink_factory.key_sink(:table => mapping.table)
+          pipeline_stage.register_sink(mapping.table, sink)
+        end
+      end
+      def add_screens
+        columns_to_screen = @schema_table.columns.reject do |column|
+          @load_separately.include?(column.name)
+        end
+        @transformations[:screens] = [Screens::MissingValue,
+                                      Screens::InvalidElement,
+                                      Screens::OutOfBounds].map do |klass|
+          klass.for_columns(columns_to_screen)
+        end.flatten
+      end
+      def add_key_transforms
+        @transformations[:keys] << Transformations::AddKey.
+          new(:key_builder => KeyBuilder.for_table(@schema_table, @db))
+        @key_mappings.each do |mapping|
+          @transformations[:keys] << Transformations::DimensionKeyMapping.
+            new(:original_key => mapping.field, :key_table => mapping.table)
+        end
+      end
+      def add_final_transforms
+        @transformations[:final] << Transformations::WrittenRowFilter.new(:key => :id)
+        @transformations[:final] << Transformations::DemultiplexErrors.new
+      end
+    end
+  end
+end

data/lib/chicago/etl/pipeline.rb ADDED Viewed

@@ -0,0 +1,151 @@
+module Chicago
+  module ETL
+    # An ETL pipeline.
+    class Pipeline
+      # Returns all defined dimension load tasks
+      attr_reader :load_dimensions
+      # Returns all defined fact load tasks
+      attr_reader :load_facts
+      # Creates a pipeline for a Schema.
+      def initialize(db, schema)
+        @schema, @db = schema, db
+        @load_dimensions = Chicago::Schema::NamedElementCollection.new
+        @load_facts = Chicago::Schema::NamedElementCollection.new
+      end
+      # Defines a dimension load stage
+      def define_dimension_load(name, options={}, &block)
+        dimension_name = options[:dimension] || name
+        @load_dimensions << build_stage(name,
+                                        @schema.dimension(dimension_name),
+                                        &block)
+      end
+      # Defines a fact load stage
+      def define_fact_load(name, options={}, &block)
+        fact_name = options[:fact] || name
+        @load_facts << build_stage(name, @schema.fact(fact_name), &block)
+      end
+      # Builds a stage, but does not define it.
+      def build_stage(name, schema_table, &block)
+        DatasetBatchStageBuilder.new(@db, schema_table).build(name, &block)
+      end
+    end
+    # Provides DSL methods for building a DataSetBatchStage.
+    #
+    # Clients shouldn't need to instantiate this directly, but instead
+    # call the protected methods in the context of defining a Pipeline
+    class DatasetBatchStageBuilder
+      # @api private
+      def initialize(db, schema_table)
+        @db, @schema_table = db, schema_table
+      end
+      # @api private
+      def build(name, &block)
+        instance_eval &block
+        unless defined? @pipeline_stage
+          pipeline do
+          end
+        end
+        DatasetBatchStage.new(name, @dataset, @pipeline_stage,
+                              :filter_strategy => @filter_strategy,
+                              :truncate_pre_load => @truncate_pre_load)
+      end
+      protected
+      # Specifies that the sinks should be truncated before loading
+      # data.
+      def truncate_pre_load
+        @truncate_pre_load = true
+      end
+      # Specifies that the dataset should never be filtered to the ETL
+      # batch - i.e. it should behave as if reextract was always true
+      def full_reload
+        @filter_strategy = lambda {|dataset, etl_batch| dataset }
+      end
+      # Define elements of the pipeline. See LoadPipelineStageBuilder
+      # for details.
+      def pipeline(&block)
+        @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
+          build(&block)
+      end
+      # Defines the dataset, see DatasetBuilder .
+      #
+      # The block must return a Sequel::Dataset.
+      def dataset(&block)
+        @dataset = DatasetBuilder.new(@db).build(&block)
+      end
+      # Define a custom filter strategy for filtering to an ETL batch.
+      def filter_strategy(&block)
+        @filter_strategy = block
+      end
+    end
+    # Provides convenience methods for defining source datasets.
+    class DatasetBuilder
+      attr_reader :db
+      # @api private
+      def initialize(db)
+        @db = db
+      end
+      # @api private
+      def build(&block)
+        instance_eval(&block)
+      end
+      protected
+      def key_field(field, name)
+        :if[{field => nil}, 1, field].as(name)
+      end
+      # Returns a column for use in a Sequel::Dataset#select method to
+      # return a dimension key.
+      #
+      # Takes care of using the key tables correctly, and dealing with
+      # missing dimension values.
+      def dimension_key(name)
+        key_field("keys_dimension_#{name}__dimension_id".to_sym,
+                  "#{name}_dimension_id".to_sym)
+      end
+      # Returns a column for use in a Sequel::Dataset#select method to
+      # return a date dimension key.
+      def date_dimension_column(dimension)
+        :if.sql_function({:id.qualify(dimension) => nil},
+                         1,
+                         :id.qualify(dimension)).
+          as("#{dimension}_dimension_id".to_sym)
+      end
+      # Rounds a monetary value to 2 decimal places.
+      #
+      # By default, natural rounding is used, you can specify either
+      # :up or :down as the direction.
+      #
+      # @deprecated
+      def round(stmt, direction = :none)
+        case direction
+        when :none
+          :round.sql_function(stmt, 2)
+        when :up
+          :ceil.sql_function(stmt * 100) / 100
+        when :down
+          :floor.sql_function(stmt * 100) / 100
+        end
+      end
+    end
+  end
+end

data/lib/chicago/etl/schema_table_sink_factory.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module Chicago
+  module ETL
+    # Builds Sinks for Dimension & Fact tables.
+    class SchemaTableSinkFactory
+      # Creates a new factory.
+      def initialize(db, schema_table)
+        @db, @schema_table = db, schema_table
+      end
+      # Returns a sink to load data into the MySQL table backing the
+      # schema table.
+      #
+      # Pass an :exclude option if you don't want all columns of the
+      # schema table to be loaded via this sink.
+      def sink(options={})
+        Flow::MysqlFileSink.new(@db,
+                                @schema_table.table_name,
+                                load_columns(options[:exclude]),
+                                mysql_options(options))
+      end
+      # Returns a sink to load data into the MySQL table backing the
+      # key table for a Dimension.
+      #
+      # @option options [Symbol] :table - a custom key table name. The
+      #   schema table's key table name will be used otherwise.
+      def key_sink(options={})
+        table = options.delete(:table) || @schema_table.key_table_name
+        sink = Flow::MysqlFileSink.new(@db,
+                                       table,
+                                       [:original_id, :dimension_id],
+                                       mysql_options(options))
+        sink.truncation_strategy = lambda do
+          # No Op - we want to maintain keys to avoid having to sort
+          # out fact tables.
+        end
+        sink
+      end
+      # Returns a sink to load errors generated in the ETL process.
+      def error_sink(options={})
+        sink = Flow::MysqlFileSink.
+          new(@db, :etl_error_log,
+              [:column, :row_id, :error, :severity, :error_detail], mysql_options(options)).
+          set_constant_values(:table => @schema_table.table_name.to_s,
+                              :process_name => "StandardTransformations",
+                              :process_version => 3,
+                              :logged_at => Time.now)
+        sink.truncation_strategy = lambda do
+          @db[:etl_error_log].
+            where(:table => @schema_table.table_name.to_s).delete
+        end
+        sink
+      end
+      private
+      def load_columns(exclude=nil)
+        exclude = [exclude].compact.flatten
+        [:id] + @schema_table.columns.
+          reject {|c| exclude.include?(c.name) }.
+          map {|c| c.database_name }
+      end
+      def mysql_options(options)
+        [:filepath, :ignore].inject({}) do |hsh, k|
+          hsh[k] = options[k] if options.has_key?(k)
+          hsh
+        end
+      end
+    end
+  end
+end

data/lib/chicago/etl/screens/column_screen.rb CHANGED Viewed

@@ -1,53 +1,54 @@
 module Chicago
   module ETL
     module Screens
-      class ColumnScreen
-        attr_reader :column, :table_name
-        def initialize(table_name, column)
-          @table_name = table_name
-          @column = column
-          @error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
+      # @abstract
+      class ColumnScreen < Flow::Transformation
+        def self.for_columns(columns)
+          columns.map {|column|
+            new(:default, :column => column)
+          }
         end
-        def self.for_columns(table_name, columns)
-          screens = columns.map {|column| new(table_name, column) }
-          CompositeScreen.new(screens)
+        def output_streams
+          [:default, :error]
         end
-        def call(row, errors=[])
-          value = row[column.database_name]
-          if applies?(value)
+        def process_row(row)
+          if applies?(row[column.database_name])
             overwrite_value(row)
-            log_error(value, errors)
+            error_row = error(row[column.database_name])
+            if error_row
+              row[:_errors] ||= []
+              row[:_errors] << error_row
+            end
           end
-          [row, errors]
+          row
         end
         def severity
           1
         end
+        def column
+          @options[:column]
+        end
         private
-        def overwrite_value(row)
-          row[column.database_name] = column.default_value
+        def error_name
+          self.class.name.split('::').last.sub(/Screen$/,'').titlecase
         end
-        def log_error(value, errors)
-          errors << error_hash(value)
+        def overwrite_value(row)
+          row[column.database_name] = column.default_value
         end
-        def error_hash(value)
+        def error(value)
           {
-            :process_name => "StandardTransformations",
-            :process_version => 2,
-            :table => table_name.to_s,
             :column => column.database_name.to_s,
             :severity => severity,
-            :error => @error_name
+            :error => error_name
           }
         end

data/lib/chicago/etl/screens/invalid_element.rb CHANGED Viewed

@@ -1,11 +1,11 @@
 module Chicago
   module ETL
     module Screens
+      # Transformation which checks to see if a field's value is in a
+      # column's elements.
       class InvalidElement < ColumnScreen
-        def self.for_columns(table_name, columns)
-          screens = columns.select(&:elements).
-            map {|column| new(table_name, column) }
-          CompositeScreen.new(screens)
+        def self.for_columns(columns)
+          columns.select(&:elements).map {|column| new(:default, :column => column) }
         end
         def severity
@@ -17,7 +17,7 @@ module Chicago
             !column.elements.map(&:downcase).include?(value.to_s.downcase)
         end
-        def error_hash(value)
+        def error(value)
           super(value).
             merge(:error_detail => "'#{value}' is not a valid value.")
         end

data/lib/chicago/etl/screens/missing_value.rb CHANGED Viewed

@@ -1,14 +1,16 @@
 module Chicago
   module ETL
     module Screens
+      # Screen which checks to see if a field is present in the row if
+      # required.
       class MissingValue < ColumnScreen
         def severity
           column.descriptive? ? 1 : 2
         end
-        def log_error(value, errors)
+        def error(value)
           if ! (column.column_type == :boolean || column.optional?)
-            errors << error_hash(value)
+            super(value)
           end
         end

data/lib/chicago/etl/screens/out_of_bounds.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Chicago
   module ETL
     module Screens
+      # Screen which checks to see if a column's value is out of
+      # defined bounds.
       class OutOfBounds < ColumnScreen
         def severity
           2

data/lib/chicago/etl/table_builder.rb CHANGED Viewed

@@ -8,11 +8,13 @@ module Chicago
         new(db).build
       end
-      def initialize(db) # :nodoc:
+      # @api private
+      def initialize(db)
         @db = db
       end
-      def build # :nodoc:
+      # @api private
+      def build
         create_table :etl_batches do
           primary_key :id, :type => :integer, :unsigned => true
           timestamp   :started_at, :null => false, :default => :current_timestamp.sql_function

data/lib/chicago/etl/task_invocation.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 module Chicago
   module ETL
     class TaskInvocation < Sequel::Model
       set_dataset :etl_task_invocations
       many_to_one :batch

data/lib/chicago/etl/transformations.rb ADDED Viewed

@@ -0,0 +1,128 @@
+module Chicago
+  module ETL
+    module Transformations
+      # Filters rows so they only get output once, based on a :key.
+      class WrittenRowFilter < Flow::Transformation
+        requires_options :key
+        def initialize(*args)
+          super(*args)
+          @written_rows = Set.new
+        end
+        def process_row(row)
+          key = row[key_field]
+          # puts "Checking on #{key}"
+          unless @written_rows.include?(key)
+            @written_rows << key
+            row
+          end
+        end
+        def key_field
+          @options[:key]
+        end
+      end
+      # Adds an :id field to a row, based on a KeyBuilder.
+      #
+      # Also adds this id as :row_id to any rows in an embedded
+      # :_errors field.
+      #
+      # Pass the :key_builder option to set the KeyBuilder.
+      class AddKey < Flow::Transformation
+        requires_options :key_builder
+        adds_fields :id
+        def output_streams
+          [:default, :dimension_key]
+        end
+        def process_row(row)
+          key, key_row = key_builder.key(row)
+          row[:id] = key
+          (row[:_errors] || []).each {|e| e[:row_id] = row[:id] }
+          if key_row
+            assign_stream(key_row, :dimension_key)
+            [row, key_row]
+          else
+            row
+          end
+        end
+        def key_builder
+          @options[:key_builder]
+        end
+      end
+      # Removes embedded :_errors and puts them on the error stream.
+      class DemultiplexErrors < Flow::Transformation
+        def output_streams
+          [:default, :error]
+        end
+        def process_row(row)
+          errors = (row.delete(:_errors) || []).each do |e|
+            assign_stream(e, :error)
+          end
+          [row] + errors
+        end
+      end
+      # Removes a field from the row, and creates a row on a
+      # designated key stream
+      class DimensionKeyMapping < Flow::Transformation
+        requires_options :original_key, :key_table
+        def removed_fields
+          [original_key]
+        end
+        def output_streams
+          [:default, key_table]
+        end
+        def process_row(row)
+          key_row = {
+            :original_id => row.delete(original_key),
+            :dimension_id => row[:id]
+          }
+          assign_stream(key_row, key_table)
+          [row, key_row]
+        end
+        def original_key
+          @options[:original_key]
+        end
+        def key_table
+          @options[:key_table]
+        end
+      end
+      # Adds a hash of the specified columns as a field in the row.
+      class HashColumns < Flow::Transformation
+        requires_options :columns
+        def process_row(row)
+          str = hash_columns.map {|c| row[c].to_s }.join
+          row.put(output_field, Digest::MD5.hexdigest(str).upcase)
+        end
+        def added_fields
+          [output_field]
+        end
+        def output_field
+          @options[:output_field] || :hash
+        end
+        def hash_columns
+          @options[:columns]
+        end
+      end
+    end
+  end
+end