chicago-etl 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +5 -5
- data/lib/chicago/etl/dataset_batch_stage.rb +12 -32
- data/lib/chicago/etl/dataset_builder.rb +60 -0
- data/lib/chicago/etl/pipeline.rb +9 -62
- data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb} +17 -15
- data/lib/chicago/etl/stage.rb +39 -34
- data/lib/chicago/etl/stage_builder.rb +5 -5
- data/lib/chicago/etl.rb +4 -5
- data/spec/etl/define_dimension_stage_spec.rb +35 -0
- data/spec/etl/define_stage_spec.rb +1 -21
- data/spec/etl/pipeline_stage_builder_spec.rb +2 -2
- data/spec/etl/stage_spec.rb +40 -0
- data/spec/flow/mysql_integration_spec.rb +15 -11
- metadata +7 -7
- data/lib/chicago/flow/pipeline_stage.rb +0 -68
- data/spec/etl/dataset_batch_stage_spec.rb +0 -55
- data/spec/flow/pipeline_stage_spec.rb +0 -89
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            0.1. | 
| 1 | 
            +
            0.1.4
         | 
    
        data/chicago-etl.gemspec
    CHANGED
    
    | @@ -5,7 +5,7 @@ | |
| 5 5 |  | 
| 6 6 | 
             
            Gem::Specification.new do |s|
         | 
| 7 7 | 
             
              s.name = "chicago-etl"
         | 
| 8 | 
            -
              s.version = "0.1. | 
| 8 | 
            +
              s.version = "0.1.4"
         | 
| 9 9 |  | 
| 10 10 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 11 11 | 
             
              s.authors = ["Roland Swingler"]
         | 
| @@ -32,10 +32,11 @@ Gem::Specification.new do |s| | |
| 32 32 | 
             
                "lib/chicago/etl/core_extensions.rb",
         | 
| 33 33 | 
             
                "lib/chicago/etl/counter.rb",
         | 
| 34 34 | 
             
                "lib/chicago/etl/dataset_batch_stage.rb",
         | 
| 35 | 
            +
                "lib/chicago/etl/dataset_builder.rb",
         | 
| 35 36 | 
             
                "lib/chicago/etl/key_builder.rb",
         | 
| 36 37 | 
             
                "lib/chicago/etl/load_dataset_builder.rb",
         | 
| 37 | 
            -
                "lib/chicago/etl/load_pipeline_stage_builder.rb",
         | 
| 38 38 | 
             
                "lib/chicago/etl/pipeline.rb",
         | 
| 39 | 
            +
                "lib/chicago/etl/schema_sinks_and_transformations_builder.rb",
         | 
| 39 40 | 
             
                "lib/chicago/etl/schema_table_sink_factory.rb",
         | 
| 40 41 | 
             
                "lib/chicago/etl/screens/column_screen.rb",
         | 
| 41 42 | 
             
                "lib/chicago/etl/screens/invalid_element.rb",
         | 
| @@ -62,7 +63,6 @@ Gem::Specification.new do |s| | |
| 62 63 | 
             
                "lib/chicago/flow/mysql_file_sink.rb",
         | 
| 63 64 | 
             
                "lib/chicago/flow/null_sink.rb",
         | 
| 64 65 | 
             
                "lib/chicago/flow/pipeline_endpoint.rb",
         | 
| 65 | 
            -
                "lib/chicago/flow/pipeline_stage.rb",
         | 
| 66 66 | 
             
                "lib/chicago/flow/sink.rb",
         | 
| 67 67 | 
             
                "lib/chicago/flow/transformation.rb",
         | 
| 68 68 | 
             
                "lib/chicago/flow/transformation_chain.rb",
         | 
| @@ -70,7 +70,7 @@ Gem::Specification.new do |s| | |
| 70 70 | 
             
                "spec/etl/batch_spec.rb",
         | 
| 71 71 | 
             
                "spec/etl/core_extensions_spec.rb",
         | 
| 72 72 | 
             
                "spec/etl/counter_spec.rb",
         | 
| 73 | 
            -
                "spec/etl/ | 
| 73 | 
            +
                "spec/etl/define_dimension_stage_spec.rb",
         | 
| 74 74 | 
             
                "spec/etl/define_stage_spec.rb",
         | 
| 75 75 | 
             
                "spec/etl/etl_batch_id_dataset_filter.rb",
         | 
| 76 76 | 
             
                "spec/etl/key_builder_spec.rb",
         | 
| @@ -82,6 +82,7 @@ Gem::Specification.new do |s| | |
| 82 82 | 
             
                "spec/etl/screens/out_of_bounds_spec.rb",
         | 
| 83 83 | 
             
                "spec/etl/sequel/dependant_tables_spec.rb",
         | 
| 84 84 | 
             
                "spec/etl/sequel/filter_to_etl_batch_spec.rb",
         | 
| 85 | 
            +
                "spec/etl/stage_spec.rb",
         | 
| 85 86 | 
             
                "spec/etl/table_builder_spec.rb",
         | 
| 86 87 | 
             
                "spec/etl/task_spec.rb",
         | 
| 87 88 | 
             
                "spec/etl/transformations/deduplicate_rows_spec.rb",
         | 
| @@ -95,7 +96,6 @@ Gem::Specification.new do |s| | |
| 95 96 | 
             
                "spec/flow/mysql_file_serializer_spec.rb",
         | 
| 96 97 | 
             
                "spec/flow/mysql_file_sink_spec.rb",
         | 
| 97 98 | 
             
                "spec/flow/mysql_integration_spec.rb",
         | 
| 98 | 
            -
                "spec/flow/pipeline_stage_spec.rb",
         | 
| 99 99 | 
             
                "spec/flow/transformation_chain_spec.rb",
         | 
| 100 100 | 
             
                "spec/flow/transformation_spec.rb",
         | 
| 101 101 | 
             
                "spec/spec_helper.rb"
         | 
| @@ -4,48 +4,28 @@ module Chicago | |
| 4 4 | 
             
                #
         | 
| 5 5 | 
             
                # Allows deferring constructing a DatasetSource until extract
         | 
| 6 6 | 
             
                # time, so that it can be filtered to an ETL batch appropriately.
         | 
| 7 | 
            -
                class DatasetBatchStage
         | 
| 7 | 
            +
                class DatasetBatchStage < Stage
         | 
| 8 8 | 
             
                  attr_reader :name
         | 
| 9 9 |  | 
| 10 | 
            -
                  def initialize(name,  | 
| 11 | 
            -
                     | 
| 12 | 
            -
                    @ | 
| 13 | 
            -
             | 
| 14 | 
            -
                    @filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
         | 
| 15 | 
            -
                      dataset.filter_to_etl_batch(etl_batch)
         | 
| 16 | 
            -
                    }
         | 
| 10 | 
            +
                  def initialize(name, options={})
         | 
| 11 | 
            +
                    super
         | 
| 12 | 
            +
                    @filter_strategy = options[:filter_strategy] ||
         | 
| 13 | 
            +
                      lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
         | 
| 17 14 | 
             
                    @truncate_pre_load = !!options[:truncate_pre_load]
         | 
| 18 | 
            -
             | 
| 15 | 
            +
                 end
         | 
| 19 16 |  | 
| 20 17 | 
             
                  # Executes this ETL stage.
         | 
| 21 18 | 
             
                  #
         | 
| 22 19 | 
             
                  # Configures the dataset and flows rows into the pipeline.
         | 
| 23 20 | 
             
                  def execute(etl_batch, reextract=false)
         | 
| 24 21 | 
             
                    if @truncate_pre_load
         | 
| 25 | 
            -
                       | 
| 26 | 
            -
                    elsif reextract &&  | 
| 27 | 
            -
                       | 
| 28 | 
            -
                    end
         | 
| 29 | 
            -
             | 
| 30 | 
            -
                    pipeline_stage.execute(source(etl_batch, reextract))
         | 
| 31 | 
            -
                  end
         | 
| 32 | 
            -
             | 
| 33 | 
            -
                  # Returns the pipeline for this stage.
         | 
| 34 | 
            -
                  def pipeline_stage
         | 
| 35 | 
            -
                    @pipeline_stage.sink(:default).
         | 
| 36 | 
            -
                      set_constant_values(:_inserted_at => Time.now)
         | 
| 37 | 
            -
                    @pipeline_stage
         | 
| 38 | 
            -
                  end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                  # Returns a DatasetSource for the provided dataset filtered to
         | 
| 41 | 
            -
                  # the ETL batch as appropriate.
         | 
| 42 | 
            -
                  def source(etl_batch, reextract=false)
         | 
| 43 | 
            -
                    if reextract
         | 
| 44 | 
            -
                      filtered_dataset = @dataset
         | 
| 45 | 
            -
                    else
         | 
| 46 | 
            -
                      filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
         | 
| 22 | 
            +
                      sinks.each {|sink| sink.truncate }
         | 
| 23 | 
            +
                    elsif reextract && sink(:error)
         | 
| 24 | 
            +
                      sink(:error).truncate
         | 
| 47 25 | 
             
                    end
         | 
| 48 | 
            -
                     | 
| 26 | 
            +
                    
         | 
| 27 | 
            +
                    sink(:default).set_constant_values(:_inserted_at => Time.now)
         | 
| 28 | 
            +
                    super
         | 
| 49 29 | 
             
                  end
         | 
| 50 30 | 
             
                end
         | 
| 51 31 | 
             
              end
         | 
| @@ -0,0 +1,60 @@ | |
| 1 | 
            +
            module Chicago
         | 
| 2 | 
            +
              module ETL
         | 
| 3 | 
            +
                # Provides convenience methods for defining source datasets.
         | 
| 4 | 
            +
                class DatasetBuilder
         | 
| 5 | 
            +
                  attr_reader :db
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                  # @api private
         | 
| 8 | 
            +
                  def initialize(db)
         | 
| 9 | 
            +
                    @db = db
         | 
| 10 | 
            +
                  end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  # @api private
         | 
| 13 | 
            +
                  def build(&block)
         | 
| 14 | 
            +
                    instance_eval(&block)
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  protected
         | 
| 18 | 
            +
                  
         | 
| 19 | 
            +
                  def key_field(field, name)
         | 
| 20 | 
            +
                    :if[{field => nil}, 1, field].as(name)
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
                  
         | 
| 23 | 
            +
                  # Returns a column for use in a Sequel::Dataset#select method to
         | 
| 24 | 
            +
                  # return a dimension key.
         | 
| 25 | 
            +
                  #
         | 
| 26 | 
            +
                  # Takes care of using the key tables correctly, and dealing with
         | 
| 27 | 
            +
                  # missing dimension values.
         | 
| 28 | 
            +
                  def dimension_key(name)
         | 
| 29 | 
            +
                    key_field("keys_dimension_#{name}__dimension_id".to_sym,
         | 
| 30 | 
            +
                              "#{name}_dimension_id".to_sym)
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  # Returns a column for use in a Sequel::Dataset#select method to
         | 
| 34 | 
            +
                  # return a date dimension key.
         | 
| 35 | 
            +
                  def date_dimension_column(dimension)
         | 
| 36 | 
            +
                    :if.sql_function({:id.qualify(dimension) => nil},
         | 
| 37 | 
            +
                                     1, 
         | 
| 38 | 
            +
                                     :id.qualify(dimension)).
         | 
| 39 | 
            +
                      as("#{dimension}_dimension_id".to_sym)
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  # Rounds a monetary value to 2 decimal places.
         | 
| 43 | 
            +
                  #
         | 
| 44 | 
            +
                  # By default, natural rounding is used, you can specify either
         | 
| 45 | 
            +
                  # :up or :down as the direction.
         | 
| 46 | 
            +
                  #
         | 
| 47 | 
            +
                  # @deprecated
         | 
| 48 | 
            +
                  def round(stmt, direction = :none)
         | 
| 49 | 
            +
                    case direction
         | 
| 50 | 
            +
                    when :none
         | 
| 51 | 
            +
                      :round.sql_function(stmt, 2)
         | 
| 52 | 
            +
                    when :up
         | 
| 53 | 
            +
                      :ceil.sql_function(stmt * 100) / 100
         | 
| 54 | 
            +
                    when :down
         | 
| 55 | 
            +
                      :floor.sql_function(stmt * 100) / 100
         | 
| 56 | 
            +
                    end
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
            end
         | 
    
        data/lib/chicago/etl/pipeline.rb
    CHANGED
    
    | @@ -61,13 +61,17 @@ module Chicago | |
| 61 61 | 
             
                  # @api private
         | 
| 62 62 | 
             
                  def build(name, &block)
         | 
| 63 63 | 
             
                    instance_eval &block
         | 
| 64 | 
            -
                    unless defined? @ | 
| 64 | 
            +
                    unless defined? @sinks_and_transformations
         | 
| 65 65 | 
             
                      pipeline do
         | 
| 66 66 | 
             
                      end
         | 
| 67 67 | 
             
                    end
         | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 68 | 
            +
                     DatasetBatchStage.new(name,
         | 
| 69 | 
            +
                                           :source => @dataset, 
         | 
| 70 | 
            +
                                           :transformations => @sinks_and_transformations[:transformations],
         | 
| 71 | 
            +
                                           :sinks => @sinks_and_transformations[:sinks],
         | 
| 72 | 
            +
                                           :filter_strategy => @filter_strategy,
         | 
| 73 | 
            +
                                           :truncate_pre_load => @truncate_pre_load)
         | 
| 74 | 
            +
             | 
| 71 75 | 
             
                  end
         | 
| 72 76 |  | 
| 73 77 | 
             
                  protected
         | 
| @@ -88,7 +92,7 @@ module Chicago | |
| 88 92 | 
             
                  # for details.
         | 
| 89 93 | 
             
                  # TODO: rename pipeline => transforms below this method
         | 
| 90 94 | 
             
                  def pipeline(&block)
         | 
| 91 | 
            -
                    @ | 
| 95 | 
            +
                    @sinks_and_transformations = SchemaSinksAndTransformationsBuilder.new(@db, @schema_table).
         | 
| 92 96 | 
             
                      build(&block)
         | 
| 93 97 | 
             
                  end
         | 
| 94 98 |  | 
| @@ -106,62 +110,5 @@ module Chicago | |
| 106 110 | 
             
                    @filter_strategy = block
         | 
| 107 111 | 
             
                  end
         | 
| 108 112 | 
             
                end
         | 
| 109 | 
            -
             | 
| 110 | 
            -
                # Provides convenience methods for defining source datasets.
         | 
| 111 | 
            -
                class DatasetBuilder
         | 
| 112 | 
            -
                  attr_reader :db
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                  # @api private
         | 
| 115 | 
            -
                  def initialize(db)
         | 
| 116 | 
            -
                    @db = db
         | 
| 117 | 
            -
                  end
         | 
| 118 | 
            -
             | 
| 119 | 
            -
                  # @api private
         | 
| 120 | 
            -
                  def build(&block)
         | 
| 121 | 
            -
                    instance_eval(&block)
         | 
| 122 | 
            -
                  end
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                  protected
         | 
| 125 | 
            -
                  
         | 
| 126 | 
            -
                  def key_field(field, name)
         | 
| 127 | 
            -
                    :if[{field => nil}, 1, field].as(name)
         | 
| 128 | 
            -
                  end
         | 
| 129 | 
            -
                  
         | 
| 130 | 
            -
                  # Returns a column for use in a Sequel::Dataset#select method to
         | 
| 131 | 
            -
                  # return a dimension key.
         | 
| 132 | 
            -
                  #
         | 
| 133 | 
            -
                  # Takes care of using the key tables correctly, and dealing with
         | 
| 134 | 
            -
                  # missing dimension values.
         | 
| 135 | 
            -
                  def dimension_key(name)
         | 
| 136 | 
            -
                    key_field("keys_dimension_#{name}__dimension_id".to_sym,
         | 
| 137 | 
            -
                              "#{name}_dimension_id".to_sym)
         | 
| 138 | 
            -
                  end
         | 
| 139 | 
            -
             | 
| 140 | 
            -
                  # Returns a column for use in a Sequel::Dataset#select method to
         | 
| 141 | 
            -
                  # return a date dimension key.
         | 
| 142 | 
            -
                  def date_dimension_column(dimension)
         | 
| 143 | 
            -
                    :if.sql_function({:id.qualify(dimension) => nil},
         | 
| 144 | 
            -
                                     1, 
         | 
| 145 | 
            -
                                     :id.qualify(dimension)).
         | 
| 146 | 
            -
                      as("#{dimension}_dimension_id".to_sym)
         | 
| 147 | 
            -
                  end
         | 
| 148 | 
            -
             | 
| 149 | 
            -
                  # Rounds a monetary value to 2 decimal places.
         | 
| 150 | 
            -
                  #
         | 
| 151 | 
            -
                  # By default, natural rounding is used, you can specify either
         | 
| 152 | 
            -
                  # :up or :down as the direction.
         | 
| 153 | 
            -
                  #
         | 
| 154 | 
            -
                  # @deprecated
         | 
| 155 | 
            -
                  def round(stmt, direction = :none)
         | 
| 156 | 
            -
                    case direction
         | 
| 157 | 
            -
                    when :none
         | 
| 158 | 
            -
                      :round.sql_function(stmt, 2)
         | 
| 159 | 
            -
                    when :up
         | 
| 160 | 
            -
                      :ceil.sql_function(stmt * 100) / 100
         | 
| 161 | 
            -
                    when :down
         | 
| 162 | 
            -
                      :floor.sql_function(stmt * 100) / 100
         | 
| 163 | 
            -
                    end
         | 
| 164 | 
            -
                  end
         | 
| 165 | 
            -
                end
         | 
| 166 113 | 
             
              end
         | 
| 167 114 | 
             
            end
         | 
    
        data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb}
    RENAMED
    
    | @@ -5,7 +5,7 @@ module Chicago | |
| 5 5 | 
             
                #
         | 
| 6 6 | 
             
                # Clients will not normally instantiate this themselves but use it
         | 
| 7 7 | 
             
                # in the context of defining an ETL stage.
         | 
| 8 | 
            -
                class  | 
| 8 | 
            +
                class SchemaSinksAndTransformationsBuilder
         | 
| 9 9 | 
             
                  # @api private
         | 
| 10 10 | 
             
                  KeyMapping = Struct.new(:table, :field)
         | 
| 11 11 |  | 
| @@ -41,9 +41,9 @@ module Chicago | |
| 41 41 | 
             
                    add_screens
         | 
| 42 42 | 
             
                    add_key_transforms
         | 
| 43 43 | 
             
                    add_final_transforms
         | 
| 44 | 
            -
                     | 
| 45 | 
            -
                    register_additional_sinks( | 
| 46 | 
            -
                     | 
| 44 | 
            +
                    sinks_and_transformations = create_sinks_and_transformations
         | 
| 45 | 
            +
                    register_additional_sinks(sinks_and_transformations)
         | 
| 46 | 
            +
                    sinks_and_transformations
         | 
| 47 47 | 
             
                  end
         | 
| 48 48 |  | 
| 49 49 | 
             
                  protected
         | 
| @@ -81,7 +81,7 @@ module Chicago | |
| 81 81 |  | 
| 82 82 | 
             
                  private
         | 
| 83 83 |  | 
| 84 | 
            -
                  def  | 
| 84 | 
            +
                  def create_sinks_and_transformations
         | 
| 85 85 | 
             
                    default = @sink_factory.sink(:ignore => @ignore_present_rows,
         | 
| 86 86 | 
             
                                                 :exclude => @load_separately)
         | 
| 87 87 | 
             
                    key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
         | 
| @@ -90,24 +90,26 @@ module Chicago | |
| 90 90 | 
             
                                 # Facts have no key table to write to.
         | 
| 91 91 | 
             
                                 Flow::NullSink.new
         | 
| 92 92 | 
             
                               end
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                     | 
| 95 | 
            -
                       | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 93 | 
            +
             | 
| 94 | 
            +
                    {
         | 
| 95 | 
            +
                      :transformations => concat_transformations,
         | 
| 96 | 
            +
                      :sinks => {
         | 
| 97 | 
            +
                        :default => default,
         | 
| 98 | 
            +
                        :dimension_key => key_sink,
         | 
| 99 | 
            +
                        :error => @sink_factory.error_sink
         | 
| 100 | 
            +
                      }
         | 
| 101 | 
            +
                    }
         | 
| 101 102 | 
             
                  end
         | 
| 102 103 |  | 
| 103 104 | 
             
                  def concat_transformations
         | 
| 104 105 | 
             
                    TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
         | 
| 105 106 | 
             
                  end
         | 
| 106 107 |  | 
| 107 | 
            -
                  def register_additional_sinks( | 
| 108 | 
            +
                  def register_additional_sinks(sinks_and_transformations)
         | 
| 109 | 
            +
                    sinks = sinks_and_transformations[:sinks]
         | 
| 108 110 | 
             
                    @key_mappings.each do |mapping|
         | 
| 109 111 | 
             
                      sink = @sink_factory.key_sink(:table => mapping.table)
         | 
| 110 | 
            -
                       | 
| 112 | 
            +
                      sinks[mapping.table] = sink
         | 
| 111 113 | 
             
                    end
         | 
| 112 114 | 
             
                  end
         | 
| 113 115 |  | 
    
        data/lib/chicago/etl/stage.rb
    CHANGED
    
    | @@ -1,42 +1,30 @@ | |
| 1 1 | 
             
            module Chicago
         | 
| 2 2 | 
             
              module ETL
         | 
| 3 | 
            +
                # A Stage in the ETL pipeline.
         | 
| 4 | 
            +
                #
         | 
| 5 | 
            +
                # A Stage wires together a Source, 0 or more Transformations and 1
         | 
| 6 | 
            +
                # or more Sinks.
         | 
| 3 7 | 
             
                class Stage
         | 
| 8 | 
            +
                  # Returns the source for this stage.
         | 
| 9 | 
            +
                  attr_reader :source
         | 
| 10 | 
            +
                  
         | 
| 11 | 
            +
                  # Returns the name of this stage.
         | 
| 4 12 | 
             
                  attr_reader :name
         | 
| 5 13 |  | 
| 6 14 | 
             
                  def initialize(name, options={})
         | 
| 7 15 | 
             
                    @name = name
         | 
| 8 | 
            -
                    @source = options | 
| 9 | 
            -
                     | 
| 10 | 
            -
             | 
| 11 | 
            -
                    @sinks = options.fetch(:sinks)
         | 
| 12 | 
            -
                    raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                    @transformations = options.fetch(:transformations)
         | 
| 15 | 
            -
                    @transformation_chain = Chicago::Flow::TransformationChain.
         | 
| 16 | 
            -
                      new(*@transformations)
         | 
| 17 | 
            -
             | 
| 16 | 
            +
                    @source = options[:source]
         | 
| 17 | 
            +
                    @sinks = options[:sinks]
         | 
| 18 | 
            +
                    @transformations = options[:transformations] || []
         | 
| 18 19 | 
             
                    @filter_strategy = options[:filter_strategy] || 
         | 
| 19 20 | 
             
                      lambda {|source, _| source }
         | 
| 20 | 
            -
                  end
         | 
| 21 21 |  | 
| 22 | 
            -
             | 
| 23 | 
            -
                    modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
         | 
| 24 | 
            -
                    transform_and_load_from(modified_source)
         | 
| 25 | 
            -
                  end
         | 
| 26 | 
            -
                  
         | 
| 27 | 
            -
                  def transform_and_load_from(source)
         | 
| 22 | 
            +
                    validate_arguments
         | 
| 28 23 | 
             
                  end
         | 
| 29 24 |  | 
| 30 | 
            -
                  def  | 
| 31 | 
            -
                     | 
| 32 | 
            -
                      filtered_dataset = source
         | 
| 33 | 
            -
                    else
         | 
| 34 | 
            -
                      filtered_dataset = @filter_strategy.call(source, etl_batch)
         | 
| 35 | 
            -
                    end
         | 
| 36 | 
            -
                    Chicago::Flow::DatasetSource.new(filtered_dataset)
         | 
| 25 | 
            +
                  def execute(etl_batch, reextract=false)
         | 
| 26 | 
            +
                    transform_and_load filtered_source(etl_batch, reextract)
         | 
| 37 27 | 
             
                  end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                      attr_reader :transformation_chain
         | 
| 40 28 |  | 
| 41 29 | 
             
                  # Returns the named sink, if it exists
         | 
| 42 30 | 
             
                  def sink(name)
         | 
| @@ -46,20 +34,22 @@ module Chicago | |
| 46 34 | 
             
                  def sinks
         | 
| 47 35 | 
             
                    @sinks.values
         | 
| 48 36 | 
             
                  end
         | 
| 37 | 
            +
                  
         | 
| 38 | 
            +
                  def filtered_source(etl_batch, reextract=false)
         | 
| 39 | 
            +
                    filtered_dataset = reextract ? source : 
         | 
| 40 | 
            +
                      @filter_strategy.call(source, etl_batch)
         | 
| 49 41 |  | 
| 50 | 
            -
             | 
| 51 | 
            -
                    @sinks[name.to_sym] = sink
         | 
| 52 | 
            -
                    self
         | 
| 42 | 
            +
                    Chicago::Flow::DatasetSource.new(filtered_dataset)
         | 
| 53 43 | 
             
                  end
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                   | 
| 44 | 
            +
             | 
| 45 | 
            +
                  private
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  def transform_and_load(source)
         | 
| 56 48 | 
             
                    sinks.each(&:open)
         | 
| 57 49 | 
             
                    pipe_rows_to_sinks_from(source)
         | 
| 58 50 | 
             
                    sinks.each(&:close)
         | 
| 59 51 | 
             
                  end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                  private
         | 
| 62 | 
            -
                  
         | 
| 52 | 
            +
             | 
| 63 53 | 
             
                  def pipe_rows_to_sinks_from(source)
         | 
| 64 54 | 
             
                    source.each do |row|
         | 
| 65 55 | 
             
                      transformation_chain.process(row).each {|row| process_row(row) }
         | 
| @@ -67,10 +57,25 @@ module Chicago | |
| 67 57 | 
             
                    transformation_chain.flush.each {|row| process_row(row) }
         | 
| 68 58 | 
             
                  end
         | 
| 69 59 |  | 
| 60 | 
            +
                  def transformation_chain
         | 
| 61 | 
            +
                    @transformation_chain ||= Chicago::Flow::TransformationChain.
         | 
| 62 | 
            +
                      new(*@transformations)
         | 
| 63 | 
            +
                  end
         | 
| 64 | 
            +
             | 
| 70 65 | 
             
                  def process_row(row)
         | 
| 71 66 | 
             
                    stream = row.delete(:_stream) || :default
         | 
| 72 67 | 
             
                    @sinks[stream] << row
         | 
| 73 68 | 
             
                  end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  def validate_arguments
         | 
| 71 | 
            +
                    if @source.nil?
         | 
| 72 | 
            +
                      raise ArgumentError, "Stage #{@name} requires a source"
         | 
| 73 | 
            +
                    end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                    if @sinks.blank?
         | 
| 76 | 
            +
                      raise ArgumentError, "Stage #{@name} requires at least one sink"
         | 
| 77 | 
            +
                    end
         | 
| 78 | 
            +
                  end
         | 
| 74 79 | 
             
                end
         | 
| 75 80 | 
             
              end
         | 
| 76 81 | 
             
            end
         | 
| @@ -1,8 +1,6 @@ | |
| 1 1 | 
             
            module Chicago
         | 
| 2 2 | 
             
              module ETL
         | 
| 3 3 | 
             
                class StageBuilder
         | 
| 4 | 
            -
                  attr_reader :sink_factory
         | 
| 5 | 
            -
             | 
| 6 4 | 
             
                  def initialize(db)
         | 
| 7 5 | 
             
                    @db = db
         | 
| 8 6 | 
             
                  end
         | 
| @@ -20,15 +18,17 @@ module Chicago | |
| 20 18 | 
             
                              :filter_strategy => @filter_strategy)
         | 
| 21 19 | 
             
                  end
         | 
| 22 20 |  | 
| 21 | 
            +
                  protected
         | 
| 22 | 
            +
             | 
| 23 23 | 
             
                  def source(&block)
         | 
| 24 24 | 
             
                    @dataset = DatasetBuilder.new(@db).build(&block)
         | 
| 25 25 | 
             
                  end
         | 
| 26 26 |  | 
| 27 | 
            -
                  def transformations( | 
| 28 | 
            -
                    @transformations =  | 
| 27 | 
            +
                  def transformations(&block)
         | 
| 28 | 
            +
                    @transformations = TransformationBuilder.new.build(&block)
         | 
| 29 29 | 
             
                  end
         | 
| 30 30 |  | 
| 31 | 
            -
                  def sinks( | 
| 31 | 
            +
                  def sinks(&block)
         | 
| 32 32 | 
             
                    @sinks = SinkBuilder.new.build(&block)
         | 
| 33 33 | 
             
                  end
         | 
| 34 34 |  | 
    
        data/lib/chicago/etl.rb
    CHANGED
    
    | @@ -10,7 +10,6 @@ require 'chicago/flow/errors' | |
| 10 10 | 
             
            require 'chicago/flow/transformation'
         | 
| 11 11 | 
             
            require 'chicago/flow/filter'
         | 
| 12 12 | 
             
            require 'chicago/flow/transformation_chain'
         | 
| 13 | 
            -
            require 'chicago/flow/pipeline_stage'
         | 
| 14 13 | 
             
            require 'chicago/flow/pipeline_endpoint'
         | 
| 15 14 | 
             
            require 'chicago/flow/array_source'
         | 
| 16 15 | 
             
            require 'chicago/flow/dataset_source'
         | 
| @@ -25,12 +24,12 @@ require 'chicago/etl/key_builder' | |
| 25 24 | 
             
            require 'chicago/etl/schema_table_sink_factory'
         | 
| 26 25 | 
             
            require 'chicago/etl/transformations'
         | 
| 27 26 | 
             
            require 'chicago/etl/load_dataset_builder'
         | 
| 28 | 
            -
            require 'chicago/etl/ | 
| 29 | 
            -
            require 'chicago/etl/load_pipeline_stage_builder'
         | 
| 30 | 
            -
            require 'chicago/etl/pipeline'
         | 
| 31 | 
            -
             | 
| 27 | 
            +
            require 'chicago/etl/dataset_builder'
         | 
| 32 28 | 
             
            require 'chicago/etl/stage'
         | 
| 33 29 | 
             
            require 'chicago/etl/stage_builder'
         | 
| 30 | 
            +
            require 'chicago/etl/dataset_batch_stage'
         | 
| 31 | 
            +
            require 'chicago/etl/schema_sinks_and_transformations_builder'
         | 
| 32 | 
            +
            require 'chicago/etl/pipeline'
         | 
| 34 33 |  | 
| 35 34 | 
             
            # Sequel Extensions
         | 
| 36 35 | 
             
            require 'chicago/etl/sequel/filter_to_etl_batch'
         | 
| @@ -0,0 +1,35 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe "creating and running a dimension stage" do
         | 
| 4 | 
            +
              let(:rows) { [{:some_field => "value"}] } 
         | 
| 5 | 
            +
              let(:db) { double(:db).as_null_object }
         | 
| 6 | 
            +
              let(:schema) { 
         | 
| 7 | 
            +
                schema = Chicago::StarSchema.new
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                schema.define_dimension(:test) do
         | 
| 10 | 
            +
                  columns do
         | 
| 11 | 
            +
                    string :foo
         | 
| 12 | 
            +
                  end
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                schema
         | 
| 16 | 
            +
              }
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              it "glues the source, transformations, and sink correctly" do
         | 
| 21 | 
            +
                pipeline.define_dimension_load(:test) do
         | 
| 22 | 
            +
                  dataset do
         | 
| 23 | 
            +
                    db.test_dataset_method
         | 
| 24 | 
            +
                  end
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                pipeline.stages.each do |stage|
         | 
| 28 | 
            +
                  stage.execute(double, true)
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              it "should set the inserted at time on the dimension"
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              it "truncates the dimension if specified"
         | 
| 35 | 
            +
            end
         | 
| @@ -39,26 +39,6 @@ describe "defining and executing a stage" do | |
| 39 39 | 
             
                stage.sink(:another_stream).data.length.should == 0
         | 
| 40 40 | 
             
              end
         | 
| 41 41 |  | 
| 42 | 
            -
              it "requires sinks" do
         | 
| 43 | 
            -
                expect {
         | 
| 44 | 
            -
                  pipeline.define_stage(:test_stage) do
         | 
| 45 | 
            -
                    source do
         | 
| 46 | 
            -
                      db.test_dataset_method
         | 
| 47 | 
            -
                    end
         | 
| 48 | 
            -
                  end
         | 
| 49 | 
            -
                }.to raise_error(ArgumentError)
         | 
| 50 | 
            -
              end
         | 
| 51 | 
            -
              
         | 
| 52 | 
            -
              it "requires sources" do
         | 
| 53 | 
            -
                expect {
         | 
| 54 | 
            -
                  pipeline.define_stage(:test_stage) do
         | 
| 55 | 
            -
                    sinks do
         | 
| 56 | 
            -
                      add Chicago::Flow::ArraySink.new(:test)
         | 
| 57 | 
            -
                    end
         | 
| 58 | 
            -
                  end
         | 
| 59 | 
            -
                }.to raise_error(ArgumentError)
         | 
| 60 | 
            -
              end
         | 
| 61 | 
            -
             | 
| 62 42 | 
             
              it "glues the source, transformations, and sink correctly" do
         | 
| 63 43 | 
             
                pipeline.define_stage(:test_stage) do
         | 
| 64 44 | 
             
                  source do
         | 
| @@ -90,8 +70,8 @@ describe "defining and executing a stage" do | |
| 90 70 | 
             
              it "allows the source to be filtered via a filter strategy" do
         | 
| 91 71 | 
             
                etl_batch_double = double
         | 
| 92 72 | 
             
                fake_source = []
         | 
| 73 | 
            +
                fake_source.should_receive(:another_dataset_method).and_return([])
         | 
| 93 74 |  | 
| 94 | 
            -
                fake_source.should_receive(:another_dataset_method).and_return([])    
         | 
| 95 75 | 
             
                pipeline.define_stage(:test_stage) do
         | 
| 96 76 | 
             
                  source do
         | 
| 97 77 | 
             
                    fake_source
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            require 'spec_helper'
         | 
| 2 2 |  | 
| 3 | 
            -
            describe Chicago::ETL:: | 
| 3 | 
            +
            describe Chicago::ETL::SchemaSinksAndTransformationsBuilder do
         | 
| 4 4 | 
             
              let(:dimension) { stub(:dimension).as_null_object }
         | 
| 5 5 | 
             
              let(:db) { stub(:db).as_null_object }
         | 
| 6 6 | 
             
              let(:sink_factory) { stub(:sink_factory).as_null_object }
         | 
| @@ -34,6 +34,6 @@ describe Chicago::ETL::LoadPipelineStageBuilder do | |
| 34 34 | 
             
                  key_mapping :bar, :original_id
         | 
| 35 35 | 
             
                end
         | 
| 36 36 |  | 
| 37 | 
            -
                stage | 
| 37 | 
            +
                stage[:sinks][:bar].should_not be_nil
         | 
| 38 38 | 
             
              end
         | 
| 39 39 | 
             
            end
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe Chicago::ETL::Stage do
         | 
| 4 | 
            +
              it "requires a source" do
         | 
| 5 | 
            +
                expect {
         | 
| 6 | 
            +
                  described_class.new(:test,
         | 
| 7 | 
            +
                                      :source => nil,
         | 
| 8 | 
            +
                                      :sinks => {:default => stub(:sink)})
         | 
| 9 | 
            +
                }.to raise_error(ArgumentError)
         | 
| 10 | 
            +
              end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              it "requires sinks" do
         | 
| 13 | 
            +
                expect {
         | 
| 14 | 
            +
                  described_class.new(:test,
         | 
| 15 | 
            +
                                      :source => stub(:source),
         | 
| 16 | 
            +
                                      :sinks => nil)
         | 
| 17 | 
            +
                }.to raise_error(ArgumentError)
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              it "does not filter the dataset if re-extracting" do
         | 
| 21 | 
            +
                stage = described_class.new(:test,
         | 
| 22 | 
            +
                                            :source => stub(:source),
         | 
| 23 | 
            +
                                            :sinks => {:default => stub(:sink)},
         | 
| 24 | 
            +
                                            :filter_strategy => lambda { fail })
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                stage.filtered_source(stub(:etl_batch), true)
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              it "opens sinks before writing and closes them afterwards" do
         | 
| 30 | 
            +
                sink = mock(:sink)
         | 
| 31 | 
            +
                sink.should_receive(:open)
         | 
| 32 | 
            +
                sink.should_receive(:close)
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                stage = described_class.new(:test,
         | 
| 35 | 
            +
                                            :source => [],
         | 
| 36 | 
            +
                                            :sinks => {:default => sink})
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                stage.execute(stub(:etl_batch), true)
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
            end
         | 
| @@ -44,29 +44,33 @@ describe "Mysql -> Mysql through transformation chain" do | |
| 44 44 |  | 
| 45 45 | 
             
              it "copies data from source to destination" do
         | 
| 46 46 | 
             
                TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
         | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 47 | 
            +
                                               {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
         | 
| 48 | 
            +
                
         | 
| 49 49 | 
             
                source = Chicago::Flow::DatasetSource.
         | 
| 50 50 | 
             
                  new(TEST_DB[:source].
         | 
| 51 51 | 
             
                      select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                transformations = [dup_row.new(:onto => :other)]
         | 
| 54 | 
            +
             | 
| 52 55 | 
             
                sink_1 = Chicago::Flow::MysqlFileSink.
         | 
| 53 56 | 
             
                  new(TEST_DB, :destination, [:id, :foo, :bin])
         | 
| 54 57 | 
             
                sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
         | 
| 55 58 |  | 
| 56 | 
            -
                stage = Chicago:: | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 59 | 
            +
                stage = Chicago::ETL::Stage.new(:test, 
         | 
| 60 | 
            +
                                                :source => source, 
         | 
| 61 | 
            +
                                                :transformations => transformations, 
         | 
| 62 | 
            +
                                                :sinks => {
         | 
| 63 | 
            +
                                                  :default => sink_1, 
         | 
| 64 | 
            +
                                                  :other => sink_2
         | 
| 65 | 
            +
                                                })
         | 
| 63 66 |  | 
| 64 | 
            -
                stage.execute( | 
| 67 | 
            +
                stage.execute(stub(:etl_batch), true)
         | 
| 65 68 |  | 
| 66 69 | 
             
                expected = [{:id => 1, :foo => nil, :bin => "1F"},
         | 
| 67 70 | 
             
                            {:id => 2, :foo => "Hello", :bin => "1F"}]
         | 
| 68 71 |  | 
| 69 72 | 
             
                sink_2.data.should == expected
         | 
| 70 | 
            -
                TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)). | 
| 73 | 
            +
                TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
         | 
| 74 | 
            +
                  all.should == expected
         | 
| 71 75 | 
             
              end
         | 
| 72 76 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: chicago-etl
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 19
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 8 | 
             
              - 1
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.1. | 
| 9 | 
            +
              - 4
         | 
| 10 | 
            +
              version: 0.1.4
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Roland Swingler
         | 
| @@ -243,10 +243,11 @@ files: | |
| 243 243 | 
             
            - lib/chicago/etl/core_extensions.rb
         | 
| 244 244 | 
             
            - lib/chicago/etl/counter.rb
         | 
| 245 245 | 
             
            - lib/chicago/etl/dataset_batch_stage.rb
         | 
| 246 | 
            +
            - lib/chicago/etl/dataset_builder.rb
         | 
| 246 247 | 
             
            - lib/chicago/etl/key_builder.rb
         | 
| 247 248 | 
             
            - lib/chicago/etl/load_dataset_builder.rb
         | 
| 248 | 
            -
            - lib/chicago/etl/load_pipeline_stage_builder.rb
         | 
| 249 249 | 
             
            - lib/chicago/etl/pipeline.rb
         | 
| 250 | 
            +
            - lib/chicago/etl/schema_sinks_and_transformations_builder.rb
         | 
| 250 251 | 
             
            - lib/chicago/etl/schema_table_sink_factory.rb
         | 
| 251 252 | 
             
            - lib/chicago/etl/screens/column_screen.rb
         | 
| 252 253 | 
             
            - lib/chicago/etl/screens/invalid_element.rb
         | 
| @@ -273,7 +274,6 @@ files: | |
| 273 274 | 
             
            - lib/chicago/flow/mysql_file_sink.rb
         | 
| 274 275 | 
             
            - lib/chicago/flow/null_sink.rb
         | 
| 275 276 | 
             
            - lib/chicago/flow/pipeline_endpoint.rb
         | 
| 276 | 
            -
            - lib/chicago/flow/pipeline_stage.rb
         | 
| 277 277 | 
             
            - lib/chicago/flow/sink.rb
         | 
| 278 278 | 
             
            - lib/chicago/flow/transformation.rb
         | 
| 279 279 | 
             
            - lib/chicago/flow/transformation_chain.rb
         | 
| @@ -281,7 +281,7 @@ files: | |
| 281 281 | 
             
            - spec/etl/batch_spec.rb
         | 
| 282 282 | 
             
            - spec/etl/core_extensions_spec.rb
         | 
| 283 283 | 
             
            - spec/etl/counter_spec.rb
         | 
| 284 | 
            -
            - spec/etl/ | 
| 284 | 
            +
            - spec/etl/define_dimension_stage_spec.rb
         | 
| 285 285 | 
             
            - spec/etl/define_stage_spec.rb
         | 
| 286 286 | 
             
            - spec/etl/etl_batch_id_dataset_filter.rb
         | 
| 287 287 | 
             
            - spec/etl/key_builder_spec.rb
         | 
| @@ -293,6 +293,7 @@ files: | |
| 293 293 | 
             
            - spec/etl/screens/out_of_bounds_spec.rb
         | 
| 294 294 | 
             
            - spec/etl/sequel/dependant_tables_spec.rb
         | 
| 295 295 | 
             
            - spec/etl/sequel/filter_to_etl_batch_spec.rb
         | 
| 296 | 
            +
            - spec/etl/stage_spec.rb
         | 
| 296 297 | 
             
            - spec/etl/table_builder_spec.rb
         | 
| 297 298 | 
             
            - spec/etl/task_spec.rb
         | 
| 298 299 | 
             
            - spec/etl/transformations/deduplicate_rows_spec.rb
         | 
| @@ -306,7 +307,6 @@ files: | |
| 306 307 | 
             
            - spec/flow/mysql_file_serializer_spec.rb
         | 
| 307 308 | 
             
            - spec/flow/mysql_file_sink_spec.rb
         | 
| 308 309 | 
             
            - spec/flow/mysql_integration_spec.rb
         | 
| 309 | 
            -
            - spec/flow/pipeline_stage_spec.rb
         | 
| 310 310 | 
             
            - spec/flow/transformation_chain_spec.rb
         | 
| 311 311 | 
             
            - spec/flow/transformation_spec.rb
         | 
| 312 312 | 
             
            - spec/spec_helper.rb
         | 
| @@ -1,68 +0,0 @@ | |
| 1 | 
            -
            module Chicago
         | 
| 2 | 
            -
              module Flow
         | 
| 3 | 
            -
                # Co-ordinates iterating over rows provided by a source, passing
         | 
| 4 | 
            -
                # them through a transformation chain before writing them to
         | 
| 5 | 
            -
                # sink(s).
         | 
| 6 | 
            -
                #
         | 
| 7 | 
            -
                # @api public
         | 
| 8 | 
            -
                class PipelineStage
         | 
| 9 | 
            -
                  attr_reader :transformation_chain
         | 
| 10 | 
            -
                  
         | 
| 11 | 
            -
                  def initialize(options={})
         | 
| 12 | 
            -
                    @sinks  = options[:sinks] || {}
         | 
| 13 | 
            -
                    @transformations = options[:transformations] || []
         | 
| 14 | 
            -
                    @error_handler = options[:error_handler] || RaisingErrorHandler.new
         | 
| 15 | 
            -
                    @transformation_chain = TransformationChain.new(*@transformations)
         | 
| 16 | 
            -
                  end
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                  # Returns the named sink, if it exists
         | 
| 19 | 
            -
                  def sink(name)
         | 
| 20 | 
            -
                    @sinks[name.to_sym]
         | 
| 21 | 
            -
                  end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                  def sinks
         | 
| 24 | 
            -
                    @sinks.values
         | 
| 25 | 
            -
                  end
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                  def register_sink(name, sink)
         | 
| 28 | 
            -
                    @sinks[name.to_sym] = sink
         | 
| 29 | 
            -
                    self
         | 
| 30 | 
            -
                  end
         | 
| 31 | 
            -
                  
         | 
| 32 | 
            -
                  def validate_pipeline
         | 
| 33 | 
            -
                    unless unregistered_sinks.empty?
         | 
| 34 | 
            -
                      @error_handler.unregistered_sinks(unregistered_sinks)
         | 
| 35 | 
            -
                    end
         | 
| 36 | 
            -
                  end
         | 
| 37 | 
            -
                  
         | 
| 38 | 
            -
                  def execute(source)
         | 
| 39 | 
            -
                    validate_pipeline
         | 
| 40 | 
            -
                    sinks.each(&:open)
         | 
| 41 | 
            -
                    pipe_rows_to_sinks_from(source)
         | 
| 42 | 
            -
                    sinks.each(&:close)
         | 
| 43 | 
            -
                  end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
                  def required_sinks
         | 
| 46 | 
            -
                    transformation_chain.output_streams | [:default]
         | 
| 47 | 
            -
                  end
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                  def unregistered_sinks
         | 
| 50 | 
            -
                    required_sinks - @sinks.keys
         | 
| 51 | 
            -
                  end
         | 
| 52 | 
            -
                  
         | 
| 53 | 
            -
                  private
         | 
| 54 | 
            -
                  
         | 
| 55 | 
            -
                  def pipe_rows_to_sinks_from(source)
         | 
| 56 | 
            -
                    source.each do |row|
         | 
| 57 | 
            -
                      transformation_chain.process(row).each {|row| process_row(row) }
         | 
| 58 | 
            -
                    end
         | 
| 59 | 
            -
                    transformation_chain.flush.each {|row| process_row(row) }
         | 
| 60 | 
            -
                  end
         | 
| 61 | 
            -
             | 
| 62 | 
            -
                  def process_row(row)
         | 
| 63 | 
            -
                    stream = row.delete(:_stream) || :default
         | 
| 64 | 
            -
                    @sinks[stream] << row
         | 
| 65 | 
            -
                  end
         | 
| 66 | 
            -
                end
         | 
| 67 | 
            -
              end
         | 
| 68 | 
            -
            end
         | 
| @@ -1,55 +0,0 @@ | |
| 1 | 
            -
            require 'spec_helper'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            describe Chicago::ETL::DatasetBatchStage do
         | 
| 4 | 
            -
              let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
         | 
| 5 | 
            -
              let(:dataset) { mock(:dataset).as_null_object }
         | 
| 6 | 
            -
              let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
         | 
| 7 | 
            -
              let(:etl_batch) { stub(:etl_batch) }
         | 
| 8 | 
            -
             | 
| 9 | 
            -
              it "has a name" do
         | 
| 10 | 
            -
                stage.name.should == :foo
         | 
| 11 | 
            -
              end
         | 
| 12 | 
            -
             | 
| 13 | 
            -
              it "should set the inserted at time on the default sink" do
         | 
| 14 | 
            -
                sink = Chicago::Flow::ArraySink.new(:foo)
         | 
| 15 | 
            -
                pipeline_stage.stub(:sink).with(:default).and_return(sink)
         | 
| 16 | 
            -
                stage.pipeline_stage.should == pipeline_stage
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                sink.constant_values[:_inserted_at].should_not be_nil
         | 
| 19 | 
            -
              end
         | 
| 20 | 
            -
             | 
| 21 | 
            -
              it "filters the dataset to the batch" do
         | 
| 22 | 
            -
                dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
         | 
| 23 | 
            -
                stage.source(etl_batch)
         | 
| 24 | 
            -
              end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
              it "does not filter the dataset if re-extracting" do
         | 
| 27 | 
            -
                dataset.should_not_recieve(:filter_to_etl_batch)
         | 
| 28 | 
            -
                stage.source(etl_batch, true)
         | 
| 29 | 
            -
              end
         | 
| 30 | 
            -
             | 
| 31 | 
            -
              it "can filter via a custom strategy" do
         | 
| 32 | 
            -
                dataset.should_not_recieve(:filter_to_etl_batch)
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                filter_strategy = lambda {|ds, batch| ds }
         | 
| 35 | 
            -
                described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
         | 
| 36 | 
            -
                  source(etl_batch)
         | 
| 37 | 
            -
              end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
              it "executes the pipeline stage using a DatasetSource" do
         | 
| 40 | 
            -
                pipeline_stage.should_receive(:execute).
         | 
| 41 | 
            -
                  with(kind_of(Chicago::Flow::DatasetSource))
         | 
| 42 | 
            -
                stage.execute(etl_batch, true)
         | 
| 43 | 
            -
              end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
              it "truncates any sinks if truncate_pre_load has been set" do
         | 
| 46 | 
            -
                stage = described_class.new(:foo, dataset, pipeline_stage,
         | 
| 47 | 
            -
                                            :truncate_pre_load => true)
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                sink = Chicago::Flow::ArraySink.new(:output)
         | 
| 50 | 
            -
                sink << {:foo => "foo"}
         | 
| 51 | 
            -
                pipeline_stage.stub(:sinks).and_return([sink])
         | 
| 52 | 
            -
                stage.execute(etl_batch)
         | 
| 53 | 
            -
                sink.data.should == []
         | 
| 54 | 
            -
              end
         | 
| 55 | 
            -
            end
         | 
| @@ -1,89 +0,0 @@ | |
| 1 | 
            -
            require 'spec_helper'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            describe Chicago::Flow::PipelineStage do
         | 
| 4 | 
            -
              let(:transform) {
         | 
| 5 | 
            -
                Class.new(Chicago::Flow::Transformation) {
         | 
| 6 | 
            -
                  def process_row(row)
         | 
| 7 | 
            -
                    row[:a] += 1
         | 
| 8 | 
            -
                    row
         | 
| 9 | 
            -
                  end
         | 
| 10 | 
            -
                }
         | 
| 11 | 
            -
              }
         | 
| 12 | 
            -
              
         | 
| 13 | 
            -
              let(:add_error) {
         | 
| 14 | 
            -
                Class.new(Chicago::Flow::Transformation) {
         | 
| 15 | 
            -
                  # add_output_stream :error
         | 
| 16 | 
            -
                  def output_streams
         | 
| 17 | 
            -
                    [:default, :error]
         | 
| 18 | 
            -
                  end
         | 
| 19 | 
            -
                  
         | 
| 20 | 
            -
                  def process_row(row)
         | 
| 21 | 
            -
                    [row, {Chicago::Flow::STREAM => :error, :message => "error"}]
         | 
| 22 | 
            -
                  end
         | 
| 23 | 
            -
                }
         | 
| 24 | 
            -
              }
         | 
| 25 | 
            -
             | 
| 26 | 
            -
              let(:sink) { Chicago::Flow::ArraySink.new(:test) }
         | 
| 27 | 
            -
              let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
         | 
| 28 | 
            -
             | 
| 29 | 
            -
              it "returns all sinks" do
         | 
| 30 | 
            -
                stage = described_class.new.register_sink(:default, sink)
         | 
| 31 | 
            -
                stage.sinks.should == [sink]
         | 
| 32 | 
            -
              end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
              it "returns a sink by name" do
         | 
| 35 | 
            -
                stage = described_class.new.register_sink(:default, sink)
         | 
| 36 | 
            -
                stage.sink(:default).should == sink
         | 
| 37 | 
            -
              end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
              it "reads from source to sink" do
         | 
| 40 | 
            -
                pipeline = described_class.new.register_sink(:default, sink)
         | 
| 41 | 
            -
                pipeline.execute(source)
         | 
| 42 | 
            -
                sink.data.should == [{:a => 1}]
         | 
| 43 | 
            -
              end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
              it "passes rows through transforms" do
         | 
| 46 | 
            -
                pipeline = described_class.new(:transformations => [transform.new]).
         | 
| 47 | 
            -
                  register_sink(:default, sink)
         | 
| 48 | 
            -
                                               
         | 
| 49 | 
            -
                pipeline.execute(source)
         | 
| 50 | 
            -
                sink.data.should == [{:a => 2}]
         | 
| 51 | 
            -
              end
         | 
| 52 | 
            -
             | 
| 53 | 
            -
              it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
         | 
| 54 | 
            -
                error_sink = Chicago::Flow::ArraySink.new(:test)
         | 
| 55 | 
            -
             | 
| 56 | 
            -
                pipeline = described_class.new(:transformations => [add_error.new]).
         | 
| 57 | 
            -
                  register_sink(:default, sink).
         | 
| 58 | 
            -
                  register_sink(:error, error_sink)
         | 
| 59 | 
            -
             | 
| 60 | 
            -
                pipeline.execute(source)
         | 
| 61 | 
            -
                sink.data.should == [{:a => 1}]
         | 
| 62 | 
            -
                error_sink.data.should == [{:message => "error"}]
         | 
| 63 | 
            -
              end
         | 
| 64 | 
            -
             | 
| 65 | 
            -
              it "calls an error handler if sinks are not registered" do
         | 
| 66 | 
            -
                error_handler = mock(:error_handler)
         | 
| 67 | 
            -
                error_handler.should_receive(:unregistered_sinks).
         | 
| 68 | 
            -
                  with([:default, :error])
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                pipeline = described_class.new(:transformations => [add_error.new],
         | 
| 71 | 
            -
                                               :error_handler => error_handler)
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                pipeline.validate_pipeline
         | 
| 74 | 
            -
              end
         | 
| 75 | 
            -
             | 
| 76 | 
            -
              it "by default raises an exception if the pipeline is not valid when executed" do
         | 
| 77 | 
            -
                pipeline = described_class.new(:transformations => [add_error.new])
         | 
| 78 | 
            -
                expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
         | 
| 79 | 
            -
              end
         | 
| 80 | 
            -
             | 
| 81 | 
            -
              it "opens sinks before writing and closes them afterwards" do
         | 
| 82 | 
            -
                sink = mock(:sink)
         | 
| 83 | 
            -
                pipeline = described_class.new.register_sink(:default, sink)
         | 
| 84 | 
            -
                sink.should_receive(:open)
         | 
| 85 | 
            -
                sink.stub(:<<)
         | 
| 86 | 
            -
                sink.should_receive(:close)
         | 
| 87 | 
            -
                pipeline.execute(source)
         | 
| 88 | 
            -
              end
         | 
| 89 | 
            -
            end
         |