RubyGems - chicago-flow - Versions diffs - 0.0.1 - Mend

chicago-flow 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/.document +5 -0
data/.rspec +1 -0
data/Gemfile +15 -0
data/LICENSE.txt +20 -0
data/README.rdoc +24 -0
data/Rakefile +33 -0
data/VERSION +1 -0
data/lib/chicago/flow.rb +16 -0
data/lib/chicago/flow/array_sink.rb +16 -0
data/lib/chicago/flow/array_source.rb +14 -0
data/lib/chicago/flow/dataset_source.rb +20 -0
data/lib/chicago/flow/filter.rb +14 -0
data/lib/chicago/flow/mysql.rb +4 -0
data/lib/chicago/flow/mysql_file_serializer.rb +26 -0
data/lib/chicago/flow/mysql_file_sink.rb +54 -0
data/lib/chicago/flow/pipeline_endpoint.rb +11 -0
data/lib/chicago/flow/pipeline_stage.rb +60 -0
data/lib/chicago/flow/sink.rb +34 -0
data/lib/chicago/flow/transformation.rb +78 -0
data/lib/chicago/flow/transformation_chain.rb +39 -0
data/spec/array_sink_spec.rb +14 -0
data/spec/array_source_spec.rb +20 -0
data/spec/database.yml.dist +4 -0
data/spec/dataset_source_spec.rb +15 -0
data/spec/filter_spec.rb +13 -0
data/spec/mysql_file_serializer_spec.rb +27 -0
data/spec/mysql_file_sink_spec.rb +77 -0
data/spec/mysql_integration_spec.rb +69 -0
data/spec/pipeline_stage_spec.rb +81 -0
data/spec/spec_helper.rb +20 -0
data/spec/transformation_chain_spec.rb +75 -0
data/spec/transformation_spec.rb +73 -0
metadata +210 -0

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --color

data/Gemfile ADDED Viewed

@@ -0,0 +1,15 @@
+source "http://rubygems.org"
+gem "fastercsv", :platform => :ruby_18
+gem "sequel"
+gem "sequel_load_data_infile", ">= 0.0.2", :require => "sequel/load_data_infile"
+gem "sequel_fast_columns", :require => "sequel/fast_columns"
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "mysql", "2.8.1"
+  gem "rspec", "~> 2"
+  gem "bundler", "~> 1"
+  gem "jeweler", "~> 1.8.4"
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2013 notonthehighstreet.com
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,24 @@
+= chicago-flow
+A dataflow programming model for processing rows of hash-like
+data. Used in the Chicago Warehouse ETL process.
+== Contributing to chicago-flow
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
+* Fork the project.
+* Start a feature/bugfix branch.
+* Commit and push until you are happy with your contribution.
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Authors
+Roland Swingler (@knaveofdiamonds)
+== Copyright
+Copyright (c) 2013 notonthehighstreet.com. See LICENSE.txt for
+further details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,33 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  gem.name = "chicago-flow"
+  gem.homepage = "http://github.com/notonthehighstreet/chicago-flow"
+  gem.license = "MIT"
+  gem.summary = "Dataflow-style processing for hash-like rows"
+  gem.description = "Dataflow-style processing for hash-like rows"
+  gem.email = "roland.swingler@gmail.com"
+  gem.authors = ["Roland Swingler"]
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task :default => :spec

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.1

data/lib/chicago/flow.rb ADDED Viewed

@@ -0,0 +1,16 @@
+if RUBY_VERSION.split(".")[1] < "9"
+  require 'fastercsv'
+  CSV = FasterCSV
+else
+  require 'csv'
+end
+require 'chicago/flow/transformation'
+require 'chicago/flow/filter'
+require 'chicago/flow/transformation_chain'
+require 'chicago/flow/pipeline_stage'
+require 'chicago/flow/pipeline_endpoint'
+require 'chicago/flow/array_source'
+require 'chicago/flow/dataset_source'
+require 'chicago/flow/sink'
+require 'chicago/flow/array_sink'

data/lib/chicago/flow/array_sink.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module Chicago
+  module Flow
+    class ArraySink < Sink
+      attr_reader :data
+      def initialize(fields=[])
+        @fields = [fields].flatten
+        @data = []
+      end
+      def <<(row)
+        @data << row.merge(constant_values)
+      end
+    end
+  end
+end

data/lib/chicago/flow/array_source.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Chicago
+  module Flow
+    class ArraySource < PipelineEndpoint
+      def initialize(array, fields=[])
+        @fields = [fields].flatten
+        @array = array
+      end
+      def each
+        @array.each {|row| yield row }
+      end
+    end
+  end
+end

data/lib/chicago/flow/dataset_source.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'sequel'
+require 'sequel/fast_columns'
+module Chicago
+  module Flow
+    class DatasetSource < PipelineEndpoint
+      def initialize(dataset)
+        @dataset = dataset
+      end
+      def each
+        @dataset.each {|row| yield row }
+      end
+      def fields
+        @dataset.columns
+      end
+    end
+  end
+end

data/lib/chicago/flow/filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Chicago
+  module Flow
+    class Filter < Transformation
+      def initialize(stream=:default, &block)
+        super(stream)
+        @block = block || lambda {|row| false }
+      end
+      def process_row(row)
+        row if @block.call(row)
+      end
+    end
+  end
+end

data/lib/chicago/flow/mysql.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'sequel'
+require 'sequel/load_data_infile'
+require 'chicago/flow/mysql_file_serializer'
+require 'chicago/flow/mysql_file_sink'

data/lib/chicago/flow/mysql_file_serializer.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'date'
+module Chicago
+  module Flow
+    class MysqlFileSerializer
+      # Transforms a value to be suitable for use in file in a LOAD
+      # DATA INFILE mysql statement.
+      def serialize(value)
+        case value
+        when nil
+          "NULL"
+        when true
+          "1"
+        when false
+          "0"
+        when Time, DateTime
+          value.strftime("%Y-%m-%d %H:%M:%S")
+        when Date
+          value.strftime("%Y-%m-%d")
+        else
+          value
+        end
+      end
+    end
+  end
+end

data/lib/chicago/flow/mysql_file_sink.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require 'sequel'
+require 'sequel/load_data_infile'
+require 'tmpdir'
+Sequel.extension :core_extensions
+module Chicago
+  module Flow
+    class MysqlFileSink < Sink
+      attr_reader :filepath
+      def initialize(db, table_name, fields, options = {})
+        @fields = [fields].flatten
+        @filepath = options[:filepath] || temp_file(table_name)
+        @serializer = MysqlFileSerializer.new
+        @db = db
+        @table_name = table_name
+        @insert_ignore = !!options[:ignore]
+      end
+      def <<(row)
+        csv << fields.map {|c| @serializer.serialize(row[c]) }
+      end
+      def close
+        csv.flush
+        load_from_file(filepath)
+        csv.close
+        File.unlink(filepath) if File.exists?(filepath)
+      end
+      # Loads data from the file into the MySQL table via LOAD DATA
+      # INFILE, if the file exists and has content.
+      def load_from_file(file)
+        return unless File.size?(file)
+        dataset.load_csv_infile(file, @fields, :set => constant_values)
+      end
+      private
+      def dataset
+        @insert_ignore ? @db[@table_name].insert_ignore : @db[@table_name]
+      end
+      def csv
+        @csv ||= CSV.open(filepath, "w")
+      end
+      def temp_file(table_name)
+        File.join(Dir.tmpdir, "#{table_name}.#{rand(1_000_000)}.csv")
+      end
+    end
+  end
+end

data/lib/chicago/flow/pipeline_endpoint.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Chicago
+  module Flow
+    class PipelineEndpoint
+      attr_reader :fields
+      def has_defined_fields?
+        !fields.empty?
+      end
+    end
+  end
+end

data/lib/chicago/flow/pipeline_stage.rb ADDED Viewed

@@ -0,0 +1,60 @@
+module Chicago
+  module Flow
+    class Error < RuntimeError
+    end
+    class RaisingErrorHandler
+      def unregistered_sinks(sinks)
+        raise Error.new("Sinks not registered: #{sinks.join(",")}")
+      end
+    end
+    class PipelineStage
+      attr_reader :transformation_chain
+      def initialize(source, options={})
+        @source = source
+        @sinks  = options[:sinks] || {}
+        @transformations = options[:transformations] || []
+        @error_handler = options[:error_handler] || RaisingErrorHandler.new
+        @transformation_chain = TransformationChain.new(*@transformations)
+      end
+      def register_sink(name, sink)
+        @sinks[name.to_sym] = sink
+        self
+      end
+      def validate_pipeline
+        unless unregistered_sinks.empty?
+          @error_handler.unregistered_sinks(unregistered_sinks)
+        end
+      end
+      def execute
+        validate_pipeline
+        @sinks.values.each(&:open)
+        @source.each do |row|
+          transformation_chain.process(row).each {|row| process_row(row) }
+        end
+        transformation_chain.flush.each {|row| process_row(row) }
+        @sinks.values.each(&:close)
+      end
+      def required_sinks
+        transformation_chain.output_streams | [:default]
+      end
+      def unregistered_sinks
+        required_sinks - @sinks.keys
+      end
+      private
+      def process_row(row)
+        stream = row.delete(:_stream) || :default
+        @sinks[stream] << row
+      end
+    end
+  end
+end

data/lib/chicago/flow/sink.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module Chicago
+  module Flow
+    class Sink < PipelineEndpoint
+      # Specifies a hash of values that are assumed to apply to all
+      # rows.
+      #
+      # Subclasses should use there constant values appropriately when
+      # writing rows, by merging them with the row or otherwise
+      # ensuring that they end up in the final source this sink
+      # represents.
+      def constant_values
+        @constant_values ||= {}
+      end
+      # Performs any operations before writing rows to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def open
+      end
+      # Performs any operations after writing rows to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def close
+      end
+      # Writes a row to this sink.
+      #
+      # By default does nothing; may be overridden by subclasses.
+      def <<(row)
+      end
+    end
+  end
+end

data/lib/chicago/flow/transformation.rb ADDED Viewed

@@ -0,0 +1,78 @@
+module Chicago
+  module Flow
+    STREAM = :_stream
+    class Transformation
+      def initialize(*args)
+        stream, options = *args
+        if stream.kind_of?(Hash)
+          @stream = :default
+          @options = stream
+        else
+          @stream = stream || :default
+          @options = options || {}
+        end
+      end
+      class << self
+        attr_reader :added_fields, :removed_fields
+        def adds_fields(*fields)
+          @added_fields ||= []
+          @added_fields += fields.flatten
+        end
+        def removes_fields(*fields)
+          @removed_fields ||= []
+          @removed_fields += fields.flatten
+        end
+      end
+      def added_fields
+        self.class.added_fields
+      end
+      def removed_fields
+        self.class.removed_fields
+      end
+      def upstream_fields(fields)
+        ((fields + removed_fields) - added_fields).uniq
+      end
+      def downstream_fields(fields)
+        ((fields - removed_fields) + added_fields).uniq
+      end
+      def process(row)
+        applies_to_stream?(row[STREAM]) ? process_row(row) : row
+      end
+      def flush
+        []
+      end
+      def output_streams
+        [:default]
+      end
+      def applies_to_stream?(target_stream)
+        @stream == :all ||
+          (target_stream.nil? && @stream == :default) ||
+          target_stream == @stream
+      end
+      protected
+      def process_row(row)
+        row
+      end
+      def assign_stream(row, stream)
+        raise "Stream not declared" unless stream.nil? || output_streams.include?(stream)
+        row[STREAM] = stream if stream
+        row
+      end
+    end
+  end
+end

data/lib/chicago/flow/transformation_chain.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module Chicago
+  module Flow
+    class TransformationChain
+      def initialize(*transforms)
+        @transforms = transforms
+      end
+      def output_streams
+        @transforms.inject([]) {|s, t| s | t.output_streams }
+      end
+      def process(row)
+        @transforms.inject([row]) do |rows, transform|
+          process_rows(rows, transform)
+        end
+      end
+      def flush
+        @transforms.inject([]) do |rows, transform|
+          process_rows(rows, transform) + transform.flush
+        end
+      end
+      def upstream_fields(fields)
+        @transforms.inject(fields) {|t| t.upstream_fields(fields) }
+      end
+      def downstream_fields(fields)
+        @transforms.inject(fields) {|t| t.downstream_fields(fields) }
+      end
+      private
+      def process_rows(rows, transform)
+        rows.map {|row| transform.process(row) }.flatten.compact
+      end
+    end
+  end
+end

data/spec/array_sink_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ArraySink do
+  it "stores rows in #data" do
+    subject << {:a => 1}
+    subject.data.should == [{:a => 1}]
+  end
+  it "merges constant values into the sink row" do
+    subject.constant_values[:number] = 1
+    subject << {:a => 1}
+    subject.data.should == [{:a => 1, :number => 1}]
+  end
+end

data/spec/array_source_spec.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ArraySource do
+  it "has an each method that yields rows" do
+    ArraySource.new([{:a => 1}]).each do |row|
+      row.should == {:a => 1}
+    end
+  end
+  it "doesn't know about any fields rows have by default" do
+    ArraySource.new([]).fields.should == []
+    ArraySource.new([]).should_not have_defined_fields
+  end
+  it "can optionally define which fields will be in rows" do
+    ArraySource.new([], [:a, :b]).fields.should == [:a, :b]
+    ArraySource.new([], :a).fields.should == [:a]
+    ArraySource.new([], :a).should have_defined_fields
+  end
+end

data/spec/database.yml.dist ADDED Viewed

@@ -0,0 +1,4 @@
+adapter: mysql
+username: root
+socket: /var/run/mysqld/mysqld.sock
+database: chicago_flow_test_db

data/spec/dataset_source_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe DatasetSource do
+  let(:dataset) { stub(:dataset) }
+  it "should delegtate each to the dataset" do
+    dataset.should_receive(:each)
+    described_class.new(dataset).each {|row| }
+  end
+  it "gets columns from the dataset" do
+    dataset.should_receive(:columns)
+    described_class.new(dataset).fields
+  end
+end

data/spec/filter_spec.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe Filter do
+  it "filters all rows by default" do
+    subject.process({:a => 1}).should be_nil
+  end
+  it "filters rows given a block" do
+    filter = Filter.new {|row| row.has_key?(:a) }
+    filter.process(:a => 1).should == {:a => 1}
+    filter.process(:b => 1).should be_nil
+  end
+end

data/spec/mysql_file_serializer_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe Chicago::Flow::MysqlFileSerializer do
+  it "serializes nil into NULL" do
+    subject.serialize(nil).should == "NULL"
+  end
+  it "serializes true into '1'" do
+    subject.serialize(true).should == "1"
+  end
+  it "serializes false into '0'" do
+    subject.serialize(false).should == "0"
+  end
+  it "serializes times into mysql time format" do
+    subject.serialize(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
+  end
+  it "serializes datetimes into mysql time format" do
+    subject.serialize(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
+  end
+  it "serializes dates into mysql date format" do
+    subject.serialize(Date.new(2011,01,02)).should == "2011-01-02"
+  end
+end

data/spec/mysql_file_sink_spec.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require 'sequel'
+describe MysqlFileSink do
+  let(:dataset) { mock(:dataset).as_null_object }
+  let(:db) { mock(:db, :[] => dataset, :schema => []) }
+  let(:csv) { mock(:csv) }
+  let(:sink) {
+    described_class.new(db, :table, [:foo], :filepath => "test_file")
+  }
+  before :each do
+    CSV.stub(:open).and_return(csv)
+    csv.stub(:<<)
+    csv.stub(:close).and_return(csv)
+    csv.stub(:flush)
+    File.stub(:size?).and_return(true)
+  end
+  it "writes specified columns to rows in a file" do
+    csv.should_receive(:<<).with([1])
+    sink << {:foo => 1, :bar => 2}
+  end
+  it "serializes values before writing to the file" do
+    MysqlFileSerializer.any_instance.should_receive(:serialize).with(1).and_return(1)
+    sink << {:foo => 1}
+  end
+  it "has defined fields" do
+    sink.should have_defined_fields
+    sink.fields.should == [:foo]
+  end
+  it "loads the csv file into the database when closed" do
+    dataset.should_receive(:load_csv_infile).
+      with("test_file", [:foo], :set => {})
+    sink.close
+  end
+  it "uses the :set hash to load constant values" do
+    sink.constant_values[:bar] = 1
+    dataset.should_receive(:load_csv_infile).
+      with("test_file", [:foo], :set => {:bar => 1})
+    sink.close
+  end
+  it "does not IGNORE rows by default" do
+    dataset.should_not_receive(:insert_ignore)
+    sink.close
+  end
+  it "can specify that INSERT IGNORE should be used" do
+    dataset.should_receive(:insert_ignore)
+    described_class.new(db, :table, [:foo],
+                        :filepath => "test_file", :ignore => true).close
+  end
+  it "writes csv to a tempfile if no explicit filepath is given" do
+    described_class.new(db, :table, [:foo]).filepath.should match(/table\.\d+\.csv/)
+  end
+  it "doesn't attempt to load data if the file is empty or does not exist" do
+    File.stub(:size?).and_return(false)
+    dataset.should_not_receive(:load_csv_infile)
+    sink.close
+  end
+  it "removes the temporary file when closed" do
+    File.stub(:exists?).and_return(true)
+    File.should_receive(:unlink).with("test_file")
+    sink.close
+  end
+end

data/spec/mysql_integration_spec.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "Mysql -> Mysql through transformation chain" do
+  let(:dup_row) {
+    Class.new(Transformation) {
+      def output_streams
+        [:default, @options[:onto]].flatten
+      end
+      def process_row(row)
+        new_row = assign_stream(row.dup, @options[:onto])
+        [row, new_row]
+      end
+    }
+  }
+  before :all do
+    unless TEST_DB.table_exists?(:source)
+      TEST_DB.create_table(:source) do
+        primary_key :id
+        varchar :foo
+        binary  :bin, :size => 1
+      end
+    end
+    unless TEST_DB.table_exists?(:destination)
+      TEST_DB.create_table(:destination) do
+        primary_key :id
+        varchar :foo
+        binary  :bin, :size => 1
+      end
+    end
+  end
+  before :each do
+    TEST_DB[:source].truncate
+    TEST_DB[:destination].truncate
+  end
+  after :each do
+    # TEST_DB[:source].truncate
+    # TEST_DB[:destination].truncate
+  end
+  it "copies data from source to destination" do
+    TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
+                                  {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
+    source = DatasetSource.new(TEST_DB[:source].select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
+    sink_1 = MysqlFileSink.new(TEST_DB, :destination, [:id, :foo, :bin])
+    sink_2 = ArraySink.new([:id, :foo, :bin])
+    stage = PipelineStage.new(source,
+                              :transformations => [dup_row.new(:onto => :other)])
+    expect { stage.execute }.to raise_error
+    stage.register_sink(:default, sink_1)
+    stage.register_sink(:other, sink_2)
+    stage.execute
+    expected = [{:id => 1, :foo => nil, :bin => "1F"},
+                {:id => 2, :foo => "Hello", :bin => "1F"}]
+    sink_2.data.should == expected
+    TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
+  end
+end

data/spec/pipeline_stage_spec.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe PipelineStage do
+  let(:transform) {
+    Class.new(Transformation) {
+      def process_row(row)
+        row[:a] += 1
+        row
+      end
+    }
+  }
+  let(:add_error) {
+    Class.new(Transformation) {
+      # add_output_stream :error
+      def output_streams
+        [:default, :error]
+      end
+      def process_row(row)
+        [row, {STREAM => :error, :message => "error"}]
+      end
+    }
+  }
+  let(:sink) { ArraySink.new }
+  let(:source) { ArraySource.new([{:a => 1}]) }
+  it "reads from source to sink" do
+    pipeline = described_class.new(source).register_sink(:default, sink)
+    pipeline.execute
+    sink.data.should == [{:a => 1}]
+  end
+  it "passes rows through transforms" do
+    pipeline = described_class.new(source, :transformations => [transform.new]).
+      register_sink(:default, sink)
+    pipeline.execute
+    sink.data.should == [{:a => 2}]
+  end
+  it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
+    error_sink = ArraySink.new
+    pipeline = described_class.new(source, :transformations => [add_error.new]).
+      register_sink(:default, sink).
+      register_sink(:error, error_sink)
+    pipeline.execute
+    sink.data.should == [{:a => 1}]
+    error_sink.data.should == [{:message => "error"}]
+  end
+  it "calls an error handler if sinks are not registered" do
+    error_handler = mock(:error_handler)
+    error_handler.should_receive(:unregistered_sinks).with([:default, :error])
+    pipeline = described_class.new(source,
+                                   :transformations => [add_error.new],
+                                   :error_handler => error_handler)
+    pipeline.validate_pipeline
+  end
+  it "by default raises an exception if the pipeline is not valid when executed" do
+    pipeline = described_class.new(source,
+                                   :transformations => [add_error.new])
+    expect { pipeline.execute }.to raise_error(Chicago::Flow::Error)
+  end
+  it "opens sinks before writing and closes them afterwards" do
+    sink = mock(:sink)
+    pipeline = described_class.new(source).register_sink(:default, sink)
+    sink.should_receive(:open)
+    sink.stub(:<<)
+    sink.should_receive(:close)
+    pipeline.execute
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,20 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rspec'
+require 'yaml'
+require 'chicago/flow'
+require 'chicago/flow/mysql'
+unless defined? TEST_DB
+  TEST_DB = Sequel.connect(YAML.load(File.read(File.dirname(__FILE__) + "/database.yml")))
+end
+include Chicago::Flow
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
+RSpec.configure do |config|
+end

data/spec/transformation_chain_spec.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe TransformationChain do
+  let(:add_1_to_a) {
+    Class.new(Transformation) {
+      def process_row(row)
+        row[:a] += 1
+        row
+      end
+    }
+  }
+  let(:dup_row) {
+    Class.new(Transformation) {
+      def output_streams
+        [:default, @options[:onto]].flatten
+      end
+      def process_row(row)
+        new_row = assign_stream(row.dup, @options[:onto])
+        [row, new_row]
+      end
+    }
+  }
+  let(:store_until_flush) {
+    Class.new(Chicago::Flow::Transformation) {
+      def process_row(row)
+        @cache ||= []
+        @cache << row
+        nil
+      end
+      def flush
+        @cache
+      end
+    }
+  }
+  it "chains transformations" do
+    described_class.new(add_1_to_a.new, add_1_to_a.new).process({:a => 1}).
+      should == [{:a => 3}]
+  end
+  it "can cope with multiple return rows from transformations" do
+    described_class.new(add_1_to_a.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
+      should == [{:a => 3}, {:a => 3}]
+  end
+  it "can cope with a filter returning nil" do
+    described_class.new(Filter.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
+      should == []
+  end
+  it "can write to different streams" do
+    described_class.new(dup_row.new(:onto => :other),
+                        add_1_to_a.new).process({:a => 1}).
+      should == [{:a => 2}, {:a => 1, Chicago::Flow::STREAM => :other}]
+  end
+  it "knows what streams it writes to as a chain" do
+    described_class.new(dup_row.new(:onto => :other),
+                        add_1_to_a.new).output_streams.should == [:default, :other]
+  end
+  it "can flush rows held back by transforms" do
+    chain = described_class.new(store_until_flush.new,
+                                add_1_to_a.new,
+                                store_until_flush.new,
+                                add_1_to_a.new)
+    chain.process({:a => 1}).should == []
+    chain.process({:a => 2}).should == []
+    chain.flush.should == [{:a => 3}, {:a => 4}]
+  end
+end

data/spec/transformation_spec.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe Chicago::Flow::Transformation do
+  let(:add_1_to_a) {
+    Class.new(Chicago::Flow::Transformation) {
+      def process_row(row)
+        row[:a] += 1
+        row
+      end
+    }
+  }
+  let(:add_and_remove) {
+    Class.new(Chicago::Flow::Transformation) {
+      adds_fields :b, :c
+      removes_fields :a
+      def process_row(row)
+        row.delete(:a)
+        row[:b] = 1
+        row[:c] = 2
+        row
+      end
+    }
+  }
+  it "writes to the :default stream by default" do
+    subject.output_streams.should == [:default]
+  end
+  it "may apply to a particular stream" do
+    subject.applies_to_stream?(:default).should be_true
+    subject.applies_to_stream?(nil).should be_true
+    described_class.new(:other).applies_to_stream?(:default).should be_false
+    described_class.new(:other).applies_to_stream?(:other).should be_true
+  end
+  it "processes a row via #process_row" do
+    add_1_to_a.new.process({:a => 1}).should == {:a => 2}
+  end
+  it "passes through rows not on its stream" do
+    add_1_to_a.new(:other).process({:a => 1}).should == {:a => 1}
+  end
+  it "can apply to all streams using :all" do
+    add_1_to_a.new(:all).process({:a => 1}).should == {:a => 2}
+    add_1_to_a.new(:all).process({:a => 1, Chicago::Flow::STREAM => :other}).
+      should == {:a => 2, Chicago::Flow::STREAM => :other}
+  end
+  it "can be flushed" do
+    subject.flush.should == []
+  end
+  it "can specify which fields are added" do
+    add_and_remove.new.added_fields.should == [:b, :c]
+  end
+  it "can specify which fields are removed" do
+    add_and_remove.new.removed_fields.should == [:a]
+  end
+  it "can calculate downstream fields" do
+    Set.new(add_and_remove.new.downstream_fields([:a, :b, :d])).
+      should == Set.new([:b, :c, :d])
+  end
+  it "can calculate upstream fields" do
+    Set.new(add_and_remove.new.upstream_fields([:b, :c, :d])).
+      should == Set.new([:a, :d])
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,210 @@
+--- !ruby/object:Gem::Specification
+name: chicago-flow
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Roland Swingler
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-06-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: fastercsv
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: sequel
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: sequel_load_data_infile
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.0.2
+- !ruby/object:Gem::Dependency
+  name: sequel_fast_columns
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: mysql
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.8.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.8.1
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1'
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.4
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.4
+description: Dataflow-style processing for hash-like rows
+email: roland.swingler@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- .document
+- .rspec
+- Gemfile
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/chicago/flow.rb
+- lib/chicago/flow/array_sink.rb
+- lib/chicago/flow/array_source.rb
+- lib/chicago/flow/dataset_source.rb
+- lib/chicago/flow/filter.rb
+- lib/chicago/flow/mysql.rb
+- lib/chicago/flow/mysql_file_serializer.rb
+- lib/chicago/flow/mysql_file_sink.rb
+- lib/chicago/flow/pipeline_endpoint.rb
+- lib/chicago/flow/pipeline_stage.rb
+- lib/chicago/flow/sink.rb
+- lib/chicago/flow/transformation.rb
+- lib/chicago/flow/transformation_chain.rb
+- spec/array_sink_spec.rb
+- spec/array_source_spec.rb
+- spec/database.yml.dist
+- spec/dataset_source_spec.rb
+- spec/filter_spec.rb
+- spec/mysql_file_serializer_spec.rb
+- spec/mysql_file_sink_spec.rb
+- spec/mysql_integration_spec.rb
+- spec/pipeline_stage_spec.rb
+- spec/spec_helper.rb
+- spec/transformation_chain_spec.rb
+- spec/transformation_spec.rb
+homepage: http://github.com/notonthehighstreet/chicago-flow
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -2256243131001367107
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Dataflow-style processing for hash-like rows
+test_files: []