chicago-flow 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "fastercsv", :platform => :ruby_18
4
+ gem "sequel"
5
+ gem "sequel_load_data_infile", ">= 0.0.2", :require => "sequel/load_data_infile"
6
+ gem "sequel_fast_columns", :require => "sequel/fast_columns"
7
+
8
+ # Add dependencies to develop your gem here.
9
+ # Include everything needed to run rake, tests, features, etc.
10
+ group :development do
11
+ gem "mysql", "2.8.1"
12
+ gem "rspec", "~> 2"
13
+ gem "bundler", "~> 1"
14
+ gem "jeweler", "~> 1.8.4"
15
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 notonthehighstreet.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,24 @@
1
+ = chicago-flow
2
+
3
+ A dataflow programming model for processing rows of hash-like
4
+ data. Used in the Chicago Warehouse ETL process.
5
+
6
+ == Contributing to chicago-flow
7
+
8
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
9
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
10
+ * Fork the project.
11
+ * Start a feature/bugfix branch.
12
+ * Commit and push until you are happy with your contribution.
13
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
14
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
15
+
16
+ == Authors
17
+
18
+ Roland Swingler (@knaveofdiamonds)
19
+
20
+ == Copyright
21
+
22
+ Copyright (c) 2013 notonthehighstreet.com. See LICENSE.txt for
23
+ further details.
24
+
data/Rakefile ADDED
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ gem.name = "chicago-flow"
17
+ gem.homepage = "http://github.com/notonthehighstreet/chicago-flow"
18
+ gem.license = "MIT"
19
+ gem.summary = "Dataflow-style processing for hash-like rows"
20
+ gem.description = "Dataflow-style processing for hash-like rows"
21
+ gem.email = "roland.swingler@gmail.com"
22
+ gem.authors = ["Roland Swingler"]
23
+ # dependencies defined in Gemfile
24
+ end
25
+ Jeweler::RubygemsDotOrgTasks.new
26
+
27
+ require 'rspec/core'
28
+ require 'rspec/core/rake_task'
29
+ RSpec::Core::RakeTask.new(:spec) do |spec|
30
+ spec.pattern = FileList['spec/**/*_spec.rb']
31
+ end
32
+
33
+ task :default => :spec
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,16 @@
1
+ if RUBY_VERSION.split(".")[1] < "9"
2
+ require 'fastercsv'
3
+ CSV = FasterCSV
4
+ else
5
+ require 'csv'
6
+ end
7
+
8
+ require 'chicago/flow/transformation'
9
+ require 'chicago/flow/filter'
10
+ require 'chicago/flow/transformation_chain'
11
+ require 'chicago/flow/pipeline_stage'
12
+ require 'chicago/flow/pipeline_endpoint'
13
+ require 'chicago/flow/array_source'
14
+ require 'chicago/flow/dataset_source'
15
+ require 'chicago/flow/sink'
16
+ require 'chicago/flow/array_sink'
@@ -0,0 +1,16 @@
1
+ module Chicago
2
+ module Flow
3
+ class ArraySink < Sink
4
+ attr_reader :data
5
+
6
+ def initialize(fields=[])
7
+ @fields = [fields].flatten
8
+ @data = []
9
+ end
10
+
11
+ def <<(row)
12
+ @data << row.merge(constant_values)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,14 @@
1
+ module Chicago
2
+ module Flow
3
+ class ArraySource < PipelineEndpoint
4
+ def initialize(array, fields=[])
5
+ @fields = [fields].flatten
6
+ @array = array
7
+ end
8
+
9
+ def each
10
+ @array.each {|row| yield row }
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ require 'sequel'
2
+ require 'sequel/fast_columns'
3
+
4
+ module Chicago
5
+ module Flow
6
+ class DatasetSource < PipelineEndpoint
7
+ def initialize(dataset)
8
+ @dataset = dataset
9
+ end
10
+
11
+ def each
12
+ @dataset.each {|row| yield row }
13
+ end
14
+
15
+ def fields
16
+ @dataset.columns
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,14 @@
1
+ module Chicago
2
+ module Flow
3
+ class Filter < Transformation
4
+ def initialize(stream=:default, &block)
5
+ super(stream)
6
+ @block = block || lambda {|row| false }
7
+ end
8
+
9
+ def process_row(row)
10
+ row if @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,4 @@
1
+ require 'sequel'
2
+ require 'sequel/load_data_infile'
3
+ require 'chicago/flow/mysql_file_serializer'
4
+ require 'chicago/flow/mysql_file_sink'
@@ -0,0 +1,26 @@
1
+ require 'date'
2
+
3
+ module Chicago
4
+ module Flow
5
+ class MysqlFileSerializer
6
+ # Transforms a value to be suitable for use in file in a LOAD
7
+ # DATA INFILE mysql statement.
8
+ def serialize(value)
9
+ case value
10
+ when nil
11
+ "NULL"
12
+ when true
13
+ "1"
14
+ when false
15
+ "0"
16
+ when Time, DateTime
17
+ value.strftime("%Y-%m-%d %H:%M:%S")
18
+ when Date
19
+ value.strftime("%Y-%m-%d")
20
+ else
21
+ value
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,54 @@
1
+ require 'sequel'
2
+ require 'sequel/load_data_infile'
3
+ require 'tmpdir'
4
+
5
+ Sequel.extension :core_extensions
6
+
7
+ module Chicago
8
+ module Flow
9
+ class MysqlFileSink < Sink
10
+ attr_reader :filepath
11
+
12
+ def initialize(db, table_name, fields, options = {})
13
+ @fields = [fields].flatten
14
+ @filepath = options[:filepath] || temp_file(table_name)
15
+ @serializer = MysqlFileSerializer.new
16
+ @db = db
17
+ @table_name = table_name
18
+ @insert_ignore = !!options[:ignore]
19
+ end
20
+
21
+ def <<(row)
22
+ csv << fields.map {|c| @serializer.serialize(row[c]) }
23
+ end
24
+
25
+ def close
26
+ csv.flush
27
+ load_from_file(filepath)
28
+ csv.close
29
+ File.unlink(filepath) if File.exists?(filepath)
30
+ end
31
+
32
+ # Loads data from the file into the MySQL table via LOAD DATA
33
+ # INFILE, if the file exists and has content.
34
+ def load_from_file(file)
35
+ return unless File.size?(file)
36
+ dataset.load_csv_infile(file, @fields, :set => constant_values)
37
+ end
38
+
39
+ private
40
+
41
+ def dataset
42
+ @insert_ignore ? @db[@table_name].insert_ignore : @db[@table_name]
43
+ end
44
+
45
+ def csv
46
+ @csv ||= CSV.open(filepath, "w")
47
+ end
48
+
49
+ def temp_file(table_name)
50
+ File.join(Dir.tmpdir, "#{table_name}.#{rand(1_000_000)}.csv")
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,11 @@
1
+ module Chicago
2
+ module Flow
3
+ class PipelineEndpoint
4
+ attr_reader :fields
5
+
6
+ def has_defined_fields?
7
+ !fields.empty?
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,60 @@
1
+ module Chicago
2
+ module Flow
3
+ class Error < RuntimeError
4
+ end
5
+
6
+ class RaisingErrorHandler
7
+ def unregistered_sinks(sinks)
8
+ raise Error.new("Sinks not registered: #{sinks.join(",")}")
9
+ end
10
+ end
11
+
12
+ class PipelineStage
13
+ attr_reader :transformation_chain
14
+
15
+ def initialize(source, options={})
16
+ @source = source
17
+ @sinks = options[:sinks] || {}
18
+ @transformations = options[:transformations] || []
19
+ @error_handler = options[:error_handler] || RaisingErrorHandler.new
20
+ @transformation_chain = TransformationChain.new(*@transformations)
21
+ end
22
+
23
+ def register_sink(name, sink)
24
+ @sinks[name.to_sym] = sink
25
+ self
26
+ end
27
+
28
+ def validate_pipeline
29
+ unless unregistered_sinks.empty?
30
+ @error_handler.unregistered_sinks(unregistered_sinks)
31
+ end
32
+ end
33
+
34
+ def execute
35
+ validate_pipeline
36
+ @sinks.values.each(&:open)
37
+ @source.each do |row|
38
+ transformation_chain.process(row).each {|row| process_row(row) }
39
+ end
40
+ transformation_chain.flush.each {|row| process_row(row) }
41
+ @sinks.values.each(&:close)
42
+ end
43
+
44
+ def required_sinks
45
+ transformation_chain.output_streams | [:default]
46
+ end
47
+
48
+ def unregistered_sinks
49
+ required_sinks - @sinks.keys
50
+ end
51
+
52
+ private
53
+
54
+ def process_row(row)
55
+ stream = row.delete(:_stream) || :default
56
+ @sinks[stream] << row
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,34 @@
1
+ module Chicago
2
+ module Flow
3
+ class Sink < PipelineEndpoint
4
+ # Specifies a hash of values that are assumed to apply to all
5
+ # rows.
6
+ #
7
+ # Subclasses should use there constant values appropriately when
8
+ # writing rows, by merging them with the row or otherwise
9
+ # ensuring that they end up in the final source this sink
10
+ # represents.
11
+ def constant_values
12
+ @constant_values ||= {}
13
+ end
14
+
15
+ # Performs any operations before writing rows to this sink.
16
+ #
17
+ # By default does nothing; may be overridden by subclasses.
18
+ def open
19
+ end
20
+
21
+ # Performs any operations after writing rows to this sink.
22
+ #
23
+ # By default does nothing; may be overridden by subclasses.
24
+ def close
25
+ end
26
+
27
+ # Writes a row to this sink.
28
+ #
29
+ # By default does nothing; may be overridden by subclasses.
30
+ def <<(row)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,78 @@
1
+ module Chicago
2
+ module Flow
3
+ STREAM = :_stream
4
+
5
+ class Transformation
6
+ def initialize(*args)
7
+ stream, options = *args
8
+ if stream.kind_of?(Hash)
9
+ @stream = :default
10
+ @options = stream
11
+ else
12
+ @stream = stream || :default
13
+ @options = options || {}
14
+ end
15
+ end
16
+
17
+ class << self
18
+ attr_reader :added_fields, :removed_fields
19
+
20
+ def adds_fields(*fields)
21
+ @added_fields ||= []
22
+ @added_fields += fields.flatten
23
+ end
24
+
25
+ def removes_fields(*fields)
26
+ @removed_fields ||= []
27
+ @removed_fields += fields.flatten
28
+ end
29
+ end
30
+
31
+ def added_fields
32
+ self.class.added_fields
33
+ end
34
+
35
+ def removed_fields
36
+ self.class.removed_fields
37
+ end
38
+
39
+ def upstream_fields(fields)
40
+ ((fields + removed_fields) - added_fields).uniq
41
+ end
42
+
43
+ def downstream_fields(fields)
44
+ ((fields - removed_fields) + added_fields).uniq
45
+ end
46
+
47
+ def process(row)
48
+ applies_to_stream?(row[STREAM]) ? process_row(row) : row
49
+ end
50
+
51
+ def flush
52
+ []
53
+ end
54
+
55
+ def output_streams
56
+ [:default]
57
+ end
58
+
59
+ def applies_to_stream?(target_stream)
60
+ @stream == :all ||
61
+ (target_stream.nil? && @stream == :default) ||
62
+ target_stream == @stream
63
+ end
64
+
65
+ protected
66
+
67
+ def process_row(row)
68
+ row
69
+ end
70
+
71
+ def assign_stream(row, stream)
72
+ raise "Stream not declared" unless stream.nil? || output_streams.include?(stream)
73
+ row[STREAM] = stream if stream
74
+ row
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,39 @@
1
+ module Chicago
2
+ module Flow
3
+ class TransformationChain
4
+ def initialize(*transforms)
5
+ @transforms = transforms
6
+ end
7
+
8
+ def output_streams
9
+ @transforms.inject([]) {|s, t| s | t.output_streams }
10
+ end
11
+
12
+ def process(row)
13
+ @transforms.inject([row]) do |rows, transform|
14
+ process_rows(rows, transform)
15
+ end
16
+ end
17
+
18
+ def flush
19
+ @transforms.inject([]) do |rows, transform|
20
+ process_rows(rows, transform) + transform.flush
21
+ end
22
+ end
23
+
24
+ def upstream_fields(fields)
25
+ @transforms.inject(fields) {|t| t.upstream_fields(fields) }
26
+ end
27
+
28
+ def downstream_fields(fields)
29
+ @transforms.inject(fields) {|t| t.downstream_fields(fields) }
30
+ end
31
+
32
+ private
33
+
34
+ def process_rows(rows, transform)
35
+ rows.map {|row| transform.process(row) }.flatten.compact
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,14 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe ArraySink do
4
+ it "stores rows in #data" do
5
+ subject << {:a => 1}
6
+ subject.data.should == [{:a => 1}]
7
+ end
8
+
9
+ it "merges constant values into the sink row" do
10
+ subject.constant_values[:number] = 1
11
+ subject << {:a => 1}
12
+ subject.data.should == [{:a => 1, :number => 1}]
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe ArraySource do
4
+ it "has an each method that yields rows" do
5
+ ArraySource.new([{:a => 1}]).each do |row|
6
+ row.should == {:a => 1}
7
+ end
8
+ end
9
+
10
+ it "doesn't know about any fields rows have by default" do
11
+ ArraySource.new([]).fields.should == []
12
+ ArraySource.new([]).should_not have_defined_fields
13
+ end
14
+
15
+ it "can optionally define which fields will be in rows" do
16
+ ArraySource.new([], [:a, :b]).fields.should == [:a, :b]
17
+ ArraySource.new([], :a).fields.should == [:a]
18
+ ArraySource.new([], :a).should have_defined_fields
19
+ end
20
+ end
@@ -0,0 +1,4 @@
1
+ adapter: mysql
2
+ username: root
3
+ socket: /var/run/mysqld/mysqld.sock
4
+ database: chicago_flow_test_db
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe DatasetSource do
4
+ let(:dataset) { stub(:dataset) }
5
+
6
+ it "should delegtate each to the dataset" do
7
+ dataset.should_receive(:each)
8
+ described_class.new(dataset).each {|row| }
9
+ end
10
+
11
+ it "gets columns from the dataset" do
12
+ dataset.should_receive(:columns)
13
+ described_class.new(dataset).fields
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe Filter do
4
+ it "filters all rows by default" do
5
+ subject.process({:a => 1}).should be_nil
6
+ end
7
+
8
+ it "filters rows given a block" do
9
+ filter = Filter.new {|row| row.has_key?(:a) }
10
+ filter.process(:a => 1).should == {:a => 1}
11
+ filter.process(:b => 1).should be_nil
12
+ end
13
+ end
@@ -0,0 +1,27 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe Chicago::Flow::MysqlFileSerializer do
4
+ it "serializes nil into NULL" do
5
+ subject.serialize(nil).should == "NULL"
6
+ end
7
+
8
+ it "serializes true into '1'" do
9
+ subject.serialize(true).should == "1"
10
+ end
11
+
12
+ it "serializes false into '0'" do
13
+ subject.serialize(false).should == "0"
14
+ end
15
+
16
+ it "serializes times into mysql time format" do
17
+ subject.serialize(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
18
+ end
19
+
20
+ it "serializes datetimes into mysql time format" do
21
+ subject.serialize(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
22
+ end
23
+
24
+ it "serializes dates into mysql date format" do
25
+ subject.serialize(Date.new(2011,01,02)).should == "2011-01-02"
26
+ end
27
+ end
@@ -0,0 +1,77 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'sequel'
3
+
4
+ describe MysqlFileSink do
5
+ let(:dataset) { mock(:dataset).as_null_object }
6
+ let(:db) { mock(:db, :[] => dataset, :schema => []) }
7
+ let(:csv) { mock(:csv) }
8
+
9
+ let(:sink) {
10
+ described_class.new(db, :table, [:foo], :filepath => "test_file")
11
+ }
12
+
13
+ before :each do
14
+ CSV.stub(:open).and_return(csv)
15
+ csv.stub(:<<)
16
+ csv.stub(:close).and_return(csv)
17
+ csv.stub(:flush)
18
+
19
+ File.stub(:size?).and_return(true)
20
+ end
21
+
22
+ it "writes specified columns to rows in a file" do
23
+ csv.should_receive(:<<).with([1])
24
+ sink << {:foo => 1, :bar => 2}
25
+ end
26
+
27
+ it "serializes values before writing to the file" do
28
+ MysqlFileSerializer.any_instance.should_receive(:serialize).with(1).and_return(1)
29
+ sink << {:foo => 1}
30
+ end
31
+
32
+ it "has defined fields" do
33
+ sink.should have_defined_fields
34
+ sink.fields.should == [:foo]
35
+ end
36
+
37
+ it "loads the csv file into the database when closed" do
38
+ dataset.should_receive(:load_csv_infile).
39
+ with("test_file", [:foo], :set => {})
40
+ sink.close
41
+ end
42
+
43
+ it "uses the :set hash to load constant values" do
44
+ sink.constant_values[:bar] = 1
45
+ dataset.should_receive(:load_csv_infile).
46
+ with("test_file", [:foo], :set => {:bar => 1})
47
+ sink.close
48
+ end
49
+
50
+ it "does not IGNORE rows by default" do
51
+ dataset.should_not_receive(:insert_ignore)
52
+ sink.close
53
+ end
54
+
55
+ it "can specify that INSERT IGNORE should be used" do
56
+ dataset.should_receive(:insert_ignore)
57
+ described_class.new(db, :table, [:foo],
58
+ :filepath => "test_file", :ignore => true).close
59
+ end
60
+
61
+ it "writes csv to a tempfile if no explicit filepath is given" do
62
+ described_class.new(db, :table, [:foo]).filepath.should match(/table\.\d+\.csv/)
63
+ end
64
+
65
+ it "doesn't attempt to load data if the file is empty or does not exist" do
66
+ File.stub(:size?).and_return(false)
67
+ dataset.should_not_receive(:load_csv_infile)
68
+ sink.close
69
+ end
70
+
71
+ it "removes the temporary file when closed" do
72
+ File.stub(:exists?).and_return(true)
73
+ File.should_receive(:unlink).with("test_file")
74
+
75
+ sink.close
76
+ end
77
+ end
@@ -0,0 +1,69 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Mysql -> Mysql through transformation chain" do
4
+ let(:dup_row) {
5
+ Class.new(Transformation) {
6
+ def output_streams
7
+ [:default, @options[:onto]].flatten
8
+ end
9
+
10
+ def process_row(row)
11
+ new_row = assign_stream(row.dup, @options[:onto])
12
+ [row, new_row]
13
+ end
14
+ }
15
+ }
16
+
17
+ before :all do
18
+ unless TEST_DB.table_exists?(:source)
19
+ TEST_DB.create_table(:source) do
20
+ primary_key :id
21
+ varchar :foo
22
+ binary :bin, :size => 1
23
+ end
24
+ end
25
+
26
+ unless TEST_DB.table_exists?(:destination)
27
+ TEST_DB.create_table(:destination) do
28
+ primary_key :id
29
+ varchar :foo
30
+ binary :bin, :size => 1
31
+ end
32
+ end
33
+ end
34
+
35
+ before :each do
36
+ TEST_DB[:source].truncate
37
+ TEST_DB[:destination].truncate
38
+ end
39
+
40
+ after :each do
41
+ # TEST_DB[:source].truncate
42
+ # TEST_DB[:destination].truncate
43
+ end
44
+
45
+ it "copies data from source to destination" do
46
+ TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
47
+ {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
+
49
+ source = DatasetSource.new(TEST_DB[:source].select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
50
+ sink_1 = MysqlFileSink.new(TEST_DB, :destination, [:id, :foo, :bin])
51
+ sink_2 = ArraySink.new([:id, :foo, :bin])
52
+
53
+ stage = PipelineStage.new(source,
54
+ :transformations => [dup_row.new(:onto => :other)])
55
+
56
+ expect { stage.execute }.to raise_error
57
+
58
+ stage.register_sink(:default, sink_1)
59
+ stage.register_sink(:other, sink_2)
60
+
61
+ stage.execute
62
+
63
+ expected = [{:id => 1, :foo => nil, :bin => "1F"},
64
+ {:id => 2, :foo => "Hello", :bin => "1F"}]
65
+
66
+ sink_2.data.should == expected
67
+ TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
68
+ end
69
+ end
@@ -0,0 +1,81 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe PipelineStage do
4
+ let(:transform) {
5
+ Class.new(Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_error) {
14
+ Class.new(Transformation) {
15
+ # add_output_stream :error
16
+ def output_streams
17
+ [:default, :error]
18
+ end
19
+
20
+ def process_row(row)
21
+ [row, {STREAM => :error, :message => "error"}]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:sink) { ArraySink.new }
27
+ let(:source) { ArraySource.new([{:a => 1}]) }
28
+
29
+ it "reads from source to sink" do
30
+ pipeline = described_class.new(source).register_sink(:default, sink)
31
+ pipeline.execute
32
+ sink.data.should == [{:a => 1}]
33
+ end
34
+
35
+ it "passes rows through transforms" do
36
+ pipeline = described_class.new(source, :transformations => [transform.new]).
37
+ register_sink(:default, sink)
38
+
39
+ pipeline.execute
40
+ sink.data.should == [{:a => 2}]
41
+ end
42
+
43
+ it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
44
+ error_sink = ArraySink.new
45
+
46
+ pipeline = described_class.new(source, :transformations => [add_error.new]).
47
+ register_sink(:default, sink).
48
+ register_sink(:error, error_sink)
49
+
50
+ pipeline.execute
51
+ sink.data.should == [{:a => 1}]
52
+ error_sink.data.should == [{:message => "error"}]
53
+ end
54
+
55
+ it "calls an error handler if sinks are not registered" do
56
+ error_handler = mock(:error_handler)
57
+ error_handler.should_receive(:unregistered_sinks).with([:default, :error])
58
+
59
+ pipeline = described_class.new(source,
60
+ :transformations => [add_error.new],
61
+ :error_handler => error_handler)
62
+
63
+ pipeline.validate_pipeline
64
+ end
65
+
66
+ it "by default raises an exception if the pipeline is not valid when executed" do
67
+ pipeline = described_class.new(source,
68
+ :transformations => [add_error.new])
69
+
70
+ expect { pipeline.execute }.to raise_error(Chicago::Flow::Error)
71
+ end
72
+
73
+ it "opens sinks before writing and closes them afterwards" do
74
+ sink = mock(:sink)
75
+ pipeline = described_class.new(source).register_sink(:default, sink)
76
+ sink.should_receive(:open)
77
+ sink.stub(:<<)
78
+ sink.should_receive(:close)
79
+ pipeline.execute
80
+ end
81
+ end
@@ -0,0 +1,20 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'yaml'
5
+ require 'chicago/flow'
6
+ require 'chicago/flow/mysql'
7
+
8
+ unless defined? TEST_DB
9
+ TEST_DB = Sequel.connect(YAML.load(File.read(File.dirname(__FILE__) + "/database.yml")))
10
+ end
11
+
12
+ include Chicago::Flow
13
+
14
+ # Requires supporting files with custom matchers and macros, etc,
15
+ # in ./support/ and its subdirectories.
16
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
17
+
18
+ RSpec.configure do |config|
19
+
20
+ end
@@ -0,0 +1,75 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe TransformationChain do
4
+ let(:add_1_to_a) {
5
+ Class.new(Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:dup_row) {
14
+ Class.new(Transformation) {
15
+ def output_streams
16
+ [:default, @options[:onto]].flatten
17
+ end
18
+
19
+ def process_row(row)
20
+ new_row = assign_stream(row.dup, @options[:onto])
21
+ [row, new_row]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:store_until_flush) {
27
+ Class.new(Chicago::Flow::Transformation) {
28
+ def process_row(row)
29
+ @cache ||= []
30
+ @cache << row
31
+ nil
32
+ end
33
+
34
+ def flush
35
+ @cache
36
+ end
37
+ }
38
+ }
39
+
40
+ it "chains transformations" do
41
+ described_class.new(add_1_to_a.new, add_1_to_a.new).process({:a => 1}).
42
+ should == [{:a => 3}]
43
+ end
44
+
45
+ it "can cope with multiple return rows from transformations" do
46
+ described_class.new(add_1_to_a.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
47
+ should == [{:a => 3}, {:a => 3}]
48
+ end
49
+
50
+ it "can cope with a filter returning nil" do
51
+ described_class.new(Filter.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
52
+ should == []
53
+ end
54
+
55
+ it "can write to different streams" do
56
+ described_class.new(dup_row.new(:onto => :other),
57
+ add_1_to_a.new).process({:a => 1}).
58
+ should == [{:a => 2}, {:a => 1, Chicago::Flow::STREAM => :other}]
59
+ end
60
+
61
+ it "knows what streams it writes to as a chain" do
62
+ described_class.new(dup_row.new(:onto => :other),
63
+ add_1_to_a.new).output_streams.should == [:default, :other]
64
+ end
65
+
66
+ it "can flush rows held back by transforms" do
67
+ chain = described_class.new(store_until_flush.new,
68
+ add_1_to_a.new,
69
+ store_until_flush.new,
70
+ add_1_to_a.new)
71
+ chain.process({:a => 1}).should == []
72
+ chain.process({:a => 2}).should == []
73
+ chain.flush.should == [{:a => 3}, {:a => 4}]
74
+ end
75
+ end
@@ -0,0 +1,73 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe Chicago::Flow::Transformation do
4
+ let(:add_1_to_a) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_and_remove) {
14
+ Class.new(Chicago::Flow::Transformation) {
15
+ adds_fields :b, :c
16
+ removes_fields :a
17
+
18
+ def process_row(row)
19
+ row.delete(:a)
20
+ row[:b] = 1
21
+ row[:c] = 2
22
+ row
23
+ end
24
+ }
25
+ }
26
+
27
+ it "writes to the :default stream by default" do
28
+ subject.output_streams.should == [:default]
29
+ end
30
+
31
+ it "may apply to a particular stream" do
32
+ subject.applies_to_stream?(:default).should be_true
33
+ subject.applies_to_stream?(nil).should be_true
34
+ described_class.new(:other).applies_to_stream?(:default).should be_false
35
+ described_class.new(:other).applies_to_stream?(:other).should be_true
36
+ end
37
+
38
+ it "processes a row via #process_row" do
39
+ add_1_to_a.new.process({:a => 1}).should == {:a => 2}
40
+ end
41
+
42
+ it "passes through rows not on its stream" do
43
+ add_1_to_a.new(:other).process({:a => 1}).should == {:a => 1}
44
+ end
45
+
46
+ it "can apply to all streams using :all" do
47
+ add_1_to_a.new(:all).process({:a => 1}).should == {:a => 2}
48
+ add_1_to_a.new(:all).process({:a => 1, Chicago::Flow::STREAM => :other}).
49
+ should == {:a => 2, Chicago::Flow::STREAM => :other}
50
+ end
51
+
52
+ it "can be flushed" do
53
+ subject.flush.should == []
54
+ end
55
+
56
+ it "can specify which fields are added" do
57
+ add_and_remove.new.added_fields.should == [:b, :c]
58
+ end
59
+
60
+ it "can specify which fields are removed" do
61
+ add_and_remove.new.removed_fields.should == [:a]
62
+ end
63
+
64
+ it "can calculate downstream fields" do
65
+ Set.new(add_and_remove.new.downstream_fields([:a, :b, :d])).
66
+ should == Set.new([:b, :c, :d])
67
+ end
68
+
69
+ it "can calculate upstream fields" do
70
+ Set.new(add_and_remove.new.upstream_fields([:b, :c, :d])).
71
+ should == Set.new([:a, :d])
72
+ end
73
+ end
metadata ADDED
@@ -0,0 +1,210 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chicago-flow
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Roland Swingler
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: fastercsv
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: sequel
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: sequel_load_data_infile
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 0.0.2
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.2
62
+ - !ruby/object:Gem::Dependency
63
+ name: sequel_fast_columns
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mysql
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - '='
84
+ - !ruby/object:Gem::Version
85
+ version: 2.8.1
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - '='
92
+ - !ruby/object:Gem::Version
93
+ version: 2.8.1
94
+ - !ruby/object:Gem::Dependency
95
+ name: rspec
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: '2'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: '2'
110
+ - !ruby/object:Gem::Dependency
111
+ name: bundler
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '1'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: '1'
126
+ - !ruby/object:Gem::Dependency
127
+ name: jeweler
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ~>
132
+ - !ruby/object:Gem::Version
133
+ version: 1.8.4
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ~>
140
+ - !ruby/object:Gem::Version
141
+ version: 1.8.4
142
+ description: Dataflow-style processing for hash-like rows
143
+ email: roland.swingler@gmail.com
144
+ executables: []
145
+ extensions: []
146
+ extra_rdoc_files:
147
+ - LICENSE.txt
148
+ - README.rdoc
149
+ files:
150
+ - .document
151
+ - .rspec
152
+ - Gemfile
153
+ - LICENSE.txt
154
+ - README.rdoc
155
+ - Rakefile
156
+ - VERSION
157
+ - lib/chicago/flow.rb
158
+ - lib/chicago/flow/array_sink.rb
159
+ - lib/chicago/flow/array_source.rb
160
+ - lib/chicago/flow/dataset_source.rb
161
+ - lib/chicago/flow/filter.rb
162
+ - lib/chicago/flow/mysql.rb
163
+ - lib/chicago/flow/mysql_file_serializer.rb
164
+ - lib/chicago/flow/mysql_file_sink.rb
165
+ - lib/chicago/flow/pipeline_endpoint.rb
166
+ - lib/chicago/flow/pipeline_stage.rb
167
+ - lib/chicago/flow/sink.rb
168
+ - lib/chicago/flow/transformation.rb
169
+ - lib/chicago/flow/transformation_chain.rb
170
+ - spec/array_sink_spec.rb
171
+ - spec/array_source_spec.rb
172
+ - spec/database.yml.dist
173
+ - spec/dataset_source_spec.rb
174
+ - spec/filter_spec.rb
175
+ - spec/mysql_file_serializer_spec.rb
176
+ - spec/mysql_file_sink_spec.rb
177
+ - spec/mysql_integration_spec.rb
178
+ - spec/pipeline_stage_spec.rb
179
+ - spec/spec_helper.rb
180
+ - spec/transformation_chain_spec.rb
181
+ - spec/transformation_spec.rb
182
+ homepage: http://github.com/notonthehighstreet/chicago-flow
183
+ licenses:
184
+ - MIT
185
+ post_install_message:
186
+ rdoc_options: []
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ none: false
191
+ requirements:
192
+ - - ! '>='
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ segments:
196
+ - 0
197
+ hash: -2256243131001367107
198
+ required_rubygems_version: !ruby/object:Gem::Requirement
199
+ none: false
200
+ requirements:
201
+ - - ! '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ requirements: []
205
+ rubyforge_project:
206
+ rubygems_version: 1.8.24
207
+ signing_key:
208
+ specification_version: 3
209
+ summary: Dataflow-style processing for hash-like rows
210
+ test_files: []