chicago-etl 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::ArraySource do
4
+ it "has an each method that yields rows" do
5
+ described_class.new([{:a => 1}]).each do |row|
6
+ row.should == {:a => 1}
7
+ end
8
+ end
9
+
10
+ it "doesn't know about any fields rows have by default" do
11
+ described_class.new([]).fields.should == []
12
+ described_class.new([]).should_not have_defined_fields
13
+ end
14
+
15
+ it "can optionally define which fields will be in rows" do
16
+ described_class.new([], [:a, :b]).fields.should == [:a, :b]
17
+ described_class.new([], :a).fields.should == [:a]
18
+ described_class.new([], :a).should have_defined_fields
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::DatasetSource do
4
+ let(:dataset) { stub(:dataset) }
5
+
6
+ it "should delegtate each to the dataset" do
7
+ dataset.should_receive(:each)
8
+ described_class.new(dataset).each {|row| }
9
+ end
10
+
11
+ it "gets columns from the dataset" do
12
+ dataset.should_receive(:columns)
13
+ described_class.new(dataset).fields
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::Filter do
4
+ it "filters all rows by default" do
5
+ subject.process({:a => 1}).should be_nil
6
+ end
7
+
8
+ it "filters rows given a block" do
9
+ filter = described_class.new {|row| row.has_key?(:a) }
10
+ filter.process(:a => 1).should == {:a => 1}
11
+ filter.process(:b => 1).should be_nil
12
+ end
13
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::MysqlFileSerializer do
4
+ it "serializes nil into NULL" do
5
+ subject.serialize(nil).should == "NULL"
6
+ end
7
+
8
+ it "serializes true into '1'" do
9
+ subject.serialize(true).should == "1"
10
+ end
11
+
12
+ it "serializes false into '0'" do
13
+ subject.serialize(false).should == "0"
14
+ end
15
+
16
+ it "serializes times into mysql time format" do
17
+ subject.serialize(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
18
+ end
19
+
20
+ it "serializes datetimes into mysql time format" do
21
+ subject.serialize(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
22
+ end
23
+
24
+ it "serializes dates into mysql date format" do
25
+ subject.serialize(Date.new(2011,01,02)).should == "2011-01-02"
26
+ end
27
+ end
@@ -0,0 +1,94 @@
1
+ require 'spec_helper'
2
+ require 'sequel'
3
+
4
+ describe Chicago::Flow::MysqlFileSink do
5
+ let(:dataset) { mock(:dataset).as_null_object }
6
+ let(:db) { mock(:db, :[] => dataset, :schema => []) }
7
+ let(:csv) { mock(:csv) }
8
+
9
+ let(:sink) {
10
+ described_class.new(db, :table, [:foo], :filepath => "test_file")
11
+ }
12
+
13
+ before :each do
14
+ CSV.stub(:open).and_return(csv)
15
+ csv.stub(:<<)
16
+ csv.stub(:close).and_return(csv)
17
+ csv.stub(:flush)
18
+
19
+ File.stub(:size?).and_return(true)
20
+ end
21
+
22
+ it "has the same name as the table it is loading into" do
23
+ sink.name.should == :table
24
+ end
25
+
26
+ it "writes specified columns to rows in a file" do
27
+ csv.should_receive(:<<).with([1])
28
+ sink << {:foo => 1, :bar => 2}
29
+ end
30
+
31
+ it "serializes values before writing to the file" do
32
+ Chicago::Flow::MysqlFileSerializer.any_instance.
33
+ should_receive(:serialize).with(1).and_return(1)
34
+ sink << {:foo => 1}
35
+ end
36
+
37
+ it "has defined fields" do
38
+ sink.should have_defined_fields
39
+ sink.fields.should == [:foo]
40
+ end
41
+
42
+ it "loads the csv file into the database when closed" do
43
+ dataset.should_receive(:load_csv_infile).
44
+ with("test_file", [:foo], :set => {})
45
+ sink.close
46
+ end
47
+
48
+ it "uses the :set hash to load constant values" do
49
+ sink.set_constant_values(:bar => 1).should == sink
50
+ dataset.should_receive(:load_csv_infile).
51
+ with("test_file", [:foo], :set => {:bar => 1})
52
+ sink.close
53
+ end
54
+
55
+ it "does not IGNORE rows by default" do
56
+ dataset.should_not_receive(:insert_ignore)
57
+ sink.close
58
+ end
59
+
60
+ it "can specify that INSERT IGNORE should be used" do
61
+ dataset.should_receive(:insert_ignore)
62
+ described_class.new(db, :table, [:foo],
63
+ :filepath => "test_file", :ignore => true).close
64
+ end
65
+
66
+ it "writes csv to a tempfile if no explicit filepath is given" do
67
+ described_class.new(db, :table, [:foo]).filepath.should match(/table\.\d+\.csv/)
68
+ end
69
+
70
+ it "doesn't attempt to load data if the file is empty or does not exist" do
71
+ File.stub(:size?).and_return(false)
72
+ dataset.should_not_receive(:load_csv_infile)
73
+ sink.close
74
+ end
75
+
76
+ it "removes the temporary file when closed" do
77
+ File.stub(:exists?).and_return(true)
78
+ File.should_receive(:unlink).with("test_file")
79
+
80
+ sink.close
81
+ end
82
+
83
+ it "truncates the table by default" do
84
+ dataset.should_receive(:truncate)
85
+ sink.truncate
86
+ end
87
+
88
+ it "can have a truncation strategy set" do
89
+ x = nil
90
+ sink.truncation_strategy = lambda { x = "deleted table" }
91
+ sink.truncate
92
+ x.should == "deleted table"
93
+ end
94
+ end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Mysql -> Mysql through transformation chain" do
4
+ let(:dup_row) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def output_streams
7
+ [:default, @options[:onto]].flatten
8
+ end
9
+
10
+ def process_row(row)
11
+ new_row = assign_stream(row.dup, @options[:onto])
12
+ [row, new_row]
13
+ end
14
+ }
15
+ }
16
+
17
+ before :all do
18
+ unless TEST_DB.table_exists?(:source)
19
+ TEST_DB.create_table(:source) do
20
+ primary_key :id
21
+ varchar :foo
22
+ binary :bin, :size => 1
23
+ end
24
+ end
25
+
26
+ unless TEST_DB.table_exists?(:destination)
27
+ TEST_DB.create_table(:destination) do
28
+ primary_key :id
29
+ varchar :foo
30
+ binary :bin, :size => 1
31
+ end
32
+ end
33
+ end
34
+
35
+ before :each do
36
+ TEST_DB[:source].truncate
37
+ TEST_DB[:destination].truncate
38
+ end
39
+
40
+ after :each do
41
+ TEST_DB[:source].truncate
42
+ TEST_DB[:destination].truncate
43
+ end
44
+
45
+ it "copies data from source to destination" do
46
+ TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
47
+ {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
+
49
+ source = Chicago::Flow::DatasetSource.
50
+ new(TEST_DB[:source].
51
+ select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
52
+ sink_1 = Chicago::Flow::MysqlFileSink.
53
+ new(TEST_DB, :destination, [:id, :foo, :bin])
54
+ sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
55
+
56
+ stage = Chicago::Flow::PipelineStage.
57
+ new(:transformations => [dup_row.new(:onto => :other)])
58
+
59
+ expect { stage.execute(source) }.to raise_error
60
+
61
+ stage.register_sink(:default, sink_1)
62
+ stage.register_sink(:other, sink_2)
63
+
64
+ stage.execute(source)
65
+
66
+ expected = [{:id => 1, :foo => nil, :bin => "1F"},
67
+ {:id => 2, :foo => "Hello", :bin => "1F"}]
68
+
69
+ sink_2.data.should == expected
70
+ TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
71
+ end
72
+ end
@@ -0,0 +1,89 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::PipelineStage do
4
+ let(:transform) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_error) {
14
+ Class.new(Chicago::Flow::Transformation) {
15
+ # add_output_stream :error
16
+ def output_streams
17
+ [:default, :error]
18
+ end
19
+
20
+ def process_row(row)
21
+ [row, {Chicago::Flow::STREAM => :error, :message => "error"}]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:sink) { Chicago::Flow::ArraySink.new(:test) }
27
+ let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
28
+
29
+ it "returns all sinks" do
30
+ stage = described_class.new.register_sink(:default, sink)
31
+ stage.sinks.should == [sink]
32
+ end
33
+
34
+ it "returns a sink by name" do
35
+ stage = described_class.new.register_sink(:default, sink)
36
+ stage.sink(:default).should == sink
37
+ end
38
+
39
+ it "reads from source to sink" do
40
+ pipeline = described_class.new.register_sink(:default, sink)
41
+ pipeline.execute(source)
42
+ sink.data.should == [{:a => 1}]
43
+ end
44
+
45
+ it "passes rows through transforms" do
46
+ pipeline = described_class.new(:transformations => [transform.new]).
47
+ register_sink(:default, sink)
48
+
49
+ pipeline.execute(source)
50
+ sink.data.should == [{:a => 2}]
51
+ end
52
+
53
+ it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
54
+ error_sink = Chicago::Flow::ArraySink.new(:test)
55
+
56
+ pipeline = described_class.new(:transformations => [add_error.new]).
57
+ register_sink(:default, sink).
58
+ register_sink(:error, error_sink)
59
+
60
+ pipeline.execute(source)
61
+ sink.data.should == [{:a => 1}]
62
+ error_sink.data.should == [{:message => "error"}]
63
+ end
64
+
65
+ it "calls an error handler if sinks are not registered" do
66
+ error_handler = mock(:error_handler)
67
+ error_handler.should_receive(:unregistered_sinks).
68
+ with([:default, :error])
69
+
70
+ pipeline = described_class.new(:transformations => [add_error.new],
71
+ :error_handler => error_handler)
72
+
73
+ pipeline.validate_pipeline
74
+ end
75
+
76
+ it "by default raises an exception if the pipeline is not valid when executed" do
77
+ pipeline = described_class.new(:transformations => [add_error.new])
78
+ expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
79
+ end
80
+
81
+ it "opens sinks before writing and closes them afterwards" do
82
+ sink = mock(:sink)
83
+ pipeline = described_class.new.register_sink(:default, sink)
84
+ sink.should_receive(:open)
85
+ sink.stub(:<<)
86
+ sink.should_receive(:close)
87
+ pipeline.execute(source)
88
+ end
89
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::TransformationChain do
4
+ let(:add_1_to_a) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:dup_row) {
14
+ Class.new(Chicago::Flow::Transformation) {
15
+ def output_streams
16
+ [:default, @options[:onto]].flatten
17
+ end
18
+
19
+ def process_row(row)
20
+ new_row = assign_stream(row.dup, @options[:onto])
21
+ [row, new_row]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:store_until_flush) {
27
+ Class.new(Chicago::Flow::Transformation) {
28
+ def process_row(row)
29
+ @cache ||= []
30
+ @cache << row
31
+ nil
32
+ end
33
+
34
+ def flush
35
+ @cache
36
+ end
37
+ }
38
+ }
39
+
40
+ it "chains transformations" do
41
+ described_class.new(add_1_to_a.new, add_1_to_a.new).process({:a => 1}).
42
+ should == [{:a => 3}]
43
+ end
44
+
45
+ it "can cope with multiple return rows from transformations" do
46
+ described_class.new(add_1_to_a.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
47
+ should == [{:a => 3}, {:a => 3}]
48
+ end
49
+
50
+ it "can cope with a filter returning nil" do
51
+ described_class.new(Chicago::Flow::Filter.new,
52
+ dup_row.new, add_1_to_a.new).process({:a => 1}).
53
+ should == []
54
+ end
55
+
56
+ it "can write to different streams" do
57
+ described_class.new(dup_row.new(:onto => :other),
58
+ add_1_to_a.new).process({:a => 1}).
59
+ should == [{:a => 2}, {:a => 1, Chicago::Flow::STREAM => :other}]
60
+ end
61
+
62
+ it "knows what streams it writes to as a chain" do
63
+ described_class.new(dup_row.new(:onto => :other),
64
+ add_1_to_a.new).output_streams.should == [:default, :other]
65
+ end
66
+
67
+ it "can flush rows held back by transforms" do
68
+ chain = described_class.new(store_until_flush.new,
69
+ add_1_to_a.new,
70
+ store_until_flush.new,
71
+ add_1_to_a.new)
72
+ chain.process({:a => 1}).should == []
73
+ chain.process({:a => 2}).should == []
74
+ chain.flush.should == [{:a => 3}, {:a => 4}]
75
+ end
76
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::Transformation do
4
+ let(:add_1_to_a) {
5
+ Class.new(described_class) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_and_remove) {
14
+ Class.new(described_class) {
15
+ adds_fields :b, :c
16
+ removes_fields :a
17
+
18
+ def process_row(row)
19
+ row.delete(:a)
20
+ row[:b] = 1
21
+ row[:c] = 2
22
+ row
23
+ end
24
+ }
25
+ }
26
+
27
+ it "writes to the :default stream by default" do
28
+ subject.output_streams.should == [:default]
29
+ end
30
+
31
+ it "may apply to a particular stream" do
32
+ subject.applies_to_stream?(:default).should be_true
33
+ subject.applies_to_stream?(nil).should be_true
34
+ described_class.new(:other).applies_to_stream?(:default).should be_false
35
+ described_class.new(:other).applies_to_stream?(:other).should be_true
36
+ end
37
+
38
+ it "processes a row via #process_row" do
39
+ add_1_to_a.new.process({:a => 1}).should == {:a => 2}
40
+ end
41
+
42
+ it "passes through rows not on its stream" do
43
+ add_1_to_a.new(:other).process({:a => 1}).should == {:a => 1}
44
+ end
45
+
46
+ it "can apply to all streams using :all" do
47
+ add_1_to_a.new(:all).process({:a => 1}).should == {:a => 2}
48
+ add_1_to_a.new(:all).process({:a => 1, Chicago::Flow::STREAM => :other}).
49
+ should == {:a => 2, Chicago::Flow::STREAM => :other}
50
+ end
51
+
52
+ it "can be flushed" do
53
+ subject.flush.should == []
54
+ end
55
+
56
+ it "can specify which fields are added" do
57
+ add_and_remove.new.added_fields.should == [:b, :c]
58
+ end
59
+
60
+ it "can specify which fields are removed" do
61
+ add_and_remove.new.removed_fields.should == [:a]
62
+ end
63
+
64
+ it "can calculate downstream fields" do
65
+ Set.new(add_and_remove.new.downstream_fields([:a, :b, :d])).
66
+ should == Set.new([:b, :c, :d])
67
+ end
68
+
69
+ it "can calculate upstream fields" do
70
+ Set.new(add_and_remove.new.upstream_fields([:b, :c, :d])).
71
+ should == Set.new([:a, :d])
72
+ end
73
+
74
+ it "has an empty array of added fields by default" do
75
+ subject.added_fields.should == []
76
+ end
77
+
78
+ it "has an empty array of removed fields by default" do
79
+ subject.removed_fields.should == []
80
+ end
81
+
82
+ it "has an empty array of required options by default" do
83
+ subject.required_options.should == []
84
+ end
85
+
86
+ it "can enforce options" do
87
+ klass = Class.new(described_class) { requires_options :foo }
88
+ expect { klass.new }.to raise_error(ArgumentError)
89
+ expect { klass.new(:foo => :bar) }.to_not raise_error(ArgumentError)
90
+ end
91
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,8 @@
1
+ if RUBY_VERSION.to_f >= 1.9
2
+ require 'simplecov'
3
+ SimpleCov.start { add_filter 'spec' }
4
+ end
5
+
1
6
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
7
  $LOAD_PATH.unshift(File.dirname(__FILE__))
3
8
  require 'rspec'