chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::ArraySource do
4
+ it "has an each method that yields rows" do
5
+ described_class.new([{:a => 1}]).each do |row|
6
+ row.should == {:a => 1}
7
+ end
8
+ end
9
+
10
+ it "doesn't know about any fields rows have by default" do
11
+ described_class.new([]).fields.should == []
12
+ described_class.new([]).should_not have_defined_fields
13
+ end
14
+
15
+ it "can optionally define which fields will be in rows" do
16
+ described_class.new([], [:a, :b]).fields.should == [:a, :b]
17
+ described_class.new([], :a).fields.should == [:a]
18
+ described_class.new([], :a).should have_defined_fields
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::DatasetSource do
4
+ let(:dataset) { stub(:dataset) }
5
+
6
+ it "should delegtate each to the dataset" do
7
+ dataset.should_receive(:each)
8
+ described_class.new(dataset).each {|row| }
9
+ end
10
+
11
+ it "gets columns from the dataset" do
12
+ dataset.should_receive(:columns)
13
+ described_class.new(dataset).fields
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::Filter do
4
+ it "filters all rows by default" do
5
+ subject.process({:a => 1}).should be_nil
6
+ end
7
+
8
+ it "filters rows given a block" do
9
+ filter = described_class.new {|row| row.has_key?(:a) }
10
+ filter.process(:a => 1).should == {:a => 1}
11
+ filter.process(:b => 1).should be_nil
12
+ end
13
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::MysqlFileSerializer do
4
+ it "serializes nil into NULL" do
5
+ subject.serialize(nil).should == "NULL"
6
+ end
7
+
8
+ it "serializes true into '1'" do
9
+ subject.serialize(true).should == "1"
10
+ end
11
+
12
+ it "serializes false into '0'" do
13
+ subject.serialize(false).should == "0"
14
+ end
15
+
16
+ it "serializes times into mysql time format" do
17
+ subject.serialize(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
18
+ end
19
+
20
+ it "serializes datetimes into mysql time format" do
21
+ subject.serialize(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
22
+ end
23
+
24
+ it "serializes dates into mysql date format" do
25
+ subject.serialize(Date.new(2011,01,02)).should == "2011-01-02"
26
+ end
27
+ end
@@ -0,0 +1,94 @@
1
+ require 'spec_helper'
2
+ require 'sequel'
3
+
4
+ describe Chicago::Flow::MysqlFileSink do
5
+ let(:dataset) { mock(:dataset).as_null_object }
6
+ let(:db) { mock(:db, :[] => dataset, :schema => []) }
7
+ let(:csv) { mock(:csv) }
8
+
9
+ let(:sink) {
10
+ described_class.new(db, :table, [:foo], :filepath => "test_file")
11
+ }
12
+
13
+ before :each do
14
+ CSV.stub(:open).and_return(csv)
15
+ csv.stub(:<<)
16
+ csv.stub(:close).and_return(csv)
17
+ csv.stub(:flush)
18
+
19
+ File.stub(:size?).and_return(true)
20
+ end
21
+
22
+ it "has the same name as the table it is loading into" do
23
+ sink.name.should == :table
24
+ end
25
+
26
+ it "writes specified columns to rows in a file" do
27
+ csv.should_receive(:<<).with([1])
28
+ sink << {:foo => 1, :bar => 2}
29
+ end
30
+
31
+ it "serializes values before writing to the file" do
32
+ Chicago::Flow::MysqlFileSerializer.any_instance.
33
+ should_receive(:serialize).with(1).and_return(1)
34
+ sink << {:foo => 1}
35
+ end
36
+
37
+ it "has defined fields" do
38
+ sink.should have_defined_fields
39
+ sink.fields.should == [:foo]
40
+ end
41
+
42
+ it "loads the csv file into the database when closed" do
43
+ dataset.should_receive(:load_csv_infile).
44
+ with("test_file", [:foo], :set => {})
45
+ sink.close
46
+ end
47
+
48
+ it "uses the :set hash to load constant values" do
49
+ sink.set_constant_values(:bar => 1).should == sink
50
+ dataset.should_receive(:load_csv_infile).
51
+ with("test_file", [:foo], :set => {:bar => 1})
52
+ sink.close
53
+ end
54
+
55
+ it "does not IGNORE rows by default" do
56
+ dataset.should_not_receive(:insert_ignore)
57
+ sink.close
58
+ end
59
+
60
+ it "can specify that INSERT IGNORE should be used" do
61
+ dataset.should_receive(:insert_ignore)
62
+ described_class.new(db, :table, [:foo],
63
+ :filepath => "test_file", :ignore => true).close
64
+ end
65
+
66
+ it "writes csv to a tempfile if no explicit filepath is given" do
67
+ described_class.new(db, :table, [:foo]).filepath.should match(/table\.\d+\.csv/)
68
+ end
69
+
70
+ it "doesn't attempt to load data if the file is empty or does not exist" do
71
+ File.stub(:size?).and_return(false)
72
+ dataset.should_not_receive(:load_csv_infile)
73
+ sink.close
74
+ end
75
+
76
+ it "removes the temporary file when closed" do
77
+ File.stub(:exists?).and_return(true)
78
+ File.should_receive(:unlink).with("test_file")
79
+
80
+ sink.close
81
+ end
82
+
83
+ it "truncates the table by default" do
84
+ dataset.should_receive(:truncate)
85
+ sink.truncate
86
+ end
87
+
88
+ it "can have a truncation strategy set" do
89
+ x = nil
90
+ sink.truncation_strategy = lambda { x = "deleted table" }
91
+ sink.truncate
92
+ x.should == "deleted table"
93
+ end
94
+ end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Mysql -> Mysql through transformation chain" do
4
+ let(:dup_row) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def output_streams
7
+ [:default, @options[:onto]].flatten
8
+ end
9
+
10
+ def process_row(row)
11
+ new_row = assign_stream(row.dup, @options[:onto])
12
+ [row, new_row]
13
+ end
14
+ }
15
+ }
16
+
17
+ before :all do
18
+ unless TEST_DB.table_exists?(:source)
19
+ TEST_DB.create_table(:source) do
20
+ primary_key :id
21
+ varchar :foo
22
+ binary :bin, :size => 1
23
+ end
24
+ end
25
+
26
+ unless TEST_DB.table_exists?(:destination)
27
+ TEST_DB.create_table(:destination) do
28
+ primary_key :id
29
+ varchar :foo
30
+ binary :bin, :size => 1
31
+ end
32
+ end
33
+ end
34
+
35
+ before :each do
36
+ TEST_DB[:source].truncate
37
+ TEST_DB[:destination].truncate
38
+ end
39
+
40
+ after :each do
41
+ TEST_DB[:source].truncate
42
+ TEST_DB[:destination].truncate
43
+ end
44
+
45
+ it "copies data from source to destination" do
46
+ TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
47
+ {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
+
49
+ source = Chicago::Flow::DatasetSource.
50
+ new(TEST_DB[:source].
51
+ select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
52
+ sink_1 = Chicago::Flow::MysqlFileSink.
53
+ new(TEST_DB, :destination, [:id, :foo, :bin])
54
+ sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
55
+
56
+ stage = Chicago::Flow::PipelineStage.
57
+ new(:transformations => [dup_row.new(:onto => :other)])
58
+
59
+ expect { stage.execute(source) }.to raise_error
60
+
61
+ stage.register_sink(:default, sink_1)
62
+ stage.register_sink(:other, sink_2)
63
+
64
+ stage.execute(source)
65
+
66
+ expected = [{:id => 1, :foo => nil, :bin => "1F"},
67
+ {:id => 2, :foo => "Hello", :bin => "1F"}]
68
+
69
+ sink_2.data.should == expected
70
+ TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
71
+ end
72
+ end
@@ -0,0 +1,89 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::PipelineStage do
4
+ let(:transform) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_error) {
14
+ Class.new(Chicago::Flow::Transformation) {
15
+ # add_output_stream :error
16
+ def output_streams
17
+ [:default, :error]
18
+ end
19
+
20
+ def process_row(row)
21
+ [row, {Chicago::Flow::STREAM => :error, :message => "error"}]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:sink) { Chicago::Flow::ArraySink.new(:test) }
27
+ let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
28
+
29
+ it "returns all sinks" do
30
+ stage = described_class.new.register_sink(:default, sink)
31
+ stage.sinks.should == [sink]
32
+ end
33
+
34
+ it "returns a sink by name" do
35
+ stage = described_class.new.register_sink(:default, sink)
36
+ stage.sink(:default).should == sink
37
+ end
38
+
39
+ it "reads from source to sink" do
40
+ pipeline = described_class.new.register_sink(:default, sink)
41
+ pipeline.execute(source)
42
+ sink.data.should == [{:a => 1}]
43
+ end
44
+
45
+ it "passes rows through transforms" do
46
+ pipeline = described_class.new(:transformations => [transform.new]).
47
+ register_sink(:default, sink)
48
+
49
+ pipeline.execute(source)
50
+ sink.data.should == [{:a => 2}]
51
+ end
52
+
53
+ it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
54
+ error_sink = Chicago::Flow::ArraySink.new(:test)
55
+
56
+ pipeline = described_class.new(:transformations => [add_error.new]).
57
+ register_sink(:default, sink).
58
+ register_sink(:error, error_sink)
59
+
60
+ pipeline.execute(source)
61
+ sink.data.should == [{:a => 1}]
62
+ error_sink.data.should == [{:message => "error"}]
63
+ end
64
+
65
+ it "calls an error handler if sinks are not registered" do
66
+ error_handler = mock(:error_handler)
67
+ error_handler.should_receive(:unregistered_sinks).
68
+ with([:default, :error])
69
+
70
+ pipeline = described_class.new(:transformations => [add_error.new],
71
+ :error_handler => error_handler)
72
+
73
+ pipeline.validate_pipeline
74
+ end
75
+
76
+ it "by default raises an exception if the pipeline is not valid when executed" do
77
+ pipeline = described_class.new(:transformations => [add_error.new])
78
+ expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
79
+ end
80
+
81
+ it "opens sinks before writing and closes them afterwards" do
82
+ sink = mock(:sink)
83
+ pipeline = described_class.new.register_sink(:default, sink)
84
+ sink.should_receive(:open)
85
+ sink.stub(:<<)
86
+ sink.should_receive(:close)
87
+ pipeline.execute(source)
88
+ end
89
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::TransformationChain do
4
+ let(:add_1_to_a) {
5
+ Class.new(Chicago::Flow::Transformation) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:dup_row) {
14
+ Class.new(Chicago::Flow::Transformation) {
15
+ def output_streams
16
+ [:default, @options[:onto]].flatten
17
+ end
18
+
19
+ def process_row(row)
20
+ new_row = assign_stream(row.dup, @options[:onto])
21
+ [row, new_row]
22
+ end
23
+ }
24
+ }
25
+
26
+ let(:store_until_flush) {
27
+ Class.new(Chicago::Flow::Transformation) {
28
+ def process_row(row)
29
+ @cache ||= []
30
+ @cache << row
31
+ nil
32
+ end
33
+
34
+ def flush
35
+ @cache
36
+ end
37
+ }
38
+ }
39
+
40
+ it "chains transformations" do
41
+ described_class.new(add_1_to_a.new, add_1_to_a.new).process({:a => 1}).
42
+ should == [{:a => 3}]
43
+ end
44
+
45
+ it "can cope with multiple return rows from transformations" do
46
+ described_class.new(add_1_to_a.new, dup_row.new, add_1_to_a.new).process({:a => 1}).
47
+ should == [{:a => 3}, {:a => 3}]
48
+ end
49
+
50
+ it "can cope with a filter returning nil" do
51
+ described_class.new(Chicago::Flow::Filter.new,
52
+ dup_row.new, add_1_to_a.new).process({:a => 1}).
53
+ should == []
54
+ end
55
+
56
+ it "can write to different streams" do
57
+ described_class.new(dup_row.new(:onto => :other),
58
+ add_1_to_a.new).process({:a => 1}).
59
+ should == [{:a => 2}, {:a => 1, Chicago::Flow::STREAM => :other}]
60
+ end
61
+
62
+ it "knows what streams it writes to as a chain" do
63
+ described_class.new(dup_row.new(:onto => :other),
64
+ add_1_to_a.new).output_streams.should == [:default, :other]
65
+ end
66
+
67
+ it "can flush rows held back by transforms" do
68
+ chain = described_class.new(store_until_flush.new,
69
+ add_1_to_a.new,
70
+ store_until_flush.new,
71
+ add_1_to_a.new)
72
+ chain.process({:a => 1}).should == []
73
+ chain.process({:a => 2}).should == []
74
+ chain.flush.should == [{:a => 3}, {:a => 4}]
75
+ end
76
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::Transformation do
4
+ let(:add_1_to_a) {
5
+ Class.new(described_class) {
6
+ def process_row(row)
7
+ row[:a] += 1
8
+ row
9
+ end
10
+ }
11
+ }
12
+
13
+ let(:add_and_remove) {
14
+ Class.new(described_class) {
15
+ adds_fields :b, :c
16
+ removes_fields :a
17
+
18
+ def process_row(row)
19
+ row.delete(:a)
20
+ row[:b] = 1
21
+ row[:c] = 2
22
+ row
23
+ end
24
+ }
25
+ }
26
+
27
+ it "writes to the :default stream by default" do
28
+ subject.output_streams.should == [:default]
29
+ end
30
+
31
+ it "may apply to a particular stream" do
32
+ subject.applies_to_stream?(:default).should be_true
33
+ subject.applies_to_stream?(nil).should be_true
34
+ described_class.new(:other).applies_to_stream?(:default).should be_false
35
+ described_class.new(:other).applies_to_stream?(:other).should be_true
36
+ end
37
+
38
+ it "processes a row via #process_row" do
39
+ add_1_to_a.new.process({:a => 1}).should == {:a => 2}
40
+ end
41
+
42
+ it "passes through rows not on its stream" do
43
+ add_1_to_a.new(:other).process({:a => 1}).should == {:a => 1}
44
+ end
45
+
46
+ it "can apply to all streams using :all" do
47
+ add_1_to_a.new(:all).process({:a => 1}).should == {:a => 2}
48
+ add_1_to_a.new(:all).process({:a => 1, Chicago::Flow::STREAM => :other}).
49
+ should == {:a => 2, Chicago::Flow::STREAM => :other}
50
+ end
51
+
52
+ it "can be flushed" do
53
+ subject.flush.should == []
54
+ end
55
+
56
+ it "can specify which fields are added" do
57
+ add_and_remove.new.added_fields.should == [:b, :c]
58
+ end
59
+
60
+ it "can specify which fields are removed" do
61
+ add_and_remove.new.removed_fields.should == [:a]
62
+ end
63
+
64
+ it "can calculate downstream fields" do
65
+ Set.new(add_and_remove.new.downstream_fields([:a, :b, :d])).
66
+ should == Set.new([:b, :c, :d])
67
+ end
68
+
69
+ it "can calculate upstream fields" do
70
+ Set.new(add_and_remove.new.upstream_fields([:b, :c, :d])).
71
+ should == Set.new([:a, :d])
72
+ end
73
+
74
+ it "has an empty array of added fields by default" do
75
+ subject.added_fields.should == []
76
+ end
77
+
78
+ it "has an empty array of removed fields by default" do
79
+ subject.removed_fields.should == []
80
+ end
81
+
82
+ it "has an empty array of required options by default" do
83
+ subject.required_options.should == []
84
+ end
85
+
86
+ it "can enforce options" do
87
+ klass = Class.new(described_class) { requires_options :foo }
88
+ expect { klass.new }.to raise_error(ArgumentError)
89
+ expect { klass.new(:foo => :bar) }.to_not raise_error(ArgumentError)
90
+ end
91
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,8 @@
1
+ if RUBY_VERSION.to_f >= 1.9
2
+ require 'simplecov'
3
+ SimpleCov.start { add_filter 'spec' }
4
+ end
5
+
1
6
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
7
  $LOAD_PATH.unshift(File.dirname(__FILE__))
3
8
  require 'rspec'