chicago-etl 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.3
1
+ 0.1.4
data/chicago-etl.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.1.3"
8
+ s.version = "0.1.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
@@ -32,10 +32,11 @@ Gem::Specification.new do |s|
32
32
  "lib/chicago/etl/core_extensions.rb",
33
33
  "lib/chicago/etl/counter.rb",
34
34
  "lib/chicago/etl/dataset_batch_stage.rb",
35
+ "lib/chicago/etl/dataset_builder.rb",
35
36
  "lib/chicago/etl/key_builder.rb",
36
37
  "lib/chicago/etl/load_dataset_builder.rb",
37
- "lib/chicago/etl/load_pipeline_stage_builder.rb",
38
38
  "lib/chicago/etl/pipeline.rb",
39
+ "lib/chicago/etl/schema_sinks_and_transformations_builder.rb",
39
40
  "lib/chicago/etl/schema_table_sink_factory.rb",
40
41
  "lib/chicago/etl/screens/column_screen.rb",
41
42
  "lib/chicago/etl/screens/invalid_element.rb",
@@ -62,7 +63,6 @@ Gem::Specification.new do |s|
62
63
  "lib/chicago/flow/mysql_file_sink.rb",
63
64
  "lib/chicago/flow/null_sink.rb",
64
65
  "lib/chicago/flow/pipeline_endpoint.rb",
65
- "lib/chicago/flow/pipeline_stage.rb",
66
66
  "lib/chicago/flow/sink.rb",
67
67
  "lib/chicago/flow/transformation.rb",
68
68
  "lib/chicago/flow/transformation_chain.rb",
@@ -70,7 +70,7 @@ Gem::Specification.new do |s|
70
70
  "spec/etl/batch_spec.rb",
71
71
  "spec/etl/core_extensions_spec.rb",
72
72
  "spec/etl/counter_spec.rb",
73
- "spec/etl/dataset_batch_stage_spec.rb",
73
+ "spec/etl/define_dimension_stage_spec.rb",
74
74
  "spec/etl/define_stage_spec.rb",
75
75
  "spec/etl/etl_batch_id_dataset_filter.rb",
76
76
  "spec/etl/key_builder_spec.rb",
@@ -82,6 +82,7 @@ Gem::Specification.new do |s|
82
82
  "spec/etl/screens/out_of_bounds_spec.rb",
83
83
  "spec/etl/sequel/dependant_tables_spec.rb",
84
84
  "spec/etl/sequel/filter_to_etl_batch_spec.rb",
85
+ "spec/etl/stage_spec.rb",
85
86
  "spec/etl/table_builder_spec.rb",
86
87
  "spec/etl/task_spec.rb",
87
88
  "spec/etl/transformations/deduplicate_rows_spec.rb",
@@ -95,7 +96,6 @@ Gem::Specification.new do |s|
95
96
  "spec/flow/mysql_file_serializer_spec.rb",
96
97
  "spec/flow/mysql_file_sink_spec.rb",
97
98
  "spec/flow/mysql_integration_spec.rb",
98
- "spec/flow/pipeline_stage_spec.rb",
99
99
  "spec/flow/transformation_chain_spec.rb",
100
100
  "spec/flow/transformation_spec.rb",
101
101
  "spec/spec_helper.rb"
@@ -4,48 +4,28 @@ module Chicago
4
4
  #
5
5
  # Allows deferring constructing a DatasetSource until extract
6
6
  # time, so that it can be filtered to an ETL batch appropriately.
7
- class DatasetBatchStage
7
+ class DatasetBatchStage < Stage
8
8
  attr_reader :name
9
9
 
10
- def initialize(name, dataset, pipeline_stage, options={})
11
- @name = name
12
- @dataset = dataset
13
- @pipeline_stage = pipeline_stage
14
- @filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
15
- dataset.filter_to_etl_batch(etl_batch)
16
- }
10
+ def initialize(name, options={})
11
+ super
12
+ @filter_strategy = options[:filter_strategy] ||
13
+ lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
17
14
  @truncate_pre_load = !!options[:truncate_pre_load]
18
- end
15
+ end
19
16
 
20
17
  # Executes this ETL stage.
21
18
  #
22
19
  # Configures the dataset and flows rows into the pipeline.
23
20
  def execute(etl_batch, reextract=false)
24
21
  if @truncate_pre_load
25
- pipeline_stage.sinks.each {|sink| sink.truncate }
26
- elsif reextract && pipeline_stage.sink(:error)
27
- pipeline_stage.sink(:error).truncate
28
- end
29
-
30
- pipeline_stage.execute(source(etl_batch, reextract))
31
- end
32
-
33
- # Returns the pipeline for this stage.
34
- def pipeline_stage
35
- @pipeline_stage.sink(:default).
36
- set_constant_values(:_inserted_at => Time.now)
37
- @pipeline_stage
38
- end
39
-
40
- # Returns a DatasetSource for the provided dataset filtered to
41
- # the ETL batch as appropriate.
42
- def source(etl_batch, reextract=false)
43
- if reextract
44
- filtered_dataset = @dataset
45
- else
46
- filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
22
+ sinks.each {|sink| sink.truncate }
23
+ elsif reextract && sink(:error)
24
+ sink(:error).truncate
47
25
  end
48
- Chicago::Flow::DatasetSource.new(filtered_dataset)
26
+
27
+ sink(:default).set_constant_values(:_inserted_at => Time.now)
28
+ super
49
29
  end
50
30
  end
51
31
  end
@@ -0,0 +1,60 @@
1
+ module Chicago
2
+ module ETL
3
+ # Provides convenience methods for defining source datasets.
4
+ class DatasetBuilder
5
+ attr_reader :db
6
+
7
+ # @api private
8
+ def initialize(db)
9
+ @db = db
10
+ end
11
+
12
+ # @api private
13
+ def build(&block)
14
+ instance_eval(&block)
15
+ end
16
+
17
+ protected
18
+
19
+ def key_field(field, name)
20
+ :if[{field => nil}, 1, field].as(name)
21
+ end
22
+
23
+ # Returns a column for use in a Sequel::Dataset#select method to
24
+ # return a dimension key.
25
+ #
26
+ # Takes care of using the key tables correctly, and dealing with
27
+ # missing dimension values.
28
+ def dimension_key(name)
29
+ key_field("keys_dimension_#{name}__dimension_id".to_sym,
30
+ "#{name}_dimension_id".to_sym)
31
+ end
32
+
33
+ # Returns a column for use in a Sequel::Dataset#select method to
34
+ # return a date dimension key.
35
+ def date_dimension_column(dimension)
36
+ :if.sql_function({:id.qualify(dimension) => nil},
37
+ 1,
38
+ :id.qualify(dimension)).
39
+ as("#{dimension}_dimension_id".to_sym)
40
+ end
41
+
42
+ # Rounds a monetary value to 2 decimal places.
43
+ #
44
+ # By default, natural rounding is used, you can specify either
45
+ # :up or :down as the direction.
46
+ #
47
+ # @deprecated
48
+ def round(stmt, direction = :none)
49
+ case direction
50
+ when :none
51
+ :round.sql_function(stmt, 2)
52
+ when :up
53
+ :ceil.sql_function(stmt * 100) / 100
54
+ when :down
55
+ :floor.sql_function(stmt * 100) / 100
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -61,13 +61,17 @@ module Chicago
61
61
  # @api private
62
62
  def build(name, &block)
63
63
  instance_eval &block
64
- unless defined? @pipeline_stage
64
+ unless defined? @sinks_and_transformations
65
65
  pipeline do
66
66
  end
67
67
  end
68
- DatasetBatchStage.new(name, @dataset, @pipeline_stage,
69
- :filter_strategy => @filter_strategy,
70
- :truncate_pre_load => @truncate_pre_load)
68
+ DatasetBatchStage.new(name,
69
+ :source => @dataset,
70
+ :transformations => @sinks_and_transformations[:transformations],
71
+ :sinks => @sinks_and_transformations[:sinks],
72
+ :filter_strategy => @filter_strategy,
73
+ :truncate_pre_load => @truncate_pre_load)
74
+
71
75
  end
72
76
 
73
77
  protected
@@ -88,7 +92,7 @@ module Chicago
88
92
  # for details.
89
93
  # TODO: rename pipeline => transforms below this method
90
94
  def pipeline(&block)
91
- @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
95
+ @sinks_and_transformations = SchemaSinksAndTransformationsBuilder.new(@db, @schema_table).
92
96
  build(&block)
93
97
  end
94
98
 
@@ -106,62 +110,5 @@ module Chicago
106
110
  @filter_strategy = block
107
111
  end
108
112
  end
109
-
110
- # Provides convenience methods for defining source datasets.
111
- class DatasetBuilder
112
- attr_reader :db
113
-
114
- # @api private
115
- def initialize(db)
116
- @db = db
117
- end
118
-
119
- # @api private
120
- def build(&block)
121
- instance_eval(&block)
122
- end
123
-
124
- protected
125
-
126
- def key_field(field, name)
127
- :if[{field => nil}, 1, field].as(name)
128
- end
129
-
130
- # Returns a column for use in a Sequel::Dataset#select method to
131
- # return a dimension key.
132
- #
133
- # Takes care of using the key tables correctly, and dealing with
134
- # missing dimension values.
135
- def dimension_key(name)
136
- key_field("keys_dimension_#{name}__dimension_id".to_sym,
137
- "#{name}_dimension_id".to_sym)
138
- end
139
-
140
- # Returns a column for use in a Sequel::Dataset#select method to
141
- # return a date dimension key.
142
- def date_dimension_column(dimension)
143
- :if.sql_function({:id.qualify(dimension) => nil},
144
- 1,
145
- :id.qualify(dimension)).
146
- as("#{dimension}_dimension_id".to_sym)
147
- end
148
-
149
- # Rounds a monetary value to 2 decimal places.
150
- #
151
- # By default, natural rounding is used, you can specify either
152
- # :up or :down as the direction.
153
- #
154
- # @deprecated
155
- def round(stmt, direction = :none)
156
- case direction
157
- when :none
158
- :round.sql_function(stmt, 2)
159
- when :up
160
- :ceil.sql_function(stmt * 100) / 100
161
- when :down
162
- :floor.sql_function(stmt * 100) / 100
163
- end
164
- end
165
- end
166
113
  end
167
114
  end
@@ -5,7 +5,7 @@ module Chicago
5
5
  #
6
6
  # Clients will not normally instantiate this themselves but use it
7
7
  # in the context of defining an ETL stage.
8
- class LoadPipelineStageBuilder
8
+ class SchemaSinksAndTransformationsBuilder
9
9
  # @api private
10
10
  KeyMapping = Struct.new(:table, :field)
11
11
 
@@ -41,9 +41,9 @@ module Chicago
41
41
  add_screens
42
42
  add_key_transforms
43
43
  add_final_transforms
44
- pipeline_stage = create_pipeline_stage
45
- register_additional_sinks(pipeline_stage)
46
- pipeline_stage
44
+ sinks_and_transformations = create_sinks_and_transformations
45
+ register_additional_sinks(sinks_and_transformations)
46
+ sinks_and_transformations
47
47
  end
48
48
 
49
49
  protected
@@ -81,7 +81,7 @@ module Chicago
81
81
 
82
82
  private
83
83
 
84
- def create_pipeline_stage
84
+ def create_sinks_and_transformations
85
85
  default = @sink_factory.sink(:ignore => @ignore_present_rows,
86
86
  :exclude => @load_separately)
87
87
  key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
@@ -90,24 +90,26 @@ module Chicago
90
90
  # Facts have no key table to write to.
91
91
  Flow::NullSink.new
92
92
  end
93
-
94
- Flow::PipelineStage.
95
- new(:transformations => concat_transformations,
96
- :sinks => {
97
- :default => default,
98
- :dimension_key => key_sink,
99
- :error => @sink_factory.error_sink
100
- })
93
+
94
+ {
95
+ :transformations => concat_transformations,
96
+ :sinks => {
97
+ :default => default,
98
+ :dimension_key => key_sink,
99
+ :error => @sink_factory.error_sink
100
+ }
101
+ }
101
102
  end
102
103
 
103
104
  def concat_transformations
104
105
  TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
105
106
  end
106
107
 
107
- def register_additional_sinks(pipeline_stage)
108
+ def register_additional_sinks(sinks_and_transformations)
109
+ sinks = sinks_and_transformations[:sinks]
108
110
  @key_mappings.each do |mapping|
109
111
  sink = @sink_factory.key_sink(:table => mapping.table)
110
- pipeline_stage.register_sink(mapping.table, sink)
112
+ sinks[mapping.table] = sink
111
113
  end
112
114
  end
113
115
 
@@ -1,42 +1,30 @@
1
1
  module Chicago
2
2
  module ETL
3
+ # A Stage in the ETL pipeline.
4
+ #
5
+ # A Stage wires together a Source, 0 or more Transformations and 1
6
+ # or more Sinks.
3
7
  class Stage
8
+ # Returns the source for this stage.
9
+ attr_reader :source
10
+
11
+ # Returns the name of this stage.
4
12
  attr_reader :name
5
13
 
6
14
  def initialize(name, options={})
7
15
  @name = name
8
- @source = options.fetch(:source)
9
- raise ArgumentError, "Stage #{name} requires a source" unless @source
10
-
11
- @sinks = options.fetch(:sinks)
12
- raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
13
-
14
- @transformations = options.fetch(:transformations)
15
- @transformation_chain = Chicago::Flow::TransformationChain.
16
- new(*@transformations)
17
-
16
+ @source = options[:source]
17
+ @sinks = options[:sinks]
18
+ @transformations = options[:transformations] || []
18
19
  @filter_strategy = options[:filter_strategy] ||
19
20
  lambda {|source, _| source }
20
- end
21
21
 
22
- def execute(etl_batch, reextract)
23
- modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
24
- transform_and_load_from(modified_source)
25
- end
26
-
27
- def transform_and_load_from(source)
22
+ validate_arguments
28
23
  end
29
24
 
30
- def reextract_and_filter_source(source, etl_batch, reextract=false)
31
- if reextract
32
- filtered_dataset = source
33
- else
34
- filtered_dataset = @filter_strategy.call(source, etl_batch)
35
- end
36
- Chicago::Flow::DatasetSource.new(filtered_dataset)
25
+ def execute(etl_batch, reextract=false)
26
+ transform_and_load filtered_source(etl_batch, reextract)
37
27
  end
38
-
39
- attr_reader :transformation_chain
40
28
 
41
29
  # Returns the named sink, if it exists
42
30
  def sink(name)
@@ -46,20 +34,22 @@ module Chicago
46
34
  def sinks
47
35
  @sinks.values
48
36
  end
37
+
38
+ def filtered_source(etl_batch, reextract=false)
39
+ filtered_dataset = reextract ? source :
40
+ @filter_strategy.call(source, etl_batch)
49
41
 
50
- def register_sink(name, sink)
51
- @sinks[name.to_sym] = sink
52
- self
42
+ Chicago::Flow::DatasetSource.new(filtered_dataset)
53
43
  end
54
-
55
- def transform_and_load_from(source)
44
+
45
+ private
46
+
47
+ def transform_and_load(source)
56
48
  sinks.each(&:open)
57
49
  pipe_rows_to_sinks_from(source)
58
50
  sinks.each(&:close)
59
51
  end
60
-
61
- private
62
-
52
+
63
53
  def pipe_rows_to_sinks_from(source)
64
54
  source.each do |row|
65
55
  transformation_chain.process(row).each {|row| process_row(row) }
@@ -67,10 +57,25 @@ module Chicago
67
57
  transformation_chain.flush.each {|row| process_row(row) }
68
58
  end
69
59
 
60
+ def transformation_chain
61
+ @transformation_chain ||= Chicago::Flow::TransformationChain.
62
+ new(*@transformations)
63
+ end
64
+
70
65
  def process_row(row)
71
66
  stream = row.delete(:_stream) || :default
72
67
  @sinks[stream] << row
73
68
  end
69
+
70
+ def validate_arguments
71
+ if @source.nil?
72
+ raise ArgumentError, "Stage #{@name} requires a source"
73
+ end
74
+
75
+ if @sinks.blank?
76
+ raise ArgumentError, "Stage #{@name} requires at least one sink"
77
+ end
78
+ end
74
79
  end
75
80
  end
76
81
  end
@@ -1,8 +1,6 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  class StageBuilder
4
- attr_reader :sink_factory
5
-
6
4
  def initialize(db)
7
5
  @db = db
8
6
  end
@@ -20,15 +18,17 @@ module Chicago
20
18
  :filter_strategy => @filter_strategy)
21
19
  end
22
20
 
21
+ protected
22
+
23
23
  def source(&block)
24
24
  @dataset = DatasetBuilder.new(@db).build(&block)
25
25
  end
26
26
 
27
- def transformations(klass=TransformationBuilder, &block)
28
- @transformations = klass.new.build(&block)
27
+ def transformations(&block)
28
+ @transformations = TransformationBuilder.new.build(&block)
29
29
  end
30
30
 
31
- def sinks(options={}, &block)
31
+ def sinks(&block)
32
32
  @sinks = SinkBuilder.new.build(&block)
33
33
  end
34
34
 
data/lib/chicago/etl.rb CHANGED
@@ -10,7 +10,6 @@ require 'chicago/flow/errors'
10
10
  require 'chicago/flow/transformation'
11
11
  require 'chicago/flow/filter'
12
12
  require 'chicago/flow/transformation_chain'
13
- require 'chicago/flow/pipeline_stage'
14
13
  require 'chicago/flow/pipeline_endpoint'
15
14
  require 'chicago/flow/array_source'
16
15
  require 'chicago/flow/dataset_source'
@@ -25,12 +24,12 @@ require 'chicago/etl/key_builder'
25
24
  require 'chicago/etl/schema_table_sink_factory'
26
25
  require 'chicago/etl/transformations'
27
26
  require 'chicago/etl/load_dataset_builder'
28
- require 'chicago/etl/dataset_batch_stage'
29
- require 'chicago/etl/load_pipeline_stage_builder'
30
- require 'chicago/etl/pipeline'
31
-
27
+ require 'chicago/etl/dataset_builder'
32
28
  require 'chicago/etl/stage'
33
29
  require 'chicago/etl/stage_builder'
30
+ require 'chicago/etl/dataset_batch_stage'
31
+ require 'chicago/etl/schema_sinks_and_transformations_builder'
32
+ require 'chicago/etl/pipeline'
34
33
 
35
34
  # Sequel Extensions
36
35
  require 'chicago/etl/sequel/filter_to_etl_batch'
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe "creating and running a dimension stage" do
4
+ let(:rows) { [{:some_field => "value"}] }
5
+ let(:db) { double(:db).as_null_object }
6
+ let(:schema) {
7
+ schema = Chicago::StarSchema.new
8
+
9
+ schema.define_dimension(:test) do
10
+ columns do
11
+ string :foo
12
+ end
13
+ end
14
+
15
+ schema
16
+ }
17
+
18
+ let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
19
+
20
+ it "glues the source, transformations, and sink correctly" do
21
+ pipeline.define_dimension_load(:test) do
22
+ dataset do
23
+ db.test_dataset_method
24
+ end
25
+ end
26
+
27
+ pipeline.stages.each do |stage|
28
+ stage.execute(double, true)
29
+ end
30
+ end
31
+
32
+ it "should set the inserted at time on the dimension"
33
+
34
+ it "truncates the dimension if specified"
35
+ end
@@ -39,26 +39,6 @@ describe "defining and executing a stage" do
39
39
  stage.sink(:another_stream).data.length.should == 0
40
40
  end
41
41
 
42
- it "requires sinks" do
43
- expect {
44
- pipeline.define_stage(:test_stage) do
45
- source do
46
- db.test_dataset_method
47
- end
48
- end
49
- }.to raise_error(ArgumentError)
50
- end
51
-
52
- it "requires sources" do
53
- expect {
54
- pipeline.define_stage(:test_stage) do
55
- sinks do
56
- add Chicago::Flow::ArraySink.new(:test)
57
- end
58
- end
59
- }.to raise_error(ArgumentError)
60
- end
61
-
62
42
  it "glues the source, transformations, and sink correctly" do
63
43
  pipeline.define_stage(:test_stage) do
64
44
  source do
@@ -90,8 +70,8 @@ describe "defining and executing a stage" do
90
70
  it "allows the source to be filtered via a filter strategy" do
91
71
  etl_batch_double = double
92
72
  fake_source = []
73
+ fake_source.should_receive(:another_dataset_method).and_return([])
93
74
 
94
- fake_source.should_receive(:another_dataset_method).and_return([])
95
75
  pipeline.define_stage(:test_stage) do
96
76
  source do
97
77
  fake_source
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Chicago::ETL::LoadPipelineStageBuilder do
3
+ describe Chicago::ETL::SchemaSinksAndTransformationsBuilder do
4
4
  let(:dimension) { stub(:dimension).as_null_object }
5
5
  let(:db) { stub(:db).as_null_object }
6
6
  let(:sink_factory) { stub(:sink_factory).as_null_object }
@@ -34,6 +34,6 @@ describe Chicago::ETL::LoadPipelineStageBuilder do
34
34
  key_mapping :bar, :original_id
35
35
  end
36
36
 
37
- stage.sink(:bar).should_not be_nil
37
+ stage[:sinks][:bar].should_not be_nil
38
38
  end
39
39
  end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Stage do
4
+ it "requires a source" do
5
+ expect {
6
+ described_class.new(:test,
7
+ :source => nil,
8
+ :sinks => {:default => stub(:sink)})
9
+ }.to raise_error(ArgumentError)
10
+ end
11
+
12
+ it "requires sinks" do
13
+ expect {
14
+ described_class.new(:test,
15
+ :source => stub(:source),
16
+ :sinks => nil)
17
+ }.to raise_error(ArgumentError)
18
+ end
19
+
20
+ it "does not filter the dataset if re-extracting" do
21
+ stage = described_class.new(:test,
22
+ :source => stub(:source),
23
+ :sinks => {:default => stub(:sink)},
24
+ :filter_strategy => lambda { fail })
25
+
26
+ stage.filtered_source(stub(:etl_batch), true)
27
+ end
28
+
29
+ it "opens sinks before writing and closes them afterwards" do
30
+ sink = mock(:sink)
31
+ sink.should_receive(:open)
32
+ sink.should_receive(:close)
33
+
34
+ stage = described_class.new(:test,
35
+ :source => [],
36
+ :sinks => {:default => sink})
37
+
38
+ stage.execute(stub(:etl_batch), true)
39
+ end
40
+ end
@@ -44,29 +44,33 @@ describe "Mysql -> Mysql through transformation chain" do
44
44
 
45
45
  it "copies data from source to destination" do
46
46
  TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
47
- {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
-
47
+ {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
+
49
49
  source = Chicago::Flow::DatasetSource.
50
50
  new(TEST_DB[:source].
51
51
  select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
52
+
53
+ transformations = [dup_row.new(:onto => :other)]
54
+
52
55
  sink_1 = Chicago::Flow::MysqlFileSink.
53
56
  new(TEST_DB, :destination, [:id, :foo, :bin])
54
57
  sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
55
58
 
56
- stage = Chicago::Flow::PipelineStage.
57
- new(:transformations => [dup_row.new(:onto => :other)])
58
-
59
- expect { stage.execute(source) }.to raise_error
60
-
61
- stage.register_sink(:default, sink_1)
62
- stage.register_sink(:other, sink_2)
59
+ stage = Chicago::ETL::Stage.new(:test,
60
+ :source => source,
61
+ :transformations => transformations,
62
+ :sinks => {
63
+ :default => sink_1,
64
+ :other => sink_2
65
+ })
63
66
 
64
- stage.execute(source)
67
+ stage.execute(stub(:etl_batch), true)
65
68
 
66
69
  expected = [{:id => 1, :foo => nil, :bin => "1F"},
67
70
  {:id => 2, :foo => "Hello", :bin => "1F"}]
68
71
 
69
72
  sink_2.data.should == expected
70
- TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
73
+ TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
74
+ all.should == expected
71
75
  end
72
76
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chicago-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 3
10
- version: 0.1.3
9
+ - 4
10
+ version: 0.1.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Roland Swingler
@@ -243,10 +243,11 @@ files:
243
243
  - lib/chicago/etl/core_extensions.rb
244
244
  - lib/chicago/etl/counter.rb
245
245
  - lib/chicago/etl/dataset_batch_stage.rb
246
+ - lib/chicago/etl/dataset_builder.rb
246
247
  - lib/chicago/etl/key_builder.rb
247
248
  - lib/chicago/etl/load_dataset_builder.rb
248
- - lib/chicago/etl/load_pipeline_stage_builder.rb
249
249
  - lib/chicago/etl/pipeline.rb
250
+ - lib/chicago/etl/schema_sinks_and_transformations_builder.rb
250
251
  - lib/chicago/etl/schema_table_sink_factory.rb
251
252
  - lib/chicago/etl/screens/column_screen.rb
252
253
  - lib/chicago/etl/screens/invalid_element.rb
@@ -273,7 +274,6 @@ files:
273
274
  - lib/chicago/flow/mysql_file_sink.rb
274
275
  - lib/chicago/flow/null_sink.rb
275
276
  - lib/chicago/flow/pipeline_endpoint.rb
276
- - lib/chicago/flow/pipeline_stage.rb
277
277
  - lib/chicago/flow/sink.rb
278
278
  - lib/chicago/flow/transformation.rb
279
279
  - lib/chicago/flow/transformation_chain.rb
@@ -281,7 +281,7 @@ files:
281
281
  - spec/etl/batch_spec.rb
282
282
  - spec/etl/core_extensions_spec.rb
283
283
  - spec/etl/counter_spec.rb
284
- - spec/etl/dataset_batch_stage_spec.rb
284
+ - spec/etl/define_dimension_stage_spec.rb
285
285
  - spec/etl/define_stage_spec.rb
286
286
  - spec/etl/etl_batch_id_dataset_filter.rb
287
287
  - spec/etl/key_builder_spec.rb
@@ -293,6 +293,7 @@ files:
293
293
  - spec/etl/screens/out_of_bounds_spec.rb
294
294
  - spec/etl/sequel/dependant_tables_spec.rb
295
295
  - spec/etl/sequel/filter_to_etl_batch_spec.rb
296
+ - spec/etl/stage_spec.rb
296
297
  - spec/etl/table_builder_spec.rb
297
298
  - spec/etl/task_spec.rb
298
299
  - spec/etl/transformations/deduplicate_rows_spec.rb
@@ -306,7 +307,6 @@ files:
306
307
  - spec/flow/mysql_file_serializer_spec.rb
307
308
  - spec/flow/mysql_file_sink_spec.rb
308
309
  - spec/flow/mysql_integration_spec.rb
309
- - spec/flow/pipeline_stage_spec.rb
310
310
  - spec/flow/transformation_chain_spec.rb
311
311
  - spec/flow/transformation_spec.rb
312
312
  - spec/spec_helper.rb
@@ -1,68 +0,0 @@
1
- module Chicago
2
- module Flow
3
- # Co-ordinates iterating over rows provided by a source, passing
4
- # them through a transformation chain before writing them to
5
- # sink(s).
6
- #
7
- # @api public
8
- class PipelineStage
9
- attr_reader :transformation_chain
10
-
11
- def initialize(options={})
12
- @sinks = options[:sinks] || {}
13
- @transformations = options[:transformations] || []
14
- @error_handler = options[:error_handler] || RaisingErrorHandler.new
15
- @transformation_chain = TransformationChain.new(*@transformations)
16
- end
17
-
18
- # Returns the named sink, if it exists
19
- def sink(name)
20
- @sinks[name.to_sym]
21
- end
22
-
23
- def sinks
24
- @sinks.values
25
- end
26
-
27
- def register_sink(name, sink)
28
- @sinks[name.to_sym] = sink
29
- self
30
- end
31
-
32
- def validate_pipeline
33
- unless unregistered_sinks.empty?
34
- @error_handler.unregistered_sinks(unregistered_sinks)
35
- end
36
- end
37
-
38
- def execute(source)
39
- validate_pipeline
40
- sinks.each(&:open)
41
- pipe_rows_to_sinks_from(source)
42
- sinks.each(&:close)
43
- end
44
-
45
- def required_sinks
46
- transformation_chain.output_streams | [:default]
47
- end
48
-
49
- def unregistered_sinks
50
- required_sinks - @sinks.keys
51
- end
52
-
53
- private
54
-
55
- def pipe_rows_to_sinks_from(source)
56
- source.each do |row|
57
- transformation_chain.process(row).each {|row| process_row(row) }
58
- end
59
- transformation_chain.flush.each {|row| process_row(row) }
60
- end
61
-
62
- def process_row(row)
63
- stream = row.delete(:_stream) || :default
64
- @sinks[stream] << row
65
- end
66
- end
67
- end
68
- end
@@ -1,55 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::DatasetBatchStage do
4
- let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
5
- let(:dataset) { mock(:dataset).as_null_object }
6
- let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
7
- let(:etl_batch) { stub(:etl_batch) }
8
-
9
- it "has a name" do
10
- stage.name.should == :foo
11
- end
12
-
13
- it "should set the inserted at time on the default sink" do
14
- sink = Chicago::Flow::ArraySink.new(:foo)
15
- pipeline_stage.stub(:sink).with(:default).and_return(sink)
16
- stage.pipeline_stage.should == pipeline_stage
17
-
18
- sink.constant_values[:_inserted_at].should_not be_nil
19
- end
20
-
21
- it "filters the dataset to the batch" do
22
- dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
23
- stage.source(etl_batch)
24
- end
25
-
26
- it "does not filter the dataset if re-extracting" do
27
- dataset.should_not_recieve(:filter_to_etl_batch)
28
- stage.source(etl_batch, true)
29
- end
30
-
31
- it "can filter via a custom strategy" do
32
- dataset.should_not_recieve(:filter_to_etl_batch)
33
-
34
- filter_strategy = lambda {|ds, batch| ds }
35
- described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
36
- source(etl_batch)
37
- end
38
-
39
- it "executes the pipeline stage using a DatasetSource" do
40
- pipeline_stage.should_receive(:execute).
41
- with(kind_of(Chicago::Flow::DatasetSource))
42
- stage.execute(etl_batch, true)
43
- end
44
-
45
- it "truncates any sinks if truncate_pre_load has been set" do
46
- stage = described_class.new(:foo, dataset, pipeline_stage,
47
- :truncate_pre_load => true)
48
-
49
- sink = Chicago::Flow::ArraySink.new(:output)
50
- sink << {:foo => "foo"}
51
- pipeline_stage.stub(:sinks).and_return([sink])
52
- stage.execute(etl_batch)
53
- sink.data.should == []
54
- end
55
- end
@@ -1,89 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::Flow::PipelineStage do
4
- let(:transform) {
5
- Class.new(Chicago::Flow::Transformation) {
6
- def process_row(row)
7
- row[:a] += 1
8
- row
9
- end
10
- }
11
- }
12
-
13
- let(:add_error) {
14
- Class.new(Chicago::Flow::Transformation) {
15
- # add_output_stream :error
16
- def output_streams
17
- [:default, :error]
18
- end
19
-
20
- def process_row(row)
21
- [row, {Chicago::Flow::STREAM => :error, :message => "error"}]
22
- end
23
- }
24
- }
25
-
26
- let(:sink) { Chicago::Flow::ArraySink.new(:test) }
27
- let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
28
-
29
- it "returns all sinks" do
30
- stage = described_class.new.register_sink(:default, sink)
31
- stage.sinks.should == [sink]
32
- end
33
-
34
- it "returns a sink by name" do
35
- stage = described_class.new.register_sink(:default, sink)
36
- stage.sink(:default).should == sink
37
- end
38
-
39
- it "reads from source to sink" do
40
- pipeline = described_class.new.register_sink(:default, sink)
41
- pipeline.execute(source)
42
- sink.data.should == [{:a => 1}]
43
- end
44
-
45
- it "passes rows through transforms" do
46
- pipeline = described_class.new(:transformations => [transform.new]).
47
- register_sink(:default, sink)
48
-
49
- pipeline.execute(source)
50
- sink.data.should == [{:a => 2}]
51
- end
52
-
53
- it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
54
- error_sink = Chicago::Flow::ArraySink.new(:test)
55
-
56
- pipeline = described_class.new(:transformations => [add_error.new]).
57
- register_sink(:default, sink).
58
- register_sink(:error, error_sink)
59
-
60
- pipeline.execute(source)
61
- sink.data.should == [{:a => 1}]
62
- error_sink.data.should == [{:message => "error"}]
63
- end
64
-
65
- it "calls an error handler if sinks are not registered" do
66
- error_handler = mock(:error_handler)
67
- error_handler.should_receive(:unregistered_sinks).
68
- with([:default, :error])
69
-
70
- pipeline = described_class.new(:transformations => [add_error.new],
71
- :error_handler => error_handler)
72
-
73
- pipeline.validate_pipeline
74
- end
75
-
76
- it "by default raises an exception if the pipeline is not valid when executed" do
77
- pipeline = described_class.new(:transformations => [add_error.new])
78
- expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
79
- end
80
-
81
- it "opens sinks before writing and closes them afterwards" do
82
- sink = mock(:sink)
83
- pipeline = described_class.new.register_sink(:default, sink)
84
- sink.should_receive(:open)
85
- sink.stub(:<<)
86
- sink.should_receive(:close)
87
- pipeline.execute(source)
88
- end
89
- end