chicago-etl 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.3
1
+ 0.1.4
data/chicago-etl.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.1.3"
8
+ s.version = "0.1.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
@@ -32,10 +32,11 @@ Gem::Specification.new do |s|
32
32
  "lib/chicago/etl/core_extensions.rb",
33
33
  "lib/chicago/etl/counter.rb",
34
34
  "lib/chicago/etl/dataset_batch_stage.rb",
35
+ "lib/chicago/etl/dataset_builder.rb",
35
36
  "lib/chicago/etl/key_builder.rb",
36
37
  "lib/chicago/etl/load_dataset_builder.rb",
37
- "lib/chicago/etl/load_pipeline_stage_builder.rb",
38
38
  "lib/chicago/etl/pipeline.rb",
39
+ "lib/chicago/etl/schema_sinks_and_transformations_builder.rb",
39
40
  "lib/chicago/etl/schema_table_sink_factory.rb",
40
41
  "lib/chicago/etl/screens/column_screen.rb",
41
42
  "lib/chicago/etl/screens/invalid_element.rb",
@@ -62,7 +63,6 @@ Gem::Specification.new do |s|
62
63
  "lib/chicago/flow/mysql_file_sink.rb",
63
64
  "lib/chicago/flow/null_sink.rb",
64
65
  "lib/chicago/flow/pipeline_endpoint.rb",
65
- "lib/chicago/flow/pipeline_stage.rb",
66
66
  "lib/chicago/flow/sink.rb",
67
67
  "lib/chicago/flow/transformation.rb",
68
68
  "lib/chicago/flow/transformation_chain.rb",
@@ -70,7 +70,7 @@ Gem::Specification.new do |s|
70
70
  "spec/etl/batch_spec.rb",
71
71
  "spec/etl/core_extensions_spec.rb",
72
72
  "spec/etl/counter_spec.rb",
73
- "spec/etl/dataset_batch_stage_spec.rb",
73
+ "spec/etl/define_dimension_stage_spec.rb",
74
74
  "spec/etl/define_stage_spec.rb",
75
75
  "spec/etl/etl_batch_id_dataset_filter.rb",
76
76
  "spec/etl/key_builder_spec.rb",
@@ -82,6 +82,7 @@ Gem::Specification.new do |s|
82
82
  "spec/etl/screens/out_of_bounds_spec.rb",
83
83
  "spec/etl/sequel/dependant_tables_spec.rb",
84
84
  "spec/etl/sequel/filter_to_etl_batch_spec.rb",
85
+ "spec/etl/stage_spec.rb",
85
86
  "spec/etl/table_builder_spec.rb",
86
87
  "spec/etl/task_spec.rb",
87
88
  "spec/etl/transformations/deduplicate_rows_spec.rb",
@@ -95,7 +96,6 @@ Gem::Specification.new do |s|
95
96
  "spec/flow/mysql_file_serializer_spec.rb",
96
97
  "spec/flow/mysql_file_sink_spec.rb",
97
98
  "spec/flow/mysql_integration_spec.rb",
98
- "spec/flow/pipeline_stage_spec.rb",
99
99
  "spec/flow/transformation_chain_spec.rb",
100
100
  "spec/flow/transformation_spec.rb",
101
101
  "spec/spec_helper.rb"
@@ -4,48 +4,28 @@ module Chicago
4
4
  #
5
5
  # Allows deferring constructing a DatasetSource until extract
6
6
  # time, so that it can be filtered to an ETL batch appropriately.
7
- class DatasetBatchStage
7
+ class DatasetBatchStage < Stage
8
8
  attr_reader :name
9
9
 
10
- def initialize(name, dataset, pipeline_stage, options={})
11
- @name = name
12
- @dataset = dataset
13
- @pipeline_stage = pipeline_stage
14
- @filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
15
- dataset.filter_to_etl_batch(etl_batch)
16
- }
10
+ def initialize(name, options={})
11
+ super
12
+ @filter_strategy = options[:filter_strategy] ||
13
+ lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
17
14
  @truncate_pre_load = !!options[:truncate_pre_load]
18
- end
15
+ end
19
16
 
20
17
  # Executes this ETL stage.
21
18
  #
22
19
  # Configures the dataset and flows rows into the pipeline.
23
20
  def execute(etl_batch, reextract=false)
24
21
  if @truncate_pre_load
25
- pipeline_stage.sinks.each {|sink| sink.truncate }
26
- elsif reextract && pipeline_stage.sink(:error)
27
- pipeline_stage.sink(:error).truncate
28
- end
29
-
30
- pipeline_stage.execute(source(etl_batch, reextract))
31
- end
32
-
33
- # Returns the pipeline for this stage.
34
- def pipeline_stage
35
- @pipeline_stage.sink(:default).
36
- set_constant_values(:_inserted_at => Time.now)
37
- @pipeline_stage
38
- end
39
-
40
- # Returns a DatasetSource for the provided dataset filtered to
41
- # the ETL batch as appropriate.
42
- def source(etl_batch, reextract=false)
43
- if reextract
44
- filtered_dataset = @dataset
45
- else
46
- filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
22
+ sinks.each {|sink| sink.truncate }
23
+ elsif reextract && sink(:error)
24
+ sink(:error).truncate
47
25
  end
48
- Chicago::Flow::DatasetSource.new(filtered_dataset)
26
+
27
+ sink(:default).set_constant_values(:_inserted_at => Time.now)
28
+ super
49
29
  end
50
30
  end
51
31
  end
@@ -0,0 +1,60 @@
1
+ module Chicago
2
+ module ETL
3
+ # Provides convenience methods for defining source datasets.
4
+ class DatasetBuilder
5
+ attr_reader :db
6
+
7
+ # @api private
8
+ def initialize(db)
9
+ @db = db
10
+ end
11
+
12
+ # @api private
13
+ def build(&block)
14
+ instance_eval(&block)
15
+ end
16
+
17
+ protected
18
+
19
+ def key_field(field, name)
20
+ :if[{field => nil}, 1, field].as(name)
21
+ end
22
+
23
+ # Returns a column for use in a Sequel::Dataset#select method to
24
+ # return a dimension key.
25
+ #
26
+ # Takes care of using the key tables correctly, and dealing with
27
+ # missing dimension values.
28
+ def dimension_key(name)
29
+ key_field("keys_dimension_#{name}__dimension_id".to_sym,
30
+ "#{name}_dimension_id".to_sym)
31
+ end
32
+
33
+ # Returns a column for use in a Sequel::Dataset#select method to
34
+ # return a date dimension key.
35
+ def date_dimension_column(dimension)
36
+ :if.sql_function({:id.qualify(dimension) => nil},
37
+ 1,
38
+ :id.qualify(dimension)).
39
+ as("#{dimension}_dimension_id".to_sym)
40
+ end
41
+
42
+ # Rounds a monetary value to 2 decimal places.
43
+ #
44
+ # By default, natural rounding is used, you can specify either
45
+ # :up or :down as the direction.
46
+ #
47
+ # @deprecated
48
+ def round(stmt, direction = :none)
49
+ case direction
50
+ when :none
51
+ :round.sql_function(stmt, 2)
52
+ when :up
53
+ :ceil.sql_function(stmt * 100) / 100
54
+ when :down
55
+ :floor.sql_function(stmt * 100) / 100
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -61,13 +61,17 @@ module Chicago
61
61
  # @api private
62
62
  def build(name, &block)
63
63
  instance_eval &block
64
- unless defined? @pipeline_stage
64
+ unless defined? @sinks_and_transformations
65
65
  pipeline do
66
66
  end
67
67
  end
68
- DatasetBatchStage.new(name, @dataset, @pipeline_stage,
69
- :filter_strategy => @filter_strategy,
70
- :truncate_pre_load => @truncate_pre_load)
68
+ DatasetBatchStage.new(name,
69
+ :source => @dataset,
70
+ :transformations => @sinks_and_transformations[:transformations],
71
+ :sinks => @sinks_and_transformations[:sinks],
72
+ :filter_strategy => @filter_strategy,
73
+ :truncate_pre_load => @truncate_pre_load)
74
+
71
75
  end
72
76
 
73
77
  protected
@@ -88,7 +92,7 @@ module Chicago
88
92
  # for details.
89
93
  # TODO: rename pipeline => transforms below this method
90
94
  def pipeline(&block)
91
- @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
95
+ @sinks_and_transformations = SchemaSinksAndTransformationsBuilder.new(@db, @schema_table).
92
96
  build(&block)
93
97
  end
94
98
 
@@ -106,62 +110,5 @@ module Chicago
106
110
  @filter_strategy = block
107
111
  end
108
112
  end
109
-
110
- # Provides convenience methods for defining source datasets.
111
- class DatasetBuilder
112
- attr_reader :db
113
-
114
- # @api private
115
- def initialize(db)
116
- @db = db
117
- end
118
-
119
- # @api private
120
- def build(&block)
121
- instance_eval(&block)
122
- end
123
-
124
- protected
125
-
126
- def key_field(field, name)
127
- :if[{field => nil}, 1, field].as(name)
128
- end
129
-
130
- # Returns a column for use in a Sequel::Dataset#select method to
131
- # return a dimension key.
132
- #
133
- # Takes care of using the key tables correctly, and dealing with
134
- # missing dimension values.
135
- def dimension_key(name)
136
- key_field("keys_dimension_#{name}__dimension_id".to_sym,
137
- "#{name}_dimension_id".to_sym)
138
- end
139
-
140
- # Returns a column for use in a Sequel::Dataset#select method to
141
- # return a date dimension key.
142
- def date_dimension_column(dimension)
143
- :if.sql_function({:id.qualify(dimension) => nil},
144
- 1,
145
- :id.qualify(dimension)).
146
- as("#{dimension}_dimension_id".to_sym)
147
- end
148
-
149
- # Rounds a monetary value to 2 decimal places.
150
- #
151
- # By default, natural rounding is used, you can specify either
152
- # :up or :down as the direction.
153
- #
154
- # @deprecated
155
- def round(stmt, direction = :none)
156
- case direction
157
- when :none
158
- :round.sql_function(stmt, 2)
159
- when :up
160
- :ceil.sql_function(stmt * 100) / 100
161
- when :down
162
- :floor.sql_function(stmt * 100) / 100
163
- end
164
- end
165
- end
166
113
  end
167
114
  end
@@ -5,7 +5,7 @@ module Chicago
5
5
  #
6
6
  # Clients will not normally instantiate this themselves but use it
7
7
  # in the context of defining an ETL stage.
8
- class LoadPipelineStageBuilder
8
+ class SchemaSinksAndTransformationsBuilder
9
9
  # @api private
10
10
  KeyMapping = Struct.new(:table, :field)
11
11
 
@@ -41,9 +41,9 @@ module Chicago
41
41
  add_screens
42
42
  add_key_transforms
43
43
  add_final_transforms
44
- pipeline_stage = create_pipeline_stage
45
- register_additional_sinks(pipeline_stage)
46
- pipeline_stage
44
+ sinks_and_transformations = create_sinks_and_transformations
45
+ register_additional_sinks(sinks_and_transformations)
46
+ sinks_and_transformations
47
47
  end
48
48
 
49
49
  protected
@@ -81,7 +81,7 @@ module Chicago
81
81
 
82
82
  private
83
83
 
84
- def create_pipeline_stage
84
+ def create_sinks_and_transformations
85
85
  default = @sink_factory.sink(:ignore => @ignore_present_rows,
86
86
  :exclude => @load_separately)
87
87
  key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
@@ -90,24 +90,26 @@ module Chicago
90
90
  # Facts have no key table to write to.
91
91
  Flow::NullSink.new
92
92
  end
93
-
94
- Flow::PipelineStage.
95
- new(:transformations => concat_transformations,
96
- :sinks => {
97
- :default => default,
98
- :dimension_key => key_sink,
99
- :error => @sink_factory.error_sink
100
- })
93
+
94
+ {
95
+ :transformations => concat_transformations,
96
+ :sinks => {
97
+ :default => default,
98
+ :dimension_key => key_sink,
99
+ :error => @sink_factory.error_sink
100
+ }
101
+ }
101
102
  end
102
103
 
103
104
  def concat_transformations
104
105
  TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
105
106
  end
106
107
 
107
- def register_additional_sinks(pipeline_stage)
108
+ def register_additional_sinks(sinks_and_transformations)
109
+ sinks = sinks_and_transformations[:sinks]
108
110
  @key_mappings.each do |mapping|
109
111
  sink = @sink_factory.key_sink(:table => mapping.table)
110
- pipeline_stage.register_sink(mapping.table, sink)
112
+ sinks[mapping.table] = sink
111
113
  end
112
114
  end
113
115
 
@@ -1,42 +1,30 @@
1
1
  module Chicago
2
2
  module ETL
3
+ # A Stage in the ETL pipeline.
4
+ #
5
+ # A Stage wires together a Source, 0 or more Transformations and 1
6
+ # or more Sinks.
3
7
  class Stage
8
+ # Returns the source for this stage.
9
+ attr_reader :source
10
+
11
+ # Returns the name of this stage.
4
12
  attr_reader :name
5
13
 
6
14
  def initialize(name, options={})
7
15
  @name = name
8
- @source = options.fetch(:source)
9
- raise ArgumentError, "Stage #{name} requires a source" unless @source
10
-
11
- @sinks = options.fetch(:sinks)
12
- raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
13
-
14
- @transformations = options.fetch(:transformations)
15
- @transformation_chain = Chicago::Flow::TransformationChain.
16
- new(*@transformations)
17
-
16
+ @source = options[:source]
17
+ @sinks = options[:sinks]
18
+ @transformations = options[:transformations] || []
18
19
  @filter_strategy = options[:filter_strategy] ||
19
20
  lambda {|source, _| source }
20
- end
21
21
 
22
- def execute(etl_batch, reextract)
23
- modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
24
- transform_and_load_from(modified_source)
25
- end
26
-
27
- def transform_and_load_from(source)
22
+ validate_arguments
28
23
  end
29
24
 
30
- def reextract_and_filter_source(source, etl_batch, reextract=false)
31
- if reextract
32
- filtered_dataset = source
33
- else
34
- filtered_dataset = @filter_strategy.call(source, etl_batch)
35
- end
36
- Chicago::Flow::DatasetSource.new(filtered_dataset)
25
+ def execute(etl_batch, reextract=false)
26
+ transform_and_load filtered_source(etl_batch, reextract)
37
27
  end
38
-
39
- attr_reader :transformation_chain
40
28
 
41
29
  # Returns the named sink, if it exists
42
30
  def sink(name)
@@ -46,20 +34,22 @@ module Chicago
46
34
  def sinks
47
35
  @sinks.values
48
36
  end
37
+
38
+ def filtered_source(etl_batch, reextract=false)
39
+ filtered_dataset = reextract ? source :
40
+ @filter_strategy.call(source, etl_batch)
49
41
 
50
- def register_sink(name, sink)
51
- @sinks[name.to_sym] = sink
52
- self
42
+ Chicago::Flow::DatasetSource.new(filtered_dataset)
53
43
  end
54
-
55
- def transform_and_load_from(source)
44
+
45
+ private
46
+
47
+ def transform_and_load(source)
56
48
  sinks.each(&:open)
57
49
  pipe_rows_to_sinks_from(source)
58
50
  sinks.each(&:close)
59
51
  end
60
-
61
- private
62
-
52
+
63
53
  def pipe_rows_to_sinks_from(source)
64
54
  source.each do |row|
65
55
  transformation_chain.process(row).each {|row| process_row(row) }
@@ -67,10 +57,25 @@ module Chicago
67
57
  transformation_chain.flush.each {|row| process_row(row) }
68
58
  end
69
59
 
60
+ def transformation_chain
61
+ @transformation_chain ||= Chicago::Flow::TransformationChain.
62
+ new(*@transformations)
63
+ end
64
+
70
65
  def process_row(row)
71
66
  stream = row.delete(:_stream) || :default
72
67
  @sinks[stream] << row
73
68
  end
69
+
70
+ def validate_arguments
71
+ if @source.nil?
72
+ raise ArgumentError, "Stage #{@name} requires a source"
73
+ end
74
+
75
+ if @sinks.blank?
76
+ raise ArgumentError, "Stage #{@name} requires at least one sink"
77
+ end
78
+ end
74
79
  end
75
80
  end
76
81
  end
@@ -1,8 +1,6 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  class StageBuilder
4
- attr_reader :sink_factory
5
-
6
4
  def initialize(db)
7
5
  @db = db
8
6
  end
@@ -20,15 +18,17 @@ module Chicago
20
18
  :filter_strategy => @filter_strategy)
21
19
  end
22
20
 
21
+ protected
22
+
23
23
  def source(&block)
24
24
  @dataset = DatasetBuilder.new(@db).build(&block)
25
25
  end
26
26
 
27
- def transformations(klass=TransformationBuilder, &block)
28
- @transformations = klass.new.build(&block)
27
+ def transformations(&block)
28
+ @transformations = TransformationBuilder.new.build(&block)
29
29
  end
30
30
 
31
- def sinks(options={}, &block)
31
+ def sinks(&block)
32
32
  @sinks = SinkBuilder.new.build(&block)
33
33
  end
34
34
 
data/lib/chicago/etl.rb CHANGED
@@ -10,7 +10,6 @@ require 'chicago/flow/errors'
10
10
  require 'chicago/flow/transformation'
11
11
  require 'chicago/flow/filter'
12
12
  require 'chicago/flow/transformation_chain'
13
- require 'chicago/flow/pipeline_stage'
14
13
  require 'chicago/flow/pipeline_endpoint'
15
14
  require 'chicago/flow/array_source'
16
15
  require 'chicago/flow/dataset_source'
@@ -25,12 +24,12 @@ require 'chicago/etl/key_builder'
25
24
  require 'chicago/etl/schema_table_sink_factory'
26
25
  require 'chicago/etl/transformations'
27
26
  require 'chicago/etl/load_dataset_builder'
28
- require 'chicago/etl/dataset_batch_stage'
29
- require 'chicago/etl/load_pipeline_stage_builder'
30
- require 'chicago/etl/pipeline'
31
-
27
+ require 'chicago/etl/dataset_builder'
32
28
  require 'chicago/etl/stage'
33
29
  require 'chicago/etl/stage_builder'
30
+ require 'chicago/etl/dataset_batch_stage'
31
+ require 'chicago/etl/schema_sinks_and_transformations_builder'
32
+ require 'chicago/etl/pipeline'
34
33
 
35
34
  # Sequel Extensions
36
35
  require 'chicago/etl/sequel/filter_to_etl_batch'
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe "creating and running a dimension stage" do
4
+ let(:rows) { [{:some_field => "value"}] }
5
+ let(:db) { double(:db).as_null_object }
6
+ let(:schema) {
7
+ schema = Chicago::StarSchema.new
8
+
9
+ schema.define_dimension(:test) do
10
+ columns do
11
+ string :foo
12
+ end
13
+ end
14
+
15
+ schema
16
+ }
17
+
18
+ let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
19
+
20
+ it "glues the source, transformations, and sink correctly" do
21
+ pipeline.define_dimension_load(:test) do
22
+ dataset do
23
+ db.test_dataset_method
24
+ end
25
+ end
26
+
27
+ pipeline.stages.each do |stage|
28
+ stage.execute(double, true)
29
+ end
30
+ end
31
+
32
+ it "should set the inserted at time on the dimension"
33
+
34
+ it "truncates the dimension if specified"
35
+ end
@@ -39,26 +39,6 @@ describe "defining and executing a stage" do
39
39
  stage.sink(:another_stream).data.length.should == 0
40
40
  end
41
41
 
42
- it "requires sinks" do
43
- expect {
44
- pipeline.define_stage(:test_stage) do
45
- source do
46
- db.test_dataset_method
47
- end
48
- end
49
- }.to raise_error(ArgumentError)
50
- end
51
-
52
- it "requires sources" do
53
- expect {
54
- pipeline.define_stage(:test_stage) do
55
- sinks do
56
- add Chicago::Flow::ArraySink.new(:test)
57
- end
58
- end
59
- }.to raise_error(ArgumentError)
60
- end
61
-
62
42
  it "glues the source, transformations, and sink correctly" do
63
43
  pipeline.define_stage(:test_stage) do
64
44
  source do
@@ -90,8 +70,8 @@ describe "defining and executing a stage" do
90
70
  it "allows the source to be filtered via a filter strategy" do
91
71
  etl_batch_double = double
92
72
  fake_source = []
73
+ fake_source.should_receive(:another_dataset_method).and_return([])
93
74
 
94
- fake_source.should_receive(:another_dataset_method).and_return([])
95
75
  pipeline.define_stage(:test_stage) do
96
76
  source do
97
77
  fake_source
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Chicago::ETL::LoadPipelineStageBuilder do
3
+ describe Chicago::ETL::SchemaSinksAndTransformationsBuilder do
4
4
  let(:dimension) { stub(:dimension).as_null_object }
5
5
  let(:db) { stub(:db).as_null_object }
6
6
  let(:sink_factory) { stub(:sink_factory).as_null_object }
@@ -34,6 +34,6 @@ describe Chicago::ETL::LoadPipelineStageBuilder do
34
34
  key_mapping :bar, :original_id
35
35
  end
36
36
 
37
- stage.sink(:bar).should_not be_nil
37
+ stage[:sinks][:bar].should_not be_nil
38
38
  end
39
39
  end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Stage do
4
+ it "requires a source" do
5
+ expect {
6
+ described_class.new(:test,
7
+ :source => nil,
8
+ :sinks => {:default => stub(:sink)})
9
+ }.to raise_error(ArgumentError)
10
+ end
11
+
12
+ it "requires sinks" do
13
+ expect {
14
+ described_class.new(:test,
15
+ :source => stub(:source),
16
+ :sinks => nil)
17
+ }.to raise_error(ArgumentError)
18
+ end
19
+
20
+ it "does not filter the dataset if re-extracting" do
21
+ stage = described_class.new(:test,
22
+ :source => stub(:source),
23
+ :sinks => {:default => stub(:sink)},
24
+ :filter_strategy => lambda { fail })
25
+
26
+ stage.filtered_source(stub(:etl_batch), true)
27
+ end
28
+
29
+ it "opens sinks before writing and closes them afterwards" do
30
+ sink = mock(:sink)
31
+ sink.should_receive(:open)
32
+ sink.should_receive(:close)
33
+
34
+ stage = described_class.new(:test,
35
+ :source => [],
36
+ :sinks => {:default => sink})
37
+
38
+ stage.execute(stub(:etl_batch), true)
39
+ end
40
+ end
@@ -44,29 +44,33 @@ describe "Mysql -> Mysql through transformation chain" do
44
44
 
45
45
  it "copies data from source to destination" do
46
46
  TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
47
- {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
-
47
+ {:foo => "Hello", :bin => :unhex.sql_function("1F")}])
48
+
49
49
  source = Chicago::Flow::DatasetSource.
50
50
  new(TEST_DB[:source].
51
51
  select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
52
+
53
+ transformations = [dup_row.new(:onto => :other)]
54
+
52
55
  sink_1 = Chicago::Flow::MysqlFileSink.
53
56
  new(TEST_DB, :destination, [:id, :foo, :bin])
54
57
  sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
55
58
 
56
- stage = Chicago::Flow::PipelineStage.
57
- new(:transformations => [dup_row.new(:onto => :other)])
58
-
59
- expect { stage.execute(source) }.to raise_error
60
-
61
- stage.register_sink(:default, sink_1)
62
- stage.register_sink(:other, sink_2)
59
+ stage = Chicago::ETL::Stage.new(:test,
60
+ :source => source,
61
+ :transformations => transformations,
62
+ :sinks => {
63
+ :default => sink_1,
64
+ :other => sink_2
65
+ })
63
66
 
64
- stage.execute(source)
67
+ stage.execute(stub(:etl_batch), true)
65
68
 
66
69
  expected = [{:id => 1, :foo => nil, :bin => "1F"},
67
70
  {:id => 2, :foo => "Hello", :bin => "1F"}]
68
71
 
69
72
  sink_2.data.should == expected
70
- TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).all.should == expected
73
+ TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
74
+ all.should == expected
71
75
  end
72
76
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chicago-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 3
10
- version: 0.1.3
9
+ - 4
10
+ version: 0.1.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Roland Swingler
@@ -243,10 +243,11 @@ files:
243
243
  - lib/chicago/etl/core_extensions.rb
244
244
  - lib/chicago/etl/counter.rb
245
245
  - lib/chicago/etl/dataset_batch_stage.rb
246
+ - lib/chicago/etl/dataset_builder.rb
246
247
  - lib/chicago/etl/key_builder.rb
247
248
  - lib/chicago/etl/load_dataset_builder.rb
248
- - lib/chicago/etl/load_pipeline_stage_builder.rb
249
249
  - lib/chicago/etl/pipeline.rb
250
+ - lib/chicago/etl/schema_sinks_and_transformations_builder.rb
250
251
  - lib/chicago/etl/schema_table_sink_factory.rb
251
252
  - lib/chicago/etl/screens/column_screen.rb
252
253
  - lib/chicago/etl/screens/invalid_element.rb
@@ -273,7 +274,6 @@ files:
273
274
  - lib/chicago/flow/mysql_file_sink.rb
274
275
  - lib/chicago/flow/null_sink.rb
275
276
  - lib/chicago/flow/pipeline_endpoint.rb
276
- - lib/chicago/flow/pipeline_stage.rb
277
277
  - lib/chicago/flow/sink.rb
278
278
  - lib/chicago/flow/transformation.rb
279
279
  - lib/chicago/flow/transformation_chain.rb
@@ -281,7 +281,7 @@ files:
281
281
  - spec/etl/batch_spec.rb
282
282
  - spec/etl/core_extensions_spec.rb
283
283
  - spec/etl/counter_spec.rb
284
- - spec/etl/dataset_batch_stage_spec.rb
284
+ - spec/etl/define_dimension_stage_spec.rb
285
285
  - spec/etl/define_stage_spec.rb
286
286
  - spec/etl/etl_batch_id_dataset_filter.rb
287
287
  - spec/etl/key_builder_spec.rb
@@ -293,6 +293,7 @@ files:
293
293
  - spec/etl/screens/out_of_bounds_spec.rb
294
294
  - spec/etl/sequel/dependant_tables_spec.rb
295
295
  - spec/etl/sequel/filter_to_etl_batch_spec.rb
296
+ - spec/etl/stage_spec.rb
296
297
  - spec/etl/table_builder_spec.rb
297
298
  - spec/etl/task_spec.rb
298
299
  - spec/etl/transformations/deduplicate_rows_spec.rb
@@ -306,7 +307,6 @@ files:
306
307
  - spec/flow/mysql_file_serializer_spec.rb
307
308
  - spec/flow/mysql_file_sink_spec.rb
308
309
  - spec/flow/mysql_integration_spec.rb
309
- - spec/flow/pipeline_stage_spec.rb
310
310
  - spec/flow/transformation_chain_spec.rb
311
311
  - spec/flow/transformation_spec.rb
312
312
  - spec/spec_helper.rb
@@ -1,68 +0,0 @@
1
- module Chicago
2
- module Flow
3
- # Co-ordinates iterating over rows provided by a source, passing
4
- # them through a transformation chain before writing them to
5
- # sink(s).
6
- #
7
- # @api public
8
- class PipelineStage
9
- attr_reader :transformation_chain
10
-
11
- def initialize(options={})
12
- @sinks = options[:sinks] || {}
13
- @transformations = options[:transformations] || []
14
- @error_handler = options[:error_handler] || RaisingErrorHandler.new
15
- @transformation_chain = TransformationChain.new(*@transformations)
16
- end
17
-
18
- # Returns the named sink, if it exists
19
- def sink(name)
20
- @sinks[name.to_sym]
21
- end
22
-
23
- def sinks
24
- @sinks.values
25
- end
26
-
27
- def register_sink(name, sink)
28
- @sinks[name.to_sym] = sink
29
- self
30
- end
31
-
32
- def validate_pipeline
33
- unless unregistered_sinks.empty?
34
- @error_handler.unregistered_sinks(unregistered_sinks)
35
- end
36
- end
37
-
38
- def execute(source)
39
- validate_pipeline
40
- sinks.each(&:open)
41
- pipe_rows_to_sinks_from(source)
42
- sinks.each(&:close)
43
- end
44
-
45
- def required_sinks
46
- transformation_chain.output_streams | [:default]
47
- end
48
-
49
- def unregistered_sinks
50
- required_sinks - @sinks.keys
51
- end
52
-
53
- private
54
-
55
- def pipe_rows_to_sinks_from(source)
56
- source.each do |row|
57
- transformation_chain.process(row).each {|row| process_row(row) }
58
- end
59
- transformation_chain.flush.each {|row| process_row(row) }
60
- end
61
-
62
- def process_row(row)
63
- stream = row.delete(:_stream) || :default
64
- @sinks[stream] << row
65
- end
66
- end
67
- end
68
- end
@@ -1,55 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::DatasetBatchStage do
4
- let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
5
- let(:dataset) { mock(:dataset).as_null_object }
6
- let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
7
- let(:etl_batch) { stub(:etl_batch) }
8
-
9
- it "has a name" do
10
- stage.name.should == :foo
11
- end
12
-
13
- it "should set the inserted at time on the default sink" do
14
- sink = Chicago::Flow::ArraySink.new(:foo)
15
- pipeline_stage.stub(:sink).with(:default).and_return(sink)
16
- stage.pipeline_stage.should == pipeline_stage
17
-
18
- sink.constant_values[:_inserted_at].should_not be_nil
19
- end
20
-
21
- it "filters the dataset to the batch" do
22
- dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
23
- stage.source(etl_batch)
24
- end
25
-
26
- it "does not filter the dataset if re-extracting" do
27
- dataset.should_not_recieve(:filter_to_etl_batch)
28
- stage.source(etl_batch, true)
29
- end
30
-
31
- it "can filter via a custom strategy" do
32
- dataset.should_not_recieve(:filter_to_etl_batch)
33
-
34
- filter_strategy = lambda {|ds, batch| ds }
35
- described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
36
- source(etl_batch)
37
- end
38
-
39
- it "executes the pipeline stage using a DatasetSource" do
40
- pipeline_stage.should_receive(:execute).
41
- with(kind_of(Chicago::Flow::DatasetSource))
42
- stage.execute(etl_batch, true)
43
- end
44
-
45
- it "truncates any sinks if truncate_pre_load has been set" do
46
- stage = described_class.new(:foo, dataset, pipeline_stage,
47
- :truncate_pre_load => true)
48
-
49
- sink = Chicago::Flow::ArraySink.new(:output)
50
- sink << {:foo => "foo"}
51
- pipeline_stage.stub(:sinks).and_return([sink])
52
- stage.execute(etl_batch)
53
- sink.data.should == []
54
- end
55
- end
@@ -1,89 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::Flow::PipelineStage do
4
- let(:transform) {
5
- Class.new(Chicago::Flow::Transformation) {
6
- def process_row(row)
7
- row[:a] += 1
8
- row
9
- end
10
- }
11
- }
12
-
13
- let(:add_error) {
14
- Class.new(Chicago::Flow::Transformation) {
15
- # add_output_stream :error
16
- def output_streams
17
- [:default, :error]
18
- end
19
-
20
- def process_row(row)
21
- [row, {Chicago::Flow::STREAM => :error, :message => "error"}]
22
- end
23
- }
24
- }
25
-
26
- let(:sink) { Chicago::Flow::ArraySink.new(:test) }
27
- let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
28
-
29
- it "returns all sinks" do
30
- stage = described_class.new.register_sink(:default, sink)
31
- stage.sinks.should == [sink]
32
- end
33
-
34
- it "returns a sink by name" do
35
- stage = described_class.new.register_sink(:default, sink)
36
- stage.sink(:default).should == sink
37
- end
38
-
39
- it "reads from source to sink" do
40
- pipeline = described_class.new.register_sink(:default, sink)
41
- pipeline.execute(source)
42
- sink.data.should == [{:a => 1}]
43
- end
44
-
45
- it "passes rows through transforms" do
46
- pipeline = described_class.new(:transformations => [transform.new]).
47
- register_sink(:default, sink)
48
-
49
- pipeline.execute(source)
50
- sink.data.should == [{:a => 2}]
51
- end
52
-
53
- it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
54
- error_sink = Chicago::Flow::ArraySink.new(:test)
55
-
56
- pipeline = described_class.new(:transformations => [add_error.new]).
57
- register_sink(:default, sink).
58
- register_sink(:error, error_sink)
59
-
60
- pipeline.execute(source)
61
- sink.data.should == [{:a => 1}]
62
- error_sink.data.should == [{:message => "error"}]
63
- end
64
-
65
- it "calls an error handler if sinks are not registered" do
66
- error_handler = mock(:error_handler)
67
- error_handler.should_receive(:unregistered_sinks).
68
- with([:default, :error])
69
-
70
- pipeline = described_class.new(:transformations => [add_error.new],
71
- :error_handler => error_handler)
72
-
73
- pipeline.validate_pipeline
74
- end
75
-
76
- it "by default raises an exception if the pipeline is not valid when executed" do
77
- pipeline = described_class.new(:transformations => [add_error.new])
78
- expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
79
- end
80
-
81
- it "opens sinks before writing and closes them afterwards" do
82
- sink = mock(:sink)
83
- pipeline = described_class.new.register_sink(:default, sink)
84
- sink.should_receive(:open)
85
- sink.stub(:<<)
86
- sink.should_receive(:close)
87
- pipeline.execute(source)
88
- end
89
- end