chicago-etl 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +5 -5
- data/lib/chicago/etl/dataset_batch_stage.rb +12 -32
- data/lib/chicago/etl/dataset_builder.rb +60 -0
- data/lib/chicago/etl/pipeline.rb +9 -62
- data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb} +17 -15
- data/lib/chicago/etl/stage.rb +39 -34
- data/lib/chicago/etl/stage_builder.rb +5 -5
- data/lib/chicago/etl.rb +4 -5
- data/spec/etl/define_dimension_stage_spec.rb +35 -0
- data/spec/etl/define_stage_spec.rb +1 -21
- data/spec/etl/pipeline_stage_builder_spec.rb +2 -2
- data/spec/etl/stage_spec.rb +40 -0
- data/spec/flow/mysql_integration_spec.rb +15 -11
- metadata +7 -7
- data/lib/chicago/flow/pipeline_stage.rb +0 -68
- data/spec/etl/dataset_batch_stage_spec.rb +0 -55
- data/spec/flow/pipeline_stage_spec.rb +0 -89
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.4
|
data/chicago-etl.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
@@ -32,10 +32,11 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/chicago/etl/core_extensions.rb",
|
33
33
|
"lib/chicago/etl/counter.rb",
|
34
34
|
"lib/chicago/etl/dataset_batch_stage.rb",
|
35
|
+
"lib/chicago/etl/dataset_builder.rb",
|
35
36
|
"lib/chicago/etl/key_builder.rb",
|
36
37
|
"lib/chicago/etl/load_dataset_builder.rb",
|
37
|
-
"lib/chicago/etl/load_pipeline_stage_builder.rb",
|
38
38
|
"lib/chicago/etl/pipeline.rb",
|
39
|
+
"lib/chicago/etl/schema_sinks_and_transformations_builder.rb",
|
39
40
|
"lib/chicago/etl/schema_table_sink_factory.rb",
|
40
41
|
"lib/chicago/etl/screens/column_screen.rb",
|
41
42
|
"lib/chicago/etl/screens/invalid_element.rb",
|
@@ -62,7 +63,6 @@ Gem::Specification.new do |s|
|
|
62
63
|
"lib/chicago/flow/mysql_file_sink.rb",
|
63
64
|
"lib/chicago/flow/null_sink.rb",
|
64
65
|
"lib/chicago/flow/pipeline_endpoint.rb",
|
65
|
-
"lib/chicago/flow/pipeline_stage.rb",
|
66
66
|
"lib/chicago/flow/sink.rb",
|
67
67
|
"lib/chicago/flow/transformation.rb",
|
68
68
|
"lib/chicago/flow/transformation_chain.rb",
|
@@ -70,7 +70,7 @@ Gem::Specification.new do |s|
|
|
70
70
|
"spec/etl/batch_spec.rb",
|
71
71
|
"spec/etl/core_extensions_spec.rb",
|
72
72
|
"spec/etl/counter_spec.rb",
|
73
|
-
"spec/etl/
|
73
|
+
"spec/etl/define_dimension_stage_spec.rb",
|
74
74
|
"spec/etl/define_stage_spec.rb",
|
75
75
|
"spec/etl/etl_batch_id_dataset_filter.rb",
|
76
76
|
"spec/etl/key_builder_spec.rb",
|
@@ -82,6 +82,7 @@ Gem::Specification.new do |s|
|
|
82
82
|
"spec/etl/screens/out_of_bounds_spec.rb",
|
83
83
|
"spec/etl/sequel/dependant_tables_spec.rb",
|
84
84
|
"spec/etl/sequel/filter_to_etl_batch_spec.rb",
|
85
|
+
"spec/etl/stage_spec.rb",
|
85
86
|
"spec/etl/table_builder_spec.rb",
|
86
87
|
"spec/etl/task_spec.rb",
|
87
88
|
"spec/etl/transformations/deduplicate_rows_spec.rb",
|
@@ -95,7 +96,6 @@ Gem::Specification.new do |s|
|
|
95
96
|
"spec/flow/mysql_file_serializer_spec.rb",
|
96
97
|
"spec/flow/mysql_file_sink_spec.rb",
|
97
98
|
"spec/flow/mysql_integration_spec.rb",
|
98
|
-
"spec/flow/pipeline_stage_spec.rb",
|
99
99
|
"spec/flow/transformation_chain_spec.rb",
|
100
100
|
"spec/flow/transformation_spec.rb",
|
101
101
|
"spec/spec_helper.rb"
|
@@ -4,48 +4,28 @@ module Chicago
|
|
4
4
|
#
|
5
5
|
# Allows deferring constructing a DatasetSource until extract
|
6
6
|
# time, so that it can be filtered to an ETL batch appropriately.
|
7
|
-
class DatasetBatchStage
|
7
|
+
class DatasetBatchStage < Stage
|
8
8
|
attr_reader :name
|
9
9
|
|
10
|
-
def initialize(name,
|
11
|
-
|
12
|
-
@
|
13
|
-
|
14
|
-
@filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
|
15
|
-
dataset.filter_to_etl_batch(etl_batch)
|
16
|
-
}
|
10
|
+
def initialize(name, options={})
|
11
|
+
super
|
12
|
+
@filter_strategy = options[:filter_strategy] ||
|
13
|
+
lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
|
17
14
|
@truncate_pre_load = !!options[:truncate_pre_load]
|
18
|
-
|
15
|
+
end
|
19
16
|
|
20
17
|
# Executes this ETL stage.
|
21
18
|
#
|
22
19
|
# Configures the dataset and flows rows into the pipeline.
|
23
20
|
def execute(etl_batch, reextract=false)
|
24
21
|
if @truncate_pre_load
|
25
|
-
|
26
|
-
elsif reextract &&
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
pipeline_stage.execute(source(etl_batch, reextract))
|
31
|
-
end
|
32
|
-
|
33
|
-
# Returns the pipeline for this stage.
|
34
|
-
def pipeline_stage
|
35
|
-
@pipeline_stage.sink(:default).
|
36
|
-
set_constant_values(:_inserted_at => Time.now)
|
37
|
-
@pipeline_stage
|
38
|
-
end
|
39
|
-
|
40
|
-
# Returns a DatasetSource for the provided dataset filtered to
|
41
|
-
# the ETL batch as appropriate.
|
42
|
-
def source(etl_batch, reextract=false)
|
43
|
-
if reextract
|
44
|
-
filtered_dataset = @dataset
|
45
|
-
else
|
46
|
-
filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
|
22
|
+
sinks.each {|sink| sink.truncate }
|
23
|
+
elsif reextract && sink(:error)
|
24
|
+
sink(:error).truncate
|
47
25
|
end
|
48
|
-
|
26
|
+
|
27
|
+
sink(:default).set_constant_values(:_inserted_at => Time.now)
|
28
|
+
super
|
49
29
|
end
|
50
30
|
end
|
51
31
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Provides convenience methods for defining source datasets.
|
4
|
+
class DatasetBuilder
|
5
|
+
attr_reader :db
|
6
|
+
|
7
|
+
# @api private
|
8
|
+
def initialize(db)
|
9
|
+
@db = db
|
10
|
+
end
|
11
|
+
|
12
|
+
# @api private
|
13
|
+
def build(&block)
|
14
|
+
instance_eval(&block)
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
def key_field(field, name)
|
20
|
+
:if[{field => nil}, 1, field].as(name)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
24
|
+
# return a dimension key.
|
25
|
+
#
|
26
|
+
# Takes care of using the key tables correctly, and dealing with
|
27
|
+
# missing dimension values.
|
28
|
+
def dimension_key(name)
|
29
|
+
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
30
|
+
"#{name}_dimension_id".to_sym)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
34
|
+
# return a date dimension key.
|
35
|
+
def date_dimension_column(dimension)
|
36
|
+
:if.sql_function({:id.qualify(dimension) => nil},
|
37
|
+
1,
|
38
|
+
:id.qualify(dimension)).
|
39
|
+
as("#{dimension}_dimension_id".to_sym)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Rounds a monetary value to 2 decimal places.
|
43
|
+
#
|
44
|
+
# By default, natural rounding is used, you can specify either
|
45
|
+
# :up or :down as the direction.
|
46
|
+
#
|
47
|
+
# @deprecated
|
48
|
+
def round(stmt, direction = :none)
|
49
|
+
case direction
|
50
|
+
when :none
|
51
|
+
:round.sql_function(stmt, 2)
|
52
|
+
when :up
|
53
|
+
:ceil.sql_function(stmt * 100) / 100
|
54
|
+
when :down
|
55
|
+
:floor.sql_function(stmt * 100) / 100
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/chicago/etl/pipeline.rb
CHANGED
@@ -61,13 +61,17 @@ module Chicago
|
|
61
61
|
# @api private
|
62
62
|
def build(name, &block)
|
63
63
|
instance_eval &block
|
64
|
-
unless defined? @
|
64
|
+
unless defined? @sinks_and_transformations
|
65
65
|
pipeline do
|
66
66
|
end
|
67
67
|
end
|
68
|
-
|
69
|
-
|
70
|
-
|
68
|
+
DatasetBatchStage.new(name,
|
69
|
+
:source => @dataset,
|
70
|
+
:transformations => @sinks_and_transformations[:transformations],
|
71
|
+
:sinks => @sinks_and_transformations[:sinks],
|
72
|
+
:filter_strategy => @filter_strategy,
|
73
|
+
:truncate_pre_load => @truncate_pre_load)
|
74
|
+
|
71
75
|
end
|
72
76
|
|
73
77
|
protected
|
@@ -88,7 +92,7 @@ module Chicago
|
|
88
92
|
# for details.
|
89
93
|
# TODO: rename pipeline => transforms below this method
|
90
94
|
def pipeline(&block)
|
91
|
-
@
|
95
|
+
@sinks_and_transformations = SchemaSinksAndTransformationsBuilder.new(@db, @schema_table).
|
92
96
|
build(&block)
|
93
97
|
end
|
94
98
|
|
@@ -106,62 +110,5 @@ module Chicago
|
|
106
110
|
@filter_strategy = block
|
107
111
|
end
|
108
112
|
end
|
109
|
-
|
110
|
-
# Provides convenience methods for defining source datasets.
|
111
|
-
class DatasetBuilder
|
112
|
-
attr_reader :db
|
113
|
-
|
114
|
-
# @api private
|
115
|
-
def initialize(db)
|
116
|
-
@db = db
|
117
|
-
end
|
118
|
-
|
119
|
-
# @api private
|
120
|
-
def build(&block)
|
121
|
-
instance_eval(&block)
|
122
|
-
end
|
123
|
-
|
124
|
-
protected
|
125
|
-
|
126
|
-
def key_field(field, name)
|
127
|
-
:if[{field => nil}, 1, field].as(name)
|
128
|
-
end
|
129
|
-
|
130
|
-
# Returns a column for use in a Sequel::Dataset#select method to
|
131
|
-
# return a dimension key.
|
132
|
-
#
|
133
|
-
# Takes care of using the key tables correctly, and dealing with
|
134
|
-
# missing dimension values.
|
135
|
-
def dimension_key(name)
|
136
|
-
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
137
|
-
"#{name}_dimension_id".to_sym)
|
138
|
-
end
|
139
|
-
|
140
|
-
# Returns a column for use in a Sequel::Dataset#select method to
|
141
|
-
# return a date dimension key.
|
142
|
-
def date_dimension_column(dimension)
|
143
|
-
:if.sql_function({:id.qualify(dimension) => nil},
|
144
|
-
1,
|
145
|
-
:id.qualify(dimension)).
|
146
|
-
as("#{dimension}_dimension_id".to_sym)
|
147
|
-
end
|
148
|
-
|
149
|
-
# Rounds a monetary value to 2 decimal places.
|
150
|
-
#
|
151
|
-
# By default, natural rounding is used, you can specify either
|
152
|
-
# :up or :down as the direction.
|
153
|
-
#
|
154
|
-
# @deprecated
|
155
|
-
def round(stmt, direction = :none)
|
156
|
-
case direction
|
157
|
-
when :none
|
158
|
-
:round.sql_function(stmt, 2)
|
159
|
-
when :up
|
160
|
-
:ceil.sql_function(stmt * 100) / 100
|
161
|
-
when :down
|
162
|
-
:floor.sql_function(stmt * 100) / 100
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
113
|
end
|
167
114
|
end
|
data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb}
RENAMED
@@ -5,7 +5,7 @@ module Chicago
|
|
5
5
|
#
|
6
6
|
# Clients will not normally instantiate this themselves but use it
|
7
7
|
# in the context of defining an ETL stage.
|
8
|
-
class
|
8
|
+
class SchemaSinksAndTransformationsBuilder
|
9
9
|
# @api private
|
10
10
|
KeyMapping = Struct.new(:table, :field)
|
11
11
|
|
@@ -41,9 +41,9 @@ module Chicago
|
|
41
41
|
add_screens
|
42
42
|
add_key_transforms
|
43
43
|
add_final_transforms
|
44
|
-
|
45
|
-
register_additional_sinks(
|
46
|
-
|
44
|
+
sinks_and_transformations = create_sinks_and_transformations
|
45
|
+
register_additional_sinks(sinks_and_transformations)
|
46
|
+
sinks_and_transformations
|
47
47
|
end
|
48
48
|
|
49
49
|
protected
|
@@ -81,7 +81,7 @@ module Chicago
|
|
81
81
|
|
82
82
|
private
|
83
83
|
|
84
|
-
def
|
84
|
+
def create_sinks_and_transformations
|
85
85
|
default = @sink_factory.sink(:ignore => @ignore_present_rows,
|
86
86
|
:exclude => @load_separately)
|
87
87
|
key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
|
@@ -90,24 +90,26 @@ module Chicago
|
|
90
90
|
# Facts have no key table to write to.
|
91
91
|
Flow::NullSink.new
|
92
92
|
end
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
93
|
+
|
94
|
+
{
|
95
|
+
:transformations => concat_transformations,
|
96
|
+
:sinks => {
|
97
|
+
:default => default,
|
98
|
+
:dimension_key => key_sink,
|
99
|
+
:error => @sink_factory.error_sink
|
100
|
+
}
|
101
|
+
}
|
101
102
|
end
|
102
103
|
|
103
104
|
def concat_transformations
|
104
105
|
TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
|
105
106
|
end
|
106
107
|
|
107
|
-
def register_additional_sinks(
|
108
|
+
def register_additional_sinks(sinks_and_transformations)
|
109
|
+
sinks = sinks_and_transformations[:sinks]
|
108
110
|
@key_mappings.each do |mapping|
|
109
111
|
sink = @sink_factory.key_sink(:table => mapping.table)
|
110
|
-
|
112
|
+
sinks[mapping.table] = sink
|
111
113
|
end
|
112
114
|
end
|
113
115
|
|
data/lib/chicago/etl/stage.rb
CHANGED
@@ -1,42 +1,30 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
|
+
# A Stage in the ETL pipeline.
|
4
|
+
#
|
5
|
+
# A Stage wires together a Source, 0 or more Transformations and 1
|
6
|
+
# or more Sinks.
|
3
7
|
class Stage
|
8
|
+
# Returns the source for this stage.
|
9
|
+
attr_reader :source
|
10
|
+
|
11
|
+
# Returns the name of this stage.
|
4
12
|
attr_reader :name
|
5
13
|
|
6
14
|
def initialize(name, options={})
|
7
15
|
@name = name
|
8
|
-
@source = options
|
9
|
-
|
10
|
-
|
11
|
-
@sinks = options.fetch(:sinks)
|
12
|
-
raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
|
13
|
-
|
14
|
-
@transformations = options.fetch(:transformations)
|
15
|
-
@transformation_chain = Chicago::Flow::TransformationChain.
|
16
|
-
new(*@transformations)
|
17
|
-
|
16
|
+
@source = options[:source]
|
17
|
+
@sinks = options[:sinks]
|
18
|
+
@transformations = options[:transformations] || []
|
18
19
|
@filter_strategy = options[:filter_strategy] ||
|
19
20
|
lambda {|source, _| source }
|
20
|
-
end
|
21
21
|
|
22
|
-
|
23
|
-
modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
|
24
|
-
transform_and_load_from(modified_source)
|
25
|
-
end
|
26
|
-
|
27
|
-
def transform_and_load_from(source)
|
22
|
+
validate_arguments
|
28
23
|
end
|
29
24
|
|
30
|
-
def
|
31
|
-
|
32
|
-
filtered_dataset = source
|
33
|
-
else
|
34
|
-
filtered_dataset = @filter_strategy.call(source, etl_batch)
|
35
|
-
end
|
36
|
-
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
25
|
+
def execute(etl_batch, reextract=false)
|
26
|
+
transform_and_load filtered_source(etl_batch, reextract)
|
37
27
|
end
|
38
|
-
|
39
|
-
attr_reader :transformation_chain
|
40
28
|
|
41
29
|
# Returns the named sink, if it exists
|
42
30
|
def sink(name)
|
@@ -46,20 +34,22 @@ module Chicago
|
|
46
34
|
def sinks
|
47
35
|
@sinks.values
|
48
36
|
end
|
37
|
+
|
38
|
+
def filtered_source(etl_batch, reextract=false)
|
39
|
+
filtered_dataset = reextract ? source :
|
40
|
+
@filter_strategy.call(source, etl_batch)
|
49
41
|
|
50
|
-
|
51
|
-
@sinks[name.to_sym] = sink
|
52
|
-
self
|
42
|
+
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
53
43
|
end
|
54
|
-
|
55
|
-
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def transform_and_load(source)
|
56
48
|
sinks.each(&:open)
|
57
49
|
pipe_rows_to_sinks_from(source)
|
58
50
|
sinks.each(&:close)
|
59
51
|
end
|
60
|
-
|
61
|
-
private
|
62
|
-
|
52
|
+
|
63
53
|
def pipe_rows_to_sinks_from(source)
|
64
54
|
source.each do |row|
|
65
55
|
transformation_chain.process(row).each {|row| process_row(row) }
|
@@ -67,10 +57,25 @@ module Chicago
|
|
67
57
|
transformation_chain.flush.each {|row| process_row(row) }
|
68
58
|
end
|
69
59
|
|
60
|
+
def transformation_chain
|
61
|
+
@transformation_chain ||= Chicago::Flow::TransformationChain.
|
62
|
+
new(*@transformations)
|
63
|
+
end
|
64
|
+
|
70
65
|
def process_row(row)
|
71
66
|
stream = row.delete(:_stream) || :default
|
72
67
|
@sinks[stream] << row
|
73
68
|
end
|
69
|
+
|
70
|
+
def validate_arguments
|
71
|
+
if @source.nil?
|
72
|
+
raise ArgumentError, "Stage #{@name} requires a source"
|
73
|
+
end
|
74
|
+
|
75
|
+
if @sinks.blank?
|
76
|
+
raise ArgumentError, "Stage #{@name} requires at least one sink"
|
77
|
+
end
|
78
|
+
end
|
74
79
|
end
|
75
80
|
end
|
76
81
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
class StageBuilder
|
4
|
-
attr_reader :sink_factory
|
5
|
-
|
6
4
|
def initialize(db)
|
7
5
|
@db = db
|
8
6
|
end
|
@@ -20,15 +18,17 @@ module Chicago
|
|
20
18
|
:filter_strategy => @filter_strategy)
|
21
19
|
end
|
22
20
|
|
21
|
+
protected
|
22
|
+
|
23
23
|
def source(&block)
|
24
24
|
@dataset = DatasetBuilder.new(@db).build(&block)
|
25
25
|
end
|
26
26
|
|
27
|
-
def transformations(
|
28
|
-
@transformations =
|
27
|
+
def transformations(&block)
|
28
|
+
@transformations = TransformationBuilder.new.build(&block)
|
29
29
|
end
|
30
30
|
|
31
|
-
def sinks(
|
31
|
+
def sinks(&block)
|
32
32
|
@sinks = SinkBuilder.new.build(&block)
|
33
33
|
end
|
34
34
|
|
data/lib/chicago/etl.rb
CHANGED
@@ -10,7 +10,6 @@ require 'chicago/flow/errors'
|
|
10
10
|
require 'chicago/flow/transformation'
|
11
11
|
require 'chicago/flow/filter'
|
12
12
|
require 'chicago/flow/transformation_chain'
|
13
|
-
require 'chicago/flow/pipeline_stage'
|
14
13
|
require 'chicago/flow/pipeline_endpoint'
|
15
14
|
require 'chicago/flow/array_source'
|
16
15
|
require 'chicago/flow/dataset_source'
|
@@ -25,12 +24,12 @@ require 'chicago/etl/key_builder'
|
|
25
24
|
require 'chicago/etl/schema_table_sink_factory'
|
26
25
|
require 'chicago/etl/transformations'
|
27
26
|
require 'chicago/etl/load_dataset_builder'
|
28
|
-
require 'chicago/etl/
|
29
|
-
require 'chicago/etl/load_pipeline_stage_builder'
|
30
|
-
require 'chicago/etl/pipeline'
|
31
|
-
|
27
|
+
require 'chicago/etl/dataset_builder'
|
32
28
|
require 'chicago/etl/stage'
|
33
29
|
require 'chicago/etl/stage_builder'
|
30
|
+
require 'chicago/etl/dataset_batch_stage'
|
31
|
+
require 'chicago/etl/schema_sinks_and_transformations_builder'
|
32
|
+
require 'chicago/etl/pipeline'
|
34
33
|
|
35
34
|
# Sequel Extensions
|
36
35
|
require 'chicago/etl/sequel/filter_to_etl_batch'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "creating and running a dimension stage" do
|
4
|
+
let(:rows) { [{:some_field => "value"}] }
|
5
|
+
let(:db) { double(:db).as_null_object }
|
6
|
+
let(:schema) {
|
7
|
+
schema = Chicago::StarSchema.new
|
8
|
+
|
9
|
+
schema.define_dimension(:test) do
|
10
|
+
columns do
|
11
|
+
string :foo
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
schema
|
16
|
+
}
|
17
|
+
|
18
|
+
let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
|
19
|
+
|
20
|
+
it "glues the source, transformations, and sink correctly" do
|
21
|
+
pipeline.define_dimension_load(:test) do
|
22
|
+
dataset do
|
23
|
+
db.test_dataset_method
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
pipeline.stages.each do |stage|
|
28
|
+
stage.execute(double, true)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should set the inserted at time on the dimension"
|
33
|
+
|
34
|
+
it "truncates the dimension if specified"
|
35
|
+
end
|
@@ -39,26 +39,6 @@ describe "defining and executing a stage" do
|
|
39
39
|
stage.sink(:another_stream).data.length.should == 0
|
40
40
|
end
|
41
41
|
|
42
|
-
it "requires sinks" do
|
43
|
-
expect {
|
44
|
-
pipeline.define_stage(:test_stage) do
|
45
|
-
source do
|
46
|
-
db.test_dataset_method
|
47
|
-
end
|
48
|
-
end
|
49
|
-
}.to raise_error(ArgumentError)
|
50
|
-
end
|
51
|
-
|
52
|
-
it "requires sources" do
|
53
|
-
expect {
|
54
|
-
pipeline.define_stage(:test_stage) do
|
55
|
-
sinks do
|
56
|
-
add Chicago::Flow::ArraySink.new(:test)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
}.to raise_error(ArgumentError)
|
60
|
-
end
|
61
|
-
|
62
42
|
it "glues the source, transformations, and sink correctly" do
|
63
43
|
pipeline.define_stage(:test_stage) do
|
64
44
|
source do
|
@@ -90,8 +70,8 @@ describe "defining and executing a stage" do
|
|
90
70
|
it "allows the source to be filtered via a filter strategy" do
|
91
71
|
etl_batch_double = double
|
92
72
|
fake_source = []
|
73
|
+
fake_source.should_receive(:another_dataset_method).and_return([])
|
93
74
|
|
94
|
-
fake_source.should_receive(:another_dataset_method).and_return([])
|
95
75
|
pipeline.define_stage(:test_stage) do
|
96
76
|
source do
|
97
77
|
fake_source
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Chicago::ETL::
|
3
|
+
describe Chicago::ETL::SchemaSinksAndTransformationsBuilder do
|
4
4
|
let(:dimension) { stub(:dimension).as_null_object }
|
5
5
|
let(:db) { stub(:db).as_null_object }
|
6
6
|
let(:sink_factory) { stub(:sink_factory).as_null_object }
|
@@ -34,6 +34,6 @@ describe Chicago::ETL::LoadPipelineStageBuilder do
|
|
34
34
|
key_mapping :bar, :original_id
|
35
35
|
end
|
36
36
|
|
37
|
-
stage
|
37
|
+
stage[:sinks][:bar].should_not be_nil
|
38
38
|
end
|
39
39
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::Stage do
|
4
|
+
it "requires a source" do
|
5
|
+
expect {
|
6
|
+
described_class.new(:test,
|
7
|
+
:source => nil,
|
8
|
+
:sinks => {:default => stub(:sink)})
|
9
|
+
}.to raise_error(ArgumentError)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "requires sinks" do
|
13
|
+
expect {
|
14
|
+
described_class.new(:test,
|
15
|
+
:source => stub(:source),
|
16
|
+
:sinks => nil)
|
17
|
+
}.to raise_error(ArgumentError)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "does not filter the dataset if re-extracting" do
|
21
|
+
stage = described_class.new(:test,
|
22
|
+
:source => stub(:source),
|
23
|
+
:sinks => {:default => stub(:sink)},
|
24
|
+
:filter_strategy => lambda { fail })
|
25
|
+
|
26
|
+
stage.filtered_source(stub(:etl_batch), true)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "opens sinks before writing and closes them afterwards" do
|
30
|
+
sink = mock(:sink)
|
31
|
+
sink.should_receive(:open)
|
32
|
+
sink.should_receive(:close)
|
33
|
+
|
34
|
+
stage = described_class.new(:test,
|
35
|
+
:source => [],
|
36
|
+
:sinks => {:default => sink})
|
37
|
+
|
38
|
+
stage.execute(stub(:etl_batch), true)
|
39
|
+
end
|
40
|
+
end
|
@@ -44,29 +44,33 @@ describe "Mysql -> Mysql through transformation chain" do
|
|
44
44
|
|
45
45
|
it "copies data from source to destination" do
|
46
46
|
TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
|
47
|
-
|
48
|
-
|
47
|
+
{:foo => "Hello", :bin => :unhex.sql_function("1F")}])
|
48
|
+
|
49
49
|
source = Chicago::Flow::DatasetSource.
|
50
50
|
new(TEST_DB[:source].
|
51
51
|
select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
|
52
|
+
|
53
|
+
transformations = [dup_row.new(:onto => :other)]
|
54
|
+
|
52
55
|
sink_1 = Chicago::Flow::MysqlFileSink.
|
53
56
|
new(TEST_DB, :destination, [:id, :foo, :bin])
|
54
57
|
sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
|
55
58
|
|
56
|
-
stage = Chicago::
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
stage = Chicago::ETL::Stage.new(:test,
|
60
|
+
:source => source,
|
61
|
+
:transformations => transformations,
|
62
|
+
:sinks => {
|
63
|
+
:default => sink_1,
|
64
|
+
:other => sink_2
|
65
|
+
})
|
63
66
|
|
64
|
-
stage.execute(
|
67
|
+
stage.execute(stub(:etl_batch), true)
|
65
68
|
|
66
69
|
expected = [{:id => 1, :foo => nil, :bin => "1F"},
|
67
70
|
{:id => 2, :foo => "Hello", :bin => "1F"}]
|
68
71
|
|
69
72
|
sink_2.data.should == expected
|
70
|
-
TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
|
73
|
+
TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
|
74
|
+
all.should == expected
|
71
75
|
end
|
72
76
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -243,10 +243,11 @@ files:
|
|
243
243
|
- lib/chicago/etl/core_extensions.rb
|
244
244
|
- lib/chicago/etl/counter.rb
|
245
245
|
- lib/chicago/etl/dataset_batch_stage.rb
|
246
|
+
- lib/chicago/etl/dataset_builder.rb
|
246
247
|
- lib/chicago/etl/key_builder.rb
|
247
248
|
- lib/chicago/etl/load_dataset_builder.rb
|
248
|
-
- lib/chicago/etl/load_pipeline_stage_builder.rb
|
249
249
|
- lib/chicago/etl/pipeline.rb
|
250
|
+
- lib/chicago/etl/schema_sinks_and_transformations_builder.rb
|
250
251
|
- lib/chicago/etl/schema_table_sink_factory.rb
|
251
252
|
- lib/chicago/etl/screens/column_screen.rb
|
252
253
|
- lib/chicago/etl/screens/invalid_element.rb
|
@@ -273,7 +274,6 @@ files:
|
|
273
274
|
- lib/chicago/flow/mysql_file_sink.rb
|
274
275
|
- lib/chicago/flow/null_sink.rb
|
275
276
|
- lib/chicago/flow/pipeline_endpoint.rb
|
276
|
-
- lib/chicago/flow/pipeline_stage.rb
|
277
277
|
- lib/chicago/flow/sink.rb
|
278
278
|
- lib/chicago/flow/transformation.rb
|
279
279
|
- lib/chicago/flow/transformation_chain.rb
|
@@ -281,7 +281,7 @@ files:
|
|
281
281
|
- spec/etl/batch_spec.rb
|
282
282
|
- spec/etl/core_extensions_spec.rb
|
283
283
|
- spec/etl/counter_spec.rb
|
284
|
-
- spec/etl/
|
284
|
+
- spec/etl/define_dimension_stage_spec.rb
|
285
285
|
- spec/etl/define_stage_spec.rb
|
286
286
|
- spec/etl/etl_batch_id_dataset_filter.rb
|
287
287
|
- spec/etl/key_builder_spec.rb
|
@@ -293,6 +293,7 @@ files:
|
|
293
293
|
- spec/etl/screens/out_of_bounds_spec.rb
|
294
294
|
- spec/etl/sequel/dependant_tables_spec.rb
|
295
295
|
- spec/etl/sequel/filter_to_etl_batch_spec.rb
|
296
|
+
- spec/etl/stage_spec.rb
|
296
297
|
- spec/etl/table_builder_spec.rb
|
297
298
|
- spec/etl/task_spec.rb
|
298
299
|
- spec/etl/transformations/deduplicate_rows_spec.rb
|
@@ -306,7 +307,6 @@ files:
|
|
306
307
|
- spec/flow/mysql_file_serializer_spec.rb
|
307
308
|
- spec/flow/mysql_file_sink_spec.rb
|
308
309
|
- spec/flow/mysql_integration_spec.rb
|
309
|
-
- spec/flow/pipeline_stage_spec.rb
|
310
310
|
- spec/flow/transformation_chain_spec.rb
|
311
311
|
- spec/flow/transformation_spec.rb
|
312
312
|
- spec/spec_helper.rb
|
@@ -1,68 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module Flow
|
3
|
-
# Co-ordinates iterating over rows provided by a source, passing
|
4
|
-
# them through a transformation chain before writing them to
|
5
|
-
# sink(s).
|
6
|
-
#
|
7
|
-
# @api public
|
8
|
-
class PipelineStage
|
9
|
-
attr_reader :transformation_chain
|
10
|
-
|
11
|
-
def initialize(options={})
|
12
|
-
@sinks = options[:sinks] || {}
|
13
|
-
@transformations = options[:transformations] || []
|
14
|
-
@error_handler = options[:error_handler] || RaisingErrorHandler.new
|
15
|
-
@transformation_chain = TransformationChain.new(*@transformations)
|
16
|
-
end
|
17
|
-
|
18
|
-
# Returns the named sink, if it exists
|
19
|
-
def sink(name)
|
20
|
-
@sinks[name.to_sym]
|
21
|
-
end
|
22
|
-
|
23
|
-
def sinks
|
24
|
-
@sinks.values
|
25
|
-
end
|
26
|
-
|
27
|
-
def register_sink(name, sink)
|
28
|
-
@sinks[name.to_sym] = sink
|
29
|
-
self
|
30
|
-
end
|
31
|
-
|
32
|
-
def validate_pipeline
|
33
|
-
unless unregistered_sinks.empty?
|
34
|
-
@error_handler.unregistered_sinks(unregistered_sinks)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def execute(source)
|
39
|
-
validate_pipeline
|
40
|
-
sinks.each(&:open)
|
41
|
-
pipe_rows_to_sinks_from(source)
|
42
|
-
sinks.each(&:close)
|
43
|
-
end
|
44
|
-
|
45
|
-
def required_sinks
|
46
|
-
transformation_chain.output_streams | [:default]
|
47
|
-
end
|
48
|
-
|
49
|
-
def unregistered_sinks
|
50
|
-
required_sinks - @sinks.keys
|
51
|
-
end
|
52
|
-
|
53
|
-
private
|
54
|
-
|
55
|
-
def pipe_rows_to_sinks_from(source)
|
56
|
-
source.each do |row|
|
57
|
-
transformation_chain.process(row).each {|row| process_row(row) }
|
58
|
-
end
|
59
|
-
transformation_chain.flush.each {|row| process_row(row) }
|
60
|
-
end
|
61
|
-
|
62
|
-
def process_row(row)
|
63
|
-
stream = row.delete(:_stream) || :default
|
64
|
-
@sinks[stream] << row
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::DatasetBatchStage do
|
4
|
-
let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
|
5
|
-
let(:dataset) { mock(:dataset).as_null_object }
|
6
|
-
let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
|
7
|
-
let(:etl_batch) { stub(:etl_batch) }
|
8
|
-
|
9
|
-
it "has a name" do
|
10
|
-
stage.name.should == :foo
|
11
|
-
end
|
12
|
-
|
13
|
-
it "should set the inserted at time on the default sink" do
|
14
|
-
sink = Chicago::Flow::ArraySink.new(:foo)
|
15
|
-
pipeline_stage.stub(:sink).with(:default).and_return(sink)
|
16
|
-
stage.pipeline_stage.should == pipeline_stage
|
17
|
-
|
18
|
-
sink.constant_values[:_inserted_at].should_not be_nil
|
19
|
-
end
|
20
|
-
|
21
|
-
it "filters the dataset to the batch" do
|
22
|
-
dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
|
23
|
-
stage.source(etl_batch)
|
24
|
-
end
|
25
|
-
|
26
|
-
it "does not filter the dataset if re-extracting" do
|
27
|
-
dataset.should_not_recieve(:filter_to_etl_batch)
|
28
|
-
stage.source(etl_batch, true)
|
29
|
-
end
|
30
|
-
|
31
|
-
it "can filter via a custom strategy" do
|
32
|
-
dataset.should_not_recieve(:filter_to_etl_batch)
|
33
|
-
|
34
|
-
filter_strategy = lambda {|ds, batch| ds }
|
35
|
-
described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
|
36
|
-
source(etl_batch)
|
37
|
-
end
|
38
|
-
|
39
|
-
it "executes the pipeline stage using a DatasetSource" do
|
40
|
-
pipeline_stage.should_receive(:execute).
|
41
|
-
with(kind_of(Chicago::Flow::DatasetSource))
|
42
|
-
stage.execute(etl_batch, true)
|
43
|
-
end
|
44
|
-
|
45
|
-
it "truncates any sinks if truncate_pre_load has been set" do
|
46
|
-
stage = described_class.new(:foo, dataset, pipeline_stage,
|
47
|
-
:truncate_pre_load => true)
|
48
|
-
|
49
|
-
sink = Chicago::Flow::ArraySink.new(:output)
|
50
|
-
sink << {:foo => "foo"}
|
51
|
-
pipeline_stage.stub(:sinks).and_return([sink])
|
52
|
-
stage.execute(etl_batch)
|
53
|
-
sink.data.should == []
|
54
|
-
end
|
55
|
-
end
|
@@ -1,89 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::Flow::PipelineStage do
|
4
|
-
let(:transform) {
|
5
|
-
Class.new(Chicago::Flow::Transformation) {
|
6
|
-
def process_row(row)
|
7
|
-
row[:a] += 1
|
8
|
-
row
|
9
|
-
end
|
10
|
-
}
|
11
|
-
}
|
12
|
-
|
13
|
-
let(:add_error) {
|
14
|
-
Class.new(Chicago::Flow::Transformation) {
|
15
|
-
# add_output_stream :error
|
16
|
-
def output_streams
|
17
|
-
[:default, :error]
|
18
|
-
end
|
19
|
-
|
20
|
-
def process_row(row)
|
21
|
-
[row, {Chicago::Flow::STREAM => :error, :message => "error"}]
|
22
|
-
end
|
23
|
-
}
|
24
|
-
}
|
25
|
-
|
26
|
-
let(:sink) { Chicago::Flow::ArraySink.new(:test) }
|
27
|
-
let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
|
28
|
-
|
29
|
-
it "returns all sinks" do
|
30
|
-
stage = described_class.new.register_sink(:default, sink)
|
31
|
-
stage.sinks.should == [sink]
|
32
|
-
end
|
33
|
-
|
34
|
-
it "returns a sink by name" do
|
35
|
-
stage = described_class.new.register_sink(:default, sink)
|
36
|
-
stage.sink(:default).should == sink
|
37
|
-
end
|
38
|
-
|
39
|
-
it "reads from source to sink" do
|
40
|
-
pipeline = described_class.new.register_sink(:default, sink)
|
41
|
-
pipeline.execute(source)
|
42
|
-
sink.data.should == [{:a => 1}]
|
43
|
-
end
|
44
|
-
|
45
|
-
it "passes rows through transforms" do
|
46
|
-
pipeline = described_class.new(:transformations => [transform.new]).
|
47
|
-
register_sink(:default, sink)
|
48
|
-
|
49
|
-
pipeline.execute(source)
|
50
|
-
sink.data.should == [{:a => 2}]
|
51
|
-
end
|
52
|
-
|
53
|
-
it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
|
54
|
-
error_sink = Chicago::Flow::ArraySink.new(:test)
|
55
|
-
|
56
|
-
pipeline = described_class.new(:transformations => [add_error.new]).
|
57
|
-
register_sink(:default, sink).
|
58
|
-
register_sink(:error, error_sink)
|
59
|
-
|
60
|
-
pipeline.execute(source)
|
61
|
-
sink.data.should == [{:a => 1}]
|
62
|
-
error_sink.data.should == [{:message => "error"}]
|
63
|
-
end
|
64
|
-
|
65
|
-
it "calls an error handler if sinks are not registered" do
|
66
|
-
error_handler = mock(:error_handler)
|
67
|
-
error_handler.should_receive(:unregistered_sinks).
|
68
|
-
with([:default, :error])
|
69
|
-
|
70
|
-
pipeline = described_class.new(:transformations => [add_error.new],
|
71
|
-
:error_handler => error_handler)
|
72
|
-
|
73
|
-
pipeline.validate_pipeline
|
74
|
-
end
|
75
|
-
|
76
|
-
it "by default raises an exception if the pipeline is not valid when executed" do
|
77
|
-
pipeline = described_class.new(:transformations => [add_error.new])
|
78
|
-
expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
|
79
|
-
end
|
80
|
-
|
81
|
-
it "opens sinks before writing and closes them afterwards" do
|
82
|
-
sink = mock(:sink)
|
83
|
-
pipeline = described_class.new.register_sink(:default, sink)
|
84
|
-
sink.should_receive(:open)
|
85
|
-
sink.stub(:<<)
|
86
|
-
sink.should_receive(:close)
|
87
|
-
pipeline.execute(source)
|
88
|
-
end
|
89
|
-
end
|