chicago-etl 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +5 -5
- data/lib/chicago/etl/dataset_batch_stage.rb +12 -32
- data/lib/chicago/etl/dataset_builder.rb +60 -0
- data/lib/chicago/etl/pipeline.rb +9 -62
- data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb} +17 -15
- data/lib/chicago/etl/stage.rb +39 -34
- data/lib/chicago/etl/stage_builder.rb +5 -5
- data/lib/chicago/etl.rb +4 -5
- data/spec/etl/define_dimension_stage_spec.rb +35 -0
- data/spec/etl/define_stage_spec.rb +1 -21
- data/spec/etl/pipeline_stage_builder_spec.rb +2 -2
- data/spec/etl/stage_spec.rb +40 -0
- data/spec/flow/mysql_integration_spec.rb +15 -11
- metadata +7 -7
- data/lib/chicago/flow/pipeline_stage.rb +0 -68
- data/spec/etl/dataset_batch_stage_spec.rb +0 -55
- data/spec/flow/pipeline_stage_spec.rb +0 -89
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.4
|
data/chicago-etl.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
@@ -32,10 +32,11 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/chicago/etl/core_extensions.rb",
|
33
33
|
"lib/chicago/etl/counter.rb",
|
34
34
|
"lib/chicago/etl/dataset_batch_stage.rb",
|
35
|
+
"lib/chicago/etl/dataset_builder.rb",
|
35
36
|
"lib/chicago/etl/key_builder.rb",
|
36
37
|
"lib/chicago/etl/load_dataset_builder.rb",
|
37
|
-
"lib/chicago/etl/load_pipeline_stage_builder.rb",
|
38
38
|
"lib/chicago/etl/pipeline.rb",
|
39
|
+
"lib/chicago/etl/schema_sinks_and_transformations_builder.rb",
|
39
40
|
"lib/chicago/etl/schema_table_sink_factory.rb",
|
40
41
|
"lib/chicago/etl/screens/column_screen.rb",
|
41
42
|
"lib/chicago/etl/screens/invalid_element.rb",
|
@@ -62,7 +63,6 @@ Gem::Specification.new do |s|
|
|
62
63
|
"lib/chicago/flow/mysql_file_sink.rb",
|
63
64
|
"lib/chicago/flow/null_sink.rb",
|
64
65
|
"lib/chicago/flow/pipeline_endpoint.rb",
|
65
|
-
"lib/chicago/flow/pipeline_stage.rb",
|
66
66
|
"lib/chicago/flow/sink.rb",
|
67
67
|
"lib/chicago/flow/transformation.rb",
|
68
68
|
"lib/chicago/flow/transformation_chain.rb",
|
@@ -70,7 +70,7 @@ Gem::Specification.new do |s|
|
|
70
70
|
"spec/etl/batch_spec.rb",
|
71
71
|
"spec/etl/core_extensions_spec.rb",
|
72
72
|
"spec/etl/counter_spec.rb",
|
73
|
-
"spec/etl/
|
73
|
+
"spec/etl/define_dimension_stage_spec.rb",
|
74
74
|
"spec/etl/define_stage_spec.rb",
|
75
75
|
"spec/etl/etl_batch_id_dataset_filter.rb",
|
76
76
|
"spec/etl/key_builder_spec.rb",
|
@@ -82,6 +82,7 @@ Gem::Specification.new do |s|
|
|
82
82
|
"spec/etl/screens/out_of_bounds_spec.rb",
|
83
83
|
"spec/etl/sequel/dependant_tables_spec.rb",
|
84
84
|
"spec/etl/sequel/filter_to_etl_batch_spec.rb",
|
85
|
+
"spec/etl/stage_spec.rb",
|
85
86
|
"spec/etl/table_builder_spec.rb",
|
86
87
|
"spec/etl/task_spec.rb",
|
87
88
|
"spec/etl/transformations/deduplicate_rows_spec.rb",
|
@@ -95,7 +96,6 @@ Gem::Specification.new do |s|
|
|
95
96
|
"spec/flow/mysql_file_serializer_spec.rb",
|
96
97
|
"spec/flow/mysql_file_sink_spec.rb",
|
97
98
|
"spec/flow/mysql_integration_spec.rb",
|
98
|
-
"spec/flow/pipeline_stage_spec.rb",
|
99
99
|
"spec/flow/transformation_chain_spec.rb",
|
100
100
|
"spec/flow/transformation_spec.rb",
|
101
101
|
"spec/spec_helper.rb"
|
@@ -4,48 +4,28 @@ module Chicago
|
|
4
4
|
#
|
5
5
|
# Allows deferring constructing a DatasetSource until extract
|
6
6
|
# time, so that it can be filtered to an ETL batch appropriately.
|
7
|
-
class DatasetBatchStage
|
7
|
+
class DatasetBatchStage < Stage
|
8
8
|
attr_reader :name
|
9
9
|
|
10
|
-
def initialize(name,
|
11
|
-
|
12
|
-
@
|
13
|
-
|
14
|
-
@filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
|
15
|
-
dataset.filter_to_etl_batch(etl_batch)
|
16
|
-
}
|
10
|
+
def initialize(name, options={})
|
11
|
+
super
|
12
|
+
@filter_strategy = options[:filter_strategy] ||
|
13
|
+
lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
|
17
14
|
@truncate_pre_load = !!options[:truncate_pre_load]
|
18
|
-
|
15
|
+
end
|
19
16
|
|
20
17
|
# Executes this ETL stage.
|
21
18
|
#
|
22
19
|
# Configures the dataset and flows rows into the pipeline.
|
23
20
|
def execute(etl_batch, reextract=false)
|
24
21
|
if @truncate_pre_load
|
25
|
-
|
26
|
-
elsif reextract &&
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
pipeline_stage.execute(source(etl_batch, reextract))
|
31
|
-
end
|
32
|
-
|
33
|
-
# Returns the pipeline for this stage.
|
34
|
-
def pipeline_stage
|
35
|
-
@pipeline_stage.sink(:default).
|
36
|
-
set_constant_values(:_inserted_at => Time.now)
|
37
|
-
@pipeline_stage
|
38
|
-
end
|
39
|
-
|
40
|
-
# Returns a DatasetSource for the provided dataset filtered to
|
41
|
-
# the ETL batch as appropriate.
|
42
|
-
def source(etl_batch, reextract=false)
|
43
|
-
if reextract
|
44
|
-
filtered_dataset = @dataset
|
45
|
-
else
|
46
|
-
filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
|
22
|
+
sinks.each {|sink| sink.truncate }
|
23
|
+
elsif reextract && sink(:error)
|
24
|
+
sink(:error).truncate
|
47
25
|
end
|
48
|
-
|
26
|
+
|
27
|
+
sink(:default).set_constant_values(:_inserted_at => Time.now)
|
28
|
+
super
|
49
29
|
end
|
50
30
|
end
|
51
31
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Provides convenience methods for defining source datasets.
|
4
|
+
class DatasetBuilder
|
5
|
+
attr_reader :db
|
6
|
+
|
7
|
+
# @api private
|
8
|
+
def initialize(db)
|
9
|
+
@db = db
|
10
|
+
end
|
11
|
+
|
12
|
+
# @api private
|
13
|
+
def build(&block)
|
14
|
+
instance_eval(&block)
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
def key_field(field, name)
|
20
|
+
:if[{field => nil}, 1, field].as(name)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
24
|
+
# return a dimension key.
|
25
|
+
#
|
26
|
+
# Takes care of using the key tables correctly, and dealing with
|
27
|
+
# missing dimension values.
|
28
|
+
def dimension_key(name)
|
29
|
+
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
30
|
+
"#{name}_dimension_id".to_sym)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
34
|
+
# return a date dimension key.
|
35
|
+
def date_dimension_column(dimension)
|
36
|
+
:if.sql_function({:id.qualify(dimension) => nil},
|
37
|
+
1,
|
38
|
+
:id.qualify(dimension)).
|
39
|
+
as("#{dimension}_dimension_id".to_sym)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Rounds a monetary value to 2 decimal places.
|
43
|
+
#
|
44
|
+
# By default, natural rounding is used, you can specify either
|
45
|
+
# :up or :down as the direction.
|
46
|
+
#
|
47
|
+
# @deprecated
|
48
|
+
def round(stmt, direction = :none)
|
49
|
+
case direction
|
50
|
+
when :none
|
51
|
+
:round.sql_function(stmt, 2)
|
52
|
+
when :up
|
53
|
+
:ceil.sql_function(stmt * 100) / 100
|
54
|
+
when :down
|
55
|
+
:floor.sql_function(stmt * 100) / 100
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/chicago/etl/pipeline.rb
CHANGED
@@ -61,13 +61,17 @@ module Chicago
|
|
61
61
|
# @api private
|
62
62
|
def build(name, &block)
|
63
63
|
instance_eval &block
|
64
|
-
unless defined? @
|
64
|
+
unless defined? @sinks_and_transformations
|
65
65
|
pipeline do
|
66
66
|
end
|
67
67
|
end
|
68
|
-
|
69
|
-
|
70
|
-
|
68
|
+
DatasetBatchStage.new(name,
|
69
|
+
:source => @dataset,
|
70
|
+
:transformations => @sinks_and_transformations[:transformations],
|
71
|
+
:sinks => @sinks_and_transformations[:sinks],
|
72
|
+
:filter_strategy => @filter_strategy,
|
73
|
+
:truncate_pre_load => @truncate_pre_load)
|
74
|
+
|
71
75
|
end
|
72
76
|
|
73
77
|
protected
|
@@ -88,7 +92,7 @@ module Chicago
|
|
88
92
|
# for details.
|
89
93
|
# TODO: rename pipeline => transforms below this method
|
90
94
|
def pipeline(&block)
|
91
|
-
@
|
95
|
+
@sinks_and_transformations = SchemaSinksAndTransformationsBuilder.new(@db, @schema_table).
|
92
96
|
build(&block)
|
93
97
|
end
|
94
98
|
|
@@ -106,62 +110,5 @@ module Chicago
|
|
106
110
|
@filter_strategy = block
|
107
111
|
end
|
108
112
|
end
|
109
|
-
|
110
|
-
# Provides convenience methods for defining source datasets.
|
111
|
-
class DatasetBuilder
|
112
|
-
attr_reader :db
|
113
|
-
|
114
|
-
# @api private
|
115
|
-
def initialize(db)
|
116
|
-
@db = db
|
117
|
-
end
|
118
|
-
|
119
|
-
# @api private
|
120
|
-
def build(&block)
|
121
|
-
instance_eval(&block)
|
122
|
-
end
|
123
|
-
|
124
|
-
protected
|
125
|
-
|
126
|
-
def key_field(field, name)
|
127
|
-
:if[{field => nil}, 1, field].as(name)
|
128
|
-
end
|
129
|
-
|
130
|
-
# Returns a column for use in a Sequel::Dataset#select method to
|
131
|
-
# return a dimension key.
|
132
|
-
#
|
133
|
-
# Takes care of using the key tables correctly, and dealing with
|
134
|
-
# missing dimension values.
|
135
|
-
def dimension_key(name)
|
136
|
-
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
137
|
-
"#{name}_dimension_id".to_sym)
|
138
|
-
end
|
139
|
-
|
140
|
-
# Returns a column for use in a Sequel::Dataset#select method to
|
141
|
-
# return a date dimension key.
|
142
|
-
def date_dimension_column(dimension)
|
143
|
-
:if.sql_function({:id.qualify(dimension) => nil},
|
144
|
-
1,
|
145
|
-
:id.qualify(dimension)).
|
146
|
-
as("#{dimension}_dimension_id".to_sym)
|
147
|
-
end
|
148
|
-
|
149
|
-
# Rounds a monetary value to 2 decimal places.
|
150
|
-
#
|
151
|
-
# By default, natural rounding is used, you can specify either
|
152
|
-
# :up or :down as the direction.
|
153
|
-
#
|
154
|
-
# @deprecated
|
155
|
-
def round(stmt, direction = :none)
|
156
|
-
case direction
|
157
|
-
when :none
|
158
|
-
:round.sql_function(stmt, 2)
|
159
|
-
when :up
|
160
|
-
:ceil.sql_function(stmt * 100) / 100
|
161
|
-
when :down
|
162
|
-
:floor.sql_function(stmt * 100) / 100
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
113
|
end
|
167
114
|
end
|
data/lib/chicago/etl/{load_pipeline_stage_builder.rb → schema_sinks_and_transformations_builder.rb}
RENAMED
@@ -5,7 +5,7 @@ module Chicago
|
|
5
5
|
#
|
6
6
|
# Clients will not normally instantiate this themselves but use it
|
7
7
|
# in the context of defining an ETL stage.
|
8
|
-
class
|
8
|
+
class SchemaSinksAndTransformationsBuilder
|
9
9
|
# @api private
|
10
10
|
KeyMapping = Struct.new(:table, :field)
|
11
11
|
|
@@ -41,9 +41,9 @@ module Chicago
|
|
41
41
|
add_screens
|
42
42
|
add_key_transforms
|
43
43
|
add_final_transforms
|
44
|
-
|
45
|
-
register_additional_sinks(
|
46
|
-
|
44
|
+
sinks_and_transformations = create_sinks_and_transformations
|
45
|
+
register_additional_sinks(sinks_and_transformations)
|
46
|
+
sinks_and_transformations
|
47
47
|
end
|
48
48
|
|
49
49
|
protected
|
@@ -81,7 +81,7 @@ module Chicago
|
|
81
81
|
|
82
82
|
private
|
83
83
|
|
84
|
-
def
|
84
|
+
def create_sinks_and_transformations
|
85
85
|
default = @sink_factory.sink(:ignore => @ignore_present_rows,
|
86
86
|
:exclude => @load_separately)
|
87
87
|
key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
|
@@ -90,24 +90,26 @@ module Chicago
|
|
90
90
|
# Facts have no key table to write to.
|
91
91
|
Flow::NullSink.new
|
92
92
|
end
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
93
|
+
|
94
|
+
{
|
95
|
+
:transformations => concat_transformations,
|
96
|
+
:sinks => {
|
97
|
+
:default => default,
|
98
|
+
:dimension_key => key_sink,
|
99
|
+
:error => @sink_factory.error_sink
|
100
|
+
}
|
101
|
+
}
|
101
102
|
end
|
102
103
|
|
103
104
|
def concat_transformations
|
104
105
|
TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
|
105
106
|
end
|
106
107
|
|
107
|
-
def register_additional_sinks(
|
108
|
+
def register_additional_sinks(sinks_and_transformations)
|
109
|
+
sinks = sinks_and_transformations[:sinks]
|
108
110
|
@key_mappings.each do |mapping|
|
109
111
|
sink = @sink_factory.key_sink(:table => mapping.table)
|
110
|
-
|
112
|
+
sinks[mapping.table] = sink
|
111
113
|
end
|
112
114
|
end
|
113
115
|
|
data/lib/chicago/etl/stage.rb
CHANGED
@@ -1,42 +1,30 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
|
+
# A Stage in the ETL pipeline.
|
4
|
+
#
|
5
|
+
# A Stage wires together a Source, 0 or more Transformations and 1
|
6
|
+
# or more Sinks.
|
3
7
|
class Stage
|
8
|
+
# Returns the source for this stage.
|
9
|
+
attr_reader :source
|
10
|
+
|
11
|
+
# Returns the name of this stage.
|
4
12
|
attr_reader :name
|
5
13
|
|
6
14
|
def initialize(name, options={})
|
7
15
|
@name = name
|
8
|
-
@source = options
|
9
|
-
|
10
|
-
|
11
|
-
@sinks = options.fetch(:sinks)
|
12
|
-
raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
|
13
|
-
|
14
|
-
@transformations = options.fetch(:transformations)
|
15
|
-
@transformation_chain = Chicago::Flow::TransformationChain.
|
16
|
-
new(*@transformations)
|
17
|
-
|
16
|
+
@source = options[:source]
|
17
|
+
@sinks = options[:sinks]
|
18
|
+
@transformations = options[:transformations] || []
|
18
19
|
@filter_strategy = options[:filter_strategy] ||
|
19
20
|
lambda {|source, _| source }
|
20
|
-
end
|
21
21
|
|
22
|
-
|
23
|
-
modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
|
24
|
-
transform_and_load_from(modified_source)
|
25
|
-
end
|
26
|
-
|
27
|
-
def transform_and_load_from(source)
|
22
|
+
validate_arguments
|
28
23
|
end
|
29
24
|
|
30
|
-
def
|
31
|
-
|
32
|
-
filtered_dataset = source
|
33
|
-
else
|
34
|
-
filtered_dataset = @filter_strategy.call(source, etl_batch)
|
35
|
-
end
|
36
|
-
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
25
|
+
def execute(etl_batch, reextract=false)
|
26
|
+
transform_and_load filtered_source(etl_batch, reextract)
|
37
27
|
end
|
38
|
-
|
39
|
-
attr_reader :transformation_chain
|
40
28
|
|
41
29
|
# Returns the named sink, if it exists
|
42
30
|
def sink(name)
|
@@ -46,20 +34,22 @@ module Chicago
|
|
46
34
|
def sinks
|
47
35
|
@sinks.values
|
48
36
|
end
|
37
|
+
|
38
|
+
def filtered_source(etl_batch, reextract=false)
|
39
|
+
filtered_dataset = reextract ? source :
|
40
|
+
@filter_strategy.call(source, etl_batch)
|
49
41
|
|
50
|
-
|
51
|
-
@sinks[name.to_sym] = sink
|
52
|
-
self
|
42
|
+
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
53
43
|
end
|
54
|
-
|
55
|
-
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def transform_and_load(source)
|
56
48
|
sinks.each(&:open)
|
57
49
|
pipe_rows_to_sinks_from(source)
|
58
50
|
sinks.each(&:close)
|
59
51
|
end
|
60
|
-
|
61
|
-
private
|
62
|
-
|
52
|
+
|
63
53
|
def pipe_rows_to_sinks_from(source)
|
64
54
|
source.each do |row|
|
65
55
|
transformation_chain.process(row).each {|row| process_row(row) }
|
@@ -67,10 +57,25 @@ module Chicago
|
|
67
57
|
transformation_chain.flush.each {|row| process_row(row) }
|
68
58
|
end
|
69
59
|
|
60
|
+
def transformation_chain
|
61
|
+
@transformation_chain ||= Chicago::Flow::TransformationChain.
|
62
|
+
new(*@transformations)
|
63
|
+
end
|
64
|
+
|
70
65
|
def process_row(row)
|
71
66
|
stream = row.delete(:_stream) || :default
|
72
67
|
@sinks[stream] << row
|
73
68
|
end
|
69
|
+
|
70
|
+
def validate_arguments
|
71
|
+
if @source.nil?
|
72
|
+
raise ArgumentError, "Stage #{@name} requires a source"
|
73
|
+
end
|
74
|
+
|
75
|
+
if @sinks.blank?
|
76
|
+
raise ArgumentError, "Stage #{@name} requires at least one sink"
|
77
|
+
end
|
78
|
+
end
|
74
79
|
end
|
75
80
|
end
|
76
81
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
class StageBuilder
|
4
|
-
attr_reader :sink_factory
|
5
|
-
|
6
4
|
def initialize(db)
|
7
5
|
@db = db
|
8
6
|
end
|
@@ -20,15 +18,17 @@ module Chicago
|
|
20
18
|
:filter_strategy => @filter_strategy)
|
21
19
|
end
|
22
20
|
|
21
|
+
protected
|
22
|
+
|
23
23
|
def source(&block)
|
24
24
|
@dataset = DatasetBuilder.new(@db).build(&block)
|
25
25
|
end
|
26
26
|
|
27
|
-
def transformations(
|
28
|
-
@transformations =
|
27
|
+
def transformations(&block)
|
28
|
+
@transformations = TransformationBuilder.new.build(&block)
|
29
29
|
end
|
30
30
|
|
31
|
-
def sinks(
|
31
|
+
def sinks(&block)
|
32
32
|
@sinks = SinkBuilder.new.build(&block)
|
33
33
|
end
|
34
34
|
|
data/lib/chicago/etl.rb
CHANGED
@@ -10,7 +10,6 @@ require 'chicago/flow/errors'
|
|
10
10
|
require 'chicago/flow/transformation'
|
11
11
|
require 'chicago/flow/filter'
|
12
12
|
require 'chicago/flow/transformation_chain'
|
13
|
-
require 'chicago/flow/pipeline_stage'
|
14
13
|
require 'chicago/flow/pipeline_endpoint'
|
15
14
|
require 'chicago/flow/array_source'
|
16
15
|
require 'chicago/flow/dataset_source'
|
@@ -25,12 +24,12 @@ require 'chicago/etl/key_builder'
|
|
25
24
|
require 'chicago/etl/schema_table_sink_factory'
|
26
25
|
require 'chicago/etl/transformations'
|
27
26
|
require 'chicago/etl/load_dataset_builder'
|
28
|
-
require 'chicago/etl/
|
29
|
-
require 'chicago/etl/load_pipeline_stage_builder'
|
30
|
-
require 'chicago/etl/pipeline'
|
31
|
-
|
27
|
+
require 'chicago/etl/dataset_builder'
|
32
28
|
require 'chicago/etl/stage'
|
33
29
|
require 'chicago/etl/stage_builder'
|
30
|
+
require 'chicago/etl/dataset_batch_stage'
|
31
|
+
require 'chicago/etl/schema_sinks_and_transformations_builder'
|
32
|
+
require 'chicago/etl/pipeline'
|
34
33
|
|
35
34
|
# Sequel Extensions
|
36
35
|
require 'chicago/etl/sequel/filter_to_etl_batch'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "creating and running a dimension stage" do
|
4
|
+
let(:rows) { [{:some_field => "value"}] }
|
5
|
+
let(:db) { double(:db).as_null_object }
|
6
|
+
let(:schema) {
|
7
|
+
schema = Chicago::StarSchema.new
|
8
|
+
|
9
|
+
schema.define_dimension(:test) do
|
10
|
+
columns do
|
11
|
+
string :foo
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
schema
|
16
|
+
}
|
17
|
+
|
18
|
+
let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
|
19
|
+
|
20
|
+
it "glues the source, transformations, and sink correctly" do
|
21
|
+
pipeline.define_dimension_load(:test) do
|
22
|
+
dataset do
|
23
|
+
db.test_dataset_method
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
pipeline.stages.each do |stage|
|
28
|
+
stage.execute(double, true)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should set the inserted at time on the dimension"
|
33
|
+
|
34
|
+
it "truncates the dimension if specified"
|
35
|
+
end
|
@@ -39,26 +39,6 @@ describe "defining and executing a stage" do
|
|
39
39
|
stage.sink(:another_stream).data.length.should == 0
|
40
40
|
end
|
41
41
|
|
42
|
-
it "requires sinks" do
|
43
|
-
expect {
|
44
|
-
pipeline.define_stage(:test_stage) do
|
45
|
-
source do
|
46
|
-
db.test_dataset_method
|
47
|
-
end
|
48
|
-
end
|
49
|
-
}.to raise_error(ArgumentError)
|
50
|
-
end
|
51
|
-
|
52
|
-
it "requires sources" do
|
53
|
-
expect {
|
54
|
-
pipeline.define_stage(:test_stage) do
|
55
|
-
sinks do
|
56
|
-
add Chicago::Flow::ArraySink.new(:test)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
}.to raise_error(ArgumentError)
|
60
|
-
end
|
61
|
-
|
62
42
|
it "glues the source, transformations, and sink correctly" do
|
63
43
|
pipeline.define_stage(:test_stage) do
|
64
44
|
source do
|
@@ -90,8 +70,8 @@ describe "defining and executing a stage" do
|
|
90
70
|
it "allows the source to be filtered via a filter strategy" do
|
91
71
|
etl_batch_double = double
|
92
72
|
fake_source = []
|
73
|
+
fake_source.should_receive(:another_dataset_method).and_return([])
|
93
74
|
|
94
|
-
fake_source.should_receive(:another_dataset_method).and_return([])
|
95
75
|
pipeline.define_stage(:test_stage) do
|
96
76
|
source do
|
97
77
|
fake_source
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Chicago::ETL::
|
3
|
+
describe Chicago::ETL::SchemaSinksAndTransformationsBuilder do
|
4
4
|
let(:dimension) { stub(:dimension).as_null_object }
|
5
5
|
let(:db) { stub(:db).as_null_object }
|
6
6
|
let(:sink_factory) { stub(:sink_factory).as_null_object }
|
@@ -34,6 +34,6 @@ describe Chicago::ETL::LoadPipelineStageBuilder do
|
|
34
34
|
key_mapping :bar, :original_id
|
35
35
|
end
|
36
36
|
|
37
|
-
stage
|
37
|
+
stage[:sinks][:bar].should_not be_nil
|
38
38
|
end
|
39
39
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::Stage do
|
4
|
+
it "requires a source" do
|
5
|
+
expect {
|
6
|
+
described_class.new(:test,
|
7
|
+
:source => nil,
|
8
|
+
:sinks => {:default => stub(:sink)})
|
9
|
+
}.to raise_error(ArgumentError)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "requires sinks" do
|
13
|
+
expect {
|
14
|
+
described_class.new(:test,
|
15
|
+
:source => stub(:source),
|
16
|
+
:sinks => nil)
|
17
|
+
}.to raise_error(ArgumentError)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "does not filter the dataset if re-extracting" do
|
21
|
+
stage = described_class.new(:test,
|
22
|
+
:source => stub(:source),
|
23
|
+
:sinks => {:default => stub(:sink)},
|
24
|
+
:filter_strategy => lambda { fail })
|
25
|
+
|
26
|
+
stage.filtered_source(stub(:etl_batch), true)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "opens sinks before writing and closes them afterwards" do
|
30
|
+
sink = mock(:sink)
|
31
|
+
sink.should_receive(:open)
|
32
|
+
sink.should_receive(:close)
|
33
|
+
|
34
|
+
stage = described_class.new(:test,
|
35
|
+
:source => [],
|
36
|
+
:sinks => {:default => sink})
|
37
|
+
|
38
|
+
stage.execute(stub(:etl_batch), true)
|
39
|
+
end
|
40
|
+
end
|
@@ -44,29 +44,33 @@ describe "Mysql -> Mysql through transformation chain" do
|
|
44
44
|
|
45
45
|
it "copies data from source to destination" do
|
46
46
|
TEST_DB[:source].multi_insert([{:foo => nil, :bin => :unhex.sql_function("1F")},
|
47
|
-
|
48
|
-
|
47
|
+
{:foo => "Hello", :bin => :unhex.sql_function("1F")}])
|
48
|
+
|
49
49
|
source = Chicago::Flow::DatasetSource.
|
50
50
|
new(TEST_DB[:source].
|
51
51
|
select(:id, :foo, :hex.sql_function(:bin).as(:bin)))
|
52
|
+
|
53
|
+
transformations = [dup_row.new(:onto => :other)]
|
54
|
+
|
52
55
|
sink_1 = Chicago::Flow::MysqlFileSink.
|
53
56
|
new(TEST_DB, :destination, [:id, :foo, :bin])
|
54
57
|
sink_2 = Chicago::Flow::ArraySink.new([:id, :foo, :bin])
|
55
58
|
|
56
|
-
stage = Chicago::
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
stage = Chicago::ETL::Stage.new(:test,
|
60
|
+
:source => source,
|
61
|
+
:transformations => transformations,
|
62
|
+
:sinks => {
|
63
|
+
:default => sink_1,
|
64
|
+
:other => sink_2
|
65
|
+
})
|
63
66
|
|
64
|
-
stage.execute(
|
67
|
+
stage.execute(stub(:etl_batch), true)
|
65
68
|
|
66
69
|
expected = [{:id => 1, :foo => nil, :bin => "1F"},
|
67
70
|
{:id => 2, :foo => "Hello", :bin => "1F"}]
|
68
71
|
|
69
72
|
sink_2.data.should == expected
|
70
|
-
TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
|
73
|
+
TEST_DB[:destination].select(:id, :foo, :hex.sql_function(:bin).as(:bin)).
|
74
|
+
all.should == expected
|
71
75
|
end
|
72
76
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -243,10 +243,11 @@ files:
|
|
243
243
|
- lib/chicago/etl/core_extensions.rb
|
244
244
|
- lib/chicago/etl/counter.rb
|
245
245
|
- lib/chicago/etl/dataset_batch_stage.rb
|
246
|
+
- lib/chicago/etl/dataset_builder.rb
|
246
247
|
- lib/chicago/etl/key_builder.rb
|
247
248
|
- lib/chicago/etl/load_dataset_builder.rb
|
248
|
-
- lib/chicago/etl/load_pipeline_stage_builder.rb
|
249
249
|
- lib/chicago/etl/pipeline.rb
|
250
|
+
- lib/chicago/etl/schema_sinks_and_transformations_builder.rb
|
250
251
|
- lib/chicago/etl/schema_table_sink_factory.rb
|
251
252
|
- lib/chicago/etl/screens/column_screen.rb
|
252
253
|
- lib/chicago/etl/screens/invalid_element.rb
|
@@ -273,7 +274,6 @@ files:
|
|
273
274
|
- lib/chicago/flow/mysql_file_sink.rb
|
274
275
|
- lib/chicago/flow/null_sink.rb
|
275
276
|
- lib/chicago/flow/pipeline_endpoint.rb
|
276
|
-
- lib/chicago/flow/pipeline_stage.rb
|
277
277
|
- lib/chicago/flow/sink.rb
|
278
278
|
- lib/chicago/flow/transformation.rb
|
279
279
|
- lib/chicago/flow/transformation_chain.rb
|
@@ -281,7 +281,7 @@ files:
|
|
281
281
|
- spec/etl/batch_spec.rb
|
282
282
|
- spec/etl/core_extensions_spec.rb
|
283
283
|
- spec/etl/counter_spec.rb
|
284
|
-
- spec/etl/
|
284
|
+
- spec/etl/define_dimension_stage_spec.rb
|
285
285
|
- spec/etl/define_stage_spec.rb
|
286
286
|
- spec/etl/etl_batch_id_dataset_filter.rb
|
287
287
|
- spec/etl/key_builder_spec.rb
|
@@ -293,6 +293,7 @@ files:
|
|
293
293
|
- spec/etl/screens/out_of_bounds_spec.rb
|
294
294
|
- spec/etl/sequel/dependant_tables_spec.rb
|
295
295
|
- spec/etl/sequel/filter_to_etl_batch_spec.rb
|
296
|
+
- spec/etl/stage_spec.rb
|
296
297
|
- spec/etl/table_builder_spec.rb
|
297
298
|
- spec/etl/task_spec.rb
|
298
299
|
- spec/etl/transformations/deduplicate_rows_spec.rb
|
@@ -306,7 +307,6 @@ files:
|
|
306
307
|
- spec/flow/mysql_file_serializer_spec.rb
|
307
308
|
- spec/flow/mysql_file_sink_spec.rb
|
308
309
|
- spec/flow/mysql_integration_spec.rb
|
309
|
-
- spec/flow/pipeline_stage_spec.rb
|
310
310
|
- spec/flow/transformation_chain_spec.rb
|
311
311
|
- spec/flow/transformation_spec.rb
|
312
312
|
- spec/spec_helper.rb
|
@@ -1,68 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module Flow
|
3
|
-
# Co-ordinates iterating over rows provided by a source, passing
|
4
|
-
# them through a transformation chain before writing them to
|
5
|
-
# sink(s).
|
6
|
-
#
|
7
|
-
# @api public
|
8
|
-
class PipelineStage
|
9
|
-
attr_reader :transformation_chain
|
10
|
-
|
11
|
-
def initialize(options={})
|
12
|
-
@sinks = options[:sinks] || {}
|
13
|
-
@transformations = options[:transformations] || []
|
14
|
-
@error_handler = options[:error_handler] || RaisingErrorHandler.new
|
15
|
-
@transformation_chain = TransformationChain.new(*@transformations)
|
16
|
-
end
|
17
|
-
|
18
|
-
# Returns the named sink, if it exists
|
19
|
-
def sink(name)
|
20
|
-
@sinks[name.to_sym]
|
21
|
-
end
|
22
|
-
|
23
|
-
def sinks
|
24
|
-
@sinks.values
|
25
|
-
end
|
26
|
-
|
27
|
-
def register_sink(name, sink)
|
28
|
-
@sinks[name.to_sym] = sink
|
29
|
-
self
|
30
|
-
end
|
31
|
-
|
32
|
-
def validate_pipeline
|
33
|
-
unless unregistered_sinks.empty?
|
34
|
-
@error_handler.unregistered_sinks(unregistered_sinks)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def execute(source)
|
39
|
-
validate_pipeline
|
40
|
-
sinks.each(&:open)
|
41
|
-
pipe_rows_to_sinks_from(source)
|
42
|
-
sinks.each(&:close)
|
43
|
-
end
|
44
|
-
|
45
|
-
def required_sinks
|
46
|
-
transformation_chain.output_streams | [:default]
|
47
|
-
end
|
48
|
-
|
49
|
-
def unregistered_sinks
|
50
|
-
required_sinks - @sinks.keys
|
51
|
-
end
|
52
|
-
|
53
|
-
private
|
54
|
-
|
55
|
-
def pipe_rows_to_sinks_from(source)
|
56
|
-
source.each do |row|
|
57
|
-
transformation_chain.process(row).each {|row| process_row(row) }
|
58
|
-
end
|
59
|
-
transformation_chain.flush.each {|row| process_row(row) }
|
60
|
-
end
|
61
|
-
|
62
|
-
def process_row(row)
|
63
|
-
stream = row.delete(:_stream) || :default
|
64
|
-
@sinks[stream] << row
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::DatasetBatchStage do
|
4
|
-
let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
|
5
|
-
let(:dataset) { mock(:dataset).as_null_object }
|
6
|
-
let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
|
7
|
-
let(:etl_batch) { stub(:etl_batch) }
|
8
|
-
|
9
|
-
it "has a name" do
|
10
|
-
stage.name.should == :foo
|
11
|
-
end
|
12
|
-
|
13
|
-
it "should set the inserted at time on the default sink" do
|
14
|
-
sink = Chicago::Flow::ArraySink.new(:foo)
|
15
|
-
pipeline_stage.stub(:sink).with(:default).and_return(sink)
|
16
|
-
stage.pipeline_stage.should == pipeline_stage
|
17
|
-
|
18
|
-
sink.constant_values[:_inserted_at].should_not be_nil
|
19
|
-
end
|
20
|
-
|
21
|
-
it "filters the dataset to the batch" do
|
22
|
-
dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
|
23
|
-
stage.source(etl_batch)
|
24
|
-
end
|
25
|
-
|
26
|
-
it "does not filter the dataset if re-extracting" do
|
27
|
-
dataset.should_not_recieve(:filter_to_etl_batch)
|
28
|
-
stage.source(etl_batch, true)
|
29
|
-
end
|
30
|
-
|
31
|
-
it "can filter via a custom strategy" do
|
32
|
-
dataset.should_not_recieve(:filter_to_etl_batch)
|
33
|
-
|
34
|
-
filter_strategy = lambda {|ds, batch| ds }
|
35
|
-
described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
|
36
|
-
source(etl_batch)
|
37
|
-
end
|
38
|
-
|
39
|
-
it "executes the pipeline stage using a DatasetSource" do
|
40
|
-
pipeline_stage.should_receive(:execute).
|
41
|
-
with(kind_of(Chicago::Flow::DatasetSource))
|
42
|
-
stage.execute(etl_batch, true)
|
43
|
-
end
|
44
|
-
|
45
|
-
it "truncates any sinks if truncate_pre_load has been set" do
|
46
|
-
stage = described_class.new(:foo, dataset, pipeline_stage,
|
47
|
-
:truncate_pre_load => true)
|
48
|
-
|
49
|
-
sink = Chicago::Flow::ArraySink.new(:output)
|
50
|
-
sink << {:foo => "foo"}
|
51
|
-
pipeline_stage.stub(:sinks).and_return([sink])
|
52
|
-
stage.execute(etl_batch)
|
53
|
-
sink.data.should == []
|
54
|
-
end
|
55
|
-
end
|
@@ -1,89 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::Flow::PipelineStage do
|
4
|
-
let(:transform) {
|
5
|
-
Class.new(Chicago::Flow::Transformation) {
|
6
|
-
def process_row(row)
|
7
|
-
row[:a] += 1
|
8
|
-
row
|
9
|
-
end
|
10
|
-
}
|
11
|
-
}
|
12
|
-
|
13
|
-
let(:add_error) {
|
14
|
-
Class.new(Chicago::Flow::Transformation) {
|
15
|
-
# add_output_stream :error
|
16
|
-
def output_streams
|
17
|
-
[:default, :error]
|
18
|
-
end
|
19
|
-
|
20
|
-
def process_row(row)
|
21
|
-
[row, {Chicago::Flow::STREAM => :error, :message => "error"}]
|
22
|
-
end
|
23
|
-
}
|
24
|
-
}
|
25
|
-
|
26
|
-
let(:sink) { Chicago::Flow::ArraySink.new(:test) }
|
27
|
-
let(:source) { Chicago::Flow::ArraySource.new([{:a => 1}]) }
|
28
|
-
|
29
|
-
it "returns all sinks" do
|
30
|
-
stage = described_class.new.register_sink(:default, sink)
|
31
|
-
stage.sinks.should == [sink]
|
32
|
-
end
|
33
|
-
|
34
|
-
it "returns a sink by name" do
|
35
|
-
stage = described_class.new.register_sink(:default, sink)
|
36
|
-
stage.sink(:default).should == sink
|
37
|
-
end
|
38
|
-
|
39
|
-
it "reads from source to sink" do
|
40
|
-
pipeline = described_class.new.register_sink(:default, sink)
|
41
|
-
pipeline.execute(source)
|
42
|
-
sink.data.should == [{:a => 1}]
|
43
|
-
end
|
44
|
-
|
45
|
-
it "passes rows through transforms" do
|
46
|
-
pipeline = described_class.new(:transformations => [transform.new]).
|
47
|
-
register_sink(:default, sink)
|
48
|
-
|
49
|
-
pipeline.execute(source)
|
50
|
-
sink.data.should == [{:a => 2}]
|
51
|
-
end
|
52
|
-
|
53
|
-
it "writes rows to the appropriate sink for their stream, and strips the stream tag" do
|
54
|
-
error_sink = Chicago::Flow::ArraySink.new(:test)
|
55
|
-
|
56
|
-
pipeline = described_class.new(:transformations => [add_error.new]).
|
57
|
-
register_sink(:default, sink).
|
58
|
-
register_sink(:error, error_sink)
|
59
|
-
|
60
|
-
pipeline.execute(source)
|
61
|
-
sink.data.should == [{:a => 1}]
|
62
|
-
error_sink.data.should == [{:message => "error"}]
|
63
|
-
end
|
64
|
-
|
65
|
-
it "calls an error handler if sinks are not registered" do
|
66
|
-
error_handler = mock(:error_handler)
|
67
|
-
error_handler.should_receive(:unregistered_sinks).
|
68
|
-
with([:default, :error])
|
69
|
-
|
70
|
-
pipeline = described_class.new(:transformations => [add_error.new],
|
71
|
-
:error_handler => error_handler)
|
72
|
-
|
73
|
-
pipeline.validate_pipeline
|
74
|
-
end
|
75
|
-
|
76
|
-
it "by default raises an exception if the pipeline is not valid when executed" do
|
77
|
-
pipeline = described_class.new(:transformations => [add_error.new])
|
78
|
-
expect { pipeline.execute(source) }.to raise_error(Chicago::Flow::Error)
|
79
|
-
end
|
80
|
-
|
81
|
-
it "opens sinks before writing and closes them afterwards" do
|
82
|
-
sink = mock(:sink)
|
83
|
-
pipeline = described_class.new.register_sink(:default, sink)
|
84
|
-
sink.should_receive(:open)
|
85
|
-
sink.stub(:<<)
|
86
|
-
sink.should_receive(:close)
|
87
|
-
pipeline.execute(source)
|
88
|
-
end
|
89
|
-
end
|