chicago-etl 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,142 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Provides DSL methods for specifying the pipeline in an ETL
|
4
|
+
# stage.
|
5
|
+
#
|
6
|
+
# Clients will not normally instantiate this themselves but use it
|
7
|
+
# in the context of defining an ETL stage.
|
8
|
+
class LoadPipelineStageBuilder
|
9
|
+
# @api private
|
10
|
+
KeyMapping = Struct.new(:table, :field)
|
11
|
+
|
12
|
+
# The ordering of inbuilt transformation and screening steps.
|
13
|
+
TRANSFORMATION_ORDER = [:before_screens,
|
14
|
+
:screens,
|
15
|
+
:after_screens,
|
16
|
+
:before_keys,
|
17
|
+
:keys,
|
18
|
+
:after_keys,
|
19
|
+
:before_final,
|
20
|
+
:final,
|
21
|
+
:after_final
|
22
|
+
].freeze
|
23
|
+
|
24
|
+
# @api private
|
25
|
+
def initialize(db, schema_table)
|
26
|
+
@db = db
|
27
|
+
@schema_table = schema_table
|
28
|
+
@sink_factory = SchemaTableSinkFactory.new(@db, @schema_table)
|
29
|
+
end
|
30
|
+
|
31
|
+
# @api private
|
32
|
+
def build(&block)
|
33
|
+
@load_separately = []
|
34
|
+
@key_mappings = []
|
35
|
+
@transformations = {}
|
36
|
+
TRANSFORMATION_ORDER.each {|k| @transformations[k] = [] }
|
37
|
+
@ignore_present_rows = false
|
38
|
+
|
39
|
+
instance_eval &block
|
40
|
+
|
41
|
+
add_screens
|
42
|
+
add_key_transforms
|
43
|
+
add_final_transforms
|
44
|
+
pipeline_stage = create_pipeline_stage
|
45
|
+
register_additional_sinks(pipeline_stage)
|
46
|
+
pipeline_stage
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
# Ignore rows already present in the target table, rather than
|
52
|
+
# replacing them.
|
53
|
+
def ignore_present_rows
|
54
|
+
@ignore_present_rows = true
|
55
|
+
end
|
56
|
+
|
57
|
+
# Specify columns that won't be loaded or screened as part of
|
58
|
+
# this pipeline stage
|
59
|
+
def load_separately(*columns)
|
60
|
+
@load_separately += columns
|
61
|
+
end
|
62
|
+
|
63
|
+
# Add an additional key mapping.
|
64
|
+
def key_mapping(table, field)
|
65
|
+
@key_mappings << KeyMapping.new(table, field)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Add a transformation before the specified point in the
|
69
|
+
# transformation chain (defined in TRANSFORMATION_ORDER)
|
70
|
+
def before(point_in_transformation_chain, transform)
|
71
|
+
key = "before_#{point_in_transformation_chain}".to_sym
|
72
|
+
@transformations[key] << transform
|
73
|
+
end
|
74
|
+
|
75
|
+
# Add a transformation after the specified point in the
|
76
|
+
# transformation chain (defined in TRANSFORMATION_ORDER)
|
77
|
+
def after(point_in_transformation_chain, transform)
|
78
|
+
key = "after_#{point_in_transformation_chain}".to_sym
|
79
|
+
@transformations[key] << transform
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def create_pipeline_stage
|
85
|
+
default = @sink_factory.sink(:ignore => @ignore_present_rows,
|
86
|
+
:exclude => @load_separately)
|
87
|
+
key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
|
88
|
+
@sink_factory.key_sink
|
89
|
+
else
|
90
|
+
# Facts have no key table to write to.
|
91
|
+
Flow::NullSink.new
|
92
|
+
end
|
93
|
+
|
94
|
+
Flow::PipelineStage.
|
95
|
+
new(:transformations => concat_transformations,
|
96
|
+
:sinks => {
|
97
|
+
:default => default,
|
98
|
+
:dimension_key => key_sink,
|
99
|
+
:error => @sink_factory.error_sink
|
100
|
+
})
|
101
|
+
end
|
102
|
+
|
103
|
+
def concat_transformations
|
104
|
+
TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
|
105
|
+
end
|
106
|
+
|
107
|
+
def register_additional_sinks(pipeline_stage)
|
108
|
+
@key_mappings.each do |mapping|
|
109
|
+
sink = @sink_factory.key_sink(:table => mapping.table)
|
110
|
+
pipeline_stage.register_sink(mapping.table, sink)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def add_screens
|
115
|
+
columns_to_screen = @schema_table.columns.reject do |column|
|
116
|
+
@load_separately.include?(column.name)
|
117
|
+
end
|
118
|
+
|
119
|
+
@transformations[:screens] = [Screens::MissingValue,
|
120
|
+
Screens::InvalidElement,
|
121
|
+
Screens::OutOfBounds].map do |klass|
|
122
|
+
klass.for_columns(columns_to_screen)
|
123
|
+
end.flatten
|
124
|
+
end
|
125
|
+
|
126
|
+
def add_key_transforms
|
127
|
+
@transformations[:keys] << Transformations::AddKey.
|
128
|
+
new(:key_builder => KeyBuilder.for_table(@schema_table, @db))
|
129
|
+
|
130
|
+
@key_mappings.each do |mapping|
|
131
|
+
@transformations[:keys] << Transformations::DimensionKeyMapping.
|
132
|
+
new(:original_key => mapping.field, :key_table => mapping.table)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def add_final_transforms
|
137
|
+
@transformations[:final] << Transformations::WrittenRowFilter.new(:key => :id)
|
138
|
+
@transformations[:final] << Transformations::DemultiplexErrors.new
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# An ETL pipeline.
|
4
|
+
class Pipeline
|
5
|
+
# Returns all defined dimension load tasks
|
6
|
+
attr_reader :load_dimensions
|
7
|
+
|
8
|
+
# Returns all defined fact load tasks
|
9
|
+
attr_reader :load_facts
|
10
|
+
|
11
|
+
# Creates a pipeline for a Schema.
|
12
|
+
def initialize(db, schema)
|
13
|
+
@schema, @db = schema, db
|
14
|
+
@load_dimensions = Chicago::Schema::NamedElementCollection.new
|
15
|
+
@load_facts = Chicago::Schema::NamedElementCollection.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Defines a dimension load stage
|
19
|
+
def define_dimension_load(name, options={}, &block)
|
20
|
+
dimension_name = options[:dimension] || name
|
21
|
+
@load_dimensions << build_stage(name,
|
22
|
+
@schema.dimension(dimension_name),
|
23
|
+
&block)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Defines a fact load stage
|
27
|
+
def define_fact_load(name, options={}, &block)
|
28
|
+
fact_name = options[:fact] || name
|
29
|
+
@load_facts << build_stage(name, @schema.fact(fact_name), &block)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Builds a stage, but does not define it.
|
33
|
+
def build_stage(name, schema_table, &block)
|
34
|
+
DatasetBatchStageBuilder.new(@db, schema_table).build(name, &block)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Provides DSL methods for building a DataSetBatchStage.
|
39
|
+
#
|
40
|
+
# Clients shouldn't need to instantiate this directly, but instead
|
41
|
+
# call the protected methods in the context of defining a Pipeline
|
42
|
+
class DatasetBatchStageBuilder
|
43
|
+
# @api private
|
44
|
+
def initialize(db, schema_table)
|
45
|
+
@db, @schema_table = db, schema_table
|
46
|
+
end
|
47
|
+
|
48
|
+
# @api private
|
49
|
+
def build(name, &block)
|
50
|
+
instance_eval &block
|
51
|
+
unless defined? @pipeline_stage
|
52
|
+
pipeline do
|
53
|
+
end
|
54
|
+
end
|
55
|
+
DatasetBatchStage.new(name, @dataset, @pipeline_stage,
|
56
|
+
:filter_strategy => @filter_strategy,
|
57
|
+
:truncate_pre_load => @truncate_pre_load)
|
58
|
+
end
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
62
|
+
# Specifies that the sinks should be truncated before loading
|
63
|
+
# data.
|
64
|
+
def truncate_pre_load
|
65
|
+
@truncate_pre_load = true
|
66
|
+
end
|
67
|
+
|
68
|
+
# Specifies that the dataset should never be filtered to the ETL
|
69
|
+
# batch - i.e. it should behave as if reextract was always true
|
70
|
+
def full_reload
|
71
|
+
@filter_strategy = lambda {|dataset, etl_batch| dataset }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Define elements of the pipeline. See LoadPipelineStageBuilder
|
75
|
+
# for details.
|
76
|
+
def pipeline(&block)
|
77
|
+
@pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
|
78
|
+
build(&block)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Defines the dataset, see DatasetBuilder .
|
82
|
+
#
|
83
|
+
# The block must return a Sequel::Dataset.
|
84
|
+
def dataset(&block)
|
85
|
+
@dataset = DatasetBuilder.new(@db).build(&block)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Define a custom filter strategy for filtering to an ETL batch.
|
89
|
+
def filter_strategy(&block)
|
90
|
+
@filter_strategy = block
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Provides convenience methods for defining source datasets.
|
95
|
+
class DatasetBuilder
|
96
|
+
attr_reader :db
|
97
|
+
|
98
|
+
# @api private
|
99
|
+
def initialize(db)
|
100
|
+
@db = db
|
101
|
+
end
|
102
|
+
|
103
|
+
# @api private
|
104
|
+
def build(&block)
|
105
|
+
instance_eval(&block)
|
106
|
+
end
|
107
|
+
|
108
|
+
protected
|
109
|
+
|
110
|
+
def key_field(field, name)
|
111
|
+
:if[{field => nil}, 1, field].as(name)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
115
|
+
# return a dimension key.
|
116
|
+
#
|
117
|
+
# Takes care of using the key tables correctly, and dealing with
|
118
|
+
# missing dimension values.
|
119
|
+
def dimension_key(name)
|
120
|
+
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
121
|
+
"#{name}_dimension_id".to_sym)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
125
|
+
# return a date dimension key.
|
126
|
+
def date_dimension_column(dimension)
|
127
|
+
:if.sql_function({:id.qualify(dimension) => nil},
|
128
|
+
1,
|
129
|
+
:id.qualify(dimension)).
|
130
|
+
as("#{dimension}_dimension_id".to_sym)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Rounds a monetary value to 2 decimal places.
|
134
|
+
#
|
135
|
+
# By default, natural rounding is used, you can specify either
|
136
|
+
# :up or :down as the direction.
|
137
|
+
#
|
138
|
+
# @deprecated
|
139
|
+
def round(stmt, direction = :none)
|
140
|
+
case direction
|
141
|
+
when :none
|
142
|
+
:round.sql_function(stmt, 2)
|
143
|
+
when :up
|
144
|
+
:ceil.sql_function(stmt * 100) / 100
|
145
|
+
when :down
|
146
|
+
:floor.sql_function(stmt * 100) / 100
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Builds Sinks for Dimension & Fact tables.
|
4
|
+
class SchemaTableSinkFactory
|
5
|
+
# Creates a new factory.
|
6
|
+
def initialize(db, schema_table)
|
7
|
+
@db, @schema_table = db, schema_table
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns a sink to load data into the MySQL table backing the
|
11
|
+
# schema table.
|
12
|
+
#
|
13
|
+
# Pass an :exclude option if you don't want all columns of the
|
14
|
+
# schema table to be loaded via this sink.
|
15
|
+
def sink(options={})
|
16
|
+
Flow::MysqlFileSink.new(@db,
|
17
|
+
@schema_table.table_name,
|
18
|
+
load_columns(options[:exclude]),
|
19
|
+
mysql_options(options))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns a sink to load data into the MySQL table backing the
|
23
|
+
# key table for a Dimension.
|
24
|
+
#
|
25
|
+
# @option options [Symbol] :table - a custom key table name. The
|
26
|
+
# schema table's key table name will be used otherwise.
|
27
|
+
def key_sink(options={})
|
28
|
+
table = options.delete(:table) || @schema_table.key_table_name
|
29
|
+
sink = Flow::MysqlFileSink.new(@db,
|
30
|
+
table,
|
31
|
+
[:original_id, :dimension_id],
|
32
|
+
mysql_options(options))
|
33
|
+
sink.truncation_strategy = lambda do
|
34
|
+
# No Op - we want to maintain keys to avoid having to sort
|
35
|
+
# out fact tables.
|
36
|
+
end
|
37
|
+
sink
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns a sink to load errors generated in the ETL process.
|
41
|
+
def error_sink(options={})
|
42
|
+
sink = Flow::MysqlFileSink.
|
43
|
+
new(@db, :etl_error_log,
|
44
|
+
[:column, :row_id, :error, :severity, :error_detail], mysql_options(options)).
|
45
|
+
set_constant_values(:table => @schema_table.table_name.to_s,
|
46
|
+
:process_name => "StandardTransformations",
|
47
|
+
:process_version => 3,
|
48
|
+
:logged_at => Time.now)
|
49
|
+
|
50
|
+
sink.truncation_strategy = lambda do
|
51
|
+
@db[:etl_error_log].
|
52
|
+
where(:table => @schema_table.table_name.to_s).delete
|
53
|
+
end
|
54
|
+
sink
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def load_columns(exclude=nil)
|
60
|
+
exclude = [exclude].compact.flatten
|
61
|
+
[:id] + @schema_table.columns.
|
62
|
+
reject {|c| exclude.include?(c.name) }.
|
63
|
+
map {|c| c.database_name }
|
64
|
+
end
|
65
|
+
|
66
|
+
def mysql_options(options)
|
67
|
+
[:filepath, :ignore].inject({}) do |hsh, k|
|
68
|
+
hsh[k] = options[k] if options.has_key?(k)
|
69
|
+
hsh
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -1,53 +1,54 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
@error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
|
4
|
+
# @abstract
|
5
|
+
class ColumnScreen < Flow::Transformation
|
6
|
+
def self.for_columns(columns)
|
7
|
+
columns.map {|column|
|
8
|
+
new(:default, :column => column)
|
9
|
+
}
|
11
10
|
end
|
12
11
|
|
13
|
-
def
|
14
|
-
|
15
|
-
CompositeScreen.new(screens)
|
12
|
+
def output_streams
|
13
|
+
[:default, :error]
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
if applies?(value)
|
16
|
+
def process_row(row)
|
17
|
+
if applies?(row[column.database_name])
|
22
18
|
overwrite_value(row)
|
23
|
-
|
19
|
+
error_row = error(row[column.database_name])
|
20
|
+
if error_row
|
21
|
+
row[:_errors] ||= []
|
22
|
+
row[:_errors] << error_row
|
23
|
+
end
|
24
24
|
end
|
25
25
|
|
26
|
-
|
26
|
+
row
|
27
27
|
end
|
28
28
|
|
29
29
|
def severity
|
30
30
|
1
|
31
31
|
end
|
32
32
|
|
33
|
+
def column
|
34
|
+
@options[:column]
|
35
|
+
end
|
36
|
+
|
33
37
|
private
|
34
38
|
|
35
|
-
def
|
36
|
-
|
39
|
+
def error_name
|
40
|
+
self.class.name.split('::').last.sub(/Screen$/,'').titlecase
|
37
41
|
end
|
38
42
|
|
39
|
-
def
|
40
|
-
|
43
|
+
def overwrite_value(row)
|
44
|
+
row[column.database_name] = column.default_value
|
41
45
|
end
|
42
46
|
|
43
|
-
def
|
47
|
+
def error(value)
|
44
48
|
{
|
45
|
-
:process_name => "StandardTransformations",
|
46
|
-
:process_version => 2,
|
47
|
-
:table => table_name.to_s,
|
48
49
|
:column => column.database_name.to_s,
|
49
50
|
:severity => severity,
|
50
|
-
:error =>
|
51
|
+
:error => error_name
|
51
52
|
}
|
52
53
|
end
|
53
54
|
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
+
# Transformation which checks to see if a field's value is in a
|
5
|
+
# column's elements.
|
4
6
|
class InvalidElement < ColumnScreen
|
5
|
-
def self.for_columns(
|
6
|
-
|
7
|
-
map {|column| new(table_name, column) }
|
8
|
-
CompositeScreen.new(screens)
|
7
|
+
def self.for_columns(columns)
|
8
|
+
columns.select(&:elements).map {|column| new(:default, :column => column) }
|
9
9
|
end
|
10
10
|
|
11
11
|
def severity
|
@@ -17,7 +17,7 @@ module Chicago
|
|
17
17
|
!column.elements.map(&:downcase).include?(value.to_s.downcase)
|
18
18
|
end
|
19
19
|
|
20
|
-
def
|
20
|
+
def error(value)
|
21
21
|
super(value).
|
22
22
|
merge(:error_detail => "'#{value}' is not a valid value.")
|
23
23
|
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
+
# Screen which checks to see if a field is present in the row if
|
5
|
+
# required.
|
4
6
|
class MissingValue < ColumnScreen
|
5
7
|
def severity
|
6
8
|
column.descriptive? ? 1 : 2
|
7
9
|
end
|
8
10
|
|
9
|
-
def
|
11
|
+
def error(value)
|
10
12
|
if ! (column.column_type == :boolean || column.optional?)
|
11
|
-
|
13
|
+
super(value)
|
12
14
|
end
|
13
15
|
end
|
14
16
|
|
@@ -8,11 +8,13 @@ module Chicago
|
|
8
8
|
new(db).build
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
# @api private
|
12
|
+
def initialize(db)
|
12
13
|
@db = db
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
+
# @api private
|
17
|
+
def build
|
16
18
|
create_table :etl_batches do
|
17
19
|
primary_key :id, :type => :integer, :unsigned => true
|
18
20
|
timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
|
@@ -0,0 +1,128 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Transformations
|
4
|
+
# Filters rows so they only get output once, based on a :key.
|
5
|
+
class WrittenRowFilter < Flow::Transformation
|
6
|
+
requires_options :key
|
7
|
+
|
8
|
+
def initialize(*args)
|
9
|
+
super(*args)
|
10
|
+
@written_rows = Set.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def process_row(row)
|
14
|
+
key = row[key_field]
|
15
|
+
# puts "Checking on #{key}"
|
16
|
+
unless @written_rows.include?(key)
|
17
|
+
@written_rows << key
|
18
|
+
row
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def key_field
|
23
|
+
@options[:key]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Adds an :id field to a row, based on a KeyBuilder.
|
28
|
+
#
|
29
|
+
# Also adds this id as :row_id to any rows in an embedded
|
30
|
+
# :_errors field.
|
31
|
+
#
|
32
|
+
# Pass the :key_builder option to set the KeyBuilder.
|
33
|
+
class AddKey < Flow::Transformation
|
34
|
+
requires_options :key_builder
|
35
|
+
adds_fields :id
|
36
|
+
|
37
|
+
def output_streams
|
38
|
+
[:default, :dimension_key]
|
39
|
+
end
|
40
|
+
|
41
|
+
def process_row(row)
|
42
|
+
key, key_row = key_builder.key(row)
|
43
|
+
row[:id] = key
|
44
|
+
(row[:_errors] || []).each {|e| e[:row_id] = row[:id] }
|
45
|
+
|
46
|
+
if key_row
|
47
|
+
assign_stream(key_row, :dimension_key)
|
48
|
+
[row, key_row]
|
49
|
+
else
|
50
|
+
row
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def key_builder
|
55
|
+
@options[:key_builder]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Removes embedded :_errors and puts them on the error stream.
|
60
|
+
class DemultiplexErrors < Flow::Transformation
|
61
|
+
def output_streams
|
62
|
+
[:default, :error]
|
63
|
+
end
|
64
|
+
|
65
|
+
def process_row(row)
|
66
|
+
errors = (row.delete(:_errors) || []).each do |e|
|
67
|
+
assign_stream(e, :error)
|
68
|
+
end
|
69
|
+
|
70
|
+
[row] + errors
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Removes a field from the row, and creates a row on a
|
75
|
+
# designated key stream
|
76
|
+
class DimensionKeyMapping < Flow::Transformation
|
77
|
+
requires_options :original_key, :key_table
|
78
|
+
|
79
|
+
def removed_fields
|
80
|
+
[original_key]
|
81
|
+
end
|
82
|
+
|
83
|
+
def output_streams
|
84
|
+
[:default, key_table]
|
85
|
+
end
|
86
|
+
|
87
|
+
def process_row(row)
|
88
|
+
key_row = {
|
89
|
+
:original_id => row.delete(original_key),
|
90
|
+
:dimension_id => row[:id]
|
91
|
+
}
|
92
|
+
assign_stream(key_row, key_table)
|
93
|
+
[row, key_row]
|
94
|
+
end
|
95
|
+
|
96
|
+
def original_key
|
97
|
+
@options[:original_key]
|
98
|
+
end
|
99
|
+
|
100
|
+
def key_table
|
101
|
+
@options[:key_table]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Adds a hash of the specified columns as a field in the row.
|
106
|
+
class HashColumns < Flow::Transformation
|
107
|
+
requires_options :columns
|
108
|
+
|
109
|
+
def process_row(row)
|
110
|
+
str = hash_columns.map {|c| row[c].to_s }.join
|
111
|
+
row.put(output_field, Digest::MD5.hexdigest(str).upcase)
|
112
|
+
end
|
113
|
+
|
114
|
+
def added_fields
|
115
|
+
[output_field]
|
116
|
+
end
|
117
|
+
|
118
|
+
def output_field
|
119
|
+
@options[:output_field] || :hash
|
120
|
+
end
|
121
|
+
|
122
|
+
def hash_columns
|
123
|
+
@options[:columns]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|