chicago-etl 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,142 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Provides DSL methods for specifying the pipeline in an ETL
|
4
|
+
# stage.
|
5
|
+
#
|
6
|
+
# Clients will not normally instantiate this themselves but use it
|
7
|
+
# in the context of defining an ETL stage.
|
8
|
+
class LoadPipelineStageBuilder
|
9
|
+
# @api private
|
10
|
+
KeyMapping = Struct.new(:table, :field)
|
11
|
+
|
12
|
+
# The ordering of inbuilt transformation and screening steps.
|
13
|
+
TRANSFORMATION_ORDER = [:before_screens,
|
14
|
+
:screens,
|
15
|
+
:after_screens,
|
16
|
+
:before_keys,
|
17
|
+
:keys,
|
18
|
+
:after_keys,
|
19
|
+
:before_final,
|
20
|
+
:final,
|
21
|
+
:after_final
|
22
|
+
].freeze
|
23
|
+
|
24
|
+
# @api private
|
25
|
+
def initialize(db, schema_table)
|
26
|
+
@db = db
|
27
|
+
@schema_table = schema_table
|
28
|
+
@sink_factory = SchemaTableSinkFactory.new(@db, @schema_table)
|
29
|
+
end
|
30
|
+
|
31
|
+
# @api private
|
32
|
+
def build(&block)
|
33
|
+
@load_separately = []
|
34
|
+
@key_mappings = []
|
35
|
+
@transformations = {}
|
36
|
+
TRANSFORMATION_ORDER.each {|k| @transformations[k] = [] }
|
37
|
+
@ignore_present_rows = false
|
38
|
+
|
39
|
+
instance_eval &block
|
40
|
+
|
41
|
+
add_screens
|
42
|
+
add_key_transforms
|
43
|
+
add_final_transforms
|
44
|
+
pipeline_stage = create_pipeline_stage
|
45
|
+
register_additional_sinks(pipeline_stage)
|
46
|
+
pipeline_stage
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
# Ignore rows already present in the target table, rather than
|
52
|
+
# replacing them.
|
53
|
+
def ignore_present_rows
|
54
|
+
@ignore_present_rows = true
|
55
|
+
end
|
56
|
+
|
57
|
+
# Specify columns that won't be loaded or screened as part of
|
58
|
+
# this pipeline stage
|
59
|
+
def load_separately(*columns)
|
60
|
+
@load_separately += columns
|
61
|
+
end
|
62
|
+
|
63
|
+
# Add an additional key mapping.
|
64
|
+
def key_mapping(table, field)
|
65
|
+
@key_mappings << KeyMapping.new(table, field)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Add a transformation before the specified point in the
|
69
|
+
# transformation chain (defined in TRANSFORMATION_ORDER)
|
70
|
+
def before(point_in_transformation_chain, transform)
|
71
|
+
key = "before_#{point_in_transformation_chain}".to_sym
|
72
|
+
@transformations[key] << transform
|
73
|
+
end
|
74
|
+
|
75
|
+
# Add a transformation after the specified point in the
|
76
|
+
# transformation chain (defined in TRANSFORMATION_ORDER)
|
77
|
+
def after(point_in_transformation_chain, transform)
|
78
|
+
key = "after_#{point_in_transformation_chain}".to_sym
|
79
|
+
@transformations[key] << transform
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def create_pipeline_stage
|
85
|
+
default = @sink_factory.sink(:ignore => @ignore_present_rows,
|
86
|
+
:exclude => @load_separately)
|
87
|
+
key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
|
88
|
+
@sink_factory.key_sink
|
89
|
+
else
|
90
|
+
# Facts have no key table to write to.
|
91
|
+
Flow::NullSink.new
|
92
|
+
end
|
93
|
+
|
94
|
+
Flow::PipelineStage.
|
95
|
+
new(:transformations => concat_transformations,
|
96
|
+
:sinks => {
|
97
|
+
:default => default,
|
98
|
+
:dimension_key => key_sink,
|
99
|
+
:error => @sink_factory.error_sink
|
100
|
+
})
|
101
|
+
end
|
102
|
+
|
103
|
+
def concat_transformations
|
104
|
+
TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
|
105
|
+
end
|
106
|
+
|
107
|
+
def register_additional_sinks(pipeline_stage)
|
108
|
+
@key_mappings.each do |mapping|
|
109
|
+
sink = @sink_factory.key_sink(:table => mapping.table)
|
110
|
+
pipeline_stage.register_sink(mapping.table, sink)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def add_screens
|
115
|
+
columns_to_screen = @schema_table.columns.reject do |column|
|
116
|
+
@load_separately.include?(column.name)
|
117
|
+
end
|
118
|
+
|
119
|
+
@transformations[:screens] = [Screens::MissingValue,
|
120
|
+
Screens::InvalidElement,
|
121
|
+
Screens::OutOfBounds].map do |klass|
|
122
|
+
klass.for_columns(columns_to_screen)
|
123
|
+
end.flatten
|
124
|
+
end
|
125
|
+
|
126
|
+
def add_key_transforms
|
127
|
+
@transformations[:keys] << Transformations::AddKey.
|
128
|
+
new(:key_builder => KeyBuilder.for_table(@schema_table, @db))
|
129
|
+
|
130
|
+
@key_mappings.each do |mapping|
|
131
|
+
@transformations[:keys] << Transformations::DimensionKeyMapping.
|
132
|
+
new(:original_key => mapping.field, :key_table => mapping.table)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def add_final_transforms
|
137
|
+
@transformations[:final] << Transformations::WrittenRowFilter.new(:key => :id)
|
138
|
+
@transformations[:final] << Transformations::DemultiplexErrors.new
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# An ETL pipeline.
|
4
|
+
class Pipeline
|
5
|
+
# Returns all defined dimension load tasks
|
6
|
+
attr_reader :load_dimensions
|
7
|
+
|
8
|
+
# Returns all defined fact load tasks
|
9
|
+
attr_reader :load_facts
|
10
|
+
|
11
|
+
# Creates a pipeline for a Schema.
|
12
|
+
def initialize(db, schema)
|
13
|
+
@schema, @db = schema, db
|
14
|
+
@load_dimensions = Chicago::Schema::NamedElementCollection.new
|
15
|
+
@load_facts = Chicago::Schema::NamedElementCollection.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Defines a dimension load stage
|
19
|
+
def define_dimension_load(name, options={}, &block)
|
20
|
+
dimension_name = options[:dimension] || name
|
21
|
+
@load_dimensions << build_stage(name,
|
22
|
+
@schema.dimension(dimension_name),
|
23
|
+
&block)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Defines a fact load stage
|
27
|
+
def define_fact_load(name, options={}, &block)
|
28
|
+
fact_name = options[:fact] || name
|
29
|
+
@load_facts << build_stage(name, @schema.fact(fact_name), &block)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Builds a stage, but does not define it.
|
33
|
+
def build_stage(name, schema_table, &block)
|
34
|
+
DatasetBatchStageBuilder.new(@db, schema_table).build(name, &block)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Provides DSL methods for building a DataSetBatchStage.
|
39
|
+
#
|
40
|
+
# Clients shouldn't need to instantiate this directly, but instead
|
41
|
+
# call the protected methods in the context of defining a Pipeline
|
42
|
+
class DatasetBatchStageBuilder
|
43
|
+
# @api private
|
44
|
+
def initialize(db, schema_table)
|
45
|
+
@db, @schema_table = db, schema_table
|
46
|
+
end
|
47
|
+
|
48
|
+
# @api private
|
49
|
+
def build(name, &block)
|
50
|
+
instance_eval &block
|
51
|
+
unless defined? @pipeline_stage
|
52
|
+
pipeline do
|
53
|
+
end
|
54
|
+
end
|
55
|
+
DatasetBatchStage.new(name, @dataset, @pipeline_stage,
|
56
|
+
:filter_strategy => @filter_strategy,
|
57
|
+
:truncate_pre_load => @truncate_pre_load)
|
58
|
+
end
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
62
|
+
# Specifies that the sinks should be truncated before loading
|
63
|
+
# data.
|
64
|
+
def truncate_pre_load
|
65
|
+
@truncate_pre_load = true
|
66
|
+
end
|
67
|
+
|
68
|
+
# Specifies that the dataset should never be filtered to the ETL
|
69
|
+
# batch - i.e. it should behave as if reextract was always true
|
70
|
+
def full_reload
|
71
|
+
@filter_strategy = lambda {|dataset, etl_batch| dataset }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Define elements of the pipeline. See LoadPipelineStageBuilder
|
75
|
+
# for details.
|
76
|
+
def pipeline(&block)
|
77
|
+
@pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
|
78
|
+
build(&block)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Defines the dataset, see DatasetBuilder .
|
82
|
+
#
|
83
|
+
# The block must return a Sequel::Dataset.
|
84
|
+
def dataset(&block)
|
85
|
+
@dataset = DatasetBuilder.new(@db).build(&block)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Define a custom filter strategy for filtering to an ETL batch.
|
89
|
+
def filter_strategy(&block)
|
90
|
+
@filter_strategy = block
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Provides convenience methods for defining source datasets.
|
95
|
+
class DatasetBuilder
|
96
|
+
attr_reader :db
|
97
|
+
|
98
|
+
# @api private
|
99
|
+
def initialize(db)
|
100
|
+
@db = db
|
101
|
+
end
|
102
|
+
|
103
|
+
# @api private
|
104
|
+
def build(&block)
|
105
|
+
instance_eval(&block)
|
106
|
+
end
|
107
|
+
|
108
|
+
protected
|
109
|
+
|
110
|
+
def key_field(field, name)
|
111
|
+
:if[{field => nil}, 1, field].as(name)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
115
|
+
# return a dimension key.
|
116
|
+
#
|
117
|
+
# Takes care of using the key tables correctly, and dealing with
|
118
|
+
# missing dimension values.
|
119
|
+
def dimension_key(name)
|
120
|
+
key_field("keys_dimension_#{name}__dimension_id".to_sym,
|
121
|
+
"#{name}_dimension_id".to_sym)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a column for use in a Sequel::Dataset#select method to
|
125
|
+
# return a date dimension key.
|
126
|
+
def date_dimension_column(dimension)
|
127
|
+
:if.sql_function({:id.qualify(dimension) => nil},
|
128
|
+
1,
|
129
|
+
:id.qualify(dimension)).
|
130
|
+
as("#{dimension}_dimension_id".to_sym)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Rounds a monetary value to 2 decimal places.
|
134
|
+
#
|
135
|
+
# By default, natural rounding is used, you can specify either
|
136
|
+
# :up or :down as the direction.
|
137
|
+
#
|
138
|
+
# @deprecated
|
139
|
+
def round(stmt, direction = :none)
|
140
|
+
case direction
|
141
|
+
when :none
|
142
|
+
:round.sql_function(stmt, 2)
|
143
|
+
when :up
|
144
|
+
:ceil.sql_function(stmt * 100) / 100
|
145
|
+
when :down
|
146
|
+
:floor.sql_function(stmt * 100) / 100
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Builds Sinks for Dimension & Fact tables.
|
4
|
+
class SchemaTableSinkFactory
|
5
|
+
# Creates a new factory.
|
6
|
+
def initialize(db, schema_table)
|
7
|
+
@db, @schema_table = db, schema_table
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns a sink to load data into the MySQL table backing the
|
11
|
+
# schema table.
|
12
|
+
#
|
13
|
+
# Pass an :exclude option if you don't want all columns of the
|
14
|
+
# schema table to be loaded via this sink.
|
15
|
+
def sink(options={})
|
16
|
+
Flow::MysqlFileSink.new(@db,
|
17
|
+
@schema_table.table_name,
|
18
|
+
load_columns(options[:exclude]),
|
19
|
+
mysql_options(options))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns a sink to load data into the MySQL table backing the
|
23
|
+
# key table for a Dimension.
|
24
|
+
#
|
25
|
+
# @option options [Symbol] :table - a custom key table name. The
|
26
|
+
# schema table's key table name will be used otherwise.
|
27
|
+
def key_sink(options={})
|
28
|
+
table = options.delete(:table) || @schema_table.key_table_name
|
29
|
+
sink = Flow::MysqlFileSink.new(@db,
|
30
|
+
table,
|
31
|
+
[:original_id, :dimension_id],
|
32
|
+
mysql_options(options))
|
33
|
+
sink.truncation_strategy = lambda do
|
34
|
+
# No Op - we want to maintain keys to avoid having to sort
|
35
|
+
# out fact tables.
|
36
|
+
end
|
37
|
+
sink
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns a sink to load errors generated in the ETL process.
|
41
|
+
def error_sink(options={})
|
42
|
+
sink = Flow::MysqlFileSink.
|
43
|
+
new(@db, :etl_error_log,
|
44
|
+
[:column, :row_id, :error, :severity, :error_detail], mysql_options(options)).
|
45
|
+
set_constant_values(:table => @schema_table.table_name.to_s,
|
46
|
+
:process_name => "StandardTransformations",
|
47
|
+
:process_version => 3,
|
48
|
+
:logged_at => Time.now)
|
49
|
+
|
50
|
+
sink.truncation_strategy = lambda do
|
51
|
+
@db[:etl_error_log].
|
52
|
+
where(:table => @schema_table.table_name.to_s).delete
|
53
|
+
end
|
54
|
+
sink
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def load_columns(exclude=nil)
|
60
|
+
exclude = [exclude].compact.flatten
|
61
|
+
[:id] + @schema_table.columns.
|
62
|
+
reject {|c| exclude.include?(c.name) }.
|
63
|
+
map {|c| c.database_name }
|
64
|
+
end
|
65
|
+
|
66
|
+
def mysql_options(options)
|
67
|
+
[:filepath, :ignore].inject({}) do |hsh, k|
|
68
|
+
hsh[k] = options[k] if options.has_key?(k)
|
69
|
+
hsh
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -1,53 +1,54 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
@error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
|
4
|
+
# @abstract
|
5
|
+
class ColumnScreen < Flow::Transformation
|
6
|
+
def self.for_columns(columns)
|
7
|
+
columns.map {|column|
|
8
|
+
new(:default, :column => column)
|
9
|
+
}
|
11
10
|
end
|
12
11
|
|
13
|
-
def
|
14
|
-
|
15
|
-
CompositeScreen.new(screens)
|
12
|
+
def output_streams
|
13
|
+
[:default, :error]
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
if applies?(value)
|
16
|
+
def process_row(row)
|
17
|
+
if applies?(row[column.database_name])
|
22
18
|
overwrite_value(row)
|
23
|
-
|
19
|
+
error_row = error(row[column.database_name])
|
20
|
+
if error_row
|
21
|
+
row[:_errors] ||= []
|
22
|
+
row[:_errors] << error_row
|
23
|
+
end
|
24
24
|
end
|
25
25
|
|
26
|
-
|
26
|
+
row
|
27
27
|
end
|
28
28
|
|
29
29
|
def severity
|
30
30
|
1
|
31
31
|
end
|
32
32
|
|
33
|
+
def column
|
34
|
+
@options[:column]
|
35
|
+
end
|
36
|
+
|
33
37
|
private
|
34
38
|
|
35
|
-
def
|
36
|
-
|
39
|
+
def error_name
|
40
|
+
self.class.name.split('::').last.sub(/Screen$/,'').titlecase
|
37
41
|
end
|
38
42
|
|
39
|
-
def
|
40
|
-
|
43
|
+
def overwrite_value(row)
|
44
|
+
row[column.database_name] = column.default_value
|
41
45
|
end
|
42
46
|
|
43
|
-
def
|
47
|
+
def error(value)
|
44
48
|
{
|
45
|
-
:process_name => "StandardTransformations",
|
46
|
-
:process_version => 2,
|
47
|
-
:table => table_name.to_s,
|
48
49
|
:column => column.database_name.to_s,
|
49
50
|
:severity => severity,
|
50
|
-
:error =>
|
51
|
+
:error => error_name
|
51
52
|
}
|
52
53
|
end
|
53
54
|
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
+
# Transformation which checks to see if a field's value is in a
|
5
|
+
# column's elements.
|
4
6
|
class InvalidElement < ColumnScreen
|
5
|
-
def self.for_columns(
|
6
|
-
|
7
|
-
map {|column| new(table_name, column) }
|
8
|
-
CompositeScreen.new(screens)
|
7
|
+
def self.for_columns(columns)
|
8
|
+
columns.select(&:elements).map {|column| new(:default, :column => column) }
|
9
9
|
end
|
10
10
|
|
11
11
|
def severity
|
@@ -17,7 +17,7 @@ module Chicago
|
|
17
17
|
!column.elements.map(&:downcase).include?(value.to_s.downcase)
|
18
18
|
end
|
19
19
|
|
20
|
-
def
|
20
|
+
def error(value)
|
21
21
|
super(value).
|
22
22
|
merge(:error_detail => "'#{value}' is not a valid value.")
|
23
23
|
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
3
|
module Screens
|
4
|
+
# Screen which checks to see if a field is present in the row if
|
5
|
+
# required.
|
4
6
|
class MissingValue < ColumnScreen
|
5
7
|
def severity
|
6
8
|
column.descriptive? ? 1 : 2
|
7
9
|
end
|
8
10
|
|
9
|
-
def
|
11
|
+
def error(value)
|
10
12
|
if ! (column.column_type == :boolean || column.optional?)
|
11
|
-
|
13
|
+
super(value)
|
12
14
|
end
|
13
15
|
end
|
14
16
|
|
@@ -8,11 +8,13 @@ module Chicago
|
|
8
8
|
new(db).build
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
# @api private
|
12
|
+
def initialize(db)
|
12
13
|
@db = db
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
+
# @api private
|
17
|
+
def build
|
16
18
|
create_table :etl_batches do
|
17
19
|
primary_key :id, :type => :integer, :unsigned => true
|
18
20
|
timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
|
@@ -0,0 +1,128 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Transformations
|
4
|
+
# Filters rows so they only get output once, based on a :key.
|
5
|
+
class WrittenRowFilter < Flow::Transformation
|
6
|
+
requires_options :key
|
7
|
+
|
8
|
+
def initialize(*args)
|
9
|
+
super(*args)
|
10
|
+
@written_rows = Set.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def process_row(row)
|
14
|
+
key = row[key_field]
|
15
|
+
# puts "Checking on #{key}"
|
16
|
+
unless @written_rows.include?(key)
|
17
|
+
@written_rows << key
|
18
|
+
row
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def key_field
|
23
|
+
@options[:key]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Adds an :id field to a row, based on a KeyBuilder.
|
28
|
+
#
|
29
|
+
# Also adds this id as :row_id to any rows in an embedded
|
30
|
+
# :_errors field.
|
31
|
+
#
|
32
|
+
# Pass the :key_builder option to set the KeyBuilder.
|
33
|
+
class AddKey < Flow::Transformation
|
34
|
+
requires_options :key_builder
|
35
|
+
adds_fields :id
|
36
|
+
|
37
|
+
def output_streams
|
38
|
+
[:default, :dimension_key]
|
39
|
+
end
|
40
|
+
|
41
|
+
def process_row(row)
|
42
|
+
key, key_row = key_builder.key(row)
|
43
|
+
row[:id] = key
|
44
|
+
(row[:_errors] || []).each {|e| e[:row_id] = row[:id] }
|
45
|
+
|
46
|
+
if key_row
|
47
|
+
assign_stream(key_row, :dimension_key)
|
48
|
+
[row, key_row]
|
49
|
+
else
|
50
|
+
row
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def key_builder
|
55
|
+
@options[:key_builder]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Removes embedded :_errors and puts them on the error stream.
|
60
|
+
class DemultiplexErrors < Flow::Transformation
|
61
|
+
def output_streams
|
62
|
+
[:default, :error]
|
63
|
+
end
|
64
|
+
|
65
|
+
def process_row(row)
|
66
|
+
errors = (row.delete(:_errors) || []).each do |e|
|
67
|
+
assign_stream(e, :error)
|
68
|
+
end
|
69
|
+
|
70
|
+
[row] + errors
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Removes a field from the row, and creates a row on a
|
75
|
+
# designated key stream
|
76
|
+
class DimensionKeyMapping < Flow::Transformation
|
77
|
+
requires_options :original_key, :key_table
|
78
|
+
|
79
|
+
def removed_fields
|
80
|
+
[original_key]
|
81
|
+
end
|
82
|
+
|
83
|
+
def output_streams
|
84
|
+
[:default, key_table]
|
85
|
+
end
|
86
|
+
|
87
|
+
def process_row(row)
|
88
|
+
key_row = {
|
89
|
+
:original_id => row.delete(original_key),
|
90
|
+
:dimension_id => row[:id]
|
91
|
+
}
|
92
|
+
assign_stream(key_row, key_table)
|
93
|
+
[row, key_row]
|
94
|
+
end
|
95
|
+
|
96
|
+
def original_key
|
97
|
+
@options[:original_key]
|
98
|
+
end
|
99
|
+
|
100
|
+
def key_table
|
101
|
+
@options[:key_table]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Adds a hash of the specified columns as a field in the row.
|
106
|
+
class HashColumns < Flow::Transformation
|
107
|
+
requires_options :columns
|
108
|
+
|
109
|
+
def process_row(row)
|
110
|
+
str = hash_columns.map {|c| row[c].to_s }.join
|
111
|
+
row.put(output_field, Digest::MD5.hexdigest(str).upcase)
|
112
|
+
end
|
113
|
+
|
114
|
+
def added_fields
|
115
|
+
[output_field]
|
116
|
+
end
|
117
|
+
|
118
|
+
def output_field
|
119
|
+
@options[:output_field] || :hash
|
120
|
+
end
|
121
|
+
|
122
|
+
def hash_columns
|
123
|
+
@options[:columns]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|