chicago-etl 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
data/lib/chicago/etl.rb
CHANGED
@@ -1,35 +1,66 @@
|
|
1
|
+
if RUBY_VERSION.split(".")[1] < "9"
|
2
|
+
require 'fastercsv'
|
3
|
+
CSV = FasterCSV
|
4
|
+
else
|
5
|
+
require 'csv'
|
6
|
+
end
|
7
|
+
|
1
8
|
require 'sequel'
|
9
|
+
require 'chicago/flow/errors'
|
10
|
+
require 'chicago/flow/transformation'
|
11
|
+
require 'chicago/flow/filter'
|
12
|
+
require 'chicago/flow/transformation_chain'
|
13
|
+
require 'chicago/flow/pipeline_stage'
|
14
|
+
require 'chicago/flow/pipeline_endpoint'
|
15
|
+
require 'chicago/flow/array_source'
|
16
|
+
require 'chicago/flow/dataset_source'
|
17
|
+
require 'chicago/flow/sink'
|
18
|
+
require 'chicago/flow/array_sink'
|
19
|
+
require 'chicago/flow/null_sink'
|
20
|
+
require 'chicago/flow/mysql'
|
2
21
|
|
22
|
+
require 'chicago/etl/core_extensions'
|
3
23
|
require 'chicago/etl/counter'
|
4
24
|
require 'chicago/etl/key_builder'
|
5
|
-
require 'chicago/etl/
|
6
|
-
require 'chicago/etl/
|
7
|
-
require 'chicago/etl/buffering_insert_writer'
|
8
|
-
require 'chicago/etl/mysql_dumpfile'
|
9
|
-
|
25
|
+
require 'chicago/etl/schema_table_sink_factory'
|
26
|
+
require 'chicago/etl/transformations'
|
10
27
|
require 'chicago/etl/load_dataset_builder'
|
28
|
+
require 'chicago/etl/dataset_batch_stage'
|
29
|
+
require 'chicago/etl/load_pipeline_stage_builder'
|
30
|
+
require 'chicago/etl/pipeline'
|
11
31
|
|
12
32
|
# Sequel Extensions
|
13
33
|
require 'chicago/etl/sequel/filter_to_etl_batch'
|
14
|
-
require 'chicago/etl/sequel/load_data_infile'
|
15
34
|
require 'chicago/etl/sequel/dependant_tables'
|
16
35
|
|
17
36
|
# Screens
|
18
37
|
require 'chicago/etl/screens/column_screen'
|
19
|
-
require 'chicago/etl/screens/composite_screen'
|
20
38
|
require 'chicago/etl/screens/missing_value'
|
21
39
|
require 'chicago/etl/screens/invalid_element'
|
22
40
|
require 'chicago/etl/screens/out_of_bounds'
|
23
41
|
|
24
42
|
# Transformations
|
25
|
-
require 'chicago/etl/transformations/add_insert_timestamp'
|
26
43
|
require 'chicago/etl/transformations/uk_post_code'
|
27
44
|
require 'chicago/etl/transformations/uk_post_code_field'
|
28
45
|
|
29
46
|
module Chicago
|
47
|
+
# Contains classes related to ETL processing.
|
30
48
|
module ETL
|
31
49
|
autoload :TableBuilder, 'chicago/etl/table_builder.rb'
|
32
50
|
autoload :Batch, 'chicago/etl/batch.rb'
|
33
51
|
autoload :TaskInvocation, 'chicago/etl/task_invocation.rb'
|
52
|
+
|
53
|
+
# Executes a pipeline stage in the context of an ETL Batch.
|
54
|
+
#
|
55
|
+
# Tasks execution status is stored in a database etl task
|
56
|
+
# invocations table - this ensures tasks aren't run more than once
|
57
|
+
# within a batch.
|
58
|
+
def self.execute(stage, etl_batch, reextract, logger)
|
59
|
+
etl_batch.perform_task(:load, stage.name) do
|
60
|
+
logger.debug "Starting loading #{stage.name}"
|
61
|
+
stage.execute(etl_batch, reextract)
|
62
|
+
logger.debug "Finished loading #{stage.name}"
|
63
|
+
end
|
64
|
+
end
|
34
65
|
end
|
35
66
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# An endpoint that stores rows in an Array.
|
4
|
+
#
|
5
|
+
# @api public
|
6
|
+
class ArraySink < Sink
|
7
|
+
# Returns the array of written rows.
|
8
|
+
attr_reader :data
|
9
|
+
|
10
|
+
# The name of this sink
|
11
|
+
attr_reader :name
|
12
|
+
|
13
|
+
# Creates an ArraySink.
|
14
|
+
#
|
15
|
+
# Optionally you may pass an array of column names if you wish
|
16
|
+
# to use static validation that the correct columns are written
|
17
|
+
# through the pipeline.
|
18
|
+
def initialize(name, fields=[])
|
19
|
+
@name = name
|
20
|
+
@fields = [fields].flatten
|
21
|
+
@data = []
|
22
|
+
end
|
23
|
+
|
24
|
+
# See Sink#<<
|
25
|
+
def <<(row)
|
26
|
+
@data << row.merge(constant_values)
|
27
|
+
end
|
28
|
+
|
29
|
+
# See Sink#truncate
|
30
|
+
def truncate
|
31
|
+
@data.clear
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'sequel/fast_columns'
|
3
|
+
|
4
|
+
module Chicago
|
5
|
+
module Flow
|
6
|
+
# @api public
|
7
|
+
class DatasetSource < PipelineEndpoint
|
8
|
+
attr_reader :dataset
|
9
|
+
|
10
|
+
def initialize(dataset)
|
11
|
+
@dataset = dataset
|
12
|
+
end
|
13
|
+
|
14
|
+
def each
|
15
|
+
@dataset.each {|row| yield row }
|
16
|
+
end
|
17
|
+
|
18
|
+
def fields
|
19
|
+
@dataset.columns
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# @api public
|
4
|
+
class Filter < Transformation
|
5
|
+
def initialize(stream=:default, &block)
|
6
|
+
super(stream)
|
7
|
+
@block = block || lambda {|row| false }
|
8
|
+
end
|
9
|
+
|
10
|
+
def process_row(row)
|
11
|
+
row if @block.call(row)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -1,12 +1,15 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
1
3
|
module Chicago
|
2
|
-
module
|
3
|
-
|
4
|
+
module Flow
|
5
|
+
# @api private
|
6
|
+
class MysqlFileSerializer
|
4
7
|
# Transforms a value to be suitable for use in file in a LOAD
|
5
8
|
# DATA INFILE mysql statement.
|
6
|
-
def
|
9
|
+
def serialize(value)
|
7
10
|
case value
|
8
11
|
when nil
|
9
|
-
"
|
12
|
+
"NULL"
|
10
13
|
when true
|
11
14
|
"1"
|
12
15
|
when false
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'sequel/load_data_infile'
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
Sequel.extension :core_extensions
|
6
|
+
|
7
|
+
module Chicago
|
8
|
+
module Flow
|
9
|
+
# @api public
|
10
|
+
class MysqlFileSink < Sink
|
11
|
+
attr_reader :filepath
|
12
|
+
attr_writer :truncation_strategy
|
13
|
+
|
14
|
+
def initialize(db, table_name, fields, options = {})
|
15
|
+
@fields = [fields].flatten
|
16
|
+
@filepath = options[:filepath] || temp_file(table_name)
|
17
|
+
@serializer = MysqlFileSerializer.new
|
18
|
+
@db = db
|
19
|
+
@table_name = table_name
|
20
|
+
@insert_ignore = !!options[:ignore]
|
21
|
+
end
|
22
|
+
|
23
|
+
def name
|
24
|
+
@table_name
|
25
|
+
end
|
26
|
+
|
27
|
+
def <<(row)
|
28
|
+
csv << fields.map {|c| @serializer.serialize(row[c]) }
|
29
|
+
end
|
30
|
+
|
31
|
+
def close
|
32
|
+
csv.flush
|
33
|
+
load_from_file(filepath)
|
34
|
+
csv.close
|
35
|
+
File.unlink(filepath) if File.exists?(filepath)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Loads data from the file into the MySQL table via LOAD DATA
|
39
|
+
# INFILE, if the file exists and has content.
|
40
|
+
def load_from_file(file)
|
41
|
+
return unless File.size?(file)
|
42
|
+
dataset.load_csv_infile(file, @fields, :set => constant_values)
|
43
|
+
end
|
44
|
+
|
45
|
+
def truncate
|
46
|
+
if @truncation_strategy
|
47
|
+
@truncation_strategy.call
|
48
|
+
else
|
49
|
+
@db[@table_name].truncate
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def dataset
|
56
|
+
@insert_ignore ? @db[@table_name].insert_ignore : @db[@table_name]
|
57
|
+
end
|
58
|
+
|
59
|
+
def csv
|
60
|
+
@csv ||= CSV.open(filepath, "w")
|
61
|
+
end
|
62
|
+
|
63
|
+
def temp_file(table_name)
|
64
|
+
File.join(Dir.tmpdir, "#{table_name}.#{rand(1_000_000)}.csv")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# Co-ordinates iterating over rows provided by a source, passing
|
4
|
+
# them through a transformation chain before writing them to
|
5
|
+
# sink(s).
|
6
|
+
#
|
7
|
+
# @api public
|
8
|
+
class PipelineStage
|
9
|
+
attr_reader :transformation_chain
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
@sinks = options[:sinks] || {}
|
13
|
+
@transformations = options[:transformations] || []
|
14
|
+
@error_handler = options[:error_handler] || RaisingErrorHandler.new
|
15
|
+
@transformation_chain = TransformationChain.new(*@transformations)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the named sink, if it exists
|
19
|
+
def sink(name)
|
20
|
+
@sinks[name.to_sym]
|
21
|
+
end
|
22
|
+
|
23
|
+
def sinks
|
24
|
+
@sinks.values
|
25
|
+
end
|
26
|
+
|
27
|
+
def register_sink(name, sink)
|
28
|
+
@sinks[name.to_sym] = sink
|
29
|
+
self
|
30
|
+
end
|
31
|
+
|
32
|
+
def validate_pipeline
|
33
|
+
unless unregistered_sinks.empty?
|
34
|
+
@error_handler.unregistered_sinks(unregistered_sinks)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def execute(source)
|
39
|
+
validate_pipeline
|
40
|
+
sinks.each(&:open)
|
41
|
+
pipe_rows_to_sinks_from(source)
|
42
|
+
sinks.each(&:close)
|
43
|
+
end
|
44
|
+
|
45
|
+
def required_sinks
|
46
|
+
transformation_chain.output_streams | [:default]
|
47
|
+
end
|
48
|
+
|
49
|
+
def unregistered_sinks
|
50
|
+
required_sinks - @sinks.keys
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def pipe_rows_to_sinks_from(source)
|
56
|
+
source.each do |row|
|
57
|
+
transformation_chain.process(row).each {|row| process_row(row) }
|
58
|
+
end
|
59
|
+
transformation_chain.flush.each {|row| process_row(row) }
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_row(row)
|
63
|
+
stream = row.delete(:_stream) || :default
|
64
|
+
@sinks[stream] << row
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# The destination for rows passing through a pipeline stage.
|
4
|
+
#
|
5
|
+
# @api public
|
6
|
+
# @abstract
|
7
|
+
class Sink < PipelineEndpoint
|
8
|
+
# Specifies a hash of values that are assumed to apply to all
|
9
|
+
# rows.
|
10
|
+
#
|
11
|
+
# Subclasses should use there constant values appropriately when
|
12
|
+
# writing rows, by merging them with the row or otherwise
|
13
|
+
# ensuring that they end up in the final source this sink
|
14
|
+
# represents.
|
15
|
+
def constant_values
|
16
|
+
@constant_values ||= {}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Sets a number of constant values.
|
20
|
+
def set_constant_values(hash={})
|
21
|
+
constant_values.merge!(hash)
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
# Performs any operations before writing rows to this sink.
|
26
|
+
#
|
27
|
+
# By default does nothing; may be overridden by subclasses.
|
28
|
+
def open
|
29
|
+
end
|
30
|
+
|
31
|
+
# Performs any operations after writing rows to this sink.
|
32
|
+
#
|
33
|
+
# By default does nothing; may be overridden by subclasses.
|
34
|
+
def close
|
35
|
+
end
|
36
|
+
|
37
|
+
# Writes a row to this sink.
|
38
|
+
#
|
39
|
+
# By default does nothing; may be overridden by subclasses.
|
40
|
+
def <<(row)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Removes all rows from this sink.
|
44
|
+
#
|
45
|
+
# This includes all rows written prior to this particular
|
46
|
+
# execution of a pipeline stage.
|
47
|
+
#
|
48
|
+
# By default does nothing; should be overritten by subclasses.
|
49
|
+
def truncate
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# The key used to store the stream in the row.
|
4
|
+
#
|
5
|
+
# @api private
|
6
|
+
STREAM = :_stream
|
7
|
+
|
8
|
+
# A base class for row transformations.
|
9
|
+
#
|
10
|
+
# Transformations process hash-like rows by filtering or altering
|
11
|
+
# their contents.
|
12
|
+
#
|
13
|
+
# @api public
|
14
|
+
# @abstract Subclass and add a process_row method
|
15
|
+
class Transformation
|
16
|
+
# Creates the transformation.
|
17
|
+
#
|
18
|
+
# This should not be overridden by subclasses - transformations
|
19
|
+
# that need their own arguments should do so by passing named
|
20
|
+
# options.
|
21
|
+
#
|
22
|
+
# @overload initialize(stream, options)
|
23
|
+
# Specifies this transformation applies to a specific
|
24
|
+
# stream. Options are specific to the stream subclass
|
25
|
+
# @overload initialize(options)
|
26
|
+
# As above, but the stream is assumed to be :default
|
27
|
+
def initialize(*args)
|
28
|
+
stream, options = *args
|
29
|
+
if stream.kind_of?(Hash)
|
30
|
+
@stream = :default
|
31
|
+
@options = stream
|
32
|
+
else
|
33
|
+
@stream = stream || :default
|
34
|
+
@options = options || {}
|
35
|
+
end
|
36
|
+
|
37
|
+
ensure_options_present
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the required initialization options for this transformation.
|
41
|
+
def self.required_options
|
42
|
+
@required_options ||= []
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the fields added by this transformation.
|
46
|
+
def self.added_fields
|
47
|
+
@added_fields ||= []
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the fields removed by this transformation.
|
51
|
+
def self.removed_fields
|
52
|
+
@removed_fields ||= []
|
53
|
+
end
|
54
|
+
|
55
|
+
# Specify which options are required in the constructor of
|
56
|
+
# this transformation.
|
57
|
+
def self.requires_options(*options)
|
58
|
+
required_options.concat options.flatten
|
59
|
+
end
|
60
|
+
|
61
|
+
# Specify which fields are added to the row by this
|
62
|
+
# transformation.
|
63
|
+
def self.adds_fields(*fields)
|
64
|
+
added_fields.concat fields.flatten
|
65
|
+
end
|
66
|
+
|
67
|
+
# Specify which fields are removed from the row by this
|
68
|
+
# transformation.
|
69
|
+
def self.removes_fields(*fields)
|
70
|
+
removed_fields.concat fields.flatten
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns the required initialization options for this transformation.
|
74
|
+
def required_options
|
75
|
+
self.class.required_options
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns the fields added by this transformation.
|
79
|
+
def added_fields
|
80
|
+
self.class.added_fields
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the fields removed by this transformation.
|
84
|
+
def removed_fields
|
85
|
+
self.class.removed_fields
|
86
|
+
end
|
87
|
+
|
88
|
+
def upstream_fields(fields)
|
89
|
+
((fields + removed_fields) - added_fields).uniq
|
90
|
+
end
|
91
|
+
|
92
|
+
def downstream_fields(fields)
|
93
|
+
((fields - removed_fields) + added_fields).uniq
|
94
|
+
end
|
95
|
+
|
96
|
+
# Processes a row if the row is on this transformation's stream.
|
97
|
+
#
|
98
|
+
# This should not be overridden by subclasses, override
|
99
|
+
# process_row instead.
|
100
|
+
#
|
101
|
+
# @return Hash if a single row is returned
|
102
|
+
# @return Array<Hash> if multiple rows need to be returned
|
103
|
+
def process(row)
|
104
|
+
applies_to_stream?(row[STREAM]) ? process_row(row) : row
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns all remaining rows yet to make their way through the
|
108
|
+
# pipeline.
|
109
|
+
#
|
110
|
+
# This should be overridden by subclasses if the transformation
|
111
|
+
# holds back rows as it does processing (to find the maximum
|
112
|
+
# value in a set of rows for example), to ensure that all rows
|
113
|
+
# are written through the pipeline.
|
114
|
+
#
|
115
|
+
# @return Array<Hash> by default an empty array.
|
116
|
+
def flush
|
117
|
+
[]
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns the streams to which this transformation may write
|
121
|
+
# rows.
|
122
|
+
#
|
123
|
+
# By default, transformations are assumed to write only to the
|
124
|
+
# :default stream. Override this in subclasses as necessary.
|
125
|
+
def output_streams
|
126
|
+
[:default]
|
127
|
+
end
|
128
|
+
|
129
|
+
# Returns true if this transformation should be applied to a row
|
130
|
+
# on the target stream.
|
131
|
+
def applies_to_stream?(target_stream)
|
132
|
+
@stream == :all ||
|
133
|
+
(target_stream.nil? && @stream == :default) ||
|
134
|
+
target_stream == @stream
|
135
|
+
end
|
136
|
+
|
137
|
+
protected
|
138
|
+
|
139
|
+
# Performs transformation on the row.
|
140
|
+
#
|
141
|
+
# By default does nothing; override in subclasses. Subclasses
|
142
|
+
# should return either nil, a Hash-like row or an Array of
|
143
|
+
# Hash-like rows.
|
144
|
+
def process_row(row)
|
145
|
+
row
|
146
|
+
end
|
147
|
+
|
148
|
+
# Assigns the row to a stream.
|
149
|
+
#
|
150
|
+
# Will raise an error if the stream is not declared by
|
151
|
+
# overriding output_streams.
|
152
|
+
def assign_stream(row, stream)
|
153
|
+
raise "Stream not declared" unless stream.nil? || output_streams.include?(stream)
|
154
|
+
row[STREAM] = stream if stream
|
155
|
+
row
|
156
|
+
end
|
157
|
+
|
158
|
+
private
|
159
|
+
|
160
|
+
def ensure_options_present
|
161
|
+
missing_keys = required_options - @options.keys
|
162
|
+
|
163
|
+
unless missing_keys.empty?
|
164
|
+
raise ArgumentError.new("The following options are not supplied: " + missing_keys.join(","))
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Chicago
|
2
|
+
module Flow
|
3
|
+
# @api private
|
4
|
+
class TransformationChain
|
5
|
+
def initialize(*transforms)
|
6
|
+
@transforms = transforms
|
7
|
+
end
|
8
|
+
|
9
|
+
def output_streams
|
10
|
+
@transforms.inject([]) {|s, t| s | t.output_streams }
|
11
|
+
end
|
12
|
+
|
13
|
+
def process(row)
|
14
|
+
@transforms.inject([row]) do |rows, transform|
|
15
|
+
process_rows(rows, transform)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def flush
|
20
|
+
@transforms.inject([]) do |rows, transform|
|
21
|
+
process_rows(rows, transform) + transform.flush
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def upstream_fields(fields)
|
26
|
+
@transforms.inject(fields) {|t| t.upstream_fields(fields) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def downstream_fields(fields)
|
30
|
+
@transforms.inject(fields) {|t| t.downstream_fields(fields) }
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def process_rows(rows, transform)
|
36
|
+
rows.map {|row| transform.process(row) }.flatten.compact
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/spec/etl/batch_spec.rb
CHANGED
@@ -15,7 +15,8 @@ describe Chicago::ETL::Batch do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should set the start timestamp of the batch to now when created" do
|
18
|
-
ETL::Batch.instance.start.started_at.to_i
|
18
|
+
(ETL::Batch.instance.start.started_at.to_i - Time.now.to_i).abs.
|
19
|
+
should <= 5
|
19
20
|
end
|
20
21
|
|
21
22
|
it "should have a state of 'Started' when started" do
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Hash do
|
4
|
+
it "should have a put method which returns the hash" do
|
5
|
+
{}.put(:a, 1).should == {:a => 1}
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should have a modify existing method that ignores nil values" do
|
9
|
+
{:a => nil}.modify_existing(:a) {|v| 2 }.should == {:a => nil}
|
10
|
+
{:a => 1}.modify_existing(:a) {|v| 2 }.should == {:a => 2}
|
11
|
+
{}.modify_existing(:a) {|r| 2 }.should == {}
|
12
|
+
end
|
13
|
+
end
|