chicago-etl 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,142 @@
1
+ module Chicago
2
+ module ETL
3
+ # Provides DSL methods for specifying the pipeline in an ETL
4
+ # stage.
5
+ #
6
+ # Clients will not normally instantiate this themselves but use it
7
+ # in the context of defining an ETL stage.
8
+ class LoadPipelineStageBuilder
9
+ # @api private
10
+ KeyMapping = Struct.new(:table, :field)
11
+
12
+ # The ordering of inbuilt transformation and screening steps.
13
+ TRANSFORMATION_ORDER = [:before_screens,
14
+ :screens,
15
+ :after_screens,
16
+ :before_keys,
17
+ :keys,
18
+ :after_keys,
19
+ :before_final,
20
+ :final,
21
+ :after_final
22
+ ].freeze
23
+
24
+ # @api private
25
+ def initialize(db, schema_table)
26
+ @db = db
27
+ @schema_table = schema_table
28
+ @sink_factory = SchemaTableSinkFactory.new(@db, @schema_table)
29
+ end
30
+
31
+ # @api private
32
+ def build(&block)
33
+ @load_separately = []
34
+ @key_mappings = []
35
+ @transformations = {}
36
+ TRANSFORMATION_ORDER.each {|k| @transformations[k] = [] }
37
+ @ignore_present_rows = false
38
+
39
+ instance_eval &block
40
+
41
+ add_screens
42
+ add_key_transforms
43
+ add_final_transforms
44
+ pipeline_stage = create_pipeline_stage
45
+ register_additional_sinks(pipeline_stage)
46
+ pipeline_stage
47
+ end
48
+
49
+ protected
50
+
51
+ # Ignore rows already present in the target table, rather than
52
+ # replacing them.
53
+ def ignore_present_rows
54
+ @ignore_present_rows = true
55
+ end
56
+
57
+ # Specify columns that won't be loaded or screened as part of
58
+ # this pipeline stage
59
+ def load_separately(*columns)
60
+ @load_separately += columns
61
+ end
62
+
63
+ # Add an additional key mapping.
64
+ def key_mapping(table, field)
65
+ @key_mappings << KeyMapping.new(table, field)
66
+ end
67
+
68
+ # Add a transformation before the specified point in the
69
+ # transformation chain (defined in TRANSFORMATION_ORDER)
70
+ def before(point_in_transformation_chain, transform)
71
+ key = "before_#{point_in_transformation_chain}".to_sym
72
+ @transformations[key] << transform
73
+ end
74
+
75
+ # Add a transformation after the specified point in the
76
+ # transformation chain (defined in TRANSFORMATION_ORDER)
77
+ def after(point_in_transformation_chain, transform)
78
+ key = "after_#{point_in_transformation_chain}".to_sym
79
+ @transformations[key] << transform
80
+ end
81
+
82
+ private
83
+
84
+ def create_pipeline_stage
85
+ default = @sink_factory.sink(:ignore => @ignore_present_rows,
86
+ :exclude => @load_separately)
87
+ key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
88
+ @sink_factory.key_sink
89
+ else
90
+ # Facts have no key table to write to.
91
+ Flow::NullSink.new
92
+ end
93
+
94
+ Flow::PipelineStage.
95
+ new(:transformations => concat_transformations,
96
+ :sinks => {
97
+ :default => default,
98
+ :dimension_key => key_sink,
99
+ :error => @sink_factory.error_sink
100
+ })
101
+ end
102
+
103
+ def concat_transformations
104
+ TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
105
+ end
106
+
107
+ def register_additional_sinks(pipeline_stage)
108
+ @key_mappings.each do |mapping|
109
+ sink = @sink_factory.key_sink(:table => mapping.table)
110
+ pipeline_stage.register_sink(mapping.table, sink)
111
+ end
112
+ end
113
+
114
+ def add_screens
115
+ columns_to_screen = @schema_table.columns.reject do |column|
116
+ @load_separately.include?(column.name)
117
+ end
118
+
119
+ @transformations[:screens] = [Screens::MissingValue,
120
+ Screens::InvalidElement,
121
+ Screens::OutOfBounds].map do |klass|
122
+ klass.for_columns(columns_to_screen)
123
+ end.flatten
124
+ end
125
+
126
+ def add_key_transforms
127
+ @transformations[:keys] << Transformations::AddKey.
128
+ new(:key_builder => KeyBuilder.for_table(@schema_table, @db))
129
+
130
+ @key_mappings.each do |mapping|
131
+ @transformations[:keys] << Transformations::DimensionKeyMapping.
132
+ new(:original_key => mapping.field, :key_table => mapping.table)
133
+ end
134
+ end
135
+
136
+ def add_final_transforms
137
+ @transformations[:final] << Transformations::WrittenRowFilter.new(:key => :id)
138
+ @transformations[:final] << Transformations::DemultiplexErrors.new
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,151 @@
1
+ module Chicago
2
+ module ETL
3
+ # An ETL pipeline.
4
+ class Pipeline
5
+ # Returns all defined dimension load tasks
6
+ attr_reader :load_dimensions
7
+
8
+ # Returns all defined fact load tasks
9
+ attr_reader :load_facts
10
+
11
+ # Creates a pipeline for a Schema.
12
+ def initialize(db, schema)
13
+ @schema, @db = schema, db
14
+ @load_dimensions = Chicago::Schema::NamedElementCollection.new
15
+ @load_facts = Chicago::Schema::NamedElementCollection.new
16
+ end
17
+
18
+ # Defines a dimension load stage
19
+ def define_dimension_load(name, options={}, &block)
20
+ dimension_name = options[:dimension] || name
21
+ @load_dimensions << build_stage(name,
22
+ @schema.dimension(dimension_name),
23
+ &block)
24
+ end
25
+
26
+ # Defines a fact load stage
27
+ def define_fact_load(name, options={}, &block)
28
+ fact_name = options[:fact] || name
29
+ @load_facts << build_stage(name, @schema.fact(fact_name), &block)
30
+ end
31
+
32
+ # Builds a stage, but does not define it.
33
+ def build_stage(name, schema_table, &block)
34
+ DatasetBatchStageBuilder.new(@db, schema_table).build(name, &block)
35
+ end
36
+ end
37
+
38
+ # Provides DSL methods for building a DataSetBatchStage.
39
+ #
40
+ # Clients shouldn't need to instantiate this directly, but instead
41
+ # call the protected methods in the context of defining a Pipeline
42
+ class DatasetBatchStageBuilder
43
+ # @api private
44
+ def initialize(db, schema_table)
45
+ @db, @schema_table = db, schema_table
46
+ end
47
+
48
+ # @api private
49
+ def build(name, &block)
50
+ instance_eval &block
51
+ unless defined? @pipeline_stage
52
+ pipeline do
53
+ end
54
+ end
55
+ DatasetBatchStage.new(name, @dataset, @pipeline_stage,
56
+ :filter_strategy => @filter_strategy,
57
+ :truncate_pre_load => @truncate_pre_load)
58
+ end
59
+
60
+ protected
61
+
62
+ # Specifies that the sinks should be truncated before loading
63
+ # data.
64
+ def truncate_pre_load
65
+ @truncate_pre_load = true
66
+ end
67
+
68
+ # Specifies that the dataset should never be filtered to the ETL
69
+ # batch - i.e. it should behave as if reextract was always true
70
+ def full_reload
71
+ @filter_strategy = lambda {|dataset, etl_batch| dataset }
72
+ end
73
+
74
+ # Define elements of the pipeline. See LoadPipelineStageBuilder
75
+ # for details.
76
+ def pipeline(&block)
77
+ @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
78
+ build(&block)
79
+ end
80
+
81
+ # Defines the dataset, see DatasetBuilder .
82
+ #
83
+ # The block must return a Sequel::Dataset.
84
+ def dataset(&block)
85
+ @dataset = DatasetBuilder.new(@db).build(&block)
86
+ end
87
+
88
+ # Define a custom filter strategy for filtering to an ETL batch.
89
+ def filter_strategy(&block)
90
+ @filter_strategy = block
91
+ end
92
+ end
93
+
94
+ # Provides convenience methods for defining source datasets.
95
+ class DatasetBuilder
96
+ attr_reader :db
97
+
98
+ # @api private
99
+ def initialize(db)
100
+ @db = db
101
+ end
102
+
103
+ # @api private
104
+ def build(&block)
105
+ instance_eval(&block)
106
+ end
107
+
108
+ protected
109
+
110
+ def key_field(field, name)
111
+ :if[{field => nil}, 1, field].as(name)
112
+ end
113
+
114
+ # Returns a column for use in a Sequel::Dataset#select method to
115
+ # return a dimension key.
116
+ #
117
+ # Takes care of using the key tables correctly, and dealing with
118
+ # missing dimension values.
119
+ def dimension_key(name)
120
+ key_field("keys_dimension_#{name}__dimension_id".to_sym,
121
+ "#{name}_dimension_id".to_sym)
122
+ end
123
+
124
+ # Returns a column for use in a Sequel::Dataset#select method to
125
+ # return a date dimension key.
126
+ def date_dimension_column(dimension)
127
+ :if.sql_function({:id.qualify(dimension) => nil},
128
+ 1,
129
+ :id.qualify(dimension)).
130
+ as("#{dimension}_dimension_id".to_sym)
131
+ end
132
+
133
+ # Rounds a monetary value to 2 decimal places.
134
+ #
135
+ # By default, natural rounding is used, you can specify either
136
+ # :up or :down as the direction.
137
+ #
138
+ # @deprecated
139
+ def round(stmt, direction = :none)
140
+ case direction
141
+ when :none
142
+ :round.sql_function(stmt, 2)
143
+ when :up
144
+ :ceil.sql_function(stmt * 100) / 100
145
+ when :down
146
+ :floor.sql_function(stmt * 100) / 100
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,74 @@
1
+ module Chicago
2
+ module ETL
3
+ # Builds Sinks for Dimension & Fact tables.
4
+ class SchemaTableSinkFactory
5
+ # Creates a new factory.
6
+ def initialize(db, schema_table)
7
+ @db, @schema_table = db, schema_table
8
+ end
9
+
10
+ # Returns a sink to load data into the MySQL table backing the
11
+ # schema table.
12
+ #
13
+ # Pass an :exclude option if you don't want all columns of the
14
+ # schema table to be loaded via this sink.
15
+ def sink(options={})
16
+ Flow::MysqlFileSink.new(@db,
17
+ @schema_table.table_name,
18
+ load_columns(options[:exclude]),
19
+ mysql_options(options))
20
+ end
21
+
22
+ # Returns a sink to load data into the MySQL table backing the
23
+ # key table for a Dimension.
24
+ #
25
+ # @option options [Symbol] :table - a custom key table name. The
26
+ # schema table's key table name will be used otherwise.
27
+ def key_sink(options={})
28
+ table = options.delete(:table) || @schema_table.key_table_name
29
+ sink = Flow::MysqlFileSink.new(@db,
30
+ table,
31
+ [:original_id, :dimension_id],
32
+ mysql_options(options))
33
+ sink.truncation_strategy = lambda do
34
+ # No Op - we want to maintain keys to avoid having to sort
35
+ # out fact tables.
36
+ end
37
+ sink
38
+ end
39
+
40
+ # Returns a sink to load errors generated in the ETL process.
41
+ def error_sink(options={})
42
+ sink = Flow::MysqlFileSink.
43
+ new(@db, :etl_error_log,
44
+ [:column, :row_id, :error, :severity, :error_detail], mysql_options(options)).
45
+ set_constant_values(:table => @schema_table.table_name.to_s,
46
+ :process_name => "StandardTransformations",
47
+ :process_version => 3,
48
+ :logged_at => Time.now)
49
+
50
+ sink.truncation_strategy = lambda do
51
+ @db[:etl_error_log].
52
+ where(:table => @schema_table.table_name.to_s).delete
53
+ end
54
+ sink
55
+ end
56
+
57
+ private
58
+
59
+ def load_columns(exclude=nil)
60
+ exclude = [exclude].compact.flatten
61
+ [:id] + @schema_table.columns.
62
+ reject {|c| exclude.include?(c.name) }.
63
+ map {|c| c.database_name }
64
+ end
65
+
66
+ def mysql_options(options)
67
+ [:filepath, :ignore].inject({}) do |hsh, k|
68
+ hsh[k] = options[k] if options.has_key?(k)
69
+ hsh
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,53 +1,54 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
- class ColumnScreen
5
- attr_reader :column, :table_name
6
-
7
- def initialize(table_name, column)
8
- @table_name = table_name
9
- @column = column
10
- @error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
4
+ # @abstract
5
+ class ColumnScreen < Flow::Transformation
6
+ def self.for_columns(columns)
7
+ columns.map {|column|
8
+ new(:default, :column => column)
9
+ }
11
10
  end
12
11
 
13
- def self.for_columns(table_name, columns)
14
- screens = columns.map {|column| new(table_name, column) }
15
- CompositeScreen.new(screens)
12
+ def output_streams
13
+ [:default, :error]
16
14
  end
17
15
 
18
- def call(row, errors=[])
19
- value = row[column.database_name]
20
-
21
- if applies?(value)
16
+ def process_row(row)
17
+ if applies?(row[column.database_name])
22
18
  overwrite_value(row)
23
- log_error(value, errors)
19
+ error_row = error(row[column.database_name])
20
+ if error_row
21
+ row[:_errors] ||= []
22
+ row[:_errors] << error_row
23
+ end
24
24
  end
25
25
 
26
- [row, errors]
26
+ row
27
27
  end
28
28
 
29
29
  def severity
30
30
  1
31
31
  end
32
32
 
33
+ def column
34
+ @options[:column]
35
+ end
36
+
33
37
  private
34
38
 
35
- def overwrite_value(row)
36
- row[column.database_name] = column.default_value
39
+ def error_name
40
+ self.class.name.split('::').last.sub(/Screen$/,'').titlecase
37
41
  end
38
42
 
39
- def log_error(value, errors)
40
- errors << error_hash(value)
43
+ def overwrite_value(row)
44
+ row[column.database_name] = column.default_value
41
45
  end
42
46
 
43
- def error_hash(value)
47
+ def error(value)
44
48
  {
45
- :process_name => "StandardTransformations",
46
- :process_version => 2,
47
- :table => table_name.to_s,
48
49
  :column => column.database_name.to_s,
49
50
  :severity => severity,
50
- :error => @error_name
51
+ :error => error_name
51
52
  }
52
53
  end
53
54
 
@@ -1,11 +1,11 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Transformation which checks to see if a field's value is in a
5
+ # column's elements.
4
6
  class InvalidElement < ColumnScreen
5
- def self.for_columns(table_name, columns)
6
- screens = columns.select(&:elements).
7
- map {|column| new(table_name, column) }
8
- CompositeScreen.new(screens)
7
+ def self.for_columns(columns)
8
+ columns.select(&:elements).map {|column| new(:default, :column => column) }
9
9
  end
10
10
 
11
11
  def severity
@@ -17,7 +17,7 @@ module Chicago
17
17
  !column.elements.map(&:downcase).include?(value.to_s.downcase)
18
18
  end
19
19
 
20
- def error_hash(value)
20
+ def error(value)
21
21
  super(value).
22
22
  merge(:error_detail => "'#{value}' is not a valid value.")
23
23
  end
@@ -1,14 +1,16 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Screen which checks to see if a field is present in the row if
5
+ # required.
4
6
  class MissingValue < ColumnScreen
5
7
  def severity
6
8
  column.descriptive? ? 1 : 2
7
9
  end
8
10
 
9
- def log_error(value, errors)
11
+ def error(value)
10
12
  if ! (column.column_type == :boolean || column.optional?)
11
- errors << error_hash(value)
13
+ super(value)
12
14
  end
13
15
  end
14
16
 
@@ -1,6 +1,8 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Screen which checks to see if a column's value is out of
5
+ # defined bounds.
4
6
  class OutOfBounds < ColumnScreen
5
7
  def severity
6
8
  2
@@ -8,11 +8,13 @@ module Chicago
8
8
  new(db).build
9
9
  end
10
10
 
11
- def initialize(db) # :nodoc:
11
+ # @api private
12
+ def initialize(db)
12
13
  @db = db
13
14
  end
14
15
 
15
- def build # :nodoc:
16
+ # @api private
17
+ def build
16
18
  create_table :etl_batches do
17
19
  primary_key :id, :type => :integer, :unsigned => true
18
20
  timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
@@ -1,6 +1,5 @@
1
1
  module Chicago
2
2
  module ETL
3
-
4
3
  class TaskInvocation < Sequel::Model
5
4
  set_dataset :etl_task_invocations
6
5
  many_to_one :batch
@@ -0,0 +1,128 @@
1
+ module Chicago
2
+ module ETL
3
+ module Transformations
4
+ # Filters rows so they only get output once, based on a :key.
5
+ class WrittenRowFilter < Flow::Transformation
6
+ requires_options :key
7
+
8
+ def initialize(*args)
9
+ super(*args)
10
+ @written_rows = Set.new
11
+ end
12
+
13
+ def process_row(row)
14
+ key = row[key_field]
15
+ # puts "Checking on #{key}"
16
+ unless @written_rows.include?(key)
17
+ @written_rows << key
18
+ row
19
+ end
20
+ end
21
+
22
+ def key_field
23
+ @options[:key]
24
+ end
25
+ end
26
+
27
+ # Adds an :id field to a row, based on a KeyBuilder.
28
+ #
29
+ # Also adds this id as :row_id to any rows in an embedded
30
+ # :_errors field.
31
+ #
32
+ # Pass the :key_builder option to set the KeyBuilder.
33
+ class AddKey < Flow::Transformation
34
+ requires_options :key_builder
35
+ adds_fields :id
36
+
37
+ def output_streams
38
+ [:default, :dimension_key]
39
+ end
40
+
41
+ def process_row(row)
42
+ key, key_row = key_builder.key(row)
43
+ row[:id] = key
44
+ (row[:_errors] || []).each {|e| e[:row_id] = row[:id] }
45
+
46
+ if key_row
47
+ assign_stream(key_row, :dimension_key)
48
+ [row, key_row]
49
+ else
50
+ row
51
+ end
52
+ end
53
+
54
+ def key_builder
55
+ @options[:key_builder]
56
+ end
57
+ end
58
+
59
+ # Removes embedded :_errors and puts them on the error stream.
60
+ class DemultiplexErrors < Flow::Transformation
61
+ def output_streams
62
+ [:default, :error]
63
+ end
64
+
65
+ def process_row(row)
66
+ errors = (row.delete(:_errors) || []).each do |e|
67
+ assign_stream(e, :error)
68
+ end
69
+
70
+ [row] + errors
71
+ end
72
+ end
73
+
74
+ # Removes a field from the row, and creates a row on a
75
+ # designated key stream
76
+ class DimensionKeyMapping < Flow::Transformation
77
+ requires_options :original_key, :key_table
78
+
79
+ def removed_fields
80
+ [original_key]
81
+ end
82
+
83
+ def output_streams
84
+ [:default, key_table]
85
+ end
86
+
87
+ def process_row(row)
88
+ key_row = {
89
+ :original_id => row.delete(original_key),
90
+ :dimension_id => row[:id]
91
+ }
92
+ assign_stream(key_row, key_table)
93
+ [row, key_row]
94
+ end
95
+
96
+ def original_key
97
+ @options[:original_key]
98
+ end
99
+
100
+ def key_table
101
+ @options[:key_table]
102
+ end
103
+ end
104
+
105
+ # Adds a hash of the specified columns as a field in the row.
106
+ class HashColumns < Flow::Transformation
107
+ requires_options :columns
108
+
109
+ def process_row(row)
110
+ str = hash_columns.map {|c| row[c].to_s }.join
111
+ row.put(output_field, Digest::MD5.hexdigest(str).upcase)
112
+ end
113
+
114
+ def added_fields
115
+ [output_field]
116
+ end
117
+
118
+ def output_field
119
+ @options[:output_field] || :hash
120
+ end
121
+
122
+ def hash_columns
123
+ @options[:columns]
124
+ end
125
+ end
126
+ end
127
+ end
128
+ end