chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,142 @@
1
+ module Chicago
2
+ module ETL
3
+ # Provides DSL methods for specifying the pipeline in an ETL
4
+ # stage.
5
+ #
6
+ # Clients will not normally instantiate this themselves but use it
7
+ # in the context of defining an ETL stage.
8
+ class LoadPipelineStageBuilder
9
+ # @api private
10
+ KeyMapping = Struct.new(:table, :field)
11
+
12
+ # The ordering of inbuilt transformation and screening steps.
13
+ TRANSFORMATION_ORDER = [:before_screens,
14
+ :screens,
15
+ :after_screens,
16
+ :before_keys,
17
+ :keys,
18
+ :after_keys,
19
+ :before_final,
20
+ :final,
21
+ :after_final
22
+ ].freeze
23
+
24
+ # @api private
25
+ def initialize(db, schema_table)
26
+ @db = db
27
+ @schema_table = schema_table
28
+ @sink_factory = SchemaTableSinkFactory.new(@db, @schema_table)
29
+ end
30
+
31
+ # @api private
32
+ def build(&block)
33
+ @load_separately = []
34
+ @key_mappings = []
35
+ @transformations = {}
36
+ TRANSFORMATION_ORDER.each {|k| @transformations[k] = [] }
37
+ @ignore_present_rows = false
38
+
39
+ instance_eval &block
40
+
41
+ add_screens
42
+ add_key_transforms
43
+ add_final_transforms
44
+ pipeline_stage = create_pipeline_stage
45
+ register_additional_sinks(pipeline_stage)
46
+ pipeline_stage
47
+ end
48
+
49
+ protected
50
+
51
+ # Ignore rows already present in the target table, rather than
52
+ # replacing them.
53
+ def ignore_present_rows
54
+ @ignore_present_rows = true
55
+ end
56
+
57
+ # Specify columns that won't be loaded or screened as part of
58
+ # this pipeline stage
59
+ def load_separately(*columns)
60
+ @load_separately += columns
61
+ end
62
+
63
+ # Add an additional key mapping.
64
+ def key_mapping(table, field)
65
+ @key_mappings << KeyMapping.new(table, field)
66
+ end
67
+
68
+ # Add a transformation before the specified point in the
69
+ # transformation chain (defined in TRANSFORMATION_ORDER)
70
+ def before(point_in_transformation_chain, transform)
71
+ key = "before_#{point_in_transformation_chain}".to_sym
72
+ @transformations[key] << transform
73
+ end
74
+
75
+ # Add a transformation after the specified point in the
76
+ # transformation chain (defined in TRANSFORMATION_ORDER)
77
+ def after(point_in_transformation_chain, transform)
78
+ key = "after_#{point_in_transformation_chain}".to_sym
79
+ @transformations[key] << transform
80
+ end
81
+
82
+ private
83
+
84
+ def create_pipeline_stage
85
+ default = @sink_factory.sink(:ignore => @ignore_present_rows,
86
+ :exclude => @load_separately)
87
+ key_sink = if @schema_table.kind_of?(Chicago::Schema::Dimension)
88
+ @sink_factory.key_sink
89
+ else
90
+ # Facts have no key table to write to.
91
+ Flow::NullSink.new
92
+ end
93
+
94
+ Flow::PipelineStage.
95
+ new(:transformations => concat_transformations,
96
+ :sinks => {
97
+ :default => default,
98
+ :dimension_key => key_sink,
99
+ :error => @sink_factory.error_sink
100
+ })
101
+ end
102
+
103
+ def concat_transformations
104
+ TRANSFORMATION_ORDER.map {|k| @transformations[k] }.flatten
105
+ end
106
+
107
+ def register_additional_sinks(pipeline_stage)
108
+ @key_mappings.each do |mapping|
109
+ sink = @sink_factory.key_sink(:table => mapping.table)
110
+ pipeline_stage.register_sink(mapping.table, sink)
111
+ end
112
+ end
113
+
114
+ def add_screens
115
+ columns_to_screen = @schema_table.columns.reject do |column|
116
+ @load_separately.include?(column.name)
117
+ end
118
+
119
+ @transformations[:screens] = [Screens::MissingValue,
120
+ Screens::InvalidElement,
121
+ Screens::OutOfBounds].map do |klass|
122
+ klass.for_columns(columns_to_screen)
123
+ end.flatten
124
+ end
125
+
126
+ def add_key_transforms
127
+ @transformations[:keys] << Transformations::AddKey.
128
+ new(:key_builder => KeyBuilder.for_table(@schema_table, @db))
129
+
130
+ @key_mappings.each do |mapping|
131
+ @transformations[:keys] << Transformations::DimensionKeyMapping.
132
+ new(:original_key => mapping.field, :key_table => mapping.table)
133
+ end
134
+ end
135
+
136
+ def add_final_transforms
137
+ @transformations[:final] << Transformations::WrittenRowFilter.new(:key => :id)
138
+ @transformations[:final] << Transformations::DemultiplexErrors.new
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,151 @@
1
+ module Chicago
2
+ module ETL
3
+ # An ETL pipeline.
4
+ class Pipeline
5
+ # Returns all defined dimension load tasks
6
+ attr_reader :load_dimensions
7
+
8
+ # Returns all defined fact load tasks
9
+ attr_reader :load_facts
10
+
11
+ # Creates a pipeline for a Schema.
12
+ def initialize(db, schema)
13
+ @schema, @db = schema, db
14
+ @load_dimensions = Chicago::Schema::NamedElementCollection.new
15
+ @load_facts = Chicago::Schema::NamedElementCollection.new
16
+ end
17
+
18
+ # Defines a dimension load stage
19
+ def define_dimension_load(name, options={}, &block)
20
+ dimension_name = options[:dimension] || name
21
+ @load_dimensions << build_stage(name,
22
+ @schema.dimension(dimension_name),
23
+ &block)
24
+ end
25
+
26
+ # Defines a fact load stage
27
+ def define_fact_load(name, options={}, &block)
28
+ fact_name = options[:fact] || name
29
+ @load_facts << build_stage(name, @schema.fact(fact_name), &block)
30
+ end
31
+
32
+ # Builds a stage, but does not define it.
33
+ def build_stage(name, schema_table, &block)
34
+ DatasetBatchStageBuilder.new(@db, schema_table).build(name, &block)
35
+ end
36
+ end
37
+
38
+ # Provides DSL methods for building a DataSetBatchStage.
39
+ #
40
+ # Clients shouldn't need to instantiate this directly, but instead
41
+ # call the protected methods in the context of defining a Pipeline
42
+ class DatasetBatchStageBuilder
43
+ # @api private
44
+ def initialize(db, schema_table)
45
+ @db, @schema_table = db, schema_table
46
+ end
47
+
48
+ # @api private
49
+ def build(name, &block)
50
+ instance_eval &block
51
+ unless defined? @pipeline_stage
52
+ pipeline do
53
+ end
54
+ end
55
+ DatasetBatchStage.new(name, @dataset, @pipeline_stage,
56
+ :filter_strategy => @filter_strategy,
57
+ :truncate_pre_load => @truncate_pre_load)
58
+ end
59
+
60
+ protected
61
+
62
+ # Specifies that the sinks should be truncated before loading
63
+ # data.
64
+ def truncate_pre_load
65
+ @truncate_pre_load = true
66
+ end
67
+
68
+ # Specifies that the dataset should never be filtered to the ETL
69
+ # batch - i.e. it should behave as if reextract was always true
70
+ def full_reload
71
+ @filter_strategy = lambda {|dataset, etl_batch| dataset }
72
+ end
73
+
74
+ # Define elements of the pipeline. See LoadPipelineStageBuilder
75
+ # for details.
76
+ def pipeline(&block)
77
+ @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
78
+ build(&block)
79
+ end
80
+
81
+ # Defines the dataset, see DatasetBuilder .
82
+ #
83
+ # The block must return a Sequel::Dataset.
84
+ def dataset(&block)
85
+ @dataset = DatasetBuilder.new(@db).build(&block)
86
+ end
87
+
88
+ # Define a custom filter strategy for filtering to an ETL batch.
89
+ def filter_strategy(&block)
90
+ @filter_strategy = block
91
+ end
92
+ end
93
+
94
+ # Provides convenience methods for defining source datasets.
95
+ class DatasetBuilder
96
+ attr_reader :db
97
+
98
+ # @api private
99
+ def initialize(db)
100
+ @db = db
101
+ end
102
+
103
+ # @api private
104
+ def build(&block)
105
+ instance_eval(&block)
106
+ end
107
+
108
+ protected
109
+
110
+ def key_field(field, name)
111
+ :if[{field => nil}, 1, field].as(name)
112
+ end
113
+
114
+ # Returns a column for use in a Sequel::Dataset#select method to
115
+ # return a dimension key.
116
+ #
117
+ # Takes care of using the key tables correctly, and dealing with
118
+ # missing dimension values.
119
+ def dimension_key(name)
120
+ key_field("keys_dimension_#{name}__dimension_id".to_sym,
121
+ "#{name}_dimension_id".to_sym)
122
+ end
123
+
124
+ # Returns a column for use in a Sequel::Dataset#select method to
125
+ # return a date dimension key.
126
+ def date_dimension_column(dimension)
127
+ :if.sql_function({:id.qualify(dimension) => nil},
128
+ 1,
129
+ :id.qualify(dimension)).
130
+ as("#{dimension}_dimension_id".to_sym)
131
+ end
132
+
133
+ # Rounds a monetary value to 2 decimal places.
134
+ #
135
+ # By default, natural rounding is used, you can specify either
136
+ # :up or :down as the direction.
137
+ #
138
+ # @deprecated
139
+ def round(stmt, direction = :none)
140
+ case direction
141
+ when :none
142
+ :round.sql_function(stmt, 2)
143
+ when :up
144
+ :ceil.sql_function(stmt * 100) / 100
145
+ when :down
146
+ :floor.sql_function(stmt * 100) / 100
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,74 @@
1
+ module Chicago
2
+ module ETL
3
+ # Builds Sinks for Dimension & Fact tables.
4
+ class SchemaTableSinkFactory
5
+ # Creates a new factory.
6
+ def initialize(db, schema_table)
7
+ @db, @schema_table = db, schema_table
8
+ end
9
+
10
+ # Returns a sink to load data into the MySQL table backing the
11
+ # schema table.
12
+ #
13
+ # Pass an :exclude option if you don't want all columns of the
14
+ # schema table to be loaded via this sink.
15
+ def sink(options={})
16
+ Flow::MysqlFileSink.new(@db,
17
+ @schema_table.table_name,
18
+ load_columns(options[:exclude]),
19
+ mysql_options(options))
20
+ end
21
+
22
+ # Returns a sink to load data into the MySQL table backing the
23
+ # key table for a Dimension.
24
+ #
25
+ # @option options [Symbol] :table - a custom key table name. The
26
+ # schema table's key table name will be used otherwise.
27
+ def key_sink(options={})
28
+ table = options.delete(:table) || @schema_table.key_table_name
29
+ sink = Flow::MysqlFileSink.new(@db,
30
+ table,
31
+ [:original_id, :dimension_id],
32
+ mysql_options(options))
33
+ sink.truncation_strategy = lambda do
34
+ # No Op - we want to maintain keys to avoid having to sort
35
+ # out fact tables.
36
+ end
37
+ sink
38
+ end
39
+
40
+ # Returns a sink to load errors generated in the ETL process.
41
+ def error_sink(options={})
42
+ sink = Flow::MysqlFileSink.
43
+ new(@db, :etl_error_log,
44
+ [:column, :row_id, :error, :severity, :error_detail], mysql_options(options)).
45
+ set_constant_values(:table => @schema_table.table_name.to_s,
46
+ :process_name => "StandardTransformations",
47
+ :process_version => 3,
48
+ :logged_at => Time.now)
49
+
50
+ sink.truncation_strategy = lambda do
51
+ @db[:etl_error_log].
52
+ where(:table => @schema_table.table_name.to_s).delete
53
+ end
54
+ sink
55
+ end
56
+
57
+ private
58
+
59
+ def load_columns(exclude=nil)
60
+ exclude = [exclude].compact.flatten
61
+ [:id] + @schema_table.columns.
62
+ reject {|c| exclude.include?(c.name) }.
63
+ map {|c| c.database_name }
64
+ end
65
+
66
+ def mysql_options(options)
67
+ [:filepath, :ignore].inject({}) do |hsh, k|
68
+ hsh[k] = options[k] if options.has_key?(k)
69
+ hsh
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,53 +1,54 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
- class ColumnScreen
5
- attr_reader :column, :table_name
6
-
7
- def initialize(table_name, column)
8
- @table_name = table_name
9
- @column = column
10
- @error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
4
+ # @abstract
5
+ class ColumnScreen < Flow::Transformation
6
+ def self.for_columns(columns)
7
+ columns.map {|column|
8
+ new(:default, :column => column)
9
+ }
11
10
  end
12
11
 
13
- def self.for_columns(table_name, columns)
14
- screens = columns.map {|column| new(table_name, column) }
15
- CompositeScreen.new(screens)
12
+ def output_streams
13
+ [:default, :error]
16
14
  end
17
15
 
18
- def call(row, errors=[])
19
- value = row[column.database_name]
20
-
21
- if applies?(value)
16
+ def process_row(row)
17
+ if applies?(row[column.database_name])
22
18
  overwrite_value(row)
23
- log_error(value, errors)
19
+ error_row = error(row[column.database_name])
20
+ if error_row
21
+ row[:_errors] ||= []
22
+ row[:_errors] << error_row
23
+ end
24
24
  end
25
25
 
26
- [row, errors]
26
+ row
27
27
  end
28
28
 
29
29
  def severity
30
30
  1
31
31
  end
32
32
 
33
+ def column
34
+ @options[:column]
35
+ end
36
+
33
37
  private
34
38
 
35
- def overwrite_value(row)
36
- row[column.database_name] = column.default_value
39
+ def error_name
40
+ self.class.name.split('::').last.sub(/Screen$/,'').titlecase
37
41
  end
38
42
 
39
- def log_error(value, errors)
40
- errors << error_hash(value)
43
+ def overwrite_value(row)
44
+ row[column.database_name] = column.default_value
41
45
  end
42
46
 
43
- def error_hash(value)
47
+ def error(value)
44
48
  {
45
- :process_name => "StandardTransformations",
46
- :process_version => 2,
47
- :table => table_name.to_s,
48
49
  :column => column.database_name.to_s,
49
50
  :severity => severity,
50
- :error => @error_name
51
+ :error => error_name
51
52
  }
52
53
  end
53
54
 
@@ -1,11 +1,11 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Transformation which checks to see if a field's value is in a
5
+ # column's elements.
4
6
  class InvalidElement < ColumnScreen
5
- def self.for_columns(table_name, columns)
6
- screens = columns.select(&:elements).
7
- map {|column| new(table_name, column) }
8
- CompositeScreen.new(screens)
7
+ def self.for_columns(columns)
8
+ columns.select(&:elements).map {|column| new(:default, :column => column) }
9
9
  end
10
10
 
11
11
  def severity
@@ -17,7 +17,7 @@ module Chicago
17
17
  !column.elements.map(&:downcase).include?(value.to_s.downcase)
18
18
  end
19
19
 
20
- def error_hash(value)
20
+ def error(value)
21
21
  super(value).
22
22
  merge(:error_detail => "'#{value}' is not a valid value.")
23
23
  end
@@ -1,14 +1,16 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Screen which checks to see if a field is present in the row if
5
+ # required.
4
6
  class MissingValue < ColumnScreen
5
7
  def severity
6
8
  column.descriptive? ? 1 : 2
7
9
  end
8
10
 
9
- def log_error(value, errors)
11
+ def error(value)
10
12
  if ! (column.column_type == :boolean || column.optional?)
11
- errors << error_hash(value)
13
+ super(value)
12
14
  end
13
15
  end
14
16
 
@@ -1,6 +1,8 @@
1
1
  module Chicago
2
2
  module ETL
3
3
  module Screens
4
+ # Screen which checks to see if a column's value is out of
5
+ # defined bounds.
4
6
  class OutOfBounds < ColumnScreen
5
7
  def severity
6
8
  2
@@ -8,11 +8,13 @@ module Chicago
8
8
  new(db).build
9
9
  end
10
10
 
11
- def initialize(db) # :nodoc:
11
+ # @api private
12
+ def initialize(db)
12
13
  @db = db
13
14
  end
14
15
 
15
- def build # :nodoc:
16
+ # @api private
17
+ def build
16
18
  create_table :etl_batches do
17
19
  primary_key :id, :type => :integer, :unsigned => true
18
20
  timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
@@ -1,6 +1,5 @@
1
1
  module Chicago
2
2
  module ETL
3
-
4
3
  class TaskInvocation < Sequel::Model
5
4
  set_dataset :etl_task_invocations
6
5
  many_to_one :batch
@@ -0,0 +1,128 @@
1
+ module Chicago
2
+ module ETL
3
+ module Transformations
4
+ # Filters rows so they only get output once, based on a :key.
5
+ class WrittenRowFilter < Flow::Transformation
6
+ requires_options :key
7
+
8
+ def initialize(*args)
9
+ super(*args)
10
+ @written_rows = Set.new
11
+ end
12
+
13
+ def process_row(row)
14
+ key = row[key_field]
15
+ # puts "Checking on #{key}"
16
+ unless @written_rows.include?(key)
17
+ @written_rows << key
18
+ row
19
+ end
20
+ end
21
+
22
+ def key_field
23
+ @options[:key]
24
+ end
25
+ end
26
+
27
+ # Adds an :id field to a row, based on a KeyBuilder.
28
+ #
29
+ # Also adds this id as :row_id to any rows in an embedded
30
+ # :_errors field.
31
+ #
32
+ # Pass the :key_builder option to set the KeyBuilder.
33
+ class AddKey < Flow::Transformation
34
+ requires_options :key_builder
35
+ adds_fields :id
36
+
37
+ def output_streams
38
+ [:default, :dimension_key]
39
+ end
40
+
41
+ def process_row(row)
42
+ key, key_row = key_builder.key(row)
43
+ row[:id] = key
44
+ (row[:_errors] || []).each {|e| e[:row_id] = row[:id] }
45
+
46
+ if key_row
47
+ assign_stream(key_row, :dimension_key)
48
+ [row, key_row]
49
+ else
50
+ row
51
+ end
52
+ end
53
+
54
+ def key_builder
55
+ @options[:key_builder]
56
+ end
57
+ end
58
+
59
+ # Removes embedded :_errors and puts them on the error stream.
60
+ class DemultiplexErrors < Flow::Transformation
61
+ def output_streams
62
+ [:default, :error]
63
+ end
64
+
65
+ def process_row(row)
66
+ errors = (row.delete(:_errors) || []).each do |e|
67
+ assign_stream(e, :error)
68
+ end
69
+
70
+ [row] + errors
71
+ end
72
+ end
73
+
74
+ # Removes a field from the row, and creates a row on a
75
+ # designated key stream
76
+ class DimensionKeyMapping < Flow::Transformation
77
+ requires_options :original_key, :key_table
78
+
79
+ def removed_fields
80
+ [original_key]
81
+ end
82
+
83
+ def output_streams
84
+ [:default, key_table]
85
+ end
86
+
87
+ def process_row(row)
88
+ key_row = {
89
+ :original_id => row.delete(original_key),
90
+ :dimension_id => row[:id]
91
+ }
92
+ assign_stream(key_row, key_table)
93
+ [row, key_row]
94
+ end
95
+
96
+ def original_key
97
+ @options[:original_key]
98
+ end
99
+
100
+ def key_table
101
+ @options[:key_table]
102
+ end
103
+ end
104
+
105
+ # Adds a hash of the specified columns as a field in the row.
106
+ class HashColumns < Flow::Transformation
107
+ requires_options :columns
108
+
109
+ def process_row(row)
110
+ str = hash_columns.map {|c| row[c].to_s }.join
111
+ row.put(output_field, Digest::MD5.hexdigest(str).upcase)
112
+ end
113
+
114
+ def added_fields
115
+ [output_field]
116
+ end
117
+
118
+ def output_field
119
+ @options[:output_field] || :hash
120
+ end
121
+
122
+ def hash_columns
123
+ @options[:columns]
124
+ end
125
+ end
126
+ end
127
+ end
128
+ end