factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,405 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # The Context is passed to eval.
4
+ class Context
5
+ require 'test/unit/assertions'
6
+ include Test::Unit::Assertions
7
+ attr_reader :control
8
+
9
+ class << self
10
+ # Create a Context instance
11
+ def create(control)
12
+ Context.new(control).get_binding
13
+ end
14
+ end
15
+
16
+ # Initialize the context
17
+ def initialize(control)
18
+ @control = control
19
+ end
20
+
21
+ # Get the control file
22
+ def file
23
+ control.file
24
+ end
25
+
26
+ # Set the allowed error threshold
27
+ def set_error_threshold(error_threshold)
28
+ control.error_threshold = error_threshold
29
+ end
30
+
31
+ # Define a list of control files that this file depends on. Those control
32
+ # files will be executed prior to this control file. The list may
33
+ # contain symbols that will be converted to file names by calling
34
+ # to_s + '.ctl', or they may be strings in which case they will be used
35
+ # as is
36
+ def depends_on(*args)
37
+ (dependencies << args).flatten!
38
+ end
39
+
40
+ # Get the defined dependencies
41
+ def dependencies
42
+ control.dependencies
43
+ end
44
+
45
+ # Define a source.
46
+ def source(name, configuration={}, definition={})
47
+ if configuration[:type]
48
+ case configuration[:type]
49
+ when Class
50
+ source_class = configuration[:type]
51
+ sources << source_class.new(self, configuration, definition)
52
+ when String, Symbol
53
+ source_class = ETL::Control::Source.class_for_name(configuration[:type])
54
+ sources << source_class.new(self, configuration, definition)
55
+ else
56
+ if configuration[:type].is_a?(ETL::Control::Source)
57
+ sources << configuration[:type]
58
+ else
59
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
60
+ end
61
+ end
62
+ else
63
+ source_types.each do |source_type|
64
+ if configuration[source_type]
65
+ source_class = ETL::Control::Source.class_for_name(source_type)
66
+ sources << source_class.new(self, configuration, definition)
67
+ break
68
+ end
69
+ raise ControlError, "A source was specified but no matching type was found"
70
+ end
71
+ end
72
+ end
73
+
74
+ # Get the defined source
75
+ def sources
76
+ control.sources
77
+ end
78
+
79
+ # Define a destination
80
+ def destination(name, configuration={}, mapping={})
81
+ if configuration[:type]
82
+ case configuration[:type]
83
+ when Class
84
+ dest_class = configuration[:type]
85
+ destinations << dest_class.new(self, configuration, mapping)
86
+ when String, Symbol
87
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
88
+ destinations << dest_class.new(self, configuration, mapping)
89
+ else
90
+ if configuration[:type].is_a?(ETL::Control::Destination)
91
+ destinations << configuration[:type]
92
+ else
93
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
94
+ end
95
+ end
96
+ else
97
+ destination_types.each do |dest_type|
98
+ if configuration[dest_type]
99
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
100
+ destinations << dest_class.new(self, configuration, mapping)
101
+ break
102
+ end
103
+ raise ControlError, "A destination was specified but no matching destination type was found"
104
+ end
105
+ end
106
+ end
107
+
108
+ # Get the defined destinations
109
+ def destinations
110
+ control.destinations
111
+ end
112
+
113
+ # Define a transform
114
+ def transform(name, transformer=nil, configuration={}, &block)
115
+ if transformer
116
+ case transformer
117
+ when String, Symbol
118
+ class_name = "#{transformer.to_s.camelize}Transform"
119
+ begin
120
+ transform_class = ETL::Transform.const_get(class_name)
121
+ transforms << transform_class.new(self, name, configuration)
122
+ rescue NameError => e
123
+ raise ControlError, "Unable to find transformer #{class_name}: #{e}"
124
+ end
125
+ when Class
126
+ transforms << transformer.new(self, transformer.name, configuration)
127
+ else
128
+ #transformer.class.inspect
129
+ if transformer.is_a?(ETL::Transform::Transform)
130
+ Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
131
+ t = transformer.dup
132
+ t.name = name
133
+ transforms << t
134
+ else
135
+ raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
136
+ end
137
+ end
138
+ elsif block_given?
139
+ transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
140
+ else
141
+ raise ControlError, "Either a transformer or a block must be specified"
142
+ end
143
+ end
144
+
145
+ # Get the defined transforms
146
+ def transforms
147
+ control.transforms
148
+ end
149
+
150
+ # Define a before post-process screen block. The type argument must be
151
+ # one of :fatal, :error or :warn
152
+ def screen(type, &block)
153
+ screens[type] << block
154
+ end
155
+
156
+ # Get the before post-process screen blocks
157
+ def screens
158
+ control.screens
159
+ end
160
+
161
+ # Define an after post-proces screen block. The type argument must be
162
+ # one of :fatal, :error or :warn
163
+ def after_post_process_screen(type, &block)
164
+ after_post_process_screens[type] << block
165
+ end
166
+
167
+ # Get the after post-process screen blocks
168
+ def after_post_process_screens
169
+ control.after_post_process_screens
170
+ end
171
+
172
+ # Rename the source field to the destination field
173
+ def rename(source, destination)
174
+ after_read :rename, :source => source, :dest => destination
175
+ end
176
+
177
+ # Copy the source field to the destination field
178
+ def copy(source, destination)
179
+ after_read :copy_field, :source => source, :dest => destination
180
+ end
181
+
182
+ protected
183
+ # This method is used to define a processor and insert into the specified processor
184
+ # collection.
185
+ def define_processor(name, processor_collection, configuration, proc)
186
+ case name
187
+ when String, Symbol, nil
188
+ name ||= 'block'
189
+ class_name = "#{name.to_s.camelize}Processor"
190
+ begin
191
+ processor_class = ETL::Processor.const_get(class_name)
192
+ if name == 'block'
193
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
194
+ configuration[:block] = proc
195
+ end
196
+ processor_collection << processor_class.new(self, configuration)
197
+ rescue NameError => e
198
+ raise ControlError, "Unable to find processor #{class_name}: #{e}"
199
+ end
200
+ when Class
201
+ processor_collection << name.new(self, configuration)
202
+ else
203
+ raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
204
+ end
205
+ end
206
+
207
+ public
208
+ # Define an "after read" processor. This must be a row-level processor.
209
+ def after_read(name='block', configuration={}, &block)
210
+ define_processor(name, after_read_processors, configuration, block)
211
+ end
212
+
213
+ # Get the defined "after read" processors
214
+ def after_read_processors
215
+ control.after_read_processors
216
+ end
217
+
218
+ # Define a "before write" processor. This must be a row-level processor.
219
+ def before_write(name='block', configuration={}, &block)
220
+ define_processor(name, before_write_processors, configuration, block)
221
+ end
222
+
223
+ # Get the defined "before write" processors
224
+ def before_write_processors
225
+ control.before_write_processors
226
+ end
227
+
228
+ # Define a pre-processor
229
+ def pre_process(name='block', configuration={}, &block)
230
+ define_processor(name, pre_processors, configuration, block)
231
+ end
232
+
233
+ # Get the defined pre-processors
234
+ def pre_processors
235
+ control.pre_processors
236
+ end
237
+
238
+ # Define a post-processor
239
+ def post_process(name='block', configuration={}, &block)
240
+ define_processor(name, post_processors, configuration, block)
241
+ end
242
+
243
+ # Get the defined post-processors
244
+ def post_processors
245
+ control.post_processors
246
+ end
247
+
248
+ # Get the binding object
249
+ def get_binding
250
+ binding
251
+ end
252
+
253
+ protected
254
+ # Get an array of supported source types
255
+ def source_types
256
+ control.source_types
257
+ end
258
+
259
+ # Get an array of supported destination types
260
+ def destination_types
261
+ control.destination_types
262
+ end
263
+
264
+ end
265
+
266
+ # Object representation of a control file
267
+ class Control
268
+ # The File object
269
+ attr_reader :file
270
+
271
+ # The error threshold
272
+ attr_accessor :error_threshold
273
+
274
+ class << self
275
+ # Parse a control file and return a Control instance
276
+ def parse(control_file)
277
+ control_file = control_file.path if control_file.instance_of?(File)
278
+ control = ETL::Control::Control.new(control_file)
279
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
280
+ eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
281
+ control.validate
282
+ control
283
+ end
284
+
285
+ def parse_text(text)
286
+ control = ETL::Control::Control.new(nil)
287
+ eval(text, Context.create(control), 'inline')
288
+ control.validate
289
+ control
290
+ end
291
+
292
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
293
+ # are:
294
+ # * The path to a control file as a String
295
+ # * A File object referencing the control file
296
+ # * The ETL::Control::Control object (which will just be returned)
297
+ #
298
+ # Raises a ControlError if any other type is given
299
+ def resolve(control)
300
+ case control
301
+ when String
302
+ ETL::Control::Control.parse(File.new(control))
303
+ when File
304
+ ETL::Control::Control.parse(control)
305
+ when ETL::Control::Control
306
+ control
307
+ else
308
+ raise ControlError, "Control must be a String, File or Control object"
309
+ end
310
+ end
311
+ end
312
+
313
+ # Initialize the instance with the given File object
314
+ def initialize(file)
315
+ @file = file
316
+ end
317
+
318
+ # Get a list of dependencies
319
+ def dependencies
320
+ @dependencies ||= []
321
+ end
322
+
323
+ # Get the defined source
324
+ def sources
325
+ @sources ||= []
326
+ end
327
+
328
+ # Get the defined destinations
329
+ def destinations
330
+ @destinations ||= []
331
+ end
332
+
333
+ # Get the transforms with the specified name
334
+ # def transform(name)
335
+ # transforms[name] ||= []
336
+ # end
337
+
338
+ def after_read_processors
339
+ @after_read_processors ||= []
340
+ end
341
+
342
+ # Get all of the "before write" processors
343
+ def before_write_processors
344
+ @before_write_processors ||= []
345
+ end
346
+
347
+ # Get an Array of preprocessors
348
+ def pre_processors
349
+ @pre_processors ||= []
350
+ end
351
+
352
+ # Get an Array of post processors
353
+ def post_processors
354
+ @post_processors ||= []
355
+ end
356
+
357
+ # Get an Array of all transforms for this control
358
+ def transforms
359
+ @transforms ||= []
360
+ end
361
+
362
+ # A hash of the screens executed before post-process
363
+ def screens
364
+ @screens ||= {
365
+ :fatal => [],
366
+ :error => [],
367
+ :warn => []
368
+ }
369
+ end
370
+
371
+ # A hash of the screens executed after post-process
372
+ def after_post_process_screens
373
+ @after_post_process_screens ||= {
374
+ :fatal => [],
375
+ :error => [],
376
+ :warn => []
377
+ }
378
+ end
379
+
380
+ # Get the error threshold. Defaults to 100.
381
+ def error_threshold
382
+ @error_threshold ||= 100
383
+ end
384
+
385
+ # Validate the control file
386
+ def validate
387
+ #unless sources.length > 0
388
+ # raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
389
+ #end
390
+ #unless destinations.length > 0
391
+ # raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
392
+ #end
393
+ end
394
+
395
+ def source_types
396
+ [:file, :database]
397
+ end
398
+
399
+ def destination_types
400
+ [:file, :database]
401
+ end
402
+
403
+ end
404
+ end
405
+ end
@@ -0,0 +1,420 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Base class for destinations.
4
+ class Destination
5
+ # Read-only accessor for the ETL::Control::Control instance
6
+ attr_reader :control
7
+
8
+ # Read-only accessor for the configuration Hash
9
+ attr_reader :configuration
10
+
11
+ # Read-only accessor for the destination mapping Hash
12
+ attr_reader :mapping
13
+
14
+ # Accessor to the buffer size
15
+ attr_accessor :buffer_size
16
+
17
+ # Unique flag.
18
+ attr_accessor :unique
19
+
20
+ # A condition for writing
21
+ attr_accessor :condition
22
+
23
+ # An array of rows to append to the destination
24
+ attr_accessor :append_rows
25
+
26
+ class << self
27
+ # Get the destination class for the specified name.
28
+ #
29
+ # For example if name is :database or 'database' then the
30
+ # DatabaseDestination class is returned
31
+ def class_for_name(name)
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
+ end
34
+ end
35
+
36
+ # Initialize the destination
37
+ #
38
+ # Arguments:
39
+ # * <tt>control</tt>: The ETL::Control::Control instance
40
+ # * <tt>configuration</tt>: The configuration Hash
41
+ # * <tt>mapping</tt>: The mapping Hash
42
+ #
43
+ # Options:
44
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
45
+ # * <tt>:condition</tt>: A conditional proc that must return true for the
46
+ # row to be written
47
+ # * <tt>:append_rows</tt>: An array of rows to append
48
+ def initialize(control, configuration, mapping)
49
+ @control = control
50
+ @configuration = configuration
51
+ @mapping = mapping
52
+ @buffer_size = configuration[:buffer_size] ||= 100
53
+ @condition = configuration[:condition]
54
+ @append_rows = configuration[:append_rows]
55
+ end
56
+
57
+ # Get the current row number
58
+ def current_row
59
+ @current_row ||= 1
60
+ end
61
+
62
+ # Write the given row
63
+ def write(row)
64
+ if @condition.nil? || @condition.call(row)
65
+ process_change(row)
66
+ end
67
+ flush if buffer.length >= buffer_size
68
+ end
69
+
70
+ # Abstract method
71
+ def flush
72
+ raise NotImplementedError, "flush method must be implemented by subclasses"
73
+ end
74
+
75
+ # Abstract method
76
+ def close
77
+ raise NotImplementedError, "close method must be implemented by subclasses"
78
+ end
79
+
80
+ def errors
81
+ @errors ||= []
82
+ end
83
+
84
+ protected
85
+ # Access the buffer
86
+ def buffer
87
+ @buffer ||= []
88
+ end
89
+
90
+ # Access the generators map
91
+ def generators
92
+ @generators ||= {}
93
+ end
94
+
95
+ # Get the order of elements from the source order
96
+ def order_from_source
97
+ order = []
98
+ control.sources.first.definition.each do |item|
99
+ case item
100
+ when Hash
101
+ order << item[:name]
102
+ else
103
+ order << item
104
+ end
105
+ end
106
+ order
107
+ end
108
+
109
+ # Return true if the row is allowed. The row will not be allowed if the
110
+ # :unique option is specified in the configuration and the compound key
111
+ # already exists
112
+ def row_allowed?(row)
113
+ if unique
114
+ key = (unique.collect { |k| row[k] }).join('|')
115
+ return false if compound_key_constraints[key]
116
+ compound_key_constraints[key] = 1
117
+ end
118
+ return true
119
+ end
120
+
121
+ # Get a hash of compound key contraints. This is used to determine if a
122
+ # row can be written when the unique option is specified
123
+ def compound_key_constraints
124
+ @compound_key_constraints ||= {}
125
+ end
126
+
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ end
133
+
134
+ def non_scd_fields(row)
135
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
137
+ end
138
+
139
+ def non_evolving_fields
140
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
141
+ end
142
+
143
+ def scd?
144
+ !configuration[:scd].nil?
145
+ end
146
+
147
+ def scd_type
148
+ scd? ? configuration[:scd][:type] : nil
149
+ end
150
+
151
+ # Get the Slowly Changing Dimension effective date field. Defaults to
152
+ # 'effective_date'.
153
+ def scd_effective_date_field
154
+ configuration[:scd][:effective_date_field] || :effective_date if scd?
155
+ end
156
+
157
+ # Get the Slowly Changing Dimension end date field. Defaults to
158
+ # 'end_date'.
159
+ def scd_end_date_field
160
+ configuration[:scd][:end_date_field] || :end_date if scd?
161
+ end
162
+
163
+ # Get the Slowly Changing Dimension latest version field. Defaults to
164
+ # 'latest_version'.
165
+ def scd_latest_version_field
166
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
167
+ end
168
+
169
+ # Return the natural key field names, defaults to []
170
+ def natural_key
171
+ @natural_key ||= determine_natural_key
172
+ end
173
+
174
+ # Get the dimension table if specified
175
+ def dimension_table
176
+ @dimension_table ||= if scd?
177
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
178
+ end
179
+ end
180
+
181
+ # Get the dimension target if specified
182
+ def dimension_target
183
+ @dimension_target ||= if scd?
184
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
185
+ end
186
+ end
187
+
188
+ # Process a row to determine the change type
189
+ def process_change(row)
190
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
191
+ return unless row
192
+
193
+ # Change processing can only occur if the natural key exists in the row
194
+ ETL::Engine.logger.debug "Checking for natural key existence"
195
+ unless has_natural_key?(row)
196
+ buffer << row
197
+ return
198
+ end
199
+
200
+ @timestamp = Time.now
201
+
202
+ # See if the scd_fields of the current record have changed
203
+ # from the last time this record was loaded into the data
204
+ # warehouse. If they match then throw away this row (no need
205
+ # to process). If they do not match then the record is an
206
+ # 'update'. If the record doesn't exist then it is an 'insert'
207
+ ETL::Engine.logger.debug "Checking record for SCD change"
208
+ if @existing_row = preexisting_row(row)
209
+ if has_scd_field_changes?(row)
210
+ process_scd_change(row)
211
+ else
212
+ process_scd_match(row)
213
+ end
214
+ else
215
+ schedule_new_record(row)
216
+ end
217
+ end
218
+
219
+ # Add any virtual fields to the row. Virtual rows will get their value
220
+ # from one of the following:
221
+ # * If the mapping is a Class, then an object which implements the next
222
+ # method
223
+ # * If the mapping is a Symbol, then the XGenerator where X is the
224
+ # classified symbol
225
+ # * If the mapping is a Proc, then it will be called with the row
226
+ # * Otherwise the value itself will be assigned to the field
227
+ def add_virtuals!(row)
228
+ if mapping[:virtual]
229
+ mapping[:virtual].each do |key,value|
230
+ # If the row already has the virtual set, assume that's correct
231
+ next if row[key]
232
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
233
+ case value
234
+ when Class
235
+ generator = generators[key] ||= value.new
236
+ row[key] = generator.next
237
+ when Symbol
238
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
239
+ row[key] = generator.next
240
+ when Proc
241
+ row[key] = value.call(row)
242
+ else
243
+ if value.is_a?(ETL::Generator::Generator)
244
+ row[key] = value.next
245
+ else
246
+ row[key] = value
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+
253
+ private
254
+
255
+ # Determine the natural key. This method will always return an array
256
+ # of symbols. The default value is [].
257
+ def determine_natural_key
258
+ Array(configuration[:natural_key]).collect(&:to_sym)
259
+ end
260
+
261
+ # Check whether a natural key has been defined, and if so, whether
262
+ # this row has enough information to do searches based on that natural
263
+ # key.
264
+ #
265
+ # TODO: This should be factored out into
266
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
267
+ # until *all* sources cast to ETL::Row, instead of sometimes
268
+ # using Hash
269
+ def has_natural_key?(row)
270
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
271
+ end
272
+
273
+ # Helper for generating the SQL where clause that allows searching
274
+ # by a natural key
275
+ def natural_key_equality_for_row(row)
276
+ statement = []
277
+ values = []
278
+ natural_key.each do |nk|
279
+ statement << "#{nk} = ?"
280
+ values << row[nk]
281
+ end
282
+ statement = statement.join(" AND ")
283
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
284
+ end
285
+
286
+ # Do all the steps required when a SCD *has* changed. Exact steps
287
+ # depend on what type of SCD we're handling.
288
+ def process_scd_change(row)
289
+ ETL::Engine.logger.debug "SCD fields do not match"
290
+
291
+ if scd_type == 2
292
+ # SCD Type 2: new row should be added and old row should be updated
293
+ ETL::Engine.logger.debug "type 2 SCD"
294
+
295
+ # To update the old row, we delete the version in the database
296
+ # and insert a new expired version
297
+
298
+ # If there is no truncate then the row will exist twice in the database
299
+ delete_outdated_record
300
+
301
+ ETL::Engine.logger.debug "expiring original record"
302
+ @existing_row[scd_end_date_field] = @timestamp
303
+ @existing_row[scd_latest_version_field] = false
304
+
305
+ buffer << @existing_row
306
+
307
+ elsif scd_type == 1
308
+ # SCD Type 1: only the new row should be added
309
+ ETL::Engine.logger.debug "type 1 SCD"
310
+
311
+ # Copy primary key, and other non-evolving fields over from
312
+ # original version of record
313
+ non_evolving_fields.each do |non_evolving_field|
314
+ row[non_evolving_field] = @existing_row[non_evolving_field]
315
+ end
316
+
317
+ # If there is no truncate then the row will exist twice in the database
318
+ delete_outdated_record
319
+ else
320
+ # SCD Type 3: not supported
321
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
322
+ end
323
+
324
+ # In all cases, the latest, greatest version of the record
325
+ # should go into the load
326
+ schedule_new_record(row)
327
+ end
328
+
329
+ # Do all the steps required when a SCD has *not* changed. Exact
330
+ # steps depend on what type of SCD we're handling.
331
+ def process_scd_match(row)
332
+ ETL::Engine.logger.debug "SCD fields match"
333
+
334
+ if scd_type == 2 && has_non_scd_field_changes?(row)
335
+ ETL::Engine.logger.debug "Non-SCD field changes"
336
+ # Copy important data over from original version of record
337
+ row[primary_key] = @existing_row[primary_key]
338
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
339
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
340
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
341
+
342
+ # If there is no truncate then the row will exist twice in the database
343
+ delete_outdated_record
344
+
345
+ buffer << row
346
+ else
347
+ # The record is totally the same, so skip it
348
+ end
349
+ end
350
+
351
+ # Find the version of this row that already exists in the datawarehouse.
352
+ def preexisting_row(row)
353
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
355
+
356
+ #puts "looking for original record"
357
+ result = connection.select_one(q)
358
+
359
+ #puts "Result: #{result.inspect}"
360
+
361
+ result ? ETL::Row[result.symbolize_keys!] : nil
362
+ end
363
+
364
+ # Check whether non-scd fields have changed since the last
365
+ # load of this record.
366
+ def has_scd_field_changes?(row)
367
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
368
+ end
369
+
370
+ # Check whether non-scd fields have changed since the last
371
+ # load of this record.
372
+ def has_non_scd_field_changes?(row)
373
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
374
+ end
375
+
376
+ # Grab, or re-use, a database connection for running queries directly
377
+ # during the destination processing.
378
+ def connection
379
+ @conn ||= ETL::Engine.connection(dimension_target)
380
+ end
381
+
382
+ # Utility for removing a row that has outdated information. Note
383
+ # that this deletes directly from the database, even if this is a file
384
+ # destination. It needs to do this because you can't do deletes in a
385
+ # bulk load.
386
+ def delete_outdated_record
387
+ ETL::Engine.logger.debug "deleting old row"
388
+
389
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
390
+ connection.delete(q)
391
+ end
392
+
393
+ # Schedule the latest, greatest version of the row for insertion
394
+ # into the database
395
+ def schedule_new_record(row)
396
+ ETL::Engine.logger.debug "writing new record"
397
+ if scd_type == 2
398
+ row[scd_effective_date_field] = @timestamp
399
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
400
+ row[scd_latest_version_field] = true
401
+ end
402
+ buffer << row
403
+ end
404
+
405
+ # Get the name of the primary key for this table. Asks the dimension
406
+ # model class for this information, but if that class hasn't been
407
+ # defined, just defaults to :id.
408
+ def primary_key
409
+ return @primary_key if @primary_key
410
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
411
+ rescue NameError => e
412
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
413
+ @primary_key = :id
414
+ end
415
+
416
+ end
417
+ end
418
+ end
419
+
420
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }