aeden-activewarehouse-etl 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. data/etl/CHANGELOG +190 -0
  2. data/etl/LICENSE +7 -0
  3. data/etl/README +85 -0
  4. data/etl/Rakefile +153 -0
  5. data/etl/TODO +28 -0
  6. data/etl/bin/etl +28 -0
  7. data/etl/bin/etl.cmd +8 -0
  8. data/etl/examples/database.example.yml +16 -0
  9. data/etl/lib/etl.rb +97 -0
  10. data/etl/lib/etl/batch.rb +2 -0
  11. data/etl/lib/etl/batch/batch.rb +111 -0
  12. data/etl/lib/etl/batch/directives.rb +55 -0
  13. data/etl/lib/etl/builder.rb +2 -0
  14. data/etl/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/etl/lib/etl/commands/etl.rb +89 -0
  16. data/etl/lib/etl/control.rb +3 -0
  17. data/etl/lib/etl/control/control.rb +403 -0
  18. data/etl/lib/etl/control/destination.rb +420 -0
  19. data/etl/lib/etl/control/destination/database_destination.rb +95 -0
  20. data/etl/lib/etl/control/destination/file_destination.rb +124 -0
  21. data/etl/lib/etl/control/source.rb +109 -0
  22. data/etl/lib/etl/control/source/database_source.rb +220 -0
  23. data/etl/lib/etl/control/source/enumerable_source.rb +11 -0
  24. data/etl/lib/etl/control/source/file_source.rb +90 -0
  25. data/etl/lib/etl/control/source/model_source.rb +39 -0
  26. data/etl/lib/etl/core_ext.rb +1 -0
  27. data/etl/lib/etl/core_ext/time.rb +5 -0
  28. data/etl/lib/etl/core_ext/time/calculations.rb +42 -0
  29. data/etl/lib/etl/engine.rb +552 -0
  30. data/etl/lib/etl/execution.rb +20 -0
  31. data/etl/lib/etl/execution/base.rb +9 -0
  32. data/etl/lib/etl/execution/batch.rb +8 -0
  33. data/etl/lib/etl/execution/job.rb +8 -0
  34. data/etl/lib/etl/execution/migration.rb +85 -0
  35. data/etl/lib/etl/generator.rb +2 -0
  36. data/etl/lib/etl/generator/generator.rb +20 -0
  37. data/etl/lib/etl/generator/surrogate_key_generator.rb +39 -0
  38. data/etl/lib/etl/http_tools.rb +125 -0
  39. data/etl/lib/etl/parser.rb +11 -0
  40. data/etl/lib/etl/parser/apache_combined_log_parser.rb +47 -0
  41. data/etl/lib/etl/parser/delimited_parser.rb +74 -0
  42. data/etl/lib/etl/parser/fixed_width_parser.rb +65 -0
  43. data/etl/lib/etl/parser/parser.rb +41 -0
  44. data/etl/lib/etl/parser/sax_parser.rb +218 -0
  45. data/etl/lib/etl/parser/xml_parser.rb +65 -0
  46. data/etl/lib/etl/processor.rb +11 -0
  47. data/etl/lib/etl/processor/block_processor.rb +14 -0
  48. data/etl/lib/etl/processor/bulk_import_processor.rb +81 -0
  49. data/etl/lib/etl/processor/check_exist_processor.rb +80 -0
  50. data/etl/lib/etl/processor/check_unique_processor.rb +35 -0
  51. data/etl/lib/etl/processor/copy_field_processor.rb +26 -0
  52. data/etl/lib/etl/processor/encode_processor.rb +55 -0
  53. data/etl/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  54. data/etl/lib/etl/processor/print_row_processor.rb +12 -0
  55. data/etl/lib/etl/processor/processor.rb +25 -0
  56. data/etl/lib/etl/processor/rename_processor.rb +24 -0
  57. data/etl/lib/etl/processor/require_non_blank_processor.rb +26 -0
  58. data/etl/lib/etl/processor/row_processor.rb +17 -0
  59. data/etl/lib/etl/processor/sequence_processor.rb +23 -0
  60. data/etl/lib/etl/processor/surrogate_key_processor.rb +53 -0
  61. data/etl/lib/etl/processor/truncate_processor.rb +35 -0
  62. data/etl/lib/etl/row.rb +20 -0
  63. data/etl/lib/etl/screen.rb +14 -0
  64. data/etl/lib/etl/screen/row_count_screen.rb +20 -0
  65. data/etl/lib/etl/transform.rb +2 -0
  66. data/etl/lib/etl/transform/block_transform.rb +13 -0
  67. data/etl/lib/etl/transform/date_to_string_transform.rb +20 -0
  68. data/etl/lib/etl/transform/decode_transform.rb +51 -0
  69. data/etl/lib/etl/transform/default_transform.rb +20 -0
  70. data/etl/lib/etl/transform/foreign_key_lookup_transform.rb +122 -0
  71. data/etl/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  72. data/etl/lib/etl/transform/ordinalize_transform.rb +12 -0
  73. data/etl/lib/etl/transform/sha1_transform.rb +13 -0
  74. data/etl/lib/etl/transform/string_to_date_transform.rb +16 -0
  75. data/etl/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  76. data/etl/lib/etl/transform/string_to_time_transform.rb +11 -0
  77. data/etl/lib/etl/transform/transform.rb +61 -0
  78. data/etl/lib/etl/transform/trim_transform.rb +26 -0
  79. data/etl/lib/etl/transform/type_transform.rb +35 -0
  80. data/etl/lib/etl/util.rb +59 -0
  81. data/etl/lib/etl/version.rb +9 -0
  82. metadata +193 -0
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,403 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # The Context is passed to eval.
4
+ class Context
5
+ require 'test/unit/assertions'
6
+ include Test::Unit::Assertions
7
+ attr_reader :control
8
+
9
+ class << self
10
+ # Create a Context instance
11
+ def create(control)
12
+ Context.new(control).get_binding
13
+ end
14
+ end
15
+
16
+ # Initialize the context
17
+ def initialize(control)
18
+ @control = control
19
+ end
20
+
21
+ # Get the control file
22
+ def file
23
+ control.file
24
+ end
25
+
26
+ # Set the allowed error threshold
27
+ def set_error_threshold(error_threshold)
28
+ control.error_threshold = error_threshold
29
+ end
30
+
31
+ # Define a list of control files that this file depends on. Those control
32
+ # files will be executed prior to this control file. The list may
33
+ # contain symbols that will be converted to file names by calling
34
+ # to_s + '.ctl', or they may be strings in which case they will be used
35
+ # as is
36
+ def depends_on(*args)
37
+ (dependencies << args).flatten!
38
+ end
39
+
40
+ # Get the defined dependencies
41
+ def dependencies
42
+ control.dependencies
43
+ end
44
+
45
+ # Define a source.
46
+ def source(name, configuration={}, definition={})
47
+ if configuration[:type]
48
+ case configuration[:type]
49
+ when Class
50
+ source_class = configuration[:type]
51
+ sources << source_class.new(self, configuration, definition)
52
+ when String, Symbol
53
+ source_class = ETL::Control::Source.class_for_name(configuration[:type])
54
+ sources << source_class.new(self, configuration, definition)
55
+ else
56
+ if configuration[:type].is_a?(ETL::Control::Source)
57
+ sources << configuration[:type]
58
+ else
59
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
60
+ end
61
+ end
62
+ else
63
+ source_types.each do |source_type|
64
+ if configuration[source_type]
65
+ source_class = ETL::Control::Source.class_for_name(source_type)
66
+ sources << source_class.new(self, configuration, definition)
67
+ break
68
+ end
69
+ raise ControlError, "A source was specified but no matching type was found"
70
+ end
71
+ end
72
+ end
73
+
74
+ # Get the defined source
75
+ def sources
76
+ control.sources
77
+ end
78
+
79
+ # Define a destination
80
+ def destination(name, configuration={}, mapping={})
81
+ if configuration[:type]
82
+ case configuration[:type]
83
+ when Class
84
+ dest_class = configuration[:type]
85
+ destinations << dest_class.new(self, configuration, mapping)
86
+ when String, Symbol
87
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
88
+ destinations << dest_class.new(self, configuration, mapping)
89
+ else
90
+ if configuration[:type].is_a?(ETL::Control::Destination)
91
+ destinations << configuration[:type]
92
+ else
93
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
94
+ end
95
+ end
96
+ else
97
+ destination_types.each do |dest_type|
98
+ if configuration[dest_type]
99
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
100
+ destinations << dest_class.new(self, configuration, mapping)
101
+ break
102
+ end
103
+ raise ControlError, "A destination was specified but no matching destination type was found"
104
+ end
105
+ end
106
+ end
107
+
108
+ # Get the defined destinations
109
+ def destinations
110
+ control.destinations
111
+ end
112
+
113
+ # Define a transform
114
+ def transform(name, transformer=nil, configuration={}, &block)
115
+ if transformer
116
+ case transformer
117
+ when String, Symbol
118
+ class_name = "#{transformer.to_s.camelize}Transform"
119
+ begin
120
+ transform_class = ETL::Transform.const_get(class_name)
121
+ transforms << transform_class.new(self, name, configuration)
122
+ rescue NameError => e
123
+ raise ControlError, "Unable to find transformer #{class_name}: #{e}"
124
+ end
125
+ else
126
+ #transformer.class.inspect
127
+ if transformer.is_a?(ETL::Transform::Transform)
128
+ Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
129
+ t = transformer.dup
130
+ t.name = name
131
+ transforms << t
132
+ else
133
+ raise ControlError, "Transformer must be a String, Symbol or Transform instance"
134
+ end
135
+ end
136
+ elsif block_given?
137
+ transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
138
+ else
139
+ raise ControlError, "Either a transformer or a block must be specified"
140
+ end
141
+ end
142
+
143
+ # Get the defined transforms
144
+ def transforms
145
+ control.transforms
146
+ end
147
+
148
+ # Define a before post-process screen block. The type argument must be
149
+ # one of :fatal, :error or :warn
150
+ def screen(type, &block)
151
+ screens[type] << block
152
+ end
153
+
154
+ # Get the before post-process screen blocks
155
+ def screens
156
+ control.screens
157
+ end
158
+
159
+ # Define an after post-proces screen block. The type argument must be
160
+ # one of :fatal, :error or :warn
161
+ def after_post_process_screen(type, &block)
162
+ after_post_process_screens[type] << block
163
+ end
164
+
165
+ # Get the after post-process screen blocks
166
+ def after_post_process_screens
167
+ control.after_post_process_screens
168
+ end
169
+
170
+ # Rename the source field to the destination field
171
+ def rename(source, destination)
172
+ after_read :rename, :source => source, :dest => destination
173
+ end
174
+
175
+ # Copy the source field to the destination field
176
+ def copy(source, destination)
177
+ after_read :copy_field, :source => source, :dest => destination
178
+ end
179
+
180
+ protected
181
+ # This method is used to define a processor and insert into the specified processor
182
+ # collection.
183
+ def define_processor(name, processor_collection, configuration, proc)
184
+ case name
185
+ when String, Symbol, nil
186
+ name ||= 'block'
187
+ class_name = "#{name.to_s.camelize}Processor"
188
+ begin
189
+ processor_class = ETL::Processor.const_get(class_name)
190
+ if name == 'block'
191
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
192
+ configuration[:block] = proc
193
+ end
194
+ processor_collection << processor_class.new(self, configuration)
195
+ rescue NameError => e
196
+ raise ControlError, "Unable to find processor #{class_name}: #{e}"
197
+ end
198
+ when Class
199
+ processor_collection << name.new(self, configuration)
200
+ else
201
+ raise ControlError, "The process declaration requires a String, Symbol or Class, or a Block to be passed"
202
+ end
203
+ end
204
+
205
+ public
206
+ # Define an "after read" processor. This must be a row-level processor.
207
+ def after_read(name='block', configuration={}, &block)
208
+ define_processor(name, after_read_processors, configuration, block)
209
+ end
210
+
211
+ # Get the defined "after read" processors
212
+ def after_read_processors
213
+ control.after_read_processors
214
+ end
215
+
216
+ # Define a "before write" processor. This must be a row-level processor.
217
+ def before_write(name='block', configuration={}, &block)
218
+ define_processor(name, before_write_processors, configuration, block)
219
+ end
220
+
221
+ # Get the defined "before write" processors
222
+ def before_write_processors
223
+ control.before_write_processors
224
+ end
225
+
226
+ # Define a pre-processor
227
+ def pre_process(name='block', configuration={}, &block)
228
+ define_processor(name, pre_processors, configuration, block)
229
+ end
230
+
231
+ # Get the defined pre-processors
232
+ def pre_processors
233
+ control.pre_processors
234
+ end
235
+
236
+ # Define a post-processor
237
+ def post_process(name='block', configuration={}, &block)
238
+ define_processor(name, post_processors, configuration, block)
239
+ end
240
+
241
+ # Get the defined post-processors
242
+ def post_processors
243
+ control.post_processors
244
+ end
245
+
246
+ # Get the binding object
247
+ def get_binding
248
+ binding
249
+ end
250
+
251
+ protected
252
+ # Get an array of supported source types
253
+ def source_types
254
+ control.source_types
255
+ end
256
+
257
+ # Get an array of supported destination types
258
+ def destination_types
259
+ control.destination_types
260
+ end
261
+
262
+ end
263
+
264
+ # Object representation of a control file
265
+ class Control
266
+ # The File object
267
+ attr_reader :file
268
+
269
+ # The error threshold
270
+ attr_accessor :error_threshold
271
+
272
+ class << self
273
+ # Parse a control file and return a Control instance
274
+ def parse(control_file)
275
+ control_file = control_file.path if control_file.instance_of?(File)
276
+ control = ETL::Control::Control.new(control_file)
277
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
278
+ eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
279
+ control.validate
280
+ control
281
+ end
282
+
283
+ def parse_text(text)
284
+ control = ETL::Control::Control.new(nil)
285
+ eval(text, Context.create(control), 'inline')
286
+ control.validate
287
+ control
288
+ end
289
+
290
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
291
+ # are:
292
+ # * The path to a control file as a String
293
+ # * A File object referencing the control file
294
+ # * The ETL::Control::Control object (which will just be returned)
295
+ #
296
+ # Raises a ControlError if any other type is given
297
+ def resolve(control)
298
+ case control
299
+ when String
300
+ ETL::Control::Control.parse(File.new(control))
301
+ when File
302
+ ETL::Control::Control.parse(control)
303
+ when ETL::Control::Control
304
+ control
305
+ else
306
+ raise ControlError, "Control must be a String, File or Control object"
307
+ end
308
+ end
309
+ end
310
+
311
+ # Initialize the instance with the given File object
312
+ def initialize(file)
313
+ @file = file
314
+ end
315
+
316
+ # Get a list of dependencies
317
+ def dependencies
318
+ @dependencies ||= []
319
+ end
320
+
321
+ # Get the defined source
322
+ def sources
323
+ @sources ||= []
324
+ end
325
+
326
+ # Get the defined destinations
327
+ def destinations
328
+ @destinations ||= []
329
+ end
330
+
331
+ # Get the transforms with the specified name
332
+ # def transform(name)
333
+ # transforms[name] ||= []
334
+ # end
335
+
336
+ def after_read_processors
337
+ @after_read_processors ||= []
338
+ end
339
+
340
+ # Get all of the "before write" processors
341
+ def before_write_processors
342
+ @before_write_processors ||= []
343
+ end
344
+
345
+ # Get an Array of preprocessors
346
+ def pre_processors
347
+ @pre_processors ||= []
348
+ end
349
+
350
+ # Get an Array of post processors
351
+ def post_processors
352
+ @post_processors ||= []
353
+ end
354
+
355
+ # Get an Array of all transforms for this control
356
+ def transforms
357
+ @transforms ||= []
358
+ end
359
+
360
+ # A hash of the screens executed before post-process
361
+ def screens
362
+ @screens ||= {
363
+ :fatal => [],
364
+ :error => [],
365
+ :warn => []
366
+ }
367
+ end
368
+
369
+ # A hash of the screens executed after post-process
370
+ def after_post_process_screens
371
+ @after_post_process_screens ||= {
372
+ :fatal => [],
373
+ :error => [],
374
+ :warn => []
375
+ }
376
+ end
377
+
378
+ # Get the error threshold. Defaults to 100.
379
+ def error_threshold
380
+ @error_threshold ||= 100
381
+ end
382
+
383
+ # Validate the control file
384
+ def validate
385
+ #unless sources.length > 0
386
+ # raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
387
+ #end
388
+ #unless destinations.length > 0
389
+ # raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
390
+ #end
391
+ end
392
+
393
+ def source_types
394
+ [:file, :database]
395
+ end
396
+
397
+ def destination_types
398
+ [:file, :database]
399
+ end
400
+
401
+ end
402
+ end
403
+ end
@@ -0,0 +1,420 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Base class for destinations.
4
+ class Destination
5
+ # Read-only accessor for the ETL::Control::Control instance
6
+ attr_reader :control
7
+
8
+ # Read-only accessor for the configuration Hash
9
+ attr_reader :configuration
10
+
11
+ # Read-only accessor for the destination mapping Hash
12
+ attr_reader :mapping
13
+
14
+ # Accessor to the buffer size
15
+ attr_accessor :buffer_size
16
+
17
+ # Unique flag.
18
+ attr_accessor :unique
19
+
20
+ # A condition for writing
21
+ attr_accessor :condition
22
+
23
+ # An array of rows to append to the destination
24
+ attr_accessor :append_rows
25
+
26
+ class << self
27
+ # Get the destination class for the specified name.
28
+ #
29
+ # For example if name is :database or 'database' then the
30
+ # DatabaseDestination class is returned
31
+ def class_for_name(name)
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
+ end
34
+ end
35
+
36
+ # Initialize the destination
37
+ #
38
+ # Arguments:
39
+ # * <tt>control</tt>: The ETL::Control::Control instance
40
+ # * <tt>configuration</tt>: The configuration Hash
41
+ # * <tt>mapping</tt>: The mapping Hash
42
+ #
43
+ # Options:
44
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
45
+ # * <tt>:condition</tt>: A conditional proc that must return true for the
46
+ # row to be written
47
+ # * <tt>:append_rows</tt>: An array of rows to append
48
+ def initialize(control, configuration, mapping)
49
+ @control = control
50
+ @configuration = configuration
51
+ @mapping = mapping
52
+ @buffer_size = configuration[:buffer_size] ||= 100
53
+ @condition = configuration[:condition]
54
+ @append_rows = configuration[:append_rows]
55
+ end
56
+
57
+ # Get the current row number
58
+ def current_row
59
+ @current_row ||= 1
60
+ end
61
+
62
+ # Write the given row
63
+ def write(row)
64
+ if @condition.nil? || @condition.call(row)
65
+ process_change(row)
66
+ end
67
+ flush if buffer.length >= buffer_size
68
+ end
69
+
70
+ # Abstract method
71
+ def flush
72
+ raise NotImplementedError, "flush method must be implemented by subclasses"
73
+ end
74
+
75
+ # Abstract method
76
+ def close
77
+ raise NotImplementedError, "close method must be implemented by subclasses"
78
+ end
79
+
80
+ def errors
81
+ @errors ||= []
82
+ end
83
+
84
+ protected
85
+ # Access the buffer
86
+ def buffer
87
+ @buffer ||= []
88
+ end
89
+
90
+ # Access the generators map
91
+ def generators
92
+ @generators ||= {}
93
+ end
94
+
95
+ # Get the order of elements from the source order
96
+ def order_from_source
97
+ order = []
98
+ control.sources.first.definition.each do |item|
99
+ case item
100
+ when Hash
101
+ order << item[:name]
102
+ else
103
+ order << item
104
+ end
105
+ end
106
+ order
107
+ end
108
+
109
+ # Return true if the row is allowed. The row will not be allowed if the
110
+ # :unique option is specified in the configuration and the compound key
111
+ # already exists
112
+ def row_allowed?(row)
113
+ if unique
114
+ key = (unique.collect { |k| row[k] }).join('|')
115
+ return false if compound_key_constraints[key]
116
+ compound_key_constraints[key] = 1
117
+ end
118
+ return true
119
+ end
120
+
121
+ # Get a hash of compound key contraints. This is used to determine if a
122
+ # row can be written when the unique option is specified
123
+ def compound_key_constraints
124
+ @compound_key_constraints ||= {}
125
+ end
126
+
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ end
133
+
134
+ def non_scd_fields(row)
135
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
137
+ end
138
+
139
+ def non_evolving_fields
140
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
141
+ end
142
+
143
+ def scd?
144
+ !configuration[:scd].nil?
145
+ end
146
+
147
+ def scd_type
148
+ scd? ? configuration[:scd][:type] : nil
149
+ end
150
+
151
+ # Get the Slowly Changing Dimension effective date field. Defaults to
152
+ # 'effective_date'.
153
+ def scd_effective_date_field
154
+ configuration[:scd][:effective_date_field] || :effective_date if scd?
155
+ end
156
+
157
+ # Get the Slowly Changing Dimension end date field. Defaults to
158
+ # 'end_date'.
159
+ def scd_end_date_field
160
+ configuration[:scd][:end_date_field] || :end_date if scd?
161
+ end
162
+
163
+ # Get the Slowly Changing Dimension latest version field. Defaults to
164
+ # 'latest_version'.
165
+ def scd_latest_version_field
166
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
167
+ end
168
+
169
+ # Return the natural key field names, defaults to []
170
+ def natural_key
171
+ @natural_key ||= determine_natural_key
172
+ end
173
+
174
+ # Get the dimension table if specified
175
+ def dimension_table
176
+ @dimension_table ||= if scd?
177
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
178
+ end
179
+ end
180
+
181
+ # Get the dimension target if specified
182
+ def dimension_target
183
+ @dimension_target ||= if scd?
184
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
185
+ end
186
+ end
187
+
188
+ # Process a row to determine the change type
189
+ def process_change(row)
190
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
191
+ return unless row
192
+
193
+ # Change processing can only occur if the natural key exists in the row
194
+ ETL::Engine.logger.debug "Checking for natural key existence"
195
+ unless has_natural_key?(row)
196
+ buffer << row
197
+ return
198
+ end
199
+
200
+ @timestamp = Time.now
201
+
202
+ # See if the scd_fields of the current record have changed
203
+ # from the last time this record was loaded into the data
204
+ # warehouse. If they match then throw away this row (no need
205
+ # to process). If they do not match then the record is an
206
+ # 'update'. If the record doesn't exist then it is an 'insert'
207
+ ETL::Engine.logger.debug "Checking record for SCD change"
208
+ if @existing_row = preexisting_row(row)
209
+ if has_scd_field_changes?(row)
210
+ process_scd_change(row)
211
+ else
212
+ process_scd_match(row)
213
+ end
214
+ else
215
+ schedule_new_record(row)
216
+ end
217
+ end
218
+
219
+ # Add any virtual fields to the row. Virtual rows will get their value
220
+ # from one of the following:
221
+ # * If the mapping is a Class, then an object which implements the next
222
+ # method
223
+ # * If the mapping is a Symbol, then the XGenerator where X is the
224
+ # classified symbol
225
+ # * If the mapping is a Proc, then it will be called with the row
226
+ # * Otherwise the value itself will be assigned to the field
227
+ def add_virtuals!(row)
228
+ if mapping[:virtual]
229
+ mapping[:virtual].each do |key,value|
230
+ # If the row already has the virtual set, assume that's correct
231
+ next if row[key]
232
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
233
+ case value
234
+ when Class
235
+ generator = generators[key] ||= value.new
236
+ row[key] = generator.next
237
+ when Symbol
238
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
239
+ row[key] = generator.next
240
+ when Proc
241
+ row[key] = value.call(row)
242
+ else
243
+ if value.is_a?(ETL::Generator::Generator)
244
+ row[key] = value.next
245
+ else
246
+ row[key] = value
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+
253
+ private
254
+
255
+ # Determine the natural key. This method will always return an array
256
+ # of symbols. The default value is [].
257
+ def determine_natural_key
258
+ Array(configuration[:natural_key]).collect(&:to_sym)
259
+ end
260
+
261
+ # Check whether a natural key has been defined, and if so, whether
262
+ # this row has enough information to do searches based on that natural
263
+ # key.
264
+ #
265
+ # TODO: This should be factored out into
266
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
267
+ # until *all* sources cast to ETL::Row, instead of sometimes
268
+ # using Hash
269
+ def has_natural_key?(row)
270
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
271
+ end
272
+
273
+ # Helper for generating the SQL where clause that allows searching
274
+ # by a natural key
275
+ def natural_key_equality_for_row(row)
276
+ statement = []
277
+ values = []
278
+ natural_key.each do |nk|
279
+ statement << "#{nk} = ?"
280
+ values << row[nk]
281
+ end
282
+ statement = statement.join(" AND ")
283
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
284
+ end
285
+
286
+ # Do all the steps required when a SCD *has* changed. Exact steps
287
+ # depend on what type of SCD we're handling.
288
+ def process_scd_change(row)
289
+ ETL::Engine.logger.debug "SCD fields do not match"
290
+
291
+ if scd_type == 2
292
+ # SCD Type 2: new row should be added and old row should be updated
293
+ ETL::Engine.logger.debug "type 2 SCD"
294
+
295
+ # To update the old row, we delete the version in the database
296
+ # and insert a new expired version
297
+
298
+ # If there is no truncate then the row will exist twice in the database
299
+ delete_outdated_record
300
+
301
+ ETL::Engine.logger.debug "expiring original record"
302
+ @existing_row[scd_end_date_field] = @timestamp
303
+ @existing_row[scd_latest_version_field] = false
304
+
305
+ buffer << @existing_row
306
+
307
+ elsif scd_type == 1
308
+ # SCD Type 1: only the new row should be added
309
+ ETL::Engine.logger.debug "type 1 SCD"
310
+
311
+ # Copy primary key, and other non-evolving fields over from
312
+ # original version of record
313
+ non_evolving_fields.each do |non_evolving_field|
314
+ row[non_evolving_field] = @existing_row[non_evolving_field]
315
+ end
316
+
317
+ # If there is no truncate then the row will exist twice in the database
318
+ delete_outdated_record
319
+ else
320
+ # SCD Type 3: not supported
321
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
322
+ end
323
+
324
+ # In all cases, the latest, greatest version of the record
325
+ # should go into the load
326
+ schedule_new_record(row)
327
+ end
328
+
329
+ # Do all the steps required when a SCD has *not* changed. Exact
330
+ # steps depend on what type of SCD we're handling.
331
+ def process_scd_match(row)
332
+ ETL::Engine.logger.debug "SCD fields match"
333
+
334
+ if scd_type == 2 && has_non_scd_field_changes?(row)
335
+ ETL::Engine.logger.debug "Non-SCD field changes"
336
+ # Copy important data over from original version of record
337
+ row[primary_key] = @existing_row[primary_key]
338
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
339
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
340
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
341
+
342
+ # If there is no truncate then the row will exist twice in the database
343
+ delete_outdated_record
344
+
345
+ buffer << row
346
+ else
347
+ # The record is totally the same, so skip it
348
+ end
349
+ end
350
+
351
+ # Find the version of this row that already exists in the datawarehouse.
352
+ def preexisting_row(row)
353
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
355
+
356
+ #puts "looking for original record"
357
+ result = connection.select_one(q)
358
+
359
+ #puts "Result: #{result.inspect}"
360
+
361
+ result ? ETL::Row[result.symbolize_keys!] : nil
362
+ end
363
+
364
+ # Check whether non-scd fields have changed since the last
365
+ # load of this record.
366
+ def has_scd_field_changes?(row)
367
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
368
+ end
369
+
370
+ # Check whether non-scd fields have changed since the last
371
+ # load of this record.
372
+ def has_non_scd_field_changes?(row)
373
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
374
+ end
375
+
376
+ # Grab, or re-use, a database connection for running queries directly
377
+ # during the destination processing.
378
+ def connection
379
+ @conn ||= ETL::Engine.connection(dimension_target)
380
+ end
381
+
382
+ # Utility for removing a row that has outdated information. Note
383
+ # that this deletes directly from the database, even if this is a file
384
+ # destination. It needs to do this because you can't do deletes in a
385
+ # bulk load.
386
+ def delete_outdated_record
387
+ ETL::Engine.logger.debug "deleting old row"
388
+
389
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
390
+ connection.delete(q)
391
+ end
392
+
393
+ # Schedule the latest, greatest version of the row for insertion
394
+ # into the database
395
+ def schedule_new_record(row)
396
+ ETL::Engine.logger.debug "writing new record"
397
+ if scd_type == 2
398
+ row[scd_effective_date_field] = @timestamp
399
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
400
+ row[scd_latest_version_field] = true
401
+ end
402
+ buffer << row
403
+ end
404
+
405
+ # Get the name of the primary key for this table. Asks the dimension
406
+ # model class for this information, but if that class hasn't been
407
+ # defined, just defaults to :id.
408
+ def primary_key
409
+ return @primary_key if @primary_key
410
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
411
+ rescue NameError => e
412
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
413
+ @primary_key = :id
414
+ end
415
+
416
+ end
417
+ end
418
+ end
419
+
420
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }