activewarehouse-etl 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -78,12 +78,11 @@ def execute
78
78
 
79
79
  ETL::Engine.init(options)
80
80
  ARGV.each do |f|
81
- puts "Processing #{f}"
82
81
  ETL::Engine.realtime_activity = true
83
82
  ETL::Engine.process(f)
84
83
  end
85
84
 
86
- puts "ETL process complete"
85
+ puts "ETL process complete\n\n"
87
86
  end
88
87
  end
89
88
 
@@ -34,7 +34,7 @@ module ETL #:nodoc:
34
34
  # to_s + '.ctl', or they may be strings in which case they will be used
35
35
  # as is
36
36
  def depends_on(*args)
37
- dependencies << args
37
+ (dependencies << args).flatten!
38
38
  end
39
39
 
40
40
  # Get the defined dependencies
@@ -115,13 +115,15 @@ module ETL #:nodoc:
115
115
  if transformer
116
116
  case transformer
117
117
  when String, Symbol
118
- class_name = "#{transformer.to_s.classify}Transform"
118
+ class_name = "#{transformer.to_s.camelize}Transform"
119
119
  begin
120
120
  transform_class = ETL::Transform.const_get(class_name)
121
121
  transforms << transform_class.new(self, name, configuration)
122
122
  rescue NameError => e
123
123
  raise ControlError, "Unable to find transformer #{class_name}: #{e}"
124
124
  end
125
+ when Class
126
+ transforms << transformer.new(self, transformer.name, configuration)
125
127
  else
126
128
  #transformer.class.inspect
127
129
  if transformer.is_a?(ETL::Transform::Transform)
@@ -130,7 +132,7 @@ module ETL #:nodoc:
130
132
  t.name = name
131
133
  transforms << t
132
134
  else
133
- raise ControlError, "Transformer must be a String, Symbol or Transform instance"
135
+ raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
134
136
  end
135
137
  end
136
138
  elsif block_given?
@@ -145,17 +147,28 @@ module ETL #:nodoc:
145
147
  control.transforms
146
148
  end
147
149
 
148
- # Define a screen block. The type argument must be one of :fatal, :error
149
- # or :warn
150
+ # Define a before post-process screen block. The type argument must be
151
+ # one of :fatal, :error or :warn
150
152
  def screen(type, &block)
151
153
  screens[type] << block
152
154
  end
153
155
 
154
- # Get the screen blocks
156
+ # Get the before post-process screen blocks
155
157
  def screens
156
158
  control.screens
157
159
  end
158
160
 
161
+ # Define an after post-proces screen block. The type argument must be
162
+ # one of :fatal, :error or :warn
163
+ def after_post_process_screen(type, &block)
164
+ after_post_process_screens[type] << block
165
+ end
166
+
167
+ # Get the after post-process screen blocks
168
+ def after_post_process_screens
169
+ control.after_post_process_screens
170
+ end
171
+
159
172
  # Rename the source field to the destination field
160
173
  def rename(source, destination)
161
174
  after_read :rename, :source => source, :dest => destination
@@ -169,12 +182,17 @@ module ETL #:nodoc:
169
182
  protected
170
183
  # This method is used to define a processor and insert into the specified processor
171
184
  # collection.
172
- def define_processor(name, processor_collection, configuration)
185
+ def define_processor(name, processor_collection, configuration, proc)
173
186
  case name
174
- when String, Symbol
175
- class_name = "#{name.to_s.classify}Processor"
187
+ when String, Symbol, nil
188
+ name ||= 'block'
189
+ class_name = "#{name.to_s.camelize}Processor"
176
190
  begin
177
191
  processor_class = ETL::Processor.const_get(class_name)
192
+ if name == 'block'
193
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
194
+ configuration[:block] = proc
195
+ end
178
196
  processor_collection << processor_class.new(self, configuration)
179
197
  rescue NameError => e
180
198
  raise ControlError, "Unable to find processor #{class_name}: #{e}"
@@ -182,14 +200,14 @@ module ETL #:nodoc:
182
200
  when Class
183
201
  processor_collection << name.new(self, configuration)
184
202
  else
185
- raise ControlError, "The process declaration requires a String, Symbol or Class"
203
+ raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
186
204
  end
187
205
  end
188
206
 
189
207
  public
190
208
  # Define an "after read" processor. This must be a row-level processor.
191
- def after_read(name, configuration={})
192
- define_processor(name, after_read_processors, configuration)
209
+ def after_read(name='block', configuration={}, &block)
210
+ define_processor(name, after_read_processors, configuration, block)
193
211
  end
194
212
 
195
213
  # Get the defined "after read" processors
@@ -198,8 +216,8 @@ module ETL #:nodoc:
198
216
  end
199
217
 
200
218
  # Define a "before write" processor. This must be a row-level processor.
201
- def before_write(name, configuration={})
202
- define_processor(name, before_write_processors, configuration)
219
+ def before_write(name='block', configuration={}, &block)
220
+ define_processor(name, before_write_processors, configuration, block)
203
221
  end
204
222
 
205
223
  # Get the defined "before write" processors
@@ -208,8 +226,8 @@ module ETL #:nodoc:
208
226
  end
209
227
 
210
228
  # Define a pre-processor
211
- def pre_process(name, configuration={})
212
- define_processor(name, pre_processors, configuration)
229
+ def pre_process(name='block', configuration={}, &block)
230
+ define_processor(name, pre_processors, configuration, block)
213
231
  end
214
232
 
215
233
  # Get the defined pre-processors
@@ -218,8 +236,8 @@ module ETL #:nodoc:
218
236
  end
219
237
 
220
238
  # Define a post-processor
221
- def post_process(name, configuration={})
222
- define_processor(name, post_processors, configuration)
239
+ def post_process(name='block', configuration={}, &block)
240
+ define_processor(name, post_processors, configuration, block)
223
241
  end
224
242
 
225
243
  # Get the defined post-processors
@@ -341,6 +359,7 @@ module ETL #:nodoc:
341
359
  @transforms ||= []
342
360
  end
343
361
 
362
+ # A hash of the screens executed before post-process
344
363
  def screens
345
364
  @screens ||= {
346
365
  :fatal => [],
@@ -349,6 +368,15 @@ module ETL #:nodoc:
349
368
  }
350
369
  end
351
370
 
371
+ # A hash of the screens executed after post-process
372
+ def after_post_process_screens
373
+ @after_post_process_screens ||= {
374
+ :fatal => [],
375
+ :error => [],
376
+ :warn => []
377
+ }
378
+ end
379
+
352
380
  # Get the error threshold. Defaults to 100.
353
381
  def error_threshold
354
382
  @error_threshold ||= 100
@@ -29,7 +29,7 @@ module ETL #:nodoc:
29
29
  # For example if name is :database or 'database' then the
30
30
  # DatabaseDestination class is returned
31
31
  def class_for_name(name)
32
- ETL::Control.const_get("#{name.to_s.classify}Destination")
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
33
  end
34
34
  end
35
35
 
@@ -124,10 +124,20 @@ module ETL #:nodoc:
124
124
  @compound_key_constraints ||= {}
125
125
  end
126
126
 
127
- # Return fields which are Slowly Changing Dimension fields. Return nil
128
- # by default.
129
- def scd_fields
130
- @scd_fields ||= configuration[:scd_fields]
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ end
133
+
134
+ def non_scd_fields(row)
135
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
137
+ end
138
+
139
+ def non_evolving_fields
140
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
131
141
  end
132
142
 
133
143
  def scd?
@@ -150,19 +160,29 @@ module ETL #:nodoc:
150
160
  configuration[:scd][:end_date_field] || :end_date if scd?
151
161
  end
152
162
 
153
- # Return the natural key field name, defaults to :id
163
+ # Get the Slowly Changing Dimension latest version field. Defaults to
164
+ # 'latest_version'.
165
+ def scd_latest_version_field
166
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
167
+ end
168
+
169
+ # Return the natural key field names, defaults to []
154
170
  def natural_key
155
171
  @natural_key ||= determine_natural_key
156
172
  end
157
173
 
158
174
  # Get the dimension table if specified
159
175
  def dimension_table
160
- ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) if scd?
176
+ @dimension_table ||= if scd?
177
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
178
+ end
161
179
  end
162
180
 
163
181
  # Get the dimension target if specified
164
182
  def dimension_target
165
- configuration[:scd][:dimension_target] if scd?
183
+ @dimension_target ||= if scd?
184
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
185
+ end
166
186
  end
167
187
 
168
188
  # Process a row to determine the change type
@@ -172,136 +192,27 @@ module ETL #:nodoc:
172
192
 
173
193
  # Change processing can only occur if the natural key exists in the row
174
194
  ETL::Engine.logger.debug "Checking for natural key existence"
175
- if natural_key.length == 0
195
+ unless has_natural_key?(row)
176
196
  buffer << row
177
197
  return
178
198
  end
179
199
 
180
- natural_key.each do |key|
181
- unless row.has_key?(key)
182
- buffer << row
183
- return
184
- end
185
- end
186
-
187
- ETL::Engine.logger.debug "Checking for SCD fields"
188
- s = String.new
189
- if scd_fields
190
- scd_fields.each { |f| s << row[f].to_s }
191
- else
192
- row.each { |key,value| s << value.to_s }
193
- end
194
-
195
- # apply the CRC to 's' and see if it matches the last
196
- # ETL::Execution::Record with the samenatural key. If they match then
197
- # throw away this row (no need to process). If they do not match then
198
- # the record is an 'update'. If the record doesn't exist then it is an
199
- # 'insert'
200
- nk = natural_key.collect{|k|row[k]}.join('|')
201
- require 'zlib'
202
- crc = Zlib.crc32(s)
203
- record = ETL::Execution::Record.find_by_control_file_and_natural_key(control.file, nk)
204
-
205
- timestamp = Time.now
206
-
207
- ETL::Engine.logger.debug "Checking record change type"
208
- if record
209
- if record.crc != crc.to_s
210
- # SCD Type 1: only the new row should be added
211
- # SCD Type 2: both an old and new row should be added
212
- # SCD Type 3: not supported
213
- ETL::Engine.logger.debug "CRC does not match"
214
-
215
- if scd_type == 2
216
- ETL::Engine.logger.debug "type 2 SCD"
217
-
218
- raise ConfigurationError, "dimension_table setting required" unless dimension_table
219
- raise ConfigurationError, "dimension_target setting required" unless dimension_target
220
-
221
- conn = ETL::Engine.connection(dimension_target)
222
-
223
- q = "SELECT * FROM #{dimension_table} WHERE "
224
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
225
- #puts "looking for original record"
226
- result = conn.select_one(q)
227
- if result
228
- #puts "Result: #{result.inspect}"
229
- original_record = ETL::Row[result.symbolize_keys!]
230
- original_record[scd_end_date_field] = timestamp
231
- ETL::Engine.logger.debug "writing original record"
232
-
233
- # if there is no truncate then the row will exist twice in the database
234
- # need to figure out how to delete that old record before inserting the
235
- # updated version of the record
236
-
237
- q = "DELETE FROM #{dimension_table} WHERE "
238
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
239
-
240
- num_rows_affected = conn.delete(q)
241
- ETL::Engine.logger.debug "deleted old row"
242
-
243
- # do this?
244
- #raise "Should have deleted a single record" if num_rows_affected != 1
245
-
246
- buffer << original_record
247
- end
248
-
249
- row[scd_effective_date_field] = timestamp
250
- row[scd_end_date_field] = '9999-12-31 00:00:00'
251
- elsif scd_type == 1
252
- ETL::Engine.logger.debug "type 1 SCD"
253
- else
254
- ETL::Engine.logger.debug "SCD not specified"
255
- end
256
-
257
- ETL::Engine.logger.debug "writing new record"
258
- buffer << row
200
+ @timestamp = Time.now
201
+
202
+ # See if the scd_fields of the current record have changed
203
+ # from the last time this record was loaded into the data
204
+ # warehouse. If they match then throw away this row (no need
205
+ # to process). If they do not match then the record is an
206
+ # 'update'. If the record doesn't exist then it is an 'insert'
207
+ ETL::Engine.logger.debug "Checking record for SCD change"
208
+ if @existing_row = preexisting_row(row)
209
+ if has_scd_field_changes?(row)
210
+ process_scd_change(row)
259
211
  else
260
- ETL::Engine.logger.debug "CRC matches, skipping"
261
-
262
- raise ConfigurationError, "dimension_table setting required" unless dimension_table
263
- raise ConfigurationError, "dimension_target setting required" unless dimension_target
264
-
265
- conn = ETL::Engine.connection(dimension_target)
266
-
267
- q = "SELECT * FROM #{dimension_table} WHERE "
268
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
269
- result = conn.select_one(q)
270
- if result
271
- # This was necessary when truncating and then loading, however I
272
- # am getting reluctant to having the ETL process do the truncation
273
- # as part of the bulk load, favoring using a preprocessor instead.
274
- # buffer << ETL::Row[result.symbolize_keys!]
275
- else
276
- # The record never made it into the database, so add the effective and end date
277
- # and add it into the bulk load file
278
- row[scd_effective_date_field] = timestamp
279
- row[scd_end_date_field] = '9999-12-31 00:00:00'
280
- buffer << row
281
- end
212
+ process_scd_match(row)
282
213
  end
283
214
  else
284
- ETL::Engine.logger.debug "record never loaded"
285
- # Set the effective and end date fields
286
- if scd_type == 2
287
- row[scd_effective_date_field] = timestamp
288
- row[scd_end_date_field] = '9999-12-31 00:00:00'
289
- end
290
-
291
- # Write the row
292
- buffer << row
293
-
294
- # Record the record
295
- if ETL::Engine.job # only record the execution if there is a job
296
- ETL::Execution::Record.time_spent += Benchmark.realtime do
297
- ETL::Execution::Record.create!(
298
- :control_file => control.file,
299
- :natural_key => nk,
300
- :crc => crc,
301
- :job_id => ETL::Engine.job.id
302
- )
303
- end
304
- end
215
+ schedule_new_record(row)
305
216
  end
306
217
  end
307
218
 
@@ -316,6 +227,8 @@ module ETL #:nodoc:
316
227
  def add_virtuals!(row)
317
228
  if mapping[:virtual]
318
229
  mapping[:virtual].each do |key,value|
230
+ # If the row already has the virtual set, assume that's correct
231
+ next if row[key]
319
232
  # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
320
233
  case value
321
234
  when Class
@@ -338,18 +251,168 @@ module ETL #:nodoc:
338
251
  end
339
252
 
340
253
  private
254
+
341
255
  # Determine the natural key. This method will always return an array
342
- # of symbols. The default value is [:id].
256
+ # of symbols. The default value is [].
343
257
  def determine_natural_key
344
- case configuration[:natural_key]
345
- when Array
346
- configuration[:natural_key].collect(&:to_sym)
347
- when String, Symbol
348
- [configuration[:natural_key].to_sym]
258
+ Array(configuration[:natural_key]).collect(&:to_sym)
259
+ end
260
+
261
+ # Check whether a natural key has been defined, and if so, whether
262
+ # this row has enough information to do searches based on that natural
263
+ # key.
264
+ #
265
+ # TODO: This should be factored out into
266
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
267
+ # until *all* sources cast to ETL::Row, instead of sometimes
268
+ # using Hash
269
+ def has_natural_key?(row)
270
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
271
+ end
272
+
273
+ # Helper for generating the SQL where clause that allows searching
274
+ # by a natural key
275
+ def natural_key_equality_for_row(row)
276
+ statement = []
277
+ values = []
278
+ natural_key.each do |nk|
279
+ statement << "#{nk} = ?"
280
+ values << row[nk]
281
+ end
282
+ statement = statement.join(" AND ")
283
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
284
+ end
285
+
286
+ # Do all the steps required when a SCD *has* changed. Exact steps
287
+ # depend on what type of SCD we're handling.
288
+ def process_scd_change(row)
289
+ ETL::Engine.logger.debug "SCD fields do not match"
290
+
291
+ if scd_type == 2
292
+ # SCD Type 2: new row should be added and old row should be updated
293
+ ETL::Engine.logger.debug "type 2 SCD"
294
+
295
+ # To update the old row, we delete the version in the database
296
+ # and insert a new expired version
297
+
298
+ # If there is no truncate then the row will exist twice in the database
299
+ delete_outdated_record
300
+
301
+ ETL::Engine.logger.debug "expiring original record"
302
+ @existing_row[scd_end_date_field] = @timestamp
303
+ @existing_row[scd_latest_version_field] = false
304
+
305
+ buffer << @existing_row
306
+
307
+ elsif scd_type == 1
308
+ # SCD Type 1: only the new row should be added
309
+ ETL::Engine.logger.debug "type 1 SCD"
310
+
311
+ # Copy primary key, and other non-evolving fields over from
312
+ # original version of record
313
+ non_evolving_fields.each do |non_evolving_field|
314
+ row[non_evolving_field] = @existing_row[non_evolving_field]
315
+ end
316
+
317
+ # If there is no truncate then the row will exist twice in the database
318
+ delete_outdated_record
349
319
  else
350
- [] # no natural key defined
320
+ # SCD Type 3: not supported
321
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
351
322
  end
323
+
324
+ # In all cases, the latest, greatest version of the record
325
+ # should go into the load
326
+ schedule_new_record(row)
352
327
  end
328
+
329
+ # Do all the steps required when a SCD has *not* changed. Exact
330
+ # steps depend on what type of SCD we're handling.
331
+ def process_scd_match(row)
332
+ ETL::Engine.logger.debug "SCD fields match"
333
+
334
+ if scd_type == 2 && has_non_scd_field_changes?(row)
335
+ ETL::Engine.logger.debug "Non-SCD field changes"
336
+ # Copy important data over from original version of record
337
+ row[primary_key] = @existing_row[primary_key]
338
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
339
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
340
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
341
+
342
+ # If there is no truncate then the row will exist twice in the database
343
+ delete_outdated_record
344
+
345
+ buffer << row
346
+ else
347
+ # The record is totally the same, so skip it
348
+ end
349
+ end
350
+
351
+ # Find the version of this row that already exists in the datawarehouse.
352
+ def preexisting_row(row)
353
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
355
+
356
+ #puts "looking for original record"
357
+ result = connection.select_one(q)
358
+
359
+ #puts "Result: #{result.inspect}"
360
+
361
+ result ? ETL::Row[result.symbolize_keys!] : nil
362
+ end
363
+
364
+ # Check whether non-scd fields have changed since the last
365
+ # load of this record.
366
+ def has_scd_field_changes?(row)
367
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
368
+ end
369
+
370
+ # Check whether non-scd fields have changed since the last
371
+ # load of this record.
372
+ def has_non_scd_field_changes?(row)
373
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
374
+ end
375
+
376
+ # Grab, or re-use, a database connection for running queries directly
377
+ # during the destination processing.
378
+ def connection
379
+ @conn ||= ETL::Engine.connection(dimension_target)
380
+ end
381
+
382
+ # Utility for removing a row that has outdated information. Note
383
+ # that this deletes directly from the database, even if this is a file
384
+ # destination. It needs to do this because you can't do deletes in a
385
+ # bulk load.
386
+ def delete_outdated_record
387
+ ETL::Engine.logger.debug "deleting old row"
388
+
389
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
390
+ connection.delete(q)
391
+ end
392
+
393
+ # Schedule the latest, greatest version of the row for insertion
394
+ # into the database
395
+ def schedule_new_record(row)
396
+ ETL::Engine.logger.debug "writing new record"
397
+ if scd_type == 2
398
+ row[scd_effective_date_field] = @timestamp
399
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
400
+ row[scd_latest_version_field] = true
401
+ end
402
+ buffer << row
403
+ end
404
+
405
+ # Get the name of the primary key for this table. Asks the dimension
406
+ # model class for this information, but if that class hasn't been
407
+ # defined, just defaults to :id.
408
+ def primary_key
409
+ return @primary_key if @primary_key
410
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
411
+ rescue NameError => e
412
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
413
+ @primary_key = :id
414
+ end
415
+
353
416
  end
354
417
  end
355
418
  end