activewarehouse-etl 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -78,12 +78,11 @@ def execute
78
78
 
79
79
  ETL::Engine.init(options)
80
80
  ARGV.each do |f|
81
- puts "Processing #{f}"
82
81
  ETL::Engine.realtime_activity = true
83
82
  ETL::Engine.process(f)
84
83
  end
85
84
 
86
- puts "ETL process complete"
85
+ puts "ETL process complete\n\n"
87
86
  end
88
87
  end
89
88
 
@@ -34,7 +34,7 @@ module ETL #:nodoc:
34
34
  # to_s + '.ctl', or they may be strings in which case they will be used
35
35
  # as is
36
36
  def depends_on(*args)
37
- dependencies << args
37
+ (dependencies << args).flatten!
38
38
  end
39
39
 
40
40
  # Get the defined dependencies
@@ -115,13 +115,15 @@ module ETL #:nodoc:
115
115
  if transformer
116
116
  case transformer
117
117
  when String, Symbol
118
- class_name = "#{transformer.to_s.classify}Transform"
118
+ class_name = "#{transformer.to_s.camelize}Transform"
119
119
  begin
120
120
  transform_class = ETL::Transform.const_get(class_name)
121
121
  transforms << transform_class.new(self, name, configuration)
122
122
  rescue NameError => e
123
123
  raise ControlError, "Unable to find transformer #{class_name}: #{e}"
124
124
  end
125
+ when Class
126
+ transforms << transformer.new(self, transformer.name, configuration)
125
127
  else
126
128
  #transformer.class.inspect
127
129
  if transformer.is_a?(ETL::Transform::Transform)
@@ -130,7 +132,7 @@ module ETL #:nodoc:
130
132
  t.name = name
131
133
  transforms << t
132
134
  else
133
- raise ControlError, "Transformer must be a String, Symbol or Transform instance"
135
+ raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
134
136
  end
135
137
  end
136
138
  elsif block_given?
@@ -145,17 +147,28 @@ module ETL #:nodoc:
145
147
  control.transforms
146
148
  end
147
149
 
148
- # Define a screen block. The type argument must be one of :fatal, :error
149
- # or :warn
150
+ # Define a before post-process screen block. The type argument must be
151
+ # one of :fatal, :error or :warn
150
152
  def screen(type, &block)
151
153
  screens[type] << block
152
154
  end
153
155
 
154
- # Get the screen blocks
156
+ # Get the before post-process screen blocks
155
157
  def screens
156
158
  control.screens
157
159
  end
158
160
 
161
+ # Define an after post-proces screen block. The type argument must be
162
+ # one of :fatal, :error or :warn
163
+ def after_post_process_screen(type, &block)
164
+ after_post_process_screens[type] << block
165
+ end
166
+
167
+ # Get the after post-process screen blocks
168
+ def after_post_process_screens
169
+ control.after_post_process_screens
170
+ end
171
+
159
172
  # Rename the source field to the destination field
160
173
  def rename(source, destination)
161
174
  after_read :rename, :source => source, :dest => destination
@@ -169,12 +182,17 @@ module ETL #:nodoc:
169
182
  protected
170
183
  # This method is used to define a processor and insert into the specified processor
171
184
  # collection.
172
- def define_processor(name, processor_collection, configuration)
185
+ def define_processor(name, processor_collection, configuration, proc)
173
186
  case name
174
- when String, Symbol
175
- class_name = "#{name.to_s.classify}Processor"
187
+ when String, Symbol, nil
188
+ name ||= 'block'
189
+ class_name = "#{name.to_s.camelize}Processor"
176
190
  begin
177
191
  processor_class = ETL::Processor.const_get(class_name)
192
+ if name == 'block'
193
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
194
+ configuration[:block] = proc
195
+ end
178
196
  processor_collection << processor_class.new(self, configuration)
179
197
  rescue NameError => e
180
198
  raise ControlError, "Unable to find processor #{class_name}: #{e}"
@@ -182,14 +200,14 @@ module ETL #:nodoc:
182
200
  when Class
183
201
  processor_collection << name.new(self, configuration)
184
202
  else
185
- raise ControlError, "The process declaration requires a String, Symbol or Class"
203
+ raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
186
204
  end
187
205
  end
188
206
 
189
207
  public
190
208
  # Define an "after read" processor. This must be a row-level processor.
191
- def after_read(name, configuration={})
192
- define_processor(name, after_read_processors, configuration)
209
+ def after_read(name='block', configuration={}, &block)
210
+ define_processor(name, after_read_processors, configuration, block)
193
211
  end
194
212
 
195
213
  # Get the defined "after read" processors
@@ -198,8 +216,8 @@ module ETL #:nodoc:
198
216
  end
199
217
 
200
218
  # Define a "before write" processor. This must be a row-level processor.
201
- def before_write(name, configuration={})
202
- define_processor(name, before_write_processors, configuration)
219
+ def before_write(name='block', configuration={}, &block)
220
+ define_processor(name, before_write_processors, configuration, block)
203
221
  end
204
222
 
205
223
  # Get the defined "before write" processors
@@ -208,8 +226,8 @@ module ETL #:nodoc:
208
226
  end
209
227
 
210
228
  # Define a pre-processor
211
- def pre_process(name, configuration={})
212
- define_processor(name, pre_processors, configuration)
229
+ def pre_process(name='block', configuration={}, &block)
230
+ define_processor(name, pre_processors, configuration, block)
213
231
  end
214
232
 
215
233
  # Get the defined pre-processors
@@ -218,8 +236,8 @@ module ETL #:nodoc:
218
236
  end
219
237
 
220
238
  # Define a post-processor
221
- def post_process(name, configuration={})
222
- define_processor(name, post_processors, configuration)
239
+ def post_process(name='block', configuration={}, &block)
240
+ define_processor(name, post_processors, configuration, block)
223
241
  end
224
242
 
225
243
  # Get the defined post-processors
@@ -341,6 +359,7 @@ module ETL #:nodoc:
341
359
  @transforms ||= []
342
360
  end
343
361
 
362
+ # A hash of the screens executed before post-process
344
363
  def screens
345
364
  @screens ||= {
346
365
  :fatal => [],
@@ -349,6 +368,15 @@ module ETL #:nodoc:
349
368
  }
350
369
  end
351
370
 
371
+ # A hash of the screens executed after post-process
372
+ def after_post_process_screens
373
+ @after_post_process_screens ||= {
374
+ :fatal => [],
375
+ :error => [],
376
+ :warn => []
377
+ }
378
+ end
379
+
352
380
  # Get the error threshold. Defaults to 100.
353
381
  def error_threshold
354
382
  @error_threshold ||= 100
@@ -29,7 +29,7 @@ module ETL #:nodoc:
29
29
  # For example if name is :database or 'database' then the
30
30
  # DatabaseDestination class is returned
31
31
  def class_for_name(name)
32
- ETL::Control.const_get("#{name.to_s.classify}Destination")
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
33
  end
34
34
  end
35
35
 
@@ -124,10 +124,20 @@ module ETL #:nodoc:
124
124
  @compound_key_constraints ||= {}
125
125
  end
126
126
 
127
- # Return fields which are Slowly Changing Dimension fields. Return nil
128
- # by default.
129
- def scd_fields
130
- @scd_fields ||= configuration[:scd_fields]
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ end
133
+
134
+ def non_scd_fields(row)
135
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
137
+ end
138
+
139
+ def non_evolving_fields
140
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
131
141
  end
132
142
 
133
143
  def scd?
@@ -150,19 +160,29 @@ module ETL #:nodoc:
150
160
  configuration[:scd][:end_date_field] || :end_date if scd?
151
161
  end
152
162
 
153
- # Return the natural key field name, defaults to :id
163
+ # Get the Slowly Changing Dimension latest version field. Defaults to
164
+ # 'latest_version'.
165
+ def scd_latest_version_field
166
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
167
+ end
168
+
169
+ # Return the natural key field names, defaults to []
154
170
  def natural_key
155
171
  @natural_key ||= determine_natural_key
156
172
  end
157
173
 
158
174
  # Get the dimension table if specified
159
175
  def dimension_table
160
- ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) if scd?
176
+ @dimension_table ||= if scd?
177
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
178
+ end
161
179
  end
162
180
 
163
181
  # Get the dimension target if specified
164
182
  def dimension_target
165
- configuration[:scd][:dimension_target] if scd?
183
+ @dimension_target ||= if scd?
184
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
185
+ end
166
186
  end
167
187
 
168
188
  # Process a row to determine the change type
@@ -172,136 +192,27 @@ module ETL #:nodoc:
172
192
 
173
193
  # Change processing can only occur if the natural key exists in the row
174
194
  ETL::Engine.logger.debug "Checking for natural key existence"
175
- if natural_key.length == 0
195
+ unless has_natural_key?(row)
176
196
  buffer << row
177
197
  return
178
198
  end
179
199
 
180
- natural_key.each do |key|
181
- unless row.has_key?(key)
182
- buffer << row
183
- return
184
- end
185
- end
186
-
187
- ETL::Engine.logger.debug "Checking for SCD fields"
188
- s = String.new
189
- if scd_fields
190
- scd_fields.each { |f| s << row[f].to_s }
191
- else
192
- row.each { |key,value| s << value.to_s }
193
- end
194
-
195
- # apply the CRC to 's' and see if it matches the last
196
- # ETL::Execution::Record with the samenatural key. If they match then
197
- # throw away this row (no need to process). If they do not match then
198
- # the record is an 'update'. If the record doesn't exist then it is an
199
- # 'insert'
200
- nk = natural_key.collect{|k|row[k]}.join('|')
201
- require 'zlib'
202
- crc = Zlib.crc32(s)
203
- record = ETL::Execution::Record.find_by_control_file_and_natural_key(control.file, nk)
204
-
205
- timestamp = Time.now
206
-
207
- ETL::Engine.logger.debug "Checking record change type"
208
- if record
209
- if record.crc != crc.to_s
210
- # SCD Type 1: only the new row should be added
211
- # SCD Type 2: both an old and new row should be added
212
- # SCD Type 3: not supported
213
- ETL::Engine.logger.debug "CRC does not match"
214
-
215
- if scd_type == 2
216
- ETL::Engine.logger.debug "type 2 SCD"
217
-
218
- raise ConfigurationError, "dimension_table setting required" unless dimension_table
219
- raise ConfigurationError, "dimension_target setting required" unless dimension_target
220
-
221
- conn = ETL::Engine.connection(dimension_target)
222
-
223
- q = "SELECT * FROM #{dimension_table} WHERE "
224
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
225
- #puts "looking for original record"
226
- result = conn.select_one(q)
227
- if result
228
- #puts "Result: #{result.inspect}"
229
- original_record = ETL::Row[result.symbolize_keys!]
230
- original_record[scd_end_date_field] = timestamp
231
- ETL::Engine.logger.debug "writing original record"
232
-
233
- # if there is no truncate then the row will exist twice in the database
234
- # need to figure out how to delete that old record before inserting the
235
- # updated version of the record
236
-
237
- q = "DELETE FROM #{dimension_table} WHERE "
238
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
239
-
240
- num_rows_affected = conn.delete(q)
241
- ETL::Engine.logger.debug "deleted old row"
242
-
243
- # do this?
244
- #raise "Should have deleted a single record" if num_rows_affected != 1
245
-
246
- buffer << original_record
247
- end
248
-
249
- row[scd_effective_date_field] = timestamp
250
- row[scd_end_date_field] = '9999-12-31 00:00:00'
251
- elsif scd_type == 1
252
- ETL::Engine.logger.debug "type 1 SCD"
253
- else
254
- ETL::Engine.logger.debug "SCD not specified"
255
- end
256
-
257
- ETL::Engine.logger.debug "writing new record"
258
- buffer << row
200
+ @timestamp = Time.now
201
+
202
+ # See if the scd_fields of the current record have changed
203
+ # from the last time this record was loaded into the data
204
+ # warehouse. If they match then throw away this row (no need
205
+ # to process). If they do not match then the record is an
206
+ # 'update'. If the record doesn't exist then it is an 'insert'
207
+ ETL::Engine.logger.debug "Checking record for SCD change"
208
+ if @existing_row = preexisting_row(row)
209
+ if has_scd_field_changes?(row)
210
+ process_scd_change(row)
259
211
  else
260
- ETL::Engine.logger.debug "CRC matches, skipping"
261
-
262
- raise ConfigurationError, "dimension_table setting required" unless dimension_table
263
- raise ConfigurationError, "dimension_target setting required" unless dimension_target
264
-
265
- conn = ETL::Engine.connection(dimension_target)
266
-
267
- q = "SELECT * FROM #{dimension_table} WHERE "
268
- q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
269
- result = conn.select_one(q)
270
- if result
271
- # This was necessary when truncating and then loading, however I
272
- # am getting reluctant to having the ETL process do the truncation
273
- # as part of the bulk load, favoring using a preprocessor instead.
274
- # buffer << ETL::Row[result.symbolize_keys!]
275
- else
276
- # The record never made it into the database, so add the effective and end date
277
- # and add it into the bulk load file
278
- row[scd_effective_date_field] = timestamp
279
- row[scd_end_date_field] = '9999-12-31 00:00:00'
280
- buffer << row
281
- end
212
+ process_scd_match(row)
282
213
  end
283
214
  else
284
- ETL::Engine.logger.debug "record never loaded"
285
- # Set the effective and end date fields
286
- if scd_type == 2
287
- row[scd_effective_date_field] = timestamp
288
- row[scd_end_date_field] = '9999-12-31 00:00:00'
289
- end
290
-
291
- # Write the row
292
- buffer << row
293
-
294
- # Record the record
295
- if ETL::Engine.job # only record the execution if there is a job
296
- ETL::Execution::Record.time_spent += Benchmark.realtime do
297
- ETL::Execution::Record.create!(
298
- :control_file => control.file,
299
- :natural_key => nk,
300
- :crc => crc,
301
- :job_id => ETL::Engine.job.id
302
- )
303
- end
304
- end
215
+ schedule_new_record(row)
305
216
  end
306
217
  end
307
218
 
@@ -316,6 +227,8 @@ module ETL #:nodoc:
316
227
  def add_virtuals!(row)
317
228
  if mapping[:virtual]
318
229
  mapping[:virtual].each do |key,value|
230
+ # If the row already has the virtual set, assume that's correct
231
+ next if row[key]
319
232
  # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
320
233
  case value
321
234
  when Class
@@ -338,18 +251,168 @@ module ETL #:nodoc:
338
251
  end
339
252
 
340
253
  private
254
+
341
255
  # Determine the natural key. This method will always return an array
342
- # of symbols. The default value is [:id].
256
+ # of symbols. The default value is [].
343
257
  def determine_natural_key
344
- case configuration[:natural_key]
345
- when Array
346
- configuration[:natural_key].collect(&:to_sym)
347
- when String, Symbol
348
- [configuration[:natural_key].to_sym]
258
+ Array(configuration[:natural_key]).collect(&:to_sym)
259
+ end
260
+
261
+ # Check whether a natural key has been defined, and if so, whether
262
+ # this row has enough information to do searches based on that natural
263
+ # key.
264
+ #
265
+ # TODO: This should be factored out into
266
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
267
+ # until *all* sources cast to ETL::Row, instead of sometimes
268
+ # using Hash
269
+ def has_natural_key?(row)
270
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
271
+ end
272
+
273
+ # Helper for generating the SQL where clause that allows searching
274
+ # by a natural key
275
+ def natural_key_equality_for_row(row)
276
+ statement = []
277
+ values = []
278
+ natural_key.each do |nk|
279
+ statement << "#{nk} = ?"
280
+ values << row[nk]
281
+ end
282
+ statement = statement.join(" AND ")
283
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
284
+ end
285
+
286
+ # Do all the steps required when a SCD *has* changed. Exact steps
287
+ # depend on what type of SCD we're handling.
288
+ def process_scd_change(row)
289
+ ETL::Engine.logger.debug "SCD fields do not match"
290
+
291
+ if scd_type == 2
292
+ # SCD Type 2: new row should be added and old row should be updated
293
+ ETL::Engine.logger.debug "type 2 SCD"
294
+
295
+ # To update the old row, we delete the version in the database
296
+ # and insert a new expired version
297
+
298
+ # If there is no truncate then the row will exist twice in the database
299
+ delete_outdated_record
300
+
301
+ ETL::Engine.logger.debug "expiring original record"
302
+ @existing_row[scd_end_date_field] = @timestamp
303
+ @existing_row[scd_latest_version_field] = false
304
+
305
+ buffer << @existing_row
306
+
307
+ elsif scd_type == 1
308
+ # SCD Type 1: only the new row should be added
309
+ ETL::Engine.logger.debug "type 1 SCD"
310
+
311
+ # Copy primary key, and other non-evolving fields over from
312
+ # original version of record
313
+ non_evolving_fields.each do |non_evolving_field|
314
+ row[non_evolving_field] = @existing_row[non_evolving_field]
315
+ end
316
+
317
+ # If there is no truncate then the row will exist twice in the database
318
+ delete_outdated_record
349
319
  else
350
- [] # no natural key defined
320
+ # SCD Type 3: not supported
321
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
351
322
  end
323
+
324
+ # In all cases, the latest, greatest version of the record
325
+ # should go into the load
326
+ schedule_new_record(row)
352
327
  end
328
+
329
+ # Do all the steps required when a SCD has *not* changed. Exact
330
+ # steps depend on what type of SCD we're handling.
331
+ def process_scd_match(row)
332
+ ETL::Engine.logger.debug "SCD fields match"
333
+
334
+ if scd_type == 2 && has_non_scd_field_changes?(row)
335
+ ETL::Engine.logger.debug "Non-SCD field changes"
336
+ # Copy important data over from original version of record
337
+ row[primary_key] = @existing_row[primary_key]
338
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
339
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
340
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
341
+
342
+ # If there is no truncate then the row will exist twice in the database
343
+ delete_outdated_record
344
+
345
+ buffer << row
346
+ else
347
+ # The record is totally the same, so skip it
348
+ end
349
+ end
350
+
351
+ # Find the version of this row that already exists in the datawarehouse.
352
+ def preexisting_row(row)
353
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
355
+
356
+ #puts "looking for original record"
357
+ result = connection.select_one(q)
358
+
359
+ #puts "Result: #{result.inspect}"
360
+
361
+ result ? ETL::Row[result.symbolize_keys!] : nil
362
+ end
363
+
364
+ # Check whether non-scd fields have changed since the last
365
+ # load of this record.
366
+ def has_scd_field_changes?(row)
367
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
368
+ end
369
+
370
+ # Check whether non-scd fields have changed since the last
371
+ # load of this record.
372
+ def has_non_scd_field_changes?(row)
373
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
374
+ end
375
+
376
+ # Grab, or re-use, a database connection for running queries directly
377
+ # during the destination processing.
378
+ def connection
379
+ @conn ||= ETL::Engine.connection(dimension_target)
380
+ end
381
+
382
+ # Utility for removing a row that has outdated information. Note
383
+ # that this deletes directly from the database, even if this is a file
384
+ # destination. It needs to do this because you can't do deletes in a
385
+ # bulk load.
386
+ def delete_outdated_record
387
+ ETL::Engine.logger.debug "deleting old row"
388
+
389
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
390
+ connection.delete(q)
391
+ end
392
+
393
+ # Schedule the latest, greatest version of the row for insertion
394
+ # into the database
395
+ def schedule_new_record(row)
396
+ ETL::Engine.logger.debug "writing new record"
397
+ if scd_type == 2
398
+ row[scd_effective_date_field] = @timestamp
399
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
400
+ row[scd_latest_version_field] = true
401
+ end
402
+ buffer << row
403
+ end
404
+
405
+ # Get the name of the primary key for this table. Asks the dimension
406
+ # model class for this information, but if that class hasn't been
407
+ # defined, just defaults to :id.
408
+ def primary_key
409
+ return @primary_key if @primary_key
410
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
411
+ rescue NameError => e
412
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
413
+ @primary_key = :id
414
+ end
415
+
353
416
  end
354
417
  end
355
418
  end