activewarehouse-etl 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +22 -2
- data/README +12 -0
- data/Rakefile +64 -59
- data/bin/etl +0 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +11 -1
- data/lib/etl.rb +9 -21
- data/lib/etl/builder.rb +2 -1
- data/lib/etl/builder/date_dimension_builder.rb +67 -54
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +1 -2
- data/lib/etl/control/control.rb +46 -18
- data/lib/etl/control/destination.rb +201 -138
- data/lib/etl/control/destination/database_destination.rb +10 -5
- data/lib/etl/control/source.rb +1 -1
- data/lib/etl/control/source/database_source.rb +8 -10
- data/lib/etl/core_ext/time/calculations.rb +4 -2
- data/lib/etl/engine.rb +35 -10
- data/lib/etl/execution/migration.rb +21 -9
- data/lib/etl/generator/generator.rb +1 -1
- data/lib/etl/http_tools.rb +21 -7
- data/lib/etl/parser/apache_combined_log_parser.rb +3 -1
- data/lib/etl/parser/delimited_parser.rb +1 -1
- data/lib/etl/parser/parser.rb +1 -1
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +5 -1
- data/lib/etl/processor/check_exist_processor.rb +1 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +67 -2
- data/lib/etl/transform/string_to_date_transform.rb +6 -1
- data/lib/etl/transform/string_to_datetime_transform.rb +1 -1
- data/lib/etl/transform/string_to_time_transform.rb +1 -1
- data/lib/etl/version.rb +1 -1
- metadata +94 -78
data/lib/etl/commands/etl.rb
CHANGED
@@ -78,12 +78,11 @@ def execute
|
|
78
78
|
|
79
79
|
ETL::Engine.init(options)
|
80
80
|
ARGV.each do |f|
|
81
|
-
puts "Processing #{f}"
|
82
81
|
ETL::Engine.realtime_activity = true
|
83
82
|
ETL::Engine.process(f)
|
84
83
|
end
|
85
84
|
|
86
|
-
puts "ETL process complete"
|
85
|
+
puts "ETL process complete\n\n"
|
87
86
|
end
|
88
87
|
end
|
89
88
|
|
data/lib/etl/control/control.rb
CHANGED
@@ -34,7 +34,7 @@ module ETL #:nodoc:
|
|
34
34
|
# to_s + '.ctl', or they may be strings in which case they will be used
|
35
35
|
# as is
|
36
36
|
def depends_on(*args)
|
37
|
-
dependencies << args
|
37
|
+
(dependencies << args).flatten!
|
38
38
|
end
|
39
39
|
|
40
40
|
# Get the defined dependencies
|
@@ -115,13 +115,15 @@ module ETL #:nodoc:
|
|
115
115
|
if transformer
|
116
116
|
case transformer
|
117
117
|
when String, Symbol
|
118
|
-
class_name = "#{transformer.to_s.
|
118
|
+
class_name = "#{transformer.to_s.camelize}Transform"
|
119
119
|
begin
|
120
120
|
transform_class = ETL::Transform.const_get(class_name)
|
121
121
|
transforms << transform_class.new(self, name, configuration)
|
122
122
|
rescue NameError => e
|
123
123
|
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
124
124
|
end
|
125
|
+
when Class
|
126
|
+
transforms << transformer.new(self, transformer.name, configuration)
|
125
127
|
else
|
126
128
|
#transformer.class.inspect
|
127
129
|
if transformer.is_a?(ETL::Transform::Transform)
|
@@ -130,7 +132,7 @@ module ETL #:nodoc:
|
|
130
132
|
t.name = name
|
131
133
|
transforms << t
|
132
134
|
else
|
133
|
-
raise ControlError, "Transformer must be a String, Symbol or Transform instance"
|
135
|
+
raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
|
134
136
|
end
|
135
137
|
end
|
136
138
|
elsif block_given?
|
@@ -145,17 +147,28 @@ module ETL #:nodoc:
|
|
145
147
|
control.transforms
|
146
148
|
end
|
147
149
|
|
148
|
-
# Define a screen block. The type argument must be
|
149
|
-
# or :warn
|
150
|
+
# Define a before post-process screen block. The type argument must be
|
151
|
+
# one of :fatal, :error or :warn
|
150
152
|
def screen(type, &block)
|
151
153
|
screens[type] << block
|
152
154
|
end
|
153
155
|
|
154
|
-
# Get the screen blocks
|
156
|
+
# Get the before post-process screen blocks
|
155
157
|
def screens
|
156
158
|
control.screens
|
157
159
|
end
|
158
160
|
|
161
|
+
# Define an after post-proces screen block. The type argument must be
|
162
|
+
# one of :fatal, :error or :warn
|
163
|
+
def after_post_process_screen(type, &block)
|
164
|
+
after_post_process_screens[type] << block
|
165
|
+
end
|
166
|
+
|
167
|
+
# Get the after post-process screen blocks
|
168
|
+
def after_post_process_screens
|
169
|
+
control.after_post_process_screens
|
170
|
+
end
|
171
|
+
|
159
172
|
# Rename the source field to the destination field
|
160
173
|
def rename(source, destination)
|
161
174
|
after_read :rename, :source => source, :dest => destination
|
@@ -169,12 +182,17 @@ module ETL #:nodoc:
|
|
169
182
|
protected
|
170
183
|
# This method is used to define a processor and insert into the specified processor
|
171
184
|
# collection.
|
172
|
-
def define_processor(name, processor_collection, configuration)
|
185
|
+
def define_processor(name, processor_collection, configuration, proc)
|
173
186
|
case name
|
174
|
-
when String, Symbol
|
175
|
-
|
187
|
+
when String, Symbol, nil
|
188
|
+
name ||= 'block'
|
189
|
+
class_name = "#{name.to_s.camelize}Processor"
|
176
190
|
begin
|
177
191
|
processor_class = ETL::Processor.const_get(class_name)
|
192
|
+
if name == 'block'
|
193
|
+
raise ControlError, "A block must be passed for block processor" if proc.nil?
|
194
|
+
configuration[:block] = proc
|
195
|
+
end
|
178
196
|
processor_collection << processor_class.new(self, configuration)
|
179
197
|
rescue NameError => e
|
180
198
|
raise ControlError, "Unable to find processor #{class_name}: #{e}"
|
@@ -182,14 +200,14 @@ module ETL #:nodoc:
|
|
182
200
|
when Class
|
183
201
|
processor_collection << name.new(self, configuration)
|
184
202
|
else
|
185
|
-
raise ControlError, "The process declaration requires a String, Symbol or
|
203
|
+
raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
|
186
204
|
end
|
187
205
|
end
|
188
206
|
|
189
207
|
public
|
190
208
|
# Define an "after read" processor. This must be a row-level processor.
|
191
|
-
def after_read(name, configuration={})
|
192
|
-
define_processor(name, after_read_processors, configuration)
|
209
|
+
def after_read(name='block', configuration={}, &block)
|
210
|
+
define_processor(name, after_read_processors, configuration, block)
|
193
211
|
end
|
194
212
|
|
195
213
|
# Get the defined "after read" processors
|
@@ -198,8 +216,8 @@ module ETL #:nodoc:
|
|
198
216
|
end
|
199
217
|
|
200
218
|
# Define a "before write" processor. This must be a row-level processor.
|
201
|
-
def before_write(name, configuration={})
|
202
|
-
define_processor(name, before_write_processors, configuration)
|
219
|
+
def before_write(name='block', configuration={}, &block)
|
220
|
+
define_processor(name, before_write_processors, configuration, block)
|
203
221
|
end
|
204
222
|
|
205
223
|
# Get the defined "before write" processors
|
@@ -208,8 +226,8 @@ module ETL #:nodoc:
|
|
208
226
|
end
|
209
227
|
|
210
228
|
# Define a pre-processor
|
211
|
-
def pre_process(name, configuration={})
|
212
|
-
define_processor(name, pre_processors, configuration)
|
229
|
+
def pre_process(name='block', configuration={}, &block)
|
230
|
+
define_processor(name, pre_processors, configuration, block)
|
213
231
|
end
|
214
232
|
|
215
233
|
# Get the defined pre-processors
|
@@ -218,8 +236,8 @@ module ETL #:nodoc:
|
|
218
236
|
end
|
219
237
|
|
220
238
|
# Define a post-processor
|
221
|
-
def post_process(name, configuration={})
|
222
|
-
define_processor(name, post_processors, configuration)
|
239
|
+
def post_process(name='block', configuration={}, &block)
|
240
|
+
define_processor(name, post_processors, configuration, block)
|
223
241
|
end
|
224
242
|
|
225
243
|
# Get the defined post-processors
|
@@ -341,6 +359,7 @@ module ETL #:nodoc:
|
|
341
359
|
@transforms ||= []
|
342
360
|
end
|
343
361
|
|
362
|
+
# A hash of the screens executed before post-process
|
344
363
|
def screens
|
345
364
|
@screens ||= {
|
346
365
|
:fatal => [],
|
@@ -349,6 +368,15 @@ module ETL #:nodoc:
|
|
349
368
|
}
|
350
369
|
end
|
351
370
|
|
371
|
+
# A hash of the screens executed after post-process
|
372
|
+
def after_post_process_screens
|
373
|
+
@after_post_process_screens ||= {
|
374
|
+
:fatal => [],
|
375
|
+
:error => [],
|
376
|
+
:warn => []
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
352
380
|
# Get the error threshold. Defaults to 100.
|
353
381
|
def error_threshold
|
354
382
|
@error_threshold ||= 100
|
@@ -29,7 +29,7 @@ module ETL #:nodoc:
|
|
29
29
|
# For example if name is :database or 'database' then the
|
30
30
|
# DatabaseDestination class is returned
|
31
31
|
def class_for_name(name)
|
32
|
-
ETL::Control.const_get("#{name.to_s.
|
32
|
+
ETL::Control.const_get("#{name.to_s.camelize}Destination")
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
@@ -124,10 +124,20 @@ module ETL #:nodoc:
|
|
124
124
|
@compound_key_constraints ||= {}
|
125
125
|
end
|
126
126
|
|
127
|
-
# Return fields which are Slowly Changing Dimension fields.
|
128
|
-
#
|
129
|
-
|
130
|
-
|
127
|
+
# Return fields which are Slowly Changing Dimension fields.
|
128
|
+
# Uses the scd_fields specified in the configuration. If that's
|
129
|
+
# missing, uses all of the row's fields.
|
130
|
+
def scd_fields(row)
|
131
|
+
@scd_fields ||= configuration[:scd_fields] || row.keys
|
132
|
+
end
|
133
|
+
|
134
|
+
def non_scd_fields(row)
|
135
|
+
@non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
|
136
|
+
[primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
|
137
|
+
end
|
138
|
+
|
139
|
+
def non_evolving_fields
|
140
|
+
(Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
|
131
141
|
end
|
132
142
|
|
133
143
|
def scd?
|
@@ -150,19 +160,29 @@ module ETL #:nodoc:
|
|
150
160
|
configuration[:scd][:end_date_field] || :end_date if scd?
|
151
161
|
end
|
152
162
|
|
153
|
-
#
|
163
|
+
# Get the Slowly Changing Dimension latest version field. Defaults to
|
164
|
+
# 'latest_version'.
|
165
|
+
def scd_latest_version_field
|
166
|
+
configuration[:scd][:latest_version_field] || :latest_version if scd?
|
167
|
+
end
|
168
|
+
|
169
|
+
# Return the natural key field names, defaults to []
|
154
170
|
def natural_key
|
155
171
|
@natural_key ||= determine_natural_key
|
156
172
|
end
|
157
173
|
|
158
174
|
# Get the dimension table if specified
|
159
175
|
def dimension_table
|
160
|
-
|
176
|
+
@dimension_table ||= if scd?
|
177
|
+
ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
|
178
|
+
end
|
161
179
|
end
|
162
180
|
|
163
181
|
# Get the dimension target if specified
|
164
182
|
def dimension_target
|
165
|
-
|
183
|
+
@dimension_target ||= if scd?
|
184
|
+
configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
|
185
|
+
end
|
166
186
|
end
|
167
187
|
|
168
188
|
# Process a row to determine the change type
|
@@ -172,136 +192,27 @@ module ETL #:nodoc:
|
|
172
192
|
|
173
193
|
# Change processing can only occur if the natural key exists in the row
|
174
194
|
ETL::Engine.logger.debug "Checking for natural key existence"
|
175
|
-
|
195
|
+
unless has_natural_key?(row)
|
176
196
|
buffer << row
|
177
197
|
return
|
178
198
|
end
|
179
199
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
ETL::Engine.logger.debug "Checking for SCD
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
else
|
192
|
-
row.each { |key,value| s << value.to_s }
|
193
|
-
end
|
194
|
-
|
195
|
-
# apply the CRC to 's' and see if it matches the last
|
196
|
-
# ETL::Execution::Record with the samenatural key. If they match then
|
197
|
-
# throw away this row (no need to process). If they do not match then
|
198
|
-
# the record is an 'update'. If the record doesn't exist then it is an
|
199
|
-
# 'insert'
|
200
|
-
nk = natural_key.collect{|k|row[k]}.join('|')
|
201
|
-
require 'zlib'
|
202
|
-
crc = Zlib.crc32(s)
|
203
|
-
record = ETL::Execution::Record.find_by_control_file_and_natural_key(control.file, nk)
|
204
|
-
|
205
|
-
timestamp = Time.now
|
206
|
-
|
207
|
-
ETL::Engine.logger.debug "Checking record change type"
|
208
|
-
if record
|
209
|
-
if record.crc != crc.to_s
|
210
|
-
# SCD Type 1: only the new row should be added
|
211
|
-
# SCD Type 2: both an old and new row should be added
|
212
|
-
# SCD Type 3: not supported
|
213
|
-
ETL::Engine.logger.debug "CRC does not match"
|
214
|
-
|
215
|
-
if scd_type == 2
|
216
|
-
ETL::Engine.logger.debug "type 2 SCD"
|
217
|
-
|
218
|
-
raise ConfigurationError, "dimension_table setting required" unless dimension_table
|
219
|
-
raise ConfigurationError, "dimension_target setting required" unless dimension_target
|
220
|
-
|
221
|
-
conn = ETL::Engine.connection(dimension_target)
|
222
|
-
|
223
|
-
q = "SELECT * FROM #{dimension_table} WHERE "
|
224
|
-
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
225
|
-
#puts "looking for original record"
|
226
|
-
result = conn.select_one(q)
|
227
|
-
if result
|
228
|
-
#puts "Result: #{result.inspect}"
|
229
|
-
original_record = ETL::Row[result.symbolize_keys!]
|
230
|
-
original_record[scd_end_date_field] = timestamp
|
231
|
-
ETL::Engine.logger.debug "writing original record"
|
232
|
-
|
233
|
-
# if there is no truncate then the row will exist twice in the database
|
234
|
-
# need to figure out how to delete that old record before inserting the
|
235
|
-
# updated version of the record
|
236
|
-
|
237
|
-
q = "DELETE FROM #{dimension_table} WHERE "
|
238
|
-
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
239
|
-
|
240
|
-
num_rows_affected = conn.delete(q)
|
241
|
-
ETL::Engine.logger.debug "deleted old row"
|
242
|
-
|
243
|
-
# do this?
|
244
|
-
#raise "Should have deleted a single record" if num_rows_affected != 1
|
245
|
-
|
246
|
-
buffer << original_record
|
247
|
-
end
|
248
|
-
|
249
|
-
row[scd_effective_date_field] = timestamp
|
250
|
-
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
251
|
-
elsif scd_type == 1
|
252
|
-
ETL::Engine.logger.debug "type 1 SCD"
|
253
|
-
else
|
254
|
-
ETL::Engine.logger.debug "SCD not specified"
|
255
|
-
end
|
256
|
-
|
257
|
-
ETL::Engine.logger.debug "writing new record"
|
258
|
-
buffer << row
|
200
|
+
@timestamp = Time.now
|
201
|
+
|
202
|
+
# See if the scd_fields of the current record have changed
|
203
|
+
# from the last time this record was loaded into the data
|
204
|
+
# warehouse. If they match then throw away this row (no need
|
205
|
+
# to process). If they do not match then the record is an
|
206
|
+
# 'update'. If the record doesn't exist then it is an 'insert'
|
207
|
+
ETL::Engine.logger.debug "Checking record for SCD change"
|
208
|
+
if @existing_row = preexisting_row(row)
|
209
|
+
if has_scd_field_changes?(row)
|
210
|
+
process_scd_change(row)
|
259
211
|
else
|
260
|
-
|
261
|
-
|
262
|
-
raise ConfigurationError, "dimension_table setting required" unless dimension_table
|
263
|
-
raise ConfigurationError, "dimension_target setting required" unless dimension_target
|
264
|
-
|
265
|
-
conn = ETL::Engine.connection(dimension_target)
|
266
|
-
|
267
|
-
q = "SELECT * FROM #{dimension_table} WHERE "
|
268
|
-
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
269
|
-
result = conn.select_one(q)
|
270
|
-
if result
|
271
|
-
# This was necessary when truncating and then loading, however I
|
272
|
-
# am getting reluctant to having the ETL process do the truncation
|
273
|
-
# as part of the bulk load, favoring using a preprocessor instead.
|
274
|
-
# buffer << ETL::Row[result.symbolize_keys!]
|
275
|
-
else
|
276
|
-
# The record never made it into the database, so add the effective and end date
|
277
|
-
# and add it into the bulk load file
|
278
|
-
row[scd_effective_date_field] = timestamp
|
279
|
-
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
280
|
-
buffer << row
|
281
|
-
end
|
212
|
+
process_scd_match(row)
|
282
213
|
end
|
283
214
|
else
|
284
|
-
|
285
|
-
# Set the effective and end date fields
|
286
|
-
if scd_type == 2
|
287
|
-
row[scd_effective_date_field] = timestamp
|
288
|
-
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
289
|
-
end
|
290
|
-
|
291
|
-
# Write the row
|
292
|
-
buffer << row
|
293
|
-
|
294
|
-
# Record the record
|
295
|
-
if ETL::Engine.job # only record the execution if there is a job
|
296
|
-
ETL::Execution::Record.time_spent += Benchmark.realtime do
|
297
|
-
ETL::Execution::Record.create!(
|
298
|
-
:control_file => control.file,
|
299
|
-
:natural_key => nk,
|
300
|
-
:crc => crc,
|
301
|
-
:job_id => ETL::Engine.job.id
|
302
|
-
)
|
303
|
-
end
|
304
|
-
end
|
215
|
+
schedule_new_record(row)
|
305
216
|
end
|
306
217
|
end
|
307
218
|
|
@@ -316,6 +227,8 @@ module ETL #:nodoc:
|
|
316
227
|
def add_virtuals!(row)
|
317
228
|
if mapping[:virtual]
|
318
229
|
mapping[:virtual].each do |key,value|
|
230
|
+
# If the row already has the virtual set, assume that's correct
|
231
|
+
next if row[key]
|
319
232
|
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
320
233
|
case value
|
321
234
|
when Class
|
@@ -338,18 +251,168 @@ module ETL #:nodoc:
|
|
338
251
|
end
|
339
252
|
|
340
253
|
private
|
254
|
+
|
341
255
|
# Determine the natural key. This method will always return an array
|
342
|
-
# of symbols. The default value is [
|
256
|
+
# of symbols. The default value is [].
|
343
257
|
def determine_natural_key
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
258
|
+
Array(configuration[:natural_key]).collect(&:to_sym)
|
259
|
+
end
|
260
|
+
|
261
|
+
# Check whether a natural key has been defined, and if so, whether
|
262
|
+
# this row has enough information to do searches based on that natural
|
263
|
+
# key.
|
264
|
+
#
|
265
|
+
# TODO: This should be factored out into
|
266
|
+
# ETL::Row#has_all_fields?(field_array) But that's not possible
|
267
|
+
# until *all* sources cast to ETL::Row, instead of sometimes
|
268
|
+
# using Hash
|
269
|
+
def has_natural_key?(row)
|
270
|
+
natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
|
271
|
+
end
|
272
|
+
|
273
|
+
# Helper for generating the SQL where clause that allows searching
|
274
|
+
# by a natural key
|
275
|
+
def natural_key_equality_for_row(row)
|
276
|
+
statement = []
|
277
|
+
values = []
|
278
|
+
natural_key.each do |nk|
|
279
|
+
statement << "#{nk} = ?"
|
280
|
+
values << row[nk]
|
281
|
+
end
|
282
|
+
statement = statement.join(" AND ")
|
283
|
+
ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
|
284
|
+
end
|
285
|
+
|
286
|
+
# Do all the steps required when a SCD *has* changed. Exact steps
|
287
|
+
# depend on what type of SCD we're handling.
|
288
|
+
def process_scd_change(row)
|
289
|
+
ETL::Engine.logger.debug "SCD fields do not match"
|
290
|
+
|
291
|
+
if scd_type == 2
|
292
|
+
# SCD Type 2: new row should be added and old row should be updated
|
293
|
+
ETL::Engine.logger.debug "type 2 SCD"
|
294
|
+
|
295
|
+
# To update the old row, we delete the version in the database
|
296
|
+
# and insert a new expired version
|
297
|
+
|
298
|
+
# If there is no truncate then the row will exist twice in the database
|
299
|
+
delete_outdated_record
|
300
|
+
|
301
|
+
ETL::Engine.logger.debug "expiring original record"
|
302
|
+
@existing_row[scd_end_date_field] = @timestamp
|
303
|
+
@existing_row[scd_latest_version_field] = false
|
304
|
+
|
305
|
+
buffer << @existing_row
|
306
|
+
|
307
|
+
elsif scd_type == 1
|
308
|
+
# SCD Type 1: only the new row should be added
|
309
|
+
ETL::Engine.logger.debug "type 1 SCD"
|
310
|
+
|
311
|
+
# Copy primary key, and other non-evolving fields over from
|
312
|
+
# original version of record
|
313
|
+
non_evolving_fields.each do |non_evolving_field|
|
314
|
+
row[non_evolving_field] = @existing_row[non_evolving_field]
|
315
|
+
end
|
316
|
+
|
317
|
+
# If there is no truncate then the row will exist twice in the database
|
318
|
+
delete_outdated_record
|
349
319
|
else
|
350
|
-
|
320
|
+
# SCD Type 3: not supported
|
321
|
+
ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
|
351
322
|
end
|
323
|
+
|
324
|
+
# In all cases, the latest, greatest version of the record
|
325
|
+
# should go into the load
|
326
|
+
schedule_new_record(row)
|
352
327
|
end
|
328
|
+
|
329
|
+
# Do all the steps required when a SCD has *not* changed. Exact
|
330
|
+
# steps depend on what type of SCD we're handling.
|
331
|
+
def process_scd_match(row)
|
332
|
+
ETL::Engine.logger.debug "SCD fields match"
|
333
|
+
|
334
|
+
if scd_type == 2 && has_non_scd_field_changes?(row)
|
335
|
+
ETL::Engine.logger.debug "Non-SCD field changes"
|
336
|
+
# Copy important data over from original version of record
|
337
|
+
row[primary_key] = @existing_row[primary_key]
|
338
|
+
row[scd_end_date_field] = @existing_row[scd_end_date_field]
|
339
|
+
row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
|
340
|
+
row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
|
341
|
+
|
342
|
+
# If there is no truncate then the row will exist twice in the database
|
343
|
+
delete_outdated_record
|
344
|
+
|
345
|
+
buffer << row
|
346
|
+
else
|
347
|
+
# The record is totally the same, so skip it
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
# Find the version of this row that already exists in the datawarehouse.
|
352
|
+
def preexisting_row(row)
|
353
|
+
q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
|
354
|
+
q << " AND #{scd_latest_version_field}" if scd_type == 2
|
355
|
+
|
356
|
+
#puts "looking for original record"
|
357
|
+
result = connection.select_one(q)
|
358
|
+
|
359
|
+
#puts "Result: #{result.inspect}"
|
360
|
+
|
361
|
+
result ? ETL::Row[result.symbolize_keys!] : nil
|
362
|
+
end
|
363
|
+
|
364
|
+
# Check whether non-scd fields have changed since the last
|
365
|
+
# load of this record.
|
366
|
+
def has_scd_field_changes?(row)
|
367
|
+
scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
|
368
|
+
end
|
369
|
+
|
370
|
+
# Check whether non-scd fields have changed since the last
|
371
|
+
# load of this record.
|
372
|
+
def has_non_scd_field_changes?(row)
|
373
|
+
non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
|
374
|
+
end
|
375
|
+
|
376
|
+
# Grab, or re-use, a database connection for running queries directly
|
377
|
+
# during the destination processing.
|
378
|
+
def connection
|
379
|
+
@conn ||= ETL::Engine.connection(dimension_target)
|
380
|
+
end
|
381
|
+
|
382
|
+
# Utility for removing a row that has outdated information. Note
|
383
|
+
# that this deletes directly from the database, even if this is a file
|
384
|
+
# destination. It needs to do this because you can't do deletes in a
|
385
|
+
# bulk load.
|
386
|
+
def delete_outdated_record
|
387
|
+
ETL::Engine.logger.debug "deleting old row"
|
388
|
+
|
389
|
+
q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
|
390
|
+
connection.delete(q)
|
391
|
+
end
|
392
|
+
|
393
|
+
# Schedule the latest, greatest version of the row for insertion
|
394
|
+
# into the database
|
395
|
+
def schedule_new_record(row)
|
396
|
+
ETL::Engine.logger.debug "writing new record"
|
397
|
+
if scd_type == 2
|
398
|
+
row[scd_effective_date_field] = @timestamp
|
399
|
+
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
400
|
+
row[scd_latest_version_field] = true
|
401
|
+
end
|
402
|
+
buffer << row
|
403
|
+
end
|
404
|
+
|
405
|
+
# Get the name of the primary key for this table. Asks the dimension
|
406
|
+
# model class for this information, but if that class hasn't been
|
407
|
+
# defined, just defaults to :id.
|
408
|
+
def primary_key
|
409
|
+
return @primary_key if @primary_key
|
410
|
+
@primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
|
411
|
+
rescue NameError => e
|
412
|
+
ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
|
413
|
+
@primary_key = :id
|
414
|
+
end
|
415
|
+
|
353
416
|
end
|
354
417
|
end
|
355
418
|
end
|