factorylabs-activewarehouse-etl 0.9.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +85 -0
- data/Rakefile +153 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +78 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +420 -0
- data/lib/etl/control/destination/database_destination.rb +95 -0
- data/lib/etl/control/destination/file_destination.rb +124 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution.rb +20 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution/record.rb +18 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +81 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- metadata +195 -0
data/lib/etl/control.rb
ADDED
@@ -0,0 +1,405 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# The Context is passed to eval.
|
4
|
+
class Context
|
5
|
+
require 'test/unit/assertions'
|
6
|
+
include Test::Unit::Assertions
|
7
|
+
attr_reader :control
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Create a Context instance
|
11
|
+
def create(control)
|
12
|
+
Context.new(control).get_binding
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Initialize the context
|
17
|
+
def initialize(control)
|
18
|
+
@control = control
|
19
|
+
end
|
20
|
+
|
21
|
+
# Get the control file
|
22
|
+
def file
|
23
|
+
control.file
|
24
|
+
end
|
25
|
+
|
26
|
+
# Set the allowed error threshold
|
27
|
+
def set_error_threshold(error_threshold)
|
28
|
+
control.error_threshold = error_threshold
|
29
|
+
end
|
30
|
+
|
31
|
+
# Define a list of control files that this file depends on. Those control
|
32
|
+
# files will be executed prior to this control file. The list may
|
33
|
+
# contain symbols that will be converted to file names by calling
|
34
|
+
# to_s + '.ctl', or they may be strings in which case they will be used
|
35
|
+
# as is
|
36
|
+
def depends_on(*args)
|
37
|
+
(dependencies << args).flatten!
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the defined dependencies
|
41
|
+
def dependencies
|
42
|
+
control.dependencies
|
43
|
+
end
|
44
|
+
|
45
|
+
# Define a source.
|
46
|
+
def source(name, configuration={}, definition={})
|
47
|
+
if configuration[:type]
|
48
|
+
case configuration[:type]
|
49
|
+
when Class
|
50
|
+
source_class = configuration[:type]
|
51
|
+
sources << source_class.new(self, configuration, definition)
|
52
|
+
when String, Symbol
|
53
|
+
source_class = ETL::Control::Source.class_for_name(configuration[:type])
|
54
|
+
sources << source_class.new(self, configuration, definition)
|
55
|
+
else
|
56
|
+
if configuration[:type].is_a?(ETL::Control::Source)
|
57
|
+
sources << configuration[:type]
|
58
|
+
else
|
59
|
+
raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
else
|
63
|
+
source_types.each do |source_type|
|
64
|
+
if configuration[source_type]
|
65
|
+
source_class = ETL::Control::Source.class_for_name(source_type)
|
66
|
+
sources << source_class.new(self, configuration, definition)
|
67
|
+
break
|
68
|
+
end
|
69
|
+
raise ControlError, "A source was specified but no matching type was found"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the defined source
|
75
|
+
def sources
|
76
|
+
control.sources
|
77
|
+
end
|
78
|
+
|
79
|
+
# Define a destination
|
80
|
+
def destination(name, configuration={}, mapping={})
|
81
|
+
if configuration[:type]
|
82
|
+
case configuration[:type]
|
83
|
+
when Class
|
84
|
+
dest_class = configuration[:type]
|
85
|
+
destinations << dest_class.new(self, configuration, mapping)
|
86
|
+
when String, Symbol
|
87
|
+
dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
|
88
|
+
destinations << dest_class.new(self, configuration, mapping)
|
89
|
+
else
|
90
|
+
if configuration[:type].is_a?(ETL::Control::Destination)
|
91
|
+
destinations << configuration[:type]
|
92
|
+
else
|
93
|
+
raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
else
|
97
|
+
destination_types.each do |dest_type|
|
98
|
+
if configuration[dest_type]
|
99
|
+
dest_class = ETL::Control::Destination.class_for_name(dest_type)
|
100
|
+
destinations << dest_class.new(self, configuration, mapping)
|
101
|
+
break
|
102
|
+
end
|
103
|
+
raise ControlError, "A destination was specified but no matching destination type was found"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Get the defined destinations
|
109
|
+
def destinations
|
110
|
+
control.destinations
|
111
|
+
end
|
112
|
+
|
113
|
+
# Define a transform
|
114
|
+
def transform(name, transformer=nil, configuration={}, &block)
|
115
|
+
if transformer
|
116
|
+
case transformer
|
117
|
+
when String, Symbol
|
118
|
+
class_name = "#{transformer.to_s.camelize}Transform"
|
119
|
+
begin
|
120
|
+
transform_class = ETL::Transform.const_get(class_name)
|
121
|
+
transforms << transform_class.new(self, name, configuration)
|
122
|
+
rescue NameError => e
|
123
|
+
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
124
|
+
end
|
125
|
+
when Class
|
126
|
+
transforms << transformer.new(self, transformer.name, configuration)
|
127
|
+
else
|
128
|
+
#transformer.class.inspect
|
129
|
+
if transformer.is_a?(ETL::Transform::Transform)
|
130
|
+
Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
|
131
|
+
t = transformer.dup
|
132
|
+
t.name = name
|
133
|
+
transforms << t
|
134
|
+
else
|
135
|
+
raise ControlError, "Transformer must be a String, Symbol, Class or Transform instance"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
elsif block_given?
|
139
|
+
transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
|
140
|
+
else
|
141
|
+
raise ControlError, "Either a transformer or a block must be specified"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Get the defined transforms
|
146
|
+
def transforms
|
147
|
+
control.transforms
|
148
|
+
end
|
149
|
+
|
150
|
+
# Define a before post-process screen block. The type argument must be
|
151
|
+
# one of :fatal, :error or :warn
|
152
|
+
def screen(type, &block)
|
153
|
+
screens[type] << block
|
154
|
+
end
|
155
|
+
|
156
|
+
# Get the before post-process screen blocks
|
157
|
+
def screens
|
158
|
+
control.screens
|
159
|
+
end
|
160
|
+
|
161
|
+
# Define an after post-proces screen block. The type argument must be
|
162
|
+
# one of :fatal, :error or :warn
|
163
|
+
def after_post_process_screen(type, &block)
|
164
|
+
after_post_process_screens[type] << block
|
165
|
+
end
|
166
|
+
|
167
|
+
# Get the after post-process screen blocks
|
168
|
+
def after_post_process_screens
|
169
|
+
control.after_post_process_screens
|
170
|
+
end
|
171
|
+
|
172
|
+
# Rename the source field to the destination field
|
173
|
+
def rename(source, destination)
|
174
|
+
after_read :rename, :source => source, :dest => destination
|
175
|
+
end
|
176
|
+
|
177
|
+
# Copy the source field to the destination field
|
178
|
+
def copy(source, destination)
|
179
|
+
after_read :copy_field, :source => source, :dest => destination
|
180
|
+
end
|
181
|
+
|
182
|
+
protected
|
183
|
+
# This method is used to define a processor and insert into the specified processor
|
184
|
+
# collection.
|
185
|
+
def define_processor(name, processor_collection, configuration, proc)
|
186
|
+
case name
|
187
|
+
when String, Symbol, nil
|
188
|
+
name ||= 'block'
|
189
|
+
class_name = "#{name.to_s.camelize}Processor"
|
190
|
+
begin
|
191
|
+
processor_class = ETL::Processor.const_get(class_name)
|
192
|
+
if name == 'block'
|
193
|
+
raise ControlError, "A block must be passed for block processor" if proc.nil?
|
194
|
+
configuration[:block] = proc
|
195
|
+
end
|
196
|
+
processor_collection << processor_class.new(self, configuration)
|
197
|
+
rescue NameError => e
|
198
|
+
raise ControlError, "Unable to find processor #{class_name}: #{e}"
|
199
|
+
end
|
200
|
+
when Class
|
201
|
+
processor_collection << name.new(self, configuration)
|
202
|
+
else
|
203
|
+
raise ControlError, "The process declaration requires a String, Symbol, Class, or a Block to be passed"
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
public
|
208
|
+
# Define an "after read" processor. This must be a row-level processor.
|
209
|
+
def after_read(name='block', configuration={}, &block)
|
210
|
+
define_processor(name, after_read_processors, configuration, block)
|
211
|
+
end
|
212
|
+
|
213
|
+
# Get the defined "after read" processors
|
214
|
+
def after_read_processors
|
215
|
+
control.after_read_processors
|
216
|
+
end
|
217
|
+
|
218
|
+
# Define a "before write" processor. This must be a row-level processor.
|
219
|
+
def before_write(name='block', configuration={}, &block)
|
220
|
+
define_processor(name, before_write_processors, configuration, block)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get the defined "before write" processors
|
224
|
+
def before_write_processors
|
225
|
+
control.before_write_processors
|
226
|
+
end
|
227
|
+
|
228
|
+
# Define a pre-processor
|
229
|
+
def pre_process(name='block', configuration={}, &block)
|
230
|
+
define_processor(name, pre_processors, configuration, block)
|
231
|
+
end
|
232
|
+
|
233
|
+
# Get the defined pre-processors
|
234
|
+
def pre_processors
|
235
|
+
control.pre_processors
|
236
|
+
end
|
237
|
+
|
238
|
+
# Define a post-processor
|
239
|
+
def post_process(name='block', configuration={}, &block)
|
240
|
+
define_processor(name, post_processors, configuration, block)
|
241
|
+
end
|
242
|
+
|
243
|
+
# Get the defined post-processors
|
244
|
+
def post_processors
|
245
|
+
control.post_processors
|
246
|
+
end
|
247
|
+
|
248
|
+
# Get the binding object
|
249
|
+
def get_binding
|
250
|
+
binding
|
251
|
+
end
|
252
|
+
|
253
|
+
protected
|
254
|
+
# Get an array of supported source types
|
255
|
+
def source_types
|
256
|
+
control.source_types
|
257
|
+
end
|
258
|
+
|
259
|
+
# Get an array of supported destination types
|
260
|
+
def destination_types
|
261
|
+
control.destination_types
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
# Object representation of a control file
|
267
|
+
class Control
|
268
|
+
# The File object
|
269
|
+
attr_reader :file
|
270
|
+
|
271
|
+
# The error threshold
|
272
|
+
attr_accessor :error_threshold
|
273
|
+
|
274
|
+
class << self
|
275
|
+
# Parse a control file and return a Control instance
|
276
|
+
def parse(control_file)
|
277
|
+
control_file = control_file.path if control_file.instance_of?(File)
|
278
|
+
control = ETL::Control::Control.new(control_file)
|
279
|
+
# TODO: better handling of parser errors. Return the line in the control file where the error occurs.
|
280
|
+
eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
|
281
|
+
control.validate
|
282
|
+
control
|
283
|
+
end
|
284
|
+
|
285
|
+
def parse_text(text)
|
286
|
+
control = ETL::Control::Control.new(nil)
|
287
|
+
eval(text, Context.create(control), 'inline')
|
288
|
+
control.validate
|
289
|
+
control
|
290
|
+
end
|
291
|
+
|
292
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
293
|
+
# are:
|
294
|
+
# * The path to a control file as a String
|
295
|
+
# * A File object referencing the control file
|
296
|
+
# * The ETL::Control::Control object (which will just be returned)
|
297
|
+
#
|
298
|
+
# Raises a ControlError if any other type is given
|
299
|
+
def resolve(control)
|
300
|
+
case control
|
301
|
+
when String
|
302
|
+
ETL::Control::Control.parse(File.new(control))
|
303
|
+
when File
|
304
|
+
ETL::Control::Control.parse(control)
|
305
|
+
when ETL::Control::Control
|
306
|
+
control
|
307
|
+
else
|
308
|
+
raise ControlError, "Control must be a String, File or Control object"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Initialize the instance with the given File object
|
314
|
+
def initialize(file)
|
315
|
+
@file = file
|
316
|
+
end
|
317
|
+
|
318
|
+
# Get a list of dependencies
|
319
|
+
def dependencies
|
320
|
+
@dependencies ||= []
|
321
|
+
end
|
322
|
+
|
323
|
+
# Get the defined source
|
324
|
+
def sources
|
325
|
+
@sources ||= []
|
326
|
+
end
|
327
|
+
|
328
|
+
# Get the defined destinations
|
329
|
+
def destinations
|
330
|
+
@destinations ||= []
|
331
|
+
end
|
332
|
+
|
333
|
+
# Get the transforms with the specified name
|
334
|
+
# def transform(name)
|
335
|
+
# transforms[name] ||= []
|
336
|
+
# end
|
337
|
+
|
338
|
+
def after_read_processors
|
339
|
+
@after_read_processors ||= []
|
340
|
+
end
|
341
|
+
|
342
|
+
# Get all of the "before write" processors
|
343
|
+
def before_write_processors
|
344
|
+
@before_write_processors ||= []
|
345
|
+
end
|
346
|
+
|
347
|
+
# Get an Array of preprocessors
|
348
|
+
def pre_processors
|
349
|
+
@pre_processors ||= []
|
350
|
+
end
|
351
|
+
|
352
|
+
# Get an Array of post processors
|
353
|
+
def post_processors
|
354
|
+
@post_processors ||= []
|
355
|
+
end
|
356
|
+
|
357
|
+
# Get an Array of all transforms for this control
|
358
|
+
def transforms
|
359
|
+
@transforms ||= []
|
360
|
+
end
|
361
|
+
|
362
|
+
# A hash of the screens executed before post-process
|
363
|
+
def screens
|
364
|
+
@screens ||= {
|
365
|
+
:fatal => [],
|
366
|
+
:error => [],
|
367
|
+
:warn => []
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
# A hash of the screens executed after post-process
|
372
|
+
def after_post_process_screens
|
373
|
+
@after_post_process_screens ||= {
|
374
|
+
:fatal => [],
|
375
|
+
:error => [],
|
376
|
+
:warn => []
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
# Get the error threshold. Defaults to 100.
|
381
|
+
def error_threshold
|
382
|
+
@error_threshold ||= 100
|
383
|
+
end
|
384
|
+
|
385
|
+
# Validate the control file
|
386
|
+
def validate
|
387
|
+
#unless sources.length > 0
|
388
|
+
# raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
|
389
|
+
#end
|
390
|
+
#unless destinations.length > 0
|
391
|
+
# raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
|
392
|
+
#end
|
393
|
+
end
|
394
|
+
|
395
|
+
def source_types
|
396
|
+
[:file, :database]
|
397
|
+
end
|
398
|
+
|
399
|
+
def destination_types
|
400
|
+
[:file, :database]
|
401
|
+
end
|
402
|
+
|
403
|
+
end
|
404
|
+
end
|
405
|
+
end
|
@@ -0,0 +1,420 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Base class for destinations.
|
4
|
+
class Destination
|
5
|
+
# Read-only accessor for the ETL::Control::Control instance
|
6
|
+
attr_reader :control
|
7
|
+
|
8
|
+
# Read-only accessor for the configuration Hash
|
9
|
+
attr_reader :configuration
|
10
|
+
|
11
|
+
# Read-only accessor for the destination mapping Hash
|
12
|
+
attr_reader :mapping
|
13
|
+
|
14
|
+
# Accessor to the buffer size
|
15
|
+
attr_accessor :buffer_size
|
16
|
+
|
17
|
+
# Unique flag.
|
18
|
+
attr_accessor :unique
|
19
|
+
|
20
|
+
# A condition for writing
|
21
|
+
attr_accessor :condition
|
22
|
+
|
23
|
+
# An array of rows to append to the destination
|
24
|
+
attr_accessor :append_rows
|
25
|
+
|
26
|
+
class << self
|
27
|
+
# Get the destination class for the specified name.
|
28
|
+
#
|
29
|
+
# For example if name is :database or 'database' then the
|
30
|
+
# DatabaseDestination class is returned
|
31
|
+
def class_for_name(name)
|
32
|
+
ETL::Control.const_get("#{name.to_s.camelize}Destination")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Initialize the destination
|
37
|
+
#
|
38
|
+
# Arguments:
|
39
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
40
|
+
# * <tt>configuration</tt>: The configuration Hash
|
41
|
+
# * <tt>mapping</tt>: The mapping Hash
|
42
|
+
#
|
43
|
+
# Options:
|
44
|
+
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
45
|
+
# * <tt>:condition</tt>: A conditional proc that must return true for the
|
46
|
+
# row to be written
|
47
|
+
# * <tt>:append_rows</tt>: An array of rows to append
|
48
|
+
def initialize(control, configuration, mapping)
|
49
|
+
@control = control
|
50
|
+
@configuration = configuration
|
51
|
+
@mapping = mapping
|
52
|
+
@buffer_size = configuration[:buffer_size] ||= 100
|
53
|
+
@condition = configuration[:condition]
|
54
|
+
@append_rows = configuration[:append_rows]
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get the current row number
|
58
|
+
def current_row
|
59
|
+
@current_row ||= 1
|
60
|
+
end
|
61
|
+
|
62
|
+
# Write the given row
|
63
|
+
def write(row)
|
64
|
+
if @condition.nil? || @condition.call(row)
|
65
|
+
process_change(row)
|
66
|
+
end
|
67
|
+
flush if buffer.length >= buffer_size
|
68
|
+
end
|
69
|
+
|
70
|
+
# Abstract method
|
71
|
+
def flush
|
72
|
+
raise NotImplementedError, "flush method must be implemented by subclasses"
|
73
|
+
end
|
74
|
+
|
75
|
+
# Abstract method
|
76
|
+
def close
|
77
|
+
raise NotImplementedError, "close method must be implemented by subclasses"
|
78
|
+
end
|
79
|
+
|
80
|
+
def errors
|
81
|
+
@errors ||= []
|
82
|
+
end
|
83
|
+
|
84
|
+
protected
|
85
|
+
# Access the buffer
|
86
|
+
def buffer
|
87
|
+
@buffer ||= []
|
88
|
+
end
|
89
|
+
|
90
|
+
# Access the generators map
|
91
|
+
def generators
|
92
|
+
@generators ||= {}
|
93
|
+
end
|
94
|
+
|
95
|
+
# Get the order of elements from the source order
|
96
|
+
def order_from_source
|
97
|
+
order = []
|
98
|
+
control.sources.first.definition.each do |item|
|
99
|
+
case item
|
100
|
+
when Hash
|
101
|
+
order << item[:name]
|
102
|
+
else
|
103
|
+
order << item
|
104
|
+
end
|
105
|
+
end
|
106
|
+
order
|
107
|
+
end
|
108
|
+
|
109
|
+
# Return true if the row is allowed. The row will not be allowed if the
|
110
|
+
# :unique option is specified in the configuration and the compound key
|
111
|
+
# already exists
|
112
|
+
def row_allowed?(row)
|
113
|
+
if unique
|
114
|
+
key = (unique.collect { |k| row[k] }).join('|')
|
115
|
+
return false if compound_key_constraints[key]
|
116
|
+
compound_key_constraints[key] = 1
|
117
|
+
end
|
118
|
+
return true
|
119
|
+
end
|
120
|
+
|
121
|
+
# Get a hash of compound key contraints. This is used to determine if a
|
122
|
+
# row can be written when the unique option is specified
|
123
|
+
def compound_key_constraints
|
124
|
+
@compound_key_constraints ||= {}
|
125
|
+
end
|
126
|
+
|
127
|
+
# Return fields which are Slowly Changing Dimension fields.
|
128
|
+
# Uses the scd_fields specified in the configuration. If that's
|
129
|
+
# missing, uses all of the row's fields.
|
130
|
+
def scd_fields(row)
|
131
|
+
@scd_fields ||= configuration[:scd_fields] || row.keys
|
132
|
+
end
|
133
|
+
|
134
|
+
def non_scd_fields(row)
|
135
|
+
@non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
|
136
|
+
[primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
|
137
|
+
end
|
138
|
+
|
139
|
+
def non_evolving_fields
|
140
|
+
(Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
|
141
|
+
end
|
142
|
+
|
143
|
+
def scd?
|
144
|
+
!configuration[:scd].nil?
|
145
|
+
end
|
146
|
+
|
147
|
+
def scd_type
|
148
|
+
scd? ? configuration[:scd][:type] : nil
|
149
|
+
end
|
150
|
+
|
151
|
+
# Get the Slowly Changing Dimension effective date field. Defaults to
|
152
|
+
# 'effective_date'.
|
153
|
+
def scd_effective_date_field
|
154
|
+
configuration[:scd][:effective_date_field] || :effective_date if scd?
|
155
|
+
end
|
156
|
+
|
157
|
+
# Get the Slowly Changing Dimension end date field. Defaults to
|
158
|
+
# 'end_date'.
|
159
|
+
def scd_end_date_field
|
160
|
+
configuration[:scd][:end_date_field] || :end_date if scd?
|
161
|
+
end
|
162
|
+
|
163
|
+
# Get the Slowly Changing Dimension latest version field. Defaults to
|
164
|
+
# 'latest_version'.
|
165
|
+
def scd_latest_version_field
|
166
|
+
configuration[:scd][:latest_version_field] || :latest_version if scd?
|
167
|
+
end
|
168
|
+
|
169
|
+
# Return the natural key field names, defaults to []
|
170
|
+
def natural_key
|
171
|
+
@natural_key ||= determine_natural_key
|
172
|
+
end
|
173
|
+
|
174
|
+
# Get the dimension table if specified
|
175
|
+
def dimension_table
|
176
|
+
@dimension_table ||= if scd?
|
177
|
+
ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
# Get the dimension target if specified
|
182
|
+
def dimension_target
|
183
|
+
@dimension_target ||= if scd?
|
184
|
+
configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Process a row to determine the change type
|
189
|
+
def process_change(row)
|
190
|
+
ETL::Engine.logger.debug "Processing row: #{row.inspect}"
|
191
|
+
return unless row
|
192
|
+
|
193
|
+
# Change processing can only occur if the natural key exists in the row
|
194
|
+
ETL::Engine.logger.debug "Checking for natural key existence"
|
195
|
+
unless has_natural_key?(row)
|
196
|
+
buffer << row
|
197
|
+
return
|
198
|
+
end
|
199
|
+
|
200
|
+
@timestamp = Time.now
|
201
|
+
|
202
|
+
# See if the scd_fields of the current record have changed
|
203
|
+
# from the last time this record was loaded into the data
|
204
|
+
# warehouse. If they match then throw away this row (no need
|
205
|
+
# to process). If they do not match then the record is an
|
206
|
+
# 'update'. If the record doesn't exist then it is an 'insert'
|
207
|
+
ETL::Engine.logger.debug "Checking record for SCD change"
|
208
|
+
if @existing_row = preexisting_row(row)
|
209
|
+
if has_scd_field_changes?(row)
|
210
|
+
process_scd_change(row)
|
211
|
+
else
|
212
|
+
process_scd_match(row)
|
213
|
+
end
|
214
|
+
else
|
215
|
+
schedule_new_record(row)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# Add any virtual fields to the row. Virtual rows will get their value
|
220
|
+
# from one of the following:
|
221
|
+
# * If the mapping is a Class, then an object which implements the next
|
222
|
+
# method
|
223
|
+
# * If the mapping is a Symbol, then the XGenerator where X is the
|
224
|
+
# classified symbol
|
225
|
+
# * If the mapping is a Proc, then it will be called with the row
|
226
|
+
# * Otherwise the value itself will be assigned to the field
|
227
|
+
def add_virtuals!(row)
|
228
|
+
if mapping[:virtual]
|
229
|
+
mapping[:virtual].each do |key,value|
|
230
|
+
# If the row already has the virtual set, assume that's correct
|
231
|
+
next if row[key]
|
232
|
+
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
233
|
+
case value
|
234
|
+
when Class
|
235
|
+
generator = generators[key] ||= value.new
|
236
|
+
row[key] = generator.next
|
237
|
+
when Symbol
|
238
|
+
generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
|
239
|
+
row[key] = generator.next
|
240
|
+
when Proc
|
241
|
+
row[key] = value.call(row)
|
242
|
+
else
|
243
|
+
if value.is_a?(ETL::Generator::Generator)
|
244
|
+
row[key] = value.next
|
245
|
+
else
|
246
|
+
row[key] = value
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
private
|
254
|
+
|
255
|
+
# Determine the natural key. This method will always return an array
|
256
|
+
# of symbols. The default value is [].
|
257
|
+
def determine_natural_key
|
258
|
+
Array(configuration[:natural_key]).collect(&:to_sym)
|
259
|
+
end
|
260
|
+
|
261
|
+
# Check whether a natural key has been defined, and if so, whether
|
262
|
+
# this row has enough information to do searches based on that natural
|
263
|
+
# key.
|
264
|
+
#
|
265
|
+
# TODO: This should be factored out into
|
266
|
+
# ETL::Row#has_all_fields?(field_array) But that's not possible
|
267
|
+
# until *all* sources cast to ETL::Row, instead of sometimes
|
268
|
+
# using Hash
|
269
|
+
def has_natural_key?(row)
|
270
|
+
natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
|
271
|
+
end
|
272
|
+
|
273
|
+
# Helper for generating the SQL where clause that allows searching
|
274
|
+
# by a natural key
|
275
|
+
def natural_key_equality_for_row(row)
|
276
|
+
statement = []
|
277
|
+
values = []
|
278
|
+
natural_key.each do |nk|
|
279
|
+
statement << "#{nk} = ?"
|
280
|
+
values << row[nk]
|
281
|
+
end
|
282
|
+
statement = statement.join(" AND ")
|
283
|
+
ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
|
284
|
+
end
|
285
|
+
|
286
|
+
# Do all the steps required when a SCD *has* changed. Exact steps
|
287
|
+
# depend on what type of SCD we're handling.
|
288
|
+
def process_scd_change(row)
|
289
|
+
ETL::Engine.logger.debug "SCD fields do not match"
|
290
|
+
|
291
|
+
if scd_type == 2
|
292
|
+
# SCD Type 2: new row should be added and old row should be updated
|
293
|
+
ETL::Engine.logger.debug "type 2 SCD"
|
294
|
+
|
295
|
+
# To update the old row, we delete the version in the database
|
296
|
+
# and insert a new expired version
|
297
|
+
|
298
|
+
# If there is no truncate then the row will exist twice in the database
|
299
|
+
delete_outdated_record
|
300
|
+
|
301
|
+
ETL::Engine.logger.debug "expiring original record"
|
302
|
+
@existing_row[scd_end_date_field] = @timestamp
|
303
|
+
@existing_row[scd_latest_version_field] = false
|
304
|
+
|
305
|
+
buffer << @existing_row
|
306
|
+
|
307
|
+
elsif scd_type == 1
|
308
|
+
# SCD Type 1: only the new row should be added
|
309
|
+
ETL::Engine.logger.debug "type 1 SCD"
|
310
|
+
|
311
|
+
# Copy primary key, and other non-evolving fields over from
|
312
|
+
# original version of record
|
313
|
+
non_evolving_fields.each do |non_evolving_field|
|
314
|
+
row[non_evolving_field] = @existing_row[non_evolving_field]
|
315
|
+
end
|
316
|
+
|
317
|
+
# If there is no truncate then the row will exist twice in the database
|
318
|
+
delete_outdated_record
|
319
|
+
else
|
320
|
+
# SCD Type 3: not supported
|
321
|
+
ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
|
322
|
+
end
|
323
|
+
|
324
|
+
# In all cases, the latest, greatest version of the record
|
325
|
+
# should go into the load
|
326
|
+
schedule_new_record(row)
|
327
|
+
end
|
328
|
+
|
329
|
+
# Do all the steps required when a SCD has *not* changed. Exact
|
330
|
+
# steps depend on what type of SCD we're handling.
|
331
|
+
def process_scd_match(row)
|
332
|
+
ETL::Engine.logger.debug "SCD fields match"
|
333
|
+
|
334
|
+
if scd_type == 2 && has_non_scd_field_changes?(row)
|
335
|
+
ETL::Engine.logger.debug "Non-SCD field changes"
|
336
|
+
# Copy important data over from original version of record
|
337
|
+
row[primary_key] = @existing_row[primary_key]
|
338
|
+
row[scd_end_date_field] = @existing_row[scd_end_date_field]
|
339
|
+
row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
|
340
|
+
row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
|
341
|
+
|
342
|
+
# If there is no truncate then the row will exist twice in the database
|
343
|
+
delete_outdated_record
|
344
|
+
|
345
|
+
buffer << row
|
346
|
+
else
|
347
|
+
# The record is totally the same, so skip it
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
# Find the version of this row that already exists in the datawarehouse.
|
352
|
+
def preexisting_row(row)
|
353
|
+
q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
|
354
|
+
q << " AND #{scd_latest_version_field}" if scd_type == 2
|
355
|
+
|
356
|
+
#puts "looking for original record"
|
357
|
+
result = connection.select_one(q)
|
358
|
+
|
359
|
+
#puts "Result: #{result.inspect}"
|
360
|
+
|
361
|
+
result ? ETL::Row[result.symbolize_keys!] : nil
|
362
|
+
end
|
363
|
+
|
364
|
+
# Check whether non-scd fields have changed since the last
|
365
|
+
# load of this record.
|
366
|
+
def has_scd_field_changes?(row)
|
367
|
+
scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
|
368
|
+
end
|
369
|
+
|
370
|
+
# Check whether non-scd fields have changed since the last
|
371
|
+
# load of this record.
|
372
|
+
def has_non_scd_field_changes?(row)
|
373
|
+
non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
|
374
|
+
end
|
375
|
+
|
376
|
+
# Grab, or re-use, a database connection for running queries directly
|
377
|
+
# during the destination processing.
|
378
|
+
def connection
|
379
|
+
@conn ||= ETL::Engine.connection(dimension_target)
|
380
|
+
end
|
381
|
+
|
382
|
+
# Utility for removing a row that has outdated information. Note
|
383
|
+
# that this deletes directly from the database, even if this is a file
|
384
|
+
# destination. It needs to do this because you can't do deletes in a
|
385
|
+
# bulk load.
|
386
|
+
def delete_outdated_record
|
387
|
+
ETL::Engine.logger.debug "deleting old row"
|
388
|
+
|
389
|
+
q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
|
390
|
+
connection.delete(q)
|
391
|
+
end
|
392
|
+
|
393
|
+
# Schedule the latest, greatest version of the row for insertion
|
394
|
+
# into the database
|
395
|
+
def schedule_new_record(row)
|
396
|
+
ETL::Engine.logger.debug "writing new record"
|
397
|
+
if scd_type == 2
|
398
|
+
row[scd_effective_date_field] = @timestamp
|
399
|
+
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
400
|
+
row[scd_latest_version_field] = true
|
401
|
+
end
|
402
|
+
buffer << row
|
403
|
+
end
|
404
|
+
|
405
|
+
# Get the name of the primary key for this table. Asks the dimension
|
406
|
+
# model class for this information, but if that class hasn't been
|
407
|
+
# defined, just defaults to :id.
|
408
|
+
def primary_key
|
409
|
+
return @primary_key if @primary_key
|
410
|
+
@primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
|
411
|
+
rescue NameError => e
|
412
|
+
ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
|
413
|
+
@primary_key = :id
|
414
|
+
end
|
415
|
+
|
416
|
+
end
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
|