colincasey-activewarehouse-etl 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +75 -0
  5. data/TODO +28 -0
  6. data/VERSION.yml +4 -0
  7. data/bin/etl +28 -0
  8. data/bin/etl.cmd +8 -0
  9. data/lib/etl.rb +81 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +414 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/csv_destination.rb +84 -0
  21. data/lib/etl/control/destination/database_destination.rb +95 -0
  22. data/lib/etl/control/destination/file_destination.rb +124 -0
  23. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control/source/database_source.rb +220 -0
  26. data/lib/etl/control/source/enumerable_source.rb +11 -0
  27. data/lib/etl/control/source/file_source.rb +90 -0
  28. data/lib/etl/control/source/model_source.rb +39 -0
  29. data/lib/etl/core_ext.rb +1 -0
  30. data/lib/etl/core_ext/time.rb +5 -0
  31. data/lib/etl/core_ext/time/calculations.rb +42 -0
  32. data/lib/etl/engine.rb +574 -0
  33. data/lib/etl/execution.rb +20 -0
  34. data/lib/etl/execution/base.rb +9 -0
  35. data/lib/etl/execution/batch.rb +8 -0
  36. data/lib/etl/execution/job.rb +8 -0
  37. data/lib/etl/execution/migration.rb +85 -0
  38. data/lib/etl/generator.rb +2 -0
  39. data/lib/etl/generator/generator.rb +20 -0
  40. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  41. data/lib/etl/http_tools.rb +139 -0
  42. data/lib/etl/parser.rb +11 -0
  43. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  44. data/lib/etl/parser/delimited_parser.rb +74 -0
  45. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  46. data/lib/etl/parser/parser.rb +41 -0
  47. data/lib/etl/parser/sax_parser.rb +218 -0
  48. data/lib/etl/parser/spreadsheet_parser.rb +114 -0
  49. data/lib/etl/parser/xml_parser.rb +65 -0
  50. data/lib/etl/processor.rb +11 -0
  51. data/lib/etl/processor/block_processor.rb +14 -0
  52. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  53. data/lib/etl/processor/check_exist_processor.rb +80 -0
  54. data/lib/etl/processor/check_unique_processor.rb +35 -0
  55. data/lib/etl/processor/copy_field_processor.rb +26 -0
  56. data/lib/etl/processor/encode_processor.rb +55 -0
  57. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  58. data/lib/etl/processor/print_row_processor.rb +12 -0
  59. data/lib/etl/processor/processor.rb +25 -0
  60. data/lib/etl/processor/rename_processor.rb +24 -0
  61. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  62. data/lib/etl/processor/row_processor.rb +17 -0
  63. data/lib/etl/processor/sequence_processor.rb +23 -0
  64. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  65. data/lib/etl/processor/truncate_processor.rb +35 -0
  66. data/lib/etl/row.rb +20 -0
  67. data/lib/etl/screen.rb +14 -0
  68. data/lib/etl/screen/row_count_screen.rb +20 -0
  69. data/lib/etl/transform.rb +2 -0
  70. data/lib/etl/transform/block_transform.rb +13 -0
  71. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  72. data/lib/etl/transform/decode_transform.rb +51 -0
  73. data/lib/etl/transform/default_transform.rb +20 -0
  74. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  75. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  76. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  77. data/lib/etl/transform/sha1_transform.rb +13 -0
  78. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  79. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  80. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  81. data/lib/etl/transform/transform.rb +61 -0
  82. data/lib/etl/transform/trim_transform.rb +26 -0
  83. data/lib/etl/transform/type_transform.rb +35 -0
  84. data/lib/etl/util.rb +59 -0
  85. data/lib/etl/version.rb +10 -0
  86. metadata +224 -0
@@ -0,0 +1,420 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Base class for destinations.
4
+ class Destination
5
+ # Read-only accessor for the ETL::Control::Control instance
6
+ attr_reader :control
7
+
8
+ # Read-only accessor for the configuration Hash
9
+ attr_reader :configuration
10
+
11
+ # Read-only accessor for the destination mapping Hash
12
+ attr_reader :mapping
13
+
14
+ # Accessor to the buffer size
15
+ attr_accessor :buffer_size
16
+
17
+ # Unique flag.
18
+ attr_accessor :unique
19
+
20
+ # A condition for writing
21
+ attr_accessor :condition
22
+
23
+ # An array of rows to append to the destination
24
+ attr_accessor :append_rows
25
+
26
+ class << self
27
+ # Get the destination class for the specified name.
28
+ #
29
+ # For example if name is :database or 'database' then the
30
+ # DatabaseDestination class is returned
31
+ def class_for_name(name)
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
+ end
34
+ end
35
+
36
+ # Initialize the destination
37
+ #
38
+ # Arguments:
39
+ # * <tt>control</tt>: The ETL::Control::Control instance
40
+ # * <tt>configuration</tt>: The configuration Hash
41
+ # * <tt>mapping</tt>: The mapping Hash
42
+ #
43
+ # Options:
44
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
45
+ # * <tt>:condition</tt>: A conditional proc that must return true for the
46
+ # row to be written
47
+ # * <tt>:append_rows</tt>: An array of rows to append
48
+ def initialize(control, configuration, mapping)
49
+ @control = control
50
+ @configuration = configuration
51
+ @mapping = mapping
52
+ @buffer_size = configuration[:buffer_size] ||= 100
53
+ @condition = configuration[:condition]
54
+ @append_rows = configuration[:append_rows]
55
+ end
56
+
57
+ # Get the current row number
58
+ def current_row
59
+ @current_row ||= 1
60
+ end
61
+
62
+ # Write the given row
63
+ def write(row)
64
+ if @condition.nil? || @condition.call(row)
65
+ process_change(row)
66
+ end
67
+ flush if buffer.length >= buffer_size
68
+ end
69
+
70
+ # Abstract method
71
+ def flush
72
+ raise NotImplementedError, "flush method must be implemented by subclasses"
73
+ end
74
+
75
+ # Abstract method
76
+ def close
77
+ raise NotImplementedError, "close method must be implemented by subclasses"
78
+ end
79
+
80
+ def errors
81
+ @errors ||= []
82
+ end
83
+
84
+ protected
85
+ # Access the buffer
86
+ def buffer
87
+ @buffer ||= []
88
+ end
89
+
90
+ # Access the generators map
91
+ def generators
92
+ @generators ||= {}
93
+ end
94
+
95
+ # Get the order of elements from the source order
96
+ def order_from_source
97
+ order = []
98
+ control.sources.first.definition.each do |item|
99
+ case item
100
+ when Hash
101
+ order << item[:name]
102
+ else
103
+ order << item
104
+ end
105
+ end
106
+ order
107
+ end
108
+
109
+ # Return true if the row is allowed. The row will not be allowed if the
110
+ # :unique option is specified in the configuration and the compound key
111
+ # already exists
112
+ def row_allowed?(row)
113
+ if unique
114
+ key = (unique.collect { |k| row[k] }).join('|')
115
+ return false if compound_key_constraints[key]
116
+ compound_key_constraints[key] = 1
117
+ end
118
+ return true
119
+ end
120
+
121
+ # Get a hash of compound key contraints. This is used to determine if a
122
+ # row can be written when the unique option is specified
123
+ def compound_key_constraints
124
+ @compound_key_constraints ||= {}
125
+ end
126
+
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ end
133
+
134
+ def non_scd_fields(row)
135
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
137
+ end
138
+
139
+ def non_evolving_fields
140
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
141
+ end
142
+
143
+ def scd?
144
+ !configuration[:scd].nil?
145
+ end
146
+
147
+ def scd_type
148
+ scd? ? configuration[:scd][:type] : nil
149
+ end
150
+
151
+ # Get the Slowly Changing Dimension effective date field. Defaults to
152
+ # 'effective_date'.
153
+ def scd_effective_date_field
154
+ configuration[:scd][:effective_date_field] || :effective_date if scd?
155
+ end
156
+
157
+ # Get the Slowly Changing Dimension end date field. Defaults to
158
+ # 'end_date'.
159
+ def scd_end_date_field
160
+ configuration[:scd][:end_date_field] || :end_date if scd?
161
+ end
162
+
163
+ # Get the Slowly Changing Dimension latest version field. Defaults to
164
+ # 'latest_version'.
165
+ def scd_latest_version_field
166
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
167
+ end
168
+
169
+ # Return the natural key field names, defaults to []
170
+ def natural_key
171
+ @natural_key ||= determine_natural_key
172
+ end
173
+
174
+ # Get the dimension table if specified
175
+ def dimension_table
176
+ @dimension_table ||= if scd?
177
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
178
+ end
179
+ end
180
+
181
+ # Get the dimension target if specified
182
+ def dimension_target
183
+ @dimension_target ||= if scd?
184
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
185
+ end
186
+ end
187
+
188
+ # Process a row to determine the change type
189
+ def process_change(row)
190
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
191
+ return unless row
192
+
193
+ # Change processing can only occur if the natural key exists in the row
194
+ ETL::Engine.logger.debug "Checking for natural key existence"
195
+ unless has_natural_key?(row)
196
+ buffer << row
197
+ return
198
+ end
199
+
200
+ @timestamp = Time.now
201
+
202
+ # See if the scd_fields of the current record have changed
203
+ # from the last time this record was loaded into the data
204
+ # warehouse. If they match then throw away this row (no need
205
+ # to process). If they do not match then the record is an
206
+ # 'update'. If the record doesn't exist then it is an 'insert'
207
+ ETL::Engine.logger.debug "Checking record for SCD change"
208
+ if @existing_row = preexisting_row(row)
209
+ if has_scd_field_changes?(row)
210
+ process_scd_change(row)
211
+ else
212
+ process_scd_match(row)
213
+ end
214
+ else
215
+ schedule_new_record(row)
216
+ end
217
+ end
218
+
219
+ # Add any virtual fields to the row. Virtual rows will get their value
220
+ # from one of the following:
221
+ # * If the mapping is a Class, then an object which implements the next
222
+ # method
223
+ # * If the mapping is a Symbol, then the XGenerator where X is the
224
+ # classified symbol
225
+ # * If the mapping is a Proc, then it will be called with the row
226
+ # * Otherwise the value itself will be assigned to the field
227
+ def add_virtuals!(row)
228
+ if mapping[:virtual]
229
+ mapping[:virtual].each do |key,value|
230
+ # If the row already has the virtual set, assume that's correct
231
+ next if row[key]
232
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
233
+ case value
234
+ when Class
235
+ generator = generators[key] ||= value.new
236
+ row[key] = generator.next
237
+ when Symbol
238
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
239
+ row[key] = generator.next
240
+ when Proc
241
+ row[key] = value.call(row)
242
+ else
243
+ if value.is_a?(ETL::Generator::Generator)
244
+ row[key] = value.next
245
+ else
246
+ row[key] = value
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+
253
+ private
254
+
255
+ # Determine the natural key. This method will always return an array
256
+ # of symbols. The default value is [].
257
+ def determine_natural_key
258
+ Array(configuration[:natural_key]).collect(&:to_sym)
259
+ end
260
+
261
+ # Check whether a natural key has been defined, and if so, whether
262
+ # this row has enough information to do searches based on that natural
263
+ # key.
264
+ #
265
+ # TODO: This should be factored out into
266
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
267
+ # until *all* sources cast to ETL::Row, instead of sometimes
268
+ # using Hash
269
+ def has_natural_key?(row)
270
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
271
+ end
272
+
273
+ # Helper for generating the SQL where clause that allows searching
274
+ # by a natural key
275
+ def natural_key_equality_for_row(row)
276
+ statement = []
277
+ values = []
278
+ natural_key.each do |nk|
279
+ statement << "#{nk} = ?"
280
+ values << row[nk]
281
+ end
282
+ statement = statement.join(" AND ")
283
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
284
+ end
285
+
286
+ # Do all the steps required when a SCD *has* changed. Exact steps
287
+ # depend on what type of SCD we're handling.
288
+ def process_scd_change(row)
289
+ ETL::Engine.logger.debug "SCD fields do not match"
290
+
291
+ if scd_type == 2
292
+ # SCD Type 2: new row should be added and old row should be updated
293
+ ETL::Engine.logger.debug "type 2 SCD"
294
+
295
+ # To update the old row, we delete the version in the database
296
+ # and insert a new expired version
297
+
298
+ # If there is no truncate then the row will exist twice in the database
299
+ delete_outdated_record
300
+
301
+ ETL::Engine.logger.debug "expiring original record"
302
+ @existing_row[scd_end_date_field] = @timestamp
303
+ @existing_row[scd_latest_version_field] = false
304
+
305
+ buffer << @existing_row
306
+
307
+ elsif scd_type == 1
308
+ # SCD Type 1: only the new row should be added
309
+ ETL::Engine.logger.debug "type 1 SCD"
310
+
311
+ # Copy primary key, and other non-evolving fields over from
312
+ # original version of record
313
+ non_evolving_fields.each do |non_evolving_field|
314
+ row[non_evolving_field] = @existing_row[non_evolving_field]
315
+ end
316
+
317
+ # If there is no truncate then the row will exist twice in the database
318
+ delete_outdated_record
319
+ else
320
+ # SCD Type 3: not supported
321
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
322
+ end
323
+
324
+ # In all cases, the latest, greatest version of the record
325
+ # should go into the load
326
+ schedule_new_record(row)
327
+ end
328
+
329
+ # Do all the steps required when a SCD has *not* changed. Exact
330
+ # steps depend on what type of SCD we're handling.
331
+ def process_scd_match(row)
332
+ ETL::Engine.logger.debug "SCD fields match"
333
+
334
+ if scd_type == 2 && has_non_scd_field_changes?(row)
335
+ ETL::Engine.logger.debug "Non-SCD field changes"
336
+ # Copy important data over from original version of record
337
+ row[primary_key] = @existing_row[primary_key]
338
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
339
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
340
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
341
+
342
+ # If there is no truncate then the row will exist twice in the database
343
+ delete_outdated_record
344
+
345
+ buffer << row
346
+ else
347
+ # The record is totally the same, so skip it
348
+ end
349
+ end
350
+
351
+ # Find the version of this row that already exists in the datawarehouse.
352
+ def preexisting_row(row)
353
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
355
+
356
+ #puts "looking for original record"
357
+ result = connection.select_one(q)
358
+
359
+ #puts "Result: #{result.inspect}"
360
+
361
+ result ? ETL::Row[result.symbolize_keys!] : nil
362
+ end
363
+
364
+ # Check whether non-scd fields have changed since the last
365
+ # load of this record.
366
+ def has_scd_field_changes?(row)
367
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
368
+ end
369
+
370
+ # Check whether non-scd fields have changed since the last
371
+ # load of this record.
372
+ def has_non_scd_field_changes?(row)
373
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
374
+ end
375
+
376
+ # Grab, or re-use, a database connection for running queries directly
377
+ # during the destination processing.
378
+ def connection
379
+ @conn ||= ETL::Engine.connection(dimension_target)
380
+ end
381
+
382
+ # Utility for removing a row that has outdated information. Note
383
+ # that this deletes directly from the database, even if this is a file
384
+ # destination. It needs to do this because you can't do deletes in a
385
+ # bulk load.
386
+ def delete_outdated_record
387
+ ETL::Engine.logger.debug "deleting old row"
388
+
389
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
390
+ connection.delete(q)
391
+ end
392
+
393
+ # Schedule the latest, greatest version of the row for insertion
394
+ # into the database
395
+ def schedule_new_record(row)
396
+ ETL::Engine.logger.debug "writing new record"
397
+ if scd_type == 2
398
+ row[scd_effective_date_field] = @timestamp
399
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
400
+ row[scd_latest_version_field] = true
401
+ end
402
+ buffer << row
403
+ end
404
+
405
+ # Get the name of the primary key for this table. Asks the dimension
406
+ # model class for this information, but if that class hasn't been
407
+ # defined, just defaults to :id.
408
+ def primary_key
409
+ return @primary_key if @primary_key
410
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
411
+ rescue NameError => e
412
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
413
+ @primary_key = :id
414
+ end
415
+
416
+ end
417
+ end
418
+ end
419
+
420
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
@@ -0,0 +1,84 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ class CsvDestination < Destination
4
+ attr_reader :file, :append, :headers, :order
5
+ # Initialize the object.
6
+ # * <tt>control</tt>: The Control object
7
+ # * <tt>configuration</tt>: The configuration map
8
+ # * <tt>mapping</tt>: The output mapping
9
+ #
10
+ # Configuration options:
11
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
12
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
13
+ # * <tt>:headers</tt>: Set to true to add the headers to the output also (default is true)
14
+ #
15
+ # Mapping options:
16
+ # * <tt>:order</tt>: The order array
17
+ def initialize(control, configuration, mapping={})
18
+ super
19
+ @file = File.join(File.dirname(control.file), configuration[:file])
20
+ @append = configuration[:append] ||= false
21
+ @headers = configuration[:headers] ||= true
22
+ @order = mapping[:order] || order_from_source
23
+ raise ControlError, "Order required in mapping" unless @order
24
+ end
25
+
26
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
27
+ def close
28
+ flush
29
+ f.close
30
+ end
31
+
32
+ # Flush the destination buffer
33
+ def flush
34
+ if write_header?
35
+ f << order
36
+ end
37
+
38
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
39
+ buffer.flatten.each do |row|
40
+ # check to see if this row's compound key constraint already exists
41
+ # note that the compound key constraint may not utilize virtual fields
42
+ next unless row_allowed?(row)
43
+ # add any virtual fields
44
+ add_virtuals!(row)
45
+ # collect all of the values using the order designated in the configuration
46
+ values = order.collect do |name|
47
+ value = row[name]
48
+ case value
49
+ when Date, Time, DateTime
50
+ value.to_s(:db)
51
+ else
52
+ value
53
+ # value.to_s
54
+ end
55
+ end
56
+ # write the values
57
+ f << values
58
+ end
59
+ f.flush
60
+ buffer.clear
61
+ end
62
+
63
+ private
64
+ # Get the open file stream
65
+ def f
66
+ @f ||= FasterCSV.open(file, mode)
67
+ end
68
+
69
+ # Get the appropriate mode to open the file stream
70
+ def mode
71
+ append ? 'a' : 'w'
72
+ end
73
+
74
+ def write_header?
75
+ if headers
76
+ @headers = false
77
+ return true
78
+ end
79
+ return false
80
+ end
81
+
82
+ end
83
+ end
84
+ end