activewarehouse-etl 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/CHANGELOG +98 -62
  2. data/Rakefile +11 -0
  3. data/TODO +2 -1
  4. data/lib/etl.rb +9 -0
  5. data/lib/etl/batch.rb +2 -0
  6. data/lib/etl/batch/batch.rb +111 -0
  7. data/lib/etl/batch/directives.rb +55 -0
  8. data/lib/etl/builder.rb +1 -0
  9. data/lib/etl/builder/date_dimension_builder.rb +83 -0
  10. data/lib/etl/commands/etl.rb +56 -43
  11. data/lib/etl/control/control.rb +58 -9
  12. data/lib/etl/control/destination.rb +29 -4
  13. data/lib/etl/control/destination/database_destination.rb +17 -27
  14. data/lib/etl/control/source/database_source.rb +17 -40
  15. data/lib/etl/control/source/file_source.rb +8 -5
  16. data/lib/etl/control/source/model_source.rb +39 -0
  17. data/lib/etl/core_ext.rb +1 -0
  18. data/lib/etl/core_ext/time.rb +5 -0
  19. data/lib/etl/core_ext/time/calculations.rb +40 -0
  20. data/lib/etl/engine.rb +184 -83
  21. data/lib/etl/execution.rb +1 -0
  22. data/lib/etl/execution/base.rb +1 -1
  23. data/lib/etl/execution/batch.rb +8 -0
  24. data/lib/etl/execution/job.rb +1 -0
  25. data/lib/etl/execution/migration.rb +16 -4
  26. data/lib/etl/generator/surrogate_key_generator.rb +20 -4
  27. data/lib/etl/http_tools.rb +1 -1
  28. data/lib/etl/processor/bulk_import_processor.rb +16 -19
  29. data/lib/etl/processor/check_exist_processor.rb +16 -7
  30. data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
  31. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  32. data/lib/etl/processor/surrogate_key_processor.rb +22 -2
  33. data/lib/etl/processor/truncate_processor.rb +13 -13
  34. data/lib/etl/screen.rb +14 -0
  35. data/lib/etl/screen/row_count_screen.rb +7 -2
  36. data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
  37. data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
  38. data/lib/etl/util.rb +59 -0
  39. data/lib/etl/version.rb +2 -2
  40. metadata +19 -2
@@ -0,0 +1 @@
1
+ require 'etl/builder/date_dimension_builder'
@@ -0,0 +1,83 @@
1
+ module ETL #:nodoc:
2
+ module Builder #:nodoc:
3
+ # A builder which will build a data structure which can be used to populate a date dimension using
4
+ # commonly used date dimension columns.
5
+ class DateDimensionBuilder
6
+ # Specify the start date for the first record
7
+ attr_accessor :start_date
8
+
9
+ # Specify the end date for the last record
10
+ attr_accessor :end_date
11
+
12
+ # Define any holiday indicators
13
+ attr_accessor :holiday_indicators
14
+
15
+ # Define the weekday indicators. The default array begins on Sunday and goes to Saturday.
16
+ cattr_accessor :weekday_indicators
17
+ @@weekday_indicators = ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
18
+
19
+ # Initialize the builder.
20
+ #
21
+ # * <tt>start_date</tt>: The start date. Defaults to 5 years ago from today.
22
+ # * <tt>end_date</tt>: The end date. Defaults to now.
23
+ def initialize(start_date=Time.now.years_ago(5), end_date=Time.now)
24
+ @start_date = start_date
25
+ @end_date = end_date
26
+ @holiday_indicators = []
27
+ end
28
+
29
+ # Returns an array of hashes representing records in the dimension. The values for each record are
30
+ # accessed by name.
31
+ def build(options={})
32
+ records = []
33
+ date = start_date.to_time
34
+ while date <= end_date.to_time
35
+ record = {}
36
+ record[:date] = date.strftime("%m/%d/%Y")
37
+ record[:full_date_description] = date.strftime("%B %d,%Y")
38
+ record[:day_of_week] = date.strftime("%A")
39
+ #record[:day_number_in_epoch] = date.to_i / 24
40
+ #record[:week_number_in_epoch] = date.to_i / (24 * 7)
41
+ #record[:month_number_in_epoch] = date.to_i / (24 * 7 * 30)
42
+ record[:day_number_in_calendar_month] = date.day
43
+ record[:day_number_in_calendar_year] = date.yday
44
+ record[:day_number_in_fiscal_month] = date.day # should this be different from CY?
45
+ record[:day_number_in_fiscal_year] = date.fiscal_year_yday
46
+ #record[:last_day_in_week_indicator] =
47
+ #record[:last_day_in_month_indicator] =
48
+ #record[:calendar_week_ending_date] =
49
+ record[:calendar_week] = "Week #{date.week}"
50
+ record[:calendar_week_number_in_year] = date.week
51
+ record[:calendar_month_name] = date.strftime("%B")
52
+ record[:calendar_month_number_in_year] = date.month
53
+ record[:calendar_year_month] = date.strftime("%Y-%m")
54
+ record[:calendar_quarter] = "Q#{date.quarter}"
55
+ record[:calendar_quarter_number_in_year] = date.quarter
56
+ record[:calendar_year_quarter] = "#{date.strftime('%Y')}-#{record[:calendar_quarter]}"
57
+ #record[:calendar_half_year] =
58
+ record[:calendar_year] = "#{date.year}"
59
+ record[:fiscal_week] = "FY Week #{date.fiscal_year_week}"
60
+ record[:fiscal_week_number_in_year] = date.fiscal_year_week
61
+ record[:fiscal_month] = date.fiscal_year_month
62
+ record[:fiscal_month_number_in_year] = date.fiscal_year_month
63
+ record[:fiscal_year_month] = "FY#{date.fiscal_year}-" + date.fiscal_year_month.to_s.rjust(2, '0')
64
+ record[:fiscal_quarter] = "FY Q#{date.fiscal_year_quarter}"
65
+ record[:fiscal_year_quarter] = "FY#{date.fiscal_year}-Q#{date.fiscal_year_quarter}"
66
+ record[:fiscal_year_quarter_number] = date.fiscal_year_quarter
67
+ #record[:fiscal_half_year] =
68
+ record[:fiscal_year] = "FY#{date.fiscal_year}"
69
+ record[:fiscal_year_number] = date.fiscal_year
70
+ record[:holiday_indicator] = holiday_indicators.include?(date) ? 'Holiday' : 'Nonholiday'
71
+ record[:weekday_indicator] = weekday_indicators[date.wday]
72
+ record[:selling_season] = 'None'
73
+ record[:major_event] = 'None'
74
+ record[:sql_date_stamp] = date
75
+
76
+ records << record
77
+ date = date.tomorrow
78
+ end
79
+ records
80
+ end
81
+ end
82
+ end
83
+ end
@@ -24,54 +24,67 @@
24
24
  require 'benchmark'
25
25
  require 'getoptlong'
26
26
 
27
- opts = GetoptLong.new(
28
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
29
- [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
30
- [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
31
- [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
32
- [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
33
- [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
34
- [ '--read-locally', GetoptLong::NO_ARGUMENT]
35
- )
36
-
37
27
  # Print a usage statement
38
28
  def usage #:nodoc:
39
- puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]" # TODO: add the command line options
29
+ puts "Usage: etl file [file file ...]" # TODO: add the command line options
40
30
  end
41
31
 
42
- options = {}
43
- opts.each do |opt, arg|
44
- case opt
45
- when '--help'
46
- usage
47
- when '--config'
48
- options[:config] = arg
49
- when '--limit'
50
- options[:limit] = arg.to_i
51
- when '--offset'
52
- options[:offset] = arg.to_i
53
- when '--newlog'
54
- options[:newlog] = true
55
- when '--skip-bulk-import'
56
- puts "skip bulk import enabled"
57
- options[:skip_bulk_import] = true
58
- when '--read-locally'
59
- puts "read locally enabled"
60
- options[:read_locally] = true
32
+ def execute
33
+ opts = GetoptLong.new(
34
+ [ '--version', '-v', GetoptLong::NO_ARGUMENT],
35
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
36
+ [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
37
+ [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
38
+ [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
39
+ [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
40
+ [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
41
+ [ '--read-locally', GetoptLong::NO_ARGUMENT],
42
+ [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
43
+ )
44
+
45
+ options = {}
46
+ opts.each do |opt, arg|
47
+ case opt
48
+ when '--version'
49
+ puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
50
+ return
51
+ when '--help'
52
+ usage
53
+ return
54
+ when '--config'
55
+ options[:config] = arg
56
+ when '--limit'
57
+ options[:limit] = arg.to_i
58
+ when '--offset'
59
+ options[:offset] = arg.to_i
60
+ when '--newlog'
61
+ options[:newlog] = true
62
+ when '--skip-bulk-import'
63
+ puts "skip bulk import enabled"
64
+ options[:skip_bulk_import] = true
65
+ when '--read-locally'
66
+ puts "read locally enabled"
67
+ options[:read_locally] = true
68
+ when '--rails-root'
69
+ options[:rails_root] = arg
70
+ puts "rails root set to #{options[:rails_root]}"
71
+ end
61
72
  end
62
- end
63
73
 
64
- if ARGV.length < 1
65
- usage
66
- else
67
- puts "Starting ETL process"
74
+ if ARGV.length < 1
75
+ usage
76
+ else
77
+ puts "Starting ETL process"
68
78
 
69
- ETL::Engine.init(options)
70
- ARGV.each do |f|
71
- puts "Processing #{f}"
72
- ETL::Engine.realtime_activity = true
73
- ETL::Engine.process(f)
74
- end
79
+ ETL::Engine.init(options)
80
+ ARGV.each do |f|
81
+ puts "Processing #{f}"
82
+ ETL::Engine.realtime_activity = true
83
+ ETL::Engine.process(f)
84
+ end
75
85
 
76
- puts "ETL process complete"
77
- end
86
+ puts "ETL process complete"
87
+ end
88
+ end
89
+
90
+ execute
@@ -2,6 +2,8 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  # The Context is passed to eval.
4
4
  class Context
5
+ require 'test/unit/assertions'
6
+ include Test::Unit::Assertions
5
7
  attr_reader :control
6
8
 
7
9
  class << self
@@ -26,10 +28,11 @@ module ETL #:nodoc:
26
28
  control.error_threshold = error_threshold
27
29
  end
28
30
 
29
- # Define a list of control files that this file depends on. Those control files
30
- # will be executed prior to this control file. The list may contain symbols that will
31
- # be converted to file names by calling to_s + '.ctl', or they may be strings in which
32
- # case they will be used as is
31
+ # Define a list of control files that this file depends on. Those control
32
+ # files will be executed prior to this control file. The list may
33
+ # contain symbols that will be converted to file names by calling
34
+ # to_s + '.ctl', or they may be strings in which case they will be used
35
+ # as is
33
36
  def depends_on(*args)
34
37
  dependencies << args
35
38
  end
@@ -53,7 +56,7 @@ module ETL #:nodoc:
53
56
  if configuration[:type].is_a?(ETL::Control::Source)
54
57
  sources << configuration[:type]
55
58
  else
56
- raise "Configuration must extend ETL::Control::Source"
59
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
57
60
  end
58
61
  end
59
62
  else
@@ -61,7 +64,9 @@ module ETL #:nodoc:
61
64
  if configuration[source_type]
62
65
  source_class = ETL::Control::Source.class_for_name(source_type)
63
66
  sources << source_class.new(self, configuration, definition)
67
+ break
64
68
  end
69
+ raise ControlError, "A source was specified but no matching type was found"
65
70
  end
66
71
  end
67
72
  end
@@ -73,10 +78,29 @@ module ETL #:nodoc:
73
78
 
74
79
  # Define a destination
75
80
  def destination(name, configuration={}, mapping={})
76
- destination_types.each do |dest_type|
77
- if configuration[dest_type]
78
- dest_class = ETL::Control::Destination.class_for_name(dest_type)
81
+ if configuration[:type]
82
+ case configuration[:type]
83
+ when Class
84
+ dest_class = configuration[:type]
79
85
  destinations << dest_class.new(self, configuration, mapping)
86
+ when String, Symbol
87
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
88
+ destinations << dest_class.new(self, configuration, mapping)
89
+ else
90
+ if configuration[:type].is_a?(ETL::Control::Destination)
91
+ destinations << configuration[:type]
92
+ else
93
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
94
+ end
95
+ end
96
+ else
97
+ destination_types.each do |dest_type|
98
+ if configuration[dest_type]
99
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
100
+ destinations << dest_class.new(self, configuration, mapping)
101
+ break
102
+ end
103
+ raise ControlError, "A destination was specified but no matching destination type was found"
80
104
  end
81
105
  end
82
106
  end
@@ -121,6 +145,17 @@ module ETL #:nodoc:
121
145
  control.transforms
122
146
  end
123
147
 
148
+ # Define a screen block. The type argument must be one of :fatal, :error
149
+ # or :warn
150
+ def screen(type, &block)
151
+ screens[type] << block
152
+ end
153
+
154
+ # Get the screen blocks
155
+ def screens
156
+ control.screens
157
+ end
158
+
124
159
  # Rename the source field to the destination field
125
160
  def rename(source, destination)
126
161
  after_read :rename, :source => source, :dest => destination
@@ -222,7 +257,6 @@ module ETL #:nodoc:
222
257
  # Parse a control file and return a Control instance
223
258
  def parse(control_file)
224
259
  control_file = control_file.path if control_file.instance_of?(File)
225
- # logger.debug "Parsing control file #{control_file.path}"
226
260
  control = ETL::Control::Control.new(control_file)
227
261
  # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
228
262
  eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
@@ -230,6 +264,13 @@ module ETL #:nodoc:
230
264
  control
231
265
  end
232
266
 
267
+ def parse_text(text)
268
+ control = ETL::Control::Control.new(nil)
269
+ eval(text, Context.create(control), 'inline')
270
+ control.validate
271
+ control
272
+ end
273
+
233
274
  # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
234
275
  # are:
235
276
  # * The path to a control file as a String
@@ -300,6 +341,14 @@ module ETL #:nodoc:
300
341
  @transforms ||= []
301
342
  end
302
343
 
344
+ def screens
345
+ @screens ||= {
346
+ :fatal => [],
347
+ :error => [],
348
+ :warn => []
349
+ }
350
+ end
351
+
303
352
  # Get the error threshold. Defaults to 100.
304
353
  def error_threshold
305
354
  @error_threshold ||= 100
@@ -157,7 +157,12 @@ module ETL #:nodoc:
157
157
 
158
158
  # Get the dimension table if specified
159
159
  def dimension_table
160
- configuration[:scd][:dimension_table] if scd?
160
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) if scd?
161
+ end
162
+
163
+ # Get the dimension target if specified
164
+ def dimension_target
165
+ configuration[:scd][:dimension_target] if scd?
161
166
  end
162
167
 
163
168
  # Process a row to determine the change type
@@ -209,10 +214,16 @@ module ETL #:nodoc:
209
214
 
210
215
  if scd_type == 2
211
216
  ETL::Engine.logger.debug "type 2 SCD"
217
+
218
+ raise ConfigurationError, "dimension_table setting required" unless dimension_table
219
+ raise ConfigurationError, "dimension_target setting required" unless dimension_target
220
+
221
+ conn = ETL::Engine.connection(dimension_target)
222
+
212
223
  q = "SELECT * FROM #{dimension_table} WHERE "
213
224
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
214
225
  #puts "looking for original record"
215
- result = ETL::ActiveRecord::Base.connection.select_one(q)
226
+ result = conn.select_one(q)
216
227
  if result
217
228
  #puts "Result: #{result.inspect}"
218
229
  original_record = ETL::Row[result.symbolize_keys!]
@@ -223,6 +234,15 @@ module ETL #:nodoc:
223
234
  # need to figure out how to delete that old record before inserting the
224
235
  # updated version of the record
225
236
 
237
+ q = "DELETE FROM #{dimension_table} WHERE "
238
+ q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
239
+
240
+ num_rows_affected = conn.delete(q)
241
+ ETL::Engine.logger.debug "deleted old row"
242
+
243
+ # do this?
244
+ #raise "Should have deleted a single record" if num_rows_affected != 1
245
+
226
246
  buffer << original_record
227
247
  end
228
248
 
@@ -239,9 +259,14 @@ module ETL #:nodoc:
239
259
  else
240
260
  ETL::Engine.logger.debug "CRC matches, skipping"
241
261
 
262
+ raise ConfigurationError, "dimension_table setting required" unless dimension_table
263
+ raise ConfigurationError, "dimension_target setting required" unless dimension_target
264
+
265
+ conn = ETL::Engine.connection(dimension_target)
266
+
242
267
  q = "SELECT * FROM #{dimension_table} WHERE "
243
268
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
244
- result = ETL::ActiveRecord::Base.connection.select_one(q)
269
+ result = conn.select_one(q)
245
270
  if result
246
271
  # This was necessary when truncating and then loading, however I
247
272
  # am getting reluctant to having the ETL process do the truncation
@@ -297,7 +322,7 @@ module ETL #:nodoc:
297
322
  generator = generators[key] ||= value.new
298
323
  row[key] = generator.next
299
324
  when Symbol
300
- generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
325
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
301
326
  row[key] = generator.next
302
327
  when Proc
303
328
  row[key] = value.call(row)
@@ -5,6 +5,12 @@ module ETL #:nodoc:
5
5
  # loader if it is supported with your target database as it will use a much faster load
6
6
  # method.
7
7
  class DatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
8
14
  # Specify the order from the source
9
15
  attr_reader :order
10
16
 
@@ -19,31 +25,31 @@ module ETL #:nodoc:
19
25
  #
20
26
  # Configuration options:
21
27
  # * <tt>:database</tt>: The database name (REQUIRED)
28
+ # * <tt>:target</tt>: The target connection (REQUIRED)
22
29
  # * <tt>:table</tt>: The table to write to (REQUIRED)
23
30
  # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
31
  # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
- # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
- # * <tt>:username</tt>: The database username (defaults to 'root')
27
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
- # * <tt>:host</tt>: The host for the database (defaults to 'localhost')
29
32
  # * <tt>:append_rows</tt>: Array of rows to append
30
33
  #
31
34
  # Mapping options:
32
35
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
33
36
  def initialize(control, configuration, mapping={})
34
37
  super
38
+ @target = configuration[:target]
39
+ @table = configuration[:table]
35
40
  @truncate = configuration[:truncate] ||= false
36
41
  @unique = configuration[:unique]
37
42
  @order = mapping[:order] || order_from_source
38
43
  raise ControlError, "Order required in mapping" unless @order
39
- connect
44
+ raise ControlError, "Table required" unless @table
45
+ raise ControlError, "Target required" unless @target
40
46
  end
41
47
 
42
48
  # Flush the currently buffered data
43
49
  def flush
44
- conn = ETL::ActiveRecord::Base.connection
50
+ conn = ETL::Engine.connection(target)
45
51
  conn.transaction do
46
- conn.truncate(configuration[:table]) if truncate
52
+ conn.truncate(table_name) if truncate
47
53
 
48
54
  buffer.flatten.each do |row|
49
55
  # check to see if this row's compound key constraint already exists
@@ -59,7 +65,7 @@ module ETL #:nodoc:
59
65
  names << name
60
66
  values << conn.quote(row[name]) # TODO: this is probably not database agnostic
61
67
  end
62
- q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
68
+ q = "INSERT INTO #{table_name} (#{names.join(',')}) VALUES (#{values.join(',')})"
63
69
  ETL::Engine.logger.debug("Executing insert: #{q}")
64
70
  conn.insert(q, "Insert row #{current_row}")
65
71
  @current_row += 1
@@ -72,29 +78,13 @@ module ETL #:nodoc:
72
78
  def close
73
79
  buffer << append_rows if append_rows
74
80
  flush
75
- ETL::ActiveRecord::Base.connection.disconnect!
76
81
  end
77
82
 
78
83
  private
79
- # Connect to the database.
80
- #
81
- # Required options:
82
- # * <tt>:database</tt>: The database name
83
- #
84
- # Options:
85
- # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
86
- # * <tt>:username</tt>: The database username (defaults to 'root')
87
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
88
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
89
- def connect
90
- ETL::ActiveRecord::Base.establish_connection(
91
- :adapter => (configuration[:adapter] || :mysql),
92
- :username => (configuration[:username] || 'root'),
93
- :host => (configuration[:host] || 'localhost'),
94
- :password => configuration[:password],
95
- :database => configuration[:database]
96
- )
84
+ def table_name
85
+ ETL::Engine.table(table, ETL::Engine.connection(target))
97
86
  end
87
+
98
88
  end
99
89
  end
100
90
  end