activewarehouse-etl 0.8.4 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/CHANGELOG +98 -62
  2. data/Rakefile +11 -0
  3. data/TODO +2 -1
  4. data/lib/etl.rb +9 -0
  5. data/lib/etl/batch.rb +2 -0
  6. data/lib/etl/batch/batch.rb +111 -0
  7. data/lib/etl/batch/directives.rb +55 -0
  8. data/lib/etl/builder.rb +1 -0
  9. data/lib/etl/builder/date_dimension_builder.rb +83 -0
  10. data/lib/etl/commands/etl.rb +56 -43
  11. data/lib/etl/control/control.rb +58 -9
  12. data/lib/etl/control/destination.rb +29 -4
  13. data/lib/etl/control/destination/database_destination.rb +17 -27
  14. data/lib/etl/control/source/database_source.rb +17 -40
  15. data/lib/etl/control/source/file_source.rb +8 -5
  16. data/lib/etl/control/source/model_source.rb +39 -0
  17. data/lib/etl/core_ext.rb +1 -0
  18. data/lib/etl/core_ext/time.rb +5 -0
  19. data/lib/etl/core_ext/time/calculations.rb +40 -0
  20. data/lib/etl/engine.rb +184 -83
  21. data/lib/etl/execution.rb +1 -0
  22. data/lib/etl/execution/base.rb +1 -1
  23. data/lib/etl/execution/batch.rb +8 -0
  24. data/lib/etl/execution/job.rb +1 -0
  25. data/lib/etl/execution/migration.rb +16 -4
  26. data/lib/etl/generator/surrogate_key_generator.rb +20 -4
  27. data/lib/etl/http_tools.rb +1 -1
  28. data/lib/etl/processor/bulk_import_processor.rb +16 -19
  29. data/lib/etl/processor/check_exist_processor.rb +16 -7
  30. data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
  31. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  32. data/lib/etl/processor/surrogate_key_processor.rb +22 -2
  33. data/lib/etl/processor/truncate_processor.rb +13 -13
  34. data/lib/etl/screen.rb +14 -0
  35. data/lib/etl/screen/row_count_screen.rb +7 -2
  36. data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
  37. data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
  38. data/lib/etl/util.rb +59 -0
  39. data/lib/etl/version.rb +2 -2
  40. metadata +19 -2
@@ -0,0 +1 @@
1
+ require 'etl/builder/date_dimension_builder'
@@ -0,0 +1,83 @@
1
+ module ETL #:nodoc:
2
+ module Builder #:nodoc:
3
+ # A builder which will build a data structure which can be used to populate a date dimension using
4
+ # commonly used date dimension columns.
5
+ class DateDimensionBuilder
6
+ # Specify the start date for the first record
7
+ attr_accessor :start_date
8
+
9
+ # Specify the end date for the last record
10
+ attr_accessor :end_date
11
+
12
+ # Define any holiday indicators
13
+ attr_accessor :holiday_indicators
14
+
15
+ # Define the weekday indicators. The default array begins on Sunday and goes to Saturday.
16
+ cattr_accessor :weekday_indicators
17
+ @@weekday_indicators = ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
18
+
19
+ # Initialize the builder.
20
+ #
21
+ # * <tt>start_date</tt>: The start date. Defaults to 5 years ago from today.
22
+ # * <tt>end_date</tt>: The end date. Defaults to now.
23
+ def initialize(start_date=Time.now.years_ago(5), end_date=Time.now)
24
+ @start_date = start_date
25
+ @end_date = end_date
26
+ @holiday_indicators = []
27
+ end
28
+
29
+ # Returns an array of hashes representing records in the dimension. The values for each record are
30
+ # accessed by name.
31
+ def build(options={})
32
+ records = []
33
+ date = start_date.to_time
34
+ while date <= end_date.to_time
35
+ record = {}
36
+ record[:date] = date.strftime("%m/%d/%Y")
37
+ record[:full_date_description] = date.strftime("%B %d,%Y")
38
+ record[:day_of_week] = date.strftime("%A")
39
+ #record[:day_number_in_epoch] = date.to_i / 24
40
+ #record[:week_number_in_epoch] = date.to_i / (24 * 7)
41
+ #record[:month_number_in_epoch] = date.to_i / (24 * 7 * 30)
42
+ record[:day_number_in_calendar_month] = date.day
43
+ record[:day_number_in_calendar_year] = date.yday
44
+ record[:day_number_in_fiscal_month] = date.day # should this be different from CY?
45
+ record[:day_number_in_fiscal_year] = date.fiscal_year_yday
46
+ #record[:last_day_in_week_indicator] =
47
+ #record[:last_day_in_month_indicator] =
48
+ #record[:calendar_week_ending_date] =
49
+ record[:calendar_week] = "Week #{date.week}"
50
+ record[:calendar_week_number_in_year] = date.week
51
+ record[:calendar_month_name] = date.strftime("%B")
52
+ record[:calendar_month_number_in_year] = date.month
53
+ record[:calendar_year_month] = date.strftime("%Y-%m")
54
+ record[:calendar_quarter] = "Q#{date.quarter}"
55
+ record[:calendar_quarter_number_in_year] = date.quarter
56
+ record[:calendar_year_quarter] = "#{date.strftime('%Y')}-#{record[:calendar_quarter]}"
57
+ #record[:calendar_half_year] =
58
+ record[:calendar_year] = "#{date.year}"
59
+ record[:fiscal_week] = "FY Week #{date.fiscal_year_week}"
60
+ record[:fiscal_week_number_in_year] = date.fiscal_year_week
61
+ record[:fiscal_month] = date.fiscal_year_month
62
+ record[:fiscal_month_number_in_year] = date.fiscal_year_month
63
+ record[:fiscal_year_month] = "FY#{date.fiscal_year}-" + date.fiscal_year_month.to_s.rjust(2, '0')
64
+ record[:fiscal_quarter] = "FY Q#{date.fiscal_year_quarter}"
65
+ record[:fiscal_year_quarter] = "FY#{date.fiscal_year}-Q#{date.fiscal_year_quarter}"
66
+ record[:fiscal_year_quarter_number] = date.fiscal_year_quarter
67
+ #record[:fiscal_half_year] =
68
+ record[:fiscal_year] = "FY#{date.fiscal_year}"
69
+ record[:fiscal_year_number] = date.fiscal_year
70
+ record[:holiday_indicator] = holiday_indicators.include?(date) ? 'Holiday' : 'Nonholiday'
71
+ record[:weekday_indicator] = weekday_indicators[date.wday]
72
+ record[:selling_season] = 'None'
73
+ record[:major_event] = 'None'
74
+ record[:sql_date_stamp] = date
75
+
76
+ records << record
77
+ date = date.tomorrow
78
+ end
79
+ records
80
+ end
81
+ end
82
+ end
83
+ end
@@ -24,54 +24,67 @@
24
24
  require 'benchmark'
25
25
  require 'getoptlong'
26
26
 
27
- opts = GetoptLong.new(
28
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
29
- [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
30
- [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
31
- [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
32
- [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
33
- [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
34
- [ '--read-locally', GetoptLong::NO_ARGUMENT]
35
- )
36
-
37
27
  # Print a usage statement
38
28
  def usage #:nodoc:
39
- puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]" # TODO: add the command line options
29
+ puts "Usage: etl file [file file ...]" # TODO: add the command line options
40
30
  end
41
31
 
42
- options = {}
43
- opts.each do |opt, arg|
44
- case opt
45
- when '--help'
46
- usage
47
- when '--config'
48
- options[:config] = arg
49
- when '--limit'
50
- options[:limit] = arg.to_i
51
- when '--offset'
52
- options[:offset] = arg.to_i
53
- when '--newlog'
54
- options[:newlog] = true
55
- when '--skip-bulk-import'
56
- puts "skip bulk import enabled"
57
- options[:skip_bulk_import] = true
58
- when '--read-locally'
59
- puts "read locally enabled"
60
- options[:read_locally] = true
32
+ def execute
33
+ opts = GetoptLong.new(
34
+ [ '--version', '-v', GetoptLong::NO_ARGUMENT],
35
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
36
+ [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
37
+ [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
38
+ [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
39
+ [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
40
+ [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
41
+ [ '--read-locally', GetoptLong::NO_ARGUMENT],
42
+ [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
43
+ )
44
+
45
+ options = {}
46
+ opts.each do |opt, arg|
47
+ case opt
48
+ when '--version'
49
+ puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
50
+ return
51
+ when '--help'
52
+ usage
53
+ return
54
+ when '--config'
55
+ options[:config] = arg
56
+ when '--limit'
57
+ options[:limit] = arg.to_i
58
+ when '--offset'
59
+ options[:offset] = arg.to_i
60
+ when '--newlog'
61
+ options[:newlog] = true
62
+ when '--skip-bulk-import'
63
+ puts "skip bulk import enabled"
64
+ options[:skip_bulk_import] = true
65
+ when '--read-locally'
66
+ puts "read locally enabled"
67
+ options[:read_locally] = true
68
+ when '--rails-root'
69
+ options[:rails_root] = arg
70
+ puts "rails root set to #{options[:rails_root]}"
71
+ end
61
72
  end
62
- end
63
73
 
64
- if ARGV.length < 1
65
- usage
66
- else
67
- puts "Starting ETL process"
74
+ if ARGV.length < 1
75
+ usage
76
+ else
77
+ puts "Starting ETL process"
68
78
 
69
- ETL::Engine.init(options)
70
- ARGV.each do |f|
71
- puts "Processing #{f}"
72
- ETL::Engine.realtime_activity = true
73
- ETL::Engine.process(f)
74
- end
79
+ ETL::Engine.init(options)
80
+ ARGV.each do |f|
81
+ puts "Processing #{f}"
82
+ ETL::Engine.realtime_activity = true
83
+ ETL::Engine.process(f)
84
+ end
75
85
 
76
- puts "ETL process complete"
77
- end
86
+ puts "ETL process complete"
87
+ end
88
+ end
89
+
90
+ execute
@@ -2,6 +2,8 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  # The Context is passed to eval.
4
4
  class Context
5
+ require 'test/unit/assertions'
6
+ include Test::Unit::Assertions
5
7
  attr_reader :control
6
8
 
7
9
  class << self
@@ -26,10 +28,11 @@ module ETL #:nodoc:
26
28
  control.error_threshold = error_threshold
27
29
  end
28
30
 
29
- # Define a list of control files that this file depends on. Those control files
30
- # will be executed prior to this control file. The list may contain symbols that will
31
- # be converted to file names by calling to_s + '.ctl', or they may be strings in which
32
- # case they will be used as is
31
+ # Define a list of control files that this file depends on. Those control
32
+ # files will be executed prior to this control file. The list may
33
+ # contain symbols that will be converted to file names by calling
34
+ # to_s + '.ctl', or they may be strings in which case they will be used
35
+ # as is
33
36
  def depends_on(*args)
34
37
  dependencies << args
35
38
  end
@@ -53,7 +56,7 @@ module ETL #:nodoc:
53
56
  if configuration[:type].is_a?(ETL::Control::Source)
54
57
  sources << configuration[:type]
55
58
  else
56
- raise "Configuration must extend ETL::Control::Source"
59
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
57
60
  end
58
61
  end
59
62
  else
@@ -61,7 +64,9 @@ module ETL #:nodoc:
61
64
  if configuration[source_type]
62
65
  source_class = ETL::Control::Source.class_for_name(source_type)
63
66
  sources << source_class.new(self, configuration, definition)
67
+ break
64
68
  end
69
+ raise ControlError, "A source was specified but no matching type was found"
65
70
  end
66
71
  end
67
72
  end
@@ -73,10 +78,29 @@ module ETL #:nodoc:
73
78
 
74
79
  # Define a destination
75
80
  def destination(name, configuration={}, mapping={})
76
- destination_types.each do |dest_type|
77
- if configuration[dest_type]
78
- dest_class = ETL::Control::Destination.class_for_name(dest_type)
81
+ if configuration[:type]
82
+ case configuration[:type]
83
+ when Class
84
+ dest_class = configuration[:type]
79
85
  destinations << dest_class.new(self, configuration, mapping)
86
+ when String, Symbol
87
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
88
+ destinations << dest_class.new(self, configuration, mapping)
89
+ else
90
+ if configuration[:type].is_a?(ETL::Control::Destination)
91
+ destinations << configuration[:type]
92
+ else
93
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
94
+ end
95
+ end
96
+ else
97
+ destination_types.each do |dest_type|
98
+ if configuration[dest_type]
99
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
100
+ destinations << dest_class.new(self, configuration, mapping)
101
+ break
102
+ end
103
+ raise ControlError, "A destination was specified but no matching destination type was found"
80
104
  end
81
105
  end
82
106
  end
@@ -121,6 +145,17 @@ module ETL #:nodoc:
121
145
  control.transforms
122
146
  end
123
147
 
148
+ # Define a screen block. The type argument must be one of :fatal, :error
149
+ # or :warn
150
+ def screen(type, &block)
151
+ screens[type] << block
152
+ end
153
+
154
+ # Get the screen blocks
155
+ def screens
156
+ control.screens
157
+ end
158
+
124
159
  # Rename the source field to the destination field
125
160
  def rename(source, destination)
126
161
  after_read :rename, :source => source, :dest => destination
@@ -222,7 +257,6 @@ module ETL #:nodoc:
222
257
  # Parse a control file and return a Control instance
223
258
  def parse(control_file)
224
259
  control_file = control_file.path if control_file.instance_of?(File)
225
- # logger.debug "Parsing control file #{control_file.path}"
226
260
  control = ETL::Control::Control.new(control_file)
227
261
  # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
228
262
  eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
@@ -230,6 +264,13 @@ module ETL #:nodoc:
230
264
  control
231
265
  end
232
266
 
267
+ def parse_text(text)
268
+ control = ETL::Control::Control.new(nil)
269
+ eval(text, Context.create(control), 'inline')
270
+ control.validate
271
+ control
272
+ end
273
+
233
274
  # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
234
275
  # are:
235
276
  # * The path to a control file as a String
@@ -300,6 +341,14 @@ module ETL #:nodoc:
300
341
  @transforms ||= []
301
342
  end
302
343
 
344
+ def screens
345
+ @screens ||= {
346
+ :fatal => [],
347
+ :error => [],
348
+ :warn => []
349
+ }
350
+ end
351
+
303
352
  # Get the error threshold. Defaults to 100.
304
353
  def error_threshold
305
354
  @error_threshold ||= 100
@@ -157,7 +157,12 @@ module ETL #:nodoc:
157
157
 
158
158
  # Get the dimension table if specified
159
159
  def dimension_table
160
- configuration[:scd][:dimension_table] if scd?
160
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) if scd?
161
+ end
162
+
163
+ # Get the dimension target if specified
164
+ def dimension_target
165
+ configuration[:scd][:dimension_target] if scd?
161
166
  end
162
167
 
163
168
  # Process a row to determine the change type
@@ -209,10 +214,16 @@ module ETL #:nodoc:
209
214
 
210
215
  if scd_type == 2
211
216
  ETL::Engine.logger.debug "type 2 SCD"
217
+
218
+ raise ConfigurationError, "dimension_table setting required" unless dimension_table
219
+ raise ConfigurationError, "dimension_target setting required" unless dimension_target
220
+
221
+ conn = ETL::Engine.connection(dimension_target)
222
+
212
223
  q = "SELECT * FROM #{dimension_table} WHERE "
213
224
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
214
225
  #puts "looking for original record"
215
- result = ETL::ActiveRecord::Base.connection.select_one(q)
226
+ result = conn.select_one(q)
216
227
  if result
217
228
  #puts "Result: #{result.inspect}"
218
229
  original_record = ETL::Row[result.symbolize_keys!]
@@ -223,6 +234,15 @@ module ETL #:nodoc:
223
234
  # need to figure out how to delete that old record before inserting the
224
235
  # updated version of the record
225
236
 
237
+ q = "DELETE FROM #{dimension_table} WHERE "
238
+ q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
239
+
240
+ num_rows_affected = conn.delete(q)
241
+ ETL::Engine.logger.debug "deleted old row"
242
+
243
+ # do this?
244
+ #raise "Should have deleted a single record" if num_rows_affected != 1
245
+
226
246
  buffer << original_record
227
247
  end
228
248
 
@@ -239,9 +259,14 @@ module ETL #:nodoc:
239
259
  else
240
260
  ETL::Engine.logger.debug "CRC matches, skipping"
241
261
 
262
+ raise ConfigurationError, "dimension_table setting required" unless dimension_table
263
+ raise ConfigurationError, "dimension_target setting required" unless dimension_target
264
+
265
+ conn = ETL::Engine.connection(dimension_target)
266
+
242
267
  q = "SELECT * FROM #{dimension_table} WHERE "
243
268
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
244
- result = ETL::ActiveRecord::Base.connection.select_one(q)
269
+ result = conn.select_one(q)
245
270
  if result
246
271
  # This was necessary when truncating and then loading, however I
247
272
  # am getting reluctant to having the ETL process do the truncation
@@ -297,7 +322,7 @@ module ETL #:nodoc:
297
322
  generator = generators[key] ||= value.new
298
323
  row[key] = generator.next
299
324
  when Symbol
300
- generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
325
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
301
326
  row[key] = generator.next
302
327
  when Proc
303
328
  row[key] = value.call(row)
@@ -5,6 +5,12 @@ module ETL #:nodoc:
5
5
  # loader if it is supported with your target database as it will use a much faster load
6
6
  # method.
7
7
  class DatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
8
14
  # Specify the order from the source
9
15
  attr_reader :order
10
16
 
@@ -19,31 +25,31 @@ module ETL #:nodoc:
19
25
  #
20
26
  # Configuration options:
21
27
  # * <tt>:database</tt>: The database name (REQUIRED)
28
+ # * <tt>:target</tt>: The target connection (REQUIRED)
22
29
  # * <tt>:table</tt>: The table to write to (REQUIRED)
23
30
  # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
31
  # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
- # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
- # * <tt>:username</tt>: The database username (defaults to 'root')
27
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
- # * <tt>:host</tt>: The host for the database (defaults to 'localhost')
29
32
  # * <tt>:append_rows</tt>: Array of rows to append
30
33
  #
31
34
  # Mapping options:
32
35
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
33
36
  def initialize(control, configuration, mapping={})
34
37
  super
38
+ @target = configuration[:target]
39
+ @table = configuration[:table]
35
40
  @truncate = configuration[:truncate] ||= false
36
41
  @unique = configuration[:unique]
37
42
  @order = mapping[:order] || order_from_source
38
43
  raise ControlError, "Order required in mapping" unless @order
39
- connect
44
+ raise ControlError, "Table required" unless @table
45
+ raise ControlError, "Target required" unless @target
40
46
  end
41
47
 
42
48
  # Flush the currently buffered data
43
49
  def flush
44
- conn = ETL::ActiveRecord::Base.connection
50
+ conn = ETL::Engine.connection(target)
45
51
  conn.transaction do
46
- conn.truncate(configuration[:table]) if truncate
52
+ conn.truncate(table_name) if truncate
47
53
 
48
54
  buffer.flatten.each do |row|
49
55
  # check to see if this row's compound key constraint already exists
@@ -59,7 +65,7 @@ module ETL #:nodoc:
59
65
  names << name
60
66
  values << conn.quote(row[name]) # TODO: this is probably not database agnostic
61
67
  end
62
- q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
68
+ q = "INSERT INTO #{table_name} (#{names.join(',')}) VALUES (#{values.join(',')})"
63
69
  ETL::Engine.logger.debug("Executing insert: #{q}")
64
70
  conn.insert(q, "Insert row #{current_row}")
65
71
  @current_row += 1
@@ -72,29 +78,13 @@ module ETL #:nodoc:
72
78
  def close
73
79
  buffer << append_rows if append_rows
74
80
  flush
75
- ETL::ActiveRecord::Base.connection.disconnect!
76
81
  end
77
82
 
78
83
  private
79
- # Connect to the database.
80
- #
81
- # Required options:
82
- # * <tt>:database</tt>: The database name
83
- #
84
- # Options:
85
- # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
86
- # * <tt>:username</tt>: The database username (defaults to 'root')
87
- # * <tt>:password</tt>: The password to the database (defaults to nothing)
88
- # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
89
- def connect
90
- ETL::ActiveRecord::Base.establish_connection(
91
- :adapter => (configuration[:adapter] || :mysql),
92
- :username => (configuration[:username] || 'root'),
93
- :host => (configuration[:host] || 'localhost'),
94
- :password => configuration[:password],
95
- :database => configuration[:database]
96
- )
84
+ def table_name
85
+ ETL::Engine.table(table, ETL::Engine.connection(target))
97
86
  end
87
+
98
88
  end
99
89
  end
100
90
  end