activewarehouse-etl 0.8.4 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/CHANGELOG +98 -62
  2. data/Rakefile +11 -0
  3. data/TODO +2 -1
  4. data/lib/etl.rb +9 -0
  5. data/lib/etl/batch.rb +2 -0
  6. data/lib/etl/batch/batch.rb +111 -0
  7. data/lib/etl/batch/directives.rb +55 -0
  8. data/lib/etl/builder.rb +1 -0
  9. data/lib/etl/builder/date_dimension_builder.rb +83 -0
  10. data/lib/etl/commands/etl.rb +56 -43
  11. data/lib/etl/control/control.rb +58 -9
  12. data/lib/etl/control/destination.rb +29 -4
  13. data/lib/etl/control/destination/database_destination.rb +17 -27
  14. data/lib/etl/control/source/database_source.rb +17 -40
  15. data/lib/etl/control/source/file_source.rb +8 -5
  16. data/lib/etl/control/source/model_source.rb +39 -0
  17. data/lib/etl/core_ext.rb +1 -0
  18. data/lib/etl/core_ext/time.rb +5 -0
  19. data/lib/etl/core_ext/time/calculations.rb +40 -0
  20. data/lib/etl/engine.rb +184 -83
  21. data/lib/etl/execution.rb +1 -0
  22. data/lib/etl/execution/base.rb +1 -1
  23. data/lib/etl/execution/batch.rb +8 -0
  24. data/lib/etl/execution/job.rb +1 -0
  25. data/lib/etl/execution/migration.rb +16 -4
  26. data/lib/etl/generator/surrogate_key_generator.rb +20 -4
  27. data/lib/etl/http_tools.rb +1 -1
  28. data/lib/etl/processor/bulk_import_processor.rb +16 -19
  29. data/lib/etl/processor/check_exist_processor.rb +16 -7
  30. data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
  31. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  32. data/lib/etl/processor/surrogate_key_processor.rb +22 -2
  33. data/lib/etl/processor/truncate_processor.rb +13 -13
  34. data/lib/etl/screen.rb +14 -0
  35. data/lib/etl/screen/row_count_screen.rb +7 -2
  36. data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
  37. data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
  38. data/lib/etl/util.rb +59 -0
  39. data/lib/etl/version.rb +2 -2
  40. metadata +19 -2
@@ -14,6 +14,7 @@ module ETL #:nodoc
14
14
  end
15
15
 
16
16
  require 'etl/execution/base'
17
+ require 'etl/execution/batch'
17
18
  require 'etl/execution/job'
18
19
  require 'etl/execution/record'
19
20
  require 'etl/execution/migration'
@@ -1,7 +1,7 @@
1
1
  module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Base class for ETL execution information
4
- class Base < ETL::ActiveRecord::Base
4
+ class Base < ActiveRecord::Base
5
5
  self.abstract_class = true
6
6
  establish_connection :etl_execution
7
7
  end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL batch
4
+ class Batch < Base
5
+ has_many :jobs
6
+ end
7
+ end
8
+ end
@@ -2,6 +2,7 @@ module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Persistent class representing an ETL job
4
4
  class Job < Base
5
+ belongs_to :batch
5
6
  end
6
7
  end
7
8
  end
@@ -8,7 +8,10 @@ module ETL #:nodoc:
8
8
  def migrate
9
9
  connection.initialize_schema_information
10
10
  v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
- v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
11
+ v.upto(target - 1) do |i|
12
+ __send__("migration_#{i+1}".to_sym)
13
+ update_schema_info(i+1)
14
+ end
12
15
  end
13
16
  protected
14
17
  # Get the schema info table name
@@ -24,7 +27,7 @@ module ETL #:nodoc:
24
27
 
25
28
  # Get the final target version number
26
29
  def target
27
- 2
30
+ 3
28
31
  end
29
32
 
30
33
  private
@@ -41,14 +44,23 @@ module ETL #:nodoc:
41
44
  t.column :crc, :string, :null => false
42
45
  t.column :job_id, :integer, :null => false
43
46
  end
44
- update_schema_info(1)
45
47
  end
46
48
 
47
49
  def migration_2 #:nodoc:
48
50
  connection.add_index :records, :control_file
49
51
  connection.add_index :records, :natural_key
50
52
  connection.add_index :records, :job_id
51
- update_schema_info(2)
53
+ end
54
+
55
+ def migration_3 #:nodoc:
56
+ connection.create_table :batches do |t|
57
+ t.column :batch_file, :string, :null => false
58
+ t.column :created_at, :datetime, :null => false
59
+ t.column :completed_at, :datetime
60
+ t.column :status, :string
61
+ end
62
+ connection.add_column :jobs, :batch_id, :integer
63
+ connection.add_index :jobs, :batch_id
52
64
  end
53
65
 
54
66
  # Update the schema info table, setting the version value
@@ -4,13 +4,25 @@ module ETL #:nodoc:
4
4
  module Generator #:nodoc:
5
5
  # Surrogate key generator.
6
6
  class SurrogateKeyGenerator < Generator
7
+ attr_reader :table
8
+ attr_reader :target
9
+ attr_reader :column
10
+ attr_reader :query
11
+
7
12
  # Initialize the generator
8
13
  def initialize(options={})
9
- if options[:query]
10
- @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
11
- @surrogate_key = 0 if @surrogate_key.blank?
12
- @surrogate_key = @surrogate_key.to_i
14
+ @table = options[:table]
15
+ @target = options[:target]
16
+ @column = options[:column] || 'id'
17
+ @query = options[:query]
18
+
19
+ if table
20
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
21
+ elsif query
22
+ @surrogate_key = ETL::Engine.connection(target).select_value(query)
13
23
  end
24
+ @surrogate_key = 0 if @surrogate_key.blank?
25
+ @surrogate_key = @surrogate_key.to_i
14
26
  end
15
27
 
16
28
  # Get the next surrogate key
@@ -18,6 +30,10 @@ module ETL #:nodoc:
18
30
  @surrogate_key ||= 0
19
31
  @surrogate_key += 1
20
32
  end
33
+
34
+ def table_name
35
+ ETL::Engine.table(table, ETL::Engine.connection(target))
36
+ end
21
37
  end
22
38
  end
23
39
  end
@@ -8,7 +8,7 @@ module HttpTools
8
8
  def parse_user_agent(user_agent)
9
9
  if '-' == user_agent
10
10
  #raise 'Invalid User Agent'
11
- puts 'Invalid User Agent'
11
+ #puts 'Invalid User Agent'
12
12
  end
13
13
 
14
14
  browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
@@ -4,10 +4,13 @@ module ETL #:nodoc:
4
4
  # underlying database driver from ActiveRecord must support the methods
5
5
  # +bulk_load+ method.
6
6
  class BulkImportProcessor < ETL::Processor::Processor
7
+
7
8
  # The file to load from
8
9
  attr_reader :file
9
- # The target database information (see +initialize+)
10
+ # The target database
10
11
  attr_reader :target
12
+ # The table name
13
+ attr_reader :table
11
14
  # Set to true to truncate
12
15
  attr_reader :truncate
13
16
  # Array of symbols representing the column load order
@@ -23,7 +26,8 @@ module ETL #:nodoc:
23
26
  #
24
27
  # Configuration options:
25
28
  # * <tt>:file</tt>: The file to load data from
26
- # * <tt>:target</tt>: The target connection information
29
+ # * <tt>:target</tt>: The target database
30
+ # * <tt>:table</tt>: The table name
27
31
  # * <tt>:truncate</tt>: Set to true to truncate before loading
28
32
  # * <tt>:columns</tt>: The columns to load in the order they appear in
29
33
  # the bulk data file
@@ -34,23 +38,25 @@ module ETL #:nodoc:
34
38
  super
35
39
  @file = File.join(File.dirname(control.file), configuration[:file])
36
40
  @target = configuration[:target]
41
+ @table = configuration[:table]
37
42
  @truncate = configuration[:truncate] ||= false
38
43
  @columns = configuration[:columns]
39
44
  @field_separator = (configuration[:field_separator] || ',')
40
45
  @line_separator = (configuration[:line_separator] || "\n")
41
46
  @field_enclosure = configuration[:field_enclosure]
42
- connect
47
+
48
+ raise ControlError, "Target must be specified" unless @target
49
+ raise ControlError, "Table must be specified" unless @table
43
50
  end
44
51
 
45
52
  # Execute the processor
46
53
  def process
47
54
  return if ETL::Engine.skip_bulk_import
55
+ return if File.size(file) == 0
48
56
 
49
- conn = ETL::ActiveRecord::Base.connection
57
+ conn = ETL::Engine.connection(target)
50
58
  conn.transaction do
51
- # TODO: Support all database types
52
- # Since LOCAL is used this must be allowed by both the client and server
53
- conn.truncate(target[:table]) if truncate
59
+ conn.truncate(table_name) if truncate
54
60
  options = {}
55
61
  options[:columns] = columns
56
62
  if field_separator || field_enclosure
@@ -59,21 +65,12 @@ module ETL #:nodoc:
59
65
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
60
66
  options[:fields][:terminated_by] = line_separator if line_separator
61
67
  end
62
- conn.bulk_load(file, target[:table], options)
68
+ conn.bulk_load(file, table_name, options)
63
69
  end
64
70
  end
65
71
 
66
- private
67
- # Connect to the database
68
- def connect
69
- Engine.logger.debug "Connecting to database #{target[:database]}"
70
- ETL::ActiveRecord::Base.establish_connection(
71
- :adapter => (target[:adapter] || :mysql),
72
- :username => (target[:username] || 'root'),
73
- :host => (target[:host] || 'localhost'),
74
- :password => target[:password],
75
- :database => target[:database]
76
- )
72
+ def table_name
73
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
74
  end
78
75
  end
79
76
  end
@@ -6,6 +6,9 @@ module ETL #:nodoc:
6
6
  # A symbol or array of symbols representing keys that should be skipped
7
7
  attr_accessor :skip
8
8
 
9
+ # The target database
10
+ attr_accessor :target
11
+
9
12
  # The name of the table to check against
10
13
  attr_accessor :table
11
14
 
@@ -26,11 +29,12 @@ module ETL #:nodoc:
26
29
  def initialize(control, configuration)
27
30
  super
28
31
  @skip = configuration[:skip] || []
29
- @table = configuration[:table]
32
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
33
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
30
34
  @columns = configuration[:columns]
31
35
 
32
- q = "SELECT COUNT(*) FROM #{table}"
33
- @should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
36
+ q = "SELECT COUNT(*) FROM #{table_name}"
37
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
34
38
  end
35
39
 
36
40
  # Return true if the given key should be skipped
@@ -51,20 +55,25 @@ module ETL #:nodoc:
51
55
  # Process the row
52
56
  def process(row)
53
57
  return row unless should_check?
54
- connection = ETL::ActiveRecord::Base.connection
55
- q = "SELECT * FROM #{table} WHERE "
58
+ conn = ETL::Engine.connection(target)
59
+ q = "SELECT * FROM #{table_name} WHERE "
56
60
  conditions = []
57
61
  row.each do |k,v|
58
62
  if columns.nil? || columns.include?(k.to_sym)
59
- conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
60
64
  end
61
65
  end
62
66
  q << conditions.join(" AND ")
63
67
 
64
68
  #puts "query: #{q}"
65
- result = connection.select_one(q)
69
+ result = conn.select_one(q)
66
70
  return row if result.nil?
67
71
  end
72
+
73
+ private
74
+ def table_name
75
+ ETL::Engine.table(table, ETL::Engine.connection(target))
76
+ end
68
77
  end
69
78
  end
70
79
  end
@@ -24,8 +24,9 @@ module ETL #:nodoc:
24
24
  # Process the row expanding it into hierarchy values
25
25
  def process(row)
26
26
  rows = []
27
- conn = configuration[:connection]
27
+ target = configuration[:target]
28
28
  table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
29
30
  build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
30
31
  rows
31
32
  end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which requires that the particular fields are non-blank in
4
+ # order for the row to be retained.
5
+ class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6
+ # An array of fields to check
7
+ attr_reader :fields
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Options:
12
+ # * <tt>:fields</tt>: An array of fields to check, for example:
13
+ # [:first_name,:last_name]
14
+ def initialize(control, configuration)
15
+ super
16
+ @fields = configuration[:fields] || []
17
+ end
18
+
19
+ # Process the row.
20
+ def process(row)
21
+ fields.each { |field| return if row[field].blank? }
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -2,14 +2,29 @@ module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
3
  # A row level processor that provides surrogate keys
4
4
  class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
- attr_accessor :query
6
5
  attr_accessor :destination
6
+ attr_accessor :table
7
+ attr_accessor :column
8
+ attr_accessor :target
7
9
 
8
10
  # Initialize the surrogate key generator
11
+ #
12
+ # Configuration options
13
+ # * <tt>:query</tt>: If specified it contains a query to be used to
14
+ # locate the last surrogate key. If this is specified then :target
15
+ # must also be specified.
16
+ # * <tt>:target</tt>: The target connection
17
+ # * <tt>:destination</tt>: The destination column name (defaults to :id)
9
18
  def initialize(control, configuration)
10
19
  super
20
+ @table = configuration[:table]
21
+ @column = configuration[:column] || 'id'
22
+ @target = configuration[:target]
11
23
  if configuration[:query]
12
- @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
24
+ raise ControlError, "Query option is no longer value, use :column and :table instead"
25
+ end
26
+ if table
27
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
13
28
  end
14
29
  #puts "initial surrogate key: #{@surrogate_key}"
15
30
  @surrogate_key = 0 if @surrogate_key.blank?
@@ -28,6 +43,11 @@ module ETL #:nodoc:
28
43
  row
29
44
  end
30
45
  end
46
+
47
+ private
48
+ def table_name
49
+ ETL::Engine.table(table, ETL::Engine.connection(target))
50
+ end
31
51
  end
32
52
  end
33
53
  end
@@ -6,29 +6,29 @@ module ETL #:nodoc:
6
6
  # Defines the table to truncate
7
7
  attr_reader :table
8
8
 
9
- # Defines the database connection information
9
+ # Defines the database connection to use
10
10
  attr_reader :target
11
+
12
+ # Initialize the processor
13
+ #
14
+ # Options:
15
+ # * <tt>:target</tt>: The target connection
16
+ # * <tt>:table</tt>: The table name
11
17
  def initialize(control, configuration)
12
18
  super
13
19
  #@file = File.join(File.dirname(control.file), configuration[:file])
14
20
  @target = configuration[:target] || {}
15
21
  @table = configuration[:table]
16
- connect
17
22
  end
23
+
18
24
  def process
19
- conn = ETL::ActiveRecord::Base.connection
20
- conn.truncate(table)
25
+ conn = ETL::Engine.connection(target)
26
+ conn.truncate(table_name)
21
27
  end
22
28
 
23
- # Connect to the database
24
- def connect
25
- ETL::ActiveRecord::Base.establish_connection(
26
- :adapter => (target[:adapter] || :mysql),
27
- :username => (target[:username] || 'root'),
28
- :host => (target[:host] || 'localhost'),
29
- :password => target[:password],
30
- :database => target[:database]
31
- )
29
+ private
30
+ def table_name
31
+ ETL::Engine.table(table, ETL::Engine.connection(target))
32
32
  end
33
33
  end
34
34
  end
@@ -0,0 +1,14 @@
1
+ # This source file contains the ETL::Screen module and requires all of the
2
+ # screens
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Screen module contains pre-built screens useful for checking the
6
+ # ETL state during execution. Screens may be fatal, which will result in
7
+ # termination of the ETL process, errors, which will result in the
8
+ # termination of just the current ETL control file, or warnings, which will
9
+ # result in a warning message.
10
+ module Screen
11
+ end
12
+ end
13
+
14
+ Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
@@ -4,11 +4,16 @@ module ETL
4
4
  # against the results from some sort of a row count query. If there
5
5
  # is a difference then the screen will not pass
6
6
  class RowCountScreen
7
+ attr_accessor :control, :configuration
7
8
  def initialize(control, configuration={})
8
-
9
+ @control = control
10
+ @configuration = configuration
11
+ execute
9
12
  end
10
13
  def execute
11
-
14
+ unless Engine.rows_written == configuration[:rows]
15
+ raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
16
+ end
12
17
  end
13
18
  end
14
19
  end
@@ -46,8 +46,9 @@ class ActiveRecordResolver
46
46
  # The find method to use (as a symbol)
47
47
  attr_accessor :find_method
48
48
 
49
- # Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
50
- # must be a symbol for the finder method used. For example:
49
+ # Initialize the resolver. The ar_class argument should extend from
50
+ # ActiveRecord::Base. The find_method argument must be a symbol for the
51
+ # finder method used. For example:
51
52
  #
52
53
  # ActiveRecordResolver.new(Person, :find_by_name)
53
54
  #
@@ -56,6 +57,7 @@ class ActiveRecordResolver
56
57
  @ar_class = ar_class
57
58
  @find_method = find_method
58
59
  end
60
+
59
61
  # Resolve the value
60
62
  def resolve(value)
61
63
  rec = ar_class.__send__(find_method, value)
@@ -64,13 +66,21 @@ class ActiveRecordResolver
64
66
  end
65
67
 
66
68
  class SQLResolver
69
+ # Initialize the SQL resolver. Use the given table and field name to search
70
+ # for the appropriate foreign key. The field should be the name of a natural
71
+ # key that is used to locate the surrogate key for the record.
72
+ #
73
+ # The connection argument is optional. If specified it can be either a symbol
74
+ # referencing a connection defined in the ETL database.yml file or an actual
75
+ # ActiveRecord connection instance. If the connection is not specified then
76
+ # the ActiveRecord::Base.connection will be used.
67
77
  def initialize(table, field, connection=nil)
68
78
  @table = table
69
79
  @field = field
70
- @connection = connection
80
+ @connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
81
+ @connection ||= ActiveRecord::Base.connection
71
82
  end
72
83
  def resolve(value)
73
- conn = @connection ||= ActiveRecord::Base.connection
74
- conn.select_value("SELECT id FROM #{table} WHERE #{field} = #{conn.quote(value)}")
84
+ @connection.select_value("SELECT id FROM #{@table} WHERE #{@field} = #{@connection.quote(value)}")
75
85
  end
76
86
  end