activewarehouse-etl 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/CHANGELOG +98 -62
  2. data/Rakefile +11 -0
  3. data/TODO +2 -1
  4. data/lib/etl.rb +9 -0
  5. data/lib/etl/batch.rb +2 -0
  6. data/lib/etl/batch/batch.rb +111 -0
  7. data/lib/etl/batch/directives.rb +55 -0
  8. data/lib/etl/builder.rb +1 -0
  9. data/lib/etl/builder/date_dimension_builder.rb +83 -0
  10. data/lib/etl/commands/etl.rb +56 -43
  11. data/lib/etl/control/control.rb +58 -9
  12. data/lib/etl/control/destination.rb +29 -4
  13. data/lib/etl/control/destination/database_destination.rb +17 -27
  14. data/lib/etl/control/source/database_source.rb +17 -40
  15. data/lib/etl/control/source/file_source.rb +8 -5
  16. data/lib/etl/control/source/model_source.rb +39 -0
  17. data/lib/etl/core_ext.rb +1 -0
  18. data/lib/etl/core_ext/time.rb +5 -0
  19. data/lib/etl/core_ext/time/calculations.rb +40 -0
  20. data/lib/etl/engine.rb +184 -83
  21. data/lib/etl/execution.rb +1 -0
  22. data/lib/etl/execution/base.rb +1 -1
  23. data/lib/etl/execution/batch.rb +8 -0
  24. data/lib/etl/execution/job.rb +1 -0
  25. data/lib/etl/execution/migration.rb +16 -4
  26. data/lib/etl/generator/surrogate_key_generator.rb +20 -4
  27. data/lib/etl/http_tools.rb +1 -1
  28. data/lib/etl/processor/bulk_import_processor.rb +16 -19
  29. data/lib/etl/processor/check_exist_processor.rb +16 -7
  30. data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
  31. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  32. data/lib/etl/processor/surrogate_key_processor.rb +22 -2
  33. data/lib/etl/processor/truncate_processor.rb +13 -13
  34. data/lib/etl/screen.rb +14 -0
  35. data/lib/etl/screen/row_count_screen.rb +7 -2
  36. data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
  37. data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
  38. data/lib/etl/util.rb +59 -0
  39. data/lib/etl/version.rb +2 -2
  40. metadata +19 -2
@@ -14,6 +14,7 @@ module ETL #:nodoc
14
14
  end
15
15
 
16
16
  require 'etl/execution/base'
17
+ require 'etl/execution/batch'
17
18
  require 'etl/execution/job'
18
19
  require 'etl/execution/record'
19
20
  require 'etl/execution/migration'
@@ -1,7 +1,7 @@
1
1
  module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Base class for ETL execution information
4
- class Base < ETL::ActiveRecord::Base
4
+ class Base < ActiveRecord::Base
5
5
  self.abstract_class = true
6
6
  establish_connection :etl_execution
7
7
  end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL batch
4
+ class Batch < Base
5
+ has_many :jobs
6
+ end
7
+ end
8
+ end
@@ -2,6 +2,7 @@ module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Persistent class representing an ETL job
4
4
  class Job < Base
5
+ belongs_to :batch
5
6
  end
6
7
  end
7
8
  end
@@ -8,7 +8,10 @@ module ETL #:nodoc:
8
8
  def migrate
9
9
  connection.initialize_schema_information
10
10
  v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
- v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
11
+ v.upto(target - 1) do |i|
12
+ __send__("migration_#{i+1}".to_sym)
13
+ update_schema_info(i+1)
14
+ end
12
15
  end
13
16
  protected
14
17
  # Get the schema info table name
@@ -24,7 +27,7 @@ module ETL #:nodoc:
24
27
 
25
28
  # Get the final target version number
26
29
  def target
27
- 2
30
+ 3
28
31
  end
29
32
 
30
33
  private
@@ -41,14 +44,23 @@ module ETL #:nodoc:
41
44
  t.column :crc, :string, :null => false
42
45
  t.column :job_id, :integer, :null => false
43
46
  end
44
- update_schema_info(1)
45
47
  end
46
48
 
47
49
  def migration_2 #:nodoc:
48
50
  connection.add_index :records, :control_file
49
51
  connection.add_index :records, :natural_key
50
52
  connection.add_index :records, :job_id
51
- update_schema_info(2)
53
+ end
54
+
55
+ def migration_3 #:nodoc:
56
+ connection.create_table :batches do |t|
57
+ t.column :batch_file, :string, :null => false
58
+ t.column :created_at, :datetime, :null => false
59
+ t.column :completed_at, :datetime
60
+ t.column :status, :string
61
+ end
62
+ connection.add_column :jobs, :batch_id, :integer
63
+ connection.add_index :jobs, :batch_id
52
64
  end
53
65
 
54
66
  # Update the schema info table, setting the version value
@@ -4,13 +4,25 @@ module ETL #:nodoc:
4
4
  module Generator #:nodoc:
5
5
  # Surrogate key generator.
6
6
  class SurrogateKeyGenerator < Generator
7
+ attr_reader :table
8
+ attr_reader :target
9
+ attr_reader :column
10
+ attr_reader :query
11
+
7
12
  # Initialize the generator
8
13
  def initialize(options={})
9
- if options[:query]
10
- @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
11
- @surrogate_key = 0 if @surrogate_key.blank?
12
- @surrogate_key = @surrogate_key.to_i
14
+ @table = options[:table]
15
+ @target = options[:target]
16
+ @column = options[:column] || 'id'
17
+ @query = options[:query]
18
+
19
+ if table
20
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
21
+ elsif query
22
+ @surrogate_key = ETL::Engine.connection(target).select_value(query)
13
23
  end
24
+ @surrogate_key = 0 if @surrogate_key.blank?
25
+ @surrogate_key = @surrogate_key.to_i
14
26
  end
15
27
 
16
28
  # Get the next surrogate key
@@ -18,6 +30,10 @@ module ETL #:nodoc:
18
30
  @surrogate_key ||= 0
19
31
  @surrogate_key += 1
20
32
  end
33
+
34
+ def table_name
35
+ ETL::Engine.table(table, ETL::Engine.connection(target))
36
+ end
21
37
  end
22
38
  end
23
39
  end
@@ -8,7 +8,7 @@ module HttpTools
8
8
  def parse_user_agent(user_agent)
9
9
  if '-' == user_agent
10
10
  #raise 'Invalid User Agent'
11
- puts 'Invalid User Agent'
11
+ #puts 'Invalid User Agent'
12
12
  end
13
13
 
14
14
  browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
@@ -4,10 +4,13 @@ module ETL #:nodoc:
4
4
  # underlying database driver from ActiveRecord must support the methods
5
5
  # +bulk_load+ method.
6
6
  class BulkImportProcessor < ETL::Processor::Processor
7
+
7
8
  # The file to load from
8
9
  attr_reader :file
9
- # The target database information (see +initialize+)
10
+ # The target database
10
11
  attr_reader :target
12
+ # The table name
13
+ attr_reader :table
11
14
  # Set to true to truncate
12
15
  attr_reader :truncate
13
16
  # Array of symbols representing the column load order
@@ -23,7 +26,8 @@ module ETL #:nodoc:
23
26
  #
24
27
  # Configuration options:
25
28
  # * <tt>:file</tt>: The file to load data from
26
- # * <tt>:target</tt>: The target connection information
29
+ # * <tt>:target</tt>: The target database
30
+ # * <tt>:table</tt>: The table name
27
31
  # * <tt>:truncate</tt>: Set to true to truncate before loading
28
32
  # * <tt>:columns</tt>: The columns to load in the order they appear in
29
33
  # the bulk data file
@@ -34,23 +38,25 @@ module ETL #:nodoc:
34
38
  super
35
39
  @file = File.join(File.dirname(control.file), configuration[:file])
36
40
  @target = configuration[:target]
41
+ @table = configuration[:table]
37
42
  @truncate = configuration[:truncate] ||= false
38
43
  @columns = configuration[:columns]
39
44
  @field_separator = (configuration[:field_separator] || ',')
40
45
  @line_separator = (configuration[:line_separator] || "\n")
41
46
  @field_enclosure = configuration[:field_enclosure]
42
- connect
47
+
48
+ raise ControlError, "Target must be specified" unless @target
49
+ raise ControlError, "Table must be specified" unless @table
43
50
  end
44
51
 
45
52
  # Execute the processor
46
53
  def process
47
54
  return if ETL::Engine.skip_bulk_import
55
+ return if File.size(file) == 0
48
56
 
49
- conn = ETL::ActiveRecord::Base.connection
57
+ conn = ETL::Engine.connection(target)
50
58
  conn.transaction do
51
- # TODO: Support all database types
52
- # Since LOCAL is used this must be allowed by both the client and server
53
- conn.truncate(target[:table]) if truncate
59
+ conn.truncate(table_name) if truncate
54
60
  options = {}
55
61
  options[:columns] = columns
56
62
  if field_separator || field_enclosure
@@ -59,21 +65,12 @@ module ETL #:nodoc:
59
65
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
60
66
  options[:fields][:terminated_by] = line_separator if line_separator
61
67
  end
62
- conn.bulk_load(file, target[:table], options)
68
+ conn.bulk_load(file, table_name, options)
63
69
  end
64
70
  end
65
71
 
66
- private
67
- # Connect to the database
68
- def connect
69
- Engine.logger.debug "Connecting to database #{target[:database]}"
70
- ETL::ActiveRecord::Base.establish_connection(
71
- :adapter => (target[:adapter] || :mysql),
72
- :username => (target[:username] || 'root'),
73
- :host => (target[:host] || 'localhost'),
74
- :password => target[:password],
75
- :database => target[:database]
76
- )
72
+ def table_name
73
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
74
  end
78
75
  end
79
76
  end
@@ -6,6 +6,9 @@ module ETL #:nodoc:
6
6
  # A symbol or array of symbols representing keys that should be skipped
7
7
  attr_accessor :skip
8
8
 
9
+ # The target database
10
+ attr_accessor :target
11
+
9
12
  # The name of the table to check against
10
13
  attr_accessor :table
11
14
 
@@ -26,11 +29,12 @@ module ETL #:nodoc:
26
29
  def initialize(control, configuration)
27
30
  super
28
31
  @skip = configuration[:skip] || []
29
- @table = configuration[:table]
32
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
33
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
30
34
  @columns = configuration[:columns]
31
35
 
32
- q = "SELECT COUNT(*) FROM #{table}"
33
- @should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
36
+ q = "SELECT COUNT(*) FROM #{table_name}"
37
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
34
38
  end
35
39
 
36
40
  # Return true if the given key should be skipped
@@ -51,20 +55,25 @@ module ETL #:nodoc:
51
55
  # Process the row
52
56
  def process(row)
53
57
  return row unless should_check?
54
- connection = ETL::ActiveRecord::Base.connection
55
- q = "SELECT * FROM #{table} WHERE "
58
+ conn = ETL::Engine.connection(target)
59
+ q = "SELECT * FROM #{table_name} WHERE "
56
60
  conditions = []
57
61
  row.each do |k,v|
58
62
  if columns.nil? || columns.include?(k.to_sym)
59
- conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
60
64
  end
61
65
  end
62
66
  q << conditions.join(" AND ")
63
67
 
64
68
  #puts "query: #{q}"
65
- result = connection.select_one(q)
69
+ result = conn.select_one(q)
66
70
  return row if result.nil?
67
71
  end
72
+
73
+ private
74
+ def table_name
75
+ ETL::Engine.table(table, ETL::Engine.connection(target))
76
+ end
68
77
  end
69
78
  end
70
79
  end
@@ -24,8 +24,9 @@ module ETL #:nodoc:
24
24
  # Process the row expanding it into hierarchy values
25
25
  def process(row)
26
26
  rows = []
27
- conn = configuration[:connection]
27
+ target = configuration[:target]
28
28
  table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
29
30
  build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
30
31
  rows
31
32
  end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which requires that the particular fields are non-blank in
4
+ # order for the row to be retained.
5
+ class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6
+ # An array of fields to check
7
+ attr_reader :fields
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Options:
12
+ # * <tt>:fields</tt>: An array of fields to check, for example:
13
+ # [:first_name,:last_name]
14
+ def initialize(control, configuration)
15
+ super
16
+ @fields = configuration[:fields] || []
17
+ end
18
+
19
+ # Process the row.
20
+ def process(row)
21
+ fields.each { |field| return if row[field].blank? }
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -2,14 +2,29 @@ module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
3
  # A row level processor that provides surrogate keys
4
4
  class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5
- attr_accessor :query
6
5
  attr_accessor :destination
6
+ attr_accessor :table
7
+ attr_accessor :column
8
+ attr_accessor :target
7
9
 
8
10
  # Initialize the surrogate key generator
11
+ #
12
+ # Configuration options
13
+ # * <tt>:query</tt>: If specified it contains a query to be used to
14
+ # locate the last surrogate key. If this is specified then :target
15
+ # must also be specified.
16
+ # * <tt>:target</tt>: The target connection
17
+ # * <tt>:destination</tt>: The destination column name (defaults to :id)
9
18
  def initialize(control, configuration)
10
19
  super
20
+ @table = configuration[:table]
21
+ @column = configuration[:column] || 'id'
22
+ @target = configuration[:target]
11
23
  if configuration[:query]
12
- @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
24
+ raise ControlError, "Query option is no longer value, use :column and :table instead"
25
+ end
26
+ if table
27
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
13
28
  end
14
29
  #puts "initial surrogate key: #{@surrogate_key}"
15
30
  @surrogate_key = 0 if @surrogate_key.blank?
@@ -28,6 +43,11 @@ module ETL #:nodoc:
28
43
  row
29
44
  end
30
45
  end
46
+
47
+ private
48
+ def table_name
49
+ ETL::Engine.table(table, ETL::Engine.connection(target))
50
+ end
31
51
  end
32
52
  end
33
53
  end
@@ -6,29 +6,29 @@ module ETL #:nodoc:
6
6
  # Defines the table to truncate
7
7
  attr_reader :table
8
8
 
9
- # Defines the database connection information
9
+ # Defines the database connection to use
10
10
  attr_reader :target
11
+
12
+ # Initialize the processor
13
+ #
14
+ # Options:
15
+ # * <tt>:target</tt>: The target connection
16
+ # * <tt>:table</tt>: The table name
11
17
  def initialize(control, configuration)
12
18
  super
13
19
  #@file = File.join(File.dirname(control.file), configuration[:file])
14
20
  @target = configuration[:target] || {}
15
21
  @table = configuration[:table]
16
- connect
17
22
  end
23
+
18
24
  def process
19
- conn = ETL::ActiveRecord::Base.connection
20
- conn.truncate(table)
25
+ conn = ETL::Engine.connection(target)
26
+ conn.truncate(table_name)
21
27
  end
22
28
 
23
- # Connect to the database
24
- def connect
25
- ETL::ActiveRecord::Base.establish_connection(
26
- :adapter => (target[:adapter] || :mysql),
27
- :username => (target[:username] || 'root'),
28
- :host => (target[:host] || 'localhost'),
29
- :password => target[:password],
30
- :database => target[:database]
31
- )
29
+ private
30
+ def table_name
31
+ ETL::Engine.table(table, ETL::Engine.connection(target))
32
32
  end
33
33
  end
34
34
  end
@@ -0,0 +1,14 @@
1
+ # This source file contains the ETL::Screen module and requires all of the
2
+ # screens
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Screen module contains pre-built screens useful for checking the
6
+ # ETL state during execution. Screens may be fatal, which will result in
7
+ # termination of the ETL process, errors, which will result in the
8
+ # termination of just the current ETL control file, or warnings, which will
9
+ # result in a warning message.
10
+ module Screen
11
+ end
12
+ end
13
+
14
+ Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
@@ -4,11 +4,16 @@ module ETL
4
4
  # against the results from some sort of a row count query. If there
5
5
  # is a difference then the screen will not pass
6
6
  class RowCountScreen
7
+ attr_accessor :control, :configuration
7
8
  def initialize(control, configuration={})
8
-
9
+ @control = control
10
+ @configuration = configuration
11
+ execute
9
12
  end
10
13
  def execute
11
-
14
+ unless Engine.rows_written == configuration[:rows]
15
+ raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
16
+ end
12
17
  end
13
18
  end
14
19
  end
@@ -46,8 +46,9 @@ class ActiveRecordResolver
46
46
  # The find method to use (as a symbol)
47
47
  attr_accessor :find_method
48
48
 
49
- # Initialize the resolver. The ar_class argument should extend from ActiveRecord::Base. The find_method argument
50
- # must be a symbol for the finder method used. For example:
49
+ # Initialize the resolver. The ar_class argument should extend from
50
+ # ActiveRecord::Base. The find_method argument must be a symbol for the
51
+ # finder method used. For example:
51
52
  #
52
53
  # ActiveRecordResolver.new(Person, :find_by_name)
53
54
  #
@@ -56,6 +57,7 @@ class ActiveRecordResolver
56
57
  @ar_class = ar_class
57
58
  @find_method = find_method
58
59
  end
60
+
59
61
  # Resolve the value
60
62
  def resolve(value)
61
63
  rec = ar_class.__send__(find_method, value)
@@ -64,13 +66,21 @@ class ActiveRecordResolver
64
66
  end
65
67
 
66
68
  class SQLResolver
69
+ # Initialize the SQL resolver. Use the given table and field name to search
70
+ # for the appropriate foreign key. The field should be the name of a natural
71
+ # key that is used to locate the surrogate key for the record.
72
+ #
73
+ # The connection argument is optional. If specified it can be either a symbol
74
+ # referencing a connection defined in the ETL database.yml file or an actual
75
+ # ActiveRecord connection instance. If the connection is not specified then
76
+ # the ActiveRecord::Base.connection will be used.
67
77
  def initialize(table, field, connection=nil)
68
78
  @table = table
69
79
  @field = field
70
- @connection = connection
80
+ @connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
81
+ @connection ||= ActiveRecord::Base.connection
71
82
  end
72
83
  def resolve(value)
73
- conn = @connection ||= ActiveRecord::Base.connection
74
- conn.select_value("SELECT id FROM #{table} WHERE #{field} = #{conn.quote(value)}")
84
+ @connection.select_value("SELECT id FROM #{@table} WHERE #{@field} = #{@connection.quote(value)}")
75
85
  end
76
86
  end