activewarehouse-etl 0.8.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +98 -62
- data/Rakefile +11 -0
- data/TODO +2 -1
- data/lib/etl.rb +9 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +1 -0
- data/lib/etl/builder/date_dimension_builder.rb +83 -0
- data/lib/etl/commands/etl.rb +56 -43
- data/lib/etl/control/control.rb +58 -9
- data/lib/etl/control/destination.rb +29 -4
- data/lib/etl/control/destination/database_destination.rb +17 -27
- data/lib/etl/control/source/database_source.rb +17 -40
- data/lib/etl/control/source/file_source.rb +8 -5
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +40 -0
- data/lib/etl/engine.rb +184 -83
- data/lib/etl/execution.rb +1 -0
- data/lib/etl/execution/base.rb +1 -1
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +1 -0
- data/lib/etl/execution/migration.rb +16 -4
- data/lib/etl/generator/surrogate_key_generator.rb +20 -4
- data/lib/etl/http_tools.rb +1 -1
- data/lib/etl/processor/bulk_import_processor.rb +16 -19
- data/lib/etl/processor/check_exist_processor.rb +16 -7
- data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/surrogate_key_processor.rb +22 -2
- data/lib/etl/processor/truncate_processor.rb +13 -13
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +7 -2
- data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
- data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +2 -2
- metadata +19 -2
data/lib/etl/execution.rb
CHANGED
data/lib/etl/execution/base.rb
CHANGED
data/lib/etl/execution/job.rb
CHANGED
@@ -8,7 +8,10 @@ module ETL #:nodoc:
|
|
8
8
|
def migrate
|
9
9
|
connection.initialize_schema_information
|
10
10
|
v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
|
11
|
-
v.upto(target - 1)
|
11
|
+
v.upto(target - 1) do |i|
|
12
|
+
__send__("migration_#{i+1}".to_sym)
|
13
|
+
update_schema_info(i+1)
|
14
|
+
end
|
12
15
|
end
|
13
16
|
protected
|
14
17
|
# Get the schema info table name
|
@@ -24,7 +27,7 @@ module ETL #:nodoc:
|
|
24
27
|
|
25
28
|
# Get the final target version number
|
26
29
|
def target
|
27
|
-
|
30
|
+
3
|
28
31
|
end
|
29
32
|
|
30
33
|
private
|
@@ -41,14 +44,23 @@ module ETL #:nodoc:
|
|
41
44
|
t.column :crc, :string, :null => false
|
42
45
|
t.column :job_id, :integer, :null => false
|
43
46
|
end
|
44
|
-
update_schema_info(1)
|
45
47
|
end
|
46
48
|
|
47
49
|
def migration_2 #:nodoc:
|
48
50
|
connection.add_index :records, :control_file
|
49
51
|
connection.add_index :records, :natural_key
|
50
52
|
connection.add_index :records, :job_id
|
51
|
-
|
53
|
+
end
|
54
|
+
|
55
|
+
def migration_3 #:nodoc:
|
56
|
+
connection.create_table :batches do |t|
|
57
|
+
t.column :batch_file, :string, :null => false
|
58
|
+
t.column :created_at, :datetime, :null => false
|
59
|
+
t.column :completed_at, :datetime
|
60
|
+
t.column :status, :string
|
61
|
+
end
|
62
|
+
connection.add_column :jobs, :batch_id, :integer
|
63
|
+
connection.add_index :jobs, :batch_id
|
52
64
|
end
|
53
65
|
|
54
66
|
# Update the schema info table, setting the version value
|
@@ -4,13 +4,25 @@ module ETL #:nodoc:
|
|
4
4
|
module Generator #:nodoc:
|
5
5
|
# Surrogate key generator.
|
6
6
|
class SurrogateKeyGenerator < Generator
|
7
|
+
attr_reader :table
|
8
|
+
attr_reader :target
|
9
|
+
attr_reader :column
|
10
|
+
attr_reader :query
|
11
|
+
|
7
12
|
# Initialize the generator
|
8
13
|
def initialize(options={})
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
14
|
+
@table = options[:table]
|
15
|
+
@target = options[:target]
|
16
|
+
@column = options[:column] || 'id'
|
17
|
+
@query = options[:query]
|
18
|
+
|
19
|
+
if table
|
20
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
21
|
+
elsif query
|
22
|
+
@surrogate_key = ETL::Engine.connection(target).select_value(query)
|
13
23
|
end
|
24
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
25
|
+
@surrogate_key = @surrogate_key.to_i
|
14
26
|
end
|
15
27
|
|
16
28
|
# Get the next surrogate key
|
@@ -18,6 +30,10 @@ module ETL #:nodoc:
|
|
18
30
|
@surrogate_key ||= 0
|
19
31
|
@surrogate_key += 1
|
20
32
|
end
|
33
|
+
|
34
|
+
def table_name
|
35
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
36
|
+
end
|
21
37
|
end
|
22
38
|
end
|
23
39
|
end
|
data/lib/etl/http_tools.rb
CHANGED
@@ -4,10 +4,13 @@ module ETL #:nodoc:
|
|
4
4
|
# underlying database driver from ActiveRecord must support the methods
|
5
5
|
# +bulk_load+ method.
|
6
6
|
class BulkImportProcessor < ETL::Processor::Processor
|
7
|
+
|
7
8
|
# The file to load from
|
8
9
|
attr_reader :file
|
9
|
-
# The target database
|
10
|
+
# The target database
|
10
11
|
attr_reader :target
|
12
|
+
# The table name
|
13
|
+
attr_reader :table
|
11
14
|
# Set to true to truncate
|
12
15
|
attr_reader :truncate
|
13
16
|
# Array of symbols representing the column load order
|
@@ -23,7 +26,8 @@ module ETL #:nodoc:
|
|
23
26
|
#
|
24
27
|
# Configuration options:
|
25
28
|
# * <tt>:file</tt>: The file to load data from
|
26
|
-
# * <tt>:target</tt>: The target
|
29
|
+
# * <tt>:target</tt>: The target database
|
30
|
+
# * <tt>:table</tt>: The table name
|
27
31
|
# * <tt>:truncate</tt>: Set to true to truncate before loading
|
28
32
|
# * <tt>:columns</tt>: The columns to load in the order they appear in
|
29
33
|
# the bulk data file
|
@@ -34,23 +38,25 @@ module ETL #:nodoc:
|
|
34
38
|
super
|
35
39
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
36
40
|
@target = configuration[:target]
|
41
|
+
@table = configuration[:table]
|
37
42
|
@truncate = configuration[:truncate] ||= false
|
38
43
|
@columns = configuration[:columns]
|
39
44
|
@field_separator = (configuration[:field_separator] || ',')
|
40
45
|
@line_separator = (configuration[:line_separator] || "\n")
|
41
46
|
@field_enclosure = configuration[:field_enclosure]
|
42
|
-
|
47
|
+
|
48
|
+
raise ControlError, "Target must be specified" unless @target
|
49
|
+
raise ControlError, "Table must be specified" unless @table
|
43
50
|
end
|
44
51
|
|
45
52
|
# Execute the processor
|
46
53
|
def process
|
47
54
|
return if ETL::Engine.skip_bulk_import
|
55
|
+
return if File.size(file) == 0
|
48
56
|
|
49
|
-
conn = ETL::
|
57
|
+
conn = ETL::Engine.connection(target)
|
50
58
|
conn.transaction do
|
51
|
-
|
52
|
-
# Since LOCAL is used this must be allowed by both the client and server
|
53
|
-
conn.truncate(target[:table]) if truncate
|
59
|
+
conn.truncate(table_name) if truncate
|
54
60
|
options = {}
|
55
61
|
options[:columns] = columns
|
56
62
|
if field_separator || field_enclosure
|
@@ -59,21 +65,12 @@ module ETL #:nodoc:
|
|
59
65
|
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
60
66
|
options[:fields][:terminated_by] = line_separator if line_separator
|
61
67
|
end
|
62
|
-
conn.bulk_load(file,
|
68
|
+
conn.bulk_load(file, table_name, options)
|
63
69
|
end
|
64
70
|
end
|
65
71
|
|
66
|
-
|
67
|
-
|
68
|
-
def connect
|
69
|
-
Engine.logger.debug "Connecting to database #{target[:database]}"
|
70
|
-
ETL::ActiveRecord::Base.establish_connection(
|
71
|
-
:adapter => (target[:adapter] || :mysql),
|
72
|
-
:username => (target[:username] || 'root'),
|
73
|
-
:host => (target[:host] || 'localhost'),
|
74
|
-
:password => target[:password],
|
75
|
-
:database => target[:database]
|
76
|
-
)
|
72
|
+
def table_name
|
73
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
77
74
|
end
|
78
75
|
end
|
79
76
|
end
|
@@ -6,6 +6,9 @@ module ETL #:nodoc:
|
|
6
6
|
# A symbol or array of symbols representing keys that should be skipped
|
7
7
|
attr_accessor :skip
|
8
8
|
|
9
|
+
# The target database
|
10
|
+
attr_accessor :target
|
11
|
+
|
9
12
|
# The name of the table to check against
|
10
13
|
attr_accessor :table
|
11
14
|
|
@@ -26,11 +29,12 @@ module ETL #:nodoc:
|
|
26
29
|
def initialize(control, configuration)
|
27
30
|
super
|
28
31
|
@skip = configuration[:skip] || []
|
29
|
-
@
|
32
|
+
@target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
|
33
|
+
@table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
|
30
34
|
@columns = configuration[:columns]
|
31
35
|
|
32
|
-
q = "SELECT COUNT(*) FROM #{
|
33
|
-
@should_check = ETL::
|
36
|
+
q = "SELECT COUNT(*) FROM #{table_name}"
|
37
|
+
@should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
|
34
38
|
end
|
35
39
|
|
36
40
|
# Return true if the given key should be skipped
|
@@ -51,20 +55,25 @@ module ETL #:nodoc:
|
|
51
55
|
# Process the row
|
52
56
|
def process(row)
|
53
57
|
return row unless should_check?
|
54
|
-
|
55
|
-
q = "SELECT * FROM #{
|
58
|
+
conn = ETL::Engine.connection(target)
|
59
|
+
q = "SELECT * FROM #{table_name} WHERE "
|
56
60
|
conditions = []
|
57
61
|
row.each do |k,v|
|
58
62
|
if columns.nil? || columns.include?(k.to_sym)
|
59
|
-
conditions << "#{k} = #{
|
63
|
+
conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
|
60
64
|
end
|
61
65
|
end
|
62
66
|
q << conditions.join(" AND ")
|
63
67
|
|
64
68
|
#puts "query: #{q}"
|
65
|
-
result =
|
69
|
+
result = conn.select_one(q)
|
66
70
|
return row if result.nil?
|
67
71
|
end
|
72
|
+
|
73
|
+
private
|
74
|
+
def table_name
|
75
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
76
|
+
end
|
68
77
|
end
|
69
78
|
end
|
70
79
|
end
|
@@ -24,8 +24,9 @@ module ETL #:nodoc:
|
|
24
24
|
# Process the row expanding it into hierarchy values
|
25
25
|
def process(row)
|
26
26
|
rows = []
|
27
|
-
|
27
|
+
target = configuration[:target]
|
28
28
|
table = configuration[:table]
|
29
|
+
conn = ETL::Engine.connection(target)
|
29
30
|
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
30
31
|
rows
|
31
32
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which requires that the particular fields are non-blank in
|
4
|
+
# order for the row to be retained.
|
5
|
+
class RequireNonBlankProcessor < ETL::Processor::RowProcessor
|
6
|
+
# An array of fields to check
|
7
|
+
attr_reader :fields
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# * <tt>:fields</tt>: An array of fields to check, for example:
|
13
|
+
# [:first_name,:last_name]
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@fields = configuration[:fields] || []
|
17
|
+
end
|
18
|
+
|
19
|
+
# Process the row.
|
20
|
+
def process(row)
|
21
|
+
fields.each { |field| return if row[field].blank? }
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -2,14 +2,29 @@ module ETL #:nodoc:
|
|
2
2
|
module Processor #:nodoc:
|
3
3
|
# A row level processor that provides surrogate keys
|
4
4
|
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
-
attr_accessor :query
|
6
5
|
attr_accessor :destination
|
6
|
+
attr_accessor :table
|
7
|
+
attr_accessor :column
|
8
|
+
attr_accessor :target
|
7
9
|
|
8
10
|
# Initialize the surrogate key generator
|
11
|
+
#
|
12
|
+
# Configuration options
|
13
|
+
# * <tt>:query</tt>: If specified it contains a query to be used to
|
14
|
+
# locate the last surrogate key. If this is specified then :target
|
15
|
+
# must also be specified.
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:destination</tt>: The destination column name (defaults to :id)
|
9
18
|
def initialize(control, configuration)
|
10
19
|
super
|
20
|
+
@table = configuration[:table]
|
21
|
+
@column = configuration[:column] || 'id'
|
22
|
+
@target = configuration[:target]
|
11
23
|
if configuration[:query]
|
12
|
-
|
24
|
+
raise ControlError, "Query option is no longer value, use :column and :table instead"
|
25
|
+
end
|
26
|
+
if table
|
27
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
13
28
|
end
|
14
29
|
#puts "initial surrogate key: #{@surrogate_key}"
|
15
30
|
@surrogate_key = 0 if @surrogate_key.blank?
|
@@ -28,6 +43,11 @@ module ETL #:nodoc:
|
|
28
43
|
row
|
29
44
|
end
|
30
45
|
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def table_name
|
49
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
50
|
+
end
|
31
51
|
end
|
32
52
|
end
|
33
53
|
end
|
@@ -6,29 +6,29 @@ module ETL #:nodoc:
|
|
6
6
|
# Defines the table to truncate
|
7
7
|
attr_reader :table
|
8
8
|
|
9
|
-
# Defines the database connection
|
9
|
+
# Defines the database connection to use
|
10
10
|
attr_reader :target
|
11
|
+
|
12
|
+
# Initialize the processor
|
13
|
+
#
|
14
|
+
# Options:
|
15
|
+
# * <tt>:target</tt>: The target connection
|
16
|
+
# * <tt>:table</tt>: The table name
|
11
17
|
def initialize(control, configuration)
|
12
18
|
super
|
13
19
|
#@file = File.join(File.dirname(control.file), configuration[:file])
|
14
20
|
@target = configuration[:target] || {}
|
15
21
|
@table = configuration[:table]
|
16
|
-
connect
|
17
22
|
end
|
23
|
+
|
18
24
|
def process
|
19
|
-
conn = ETL::
|
20
|
-
conn.truncate(
|
25
|
+
conn = ETL::Engine.connection(target)
|
26
|
+
conn.truncate(table_name)
|
21
27
|
end
|
22
28
|
|
23
|
-
|
24
|
-
def
|
25
|
-
ETL::
|
26
|
-
:adapter => (target[:adapter] || :mysql),
|
27
|
-
:username => (target[:username] || 'root'),
|
28
|
-
:host => (target[:host] || 'localhost'),
|
29
|
-
:password => target[:password],
|
30
|
-
:database => target[:database]
|
31
|
-
)
|
29
|
+
private
|
30
|
+
def table_name
|
31
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
32
32
|
end
|
33
33
|
end
|
34
34
|
end
|
data/lib/etl/screen.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# This source file contains the ETL::Screen module and requires all of the
|
2
|
+
# screens
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Screen module contains pre-built screens useful for checking the
|
6
|
+
# ETL state during execution. Screens may be fatal, which will result in
|
7
|
+
# termination of the ETL process, errors, which will result in the
|
8
|
+
# termination of just the current ETL control file, or warnings, which will
|
9
|
+
# result in a warning message.
|
10
|
+
module Screen
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
|
@@ -4,11 +4,16 @@ module ETL
|
|
4
4
|
# against the results from some sort of a row count query. If there
|
5
5
|
# is a difference then the screen will not pass
|
6
6
|
class RowCountScreen
|
7
|
+
attr_accessor :control, :configuration
|
7
8
|
def initialize(control, configuration={})
|
8
|
-
|
9
|
+
@control = control
|
10
|
+
@configuration = configuration
|
11
|
+
execute
|
9
12
|
end
|
10
13
|
def execute
|
11
|
-
|
14
|
+
unless Engine.rows_written == configuration[:rows]
|
15
|
+
raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
|
16
|
+
end
|
12
17
|
end
|
13
18
|
end
|
14
19
|
end
|
@@ -46,8 +46,9 @@ class ActiveRecordResolver
|
|
46
46
|
# The find method to use (as a symbol)
|
47
47
|
attr_accessor :find_method
|
48
48
|
|
49
|
-
# Initialize the resolver. The ar_class argument should extend from
|
50
|
-
# must be a symbol for the
|
49
|
+
# Initialize the resolver. The ar_class argument should extend from
|
50
|
+
# ActiveRecord::Base. The find_method argument must be a symbol for the
|
51
|
+
# finder method used. For example:
|
51
52
|
#
|
52
53
|
# ActiveRecordResolver.new(Person, :find_by_name)
|
53
54
|
#
|
@@ -56,6 +57,7 @@ class ActiveRecordResolver
|
|
56
57
|
@ar_class = ar_class
|
57
58
|
@find_method = find_method
|
58
59
|
end
|
60
|
+
|
59
61
|
# Resolve the value
|
60
62
|
def resolve(value)
|
61
63
|
rec = ar_class.__send__(find_method, value)
|
@@ -64,13 +66,21 @@ class ActiveRecordResolver
|
|
64
66
|
end
|
65
67
|
|
66
68
|
class SQLResolver
|
69
|
+
# Initialize the SQL resolver. Use the given table and field name to search
|
70
|
+
# for the appropriate foreign key. The field should be the name of a natural
|
71
|
+
# key that is used to locate the surrogate key for the record.
|
72
|
+
#
|
73
|
+
# The connection argument is optional. If specified it can be either a symbol
|
74
|
+
# referencing a connection defined in the ETL database.yml file or an actual
|
75
|
+
# ActiveRecord connection instance. If the connection is not specified then
|
76
|
+
# the ActiveRecord::Base.connection will be used.
|
67
77
|
def initialize(table, field, connection=nil)
|
68
78
|
@table = table
|
69
79
|
@field = field
|
70
|
-
@connection = connection
|
80
|
+
@connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
|
81
|
+
@connection ||= ActiveRecord::Base.connection
|
71
82
|
end
|
72
83
|
def resolve(value)
|
73
|
-
|
74
|
-
conn.select_value("SELECT id FROM #{table} WHERE #{field} = #{conn.quote(value)}")
|
84
|
+
@connection.select_value("SELECT id FROM #{@table} WHERE #{@field} = #{@connection.quote(value)}")
|
75
85
|
end
|
76
86
|
end
|