activewarehouse-etl 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -0,0 +1,19 @@
|
|
1
|
+
module ETL #:nodoc
|
2
|
+
# Classes which store information about ETL execution
|
3
|
+
module Execution
|
4
|
+
# Execution management
|
5
|
+
class Execution
|
6
|
+
class << self
|
7
|
+
# Migrate the data store
|
8
|
+
def migrate
|
9
|
+
ETL::Execution::Migration.migrate
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'etl/execution/base'
|
17
|
+
require 'etl/execution/job'
|
18
|
+
require 'etl/execution/record'
|
19
|
+
require 'etl/execution/migration'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Execution #:nodoc
|
3
|
+
# Handles migration of tables required for persistent storage of meta data
|
4
|
+
# for the ETL engine
|
5
|
+
class Migration
|
6
|
+
class << self
|
7
|
+
# Execute the migrations
|
8
|
+
def migrate
|
9
|
+
connection.initialize_schema_information
|
10
|
+
v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
|
11
|
+
v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
|
12
|
+
end
|
13
|
+
protected
|
14
|
+
# Get the schema info table name
|
15
|
+
def schema_info_table_name
|
16
|
+
ETL::Execution::Base.table_name_prefix + "schema_info" +
|
17
|
+
ETL::Execution::Base.table_name_suffix
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get the connection to use during migration
|
21
|
+
def connection
|
22
|
+
@connection ||= ETL::Execution::Base.connection
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get the final target version number
|
26
|
+
def target
|
27
|
+
1
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def migration_1 #:nodoc:
|
32
|
+
connection.create_table :jobs do |t|
|
33
|
+
t.column :control_file, :string, :null => false
|
34
|
+
t.column :created_at, :datetime, :null => false
|
35
|
+
t.column :completed_at, :datetime
|
36
|
+
t.column :status, :string
|
37
|
+
end
|
38
|
+
connection.create_table :records do |t|
|
39
|
+
t.column :control_file, :string, :null => false
|
40
|
+
t.column :natural_key, :string, :null => false
|
41
|
+
t.column :crc, :string, :null => false
|
42
|
+
t.column :job_id, :integer, :null => false
|
43
|
+
end
|
44
|
+
update_schema_info(1)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Update the schema info table, setting the version value
|
48
|
+
def update_schema_info(version)
|
49
|
+
connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/etl/parser.rb
CHANGED
@@ -1,2 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Parser module and requires all of the files
|
2
|
+
# in the parser directory ending with .rb
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Parser module provides various text parsers.
|
6
|
+
module Parser
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
1
10
|
require 'etl/parser/parser'
|
2
11
|
Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
module ETL
|
2
|
-
module Parser
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Parser #:nodoc:
|
3
|
+
# Base parser class. Implementation classes must extend this class and implement
|
4
|
+
# the each method. The each method should return each row of the source data as
|
5
|
+
# a Hash.
|
3
6
|
class Parser
|
4
7
|
include Enumerable
|
5
8
|
class << self
|
@@ -29,6 +29,7 @@ module ETL #:nodoc:
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
+
# Get an array of Field objects
|
32
33
|
def fields
|
33
34
|
@fields ||= []
|
34
35
|
end
|
@@ -44,16 +45,21 @@ module ETL #:nodoc:
|
|
44
45
|
end
|
45
46
|
end
|
46
47
|
|
48
|
+
# Class representing a field to be loaded from the source
|
47
49
|
class Field
|
48
|
-
|
49
|
-
|
50
|
+
# The name of the field
|
51
|
+
attr_reader :name
|
52
|
+
# The XPath-like path to the field in the XML document
|
53
|
+
attr_reader :path
|
54
|
+
|
55
|
+
def initialize(name, path) #:nodoc
|
50
56
|
@name = name
|
51
57
|
@path = path
|
52
58
|
end
|
53
59
|
end
|
54
60
|
end
|
55
61
|
|
56
|
-
class Listener
|
62
|
+
class Listener #:nodoc:
|
57
63
|
include REXML::SAX2Listener
|
58
64
|
def initialize(parser, &block)
|
59
65
|
@parser = parser
|
@@ -120,19 +126,28 @@ module ETL #:nodoc:
|
|
120
126
|
end
|
121
127
|
end
|
122
128
|
|
123
|
-
|
124
|
-
|
129
|
+
# Module which contains classes that are used for XPath-like filtering
|
130
|
+
# on the SAX parser
|
131
|
+
module XPath #:nodoc:
|
132
|
+
class Path #:nodoc:
|
133
|
+
# Get the elements in the path
|
125
134
|
attr_accessor :elements
|
135
|
+
|
136
|
+
# Initialize
|
126
137
|
def initialize
|
127
138
|
@elements = []
|
128
139
|
end
|
140
|
+
|
141
|
+
# Convert to a string representation
|
129
142
|
def to_s
|
130
143
|
@elements.map{ |e| e.to_s }.join("/")
|
131
144
|
end
|
145
|
+
|
132
146
|
# Returns true if the last part of the path refers to an attribute
|
133
147
|
def is_attribute?
|
134
148
|
elements.last.attributes.length > 0
|
135
149
|
end
|
150
|
+
|
136
151
|
# Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
|
137
152
|
# does not reference an attribute.
|
138
153
|
#
|
@@ -142,6 +157,7 @@ module ETL #:nodoc:
|
|
142
157
|
return nil unless is_attribute?
|
143
158
|
elements.last.attributes.keys.first
|
144
159
|
end
|
160
|
+
|
145
161
|
# Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
|
146
162
|
# will cause the method to return false.
|
147
163
|
def match?(s)
|
@@ -178,7 +194,7 @@ module ETL #:nodoc:
|
|
178
194
|
path
|
179
195
|
end
|
180
196
|
end
|
181
|
-
class Element
|
197
|
+
class Element #:nodoc
|
182
198
|
attr_reader :name
|
183
199
|
attr_reader :attributes
|
184
200
|
def initialize(name, attributes={})
|
data/lib/etl/processor.rb
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Processor module and requires all of the processors
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# The ETL::Processor module contains row-level and bulk processors
|
5
|
+
module Processor
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
1
9
|
require 'etl/processor/processor'
|
2
10
|
require 'etl/processor/row_processor'
|
3
11
|
Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
|
@@ -1,11 +1,35 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Processor #:nodoc:
|
3
|
-
# Processor which is used to bulk import data into a target database
|
3
|
+
# Processor which is used to bulk import data into a target database. The
|
4
|
+
# underlying database driver from ActiveRecord must support the methods
|
5
|
+
# +bulk_load+ method.
|
4
6
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
|
-
|
7
|
+
# The file to load from
|
8
|
+
attr_reader :file
|
9
|
+
# The target database information (see +initialize+)
|
10
|
+
attr_reader :target
|
11
|
+
# Set to true to truncate
|
12
|
+
attr_reader :truncate
|
13
|
+
# Array of symbols representing the column load order
|
14
|
+
attr_reader :columns
|
15
|
+
# The field separator (defaults to a comma)
|
6
16
|
attr_accessor :field_separator
|
17
|
+
# The field enclosure (defaults to nil)
|
7
18
|
attr_accessor :field_enclosure
|
19
|
+
# The line separator (defaults to a newline)
|
8
20
|
attr_accessor :line_separator
|
21
|
+
|
22
|
+
# Initialize the processor.
|
23
|
+
#
|
24
|
+
# Configuration options:
|
25
|
+
# * <tt>:file</tt>: The file to load data from
|
26
|
+
# * <tt>:target</tt>: The target connection information
|
27
|
+
# * <tt>:truncate</tt>: Set to true to truncate before loading
|
28
|
+
# * <tt>:columns</tt>: The columns to load in the order they appear in
|
29
|
+
# the bulk data file
|
30
|
+
# * <tt>:field_separator</tt>: The field separator. Defaults to a comma
|
31
|
+
# * <tt>:line_separator</tt>: The line separator. Defaults to a newline
|
32
|
+
# * <tt>:field_enclosure</tt>: The field enclosure charcaters
|
9
33
|
def initialize(control, configuration)
|
10
34
|
super
|
11
35
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
@@ -13,12 +37,15 @@ module ETL #:nodoc:
|
|
13
37
|
@truncate = configuration[:truncate] ||= false
|
14
38
|
@columns = configuration[:columns]
|
15
39
|
@field_separator = (configuration[:field_separator] || ',')
|
16
|
-
@line_separator = configuration[:line_separator]
|
40
|
+
@line_separator = (configuration[:line_separator] || "\n")
|
17
41
|
@field_enclosure = configuration[:field_enclosure]
|
18
42
|
connect
|
19
43
|
end
|
44
|
+
|
45
|
+
# Execute the processor
|
20
46
|
def process
|
21
|
-
|
47
|
+
return if ETL::Engine.skip_bulk_import
|
48
|
+
|
22
49
|
conn = ETL::ActiveRecord::Base.connection
|
23
50
|
conn.transaction do
|
24
51
|
# TODO: Support all database types
|
@@ -30,6 +57,7 @@ module ETL #:nodoc:
|
|
30
57
|
options[:fields] = {}
|
31
58
|
options[:fields][:delimited_by] = field_separator if field_separator
|
32
59
|
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
60
|
+
options[:fields][:terminated_by] = line_separator if line_separator
|
33
61
|
end
|
34
62
|
conn.bulk_load(file, target[:table], options)
|
35
63
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The name of the table to check against
|
10
|
+
attr_accessor :table
|
11
|
+
|
12
|
+
# An array of columns representing the natural key
|
13
|
+
attr_accessor :columns
|
14
|
+
|
15
|
+
# Is set to true if the processor should execute the check. If there are
|
16
|
+
# no rows in the target table then this should return false.
|
17
|
+
attr_accessor :should_check
|
18
|
+
|
19
|
+
# Initialize the processor
|
20
|
+
# Configuration options:
|
21
|
+
# * <tt>:skip</tt>: A symbol or array of column names that should not
|
22
|
+
# be checked
|
23
|
+
# * <tt>:table</tt>: The table name
|
24
|
+
# * <tt>:columns</tt>: An array of columns which represent the natural
|
25
|
+
# key
|
26
|
+
def initialize(control, configuration)
|
27
|
+
super
|
28
|
+
@skip = configuration[:skip]
|
29
|
+
@table = configuration[:table]
|
30
|
+
@columns = configuration[:columns]
|
31
|
+
|
32
|
+
q = "SELECT COUNT(*) FROM #{table}"
|
33
|
+
@should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
|
34
|
+
end
|
35
|
+
|
36
|
+
# Return true if the given key should be skipped
|
37
|
+
def skip?(key)
|
38
|
+
case skip
|
39
|
+
when Array
|
40
|
+
skip.include?(key)
|
41
|
+
else
|
42
|
+
skip.to_sym == key.to_sym
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return true if the row should be checked
|
47
|
+
def should_check?
|
48
|
+
@should_check ? true : false
|
49
|
+
end
|
50
|
+
|
51
|
+
# Process the row
|
52
|
+
def process(row)
|
53
|
+
return row unless should_check?
|
54
|
+
q = "SELECT * FROM #{table} WHERE "
|
55
|
+
conditions = []
|
56
|
+
row.each do |k,v|
|
57
|
+
if columns.nil? || columns.include?(k.to_sym)
|
58
|
+
conditions << "#{k} = '#{v}'" unless skip?(k)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
q << conditions.join(" AND ")
|
62
|
+
|
63
|
+
#puts "query: #{q}"
|
64
|
+
result = ActiveRecord::Base.connection.select_one(q)
|
65
|
+
return row if result.nil?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
def process(row)
|
27
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
28
|
+
unless compound_key_constraints[key]
|
29
|
+
compound_key_constraints[key] = 1
|
30
|
+
return row
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -1,8 +1,24 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
3
|
-
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
4
11
|
def process(row)
|
5
|
-
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
6
22
|
row
|
7
23
|
end
|
8
24
|
end
|
@@ -7,12 +7,15 @@ module ETL #:nodoc:
|
|
7
7
|
@configuration = configuration
|
8
8
|
end
|
9
9
|
protected
|
10
|
+
# Get the control object
|
10
11
|
def control
|
11
12
|
@control
|
12
13
|
end
|
14
|
+
# Get the configuration Hash
|
13
15
|
def configuration
|
14
16
|
@configuration
|
15
17
|
end
|
18
|
+
# Get the engine logger
|
16
19
|
def log
|
17
20
|
Engine.logger
|
18
21
|
end
|