activewarehouse-etl 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -0,0 +1,19 @@
|
|
1
|
+
module ETL #:nodoc
|
2
|
+
# Classes which store information about ETL execution
|
3
|
+
module Execution
|
4
|
+
# Execution management
|
5
|
+
class Execution
|
6
|
+
class << self
|
7
|
+
# Migrate the data store
|
8
|
+
def migrate
|
9
|
+
ETL::Execution::Migration.migrate
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'etl/execution/base'
|
17
|
+
require 'etl/execution/job'
|
18
|
+
require 'etl/execution/record'
|
19
|
+
require 'etl/execution/migration'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Execution #:nodoc
|
3
|
+
# Handles migration of tables required for persistent storage of meta data
|
4
|
+
# for the ETL engine
|
5
|
+
class Migration
|
6
|
+
class << self
|
7
|
+
# Execute the migrations
|
8
|
+
def migrate
|
9
|
+
connection.initialize_schema_information
|
10
|
+
v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
|
11
|
+
v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
|
12
|
+
end
|
13
|
+
protected
|
14
|
+
# Get the schema info table name
|
15
|
+
def schema_info_table_name
|
16
|
+
ETL::Execution::Base.table_name_prefix + "schema_info" +
|
17
|
+
ETL::Execution::Base.table_name_suffix
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get the connection to use during migration
|
21
|
+
def connection
|
22
|
+
@connection ||= ETL::Execution::Base.connection
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get the final target version number
|
26
|
+
def target
|
27
|
+
1
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def migration_1 #:nodoc:
|
32
|
+
connection.create_table :jobs do |t|
|
33
|
+
t.column :control_file, :string, :null => false
|
34
|
+
t.column :created_at, :datetime, :null => false
|
35
|
+
t.column :completed_at, :datetime
|
36
|
+
t.column :status, :string
|
37
|
+
end
|
38
|
+
connection.create_table :records do |t|
|
39
|
+
t.column :control_file, :string, :null => false
|
40
|
+
t.column :natural_key, :string, :null => false
|
41
|
+
t.column :crc, :string, :null => false
|
42
|
+
t.column :job_id, :integer, :null => false
|
43
|
+
end
|
44
|
+
update_schema_info(1)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Update the schema info table, setting the version value
|
48
|
+
def update_schema_info(version)
|
49
|
+
connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/etl/parser.rb
CHANGED
@@ -1,2 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Parser module and requires all of the files
|
2
|
+
# in the parser directory ending with .rb
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Parser module provides various text parsers.
|
6
|
+
module Parser
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
1
10
|
require 'etl/parser/parser'
|
2
11
|
Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
module ETL
|
2
|
-
module Parser
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Parser #:nodoc:
|
3
|
+
# Base parser class. Implementation classes must extend this class and implement
|
4
|
+
# the each method. The each method should return each row of the source data as
|
5
|
+
# a Hash.
|
3
6
|
class Parser
|
4
7
|
include Enumerable
|
5
8
|
class << self
|
@@ -29,6 +29,7 @@ module ETL #:nodoc:
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
+
# Get an array of Field objects
|
32
33
|
def fields
|
33
34
|
@fields ||= []
|
34
35
|
end
|
@@ -44,16 +45,21 @@ module ETL #:nodoc:
|
|
44
45
|
end
|
45
46
|
end
|
46
47
|
|
48
|
+
# Class representing a field to be loaded from the source
|
47
49
|
class Field
|
48
|
-
|
49
|
-
|
50
|
+
# The name of the field
|
51
|
+
attr_reader :name
|
52
|
+
# The XPath-like path to the field in the XML document
|
53
|
+
attr_reader :path
|
54
|
+
|
55
|
+
def initialize(name, path) #:nodoc
|
50
56
|
@name = name
|
51
57
|
@path = path
|
52
58
|
end
|
53
59
|
end
|
54
60
|
end
|
55
61
|
|
56
|
-
class Listener
|
62
|
+
class Listener #:nodoc:
|
57
63
|
include REXML::SAX2Listener
|
58
64
|
def initialize(parser, &block)
|
59
65
|
@parser = parser
|
@@ -120,19 +126,28 @@ module ETL #:nodoc:
|
|
120
126
|
end
|
121
127
|
end
|
122
128
|
|
123
|
-
|
124
|
-
|
129
|
+
# Module which contains classes that are used for XPath-like filtering
|
130
|
+
# on the SAX parser
|
131
|
+
module XPath #:nodoc:
|
132
|
+
class Path #:nodoc:
|
133
|
+
# Get the elements in the path
|
125
134
|
attr_accessor :elements
|
135
|
+
|
136
|
+
# Initialize
|
126
137
|
def initialize
|
127
138
|
@elements = []
|
128
139
|
end
|
140
|
+
|
141
|
+
# Convert to a string representation
|
129
142
|
def to_s
|
130
143
|
@elements.map{ |e| e.to_s }.join("/")
|
131
144
|
end
|
145
|
+
|
132
146
|
# Returns true if the last part of the path refers to an attribute
|
133
147
|
def is_attribute?
|
134
148
|
elements.last.attributes.length > 0
|
135
149
|
end
|
150
|
+
|
136
151
|
# Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
|
137
152
|
# does not reference an attribute.
|
138
153
|
#
|
@@ -142,6 +157,7 @@ module ETL #:nodoc:
|
|
142
157
|
return nil unless is_attribute?
|
143
158
|
elements.last.attributes.keys.first
|
144
159
|
end
|
160
|
+
|
145
161
|
# Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
|
146
162
|
# will cause the method to return false.
|
147
163
|
def match?(s)
|
@@ -178,7 +194,7 @@ module ETL #:nodoc:
|
|
178
194
|
path
|
179
195
|
end
|
180
196
|
end
|
181
|
-
class Element
|
197
|
+
class Element #:nodoc
|
182
198
|
attr_reader :name
|
183
199
|
attr_reader :attributes
|
184
200
|
def initialize(name, attributes={})
|
data/lib/etl/processor.rb
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Processor module and requires all of the processors
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# The ETL::Processor module contains row-level and bulk processors
|
5
|
+
module Processor
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
1
9
|
require 'etl/processor/processor'
|
2
10
|
require 'etl/processor/row_processor'
|
3
11
|
Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
|
@@ -1,11 +1,35 @@
|
|
1
1
|
module ETL #:nodoc:
|
2
2
|
module Processor #:nodoc:
|
3
|
-
# Processor which is used to bulk import data into a target database
|
3
|
+
# Processor which is used to bulk import data into a target database. The
|
4
|
+
# underlying database driver from ActiveRecord must support the methods
|
5
|
+
# +bulk_load+ method.
|
4
6
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
|
-
|
7
|
+
# The file to load from
|
8
|
+
attr_reader :file
|
9
|
+
# The target database information (see +initialize+)
|
10
|
+
attr_reader :target
|
11
|
+
# Set to true to truncate
|
12
|
+
attr_reader :truncate
|
13
|
+
# Array of symbols representing the column load order
|
14
|
+
attr_reader :columns
|
15
|
+
# The field separator (defaults to a comma)
|
6
16
|
attr_accessor :field_separator
|
17
|
+
# The field enclosure (defaults to nil)
|
7
18
|
attr_accessor :field_enclosure
|
19
|
+
# The line separator (defaults to a newline)
|
8
20
|
attr_accessor :line_separator
|
21
|
+
|
22
|
+
# Initialize the processor.
|
23
|
+
#
|
24
|
+
# Configuration options:
|
25
|
+
# * <tt>:file</tt>: The file to load data from
|
26
|
+
# * <tt>:target</tt>: The target connection information
|
27
|
+
# * <tt>:truncate</tt>: Set to true to truncate before loading
|
28
|
+
# * <tt>:columns</tt>: The columns to load in the order they appear in
|
29
|
+
# the bulk data file
|
30
|
+
# * <tt>:field_separator</tt>: The field separator. Defaults to a comma
|
31
|
+
# * <tt>:line_separator</tt>: The line separator. Defaults to a newline
|
32
|
+
# * <tt>:field_enclosure</tt>: The field enclosure charcaters
|
9
33
|
def initialize(control, configuration)
|
10
34
|
super
|
11
35
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
@@ -13,12 +37,15 @@ module ETL #:nodoc:
|
|
13
37
|
@truncate = configuration[:truncate] ||= false
|
14
38
|
@columns = configuration[:columns]
|
15
39
|
@field_separator = (configuration[:field_separator] || ',')
|
16
|
-
@line_separator = configuration[:line_separator]
|
40
|
+
@line_separator = (configuration[:line_separator] || "\n")
|
17
41
|
@field_enclosure = configuration[:field_enclosure]
|
18
42
|
connect
|
19
43
|
end
|
44
|
+
|
45
|
+
# Execute the processor
|
20
46
|
def process
|
21
|
-
|
47
|
+
return if ETL::Engine.skip_bulk_import
|
48
|
+
|
22
49
|
conn = ETL::ActiveRecord::Base.connection
|
23
50
|
conn.transaction do
|
24
51
|
# TODO: Support all database types
|
@@ -30,6 +57,7 @@ module ETL #:nodoc:
|
|
30
57
|
options[:fields] = {}
|
31
58
|
options[:fields][:delimited_by] = field_separator if field_separator
|
32
59
|
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
60
|
+
options[:fields][:terminated_by] = line_separator if line_separator
|
33
61
|
end
|
34
62
|
conn.bulk_load(file, target[:table], options)
|
35
63
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The name of the table to check against
|
10
|
+
attr_accessor :table
|
11
|
+
|
12
|
+
# An array of columns representing the natural key
|
13
|
+
attr_accessor :columns
|
14
|
+
|
15
|
+
# Is set to true if the processor should execute the check. If there are
|
16
|
+
# no rows in the target table then this should return false.
|
17
|
+
attr_accessor :should_check
|
18
|
+
|
19
|
+
# Initialize the processor
|
20
|
+
# Configuration options:
|
21
|
+
# * <tt>:skip</tt>: A symbol or array of column names that should not
|
22
|
+
# be checked
|
23
|
+
# * <tt>:table</tt>: The table name
|
24
|
+
# * <tt>:columns</tt>: An array of columns which represent the natural
|
25
|
+
# key
|
26
|
+
def initialize(control, configuration)
|
27
|
+
super
|
28
|
+
@skip = configuration[:skip]
|
29
|
+
@table = configuration[:table]
|
30
|
+
@columns = configuration[:columns]
|
31
|
+
|
32
|
+
q = "SELECT COUNT(*) FROM #{table}"
|
33
|
+
@should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
|
34
|
+
end
|
35
|
+
|
36
|
+
# Return true if the given key should be skipped
|
37
|
+
def skip?(key)
|
38
|
+
case skip
|
39
|
+
when Array
|
40
|
+
skip.include?(key)
|
41
|
+
else
|
42
|
+
skip.to_sym == key.to_sym
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return true if the row should be checked
|
47
|
+
def should_check?
|
48
|
+
@should_check ? true : false
|
49
|
+
end
|
50
|
+
|
51
|
+
# Process the row
|
52
|
+
def process(row)
|
53
|
+
return row unless should_check?
|
54
|
+
q = "SELECT * FROM #{table} WHERE "
|
55
|
+
conditions = []
|
56
|
+
row.each do |k,v|
|
57
|
+
if columns.nil? || columns.include?(k.to_sym)
|
58
|
+
conditions << "#{k} = '#{v}'" unless skip?(k)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
q << conditions.join(" AND ")
|
62
|
+
|
63
|
+
#puts "query: #{q}"
|
64
|
+
result = ActiveRecord::Base.connection.select_one(q)
|
65
|
+
return row if result.nil?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
def process(row)
|
27
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
28
|
+
unless compound_key_constraints[key]
|
29
|
+
compound_key_constraints[key] = 1
|
30
|
+
return row
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -1,8 +1,24 @@
|
|
1
|
-
module ETL
|
2
|
-
module Processor
|
3
|
-
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
4
11
|
def process(row)
|
5
|
-
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
6
22
|
row
|
7
23
|
end
|
8
24
|
end
|
@@ -7,12 +7,15 @@ module ETL #:nodoc:
|
|
7
7
|
@configuration = configuration
|
8
8
|
end
|
9
9
|
protected
|
10
|
+
# Get the control object
|
10
11
|
def control
|
11
12
|
@control
|
12
13
|
end
|
14
|
+
# Get the configuration Hash
|
13
15
|
def configuration
|
14
16
|
@configuration
|
15
17
|
end
|
18
|
+
# Get the engine logger
|
16
19
|
def log
|
17
20
|
Engine.logger
|
18
21
|
end
|