activewarehouse-etl-sgonyea 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +9 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +236 -0
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +13 -0
- data/LICENSE +7 -0
- data/README.textile +111 -0
- data/Rakefile +103 -0
- data/TODO +28 -0
- data/active_support_logger.patch +78 -0
- data/activewarehouse-etl.gemspec +36 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +97 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +65 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +438 -0
- data/lib/etl/control/destination/csv_destination.rb +113 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/destination/yaml_destination.rb +74 -0
- data/lib/etl/control/source.rb +132 -0
- data/lib/etl/control/source/database_source.rb +224 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +582 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +8 -0
- data/lib/etl/execution/batch.rb +10 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +90 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/csv_parser.rb +93 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +94 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +39 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/database_join_processor.rb +82 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +27 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +40 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/md5_transform.rb +13 -0
- data/lib/etl/transform/ordinalize_transform.rb +14 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +3 -0
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/all.ebf +6 -0
- data/test/apache_combined_log.ctl +11 -0
- data/test/batch_test.rb +41 -0
- data/test/batch_with_error.ebf +6 -0
- data/test/batched1.ctl +0 -0
- data/test/batched2.ctl +0 -0
- data/test/block_processor.ctl +6 -0
- data/test/block_processor_error.ctl +1 -0
- data/test/block_processor_pre_post_process.ctl +4 -0
- data/test/block_processor_remove_rows.ctl +5 -0
- data/test/block_processor_test.rb +38 -0
- data/test/check_exist_processor_test.rb +92 -0
- data/test/check_unique_processor_test.rb +40 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +53 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +61 -0
- data/test/config/common.rb +29 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +37 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +40 -0
- data/test/control_test.rb +43 -0
- data/test/data/apache_combined_log.txt +3 -0
- data/test/data/bulk_import.txt +3 -0
- data/test/data/bulk_import_with_empties.txt +3 -0
- data/test/data/decode.txt +3 -0
- data/test/data/delimited.txt +3 -0
- data/test/data/encode_source_latin1.txt +2 -0
- data/test/data/excel.xls +0 -0
- data/test/data/excel2.xls +0 -0
- data/test/data/fixed_width.txt +3 -0
- data/test/data/multiple_delimited_1.txt +3 -0
- data/test/data/multiple_delimited_2.txt +3 -0
- data/test/data/nokogiri.xml +38 -0
- data/test/data/people.txt +3 -0
- data/test/data/sax.xml +14 -0
- data/test/data/xml.xml +16 -0
- data/test/database_join_processor_test.rb +43 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/delimited.ctl +30 -0
- data/test/delimited_absolute.ctl +31 -0
- data/test/delimited_destination_db.ctl +23 -0
- data/test/delimited_excel.ctl +31 -0
- data/test/delimited_insert_update.ctl +34 -0
- data/test/delimited_update.ctl +34 -0
- data/test/delimited_with_bulk_load.ctl +34 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +78 -0
- data/test/ensure_fields_presence_processor_test.rb +28 -0
- data/test/errors.ctl +24 -0
- data/test/etl_test.rb +42 -0
- data/test/excel.ctl +24 -0
- data/test/excel2.ctl +25 -0
- data/test/fixed_width.ctl +35 -0
- data/test/foreign_key_lookup_transform_test.rb +50 -0
- data/test/generator_test.rb +14 -0
- data/test/inline_parser.ctl +17 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/model_source.ctl +14 -0
- data/test/multiple_delimited.ctl +22 -0
- data/test/multiple_source_delimited.ctl +39 -0
- data/test/nokogiri_all.ctl +35 -0
- data/test/nokogiri_select.ctl +35 -0
- data/test/nokogiri_test.rb +35 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/sax.ctl +26 -0
- data/test/scd/1.txt +1 -0
- data/test/scd/2.txt +1 -0
- data/test/scd/3.txt +1 -0
- data/test/scd_test.rb +257 -0
- data/test/scd_test_type_1.ctl +43 -0
- data/test/scd_test_type_2.ctl +34 -0
- data/test/screen_test.rb +9 -0
- data/test/screen_test_error.ctl +3 -0
- data/test/screen_test_fatal.ctl +3 -0
- data/test/source_test.rb +154 -0
- data/test/test_helper.rb +37 -0
- data/test/transform_test.rb +101 -0
- data/test/truncate_processor_test.rb +37 -0
- data/test/xml.ctl +31 -0
- metadata +370 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Base class for pre and post processors. Subclasses must implement the +process+ method.
|
4
|
+
class Processor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
@control = control
|
7
|
+
@configuration = configuration
|
8
|
+
after_initialize if respond_to?(:after_initialize)
|
9
|
+
end
|
10
|
+
protected
|
11
|
+
# Get the control object
|
12
|
+
def control
|
13
|
+
@control
|
14
|
+
end
|
15
|
+
# Get the configuration Hash
|
16
|
+
def configuration
|
17
|
+
@configuration
|
18
|
+
end
|
19
|
+
# Get the engine logger
|
20
|
+
def log
|
21
|
+
Engine.logger
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which requires that the particular fields are non-blank in
|
4
|
+
# order for the row to be retained.
|
5
|
+
class RequireNonBlankProcessor < ETL::Processor::RowProcessor
|
6
|
+
# An array of fields to check
|
7
|
+
attr_reader :fields
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# * <tt>:fields</tt>: An array of fields to check, for example:
|
13
|
+
# [:first_name,:last_name]
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@fields = configuration[:fields] || []
|
17
|
+
end
|
18
|
+
|
19
|
+
# Process the row.
|
20
|
+
def process(row)
|
21
|
+
fields.each { |field| return if row[field].blank? }
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row. This method must return the row.
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
|
16
|
+
# Ensure a given row keys include all the provided columns
|
17
|
+
# and raise an error using the provided message if it doesn't
|
18
|
+
def ensure_columns_available_in_row!(row, columns, message)
|
19
|
+
unless columns.nil?
|
20
|
+
columns.each do |k|
|
21
|
+
raise(ETL::ControlError, "Row missing required field #{k.inspect} #{message}") unless row.keys.include?(k)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
optional_require 'net/sftp'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Processor
|
5
|
+
# Custom processor to download files via SFTP
|
6
|
+
class SftpDownloaderProcessor < ETL::Processor::Processor
|
7
|
+
attr_reader :host
|
8
|
+
attr_reader :port
|
9
|
+
attr_reader :remote_dir
|
10
|
+
attr_reader :files
|
11
|
+
attr_reader :username
|
12
|
+
attr_reader :local_dir
|
13
|
+
|
14
|
+
# configuration options include:
|
15
|
+
# * host - hostname or IP address of FTP server (required)
|
16
|
+
# * port - port number for FTP server (default: 22)
|
17
|
+
# * remote_dir - remote path on FTP server (default: /)
|
18
|
+
# * files - list of files to download from FTP server (default: [])
|
19
|
+
# * username - username for FTP server authentication (default: anonymous)
|
20
|
+
# * password - password for FTP server authentication (default: nil)
|
21
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
22
|
+
#
|
23
|
+
# As an example you might write something like the following in your control process file:
|
24
|
+
# pre_process :sftp_downloader, {
|
25
|
+
# :host => 'sftp.sec.gov',
|
26
|
+
# :path => 'edgar/Feed/2007/QTR2',
|
27
|
+
# :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
|
28
|
+
# '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
|
29
|
+
# :local_dir => '/data/sec/2007/04',
|
30
|
+
# }
|
31
|
+
# The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
|
32
|
+
# from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
|
33
|
+
def initialize(control, configuration)
|
34
|
+
@host = configuration[:host]
|
35
|
+
@port = configuration[:port] || 22
|
36
|
+
@remote_dir = configuration[:remote_dir] || '/'
|
37
|
+
@files = configuration[:files] || []
|
38
|
+
@username = configuration[:username] || 'anonymous'
|
39
|
+
@password = configuration[:password]
|
40
|
+
@local_dir = configuration[:local_dir] || ''
|
41
|
+
end
|
42
|
+
|
43
|
+
def process
|
44
|
+
Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
|
45
|
+
@files.each do |f|
|
46
|
+
conn.download!(remote_file(f), local_file(f))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
attr_accessor :password
|
53
|
+
|
54
|
+
def local_file(name)
|
55
|
+
File.join(@local_dir, name)
|
56
|
+
end
|
57
|
+
|
58
|
+
def remote_file(name)
|
59
|
+
File.join(@remote_dir, name)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
optional_require 'net/sftp'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Processor
|
5
|
+
# Custom processor to download files via SFTP
|
6
|
+
class SftpUploaderProcessor < ETL::Processor::Processor
|
7
|
+
attr_reader :host
|
8
|
+
attr_reader :port
|
9
|
+
attr_reader :remote_dir
|
10
|
+
attr_reader :files
|
11
|
+
attr_reader :username
|
12
|
+
attr_reader :local_dir
|
13
|
+
|
14
|
+
# configuration options include:
|
15
|
+
# * host - hostname or IP address of FTP server (required)
|
16
|
+
# * port - port number for FTP server (default: 22)
|
17
|
+
# * remote_dir - remote path on FTP server (default: /)
|
18
|
+
# * files - list of files to download from FTP server (default: [])
|
19
|
+
# * username - username for FTP server authentication (default: anonymous)
|
20
|
+
# * password - password for FTP server authentication (default: nil)
|
21
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
22
|
+
#
|
23
|
+
# As an example you might write something like the following in your control process file:
|
24
|
+
# pre_process :sftp_uploader, {
|
25
|
+
# :host => 'sftp.sec.gov',
|
26
|
+
# :path => 'edgar/Feed/2007/QTR2',
|
27
|
+
# :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
|
28
|
+
# '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
|
29
|
+
# :local_dir => '/data/sec/2007/04',
|
30
|
+
# }
|
31
|
+
# The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
|
32
|
+
# from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
|
33
|
+
def initialize(control, configuration)
|
34
|
+
@host = configuration[:host]
|
35
|
+
@port = configuration[:port] || 22
|
36
|
+
@remote_dir = configuration[:remote_dir] || '/'
|
37
|
+
@files = configuration[:files] || []
|
38
|
+
@username = configuration[:username] || 'anonymous'
|
39
|
+
@password = configuration[:password]
|
40
|
+
@local_dir = configuration[:local_dir] || ''
|
41
|
+
end
|
42
|
+
|
43
|
+
def process
|
44
|
+
Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
|
45
|
+
@files.each do |f|
|
46
|
+
conn.upload!(local_file(f), remote_file(f))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
attr_accessor :password
|
53
|
+
|
54
|
+
def local_file(name)
|
55
|
+
File.join(@local_dir, name)
|
56
|
+
end
|
57
|
+
|
58
|
+
def remote_file(name)
|
59
|
+
File.join(@remote_dir, name)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :destination
|
6
|
+
attr_accessor :table
|
7
|
+
attr_accessor :column
|
8
|
+
attr_accessor :target
|
9
|
+
|
10
|
+
# Initialize the surrogate key generator
|
11
|
+
#
|
12
|
+
# Configuration options
|
13
|
+
# * <tt>:query</tt>: If specified it contains a query to be used to
|
14
|
+
# locate the last surrogate key. If this is specified then :target
|
15
|
+
# must also be specified.
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:destination</tt>: The destination column name (defaults to :id)
|
18
|
+
def initialize(control, configuration)
|
19
|
+
super
|
20
|
+
@table = configuration[:table]
|
21
|
+
@column = configuration[:column] || 'id'
|
22
|
+
@target = configuration[:target]
|
23
|
+
if configuration[:query]
|
24
|
+
raise ControlError, "Query option is no longer value, use :column and :table instead"
|
25
|
+
end
|
26
|
+
if table
|
27
|
+
@surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
|
28
|
+
end
|
29
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
30
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
31
|
+
@surrogate_key = @surrogate_key.to_i
|
32
|
+
#puts "surrogate key: #{@surrogate_key}"
|
33
|
+
@destination = configuration[:destination] || :id
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a surrogate key to the row
|
37
|
+
def process(row)
|
38
|
+
if row
|
39
|
+
#puts "processing row #{row.inspect}"
|
40
|
+
@surrogate_key += 1
|
41
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
42
|
+
row[destination] = @surrogate_key
|
43
|
+
row
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def table_name
|
49
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
|
+
# prior to loading
|
5
|
+
class TruncateProcessor < ETL::Processor::Processor
|
6
|
+
# Defines the table to truncate
|
7
|
+
attr_reader :table
|
8
|
+
|
9
|
+
# Defines the database connection to use
|
10
|
+
attr_reader :target
|
11
|
+
|
12
|
+
# Initialize the processor
|
13
|
+
#
|
14
|
+
# Options:
|
15
|
+
# * <tt>:target</tt>: The target connection
|
16
|
+
# * <tt>:table</tt>: The table name
|
17
|
+
# * <tt>:options</tt>: Optional truncate options
|
18
|
+
def initialize(control, configuration)
|
19
|
+
super
|
20
|
+
#@file = File.join(File.dirname(control.file), configuration[:file])
|
21
|
+
@target = configuration[:target] || {}
|
22
|
+
@table = configuration[:table]
|
23
|
+
@options = configuration[:options]
|
24
|
+
end
|
25
|
+
|
26
|
+
def process
|
27
|
+
conn = ETL::Engine.connection(target)
|
28
|
+
if conn.is_a?(ActiveRecord::ConnectionAdapters::PostgreSQLAdapter)
|
29
|
+
@options ||= 'RESTART IDENTITY'
|
30
|
+
end
|
31
|
+
conn.truncate(table_name, @options)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def table_name
|
36
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
optional_require 'zip/zip'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Processor
|
5
|
+
# Custom processor to zip files
|
6
|
+
class ZipFileProcessor < ETL::Processor::Processor
|
7
|
+
attr_reader :infile
|
8
|
+
attr_reader :destination
|
9
|
+
|
10
|
+
# configuration options include:
|
11
|
+
# * infile - File to zip (required)
|
12
|
+
# * destination - Zip file name (default: #{infile}.zip)
|
13
|
+
def initialize(control, configuration)
|
14
|
+
path = Pathname.new(configuration[:infile])
|
15
|
+
@infile = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:infile]))) + path
|
16
|
+
@destination = configuration[:destination] || "#{infile}.zip"
|
17
|
+
end
|
18
|
+
|
19
|
+
def process
|
20
|
+
Zip::ZipFile.open(@destination, Zip::ZipFile::CREATE) do |zipfile|
|
21
|
+
zipfile.add(@infile.basename, @infile)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/etl/row.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# This source file contains the ETL::Row class.
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# This class represents a single row currently passing through the ETL pipeline
|
5
|
+
class Row < Hash
|
6
|
+
# Accessor for the originating source
|
7
|
+
attr_accessor :source
|
8
|
+
|
9
|
+
# All change types
|
10
|
+
CHANGE_TYPES = [:insert, :update, :delete]
|
11
|
+
|
12
|
+
# Accessor for the row's change type
|
13
|
+
attr_accessor :change_type
|
14
|
+
|
15
|
+
# Get the change type, defaults to :insert
|
16
|
+
def change_type
|
17
|
+
@change_type ||= :insert
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/etl/screen.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# This source file contains the ETL::Screen module and requires all of the
|
2
|
+
# screens
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Screen module contains pre-built screens useful for checking the
|
6
|
+
# ETL state during execution. Screens may be fatal, which will result in
|
7
|
+
# termination of the ETL process, errors, which will result in the
|
8
|
+
# termination of just the current ETL control file, or warnings, which will
|
9
|
+
# result in a warning message.
|
10
|
+
module Screen
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ETL
|
2
|
+
module Screen
|
3
|
+
# This screen validates the number of rows which will be bulk loaded
|
4
|
+
# against the results from some sort of a row count query. If there
|
5
|
+
# is a difference then the screen will not pass
|
6
|
+
class RowCountScreen
|
7
|
+
attr_accessor :control, :configuration
|
8
|
+
def initialize(control, configuration={})
|
9
|
+
@control = control
|
10
|
+
@configuration = configuration
|
11
|
+
execute
|
12
|
+
end
|
13
|
+
def execute
|
14
|
+
unless Engine.rows_written == configuration[:rows]
|
15
|
+
raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|