activewarehouse-etl-sgonyea 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +9 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +236 -0
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +13 -0
- data/LICENSE +7 -0
- data/README.textile +111 -0
- data/Rakefile +103 -0
- data/TODO +28 -0
- data/active_support_logger.patch +78 -0
- data/activewarehouse-etl.gemspec +36 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +97 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +65 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +438 -0
- data/lib/etl/control/destination/csv_destination.rb +113 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/destination/yaml_destination.rb +74 -0
- data/lib/etl/control/source.rb +132 -0
- data/lib/etl/control/source/database_source.rb +224 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +582 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +8 -0
- data/lib/etl/execution/batch.rb +10 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +90 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/csv_parser.rb +93 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +94 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +39 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/database_join_processor.rb +82 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +27 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +40 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/md5_transform.rb +13 -0
- data/lib/etl/transform/ordinalize_transform.rb +14 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +3 -0
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/all.ebf +6 -0
- data/test/apache_combined_log.ctl +11 -0
- data/test/batch_test.rb +41 -0
- data/test/batch_with_error.ebf +6 -0
- data/test/batched1.ctl +0 -0
- data/test/batched2.ctl +0 -0
- data/test/block_processor.ctl +6 -0
- data/test/block_processor_error.ctl +1 -0
- data/test/block_processor_pre_post_process.ctl +4 -0
- data/test/block_processor_remove_rows.ctl +5 -0
- data/test/block_processor_test.rb +38 -0
- data/test/check_exist_processor_test.rb +92 -0
- data/test/check_unique_processor_test.rb +40 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +53 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +61 -0
- data/test/config/common.rb +29 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +37 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +40 -0
- data/test/control_test.rb +43 -0
- data/test/data/apache_combined_log.txt +3 -0
- data/test/data/bulk_import.txt +3 -0
- data/test/data/bulk_import_with_empties.txt +3 -0
- data/test/data/decode.txt +3 -0
- data/test/data/delimited.txt +3 -0
- data/test/data/encode_source_latin1.txt +2 -0
- data/test/data/excel.xls +0 -0
- data/test/data/excel2.xls +0 -0
- data/test/data/fixed_width.txt +3 -0
- data/test/data/multiple_delimited_1.txt +3 -0
- data/test/data/multiple_delimited_2.txt +3 -0
- data/test/data/nokogiri.xml +38 -0
- data/test/data/people.txt +3 -0
- data/test/data/sax.xml +14 -0
- data/test/data/xml.xml +16 -0
- data/test/database_join_processor_test.rb +43 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/delimited.ctl +30 -0
- data/test/delimited_absolute.ctl +31 -0
- data/test/delimited_destination_db.ctl +23 -0
- data/test/delimited_excel.ctl +31 -0
- data/test/delimited_insert_update.ctl +34 -0
- data/test/delimited_update.ctl +34 -0
- data/test/delimited_with_bulk_load.ctl +34 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +78 -0
- data/test/ensure_fields_presence_processor_test.rb +28 -0
- data/test/errors.ctl +24 -0
- data/test/etl_test.rb +42 -0
- data/test/excel.ctl +24 -0
- data/test/excel2.ctl +25 -0
- data/test/fixed_width.ctl +35 -0
- data/test/foreign_key_lookup_transform_test.rb +50 -0
- data/test/generator_test.rb +14 -0
- data/test/inline_parser.ctl +17 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/model_source.ctl +14 -0
- data/test/multiple_delimited.ctl +22 -0
- data/test/multiple_source_delimited.ctl +39 -0
- data/test/nokogiri_all.ctl +35 -0
- data/test/nokogiri_select.ctl +35 -0
- data/test/nokogiri_test.rb +35 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/sax.ctl +26 -0
- data/test/scd/1.txt +1 -0
- data/test/scd/2.txt +1 -0
- data/test/scd/3.txt +1 -0
- data/test/scd_test.rb +257 -0
- data/test/scd_test_type_1.ctl +43 -0
- data/test/scd_test_type_2.ctl +34 -0
- data/test/screen_test.rb +9 -0
- data/test/screen_test_error.ctl +3 -0
- data/test/screen_test_fatal.ctl +3 -0
- data/test/source_test.rb +154 -0
- data/test/test_helper.rb +37 -0
- data/test/transform_test.rb +101 -0
- data/test/truncate_processor_test.rb +37 -0
- data/test/xml.ctl +31 -0
- metadata +370 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
# Ensure that each specified field is available
|
4
|
+
class EnsureFieldsPresenceProcessor < ETL::Processor::RowProcessor
|
5
|
+
|
6
|
+
# Initialize the processor.
|
7
|
+
#
|
8
|
+
# Configuration options:
|
9
|
+
# * <tt>:fields</tt>: An array of keys whose presence should be verified in each row
|
10
|
+
def initialize(control, configuration)
|
11
|
+
super
|
12
|
+
@fields = configuration[:fields]
|
13
|
+
raise ControlError, ":fields must be specified" unless @fields
|
14
|
+
end
|
15
|
+
|
16
|
+
def process(row)
|
17
|
+
missing_fields = configuration[:fields] - row.keys
|
18
|
+
raise(ETL::ControlError,
|
19
|
+
"Row missing required field(s) #{missing_fields.join(',')} in row. Available fields are : #{row.keys.join(',')}") unless missing_fields.empty?
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Processor #:nodoc:
|
5
|
+
class EscapeCsvProcessor < ETL::Processor::Processor
|
6
|
+
|
7
|
+
# The file to load from
|
8
|
+
attr_reader :source_file
|
9
|
+
# The file to write to
|
10
|
+
attr_reader :target_file
|
11
|
+
# whether to use a temporary file or not
|
12
|
+
attr_reader :use_temp_file
|
13
|
+
|
14
|
+
attr_reader :filters
|
15
|
+
attr_reader :charcount
|
16
|
+
|
17
|
+
# Initialize the processor.
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:source_file</tt>: The file to load data from
|
21
|
+
# * <tt>:target_file</tt>: The file to write data to
|
22
|
+
# * <tt>:file</tt>: short-cut which will set the same value to both source_file and target_file
|
23
|
+
def initialize(control, configuration)
|
24
|
+
super
|
25
|
+
if configuration[:file]
|
26
|
+
@use_temp_file = true
|
27
|
+
configuration[:source_file] = configuration[:file]
|
28
|
+
configuration[:target_file] = configuration[:file] + '.tmp'
|
29
|
+
end
|
30
|
+
path = Pathname.new(configuration[:source_file])
|
31
|
+
@source_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:source_file]))) + path
|
32
|
+
path = Pathname.new(configuration[:target_file])
|
33
|
+
@target_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:target_file]))) + path
|
34
|
+
@filters = configuration[:filters] || [{:replace => '\"', :result => '""'}]
|
35
|
+
@charcount = configuration[:charcount]
|
36
|
+
raise ControlError, "Source file must be specified" if @source_file.nil?
|
37
|
+
raise ControlError, "Target file must be specified" if @target_file.nil?
|
38
|
+
raise ControlError, "Source and target file cannot currently point to the same file" if @source_file == @target_file
|
39
|
+
end
|
40
|
+
|
41
|
+
# Execute the processor
|
42
|
+
def process
|
43
|
+
reader = File.open(@source_file, 'r')
|
44
|
+
writer = File.open(@target_file, 'w')
|
45
|
+
|
46
|
+
reader.each_line do |line|
|
47
|
+
reading = line
|
48
|
+
@filters.each do |filter|
|
49
|
+
if (!filter[:replace].nil? &&
|
50
|
+
!filter[:result].nil?)
|
51
|
+
result = reading.gsub(Regexp.new(filter[:replace]), filter[:result])
|
52
|
+
reading = result
|
53
|
+
end
|
54
|
+
end unless @filters.nil?
|
55
|
+
@charcount.each do |count|
|
56
|
+
if (!count[:char].nil? &&
|
57
|
+
!count[:count].nil?)
|
58
|
+
c = reading.count count[:char]
|
59
|
+
if c != count[:count]
|
60
|
+
reading = nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end unless @charcount.nil?
|
64
|
+
writer.write(reading) unless reading.nil?
|
65
|
+
end
|
66
|
+
|
67
|
+
reader.close
|
68
|
+
writer.close
|
69
|
+
|
70
|
+
if use_temp_file
|
71
|
+
FileUtils.rm(source_file)
|
72
|
+
FileUtils.mv(target_file,source_file)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
class FilterRowProcessor < ETL::Processor::RowProcessor
|
4
|
+
attr_reader :condition
|
5
|
+
attr_reader :outtrue
|
6
|
+
attr_reader :outfalse
|
7
|
+
|
8
|
+
def initialize(control, configuration)
|
9
|
+
@condition = configuration[:condition]
|
10
|
+
@outtrue = configuration[:outtrue]
|
11
|
+
@outfalse = configuration[:outfalse]
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def process(row)
|
16
|
+
return nil if row.nil?
|
17
|
+
|
18
|
+
if eval_condition(row, @condition)
|
19
|
+
return [] if @outtrue.nil?
|
20
|
+
|
21
|
+
eval(@outtrue)
|
22
|
+
else
|
23
|
+
eval(@outfalse) unless @outfalse.nil?
|
24
|
+
end
|
25
|
+
|
26
|
+
return row
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def eval_condition(row, cond)
|
31
|
+
|
32
|
+
first = cond[1]
|
33
|
+
if (cond[1].class == Array)
|
34
|
+
first = eval_condition(row, cond[1])
|
35
|
+
end
|
36
|
+
|
37
|
+
second = cond[2]
|
38
|
+
if (cond[2].class == Array)
|
39
|
+
second = eval_condition(row, cond[2])
|
40
|
+
end
|
41
|
+
|
42
|
+
return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
|
43
|
+
|
44
|
+
eval("#{first}#{cond[0]}#{second}")
|
45
|
+
rescue => e
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# Written by Susan Potter under open source MIT license.
|
2
|
+
# August 12, 2007.
|
3
|
+
|
4
|
+
require 'net/ftp'
|
5
|
+
|
6
|
+
module ETL
|
7
|
+
module Processor
|
8
|
+
# Custom processor to download files via FTP
|
9
|
+
class FtpDownloaderProcessor < ETL::Processor::Processor
|
10
|
+
attr_reader :host
|
11
|
+
attr_reader :port
|
12
|
+
attr_reader :remote_dir
|
13
|
+
attr_reader :files
|
14
|
+
attr_reader :username
|
15
|
+
attr_reader :local_dir
|
16
|
+
|
17
|
+
# configuration options include:
|
18
|
+
# * host - hostname or IP address of FTP server (required)
|
19
|
+
# * port - port number for FTP server (default: 21)
|
20
|
+
# * remote_dir - remote path on FTP server (default: /)
|
21
|
+
# * files - list of files to download from FTP server (default: [])
|
22
|
+
# * username - username for FTP server authentication (default: anonymous)
|
23
|
+
# * password - password for FTP server authentication (default: nil)
|
24
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
25
|
+
#
|
26
|
+
# As an example you might write something like the following in your control process file:
|
27
|
+
# pre_process :ftp_downloader, {
|
28
|
+
# :host => 'ftp.sec.gov',
|
29
|
+
# :path => 'edgar/Feed/2007/QTR2',
|
30
|
+
# :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
|
31
|
+
# '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
|
32
|
+
# :local_dir => '/data/sec/2007/04',
|
33
|
+
# }
|
34
|
+
# The above example will anonymously download via FTP the first week's worth of SEC filing feed data
|
35
|
+
# from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
|
36
|
+
def initialize(control, configuration)
|
37
|
+
@host = configuration[:host]
|
38
|
+
@port = configuration[:port] || 21
|
39
|
+
@remote_dir = configuration[:remote_dir] || '/'
|
40
|
+
@files = configuration[:files] || []
|
41
|
+
@username = configuration[:username] || 'anonymous'
|
42
|
+
@password = configuration[:password]
|
43
|
+
@local_dir = configuration[:local_dir] || ''
|
44
|
+
end
|
45
|
+
|
46
|
+
def process
|
47
|
+
Net::FTP.open(@host) do |conn|
|
48
|
+
conn.connect(@host, @port)
|
49
|
+
conn.login(@username, @password)
|
50
|
+
@files.each do |f|
|
51
|
+
conn.getbinaryfile(remote_file(f), local_file(f))
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
attr_accessor :password
|
58
|
+
|
59
|
+
def local_file(name)
|
60
|
+
File.join(@local_dir, name)
|
61
|
+
end
|
62
|
+
|
63
|
+
def remote_file(name)
|
64
|
+
File.join(@remote_dir, name)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'net/ftp'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Processor
|
5
|
+
# Custom processor to download files via FTP
|
6
|
+
class FtpUploaderProcessor < ETL::Processor::Processor
|
7
|
+
attr_reader :host
|
8
|
+
attr_reader :port
|
9
|
+
attr_reader :remote_dir
|
10
|
+
attr_reader :files
|
11
|
+
attr_reader :username
|
12
|
+
attr_reader :local_dir
|
13
|
+
|
14
|
+
# configuration options include:
|
15
|
+
# * host - hostname or IP address of FTP server (required)
|
16
|
+
# * port - port number for FTP server (default: 21)
|
17
|
+
# * remote_dir - remote path on FTP server (default: /)
|
18
|
+
# * files - list of files to download from FTP server (default: [])
|
19
|
+
# * username - username for FTP server authentication (default: anonymous)
|
20
|
+
# * password - password for FTP server authentication (default: nil)
|
21
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
22
|
+
#
|
23
|
+
# As an example you might write something like the following in your control process file:
|
24
|
+
# pre_process :ftp_uploader, {
|
25
|
+
# :host => 'ftp.sec.gov',
|
26
|
+
# :path => 'edgar/Feed/2007/QTR2',
|
27
|
+
# :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
|
28
|
+
# '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
|
29
|
+
# :local_dir => '/data/sec/2007/04',
|
30
|
+
# }
|
31
|
+
# The above example will anonymously download via FTP the first week's worth of SEC filing feed data
|
32
|
+
# from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
|
33
|
+
def initialize(control, configuration)
|
34
|
+
@host = configuration[:host]
|
35
|
+
@port = configuration[:port] || 21
|
36
|
+
@remote_dir = configuration[:remote_dir] || '/'
|
37
|
+
@files = configuration[:files] || []
|
38
|
+
@username = configuration[:username] || 'anonymous'
|
39
|
+
@password = configuration[:password]
|
40
|
+
@local_dir = configuration[:local_dir] || ''
|
41
|
+
end
|
42
|
+
|
43
|
+
def process
|
44
|
+
Net::FTP.open(@host) do |conn|
|
45
|
+
conn.connect(@host, @port)
|
46
|
+
conn.login(@username, @password)
|
47
|
+
@files.each do |f|
|
48
|
+
conn.putbinaryfile(local_file(f), remote_file(f))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
attr_accessor :password
|
55
|
+
|
56
|
+
def local_file(name)
|
57
|
+
File.join(@local_dir, name)
|
58
|
+
end
|
59
|
+
|
60
|
+
def remote_file(name)
|
61
|
+
File.join(@remote_dir, name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
target = configuration[:target]
|
28
|
+
table = configuration[:table]
|
29
|
+
conn = ETL::Engine.connection(target)
|
30
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
31
|
+
rows
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
36
|
+
# for all of the children of the current level
|
37
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
38
|
+
ids.each do |id|
|
39
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
40
|
+
|
41
|
+
row = {
|
42
|
+
:parent_id => row_id,
|
43
|
+
:child_id => id,
|
44
|
+
:num_levels_from_parent => level,
|
45
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
46
|
+
:is_top => (root ? 1 : 0),
|
47
|
+
}
|
48
|
+
rows << row
|
49
|
+
|
50
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
optional_require 'net/imap'
|
2
|
+
optional_require 'tmail'
|
3
|
+
|
4
|
+
module ETL
|
5
|
+
module Processor
|
6
|
+
# Custom processor to download files via Imap Attachment
|
7
|
+
class ImapattachmentDownloaderProcessor < ETL::Processor::Processor
|
8
|
+
attr_reader :host
|
9
|
+
attr_reader :ssl
|
10
|
+
attr_reader :port
|
11
|
+
attr_reader :delete
|
12
|
+
attr_reader :filters
|
13
|
+
attr_reader :folder
|
14
|
+
attr_reader :username
|
15
|
+
attr_reader :local_dir
|
16
|
+
|
17
|
+
# configuration options include:
|
18
|
+
# * host - hostname or IP address of IMAP server (required)
|
19
|
+
# * ssl - activate encryption (default false)
|
20
|
+
# * port - port number for IMAP server (default: 220 or 993)
|
21
|
+
# * delete - delete message after reading (default false)
|
22
|
+
# * filters - filter mails (default [])
|
23
|
+
# * folder - folder to select mails from (default INBOX)
|
24
|
+
# * username - username for IMAP server authentication (default: anonymous)
|
25
|
+
# * password - password for IMAP server authentication (default: nil)
|
26
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
27
|
+
#
|
28
|
+
def initialize(control, configuration)
|
29
|
+
@host = configuration[:host]
|
30
|
+
@ssl = configuration[:ssl] || false
|
31
|
+
@port = configuration[:port] || (@ssl ? 993 : 220 )
|
32
|
+
@delete = configuration[:delete] || false
|
33
|
+
@filters = configuration[:filters] || []
|
34
|
+
@folder = configuration[:folder] || 'INBOX'
|
35
|
+
@username = configuration[:username] || 'anonymous'
|
36
|
+
@password = configuration[:password]
|
37
|
+
@local_dir = configuration[:local_dir] || ''
|
38
|
+
end
|
39
|
+
|
40
|
+
def process
|
41
|
+
conn = Net::IMAP.new(@host, @port, @ssl)
|
42
|
+
conn.login(@username, @password)
|
43
|
+
|
44
|
+
conn.select(@folder)
|
45
|
+
conn.uid_search(["NOT", "DELETED"]).each do |msguuid|
|
46
|
+
mail = TMail::Mail.parse( conn.uid_fetch(msguuid, 'RFC822').first.attr['RFC822'] )
|
47
|
+
next if mail.attachments.blank?
|
48
|
+
if applyfilter(mail, @filters)
|
49
|
+
mail.attachments.each do |attachment|
|
50
|
+
filename = attachment.original_filename
|
51
|
+
File.open(local_file(filename), "w") {|f|
|
52
|
+
f << attachment.gets(nil)
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
conn.store(msguuid, "+FLAGS", [:Deleted]) if @delete
|
57
|
+
end
|
58
|
+
end
|
59
|
+
conn.expunge
|
60
|
+
conn.close
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
attr_accessor :password
|
65
|
+
|
66
|
+
def local_file(name)
|
67
|
+
File.join(@local_dir, name)
|
68
|
+
end
|
69
|
+
|
70
|
+
def applyfilter(mail, cond)
|
71
|
+
return true if (cond.nil? or cond.size < 3)
|
72
|
+
|
73
|
+
first = cond[1]
|
74
|
+
if (cond[1].class == Array)
|
75
|
+
first = eval_condition(row, cond[1])
|
76
|
+
end
|
77
|
+
|
78
|
+
second = cond[2]
|
79
|
+
if (cond[2].class == Array)
|
80
|
+
second = eval_condition(row, cond[2])
|
81
|
+
end
|
82
|
+
|
83
|
+
return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
|
84
|
+
|
85
|
+
eval("#{first}#{cond[0]}#{second}")
|
86
|
+
rescue => e
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
optional_require 'net/pop'
|
2
|
+
optional_require 'tmail'
|
3
|
+
|
4
|
+
module ETL
|
5
|
+
module Processor
|
6
|
+
# Custom processor to download files via Pop3 Attachment
|
7
|
+
class Pop3attachmentDownloaderProcessor < ETL::Processor::Processor
|
8
|
+
attr_reader :host
|
9
|
+
attr_reader :ssl
|
10
|
+
attr_reader :port
|
11
|
+
attr_reader :delete
|
12
|
+
attr_reader :filters
|
13
|
+
attr_reader :username
|
14
|
+
attr_reader :local_dir
|
15
|
+
|
16
|
+
# configuration options include:
|
17
|
+
# * host - hostname or IP address of POP3 server (required)
|
18
|
+
# * ssl - activate encryption (default false)
|
19
|
+
# * port - port number for POP3 server (default: Net::POP3.default_port or Net::POP3.default_pop3s_port)
|
20
|
+
# * delete - delete message after reading (default false)
|
21
|
+
# * filters - filter mails (default [])
|
22
|
+
# * username - username for POP3 server authentication (default: anonymous)
|
23
|
+
# * password - password for POP3 server authentication (default: nil)
|
24
|
+
# * local_dir - local output directory to save downloaded files (default: '')
|
25
|
+
#
|
26
|
+
def initialize(control, configuration)
|
27
|
+
@host = configuration[:host]
|
28
|
+
@ssl = configuration[:ssl] || false
|
29
|
+
@port = configuration[:port] || (@ssl ? Net::POP3.default_pop3s_port : Net::POP3.default_port )
|
30
|
+
@delete = configuration[:delete] || false
|
31
|
+
@filters = configuration[:filters] || []
|
32
|
+
@username = configuration[:username] || 'anonymous'
|
33
|
+
@password = configuration[:password]
|
34
|
+
@local_dir = configuration[:local_dir] || ''
|
35
|
+
end
|
36
|
+
|
37
|
+
def process
|
38
|
+
Net::POP3.enable_ssl(OpenSSL::SSL::VERIFY_NONE) if @ssl
|
39
|
+
conn = Net::POP3.new(@host, @port)
|
40
|
+
conn.start(@username, @password)
|
41
|
+
if !conn.mails.empty?
|
42
|
+
conn.each_mail do |message|
|
43
|
+
stringmail = message.pop
|
44
|
+
mail = TMail::Mail.parse(stringmail)
|
45
|
+
next if mail.attachments.blank?
|
46
|
+
if applyfilter(mail, @filters)
|
47
|
+
mail.attachments.each do |attachment|
|
48
|
+
filename = attachment.original_filename
|
49
|
+
File.open(local_file(filename), "w") {|f|
|
50
|
+
f << attachment.gets(nil)
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
message.delete if @delete
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
conn.finish
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
attr_accessor :password
|
64
|
+
|
65
|
+
def local_file(name)
|
66
|
+
File.join(@local_dir, name)
|
67
|
+
end
|
68
|
+
|
69
|
+
def applyfilter(mail, cond)
|
70
|
+
return true if (cond.nil? or cond.size < 3)
|
71
|
+
|
72
|
+
first = cond[1]
|
73
|
+
if (cond[1].class == Array)
|
74
|
+
first = eval_condition(row, cond[1])
|
75
|
+
end
|
76
|
+
|
77
|
+
second = cond[2]
|
78
|
+
if (cond[2].class == Array)
|
79
|
+
second = eval_condition(row, cond[2])
|
80
|
+
end
|
81
|
+
|
82
|
+
return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
|
83
|
+
|
84
|
+
eval("#{first}#{cond[0]}#{second}")
|
85
|
+
rescue => e
|
86
|
+
return false
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|