activewarehouse-etl 0.9.1 → 0.9.5.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/.gitignore +7 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +182 -150
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +9 -0
  6. data/README +18 -2
  7. data/Rakefile +35 -91
  8. data/active_support_logger.patch +78 -0
  9. data/activewarehouse-etl.gemspec +30 -0
  10. data/lib/etl.rb +10 -2
  11. data/lib/etl/batch/directives.rb +11 -1
  12. data/lib/etl/control/control.rb +2 -2
  13. data/lib/etl/control/destination.rb +27 -7
  14. data/lib/etl/control/destination/database_destination.rb +8 -6
  15. data/lib/etl/control/destination/excel_destination.rb +91 -0
  16. data/lib/etl/control/destination/file_destination.rb +6 -4
  17. data/lib/etl/control/destination/insert_update_database_destination.rb +133 -0
  18. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  19. data/lib/etl/control/source.rb +3 -2
  20. data/lib/etl/control/source/database_source.rb +14 -10
  21. data/lib/etl/control/source/file_source.rb +2 -2
  22. data/lib/etl/engine.rb +17 -15
  23. data/lib/etl/execution.rb +0 -1
  24. data/lib/etl/execution/batch.rb +3 -1
  25. data/lib/etl/execution/migration.rb +5 -0
  26. data/lib/etl/parser/delimited_parser.rb +20 -1
  27. data/lib/etl/parser/excel_parser.rb +112 -0
  28. data/lib/etl/processor/bulk_import_processor.rb +4 -2
  29. data/lib/etl/processor/database_join_processor.rb +68 -0
  30. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  31. data/lib/etl/processor/filter_row_processor.rb +51 -0
  32. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  33. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  34. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  35. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  36. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  37. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  38. data/lib/etl/processor/zip_file_processor.rb +27 -0
  39. data/lib/etl/transform/calculation_transform.rb +71 -0
  40. data/lib/etl/transform/foreign_key_lookup_transform.rb +25 -7
  41. data/lib/etl/transform/ordinalize_transform.rb +3 -1
  42. data/lib/etl/transform/split_fields_transform.rb +27 -0
  43. data/lib/etl/version.rb +1 -7
  44. data/test-matrix.yml +10 -0
  45. data/test/.gitignore +1 -0
  46. data/test/.ignore +2 -0
  47. data/test/all.ebf +6 -0
  48. data/test/apache_combined_log.ctl +11 -0
  49. data/test/batch_test.rb +41 -0
  50. data/test/batch_with_error.ebf +6 -0
  51. data/test/batched1.ctl +0 -0
  52. data/test/batched2.ctl +0 -0
  53. data/test/block_processor.ctl +6 -0
  54. data/test/block_processor_error.ctl +1 -0
  55. data/test/block_processor_pre_post_process.ctl +4 -0
  56. data/test/block_processor_remove_rows.ctl +5 -0
  57. data/test/block_processor_test.rb +38 -0
  58. data/test/config/Gemfile.rails-2.3.x +3 -0
  59. data/test/config/Gemfile.rails-2.3.x.lock +38 -0
  60. data/test/config/Gemfile.rails-3.0.x +3 -0
  61. data/test/config/Gemfile.rails-3.0.x.lock +49 -0
  62. data/test/config/common.rb +21 -0
  63. data/test/connection/mysql/connection.rb +9 -0
  64. data/test/connection/mysql/schema.sql +36 -0
  65. data/test/connection/postgresql/connection.rb +13 -0
  66. data/test/connection/postgresql/schema.sql +39 -0
  67. data/test/control_test.rb +43 -0
  68. data/test/data/apache_combined_log.txt +3 -0
  69. data/test/data/bulk_import.txt +3 -0
  70. data/test/data/bulk_import_with_empties.txt +3 -0
  71. data/test/data/decode.txt +3 -0
  72. data/test/data/delimited.txt +3 -0
  73. data/test/data/encode_source_latin1.txt +2 -0
  74. data/test/data/excel.xls +0 -0
  75. data/test/data/excel2.xls +0 -0
  76. data/test/data/fixed_width.txt +3 -0
  77. data/test/data/multiple_delimited_1.txt +3 -0
  78. data/test/data/multiple_delimited_2.txt +3 -0
  79. data/test/data/people.txt +3 -0
  80. data/test/data/sax.xml +14 -0
  81. data/test/data/xml.xml +16 -0
  82. data/test/date_dimension_builder_test.rb +96 -0
  83. data/test/delimited.ctl +30 -0
  84. data/test/delimited_absolute.ctl +33 -0
  85. data/test/delimited_destination_db.ctl +25 -0
  86. data/test/delimited_excel.ctl +31 -0
  87. data/test/delimited_insert_update.ctl +34 -0
  88. data/test/delimited_update.ctl +34 -0
  89. data/test/delimited_with_bulk_load.ctl +34 -0
  90. data/test/destination_test.rb +275 -0
  91. data/test/directive_test.rb +23 -0
  92. data/test/encode_processor_test.rb +32 -0
  93. data/test/engine_test.rb +32 -0
  94. data/test/errors.ctl +24 -0
  95. data/test/etl_test.rb +42 -0
  96. data/test/excel.ctl +24 -0
  97. data/test/excel2.ctl +25 -0
  98. data/test/fixed_width.ctl +35 -0
  99. data/test/generator_test.rb +14 -0
  100. data/test/inline_parser.ctl +17 -0
  101. data/test/mocks/mock_destination.rb +26 -0
  102. data/test/mocks/mock_source.rb +25 -0
  103. data/test/model_source.ctl +14 -0
  104. data/test/multiple_delimited.ctl +22 -0
  105. data/test/multiple_source_delimited.ctl +39 -0
  106. data/test/parser_test.rb +224 -0
  107. data/test/performance/delimited.ctl +30 -0
  108. data/test/processor_test.rb +44 -0
  109. data/test/row_processor_test.rb +17 -0
  110. data/test/sax.ctl +26 -0
  111. data/test/scd/1.txt +1 -0
  112. data/test/scd/2.txt +1 -0
  113. data/test/scd/3.txt +1 -0
  114. data/test/scd_test.rb +257 -0
  115. data/test/scd_test_type_1.ctl +43 -0
  116. data/test/scd_test_type_2.ctl +34 -0
  117. data/test/screen_test.rb +9 -0
  118. data/test/screen_test_error.ctl +3 -0
  119. data/test/screen_test_fatal.ctl +3 -0
  120. data/test/source_test.rb +139 -0
  121. data/test/test_helper.rb +34 -0
  122. data/test/transform_test.rb +101 -0
  123. data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +26 -0
  124. data/test/vendor/adapter_extensions-0.5.0/LICENSE +16 -0
  125. data/test/vendor/adapter_extensions-0.5.0/README +7 -0
  126. data/test/vendor/adapter_extensions-0.5.0/Rakefile +158 -0
  127. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +12 -0
  128. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +44 -0
  129. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +63 -0
  130. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +52 -0
  131. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +44 -0
  132. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +10 -0
  133. data/test/xml.ctl +31 -0
  134. metadata +229 -70
  135. data/lib/etl/execution/record.rb +0 -18
@@ -0,0 +1,63 @@
1
+ require 'net/sftp'
2
+
3
+ module ETL
4
+ module Processor
5
+ # Custom processor to download files via SFTP
6
+ class SftpDownloaderProcessor < ETL::Processor::Processor
7
+ attr_reader :host
8
+ attr_reader :port
9
+ attr_reader :remote_dir
10
+ attr_reader :files
11
+ attr_reader :username
12
+ attr_reader :local_dir
13
+
14
+ # configuration options include:
15
+ # * host - hostname or IP address of FTP server (required)
16
+ # * port - port number for FTP server (default: 22)
17
+ # * remote_dir - remote path on FTP server (default: /)
18
+ # * files - list of files to download from FTP server (default: [])
19
+ # * username - username for FTP server authentication (default: anonymous)
20
+ # * password - password for FTP server authentication (default: nil)
21
+ # * local_dir - local output directory to save downloaded files (default: '')
22
+ #
23
+ # As an example you might write something like the following in your control process file:
24
+ # pre_process :sftp_downloader, {
25
+ # :host => 'sftp.sec.gov',
26
+ # :path => 'edgar/Feed/2007/QTR2',
27
+ # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28
+ # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29
+ # :local_dir => '/data/sec/2007/04',
30
+ # }
31
+ # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
32
+ # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33
+ def initialize(control, configuration)
34
+ @host = configuration[:host]
35
+ @port = configuration[:port] || 22
36
+ @remote_dir = configuration[:remote_dir] || '/'
37
+ @files = configuration[:files] || []
38
+ @username = configuration[:username] || 'anonymous'
39
+ @password = configuration[:password]
40
+ @local_dir = configuration[:local_dir] || ''
41
+ end
42
+
43
+ def process
44
+ Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
45
+ @files.each do |f|
46
+ conn.download!(remote_file(f), local_file(f))
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+ attr_accessor :password
53
+
54
+ def local_file(name)
55
+ File.join(@local_dir, name)
56
+ end
57
+
58
+ def remote_file(name)
59
+ File.join(@remote_dir, name)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,63 @@
1
+ require 'net/sftp'
2
+
3
+ module ETL
4
+ module Processor
5
+ # Custom processor to download files via SFTP
6
+ class SftpUploaderProcessor < ETL::Processor::Processor
7
+ attr_reader :host
8
+ attr_reader :port
9
+ attr_reader :remote_dir
10
+ attr_reader :files
11
+ attr_reader :username
12
+ attr_reader :local_dir
13
+
14
+ # configuration options include:
15
+ # * host - hostname or IP address of FTP server (required)
16
+ # * port - port number for FTP server (default: 22)
17
+ # * remote_dir - remote path on FTP server (default: /)
18
+ # * files - list of files to download from FTP server (default: [])
19
+ # * username - username for FTP server authentication (default: anonymous)
20
+ # * password - password for FTP server authentication (default: nil)
21
+ # * local_dir - local output directory to save downloaded files (default: '')
22
+ #
23
+ # As an example you might write something like the following in your control process file:
24
+ # pre_process :sftp_uploader, {
25
+ # :host => 'sftp.sec.gov',
26
+ # :path => 'edgar/Feed/2007/QTR2',
27
+ # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28
+ # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29
+ # :local_dir => '/data/sec/2007/04',
30
+ # }
31
+ # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
32
+ # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33
+ def initialize(control, configuration)
34
+ @host = configuration[:host]
35
+ @port = configuration[:port] || 22
36
+ @remote_dir = configuration[:remote_dir] || '/'
37
+ @files = configuration[:files] || []
38
+ @username = configuration[:username] || 'anonymous'
39
+ @password = configuration[:password]
40
+ @local_dir = configuration[:local_dir] || ''
41
+ end
42
+
43
+ def process
44
+ Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
45
+ @files.each do |f|
46
+ conn.upload!(local_file(f), remote_file(f))
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+ attr_accessor :password
53
+
54
+ def local_file(name)
55
+ File.join(@local_dir, name)
56
+ end
57
+
58
+ def remote_file(name)
59
+ File.join(@remote_dir, name)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,27 @@
1
+ require 'zip/zip'
2
+
3
+ module ETL
4
+ module Processor
5
+ # Custom processor to zip files
6
+ class ZipFileProcessor < ETL::Processor::Processor
7
+ attr_reader :infile
8
+ attr_reader :destination
9
+
10
+ # configuration options include:
11
+ # * infile - File to zip (required)
12
+ # * destination - Zip file name (default: #{infile}.zip)
13
+ def initialize(control, configuration)
14
+ path = Pathname.new(configuration[:infile])
15
+ @infile = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:infile]))) + path
16
+ @destination = configuration[:destination] || "#{infile}.zip"
17
+ end
18
+
19
+ def process
20
+ Zip::ZipFile.open(@destination, Zip::ZipFile::CREATE) do |zipfile|
21
+ zipfile.add(@infile.basename, @infile)
22
+ end
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,71 @@
1
+ module ETL
2
+ module Transform
3
+ class CalculationTransform < ETL::Transform::Transform
4
+ attr_reader :function
5
+ attr_reader :fields
6
+
7
+ def initialize(control, name, configuration)
8
+ @function = configuration[:function]
9
+ @fields = configuration[:fields]
10
+ super
11
+ end
12
+
13
+ def transform(name, value, row)
14
+ return nil if row.nil?
15
+ return nil if row[@fields[0]].nil?
16
+
17
+ if (@function.eql? "A + B")
18
+ result = ""
19
+ @fields.each do |field|
20
+ next if field.nil?
21
+
22
+ string = ""
23
+ if field.to_s.eql? field
24
+ string = field
25
+ begin
26
+ string = eval('"' + field + '"')
27
+ rescue
28
+ end
29
+ else
30
+ string = row[field]
31
+ end
32
+ next if string.nil?
33
+
34
+ result = result + string
35
+ end
36
+
37
+ row[name] = result
38
+ end
39
+
40
+ if (@function.eql? "date A")
41
+ first = row[@fields[0]]
42
+ row[name] = Time.parse(first)
43
+ end
44
+
45
+ if (@function.eql? "trim A")
46
+ first = row[@fields[0]]
47
+ row[name] = first.strip
48
+ end
49
+
50
+ if (@function.eql? "lower A")
51
+ first = row[@fields[0]]
52
+ row[name] = first.downcase
53
+ end
54
+
55
+ if (@function.eql? "upper A")
56
+ first = row[@fields[0]]
57
+ row[name] = first.upcase
58
+ end
59
+
60
+ if (@function.eql? "encoding A")
61
+ # Bug from ruby 1.8 http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
62
+ first = row[@fields[0]]
63
+ row[name] = Iconv.conv(@fields[1], @fields[2], first + ' ')[0..-2]
64
+ end
65
+
66
+ row[name]
67
+ end
68
+
69
+ end
70
+ end
71
+ end
@@ -95,10 +95,9 @@ class SQLResolver
95
95
  end
96
96
  def resolve(value)
97
97
  if @use_cache
98
- cache[value]
98
+ cache[cache_key(value)]
99
99
  else
100
- q = "SELECT id FROM #{table_name} WHERE #{@field} = #{@connection.quote(value)}"
101
- ETL::Engine.logger.debug("Executing query: #{q}")
100
+ q = "SELECT id FROM #{table_name} WHERE #{wheres(value)}"
102
101
  @connection.select_value(q)
103
102
  end
104
103
  end
@@ -110,11 +109,30 @@ class SQLResolver
110
109
  end
111
110
  def load_cache
112
111
  @use_cache = true
113
- q = "SELECT id, #{@field} FROM #{table_name}"
112
+ q = "SELECT id, #{field.join(', ')} FROM #{table_name}"
114
113
  @connection.select_all(q).each do |record|
115
- cache[record[@field]] = record['id']
114
+ cache[cache_key(record.values_at(*field))] = record['id']
116
115
  end
117
116
  end
117
+
118
+ private
119
+ def field
120
+ unless @field.kind_of?(Array)
121
+ @field = [ @field ]
122
+ end
123
+ @field
124
+ end
125
+
126
+ def cache_key(value)
127
+ value.hash
128
+ end
129
+
130
+ def wheres(value)
131
+ value = [ value ] unless value.kind_of?(Array)
132
+ field.zip(value).collect { |a|
133
+ "#{a[0]} = #{@connection.quote(a[1])}"
134
+ }.join(' AND ')
135
+ end
118
136
  end
119
137
 
120
138
  class FlatFileResolver
@@ -132,7 +150,7 @@ class FlatFileResolver
132
150
 
133
151
  # Get the rows from the file specified in the initializer.
134
152
  def rows
135
- @rows ||= FasterCSV.read(@file)
153
+ @rows ||= CSV.read(@file)
136
154
  end
137
155
  protected :rows
138
156
 
@@ -148,4 +166,4 @@ class FlatFileResolver
148
166
  end
149
167
  nil
150
168
  end
151
- end
169
+ end
@@ -1,3 +1,5 @@
1
+ require 'active_support/core_ext/integer/inflections.rb'
2
+
1
3
  module ETL #:nodoc:
2
4
  module Transform #:nodoc:
3
5
  # Transform a number to an ordinalized version using the ActiveSupport ordinalize
@@ -9,4 +11,4 @@ module ETL #:nodoc:
9
11
  end
10
12
  end
11
13
  end
12
- end
14
+ end
@@ -0,0 +1,27 @@
1
+ module ETL
2
+ module Transform
3
+ class SplitFieldsTransform < ETL::Transform::Transform
4
+ attr_reader :delimiter
5
+ attr_reader :new_fields
6
+
7
+ def initialize(control, name, configuration)
8
+ @delimiter = configuration[:delimiter] || ','
9
+ @new_fields = configuration[:new_fields]
10
+ super
11
+ end
12
+
13
+ def transform(name, value, row)
14
+ return nil if row.nil?
15
+ return nil if row[name].nil?
16
+
17
+ fields = row[name].split(@delimiter)
18
+ @new_fields.each_with_index do |new, index|
19
+ row[new] = fields[index]
20
+ end
21
+
22
+ row[name]
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -1,9 +1,3 @@
1
1
  module ETL#:nodoc:
2
- module VERSION #:nodoc:
3
- MAJOR = 0
4
- MINOR = 9
5
- TINY = 1
6
-
7
- STRING = [MAJOR, MINOR, TINY].join('.')
8
- end
2
+ VERSION = "0.9.5.rc1"
9
3
  end
@@ -0,0 +1,10 @@
1
+ rvm:
2
+ - 1.8.7
3
+ - 1.9.2
4
+ # - jruby-1.6.2
5
+ rails:
6
+ - 2.3.x
7
+ - 3.0.x
8
+ database:
9
+ - mysql
10
+ - postgresql
@@ -0,0 +1 @@
1
+ database*.yml
@@ -0,0 +1,2 @@
1
+ database.yml
2
+ *.txt
@@ -0,0 +1,6 @@
1
+ # This is an ETL Batch File and defines a means for executing
2
+ # a collection of ETL scripts as a single process.
3
+
4
+ use_temp_tables
5
+ run 'batched1.ctl'
6
+ run 'batched2.ctl'
@@ -0,0 +1,11 @@
1
+ source :in, {
2
+ :file => 'data/apache_combined_log.txt',
3
+ :parser => :apache_combined_log
4
+ }
5
+
6
+ destination :out, {
7
+ :file => 'output/apache_combined_log.txt'
8
+ },
9
+ {
10
+ :order => []
11
+ }
@@ -0,0 +1,41 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ class BatchTest < Test::Unit::TestCase
4
+ attr_reader :file, :db_yaml, :engine
5
+ def setup
6
+ @file = File.dirname(__FILE__) + '/all.ebf'
7
+ @db_yaml = File.dirname(__FILE__) + '/database.yml'
8
+ @engine = ETL::Engine.new
9
+ end
10
+ def teardown
11
+
12
+ end
13
+ def test_etl_batch_file
14
+ #`etl #{file} -c #{db_yaml}`
15
+ end
16
+ def test_batch
17
+ assert_nothing_raised do
18
+ batch = ETL::Batch::Batch.resolve(file, engine)
19
+ batch.execute
20
+ end
21
+ end
22
+ def test_batch_with_file
23
+ assert_nothing_raised do
24
+ batch = ETL::Batch::Batch.resolve(File.new(file), engine)
25
+ batch.execute
26
+ end
27
+ end
28
+ def test_batch_with_batch_object
29
+ assert_nothing_raised do
30
+ batch_instance = ETL::Batch::Batch.new(File.new(file))
31
+ batch_instance.engine = engine
32
+ batch = ETL::Batch::Batch.resolve(batch_instance, engine)
33
+ batch.execute
34
+ end
35
+ end
36
+ def test_batch_with_object_should_fail
37
+ assert_raise(RuntimeError) do
38
+ batch = ETL::Batch::Batch.resolve(0, engine)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,6 @@
1
+ # This is an ETL Batch File and defines a means for executing
2
+ # a collection of ETL scripts as a single process.
3
+
4
+ use_temp_tables
5
+ run 'delimited_with_bulk_load.ctl'
6
+ run 'screen_test_fatal.ctl'
File without changes
File without changes
@@ -0,0 +1,6 @@
1
+ source :in, { :type => :mock, :name => :block_processed_input }
2
+
3
+ after_read { |row| row[:added_by_after_read] = "after-" +row[:first_name]; row }
4
+ before_write { |row| row[:added_by_before_write] = "Row #{Engine.current_source_row}"; [row,{:new_row => 'added by post_processor'}] }
5
+
6
+ destination :out, { :type => :mock, :name => :block_processed_output }
@@ -0,0 +1 @@
1
+ pre_process { raise ControlError.new( "Cough!") }
@@ -0,0 +1,4 @@
1
+ source :in, { :type => :mock, :name => :another_input }
2
+ pre_process { TestWitness.call("I'm called from pre_process") }
3
+ post_process { TestWitness.call("I'm called from post_process") }
4
+ destination :out, { :type => :mock, :name => :another_output }