etl 0.9.5.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (215) hide show
  1. data/.gitignore +12 -0
  2. data/.yardopts +5 -0
  3. data/0.9-UPGRADE +6 -0
  4. data/CHANGELOG +236 -0
  5. data/Gemfile +4 -0
  6. data/HOW_TO_RELEASE +13 -0
  7. data/LICENSE +7 -0
  8. data/README.textile +111 -0
  9. data/Rakefile +105 -0
  10. data/TODO +28 -0
  11. data/activewarehouse-etl.gemspec +38 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/spec/fixtures/all.ebf +6 -0
  111. data/spec/fixtures/apache_combined_log.ctl +11 -0
  112. data/spec/fixtures/batch_with_error.ebf +6 -0
  113. data/spec/fixtures/batched1.ctl +0 -0
  114. data/spec/fixtures/batched2.ctl +0 -0
  115. data/spec/fixtures/block_processor.ctl +6 -0
  116. data/spec/fixtures/block_processor_error.ctl +1 -0
  117. data/spec/fixtures/block_processor_pre_post_process.ctl +4 -0
  118. data/spec/fixtures/block_processor_remove_rows.ctl +5 -0
  119. data/spec/fixtures/data/apache_combined_log.txt +3 -0
  120. data/spec/fixtures/data/bulk_import.txt +3 -0
  121. data/spec/fixtures/data/bulk_import_with_empties.txt +3 -0
  122. data/spec/fixtures/data/decode.txt +3 -0
  123. data/spec/fixtures/data/delimited.txt +3 -0
  124. data/spec/fixtures/data/encode_source_latin1.txt +2 -0
  125. data/spec/fixtures/data/excel.xls +0 -0
  126. data/spec/fixtures/data/excel2.xls +0 -0
  127. data/spec/fixtures/data/fixed_width.txt +3 -0
  128. data/spec/fixtures/data/multiple_delimited_1.txt +3 -0
  129. data/spec/fixtures/data/multiple_delimited_2.txt +3 -0
  130. data/spec/fixtures/data/nokogiri.xml +38 -0
  131. data/spec/fixtures/data/people.txt +3 -0
  132. data/spec/fixtures/data/sax.xml +14 -0
  133. data/spec/fixtures/data/xml.xml +16 -0
  134. data/spec/fixtures/delimited.ctl +30 -0
  135. data/spec/fixtures/delimited_absolute.ctl +31 -0
  136. data/spec/fixtures/delimited_destination_db.ctl +23 -0
  137. data/spec/fixtures/delimited_excel.ctl +31 -0
  138. data/spec/fixtures/delimited_insert_update.ctl +34 -0
  139. data/spec/fixtures/delimited_update.ctl +34 -0
  140. data/spec/fixtures/delimited_with_bulk_load.ctl +34 -0
  141. data/spec/fixtures/errors.ctl +24 -0
  142. data/spec/fixtures/excel.ctl +24 -0
  143. data/spec/fixtures/excel2.ctl +25 -0
  144. data/spec/fixtures/fixed_width.ctl +35 -0
  145. data/spec/fixtures/inline_parser.ctl +17 -0
  146. data/spec/fixtures/model_source.ctl +14 -0
  147. data/spec/fixtures/multiple_delimited.ctl +22 -0
  148. data/spec/fixtures/multiple_source_delimited.ctl +39 -0
  149. data/spec/fixtures/nokogiri_all.ctl +35 -0
  150. data/spec/fixtures/nokogiri_select.ctl +35 -0
  151. data/spec/fixtures/output/.ignore +1 -0
  152. data/spec/fixtures/output/delimited.txt +3 -0
  153. data/spec/fixtures/output/encode_destination_utf-8.txt +2 -0
  154. data/spec/fixtures/output/fixed_width.txt +3 -0
  155. data/spec/fixtures/output/inline_parser.txt +3 -0
  156. data/spec/fixtures/output/multiple_source_delimited.txt +6 -0
  157. data/spec/fixtures/output/test_excel_destination.xls +0 -0
  158. data/spec/fixtures/output/test_file_destination.2.txt +2 -0
  159. data/spec/fixtures/output/test_file_destination.txt +2 -0
  160. data/spec/fixtures/output/test_multiple_unique.txt +1 -0
  161. data/spec/fixtures/output/test_unique.txt +2 -0
  162. data/spec/fixtures/sax.ctl +26 -0
  163. data/spec/fixtures/scd/1.txt +1 -0
  164. data/spec/fixtures/scd/2.txt +1 -0
  165. data/spec/fixtures/scd/3.txt +1 -0
  166. data/spec/fixtures/scd_test_type_1.ctl +43 -0
  167. data/spec/fixtures/scd_test_type_2.ctl +34 -0
  168. data/spec/fixtures/screen_test_error.ctl +3 -0
  169. data/spec/fixtures/screen_test_fatal.ctl +3 -0
  170. data/spec/fixtures/xml.ctl +31 -0
  171. data/spec/quality_spec.rb +11 -0
  172. data/spec/spec_helper.rb +10 -0
  173. data/spec/support/custom_fixtures.rb +54 -0
  174. data/spec/support/custom_matchers.rb +54 -0
  175. data/test-matrix.yml +10 -0
  176. data/test/.gitignore +1 -0
  177. data/test/.ignore +2 -0
  178. data/test/batch_test.rb +41 -0
  179. data/test/block_processor_test.rb +38 -0
  180. data/test/check_exist_processor_test.rb +92 -0
  181. data/test/check_unique_processor_test.rb +40 -0
  182. data/test/config/Gemfile.rails-2.3.x +3 -0
  183. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  184. data/test/config/Gemfile.rails-3.0.x +3 -0
  185. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  186. data/test/config/common.rb +29 -0
  187. data/test/connection/mysql/connection.rb +9 -0
  188. data/test/connection/mysql/schema.sql +37 -0
  189. data/test/connection/postgresql/connection.rb +13 -0
  190. data/test/connection/postgresql/schema.sql +40 -0
  191. data/test/control_test.rb +43 -0
  192. data/test/database_join_processor_test.rb +43 -0
  193. data/test/date_dimension_builder_test.rb +96 -0
  194. data/test/destination_test.rb +275 -0
  195. data/test/directive_test.rb +23 -0
  196. data/test/encode_processor_test.rb +32 -0
  197. data/test/engine_test.rb +78 -0
  198. data/test/ensure_fields_presence_processor_test.rb +28 -0
  199. data/test/etl_test.rb +42 -0
  200. data/test/foreign_key_lookup_transform_test.rb +50 -0
  201. data/test/generator_test.rb +14 -0
  202. data/test/mocks/mock_destination.rb +26 -0
  203. data/test/mocks/mock_source.rb +25 -0
  204. data/test/nokogiri_test.rb +35 -0
  205. data/test/parser_test.rb +224 -0
  206. data/test/performance/delimited.ctl +30 -0
  207. data/test/processor_test.rb +44 -0
  208. data/test/row_processor_test.rb +17 -0
  209. data/test/scd_test.rb +257 -0
  210. data/test/screen_test.rb +9 -0
  211. data/test/source_test.rb +154 -0
  212. data/test/test_helper.rb +37 -0
  213. data/test/transform_test.rb +101 -0
  214. data/test/truncate_processor_test.rb +37 -0
  215. metadata +510 -0
@@ -0,0 +1,24 @@
1
+ module ETL
2
+ module Processor
3
+ # Ensure that each specified field is available
4
+ class EnsureFieldsPresenceProcessor < ETL::Processor::RowProcessor
5
+
6
+ # Initialize the processor.
7
+ #
8
+ # Configuration options:
9
+ # * <tt>:fields</tt>: An array of keys whose presence should be verified in each row
10
+ def initialize(control, configuration)
11
+ super
12
+ @fields = configuration[:fields]
13
+ raise ControlError, ":fields must be specified" unless @fields
14
+ end
15
+
16
+ def process(row)
17
+ missing_fields = configuration[:fields] - row.keys
18
+ raise(ETL::ControlError,
19
+ "Row missing required field(s) #{missing_fields.join(',')} in row. Available fields are : #{row.keys.join(',')}") unless missing_fields.empty?
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,77 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ class EscapeCsvProcessor < ETL::Processor::Processor
6
+
7
+ # The file to load from
8
+ attr_reader :source_file
9
+ # The file to write to
10
+ attr_reader :target_file
11
+ # whether to use a temporary file or not
12
+ attr_reader :use_temp_file
13
+
14
+ attr_reader :filters
15
+ attr_reader :charcount
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:target_file</tt>: The file to write data to
22
+ # * <tt>:file</tt>: short-cut which will set the same value to both source_file and target_file
23
+ def initialize(control, configuration)
24
+ super
25
+ if configuration[:file]
26
+ @use_temp_file = true
27
+ configuration[:source_file] = configuration[:file]
28
+ configuration[:target_file] = configuration[:file] + '.tmp'
29
+ end
30
+ path = Pathname.new(configuration[:source_file])
31
+ @source_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:source_file]))) + path
32
+ path = Pathname.new(configuration[:target_file])
33
+ @target_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:target_file]))) + path
34
+ @filters = configuration[:filters] || [{:replace => '\"', :result => '""'}]
35
+ @charcount = configuration[:charcount]
36
+ raise ControlError, "Source file must be specified" if @source_file.nil?
37
+ raise ControlError, "Target file must be specified" if @target_file.nil?
38
+ raise ControlError, "Source and target file cannot currently point to the same file" if @source_file == @target_file
39
+ end
40
+
41
+ # Execute the processor
42
+ def process
43
+ reader = File.open(@source_file, 'r')
44
+ writer = File.open(@target_file, 'w')
45
+
46
+ reader.each_line do |line|
47
+ reading = line
48
+ @filters.each do |filter|
49
+ if (!filter[:replace].nil? &&
50
+ !filter[:result].nil?)
51
+ result = reading.gsub(Regexp.new(filter[:replace]), filter[:result])
52
+ reading = result
53
+ end
54
+ end unless @filters.nil?
55
+ @charcount.each do |count|
56
+ if (!count[:char].nil? &&
57
+ !count[:count].nil?)
58
+ c = reading.count count[:char]
59
+ if c != count[:count]
60
+ reading = nil
61
+ end
62
+ end
63
+ end unless @charcount.nil?
64
+ writer.write(reading) unless reading.nil?
65
+ end
66
+
67
+ reader.close
68
+ writer.close
69
+
70
+ if use_temp_file
71
+ FileUtils.rm(source_file)
72
+ FileUtils.mv(target_file,source_file)
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,51 @@
1
+ module ETL
2
+ module Processor
3
+ class FilterRowProcessor < ETL::Processor::RowProcessor
4
+ attr_reader :condition
5
+ attr_reader :outtrue
6
+ attr_reader :outfalse
7
+
8
+ def initialize(control, configuration)
9
+ @condition = configuration[:condition]
10
+ @outtrue = configuration[:outtrue]
11
+ @outfalse = configuration[:outfalse]
12
+ super
13
+ end
14
+
15
+ def process(row)
16
+ return nil if row.nil?
17
+
18
+ if eval_condition(row, @condition)
19
+ return [] if @outtrue.nil?
20
+
21
+ eval(@outtrue)
22
+ else
23
+ eval(@outfalse) unless @outfalse.nil?
24
+ end
25
+
26
+ return row
27
+ end
28
+
29
+ private
30
+ def eval_condition(row, cond)
31
+
32
+ first = cond[1]
33
+ if (cond[1].class == Array)
34
+ first = eval_condition(row, cond[1])
35
+ end
36
+
37
+ second = cond[2]
38
+ if (cond[2].class == Array)
39
+ second = eval_condition(row, cond[2])
40
+ end
41
+
42
+ return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
43
+
44
+ eval("#{first}#{cond[0]}#{second}")
45
+ rescue => e
46
+ return false
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,68 @@
1
+ # Written by Susan Potter under open source MIT license.
2
+ # August 12, 2007.
3
+
4
+ require 'net/ftp'
5
+
6
+ module ETL
7
+ module Processor
8
+ # Custom processor to download files via FTP
9
+ class FtpDownloaderProcessor < ETL::Processor::Processor
10
+ attr_reader :host
11
+ attr_reader :port
12
+ attr_reader :remote_dir
13
+ attr_reader :files
14
+ attr_reader :username
15
+ attr_reader :local_dir
16
+
17
+ # configuration options include:
18
+ # * host - hostname or IP address of FTP server (required)
19
+ # * port - port number for FTP server (default: 21)
20
+ # * remote_dir - remote path on FTP server (default: /)
21
+ # * files - list of files to download from FTP server (default: [])
22
+ # * username - username for FTP server authentication (default: anonymous)
23
+ # * password - password for FTP server authentication (default: nil)
24
+ # * local_dir - local output directory to save downloaded files (default: '')
25
+ #
26
+ # As an example you might write something like the following in your control process file:
27
+ # pre_process :ftp_downloader, {
28
+ # :host => 'ftp.sec.gov',
29
+ # :path => 'edgar/Feed/2007/QTR2',
30
+ # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
31
+ # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
32
+ # :local_dir => '/data/sec/2007/04',
33
+ # }
34
+ # The above example will anonymously download via FTP the first week's worth of SEC filing feed data
35
+ # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
36
+ def initialize(control, configuration)
37
+ @host = configuration[:host]
38
+ @port = configuration[:port] || 21
39
+ @remote_dir = configuration[:remote_dir] || '/'
40
+ @files = configuration[:files] || []
41
+ @username = configuration[:username] || 'anonymous'
42
+ @password = configuration[:password]
43
+ @local_dir = configuration[:local_dir] || ''
44
+ end
45
+
46
+ def process
47
+ Net::FTP.open(@host) do |conn|
48
+ conn.connect(@host, @port)
49
+ conn.login(@username, @password)
50
+ @files.each do |f|
51
+ conn.getbinaryfile(remote_file(f), local_file(f))
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ attr_accessor :password
58
+
59
+ def local_file(name)
60
+ File.join(@local_dir, name)
61
+ end
62
+
63
+ def remote_file(name)
64
+ File.join(@remote_dir, name)
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,65 @@
1
+ require 'net/ftp'
2
+
3
+ module ETL
4
+ module Processor
5
+ # Custom processor to download files via FTP
6
+ class FtpUploaderProcessor < ETL::Processor::Processor
7
+ attr_reader :host
8
+ attr_reader :port
9
+ attr_reader :remote_dir
10
+ attr_reader :files
11
+ attr_reader :username
12
+ attr_reader :local_dir
13
+
14
+ # configuration options include:
15
+ # * host - hostname or IP address of FTP server (required)
16
+ # * port - port number for FTP server (default: 21)
17
+ # * remote_dir - remote path on FTP server (default: /)
18
+ # * files - list of files to download from FTP server (default: [])
19
+ # * username - username for FTP server authentication (default: anonymous)
20
+ # * password - password for FTP server authentication (default: nil)
21
+ # * local_dir - local output directory to save downloaded files (default: '')
22
+ #
23
+ # As an example you might write something like the following in your control process file:
24
+ # pre_process :ftp_uploader, {
25
+ # :host => 'ftp.sec.gov',
26
+ # :path => 'edgar/Feed/2007/QTR2',
27
+ # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28
+ # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29
+ # :local_dir => '/data/sec/2007/04',
30
+ # }
31
+ # The above example will anonymously download via FTP the first week's worth of SEC filing feed data
32
+ # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33
+ def initialize(control, configuration)
34
+ @host = configuration[:host]
35
+ @port = configuration[:port] || 21
36
+ @remote_dir = configuration[:remote_dir] || '/'
37
+ @files = configuration[:files] || []
38
+ @username = configuration[:username] || 'anonymous'
39
+ @password = configuration[:password]
40
+ @local_dir = configuration[:local_dir] || ''
41
+ end
42
+
43
+ def process
44
+ Net::FTP.open(@host) do |conn|
45
+ conn.connect(@host, @port)
46
+ conn.login(@username, @password)
47
+ @files.each do |f|
48
+ conn.putbinaryfile(local_file(f), remote_file(f))
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+ attr_accessor :password
55
+
56
+ def local_file(name)
57
+ File.join(@local_dir, name)
58
+ end
59
+
60
+ def remote_file(name)
61
+ File.join(@remote_dir, name)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ target = configuration[:target]
28
+ table = configuration[:table]
29
+ conn = ETL::Engine.connection(target)
30
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
31
+ rows
32
+ end
33
+
34
+ protected
35
+ # Recursive function that will add a row for the current level and then call build_rows
36
+ # for all of the children of the current level
37
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
38
+ ids.each do |id|
39
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
40
+
41
+ row = {
42
+ :parent_id => row_id,
43
+ :child_id => id,
44
+ :num_levels_from_parent => level,
45
+ :is_bottom => (child_ids.empty? ? 1 : 0),
46
+ :is_top => (root ? 1 : 0),
47
+ }
48
+ rows << row
49
+
50
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,91 @@
1
+ optional_require 'net/imap'
2
+ optional_require 'tmail'
3
+
4
+ module ETL
5
+ module Processor
6
+ # Custom processor to download files via Imap Attachment
7
+ class ImapattachmentDownloaderProcessor < ETL::Processor::Processor
8
+ attr_reader :host
9
+ attr_reader :ssl
10
+ attr_reader :port
11
+ attr_reader :delete
12
+ attr_reader :filters
13
+ attr_reader :folder
14
+ attr_reader :username
15
+ attr_reader :local_dir
16
+
17
+ # configuration options include:
18
+ # * host - hostname or IP address of IMAP server (required)
19
+ # * ssl - activate encryption (default false)
20
+ # * port - port number for IMAP server (default: 220 or 993)
21
+ # * delete - delete message after reading (default false)
22
+ # * filters - filter mails (default [])
23
+ # * folder - folder to select mails from (default INBOX)
24
+ # * username - username for IMAP server authentication (default: anonymous)
25
+ # * password - password for IMAP server authentication (default: nil)
26
+ # * local_dir - local output directory to save downloaded files (default: '')
27
+ #
28
+ def initialize(control, configuration)
29
+ @host = configuration[:host]
30
+ @ssl = configuration[:ssl] || false
31
+ @port = configuration[:port] || (@ssl ? 993 : 220 )
32
+ @delete = configuration[:delete] || false
33
+ @filters = configuration[:filters] || []
34
+ @folder = configuration[:folder] || 'INBOX'
35
+ @username = configuration[:username] || 'anonymous'
36
+ @password = configuration[:password]
37
+ @local_dir = configuration[:local_dir] || ''
38
+ end
39
+
40
+ def process
41
+ conn = Net::IMAP.new(@host, @port, @ssl)
42
+ conn.login(@username, @password)
43
+
44
+ conn.select(@folder)
45
+ conn.uid_search(["NOT", "DELETED"]).each do |msguuid|
46
+ mail = TMail::Mail.parse( conn.uid_fetch(msguuid, 'RFC822').first.attr['RFC822'] )
47
+ next if mail.attachments.blank?
48
+ if applyfilter(mail, @filters)
49
+ mail.attachments.each do |attachment|
50
+ filename = attachment.original_filename
51
+ File.open(local_file(filename), "w") {|f|
52
+ f << attachment.gets(nil)
53
+ }
54
+ end
55
+
56
+ conn.store(msguuid, "+FLAGS", [:Deleted]) if @delete
57
+ end
58
+ end
59
+ conn.expunge
60
+ conn.close
61
+ end
62
+
63
+ private
64
+ attr_accessor :password
65
+
66
+ def local_file(name)
67
+ File.join(@local_dir, name)
68
+ end
69
+
70
+ def applyfilter(mail, cond)
71
+ return true if (cond.nil? or cond.size < 3)
72
+
73
+ first = cond[1]
74
+ if (cond[1].class == Array)
75
+ first = eval_condition(row, cond[1])
76
+ end
77
+
78
+ second = cond[2]
79
+ if (cond[2].class == Array)
80
+ second = eval_condition(row, cond[2])
81
+ end
82
+
83
+ return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
84
+
85
+ eval("#{first}#{cond[0]}#{second}")
86
+ rescue => e
87
+ return false
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,90 @@
1
+ optional_require 'net/pop'
2
+ optional_require 'tmail'
3
+
4
+ module ETL
5
+ module Processor
6
+ # Custom processor to download files via Pop3 Attachment
7
+ class Pop3attachmentDownloaderProcessor < ETL::Processor::Processor
8
+ attr_reader :host
9
+ attr_reader :ssl
10
+ attr_reader :port
11
+ attr_reader :delete
12
+ attr_reader :filters
13
+ attr_reader :username
14
+ attr_reader :local_dir
15
+
16
+ # configuration options include:
17
+ # * host - hostname or IP address of POP3 server (required)
18
+ # * ssl - activate encryption (default false)
19
+ # * port - port number for POP3 server (default: Net::POP3.default_port or Net::POP3.default_pop3s_port)
20
+ # * delete - delete message after reading (default false)
21
+ # * filters - filter mails (default [])
22
+ # * username - username for POP3 server authentication (default: anonymous)
23
+ # * password - password for POP3 server authentication (default: nil)
24
+ # * local_dir - local output directory to save downloaded files (default: '')
25
+ #
26
+ def initialize(control, configuration)
27
+ @host = configuration[:host]
28
+ @ssl = configuration[:ssl] || false
29
+ @port = configuration[:port] || (@ssl ? Net::POP3.default_pop3s_port : Net::POP3.default_port )
30
+ @delete = configuration[:delete] || false
31
+ @filters = configuration[:filters] || []
32
+ @username = configuration[:username] || 'anonymous'
33
+ @password = configuration[:password]
34
+ @local_dir = configuration[:local_dir] || ''
35
+ end
36
+
37
+ def process
38
+ Net::POP3.enable_ssl(OpenSSL::SSL::VERIFY_NONE) if @ssl
39
+ conn = Net::POP3.new(@host, @port)
40
+ conn.start(@username, @password)
41
+ if !conn.mails.empty?
42
+ conn.each_mail do |message|
43
+ stringmail = message.pop
44
+ mail = TMail::Mail.parse(stringmail)
45
+ next if mail.attachments.blank?
46
+ if applyfilter(mail, @filters)
47
+ mail.attachments.each do |attachment|
48
+ filename = attachment.original_filename
49
+ File.open(local_file(filename), "w") {|f|
50
+ f << attachment.gets(nil)
51
+ }
52
+ end
53
+
54
+ message.delete if @delete
55
+ end
56
+ end
57
+ end
58
+
59
+ conn.finish
60
+ end
61
+
62
+ private
63
+ attr_accessor :password
64
+
65
+ def local_file(name)
66
+ File.join(@local_dir, name)
67
+ end
68
+
69
+ def applyfilter(mail, cond)
70
+ return true if (cond.nil? or cond.size < 3)
71
+
72
+ first = cond[1]
73
+ if (cond[1].class == Array)
74
+ first = eval_condition(row, cond[1])
75
+ end
76
+
77
+ second = cond[2]
78
+ if (cond[2].class == Array)
79
+ second = eval_condition(row, cond[2])
80
+ end
81
+
82
+ return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
83
+
84
+ eval("#{first}#{cond[0]}#{second}")
85
+ rescue => e
86
+ return false
87
+ end
88
+ end
89
+ end
90
+ end