activewarehouse-etl-sgonyea 0.9.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,65 @@
1
+ require 'rexml/document'
2
+
3
+ module ETL
4
+ module Parser
5
+ class XmlParser < ETL::Parser::Parser
6
+ # Initialize the parser
7
+ # * <tt>source</tt>: The Source object
8
+ # * <tt>options</tt>: Parser options Hash
9
+ def initialize(source, options={})
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Returns each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ doc = nil
18
+ t = Benchmark.realtime do
19
+ doc = REXML::Document.new(File.new(file))
20
+ end
21
+ Engine.logger.info "XML #{file} parsed in #{t}s"
22
+ doc.elements.each(@collection_xpath) do |element|
23
+ row = {}
24
+ fields.each do |f|
25
+ value = element.text(f.xpath)
26
+ row[f.name] = value
27
+ end
28
+ yield row
29
+ end
30
+ end
31
+ end
32
+
33
+ # Get an array of defined fields
34
+ def fields
35
+ @fields ||= []
36
+ end
37
+
38
+ private
39
+ def configure
40
+ @collection_xpath = source.definition[:collection]
41
+ raise "Collection XPath is required" if @collection_xpath.nil?
42
+
43
+ source.definition[:fields].each do |options|
44
+ case options
45
+ when Symbol
46
+ fields << Field.new(options, options.to_s)
47
+ when Hash
48
+ options[:xpath] ||= options[:name]
49
+ fields << Field.new(options[:name], options[:xpath].to_s)
50
+ else
51
+ raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
52
+ end
53
+ end
54
+ end
55
+
56
+ class Field
57
+ attr_reader :name, :xpath
58
+ def initialize(name, xpath)
59
+ @name = name
60
+ @xpath = xpath
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,11 @@
1
+ # This source file contains the ETL::Processor module and requires all of the processors
2
+
3
+ module ETL #:nodoc:
4
+ # The ETL::Processor module contains row-level and bulk processors
5
+ module Processor
6
+ end
7
+ end
8
+
9
+ require 'etl/processor/processor'
10
+ require 'etl/processor/row_processor'
11
+ Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -0,0 +1,14 @@
1
+ module ETL
2
+ module Processor
3
+ # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4
+ class BlockProcessor < ETL::Processor::RowProcessor
5
+ def initialize(control, configuration)
6
+ super
7
+ @block = configuration[:block]
8
+ end
9
+ def process(row=nil)
10
+ @block.call(row)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,94 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which is used to bulk import data into a target database. The
4
+ # underlying database driver from ActiveRecord must support the methods
5
+ # +bulk_load+ method.
6
+ class BulkImportProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :file
10
+ # The target database
11
+ attr_reader :target
12
+ # The table name
13
+ attr_reader :table
14
+ # Set to true to truncate
15
+ attr_reader :truncate
16
+ # Array of symbols representing the column load order
17
+ attr_reader :columns
18
+ # The field separator (defaults to a comma)
19
+ attr_accessor :field_separator
20
+ # The field enclosure (defaults to nil)
21
+ attr_accessor :field_enclosure
22
+ # The line separator (defaults to a newline)
23
+ attr_accessor :line_separator
24
+ # The string that indicates a NULL (defaults to an empty string)
25
+ attr_accessor :null_string
26
+ # boolean that indicates disable keys before, then enable after load (MySql only optimization)
27
+ attr_accessor :disable_keys
28
+ # replace existing records, not just insert
29
+ attr_accessor :replace
30
+
31
+ # Initialize the processor.
32
+ #
33
+ # Configuration options:
34
+ # * <tt>:file</tt>: The file to load data from
35
+ # * <tt>:target</tt>: The target database
36
+ # * <tt>:table</tt>: The table name
37
+ # * <tt>:truncate</tt>: Set to true to truncate before loading
38
+ # * <tt>:columns</tt>: The columns to load in the order they appear in
39
+ # the bulk data file
40
+ # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
41
+ # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
42
+ # * <tt>:field_enclosure</tt>: The field enclosure charcaters
43
+ # * <tt>:disable_keys</tt>: Set to true to disable keys before, then enable after load (MySql only optimization)
44
+ def initialize(control, configuration)
45
+ super
46
+ @target = configuration[:target]
47
+ path = Pathname.new(configuration[:file])
48
+ @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
49
+
50
+ @table = configuration[:table]
51
+ @truncate = configuration[:truncate] ||= false
52
+ @columns = configuration[:columns]
53
+ @field_separator = (configuration[:field_separator] || ',')
54
+ @line_separator = (configuration[:line_separator] || "\n")
55
+ @null_string = (configuration[:null_string] || "")
56
+ @field_enclosure = configuration[:field_enclosure]
57
+ @disable_keys = configuration[:disable_keys] || false
58
+ @replace = configuration[:replace] || false
59
+
60
+ raise ControlError, "Target must be specified" unless @target
61
+ raise ControlError, "Table must be specified" unless @table
62
+ end
63
+
64
+ # Execute the processor
65
+ def process
66
+ return if ETL::Engine.skip_bulk_import
67
+ return if File.size(file) == 0
68
+
69
+ conn = ETL::Engine.connection(target)
70
+ conn.transaction do
71
+ conn.truncate(table_name) if truncate
72
+ options = {}
73
+ options[:columns] = columns
74
+
75
+ options[:disable_keys] = true if disable_keys
76
+ options[:replace] = true if replace
77
+
78
+ if field_separator || field_enclosure || line_separator || null_string
79
+ options[:fields] = {}
80
+ options[:fields][:null_string] = null_string if null_string
81
+ options[:fields][:delimited_by] = field_separator if field_separator
82
+ options[:fields][:enclosed_by] = field_enclosure if field_enclosure
83
+ options[:fields][:terminated_by] = line_separator if line_separator
84
+ end
85
+ conn.bulk_load(file, table_name, options)
86
+ end
87
+ end
88
+
89
+ def table_name
90
+ ETL::Engine.table(table, ETL::Engine.connection(target))
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,80 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The target database
10
+ attr_accessor :target
11
+
12
+ # The name of the table to check against
13
+ attr_accessor :table
14
+
15
+ # An array of columns representing the natural key
16
+ attr_accessor :columns
17
+
18
+ # Is set to true if the processor should execute the check. If there are
19
+ # no rows in the target table then this should return false.
20
+ attr_accessor :should_check
21
+
22
+ # Initialize the processor
23
+ # Configuration options:
24
+ # * <tt>:columns</tt>: An array of symbols for columns that should be included in the query conditions. If this option is not specified then all of the columns in the row will be included in the conditions (unless :skip is specified).
25
+ # * <tt>:skip</tt>: A symbol or array of symbols that should not be included in the existence check. If this option is not specified then all of the columns will be included in the existence check (unless :columns is specified).
26
+ # * <tt>:target</tt>: The target connection
27
+ # * <tt>:table</tt>: The table name
28
+ def initialize(control, configuration)
29
+ super
30
+ @skip = configuration[:skip] || []
31
+ @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
32
+ @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
33
+ @columns = configuration[:columns]
34
+
35
+ q = "SELECT COUNT(*) FROM #{table_name}"
36
+ @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
37
+ end
38
+
39
+ # Return true if the given key should be skipped
40
+ def skip?(key)
41
+ case skip
42
+ when Array
43
+ skip.include?(key)
44
+ else
45
+ skip.to_sym == key.to_sym
46
+ end
47
+ end
48
+
49
+ # Return true if the row should be checked
50
+ def should_check?
51
+ @should_check ? true : false
52
+ end
53
+
54
+ # Process the row
55
+ def process(row)
56
+ return row unless should_check?
57
+ conn = ETL::Engine.connection(target)
58
+ q = "SELECT * FROM #{table_name} WHERE "
59
+ conditions = []
60
+ ensure_columns_available_in_row!(row, columns, 'for existence check')
61
+ row.each do |k,v|
62
+ if columns.nil? || columns.include?(k.to_sym)
63
+ conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
64
+ end
65
+ end
66
+ q << conditions.join(" AND ")
67
+ q << " LIMIT 1"
68
+
69
+ result = conn.select_one(q)
70
+ return row if result.nil?
71
+ end
72
+
73
+ private
74
+
75
+ def table_name
76
+ ETL::Engine.table(table, ETL::Engine.connection(target))
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,39 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ #
27
+ # An error will be raised if the row doesn't include the keys.
28
+ def process(row)
29
+ ensure_columns_available_in_row!(row, keys, 'for unicity check')
30
+
31
+ key = (keys.collect { |k| row[k] }).join('|')
32
+ unless compound_key_constraints[key]
33
+ compound_key_constraints[key] = 1
34
+ return row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
11
+ def process(row)
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
22
+ row
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,82 @@
1
+ module ETL
2
+ module Processor
3
+ class DatabaseJoinProcessor < ETL::Processor::RowProcessor
4
+ attr_reader :target
5
+ attr_reader :query
6
+ attr_reader :fields
7
+
8
+ # Initialize the procesor.
9
+ #
10
+ # Arguments:
11
+ # * <tt>control</tt>: The ETL::Control::Control instance
12
+ # * <tt>configuration</tt>: The configuration Hash
13
+ # * <tt>definition</tt>: The source definition
14
+ #
15
+ # Required configuration options:
16
+ # * <tt>:target</tt>: The target connection
17
+ # * <tt>:query</tt>: The join query
18
+ # * <tt>:fields</tt>: The fields to add to the row
19
+ def initialize(control, configuration)
20
+ super
21
+ @target = configuration[:target]
22
+ @query = configuration[:query]
23
+ @fields = configuration[:fields]
24
+ raise ControlError, ":target must be specified" unless @target
25
+ raise ControlError, ":query must be specified" unless @query
26
+ raise ControlError, ":fields must be specified" unless @fields
27
+ end
28
+
29
+ # Get a String identifier for the source
30
+ def to_s
31
+ "#{host}/#{database}"
32
+ end
33
+
34
+ def process(row)
35
+ return nil if row.nil?
36
+
37
+ q = @query
38
+ begin
39
+ q = eval('"' + @query + '"')
40
+ rescue
41
+ end
42
+
43
+ ETL::Engine.logger.debug("Executing select: #{q}")
44
+ res = connection.execute(q)
45
+
46
+ case connection
47
+ when ActiveRecord::ConnectionAdapters::PostgreSQLAdapter;
48
+ res.each do |r|
49
+ @fields.each do |field|
50
+ row[field.to_sym] = r[field.to_s]
51
+ end
52
+ end
53
+ when ActiveRecord::ConnectionAdapters::MysqlAdapter;
54
+ res.each_hash do |r|
55
+ @fields.each do |field|
56
+ row[field.to_sym] = r[field.to_s]
57
+ end
58
+ end
59
+ res.free
60
+ else raise "Unsupported adapter #{connection.class} for this destination"
61
+ end
62
+
63
+ return row
64
+ end
65
+
66
+ private
67
+ # Get the database connection to use
68
+ def connection
69
+ ETL::Engine.connection(target)
70
+ end
71
+
72
+ # Get the host, defaults to 'localhost'
73
+ def host
74
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
75
+ end
76
+
77
+ def database
78
+ ETL::Base.configurations[target.to_s]['database']
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,55 @@
1
+ require 'iconv'
2
+
3
+ module ETL #:nodoc:
4
+ module Processor #:nodoc:
5
+ # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
6
+ class EncodeProcessor < ETL::Processor::Processor
7
+
8
+ # The file to load from
9
+ attr_reader :source_file
10
+ # The file to write to
11
+ attr_reader :target_file
12
+ # The source file encoding
13
+ attr_reader :source_encoding
14
+ # The target file encoding
15
+ attr_reader :target_encoding
16
+
17
+ # Initialize the processor.
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:source_file</tt>: The file to load data from
21
+ # * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
22
+ # * <tt>:target_file</tt>: The file to write data to
23
+ # * <tt>:target_encoding</tt>: The target file encoding
24
+ def initialize(control, configuration)
25
+ super
26
+ raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
27
+ raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
28
+ @source_file = File.join(File.dirname(control.file), configuration[:source_file])
29
+ @source_encoding = configuration[:source_encoding]
30
+ @target_file = File.join(File.dirname(control.file), configuration[:target_file])
31
+ @target_encoding = configuration[:target_encoding]
32
+ raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
33
+ begin
34
+ @iconv = Iconv.new(target_encoding,source_encoding)
35
+ rescue Iconv::InvalidEncoding
36
+ raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
37
+ end
38
+ end
39
+
40
+ # Execute the processor
41
+ def process
42
+ # operate line by line to handle large files without loading them in-memory
43
+ # could be replaced by a system iconv call when available, for greater performance
44
+ File.open(source_file) do |source|
45
+ #puts "Opening #{target_file}"
46
+ File.open(target_file,'w') do |target|
47
+ source.each_line do |line|
48
+ target << @iconv.iconv(line)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end