etl 0.9.5.rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +12 -0
- data/.yardopts +5 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +236 -0
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +13 -0
- data/LICENSE +7 -0
- data/README.textile +111 -0
- data/Rakefile +105 -0
- data/TODO +28 -0
- data/activewarehouse-etl.gemspec +38 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +97 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +65 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +438 -0
- data/lib/etl/control/destination/csv_destination.rb +113 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/destination/yaml_destination.rb +74 -0
- data/lib/etl/control/source.rb +132 -0
- data/lib/etl/control/source/database_source.rb +224 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +582 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +8 -0
- data/lib/etl/execution/batch.rb +10 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +90 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/csv_parser.rb +93 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +94 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +39 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/database_join_processor.rb +82 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +27 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +40 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/md5_transform.rb +13 -0
- data/lib/etl/transform/ordinalize_transform.rb +14 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +3 -0
- data/spec/fixtures/all.ebf +6 -0
- data/spec/fixtures/apache_combined_log.ctl +11 -0
- data/spec/fixtures/batch_with_error.ebf +6 -0
- data/spec/fixtures/batched1.ctl +0 -0
- data/spec/fixtures/batched2.ctl +0 -0
- data/spec/fixtures/block_processor.ctl +6 -0
- data/spec/fixtures/block_processor_error.ctl +1 -0
- data/spec/fixtures/block_processor_pre_post_process.ctl +4 -0
- data/spec/fixtures/block_processor_remove_rows.ctl +5 -0
- data/spec/fixtures/data/apache_combined_log.txt +3 -0
- data/spec/fixtures/data/bulk_import.txt +3 -0
- data/spec/fixtures/data/bulk_import_with_empties.txt +3 -0
- data/spec/fixtures/data/decode.txt +3 -0
- data/spec/fixtures/data/delimited.txt +3 -0
- data/spec/fixtures/data/encode_source_latin1.txt +2 -0
- data/spec/fixtures/data/excel.xls +0 -0
- data/spec/fixtures/data/excel2.xls +0 -0
- data/spec/fixtures/data/fixed_width.txt +3 -0
- data/spec/fixtures/data/multiple_delimited_1.txt +3 -0
- data/spec/fixtures/data/multiple_delimited_2.txt +3 -0
- data/spec/fixtures/data/nokogiri.xml +38 -0
- data/spec/fixtures/data/people.txt +3 -0
- data/spec/fixtures/data/sax.xml +14 -0
- data/spec/fixtures/data/xml.xml +16 -0
- data/spec/fixtures/delimited.ctl +30 -0
- data/spec/fixtures/delimited_absolute.ctl +31 -0
- data/spec/fixtures/delimited_destination_db.ctl +23 -0
- data/spec/fixtures/delimited_excel.ctl +31 -0
- data/spec/fixtures/delimited_insert_update.ctl +34 -0
- data/spec/fixtures/delimited_update.ctl +34 -0
- data/spec/fixtures/delimited_with_bulk_load.ctl +34 -0
- data/spec/fixtures/errors.ctl +24 -0
- data/spec/fixtures/excel.ctl +24 -0
- data/spec/fixtures/excel2.ctl +25 -0
- data/spec/fixtures/fixed_width.ctl +35 -0
- data/spec/fixtures/inline_parser.ctl +17 -0
- data/spec/fixtures/model_source.ctl +14 -0
- data/spec/fixtures/multiple_delimited.ctl +22 -0
- data/spec/fixtures/multiple_source_delimited.ctl +39 -0
- data/spec/fixtures/nokogiri_all.ctl +35 -0
- data/spec/fixtures/nokogiri_select.ctl +35 -0
- data/spec/fixtures/output/.ignore +1 -0
- data/spec/fixtures/output/delimited.txt +3 -0
- data/spec/fixtures/output/encode_destination_utf-8.txt +2 -0
- data/spec/fixtures/output/fixed_width.txt +3 -0
- data/spec/fixtures/output/inline_parser.txt +3 -0
- data/spec/fixtures/output/multiple_source_delimited.txt +6 -0
- data/spec/fixtures/output/test_excel_destination.xls +0 -0
- data/spec/fixtures/output/test_file_destination.2.txt +2 -0
- data/spec/fixtures/output/test_file_destination.txt +2 -0
- data/spec/fixtures/output/test_multiple_unique.txt +1 -0
- data/spec/fixtures/output/test_unique.txt +2 -0
- data/spec/fixtures/sax.ctl +26 -0
- data/spec/fixtures/scd/1.txt +1 -0
- data/spec/fixtures/scd/2.txt +1 -0
- data/spec/fixtures/scd/3.txt +1 -0
- data/spec/fixtures/scd_test_type_1.ctl +43 -0
- data/spec/fixtures/scd_test_type_2.ctl +34 -0
- data/spec/fixtures/screen_test_error.ctl +3 -0
- data/spec/fixtures/screen_test_fatal.ctl +3 -0
- data/spec/fixtures/xml.ctl +31 -0
- data/spec/quality_spec.rb +11 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/support/custom_fixtures.rb +54 -0
- data/spec/support/custom_matchers.rb +54 -0
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/batch_test.rb +41 -0
- data/test/block_processor_test.rb +38 -0
- data/test/check_exist_processor_test.rb +92 -0
- data/test/check_unique_processor_test.rb +40 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +53 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +61 -0
- data/test/config/common.rb +29 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +37 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +40 -0
- data/test/control_test.rb +43 -0
- data/test/database_join_processor_test.rb +43 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +78 -0
- data/test/ensure_fields_presence_processor_test.rb +28 -0
- data/test/etl_test.rb +42 -0
- data/test/foreign_key_lookup_transform_test.rb +50 -0
- data/test/generator_test.rb +14 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/nokogiri_test.rb +35 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/scd_test.rb +257 -0
- data/test/screen_test.rb +9 -0
- data/test/source_test.rb +154 -0
- data/test/test_helper.rb +37 -0
- data/test/transform_test.rb +101 -0
- data/test/truncate_processor_test.rb +37 -0
- metadata +510 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class XmlParser < ETL::Parser::Parser
|
6
|
+
# Initialize the parser
|
7
|
+
# * <tt>source</tt>: The Source object
|
8
|
+
# * <tt>options</tt>: Parser options Hash
|
9
|
+
def initialize(source, options={})
|
10
|
+
super
|
11
|
+
configure
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns each row
|
15
|
+
def each
|
16
|
+
Dir.glob(file).each do |file|
|
17
|
+
doc = nil
|
18
|
+
t = Benchmark.realtime do
|
19
|
+
doc = REXML::Document.new(File.new(file))
|
20
|
+
end
|
21
|
+
Engine.logger.info "XML #{file} parsed in #{t}s"
|
22
|
+
doc.elements.each(@collection_xpath) do |element|
|
23
|
+
row = {}
|
24
|
+
fields.each do |f|
|
25
|
+
value = element.text(f.xpath)
|
26
|
+
row[f.name] = value
|
27
|
+
end
|
28
|
+
yield row
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get an array of defined fields
|
34
|
+
def fields
|
35
|
+
@fields ||= []
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def configure
|
40
|
+
@collection_xpath = source.definition[:collection]
|
41
|
+
raise "Collection XPath is required" if @collection_xpath.nil?
|
42
|
+
|
43
|
+
source.definition[:fields].each do |options|
|
44
|
+
case options
|
45
|
+
when Symbol
|
46
|
+
fields << Field.new(options, options.to_s)
|
47
|
+
when Hash
|
48
|
+
options[:xpath] ||= options[:name]
|
49
|
+
fields << Field.new(options[:name], options[:xpath].to_s)
|
50
|
+
else
|
51
|
+
raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Field
|
57
|
+
attr_reader :name, :xpath
|
58
|
+
def initialize(name, xpath)
|
59
|
+
@name = name
|
60
|
+
@xpath = xpath
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This source file contains the ETL::Processor module and requires all of the processors
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# The ETL::Processor module contains row-level and bulk processors
|
5
|
+
module Processor
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'etl/processor/processor'
|
10
|
+
require 'etl/processor/row_processor'
|
11
|
+
Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
# This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
|
4
|
+
class BlockProcessor < ETL::Processor::RowProcessor
|
5
|
+
def initialize(control, configuration)
|
6
|
+
super
|
7
|
+
@block = configuration[:block]
|
8
|
+
end
|
9
|
+
def process(row=nil)
|
10
|
+
@block.call(row)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which is used to bulk import data into a target database. The
|
4
|
+
# underlying database driver from ActiveRecord must support the methods
|
5
|
+
# +bulk_load+ method.
|
6
|
+
class BulkImportProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :file
|
10
|
+
# The target database
|
11
|
+
attr_reader :target
|
12
|
+
# The table name
|
13
|
+
attr_reader :table
|
14
|
+
# Set to true to truncate
|
15
|
+
attr_reader :truncate
|
16
|
+
# Array of symbols representing the column load order
|
17
|
+
attr_reader :columns
|
18
|
+
# The field separator (defaults to a comma)
|
19
|
+
attr_accessor :field_separator
|
20
|
+
# The field enclosure (defaults to nil)
|
21
|
+
attr_accessor :field_enclosure
|
22
|
+
# The line separator (defaults to a newline)
|
23
|
+
attr_accessor :line_separator
|
24
|
+
# The string that indicates a NULL (defaults to an empty string)
|
25
|
+
attr_accessor :null_string
|
26
|
+
# boolean that indicates disable keys before, then enable after load (MySql only optimization)
|
27
|
+
attr_accessor :disable_keys
|
28
|
+
# replace existing records, not just insert
|
29
|
+
attr_accessor :replace
|
30
|
+
|
31
|
+
# Initialize the processor.
|
32
|
+
#
|
33
|
+
# Configuration options:
|
34
|
+
# * <tt>:file</tt>: The file to load data from
|
35
|
+
# * <tt>:target</tt>: The target database
|
36
|
+
# * <tt>:table</tt>: The table name
|
37
|
+
# * <tt>:truncate</tt>: Set to true to truncate before loading
|
38
|
+
# * <tt>:columns</tt>: The columns to load in the order they appear in
|
39
|
+
# the bulk data file
|
40
|
+
# * <tt>:field_separator</tt>: The field separator. Defaults to a comma
|
41
|
+
# * <tt>:line_separator</tt>: The line separator. Defaults to a newline
|
42
|
+
# * <tt>:field_enclosure</tt>: The field enclosure charcaters
|
43
|
+
# * <tt>:disable_keys</tt>: Set to true to disable keys before, then enable after load (MySql only optimization)
|
44
|
+
def initialize(control, configuration)
|
45
|
+
super
|
46
|
+
@target = configuration[:target]
|
47
|
+
path = Pathname.new(configuration[:file])
|
48
|
+
@file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
|
49
|
+
|
50
|
+
@table = configuration[:table]
|
51
|
+
@truncate = configuration[:truncate] ||= false
|
52
|
+
@columns = configuration[:columns]
|
53
|
+
@field_separator = (configuration[:field_separator] || ',')
|
54
|
+
@line_separator = (configuration[:line_separator] || "\n")
|
55
|
+
@null_string = (configuration[:null_string] || "")
|
56
|
+
@field_enclosure = configuration[:field_enclosure]
|
57
|
+
@disable_keys = configuration[:disable_keys] || false
|
58
|
+
@replace = configuration[:replace] || false
|
59
|
+
|
60
|
+
raise ControlError, "Target must be specified" unless @target
|
61
|
+
raise ControlError, "Table must be specified" unless @table
|
62
|
+
end
|
63
|
+
|
64
|
+
# Execute the processor
|
65
|
+
def process
|
66
|
+
return if ETL::Engine.skip_bulk_import
|
67
|
+
return if File.size(file) == 0
|
68
|
+
|
69
|
+
conn = ETL::Engine.connection(target)
|
70
|
+
conn.transaction do
|
71
|
+
conn.truncate(table_name) if truncate
|
72
|
+
options = {}
|
73
|
+
options[:columns] = columns
|
74
|
+
|
75
|
+
options[:disable_keys] = true if disable_keys
|
76
|
+
options[:replace] = true if replace
|
77
|
+
|
78
|
+
if field_separator || field_enclosure || line_separator || null_string
|
79
|
+
options[:fields] = {}
|
80
|
+
options[:fields][:null_string] = null_string if null_string
|
81
|
+
options[:fields][:delimited_by] = field_separator if field_separator
|
82
|
+
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
83
|
+
options[:fields][:terminated_by] = line_separator if line_separator
|
84
|
+
end
|
85
|
+
conn.bulk_load(file, table_name, options)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def table_name
|
90
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row-level processor that checks if the row already exists in the
|
4
|
+
# target table
|
5
|
+
class CheckExistProcessor < ETL::Processor::RowProcessor
|
6
|
+
# A symbol or array of symbols representing keys that should be skipped
|
7
|
+
attr_accessor :skip
|
8
|
+
|
9
|
+
# The target database
|
10
|
+
attr_accessor :target
|
11
|
+
|
12
|
+
# The name of the table to check against
|
13
|
+
attr_accessor :table
|
14
|
+
|
15
|
+
# An array of columns representing the natural key
|
16
|
+
attr_accessor :columns
|
17
|
+
|
18
|
+
# Is set to true if the processor should execute the check. If there are
|
19
|
+
# no rows in the target table then this should return false.
|
20
|
+
attr_accessor :should_check
|
21
|
+
|
22
|
+
# Initialize the processor
|
23
|
+
# Configuration options:
|
24
|
+
# * <tt>:columns</tt>: An array of symbols for columns that should be included in the query conditions. If this option is not specified then all of the columns in the row will be included in the conditions (unless :skip is specified).
|
25
|
+
# * <tt>:skip</tt>: A symbol or array of symbols that should not be included in the existence check. If this option is not specified then all of the columns will be included in the existence check (unless :columns is specified).
|
26
|
+
# * <tt>:target</tt>: The target connection
|
27
|
+
# * <tt>:table</tt>: The table name
|
28
|
+
def initialize(control, configuration)
|
29
|
+
super
|
30
|
+
@skip = configuration[:skip] || []
|
31
|
+
@target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
|
32
|
+
@table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
|
33
|
+
@columns = configuration[:columns]
|
34
|
+
|
35
|
+
q = "SELECT COUNT(*) FROM #{table_name}"
|
36
|
+
@should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return true if the given key should be skipped
|
40
|
+
def skip?(key)
|
41
|
+
case skip
|
42
|
+
when Array
|
43
|
+
skip.include?(key)
|
44
|
+
else
|
45
|
+
skip.to_sym == key.to_sym
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return true if the row should be checked
|
50
|
+
def should_check?
|
51
|
+
@should_check ? true : false
|
52
|
+
end
|
53
|
+
|
54
|
+
# Process the row
|
55
|
+
def process(row)
|
56
|
+
return row unless should_check?
|
57
|
+
conn = ETL::Engine.connection(target)
|
58
|
+
q = "SELECT * FROM #{table_name} WHERE "
|
59
|
+
conditions = []
|
60
|
+
ensure_columns_available_in_row!(row, columns, 'for existence check')
|
61
|
+
row.each do |k,v|
|
62
|
+
if columns.nil? || columns.include?(k.to_sym)
|
63
|
+
conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
q << conditions.join(" AND ")
|
67
|
+
q << " LIMIT 1"
|
68
|
+
|
69
|
+
result = conn.select_one(q)
|
70
|
+
return row if result.nil?
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def table_name
|
76
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that checks whether or not the row has already passed
|
4
|
+
# through the ETL processor, using the key fields provided as the keys
|
5
|
+
# to check.
|
6
|
+
class CheckUniqueProcessor < ETL::Processor::RowProcessor
|
7
|
+
|
8
|
+
# The keys to check
|
9
|
+
attr_accessor :keys
|
10
|
+
|
11
|
+
# Initialize the processor
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:keys</tt>: An array of keys to check against
|
14
|
+
def initialize(control, configuration)
|
15
|
+
super
|
16
|
+
@keys = configuration[:keys]
|
17
|
+
end
|
18
|
+
|
19
|
+
# A Hash of keys that have already been processed.
|
20
|
+
def compound_key_constraints
|
21
|
+
@compound_key_constraints ||= {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row. This implementation will only return a row if it
|
25
|
+
# it's key combination has not already been seen.
|
26
|
+
#
|
27
|
+
# An error will be raised if the row doesn't include the keys.
|
28
|
+
def process(row)
|
29
|
+
ensure_columns_available_in_row!(row, keys, 'for unicity check')
|
30
|
+
|
31
|
+
key = (keys.collect { |k| row[k] }).join('|')
|
32
|
+
unless compound_key_constraints[key]
|
33
|
+
compound_key_constraints[key] = 1
|
34
|
+
return row
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row processor that will copy one field to another
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:destination</tt>: The destination field
|
7
|
+
# * <tt>:dest</tt>: Alias for :destination
|
8
|
+
# * <tt>:source</tt>: The source field
|
9
|
+
class CopyFieldProcessor < ETL::Processor::RowProcessor
|
10
|
+
# Process the given row
|
11
|
+
def process(row)
|
12
|
+
destination = (configuration[:destination] || configuration[:dest])
|
13
|
+
source_value = row[configuration[:source]]
|
14
|
+
case source_value
|
15
|
+
when Numeric
|
16
|
+
row[destination] = source_value
|
17
|
+
when nil
|
18
|
+
row[destination] = nil
|
19
|
+
else
|
20
|
+
row[destination] = source_value.dup
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
class DatabaseJoinProcessor < ETL::Processor::RowProcessor
|
4
|
+
attr_reader :target
|
5
|
+
attr_reader :query
|
6
|
+
attr_reader :fields
|
7
|
+
|
8
|
+
# Initialize the procesor.
|
9
|
+
#
|
10
|
+
# Arguments:
|
11
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
12
|
+
# * <tt>configuration</tt>: The configuration Hash
|
13
|
+
# * <tt>definition</tt>: The source definition
|
14
|
+
#
|
15
|
+
# Required configuration options:
|
16
|
+
# * <tt>:target</tt>: The target connection
|
17
|
+
# * <tt>:query</tt>: The join query
|
18
|
+
# * <tt>:fields</tt>: The fields to add to the row
|
19
|
+
def initialize(control, configuration)
|
20
|
+
super
|
21
|
+
@target = configuration[:target]
|
22
|
+
@query = configuration[:query]
|
23
|
+
@fields = configuration[:fields]
|
24
|
+
raise ControlError, ":target must be specified" unless @target
|
25
|
+
raise ControlError, ":query must be specified" unless @query
|
26
|
+
raise ControlError, ":fields must be specified" unless @fields
|
27
|
+
end
|
28
|
+
|
29
|
+
# Get a String identifier for the source
|
30
|
+
def to_s
|
31
|
+
"#{host}/#{database}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def process(row)
|
35
|
+
return nil if row.nil?
|
36
|
+
|
37
|
+
q = @query
|
38
|
+
begin
|
39
|
+
q = eval('"' + @query + '"')
|
40
|
+
rescue
|
41
|
+
end
|
42
|
+
|
43
|
+
ETL::Engine.logger.debug("Executing select: #{q}")
|
44
|
+
res = connection.execute(q)
|
45
|
+
|
46
|
+
case connection
|
47
|
+
when ActiveRecord::ConnectionAdapters::PostgreSQLAdapter;
|
48
|
+
res.each do |r|
|
49
|
+
@fields.each do |field|
|
50
|
+
row[field.to_sym] = r[field.to_s]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
when ActiveRecord::ConnectionAdapters::MysqlAdapter;
|
54
|
+
res.each_hash do |r|
|
55
|
+
@fields.each do |field|
|
56
|
+
row[field.to_sym] = r[field.to_s]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
res.free
|
60
|
+
else raise "Unsupported adapter #{connection.class} for this destination"
|
61
|
+
end
|
62
|
+
|
63
|
+
return row
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
# Get the database connection to use
|
68
|
+
def connection
|
69
|
+
ETL::Engine.connection(target)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get the host, defaults to 'localhost'
|
73
|
+
def host
|
74
|
+
ETL::Base.configurations[target.to_s]['host'] || 'localhost'
|
75
|
+
end
|
76
|
+
|
77
|
+
def database
|
78
|
+
ETL::Base.configurations[target.to_s]['database']
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Processor #:nodoc:
|
5
|
+
# The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
|
6
|
+
class EncodeProcessor < ETL::Processor::Processor
|
7
|
+
|
8
|
+
# The file to load from
|
9
|
+
attr_reader :source_file
|
10
|
+
# The file to write to
|
11
|
+
attr_reader :target_file
|
12
|
+
# The source file encoding
|
13
|
+
attr_reader :source_encoding
|
14
|
+
# The target file encoding
|
15
|
+
attr_reader :target_encoding
|
16
|
+
|
17
|
+
# Initialize the processor.
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:source_file</tt>: The file to load data from
|
21
|
+
# * <tt>:source_encoding</tt>: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
|
22
|
+
# * <tt>:target_file</tt>: The file to write data to
|
23
|
+
# * <tt>:target_encoding</tt>: The target file encoding
|
24
|
+
def initialize(control, configuration)
|
25
|
+
super
|
26
|
+
raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
|
27
|
+
raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
|
28
|
+
@source_file = File.join(File.dirname(control.file), configuration[:source_file])
|
29
|
+
@source_encoding = configuration[:source_encoding]
|
30
|
+
@target_file = File.join(File.dirname(control.file), configuration[:target_file])
|
31
|
+
@target_encoding = configuration[:target_encoding]
|
32
|
+
raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
|
33
|
+
begin
|
34
|
+
@iconv = Iconv.new(target_encoding,source_encoding)
|
35
|
+
rescue Iconv::InvalidEncoding
|
36
|
+
raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the processor
|
41
|
+
def process
|
42
|
+
# operate line by line to handle large files without loading them in-memory
|
43
|
+
# could be replaced by a system iconv call when available, for greater performance
|
44
|
+
File.open(source_file) do |source|
|
45
|
+
#puts "Opening #{target_file}"
|
46
|
+
File.open(target_file,'w') do |target|
|
47
|
+
source.each_line do |line|
|
48
|
+
target << @iconv.iconv(line)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|