activewarehouse-etl-sgonyea 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +9 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +236 -0
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +13 -0
- data/LICENSE +7 -0
- data/README.textile +111 -0
- data/Rakefile +103 -0
- data/TODO +28 -0
- data/active_support_logger.patch +78 -0
- data/activewarehouse-etl.gemspec +36 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +97 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +65 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +438 -0
- data/lib/etl/control/destination/csv_destination.rb +113 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/destination/yaml_destination.rb +74 -0
- data/lib/etl/control/source.rb +132 -0
- data/lib/etl/control/source/database_source.rb +224 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +582 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +8 -0
- data/lib/etl/execution/batch.rb +10 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +90 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/csv_parser.rb +93 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +94 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +39 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/database_join_processor.rb +82 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +27 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +40 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/md5_transform.rb +13 -0
- data/lib/etl/transform/ordinalize_transform.rb +14 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +3 -0
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/all.ebf +6 -0
- data/test/apache_combined_log.ctl +11 -0
- data/test/batch_test.rb +41 -0
- data/test/batch_with_error.ebf +6 -0
- data/test/batched1.ctl +0 -0
- data/test/batched2.ctl +0 -0
- data/test/block_processor.ctl +6 -0
- data/test/block_processor_error.ctl +1 -0
- data/test/block_processor_pre_post_process.ctl +4 -0
- data/test/block_processor_remove_rows.ctl +5 -0
- data/test/block_processor_test.rb +38 -0
- data/test/check_exist_processor_test.rb +92 -0
- data/test/check_unique_processor_test.rb +40 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +53 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +61 -0
- data/test/config/common.rb +29 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +37 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +40 -0
- data/test/control_test.rb +43 -0
- data/test/data/apache_combined_log.txt +3 -0
- data/test/data/bulk_import.txt +3 -0
- data/test/data/bulk_import_with_empties.txt +3 -0
- data/test/data/decode.txt +3 -0
- data/test/data/delimited.txt +3 -0
- data/test/data/encode_source_latin1.txt +2 -0
- data/test/data/excel.xls +0 -0
- data/test/data/excel2.xls +0 -0
- data/test/data/fixed_width.txt +3 -0
- data/test/data/multiple_delimited_1.txt +3 -0
- data/test/data/multiple_delimited_2.txt +3 -0
- data/test/data/nokogiri.xml +38 -0
- data/test/data/people.txt +3 -0
- data/test/data/sax.xml +14 -0
- data/test/data/xml.xml +16 -0
- data/test/database_join_processor_test.rb +43 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/delimited.ctl +30 -0
- data/test/delimited_absolute.ctl +31 -0
- data/test/delimited_destination_db.ctl +23 -0
- data/test/delimited_excel.ctl +31 -0
- data/test/delimited_insert_update.ctl +34 -0
- data/test/delimited_update.ctl +34 -0
- data/test/delimited_with_bulk_load.ctl +34 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +78 -0
- data/test/ensure_fields_presence_processor_test.rb +28 -0
- data/test/errors.ctl +24 -0
- data/test/etl_test.rb +42 -0
- data/test/excel.ctl +24 -0
- data/test/excel2.ctl +25 -0
- data/test/fixed_width.ctl +35 -0
- data/test/foreign_key_lookup_transform_test.rb +50 -0
- data/test/generator_test.rb +14 -0
- data/test/inline_parser.ctl +17 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/model_source.ctl +14 -0
- data/test/multiple_delimited.ctl +22 -0
- data/test/multiple_source_delimited.ctl +39 -0
- data/test/nokogiri_all.ctl +35 -0
- data/test/nokogiri_select.ctl +35 -0
- data/test/nokogiri_test.rb +35 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/sax.ctl +26 -0
- data/test/scd/1.txt +1 -0
- data/test/scd/2.txt +1 -0
- data/test/scd/3.txt +1 -0
- data/test/scd_test.rb +257 -0
- data/test/scd_test_type_1.ctl +43 -0
- data/test/scd_test_type_2.ctl +34 -0
- data/test/screen_test.rb +9 -0
- data/test/screen_test_error.ctl +3 -0
- data/test/screen_test_fatal.ctl +3 -0
- data/test/source_test.rb +154 -0
- data/test/test_helper.rb +37 -0
- data/test/transform_test.rb +101 -0
- data/test/truncate_processor_test.rb +37 -0
- data/test/xml.ctl +31 -0
- metadata +370 -0
@@ -0,0 +1,109 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Destination which writes directly to a database. This is useful when you are dealing with
|
4
|
+
# a small amount of data. For larger amounts of data you should probably use the bulk
|
5
|
+
# loader if it is supported with your target database as it will use a much faster load
|
6
|
+
# method.
|
7
|
+
class UpdateDatabaseDestination < Destination
|
8
|
+
# The target connection
|
9
|
+
attr_reader :target
|
10
|
+
|
11
|
+
# The table
|
12
|
+
attr_reader :table
|
13
|
+
|
14
|
+
# Specify the order from the source
|
15
|
+
attr_reader :order
|
16
|
+
|
17
|
+
# Specify the conditions from the source
|
18
|
+
attr_reader :conditions
|
19
|
+
|
20
|
+
# Initialize the database destination
|
21
|
+
#
|
22
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
23
|
+
# * <tt>configuration</tt>: The configuration Hash
|
24
|
+
# * <tt>mapping</tt>: The mapping
|
25
|
+
#
|
26
|
+
# Configuration options:
|
27
|
+
# * <tt>:database</tt>: The database name (REQUIRED)
|
28
|
+
# * <tt>:target</tt>: The target connection (REQUIRED)
|
29
|
+
# * <tt>:table</tt>: The table to write to (REQUIRED)
|
30
|
+
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
31
|
+
# * <tt>:append_rows</tt>: Array of rows to append
|
32
|
+
#
|
33
|
+
# Mapping options:
|
34
|
+
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
35
|
+
# * <tt>:conditions</tt>: The conditions on the fields to update (REQUIRED)
|
36
|
+
def initialize(control, configuration, mapping={})
|
37
|
+
super
|
38
|
+
@target = configuration[:target]
|
39
|
+
@table = configuration[:table]
|
40
|
+
@unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
|
41
|
+
@unique.uniq! unless @unique.nil?
|
42
|
+
@order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
|
43
|
+
@order.uniq! unless @order.nil?
|
44
|
+
@conditions = mapping[:conditions] ? mapping[:conditions] + scd_required_fields : nil
|
45
|
+
@conditions.uniq! unless @conditions.nil?
|
46
|
+
raise ControlError, "Conditions required in mapping" unless @conditions
|
47
|
+
raise ControlError, "Order required in mapping" unless @order
|
48
|
+
raise ControlError, "Table required" unless @table
|
49
|
+
raise ControlError, "Target required" unless @target
|
50
|
+
end
|
51
|
+
|
52
|
+
# Flush the currently buffered data
|
53
|
+
def flush
|
54
|
+
conn.transaction do
|
55
|
+
buffer.flatten.each do |row|
|
56
|
+
# check to see if this row's compound key constraint already exists
|
57
|
+
# note that the compound key constraint may not utilize virtual fields
|
58
|
+
next unless row_allowed?(row)
|
59
|
+
|
60
|
+
# add any virtual fields
|
61
|
+
add_virtuals!(row)
|
62
|
+
|
63
|
+
conditionsfilter = []
|
64
|
+
conditions.each do |cond|
|
65
|
+
c = " #{cond[:field]} #{cond[:comp]} #{cond[:value]} "
|
66
|
+
condition = c
|
67
|
+
begin
|
68
|
+
condition = eval('"' + c + '"')
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
conditionsfilter << condition
|
72
|
+
end
|
73
|
+
|
74
|
+
updatevalues = []
|
75
|
+
order.each do |name|
|
76
|
+
updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
|
77
|
+
end
|
78
|
+
q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{conditionsfilter.join(' AND ')}"
|
79
|
+
ETL::Engine.logger.debug("Executing update: #{q}")
|
80
|
+
conn.update(q, "Update row #{current_row}")
|
81
|
+
@current_row += 1
|
82
|
+
end
|
83
|
+
buffer.clear
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Close the connection
|
88
|
+
def close
|
89
|
+
buffer << append_rows if append_rows
|
90
|
+
flush
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
def conn
|
95
|
+
@conn ||= begin
|
96
|
+
conn = ETL::Engine.connection(target)
|
97
|
+
conn
|
98
|
+
rescue
|
99
|
+
raise RuntimeError, "Problem to connect to db"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def table_name
|
104
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Control #:nodoc:
|
5
|
+
class YamlDestination < Destination
|
6
|
+
attr_reader :file, :append, :only, :except
|
7
|
+
# Initialize the object.
|
8
|
+
# * <tt>control</tt>: The Control object
|
9
|
+
# * <tt>configuration</tt>: The configuration map
|
10
|
+
# * <tt>mapping</tt>: The output mapping
|
11
|
+
#
|
12
|
+
# Configuration options:
|
13
|
+
# * <tt>:file<tt>: The file to write to (REQUIRED)
|
14
|
+
# * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
|
15
|
+
# * <tt>:only</tt>
|
16
|
+
# * <tt>:except</tt>
|
17
|
+
def initialize(control, configuration, mapping={})
|
18
|
+
super
|
19
|
+
@file = File.join(File.dirname(control.file), configuration[:file])
|
20
|
+
@append = configuration[:append] ||= false
|
21
|
+
@only = configuration[:only]
|
22
|
+
@except = configuration[:except]
|
23
|
+
raise ControlError, "the :only and :except options must be used seperately, do not specify both" if @only && @except
|
24
|
+
end
|
25
|
+
|
26
|
+
# Close the destination. This will flush the buffer and close the underlying stream or connection.
|
27
|
+
def close
|
28
|
+
flush
|
29
|
+
f.close
|
30
|
+
end
|
31
|
+
|
32
|
+
# Flush the destination buffer
|
33
|
+
def flush
|
34
|
+
#puts "Flushing buffer (#{file}) with #{buffer.length} rows"
|
35
|
+
buffer.flatten.each do |row|
|
36
|
+
# check to see if this row's compound key constraint already exists
|
37
|
+
# note that the compound key constraint may not utilize virtual fields
|
38
|
+
next unless row_allowed?(row)
|
39
|
+
# add any virtual fields
|
40
|
+
add_virtuals!(row)
|
41
|
+
|
42
|
+
yaml = {}
|
43
|
+
row.each do |key, value|
|
44
|
+
next if only && !only.include?(key)
|
45
|
+
next if except && except.include?(key)
|
46
|
+
|
47
|
+
case value
|
48
|
+
when Date, Time, DateTime
|
49
|
+
value = value.to_s(:db)
|
50
|
+
end
|
51
|
+
|
52
|
+
yaml[key] = value
|
53
|
+
end
|
54
|
+
|
55
|
+
# write the values
|
56
|
+
YAML.dump(yaml, f)
|
57
|
+
end
|
58
|
+
f.flush
|
59
|
+
buffer.clear
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
# Get the open file stream
|
64
|
+
def f
|
65
|
+
@f ||= File.open(file, mode)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the appropriate mode to open the file stream
|
69
|
+
def mode
|
70
|
+
append ? 'a' : 'w'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# ETL source. Subclasses must implement the <tt>each</tt> method.
|
4
|
+
class Source
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
# The control object
|
8
|
+
attr_accessor :control
|
9
|
+
|
10
|
+
# The configuration Hash
|
11
|
+
attr_accessor :configuration
|
12
|
+
|
13
|
+
# The definition Hash
|
14
|
+
attr_accessor :definition
|
15
|
+
|
16
|
+
# Returns true if the source data should be stored locally for archival
|
17
|
+
# Default behavior will return true.
|
18
|
+
attr_accessor :store_locally
|
19
|
+
|
20
|
+
class << self
|
21
|
+
# Convert the name to a Source class.
|
22
|
+
#
|
23
|
+
# For example if name is :database then this will return a
|
24
|
+
# DatabaseSource class
|
25
|
+
def class_for_name(name)
|
26
|
+
ETL::Control.const_get("#{name.to_s.camelize}Source")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize the Source instance
|
31
|
+
# * <tt>control</tt>: The control object
|
32
|
+
# * <tt>configuration</tt>: The configuration hash
|
33
|
+
# * <tt>definition</tt>: The source layout definition
|
34
|
+
#
|
35
|
+
# Configuration options:
|
36
|
+
# * <tt>:store_locally</tt>: Set to false to not store source data
|
37
|
+
# locally (defaults to true)
|
38
|
+
def initialize(control, configuration, definition)
|
39
|
+
@control = control
|
40
|
+
@configuration = configuration
|
41
|
+
@definition = definition
|
42
|
+
|
43
|
+
@store_locally = configuration[:store_locally].nil? ? true : configuration[:store_locally]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Get an array of errors that occur during reading from the source
|
47
|
+
def errors
|
48
|
+
@errors ||= []
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get a timestamp value as a string
|
52
|
+
def timestamp
|
53
|
+
Engine.timestamp
|
54
|
+
end
|
55
|
+
|
56
|
+
# The base directory where local files are stored.
|
57
|
+
attr_accessor :local_base
|
58
|
+
|
59
|
+
# Get the local base, defaults to 'source_data'
|
60
|
+
def local_base
|
61
|
+
@local_base ||= 'source_data'
|
62
|
+
end
|
63
|
+
|
64
|
+
# The local directory for storing. This method must be overriden by
|
65
|
+
# subclasses
|
66
|
+
def local_directory
|
67
|
+
raise "local_directory method is abstract"
|
68
|
+
end
|
69
|
+
|
70
|
+
# Return the local file for storing the raw source data. Each call to
|
71
|
+
# this method will result in a timestamped file, so you cannot expect
|
72
|
+
# to call it multiple times and reference the same file
|
73
|
+
#
|
74
|
+
# Optional sequence can be specified if there are multiple source files
|
75
|
+
def local_file(sequence=nil)
|
76
|
+
filename = timestamp.to_s
|
77
|
+
filename += sequence.to_s if sequence
|
78
|
+
|
79
|
+
local_dir = local_directory
|
80
|
+
FileUtils.mkdir_p(local_dir)
|
81
|
+
File.join(local_dir, "#{filename}.csv")
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the last fully written local file
|
85
|
+
def last_local_file
|
86
|
+
File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the last local file trigger filename using timestamp in filenames.
|
90
|
+
# Filename is in the format YYYYMMDDHHMMSS.csv.trig, but in the case of a
|
91
|
+
# file source there is an unpadded sequence number before the file
|
92
|
+
# extension. This code may not return the correct "last" file in that
|
93
|
+
# case (in particular when there are 10 or more source files). However,
|
94
|
+
# at this point only the database source calls the method, and it wouldn't
|
95
|
+
# make sense for a file source to use it if multiple files are expected
|
96
|
+
def last_local_file_trigger
|
97
|
+
trig_files = []
|
98
|
+
trig_ext = '.csv.trig'
|
99
|
+
|
100
|
+
# Store the basename (without extension) of all files that end in the
|
101
|
+
# desired extension
|
102
|
+
Dir.glob(File.join(local_directory, "*" + trig_ext)) do |f|
|
103
|
+
# Extract the basename of each file with the extension snipped off
|
104
|
+
trig_files << File.basename(f, trig_ext) if File.file?(f)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Throw an exception if no trigger files are available
|
108
|
+
raise "Local cache trigger file not found" if trig_files.empty?
|
109
|
+
|
110
|
+
# Sort trigger file strings and get the last one
|
111
|
+
last_trig = trig_files.sort {|a,b| a <=> b}.last
|
112
|
+
|
113
|
+
# Return the file path including extension
|
114
|
+
File.join(local_directory, last_trig + trig_ext)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Get the local trigger file that is used to indicate that the file has
|
118
|
+
# been completely written
|
119
|
+
def local_file_trigger(file)
|
120
|
+
Pathname.new(file.to_s + '.trig')
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return true if the source should read locally.
|
124
|
+
def read_locally
|
125
|
+
Engine.read_locally
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
class Source < ::ActiveRecord::Base #:nodoc:
|
5
|
+
# Connection for database sources
|
6
|
+
end
|
7
|
+
|
8
|
+
module Control #:nodoc:
|
9
|
+
# Source object which extracts data from a database using ActiveRecord.
|
10
|
+
class DatabaseSource < Source
|
11
|
+
attr_accessor :target
|
12
|
+
attr_accessor :table
|
13
|
+
|
14
|
+
# Initialize the source.
|
15
|
+
#
|
16
|
+
# Arguments:
|
17
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
18
|
+
# * <tt>configuration</tt>: The configuration Hash
|
19
|
+
# * <tt>definition</tt>: The source definition
|
20
|
+
#
|
21
|
+
# Required configuration options:
|
22
|
+
# * <tt>:target</tt>: The target connection
|
23
|
+
# * <tt>:table</tt>: The source table name
|
24
|
+
# * <tt>:database</tt>: The database name
|
25
|
+
#
|
26
|
+
# Other options:
|
27
|
+
# * <tt>:join</tt>: Optional join part for the query (ignored unless
|
28
|
+
# specified)
|
29
|
+
# * <tt>:select</tt>: Optional select part for the query (defaults to
|
30
|
+
# '*')
|
31
|
+
# * <tt>:group</tt>: Optional group by part for the query (ignored
|
32
|
+
# unless specified)
|
33
|
+
# * <tt>:order</tt>: Optional order part for the query (ignored unless
|
34
|
+
# specified)
|
35
|
+
# * <tt>:new_records_only</tt>: Specify the column to use when comparing
|
36
|
+
# timestamps against the last successful ETL job execution for the
|
37
|
+
# current control file.
|
38
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
39
|
+
# source data locally in a flat file (defaults to true)
|
40
|
+
def initialize(control, configuration, definition)
|
41
|
+
super
|
42
|
+
@target = configuration[:target]
|
43
|
+
@table = configuration[:table]
|
44
|
+
@query = configuration[:query]
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get a String identifier for the source
|
48
|
+
def to_s
|
49
|
+
"#{host}/#{database}/#{@table}"
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get the local directory to use, which is a combination of the
|
53
|
+
# local_base, the db hostname the db database name and the db table.
|
54
|
+
def local_directory
|
55
|
+
File.join(local_base, to_s)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the join part of the query, defaults to nil
|
59
|
+
def join
|
60
|
+
configuration[:join]
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get the select part of the query, defaults to '*'
|
64
|
+
def select
|
65
|
+
configuration[:select] || '*'
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the group by part of the query, defaults to nil
|
69
|
+
def group
|
70
|
+
configuration[:group]
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get the order for the query, defaults to nil
|
74
|
+
def order
|
75
|
+
configuration[:order]
|
76
|
+
end
|
77
|
+
|
78
|
+
# Return the column which is used for in the where clause to identify
|
79
|
+
# new rows
|
80
|
+
def new_records_only
|
81
|
+
configuration[:new_records_only]
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the number of rows in the source
|
85
|
+
def count(use_cache=true)
|
86
|
+
return @count if @count && use_cache
|
87
|
+
if @store_locally || read_locally
|
88
|
+
@count = count_locally
|
89
|
+
else
|
90
|
+
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Get the list of columns to read. This is defined in the source
|
95
|
+
# definition as either an Array or Hash
|
96
|
+
def columns
|
97
|
+
# weird default is required for writing to cache correctly
|
98
|
+
@columns ||= query_rows.any? ? query_rows.first.keys : ['']
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns each row from the source. If read_locally is specified then
|
102
|
+
# this method will attempt to read from the last stored local file.
|
103
|
+
# If no locally stored file exists or if the trigger file for the last
|
104
|
+
# locally stored file does not exist then this method will raise an
|
105
|
+
# error.
|
106
|
+
def each(&block)
|
107
|
+
if read_locally # Read from the last stored source
|
108
|
+
ETL::Engine.logger.debug "Reading from local cache"
|
109
|
+
read_rows(last_local_file, &block)
|
110
|
+
else # Read from the original source
|
111
|
+
if @store_locally
|
112
|
+
file = local_file
|
113
|
+
write_local(file)
|
114
|
+
read_rows(file, &block)
|
115
|
+
else
|
116
|
+
query_rows.each do |r|
|
117
|
+
row = ETL::Row.new()
|
118
|
+
r.symbolize_keys.each_pair { |key, value|
|
119
|
+
row[key] = value
|
120
|
+
}
|
121
|
+
row.source = self
|
122
|
+
yield row
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
# Read rows from the local cache
|
130
|
+
def read_rows(file)
|
131
|
+
raise "Local cache file not found" unless File.exists?(file)
|
132
|
+
raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
|
133
|
+
|
134
|
+
t = Benchmark.realtime do
|
135
|
+
CSV.open(file, :headers => true).each do |row|
|
136
|
+
result_row = ETL::Row.new
|
137
|
+
result_row.source = self
|
138
|
+
row.each do |header, field|
|
139
|
+
result_row[header.to_sym] = field
|
140
|
+
end
|
141
|
+
yield result_row
|
142
|
+
end
|
143
|
+
end
|
144
|
+
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
145
|
+
end
|
146
|
+
|
147
|
+
def count_locally
|
148
|
+
counter = 0
|
149
|
+
File.open(last_local_file, 'r').each { |line| counter += 1 }
|
150
|
+
counter
|
151
|
+
end
|
152
|
+
|
153
|
+
# Write rows to the local cache
|
154
|
+
def write_local(file)
|
155
|
+
lines = 0
|
156
|
+
t = Benchmark.realtime do
|
157
|
+
CSV.open(file, 'w') do |f|
|
158
|
+
f << columns
|
159
|
+
query_rows.each do |row|
|
160
|
+
f << columns.collect { |column| row[column.to_s] }
|
161
|
+
lines += 1
|
162
|
+
end
|
163
|
+
end
|
164
|
+
File.open(local_file_trigger(file), 'w') {|f| }
|
165
|
+
end
|
166
|
+
ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
|
167
|
+
end
|
168
|
+
|
169
|
+
# Get the query to use
|
170
|
+
def query
|
171
|
+
return @query if @query
|
172
|
+
q = "SELECT #{select} FROM #{@table}"
|
173
|
+
q << " #{join}" if join
|
174
|
+
|
175
|
+
conditions = []
|
176
|
+
if new_records_only
|
177
|
+
last_completed = ETL::Execution::Job.maximum('created_at',
|
178
|
+
:conditions => ['control_file = ? and completed_at is not null', control.file]
|
179
|
+
)
|
180
|
+
if last_completed
|
181
|
+
conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
conditions << configuration[:conditions] if configuration[:conditions]
|
186
|
+
if conditions.length > 0
|
187
|
+
q << " WHERE #{conditions.join(' AND ')}"
|
188
|
+
end
|
189
|
+
|
190
|
+
q << " GROUP BY #{group}" if group
|
191
|
+
q << " ORDER BY #{order}" if order
|
192
|
+
|
193
|
+
if ETL::Engine.limit || ETL::Engine.offset
|
194
|
+
options = {}
|
195
|
+
options[:limit] = ETL::Engine.limit if ETL::Engine.limit
|
196
|
+
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
197
|
+
connection.add_limit_offset!(q, options)
|
198
|
+
end
|
199
|
+
|
200
|
+
q = q.gsub(/\n/,' ')
|
201
|
+
ETL::Engine.logger.info "Query: #{q}"
|
202
|
+
@query = q
|
203
|
+
end
|
204
|
+
|
205
|
+
def query_rows
|
206
|
+
@query_rows ||= connection.select_all(query)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Get the database connection to use
|
210
|
+
def connection
|
211
|
+
ETL::Engine.connection(target)
|
212
|
+
end
|
213
|
+
|
214
|
+
# Get the host, defaults to 'localhost'
|
215
|
+
def host
|
216
|
+
ETL::Base.configurations[target.to_s]['host'] || 'localhost'
|
217
|
+
end
|
218
|
+
|
219
|
+
def database
|
220
|
+
ETL::Base.configurations[target.to_s]['database']
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|