factorylabs-activewarehouse-etl 0.9.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +85 -0
- data/Rakefile +153 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl.rb +78 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination.rb +420 -0
- data/lib/etl/control/destination/database_destination.rb +95 -0
- data/lib/etl/control/destination/file_destination.rb +124 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution.rb +20 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution/record.rb +18 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +81 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- metadata +195 -0
@@ -0,0 +1,95 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Destination which writes directly to a database. This is useful when you are dealing with
|
4
|
+
# a small amount of data. For larger amounts of data you should probably use the bulk
|
5
|
+
# loader if it is supported with your target database as it will use a much faster load
|
6
|
+
# method.
|
7
|
+
class DatabaseDestination < Destination
|
8
|
+
# The target connection
|
9
|
+
attr_reader :target
|
10
|
+
|
11
|
+
# The table
|
12
|
+
attr_reader :table
|
13
|
+
|
14
|
+
# Specify the order from the source
|
15
|
+
attr_reader :order
|
16
|
+
|
17
|
+
# Set to true to truncate the destination table first
|
18
|
+
attr_reader :truncate
|
19
|
+
|
20
|
+
# Initialize the database destination
|
21
|
+
#
|
22
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
23
|
+
# * <tt>configuration</tt>: The configuration Hash
|
24
|
+
# * <tt>mapping</tt>: The mapping
|
25
|
+
#
|
26
|
+
# Configuration options:
|
27
|
+
# * <tt>:database</tt>: The database name (REQUIRED)
|
28
|
+
# * <tt>:target</tt>: The target connection (REQUIRED)
|
29
|
+
# * <tt>:table</tt>: The table to write to (REQUIRED)
|
30
|
+
# * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
|
31
|
+
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
32
|
+
# * <tt>:append_rows</tt>: Array of rows to append
|
33
|
+
#
|
34
|
+
# Mapping options:
|
35
|
+
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
36
|
+
def initialize(control, configuration, mapping={})
|
37
|
+
super
|
38
|
+
@target = configuration[:target]
|
39
|
+
@table = configuration[:table]
|
40
|
+
@truncate = configuration[:truncate] ||= false
|
41
|
+
@unique = configuration[:unique]
|
42
|
+
@order = mapping[:order] || order_from_source
|
43
|
+
raise ControlError, "Order required in mapping" unless @order
|
44
|
+
raise ControlError, "Table required" unless @table
|
45
|
+
raise ControlError, "Target required" unless @target
|
46
|
+
end
|
47
|
+
|
48
|
+
# Flush the currently buffered data
|
49
|
+
def flush
|
50
|
+
conn.transaction do
|
51
|
+
buffer.flatten.each do |row|
|
52
|
+
# check to see if this row's compound key constraint already exists
|
53
|
+
# note that the compound key constraint may not utilize virtual fields
|
54
|
+
next unless row_allowed?(row)
|
55
|
+
|
56
|
+
# add any virtual fields
|
57
|
+
add_virtuals!(row)
|
58
|
+
|
59
|
+
names = []
|
60
|
+
values = []
|
61
|
+
order.each do |name|
|
62
|
+
names << "`#{name}`"
|
63
|
+
values << conn.quote(row[name]) # TODO: this is probably not database agnostic
|
64
|
+
end
|
65
|
+
q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
|
66
|
+
ETL::Engine.logger.debug("Executing insert: #{q}")
|
67
|
+
conn.insert(q, "Insert row #{current_row}")
|
68
|
+
@current_row += 1
|
69
|
+
end
|
70
|
+
buffer.clear
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Close the connection
|
75
|
+
def close
|
76
|
+
buffer << append_rows if append_rows
|
77
|
+
flush
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
def conn
|
82
|
+
@conn ||= begin
|
83
|
+
conn = ETL::Engine.connection(target)
|
84
|
+
conn.truncate(table_name) if truncate
|
85
|
+
conn
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def table_name
|
90
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# This source file contains the ETL::Control::FileDestination
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Control #:nodoc:
|
5
|
+
# File as the final destination.
|
6
|
+
class FileDestination < Destination
|
7
|
+
# The File to write to
|
8
|
+
attr_reader :file
|
9
|
+
|
10
|
+
# The output order
|
11
|
+
attr_reader :order
|
12
|
+
|
13
|
+
# Flag which indicates to append (default is to overwrite)
|
14
|
+
attr_accessor :append
|
15
|
+
|
16
|
+
# The separator
|
17
|
+
attr_accessor :separator
|
18
|
+
|
19
|
+
# The end of line marker
|
20
|
+
attr_accessor :eol
|
21
|
+
|
22
|
+
# The enclosure character
|
23
|
+
attr_accessor :enclose
|
24
|
+
|
25
|
+
# Initialize the object.
|
26
|
+
# * <tt>control</tt>: The Control object
|
27
|
+
# * <tt>configuration</tt>: The configuration map
|
28
|
+
# * <tt>mapping</tt>: The output mapping
|
29
|
+
#
|
30
|
+
# Configuration options:
|
31
|
+
# * <tt>:file<tt>: The file to write to (REQUIRED)
|
32
|
+
# * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
|
33
|
+
# * <tt>:separator</tt>: Record separator (default is a comma)
|
34
|
+
# * <tt>:eol</tt>: End of line marker (default is \n)
|
35
|
+
# * <tt>:enclose</tt>: Enclosure character (default is none)
|
36
|
+
# * <tt>:unique</tt>: Set to true to only write unique records
|
37
|
+
# * <tt>:append_rows</tt>: Array of rows to append
|
38
|
+
#
|
39
|
+
# Mapping options:
|
40
|
+
# * <tt>:order</tt>: The order array
|
41
|
+
def initialize(control, configuration, mapping={})
|
42
|
+
super
|
43
|
+
@file = File.join(File.dirname(control.file), configuration[:file])
|
44
|
+
@append = configuration[:append] ||= false
|
45
|
+
@separator = configuration[:separator] ||= ','
|
46
|
+
@eol = configuration[:eol] ||= "\n"
|
47
|
+
@enclose = configuration[:enclose]
|
48
|
+
@unique = configuration[:unique]
|
49
|
+
|
50
|
+
@order = mapping[:order] || order_from_source
|
51
|
+
raise ControlError, "Order required in mapping" unless @order
|
52
|
+
end
|
53
|
+
|
54
|
+
# Close the destination. This will flush the buffer and close the underlying stream or connection.
|
55
|
+
def close
|
56
|
+
buffer << append_rows if append_rows
|
57
|
+
flush
|
58
|
+
f.close
|
59
|
+
end
|
60
|
+
|
61
|
+
# Flush the destination buffer
|
62
|
+
def flush
|
63
|
+
#puts "Flushing buffer (#{file}) with #{buffer.length} rows"
|
64
|
+
buffer.flatten.each do |row|
|
65
|
+
#puts "row change type: #{row.change_type}"
|
66
|
+
# check to see if this row's compound key constraint already exists
|
67
|
+
# note that the compound key constraint may not utilize virtual fields
|
68
|
+
next unless row_allowed?(row)
|
69
|
+
|
70
|
+
# add any virtual fields
|
71
|
+
add_virtuals!(row)
|
72
|
+
|
73
|
+
# collect all of the values using the order designated in the configuration
|
74
|
+
values = order.collect do |name|
|
75
|
+
value = row[name]
|
76
|
+
case value
|
77
|
+
when Date, Time, DateTime
|
78
|
+
value.to_s(:db)
|
79
|
+
else
|
80
|
+
value.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
values.collect! { |v| v.gsub(/\\/, '\\\\\\\\')}
|
85
|
+
values.collect! { |v| v.gsub(separator, "\\#{separator}")}
|
86
|
+
values.collect! { |v| v.gsub(/\n|\r/, '')}
|
87
|
+
|
88
|
+
# enclose the value if required
|
89
|
+
if !enclose.nil?
|
90
|
+
values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
|
91
|
+
end
|
92
|
+
|
93
|
+
# write the values joined by the separator defined in the configuration
|
94
|
+
f.write(values.join(separator))
|
95
|
+
|
96
|
+
# write the end-of-line
|
97
|
+
f.write(eol)
|
98
|
+
end
|
99
|
+
f.flush
|
100
|
+
buffer.clear
|
101
|
+
#puts "After flush there are #{buffer.length} rows"
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
# Get the open file stream
|
106
|
+
def f
|
107
|
+
@f ||= open(file, mode)
|
108
|
+
end
|
109
|
+
|
110
|
+
def options
|
111
|
+
@options ||= {
|
112
|
+
:col_sep => separator,
|
113
|
+
:row_sep => eol,
|
114
|
+
:force_quotes => !enclose.nil?
|
115
|
+
}
|
116
|
+
end
|
117
|
+
|
118
|
+
# Get the appropriate mode to open the file stream
|
119
|
+
def mode
|
120
|
+
append ? 'a' : 'w'
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# ETL source. Subclasses must implement the <tt>each</tt> method.
|
4
|
+
class Source
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
# The control object
|
8
|
+
attr_accessor :control
|
9
|
+
|
10
|
+
# The configuration Hash
|
11
|
+
attr_accessor :configuration
|
12
|
+
|
13
|
+
# The definition Hash
|
14
|
+
attr_accessor :definition
|
15
|
+
|
16
|
+
# Returns true if the source data should be stored locally for archival
|
17
|
+
# Default behavior will return true.
|
18
|
+
attr_accessor :store_locally
|
19
|
+
|
20
|
+
class << self
|
21
|
+
# Convert the name to a Source class.
|
22
|
+
#
|
23
|
+
# For example if name is :database then this will return a
|
24
|
+
# DatabaseSource class
|
25
|
+
def class_for_name(name)
|
26
|
+
ETL::Control.const_get("#{name.to_s.camelize}Source")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize the Source instance
|
31
|
+
# * <tt>control</tt>: The control object
|
32
|
+
# * <tt>configuration</tt>: The configuration hash
|
33
|
+
# * <tt>definition</tt>: The source layout definition
|
34
|
+
#
|
35
|
+
# Configuration options:
|
36
|
+
# * <tt>:store_locally</tt>: Set to false to not store source data
|
37
|
+
# locally (defaults to true)
|
38
|
+
def initialize(control, configuration, definition)
|
39
|
+
@control = control
|
40
|
+
@configuration = configuration
|
41
|
+
@definition = definition
|
42
|
+
|
43
|
+
@store_locally = configuration[:store_locally] || true
|
44
|
+
end
|
45
|
+
|
46
|
+
# Get an array of errors that occur during reading from the source
|
47
|
+
def errors
|
48
|
+
@errors ||= []
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get a timestamp value as a string
|
52
|
+
def timestamp
|
53
|
+
Engine.timestamp
|
54
|
+
end
|
55
|
+
|
56
|
+
# The base directory where local files are stored.
|
57
|
+
attr_accessor :local_base
|
58
|
+
|
59
|
+
# Get the local base, defaults to 'source_data'
|
60
|
+
def local_base
|
61
|
+
@local_base ||= 'source_data'
|
62
|
+
end
|
63
|
+
|
64
|
+
# The local directory for storing. This method must be overriden by
|
65
|
+
# subclasses
|
66
|
+
def local_directory
|
67
|
+
raise "local_directory method is abstract"
|
68
|
+
end
|
69
|
+
|
70
|
+
# Return the local file for storing the raw source data. Each call to
|
71
|
+
# this method will result in a timestamped file, so you cannot expect
|
72
|
+
# to call it multiple times and reference the same file
|
73
|
+
#
|
74
|
+
# Optional sequence can be specified if there are multiple source files
|
75
|
+
def local_file(sequence=nil)
|
76
|
+
filename = timestamp.to_s
|
77
|
+
filename += sequence.to_s if sequence
|
78
|
+
|
79
|
+
local_dir = local_directory
|
80
|
+
FileUtils.mkdir_p(local_dir)
|
81
|
+
File.join(local_dir, "#{filename}.csv")
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the last fully written local file
|
85
|
+
def last_local_file
|
86
|
+
File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the last local file trigger
|
90
|
+
def last_local_file_trigger
|
91
|
+
Dir.glob(File.join(local_directory, '*.trig')).last
|
92
|
+
end
|
93
|
+
|
94
|
+
# Get the local trigger file that is used to indicate that the file has
|
95
|
+
# been completely written
|
96
|
+
def local_file_trigger(file)
|
97
|
+
Pathname.new(file.to_s + '.trig')
|
98
|
+
end
|
99
|
+
|
100
|
+
# Return true if the source should read locally.
|
101
|
+
def read_locally
|
102
|
+
Engine.read_locally
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,220 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
class Source < ::ActiveRecord::Base #:nodoc:
|
5
|
+
# Connection for database sources
|
6
|
+
end
|
7
|
+
|
8
|
+
module Control #:nodoc:
|
9
|
+
# Source object which extracts data from a database using ActiveRecord.
|
10
|
+
class DatabaseSource < Source
|
11
|
+
attr_accessor :target
|
12
|
+
attr_accessor :table
|
13
|
+
|
14
|
+
# Initialize the source.
|
15
|
+
#
|
16
|
+
# Arguments:
|
17
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
18
|
+
# * <tt>configuration</tt>: The configuration Hash
|
19
|
+
# * <tt>definition</tt>: The source definition
|
20
|
+
#
|
21
|
+
# Required configuration options:
|
22
|
+
# * <tt>:target</tt>: The target connection
|
23
|
+
# * <tt>:table</tt>: The source table name
|
24
|
+
# * <tt>:database</tt>: The database name
|
25
|
+
#
|
26
|
+
# Other options:
|
27
|
+
# * <tt>:join</tt>: Optional join part for the query (ignored unless
|
28
|
+
# specified)
|
29
|
+
# * <tt>:select</tt>: Optional select part for the query (defaults to
|
30
|
+
# '*')
|
31
|
+
# * <tt>:group</tt>: Optional group by part for the query (ignored
|
32
|
+
# unless specified)
|
33
|
+
# * <tt>:order</tt>: Optional order part for the query (ignored unless
|
34
|
+
# specified)
|
35
|
+
# * <tt>:new_records_only</tt>: Specify the column to use when comparing
|
36
|
+
# timestamps against the last successful ETL job execution for the
|
37
|
+
# current control file.
|
38
|
+
# * <tt>:store_locally</tt>: Set to false to not store a copy of the
|
39
|
+
# source data locally in a flat file (defaults to true)
|
40
|
+
def initialize(control, configuration, definition)
|
41
|
+
super
|
42
|
+
@target = configuration[:target]
|
43
|
+
@table = configuration[:table]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Get a String identifier for the source
|
47
|
+
def to_s
|
48
|
+
"#{host}/#{database}/#{table}"
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the local directory to use, which is a combination of the
|
52
|
+
# local_base, the db hostname the db database name and the db table.
|
53
|
+
def local_directory
|
54
|
+
File.join(local_base, host, database, configuration[:table])
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get the join part of the query, defaults to nil
|
58
|
+
def join
|
59
|
+
configuration[:join]
|
60
|
+
end
|
61
|
+
|
62
|
+
# Get the select part of the query, defaults to '*'
|
63
|
+
def select
|
64
|
+
configuration[:select] || '*'
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get the group by part of the query, defaults to nil
|
68
|
+
def group
|
69
|
+
configuration[:group]
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get the order for the query, defaults to nil
|
73
|
+
def order
|
74
|
+
configuration[:order]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Return the column which is used for in the where clause to identify
|
78
|
+
# new rows
|
79
|
+
def new_records_only
|
80
|
+
configuration[:new_records_only]
|
81
|
+
end
|
82
|
+
|
83
|
+
# Get the number of rows in the source
|
84
|
+
def count(use_cache=true)
|
85
|
+
return @count if @count && use_cache
|
86
|
+
if store_locally || read_locally
|
87
|
+
@count = count_locally
|
88
|
+
else
|
89
|
+
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Get the list of columns to read. This is defined in the source
|
94
|
+
# definition as either an Array or Hash
|
95
|
+
def columns
|
96
|
+
# weird default is required for writing to cache correctly
|
97
|
+
@columns ||= query_rows.any? ? query_rows.first.keys : ['']
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns each row from the source. If read_locally is specified then
|
101
|
+
# this method will attempt to read from the last stored local file.
|
102
|
+
# If no locally stored file exists or if the trigger file for the last
|
103
|
+
# locally stored file does not exist then this method will raise an
|
104
|
+
# error.
|
105
|
+
def each(&block)
|
106
|
+
if read_locally # Read from the last stored source
|
107
|
+
ETL::Engine.logger.debug "Reading from local cache"
|
108
|
+
read_rows(last_local_file, &block)
|
109
|
+
else # Read from the original source
|
110
|
+
if store_locally
|
111
|
+
file = local_file
|
112
|
+
write_local(file)
|
113
|
+
read_rows(file, &block)
|
114
|
+
else
|
115
|
+
query_rows.each do |row|
|
116
|
+
row = ETL::Row.new(row.symbolize_keys)
|
117
|
+
row.source = self
|
118
|
+
yield row
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
# Read rows from the local cache
|
126
|
+
def read_rows(file)
|
127
|
+
raise "Local cache file not found" unless File.exists?(file)
|
128
|
+
raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
|
129
|
+
|
130
|
+
t = Benchmark.realtime do
|
131
|
+
FasterCSV.open(file, :headers => true).each do |row|
|
132
|
+
result_row = ETL::Row.new
|
133
|
+
result_row.source = self
|
134
|
+
row.each do |header, field|
|
135
|
+
result_row[header.to_sym] = field
|
136
|
+
end
|
137
|
+
yield result_row
|
138
|
+
end
|
139
|
+
end
|
140
|
+
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
141
|
+
end
|
142
|
+
|
143
|
+
def count_locally
|
144
|
+
counter = 0
|
145
|
+
File.open(last_local_file, 'r').each { |line| counter += 1 }
|
146
|
+
counter
|
147
|
+
end
|
148
|
+
|
149
|
+
# Write rows to the local cache
|
150
|
+
def write_local(file)
|
151
|
+
lines = 0
|
152
|
+
t = Benchmark.realtime do
|
153
|
+
FasterCSV.open(file, 'w') do |f|
|
154
|
+
f << columns
|
155
|
+
query_rows.each do |row|
|
156
|
+
f << columns.collect { |column| row[column.to_s] }
|
157
|
+
lines += 1
|
158
|
+
end
|
159
|
+
end
|
160
|
+
File.open(local_file_trigger(file), 'w') {|f| }
|
161
|
+
end
|
162
|
+
ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
|
163
|
+
end
|
164
|
+
|
165
|
+
# Get the query to use
|
166
|
+
def query
|
167
|
+
return @query if @query
|
168
|
+
q = "SELECT #{select} FROM #{configuration[:table]}"
|
169
|
+
q << " #{join}" if join
|
170
|
+
|
171
|
+
conditions = []
|
172
|
+
if new_records_only
|
173
|
+
last_completed = ETL::Execution::Job.maximum('created_at',
|
174
|
+
:conditions => ['control_file = ? and completed_at is not null', control.file]
|
175
|
+
)
|
176
|
+
if last_completed
|
177
|
+
conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
conditions << configuration[:conditions] if configuration[:conditions]
|
182
|
+
if conditions.length > 0
|
183
|
+
q << " WHERE #{conditions.join(' AND ')}"
|
184
|
+
end
|
185
|
+
|
186
|
+
q << " GROUP BY #{group}" if group
|
187
|
+
q << " ORDER BY #{order}" if order
|
188
|
+
|
189
|
+
if ETL::Engine.limit || ETL::Engine.offset
|
190
|
+
options = {}
|
191
|
+
options[:limit] = ETL::Engine.limit if ETL::Engine.limit
|
192
|
+
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
193
|
+
connection.add_limit_offset!(q, options)
|
194
|
+
end
|
195
|
+
|
196
|
+
q = q.gsub(/\n/,' ')
|
197
|
+
ETL::Engine.logger.info "Query: #{q}"
|
198
|
+
@query = q
|
199
|
+
end
|
200
|
+
|
201
|
+
def query_rows
|
202
|
+
@query_rows ||= connection.select_all(query)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Get the database connection to use
|
206
|
+
def connection
|
207
|
+
ETL::Engine.connection(target)
|
208
|
+
end
|
209
|
+
|
210
|
+
# Get the host, defaults to 'localhost'
|
211
|
+
def host
|
212
|
+
ETL::Base.configurations[target.to_s]['host'] || 'localhost'
|
213
|
+
end
|
214
|
+
|
215
|
+
def database
|
216
|
+
ETL::Base.configurations[target.to_s]['database']
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|