colincasey-activewarehouse-etl 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +75 -0
  5. data/TODO +28 -0
  6. data/VERSION.yml +4 -0
  7. data/bin/etl +28 -0
  8. data/bin/etl.cmd +8 -0
  9. data/lib/etl.rb +81 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +414 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/csv_destination.rb +84 -0
  21. data/lib/etl/control/destination/database_destination.rb +95 -0
  22. data/lib/etl/control/destination/file_destination.rb +124 -0
  23. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control/source/database_source.rb +220 -0
  26. data/lib/etl/control/source/enumerable_source.rb +11 -0
  27. data/lib/etl/control/source/file_source.rb +90 -0
  28. data/lib/etl/control/source/model_source.rb +39 -0
  29. data/lib/etl/core_ext.rb +1 -0
  30. data/lib/etl/core_ext/time.rb +5 -0
  31. data/lib/etl/core_ext/time/calculations.rb +42 -0
  32. data/lib/etl/engine.rb +574 -0
  33. data/lib/etl/execution.rb +20 -0
  34. data/lib/etl/execution/base.rb +9 -0
  35. data/lib/etl/execution/batch.rb +8 -0
  36. data/lib/etl/execution/job.rb +8 -0
  37. data/lib/etl/execution/migration.rb +85 -0
  38. data/lib/etl/generator.rb +2 -0
  39. data/lib/etl/generator/generator.rb +20 -0
  40. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  41. data/lib/etl/http_tools.rb +139 -0
  42. data/lib/etl/parser.rb +11 -0
  43. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  44. data/lib/etl/parser/delimited_parser.rb +74 -0
  45. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  46. data/lib/etl/parser/parser.rb +41 -0
  47. data/lib/etl/parser/sax_parser.rb +218 -0
  48. data/lib/etl/parser/spreadsheet_parser.rb +114 -0
  49. data/lib/etl/parser/xml_parser.rb +65 -0
  50. data/lib/etl/processor.rb +11 -0
  51. data/lib/etl/processor/block_processor.rb +14 -0
  52. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  53. data/lib/etl/processor/check_exist_processor.rb +80 -0
  54. data/lib/etl/processor/check_unique_processor.rb +35 -0
  55. data/lib/etl/processor/copy_field_processor.rb +26 -0
  56. data/lib/etl/processor/encode_processor.rb +55 -0
  57. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  58. data/lib/etl/processor/print_row_processor.rb +12 -0
  59. data/lib/etl/processor/processor.rb +25 -0
  60. data/lib/etl/processor/rename_processor.rb +24 -0
  61. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  62. data/lib/etl/processor/row_processor.rb +17 -0
  63. data/lib/etl/processor/sequence_processor.rb +23 -0
  64. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  65. data/lib/etl/processor/truncate_processor.rb +35 -0
  66. data/lib/etl/row.rb +20 -0
  67. data/lib/etl/screen.rb +14 -0
  68. data/lib/etl/screen/row_count_screen.rb +20 -0
  69. data/lib/etl/transform.rb +2 -0
  70. data/lib/etl/transform/block_transform.rb +13 -0
  71. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  72. data/lib/etl/transform/decode_transform.rb +51 -0
  73. data/lib/etl/transform/default_transform.rb +20 -0
  74. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  75. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  76. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  77. data/lib/etl/transform/sha1_transform.rb +13 -0
  78. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  79. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  80. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  81. data/lib/etl/transform/transform.rb +61 -0
  82. data/lib/etl/transform/trim_transform.rb +26 -0
  83. data/lib/etl/transform/type_transform.rb +35 -0
  84. data/lib/etl/util.rb +59 -0
  85. data/lib/etl/version.rb +10 -0
  86. metadata +224 -0
@@ -0,0 +1,95 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
7
+ class DatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
14
+ # Specify the order from the source
15
+ attr_reader :order
16
+
17
+ # Set to true to truncate the destination table first
18
+ attr_reader :truncate
19
+
20
+ # Initialize the database destination
21
+ #
22
+ # * <tt>control</tt>: The ETL::Control::Control instance
23
+ # * <tt>configuration</tt>: The configuration Hash
24
+ # * <tt>mapping</tt>: The mapping
25
+ #
26
+ # Configuration options:
27
+ # * <tt>:database</tt>: The database name (REQUIRED)
28
+ # * <tt>:target</tt>: The target connection (REQUIRED)
29
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
30
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
31
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
32
+ # * <tt>:append_rows</tt>: Array of rows to append
33
+ #
34
+ # Mapping options:
35
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
36
+ def initialize(control, configuration, mapping={})
37
+ super
38
+ @target = configuration[:target]
39
+ @table = configuration[:table]
40
+ @truncate = configuration[:truncate] ||= false
41
+ @unique = configuration[:unique]
42
+ @order = mapping[:order] || order_from_source
43
+ raise ControlError, "Order required in mapping" unless @order
44
+ raise ControlError, "Table required" unless @table
45
+ raise ControlError, "Target required" unless @target
46
+ end
47
+
48
+ # Flush the currently buffered data
49
+ def flush
50
+ conn.transaction do
51
+ buffer.flatten.each do |row|
52
+ # check to see if this row's compound key constraint already exists
53
+ # note that the compound key constraint may not utilize virtual fields
54
+ next unless row_allowed?(row)
55
+
56
+ # add any virtual fields
57
+ add_virtuals!(row)
58
+
59
+ names = []
60
+ values = []
61
+ order.each do |name|
62
+ names << "`#{name}`"
63
+ values << conn.quote(row[name]) # TODO: this is probably not database agnostic
64
+ end
65
+ q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
66
+ ETL::Engine.logger.debug("Executing insert: #{q}")
67
+ conn.insert(q, "Insert row #{current_row}")
68
+ @current_row += 1
69
+ end
70
+ buffer.clear
71
+ end
72
+ end
73
+
74
+ # Close the connection
75
+ def close
76
+ buffer << append_rows if append_rows
77
+ flush
78
+ end
79
+
80
+ private
81
+ def conn
82
+ @conn ||= begin
83
+ conn = ETL::Engine.connection(target)
84
+ conn.truncate(table_name) if truncate
85
+ conn
86
+ end
87
+ end
88
+
89
+ def table_name
90
+ ETL::Engine.table(table, ETL::Engine.connection(target))
91
+ end
92
+
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,124 @@
1
+ # This source file contains the ETL::Control::FileDestination
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ # File as the final destination.
6
+ class FileDestination < Destination
7
+ # The File to write to
8
+ attr_reader :file
9
+
10
+ # The output order
11
+ attr_reader :order
12
+
13
+ # Flag which indicates to append (default is to overwrite)
14
+ attr_accessor :append
15
+
16
+ # The separator
17
+ attr_accessor :separator
18
+
19
+ # The end of line marker
20
+ attr_accessor :eol
21
+
22
+ # The enclosure character
23
+ attr_accessor :enclose
24
+
25
+ # Initialize the object.
26
+ # * <tt>control</tt>: The Control object
27
+ # * <tt>configuration</tt>: The configuration map
28
+ # * <tt>mapping</tt>: The output mapping
29
+ #
30
+ # Configuration options:
31
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
32
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
33
+ # * <tt>:separator</tt>: Record separator (default is a comma)
34
+ # * <tt>:eol</tt>: End of line marker (default is \n)
35
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
36
+ # * <tt>:unique</tt>: Set to true to only write unique records
37
+ # * <tt>:append_rows</tt>: Array of rows to append
38
+ #
39
+ # Mapping options:
40
+ # * <tt>:order</tt>: The order array
41
+ def initialize(control, configuration, mapping={})
42
+ super
43
+ @file = File.join(File.dirname(control.file), configuration[:file])
44
+ @append = configuration[:append] ||= false
45
+ @separator = configuration[:separator] ||= ','
46
+ @eol = configuration[:eol] ||= "\n"
47
+ @enclose = configuration[:enclose]
48
+ @unique = configuration[:unique]
49
+
50
+ @order = mapping[:order] || order_from_source
51
+ raise ControlError, "Order required in mapping" unless @order
52
+ end
53
+
54
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
55
+ def close
56
+ buffer << append_rows if append_rows
57
+ flush
58
+ f.close
59
+ end
60
+
61
+ # Flush the destination buffer
62
+ def flush
63
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
64
+ buffer.flatten.each do |row|
65
+ #puts "row change type: #{row.change_type}"
66
+ # check to see if this row's compound key constraint already exists
67
+ # note that the compound key constraint may not utilize virtual fields
68
+ next unless row_allowed?(row)
69
+
70
+ # add any virtual fields
71
+ add_virtuals!(row)
72
+
73
+ # collect all of the values using the order designated in the configuration
74
+ values = order.collect do |name|
75
+ value = row[name]
76
+ case value
77
+ when Date, Time, DateTime
78
+ value.to_s(:db)
79
+ else
80
+ value.to_s
81
+ end
82
+ end
83
+
84
+ values.collect! { |v| v.gsub(/\\/, '\\\\\\\\')}
85
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
86
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
87
+
88
+ # enclose the value if required
89
+ if !enclose.nil?
90
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
91
+ end
92
+
93
+ # write the values joined by the separator defined in the configuration
94
+ f.write(values.join(separator))
95
+
96
+ # write the end-of-line
97
+ f.write(eol)
98
+ end
99
+ f.flush
100
+ buffer.clear
101
+ #puts "After flush there are #{buffer.length} rows"
102
+ end
103
+
104
+ private
105
+ # Get the open file stream
106
+ def f
107
+ @f ||= open(file, mode)
108
+ end
109
+
110
+ def options
111
+ @options ||= {
112
+ :col_sep => separator,
113
+ :row_sep => eol,
114
+ :force_quotes => !enclose.nil?
115
+ }
116
+ end
117
+
118
+ # Get the appropriate mode to open the file stream
119
+ def mode
120
+ append ? 'a' : 'w'
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,74 @@
1
+ require 'yaml'
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ class YamlDestination < Destination
6
+ attr_reader :file, :append, :only, :except
7
+ # Initialize the object.
8
+ # * <tt>control</tt>: The Control object
9
+ # * <tt>configuration</tt>: The configuration map
10
+ # * <tt>mapping</tt>: The output mapping
11
+ #
12
+ # Configuration options:
13
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
14
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
15
+ # * <tt>:only</tt>
16
+ # * <tt>:except</tt>
17
+ def initialize(control, configuration, mapping={})
18
+ super
19
+ @file = File.join(File.dirname(control.file), configuration[:file])
20
+ @append = configuration[:append] ||= false
21
+ @only = configuration[:only]
22
+ @except = configuration[:except]
23
+ raise ControlError, "the :only and :except options must be used seperately, do not specify both" if @only && @except
24
+ end
25
+
26
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
27
+ def close
28
+ flush
29
+ f.close
30
+ end
31
+
32
+ # Flush the destination buffer
33
+ def flush
34
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
35
+ buffer.flatten.each do |row|
36
+ # check to see if this row's compound key constraint already exists
37
+ # note that the compound key constraint may not utilize virtual fields
38
+ next unless row_allowed?(row)
39
+ # add any virtual fields
40
+ add_virtuals!(row)
41
+
42
+ yaml = {}
43
+ row.each do |key, value|
44
+ next if only && !only.include?(key)
45
+ next if except && except.include?(key)
46
+
47
+ case value
48
+ when Date, Time, DateTime
49
+ value = value.to_s(:db)
50
+ end
51
+
52
+ yaml[key] = value
53
+ end
54
+
55
+ # write the values
56
+ YAML.dump(yaml, f)
57
+ end
58
+ f.flush
59
+ buffer.clear
60
+ end
61
+
62
+ private
63
+ # Get the open file stream
64
+ def f
65
+ @f ||= File.open(file, mode)
66
+ end
67
+
68
+ # Get the appropriate mode to open the file stream
69
+ def mode
70
+ append ? 'a' : 'w'
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally] || true
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
94
+ # Get the local trigger file that is used to indicate that the file has
95
+ # been completely written
96
+ def local_file_trigger(file)
97
+ Pathname.new(file.to_s + '.trig')
98
+ end
99
+
100
+ # Return true if the source should read locally.
101
+ def read_locally
102
+ Engine.read_locally
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+
109
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,220 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ end
45
+
46
+ # Get a String identifier for the source
47
+ def to_s
48
+ "#{host}/#{database}/#{table}"
49
+ end
50
+
51
+ # Get the local directory to use, which is a combination of the
52
+ # local_base, the db hostname the db database name and the db table.
53
+ def local_directory
54
+ File.join(local_base, host, database, configuration[:table])
55
+ end
56
+
57
+ # Get the join part of the query, defaults to nil
58
+ def join
59
+ configuration[:join]
60
+ end
61
+
62
+ # Get the select part of the query, defaults to '*'
63
+ def select
64
+ configuration[:select] || '*'
65
+ end
66
+
67
+ # Get the group by part of the query, defaults to nil
68
+ def group
69
+ configuration[:group]
70
+ end
71
+
72
+ # Get the order for the query, defaults to nil
73
+ def order
74
+ configuration[:order]
75
+ end
76
+
77
+ # Return the column which is used for in the where clause to identify
78
+ # new rows
79
+ def new_records_only
80
+ configuration[:new_records_only]
81
+ end
82
+
83
+ # Get the number of rows in the source
84
+ def count(use_cache=true)
85
+ return @count if @count && use_cache
86
+ if store_locally || read_locally
87
+ @count = count_locally
88
+ else
89
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
90
+ end
91
+ end
92
+
93
+ # Get the list of columns to read. This is defined in the source
94
+ # definition as either an Array or Hash
95
+ def columns
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
98
+ end
99
+
100
+ # Returns each row from the source. If read_locally is specified then
101
+ # this method will attempt to read from the last stored local file.
102
+ # If no locally stored file exists or if the trigger file for the last
103
+ # locally stored file does not exist then this method will raise an
104
+ # error.
105
+ def each(&block)
106
+ if read_locally # Read from the last stored source
107
+ ETL::Engine.logger.debug "Reading from local cache"
108
+ read_rows(last_local_file, &block)
109
+ else # Read from the original source
110
+ if store_locally
111
+ file = local_file
112
+ write_local(file)
113
+ read_rows(file, &block)
114
+ else
115
+ query_rows.each do |row|
116
+ row = ETL::Row.new(row.symbolize_keys)
117
+ row.source = self
118
+ yield row
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ private
125
+ # Read rows from the local cache
126
+ def read_rows(file)
127
+ raise "Local cache file not found" unless File.exists?(file)
128
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
129
+
130
+ t = Benchmark.realtime do
131
+ FasterCSV.open(file, :headers => true).each do |row|
132
+ result_row = ETL::Row.new
133
+ result_row.source = self
134
+ row.each do |header, field|
135
+ result_row[header.to_sym] = field
136
+ end
137
+ yield result_row
138
+ end
139
+ end
140
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
141
+ end
142
+
143
+ def count_locally
144
+ counter = 0
145
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
146
+ counter
147
+ end
148
+
149
+ # Write rows to the local cache
150
+ def write_local(file)
151
+ lines = 0
152
+ t = Benchmark.realtime do
153
+ FasterCSV.open(file, 'w') do |f|
154
+ f << columns
155
+ query_rows.each do |row|
156
+ f << columns.collect { |column| row[column.to_s] }
157
+ lines += 1
158
+ end
159
+ end
160
+ File.open(local_file_trigger(file), 'w') {|f| }
161
+ end
162
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
163
+ end
164
+
165
+ # Get the query to use
166
+ def query
167
+ return @query if @query
168
+ q = "SELECT #{select} FROM #{configuration[:table]}"
169
+ q << " #{join}" if join
170
+
171
+ conditions = []
172
+ if new_records_only
173
+ last_completed = ETL::Execution::Job.maximum('created_at',
174
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
175
+ )
176
+ if last_completed
177
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
178
+ end
179
+ end
180
+
181
+ conditions << configuration[:conditions] if configuration[:conditions]
182
+ if conditions.length > 0
183
+ q << " WHERE #{conditions.join(' AND ')}"
184
+ end
185
+
186
+ q << " GROUP BY #{group}" if group
187
+ q << " ORDER BY #{order}" if order
188
+
189
+ if ETL::Engine.limit || ETL::Engine.offset
190
+ options = {}
191
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
192
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
193
+ connection.add_limit_offset!(q, options)
194
+ end
195
+
196
+ q = q.gsub(/\n/,' ')
197
+ ETL::Engine.logger.info "Query: #{q}"
198
+ @query = q
199
+ end
200
+
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
205
+ # Get the database connection to use
206
+ def connection
207
+ ETL::Engine.connection(target)
208
+ end
209
+
210
+ # Get the host, defaults to 'localhost'
211
+ def host
212
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
213
+ end
214
+
215
+ def database
216
+ ETL::Base.configurations[target.to_s]['database']
217
+ end
218
+ end
219
+ end
220
+ end