activewarehouse-etl 0.7.2 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +18 -1
- data/lib/etl/control/destination.rb +21 -10
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +1 -1
- data/lib/etl/control/source.rb +10 -0
- data/lib/etl/control/source/database_source.rb +37 -3
- data/lib/etl/control/source/file_source.rb +1 -0
- data/lib/etl/engine.rb +2 -1
- data/lib/etl/execution/base.rb +1 -1
- data/lib/etl/generator/surrogate_key_generator.rb +9 -0
- data/lib/etl/parser/delimited_parser.rb +1 -1
- data/lib/etl/processor/check_exist_processor.rb +3 -3
- data/lib/etl/processor/surrogate_key_processor.rb +3 -1
- data/lib/etl/row.rb +3 -0
- data/lib/etl/version.rb +2 -2
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -101,4 +101,21 @@
|
|
101
101
|
* Fixed source caching
|
102
102
|
|
103
103
|
0.7.2 - Apr 8, 2007
|
104
|
-
* Fixed quoting bug in CheckExistProcessor
|
104
|
+
* Fixed quoting bug in CheckExistProcessor
|
105
|
+
|
106
|
+
0.8.0 - Apr 12, 2007
|
107
|
+
* Source now available through the current row source accessor.
|
108
|
+
* Added new_rows_only configuration option to DatabaseSource. A date field must be specified and
|
109
|
+
only records that are greater than the date value in that field, relative to the last successful
|
110
|
+
execution, will be returned from the source.
|
111
|
+
* Added an (untested) count feature which returns the number of rows for processing.
|
112
|
+
* If no natural key is defined then an empty array will now be used, resulting in the row being
|
113
|
+
written to the output without going through change checks.
|
114
|
+
* Mapping argument in destination is now optional. An empty hash will be used if the mapping
|
115
|
+
hash is not specified. If the mapping hash is not specified then the order will be determined
|
116
|
+
using the originating source's order.
|
117
|
+
* ActiveRecord configurations loaded from database.yml by the etl tool will be merged with
|
118
|
+
ActiveRecord::Base.configurations.
|
119
|
+
* Fixed several bugs in how record change detection was implemented.
|
120
|
+
* Fixed how the read_locally functionality was implemented so that it will find that last
|
121
|
+
completed local source copy using the source's trigger file (untested).
|
@@ -162,10 +162,16 @@ module ETL #:nodoc:
|
|
162
162
|
|
163
163
|
# Process a row to determine the change type
|
164
164
|
def process_change(row)
|
165
|
+
ETL::Engine.logger.debug "Processing row: #{row.inspect}"
|
165
166
|
return unless row
|
166
167
|
|
167
168
|
# Change processing can only occur if the natural key exists in the row
|
168
|
-
|
169
|
+
ETL::Engine.logger.debug "Checking for natural key existence"
|
170
|
+
if natural_key.length == 0
|
171
|
+
buffer << row
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
169
175
|
natural_key.each do |key|
|
170
176
|
unless row.has_key?(key)
|
171
177
|
buffer << row
|
@@ -173,7 +179,7 @@ module ETL #:nodoc:
|
|
173
179
|
end
|
174
180
|
end
|
175
181
|
|
176
|
-
ETL::Engine.logger.debug "
|
182
|
+
ETL::Engine.logger.debug "Checking for SCD fields"
|
177
183
|
s = String.new
|
178
184
|
if scd_fields
|
179
185
|
scd_fields.each { |f| s << row[f].to_s }
|
@@ -193,7 +199,7 @@ module ETL #:nodoc:
|
|
193
199
|
|
194
200
|
timestamp = Time.now
|
195
201
|
|
196
|
-
ETL::Engine.logger.debug "
|
202
|
+
ETL::Engine.logger.debug "Checking record change type"
|
197
203
|
if record
|
198
204
|
if record.crc != crc.to_s
|
199
205
|
# SCD Type 1: only the new row should be added
|
@@ -206,7 +212,7 @@ module ETL #:nodoc:
|
|
206
212
|
q = "SELECT * FROM #{dimension_table} WHERE "
|
207
213
|
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
208
214
|
#puts "looking for original record"
|
209
|
-
result = ActiveRecord::Base.connection.select_one(q)
|
215
|
+
result = ETL::ActiveRecord::Base.connection.select_one(q)
|
210
216
|
if result
|
211
217
|
#puts "Result: #{result.inspect}"
|
212
218
|
original_record = ETL::Row[result.symbolize_keys!]
|
@@ -230,9 +236,12 @@ module ETL #:nodoc:
|
|
230
236
|
|
231
237
|
q = "SELECT * FROM #{dimension_table} WHERE "
|
232
238
|
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
233
|
-
result = ActiveRecord::Base.connection.select_one(q)
|
239
|
+
result = ETL::ActiveRecord::Base.connection.select_one(q)
|
234
240
|
if result
|
235
|
-
|
241
|
+
# This was necessary when truncating and then loading, however I
|
242
|
+
# am getting reluctant to having the ETL process do the truncation
|
243
|
+
# as part of the bulk load, favoring using a preprocessor instead.
|
244
|
+
# buffer << ETL::Row[result.symbolize_keys!]
|
236
245
|
else
|
237
246
|
# The record never made it into the database, so add the effective and end date
|
238
247
|
# and add it into the bulk load file
|
@@ -262,8 +271,6 @@ module ETL #:nodoc:
|
|
262
271
|
)
|
263
272
|
end
|
264
273
|
end
|
265
|
-
rescue => e
|
266
|
-
puts e
|
267
274
|
end
|
268
275
|
|
269
276
|
# Add any virtual fields to the row. Virtual rows will get their value
|
@@ -288,7 +295,11 @@ module ETL #:nodoc:
|
|
288
295
|
when Proc
|
289
296
|
row[key] = value.call(row)
|
290
297
|
else
|
291
|
-
|
298
|
+
if value.is_a?(ETL::Generator::Generator)
|
299
|
+
row[key] = value.next
|
300
|
+
else
|
301
|
+
row[key] = value
|
302
|
+
end
|
292
303
|
end
|
293
304
|
end
|
294
305
|
end
|
@@ -304,7 +315,7 @@ module ETL #:nodoc:
|
|
304
315
|
when String, Symbol
|
305
316
|
[configuration[:natural_key].to_sym]
|
306
317
|
else
|
307
|
-
[
|
318
|
+
[] # no natural key defined
|
308
319
|
end
|
309
320
|
end
|
310
321
|
end
|
@@ -30,7 +30,7 @@ module ETL #:nodoc:
|
|
30
30
|
#
|
31
31
|
# Mapping options:
|
32
32
|
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
33
|
-
def initialize(control, configuration, mapping)
|
33
|
+
def initialize(control, configuration, mapping={})
|
34
34
|
super
|
35
35
|
@truncate = configuration[:truncate] ||= false
|
36
36
|
@unique = configuration[:unique]
|
@@ -60,7 +60,7 @@ module ETL #:nodoc:
|
|
60
60
|
values << "'#{row[name]}'" # TODO: this is probably not database agnostic
|
61
61
|
end
|
62
62
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
63
|
-
|
63
|
+
ETL::Engine.logger.debug("Executing insert: #{q}")
|
64
64
|
conn.insert(q, "Insert row #{current_row}")
|
65
65
|
@current_row += 1
|
66
66
|
end
|
@@ -38,7 +38,7 @@ module ETL #:nodoc:
|
|
38
38
|
#
|
39
39
|
# Mapping options:
|
40
40
|
# * <tt>:order</tt>: The order array
|
41
|
-
def initialize(control, configuration, mapping)
|
41
|
+
def initialize(control, configuration, mapping={})
|
42
42
|
super
|
43
43
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
44
44
|
@append = configuration[:append] ||= false
|
data/lib/etl/control/source.rb
CHANGED
@@ -81,6 +81,16 @@ module ETL #:nodoc:
|
|
81
81
|
File.join(local_dir, "#{filename}.csv")
|
82
82
|
end
|
83
83
|
|
84
|
+
# Get the last fully written local file
|
85
|
+
def last_local_file
|
86
|
+
File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the last local file trigger
|
90
|
+
def last_local_file_trigger
|
91
|
+
Dir.glob(File.join(local_directory, '*.trig')).last
|
92
|
+
end
|
93
|
+
|
84
94
|
# Get the local trigger file that is used to indicate that the file has
|
85
95
|
# been completely written
|
86
96
|
def local_file_trigger(file)
|
@@ -65,6 +65,22 @@ module ETL #:nodoc:
|
|
65
65
|
configuration[:order]
|
66
66
|
end
|
67
67
|
|
68
|
+
# Return the column which is used for in the where clause to identify
|
69
|
+
# new rows
|
70
|
+
def new_records_only
|
71
|
+
configuration[:new_records_only]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the number of rows in the source
|
75
|
+
def count(use_cache=true)
|
76
|
+
return @count if @count && use_cache
|
77
|
+
if store_locally || read_locally
|
78
|
+
@count = count_locally
|
79
|
+
else
|
80
|
+
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
68
84
|
# Get the list of columns to read. This is defined in the source
|
69
85
|
# definition as either an Array or Hash
|
70
86
|
def columns
|
@@ -86,7 +102,7 @@ module ETL #:nodoc:
|
|
86
102
|
def each(&block)
|
87
103
|
if read_locally # Read from the last stored source
|
88
104
|
ETL::Engine.logger.debug "Reading from local cache"
|
89
|
-
read_rows(
|
105
|
+
read_rows(last_local_file, &block)
|
90
106
|
else # Read from the original source
|
91
107
|
if store_locally
|
92
108
|
file = local_file
|
@@ -94,7 +110,8 @@ module ETL #:nodoc:
|
|
94
110
|
read_rows(file, &block)
|
95
111
|
else
|
96
112
|
connection.select_all(query).each do |row|
|
97
|
-
row = Row.new(row.symbolize_keys)
|
113
|
+
row = ETL::Row.new(row.symbolize_keys)
|
114
|
+
row.source = self
|
98
115
|
yield row
|
99
116
|
end
|
100
117
|
end
|
@@ -110,6 +127,7 @@ module ETL #:nodoc:
|
|
110
127
|
t = Benchmark.realtime do
|
111
128
|
FasterCSV.open(file, :headers => true).each do |row|
|
112
129
|
result_row = ETL::Row.new
|
130
|
+
result_row.source = self
|
113
131
|
row.each do |header, field|
|
114
132
|
result_row[header.to_sym] = field
|
115
133
|
end
|
@@ -119,6 +137,12 @@ module ETL #:nodoc:
|
|
119
137
|
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
120
138
|
end
|
121
139
|
|
140
|
+
def count_locally
|
141
|
+
counter = 0
|
142
|
+
File.open(last_local_file, 'r').each { |line| counter += 1 }
|
143
|
+
counter
|
144
|
+
end
|
145
|
+
|
122
146
|
# Write rows to the local cache
|
123
147
|
def write_local(file)
|
124
148
|
lines = 0
|
@@ -140,6 +164,16 @@ module ETL #:nodoc:
|
|
140
164
|
return @query if @query
|
141
165
|
q = "SELECT #{select} FROM #{configuration[:table]}"
|
142
166
|
q << " #{join}" if join
|
167
|
+
|
168
|
+
if new_records_only
|
169
|
+
last_completed = ETL::Execution::Job.maximum('created_at',
|
170
|
+
:conditions => ['control_file = ? and completed_at is not null', control.file]
|
171
|
+
)
|
172
|
+
if last_completed
|
173
|
+
q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
143
177
|
q << " ORDER BY #{order}" if order
|
144
178
|
if ETL::Engine.limit || ETL::Engine.offset
|
145
179
|
options = {}
|
@@ -147,7 +181,7 @@ module ETL #:nodoc:
|
|
147
181
|
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
148
182
|
connection.add_limit_offset!(q, options)
|
149
183
|
end
|
150
|
-
|
184
|
+
|
151
185
|
q = q.gsub(/\n/,' ')
|
152
186
|
ETL::Engine.logger.info "Query: #{q}"
|
153
187
|
@query = q
|
data/lib/etl/engine.rb
CHANGED
@@ -20,7 +20,8 @@ module ETL #:nodoc:
|
|
20
20
|
@read_locally = options[:read_locally]
|
21
21
|
options[:config] ||= 'database.yml'
|
22
22
|
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
23
|
-
ActiveRecord::Base.configurations = database_configuration
|
23
|
+
ETL::ActiveRecord::Base.configurations = database_configuration
|
24
|
+
ActiveRecord::Base.configurations.merge!(ETL::ActiveRecord::Base.configurations)
|
24
25
|
require 'etl/execution'
|
25
26
|
ETL::Execution::Base.establish_connection :etl_execution
|
26
27
|
ETL::Execution::Execution.migrate
|
data/lib/etl/execution/base.rb
CHANGED
@@ -4,6 +4,15 @@ module ETL #:nodoc:
|
|
4
4
|
module Generator #:nodoc:
|
5
5
|
# Surrogate key generator.
|
6
6
|
class SurrogateKeyGenerator < Generator
|
7
|
+
# Initialize the generator
|
8
|
+
def initialize(options={})
|
9
|
+
if options[:query]
|
10
|
+
@surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
|
11
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
12
|
+
@surrogate_key = @surrogate_key.to_i
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
7
16
|
# Get the next surrogate key
|
8
17
|
def next
|
9
18
|
@surrogate_key ||= 0
|
@@ -41,7 +41,7 @@ module ETL #:nodoc:
|
|
41
41
|
|
42
42
|
private
|
43
43
|
def validate_row(row, line, file)
|
44
|
-
ETL::Engine.logger.debug "
|
44
|
+
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
|
45
45
|
if row.length != fields.length
|
46
46
|
raise_with_info( MismatchError,
|
47
47
|
"The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
|
@@ -30,7 +30,7 @@ module ETL #:nodoc:
|
|
30
30
|
@columns = configuration[:columns]
|
31
31
|
|
32
32
|
q = "SELECT COUNT(*) FROM #{table}"
|
33
|
-
@should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
|
33
|
+
@should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
|
34
34
|
end
|
35
35
|
|
36
36
|
# Return true if the given key should be skipped
|
@@ -51,12 +51,12 @@ module ETL #:nodoc:
|
|
51
51
|
# Process the row
|
52
52
|
def process(row)
|
53
53
|
return row unless should_check?
|
54
|
-
connection = ActiveRecord::Base.connection
|
54
|
+
connection = ETL::ActiveRecord::Base.connection
|
55
55
|
q = "SELECT * FROM #{table} WHERE "
|
56
56
|
conditions = []
|
57
57
|
row.each do |k,v|
|
58
58
|
if columns.nil? || columns.include?(k.to_sym)
|
59
|
-
conditions << "#{k} = #{connection.quote(v)}" unless skip?(k)
|
59
|
+
conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
|
60
60
|
end
|
61
61
|
end
|
62
62
|
q << conditions.join(" AND ")
|
@@ -8,7 +8,9 @@ module ETL #:nodoc:
|
|
8
8
|
# Initialize the surrogate key generator
|
9
9
|
def initialize(control, configuration)
|
10
10
|
super
|
11
|
-
|
11
|
+
if configuration[:query]
|
12
|
+
@surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
|
13
|
+
end
|
12
14
|
#puts "initial surrogate key: #{@surrogate_key}"
|
13
15
|
@surrogate_key = 0 if @surrogate_key.blank?
|
14
16
|
@surrogate_key = @surrogate_key.to_i
|
data/lib/etl/row.rb
CHANGED
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-04-
|
6
|
+
version: 0.8.0
|
7
|
+
date: 2007-04-12 00:00:00 -04:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|