activewarehouse-etl 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +18 -1
- data/lib/etl/control/destination.rb +21 -10
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +1 -1
- data/lib/etl/control/source.rb +10 -0
- data/lib/etl/control/source/database_source.rb +37 -3
- data/lib/etl/control/source/file_source.rb +1 -0
- data/lib/etl/engine.rb +2 -1
- data/lib/etl/execution/base.rb +1 -1
- data/lib/etl/generator/surrogate_key_generator.rb +9 -0
- data/lib/etl/parser/delimited_parser.rb +1 -1
- data/lib/etl/processor/check_exist_processor.rb +3 -3
- data/lib/etl/processor/surrogate_key_processor.rb +3 -1
- data/lib/etl/row.rb +3 -0
- data/lib/etl/version.rb +2 -2
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -101,4 +101,21 @@
|
|
101
101
|
* Fixed source caching
|
102
102
|
|
103
103
|
0.7.2 - Apr 8, 2007
|
104
|
-
* Fixed quoting bug in CheckExistProcessor
|
104
|
+
* Fixed quoting bug in CheckExistProcessor
|
105
|
+
|
106
|
+
0.8.0 - Apr 12, 2007
|
107
|
+
* Source now available through the current row source accessor.
|
108
|
+
* Added new_rows_only configuration option to DatabaseSource. A date field must be specified and
|
109
|
+
only records that are greater than the date value in that field, relative to the last successful
|
110
|
+
execution, will be returned from the source.
|
111
|
+
* Added an (untested) count feature which returns the number of rows for processing.
|
112
|
+
* If no natural key is defined then an empty array will now be used, resulting in the row being
|
113
|
+
written to the output without going through change checks.
|
114
|
+
* Mapping argument in destination is now optional. An empty hash will be used if the mapping
|
115
|
+
hash is not specified. If the mapping hash is not specified then the order will be determined
|
116
|
+
using the originating source's order.
|
117
|
+
* ActiveRecord configurations loaded from database.yml by the etl tool will be merged with
|
118
|
+
ActiveRecord::Base.configurations.
|
119
|
+
* Fixed several bugs in how record change detection was implemented.
|
120
|
+
* Fixed how the read_locally functionality was implemented so that it will find that last
|
121
|
+
completed local source copy using the source's trigger file (untested).
|
@@ -162,10 +162,16 @@ module ETL #:nodoc:
|
|
162
162
|
|
163
163
|
# Process a row to determine the change type
|
164
164
|
def process_change(row)
|
165
|
+
ETL::Engine.logger.debug "Processing row: #{row.inspect}"
|
165
166
|
return unless row
|
166
167
|
|
167
168
|
# Change processing can only occur if the natural key exists in the row
|
168
|
-
|
169
|
+
ETL::Engine.logger.debug "Checking for natural key existence"
|
170
|
+
if natural_key.length == 0
|
171
|
+
buffer << row
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
169
175
|
natural_key.each do |key|
|
170
176
|
unless row.has_key?(key)
|
171
177
|
buffer << row
|
@@ -173,7 +179,7 @@ module ETL #:nodoc:
|
|
173
179
|
end
|
174
180
|
end
|
175
181
|
|
176
|
-
ETL::Engine.logger.debug "
|
182
|
+
ETL::Engine.logger.debug "Checking for SCD fields"
|
177
183
|
s = String.new
|
178
184
|
if scd_fields
|
179
185
|
scd_fields.each { |f| s << row[f].to_s }
|
@@ -193,7 +199,7 @@ module ETL #:nodoc:
|
|
193
199
|
|
194
200
|
timestamp = Time.now
|
195
201
|
|
196
|
-
ETL::Engine.logger.debug "
|
202
|
+
ETL::Engine.logger.debug "Checking record change type"
|
197
203
|
if record
|
198
204
|
if record.crc != crc.to_s
|
199
205
|
# SCD Type 1: only the new row should be added
|
@@ -206,7 +212,7 @@ module ETL #:nodoc:
|
|
206
212
|
q = "SELECT * FROM #{dimension_table} WHERE "
|
207
213
|
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
208
214
|
#puts "looking for original record"
|
209
|
-
result = ActiveRecord::Base.connection.select_one(q)
|
215
|
+
result = ETL::ActiveRecord::Base.connection.select_one(q)
|
210
216
|
if result
|
211
217
|
#puts "Result: #{result.inspect}"
|
212
218
|
original_record = ETL::Row[result.symbolize_keys!]
|
@@ -230,9 +236,12 @@ module ETL #:nodoc:
|
|
230
236
|
|
231
237
|
q = "SELECT * FROM #{dimension_table} WHERE "
|
232
238
|
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
233
|
-
result = ActiveRecord::Base.connection.select_one(q)
|
239
|
+
result = ETL::ActiveRecord::Base.connection.select_one(q)
|
234
240
|
if result
|
235
|
-
|
241
|
+
# This was necessary when truncating and then loading, however I
|
242
|
+
# am getting reluctant to having the ETL process do the truncation
|
243
|
+
# as part of the bulk load, favoring using a preprocessor instead.
|
244
|
+
# buffer << ETL::Row[result.symbolize_keys!]
|
236
245
|
else
|
237
246
|
# The record never made it into the database, so add the effective and end date
|
238
247
|
# and add it into the bulk load file
|
@@ -262,8 +271,6 @@ module ETL #:nodoc:
|
|
262
271
|
)
|
263
272
|
end
|
264
273
|
end
|
265
|
-
rescue => e
|
266
|
-
puts e
|
267
274
|
end
|
268
275
|
|
269
276
|
# Add any virtual fields to the row. Virtual rows will get their value
|
@@ -288,7 +295,11 @@ module ETL #:nodoc:
|
|
288
295
|
when Proc
|
289
296
|
row[key] = value.call(row)
|
290
297
|
else
|
291
|
-
|
298
|
+
if value.is_a?(ETL::Generator::Generator)
|
299
|
+
row[key] = value.next
|
300
|
+
else
|
301
|
+
row[key] = value
|
302
|
+
end
|
292
303
|
end
|
293
304
|
end
|
294
305
|
end
|
@@ -304,7 +315,7 @@ module ETL #:nodoc:
|
|
304
315
|
when String, Symbol
|
305
316
|
[configuration[:natural_key].to_sym]
|
306
317
|
else
|
307
|
-
[
|
318
|
+
[] # no natural key defined
|
308
319
|
end
|
309
320
|
end
|
310
321
|
end
|
@@ -30,7 +30,7 @@ module ETL #:nodoc:
|
|
30
30
|
#
|
31
31
|
# Mapping options:
|
32
32
|
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
33
|
-
def initialize(control, configuration, mapping)
|
33
|
+
def initialize(control, configuration, mapping={})
|
34
34
|
super
|
35
35
|
@truncate = configuration[:truncate] ||= false
|
36
36
|
@unique = configuration[:unique]
|
@@ -60,7 +60,7 @@ module ETL #:nodoc:
|
|
60
60
|
values << "'#{row[name]}'" # TODO: this is probably not database agnostic
|
61
61
|
end
|
62
62
|
q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
|
63
|
-
|
63
|
+
ETL::Engine.logger.debug("Executing insert: #{q}")
|
64
64
|
conn.insert(q, "Insert row #{current_row}")
|
65
65
|
@current_row += 1
|
66
66
|
end
|
@@ -38,7 +38,7 @@ module ETL #:nodoc:
|
|
38
38
|
#
|
39
39
|
# Mapping options:
|
40
40
|
# * <tt>:order</tt>: The order array
|
41
|
-
def initialize(control, configuration, mapping)
|
41
|
+
def initialize(control, configuration, mapping={})
|
42
42
|
super
|
43
43
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
44
44
|
@append = configuration[:append] ||= false
|
data/lib/etl/control/source.rb
CHANGED
@@ -81,6 +81,16 @@ module ETL #:nodoc:
|
|
81
81
|
File.join(local_dir, "#{filename}.csv")
|
82
82
|
end
|
83
83
|
|
84
|
+
# Get the last fully written local file
|
85
|
+
def last_local_file
|
86
|
+
File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the last local file trigger
|
90
|
+
def last_local_file_trigger
|
91
|
+
Dir.glob(File.join(local_directory, '*.trig')).last
|
92
|
+
end
|
93
|
+
|
84
94
|
# Get the local trigger file that is used to indicate that the file has
|
85
95
|
# been completely written
|
86
96
|
def local_file_trigger(file)
|
@@ -65,6 +65,22 @@ module ETL #:nodoc:
|
|
65
65
|
configuration[:order]
|
66
66
|
end
|
67
67
|
|
68
|
+
# Return the column which is used for in the where clause to identify
|
69
|
+
# new rows
|
70
|
+
def new_records_only
|
71
|
+
configuration[:new_records_only]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the number of rows in the source
|
75
|
+
def count(use_cache=true)
|
76
|
+
return @count if @count && use_cache
|
77
|
+
if store_locally || read_locally
|
78
|
+
@count = count_locally
|
79
|
+
else
|
80
|
+
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
68
84
|
# Get the list of columns to read. This is defined in the source
|
69
85
|
# definition as either an Array or Hash
|
70
86
|
def columns
|
@@ -86,7 +102,7 @@ module ETL #:nodoc:
|
|
86
102
|
def each(&block)
|
87
103
|
if read_locally # Read from the last stored source
|
88
104
|
ETL::Engine.logger.debug "Reading from local cache"
|
89
|
-
read_rows(
|
105
|
+
read_rows(last_local_file, &block)
|
90
106
|
else # Read from the original source
|
91
107
|
if store_locally
|
92
108
|
file = local_file
|
@@ -94,7 +110,8 @@ module ETL #:nodoc:
|
|
94
110
|
read_rows(file, &block)
|
95
111
|
else
|
96
112
|
connection.select_all(query).each do |row|
|
97
|
-
row = Row.new(row.symbolize_keys)
|
113
|
+
row = ETL::Row.new(row.symbolize_keys)
|
114
|
+
row.source = self
|
98
115
|
yield row
|
99
116
|
end
|
100
117
|
end
|
@@ -110,6 +127,7 @@ module ETL #:nodoc:
|
|
110
127
|
t = Benchmark.realtime do
|
111
128
|
FasterCSV.open(file, :headers => true).each do |row|
|
112
129
|
result_row = ETL::Row.new
|
130
|
+
result_row.source = self
|
113
131
|
row.each do |header, field|
|
114
132
|
result_row[header.to_sym] = field
|
115
133
|
end
|
@@ -119,6 +137,12 @@ module ETL #:nodoc:
|
|
119
137
|
ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
|
120
138
|
end
|
121
139
|
|
140
|
+
def count_locally
|
141
|
+
counter = 0
|
142
|
+
File.open(last_local_file, 'r').each { |line| counter += 1 }
|
143
|
+
counter
|
144
|
+
end
|
145
|
+
|
122
146
|
# Write rows to the local cache
|
123
147
|
def write_local(file)
|
124
148
|
lines = 0
|
@@ -140,6 +164,16 @@ module ETL #:nodoc:
|
|
140
164
|
return @query if @query
|
141
165
|
q = "SELECT #{select} FROM #{configuration[:table]}"
|
142
166
|
q << " #{join}" if join
|
167
|
+
|
168
|
+
if new_records_only
|
169
|
+
last_completed = ETL::Execution::Job.maximum('created_at',
|
170
|
+
:conditions => ['control_file = ? and completed_at is not null', control.file]
|
171
|
+
)
|
172
|
+
if last_completed
|
173
|
+
q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
143
177
|
q << " ORDER BY #{order}" if order
|
144
178
|
if ETL::Engine.limit || ETL::Engine.offset
|
145
179
|
options = {}
|
@@ -147,7 +181,7 @@ module ETL #:nodoc:
|
|
147
181
|
options[:offset] = ETL::Engine.offset if ETL::Engine.offset
|
148
182
|
connection.add_limit_offset!(q, options)
|
149
183
|
end
|
150
|
-
|
184
|
+
|
151
185
|
q = q.gsub(/\n/,' ')
|
152
186
|
ETL::Engine.logger.info "Query: #{q}"
|
153
187
|
@query = q
|
data/lib/etl/engine.rb
CHANGED
@@ -20,7 +20,8 @@ module ETL #:nodoc:
|
|
20
20
|
@read_locally = options[:read_locally]
|
21
21
|
options[:config] ||= 'database.yml'
|
22
22
|
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
23
|
-
ActiveRecord::Base.configurations = database_configuration
|
23
|
+
ETL::ActiveRecord::Base.configurations = database_configuration
|
24
|
+
ActiveRecord::Base.configurations.merge!(ETL::ActiveRecord::Base.configurations)
|
24
25
|
require 'etl/execution'
|
25
26
|
ETL::Execution::Base.establish_connection :etl_execution
|
26
27
|
ETL::Execution::Execution.migrate
|
data/lib/etl/execution/base.rb
CHANGED
@@ -4,6 +4,15 @@ module ETL #:nodoc:
|
|
4
4
|
module Generator #:nodoc:
|
5
5
|
# Surrogate key generator.
|
6
6
|
class SurrogateKeyGenerator < Generator
|
7
|
+
# Initialize the generator
|
8
|
+
def initialize(options={})
|
9
|
+
if options[:query]
|
10
|
+
@surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
|
11
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
12
|
+
@surrogate_key = @surrogate_key.to_i
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
7
16
|
# Get the next surrogate key
|
8
17
|
def next
|
9
18
|
@surrogate_key ||= 0
|
@@ -41,7 +41,7 @@ module ETL #:nodoc:
|
|
41
41
|
|
42
42
|
private
|
43
43
|
def validate_row(row, line, file)
|
44
|
-
ETL::Engine.logger.debug "
|
44
|
+
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
|
45
45
|
if row.length != fields.length
|
46
46
|
raise_with_info( MismatchError,
|
47
47
|
"The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
|
@@ -30,7 +30,7 @@ module ETL #:nodoc:
|
|
30
30
|
@columns = configuration[:columns]
|
31
31
|
|
32
32
|
q = "SELECT COUNT(*) FROM #{table}"
|
33
|
-
@should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
|
33
|
+
@should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
|
34
34
|
end
|
35
35
|
|
36
36
|
# Return true if the given key should be skipped
|
@@ -51,12 +51,12 @@ module ETL #:nodoc:
|
|
51
51
|
# Process the row
|
52
52
|
def process(row)
|
53
53
|
return row unless should_check?
|
54
|
-
connection = ActiveRecord::Base.connection
|
54
|
+
connection = ETL::ActiveRecord::Base.connection
|
55
55
|
q = "SELECT * FROM #{table} WHERE "
|
56
56
|
conditions = []
|
57
57
|
row.each do |k,v|
|
58
58
|
if columns.nil? || columns.include?(k.to_sym)
|
59
|
-
conditions << "#{k} = #{connection.quote(v)}" unless skip?(k)
|
59
|
+
conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
|
60
60
|
end
|
61
61
|
end
|
62
62
|
q << conditions.join(" AND ")
|
@@ -8,7 +8,9 @@ module ETL #:nodoc:
|
|
8
8
|
# Initialize the surrogate key generator
|
9
9
|
def initialize(control, configuration)
|
10
10
|
super
|
11
|
-
|
11
|
+
if configuration[:query]
|
12
|
+
@surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
|
13
|
+
end
|
12
14
|
#puts "initial surrogate key: #{@surrogate_key}"
|
13
15
|
@surrogate_key = 0 if @surrogate_key.blank?
|
14
16
|
@surrogate_key = @surrogate_key.to_i
|
data/lib/etl/row.rb
CHANGED
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-04-
|
6
|
+
version: 0.8.0
|
7
|
+
date: 2007-04-12 00:00:00 -04:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|