activewarehouse-etl 0.7.2 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -101,4 +101,21 @@
101
101
  * Fixed source caching
102
102
 
103
103
  0.7.2 - Apr 8, 2007
104
- * Fixed quoting bug in CheckExistProcessor
104
+ * Fixed quoting bug in CheckExistProcessor
105
+
106
+ 0.8.0 - Apr 12, 2007
107
+ * Source now available through the current row source accessor.
108
+ * Added new_rows_only configuration option to DatabaseSource. A date field must be specified and
109
+ only records that are greater than the date value in that field, relative to the last successful
110
+ execution, will be returned from the source.
111
+ * Added an (untested) count feature which returns the number of rows for processing.
112
+ * If no natural key is defined then an empty array will now be used, resulting in the row being
113
+ written to the output without going through change checks.
114
+ * Mapping argument in destination is now optional. An empty hash will be used if the mapping
115
+ hash is not specified. If the mapping hash is not specified then the order will be determined
116
+ using the originating source's order.
117
+ * ActiveRecord configurations loaded from database.yml by the etl tool will be merged with
118
+ ActiveRecord::Base.configurations.
119
+ * Fixed several bugs in how record change detection was implemented.
120
+ * Fixed how the read_locally functionality was implemented so that it will find that last
121
+ completed local source copy using the source's trigger file (untested).
@@ -162,10 +162,16 @@ module ETL #:nodoc:
162
162
 
163
163
  # Process a row to determine the change type
164
164
  def process_change(row)
165
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
165
166
  return unless row
166
167
 
167
168
  # Change processing can only occur if the natural key exists in the row
168
- supports_change = true
169
+ ETL::Engine.logger.debug "Checking for natural key existence"
170
+ if natural_key.length == 0
171
+ buffer << row
172
+ return
173
+ end
174
+
169
175
  natural_key.each do |key|
170
176
  unless row.has_key?(key)
171
177
  buffer << row
@@ -173,7 +179,7 @@ module ETL #:nodoc:
173
179
  end
174
180
  end
175
181
 
176
- ETL::Engine.logger.debug "checking scd fields"
182
+ ETL::Engine.logger.debug "Checking for SCD fields"
177
183
  s = String.new
178
184
  if scd_fields
179
185
  scd_fields.each { |f| s << row[f].to_s }
@@ -193,7 +199,7 @@ module ETL #:nodoc:
193
199
 
194
200
  timestamp = Time.now
195
201
 
196
- ETL::Engine.logger.debug "checking record change type"
202
+ ETL::Engine.logger.debug "Checking record change type"
197
203
  if record
198
204
  if record.crc != crc.to_s
199
205
  # SCD Type 1: only the new row should be added
@@ -206,7 +212,7 @@ module ETL #:nodoc:
206
212
  q = "SELECT * FROM #{dimension_table} WHERE "
207
213
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
208
214
  #puts "looking for original record"
209
- result = ActiveRecord::Base.connection.select_one(q)
215
+ result = ETL::ActiveRecord::Base.connection.select_one(q)
210
216
  if result
211
217
  #puts "Result: #{result.inspect}"
212
218
  original_record = ETL::Row[result.symbolize_keys!]
@@ -230,9 +236,12 @@ module ETL #:nodoc:
230
236
 
231
237
  q = "SELECT * FROM #{dimension_table} WHERE "
232
238
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
233
- result = ActiveRecord::Base.connection.select_one(q)
239
+ result = ETL::ActiveRecord::Base.connection.select_one(q)
234
240
  if result
235
- buffer << ETL::Row[result.symbolize_keys!]
241
+ # This was necessary when truncating and then loading, however I
242
+ # am getting reluctant to having the ETL process do the truncation
243
+ # as part of the bulk load, favoring using a preprocessor instead.
244
+ # buffer << ETL::Row[result.symbolize_keys!]
236
245
  else
237
246
  # The record never made it into the database, so add the effective and end date
238
247
  # and add it into the bulk load file
@@ -262,8 +271,6 @@ module ETL #:nodoc:
262
271
  )
263
272
  end
264
273
  end
265
- rescue => e
266
- puts e
267
274
  end
268
275
 
269
276
  # Add any virtual fields to the row. Virtual rows will get their value
@@ -288,7 +295,11 @@ module ETL #:nodoc:
288
295
  when Proc
289
296
  row[key] = value.call(row)
290
297
  else
291
- row[key] = value
298
+ if value.is_a?(ETL::Generator::Generator)
299
+ row[key] = value.next
300
+ else
301
+ row[key] = value
302
+ end
292
303
  end
293
304
  end
294
305
  end
@@ -304,7 +315,7 @@ module ETL #:nodoc:
304
315
  when String, Symbol
305
316
  [configuration[:natural_key].to_sym]
306
317
  else
307
- [:id]
318
+ [] # no natural key defined
308
319
  end
309
320
  end
310
321
  end
@@ -30,7 +30,7 @@ module ETL #:nodoc:
30
30
  #
31
31
  # Mapping options:
32
32
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
33
- def initialize(control, configuration, mapping)
33
+ def initialize(control, configuration, mapping={})
34
34
  super
35
35
  @truncate = configuration[:truncate] ||= false
36
36
  @unique = configuration[:unique]
@@ -60,7 +60,7 @@ module ETL #:nodoc:
60
60
  values << "'#{row[name]}'" # TODO: this is probably not database agnostic
61
61
  end
62
62
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
63
- # ETL::Engine.logger.debug("Query: #{q}")
63
+ ETL::Engine.logger.debug("Executing insert: #{q}")
64
64
  conn.insert(q, "Insert row #{current_row}")
65
65
  @current_row += 1
66
66
  end
@@ -38,7 +38,7 @@ module ETL #:nodoc:
38
38
  #
39
39
  # Mapping options:
40
40
  # * <tt>:order</tt>: The order array
41
- def initialize(control, configuration, mapping)
41
+ def initialize(control, configuration, mapping={})
42
42
  super
43
43
  @file = File.join(File.dirname(control.file), configuration[:file])
44
44
  @append = configuration[:append] ||= false
@@ -81,6 +81,16 @@ module ETL #:nodoc:
81
81
  File.join(local_dir, "#{filename}.csv")
82
82
  end
83
83
 
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
84
94
  # Get the local trigger file that is used to indicate that the file has
85
95
  # been completely written
86
96
  def local_file_trigger(file)
@@ -65,6 +65,22 @@ module ETL #:nodoc:
65
65
  configuration[:order]
66
66
  end
67
67
 
68
+ # Return the column which is used for in the where clause to identify
69
+ # new rows
70
+ def new_records_only
71
+ configuration[:new_records_only]
72
+ end
73
+
74
+ # Get the number of rows in the source
75
+ def count(use_cache=true)
76
+ return @count if @count && use_cache
77
+ if store_locally || read_locally
78
+ @count = count_locally
79
+ else
80
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
81
+ end
82
+ end
83
+
68
84
  # Get the list of columns to read. This is defined in the source
69
85
  # definition as either an Array or Hash
70
86
  def columns
@@ -86,7 +102,7 @@ module ETL #:nodoc:
86
102
  def each(&block)
87
103
  if read_locally # Read from the last stored source
88
104
  ETL::Engine.logger.debug "Reading from local cache"
89
- read_rows(local_file, &block)
105
+ read_rows(last_local_file, &block)
90
106
  else # Read from the original source
91
107
  if store_locally
92
108
  file = local_file
@@ -94,7 +110,8 @@ module ETL #:nodoc:
94
110
  read_rows(file, &block)
95
111
  else
96
112
  connection.select_all(query).each do |row|
97
- row = Row.new(row.symbolize_keys)
113
+ row = ETL::Row.new(row.symbolize_keys)
114
+ row.source = self
98
115
  yield row
99
116
  end
100
117
  end
@@ -110,6 +127,7 @@ module ETL #:nodoc:
110
127
  t = Benchmark.realtime do
111
128
  FasterCSV.open(file, :headers => true).each do |row|
112
129
  result_row = ETL::Row.new
130
+ result_row.source = self
113
131
  row.each do |header, field|
114
132
  result_row[header.to_sym] = field
115
133
  end
@@ -119,6 +137,12 @@ module ETL #:nodoc:
119
137
  ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
120
138
  end
121
139
 
140
+ def count_locally
141
+ counter = 0
142
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
143
+ counter
144
+ end
145
+
122
146
  # Write rows to the local cache
123
147
  def write_local(file)
124
148
  lines = 0
@@ -140,6 +164,16 @@ module ETL #:nodoc:
140
164
  return @query if @query
141
165
  q = "SELECT #{select} FROM #{configuration[:table]}"
142
166
  q << " #{join}" if join
167
+
168
+ if new_records_only
169
+ last_completed = ETL::Execution::Job.maximum('created_at',
170
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
171
+ )
172
+ if last_completed
173
+ q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
174
+ end
175
+ end
176
+
143
177
  q << " ORDER BY #{order}" if order
144
178
  if ETL::Engine.limit || ETL::Engine.offset
145
179
  options = {}
@@ -147,7 +181,7 @@ module ETL #:nodoc:
147
181
  options[:offset] = ETL::Engine.offset if ETL::Engine.offset
148
182
  connection.add_limit_offset!(q, options)
149
183
  end
150
- #q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
184
+
151
185
  q = q.gsub(/\n/,' ')
152
186
  ETL::Engine.logger.info "Query: #{q}"
153
187
  @query = q
@@ -45,6 +45,7 @@ module ETL #:nodoc:
45
45
  # TODO skip rows if offset is defined
46
46
  # TODO stop processing if limit is reached
47
47
  row = ETL::Row[row]
48
+ row.source = self
48
49
  yield row
49
50
  end
50
51
  end
data/lib/etl/engine.rb CHANGED
@@ -20,7 +20,8 @@ module ETL #:nodoc:
20
20
  @read_locally = options[:read_locally]
21
21
  options[:config] ||= 'database.yml'
22
22
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
23
- ActiveRecord::Base.configurations = database_configuration
23
+ ETL::ActiveRecord::Base.configurations = database_configuration
24
+ ActiveRecord::Base.configurations.merge!(ETL::ActiveRecord::Base.configurations)
24
25
  require 'etl/execution'
25
26
  ETL::Execution::Base.establish_connection :etl_execution
26
27
  ETL::Execution::Execution.migrate
@@ -1,7 +1,7 @@
1
1
  module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Base class for ETL execution information
4
- class Base < ActiveRecord::Base
4
+ class Base < ETL::ActiveRecord::Base
5
5
  self.abstract_class = true
6
6
  establish_connection :etl_execution
7
7
  end
@@ -4,6 +4,15 @@ module ETL #:nodoc:
4
4
  module Generator #:nodoc:
5
5
  # Surrogate key generator.
6
6
  class SurrogateKeyGenerator < Generator
7
+ # Initialize the generator
8
+ def initialize(options={})
9
+ if options[:query]
10
+ @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
11
+ @surrogate_key = 0 if @surrogate_key.blank?
12
+ @surrogate_key = @surrogate_key.to_i
13
+ end
14
+ end
15
+
7
16
  # Get the next surrogate key
8
17
  def next
9
18
  @surrogate_key ||= 0
@@ -41,7 +41,7 @@ module ETL #:nodoc:
41
41
 
42
42
  private
43
43
  def validate_row(row, line, file)
44
- ETL::Engine.logger.debug "validing line #{line} in file #{file}"
44
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
45
45
  if row.length != fields.length
46
46
  raise_with_info( MismatchError,
47
47
  "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
@@ -30,7 +30,7 @@ module ETL #:nodoc:
30
30
  @columns = configuration[:columns]
31
31
 
32
32
  q = "SELECT COUNT(*) FROM #{table}"
33
- @should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
33
+ @should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
34
34
  end
35
35
 
36
36
  # Return true if the given key should be skipped
@@ -51,12 +51,12 @@ module ETL #:nodoc:
51
51
  # Process the row
52
52
  def process(row)
53
53
  return row unless should_check?
54
- connection = ActiveRecord::Base.connection
54
+ connection = ETL::ActiveRecord::Base.connection
55
55
  q = "SELECT * FROM #{table} WHERE "
56
56
  conditions = []
57
57
  row.each do |k,v|
58
58
  if columns.nil? || columns.include?(k.to_sym)
59
- conditions << "#{k} = #{connection.quote(v)}" unless skip?(k)
59
+ conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
60
60
  end
61
61
  end
62
62
  q << conditions.join(" AND ")
@@ -8,7 +8,9 @@ module ETL #:nodoc:
8
8
  # Initialize the surrogate key generator
9
9
  def initialize(control, configuration)
10
10
  super
11
- @surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
11
+ if configuration[:query]
12
+ @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
13
+ end
12
14
  #puts "initial surrogate key: #{@surrogate_key}"
13
15
  @surrogate_key = 0 if @surrogate_key.blank?
14
16
  @surrogate_key = @surrogate_key.to_i
data/lib/etl/row.rb CHANGED
@@ -3,6 +3,9 @@
3
3
  module ETL #:nodoc:
4
4
  # This class represents a single row currently passing through the ETL pipeline
5
5
  class Row < Hash
6
+ # Accessor for the originating source
7
+ attr_accessor :source
8
+
6
9
  # All change types
7
10
  CHANGE_TYPES = [:insert, :update, :delete]
8
11
 
data/lib/etl/version.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 7
5
- TINY = 2
4
+ MINOR = 8
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.7.2
7
- date: 2007-04-08 00:00:00 -04:00
6
+ version: 0.8.0
7
+ date: 2007-04-12 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib