activewarehouse-etl 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -101,4 +101,21 @@
101
101
  * Fixed source caching
102
102
 
103
103
  0.7.2 - Apr 8, 2007
104
- * Fixed quoting bug in CheckExistProcessor
104
+ * Fixed quoting bug in CheckExistProcessor
105
+
106
+ 0.8.0 - Apr 12, 2007
107
+ * Source now available through the current row source accessor.
108
+ * Added new_rows_only configuration option to DatabaseSource. A date field must be specified and
109
+ only records that are greater than the date value in that field, relative to the last successful
110
+ execution, will be returned from the source.
111
+ * Added an (untested) count feature which returns the number of rows for processing.
112
+ * If no natural key is defined then an empty array will now be used, resulting in the row being
113
+ written to the output without going through change checks.
114
+ * Mapping argument in destination is now optional. An empty hash will be used if the mapping
115
+ hash is not specified. If the mapping hash is not specified then the order will be determined
116
+ using the originating source's order.
117
+ * ActiveRecord configurations loaded from database.yml by the etl tool will be merged with
118
+ ActiveRecord::Base.configurations.
119
+ * Fixed several bugs in how record change detection was implemented.
120
+ * Fixed how the read_locally functionality was implemented so that it will find that last
121
+ completed local source copy using the source's trigger file (untested).
@@ -162,10 +162,16 @@ module ETL #:nodoc:
162
162
 
163
163
  # Process a row to determine the change type
164
164
  def process_change(row)
165
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
165
166
  return unless row
166
167
 
167
168
  # Change processing can only occur if the natural key exists in the row
168
- supports_change = true
169
+ ETL::Engine.logger.debug "Checking for natural key existence"
170
+ if natural_key.length == 0
171
+ buffer << row
172
+ return
173
+ end
174
+
169
175
  natural_key.each do |key|
170
176
  unless row.has_key?(key)
171
177
  buffer << row
@@ -173,7 +179,7 @@ module ETL #:nodoc:
173
179
  end
174
180
  end
175
181
 
176
- ETL::Engine.logger.debug "checking scd fields"
182
+ ETL::Engine.logger.debug "Checking for SCD fields"
177
183
  s = String.new
178
184
  if scd_fields
179
185
  scd_fields.each { |f| s << row[f].to_s }
@@ -193,7 +199,7 @@ module ETL #:nodoc:
193
199
 
194
200
  timestamp = Time.now
195
201
 
196
- ETL::Engine.logger.debug "checking record change type"
202
+ ETL::Engine.logger.debug "Checking record change type"
197
203
  if record
198
204
  if record.crc != crc.to_s
199
205
  # SCD Type 1: only the new row should be added
@@ -206,7 +212,7 @@ module ETL #:nodoc:
206
212
  q = "SELECT * FROM #{dimension_table} WHERE "
207
213
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
208
214
  #puts "looking for original record"
209
- result = ActiveRecord::Base.connection.select_one(q)
215
+ result = ETL::ActiveRecord::Base.connection.select_one(q)
210
216
  if result
211
217
  #puts "Result: #{result.inspect}"
212
218
  original_record = ETL::Row[result.symbolize_keys!]
@@ -230,9 +236,12 @@ module ETL #:nodoc:
230
236
 
231
237
  q = "SELECT * FROM #{dimension_table} WHERE "
232
238
  q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
233
- result = ActiveRecord::Base.connection.select_one(q)
239
+ result = ETL::ActiveRecord::Base.connection.select_one(q)
234
240
  if result
235
- buffer << ETL::Row[result.symbolize_keys!]
241
+ # This was necessary when truncating and then loading, however I
242
+ # am getting reluctant to having the ETL process do the truncation
243
+ # as part of the bulk load, favoring using a preprocessor instead.
244
+ # buffer << ETL::Row[result.symbolize_keys!]
236
245
  else
237
246
  # The record never made it into the database, so add the effective and end date
238
247
  # and add it into the bulk load file
@@ -262,8 +271,6 @@ module ETL #:nodoc:
262
271
  )
263
272
  end
264
273
  end
265
- rescue => e
266
- puts e
267
274
  end
268
275
 
269
276
  # Add any virtual fields to the row. Virtual rows will get their value
@@ -288,7 +295,11 @@ module ETL #:nodoc:
288
295
  when Proc
289
296
  row[key] = value.call(row)
290
297
  else
291
- row[key] = value
298
+ if value.is_a?(ETL::Generator::Generator)
299
+ row[key] = value.next
300
+ else
301
+ row[key] = value
302
+ end
292
303
  end
293
304
  end
294
305
  end
@@ -304,7 +315,7 @@ module ETL #:nodoc:
304
315
  when String, Symbol
305
316
  [configuration[:natural_key].to_sym]
306
317
  else
307
- [:id]
318
+ [] # no natural key defined
308
319
  end
309
320
  end
310
321
  end
@@ -30,7 +30,7 @@ module ETL #:nodoc:
30
30
  #
31
31
  # Mapping options:
32
32
  # * <tt>:order</tt>: The order of fields to write (REQUIRED)
33
- def initialize(control, configuration, mapping)
33
+ def initialize(control, configuration, mapping={})
34
34
  super
35
35
  @truncate = configuration[:truncate] ||= false
36
36
  @unique = configuration[:unique]
@@ -60,7 +60,7 @@ module ETL #:nodoc:
60
60
  values << "'#{row[name]}'" # TODO: this is probably not database agnostic
61
61
  end
62
62
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
63
- # ETL::Engine.logger.debug("Query: #{q}")
63
+ ETL::Engine.logger.debug("Executing insert: #{q}")
64
64
  conn.insert(q, "Insert row #{current_row}")
65
65
  @current_row += 1
66
66
  end
@@ -38,7 +38,7 @@ module ETL #:nodoc:
38
38
  #
39
39
  # Mapping options:
40
40
  # * <tt>:order</tt>: The order array
41
- def initialize(control, configuration, mapping)
41
+ def initialize(control, configuration, mapping={})
42
42
  super
43
43
  @file = File.join(File.dirname(control.file), configuration[:file])
44
44
  @append = configuration[:append] ||= false
@@ -81,6 +81,16 @@ module ETL #:nodoc:
81
81
  File.join(local_dir, "#{filename}.csv")
82
82
  end
83
83
 
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
84
94
  # Get the local trigger file that is used to indicate that the file has
85
95
  # been completely written
86
96
  def local_file_trigger(file)
@@ -65,6 +65,22 @@ module ETL #:nodoc:
65
65
  configuration[:order]
66
66
  end
67
67
 
68
+ # Return the column which is used for in the where clause to identify
69
+ # new rows
70
+ def new_records_only
71
+ configuration[:new_records_only]
72
+ end
73
+
74
+ # Get the number of rows in the source
75
+ def count(use_cache=true)
76
+ return @count if @count && use_cache
77
+ if store_locally || read_locally
78
+ @count = count_locally
79
+ else
80
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
81
+ end
82
+ end
83
+
68
84
  # Get the list of columns to read. This is defined in the source
69
85
  # definition as either an Array or Hash
70
86
  def columns
@@ -86,7 +102,7 @@ module ETL #:nodoc:
86
102
  def each(&block)
87
103
  if read_locally # Read from the last stored source
88
104
  ETL::Engine.logger.debug "Reading from local cache"
89
- read_rows(local_file, &block)
105
+ read_rows(last_local_file, &block)
90
106
  else # Read from the original source
91
107
  if store_locally
92
108
  file = local_file
@@ -94,7 +110,8 @@ module ETL #:nodoc:
94
110
  read_rows(file, &block)
95
111
  else
96
112
  connection.select_all(query).each do |row|
97
- row = Row.new(row.symbolize_keys)
113
+ row = ETL::Row.new(row.symbolize_keys)
114
+ row.source = self
98
115
  yield row
99
116
  end
100
117
  end
@@ -110,6 +127,7 @@ module ETL #:nodoc:
110
127
  t = Benchmark.realtime do
111
128
  FasterCSV.open(file, :headers => true).each do |row|
112
129
  result_row = ETL::Row.new
130
+ result_row.source = self
113
131
  row.each do |header, field|
114
132
  result_row[header.to_sym] = field
115
133
  end
@@ -119,6 +137,12 @@ module ETL #:nodoc:
119
137
  ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
120
138
  end
121
139
 
140
+ def count_locally
141
+ counter = 0
142
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
143
+ counter
144
+ end
145
+
122
146
  # Write rows to the local cache
123
147
  def write_local(file)
124
148
  lines = 0
@@ -140,6 +164,16 @@ module ETL #:nodoc:
140
164
  return @query if @query
141
165
  q = "SELECT #{select} FROM #{configuration[:table]}"
142
166
  q << " #{join}" if join
167
+
168
+ if new_records_only
169
+ last_completed = ETL::Execution::Job.maximum('created_at',
170
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
171
+ )
172
+ if last_completed
173
+ q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
174
+ end
175
+ end
176
+
143
177
  q << " ORDER BY #{order}" if order
144
178
  if ETL::Engine.limit || ETL::Engine.offset
145
179
  options = {}
@@ -147,7 +181,7 @@ module ETL #:nodoc:
147
181
  options[:offset] = ETL::Engine.offset if ETL::Engine.offset
148
182
  connection.add_limit_offset!(q, options)
149
183
  end
150
- #q << " LIMIT #{ETL::Engine.limit}" unless ETL::Engine.limit.nil?
184
+
151
185
  q = q.gsub(/\n/,' ')
152
186
  ETL::Engine.logger.info "Query: #{q}"
153
187
  @query = q
@@ -45,6 +45,7 @@ module ETL #:nodoc:
45
45
  # TODO skip rows if offset is defined
46
46
  # TODO stop processing if limit is reached
47
47
  row = ETL::Row[row]
48
+ row.source = self
48
49
  yield row
49
50
  end
50
51
  end
data/lib/etl/engine.rb CHANGED
@@ -20,7 +20,8 @@ module ETL #:nodoc:
20
20
  @read_locally = options[:read_locally]
21
21
  options[:config] ||= 'database.yml'
22
22
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
23
- ActiveRecord::Base.configurations = database_configuration
23
+ ETL::ActiveRecord::Base.configurations = database_configuration
24
+ ActiveRecord::Base.configurations.merge!(ETL::ActiveRecord::Base.configurations)
24
25
  require 'etl/execution'
25
26
  ETL::Execution::Base.establish_connection :etl_execution
26
27
  ETL::Execution::Execution.migrate
@@ -1,7 +1,7 @@
1
1
  module ETL #:nodoc:
2
2
  module Execution #:nodoc:
3
3
  # Base class for ETL execution information
4
- class Base < ActiveRecord::Base
4
+ class Base < ETL::ActiveRecord::Base
5
5
  self.abstract_class = true
6
6
  establish_connection :etl_execution
7
7
  end
@@ -4,6 +4,15 @@ module ETL #:nodoc:
4
4
  module Generator #:nodoc:
5
5
  # Surrogate key generator.
6
6
  class SurrogateKeyGenerator < Generator
7
+ # Initialize the generator
8
+ def initialize(options={})
9
+ if options[:query]
10
+ @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(options[:query])
11
+ @surrogate_key = 0 if @surrogate_key.blank?
12
+ @surrogate_key = @surrogate_key.to_i
13
+ end
14
+ end
15
+
7
16
  # Get the next surrogate key
8
17
  def next
9
18
  @surrogate_key ||= 0
@@ -41,7 +41,7 @@ module ETL #:nodoc:
41
41
 
42
42
  private
43
43
  def validate_row(row, line, file)
44
- ETL::Engine.logger.debug "validing line #{line} in file #{file}"
44
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
45
45
  if row.length != fields.length
46
46
  raise_with_info( MismatchError,
47
47
  "The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
@@ -30,7 +30,7 @@ module ETL #:nodoc:
30
30
  @columns = configuration[:columns]
31
31
 
32
32
  q = "SELECT COUNT(*) FROM #{table}"
33
- @should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
33
+ @should_check = ETL::ActiveRecord::Base.connection.select_value(q).to_i > 0
34
34
  end
35
35
 
36
36
  # Return true if the given key should be skipped
@@ -51,12 +51,12 @@ module ETL #:nodoc:
51
51
  # Process the row
52
52
  def process(row)
53
53
  return row unless should_check?
54
- connection = ActiveRecord::Base.connection
54
+ connection = ETL::ActiveRecord::Base.connection
55
55
  q = "SELECT * FROM #{table} WHERE "
56
56
  conditions = []
57
57
  row.each do |k,v|
58
58
  if columns.nil? || columns.include?(k.to_sym)
59
- conditions << "#{k} = #{connection.quote(v)}" unless skip?(k)
59
+ conditions << "#{k} = #{connection.quote(v)}" unless skip?(k.to_sym)
60
60
  end
61
61
  end
62
62
  q << conditions.join(" AND ")
@@ -8,7 +8,9 @@ module ETL #:nodoc:
8
8
  # Initialize the surrogate key generator
9
9
  def initialize(control, configuration)
10
10
  super
11
- @surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
11
+ if configuration[:query]
12
+ @surrogate_key = ETL::ActiveRecord::Base.connection.select_value(configuration[:query])
13
+ end
12
14
  #puts "initial surrogate key: #{@surrogate_key}"
13
15
  @surrogate_key = 0 if @surrogate_key.blank?
14
16
  @surrogate_key = @surrogate_key.to_i
data/lib/etl/row.rb CHANGED
@@ -3,6 +3,9 @@
3
3
  module ETL #:nodoc:
4
4
  # This class represents a single row currently passing through the ETL pipeline
5
5
  class Row < Hash
6
+ # Accessor for the originating source
7
+ attr_accessor :source
8
+
6
9
  # All change types
7
10
  CHANGE_TYPES = [:insert, :update, :delete]
8
11
 
data/lib/etl/version.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 7
5
- TINY = 2
4
+ MINOR = 8
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.7.2
7
- date: 2007-04-08 00:00:00 -04:00
6
+ version: 0.8.0
7
+ date: 2007-04-12 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib