activewarehouse-etl 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -125,4 +125,12 @@
125
125
  * Added :type configuration option to the source directive, allowing the source type to be
126
126
  explicitly specified. The source type can be a string or symbol (in which case the class will
127
127
  be constructed by appending Source to the type name), a class (which will be instantiated
128
- and passed the control, configuration and mapping) and finally an actual Source instance.
128
+ and passed the control, configuration and mapping) and finally an actual Source instance.
129
+
130
+ 0.8.2 -
131
+ * Fixed bug with premature destination closing.
132
+ * Added indexes to execution records table.
133
+ * Added a PrintRowProcessor.
134
+ * Added support for conditions and "group by" in the database source.
135
+ * Added after_initialize hook in Processor base class.
136
+ * Added examples directory
data/README CHANGED
@@ -65,7 +65,7 @@ Command line options:
65
65
  * <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
66
66
 
67
67
  == Control File Examples
68
- Control file examples can be found in the test directory.
68
+ Control file examples can be found in the examples directory.
69
69
 
70
70
  == Feedback
71
71
  This is a work in progress. Comments should be made on the
data/Rakefile CHANGED
@@ -47,6 +47,7 @@ PKG_FILES = FileList[
47
47
  'bin/**/*',
48
48
  'doc/**/*',
49
49
  'lib/**/*',
50
+ 'examples/**/*',
50
51
  ] - [ 'test' ]
51
52
 
52
53
  spec = Gem::Specification.new do |s|
@@ -0,0 +1,6 @@
1
+ etl_execution:
2
+ adapter: mysql
3
+ username: root
4
+ host: localhost
5
+ database: etl_execution
6
+ encoding: utf8
@@ -263,12 +263,14 @@ module ETL #:nodoc:
263
263
 
264
264
  # Record the record
265
265
  if ETL::Engine.job # only record the execution if there is a job
266
- ETL::Execution::Record.create!(
267
- :control_file => control.file,
268
- :natural_key => nk,
269
- :crc => crc,
270
- :job_id => ETL::Engine.job.id
271
- )
266
+ ETL::Execution::Record.time_spent += Benchmark.realtime do
267
+ ETL::Execution::Record.create!(
268
+ :control_file => control.file,
269
+ :natural_key => nk,
270
+ :crc => crc,
271
+ :job_id => ETL::Engine.job.id
272
+ )
273
+ end
272
274
  end
273
275
  end
274
276
  end
@@ -60,6 +60,11 @@ module ETL #:nodoc:
60
60
  configuration[:select] || '*'
61
61
  end
62
62
 
63
+ # Get the group by part of the query, defaults to nil
64
+ def group
65
+ configuration[:group]
66
+ end
67
+
63
68
  # Get the order for the query, defaults to nil
64
69
  def order
65
70
  configuration[:order]
@@ -165,16 +170,24 @@ module ETL #:nodoc:
165
170
  q = "SELECT #{select} FROM #{configuration[:table]}"
166
171
  q << " #{join}" if join
167
172
 
173
+ conditions = []
168
174
  if new_records_only
169
175
  last_completed = ETL::Execution::Job.maximum('created_at',
170
176
  :conditions => ['control_file = ? and completed_at is not null', control.file]
171
177
  )
172
178
  if last_completed
173
- q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
179
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
174
180
  end
175
181
  end
176
182
 
183
+ conditions << configuration[:conditions] if configuration[:conditions]
184
+ if conditions.length > 0
185
+ q << " WHERE #{conditions.join(' AND ')}"
186
+ end
187
+
188
+ q << " GROUP BY #{group}" if group
177
189
  q << " ORDER BY #{order}" if order
190
+
178
191
  if ETL::Engine.limit || ETL::Engine.offset
179
192
  options = {}
180
193
  options[:limit] = ETL::Engine.limit if ETL::Engine.limit
@@ -1,9 +1,8 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
3
4
  class EnumerableSource < ETL::Control::Source
4
- def initialize(control, configuration, definition)
5
- super
6
- end
5
+ # Iterate through the enumerable
7
6
  def each(&block)
8
7
  configuration[:enumerable].each(&block)
9
8
  end
@@ -237,7 +237,11 @@ module ETL #:nodoc:
237
237
  errors << msg
238
238
  Engine.logger.error(msg)
239
239
  e.backtrace.each { |line| Engine.logger.error(line) }
240
- exceeded_error_threshold?(control) ? break : next
240
+ begin
241
+ exceeded_error_threshold?(control) ? break : next
242
+ rescue => inner_error
243
+ puts inner_error
244
+ end
241
245
  end
242
246
  end
243
247
 
@@ -286,9 +290,10 @@ module ETL #:nodoc:
286
290
  return
287
291
  end
288
292
 
289
- destinations.each do |destination|
290
- destination.close
291
- end
293
+ end
294
+
295
+ destinations.each do |destination|
296
+ destination.close
292
297
  end
293
298
 
294
299
  say_on_own_line "Executing post processes"
@@ -310,6 +315,8 @@ module ETL #:nodoc:
310
315
  say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
311
316
  say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
312
317
  say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
318
+
319
+ say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
313
320
 
314
321
  # ETL::Transform::Transform.benchmarks.each do |klass, t|
315
322
  # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
@@ -24,7 +24,7 @@ module ETL #:nodoc:
24
24
 
25
25
  # Get the final target version number
26
26
  def target
27
- 1
27
+ 2
28
28
  end
29
29
 
30
30
  private
@@ -43,6 +43,13 @@ module ETL #:nodoc:
43
43
  end
44
44
  update_schema_info(1)
45
45
  end
46
+
47
+ def migration_2 #:nodoc:
48
+ connection.add_index :records, :control_file
49
+ connection.add_index :records, :natural_key
50
+ connection.add_index :records, :job_id
51
+ update_schema_info(2)
52
+ end
46
53
 
47
54
  # Update the schema info table, setting the version value
48
55
  def update_schema_info(version)
@@ -3,6 +3,16 @@ module ETL #:nodoc:
3
3
  # Represents a single record
4
4
  class Record < ETL::Execution::Base
5
5
  belongs_to :table
6
+ class << self
7
+ attr_accessor :time_spent
8
+ def time_spent
9
+ @time_spent ||= 0
10
+ end
11
+ def average_time_spent
12
+ return 0 if time_spent == 0
13
+ ETL::Engine.rows_read / time_spent
14
+ end
15
+ end
6
16
  end
7
17
  end
8
18
  end
@@ -25,7 +25,7 @@ module ETL #:nodoc:
25
25
  # key
26
26
  def initialize(control, configuration)
27
27
  super
28
- @skip = configuration[:skip]
28
+ @skip = configuration[:skip] || []
29
29
  @table = configuration[:table]
30
30
  @columns = configuration[:columns]
31
31
 
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Debugging processor for printing the current row
4
+ class PrintRowProcessor < ETL::Processor::RowProcessor
5
+ # Process the row
6
+ def process(row)
7
+ puts row.inspect
8
+ row
9
+ end
10
+ end
11
+ end
12
+ end
@@ -5,6 +5,7 @@ module ETL #:nodoc:
5
5
  def initialize(control, configuration)
6
6
  @control = control
7
7
  @configuration = configuration
8
+ after_initialize if respond_to?(:after_initialize)
8
9
  end
9
10
  protected
10
11
  # Get the control object
@@ -27,7 +27,7 @@ module ETL #:nodoc:
27
27
  raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
28
28
  raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
29
29
  fk = resolver.resolve(value)
30
- raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
30
+ raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}" unless fk
31
31
  @collection[value] = fk
32
32
  end
33
33
  fk
@@ -61,4 +61,16 @@ class ActiveRecordResolver
61
61
  rec = ar_class.__send__(find_method, value)
62
62
  rec.nil? ? nil : rec.id
63
63
  end
64
+ end
65
+
66
+ class SQLResolver
67
+ def initialize(table, field, connection=nil)
68
+ @table = table
69
+ @field = field
70
+ @connection = connection
71
+ end
72
+ def resolve(value)
73
+ conn = @connection ||= ActiveRecord::Base.connection
74
+ conn.select_value("SELECT id FROM #{table} WHERE #{field} = #{conn.quote(value)}")
75
+ end
64
76
  end
@@ -2,7 +2,7 @@ module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 8
5
- TINY = 1
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.8.1
7
- date: 2007-04-12 00:00:00 -04:00
6
+ version: 0.8.2
7
+ date: 2007-04-15 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -83,6 +83,7 @@ files:
83
83
  - lib/etl/processor/check_unique_processor.rb
84
84
  - lib/etl/processor/copy_field_processor.rb
85
85
  - lib/etl/processor/hierarchy_exploder_processor.rb
86
+ - lib/etl/processor/print_row_processor.rb
86
87
  - lib/etl/processor/processor.rb
87
88
  - lib/etl/processor/rename_processor.rb
88
89
  - lib/etl/processor/row_processor.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/etl/transform/transform.rb
105
106
  - lib/etl/transform/trim_transform.rb
106
107
  - lib/etl/transform/type_transform.rb
108
+ - examples/database.example.yml
107
109
  test_files: []
108
110
 
109
111
  rdoc_options: