activewarehouse-etl 0.8.1 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -125,4 +125,12 @@
125
125
  * Added :type configuration option to the source directive, allowing the source type to be
126
126
  explicitly specified. The source type can be a string or symbol (in which case the class will
127
127
  be constructed by appending Source to the type name), a class (which will be instantiated
128
- and passed the control, configuration and mapping) and finally an actual Source instance.
128
+ and passed the control, configuration and mapping) and finally an actual Source instance.
129
+
130
+ 0.8.2 -
131
+ * Fixed bug with premature destination closing.
132
+ * Added indexes to execution records table.
133
+ * Added a PrintRowProcessor.
134
+ * Added support for conditions and "group by" in the database source.
135
+ * Added after_initialize hook in Processor base class.
136
+ * Added examples directory
data/README CHANGED
@@ -65,7 +65,7 @@ Command line options:
65
65
  * <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
66
66
 
67
67
  == Control File Examples
68
- Control file examples can be found in the test directory.
68
+ Control file examples can be found in the examples directory.
69
69
 
70
70
  == Feedback
71
71
  This is a work in progress. Comments should be made on the
data/Rakefile CHANGED
@@ -47,6 +47,7 @@ PKG_FILES = FileList[
47
47
  'bin/**/*',
48
48
  'doc/**/*',
49
49
  'lib/**/*',
50
+ 'examples/**/*',
50
51
  ] - [ 'test' ]
51
52
 
52
53
  spec = Gem::Specification.new do |s|
@@ -0,0 +1,6 @@
1
+ etl_execution:
2
+ adapter: mysql
3
+ username: root
4
+ host: localhost
5
+ database: etl_execution
6
+ encoding: utf8
@@ -263,12 +263,14 @@ module ETL #:nodoc:
263
263
 
264
264
  # Record the record
265
265
  if ETL::Engine.job # only record the execution if there is a job
266
- ETL::Execution::Record.create!(
267
- :control_file => control.file,
268
- :natural_key => nk,
269
- :crc => crc,
270
- :job_id => ETL::Engine.job.id
271
- )
266
+ ETL::Execution::Record.time_spent += Benchmark.realtime do
267
+ ETL::Execution::Record.create!(
268
+ :control_file => control.file,
269
+ :natural_key => nk,
270
+ :crc => crc,
271
+ :job_id => ETL::Engine.job.id
272
+ )
273
+ end
272
274
  end
273
275
  end
274
276
  end
@@ -60,6 +60,11 @@ module ETL #:nodoc:
60
60
  configuration[:select] || '*'
61
61
  end
62
62
 
63
+ # Get the group by part of the query, defaults to nil
64
+ def group
65
+ configuration[:group]
66
+ end
67
+
63
68
  # Get the order for the query, defaults to nil
64
69
  def order
65
70
  configuration[:order]
@@ -165,16 +170,24 @@ module ETL #:nodoc:
165
170
  q = "SELECT #{select} FROM #{configuration[:table]}"
166
171
  q << " #{join}" if join
167
172
 
173
+ conditions = []
168
174
  if new_records_only
169
175
  last_completed = ETL::Execution::Job.maximum('created_at',
170
176
  :conditions => ['control_file = ? and completed_at is not null', control.file]
171
177
  )
172
178
  if last_completed
173
- q << " WHERE #{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
179
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
174
180
  end
175
181
  end
176
182
 
183
+ conditions << configuration[:conditions] if configuration[:conditions]
184
+ if conditions.length > 0
185
+ q << " WHERE #{conditions.join(' AND ')}"
186
+ end
187
+
188
+ q << " GROUP BY #{group}" if group
177
189
  q << " ORDER BY #{order}" if order
190
+
178
191
  if ETL::Engine.limit || ETL::Engine.offset
179
192
  options = {}
180
193
  options[:limit] = ETL::Engine.limit if ETL::Engine.limit
@@ -1,9 +1,8 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
3
4
  class EnumerableSource < ETL::Control::Source
4
- def initialize(control, configuration, definition)
5
- super
6
- end
5
+ # Iterate through the enumerable
7
6
  def each(&block)
8
7
  configuration[:enumerable].each(&block)
9
8
  end
@@ -237,7 +237,11 @@ module ETL #:nodoc:
237
237
  errors << msg
238
238
  Engine.logger.error(msg)
239
239
  e.backtrace.each { |line| Engine.logger.error(line) }
240
- exceeded_error_threshold?(control) ? break : next
240
+ begin
241
+ exceeded_error_threshold?(control) ? break : next
242
+ rescue => inner_error
243
+ puts inner_error
244
+ end
241
245
  end
242
246
  end
243
247
 
@@ -286,9 +290,10 @@ module ETL #:nodoc:
286
290
  return
287
291
  end
288
292
 
289
- destinations.each do |destination|
290
- destination.close
291
- end
293
+ end
294
+
295
+ destinations.each do |destination|
296
+ destination.close
292
297
  end
293
298
 
294
299
  say_on_own_line "Executing post processes"
@@ -310,6 +315,8 @@ module ETL #:nodoc:
310
315
  say "Avg before_writes: #{Engine.rows_read/benchmarks[:before_writes]} rows/sec" if benchmarks[:before_writes] > 0
311
316
  say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
312
317
  say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
318
+
319
+ say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
313
320
 
314
321
  # ETL::Transform::Transform.benchmarks.each do |klass, t|
315
322
  # say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
@@ -24,7 +24,7 @@ module ETL #:nodoc:
24
24
 
25
25
  # Get the final target version number
26
26
  def target
27
- 1
27
+ 2
28
28
  end
29
29
 
30
30
  private
@@ -43,6 +43,13 @@ module ETL #:nodoc:
43
43
  end
44
44
  update_schema_info(1)
45
45
  end
46
+
47
+ def migration_2 #:nodoc:
48
+ connection.add_index :records, :control_file
49
+ connection.add_index :records, :natural_key
50
+ connection.add_index :records, :job_id
51
+ update_schema_info(2)
52
+ end
46
53
 
47
54
  # Update the schema info table, setting the version value
48
55
  def update_schema_info(version)
@@ -3,6 +3,16 @@ module ETL #:nodoc:
3
3
  # Represents a single record
4
4
  class Record < ETL::Execution::Base
5
5
  belongs_to :table
6
+ class << self
7
+ attr_accessor :time_spent
8
+ def time_spent
9
+ @time_spent ||= 0
10
+ end
11
+ def average_time_spent
12
+ return 0 if time_spent == 0
13
+ ETL::Engine.rows_read / time_spent
14
+ end
15
+ end
6
16
  end
7
17
  end
8
18
  end
@@ -25,7 +25,7 @@ module ETL #:nodoc:
25
25
  # key
26
26
  def initialize(control, configuration)
27
27
  super
28
- @skip = configuration[:skip]
28
+ @skip = configuration[:skip] || []
29
29
  @table = configuration[:table]
30
30
  @columns = configuration[:columns]
31
31
 
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Debugging processor for printing the current row
4
+ class PrintRowProcessor < ETL::Processor::RowProcessor
5
+ # Process the row
6
+ def process(row)
7
+ puts row.inspect
8
+ row
9
+ end
10
+ end
11
+ end
12
+ end
@@ -5,6 +5,7 @@ module ETL #:nodoc:
5
5
  def initialize(control, configuration)
6
6
  @control = control
7
7
  @configuration = configuration
8
+ after_initialize if respond_to?(:after_initialize)
8
9
  end
9
10
  protected
10
11
  # Get the control object
@@ -27,7 +27,7 @@ module ETL #:nodoc:
27
27
  raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
28
28
  raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
29
29
  fk = resolver.resolve(value)
30
- raise ResolverError, "Unable to resolve #{value} to foreign key" unless fk
30
+ raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}" unless fk
31
31
  @collection[value] = fk
32
32
  end
33
33
  fk
@@ -61,4 +61,16 @@ class ActiveRecordResolver
61
61
  rec = ar_class.__send__(find_method, value)
62
62
  rec.nil? ? nil : rec.id
63
63
  end
64
+ end
65
+
66
+ class SQLResolver
67
+ def initialize(table, field, connection=nil)
68
+ @table = table
69
+ @field = field
70
+ @connection = connection
71
+ end
72
+ def resolve(value)
73
+ conn = @connection ||= ActiveRecord::Base.connection
74
+ conn.select_value("SELECT id FROM #{table} WHERE #{field} = #{conn.quote(value)}")
75
+ end
64
76
  end
@@ -2,7 +2,7 @@ module ETL#:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 8
5
- TINY = 1
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
3
3
  specification_version: 1
4
4
  name: activewarehouse-etl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.8.1
7
- date: 2007-04-12 00:00:00 -04:00
6
+ version: 0.8.2
7
+ date: 2007-04-15 00:00:00 -04:00
8
8
  summary: Pure Ruby ETL package.
9
9
  require_paths:
10
10
  - lib
@@ -83,6 +83,7 @@ files:
83
83
  - lib/etl/processor/check_unique_processor.rb
84
84
  - lib/etl/processor/copy_field_processor.rb
85
85
  - lib/etl/processor/hierarchy_exploder_processor.rb
86
+ - lib/etl/processor/print_row_processor.rb
86
87
  - lib/etl/processor/processor.rb
87
88
  - lib/etl/processor/rename_processor.rb
88
89
  - lib/etl/processor/row_processor.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/etl/transform/transform.rb
105
106
  - lib/etl/transform/trim_transform.rb
106
107
  - lib/etl/transform/type_transform.rb
108
+ - examples/database.example.yml
107
109
  test_files: []
108
110
 
109
111
  rdoc_options: