activewarehouse-etl 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,18 +1,30 @@
1
1
  0.1.0 - Dec 6, 2006
2
- * Initial release
3
-
2
+ * Initial release
3
+
4
4
  0.2.0 - Dec 7, 2006
5
- * Added an XML parser for source parsing
6
- * Added support for compound key constraints in destinations via the :unique => [] option
7
- * Added ability to declare explicit columns in bulk import
8
- * Added support for generators in destinations
9
- * Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
10
-
5
+ * Added an XML parser for source parsing
6
+ * Added support for compound key constraints in destinations via the :unique => [] option
7
+ * Added ability to declare explicit columns in bulk import
8
+ * Added support for generators in destinations
9
+ * Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
10
+
11
11
  0.3.0 - Dec 19, 2006
12
- * Added support for calculated values in virtual fields with Proc
13
-
12
+ * Added support for calculated values in virtual fields with Proc
13
+
14
14
  0.4.0 - Jan 11, 2006
15
- * Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
16
- * Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
17
- * Added :truncate option for database destination. Set to true to truncate before importing data.
18
- * Added support for :unique => [] option and virtual fields for the database destination
15
+ * Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
16
+ * Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
17
+ * Added :truncate option for database destination. Set to true to truncate before importing data.
18
+ * Added support for :unique => [] option and virtual fields for the database destination
19
+
20
+ 0.5.0 -
21
+ * Changed require_gem to gem and added alias to allow for older versions of rubygems
22
+ * Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
23
+ :options => {} defines options to pass to the parser
24
+ * Added support for passing a custom Parser class in the source configuration
25
+ * Removed the need to include Enumerable in each parser implementation
26
+ * Added new date_to_string and string_to_date transformers
27
+ * Implemented foreign_key_lookup transform including an ActiveRecordResolver
28
+ * Added real time activity logging which is called when the etl bin script is invoked
29
+ * Improved error handling
30
+ * Default logger level is now WARN
data/TODO CHANGED
@@ -2,8 +2,10 @@ TODO
2
2
 
3
3
  * Add build-in support for audit_dimension
4
4
  * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
5
- * Add built-in FK lookup
6
5
  * Provide greater control in error handling
7
6
  ** Allow a error threshold
8
7
  ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
9
- ** Allow mismatch row length error in delimited parser to be ignored
8
+ ** Allow mismatch row length error in delimited parser to be ignored
9
+ * Improve error messages throughout, but especially in problems with the control files
10
+ * Add support for determining if a row should be added, updated or removed vs. just blindly inserting
11
+ * Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
data/lib/etl.rb CHANGED
@@ -30,17 +30,22 @@ unless defined?(Logger)
30
30
  end
31
31
 
32
32
  require 'rubygems'
33
+
34
+ unless Kernel.respond_to?(:gem)
35
+ Kernel.send :alias_method, :gem, :require_gem
36
+ end
37
+
33
38
  unless defined?(ActiveSupport)
34
- require_gem 'activesupport'
39
+ gem 'activesupport'
35
40
  require 'active_support'
36
41
  end
37
42
 
38
43
  unless defined?(ActiveRecord)
39
- require_gem 'activerecord'
44
+ gem 'activerecord'
40
45
  require 'active_record'
41
46
  end
42
47
 
43
- require_gem 'fastercsv'
48
+ gem 'fastercsv'
44
49
  require 'faster_csv'
45
50
 
46
51
  $:.unshift(File.dirname(__FILE__))
@@ -64,4 +69,6 @@ module ETL #:nodoc:
64
69
  end
65
70
  class MismatchError < ETLError #:nodoc:
66
71
  end
72
+ class ResolverError < ETLError #:nodoc:
73
+ end
67
74
  end
@@ -1,7 +1,7 @@
1
1
  require 'active_record/connection_adapters/abstract_adapter'
2
2
 
3
- module ActiveRecord
4
- module ConnectionAdapters
3
+ module ActiveRecord #:nodoc:
4
+ module ConnectionAdapters #:nodoc:
5
5
  class MysqlAdapter < AbstractAdapter
6
6
  # Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
7
7
  # the auto_increment
@@ -34,12 +34,11 @@ if ARGV.length < 1
34
34
  else
35
35
  puts "Starting ETL process"
36
36
 
37
- t = Benchmark.realtime do
38
- ARGV.each do |f|
39
- puts "Processing #{f}"
40
- ETL::Engine.process(f)
41
- end
37
+ ARGV.each do |f|
38
+ puts "Processing #{f}"
39
+ ETL::Engine.realtime_activity = true
40
+ ETL::Engine.process(f)
42
41
  end
43
-
44
- puts "ETL process complete in #{sprintf('%.3f', t)} seconds"
42
+
43
+ puts "ETL process complete"
45
44
  end
@@ -1,40 +1,23 @@
1
1
  module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
- # Object representation of a control file
4
- class Control
5
- attr_reader :file
6
-
3
+ # The Context is passed to eval.
4
+ class Context
7
5
  class << self
8
- # Parse a control file and return a Control instance
9
- def parse(control_file)
10
- control_file = control_file.path if control_file.instance_of?(File)
11
- # logger.debug "Parsing control file #{control_file.path}"
12
- control = ETL::Control::Control.new(control_file)
13
- # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
14
- eval(IO.readlines(control_file).join("\n"), control.get_binding)
15
- control.validate
16
- control
17
- end
18
-
19
- def resolve(control)
20
- case control
21
- when String
22
- ETL::Control::Control.parse(File.new(control))
23
- when File
24
- ETL::Control::Control.parse(control)
25
- when ETL::Control::Control
26
- control
27
- else
28
- raise ControlError, "Control must be a String, File or Control object"
29
- end
6
+ # Create a Context instance
7
+ def create(control)
8
+ Context.new(control).get_binding
30
9
  end
31
10
  end
32
11
 
33
- def initialize(file)
34
- @file = file
12
+ def initialize(control)
13
+ @control = control
14
+ end
15
+
16
+ def file
17
+ @control.file
35
18
  end
36
19
 
37
- # Define a source
20
+ # Define a source.
38
21
  def source(name, configuration={}, definition={})
39
22
  source_types = [:file, :db]
40
23
  source_types.each do |source_type|
@@ -47,7 +30,7 @@ module ETL #:nodoc:
47
30
 
48
31
  # Get the defined source
49
32
  def sources
50
- @sources ||= []
33
+ @control.sources
51
34
  end
52
35
 
53
36
  # Define a destination
@@ -62,14 +45,19 @@ module ETL #:nodoc:
62
45
 
63
46
  # Get the defined destinations
64
47
  def destinations
65
- @destinations ||= []
48
+ @control.destinations
66
49
  end
67
50
 
68
51
  def transform(name, transformer=nil, configuration={}, &block)
69
52
  transforms[name] ||= []
70
53
  if transformer
71
- transform_class = ETL::Transform.const_get("#{transformer.to_s.classify}Transform")
72
- transforms[name] << transform_class.new(self, configuration)
54
+ class_name = "#{transformer.to_s.classify}Transform"
55
+ begin
56
+ transform_class = ETL::Transform.const_get(class_name)
57
+ transforms[name] << transform_class.new(self, configuration)
58
+ rescue NameError => e
59
+ raise ControlError, "Unable to find transformer #{class_name}: #{e}"
60
+ end
73
61
  elsif block_given?
74
62
  transforms[name] << block
75
63
  else
@@ -77,32 +65,123 @@ module ETL #:nodoc:
77
65
  end
78
66
  end
79
67
 
80
- def get_transform(name)
81
- transforms[name] ||= []
68
+ def transforms
69
+ @control.transforms
82
70
  end
83
71
 
84
72
  def pre_process(name, configuration={})
85
- processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
86
- pre_processors << processor_class.new(self, configuration)
73
+ class_name = "#{name.to_s.classify}Processor"
74
+ begin
75
+ processor_class = ETL::Processor.const_get(class_name)
76
+ pre_processors << processor_class.new(self, configuration)
77
+ rescue NameError
78
+ raise ControlError, "Unable to find preprocessor #{class_name}"
79
+ end
87
80
  end
88
81
 
89
82
  def pre_processors
90
- @pre_processors ||= []
83
+ @control.pre_processors
91
84
  end
92
85
 
93
86
  def post_process(name, configuration={})
94
- processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
95
- post_processors << processor_class.new(self, configuration)
87
+ class_name = "#{name.to_s.classify}Processor"
88
+ begin
89
+ processor_class = ETL::Processor.const_get(class_name)
90
+ post_processors << processor_class.new(self, configuration)
91
+ rescue NameError
92
+ raise ControlError, "Unable to find postprocessor #{class_name}"
93
+ end
96
94
  end
97
95
 
98
96
  def post_processors
99
- @post_processors ||= []
97
+ @control.post_processors
100
98
  end
101
99
 
102
100
  def get_binding
103
101
  binding
104
102
  end
105
103
 
104
+ protected
105
+ # Get an array of supported source types
106
+ def source_types
107
+ [:file, :database]
108
+ end
109
+
110
+ # Get an array of supported destination types
111
+ def destination_types
112
+ [:file, :database]
113
+ end
114
+
115
+ end
116
+
117
+ # Object representation of a control file
118
+ class Control
119
+ # The File object
120
+ attr_reader :file
121
+
122
+ class << self
123
+ # Parse a control file and return a Control instance
124
+ def parse(control_file)
125
+ control_file = control_file.path if control_file.instance_of?(File)
126
+ # logger.debug "Parsing control file #{control_file.path}"
127
+ control = ETL::Control::Control.new(control_file)
128
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
129
+ eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
130
+ control.validate
131
+ control
132
+ end
133
+
134
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
135
+ # are:
136
+ # * The path to a control file as a String
137
+ # * A File object referencing the control file
138
+ # * The ETL::Control::Control object (which will just be returned)
139
+ #
140
+ # Raises a ControlError if any other type is given
141
+ def resolve(control)
142
+ case control
143
+ when String
144
+ ETL::Control::Control.parse(File.new(control))
145
+ when File
146
+ ETL::Control::Control.parse(control)
147
+ when ETL::Control::Control
148
+ control
149
+ else
150
+ raise ControlError, "Control must be a String, File or Control object"
151
+ end
152
+ end
153
+ end
154
+
155
+ # Initialize the instance with the given File object
156
+ def initialize(file)
157
+ @file = file
158
+ end
159
+
160
+ # Get the defined source
161
+ def sources
162
+ @sources ||= []
163
+ end
164
+
165
+ # Get the defined destinations
166
+ def destinations
167
+ @destinations ||= []
168
+ end
169
+
170
+ # Get the transforms with the specified name
171
+ def transform(name)
172
+ transforms[name] ||= []
173
+ end
174
+
175
+ # Get an Array of preprocessors
176
+ def pre_processors
177
+ @pre_processors ||= []
178
+ end
179
+
180
+ # Get an Array of post processors
181
+ def post_processors
182
+ @post_processors ||= []
183
+ end
184
+
106
185
  # Get a map of all transforms for this control
107
186
  def transforms
108
187
  @transforms ||= {}
@@ -118,17 +197,6 @@ module ETL #:nodoc:
118
197
  end
119
198
  end
120
199
 
121
- protected
122
- # Get an array of supported source types
123
- def source_types
124
- [:file, :database]
125
- end
126
-
127
- # Get an array of supported destination types
128
- def destination_types
129
- [:file, :database]
130
- end
131
-
132
200
  end
133
201
  end
134
202
  end
@@ -1,15 +1,41 @@
1
1
  module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
+ # Base class for destinations.
3
4
  class Destination
4
- attr_reader :control, :configuration, :mapping
5
- attr_accessor :buffer_size, :current_row, :unique
5
+ # Read-only accessor for the ETL::Control::Control instance
6
+ attr_reader :control
7
+
8
+ # Read-only accessor for the configuration Hash
9
+ attr_reader :configuration
10
+
11
+ # Read-only accessor for the destination mapping Hash
12
+ attr_reader :mapping
13
+
14
+ # Accessor to the buffer size
15
+ attr_accessor :buffer_size
16
+
17
+ # Unique flag.
18
+ attr_accessor :unique
6
19
 
7
20
  class << self
21
+ # Get the destination class for the specified name.
22
+ #
23
+ # For example if name is :database or 'database' then the DatabaseDestination class
24
+ # is returned
8
25
  def class_for_name(name)
9
26
  ETL::Control.const_get("#{name.to_s.classify}Destination")
10
27
  end
11
28
  end
12
29
 
30
+ # Initialize the destination
31
+ #
32
+ # Arguments:
33
+ # * <tt>control</tt>: The ETL::Control::Control instance
34
+ # * <tt>configuration</tt>: The configuration Hash
35
+ # * <tt>mapping</tt>: The mapping Hash
36
+ #
37
+ # Options:
38
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
13
39
  def initialize(control, configuration, mapping)
14
40
  @control = control
15
41
  @configuration = configuration
@@ -17,10 +43,12 @@ module ETL #:nodoc:
17
43
  @buffer_size = configuration[:buffer_size] ||= 1000
18
44
  end
19
45
 
46
+ # Get the current row number
20
47
  def current_row
21
48
  @current_row ||= 1
22
49
  end
23
50
 
51
+ # Write the given row
24
52
  def write(row)
25
53
  buffer << row
26
54
  flush if buffer.length >= buffer_size
@@ -36,11 +64,17 @@ module ETL #:nodoc:
36
64
  raise NotImplementedError, "close method must be implemented by subclasses"
37
65
  end
38
66
 
67
+ def errors
68
+ @errors ||= []
69
+ end
70
+
39
71
  protected
72
+ # Access the buffer
40
73
  def buffer
41
74
  @buffer ||= []
42
75
  end
43
76
 
77
+ # Access the generators map
44
78
  def generators
45
79
  @generators ||= {}
46
80
  end
@@ -76,15 +110,22 @@ module ETL #:nodoc:
76
110
  @compound_key_constraints ||= {}
77
111
  end
78
112
 
79
- # Add any virtual fields to the row
113
+ # Add any virtual fields to the row. Virtual rows will get their value from one of the following:
114
+ # * If the mapping is a Class, then an object which implements the next method
115
+ # * If the mapping is a Symbol, then the XGenerator where X is the classified symbol
116
+ # * If the mapping is a Proc, then it will be called with the row
117
+ # * Otherwise the value itself will be assigned to the field
80
118
  def add_virtuals!(row)
81
119
  if mapping[:virtual]
82
120
  mapping[:virtual].each do |key,value|
83
121
  # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
84
122
  case value
123
+ when Class
124
+ generator = generators[key] ||= value.new
125
+ row[key] = generator.next
85
126
  when Symbol
86
- generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
87
- row[key] = generators[key].next
127
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
128
+ row[key] = generator.next
88
129
  when Proc
89
130
  row[key] = value.call(row)
90
131
  else