activewarehouse-etl 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/CHANGELOG +98 -62
  2. data/Rakefile +11 -0
  3. data/TODO +2 -1
  4. data/lib/etl.rb +9 -0
  5. data/lib/etl/batch.rb +2 -0
  6. data/lib/etl/batch/batch.rb +111 -0
  7. data/lib/etl/batch/directives.rb +55 -0
  8. data/lib/etl/builder.rb +1 -0
  9. data/lib/etl/builder/date_dimension_builder.rb +83 -0
  10. data/lib/etl/commands/etl.rb +56 -43
  11. data/lib/etl/control/control.rb +58 -9
  12. data/lib/etl/control/destination.rb +29 -4
  13. data/lib/etl/control/destination/database_destination.rb +17 -27
  14. data/lib/etl/control/source/database_source.rb +17 -40
  15. data/lib/etl/control/source/file_source.rb +8 -5
  16. data/lib/etl/control/source/model_source.rb +39 -0
  17. data/lib/etl/core_ext.rb +1 -0
  18. data/lib/etl/core_ext/time.rb +5 -0
  19. data/lib/etl/core_ext/time/calculations.rb +40 -0
  20. data/lib/etl/engine.rb +184 -83
  21. data/lib/etl/execution.rb +1 -0
  22. data/lib/etl/execution/base.rb +1 -1
  23. data/lib/etl/execution/batch.rb +8 -0
  24. data/lib/etl/execution/job.rb +1 -0
  25. data/lib/etl/execution/migration.rb +16 -4
  26. data/lib/etl/generator/surrogate_key_generator.rb +20 -4
  27. data/lib/etl/http_tools.rb +1 -1
  28. data/lib/etl/processor/bulk_import_processor.rb +16 -19
  29. data/lib/etl/processor/check_exist_processor.rb +16 -7
  30. data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
  31. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  32. data/lib/etl/processor/surrogate_key_processor.rb +22 -2
  33. data/lib/etl/processor/truncate_processor.rb +13 -13
  34. data/lib/etl/screen.rb +14 -0
  35. data/lib/etl/screen/row_count_screen.rb +7 -2
  36. data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
  37. data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
  38. data/lib/etl/util.rb +59 -0
  39. data/lib/etl/version.rb +2 -2
  40. metadata +19 -2
data/CHANGELOG CHANGED
@@ -3,29 +3,38 @@
3
3
 
4
4
  0.2.0 - Dec 7, 2006
5
5
  * Added an XML parser for source parsing
6
- * Added support for compound key constraints in destinations via the :unique => [] option
6
+ * Added support for compound key constraints in destinations via the
7
+ :unique => [] option
7
8
  * Added ability to declare explicit columns in bulk import
8
9
  * Added support for generators in destinations
9
- * Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
10
+ * Added a SurrogateKeyGenerator for cases where the database doesn't support
11
+ auto generated surrogate keys
10
12
 
11
13
  0.3.0 - Dec 19, 2006
12
14
  * Added support for calculated values in virtual fields with Proc
13
15
 
14
16
  0.4.0 - Jan 11, 2006
15
- * Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
16
- * Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
17
- * Added :truncate option for database destination. Set to true to truncate before importing data.
18
- * Added support for :unique => [] option and virtual fields for the database destination
17
+ * Added :skip_lines option to file source configurations, which can be used
18
+ to skip the first n lines in the source data file
19
+ * Added better error handling in delimited parser - an error is now raised
20
+ if the expected and actual field lengths do not match
21
+ * Added :truncate option for database destination. Set to true to truncate
22
+ before importing data.
23
+ * Added support for :unique => [] option and virtual fields for the database
24
+ destination
19
25
 
20
26
  0.5.0 - Feb 17, 2007
21
- * Changed require_gem to gem and added alias to allow for older versions of rubygems.
22
- * Added support for Hash in the source configuration where :name => :parser_name defines the parser to use and
23
- :options => {} defines options to pass to the parser.
27
+ * Changed require_gem to gem and added alias to allow for older versions of
28
+ rubygems.
29
+ * Added support for Hash in the source configuration where :name => :parser_name
30
+ defines the parser to use and :options => {} defines options to pass to the
31
+ parser.
24
32
  * Added support for passing a custom Parser class in the source configuration.
25
33
  * Removed the need to include Enumerable in each parser implementation.
26
34
  * Added new date_to_string and string_to_date transformers.
27
35
  * Implemented foreign_key_lookup transform including an ActiveRecordResolver.
28
- * Added real time activity logging which is called when the etl bin script is invoked.
36
+ * Added real time activity logging which is called when the etl bin script is
37
+ invoked.
29
38
  * Improved error handling.
30
39
  * Default logger level is now WARN.
31
40
 
@@ -40,29 +49,36 @@
40
49
  0.6.0 - Mar 8, 2007
41
50
  * Fixed missing method problem in validate in Control class.
42
51
  * Removed control validation for now (source could be code in the control file).
43
- * Transform interface now defined as taking 3 arguments, the field name, field value and the row. This
44
- is not backwards compatible.
52
+ * Transform interface now defined as taking 3 arguments, the field name, field
53
+ value and the row. This is not backwards compatible.
45
54
  * Added HierarchyLookupTransform.
46
- * Added DefaultTransform which will return a specified value if the initial value is blank.
55
+ * Added DefaultTransform which will return a specified value if the initial
56
+ value is blank.
47
57
  * Added row-level processing.
48
- * Added HierarchyExploderProcessor which takes a single hierarchy row and explodes it to multiple rows
49
- as used in a hierarchy bridge.
50
- * Added ApacheCombinedLogParser which parses Apache Combined Log format, including parsing of the
58
+ * Added HierarchyExploderProcessor which takes a single hierarchy row and
59
+ explodes it to multiple rows as used in a hierarchy bridge.
60
+ * Added ApacheCombinedLogParser which parses Apache Combined Log format,
61
+ including parsing of the
51
62
  user agent string and the URI, returning a Hash.
52
- * Fixed bug in SAX parser so that attributes are now set when the start_element event is received.
53
- * Added an HttpTools module which provides some parsing methods (for user agent and URI).
54
- * Database source now uses its own class for establishing an ActiveRecord connection.
63
+ * Fixed bug in SAX parser so that attributes are now set when the start_element
64
+ event is received.
65
+ * Added an HttpTools module which provides some parsing methods (for user agent
66
+ and URI).
67
+ * Database source now uses its own class for establishing an ActiveRecord
68
+ connection.
55
69
  * Log files are now timestamped.
56
70
  * Source files are now archived automatically during the extraction process
57
- * Added a :condition option to the destination configuration Hash that accepts a Proc with a single
58
- argument passed to it (the row).
59
- * Added an :append_rows option to the destination configuration Hash that accepts either a Hash (to
60
- append a single row) or an Array of Hashes (to append multiple rows).
61
- * Only print the read and written row counts if there is at least one source and one destination
62
- respectively.
63
- * Added a depends_on directive that accepts a list of arguments of either strings or symbols. Each
64
- symbol is converted to a string and .ctl is appended; strings are passed through directly. The
65
- dependencies are executed in the order they are specified.
71
+ * Added a :condition option to the destination configuration Hash that accepts
72
+ a Proc with a single argument passed to it (the row).
73
+ * Added an :append_rows option to the destination configuration Hash that
74
+ accepts either a Hash (to append a single row) or an Array of Hashes (to
75
+ append multiple rows).
76
+ * Only print the read and written row counts if there is at least one source
77
+ and one destination respectively.
78
+ * Added a depends_on directive that accepts a list of arguments of either strings
79
+ or symbols. Each symbol is converted to a string and .ctl is appended;
80
+ strings are passed through directly. The dependencies are executed in the order
81
+ they are specified.
66
82
  * The default field separator in the bulk loader is now a comma (was a tab).
67
83
 
68
84
  0.6.1 - Mar 22, 2007
@@ -70,31 +86,36 @@
70
86
  * Added CopyFieldProcessor
71
87
 
72
88
  0.7 - Apr 8, 2007
73
- * Job execution is now tracked in a database. This means that ActiveRecord is required regardless
74
- of the sources being used in the ETL scripts. An example database configuration for the etl can
75
- be found in test/database.example.yml. This file is loaded from either a.) the current working
76
- directory or b.) the location specified using the -c command line argument when running the
77
- etl command.
89
+ * Job execution is now tracked in a database. This means that ActiveRecord is
90
+ required regardless of the sources being used in the ETL scripts. An example
91
+ database configuration for the etl can be found in test/database.example.yml.
92
+ This file is loaded from either a.) the current working directory or b.) the
93
+ location specified using the -c command line argument when running the etl
94
+ command.
78
95
  * etl script now supports the following command line arguments:
79
96
  ** -h or --help: Prints the usage
80
- ** -l or --limit: Specifies a limit for the number of source rows to read, useful for testing
81
- your control files before executing a full ETL process
82
- ** -o or --offset: Specified a start offset for reading from the source, useful for testing your
83
- control files before executing a full ETL process
84
- ** -c or --config: Specify the database.yml file to configure the ETL execution data store
97
+ ** -l or --limit: Specifies a limit for the number of source rows to read,
98
+ useful for testing your control files before executing a full ETL process
99
+ ** -o or --offset: Specified a start offset for reading from the source, useful
100
+ for testing your control files before executing a full ETL process
101
+ ** -c or --config: Specify the database.yml file to configure the ETL
102
+ execution data store
85
103
  ** -n or --newlog: Write to the logfile rather than appending to it
86
- * Database source now supports specifying the select, join and order parts of the query.
87
- * Database source understands the limit argument specified on the etl command line
104
+ * Database source now supports specifying the select, join and order parts of
105
+ the query.
106
+ * Database source understands the limit argument specified on the etl command
107
+ line
88
108
  * Added CheckExistProcessor
89
109
  * Added CheckUniqueProcessor
90
- * Added SurrogateKeyProcessor. The SurrogateKey processor should be used in conjunction with the
91
- CheckExistProcessor and CheckUniqueProcessor to provide
110
+ * Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
111
+ conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
112
+ surrogate keys for all dimension records.
92
113
  * Added SequenceProcessor
93
114
  * Added OrdinalizeTransform
94
115
  * Fixed a bug in the trim transform
95
- * Sources now provide a trigger file which can be used to indicate that the original source
96
- data has been completely extracted to the local file system. This is useful if you need to
97
- recover from a failed ETL process.
116
+ * Sources now provide a trigger file which can be used to indicate that the
117
+ original source data has been completely extracted to the local file system.
118
+ This is useful if you need to recover from a failed ETL process.
98
119
  * Updated README
99
120
 
100
121
  0.7.1 - Apr 8, 2007
@@ -105,27 +126,30 @@
105
126
 
106
127
  0.8.0 - Apr 12, 2007
107
128
  * Source now available through the current row source accessor.
108
- * Added new_rows_only configuration option to DatabaseSource. A date field must be specified and
109
- only records that are greater than the date value in that field, relative to the last successful
129
+ * Added new_rows_only configuration option to DatabaseSource. A date field must
130
+ be specified and only records that are greater than the date value in that
131
+ field, relative to the last successful
110
132
  execution, will be returned from the source.
111
- * Added an (untested) count feature which returns the number of rows for processing.
112
- * If no natural key is defined then an empty array will now be used, resulting in the row being
113
- written to the output without going through change checks.
114
- * Mapping argument in destination is now optional. An empty hash will be used if the mapping
115
- hash is not specified. If the mapping hash is not specified then the order will be determined
116
- using the originating source's order.
117
- * ActiveRecord configurations loaded from database.yml by the etl tool will be merged with
118
- ActiveRecord::Base.configurations.
133
+ * Added an (untested) count feature which returns the number of rows for
134
+ processing.
135
+ * If no natural key is defined then an empty array will now be used, resulting
136
+ in the row being written to the output without going through change checks.
137
+ * Mapping argument in destination is now optional. An empty hash will be used
138
+ if the mapping hash is not specified. If the mapping hash is not specified
139
+ then the order will be determined using the originating source's order.
140
+ * ActiveRecord configurations loaded from database.yml by the etl tool will be
141
+ merged with ActiveRecord::Base.configurations.
119
142
  * Fixed several bugs in how record change detection was implemented.
120
- * Fixed how the read_locally functionality was implemented so that it will find that last
121
- completed local source copy using the source's trigger file (untested).
143
+ * Fixed how the read_locally functionality was implemented so that it will find
144
+ that last completed local source copy using the source's trigger file (untested).
122
145
 
123
146
  0.8.1 - Apr 12, 2007
124
147
  * Added EnumerableSource
125
- * Added :type configuration option to the source directive, allowing the source type to be
126
- explicitly specified. The source type can be a string or symbol (in which case the class will
127
- be constructed by appending Source to the type name), a class (which will be instantiated
128
- and passed the control, configuration and mapping) and finally an actual Source instance.
148
+ * Added :type configuration option to the source directive, allowing the source
149
+ type to be explicitly specified. The source type can be a string or symbol
150
+ (in which case the class will be constructed by appending Source to the type
151
+ name), a class (which will be instantiate and passed the control,
152
+ configuration and mapping) and finally an actual Source instance.
129
153
 
130
154
  0.8.2 - April 15, 2007
131
155
  * Fixed bug with premature destination closing.
@@ -139,4 +163,16 @@
139
163
  * Added patches from Andy Triboletti
140
164
 
141
165
  0.8.4 - May 24, 2007
142
- * Added fix for backslash in file writer
166
+ * Added fix for backslash in file writer
167
+
168
+ 0.9.0 -
169
+ * Added support for batch processing through .ebf files. These files are
170
+ essentially control files that apply settings to an entire ETL process.
171
+ * Implemented support for screen blocks. These blocks can be used to test
172
+ the data and raise an error if the screens do not pass.
173
+ * Connections are now cached in a Hash available through
174
+ ETL::Engine.connection(name). This should be used rather than including
175
+ connection information in the control files.
176
+ * Implemented temp table support throughout.
177
+ * DateDimensionBuilder now included in ActiveWarehouse ETL directly.
178
+ * Time calculations for fiscal year now included in ActiveWarehouse ETL.
data/Rakefile CHANGED
@@ -38,6 +38,17 @@ Rake::RDocTask.new(:rdoc) do |rdoc|
38
38
  rdoc.rdoc_files.include('lib/**/*.rb')
39
39
  end
40
40
 
41
+ namespace :rcov do
42
+ desc 'Measures test coverage'
43
+ task :test do
44
+ rm_f 'coverage.data'
45
+ mkdir 'coverage' unless File.exist?('coverage')
46
+ rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
47
+ system("#{rcov} test/*_test.rb")
48
+ system("open coverage/index.html") if PLATFORM['darwin']
49
+ end
50
+ end
51
+
41
52
  PKG_FILES = FileList[
42
53
  'CHANGELOG',
43
54
  'LICENSE',
data/TODO CHANGED
@@ -7,8 +7,9 @@ TODO
7
7
  ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
8
8
  ** Allow mismatch row length error in delimited parser to be ignored
9
9
  * Improve error messages throughout, but especially in problems with the control files
10
- * Add support for determining if a row should be added, updated or removed vs. just blindly inserting
11
10
  * Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
11
+ * Check if a temp table exists and the last job run was successful, in which case skip during the current run
12
+ * Create models for each of the tables in each of the databases defined in ETL::Engine.connections
12
13
 
13
14
  Audit Record
14
15
 
data/lib/etl.rb CHANGED
@@ -57,15 +57,20 @@ end
57
57
 
58
58
  $:.unshift(File.dirname(__FILE__))
59
59
 
60
+ require 'etl/core_ext'
61
+ require 'etl/util'
60
62
  require 'etl/http_tools'
63
+ require 'etl/builder'
61
64
  require 'etl/version'
62
65
  require 'etl/engine'
63
66
  require 'etl/control'
67
+ require 'etl/batch'
64
68
  require 'etl/row'
65
69
  require 'etl/parser'
66
70
  require 'etl/transform'
67
71
  require 'etl/processor'
68
72
  require 'etl/generator'
73
+ require 'etl/screen'
69
74
 
70
75
  module ETL #:nodoc:
71
76
  class ETLError < StandardError #:nodoc:
@@ -78,4 +83,8 @@ module ETL #:nodoc:
78
83
  end
79
84
  class ResolverError < ETLError #:nodoc:
80
85
  end
86
+ class ScreenError < ETLError #:nodoc:
87
+ end
88
+ class FatalScreenError < ScreenError #:nodoc:
89
+ end
81
90
  end
@@ -0,0 +1,2 @@
1
+ require 'etl/batch/batch'
2
+ require 'etl/batch/directives'
@@ -0,0 +1,111 @@
1
+ module ETL #:nodoc:
2
+ module Batch
3
+ class Context
4
+ attr_reader :batch
5
+
6
+ class << self
7
+ # Create a context that is used when evaluating the batch file
8
+ def create(batch)
9
+ Context.new(batch).get_binding
10
+ end
11
+ end
12
+
13
+ def initialize(batch)
14
+ @batch = batch
15
+ end
16
+
17
+ def file
18
+ batch.file
19
+ end
20
+
21
+ def get_binding
22
+ binding
23
+ end
24
+
25
+ def run(file)
26
+ batch.run(File.dirname(self.file) + "/" + file)
27
+ end
28
+
29
+ def use_temp_tables(value=true)
30
+ batch.use_temp_tables(value)
31
+ end
32
+
33
+ end
34
+ class Batch
35
+ attr_accessor :file
36
+ attr_accessor :engine
37
+
38
+ class << self
39
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
40
+ # are:
41
+ # * The path to a control file as a String
42
+ # * A File object referencing the control file
43
+ # * The ETL::Control::Control object (which will just be returned)
44
+ #
45
+ # Raises a ControlError if any other type is given
46
+ def resolve(batch, engine)
47
+ batch = do_resolve(batch)
48
+ batch.engine = engine
49
+ batch
50
+ end
51
+
52
+ protected
53
+ def parse(batch_file)
54
+ batch_file = batch_file.path if batch_file.instance_of?(File)
55
+ batch = ETL::Batch::Batch.new(batch_file)
56
+ eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
57
+ batch
58
+ end
59
+
60
+ def do_resolve(batch)
61
+ case batch
62
+ when String
63
+ ETL::Batch::Batch.parse(File.new(batch))
64
+ when File
65
+ ETL::Batch::Batch.parse(batch)
66
+ when ETL::Batch::Batch
67
+ batch
68
+ else
69
+ raise RuntimeError, "Batch must be a String, File or Batch object"
70
+ end
71
+ end
72
+ end
73
+
74
+ def initialize(file)
75
+ @file = file
76
+ end
77
+
78
+ def run(file)
79
+ directives << Run.new(self, file)
80
+ end
81
+
82
+ def use_temp_tables(value = true)
83
+ directives << UseTempTables.new(self)
84
+ end
85
+
86
+ def execute
87
+ engine.say "Executing batch"
88
+ before_execute
89
+ directives.each do |directive|
90
+ directive.execute
91
+ end
92
+ engine.say "Finishing batch"
93
+ after_execute
94
+ engine.say "Batch complete"
95
+ end
96
+
97
+ def directives
98
+ @directives ||= []
99
+ end
100
+
101
+ def before_execute
102
+
103
+ end
104
+
105
+ def after_execute
106
+ ETL::Engine.finish # TODO: should be moved to the directive?
107
+ ETL::Engine.use_temp_tables = false # reset the temp tables
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,55 @@
1
+ module ETL #:nodoc:
2
+ module Batch #:nodoc:
3
+ # Abstract base class for directives
4
+ class Directive
5
+ # Method to access the batch object
6
+ attr_reader :batch
7
+
8
+ # Initialize the directive with the given batch object
9
+ def initialize(batch)
10
+ @batch = batch
11
+ end
12
+
13
+ # Execute the directive
14
+ def execute
15
+ do_execute
16
+ end
17
+
18
+ protected
19
+ # Implemented by subclasses
20
+ def do_execute
21
+ raise RuntimeError, "Directive must implement do_execute method"
22
+ end
23
+ end
24
+
25
+ # Directive indicating that the specified ETL control file should be
26
+ # run
27
+ class Run < Directive
28
+ # The file to execute
29
+ attr_reader :file
30
+
31
+ # Initialize the directive with the given batch object and file
32
+ def initialize(batch, file)
33
+ super(batch)
34
+ @file = file
35
+ end
36
+
37
+ protected
38
+ # Execute the process
39
+ def do_execute
40
+ batch.engine.process(file)
41
+ end
42
+ end
43
+
44
+ # Directive indicating temp tables should be used.
45
+ class UseTempTables < Directive
46
+ def initialize(batch)
47
+ super(batch)
48
+ end
49
+ protected
50
+ def do_execute
51
+ ETL::Engine.use_temp_tables = true
52
+ end
53
+ end
54
+ end
55
+ end