activewarehouse-etl 0.8.4 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +98 -62
- data/Rakefile +11 -0
- data/TODO +2 -1
- data/lib/etl.rb +9 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +1 -0
- data/lib/etl/builder/date_dimension_builder.rb +83 -0
- data/lib/etl/commands/etl.rb +56 -43
- data/lib/etl/control/control.rb +58 -9
- data/lib/etl/control/destination.rb +29 -4
- data/lib/etl/control/destination/database_destination.rb +17 -27
- data/lib/etl/control/source/database_source.rb +17 -40
- data/lib/etl/control/source/file_source.rb +8 -5
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +40 -0
- data/lib/etl/engine.rb +184 -83
- data/lib/etl/execution.rb +1 -0
- data/lib/etl/execution/base.rb +1 -1
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +1 -0
- data/lib/etl/execution/migration.rb +16 -4
- data/lib/etl/generator/surrogate_key_generator.rb +20 -4
- data/lib/etl/http_tools.rb +1 -1
- data/lib/etl/processor/bulk_import_processor.rb +16 -19
- data/lib/etl/processor/check_exist_processor.rb +16 -7
- data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/surrogate_key_processor.rb +22 -2
- data/lib/etl/processor/truncate_processor.rb +13 -13
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +7 -2
- data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
- data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +2 -2
- metadata +19 -2
data/CHANGELOG
CHANGED
@@ -3,29 +3,38 @@
|
|
3
3
|
|
4
4
|
0.2.0 - Dec 7, 2006
|
5
5
|
* Added an XML parser for source parsing
|
6
|
-
* Added support for compound key constraints in destinations via the
|
6
|
+
* Added support for compound key constraints in destinations via the
|
7
|
+
:unique => [] option
|
7
8
|
* Added ability to declare explicit columns in bulk import
|
8
9
|
* Added support for generators in destinations
|
9
|
-
* Added a SurrogateKeyGenerator for cases where the database doesn't support
|
10
|
+
* Added a SurrogateKeyGenerator for cases where the database doesn't support
|
11
|
+
auto generated surrogate keys
|
10
12
|
|
11
13
|
0.3.0 - Dec 19, 2006
|
12
14
|
* Added support for calculated values in virtual fields with Proc
|
13
15
|
|
14
16
|
0.4.0 - Jan 11, 2006
|
15
|
-
* Added :skip_lines option to file source configurations, which can be used
|
16
|
-
|
17
|
-
* Added
|
18
|
-
|
17
|
+
* Added :skip_lines option to file source configurations, which can be used
|
18
|
+
to skip the first n lines in the source data file
|
19
|
+
* Added better error handling in delimited parser - an error is now raised
|
20
|
+
if the expected and actual field lengths do not match
|
21
|
+
* Added :truncate option for database destination. Set to true to truncate
|
22
|
+
before importing data.
|
23
|
+
* Added support for :unique => [] option and virtual fields for the database
|
24
|
+
destination
|
19
25
|
|
20
26
|
0.5.0 - Feb 17, 2007
|
21
|
-
* Changed require_gem to gem and added alias to allow for older versions of
|
22
|
-
|
23
|
-
|
27
|
+
* Changed require_gem to gem and added alias to allow for older versions of
|
28
|
+
rubygems.
|
29
|
+
* Added support for Hash in the source configuration where :name => :parser_name
|
30
|
+
defines the parser to use and :options => {} defines options to pass to the
|
31
|
+
parser.
|
24
32
|
* Added support for passing a custom Parser class in the source configuration.
|
25
33
|
* Removed the need to include Enumerable in each parser implementation.
|
26
34
|
* Added new date_to_string and string_to_date transformers.
|
27
35
|
* Implemented foreign_key_lookup transform including an ActiveRecordResolver.
|
28
|
-
* Added real time activity logging which is called when the etl bin script is
|
36
|
+
* Added real time activity logging which is called when the etl bin script is
|
37
|
+
invoked.
|
29
38
|
* Improved error handling.
|
30
39
|
* Default logger level is now WARN.
|
31
40
|
|
@@ -40,29 +49,36 @@
|
|
40
49
|
0.6.0 - Mar 8, 2007
|
41
50
|
* Fixed missing method problem in validate in Control class.
|
42
51
|
* Removed control validation for now (source could be code in the control file).
|
43
|
-
* Transform interface now defined as taking 3 arguments, the field name, field
|
44
|
-
is not backwards compatible.
|
52
|
+
* Transform interface now defined as taking 3 arguments, the field name, field
|
53
|
+
value and the row. This is not backwards compatible.
|
45
54
|
* Added HierarchyLookupTransform.
|
46
|
-
* Added DefaultTransform which will return a specified value if the initial
|
55
|
+
* Added DefaultTransform which will return a specified value if the initial
|
56
|
+
value is blank.
|
47
57
|
* Added row-level processing.
|
48
|
-
* Added HierarchyExploderProcessor which takes a single hierarchy row and
|
49
|
-
as used in a hierarchy bridge.
|
50
|
-
* Added ApacheCombinedLogParser which parses Apache Combined Log format,
|
58
|
+
* Added HierarchyExploderProcessor which takes a single hierarchy row and
|
59
|
+
explodes it to multiple rows as used in a hierarchy bridge.
|
60
|
+
* Added ApacheCombinedLogParser which parses Apache Combined Log format,
|
61
|
+
including parsing of the
|
51
62
|
user agent string and the URI, returning a Hash.
|
52
|
-
* Fixed bug in SAX parser so that attributes are now set when the start_element
|
53
|
-
|
54
|
-
*
|
63
|
+
* Fixed bug in SAX parser so that attributes are now set when the start_element
|
64
|
+
event is received.
|
65
|
+
* Added an HttpTools module which provides some parsing methods (for user agent
|
66
|
+
and URI).
|
67
|
+
* Database source now uses its own class for establishing an ActiveRecord
|
68
|
+
connection.
|
55
69
|
* Log files are now timestamped.
|
56
70
|
* Source files are now archived automatically during the extraction process
|
57
|
-
* Added a :condition option to the destination configuration Hash that accepts
|
58
|
-
argument passed to it (the row).
|
59
|
-
* Added an :append_rows option to the destination configuration Hash that
|
60
|
-
append a single row) or an Array of Hashes (to
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
71
|
+
* Added a :condition option to the destination configuration Hash that accepts
|
72
|
+
a Proc with a single argument passed to it (the row).
|
73
|
+
* Added an :append_rows option to the destination configuration Hash that
|
74
|
+
accepts either a Hash (to append a single row) or an Array of Hashes (to
|
75
|
+
append multiple rows).
|
76
|
+
* Only print the read and written row counts if there is at least one source
|
77
|
+
and one destination respectively.
|
78
|
+
* Added a depends_on directive that accepts a list of arguments of either strings
|
79
|
+
or symbols. Each symbol is converted to a string and .ctl is appended;
|
80
|
+
strings are passed through directly. The dependencies are executed in the order
|
81
|
+
they are specified.
|
66
82
|
* The default field separator in the bulk loader is now a comma (was a tab).
|
67
83
|
|
68
84
|
0.6.1 - Mar 22, 2007
|
@@ -70,31 +86,36 @@
|
|
70
86
|
* Added CopyFieldProcessor
|
71
87
|
|
72
88
|
0.7 - Apr 8, 2007
|
73
|
-
* Job execution is now tracked in a database. This means that ActiveRecord is
|
74
|
-
of the sources being used in the ETL scripts. An example
|
75
|
-
be found in test/database.example.yml.
|
76
|
-
|
77
|
-
etl
|
89
|
+
* Job execution is now tracked in a database. This means that ActiveRecord is
|
90
|
+
required regardless of the sources being used in the ETL scripts. An example
|
91
|
+
database configuration for the etl can be found in test/database.example.yml.
|
92
|
+
This file is loaded from either a.) the current working directory or b.) the
|
93
|
+
location specified using the -c command line argument when running the etl
|
94
|
+
command.
|
78
95
|
* etl script now supports the following command line arguments:
|
79
96
|
** -h or --help: Prints the usage
|
80
|
-
** -l or --limit: Specifies a limit for the number of source rows to read,
|
81
|
-
your control files before executing a full ETL process
|
82
|
-
** -o or --offset: Specified a start offset for reading from the source, useful
|
83
|
-
control files before executing a full ETL process
|
84
|
-
** -c or --config: Specify the database.yml file to configure the ETL
|
97
|
+
** -l or --limit: Specifies a limit for the number of source rows to read,
|
98
|
+
useful for testing your control files before executing a full ETL process
|
99
|
+
** -o or --offset: Specified a start offset for reading from the source, useful
|
100
|
+
for testing your control files before executing a full ETL process
|
101
|
+
** -c or --config: Specify the database.yml file to configure the ETL
|
102
|
+
execution data store
|
85
103
|
** -n or --newlog: Write to the logfile rather than appending to it
|
86
|
-
* Database source now supports specifying the select, join and order parts of
|
87
|
-
|
104
|
+
* Database source now supports specifying the select, join and order parts of
|
105
|
+
the query.
|
106
|
+
* Database source understands the limit argument specified on the etl command
|
107
|
+
line
|
88
108
|
* Added CheckExistProcessor
|
89
109
|
* Added CheckUniqueProcessor
|
90
|
-
* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
|
91
|
-
CheckExistProcessor and CheckUniqueProcessor to provide
|
110
|
+
* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
|
111
|
+
conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
|
112
|
+
surrogate keys for all dimension records.
|
92
113
|
* Added SequenceProcessor
|
93
114
|
* Added OrdinalizeTransform
|
94
115
|
* Fixed a bug in the trim transform
|
95
|
-
* Sources now provide a trigger file which can be used to indicate that the
|
96
|
-
data has been completely extracted to the local file system.
|
97
|
-
recover from a failed ETL process.
|
116
|
+
* Sources now provide a trigger file which can be used to indicate that the
|
117
|
+
original source data has been completely extracted to the local file system.
|
118
|
+
This is useful if you need to recover from a failed ETL process.
|
98
119
|
* Updated README
|
99
120
|
|
100
121
|
0.7.1 - Apr 8, 2007
|
@@ -105,27 +126,30 @@
|
|
105
126
|
|
106
127
|
0.8.0 - Apr 12, 2007
|
107
128
|
* Source now available through the current row source accessor.
|
108
|
-
* Added new_rows_only configuration option to DatabaseSource. A date field must
|
109
|
-
only records that are greater than the date value in that
|
129
|
+
* Added new_rows_only configuration option to DatabaseSource. A date field must
|
130
|
+
be specified and only records that are greater than the date value in that
|
131
|
+
field, relative to the last successful
|
110
132
|
execution, will be returned from the source.
|
111
|
-
* Added an (untested) count feature which returns the number of rows for
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
133
|
+
* Added an (untested) count feature which returns the number of rows for
|
134
|
+
processing.
|
135
|
+
* If no natural key is defined then an empty array will now be used, resulting
|
136
|
+
in the row being written to the output without going through change checks.
|
137
|
+
* Mapping argument in destination is now optional. An empty hash will be used
|
138
|
+
if the mapping hash is not specified. If the mapping hash is not specified
|
139
|
+
then the order will be determined using the originating source's order.
|
140
|
+
* ActiveRecord configurations loaded from database.yml by the etl tool will be
|
141
|
+
merged with ActiveRecord::Base.configurations.
|
119
142
|
* Fixed several bugs in how record change detection was implemented.
|
120
|
-
* Fixed how the read_locally functionality was implemented so that it will find
|
121
|
-
completed local source copy using the source's trigger file (untested).
|
143
|
+
* Fixed how the read_locally functionality was implemented so that it will find
|
144
|
+
that last completed local source copy using the source's trigger file (untested).
|
122
145
|
|
123
146
|
0.8.1 - Apr 12, 2007
|
124
147
|
* Added EnumerableSource
|
125
|
-
* Added :type configuration option to the source directive, allowing the source
|
126
|
-
explicitly specified. The source type can be a string or symbol
|
127
|
-
be constructed by appending Source to the type
|
128
|
-
|
148
|
+
* Added :type configuration option to the source directive, allowing the source
|
149
|
+
type to be explicitly specified. The source type can be a string or symbol
|
150
|
+
(in which case the class will be constructed by appending Source to the type
|
151
|
+
name), a class (which will be instantiate and passed the control,
|
152
|
+
configuration and mapping) and finally an actual Source instance.
|
129
153
|
|
130
154
|
0.8.2 - April 15, 2007
|
131
155
|
* Fixed bug with premature destination closing.
|
@@ -139,4 +163,16 @@
|
|
139
163
|
* Added patches from Andy Triboletti
|
140
164
|
|
141
165
|
0.8.4 - May 24, 2007
|
142
|
-
* Added fix for backslash in file writer
|
166
|
+
* Added fix for backslash in file writer
|
167
|
+
|
168
|
+
0.9.0 -
|
169
|
+
* Added support for batch processing through .ebf files. These files are
|
170
|
+
essentially control files that apply settings to an entire ETL process.
|
171
|
+
* Implemented support for screen blocks. These blocks can be used to test
|
172
|
+
the data and raise an error if the screens do not pass.
|
173
|
+
* Connections are now cached in a Hash available through
|
174
|
+
ETL::Engine.connection(name). This should be used rather than including
|
175
|
+
connection information in the control files.
|
176
|
+
* Implemented temp table support throughout.
|
177
|
+
* DateDimensionBuilder now included in ActiveWarehouse ETL directly.
|
178
|
+
* Time calculations for fiscal year now included in ActiveWarehouse ETL.
|
data/Rakefile
CHANGED
@@ -38,6 +38,17 @@ Rake::RDocTask.new(:rdoc) do |rdoc|
|
|
38
38
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
39
39
|
end
|
40
40
|
|
41
|
+
namespace :rcov do
|
42
|
+
desc 'Measures test coverage'
|
43
|
+
task :test do
|
44
|
+
rm_f 'coverage.data'
|
45
|
+
mkdir 'coverage' unless File.exist?('coverage')
|
46
|
+
rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
|
47
|
+
system("#{rcov} test/*_test.rb")
|
48
|
+
system("open coverage/index.html") if PLATFORM['darwin']
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
41
52
|
PKG_FILES = FileList[
|
42
53
|
'CHANGELOG',
|
43
54
|
'LICENSE',
|
data/TODO
CHANGED
@@ -7,8 +7,9 @@ TODO
|
|
7
7
|
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
8
8
|
** Allow mismatch row length error in delimited parser to be ignored
|
9
9
|
* Improve error messages throughout, but especially in problems with the control files
|
10
|
-
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
10
|
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
11
|
+
* Check if a temp table exists and the last job run was successful, in which case skip during the current run
|
12
|
+
* Create models for each of the tables in each of the databases defined in ETL::Engine.connections
|
12
13
|
|
13
14
|
Audit Record
|
14
15
|
|
data/lib/etl.rb
CHANGED
@@ -57,15 +57,20 @@ end
|
|
57
57
|
|
58
58
|
$:.unshift(File.dirname(__FILE__))
|
59
59
|
|
60
|
+
require 'etl/core_ext'
|
61
|
+
require 'etl/util'
|
60
62
|
require 'etl/http_tools'
|
63
|
+
require 'etl/builder'
|
61
64
|
require 'etl/version'
|
62
65
|
require 'etl/engine'
|
63
66
|
require 'etl/control'
|
67
|
+
require 'etl/batch'
|
64
68
|
require 'etl/row'
|
65
69
|
require 'etl/parser'
|
66
70
|
require 'etl/transform'
|
67
71
|
require 'etl/processor'
|
68
72
|
require 'etl/generator'
|
73
|
+
require 'etl/screen'
|
69
74
|
|
70
75
|
module ETL #:nodoc:
|
71
76
|
class ETLError < StandardError #:nodoc:
|
@@ -78,4 +83,8 @@ module ETL #:nodoc:
|
|
78
83
|
end
|
79
84
|
class ResolverError < ETLError #:nodoc:
|
80
85
|
end
|
86
|
+
class ScreenError < ETLError #:nodoc:
|
87
|
+
end
|
88
|
+
class FatalScreenError < ScreenError #:nodoc:
|
89
|
+
end
|
81
90
|
end
|
data/lib/etl/batch.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Batch
|
3
|
+
class Context
|
4
|
+
attr_reader :batch
|
5
|
+
|
6
|
+
class << self
|
7
|
+
# Create a context that is used when evaluating the batch file
|
8
|
+
def create(batch)
|
9
|
+
Context.new(batch).get_binding
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(batch)
|
14
|
+
@batch = batch
|
15
|
+
end
|
16
|
+
|
17
|
+
def file
|
18
|
+
batch.file
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_binding
|
22
|
+
binding
|
23
|
+
end
|
24
|
+
|
25
|
+
def run(file)
|
26
|
+
batch.run(File.dirname(self.file) + "/" + file)
|
27
|
+
end
|
28
|
+
|
29
|
+
def use_temp_tables(value=true)
|
30
|
+
batch.use_temp_tables(value)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
class Batch
|
35
|
+
attr_accessor :file
|
36
|
+
attr_accessor :engine
|
37
|
+
|
38
|
+
class << self
|
39
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
40
|
+
# are:
|
41
|
+
# * The path to a control file as a String
|
42
|
+
# * A File object referencing the control file
|
43
|
+
# * The ETL::Control::Control object (which will just be returned)
|
44
|
+
#
|
45
|
+
# Raises a ControlError if any other type is given
|
46
|
+
def resolve(batch, engine)
|
47
|
+
batch = do_resolve(batch)
|
48
|
+
batch.engine = engine
|
49
|
+
batch
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
def parse(batch_file)
|
54
|
+
batch_file = batch_file.path if batch_file.instance_of?(File)
|
55
|
+
batch = ETL::Batch::Batch.new(batch_file)
|
56
|
+
eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
|
57
|
+
batch
|
58
|
+
end
|
59
|
+
|
60
|
+
def do_resolve(batch)
|
61
|
+
case batch
|
62
|
+
when String
|
63
|
+
ETL::Batch::Batch.parse(File.new(batch))
|
64
|
+
when File
|
65
|
+
ETL::Batch::Batch.parse(batch)
|
66
|
+
when ETL::Batch::Batch
|
67
|
+
batch
|
68
|
+
else
|
69
|
+
raise RuntimeError, "Batch must be a String, File or Batch object"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def initialize(file)
|
75
|
+
@file = file
|
76
|
+
end
|
77
|
+
|
78
|
+
def run(file)
|
79
|
+
directives << Run.new(self, file)
|
80
|
+
end
|
81
|
+
|
82
|
+
def use_temp_tables(value = true)
|
83
|
+
directives << UseTempTables.new(self)
|
84
|
+
end
|
85
|
+
|
86
|
+
def execute
|
87
|
+
engine.say "Executing batch"
|
88
|
+
before_execute
|
89
|
+
directives.each do |directive|
|
90
|
+
directive.execute
|
91
|
+
end
|
92
|
+
engine.say "Finishing batch"
|
93
|
+
after_execute
|
94
|
+
engine.say "Batch complete"
|
95
|
+
end
|
96
|
+
|
97
|
+
def directives
|
98
|
+
@directives ||= []
|
99
|
+
end
|
100
|
+
|
101
|
+
def before_execute
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
def after_execute
|
106
|
+
ETL::Engine.finish # TODO: should be moved to the directive?
|
107
|
+
ETL::Engine.use_temp_tables = false # reset the temp tables
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Batch #:nodoc:
|
3
|
+
# Abstract base class for directives
|
4
|
+
class Directive
|
5
|
+
# Method to access the batch object
|
6
|
+
attr_reader :batch
|
7
|
+
|
8
|
+
# Initialize the directive with the given batch object
|
9
|
+
def initialize(batch)
|
10
|
+
@batch = batch
|
11
|
+
end
|
12
|
+
|
13
|
+
# Execute the directive
|
14
|
+
def execute
|
15
|
+
do_execute
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
# Implemented by subclasses
|
20
|
+
def do_execute
|
21
|
+
raise RuntimeError, "Directive must implement do_execute method"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Directive indicating that the specified ETL control file should be
|
26
|
+
# run
|
27
|
+
class Run < Directive
|
28
|
+
# The file to execute
|
29
|
+
attr_reader :file
|
30
|
+
|
31
|
+
# Initialize the directive with the given batch object and file
|
32
|
+
def initialize(batch, file)
|
33
|
+
super(batch)
|
34
|
+
@file = file
|
35
|
+
end
|
36
|
+
|
37
|
+
protected
|
38
|
+
# Execute the process
|
39
|
+
def do_execute
|
40
|
+
batch.engine.process(file)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Directive indicating temp tables should be used.
|
45
|
+
class UseTempTables < Directive
|
46
|
+
def initialize(batch)
|
47
|
+
super(batch)
|
48
|
+
end
|
49
|
+
protected
|
50
|
+
def do_execute
|
51
|
+
ETL::Engine.use_temp_tables = true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|