activewarehouse-etl 0.8.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +98 -62
- data/Rakefile +11 -0
- data/TODO +2 -1
- data/lib/etl.rb +9 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/builder.rb +1 -0
- data/lib/etl/builder/date_dimension_builder.rb +83 -0
- data/lib/etl/commands/etl.rb +56 -43
- data/lib/etl/control/control.rb +58 -9
- data/lib/etl/control/destination.rb +29 -4
- data/lib/etl/control/destination/database_destination.rb +17 -27
- data/lib/etl/control/source/database_source.rb +17 -40
- data/lib/etl/control/source/file_source.rb +8 -5
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext/time/calculations.rb +40 -0
- data/lib/etl/engine.rb +184 -83
- data/lib/etl/execution.rb +1 -0
- data/lib/etl/execution/base.rb +1 -1
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +1 -0
- data/lib/etl/execution/migration.rb +16 -4
- data/lib/etl/generator/surrogate_key_generator.rb +20 -4
- data/lib/etl/http_tools.rb +1 -1
- data/lib/etl/processor/bulk_import_processor.rb +16 -19
- data/lib/etl/processor/check_exist_processor.rb +16 -7
- data/lib/etl/processor/hierarchy_exploder_processor.rb +2 -1
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/surrogate_key_processor.rb +22 -2
- data/lib/etl/processor/truncate_processor.rb +13 -13
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/screen/row_count_screen.rb +7 -2
- data/lib/etl/transform/foreign_key_lookup_transform.rb +15 -5
- data/lib/etl/transform/hierarchy_lookup_transform.rb +7 -14
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +2 -2
- metadata +19 -2
data/CHANGELOG
CHANGED
@@ -3,29 +3,38 @@
|
|
3
3
|
|
4
4
|
0.2.0 - Dec 7, 2006
|
5
5
|
* Added an XML parser for source parsing
|
6
|
-
* Added support for compound key constraints in destinations via the
|
6
|
+
* Added support for compound key constraints in destinations via the
|
7
|
+
:unique => [] option
|
7
8
|
* Added ability to declare explicit columns in bulk import
|
8
9
|
* Added support for generators in destinations
|
9
|
-
* Added a SurrogateKeyGenerator for cases where the database doesn't support
|
10
|
+
* Added a SurrogateKeyGenerator for cases where the database doesn't support
|
11
|
+
auto generated surrogate keys
|
10
12
|
|
11
13
|
0.3.0 - Dec 19, 2006
|
12
14
|
* Added support for calculated values in virtual fields with Proc
|
13
15
|
|
14
16
|
0.4.0 - Jan 11, 2006
|
15
|
-
* Added :skip_lines option to file source configurations, which can be used
|
16
|
-
|
17
|
-
* Added
|
18
|
-
|
17
|
+
* Added :skip_lines option to file source configurations, which can be used
|
18
|
+
to skip the first n lines in the source data file
|
19
|
+
* Added better error handling in delimited parser - an error is now raised
|
20
|
+
if the expected and actual field lengths do not match
|
21
|
+
* Added :truncate option for database destination. Set to true to truncate
|
22
|
+
before importing data.
|
23
|
+
* Added support for :unique => [] option and virtual fields for the database
|
24
|
+
destination
|
19
25
|
|
20
26
|
0.5.0 - Feb 17, 2007
|
21
|
-
* Changed require_gem to gem and added alias to allow for older versions of
|
22
|
-
|
23
|
-
|
27
|
+
* Changed require_gem to gem and added alias to allow for older versions of
|
28
|
+
rubygems.
|
29
|
+
* Added support for Hash in the source configuration where :name => :parser_name
|
30
|
+
defines the parser to use and :options => {} defines options to pass to the
|
31
|
+
parser.
|
24
32
|
* Added support for passing a custom Parser class in the source configuration.
|
25
33
|
* Removed the need to include Enumerable in each parser implementation.
|
26
34
|
* Added new date_to_string and string_to_date transformers.
|
27
35
|
* Implemented foreign_key_lookup transform including an ActiveRecordResolver.
|
28
|
-
* Added real time activity logging which is called when the etl bin script is
|
36
|
+
* Added real time activity logging which is called when the etl bin script is
|
37
|
+
invoked.
|
29
38
|
* Improved error handling.
|
30
39
|
* Default logger level is now WARN.
|
31
40
|
|
@@ -40,29 +49,36 @@
|
|
40
49
|
0.6.0 - Mar 8, 2007
|
41
50
|
* Fixed missing method problem in validate in Control class.
|
42
51
|
* Removed control validation for now (source could be code in the control file).
|
43
|
-
* Transform interface now defined as taking 3 arguments, the field name, field
|
44
|
-
is not backwards compatible.
|
52
|
+
* Transform interface now defined as taking 3 arguments, the field name, field
|
53
|
+
value and the row. This is not backwards compatible.
|
45
54
|
* Added HierarchyLookupTransform.
|
46
|
-
* Added DefaultTransform which will return a specified value if the initial
|
55
|
+
* Added DefaultTransform which will return a specified value if the initial
|
56
|
+
value is blank.
|
47
57
|
* Added row-level processing.
|
48
|
-
* Added HierarchyExploderProcessor which takes a single hierarchy row and
|
49
|
-
as used in a hierarchy bridge.
|
50
|
-
* Added ApacheCombinedLogParser which parses Apache Combined Log format,
|
58
|
+
* Added HierarchyExploderProcessor which takes a single hierarchy row and
|
59
|
+
explodes it to multiple rows as used in a hierarchy bridge.
|
60
|
+
* Added ApacheCombinedLogParser which parses Apache Combined Log format,
|
61
|
+
including parsing of the
|
51
62
|
user agent string and the URI, returning a Hash.
|
52
|
-
* Fixed bug in SAX parser so that attributes are now set when the start_element
|
53
|
-
|
54
|
-
*
|
63
|
+
* Fixed bug in SAX parser so that attributes are now set when the start_element
|
64
|
+
event is received.
|
65
|
+
* Added an HttpTools module which provides some parsing methods (for user agent
|
66
|
+
and URI).
|
67
|
+
* Database source now uses its own class for establishing an ActiveRecord
|
68
|
+
connection.
|
55
69
|
* Log files are now timestamped.
|
56
70
|
* Source files are now archived automatically during the extraction process
|
57
|
-
* Added a :condition option to the destination configuration Hash that accepts
|
58
|
-
argument passed to it (the row).
|
59
|
-
* Added an :append_rows option to the destination configuration Hash that
|
60
|
-
append a single row) or an Array of Hashes (to
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
71
|
+
* Added a :condition option to the destination configuration Hash that accepts
|
72
|
+
a Proc with a single argument passed to it (the row).
|
73
|
+
* Added an :append_rows option to the destination configuration Hash that
|
74
|
+
accepts either a Hash (to append a single row) or an Array of Hashes (to
|
75
|
+
append multiple rows).
|
76
|
+
* Only print the read and written row counts if there is at least one source
|
77
|
+
and one destination respectively.
|
78
|
+
* Added a depends_on directive that accepts a list of arguments of either strings
|
79
|
+
or symbols. Each symbol is converted to a string and .ctl is appended;
|
80
|
+
strings are passed through directly. The dependencies are executed in the order
|
81
|
+
they are specified.
|
66
82
|
* The default field separator in the bulk loader is now a comma (was a tab).
|
67
83
|
|
68
84
|
0.6.1 - Mar 22, 2007
|
@@ -70,31 +86,36 @@
|
|
70
86
|
* Added CopyFieldProcessor
|
71
87
|
|
72
88
|
0.7 - Apr 8, 2007
|
73
|
-
* Job execution is now tracked in a database. This means that ActiveRecord is
|
74
|
-
of the sources being used in the ETL scripts. An example
|
75
|
-
be found in test/database.example.yml.
|
76
|
-
|
77
|
-
etl
|
89
|
+
* Job execution is now tracked in a database. This means that ActiveRecord is
|
90
|
+
required regardless of the sources being used in the ETL scripts. An example
|
91
|
+
database configuration for the etl can be found in test/database.example.yml.
|
92
|
+
This file is loaded from either a.) the current working directory or b.) the
|
93
|
+
location specified using the -c command line argument when running the etl
|
94
|
+
command.
|
78
95
|
* etl script now supports the following command line arguments:
|
79
96
|
** -h or --help: Prints the usage
|
80
|
-
** -l or --limit: Specifies a limit for the number of source rows to read,
|
81
|
-
your control files before executing a full ETL process
|
82
|
-
** -o or --offset: Specified a start offset for reading from the source, useful
|
83
|
-
control files before executing a full ETL process
|
84
|
-
** -c or --config: Specify the database.yml file to configure the ETL
|
97
|
+
** -l or --limit: Specifies a limit for the number of source rows to read,
|
98
|
+
useful for testing your control files before executing a full ETL process
|
99
|
+
** -o or --offset: Specified a start offset for reading from the source, useful
|
100
|
+
for testing your control files before executing a full ETL process
|
101
|
+
** -c or --config: Specify the database.yml file to configure the ETL
|
102
|
+
execution data store
|
85
103
|
** -n or --newlog: Write to the logfile rather than appending to it
|
86
|
-
* Database source now supports specifying the select, join and order parts of
|
87
|
-
|
104
|
+
* Database source now supports specifying the select, join and order parts of
|
105
|
+
the query.
|
106
|
+
* Database source understands the limit argument specified on the etl command
|
107
|
+
line
|
88
108
|
* Added CheckExistProcessor
|
89
109
|
* Added CheckUniqueProcessor
|
90
|
-
* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
|
91
|
-
CheckExistProcessor and CheckUniqueProcessor to provide
|
110
|
+
* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
|
111
|
+
conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
|
112
|
+
surrogate keys for all dimension records.
|
92
113
|
* Added SequenceProcessor
|
93
114
|
* Added OrdinalizeTransform
|
94
115
|
* Fixed a bug in the trim transform
|
95
|
-
* Sources now provide a trigger file which can be used to indicate that the
|
96
|
-
data has been completely extracted to the local file system.
|
97
|
-
recover from a failed ETL process.
|
116
|
+
* Sources now provide a trigger file which can be used to indicate that the
|
117
|
+
original source data has been completely extracted to the local file system.
|
118
|
+
This is useful if you need to recover from a failed ETL process.
|
98
119
|
* Updated README
|
99
120
|
|
100
121
|
0.7.1 - Apr 8, 2007
|
@@ -105,27 +126,30 @@
|
|
105
126
|
|
106
127
|
0.8.0 - Apr 12, 2007
|
107
128
|
* Source now available through the current row source accessor.
|
108
|
-
* Added new_rows_only configuration option to DatabaseSource. A date field must
|
109
|
-
only records that are greater than the date value in that
|
129
|
+
* Added new_rows_only configuration option to DatabaseSource. A date field must
|
130
|
+
be specified and only records that are greater than the date value in that
|
131
|
+
field, relative to the last successful
|
110
132
|
execution, will be returned from the source.
|
111
|
-
* Added an (untested) count feature which returns the number of rows for
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
133
|
+
* Added an (untested) count feature which returns the number of rows for
|
134
|
+
processing.
|
135
|
+
* If no natural key is defined then an empty array will now be used, resulting
|
136
|
+
in the row being written to the output without going through change checks.
|
137
|
+
* Mapping argument in destination is now optional. An empty hash will be used
|
138
|
+
if the mapping hash is not specified. If the mapping hash is not specified
|
139
|
+
then the order will be determined using the originating source's order.
|
140
|
+
* ActiveRecord configurations loaded from database.yml by the etl tool will be
|
141
|
+
merged with ActiveRecord::Base.configurations.
|
119
142
|
* Fixed several bugs in how record change detection was implemented.
|
120
|
-
* Fixed how the read_locally functionality was implemented so that it will find
|
121
|
-
completed local source copy using the source's trigger file (untested).
|
143
|
+
* Fixed how the read_locally functionality was implemented so that it will find
|
144
|
+
that last completed local source copy using the source's trigger file (untested).
|
122
145
|
|
123
146
|
0.8.1 - Apr 12, 2007
|
124
147
|
* Added EnumerableSource
|
125
|
-
* Added :type configuration option to the source directive, allowing the source
|
126
|
-
explicitly specified. The source type can be a string or symbol
|
127
|
-
be constructed by appending Source to the type
|
128
|
-
|
148
|
+
* Added :type configuration option to the source directive, allowing the source
|
149
|
+
type to be explicitly specified. The source type can be a string or symbol
|
150
|
+
(in which case the class will be constructed by appending Source to the type
|
151
|
+
name), a class (which will be instantiate and passed the control,
|
152
|
+
configuration and mapping) and finally an actual Source instance.
|
129
153
|
|
130
154
|
0.8.2 - April 15, 2007
|
131
155
|
* Fixed bug with premature destination closing.
|
@@ -139,4 +163,16 @@
|
|
139
163
|
* Added patches from Andy Triboletti
|
140
164
|
|
141
165
|
0.8.4 - May 24, 2007
|
142
|
-
* Added fix for backslash in file writer
|
166
|
+
* Added fix for backslash in file writer
|
167
|
+
|
168
|
+
0.9.0 -
|
169
|
+
* Added support for batch processing through .ebf files. These files are
|
170
|
+
essentially control files that apply settings to an entire ETL process.
|
171
|
+
* Implemented support for screen blocks. These blocks can be used to test
|
172
|
+
the data and raise an error if the screens do not pass.
|
173
|
+
* Connections are now cached in a Hash available through
|
174
|
+
ETL::Engine.connection(name). This should be used rather than including
|
175
|
+
connection information in the control files.
|
176
|
+
* Implemented temp table support throughout.
|
177
|
+
* DateDimensionBuilder now included in ActiveWarehouse ETL directly.
|
178
|
+
* Time calculations for fiscal year now included in ActiveWarehouse ETL.
|
data/Rakefile
CHANGED
@@ -38,6 +38,17 @@ Rake::RDocTask.new(:rdoc) do |rdoc|
|
|
38
38
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
39
39
|
end
|
40
40
|
|
41
|
+
namespace :rcov do
|
42
|
+
desc 'Measures test coverage'
|
43
|
+
task :test do
|
44
|
+
rm_f 'coverage.data'
|
45
|
+
mkdir 'coverage' unless File.exist?('coverage')
|
46
|
+
rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
|
47
|
+
system("#{rcov} test/*_test.rb")
|
48
|
+
system("open coverage/index.html") if PLATFORM['darwin']
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
41
52
|
PKG_FILES = FileList[
|
42
53
|
'CHANGELOG',
|
43
54
|
'LICENSE',
|
data/TODO
CHANGED
@@ -7,8 +7,9 @@ TODO
|
|
7
7
|
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
8
8
|
** Allow mismatch row length error in delimited parser to be ignored
|
9
9
|
* Improve error messages throughout, but especially in problems with the control files
|
10
|
-
* Add support for determining if a row should be added, updated or removed vs. just blindly inserting
|
11
10
|
* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
|
11
|
+
* Check if a temp table exists and the last job run was successful, in which case skip during the current run
|
12
|
+
* Create models for each of the tables in each of the databases defined in ETL::Engine.connections
|
12
13
|
|
13
14
|
Audit Record
|
14
15
|
|
data/lib/etl.rb
CHANGED
@@ -57,15 +57,20 @@ end
|
|
57
57
|
|
58
58
|
$:.unshift(File.dirname(__FILE__))
|
59
59
|
|
60
|
+
require 'etl/core_ext'
|
61
|
+
require 'etl/util'
|
60
62
|
require 'etl/http_tools'
|
63
|
+
require 'etl/builder'
|
61
64
|
require 'etl/version'
|
62
65
|
require 'etl/engine'
|
63
66
|
require 'etl/control'
|
67
|
+
require 'etl/batch'
|
64
68
|
require 'etl/row'
|
65
69
|
require 'etl/parser'
|
66
70
|
require 'etl/transform'
|
67
71
|
require 'etl/processor'
|
68
72
|
require 'etl/generator'
|
73
|
+
require 'etl/screen'
|
69
74
|
|
70
75
|
module ETL #:nodoc:
|
71
76
|
class ETLError < StandardError #:nodoc:
|
@@ -78,4 +83,8 @@ module ETL #:nodoc:
|
|
78
83
|
end
|
79
84
|
class ResolverError < ETLError #:nodoc:
|
80
85
|
end
|
86
|
+
class ScreenError < ETLError #:nodoc:
|
87
|
+
end
|
88
|
+
class FatalScreenError < ScreenError #:nodoc:
|
89
|
+
end
|
81
90
|
end
|
data/lib/etl/batch.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Batch
|
3
|
+
class Context
|
4
|
+
attr_reader :batch
|
5
|
+
|
6
|
+
class << self
|
7
|
+
# Create a context that is used when evaluating the batch file
|
8
|
+
def create(batch)
|
9
|
+
Context.new(batch).get_binding
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(batch)
|
14
|
+
@batch = batch
|
15
|
+
end
|
16
|
+
|
17
|
+
def file
|
18
|
+
batch.file
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_binding
|
22
|
+
binding
|
23
|
+
end
|
24
|
+
|
25
|
+
def run(file)
|
26
|
+
batch.run(File.dirname(self.file) + "/" + file)
|
27
|
+
end
|
28
|
+
|
29
|
+
def use_temp_tables(value=true)
|
30
|
+
batch.use_temp_tables(value)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
class Batch
|
35
|
+
attr_accessor :file
|
36
|
+
attr_accessor :engine
|
37
|
+
|
38
|
+
class << self
|
39
|
+
# Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
|
40
|
+
# are:
|
41
|
+
# * The path to a control file as a String
|
42
|
+
# * A File object referencing the control file
|
43
|
+
# * The ETL::Control::Control object (which will just be returned)
|
44
|
+
#
|
45
|
+
# Raises a ControlError if any other type is given
|
46
|
+
def resolve(batch, engine)
|
47
|
+
batch = do_resolve(batch)
|
48
|
+
batch.engine = engine
|
49
|
+
batch
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
def parse(batch_file)
|
54
|
+
batch_file = batch_file.path if batch_file.instance_of?(File)
|
55
|
+
batch = ETL::Batch::Batch.new(batch_file)
|
56
|
+
eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
|
57
|
+
batch
|
58
|
+
end
|
59
|
+
|
60
|
+
def do_resolve(batch)
|
61
|
+
case batch
|
62
|
+
when String
|
63
|
+
ETL::Batch::Batch.parse(File.new(batch))
|
64
|
+
when File
|
65
|
+
ETL::Batch::Batch.parse(batch)
|
66
|
+
when ETL::Batch::Batch
|
67
|
+
batch
|
68
|
+
else
|
69
|
+
raise RuntimeError, "Batch must be a String, File or Batch object"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def initialize(file)
|
75
|
+
@file = file
|
76
|
+
end
|
77
|
+
|
78
|
+
def run(file)
|
79
|
+
directives << Run.new(self, file)
|
80
|
+
end
|
81
|
+
|
82
|
+
def use_temp_tables(value = true)
|
83
|
+
directives << UseTempTables.new(self)
|
84
|
+
end
|
85
|
+
|
86
|
+
def execute
|
87
|
+
engine.say "Executing batch"
|
88
|
+
before_execute
|
89
|
+
directives.each do |directive|
|
90
|
+
directive.execute
|
91
|
+
end
|
92
|
+
engine.say "Finishing batch"
|
93
|
+
after_execute
|
94
|
+
engine.say "Batch complete"
|
95
|
+
end
|
96
|
+
|
97
|
+
def directives
|
98
|
+
@directives ||= []
|
99
|
+
end
|
100
|
+
|
101
|
+
def before_execute
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
def after_execute
|
106
|
+
ETL::Engine.finish # TODO: should be moved to the directive?
|
107
|
+
ETL::Engine.use_temp_tables = false # reset the temp tables
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Batch #:nodoc:
|
3
|
+
# Abstract base class for directives
|
4
|
+
class Directive
|
5
|
+
# Method to access the batch object
|
6
|
+
attr_reader :batch
|
7
|
+
|
8
|
+
# Initialize the directive with the given batch object
|
9
|
+
def initialize(batch)
|
10
|
+
@batch = batch
|
11
|
+
end
|
12
|
+
|
13
|
+
# Execute the directive
|
14
|
+
def execute
|
15
|
+
do_execute
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
# Implemented by subclasses
|
20
|
+
def do_execute
|
21
|
+
raise RuntimeError, "Directive must implement do_execute method"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Directive indicating that the specified ETL control file should be
|
26
|
+
# run
|
27
|
+
class Run < Directive
|
28
|
+
# The file to execute
|
29
|
+
attr_reader :file
|
30
|
+
|
31
|
+
# Initialize the directive with the given batch object and file
|
32
|
+
def initialize(batch, file)
|
33
|
+
super(batch)
|
34
|
+
@file = file
|
35
|
+
end
|
36
|
+
|
37
|
+
protected
|
38
|
+
# Execute the process
|
39
|
+
def do_execute
|
40
|
+
batch.engine.process(file)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Directive indicating temp tables should be used.
|
45
|
+
class UseTempTables < Directive
|
46
|
+
def initialize(batch)
|
47
|
+
super(batch)
|
48
|
+
end
|
49
|
+
protected
|
50
|
+
def do_execute
|
51
|
+
ETL::Engine.use_temp_tables = true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|