activewarehouse-etl 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
data/CHANGELOG
CHANGED
@@ -67,4 +67,32 @@
|
|
67
67
|
|
68
68
|
0.6.1 - Mar 22, 2007
|
69
69
|
* Added support for absolute paths in file sources
|
70
|
-
* Added CopyFieldProcessor
|
70
|
+
* Added CopyFieldProcessor
|
71
|
+
|
72
|
+
0.7 - Apr 8, 2007
|
73
|
+
* Job execution is now tracked in a database. This means that ActiveRecord is required regardless
|
74
|
+
of the sources being used in the ETL scripts. An example database configuration for the etl can
|
75
|
+
be found in test/database.example.yml. This file is loaded from either a.) the current working
|
76
|
+
directory or b.) the location specified using the -c command line argument when running the
|
77
|
+
etl command.
|
78
|
+
* etl script now supports the following command line arguments:
|
79
|
+
** -h or --help: Prints the usage
|
80
|
+
** -l or --limit: Specifies a limit for the number of source rows to read, useful for testing
|
81
|
+
your control files before executing a full ETL process
|
82
|
+
** -o or --offset: Specified a start offset for reading from the source, useful for testing your
|
83
|
+
control files before executing a full ETL process
|
84
|
+
** -c or --config: Specify the database.yml file to configure the ETL execution data store
|
85
|
+
** -n or --newlog: Write to the logfile rather than appending to it
|
86
|
+
* Database source now supports specifying the select, join and order parts of the query.
|
87
|
+
* Database source understands the limit argument specified on the etl command line
|
88
|
+
* Added CheckExistProcessor
|
89
|
+
* Added CheckUniqueProcessor
|
90
|
+
* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in conjunction with the
|
91
|
+
CheckExistProcessor and CheckUniqueProcessor to provide
|
92
|
+
* Added SequenceProcessor
|
93
|
+
* Added OrdinalizeTransform
|
94
|
+
* Fixed a bug in the trim transform
|
95
|
+
* Sources now provide a trigger file which can be used to indicate that the original source
|
96
|
+
data has been completely extracted to the local file system. This is useful if you need to
|
97
|
+
recover from a failed ETL process.
|
98
|
+
* Updated README
|
data/LICENSE
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
Copyright (c) 2006-2007 Anthony Eden
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
+
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
CHANGED
@@ -1,27 +1,73 @@
|
|
1
|
-
Ruby ETL tool.
|
1
|
+
Ruby Extract-Transform-Load (ETL) tool.
|
2
2
|
|
3
3
|
== Features
|
4
4
|
|
5
5
|
Current supported features:
|
6
6
|
|
7
7
|
* ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
|
8
|
-
* Multiple source types
|
8
|
+
* Multiple source types. Current supported types:
|
9
|
+
* Fixed-width and delimited text files
|
10
|
+
* XML files through SAX
|
11
|
+
* Apache combined log format
|
9
12
|
* Multiple destination types - file and database destinations
|
10
|
-
* Support for extracting from multiple sources
|
11
|
-
* Support for
|
12
|
-
*
|
13
|
-
*
|
13
|
+
* Support for extracting from multiple sources in a single job
|
14
|
+
* Support for writing to multiple destinations in a single job
|
15
|
+
* A variety of built-in transformations are included:
|
16
|
+
* Date-to-string, string-to-date, string-to-datetime, string-to-timestamp
|
17
|
+
* Type transformation supporting strings, integers, floats and big decimals
|
18
|
+
* Trim
|
19
|
+
* SHA-1
|
20
|
+
* Decode from an external decode file
|
21
|
+
* Default replacement for empty values
|
22
|
+
* Ordinalize
|
23
|
+
* Hierarchy lookup
|
24
|
+
* Foreign key lookup
|
25
|
+
* Ruby blocks
|
26
|
+
* Any custom transformation class
|
27
|
+
* A variety of build-in row-level processors
|
28
|
+
* Check exists processor to determine if the record already exists in the destination database
|
29
|
+
* Check unique processor to determine whether a matching record was processed during this job execution
|
30
|
+
* Copy field
|
31
|
+
* Rename field
|
32
|
+
* Hierarchy exploder which takes a tree structure defined through a parent id and explodes it into a hierarchy bridge table
|
33
|
+
* Surrogate key generator including support for looking up the last surrogate key from the target table using a custom query
|
34
|
+
* Sequence generator including support for context-sensitive sequences where the context can be defined as a combination of fields from the source data
|
35
|
+
* New row-level processors can easily be defined and applied
|
36
|
+
* Pre-processing
|
37
|
+
* Truncate processor
|
38
|
+
* Post-processing
|
39
|
+
* Bulk import using native RDBMS bulk loader tools
|
14
40
|
* Virtual fields - Add a field to the destination data which doesn't exist in the source data
|
41
|
+
* Built in job and record meta data
|
42
|
+
* Support for type 1 and type 2 slowly changing dimensions
|
43
|
+
* Automated effective date and end date time stamping for type 2
|
44
|
+
* CRC checking
|
15
45
|
|
16
|
-
|
17
|
-
|
18
|
-
|
46
|
+
== Dependencies
|
47
|
+
ActiveWarehouse ETL depends on the following gems:
|
19
48
|
* ActiveSupport Gem
|
20
49
|
* ActiveRecord Gem
|
21
50
|
* FasterCSV Gem
|
51
|
+
* AdapterExtensions Gem
|
52
|
+
|
53
|
+
== Usage
|
54
|
+
Once the ActiveWarehouse ETL gem is installed jobs can be invoked using the
|
55
|
+
included `etl` script. The etl script includes several command line options
|
56
|
+
and can process multiple control files at a time.
|
57
|
+
|
58
|
+
Command line options:
|
59
|
+
* <tt>--help, -h</tt>: Display the usage message.
|
60
|
+
* <tt>--config, -c</tt>: Specify a database.yml configuration file to use.
|
61
|
+
* <tt>--limit, -l</tt>: Specify a limit to the number of rows to process. This option is currently only applicable to database sources.
|
62
|
+
* <tt>--offset, -o</tt>: Specify the start offset for reading from the source. This option is currently only applicable to database sources.
|
63
|
+
* <tt>--newlog, -n</tt>: Instruct the engine to create a new ETL log rather than append to the last ETL log.
|
64
|
+
* <tt>--skip-bulk-import, -s</tt>: Skip any bulk imports.
|
65
|
+
* <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
|
22
66
|
|
23
|
-
== Examples
|
24
|
-
|
67
|
+
== Control File Examples
|
68
|
+
Control file examples can be found in the test directory.
|
25
69
|
|
26
70
|
== Feedback
|
27
|
-
This is a work in progress. Comments should be made on the
|
71
|
+
This is a work in progress. Comments should be made on the
|
72
|
+
activewarehouse-discuss mailing list at the moment. Contributions are always
|
73
|
+
welcome.
|
data/Rakefile
CHANGED
@@ -40,6 +40,7 @@ end
|
|
40
40
|
|
41
41
|
PKG_FILES = FileList[
|
42
42
|
'CHANGELOG',
|
43
|
+
'LICENSE',
|
43
44
|
'README',
|
44
45
|
'TODO',
|
45
46
|
'Rakefile',
|
@@ -59,7 +60,7 @@ spec = Gem::Specification.new do |s|
|
|
59
60
|
s.add_dependency('rake', '>= 0.7.1')
|
60
61
|
s.add_dependency('activesupport', '>= 1.3.1')
|
61
62
|
s.add_dependency('activerecord', '>= 1.14.4')
|
62
|
-
s.add_dependency('fastercsv', '>= 1.
|
63
|
+
s.add_dependency('fastercsv', '>= 1.2.0')
|
63
64
|
s.add_dependency('adapter_extensions', '>= 0.1.0')
|
64
65
|
|
65
66
|
s.rdoc_options << '--exclude' << '.'
|
data/lib/etl.rb
CHANGED
@@ -26,6 +26,8 @@
|
|
26
26
|
#++
|
27
27
|
|
28
28
|
require 'logger'
|
29
|
+
require 'yaml'
|
30
|
+
require 'erb'
|
29
31
|
|
30
32
|
require 'rubygems'
|
31
33
|
|
@@ -59,6 +61,7 @@ require 'etl/http_tools'
|
|
59
61
|
require 'etl/version'
|
60
62
|
require 'etl/engine'
|
61
63
|
require 'etl/control'
|
64
|
+
require 'etl/row'
|
62
65
|
require 'etl/parser'
|
63
66
|
require 'etl/transform'
|
64
67
|
require 'etl/processor'
|
data/lib/etl/commands/etl.rb
CHANGED
@@ -22,10 +22,43 @@
|
|
22
22
|
#++
|
23
23
|
|
24
24
|
require 'benchmark'
|
25
|
+
require 'getoptlong'
|
26
|
+
|
27
|
+
opts = GetoptLong.new(
|
28
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
29
|
+
[ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
30
|
+
[ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
31
|
+
[ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
|
32
|
+
[ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
|
33
|
+
[ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
|
34
|
+
[ '--read-locally', GetoptLong::NO_ARGUMENT]
|
35
|
+
)
|
25
36
|
|
26
37
|
# Print a usage statement
|
27
38
|
def usage #:nodoc:
|
28
|
-
puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]"
|
39
|
+
puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]" # TODO: add the command line options
|
40
|
+
end
|
41
|
+
|
42
|
+
options = {}
|
43
|
+
opts.each do |opt, arg|
|
44
|
+
case opt
|
45
|
+
when '--help'
|
46
|
+
usage
|
47
|
+
when '--config'
|
48
|
+
options[:config] = arg
|
49
|
+
when '--limit'
|
50
|
+
options[:limit] = arg.to_i
|
51
|
+
when '--offset'
|
52
|
+
options[:offset] = arg.to_i
|
53
|
+
when '--newlog'
|
54
|
+
options[:newlog] = true
|
55
|
+
when '--skip-bulk-import'
|
56
|
+
puts "skip bulk import enabled"
|
57
|
+
options[:skip_bulk_import] = true
|
58
|
+
when '--read-locally'
|
59
|
+
puts "read locally enabled"
|
60
|
+
options[:read_locally] = true
|
61
|
+
end
|
29
62
|
end
|
30
63
|
|
31
64
|
if ARGV.length < 1
|
@@ -33,6 +66,7 @@ if ARGV.length < 1
|
|
33
66
|
else
|
34
67
|
puts "Starting ETL process"
|
35
68
|
|
69
|
+
ETL::Engine.init(options)
|
36
70
|
ARGV.each do |f|
|
37
71
|
puts "Processing #{f}"
|
38
72
|
ETL::Engine.realtime_activity = true
|
data/lib/etl/control/control.rb
CHANGED
@@ -71,14 +71,13 @@ module ETL #:nodoc:
|
|
71
71
|
|
72
72
|
# Define a transform
|
73
73
|
def transform(name, transformer=nil, configuration={}, &block)
|
74
|
-
transforms[name] ||= []
|
75
74
|
if transformer
|
76
75
|
case transformer
|
77
76
|
when String, Symbol
|
78
77
|
class_name = "#{transformer.to_s.classify}Transform"
|
79
78
|
begin
|
80
79
|
transform_class = ETL::Transform.const_get(class_name)
|
81
|
-
transforms
|
80
|
+
transforms << transform_class.new(self, name, configuration)
|
82
81
|
rescue NameError => e
|
83
82
|
raise ControlError, "Unable to find transformer #{class_name}: #{e}"
|
84
83
|
end
|
@@ -86,13 +85,15 @@ module ETL #:nodoc:
|
|
86
85
|
#transformer.class.inspect
|
87
86
|
if transformer.is_a?(ETL::Transform::Transform)
|
88
87
|
Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
|
89
|
-
|
88
|
+
t = transformer.dup
|
89
|
+
t.name = name
|
90
|
+
transforms << t
|
90
91
|
else
|
91
92
|
raise ControlError, "Transformer must be a String, Symbol or Transform instance"
|
92
93
|
end
|
93
94
|
end
|
94
95
|
elsif block_given?
|
95
|
-
transforms
|
96
|
+
transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
|
96
97
|
else
|
97
98
|
raise ControlError, "Either a transformer or a block must be specified"
|
98
99
|
end
|
@@ -103,6 +104,16 @@ module ETL #:nodoc:
|
|
103
104
|
control.transforms
|
104
105
|
end
|
105
106
|
|
107
|
+
# Rename the source field to the destination field
|
108
|
+
def rename(source, destination)
|
109
|
+
after_read :rename, :source => source, :dest => destination
|
110
|
+
end
|
111
|
+
|
112
|
+
# Copy the source field to the destination field
|
113
|
+
def copy(source, destination)
|
114
|
+
after_read :copy_field, :source => source, :dest => destination
|
115
|
+
end
|
116
|
+
|
106
117
|
protected
|
107
118
|
# This method is used to define a processor and insert into the specified processor
|
108
119
|
# collection.
|
@@ -244,9 +255,9 @@ module ETL #:nodoc:
|
|
244
255
|
end
|
245
256
|
|
246
257
|
# Get the transforms with the specified name
|
247
|
-
def transform(name)
|
248
|
-
|
249
|
-
|
258
|
+
# def transform(name)
|
259
|
+
# transforms[name] ||= []
|
260
|
+
# end
|
250
261
|
|
251
262
|
def after_read_processors
|
252
263
|
@after_read_processors ||= []
|
@@ -267,9 +278,9 @@ module ETL #:nodoc:
|
|
267
278
|
@post_processors ||= []
|
268
279
|
end
|
269
280
|
|
270
|
-
# Get
|
281
|
+
# Get an Array of all transforms for this control
|
271
282
|
def transforms
|
272
|
-
@transforms ||=
|
283
|
+
@transforms ||= []
|
273
284
|
end
|
274
285
|
|
275
286
|
# Get the error threshold. Defaults to 100.
|
@@ -26,8 +26,8 @@ module ETL #:nodoc:
|
|
26
26
|
class << self
|
27
27
|
# Get the destination class for the specified name.
|
28
28
|
#
|
29
|
-
# For example if name is :database or 'database' then the
|
30
|
-
# is returned
|
29
|
+
# For example if name is :database or 'database' then the
|
30
|
+
# DatabaseDestination class is returned
|
31
31
|
def class_for_name(name)
|
32
32
|
ETL::Control.const_get("#{name.to_s.classify}Destination")
|
33
33
|
end
|
@@ -42,13 +42,14 @@ module ETL #:nodoc:
|
|
42
42
|
#
|
43
43
|
# Options:
|
44
44
|
# * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
|
45
|
-
# * <tt>:condition</tt>: A conditional proc that must return true for the
|
45
|
+
# * <tt>:condition</tt>: A conditional proc that must return true for the
|
46
|
+
# row to be written
|
46
47
|
# * <tt>:append_rows</tt>: An array of rows to append
|
47
48
|
def initialize(control, configuration, mapping)
|
48
49
|
@control = control
|
49
50
|
@configuration = configuration
|
50
51
|
@mapping = mapping
|
51
|
-
@buffer_size = configuration[:buffer_size] ||=
|
52
|
+
@buffer_size = configuration[:buffer_size] ||= 100
|
52
53
|
@condition = configuration[:condition]
|
53
54
|
@append_rows = configuration[:append_rows]
|
54
55
|
end
|
@@ -61,7 +62,7 @@ module ETL #:nodoc:
|
|
61
62
|
# Write the given row
|
62
63
|
def write(row)
|
63
64
|
if @condition.nil? || @condition.call(row)
|
64
|
-
|
65
|
+
process_change(row)
|
65
66
|
end
|
66
67
|
flush if buffer.length >= buffer_size
|
67
68
|
end
|
@@ -105,8 +106,9 @@ module ETL #:nodoc:
|
|
105
106
|
order
|
106
107
|
end
|
107
108
|
|
108
|
-
# Return true if the row is allowed. The row will not be allowed if the
|
109
|
-
# in the configuration and the compound key
|
109
|
+
# Return true if the row is allowed. The row will not be allowed if the
|
110
|
+
# :unique option is specified in the configuration and the compound key
|
111
|
+
# already exists
|
110
112
|
def row_allowed?(row)
|
111
113
|
if unique
|
112
114
|
key = (unique.collect { |k| row[k] }).join('|')
|
@@ -116,15 +118,160 @@ module ETL #:nodoc:
|
|
116
118
|
return true
|
117
119
|
end
|
118
120
|
|
119
|
-
# Get a hash of compound key contraints. This is used to determine if a
|
120
|
-
# unique option is specified
|
121
|
+
# Get a hash of compound key contraints. This is used to determine if a
|
122
|
+
# row can be written when the unique option is specified
|
121
123
|
def compound_key_constraints
|
122
124
|
@compound_key_constraints ||= {}
|
123
125
|
end
|
124
126
|
|
125
|
-
#
|
126
|
-
#
|
127
|
-
|
127
|
+
# Return fields which are Slowly Changing Dimension fields. Return nil
|
128
|
+
# by default.
|
129
|
+
def scd_fields
|
130
|
+
@scd_fields ||= configuration[:scd_fields]
|
131
|
+
end
|
132
|
+
|
133
|
+
def scd?
|
134
|
+
!configuration[:scd].nil?
|
135
|
+
end
|
136
|
+
|
137
|
+
def scd_type
|
138
|
+
scd? ? configuration[:scd][:type] : nil
|
139
|
+
end
|
140
|
+
|
141
|
+
# Get the Slowly Changing Dimension effective date field. Defaults to
|
142
|
+
# 'effective_date'.
|
143
|
+
def scd_effective_date_field
|
144
|
+
configuration[:scd][:effective_date_field] || :effective_date if scd?
|
145
|
+
end
|
146
|
+
|
147
|
+
# Get the Slowly Changing Dimension end date field. Defaults to
|
148
|
+
# 'end_date'.
|
149
|
+
def scd_end_date_field
|
150
|
+
configuration[:scd][:end_date_field] || :end_date if scd?
|
151
|
+
end
|
152
|
+
|
153
|
+
# Return the natural key field name, defaults to :id
|
154
|
+
def natural_key
|
155
|
+
@natural_key ||= determine_natural_key
|
156
|
+
end
|
157
|
+
|
158
|
+
# Get the dimension table if specified
|
159
|
+
def dimension_table
|
160
|
+
configuration[:scd][:dimension_table] if scd?
|
161
|
+
end
|
162
|
+
|
163
|
+
# Process a row to determine the change type
|
164
|
+
def process_change(row)
|
165
|
+
return unless row
|
166
|
+
|
167
|
+
# Change processing can only occur if the natural key exists in the row
|
168
|
+
supports_change = true
|
169
|
+
natural_key.each do |key|
|
170
|
+
unless row.has_key?(key)
|
171
|
+
buffer << row
|
172
|
+
return
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
ETL::Engine.logger.debug "checking scd fields"
|
177
|
+
s = String.new
|
178
|
+
if scd_fields
|
179
|
+
scd_fields.each { |f| s << row[f].to_s }
|
180
|
+
else
|
181
|
+
row.each { |key,value| s << value.to_s }
|
182
|
+
end
|
183
|
+
|
184
|
+
# apply the CRC to 's' and see if it matches the last
|
185
|
+
# ETL::Execution::Record with the samenatural key. If they match then
|
186
|
+
# throw away this row (no need to process). If they do not match then
|
187
|
+
# the record is an 'update'. If the record doesn't exist then it is an
|
188
|
+
# 'insert'
|
189
|
+
nk = natural_key.collect{|k|row[k]}.join('|')
|
190
|
+
require 'zlib'
|
191
|
+
crc = Zlib.crc32(s)
|
192
|
+
record = ETL::Execution::Record.find_by_control_file_and_natural_key(control.file, nk)
|
193
|
+
|
194
|
+
timestamp = Time.now
|
195
|
+
|
196
|
+
ETL::Engine.logger.debug "checking record change type"
|
197
|
+
if record
|
198
|
+
if record.crc != crc.to_s
|
199
|
+
# SCD Type 1: only the new row should be added
|
200
|
+
# SCD Type 2: both an old and new row should be added
|
201
|
+
# SCD Type 3: not supported
|
202
|
+
ETL::Engine.logger.debug "CRC does not match"
|
203
|
+
|
204
|
+
if scd_type == 2
|
205
|
+
ETL::Engine.logger.debug "type 2 SCD"
|
206
|
+
q = "SELECT * FROM #{dimension_table} WHERE "
|
207
|
+
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
208
|
+
#puts "looking for original record"
|
209
|
+
result = ActiveRecord::Base.connection.select_one(q)
|
210
|
+
if result
|
211
|
+
#puts "Result: #{result.inspect}"
|
212
|
+
original_record = ETL::Row[result.symbolize_keys!]
|
213
|
+
original_record[scd_end_date_field] = timestamp
|
214
|
+
ETL::Engine.logger.debug "writing original record"
|
215
|
+
buffer << original_record
|
216
|
+
end
|
217
|
+
|
218
|
+
row[scd_effective_date_field] = timestamp
|
219
|
+
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
220
|
+
elsif scd_type == 1
|
221
|
+
ETL::Engine.logger.debug "type 1 SCD"
|
222
|
+
else
|
223
|
+
ETL::Engine.logger.debug "SCD not specified"
|
224
|
+
end
|
225
|
+
|
226
|
+
ETL::Engine.logger.debug "writing new record"
|
227
|
+
buffer << row
|
228
|
+
else
|
229
|
+
ETL::Engine.logger.debug "CRC matches, skipping"
|
230
|
+
|
231
|
+
q = "SELECT * FROM #{dimension_table} WHERE "
|
232
|
+
q << natural_key.collect { |nk| "#{nk} = '#{row[nk]}'" }.join(" AND ")
|
233
|
+
result = ActiveRecord::Base.connection.select_one(q)
|
234
|
+
if result
|
235
|
+
buffer << ETL::Row[result.symbolize_keys!]
|
236
|
+
else
|
237
|
+
# The record never made it into the database, so add the effective and end date
|
238
|
+
# and add it into the bulk load file
|
239
|
+
row[scd_effective_date_field] = timestamp
|
240
|
+
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
241
|
+
buffer << row
|
242
|
+
end
|
243
|
+
end
|
244
|
+
else
|
245
|
+
ETL::Engine.logger.debug "record never loaded"
|
246
|
+
# Set the effective and end date fields
|
247
|
+
if scd_type == 2
|
248
|
+
row[scd_effective_date_field] = timestamp
|
249
|
+
row[scd_end_date_field] = '9999-12-31 00:00:00'
|
250
|
+
end
|
251
|
+
|
252
|
+
# Write the row
|
253
|
+
buffer << row
|
254
|
+
|
255
|
+
# Record the record
|
256
|
+
if ETL::Engine.job # only record the execution if there is a job
|
257
|
+
ETL::Execution::Record.create!(
|
258
|
+
:control_file => control.file,
|
259
|
+
:natural_key => nk,
|
260
|
+
:crc => crc,
|
261
|
+
:job_id => ETL::Engine.job.id
|
262
|
+
)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
rescue => e
|
266
|
+
puts e
|
267
|
+
end
|
268
|
+
|
269
|
+
# Add any virtual fields to the row. Virtual rows will get their value
|
270
|
+
# from one of the following:
|
271
|
+
# * If the mapping is a Class, then an object which implements the next
|
272
|
+
# method
|
273
|
+
# * If the mapping is a Symbol, then the XGenerator where X is the
|
274
|
+
# classified symbol
|
128
275
|
# * If the mapping is a Proc, then it will be called with the row
|
129
276
|
# * Otherwise the value itself will be assigned to the field
|
130
277
|
def add_virtuals!(row)
|
@@ -146,6 +293,20 @@ module ETL #:nodoc:
|
|
146
293
|
end
|
147
294
|
end
|
148
295
|
end
|
296
|
+
|
297
|
+
private
|
298
|
+
# Determine the natural key. This method will always return an array
|
299
|
+
# of symbols. The default value is [:id].
|
300
|
+
def determine_natural_key
|
301
|
+
case configuration[:natural_key]
|
302
|
+
when Array
|
303
|
+
configuration[:natural_key].collect(&:to_sym)
|
304
|
+
when String, Symbol
|
305
|
+
[configuration[:natural_key].to_sym]
|
306
|
+
else
|
307
|
+
[:id]
|
308
|
+
end
|
309
|
+
end
|
149
310
|
end
|
150
311
|
end
|
151
312
|
end
|