activewarehouse-etl 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :query
|
6
|
+
attr_accessor :destination
|
7
|
+
|
8
|
+
# Initialize the surrogate key generator
|
9
|
+
def initialize(control, configuration)
|
10
|
+
super
|
11
|
+
@surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
|
12
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
13
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
14
|
+
@surrogate_key = @surrogate_key.to_i
|
15
|
+
#puts "surrogate key: #{@surrogate_key}"
|
16
|
+
@destination = configuration[:destination] || :id
|
17
|
+
end
|
18
|
+
|
19
|
+
# Add a surrogate key to the row
|
20
|
+
def process(row)
|
21
|
+
if row
|
22
|
+
#puts "processing row #{row.inspect}"
|
23
|
+
@surrogate_key += 1
|
24
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
25
|
+
row[destination] = @surrogate_key
|
26
|
+
row
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -3,8 +3,6 @@ module ETL #:nodoc:
|
|
3
3
|
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
4
|
# prior to loading
|
5
5
|
class TruncateProcessor < ETL::Processor::Processor
|
6
|
-
#attr_reader :file
|
7
|
-
|
8
6
|
# Defines the table to truncate
|
9
7
|
attr_reader :table
|
10
8
|
|
data/lib/etl/row.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This source file contains the ETL::Row class.
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# This class represents a single row currently passing through the ETL pipeline
|
5
|
+
class Row < Hash
|
6
|
+
# All change types
|
7
|
+
CHANGE_TYPES = [:insert, :update, :delete]
|
8
|
+
|
9
|
+
# Accessor for the row's change type
|
10
|
+
attr_accessor :change_type
|
11
|
+
|
12
|
+
# Get the change type, defaults to :insert
|
13
|
+
def change_type
|
14
|
+
@change_type ||= :insert
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ETL
|
2
|
+
module Screen
|
3
|
+
# This screen validates the number of rows which will be bulk loaded
|
4
|
+
# against the results from some sort of a row count query. If there
|
5
|
+
# is a difference then the screen will not pass
|
6
|
+
class RowCountScreen
|
7
|
+
def initialize(control, configuration={})
|
8
|
+
|
9
|
+
end
|
10
|
+
def execute
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module ETL
|
2
|
+
module Transform
|
3
|
+
class BlockTransform < ETL::Transform::Transform
|
4
|
+
def initialize(control, name, configuration)
|
5
|
+
super
|
6
|
+
@block = configuration[:block]
|
7
|
+
end
|
8
|
+
def transform(name, value, row)
|
9
|
+
@block.call(name, value, row)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -6,7 +6,7 @@ module ETL #:nodoc:
|
|
6
6
|
#
|
7
7
|
# Configuration options:
|
8
8
|
# * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
|
9
|
-
def initialize(control, configuration={})
|
9
|
+
def initialize(control, name, configuration={})
|
10
10
|
super
|
11
11
|
@format = configuration[:format] || "%Y-%m-%d"
|
12
12
|
end
|
@@ -14,7 +14,7 @@ module ETL #:nodoc:
|
|
14
14
|
# * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
|
15
15
|
# * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
|
16
16
|
# * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
|
17
|
-
def initialize(control, configuration={})
|
17
|
+
def initialize(control, name, configuration={})
|
18
18
|
super
|
19
19
|
|
20
20
|
if configuration[:decode_table_path]
|
@@ -3,10 +3,15 @@ module ETL #:nodoc:
|
|
3
3
|
# Transform which will replace nil or empty values with a specified value.
|
4
4
|
class DefaultTransform < Transform
|
5
5
|
attr_accessor :default_value
|
6
|
-
|
6
|
+
# Initialize the transform
|
7
|
+
#
|
8
|
+
# Configuration options:
|
9
|
+
# * <tt>:default_value</tt>: The default value to use if the incoming value is blank
|
10
|
+
def initialize(control, name, configuration)
|
7
11
|
super
|
8
12
|
@default_value = configuration[:default_value]
|
9
13
|
end
|
14
|
+
# Transform the value
|
10
15
|
def transform(name, value, row)
|
11
16
|
value.blank? ? default_value : value
|
12
17
|
end
|
@@ -12,7 +12,7 @@ module ETL #:nodoc:
|
|
12
12
|
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
13
13
|
# for future use.
|
14
14
|
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
15
|
-
def initialize(control, configuration={})
|
15
|
+
def initialize(control, name, configuration={})
|
16
16
|
super
|
17
17
|
|
18
18
|
@collection = (configuration[:collection] || {})
|
@@ -20,7 +20,7 @@ module ETL #:nodoc:
|
|
20
20
|
# * <tt>:table</tt>: The table to search (required)
|
21
21
|
# * <tt>:connection</tt>: The ActiveRecord adapter (required)
|
22
22
|
# * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
|
23
|
-
def initialize(control, configuration={})
|
23
|
+
def initialize(control, name, configuration={})
|
24
24
|
super
|
25
25
|
@parent_id_field = configuration[:parent_id_field] || :parent_id
|
26
26
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a number to an ordinalized version using the ActiveSupport ordinalize
|
4
|
+
# core extension
|
5
|
+
class OrdinalizeTransform < ETL::Transform::Transform
|
6
|
+
# Transform the value from a number to an ordinalized number
|
7
|
+
def transform(name, value, row)
|
8
|
+
value.ordinalize
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -4,9 +4,6 @@ module ETL #:nodoc:
|
|
4
4
|
module Transform #:nodoc:
|
5
5
|
# Transform which hashes the original value with a SHA-1 hash algorithm
|
6
6
|
class Sha1Transform < ETL::Transform::Transform
|
7
|
-
def initialize(control, configuration={})
|
8
|
-
super
|
9
|
-
end
|
10
7
|
# Transform the value with a SHA1 digest algorithm.
|
11
8
|
def transform(name, value, row)
|
12
9
|
Digest::SHA1.hexdigest(value)
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a Date instance
|
4
4
|
class StringToDateTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using Date.parse
|
9
6
|
def transform(name, value, row)
|
10
7
|
Date.parse(value)
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a DateTime instance
|
4
4
|
class StringToDateTimeTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using DateTime.parse.
|
9
6
|
#
|
10
7
|
# WARNING: This transform is slow (due to the Ruby implementation), but if you need to
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a Time instance
|
4
4
|
class StringToTimeTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using Time.parse
|
9
6
|
def transform(name, value, row)
|
10
7
|
Time.parse(value)
|
@@ -22,25 +22,34 @@ module ETL#:nodoc:
|
|
22
22
|
# Any other object will result in a ControlError being raised.
|
23
23
|
def transform(name, value, row, transforms)
|
24
24
|
transforms.each do |transform|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
25
|
+
benchmarks[transform.class] ||= 0
|
26
|
+
benchmarks[transform.class] += Benchmark.realtime do
|
27
|
+
Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
|
28
|
+
case transform
|
29
|
+
when Proc
|
30
|
+
value = transform.call([name, value, row])
|
31
|
+
when Transform
|
32
|
+
value = transform.transform(name, value, row)
|
33
|
+
else
|
34
|
+
raise ControlError, "Unsupported transform configuration type: #{transform}"
|
35
|
+
end
|
33
36
|
end
|
34
37
|
end
|
35
38
|
value
|
36
39
|
end
|
40
|
+
|
41
|
+
def benchmarks
|
42
|
+
@benchmarks ||= {}
|
43
|
+
end
|
37
44
|
end
|
38
45
|
|
39
|
-
attr_reader :control, :configuration
|
46
|
+
attr_reader :control, :name, :configuration
|
40
47
|
|
41
|
-
# Initialize the transform object
|
42
|
-
|
48
|
+
# Initialize the transform object with the given control object, field name and
|
49
|
+
# configuration hash
|
50
|
+
def initialize(control, name, configuration={})
|
43
51
|
@control = control
|
52
|
+
@name = name
|
44
53
|
@configuration = configuration
|
45
54
|
end
|
46
55
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform to trim string
|
4
|
+
class TrimTransform < ETL::Transform::Transform
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:type</tt>: :left, :right or :both. Default is :both
|
7
|
+
def initialize(control, name, configuration={})
|
8
|
+
super
|
9
|
+
@type = (configuration[:type] || :both).to_sym
|
10
|
+
end
|
11
|
+
# Transform the value
|
12
|
+
def transform(name, value, row)
|
13
|
+
case @type
|
14
|
+
when :left
|
15
|
+
value.lstrip
|
16
|
+
when :right
|
17
|
+
value.rstrip
|
18
|
+
when :both
|
19
|
+
value.strip
|
20
|
+
else
|
21
|
+
raise "Trim type, if specified, must be :left, :right or :both"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -2,7 +2,15 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform from one type to another
|
4
4
|
class TypeTransform < ETL::Transform::Transform
|
5
|
-
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:type</tt>: The type to convert to. Supported types:
|
9
|
+
# ** :string
|
10
|
+
# ** :number,:integer
|
11
|
+
# ** :float
|
12
|
+
# ** :decimal
|
13
|
+
def initialize(control, name, configuration={})
|
6
14
|
super
|
7
15
|
@type = configuration[:type]
|
8
16
|
@significant = configuration[:significant] ||= 0
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.7.0
|
7
|
+
date: 2007-04-08 00:00:00 -04:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -30,6 +30,7 @@ authors:
|
|
30
30
|
- Anthony Eden
|
31
31
|
files:
|
32
32
|
- CHANGELOG
|
33
|
+
- LICENSE
|
33
34
|
- README
|
34
35
|
- TODO
|
35
36
|
- Rakefile
|
@@ -40,6 +41,8 @@ files:
|
|
40
41
|
- lib/etl/control
|
41
42
|
- lib/etl/control.rb
|
42
43
|
- lib/etl/engine.rb
|
44
|
+
- lib/etl/execution
|
45
|
+
- lib/etl/execution.rb
|
43
46
|
- lib/etl/generator
|
44
47
|
- lib/etl/generator.rb
|
45
48
|
- lib/etl/http_tools.rb
|
@@ -47,6 +50,8 @@ files:
|
|
47
50
|
- lib/etl/parser.rb
|
48
51
|
- lib/etl/processor
|
49
52
|
- lib/etl/processor.rb
|
53
|
+
- lib/etl/row.rb
|
54
|
+
- lib/etl/screen
|
50
55
|
- lib/etl/transform
|
51
56
|
- lib/etl/transform.rb
|
52
57
|
- lib/etl/version.rb
|
@@ -60,6 +65,10 @@ files:
|
|
60
65
|
- lib/etl/control/destination/file_destination.rb
|
61
66
|
- lib/etl/control/source/database_source.rb
|
62
67
|
- lib/etl/control/source/file_source.rb
|
68
|
+
- lib/etl/execution/base.rb
|
69
|
+
- lib/etl/execution/job.rb
|
70
|
+
- lib/etl/execution/migration.rb
|
71
|
+
- lib/etl/execution/record.rb
|
63
72
|
- lib/etl/generator/generator.rb
|
64
73
|
- lib/etl/generator/surrogate_key_generator.rb
|
65
74
|
- lib/etl/parser/apache_combined_log_parser.rb
|
@@ -69,21 +78,30 @@ files:
|
|
69
78
|
- lib/etl/parser/sax_parser.rb
|
70
79
|
- lib/etl/parser/xml_parser.rb
|
71
80
|
- lib/etl/processor/bulk_import_processor.rb
|
81
|
+
- lib/etl/processor/check_exist_processor.rb
|
82
|
+
- lib/etl/processor/check_unique_processor.rb
|
72
83
|
- lib/etl/processor/copy_field_processor.rb
|
73
84
|
- lib/etl/processor/hierarchy_exploder_processor.rb
|
74
85
|
- lib/etl/processor/processor.rb
|
86
|
+
- lib/etl/processor/rename_processor.rb
|
75
87
|
- lib/etl/processor/row_processor.rb
|
88
|
+
- lib/etl/processor/sequence_processor.rb
|
89
|
+
- lib/etl/processor/surrogate_key_processor.rb
|
76
90
|
- lib/etl/processor/truncate_processor.rb
|
91
|
+
- lib/etl/screen/row_count_screen.rb
|
92
|
+
- lib/etl/transform/block_transform.rb
|
77
93
|
- lib/etl/transform/date_to_string_transform.rb
|
78
94
|
- lib/etl/transform/decode_transform.rb
|
79
95
|
- lib/etl/transform/default_transform.rb
|
80
96
|
- lib/etl/transform/foreign_key_lookup_transform.rb
|
81
97
|
- lib/etl/transform/hierarchy_lookup_transform.rb
|
98
|
+
- lib/etl/transform/ordinalize_transform.rb
|
82
99
|
- lib/etl/transform/sha1_transform.rb
|
83
100
|
- lib/etl/transform/string_to_date_transform.rb
|
84
101
|
- lib/etl/transform/string_to_datetime_transform.rb
|
85
102
|
- lib/etl/transform/string_to_time_transform.rb
|
86
103
|
- lib/etl/transform/transform.rb
|
104
|
+
- lib/etl/transform/trim_transform.rb
|
87
105
|
- lib/etl/transform/type_transform.rb
|
88
106
|
test_files: []
|
89
107
|
|
@@ -133,7 +151,7 @@ dependencies:
|
|
133
151
|
requirements:
|
134
152
|
- - ">="
|
135
153
|
- !ruby/object:Gem::Version
|
136
|
-
version: 1.
|
154
|
+
version: 1.2.0
|
137
155
|
version:
|
138
156
|
- !ruby/object:Gem::Dependency
|
139
157
|
name: adapter_extensions
|