activewarehouse-etl 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +29 -1
- data/LICENSE +7 -0
- data/README +58 -12
- data/Rakefile +2 -1
- data/lib/etl.rb +3 -0
- data/lib/etl/commands/etl.rb +35 -1
- data/lib/etl/control/control.rb +20 -9
- data/lib/etl/control/destination.rb +173 -12
- data/lib/etl/control/destination/database_destination.rb +2 -2
- data/lib/etl/control/destination/file_destination.rb +25 -2
- data/lib/etl/control/source.rb +29 -8
- data/lib/etl/control/source/database_source.rb +109 -24
- data/lib/etl/control/source/file_source.rb +29 -16
- data/lib/etl/engine.rb +164 -63
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/job.rb +7 -0
- data/lib/etl/execution/migration.rb +54 -0
- data/lib/etl/execution/record.rb +8 -0
- data/lib/etl/generator/surrogate_key_generator.rb +2 -0
- data/lib/etl/parser.rb +9 -0
- data/lib/etl/parser/parser.rb +5 -2
- data/lib/etl/parser/sax_parser.rb +22 -6
- data/lib/etl/processor.rb +8 -0
- data/lib/etl/processor/bulk_import_processor.rb +32 -4
- data/lib/etl/processor/check_exist_processor.rb +69 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +20 -4
- data/lib/etl/processor/processor.rb +3 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/row_processor.rb +1 -1
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +31 -0
- data/lib/etl/processor/truncate_processor.rb +0 -2
- data/lib/etl/row.rb +17 -0
- data/lib/etl/screen/row_count_screen.rb +15 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +6 -1
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +0 -3
- data/lib/etl/transform/string_to_date_transform.rb +0 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
- data/lib/etl/transform/string_to_time_transform.rb +0 -3
- data/lib/etl/transform/transform.rb +20 -11
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +9 -1
- data/lib/etl/version.rb +2 -2
- metadata +21 -3
@@ -0,0 +1,24 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to rename a field in the row.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:source</tt>: the source field name
|
7
|
+
# * <tt>:dest</tt>: The destination field name
|
8
|
+
class RenameProcessor < ETL::Processor::RowProcessor
|
9
|
+
def process(row)
|
10
|
+
source_value = row[configuration[:source]]
|
11
|
+
case source_value
|
12
|
+
when Numeric
|
13
|
+
row[configuration[:dest]] = source_value
|
14
|
+
when nil
|
15
|
+
row[configuration[:dest]] = nil
|
16
|
+
else
|
17
|
+
row[configuration[:dest]] = source_value.dup
|
18
|
+
end
|
19
|
+
row.delete(configuration[:source])
|
20
|
+
row
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row level processor to generate a sequence.
|
4
|
+
#
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:context</tt>: A context name, if none is specified then the context will be
|
7
|
+
# the current ETL run
|
8
|
+
# * <tt>:dest</tt>: The destination field name
|
9
|
+
class SequenceProcessor < ETL::Processor::RowProcessor
|
10
|
+
def process(row)
|
11
|
+
sequences[configuration[:context]] ||= 0
|
12
|
+
row[configuration[:dest]] = sequences[configuration[:context]] += 1
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
# Get a Hash of sequences
|
18
|
+
def sequences
|
19
|
+
@sequences ||= {}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# A row level processor that provides surrogate keys
|
4
|
+
class SurrogateKeyProcessor < ETL::Processor::RowProcessor
|
5
|
+
attr_accessor :query
|
6
|
+
attr_accessor :destination
|
7
|
+
|
8
|
+
# Initialize the surrogate key generator
|
9
|
+
def initialize(control, configuration)
|
10
|
+
super
|
11
|
+
@surrogate_key = ActiveRecord::Base.connection.select_value(configuration[:query])
|
12
|
+
#puts "initial surrogate key: #{@surrogate_key}"
|
13
|
+
@surrogate_key = 0 if @surrogate_key.blank?
|
14
|
+
@surrogate_key = @surrogate_key.to_i
|
15
|
+
#puts "surrogate key: #{@surrogate_key}"
|
16
|
+
@destination = configuration[:destination] || :id
|
17
|
+
end
|
18
|
+
|
19
|
+
# Add a surrogate key to the row
|
20
|
+
def process(row)
|
21
|
+
if row
|
22
|
+
#puts "processing row #{row.inspect}"
|
23
|
+
@surrogate_key += 1
|
24
|
+
#puts "adding surrogate key to row: #{@surrogate_key}"
|
25
|
+
row[destination] = @surrogate_key
|
26
|
+
row
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -3,8 +3,6 @@ module ETL #:nodoc:
|
|
3
3
|
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
4
|
# prior to loading
|
5
5
|
class TruncateProcessor < ETL::Processor::Processor
|
6
|
-
#attr_reader :file
|
7
|
-
|
8
6
|
# Defines the table to truncate
|
9
7
|
attr_reader :table
|
10
8
|
|
data/lib/etl/row.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This source file contains the ETL::Row class.
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# This class represents a single row currently passing through the ETL pipeline
|
5
|
+
class Row < Hash
|
6
|
+
# All change types
|
7
|
+
CHANGE_TYPES = [:insert, :update, :delete]
|
8
|
+
|
9
|
+
# Accessor for the row's change type
|
10
|
+
attr_accessor :change_type
|
11
|
+
|
12
|
+
# Get the change type, defaults to :insert
|
13
|
+
def change_type
|
14
|
+
@change_type ||= :insert
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ETL
|
2
|
+
module Screen
|
3
|
+
# This screen validates the number of rows which will be bulk loaded
|
4
|
+
# against the results from some sort of a row count query. If there
|
5
|
+
# is a difference then the screen will not pass
|
6
|
+
class RowCountScreen
|
7
|
+
def initialize(control, configuration={})
|
8
|
+
|
9
|
+
end
|
10
|
+
def execute
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module ETL
|
2
|
+
module Transform
|
3
|
+
class BlockTransform < ETL::Transform::Transform
|
4
|
+
def initialize(control, name, configuration)
|
5
|
+
super
|
6
|
+
@block = configuration[:block]
|
7
|
+
end
|
8
|
+
def transform(name, value, row)
|
9
|
+
@block.call(name, value, row)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -6,7 +6,7 @@ module ETL #:nodoc:
|
|
6
6
|
#
|
7
7
|
# Configuration options:
|
8
8
|
# * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
|
9
|
-
def initialize(control, configuration={})
|
9
|
+
def initialize(control, name, configuration={})
|
10
10
|
super
|
11
11
|
@format = configuration[:format] || "%Y-%m-%d"
|
12
12
|
end
|
@@ -14,7 +14,7 @@ module ETL #:nodoc:
|
|
14
14
|
# * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
|
15
15
|
# * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
|
16
16
|
# * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
|
17
|
-
def initialize(control, configuration={})
|
17
|
+
def initialize(control, name, configuration={})
|
18
18
|
super
|
19
19
|
|
20
20
|
if configuration[:decode_table_path]
|
@@ -3,10 +3,15 @@ module ETL #:nodoc:
|
|
3
3
|
# Transform which will replace nil or empty values with a specified value.
|
4
4
|
class DefaultTransform < Transform
|
5
5
|
attr_accessor :default_value
|
6
|
-
|
6
|
+
# Initialize the transform
|
7
|
+
#
|
8
|
+
# Configuration options:
|
9
|
+
# * <tt>:default_value</tt>: The default value to use if the incoming value is blank
|
10
|
+
def initialize(control, name, configuration)
|
7
11
|
super
|
8
12
|
@default_value = configuration[:default_value]
|
9
13
|
end
|
14
|
+
# Transform the value
|
10
15
|
def transform(name, value, row)
|
11
16
|
value.blank? ? default_value : value
|
12
17
|
end
|
@@ -12,7 +12,7 @@ module ETL #:nodoc:
|
|
12
12
|
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
13
13
|
# for future use.
|
14
14
|
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
15
|
-
def initialize(control, configuration={})
|
15
|
+
def initialize(control, name, configuration={})
|
16
16
|
super
|
17
17
|
|
18
18
|
@collection = (configuration[:collection] || {})
|
@@ -20,7 +20,7 @@ module ETL #:nodoc:
|
|
20
20
|
# * <tt>:table</tt>: The table to search (required)
|
21
21
|
# * <tt>:connection</tt>: The ActiveRecord adapter (required)
|
22
22
|
# * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
|
23
|
-
def initialize(control, configuration={})
|
23
|
+
def initialize(control, name, configuration={})
|
24
24
|
super
|
25
25
|
@parent_id_field = configuration[:parent_id_field] || :parent_id
|
26
26
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a number to an ordinalized version using the ActiveSupport ordinalize
|
4
|
+
# core extension
|
5
|
+
class OrdinalizeTransform < ETL::Transform::Transform
|
6
|
+
# Transform the value from a number to an ordinalized number
|
7
|
+
def transform(name, value, row)
|
8
|
+
value.ordinalize
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -4,9 +4,6 @@ module ETL #:nodoc:
|
|
4
4
|
module Transform #:nodoc:
|
5
5
|
# Transform which hashes the original value with a SHA-1 hash algorithm
|
6
6
|
class Sha1Transform < ETL::Transform::Transform
|
7
|
-
def initialize(control, configuration={})
|
8
|
-
super
|
9
|
-
end
|
10
7
|
# Transform the value with a SHA1 digest algorithm.
|
11
8
|
def transform(name, value, row)
|
12
9
|
Digest::SHA1.hexdigest(value)
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a Date instance
|
4
4
|
class StringToDateTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using Date.parse
|
9
6
|
def transform(name, value, row)
|
10
7
|
Date.parse(value)
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a DateTime instance
|
4
4
|
class StringToDateTimeTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using DateTime.parse.
|
9
6
|
#
|
10
7
|
# WARNING: This transform is slow (due to the Ruby implementation), but if you need to
|
@@ -2,9 +2,6 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform a String representation of a date to a Time instance
|
4
4
|
class StringToTimeTransform < ETL::Transform::Transform
|
5
|
-
def initialize(control, configuration={})
|
6
|
-
super
|
7
|
-
end
|
8
5
|
# Transform the value using Time.parse
|
9
6
|
def transform(name, value, row)
|
10
7
|
Time.parse(value)
|
@@ -22,25 +22,34 @@ module ETL#:nodoc:
|
|
22
22
|
# Any other object will result in a ControlError being raised.
|
23
23
|
def transform(name, value, row, transforms)
|
24
24
|
transforms.each do |transform|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
25
|
+
benchmarks[transform.class] ||= 0
|
26
|
+
benchmarks[transform.class] += Benchmark.realtime do
|
27
|
+
Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
|
28
|
+
case transform
|
29
|
+
when Proc
|
30
|
+
value = transform.call([name, value, row])
|
31
|
+
when Transform
|
32
|
+
value = transform.transform(name, value, row)
|
33
|
+
else
|
34
|
+
raise ControlError, "Unsupported transform configuration type: #{transform}"
|
35
|
+
end
|
33
36
|
end
|
34
37
|
end
|
35
38
|
value
|
36
39
|
end
|
40
|
+
|
41
|
+
def benchmarks
|
42
|
+
@benchmarks ||= {}
|
43
|
+
end
|
37
44
|
end
|
38
45
|
|
39
|
-
attr_reader :control, :configuration
|
46
|
+
attr_reader :control, :name, :configuration
|
40
47
|
|
41
|
-
# Initialize the transform object
|
42
|
-
|
48
|
+
# Initialize the transform object with the given control object, field name and
|
49
|
+
# configuration hash
|
50
|
+
def initialize(control, name, configuration={})
|
43
51
|
@control = control
|
52
|
+
@name = name
|
44
53
|
@configuration = configuration
|
45
54
|
end
|
46
55
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform to trim string
|
4
|
+
class TrimTransform < ETL::Transform::Transform
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:type</tt>: :left, :right or :both. Default is :both
|
7
|
+
def initialize(control, name, configuration={})
|
8
|
+
super
|
9
|
+
@type = (configuration[:type] || :both).to_sym
|
10
|
+
end
|
11
|
+
# Transform the value
|
12
|
+
def transform(name, value, row)
|
13
|
+
case @type
|
14
|
+
when :left
|
15
|
+
value.lstrip
|
16
|
+
when :right
|
17
|
+
value.rstrip
|
18
|
+
when :both
|
19
|
+
value.strip
|
20
|
+
else
|
21
|
+
raise "Trim type, if specified, must be :left, :right or :both"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -2,7 +2,15 @@ module ETL #:nodoc:
|
|
2
2
|
module Transform #:nodoc:
|
3
3
|
# Transform from one type to another
|
4
4
|
class TypeTransform < ETL::Transform::Transform
|
5
|
-
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:type</tt>: The type to convert to. Supported types:
|
9
|
+
# ** :string
|
10
|
+
# ** :number,:integer
|
11
|
+
# ** :float
|
12
|
+
# ** :decimal
|
13
|
+
def initialize(control, name, configuration={})
|
6
14
|
super
|
7
15
|
@type = configuration[:type]
|
8
16
|
@significant = configuration[:significant] ||= 0
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.7.0
|
7
|
+
date: 2007-04-08 00:00:00 -04:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -30,6 +30,7 @@ authors:
|
|
30
30
|
- Anthony Eden
|
31
31
|
files:
|
32
32
|
- CHANGELOG
|
33
|
+
- LICENSE
|
33
34
|
- README
|
34
35
|
- TODO
|
35
36
|
- Rakefile
|
@@ -40,6 +41,8 @@ files:
|
|
40
41
|
- lib/etl/control
|
41
42
|
- lib/etl/control.rb
|
42
43
|
- lib/etl/engine.rb
|
44
|
+
- lib/etl/execution
|
45
|
+
- lib/etl/execution.rb
|
43
46
|
- lib/etl/generator
|
44
47
|
- lib/etl/generator.rb
|
45
48
|
- lib/etl/http_tools.rb
|
@@ -47,6 +50,8 @@ files:
|
|
47
50
|
- lib/etl/parser.rb
|
48
51
|
- lib/etl/processor
|
49
52
|
- lib/etl/processor.rb
|
53
|
+
- lib/etl/row.rb
|
54
|
+
- lib/etl/screen
|
50
55
|
- lib/etl/transform
|
51
56
|
- lib/etl/transform.rb
|
52
57
|
- lib/etl/version.rb
|
@@ -60,6 +65,10 @@ files:
|
|
60
65
|
- lib/etl/control/destination/file_destination.rb
|
61
66
|
- lib/etl/control/source/database_source.rb
|
62
67
|
- lib/etl/control/source/file_source.rb
|
68
|
+
- lib/etl/execution/base.rb
|
69
|
+
- lib/etl/execution/job.rb
|
70
|
+
- lib/etl/execution/migration.rb
|
71
|
+
- lib/etl/execution/record.rb
|
63
72
|
- lib/etl/generator/generator.rb
|
64
73
|
- lib/etl/generator/surrogate_key_generator.rb
|
65
74
|
- lib/etl/parser/apache_combined_log_parser.rb
|
@@ -69,21 +78,30 @@ files:
|
|
69
78
|
- lib/etl/parser/sax_parser.rb
|
70
79
|
- lib/etl/parser/xml_parser.rb
|
71
80
|
- lib/etl/processor/bulk_import_processor.rb
|
81
|
+
- lib/etl/processor/check_exist_processor.rb
|
82
|
+
- lib/etl/processor/check_unique_processor.rb
|
72
83
|
- lib/etl/processor/copy_field_processor.rb
|
73
84
|
- lib/etl/processor/hierarchy_exploder_processor.rb
|
74
85
|
- lib/etl/processor/processor.rb
|
86
|
+
- lib/etl/processor/rename_processor.rb
|
75
87
|
- lib/etl/processor/row_processor.rb
|
88
|
+
- lib/etl/processor/sequence_processor.rb
|
89
|
+
- lib/etl/processor/surrogate_key_processor.rb
|
76
90
|
- lib/etl/processor/truncate_processor.rb
|
91
|
+
- lib/etl/screen/row_count_screen.rb
|
92
|
+
- lib/etl/transform/block_transform.rb
|
77
93
|
- lib/etl/transform/date_to_string_transform.rb
|
78
94
|
- lib/etl/transform/decode_transform.rb
|
79
95
|
- lib/etl/transform/default_transform.rb
|
80
96
|
- lib/etl/transform/foreign_key_lookup_transform.rb
|
81
97
|
- lib/etl/transform/hierarchy_lookup_transform.rb
|
98
|
+
- lib/etl/transform/ordinalize_transform.rb
|
82
99
|
- lib/etl/transform/sha1_transform.rb
|
83
100
|
- lib/etl/transform/string_to_date_transform.rb
|
84
101
|
- lib/etl/transform/string_to_datetime_transform.rb
|
85
102
|
- lib/etl/transform/string_to_time_transform.rb
|
86
103
|
- lib/etl/transform/transform.rb
|
104
|
+
- lib/etl/transform/trim_transform.rb
|
87
105
|
- lib/etl/transform/type_transform.rb
|
88
106
|
test_files: []
|
89
107
|
|
@@ -133,7 +151,7 @@ dependencies:
|
|
133
151
|
requirements:
|
134
152
|
- - ">="
|
135
153
|
- !ruby/object:Gem::Version
|
136
|
-
version: 1.
|
154
|
+
version: 1.2.0
|
137
155
|
version:
|
138
156
|
- !ruby/object:Gem::Dependency
|
139
157
|
name: adapter_extensions
|