darrell-activewarehouse-etl 0.9.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +198 -0
- data/LICENSE +7 -0
- data/README +99 -0
- data/Rakefile +175 -0
- data/TODO +28 -0
- data/bin/etl +28 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +16 -0
- data/lib/etl/batch/batch.rb +111 -0
- data/lib/etl/batch/directives.rb +55 -0
- data/lib/etl/batch.rb +2 -0
- data/lib/etl/builder/date_dimension_builder.rb +96 -0
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/builder.rb +2 -0
- data/lib/etl/commands/etl.rb +89 -0
- data/lib/etl/control/control.rb +405 -0
- data/lib/etl/control/destination/database_destination.rb +97 -0
- data/lib/etl/control/destination/file_destination.rb +126 -0
- data/lib/etl/control/destination.rb +448 -0
- data/lib/etl/control/source/database_source.rb +220 -0
- data/lib/etl/control/source/enumerable_source.rb +11 -0
- data/lib/etl/control/source/file_source.rb +90 -0
- data/lib/etl/control/source/model_source.rb +39 -0
- data/lib/etl/control/source.rb +109 -0
- data/lib/etl/control.rb +3 -0
- data/lib/etl/core_ext/time/calculations.rb +42 -0
- data/lib/etl/core_ext/time.rb +5 -0
- data/lib/etl/core_ext.rb +1 -0
- data/lib/etl/engine.rb +556 -0
- data/lib/etl/execution/base.rb +9 -0
- data/lib/etl/execution/batch.rb +8 -0
- data/lib/etl/execution/job.rb +8 -0
- data/lib/etl/execution/migration.rb +85 -0
- data/lib/etl/execution.rb +19 -0
- data/lib/etl/generator/generator.rb +20 -0
- data/lib/etl/generator/surrogate_key_generator.rb +39 -0
- data/lib/etl/generator.rb +2 -0
- data/lib/etl/http_tools.rb +139 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
- data/lib/etl/parser/delimited_parser.rb +74 -0
- data/lib/etl/parser/fixed_width_parser.rb +65 -0
- data/lib/etl/parser/parser.rb +41 -0
- data/lib/etl/parser/sax_parser.rb +218 -0
- data/lib/etl/parser/xml_parser.rb +65 -0
- data/lib/etl/parser.rb +11 -0
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +83 -0
- data/lib/etl/processor/check_exist_processor.rb +80 -0
- data/lib/etl/processor/check_unique_processor.rb +35 -0
- data/lib/etl/processor/copy_field_processor.rb +26 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
- data/lib/etl/processor/print_row_processor.rb +12 -0
- data/lib/etl/processor/processor.rb +25 -0
- data/lib/etl/processor/rename_processor.rb +24 -0
- data/lib/etl/processor/require_non_blank_processor.rb +26 -0
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/processor/sequence_processor.rb +23 -0
- data/lib/etl/processor/surrogate_key_processor.rb +53 -0
- data/lib/etl/processor/truncate_processor.rb +35 -0
- data/lib/etl/processor.rb +11 -0
- data/lib/etl/row.rb +20 -0
- data/lib/etl/screen/row_count_screen.rb +20 -0
- data/lib/etl/screen.rb +14 -0
- data/lib/etl/transform/block_transform.rb +13 -0
- data/lib/etl/transform/date_to_string_transform.rb +20 -0
- data/lib/etl/transform/decode_transform.rb +51 -0
- data/lib/etl/transform/default_transform.rb +20 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
- data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
- data/lib/etl/transform/ordinalize_transform.rb +12 -0
- data/lib/etl/transform/sha1_transform.rb +13 -0
- data/lib/etl/transform/string_to_date_transform.rb +16 -0
- data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
- data/lib/etl/transform/string_to_time_transform.rb +11 -0
- data/lib/etl/transform/transform.rb +61 -0
- data/lib/etl/transform/trim_transform.rb +26 -0
- data/lib/etl/transform/type_transform.rb +35 -0
- data/lib/etl/transform.rb +2 -0
- data/lib/etl/util.rb +59 -0
- data/lib/etl/version.rb +9 -0
- data/lib/etl.rb +83 -0
- metadata +245 -0
data/lib/etl/row.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# This source file contains the ETL::Row class.
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
# This class represents a single row currently passing through the ETL pipeline
|
5
|
+
class Row < Hash
|
6
|
+
# Accessor for the originating source
|
7
|
+
attr_accessor :source
|
8
|
+
|
9
|
+
# All change types
|
10
|
+
CHANGE_TYPES = [:insert, :update, :delete]
|
11
|
+
|
12
|
+
# Accessor for the row's change type
|
13
|
+
attr_accessor :change_type
|
14
|
+
|
15
|
+
# Get the change type, defaults to :insert
|
16
|
+
def change_type
|
17
|
+
@change_type ||= :insert
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ETL
|
2
|
+
module Screen
|
3
|
+
# This screen validates the number of rows which will be bulk loaded
|
4
|
+
# against the results from some sort of a row count query. If there
|
5
|
+
# is a difference then the screen will not pass
|
6
|
+
class RowCountScreen
|
7
|
+
attr_accessor :control, :configuration
|
8
|
+
def initialize(control, configuration={})
|
9
|
+
@control = control
|
10
|
+
@configuration = configuration
|
11
|
+
execute
|
12
|
+
end
|
13
|
+
def execute
|
14
|
+
unless Engine.rows_written == configuration[:rows]
|
15
|
+
raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/etl/screen.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# This source file contains the ETL::Screen module and requires all of the
|
2
|
+
# screens
|
3
|
+
|
4
|
+
module ETL #:nodoc:
|
5
|
+
# The ETL::Screen module contains pre-built screens useful for checking the
|
6
|
+
# ETL state during execution. Screens may be fatal, which will result in
|
7
|
+
# termination of the ETL process, errors, which will result in the
|
8
|
+
# termination of just the current ETL control file, or warnings, which will
|
9
|
+
# result in a warning message.
|
10
|
+
module Screen
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module ETL
|
2
|
+
module Transform
|
3
|
+
class BlockTransform < ETL::Transform::Transform
|
4
|
+
def initialize(control, name, configuration)
|
5
|
+
super
|
6
|
+
@block = configuration[:block]
|
7
|
+
end
|
8
|
+
def transform(name, value, row)
|
9
|
+
@block.call(name, value, row)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a Date or Time to a formatted string instance
|
4
|
+
class DateToStringTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
|
9
|
+
def initialize(control, name, configuration={})
|
10
|
+
super
|
11
|
+
@format = configuration[:format] || "%Y-%m-%d"
|
12
|
+
end
|
13
|
+
# Transform the value using strftime
|
14
|
+
def transform(name, value, row)
|
15
|
+
return value unless value.respond_to?(:strftime)
|
16
|
+
value.strftime(@format)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which decodes coded values
|
4
|
+
class DecodeTransform < ETL::Transform::Transform
|
5
|
+
attr_accessor :decode_table_path
|
6
|
+
|
7
|
+
attr_accessor :decode_table_delimiter
|
8
|
+
|
9
|
+
attr_accessor :default_value
|
10
|
+
|
11
|
+
# Initialize the transformer
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
|
15
|
+
# * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
|
16
|
+
# * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
|
17
|
+
def initialize(control, name, configuration={})
|
18
|
+
super
|
19
|
+
|
20
|
+
if configuration[:decode_table_path]
|
21
|
+
configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path])
|
22
|
+
end
|
23
|
+
|
24
|
+
@decode_table_path = (configuration[:decode_table_path] || 'decode.txt')
|
25
|
+
@decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
|
26
|
+
@default_value = (configuration[:default_value] || 'No Value')
|
27
|
+
end
|
28
|
+
|
29
|
+
# Transform the value
|
30
|
+
def transform(name, value, row)
|
31
|
+
decode_table[value] || default_value
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get the decode table
|
35
|
+
def decode_table
|
36
|
+
unless @decode_table
|
37
|
+
@decode_table = {}
|
38
|
+
open(decode_table_path).each do |line|
|
39
|
+
code, value = line.strip.split(decode_table_delimiter)
|
40
|
+
if code && code.length > 0
|
41
|
+
@decode_table[code] = value
|
42
|
+
else
|
43
|
+
@default_value = value
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@decode_table
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which will replace nil or empty values with a specified value.
|
4
|
+
class DefaultTransform < Transform
|
5
|
+
attr_accessor :default_value
|
6
|
+
# Initialize the transform
|
7
|
+
#
|
8
|
+
# Configuration options:
|
9
|
+
# * <tt>:default_value</tt>: The default value to use if the incoming value is blank
|
10
|
+
def initialize(control, name, configuration)
|
11
|
+
super
|
12
|
+
@default_value = configuration[:default_value]
|
13
|
+
end
|
14
|
+
# Transform the value
|
15
|
+
def transform(name, value, row)
|
16
|
+
value.blank? ? default_value : value
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which looks up the value and replaces it with a foriegn key reference
|
4
|
+
class ForeignKeyLookupTransform < ETL::Transform::Transform
|
5
|
+
# The resolver to use if the foreign key is not found in the collection
|
6
|
+
attr_accessor :resolver
|
7
|
+
|
8
|
+
# The default foreign key to use if none is found.
|
9
|
+
attr_accessor :default
|
10
|
+
|
11
|
+
# Initialize the foreign key lookup transform.
|
12
|
+
#
|
13
|
+
# Configuration options:
|
14
|
+
# *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
|
15
|
+
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
16
|
+
# for future use.
|
17
|
+
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
18
|
+
# *<tt>:default</tt>: A default foreign key to use if no foreign key is found
|
19
|
+
def initialize(control, name, configuration={})
|
20
|
+
super
|
21
|
+
|
22
|
+
@collection = (configuration[:collection] || {})
|
23
|
+
@resolver = configuration[:resolver]
|
24
|
+
@resolver = @resolver.new if @resolver.is_a?(Class)
|
25
|
+
@default = configuration[:default]
|
26
|
+
if configuration[:cache] ||= true
|
27
|
+
if resolver.respond_to?(:load_cache)
|
28
|
+
resolver.load_cache
|
29
|
+
else
|
30
|
+
ETL::Engine.logger.info "#{resolver.class.name} does not support caching"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Transform the value by resolving it to a foriegn key
|
36
|
+
def transform(name, value, row)
|
37
|
+
fk = @collection[value]
|
38
|
+
unless fk
|
39
|
+
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
|
40
|
+
raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
|
41
|
+
fk = resolver.resolve(value)
|
42
|
+
fk ||= @default
|
43
|
+
raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}. You may want to specify a :default value." unless fk
|
44
|
+
@collection[value] = fk
|
45
|
+
end
|
46
|
+
fk
|
47
|
+
end
|
48
|
+
end
|
49
|
+
# Alias class name for the ForeignKeyLookupTransform.
|
50
|
+
class FkLookupTransform < ForeignKeyLookupTransform; end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Resolver which resolves using ActiveRecord.
|
55
|
+
class ActiveRecordResolver
|
56
|
+
# The ActiveRecord class to use
|
57
|
+
attr_accessor :ar_class
|
58
|
+
|
59
|
+
# The find method to use (as a symbol)
|
60
|
+
attr_accessor :find_method
|
61
|
+
|
62
|
+
# Initialize the resolver. The ar_class argument should extend from
|
63
|
+
# ActiveRecord::Base. The find_method argument must be a symbol for the
|
64
|
+
# finder method used. For example:
|
65
|
+
#
|
66
|
+
# ActiveRecordResolver.new(Person, :find_by_name)
|
67
|
+
#
|
68
|
+
# Note that the find method defined must only take a single argument.
|
69
|
+
def initialize(ar_class, find_method)
|
70
|
+
@ar_class = ar_class
|
71
|
+
@find_method = find_method
|
72
|
+
end
|
73
|
+
|
74
|
+
# Resolve the value
|
75
|
+
def resolve(value)
|
76
|
+
rec = ar_class.__send__(find_method, value)
|
77
|
+
rec.nil? ? nil : rec.id
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class SQLResolver
|
82
|
+
# Initialize the SQL resolver. Use the given table and field name to search
|
83
|
+
# for the appropriate foreign key. The field should be the name of a natural
|
84
|
+
# key that is used to locate the surrogate key for the record.
|
85
|
+
#
|
86
|
+
# The connection argument is optional. If specified it can be either a symbol
|
87
|
+
# referencing a connection defined in the ETL database.yml file or an actual
|
88
|
+
# ActiveRecord connection instance. If the connection is not specified then
|
89
|
+
# the ActiveRecord::Base.connection will be used.
|
90
|
+
def initialize(table, field, connection=nil)
|
91
|
+
@table = table
|
92
|
+
@field = field
|
93
|
+
@connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
|
94
|
+
@connection ||= ActiveRecord::Base.connection
|
95
|
+
end
|
96
|
+
def resolve(value)
|
97
|
+
if @use_cache
|
98
|
+
cache[cache_key(value)]
|
99
|
+
else
|
100
|
+
q = "SELECT id FROM #{table_name} WHERE #{wheres(value)}"
|
101
|
+
@connection.select_value(q)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
def table_name
|
105
|
+
ETL::Engine.table(@table, @connection)
|
106
|
+
end
|
107
|
+
def cache
|
108
|
+
@cache ||= {}
|
109
|
+
end
|
110
|
+
def load_cache
|
111
|
+
@use_cache = true
|
112
|
+
q = "SELECT id, #{field.join(', ')} FROM #{table_name}"
|
113
|
+
@connection.select_all(q).each do |record|
|
114
|
+
cache[cache_key(record.values_at(*field))] = record['id']
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
def field
|
120
|
+
unless @field.kind_of?(Array)
|
121
|
+
@field = [ @field ]
|
122
|
+
end
|
123
|
+
@field
|
124
|
+
end
|
125
|
+
|
126
|
+
def cache_key(value)
|
127
|
+
value.hash
|
128
|
+
end
|
129
|
+
|
130
|
+
def wheres(value)
|
131
|
+
value = [ value ] unless value.kind_of?(Array)
|
132
|
+
field.zip(value).collect { |a|
|
133
|
+
"#{a[0]} = #{@connection.quote(a[1])}"
|
134
|
+
}.join(' AND ')
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
class FlatFileResolver
|
139
|
+
# Initialize the flat file resolver. Expects to open a comma-delimited file.
|
140
|
+
# Returns the column with the given result_field_index.
|
141
|
+
#
|
142
|
+
# The matches argument is a Hash with the key as the column index to search and
|
143
|
+
# the value of the Hash as a String to match exactly. It will only match the first
|
144
|
+
# result.
|
145
|
+
def initialize(file, match_index, result_field_index)
|
146
|
+
@file = file
|
147
|
+
@match_index = match_index
|
148
|
+
@result_field_index = result_field_index
|
149
|
+
end
|
150
|
+
|
151
|
+
# Get the rows from the file specified in the initializer.
|
152
|
+
def rows
|
153
|
+
@rows ||= FasterCSV.read(@file)
|
154
|
+
end
|
155
|
+
protected :rows
|
156
|
+
|
157
|
+
# Match the row field from the column indicated by the match_index with the given
|
158
|
+
# value and return the field value from the column identified by the result_field_index.
|
159
|
+
def resolve(value)
|
160
|
+
rows.each do |row|
|
161
|
+
#puts "checking #{row.inspect} for #{value}"
|
162
|
+
if row[@match_index] == value
|
163
|
+
#puts "match found!, returning #{row[@result_field_index]}"
|
164
|
+
return row[@result_field_index]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
nil
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which walks up the hierarchy tree to find a value of the current level's value
|
4
|
+
# is nil.
|
5
|
+
#
|
6
|
+
# TODO: Let the resolver be implemented in a class so different resolution methods are
|
7
|
+
# possible.
|
8
|
+
class HierarchyLookupTransform < ETL::Transform::Transform
|
9
|
+
# The name of the field to use for the parent ID
|
10
|
+
attr_accessor :parent_id_field
|
11
|
+
|
12
|
+
# The target connection name
|
13
|
+
attr_accessor :target
|
14
|
+
|
15
|
+
# Initialize the transform
|
16
|
+
#
|
17
|
+
# Configuration options:
|
18
|
+
# * <tt>:target</tt>: The target connection name (required)
|
19
|
+
# * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
|
20
|
+
def initialize(control, name, configuration={})
|
21
|
+
super
|
22
|
+
@parent_id_field = configuration[:parent_id_field] || :parent_id
|
23
|
+
@target = configuration[:target]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Transform the value.
|
27
|
+
def transform(name, value, row)
|
28
|
+
if parent_id = row[parent_id_field]
|
29
|
+
# TODO: should use more than just the first source out of the control
|
30
|
+
parent_id, value = lookup(name,
|
31
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
32
|
+
until value || parent_id.nil?
|
33
|
+
# TODO: should use more than just the first source out of the control
|
34
|
+
parent_id, value = lookup(name,
|
35
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
value
|
39
|
+
end
|
40
|
+
|
41
|
+
# Lookup the parent value.
|
42
|
+
def lookup(field, table, parent_id, parent_id_field)
|
43
|
+
q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
|
44
|
+
row = ETL::Engine.connection(target).select_one(q)
|
45
|
+
return row[parent_id_field.to_s], row[field.to_s]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a number to an ordinalized version using the ActiveSupport ordinalize
|
4
|
+
# core extension
|
5
|
+
class OrdinalizeTransform < ETL::Transform::Transform
|
6
|
+
# Transform the value from a number to an ordinalized number
|
7
|
+
def transform(name, value, row)
|
8
|
+
value.ordinalize
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module ETL #:nodoc:
|
4
|
+
module Transform #:nodoc:
|
5
|
+
# Transform which hashes the original value with a SHA-1 hash algorithm
|
6
|
+
class Sha1Transform < ETL::Transform::Transform
|
7
|
+
# Transform the value with a SHA1 digest algorithm.
|
8
|
+
def transform(name, value, row)
|
9
|
+
Digest::SHA1.hexdigest(value)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a String representation of a date to a Date instance
|
4
|
+
class StringToDateTransform < ETL::Transform::Transform
|
5
|
+
# Transform the value using Date.parse
|
6
|
+
def transform(name, value, row)
|
7
|
+
return value if value.nil?
|
8
|
+
begin
|
9
|
+
Date.parse(value)
|
10
|
+
rescue => e
|
11
|
+
return value
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a String representation of a date to a DateTime instance
|
4
|
+
class StringToDateTimeTransform < ETL::Transform::Transform
|
5
|
+
# Transform the value using DateTime.parse.
|
6
|
+
#
|
7
|
+
# WARNING: This transform is slow (due to the Ruby implementation), but if you need to
|
8
|
+
# parse timestamps before or after the values supported by the Time.parse.
|
9
|
+
def transform(name, value, row)
|
10
|
+
DateTime.parse(value) unless value.nil?
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform a String representation of a date to a Time instance
|
4
|
+
class StringToTimeTransform < ETL::Transform::Transform
|
5
|
+
# Transform the value using Time.parse
|
6
|
+
def transform(name, value, row)
|
7
|
+
Time.parse(value) unless value.nil?
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module ETL#:nodoc:
|
2
|
+
module Transform#:nodoc:
|
3
|
+
# Base class for transforms.
|
4
|
+
#
|
5
|
+
# A transform converts one value to another value using some sort of algorithm.
|
6
|
+
#
|
7
|
+
# A simple transform has two arguments, the field to transform and the name of the transform:
|
8
|
+
#
|
9
|
+
# transform :ssn, :sha1
|
10
|
+
#
|
11
|
+
# Transforms can also be blocks:
|
12
|
+
#
|
13
|
+
# transform(:ssn){ |v| v[0,24] }
|
14
|
+
#
|
15
|
+
# Finally, a transform can include a configuration hash:
|
16
|
+
#
|
17
|
+
# transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'}
|
18
|
+
class Transform
|
19
|
+
class << self
|
20
|
+
# Transform the specified value using the given transforms. The transforms can either be
|
21
|
+
# Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
|
22
|
+
# Any other object will result in a ControlError being raised.
|
23
|
+
def transform(name, value, row, transforms)
|
24
|
+
transforms.each do |transform|
|
25
|
+
benchmarks[transform.class] ||= 0
|
26
|
+
benchmarks[transform.class] += Benchmark.realtime do
|
27
|
+
Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
|
28
|
+
case transform
|
29
|
+
when Proc
|
30
|
+
value = transform.call([name, value, row])
|
31
|
+
when Transform
|
32
|
+
value = transform.transform(name, value, row)
|
33
|
+
else
|
34
|
+
raise ControlError, "Unsupported transform configuration type: #{transform}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
value
|
39
|
+
end
|
40
|
+
|
41
|
+
def benchmarks
|
42
|
+
@benchmarks ||= {}
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
attr_reader :control, :name, :configuration
|
47
|
+
|
48
|
+
# Initialize the transform object with the given control object, field name and
|
49
|
+
# configuration hash
|
50
|
+
def initialize(control, name, configuration={})
|
51
|
+
@control = control
|
52
|
+
@name = name
|
53
|
+
@configuration = configuration
|
54
|
+
end
|
55
|
+
|
56
|
+
def transform(name, value, row)
|
57
|
+
raise "transform is an abstract method"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform to trim string
|
4
|
+
class TrimTransform < ETL::Transform::Transform
|
5
|
+
# Configuration options:
|
6
|
+
# * <tt>:type</tt>: :left, :right or :both. Default is :both
|
7
|
+
def initialize(control, name, configuration={})
|
8
|
+
super
|
9
|
+
@type = (configuration[:type] || :both).to_sym
|
10
|
+
end
|
11
|
+
# Transform the value
|
12
|
+
def transform(name, value, row)
|
13
|
+
case @type
|
14
|
+
when :left
|
15
|
+
value.lstrip
|
16
|
+
when :right
|
17
|
+
value.rstrip
|
18
|
+
when :both
|
19
|
+
value.strip
|
20
|
+
else
|
21
|
+
raise "Trim type, if specified, must be :left, :right or :both"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform from one type to another
|
4
|
+
class TypeTransform < ETL::Transform::Transform
|
5
|
+
# Initialize the transformer.
|
6
|
+
#
|
7
|
+
# Configuration options:
|
8
|
+
# * <tt>:type</tt>: The type to convert to. Supported types:
|
9
|
+
# ** :string
|
10
|
+
# ** :number,:integer
|
11
|
+
# ** :float
|
12
|
+
# ** :decimal
|
13
|
+
def initialize(control, name, configuration={})
|
14
|
+
super
|
15
|
+
@type = configuration[:type]
|
16
|
+
@significant = configuration[:significant] ||= 0
|
17
|
+
end
|
18
|
+
# Transform the value
|
19
|
+
def transform(name, value, row)
|
20
|
+
case @type
|
21
|
+
when :string
|
22
|
+
value.to_s
|
23
|
+
when :number, :integer
|
24
|
+
value.to_i
|
25
|
+
when :float
|
26
|
+
value.to_f
|
27
|
+
when :decimal
|
28
|
+
BigDecimal.new(value.to_s, @significant)
|
29
|
+
else
|
30
|
+
raise "Unsupported type: #{@type}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/etl/util.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
module ETL
|
2
|
+
module Util
|
3
|
+
# Return the distance of time in words from the given from_time to the specified to_time. If to_time
|
4
|
+
# is not specified then Time.now is used. By default seconds are included...set the include_seconds
|
5
|
+
# argument to false to disable the seconds.
|
6
|
+
def distance_of_time_in_words(from_time, to_time=Time.now)
|
7
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
8
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
9
|
+
seconds = (to_time - from_time).round
|
10
|
+
distance_in_days = (seconds/(60*60*24)).round
|
11
|
+
seconds = seconds % (60*60*24)
|
12
|
+
distance_in_hours = (seconds/(60*60)).round
|
13
|
+
seconds = seconds % (60*60)
|
14
|
+
distance_in_minutes = (seconds/60).round
|
15
|
+
seconds = seconds % 60
|
16
|
+
distance_in_seconds = seconds
|
17
|
+
|
18
|
+
s = ''
|
19
|
+
s << "#{distance_in_days} days," if distance_in_days > 0
|
20
|
+
s << "#{distance_in_hours} hours, " if distance_in_hours > 0
|
21
|
+
s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
|
22
|
+
s << "#{distance_in_seconds} seconds"
|
23
|
+
s
|
24
|
+
end
|
25
|
+
|
26
|
+
# Get the approximate disntance of time in words from the given from_time
|
27
|
+
# to the the given to_time. If to_time is not specified then it is set
|
28
|
+
# to Time.now. By default seconds are included...set the include_seconds
|
29
|
+
# argument to false to disable the seconds.
|
30
|
+
def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
|
31
|
+
from_time = from_time.to_time if from_time.respond_to?(:to_time)
|
32
|
+
to_time = to_time.to_time if to_time.respond_to?(:to_time)
|
33
|
+
distance_in_minutes = (((to_time - from_time).abs)/60).round
|
34
|
+
distance_in_seconds = ((to_time - from_time).abs).round
|
35
|
+
|
36
|
+
case distance_in_minutes
|
37
|
+
when 0..1
|
38
|
+
return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
|
39
|
+
case distance_in_seconds
|
40
|
+
when 0..4 then 'less than 5 seconds'
|
41
|
+
when 5..9 then 'less than 10 seconds'
|
42
|
+
when 10..19 then 'less than 20 seconds'
|
43
|
+
when 20..39 then 'half a minute'
|
44
|
+
when 40..59 then 'less than a minute'
|
45
|
+
else '1 minute'
|
46
|
+
end
|
47
|
+
when 2..44 then "#{distance_in_minutes} minutes"
|
48
|
+
when 45..89 then 'about 1 hour'
|
49
|
+
when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
|
50
|
+
when 1440..2879 then '1 day'
|
51
|
+
when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
|
52
|
+
when 43200..86399 then 'about 1 month'
|
53
|
+
when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
|
54
|
+
when 525960..1051919 then 'about 1 year'
|
55
|
+
else "over #{(distance_in_minutes / 525960).round} years"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|