factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4
+ # prior to loading
5
+ class TruncateProcessor < ETL::Processor::Processor
6
+ # Defines the table to truncate
7
+ attr_reader :table
8
+
9
+ # Defines the database connection to use
10
+ attr_reader :target
11
+
12
+ # Initialize the processor
13
+ #
14
+ # Options:
15
+ # * <tt>:target</tt>: The target connection
16
+ # * <tt>:table</tt>: The table name
17
+ def initialize(control, configuration)
18
+ super
19
+ #@file = File.join(File.dirname(control.file), configuration[:file])
20
+ @target = configuration[:target] || {}
21
+ @table = configuration[:table]
22
+ end
23
+
24
+ def process
25
+ conn = ETL::Engine.connection(target)
26
+ conn.truncate(table_name)
27
+ end
28
+
29
+ private
30
+ def table_name
31
+ ETL::Engine.table(table, ETL::Engine.connection(target))
32
+ end
33
+ end
34
+ end
35
+ end
data/lib/etl/row.rb ADDED
@@ -0,0 +1,20 @@
1
+ # This source file contains the ETL::Row class.
2
+
3
+ module ETL #:nodoc:
4
+ # This class represents a single row currently passing through the ETL pipeline
5
+ class Row < Hash
6
+ # Accessor for the originating source
7
+ attr_accessor :source
8
+
9
+ # All change types
10
+ CHANGE_TYPES = [:insert, :update, :delete]
11
+
12
+ # Accessor for the row's change type
13
+ attr_accessor :change_type
14
+
15
+ # Get the change type, defaults to :insert
16
+ def change_type
17
+ @change_type ||= :insert
18
+ end
19
+ end
20
+ end
data/lib/etl/screen.rb ADDED
@@ -0,0 +1,14 @@
1
+ # This source file contains the ETL::Screen module and requires all of the
2
+ # screens
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Screen module contains pre-built screens useful for checking the
6
+ # ETL state during execution. Screens may be fatal, which will result in
7
+ # termination of the ETL process, errors, which will result in the
8
+ # termination of just the current ETL control file, or warnings, which will
9
+ # result in a warning message.
10
+ module Screen
11
+ end
12
+ end
13
+
14
+ Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
@@ -0,0 +1,20 @@
1
+ module ETL
2
+ module Screen
3
+ # This screen validates the number of rows which will be bulk loaded
4
+ # against the results from some sort of a row count query. If there
5
+ # is a difference then the screen will not pass
6
+ class RowCountScreen
7
+ attr_accessor :control, :configuration
8
+ def initialize(control, configuration={})
9
+ @control = control
10
+ @configuration = configuration
11
+ execute
12
+ end
13
+ def execute
14
+ unless Engine.rows_written == configuration[:rows]
15
+ raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/transform/transform'
2
+ Dir[File.dirname(__FILE__) + "/transform/*.rb"].each { |file| require(file) }
@@ -0,0 +1,13 @@
1
+ module ETL
2
+ module Transform
3
+ class BlockTransform < ETL::Transform::Transform
4
+ def initialize(control, name, configuration)
5
+ super
6
+ @block = configuration[:block]
7
+ end
8
+ def transform(name, value, row)
9
+ @block.call(name, value, row)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a Date or Time to a formatted string instance
4
+ class DateToStringTransform < ETL::Transform::Transform
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:format</tt>: A format passed to strftime. Defaults to %Y-%m-%d
9
+ def initialize(control, name, configuration={})
10
+ super
11
+ @format = configuration[:format] || "%Y-%m-%d"
12
+ end
13
+ # Transform the value using strftime
14
+ def transform(name, value, row)
15
+ return value unless value.respond_to?(:strftime)
16
+ value.strftime(@format)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,51 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which decodes coded values
4
+ class DecodeTransform < ETL::Transform::Transform
5
+ attr_accessor :decode_table_path
6
+
7
+ attr_accessor :decode_table_delimiter
8
+
9
+ attr_accessor :default_value
10
+
11
+ # Initialize the transformer
12
+ #
13
+ # Configuration options:
14
+ # * <tt>:decode_table_path</tt>: The path to the decode table (defaults to 'decode.txt')
15
+ # * <tt>:decode_table_delimiter</tt>: The decode table delimiter (defaults to ':')
16
+ # * <tt>:default_value</tt>: The default value to use (defaults to 'No Value')
17
+ def initialize(control, name, configuration={})
18
+ super
19
+
20
+ if configuration[:decode_table_path]
21
+ configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path])
22
+ end
23
+
24
+ @decode_table_path = (configuration[:decode_table_path] || 'decode.txt')
25
+ @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
26
+ @default_value = (configuration[:default_value] || 'No Value')
27
+ end
28
+
29
+ # Transform the value
30
+ def transform(name, value, row)
31
+ decode_table[value] || default_value
32
+ end
33
+
34
+ # Get the decode table
35
+ def decode_table
36
+ unless @decode_table
37
+ @decode_table = {}
38
+ open(decode_table_path).each do |line|
39
+ code, value = line.strip.split(decode_table_delimiter)
40
+ if code && code.length > 0
41
+ @decode_table[code] = value
42
+ else
43
+ @default_value = value
44
+ end
45
+ end
46
+ end
47
+ @decode_table
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,20 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which will replace nil or empty values with a specified value.
4
+ class DefaultTransform < Transform
5
+ attr_accessor :default_value
6
+ # Initialize the transform
7
+ #
8
+ # Configuration options:
9
+ # * <tt>:default_value</tt>: The default value to use if the incoming value is blank
10
+ def initialize(control, name, configuration)
11
+ super
12
+ @default_value = configuration[:default_value]
13
+ end
14
+ # Transform the value
15
+ def transform(name, value, row)
16
+ value.blank? ? default_value : value
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,151 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which looks up the value and replaces it with a foriegn key reference
4
+ class ForeignKeyLookupTransform < ETL::Transform::Transform
5
+ # The resolver to use if the foreign key is not found in the collection
6
+ attr_accessor :resolver
7
+
8
+ # The default foreign key to use if none is found.
9
+ attr_accessor :default
10
+
11
+ # Initialize the foreign key lookup transform.
12
+ #
13
+ # Configuration options:
14
+ # *<tt>:collection</tt>: A Hash of natural keys mapped to surrogate keys. If this is not specified then
15
+ # an empty Hash will be used. This Hash will be used to cache values that have been resolved already
16
+ # for future use.
17
+ # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
18
+ # *<tt>:default</tt>: A default foreign key to use if no foreign key is found
19
+ def initialize(control, name, configuration={})
20
+ super
21
+
22
+ @collection = (configuration[:collection] || {})
23
+ @resolver = configuration[:resolver]
24
+ @resolver = @resolver.new if @resolver.is_a?(Class)
25
+ @default = configuration[:default]
26
+ if configuration[:cache] ||= true
27
+ if resolver.respond_to?(:load_cache)
28
+ resolver.load_cache
29
+ else
30
+ ETL::Engine.logger.info "#{resolver.class.name} does not support caching"
31
+ end
32
+ end
33
+ end
34
+
35
+ # Transform the value by resolving it to a foriegn key
36
+ def transform(name, value, row)
37
+ fk = @collection[value]
38
+ unless fk
39
+ raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
40
+ raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
41
+ fk = resolver.resolve(value)
42
+ fk ||= @default
43
+ raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}. You may want to specify a :default value." unless fk
44
+ @collection[value] = fk
45
+ end
46
+ fk
47
+ end
48
+ end
49
+ # Alias class name for the ForeignKeyLookupTransform.
50
+ class FkLookupTransform < ForeignKeyLookupTransform; end
51
+ end
52
+ end
53
+
54
+ # Resolver which resolves using ActiveRecord.
55
+ class ActiveRecordResolver
56
+ # The ActiveRecord class to use
57
+ attr_accessor :ar_class
58
+
59
+ # The find method to use (as a symbol)
60
+ attr_accessor :find_method
61
+
62
+ # Initialize the resolver. The ar_class argument should extend from
63
+ # ActiveRecord::Base. The find_method argument must be a symbol for the
64
+ # finder method used. For example:
65
+ #
66
+ # ActiveRecordResolver.new(Person, :find_by_name)
67
+ #
68
+ # Note that the find method defined must only take a single argument.
69
+ def initialize(ar_class, find_method)
70
+ @ar_class = ar_class
71
+ @find_method = find_method
72
+ end
73
+
74
+ # Resolve the value
75
+ def resolve(value)
76
+ rec = ar_class.__send__(find_method, value)
77
+ rec.nil? ? nil : rec.id
78
+ end
79
+ end
80
+
81
+ class SQLResolver
82
+ # Initialize the SQL resolver. Use the given table and field name to search
83
+ # for the appropriate foreign key. The field should be the name of a natural
84
+ # key that is used to locate the surrogate key for the record.
85
+ #
86
+ # The connection argument is optional. If specified it can be either a symbol
87
+ # referencing a connection defined in the ETL database.yml file or an actual
88
+ # ActiveRecord connection instance. If the connection is not specified then
89
+ # the ActiveRecord::Base.connection will be used.
90
+ def initialize(table, field, connection=nil)
91
+ @table = table
92
+ @field = field
93
+ @connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
94
+ @connection ||= ActiveRecord::Base.connection
95
+ end
96
+ def resolve(value)
97
+ if @use_cache
98
+ cache[value]
99
+ else
100
+ q = "SELECT id FROM #{table_name} WHERE #{@field} = #{@connection.quote(value)}"
101
+ ETL::Engine.logger.debug("Executing query: #{q}")
102
+ @connection.select_value(q)
103
+ end
104
+ end
105
+ def table_name
106
+ ETL::Engine.table(@table, @connection)
107
+ end
108
+ def cache
109
+ @cache ||= {}
110
+ end
111
+ def load_cache
112
+ @use_cache = true
113
+ q = "SELECT id, #{@field} FROM #{table_name}"
114
+ @connection.select_all(q).each do |record|
115
+ cache[record[@field]] = record['id']
116
+ end
117
+ end
118
+ end
119
+
120
+ class FlatFileResolver
121
+ # Initialize the flat file resolver. Expects to open a comma-delimited file.
122
+ # Returns the column with the given result_field_index.
123
+ #
124
+ # The matches argument is a Hash with the key as the column index to search and
125
+ # the value of the Hash as a String to match exactly. It will only match the first
126
+ # result.
127
+ def initialize(file, match_index, result_field_index)
128
+ @file = file
129
+ @match_index = match_index
130
+ @result_field_index = result_field_index
131
+ end
132
+
133
+ # Get the rows from the file specified in the initializer.
134
+ def rows
135
+ @rows ||= FasterCSV.read(@file)
136
+ end
137
+ protected :rows
138
+
139
+ # Match the row field from the column indicated by the match_index with the given
140
+ # value and return the field value from the column identified by the result_field_index.
141
+ def resolve(value)
142
+ rows.each do |row|
143
+ #puts "checking #{row.inspect} for #{value}"
144
+ if row[@match_index] == value
145
+ #puts "match found!, returning #{row[@result_field_index]}"
146
+ return row[@result_field_index]
147
+ end
148
+ end
149
+ nil
150
+ end
151
+ end
@@ -0,0 +1,49 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which walks up the hierarchy tree to find a value of the current level's value
4
+ # is nil.
5
+ #
6
+ # TODO: Let the resolver be implemented in a class so different resolution methods are
7
+ # possible.
8
+ class HierarchyLookupTransform < ETL::Transform::Transform
9
+ # The name of the field to use for the parent ID
10
+ attr_accessor :parent_id_field
11
+
12
+ # The target connection name
13
+ attr_accessor :target
14
+
15
+ # Initialize the transform
16
+ #
17
+ # Configuration options:
18
+ # * <tt>:target</tt>: The target connection name (required)
19
+ # * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
20
+ def initialize(control, name, configuration={})
21
+ super
22
+ @parent_id_field = configuration[:parent_id_field] || :parent_id
23
+ @target = configuration[:target]
24
+ end
25
+
26
+ # Transform the value.
27
+ def transform(name, value, row)
28
+ if parent_id = row[parent_id_field]
29
+ # TODO: should use more than just the first source out of the control
30
+ parent_id, value = lookup(name,
31
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
32
+ until value || parent_id.nil?
33
+ # TODO: should use more than just the first source out of the control
34
+ parent_id, value = lookup(name,
35
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
36
+ end
37
+ end
38
+ value
39
+ end
40
+
41
+ # Lookup the parent value.
42
+ def lookup(field, table, parent_id, parent_id_field)
43
+ q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
44
+ row = ETL::Engine.connection(target).select_one(q)
45
+ return row[parent_id_field.to_s], row[field.to_s]
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,12 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a number to an ordinalized version using the ActiveSupport ordinalize
4
+ # core extension
5
+ class OrdinalizeTransform < ETL::Transform::Transform
6
+ # Transform the value from a number to an ordinalized number
7
+ def transform(name, value, row)
8
+ value.ordinalize
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require 'digest/sha1'
2
+
3
+ module ETL #:nodoc:
4
+ module Transform #:nodoc:
5
+ # Transform which hashes the original value with a SHA-1 hash algorithm
6
+ class Sha1Transform < ETL::Transform::Transform
7
+ # Transform the value with a SHA1 digest algorithm.
8
+ def transform(name, value, row)
9
+ Digest::SHA1.hexdigest(value)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,16 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a String representation of a date to a Date instance
4
+ class StringToDateTransform < ETL::Transform::Transform
5
+ # Transform the value using Date.parse
6
+ def transform(name, value, row)
7
+ return value if value.nil?
8
+ begin
9
+ Date.parse(value)
10
+ rescue => e
11
+ return value
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end