activewarehouse-etl 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc
2
+ # Classes which store information about ETL execution
3
+ module Execution
4
+ # Execution management
5
+ class Execution
6
+ class << self
7
+ # Migrate the data store
8
+ def migrate
9
+ ETL::Execution::Migration.migrate
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ require 'etl/execution/base'
17
+ require 'etl/execution/job'
18
+ require 'etl/execution/record'
19
+ require 'etl/execution/migration'
@@ -0,0 +1,9 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Base class for ETL execution information
4
+ class Base < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ establish_connection :etl_execution
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,7 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL job
4
+ class Job < Base
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,54 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc
3
+ # Handles migration of tables required for persistent storage of meta data
4
+ # for the ETL engine
5
+ class Migration
6
+ class << self
7
+ # Execute the migrations
8
+ def migrate
9
+ connection.initialize_schema_information
10
+ v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
+ v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
12
+ end
13
+ protected
14
+ # Get the schema info table name
15
+ def schema_info_table_name
16
+ ETL::Execution::Base.table_name_prefix + "schema_info" +
17
+ ETL::Execution::Base.table_name_suffix
18
+ end
19
+
20
+ # Get the connection to use during migration
21
+ def connection
22
+ @connection ||= ETL::Execution::Base.connection
23
+ end
24
+
25
+ # Get the final target version number
26
+ def target
27
+ 1
28
+ end
29
+
30
+ private
31
+ def migration_1 #:nodoc:
32
+ connection.create_table :jobs do |t|
33
+ t.column :control_file, :string, :null => false
34
+ t.column :created_at, :datetime, :null => false
35
+ t.column :completed_at, :datetime
36
+ t.column :status, :string
37
+ end
38
+ connection.create_table :records do |t|
39
+ t.column :control_file, :string, :null => false
40
+ t.column :natural_key, :string, :null => false
41
+ t.column :crc, :string, :null => false
42
+ t.column :job_id, :integer, :null => false
43
+ end
44
+ update_schema_info(1)
45
+ end
46
+
47
+ # Update the schema info table, setting the version value
48
+ def update_schema_info(version)
49
+ connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Represents a single record
4
+ class Record < ETL::Execution::Base
5
+ belongs_to :table
6
+ end
7
+ end
8
+ end
@@ -1,3 +1,5 @@
1
+ # This source file contains code for a basic sequential surrogate key generator
2
+
1
3
  module ETL #:nodoc:
2
4
  module Generator #:nodoc:
3
5
  # Surrogate key generator.
data/lib/etl/parser.rb CHANGED
@@ -1,2 +1,11 @@
1
+ # This source file contains the ETL::Parser module and requires all of the files
2
+ # in the parser directory ending with .rb
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Parser module provides various text parsers.
6
+ module Parser
7
+ end
8
+ end
9
+
1
10
  require 'etl/parser/parser'
2
11
  Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -1,5 +1,8 @@
1
- module ETL
2
- module Parser
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Base parser class. Implementation classes must extend this class and implement
4
+ # the each method. The each method should return each row of the source data as
5
+ # a Hash.
3
6
  class Parser
4
7
  include Enumerable
5
8
  class << self
@@ -29,6 +29,7 @@ module ETL #:nodoc:
29
29
  end
30
30
  end
31
31
 
32
+ # Get an array of Field objects
32
33
  def fields
33
34
  @fields ||= []
34
35
  end
@@ -44,16 +45,21 @@ module ETL #:nodoc:
44
45
  end
45
46
  end
46
47
 
48
+ # Class representing a field to be loaded from the source
47
49
  class Field
48
- attr_reader :name, :path
49
- def initialize(name, path)
50
+ # The name of the field
51
+ attr_reader :name
52
+ # The XPath-like path to the field in the XML document
53
+ attr_reader :path
54
+
55
+ def initialize(name, path) #:nodoc
50
56
  @name = name
51
57
  @path = path
52
58
  end
53
59
  end
54
60
  end
55
61
 
56
- class Listener
62
+ class Listener #:nodoc:
57
63
  include REXML::SAX2Listener
58
64
  def initialize(parser, &block)
59
65
  @parser = parser
@@ -120,19 +126,28 @@ module ETL #:nodoc:
120
126
  end
121
127
  end
122
128
 
123
- module XPath
124
- class Path
129
+ # Module which contains classes that are used for XPath-like filtering
130
+ # on the SAX parser
131
+ module XPath #:nodoc:
132
+ class Path #:nodoc:
133
+ # Get the elements in the path
125
134
  attr_accessor :elements
135
+
136
+ # Initialize
126
137
  def initialize
127
138
  @elements = []
128
139
  end
140
+
141
+ # Convert to a string representation
129
142
  def to_s
130
143
  @elements.map{ |e| e.to_s }.join("/")
131
144
  end
145
+
132
146
  # Returns true if the last part of the path refers to an attribute
133
147
  def is_attribute?
134
148
  elements.last.attributes.length > 0
135
149
  end
150
+
136
151
  # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
137
152
  # does not reference an attribute.
138
153
  #
@@ -142,6 +157,7 @@ module ETL #:nodoc:
142
157
  return nil unless is_attribute?
143
158
  elements.last.attributes.keys.first
144
159
  end
160
+
145
161
  # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
146
162
  # will cause the method to return false.
147
163
  def match?(s)
@@ -178,7 +194,7 @@ module ETL #:nodoc:
178
194
  path
179
195
  end
180
196
  end
181
- class Element
197
+ class Element #:nodoc
182
198
  attr_reader :name
183
199
  attr_reader :attributes
184
200
  def initialize(name, attributes={})
data/lib/etl/processor.rb CHANGED
@@ -1,3 +1,11 @@
1
+ # This source file contains the ETL::Processor module and requires all of the processors
2
+
3
+ module ETL #:nodoc:
4
+ # The ETL::Processor module contains row-level and bulk processors
5
+ module Processor
6
+ end
7
+ end
8
+
1
9
  require 'etl/processor/processor'
2
10
  require 'etl/processor/row_processor'
3
11
  Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -1,11 +1,35 @@
1
1
  module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
- # Processor which is used to bulk import data into a target database
3
+ # Processor which is used to bulk import data into a target database. The
4
+ # underlying database driver from ActiveRecord must support the methods
5
+ # +bulk_load+ method.
4
6
  class BulkImportProcessor < ETL::Processor::Processor
5
- attr_reader :file, :target, :truncate, :columns
7
+ # The file to load from
8
+ attr_reader :file
9
+ # The target database information (see +initialize+)
10
+ attr_reader :target
11
+ # Set to true to truncate
12
+ attr_reader :truncate
13
+ # Array of symbols representing the column load order
14
+ attr_reader :columns
15
+ # The field separator (defaults to a comma)
6
16
  attr_accessor :field_separator
17
+ # The field enclosure (defaults to nil)
7
18
  attr_accessor :field_enclosure
19
+ # The line separator (defaults to a newline)
8
20
  attr_accessor :line_separator
21
+
22
+ # Initialize the processor.
23
+ #
24
+ # Configuration options:
25
+ # * <tt>:file</tt>: The file to load data from
26
+ # * <tt>:target</tt>: The target connection information
27
+ # * <tt>:truncate</tt>: Set to true to truncate before loading
28
+ # * <tt>:columns</tt>: The columns to load in the order they appear in
29
+ # the bulk data file
30
+ # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
31
+ # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
32
+ # * <tt>:field_enclosure</tt>: The field enclosure charcaters
9
33
  def initialize(control, configuration)
10
34
  super
11
35
  @file = File.join(File.dirname(control.file), configuration[:file])
@@ -13,12 +37,15 @@ module ETL #:nodoc:
13
37
  @truncate = configuration[:truncate] ||= false
14
38
  @columns = configuration[:columns]
15
39
  @field_separator = (configuration[:field_separator] || ',')
16
- @line_separator = configuration[:line_separator]
40
+ @line_separator = (configuration[:line_separator] || "\n")
17
41
  @field_enclosure = configuration[:field_enclosure]
18
42
  connect
19
43
  end
44
+
45
+ # Execute the processor
20
46
  def process
21
- # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
47
+ return if ETL::Engine.skip_bulk_import
48
+
22
49
  conn = ETL::ActiveRecord::Base.connection
23
50
  conn.transaction do
24
51
  # TODO: Support all database types
@@ -30,6 +57,7 @@ module ETL #:nodoc:
30
57
  options[:fields] = {}
31
58
  options[:fields][:delimited_by] = field_separator if field_separator
32
59
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
60
+ options[:fields][:terminated_by] = line_separator if line_separator
33
61
  end
34
62
  conn.bulk_load(file, target[:table], options)
35
63
  end
@@ -0,0 +1,69 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The name of the table to check against
10
+ attr_accessor :table
11
+
12
+ # An array of columns representing the natural key
13
+ attr_accessor :columns
14
+
15
+ # Is set to true if the processor should execute the check. If there are
16
+ # no rows in the target table then this should return false.
17
+ attr_accessor :should_check
18
+
19
+ # Initialize the processor
20
+ # Configuration options:
21
+ # * <tt>:skip</tt>: A symbol or array of column names that should not
22
+ # be checked
23
+ # * <tt>:table</tt>: The table name
24
+ # * <tt>:columns</tt>: An array of columns which represent the natural
25
+ # key
26
+ def initialize(control, configuration)
27
+ super
28
+ @skip = configuration[:skip]
29
+ @table = configuration[:table]
30
+ @columns = configuration[:columns]
31
+
32
+ q = "SELECT COUNT(*) FROM #{table}"
33
+ @should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
34
+ end
35
+
36
+ # Return true if the given key should be skipped
37
+ def skip?(key)
38
+ case skip
39
+ when Array
40
+ skip.include?(key)
41
+ else
42
+ skip.to_sym == key.to_sym
43
+ end
44
+ end
45
+
46
+ # Return true if the row should be checked
47
+ def should_check?
48
+ @should_check ? true : false
49
+ end
50
+
51
+ # Process the row
52
+ def process(row)
53
+ return row unless should_check?
54
+ q = "SELECT * FROM #{table} WHERE "
55
+ conditions = []
56
+ row.each do |k,v|
57
+ if columns.nil? || columns.include?(k.to_sym)
58
+ conditions << "#{k} = '#{v}'" unless skip?(k)
59
+ end
60
+ end
61
+ q << conditions.join(" AND ")
62
+
63
+ #puts "query: #{q}"
64
+ result = ActiveRecord::Base.connection.select_one(q)
65
+ return row if result.nil?
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ def process(row)
27
+ key = (keys.collect { |k| row[k] }).join('|')
28
+ unless compound_key_constraints[key]
29
+ compound_key_constraints[key] = 1
30
+ return row
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -1,8 +1,24 @@
1
- module ETL
2
- module Processor
3
- class CopyField < ETL::Processor::RowProcessor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
4
11
  def process(row)
5
- row[configuration[:destination]] = row[configuration[:source]].dup
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
6
22
  row
7
23
  end
8
24
  end
@@ -7,12 +7,15 @@ module ETL #:nodoc:
7
7
  @configuration = configuration
8
8
  end
9
9
  protected
10
+ # Get the control object
10
11
  def control
11
12
  @control
12
13
  end
14
+ # Get the configuration Hash
13
15
  def configuration
14
16
  @configuration
15
17
  end
18
+ # Get the engine logger
16
19
  def log
17
20
  Engine.logger
18
21
  end