activewarehouse-etl 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/CHANGELOG +29 -1
  2. data/LICENSE +7 -0
  3. data/README +58 -12
  4. data/Rakefile +2 -1
  5. data/lib/etl.rb +3 -0
  6. data/lib/etl/commands/etl.rb +35 -1
  7. data/lib/etl/control/control.rb +20 -9
  8. data/lib/etl/control/destination.rb +173 -12
  9. data/lib/etl/control/destination/database_destination.rb +2 -2
  10. data/lib/etl/control/destination/file_destination.rb +25 -2
  11. data/lib/etl/control/source.rb +29 -8
  12. data/lib/etl/control/source/database_source.rb +109 -24
  13. data/lib/etl/control/source/file_source.rb +29 -16
  14. data/lib/etl/engine.rb +164 -63
  15. data/lib/etl/execution.rb +19 -0
  16. data/lib/etl/execution/base.rb +9 -0
  17. data/lib/etl/execution/job.rb +7 -0
  18. data/lib/etl/execution/migration.rb +54 -0
  19. data/lib/etl/execution/record.rb +8 -0
  20. data/lib/etl/generator/surrogate_key_generator.rb +2 -0
  21. data/lib/etl/parser.rb +9 -0
  22. data/lib/etl/parser/parser.rb +5 -2
  23. data/lib/etl/parser/sax_parser.rb +22 -6
  24. data/lib/etl/processor.rb +8 -0
  25. data/lib/etl/processor/bulk_import_processor.rb +32 -4
  26. data/lib/etl/processor/check_exist_processor.rb +69 -0
  27. data/lib/etl/processor/check_unique_processor.rb +35 -0
  28. data/lib/etl/processor/copy_field_processor.rb +20 -4
  29. data/lib/etl/processor/processor.rb +3 -0
  30. data/lib/etl/processor/rename_processor.rb +24 -0
  31. data/lib/etl/processor/row_processor.rb +1 -1
  32. data/lib/etl/processor/sequence_processor.rb +23 -0
  33. data/lib/etl/processor/surrogate_key_processor.rb +31 -0
  34. data/lib/etl/processor/truncate_processor.rb +0 -2
  35. data/lib/etl/row.rb +17 -0
  36. data/lib/etl/screen/row_count_screen.rb +15 -0
  37. data/lib/etl/transform/block_transform.rb +13 -0
  38. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  39. data/lib/etl/transform/decode_transform.rb +1 -1
  40. data/lib/etl/transform/default_transform.rb +6 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  42. data/lib/etl/transform/hierarchy_lookup_transform.rb +1 -1
  43. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  44. data/lib/etl/transform/sha1_transform.rb +0 -3
  45. data/lib/etl/transform/string_to_date_transform.rb +0 -3
  46. data/lib/etl/transform/string_to_datetime_transform.rb +0 -3
  47. data/lib/etl/transform/string_to_time_transform.rb +0 -3
  48. data/lib/etl/transform/transform.rb +20 -11
  49. data/lib/etl/transform/trim_transform.rb +26 -0
  50. data/lib/etl/transform/type_transform.rb +9 -1
  51. data/lib/etl/version.rb +2 -2
  52. metadata +21 -3
@@ -0,0 +1,19 @@
1
+ module ETL #:nodoc
2
+ # Classes which store information about ETL execution
3
+ module Execution
4
+ # Execution management
5
+ class Execution
6
+ class << self
7
+ # Migrate the data store
8
+ def migrate
9
+ ETL::Execution::Migration.migrate
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ require 'etl/execution/base'
17
+ require 'etl/execution/job'
18
+ require 'etl/execution/record'
19
+ require 'etl/execution/migration'
@@ -0,0 +1,9 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Base class for ETL execution information
4
+ class Base < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ establish_connection :etl_execution
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,7 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL job
4
+ class Job < Base
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,54 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc
3
+ # Handles migration of tables required for persistent storage of meta data
4
+ # for the ETL engine
5
+ class Migration
6
+ class << self
7
+ # Execute the migrations
8
+ def migrate
9
+ connection.initialize_schema_information
10
+ v = connection.select_value("SELECT version FROM #{schema_info_table_name}").to_i
11
+ v.upto(target - 1) { |i| __send__("migration_#{i+1}".to_sym) }
12
+ end
13
+ protected
14
+ # Get the schema info table name
15
+ def schema_info_table_name
16
+ ETL::Execution::Base.table_name_prefix + "schema_info" +
17
+ ETL::Execution::Base.table_name_suffix
18
+ end
19
+
20
+ # Get the connection to use during migration
21
+ def connection
22
+ @connection ||= ETL::Execution::Base.connection
23
+ end
24
+
25
+ # Get the final target version number
26
+ def target
27
+ 1
28
+ end
29
+
30
+ private
31
+ def migration_1 #:nodoc:
32
+ connection.create_table :jobs do |t|
33
+ t.column :control_file, :string, :null => false
34
+ t.column :created_at, :datetime, :null => false
35
+ t.column :completed_at, :datetime
36
+ t.column :status, :string
37
+ end
38
+ connection.create_table :records do |t|
39
+ t.column :control_file, :string, :null => false
40
+ t.column :natural_key, :string, :null => false
41
+ t.column :crc, :string, :null => false
42
+ t.column :job_id, :integer, :null => false
43
+ end
44
+ update_schema_info(1)
45
+ end
46
+
47
+ # Update the schema info table, setting the version value
48
+ def update_schema_info(version)
49
+ connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Represents a single record
4
+ class Record < ETL::Execution::Base
5
+ belongs_to :table
6
+ end
7
+ end
8
+ end
@@ -1,3 +1,5 @@
1
+ # This source file contains code for a basic sequential surrogate key generator
2
+
1
3
  module ETL #:nodoc:
2
4
  module Generator #:nodoc:
3
5
  # Surrogate key generator.
data/lib/etl/parser.rb CHANGED
@@ -1,2 +1,11 @@
1
+ # This source file contains the ETL::Parser module and requires all of the files
2
+ # in the parser directory ending with .rb
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Parser module provides various text parsers.
6
+ module Parser
7
+ end
8
+ end
9
+
1
10
  require 'etl/parser/parser'
2
11
  Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -1,5 +1,8 @@
1
- module ETL
2
- module Parser
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Base parser class. Implementation classes must extend this class and implement
4
+ # the each method. The each method should return each row of the source data as
5
+ # a Hash.
3
6
  class Parser
4
7
  include Enumerable
5
8
  class << self
@@ -29,6 +29,7 @@ module ETL #:nodoc:
29
29
  end
30
30
  end
31
31
 
32
+ # Get an array of Field objects
32
33
  def fields
33
34
  @fields ||= []
34
35
  end
@@ -44,16 +45,21 @@ module ETL #:nodoc:
44
45
  end
45
46
  end
46
47
 
48
+ # Class representing a field to be loaded from the source
47
49
  class Field
48
- attr_reader :name, :path
49
- def initialize(name, path)
50
+ # The name of the field
51
+ attr_reader :name
52
+ # The XPath-like path to the field in the XML document
53
+ attr_reader :path
54
+
55
+ def initialize(name, path) #:nodoc
50
56
  @name = name
51
57
  @path = path
52
58
  end
53
59
  end
54
60
  end
55
61
 
56
- class Listener
62
+ class Listener #:nodoc:
57
63
  include REXML::SAX2Listener
58
64
  def initialize(parser, &block)
59
65
  @parser = parser
@@ -120,19 +126,28 @@ module ETL #:nodoc:
120
126
  end
121
127
  end
122
128
 
123
- module XPath
124
- class Path
129
+ # Module which contains classes that are used for XPath-like filtering
130
+ # on the SAX parser
131
+ module XPath #:nodoc:
132
+ class Path #:nodoc:
133
+ # Get the elements in the path
125
134
  attr_accessor :elements
135
+
136
+ # Initialize
126
137
  def initialize
127
138
  @elements = []
128
139
  end
140
+
141
+ # Convert to a string representation
129
142
  def to_s
130
143
  @elements.map{ |e| e.to_s }.join("/")
131
144
  end
145
+
132
146
  # Returns true if the last part of the path refers to an attribute
133
147
  def is_attribute?
134
148
  elements.last.attributes.length > 0
135
149
  end
150
+
136
151
  # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
137
152
  # does not reference an attribute.
138
153
  #
@@ -142,6 +157,7 @@ module ETL #:nodoc:
142
157
  return nil unless is_attribute?
143
158
  elements.last.attributes.keys.first
144
159
  end
160
+
145
161
  # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
146
162
  # will cause the method to return false.
147
163
  def match?(s)
@@ -178,7 +194,7 @@ module ETL #:nodoc:
178
194
  path
179
195
  end
180
196
  end
181
- class Element
197
+ class Element #:nodoc
182
198
  attr_reader :name
183
199
  attr_reader :attributes
184
200
  def initialize(name, attributes={})
data/lib/etl/processor.rb CHANGED
@@ -1,3 +1,11 @@
1
+ # This source file contains the ETL::Processor module and requires all of the processors
2
+
3
+ module ETL #:nodoc:
4
+ # The ETL::Processor module contains row-level and bulk processors
5
+ module Processor
6
+ end
7
+ end
8
+
1
9
  require 'etl/processor/processor'
2
10
  require 'etl/processor/row_processor'
3
11
  Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -1,11 +1,35 @@
1
1
  module ETL #:nodoc:
2
2
  module Processor #:nodoc:
3
- # Processor which is used to bulk import data into a target database
3
+ # Processor which is used to bulk import data into a target database. The
4
+ # underlying database driver from ActiveRecord must support the methods
5
+ # +bulk_load+ method.
4
6
  class BulkImportProcessor < ETL::Processor::Processor
5
- attr_reader :file, :target, :truncate, :columns
7
+ # The file to load from
8
+ attr_reader :file
9
+ # The target database information (see +initialize+)
10
+ attr_reader :target
11
+ # Set to true to truncate
12
+ attr_reader :truncate
13
+ # Array of symbols representing the column load order
14
+ attr_reader :columns
15
+ # The field separator (defaults to a comma)
6
16
  attr_accessor :field_separator
17
+ # The field enclosure (defaults to nil)
7
18
  attr_accessor :field_enclosure
19
+ # The line separator (defaults to a newline)
8
20
  attr_accessor :line_separator
21
+
22
+ # Initialize the processor.
23
+ #
24
+ # Configuration options:
25
+ # * <tt>:file</tt>: The file to load data from
26
+ # * <tt>:target</tt>: The target connection information
27
+ # * <tt>:truncate</tt>: Set to true to truncate before loading
28
+ # * <tt>:columns</tt>: The columns to load in the order they appear in
29
+ # the bulk data file
30
+ # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
31
+ # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
32
+ # * <tt>:field_enclosure</tt>: The field enclosure charcaters
9
33
  def initialize(control, configuration)
10
34
  super
11
35
  @file = File.join(File.dirname(control.file), configuration[:file])
@@ -13,12 +37,15 @@ module ETL #:nodoc:
13
37
  @truncate = configuration[:truncate] ||= false
14
38
  @columns = configuration[:columns]
15
39
  @field_separator = (configuration[:field_separator] || ',')
16
- @line_separator = configuration[:line_separator]
40
+ @line_separator = (configuration[:line_separator] || "\n")
17
41
  @field_enclosure = configuration[:field_enclosure]
18
42
  connect
19
43
  end
44
+
45
+ # Execute the processor
20
46
  def process
21
- # columns = control.destinations.first.order.join(',') # TODO: support multiple destinations?
47
+ return if ETL::Engine.skip_bulk_import
48
+
22
49
  conn = ETL::ActiveRecord::Base.connection
23
50
  conn.transaction do
24
51
  # TODO: Support all database types
@@ -30,6 +57,7 @@ module ETL #:nodoc:
30
57
  options[:fields] = {}
31
58
  options[:fields][:delimited_by] = field_separator if field_separator
32
59
  options[:fields][:enclosed_by] = field_enclosure if field_enclosure
60
+ options[:fields][:terminated_by] = line_separator if line_separator
33
61
  end
34
62
  conn.bulk_load(file, target[:table], options)
35
63
  end
@@ -0,0 +1,69 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # A row-level processor that checks if the row already exists in the
4
+ # target table
5
+ class CheckExistProcessor < ETL::Processor::RowProcessor
6
+ # A symbol or array of symbols representing keys that should be skipped
7
+ attr_accessor :skip
8
+
9
+ # The name of the table to check against
10
+ attr_accessor :table
11
+
12
+ # An array of columns representing the natural key
13
+ attr_accessor :columns
14
+
15
+ # Is set to true if the processor should execute the check. If there are
16
+ # no rows in the target table then this should return false.
17
+ attr_accessor :should_check
18
+
19
+ # Initialize the processor
20
+ # Configuration options:
21
+ # * <tt>:skip</tt>: A symbol or array of column names that should not
22
+ # be checked
23
+ # * <tt>:table</tt>: The table name
24
+ # * <tt>:columns</tt>: An array of columns which represent the natural
25
+ # key
26
+ def initialize(control, configuration)
27
+ super
28
+ @skip = configuration[:skip]
29
+ @table = configuration[:table]
30
+ @columns = configuration[:columns]
31
+
32
+ q = "SELECT COUNT(*) FROM #{table}"
33
+ @should_check = ActiveRecord::Base.connection.select_value(q).to_i > 0
34
+ end
35
+
36
+ # Return true if the given key should be skipped
37
+ def skip?(key)
38
+ case skip
39
+ when Array
40
+ skip.include?(key)
41
+ else
42
+ skip.to_sym == key.to_sym
43
+ end
44
+ end
45
+
46
+ # Return true if the row should be checked
47
+ def should_check?
48
+ @should_check ? true : false
49
+ end
50
+
51
+ # Process the row
52
+ def process(row)
53
+ return row unless should_check?
54
+ q = "SELECT * FROM #{table} WHERE "
55
+ conditions = []
56
+ row.each do |k,v|
57
+ if columns.nil? || columns.include?(k.to_sym)
58
+ conditions << "#{k} = '#{v}'" unless skip?(k)
59
+ end
60
+ end
61
+ q << conditions.join(" AND ")
62
+
63
+ #puts "query: #{q}"
64
+ result = ActiveRecord::Base.connection.select_one(q)
65
+ return row if result.nil?
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that checks whether or not the row has already passed
4
+ # through the ETL processor, using the key fields provided as the keys
5
+ # to check.
6
+ class CheckUniqueProcessor < ETL::Processor::RowProcessor
7
+
8
+ # The keys to check
9
+ attr_accessor :keys
10
+
11
+ # Initialize the processor
12
+ # Configuration options:
13
+ # * <tt>:keys</tt>: An array of keys to check against
14
+ def initialize(control, configuration)
15
+ super
16
+ @keys = configuration[:keys]
17
+ end
18
+
19
+ # A Hash of keys that have already been processed.
20
+ def compound_key_constraints
21
+ @compound_key_constraints ||= {}
22
+ end
23
+
24
+ # Process the row. This implementation will only return a row if it
25
+ # it's key combination has not already been seen.
26
+ def process(row)
27
+ key = (keys.collect { |k| row[k] }).join('|')
28
+ unless compound_key_constraints[key]
29
+ compound_key_constraints[key] = 1
30
+ return row
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -1,8 +1,24 @@
1
- module ETL
2
- module Processor
3
- class CopyField < ETL::Processor::RowProcessor
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row processor that will copy one field to another
4
+ #
5
+ # Configuration options:
6
+ # * <tt>:destination</tt>: The destination field
7
+ # * <tt>:dest</tt>: Alias for :destination
8
+ # * <tt>:source</tt>: The source field
9
+ class CopyFieldProcessor < ETL::Processor::RowProcessor
10
+ # Process the given row
4
11
  def process(row)
5
- row[configuration[:destination]] = row[configuration[:source]].dup
12
+ destination = (configuration[:destination] || configuration[:dest])
13
+ source_value = row[configuration[:source]]
14
+ case source_value
15
+ when Numeric
16
+ row[destination] = source_value
17
+ when nil
18
+ row[destination] = nil
19
+ else
20
+ row[destination] = source_value.dup
21
+ end
6
22
  row
7
23
  end
8
24
  end
@@ -7,12 +7,15 @@ module ETL #:nodoc:
7
7
  @configuration = configuration
8
8
  end
9
9
  protected
10
+ # Get the control object
10
11
  def control
11
12
  @control
12
13
  end
14
+ # Get the configuration Hash
13
15
  def configuration
14
16
  @configuration
15
17
  end
18
+ # Get the engine logger
16
19
  def log
17
20
  Engine.logger
18
21
  end