activewarehouse-etl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0 - Dec 6, 2006
2
+ * Initial release
data/README ADDED
@@ -0,0 +1,27 @@
1
+ Ruby ETL parser.
2
+
3
+ == Features
4
+
5
+ Current supported features:
6
+
7
+ * ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
8
+ * Multiple source types - fixed-width and delimited text files currently supported
9
+ * Multiple destination types - file and database destinations
10
+ * Support for extracting from multiple sources
11
+ * Support for loading to multiple destinations
12
+ * Extensible transformations - comes with built in SHA1 one-way hash example
13
+ * Pre/post processing - export to files and then post process with the bulk import processor for large amounts of data
14
+ * Virtual fields - Add a field to the destination data which doesn't exist in the source data
15
+
16
+
17
+ == Requirements
18
+
19
+ * ActiveSupport Gem
20
+ * ActiveRecord Gem
21
+ * FasterCSV Gem
22
+
23
+ == Examples
24
+ Examples can be found in the test directory.
25
+
26
+ == Feedback
27
+ This is a work in progress. Comments should be made on the activewarehouse-discuss mailing list at the moment.
data/Rakefile ADDED
@@ -0,0 +1,117 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/packagetask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
9
+
10
+ PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
11
+ PKG_NAME = 'activewarehouse-etl'
12
+ PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
13
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
14
+ PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
15
+
16
+ RELEASE_NAME = "REL #{PKG_VERSION}"
17
+
18
+ RUBY_FORGE_PROJECT = "activewarehouse"
19
+ RUBY_FORGE_USER = "aeden"
20
+
21
+ desc 'Default: run unit tests.'
22
+ task :default => :test
23
+
24
+ desc 'Test the ETL application.'
25
+ Rake::TestTask.new(:test) do |t|
26
+ t.libs << 'lib'
27
+ t.pattern = 'test/**/*_test.rb'
28
+ t.verbose = true
29
+ # TODO: reset the database
30
+ end
31
+
32
+ desc 'Generate documentation for the ETL application.'
33
+ Rake::RDocTask.new(:rdoc) do |rdoc|
34
+ rdoc.rdoc_dir = 'rdoc'
35
+ rdoc.title = 'ActiveWarehouse ETL'
36
+ rdoc.options << '--line-numbers' << '--inline-source'
37
+ rdoc.rdoc_files.include('README')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ end
40
+
41
+ PKG_FILES = FileList[
42
+ 'CHANGELOG',
43
+ 'README',
44
+ 'Rakefile',
45
+ 'bin/**/*',
46
+ 'doc/**/*',
47
+ 'lib/**/*',
48
+ ] - [ 'test' ]
49
+
50
+ spec = Gem::Specification.new do |s|
51
+ s.name = 'activewarehouse-etl'
52
+ s.version = PKG_VERSION
53
+ s.summary = "Pure Ruby ETL package."
54
+ s.description = <<-EOF
55
+ ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
56
+ EOF
57
+
58
+ s.add_dependency('rake', '>= 0.7.1')
59
+ s.add_dependency('activesupport', '>= 1.3.1.5618')
60
+ s.add_dependency('activerecord', '>= 1.14.4.5618')
61
+ s.add_dependency('fastercsv', '>= 1.0.0')
62
+
63
+ s.rdoc_options << '--exclude' << '.'
64
+ s.has_rdoc = false
65
+
66
+ s.files = PKG_FILES.to_a.delete_if {|f| f.include?('.svn')}
67
+ s.require_path = 'lib'
68
+
69
+ s.bindir = "bin" # Use these for applications.
70
+ s.executables = ['etl']
71
+ s.default_executable = "etl"
72
+
73
+ s.author = "Anthony Eden"
74
+ s.email = "anthonyeden@gmail.com"
75
+ s.homepage = "http://activewarehouse.rubyforge.org/etl"
76
+ s.rubyforge_project = "activewarehouse"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.gem_spec = spec
81
+ end
82
+
83
+ desc "Generate code statistics"
84
+ task :lines do
85
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
86
+
87
+ for file_name in FileList["lib/**/*.rb"]
88
+ next if file_name =~ /vendor/
89
+ f = File.open(file_name)
90
+
91
+ while line = f.gets
92
+ lines += 1
93
+ next if line =~ /^\s*$/
94
+ next if line =~ /^\s*#/
95
+ codelines += 1
96
+ end
97
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
98
+
99
+ total_lines += lines
100
+ total_codelines += codelines
101
+
102
+ lines, codelines = 0, 0
103
+ end
104
+
105
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
106
+ end
107
+
108
+ desc "Publish the release files to RubyForge."
109
+ task :release => [ :package ] do
110
+ `rubyforge login`
111
+
112
+ for ext in %w( gem tgz zip )
113
+ release_command = "rubyforge add_release activewarehouse #{PKG_NAME} 'REL #{PKG_VERSION}' pkg/#{PKG_NAME}-#{PKG_VERSION}.#{ext}"
114
+ puts release_command
115
+ system(release_command)
116
+ end
117
+ end
data/bin/etl ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+ # Copyright (c) 2006 Anthony Eden
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #++
25
+
26
+ require File.dirname(__FILE__) + "/../lib/etl/commands/etl"
data/lib/etl.rb ADDED
@@ -0,0 +1,58 @@
1
+ #--
2
+ # Copyright (c) 2006 Anthony Eden
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ unless defined?(Logger)
25
+ require 'logger'
26
+ end
27
+
28
+ require 'rubygems'
29
+ unless defined?(ActiveSupport)
30
+ require_gem 'activesupport'
31
+ require 'active_support'
32
+ end
33
+
34
+ unless defined?(ActiveRecord)
35
+ require_gem 'activerecord'
36
+ require 'active_record'
37
+ end
38
+
39
+ require_gem 'fastercsv'
40
+ require 'faster_csv'
41
+
42
+ $:.unshift(File.dirname(__FILE__))
43
+
44
+ require 'etl/version'
45
+ require 'etl/engine'
46
+ require 'etl/control'
47
+ require 'etl/parser'
48
+ require 'etl/transform'
49
+ require 'etl/processor'
50
+
51
+ module ETL #:nodoc:
52
+ class ETLError < StandardError #:nodoc:
53
+ end
54
+ class ControlError < ETLError #:nodoc:
55
+ end
56
+ class DefinitionError < ControlError #:nodoc:
57
+ end
58
+ end
@@ -0,0 +1,45 @@
1
+ #--
2
+ # Copyright (c) 2006 Anthony Eden
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'benchmark'
25
+ require File.dirname(__FILE__) + '/../../etl'
26
+
27
+ # Print a usage statement
28
+ def usage #:nodoc:
29
+ puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]"
30
+ end
31
+
32
+ if ARGV.length < 1
33
+ usage
34
+ else
35
+ puts "Starting ETL process"
36
+
37
+ t = Benchmark.realtime do
38
+ ARGV.each do |f|
39
+ puts "Processing #{f}"
40
+ ETL::Engine.process(f)
41
+ end
42
+ end
43
+
44
+ puts "ETL process complete in #{sprintf('%.3f', t)} seconds"
45
+ end
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,134 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Object representation of a control file
4
+ class Control
5
+ attr_reader :file
6
+
7
+ class << self
8
+ # Parse a control file and return a Control instance
9
+ def parse(control_file)
10
+ control_file = control_file.path if control_file.instance_of?(File)
11
+ # logger.debug "Parsing control file #{control_file.path}"
12
+ control = ETL::Control::Control.new(control_file)
13
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
14
+ eval(IO.readlines(control_file).join("\n"), control.get_binding)
15
+ control.validate
16
+ control
17
+ end
18
+
19
+ def resolve(control)
20
+ case control
21
+ when String
22
+ ETL::Control::Control.parse(File.new(control))
23
+ when File
24
+ ETL::Control::Control.parse(control)
25
+ when ETL::Control::Control
26
+ control
27
+ else
28
+ raise ControlError, "Control must be a String, File or Control object"
29
+ end
30
+ end
31
+ end
32
+
33
+ def initialize(file)
34
+ @file = file
35
+ end
36
+
37
+ # Define a source
38
+ def source(name, configuration={}, definition={})
39
+ source_types = [:file, :db]
40
+ source_types.each do |source_type|
41
+ if configuration[source_type]
42
+ source_class = ETL::Control::Source.class_for_name(source_type)
43
+ sources << source_class.new(self, configuration, definition)
44
+ end
45
+ end
46
+ end
47
+
48
+ # Get the defined source
49
+ def sources
50
+ @sources ||= []
51
+ end
52
+
53
+ # Define a destination
54
+ def destination(name, configuration={}, mapping={})
55
+ destination_types.each do |dest_type|
56
+ if configuration[dest_type]
57
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
58
+ destinations << dest_class.new(self, configuration, mapping)
59
+ end
60
+ end
61
+ end
62
+
63
+ # Get the defined destinations
64
+ def destinations
65
+ @destinations ||= []
66
+ end
67
+
68
+ def transform(name, transformer=nil, configuration={}, &block)
69
+ transforms[name] ||= []
70
+ if transformer
71
+ transform_class = ETL::Transform.const_get("#{transformer.to_s.classify}Transform")
72
+ transforms[name] << transform_class.new(self, configuration)
73
+ elsif block_given?
74
+ transforms[name] << block
75
+ else
76
+ raise ControlError, "Either a transformer or a block must be specified"
77
+ end
78
+ end
79
+
80
+ def get_transform(name)
81
+ transforms[name] ||= []
82
+ end
83
+
84
+ def pre_process(name, configuration={})
85
+ processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
86
+ pre_processors << processor_class.new(self, configuration)
87
+ end
88
+
89
+ def pre_processors
90
+ @pre_processors ||= []
91
+ end
92
+
93
+ def post_process(name, configuration={})
94
+ processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
95
+ post_processors << processor_class.new(self, configuration)
96
+ end
97
+
98
+ def post_processors
99
+ @post_processors ||= []
100
+ end
101
+
102
+ def get_binding
103
+ binding
104
+ end
105
+
106
+ # Get a map of all transforms for this control
107
+ def transforms
108
+ @transforms ||= {}
109
+ end
110
+
111
+ # Validate the control file
112
+ def validate
113
+ unless sources.length > 0
114
+ raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
115
+ end
116
+ unless destinations.length > 0
117
+ raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
118
+ end
119
+ end
120
+
121
+ protected
122
+ # Get an array of supported source types
123
+ def source_types
124
+ [:file, :database]
125
+ end
126
+
127
+ # Get an array of supported destination types
128
+ def destination_types
129
+ [:file, :database]
130
+ end
131
+
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,62 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ class Destination
4
+ attr_reader :control, :configuration, :mapping
5
+ attr_accessor :buffer_size, :current_row
6
+
7
+ class << self
8
+ def class_for_name(name)
9
+ ETL::Control.const_get("#{name.to_s.classify}Destination")
10
+ end
11
+ end
12
+
13
+ def initialize(control, configuration, mapping)
14
+ @control = control
15
+ @configuration = configuration
16
+ @mapping = mapping
17
+ @buffer_size = configuration[:buffer_size] ||= 1000
18
+ end
19
+
20
+ def current_row
21
+ @current_row ||= 1
22
+ end
23
+
24
+ # Abstract method
25
+ def write(row)
26
+ buffer << row
27
+ flush if buffer.length >= buffer_size
28
+ end
29
+
30
+ # Abstract method
31
+ def flush
32
+ raise NotImplementedError, "flush method must be implemented by subclasses"
33
+ end
34
+
35
+ # Abstract method
36
+ def close
37
+ raise NotImplementedError, "close method must be implemented by subclasses"
38
+ end
39
+
40
+ protected
41
+ def buffer
42
+ @buffer ||= []
43
+ end
44
+
45
+ # Get the order of elements from the source order
46
+ def order_from_source
47
+ order = []
48
+ control.sources.first.definition.each do |item|
49
+ case item
50
+ when Hash
51
+ order << item[:name]
52
+ else
53
+ order << item
54
+ end
55
+ end
56
+ order
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
@@ -0,0 +1,47 @@
1
+ module ETL
2
+ module Control
3
+ class DatabaseDestination < Destination
4
+ attr_reader :order
5
+ def initialize(control, configuration, mapping)
6
+ super
7
+ @order = mapping[:order] || order_from_source
8
+ raise ControlError, "Order required in mapping" unless @order
9
+ connect
10
+ end
11
+
12
+ def flush
13
+ conn = ActiveRecord::Base.connection
14
+ conn.transaction do
15
+ buffer.each do |row|
16
+ names = []
17
+ values = []
18
+ order.each do |name|
19
+ names << name
20
+ values << "'#{row[name]}'"
21
+ end
22
+ q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
23
+ ETL::Engine.logger.debug("Query: #{q}")
24
+ conn.execute(q, "Insert row #{current_row}")
25
+ @current_row += 1
26
+ end
27
+ buffer.clear
28
+ end
29
+ end
30
+ def close
31
+ flush
32
+ ActiveRecord::Base.connection.disconnect!
33
+ end
34
+
35
+ private
36
+ def connect
37
+ ActiveRecord::Base.establish_connection(
38
+ :adapter => (configuration[:adapter] || :mysql),
39
+ :username => (configuration[:username] || 'root'),
40
+ :host => (configuration[:host] || 'localhost'),
41
+ :password => configuration[:password],
42
+ :database => configuration[:database]
43
+ )
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,63 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # File as the final destination.
4
+ class FileDestination < Destination
5
+ attr_reader :file, :order
6
+ attr_accessor :append, :separator, :eol, :enclose
7
+
8
+ # Initialize the object.
9
+ # * <tt>control</tt>: The Control object
10
+ # * <tt>configuration</tt>: The configuration map
11
+ # * <tt>mapping</tt>: The output mapping
12
+ def initialize(control, configuration, mapping)
13
+ super
14
+ @file = File.join(File.dirname(control.file), configuration[:file])
15
+ @append = configuration[:append] ||= false
16
+ @separator = configuration[:separator] ||= ','
17
+ @eol = configuration[:eol] ||= "\n"
18
+ @enclose = configuration[:enclose] ||= nil
19
+
20
+ @order = mapping[:order] || order_from_source
21
+ raise ControlError, "Order required in mapping" unless @order
22
+ end
23
+
24
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
25
+ def close
26
+ flush
27
+ f.close
28
+ end
29
+
30
+ def flush
31
+ buffer.each do |row|
32
+ add_virtuals(row)
33
+ values = order.collect { |name| row[name] }
34
+ if !enclose.nil?
35
+ values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
36
+ end
37
+ f.write(values.join(separator))
38
+ f.write(eol)
39
+ end
40
+ buffer.clear
41
+ end
42
+
43
+ private
44
+ # Get the open file stream
45
+ def f
46
+ @f ||= open(file, mode)
47
+ end
48
+
49
+ # Get the appropriate mode to open the file stream
50
+ def mode
51
+ append ? 'a' : 'w'
52
+ end
53
+
54
+ def add_virtuals(row)
55
+ if mapping[:virtual]
56
+ mapping[:virtual].each do |key,value|
57
+ row[key] = value
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,27 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+ attr_accessor :control, :configuration, :definition
7
+
8
+ class << self
9
+ def class_for_name(name)
10
+ ETL::Control.const_get("#{name.to_s.classify}Source")
11
+ end
12
+ end
13
+
14
+ # Initialize the Source instance
15
+ # * <tt>control</tt>: The control object
16
+ # * <tt>configuration</tt>: The configuration hash
17
+ # * <tt>definition</tt>: The source layout definition
18
+ def initialize(control, configuration, definition)
19
+ @control = control
20
+ @configuration = configuration
21
+ @definition = definition
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,30 @@
1
+ module ETL
2
+ module Control
3
+ class DatabaseSource < Source
4
+ def initialize(control, configuration, definition)
5
+ super
6
+ connect
7
+ end
8
+
9
+ # Returns each row from the source
10
+ def each
11
+ conn = ActiveRecord::Base.connection
12
+ conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
13
+ yield row
14
+ end
15
+ end
16
+
17
+ private
18
+ def connect
19
+ # set up the DB connection
20
+ ActiveRecord::Base.establish_connection(
21
+ :adapter => (configuration[:adapter] || :mysql),
22
+ :username => (configuration[:username] || 'root'),
23
+ :host => (configuration[:host] || 'localhost'),
24
+ :password => configuration[:password],
25
+ :database => configuration[:database]
26
+ )
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ module ETL
2
+ module Control
3
+ class FileSource < Source
4
+ def initialize(control, configuration, definition)
5
+ super
6
+ configure
7
+ end
8
+ # Returns each row from the source
9
+ def each
10
+ @parser.each { |row| yield row }
11
+ end
12
+
13
+ private
14
+ def configure
15
+ @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
16
+ end
17
+ end
18
+ end
19
+ end
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,61 @@
1
+ module ETL
2
+ class Engine
3
+
4
+ class << self
5
+ def process(control_file)
6
+ new().process(control_file)
7
+ end
8
+
9
+ attr_accessor :logger
10
+
11
+ def logger
12
+ unless @logger
13
+ @logger = Logger.new('etl.log')
14
+ @logger.level = Logger::DEBUG
15
+ end
16
+ @logger
17
+ end
18
+ end
19
+
20
+ # Process a control file or object.
21
+ def process(control)
22
+ control = ETL::Control::Control.resolve(control)
23
+
24
+ pre_process(control)
25
+
26
+ sources = control.sources
27
+ destinations = control.destinations
28
+
29
+ sources.each do |source|
30
+ source.each_with_index do |row, index|
31
+ row.each do |name, value|
32
+ # execute transforms
33
+ row[name] = ETL::Transform::Transform.transform(name, value, control.get_transform(name))
34
+ end
35
+ # write the row to the destination
36
+ destinations.each do |destination|
37
+ destination.write(row)
38
+ end
39
+ end
40
+ destinations.each do |destination|
41
+ destination.close
42
+ end
43
+ end
44
+
45
+ post_process(control)
46
+ end
47
+
48
+ private
49
+ def pre_process(control)
50
+ control.pre_processors.each do |processor|
51
+ processor.process
52
+ end
53
+ end
54
+
55
+ def post_process(control)
56
+ control.post_processors.each do |processor|
57
+ processor.process
58
+ end
59
+ end
60
+ end
61
+ end
data/lib/etl/parser.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'etl/parser/parser'
2
+ Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -0,0 +1,56 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parses delimited files
4
+ class DelimitedParser < ETL::Parser::Parser
5
+ include Enumerable
6
+ # Initialize the parser
7
+ # * <tt>source</tt>: The Source object
8
+ def initialize(source)
9
+ super
10
+ configure
11
+ end
12
+
13
+ # Returns each row
14
+ def each
15
+ options = {}
16
+ Dir.glob(file).each do |file|
17
+ FasterCSV.foreach(file, options) do |raw_row|
18
+ row = {}
19
+ raw_row.each_with_index do |record, index|
20
+ f = fields[index]
21
+ row[f.name] = convert(f.name, record, f.type)
22
+ end
23
+ yield row
24
+ end
25
+ end
26
+ end
27
+
28
+ # Get an array of defined fields
29
+ def fields
30
+ @fields ||= []
31
+ end
32
+
33
+ private
34
+ def configure
35
+ source.definition.each do |options|
36
+ case options
37
+ when Symbol
38
+ fields << Field.new(options)
39
+ when Hash
40
+ fields << Field.new(options[:name], options[:type])
41
+ else
42
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
43
+ end
44
+ end
45
+ end
46
+
47
+ class Field
48
+ attr_reader :name, :type
49
+ def initialize(name, type=:string)
50
+ @name = name
51
+ @type = type
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,59 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser for fixed with files
4
+ class FixedWidthParser < ETL::Parser::Parser
5
+ include Enumerable
6
+
7
+ # Initialize the parser
8
+ # * <tt>source</tt>: The source object
9
+ def initialize(source)
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Return each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ open(file).each do |line|
18
+ row = {}
19
+ fields.each do |name, f|
20
+ # TODO make strip optional?
21
+ row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
22
+ end
23
+ yield row
24
+ end
25
+ end
26
+ end
27
+
28
+ # Return a map of defined fields
29
+ def fields
30
+ @fields ||= {}
31
+ end
32
+
33
+ private
34
+ def configure
35
+ source.definition.each do |field, options|
36
+ fields[field] = FixedWidthField.new(options[:name], options[:start], options[:end], options[:length], options[:type])
37
+ end
38
+ end
39
+ end
40
+
41
+ class FixedWidthField
42
+ attr_reader :name, :field_start, :field_end, :field_length, :type
43
+ def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
44
+ @name = name
45
+ @type = type ||= :string
46
+ @field_start = field_start - 1
47
+ if field_end
48
+ @field_end = field_end
49
+ @field_length = @field_end - @field_start
50
+ elsif field_length
51
+ @field_length = field_length
52
+ @field_end = @field_start + @field_length
53
+ else
54
+ raise DefinitionError, "Either field_end or field_length required"
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,43 @@
1
+ module ETL
2
+ module Parser
3
+ class Parser
4
+ class << self
5
+ # Convert the name (string or symbol) to a parser class.
6
+ #
7
+ # Example:
8
+ # <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
9
+ def class_for_name(name)
10
+ ETL::Parser.const_get("#{name.to_s.classify}Parser")
11
+ end
12
+ end
13
+
14
+ attr_reader :source
15
+
16
+ def initialize(source)
17
+ @source = source
18
+ end
19
+
20
+ # Convert the value to the specified type.
21
+ #
22
+ # Parameters:
23
+ # * <tt>name</tt>: The name of the field
24
+ # * <tt>value</tt>: The value
25
+ # * <tt>type</tt>: The type name (:integer, :float, :string)
26
+ def convert(name, value, type)
27
+ case type
28
+ when :integer
29
+ value.to_i
30
+ when :float
31
+ value.to_f
32
+ else
33
+ value
34
+ end
35
+ end
36
+
37
+ protected
38
+ def file
39
+ File.join(File.dirname(source.control.file), source.configuration[:file])
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/processor/processor'
2
+ Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -0,0 +1,39 @@
1
+ module ETL
2
+ module Processor
3
+ class BulkImportProcessor < ETL::Processor::Processor
4
+ attr_reader :file, :target
5
+ def initialize(control, configuration)
6
+ super
7
+ @file = File.join(File.dirname(control.file), configuration[:file])
8
+ @target = configuration[:target]
9
+ connect
10
+ end
11
+ def process
12
+ conn = ActiveRecord::Base.connection
13
+ conn.transaction do
14
+ # Since LOCAL is used this must be allowed by both the client and server
15
+ conn.execute("LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}")
16
+ end
17
+ end
18
+ private
19
+ def log
20
+ unless @log
21
+ @log = Logger.new(STDOUT)
22
+ @log.level = Logger::DEBUG
23
+ end
24
+ @log
25
+ end
26
+
27
+ # Connect to the database
28
+ def connect
29
+ ActiveRecord::Base.establish_connection(
30
+ :adapter => (target[:adapter] || :mysql),
31
+ :username => (target[:username] || 'root'),
32
+ :host => (target[:host] || 'localhost'),
33
+ :password => target[:password],
34
+ :database => target[:database]
35
+ )
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Base class for pre and post processors
4
+ class Processor
5
+ def initialize(control, configuration)
6
+ @control = control
7
+ @configuration = configuration
8
+ end
9
+ protected
10
+ def control
11
+ @control
12
+ end
13
+ def configuration
14
+ @configuration
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/transform/transform'
2
+ Dir[File.dirname(__FILE__) + "/transform/*.rb"].each { |file| require(file) }
@@ -0,0 +1,37 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which decodes coded values
4
+ class DecodeTransform < ETL::Transform::Transform
5
+ attr_accessor :decode_table_path, :decode_table_delimiter, :default_value
6
+ def initialize(control, configuration={})
7
+ super
8
+
9
+ if configuration[:decode_table_path]
10
+ configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path])
11
+ end
12
+
13
+ @decode_table_path = (configuration[:decode_table_path] || 'decode.txt')
14
+ @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
15
+ @default_value = (configuration[:default_value] || 'No Value')
16
+ end
17
+ def transform(value)
18
+ decode_table[value] || default_value
19
+ end
20
+
21
+ def decode_table
22
+ unless @decode_table
23
+ @decode_table = {}
24
+ open(decode_table_path).each do |line|
25
+ code, value = line.strip.split(decode_table_delimiter)
26
+ if code && code.length > 0
27
+ @decode_table[code] = value
28
+ else
29
+ @default_value = value
30
+ end
31
+ end
32
+ end
33
+ @decode_table
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,15 @@
1
+ require 'digest/sha1'
2
+
3
+ module ETL #:nodoc:
4
+ module Transform #:nodoc:
5
+ # Transform which hashes the original value with a SHA-1 hash algorithm
6
+ class Sha1Transform < ETL::Transform::Transform
7
+ def initialize(control, configuration={})
8
+ super
9
+ end
10
+ def transform(value)
11
+ Digest::SHA1.hexdigest(value)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ module ETL
2
+ module Transform
3
+ class Transform
4
+ class << self
5
+ def transform(name, value, transforms)
6
+ # logger.debug "Transforming field #{name}" if transforms.length > 0
7
+ transforms.each do |transform|
8
+ case transform
9
+ when Proc
10
+ value = transform.call(value)
11
+ when Transform
12
+ value = transform.transform(value)
13
+ else
14
+ raise ControlError, "Unsupported transform configuration type: #{transform}"
15
+ end
16
+ end
17
+ value
18
+ end
19
+ end
20
+
21
+ attr_reader :control, :configuration
22
+
23
+ def initialize(control, configuration={})
24
+ @control = control
25
+ @configuration = configuration
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ module ETL
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ TINY = 0
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: activewarehouse-etl
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-12-06 00:00:00 -05:00
8
+ summary: Pure Ruby ETL package.
9
+ require_paths:
10
+ - lib
11
+ email: anthonyeden@gmail.com
12
+ homepage: http://activewarehouse.rubyforge.org/etl
13
+ rubyforge_project: activewarehouse
14
+ description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
15
+ autorequire:
16
+ default_executable: etl
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Anthony Eden
31
+ files:
32
+ - CHANGELOG
33
+ - README
34
+ - Rakefile
35
+ - bin/etl
36
+ - lib/etl
37
+ - lib/etl.rb
38
+ - lib/etl/commands
39
+ - lib/etl/control
40
+ - lib/etl/control.rb
41
+ - lib/etl/engine.rb
42
+ - lib/etl/parser
43
+ - lib/etl/parser.rb
44
+ - lib/etl/processor
45
+ - lib/etl/processor.rb
46
+ - lib/etl/transform
47
+ - lib/etl/transform.rb
48
+ - lib/etl/version.rb
49
+ - lib/etl/commands/etl.rb
50
+ - lib/etl/control/control.rb
51
+ - lib/etl/control/destination
52
+ - lib/etl/control/destination.rb
53
+ - lib/etl/control/source
54
+ - lib/etl/control/source.rb
55
+ - lib/etl/control/destination/database_destination.rb
56
+ - lib/etl/control/destination/file_destination.rb
57
+ - lib/etl/control/source/database_source.rb
58
+ - lib/etl/control/source/file_source.rb
59
+ - lib/etl/parser/delimited_parser.rb
60
+ - lib/etl/parser/fixed_width_parser.rb
61
+ - lib/etl/parser/parser.rb
62
+ - lib/etl/processor/bulk_import_processor.rb
63
+ - lib/etl/processor/processor.rb
64
+ - lib/etl/transform/decode_transform.rb
65
+ - lib/etl/transform/sha1_transform.rb
66
+ - lib/etl/transform/transform.rb
67
+ test_files: []
68
+
69
+ rdoc_options:
70
+ - --exclude
71
+ - .
72
+ extra_rdoc_files: []
73
+
74
+ executables:
75
+ - etl
76
+ extensions: []
77
+
78
+ requirements: []
79
+
80
+ dependencies:
81
+ - !ruby/object:Gem::Dependency
82
+ name: rake
83
+ version_requirement:
84
+ version_requirements: !ruby/object:Gem::Version::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 0.7.1
89
+ version:
90
+ - !ruby/object:Gem::Dependency
91
+ name: activesupport
92
+ version_requirement:
93
+ version_requirements: !ruby/object:Gem::Version::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 1.3.1.5618
98
+ version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: activerecord
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.14.4.5618
107
+ version:
108
+ - !ruby/object:Gem::Dependency
109
+ name: fastercsv
110
+ version_requirement:
111
+ version_requirements: !ruby/object:Gem::Version::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: 1.0.0
116
+ version: