activewarehouse-etl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0 - Dec 6, 2006
2
+ * Initial release
data/README ADDED
@@ -0,0 +1,27 @@
1
+ Ruby ETL parser.
2
+
3
+ == Features
4
+
5
+ Current supported features:
6
+
7
+ * ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
8
+ * Multiple source types - fixed-width and delimited text files currently supported
9
+ * Multiple destination types - file and database destinations
10
+ * Support for extracting from multiple sources
11
+ * Support for loading to multiple destinations
12
+ * Extensible transformations - comes with built in SHA1 one-way hash example
13
+ * Pre/post processing - export to files and then post process with the bulk import processor for large amounts of data
14
+ * Virtual fields - Add a field to the destination data which doesn't exist in the source data
15
+
16
+
17
+ == Requirements
18
+
19
+ * ActiveSupport Gem
20
+ * ActiveRecord Gem
21
+ * FasterCSV Gem
22
+
23
+ == Examples
24
+ Examples can be found in the test directory.
25
+
26
+ == Feedback
27
+ This is a work in progress. Comments should be made on the activewarehouse-discuss mailing list at the moment.
data/Rakefile ADDED
@@ -0,0 +1,117 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/packagetask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
9
+
10
+ PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
11
+ PKG_NAME = 'activewarehouse-etl'
12
+ PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
13
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
14
+ PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
15
+
16
+ RELEASE_NAME = "REL #{PKG_VERSION}"
17
+
18
+ RUBY_FORGE_PROJECT = "activewarehouse"
19
+ RUBY_FORGE_USER = "aeden"
20
+
21
+ desc 'Default: run unit tests.'
22
+ task :default => :test
23
+
24
+ desc 'Test the ETL application.'
25
+ Rake::TestTask.new(:test) do |t|
26
+ t.libs << 'lib'
27
+ t.pattern = 'test/**/*_test.rb'
28
+ t.verbose = true
29
+ # TODO: reset the database
30
+ end
31
+
32
+ desc 'Generate documentation for the ETL application.'
33
+ Rake::RDocTask.new(:rdoc) do |rdoc|
34
+ rdoc.rdoc_dir = 'rdoc'
35
+ rdoc.title = 'ActiveWarehouse ETL'
36
+ rdoc.options << '--line-numbers' << '--inline-source'
37
+ rdoc.rdoc_files.include('README')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ end
40
+
41
+ PKG_FILES = FileList[
42
+ 'CHANGELOG',
43
+ 'README',
44
+ 'Rakefile',
45
+ 'bin/**/*',
46
+ 'doc/**/*',
47
+ 'lib/**/*',
48
+ ] - [ 'test' ]
49
+
50
+ spec = Gem::Specification.new do |s|
51
+ s.name = 'activewarehouse-etl'
52
+ s.version = PKG_VERSION
53
+ s.summary = "Pure Ruby ETL package."
54
+ s.description = <<-EOF
55
+ ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
56
+ EOF
57
+
58
+ s.add_dependency('rake', '>= 0.7.1')
59
+ s.add_dependency('activesupport', '>= 1.3.1.5618')
60
+ s.add_dependency('activerecord', '>= 1.14.4.5618')
61
+ s.add_dependency('fastercsv', '>= 1.0.0')
62
+
63
+ s.rdoc_options << '--exclude' << '.'
64
+ s.has_rdoc = false
65
+
66
+ s.files = PKG_FILES.to_a.delete_if {|f| f.include?('.svn')}
67
+ s.require_path = 'lib'
68
+
69
+ s.bindir = "bin" # Use these for applications.
70
+ s.executables = ['etl']
71
+ s.default_executable = "etl"
72
+
73
+ s.author = "Anthony Eden"
74
+ s.email = "anthonyeden@gmail.com"
75
+ s.homepage = "http://activewarehouse.rubyforge.org/etl"
76
+ s.rubyforge_project = "activewarehouse"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.gem_spec = spec
81
+ end
82
+
83
+ desc "Generate code statistics"
84
+ task :lines do
85
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
86
+
87
+ for file_name in FileList["lib/**/*.rb"]
88
+ next if file_name =~ /vendor/
89
+ f = File.open(file_name)
90
+
91
+ while line = f.gets
92
+ lines += 1
93
+ next if line =~ /^\s*$/
94
+ next if line =~ /^\s*#/
95
+ codelines += 1
96
+ end
97
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
98
+
99
+ total_lines += lines
100
+ total_codelines += codelines
101
+
102
+ lines, codelines = 0, 0
103
+ end
104
+
105
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
106
+ end
107
+
108
+ desc "Publish the release files to RubyForge."
109
+ task :release => [ :package ] do
110
+ `rubyforge login`
111
+
112
+ for ext in %w( gem tgz zip )
113
+ release_command = "rubyforge add_release activewarehouse #{PKG_NAME} 'REL #{PKG_VERSION}' pkg/#{PKG_NAME}-#{PKG_VERSION}.#{ext}"
114
+ puts release_command
115
+ system(release_command)
116
+ end
117
+ end
data/bin/etl ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+ # Copyright (c) 2006 Anthony Eden
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #++
25
+
26
+ require File.dirname(__FILE__) + "/../lib/etl/commands/etl"
data/lib/etl.rb ADDED
@@ -0,0 +1,58 @@
1
+ #--
2
+ # Copyright (c) 2006 Anthony Eden
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ unless defined?(Logger)
25
+ require 'logger'
26
+ end
27
+
28
+ require 'rubygems'
29
+ unless defined?(ActiveSupport)
30
+ require_gem 'activesupport'
31
+ require 'active_support'
32
+ end
33
+
34
+ unless defined?(ActiveRecord)
35
+ require_gem 'activerecord'
36
+ require 'active_record'
37
+ end
38
+
39
+ require_gem 'fastercsv'
40
+ require 'faster_csv'
41
+
42
+ $:.unshift(File.dirname(__FILE__))
43
+
44
+ require 'etl/version'
45
+ require 'etl/engine'
46
+ require 'etl/control'
47
+ require 'etl/parser'
48
+ require 'etl/transform'
49
+ require 'etl/processor'
50
+
51
+ module ETL #:nodoc:
52
+ class ETLError < StandardError #:nodoc:
53
+ end
54
+ class ControlError < ETLError #:nodoc:
55
+ end
56
+ class DefinitionError < ControlError #:nodoc:
57
+ end
58
+ end
@@ -0,0 +1,45 @@
1
+ #--
2
+ # Copyright (c) 2006 Anthony Eden
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'benchmark'
25
+ require File.dirname(__FILE__) + '/../../etl'
26
+
27
+ # Print a usage statement
28
+ def usage #:nodoc:
29
+ puts "Usage: etl ctl_file [ctl_file2 ctl_file3 ...]"
30
+ end
31
+
32
+ if ARGV.length < 1
33
+ usage
34
+ else
35
+ puts "Starting ETL process"
36
+
37
+ t = Benchmark.realtime do
38
+ ARGV.each do |f|
39
+ puts "Processing #{f}"
40
+ ETL::Engine.process(f)
41
+ end
42
+ end
43
+
44
+ puts "ETL process complete in #{sprintf('%.3f', t)} seconds"
45
+ end
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,134 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Object representation of a control file
4
+ class Control
5
+ attr_reader :file
6
+
7
+ class << self
8
+ # Parse a control file and return a Control instance
9
+ def parse(control_file)
10
+ control_file = control_file.path if control_file.instance_of?(File)
11
+ # logger.debug "Parsing control file #{control_file.path}"
12
+ control = ETL::Control::Control.new(control_file)
13
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
14
+ eval(IO.readlines(control_file).join("\n"), control.get_binding)
15
+ control.validate
16
+ control
17
+ end
18
+
19
+ def resolve(control)
20
+ case control
21
+ when String
22
+ ETL::Control::Control.parse(File.new(control))
23
+ when File
24
+ ETL::Control::Control.parse(control)
25
+ when ETL::Control::Control
26
+ control
27
+ else
28
+ raise ControlError, "Control must be a String, File or Control object"
29
+ end
30
+ end
31
+ end
32
+
33
+ def initialize(file)
34
+ @file = file
35
+ end
36
+
37
+ # Define a source
38
+ def source(name, configuration={}, definition={})
39
+ source_types = [:file, :db]
40
+ source_types.each do |source_type|
41
+ if configuration[source_type]
42
+ source_class = ETL::Control::Source.class_for_name(source_type)
43
+ sources << source_class.new(self, configuration, definition)
44
+ end
45
+ end
46
+ end
47
+
48
+ # Get the defined source
49
+ def sources
50
+ @sources ||= []
51
+ end
52
+
53
+ # Define a destination
54
+ def destination(name, configuration={}, mapping={})
55
+ destination_types.each do |dest_type|
56
+ if configuration[dest_type]
57
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
58
+ destinations << dest_class.new(self, configuration, mapping)
59
+ end
60
+ end
61
+ end
62
+
63
+ # Get the defined destinations
64
+ def destinations
65
+ @destinations ||= []
66
+ end
67
+
68
+ def transform(name, transformer=nil, configuration={}, &block)
69
+ transforms[name] ||= []
70
+ if transformer
71
+ transform_class = ETL::Transform.const_get("#{transformer.to_s.classify}Transform")
72
+ transforms[name] << transform_class.new(self, configuration)
73
+ elsif block_given?
74
+ transforms[name] << block
75
+ else
76
+ raise ControlError, "Either a transformer or a block must be specified"
77
+ end
78
+ end
79
+
80
+ def get_transform(name)
81
+ transforms[name] ||= []
82
+ end
83
+
84
+ def pre_process(name, configuration={})
85
+ processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
86
+ pre_processors << processor_class.new(self, configuration)
87
+ end
88
+
89
+ def pre_processors
90
+ @pre_processors ||= []
91
+ end
92
+
93
+ def post_process(name, configuration={})
94
+ processor_class = ETL::Processor.const_get("#{name.to_s.classify}Processor")
95
+ post_processors << processor_class.new(self, configuration)
96
+ end
97
+
98
+ def post_processors
99
+ @post_processors ||= []
100
+ end
101
+
102
+ def get_binding
103
+ binding
104
+ end
105
+
106
+ # Get a map of all transforms for this control
107
+ def transforms
108
+ @transforms ||= {}
109
+ end
110
+
111
+ # Validate the control file
112
+ def validate
113
+ unless sources.length > 0
114
+ raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
115
+ end
116
+ unless destinations.length > 0
117
+ raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
118
+ end
119
+ end
120
+
121
+ protected
122
+ # Get an array of supported source types
123
+ def source_types
124
+ [:file, :database]
125
+ end
126
+
127
+ # Get an array of supported destination types
128
+ def destination_types
129
+ [:file, :database]
130
+ end
131
+
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,62 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ class Destination
4
+ attr_reader :control, :configuration, :mapping
5
+ attr_accessor :buffer_size, :current_row
6
+
7
+ class << self
8
+ def class_for_name(name)
9
+ ETL::Control.const_get("#{name.to_s.classify}Destination")
10
+ end
11
+ end
12
+
13
+ def initialize(control, configuration, mapping)
14
+ @control = control
15
+ @configuration = configuration
16
+ @mapping = mapping
17
+ @buffer_size = configuration[:buffer_size] ||= 1000
18
+ end
19
+
20
+ def current_row
21
+ @current_row ||= 1
22
+ end
23
+
24
+ # Abstract method
25
+ def write(row)
26
+ buffer << row
27
+ flush if buffer.length >= buffer_size
28
+ end
29
+
30
+ # Abstract method
31
+ def flush
32
+ raise NotImplementedError, "flush method must be implemented by subclasses"
33
+ end
34
+
35
+ # Abstract method
36
+ def close
37
+ raise NotImplementedError, "close method must be implemented by subclasses"
38
+ end
39
+
40
+ protected
41
+ def buffer
42
+ @buffer ||= []
43
+ end
44
+
45
+ # Get the order of elements from the source order
46
+ def order_from_source
47
+ order = []
48
+ control.sources.first.definition.each do |item|
49
+ case item
50
+ when Hash
51
+ order << item[:name]
52
+ else
53
+ order << item
54
+ end
55
+ end
56
+ order
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
@@ -0,0 +1,47 @@
1
+ module ETL
2
+ module Control
3
+ class DatabaseDestination < Destination
4
+ attr_reader :order
5
+ def initialize(control, configuration, mapping)
6
+ super
7
+ @order = mapping[:order] || order_from_source
8
+ raise ControlError, "Order required in mapping" unless @order
9
+ connect
10
+ end
11
+
12
+ def flush
13
+ conn = ActiveRecord::Base.connection
14
+ conn.transaction do
15
+ buffer.each do |row|
16
+ names = []
17
+ values = []
18
+ order.each do |name|
19
+ names << name
20
+ values << "'#{row[name]}'"
21
+ end
22
+ q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
23
+ ETL::Engine.logger.debug("Query: #{q}")
24
+ conn.execute(q, "Insert row #{current_row}")
25
+ @current_row += 1
26
+ end
27
+ buffer.clear
28
+ end
29
+ end
30
+ def close
31
+ flush
32
+ ActiveRecord::Base.connection.disconnect!
33
+ end
34
+
35
+ private
36
+ def connect
37
+ ActiveRecord::Base.establish_connection(
38
+ :adapter => (configuration[:adapter] || :mysql),
39
+ :username => (configuration[:username] || 'root'),
40
+ :host => (configuration[:host] || 'localhost'),
41
+ :password => configuration[:password],
42
+ :database => configuration[:database]
43
+ )
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,63 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # File as the final destination.
4
+ class FileDestination < Destination
5
+ attr_reader :file, :order
6
+ attr_accessor :append, :separator, :eol, :enclose
7
+
8
+ # Initialize the object.
9
+ # * <tt>control</tt>: The Control object
10
+ # * <tt>configuration</tt>: The configuration map
11
+ # * <tt>mapping</tt>: The output mapping
12
+ def initialize(control, configuration, mapping)
13
+ super
14
+ @file = File.join(File.dirname(control.file), configuration[:file])
15
+ @append = configuration[:append] ||= false
16
+ @separator = configuration[:separator] ||= ','
17
+ @eol = configuration[:eol] ||= "\n"
18
+ @enclose = configuration[:enclose] ||= nil
19
+
20
+ @order = mapping[:order] || order_from_source
21
+ raise ControlError, "Order required in mapping" unless @order
22
+ end
23
+
24
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
25
+ def close
26
+ flush
27
+ f.close
28
+ end
29
+
30
+ def flush
31
+ buffer.each do |row|
32
+ add_virtuals(row)
33
+ values = order.collect { |name| row[name] }
34
+ if !enclose.nil?
35
+ values.collect! { |v| enclose + v.to_s.gsub(/(#{enclose})/, '\\\\\1') + enclose }
36
+ end
37
+ f.write(values.join(separator))
38
+ f.write(eol)
39
+ end
40
+ buffer.clear
41
+ end
42
+
43
+ private
44
+ # Get the open file stream
45
+ def f
46
+ @f ||= open(file, mode)
47
+ end
48
+
49
+ # Get the appropriate mode to open the file stream
50
+ def mode
51
+ append ? 'a' : 'w'
52
+ end
53
+
54
+ def add_virtuals(row)
55
+ if mapping[:virtual]
56
+ mapping[:virtual].each do |key,value|
57
+ row[key] = value
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,27 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+ attr_accessor :control, :configuration, :definition
7
+
8
+ class << self
9
+ def class_for_name(name)
10
+ ETL::Control.const_get("#{name.to_s.classify}Source")
11
+ end
12
+ end
13
+
14
+ # Initialize the Source instance
15
+ # * <tt>control</tt>: The control object
16
+ # * <tt>configuration</tt>: The configuration hash
17
+ # * <tt>definition</tt>: The source layout definition
18
+ def initialize(control, configuration, definition)
19
+ @control = control
20
+ @configuration = configuration
21
+ @definition = definition
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,30 @@
1
+ module ETL
2
+ module Control
3
+ class DatabaseSource < Source
4
+ def initialize(control, configuration, definition)
5
+ super
6
+ connect
7
+ end
8
+
9
+ # Returns each row from the source
10
+ def each
11
+ conn = ActiveRecord::Base.connection
12
+ conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
13
+ yield row
14
+ end
15
+ end
16
+
17
+ private
18
+ def connect
19
+ # set up the DB connection
20
+ ActiveRecord::Base.establish_connection(
21
+ :adapter => (configuration[:adapter] || :mysql),
22
+ :username => (configuration[:username] || 'root'),
23
+ :host => (configuration[:host] || 'localhost'),
24
+ :password => configuration[:password],
25
+ :database => configuration[:database]
26
+ )
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ module ETL
2
+ module Control
3
+ class FileSource < Source
4
+ def initialize(control, configuration, definition)
5
+ super
6
+ configure
7
+ end
8
+ # Returns each row from the source
9
+ def each
10
+ @parser.each { |row| yield row }
11
+ end
12
+
13
+ private
14
+ def configure
15
+ @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
16
+ end
17
+ end
18
+ end
19
+ end
data/lib/etl/engine.rb ADDED
@@ -0,0 +1,61 @@
1
+ module ETL
2
+ class Engine
3
+
4
+ class << self
5
+ def process(control_file)
6
+ new().process(control_file)
7
+ end
8
+
9
+ attr_accessor :logger
10
+
11
+ def logger
12
+ unless @logger
13
+ @logger = Logger.new('etl.log')
14
+ @logger.level = Logger::DEBUG
15
+ end
16
+ @logger
17
+ end
18
+ end
19
+
20
+ # Process a control file or object.
21
+ def process(control)
22
+ control = ETL::Control::Control.resolve(control)
23
+
24
+ pre_process(control)
25
+
26
+ sources = control.sources
27
+ destinations = control.destinations
28
+
29
+ sources.each do |source|
30
+ source.each_with_index do |row, index|
31
+ row.each do |name, value|
32
+ # execute transforms
33
+ row[name] = ETL::Transform::Transform.transform(name, value, control.get_transform(name))
34
+ end
35
+ # write the row to the destination
36
+ destinations.each do |destination|
37
+ destination.write(row)
38
+ end
39
+ end
40
+ destinations.each do |destination|
41
+ destination.close
42
+ end
43
+ end
44
+
45
+ post_process(control)
46
+ end
47
+
48
+ private
49
+ def pre_process(control)
50
+ control.pre_processors.each do |processor|
51
+ processor.process
52
+ end
53
+ end
54
+
55
+ def post_process(control)
56
+ control.post_processors.each do |processor|
57
+ processor.process
58
+ end
59
+ end
60
+ end
61
+ end
data/lib/etl/parser.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'etl/parser/parser'
2
+ Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -0,0 +1,56 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parses delimited files
4
+ class DelimitedParser < ETL::Parser::Parser
5
+ include Enumerable
6
+ # Initialize the parser
7
+ # * <tt>source</tt>: The Source object
8
+ def initialize(source)
9
+ super
10
+ configure
11
+ end
12
+
13
+ # Returns each row
14
+ def each
15
+ options = {}
16
+ Dir.glob(file).each do |file|
17
+ FasterCSV.foreach(file, options) do |raw_row|
18
+ row = {}
19
+ raw_row.each_with_index do |record, index|
20
+ f = fields[index]
21
+ row[f.name] = convert(f.name, record, f.type)
22
+ end
23
+ yield row
24
+ end
25
+ end
26
+ end
27
+
28
+ # Get an array of defined fields
29
+ def fields
30
+ @fields ||= []
31
+ end
32
+
33
+ private
34
+ def configure
35
+ source.definition.each do |options|
36
+ case options
37
+ when Symbol
38
+ fields << Field.new(options)
39
+ when Hash
40
+ fields << Field.new(options[:name], options[:type])
41
+ else
42
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
43
+ end
44
+ end
45
+ end
46
+
47
+ class Field
48
+ attr_reader :name, :type
49
+ def initialize(name, type=:string)
50
+ @name = name
51
+ @type = type
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,59 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser for fixed with files
4
+ class FixedWidthParser < ETL::Parser::Parser
5
+ include Enumerable
6
+
7
+ # Initialize the parser
8
+ # * <tt>source</tt>: The source object
9
+ def initialize(source)
10
+ super
11
+ configure
12
+ end
13
+
14
+ # Return each row
15
+ def each
16
+ Dir.glob(file).each do |file|
17
+ open(file).each do |line|
18
+ row = {}
19
+ fields.each do |name, f|
20
+ # TODO make strip optional?
21
+ row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
22
+ end
23
+ yield row
24
+ end
25
+ end
26
+ end
27
+
28
+ # Return a map of defined fields
29
+ def fields
30
+ @fields ||= {}
31
+ end
32
+
33
+ private
34
+ def configure
35
+ source.definition.each do |field, options|
36
+ fields[field] = FixedWidthField.new(options[:name], options[:start], options[:end], options[:length], options[:type])
37
+ end
38
+ end
39
+ end
40
+
41
+ class FixedWidthField
42
+ attr_reader :name, :field_start, :field_end, :field_length, :type
43
+ def initialize(name, field_start, field_end=nil, field_length=nil, type=nil)
44
+ @name = name
45
+ @type = type ||= :string
46
+ @field_start = field_start - 1
47
+ if field_end
48
+ @field_end = field_end
49
+ @field_length = @field_end - @field_start
50
+ elsif field_length
51
+ @field_length = field_length
52
+ @field_end = @field_start + @field_length
53
+ else
54
+ raise DefinitionError, "Either field_end or field_length required"
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,43 @@
1
+ module ETL
2
+ module Parser
3
+ class Parser
4
+ class << self
5
+ # Convert the name (string or symbol) to a parser class.
6
+ #
7
+ # Example:
8
+ # <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
9
+ def class_for_name(name)
10
+ ETL::Parser.const_get("#{name.to_s.classify}Parser")
11
+ end
12
+ end
13
+
14
+ attr_reader :source
15
+
16
+ def initialize(source)
17
+ @source = source
18
+ end
19
+
20
+ # Convert the value to the specified type.
21
+ #
22
+ # Parameters:
23
+ # * <tt>name</tt>: The name of the field
24
+ # * <tt>value</tt>: The value
25
+ # * <tt>type</tt>: The type name (:integer, :float, :string)
26
+ def convert(name, value, type)
27
+ case type
28
+ when :integer
29
+ value.to_i
30
+ when :float
31
+ value.to_f
32
+ else
33
+ value
34
+ end
35
+ end
36
+
37
+ protected
38
+ def file
39
+ File.join(File.dirname(source.control.file), source.configuration[:file])
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/processor/processor'
2
+ Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -0,0 +1,39 @@
1
+ module ETL
2
+ module Processor
3
+ class BulkImportProcessor < ETL::Processor::Processor
4
+ attr_reader :file, :target
5
+ def initialize(control, configuration)
6
+ super
7
+ @file = File.join(File.dirname(control.file), configuration[:file])
8
+ @target = configuration[:target]
9
+ connect
10
+ end
11
+ def process
12
+ conn = ActiveRecord::Base.connection
13
+ conn.transaction do
14
+ # Since LOCAL is used this must be allowed by both the client and server
15
+ conn.execute("LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{target[:table]}")
16
+ end
17
+ end
18
+ private
19
+ def log
20
+ unless @log
21
+ @log = Logger.new(STDOUT)
22
+ @log.level = Logger::DEBUG
23
+ end
24
+ @log
25
+ end
26
+
27
+ # Connect to the database
28
+ def connect
29
+ ActiveRecord::Base.establish_connection(
30
+ :adapter => (target[:adapter] || :mysql),
31
+ :username => (target[:username] || 'root'),
32
+ :host => (target[:host] || 'localhost'),
33
+ :password => target[:password],
34
+ :database => target[:database]
35
+ )
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Base class for pre and post processors
4
+ class Processor
5
+ def initialize(control, configuration)
6
+ @control = control
7
+ @configuration = configuration
8
+ end
9
+ protected
10
+ def control
11
+ @control
12
+ end
13
+ def configuration
14
+ @configuration
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/transform/transform'
2
+ Dir[File.dirname(__FILE__) + "/transform/*.rb"].each { |file| require(file) }
@@ -0,0 +1,37 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which decodes coded values
4
+ class DecodeTransform < ETL::Transform::Transform
5
+ attr_accessor :decode_table_path, :decode_table_delimiter, :default_value
6
+ def initialize(control, configuration={})
7
+ super
8
+
9
+ if configuration[:decode_table_path]
10
+ configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path])
11
+ end
12
+
13
+ @decode_table_path = (configuration[:decode_table_path] || 'decode.txt')
14
+ @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
15
+ @default_value = (configuration[:default_value] || 'No Value')
16
+ end
17
+ def transform(value)
18
+ decode_table[value] || default_value
19
+ end
20
+
21
+ def decode_table
22
+ unless @decode_table
23
+ @decode_table = {}
24
+ open(decode_table_path).each do |line|
25
+ code, value = line.strip.split(decode_table_delimiter)
26
+ if code && code.length > 0
27
+ @decode_table[code] = value
28
+ else
29
+ @default_value = value
30
+ end
31
+ end
32
+ end
33
+ @decode_table
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,15 @@
1
+ require 'digest/sha1'
2
+
3
+ module ETL #:nodoc:
4
+ module Transform #:nodoc:
5
+ # Transform which hashes the original value with a SHA-1 hash algorithm
6
+ class Sha1Transform < ETL::Transform::Transform
7
+ def initialize(control, configuration={})
8
+ super
9
+ end
10
+ def transform(value)
11
+ Digest::SHA1.hexdigest(value)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ module ETL
2
+ module Transform
3
+ class Transform
4
+ class << self
5
+ def transform(name, value, transforms)
6
+ # logger.debug "Transforming field #{name}" if transforms.length > 0
7
+ transforms.each do |transform|
8
+ case transform
9
+ when Proc
10
+ value = transform.call(value)
11
+ when Transform
12
+ value = transform.transform(value)
13
+ else
14
+ raise ControlError, "Unsupported transform configuration type: #{transform}"
15
+ end
16
+ end
17
+ value
18
+ end
19
+ end
20
+
21
+ attr_reader :control, :configuration
22
+
23
+ def initialize(control, configuration={})
24
+ @control = control
25
+ @configuration = configuration
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ module ETL
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ TINY = 0
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: activewarehouse-etl
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-12-06 00:00:00 -05:00
8
+ summary: Pure Ruby ETL package.
9
+ require_paths:
10
+ - lib
11
+ email: anthonyeden@gmail.com
12
+ homepage: http://activewarehouse.rubyforge.org/etl
13
+ rubyforge_project: activewarehouse
14
+ description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
15
+ autorequire:
16
+ default_executable: etl
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Anthony Eden
31
+ files:
32
+ - CHANGELOG
33
+ - README
34
+ - Rakefile
35
+ - bin/etl
36
+ - lib/etl
37
+ - lib/etl.rb
38
+ - lib/etl/commands
39
+ - lib/etl/control
40
+ - lib/etl/control.rb
41
+ - lib/etl/engine.rb
42
+ - lib/etl/parser
43
+ - lib/etl/parser.rb
44
+ - lib/etl/processor
45
+ - lib/etl/processor.rb
46
+ - lib/etl/transform
47
+ - lib/etl/transform.rb
48
+ - lib/etl/version.rb
49
+ - lib/etl/commands/etl.rb
50
+ - lib/etl/control/control.rb
51
+ - lib/etl/control/destination
52
+ - lib/etl/control/destination.rb
53
+ - lib/etl/control/source
54
+ - lib/etl/control/source.rb
55
+ - lib/etl/control/destination/database_destination.rb
56
+ - lib/etl/control/destination/file_destination.rb
57
+ - lib/etl/control/source/database_source.rb
58
+ - lib/etl/control/source/file_source.rb
59
+ - lib/etl/parser/delimited_parser.rb
60
+ - lib/etl/parser/fixed_width_parser.rb
61
+ - lib/etl/parser/parser.rb
62
+ - lib/etl/processor/bulk_import_processor.rb
63
+ - lib/etl/processor/processor.rb
64
+ - lib/etl/transform/decode_transform.rb
65
+ - lib/etl/transform/sha1_transform.rb
66
+ - lib/etl/transform/transform.rb
67
+ test_files: []
68
+
69
+ rdoc_options:
70
+ - --exclude
71
+ - .
72
+ extra_rdoc_files: []
73
+
74
+ executables:
75
+ - etl
76
+ extensions: []
77
+
78
+ requirements: []
79
+
80
+ dependencies:
81
+ - !ruby/object:Gem::Dependency
82
+ name: rake
83
+ version_requirement:
84
+ version_requirements: !ruby/object:Gem::Version::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 0.7.1
89
+ version:
90
+ - !ruby/object:Gem::Dependency
91
+ name: activesupport
92
+ version_requirement:
93
+ version_requirements: !ruby/object:Gem::Version::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 1.3.1.5618
98
+ version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: activerecord
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.14.4.5618
107
+ version:
108
+ - !ruby/object:Gem::Dependency
109
+ name: fastercsv
110
+ version_requirement:
111
+ version_requirements: !ruby/object:Gem::Version::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: 1.0.0
116
+ version: