activewarehouse-etl 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -2
- data/Rakefile +6 -0
- data/TODO +9 -0
- data/lib/etl.rb +9 -1
- data/lib/etl/active_record_ext.rb +1 -0
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +34 -0
- data/lib/etl/control/destination.rb +36 -1
- data/lib/etl/control/destination/database_destination.rb +14 -2
- data/lib/etl/control/destination/file_destination.rb +1 -31
- data/lib/etl/control/source/file_source.rb +2 -0
- data/lib/etl/parser/delimited_parser.rb +21 -1
- data/lib/etl/parser/fixed_width_parser.rb +5 -0
- data/lib/etl/parser/parser.rb +4 -0
- data/lib/etl/processor/bulk_import_processor.rb +6 -12
- data/lib/etl/processor/processor.rb +7 -0
- data/lib/etl/processor/truncate_processor.rb +30 -0
- data/lib/etl/transform/sha1_transform.rb +1 -0
- data/lib/etl/transform/transform.rb +4 -0
- data/lib/etl/version.rb +1 -1
- metadata +8 -2
data/CHANGELOG
CHANGED
@@ -8,5 +8,11 @@
|
|
8
8
|
* Added support for generators in destinations
|
9
9
|
* Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
|
10
10
|
|
11
|
-
0.3.0
|
12
|
-
* Added support for calculated values in virtual fields with Proc
|
11
|
+
0.3.0 - Dec 19, 2006
|
12
|
+
* Added support for calculated values in virtual fields with Proc
|
13
|
+
|
14
|
+
0.4.0 - Jan 11, 2006
|
15
|
+
* Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
|
16
|
+
* Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
|
17
|
+
* Added :truncate option for database destination. Set to true to truncate before importing data.
|
18
|
+
* Added support for :unique => [] option and virtual fields for the database destination
|
data/Rakefile
CHANGED
@@ -41,6 +41,7 @@ end
|
|
41
41
|
PKG_FILES = FileList[
|
42
42
|
'CHANGELOG',
|
43
43
|
'README',
|
44
|
+
'TODO',
|
44
45
|
'Rakefile',
|
45
46
|
'bin/**/*',
|
46
47
|
'doc/**/*',
|
@@ -116,4 +117,9 @@ task :release => [ :package ] do
|
|
116
117
|
puts release_command
|
117
118
|
system(release_command)
|
118
119
|
end
|
120
|
+
end
|
121
|
+
|
122
|
+
desc "Publish the API documentation"
|
123
|
+
task :pdoc => [:rdoc] do
|
124
|
+
Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
|
119
125
|
end
|
data/TODO
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
TODO
|
2
|
+
|
3
|
+
* Add build-in support for audit_dimension
|
4
|
+
* Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
|
5
|
+
* Add built-in FK lookup
|
6
|
+
* Provide greater control in error handling
|
7
|
+
** Allow a error threshold
|
8
|
+
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
9
|
+
** Allow mismatch row length error in delimited parser to be ignored
|
data/lib/etl.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
|
+
# This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
|
2
|
+
# load this source file all of the other required files and gems will also be brought into the
|
3
|
+
# runtime.
|
4
|
+
|
1
5
|
#--
|
2
|
-
# Copyright (c) 2006 Anthony Eden
|
6
|
+
# Copyright (c) 2006-2007 Anthony Eden
|
3
7
|
#
|
4
8
|
# Permission is hereby granted, free of charge, to any person obtaining
|
5
9
|
# a copy of this software and associated documentation files (the
|
@@ -49,6 +53,8 @@ require 'etl/transform'
|
|
49
53
|
require 'etl/processor'
|
50
54
|
require 'etl/generator'
|
51
55
|
|
56
|
+
require 'etl/active_record_ext'
|
57
|
+
|
52
58
|
module ETL #:nodoc:
|
53
59
|
class ETLError < StandardError #:nodoc:
|
54
60
|
end
|
@@ -56,4 +62,6 @@ module ETL #:nodoc:
|
|
56
62
|
end
|
57
63
|
class DefinitionError < ControlError #:nodoc:
|
58
64
|
end
|
65
|
+
class MismatchError < ETLError #:nodoc:
|
66
|
+
end
|
59
67
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'etl/active_record_ext/connection_adapters/mysql_adapter'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'active_record/connection_adapters/abstract_adapter'
|
2
|
+
|
3
|
+
module ActiveRecord
|
4
|
+
module ConnectionAdapters
|
5
|
+
class MysqlAdapter < AbstractAdapter
|
6
|
+
# Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
|
7
|
+
# the auto_increment
|
8
|
+
def truncate(table_name)
|
9
|
+
execute("TRUNCATE #{table_name}")
|
10
|
+
end
|
11
|
+
|
12
|
+
# Bulk load the data in the specified file. This implementation always uses the LOCAL keyword
|
13
|
+
# so the file must be found locally, not on the remote server, to be loaded.
|
14
|
+
#
|
15
|
+
# Options:
|
16
|
+
# * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
|
17
|
+
# * <tt>:columns</tt> -- Array of column names defining the source file column order
|
18
|
+
# * <tt>:fields</tt> -- Hash of options for fields:
|
19
|
+
# ** <tt>:delimited_by</tt> -- The field delimiter
|
20
|
+
# ** <tt>:enclosed_by</tt> -- The field enclosure
|
21
|
+
def bulk_load(file, table_name, options={})
|
22
|
+
q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name}"
|
23
|
+
if options[:fields]
|
24
|
+
q << " FIELDS"
|
25
|
+
q << " TERMINATED BY '#{options[:fields][:delimited_by]}'" if options[:fields][:delimited_by]
|
26
|
+
q << " ENCLOSED BY '#{options[:fields][:enclosed_by]}'" if options[:fields][:enclosed_by]
|
27
|
+
end
|
28
|
+
q << " IGNORE #{options[:ignore]} LINES" if options[:ignore]
|
29
|
+
q << " (#{options[:columns].join(',')})" if options[:columns]
|
30
|
+
execute(q)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -2,7 +2,7 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
class Destination
|
4
4
|
attr_reader :control, :configuration, :mapping
|
5
|
-
attr_accessor :buffer_size, :current_row
|
5
|
+
attr_accessor :buffer_size, :current_row, :unique
|
6
6
|
|
7
7
|
class << self
|
8
8
|
def class_for_name(name)
|
@@ -58,6 +58,41 @@ module ETL #:nodoc:
|
|
58
58
|
end
|
59
59
|
order
|
60
60
|
end
|
61
|
+
|
62
|
+
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
63
|
+
# in the configuration and the compound key already exists
|
64
|
+
def row_allowed?(row)
|
65
|
+
if unique
|
66
|
+
key = (unique.collect { |k| row[k] }).join('|')
|
67
|
+
return false if compound_key_constraints[key]
|
68
|
+
compound_key_constraints[key] = 1
|
69
|
+
end
|
70
|
+
return true
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get a hash of compound key contraints. This is used to determine if a row can be written when the
|
74
|
+
# unique option is specified
|
75
|
+
def compound_key_constraints
|
76
|
+
@compound_key_constraints ||= {}
|
77
|
+
end
|
78
|
+
|
79
|
+
# Add any virtual fields to the row
|
80
|
+
def add_virtuals!(row)
|
81
|
+
if mapping[:virtual]
|
82
|
+
mapping[:virtual].each do |key,value|
|
83
|
+
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
84
|
+
case value
|
85
|
+
when Symbol
|
86
|
+
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
87
|
+
row[key] = generators[key].next
|
88
|
+
when Proc
|
89
|
+
row[key] = value.call(row)
|
90
|
+
else
|
91
|
+
row[key] = value
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
61
96
|
end
|
62
97
|
end
|
63
98
|
end
|
@@ -1,19 +1,29 @@
|
|
1
1
|
module ETL
|
2
2
|
module Control
|
3
3
|
class DatabaseDestination < Destination
|
4
|
-
attr_reader :order
|
4
|
+
attr_reader :order, :truncate
|
5
5
|
def initialize(control, configuration, mapping)
|
6
6
|
super
|
7
|
+
@truncate = configuration[:truncate] ||= false
|
8
|
+
@unique = configuration[:unique]
|
7
9
|
@order = mapping[:order] || order_from_source
|
8
10
|
raise ControlError, "Order required in mapping" unless @order
|
9
11
|
connect
|
10
12
|
end
|
11
13
|
|
12
14
|
def flush
|
13
|
-
# TODO: add virtual fields and compound key constraint support like in the FileDestination
|
14
15
|
conn = ActiveRecord::Base.connection
|
15
16
|
conn.transaction do
|
17
|
+
conn.truncate(configuration[:table]) if truncate
|
18
|
+
|
16
19
|
buffer.each do |row|
|
20
|
+
# check to see if this row's compound key constraint already exists
|
21
|
+
# note that the compound key constraint may not utilize virtual fields
|
22
|
+
next unless row_allowed?(row)
|
23
|
+
|
24
|
+
# add any virtual fields
|
25
|
+
add_virtuals!(row)
|
26
|
+
|
17
27
|
names = []
|
18
28
|
values = []
|
19
29
|
order.each do |name|
|
@@ -28,6 +38,8 @@ module ETL
|
|
28
38
|
buffer.clear
|
29
39
|
end
|
30
40
|
end
|
41
|
+
|
42
|
+
# Close the connection
|
31
43
|
def close
|
32
44
|
flush
|
33
45
|
ActiveRecord::Base.connection.disconnect!
|
@@ -3,7 +3,7 @@ module ETL #:nodoc:
|
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
5
|
attr_reader :file, :order
|
6
|
-
attr_accessor :append, :separator, :eol, :enclose
|
6
|
+
attr_accessor :append, :separator, :eol, :enclose
|
7
7
|
|
8
8
|
# Initialize the object.
|
9
9
|
# * <tt>control</tt>: The Control object
|
@@ -57,20 +57,6 @@ module ETL #:nodoc:
|
|
57
57
|
end
|
58
58
|
|
59
59
|
private
|
60
|
-
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
61
|
-
# in the configuration and the compound key already exists
|
62
|
-
def row_allowed?(row)
|
63
|
-
if unique
|
64
|
-
key = (unique.collect { |k| row[k] }).join('|')
|
65
|
-
return false if compound_key_constraints[key]
|
66
|
-
compound_key_constraints[key] = 1
|
67
|
-
end
|
68
|
-
return true
|
69
|
-
end
|
70
|
-
|
71
|
-
def compound_key_constraints
|
72
|
-
@compound_key_constraints ||= {}
|
73
|
-
end
|
74
60
|
|
75
61
|
# Get the open file stream
|
76
62
|
def f
|
@@ -82,22 +68,6 @@ module ETL #:nodoc:
|
|
82
68
|
append ? 'a' : 'w'
|
83
69
|
end
|
84
70
|
|
85
|
-
def add_virtuals!(row)
|
86
|
-
if mapping[:virtual]
|
87
|
-
mapping[:virtual].each do |key,value|
|
88
|
-
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
89
|
-
case value
|
90
|
-
when Symbol
|
91
|
-
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
92
|
-
row[key] = generators[key].next
|
93
|
-
when Proc
|
94
|
-
row[key] = value.call(row)
|
95
|
-
else
|
96
|
-
row[key] = value
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
71
|
end
|
102
72
|
end
|
103
73
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module ETL
|
2
2
|
module Control
|
3
3
|
class FileSource < Source
|
4
|
+
attr_accessor :skip_lines
|
4
5
|
def initialize(control, configuration, definition)
|
5
6
|
super
|
6
7
|
configure
|
@@ -13,6 +14,7 @@ module ETL
|
|
13
14
|
private
|
14
15
|
def configure
|
15
16
|
@parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
|
17
|
+
@skip_lines = @configuration[:skip_lines] ||= 0
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
@@ -10,12 +10,22 @@ module ETL #:nodoc:
|
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
|
-
# Returns each row
|
13
|
+
# Returns each row.
|
14
14
|
def each
|
15
15
|
options = {}
|
16
16
|
Dir.glob(file).each do |file|
|
17
|
+
ETL::Engine.logger.debug "parsing #{file}"
|
18
|
+
line = 0
|
19
|
+
lines_skipped = 0
|
17
20
|
FasterCSV.foreach(file, options) do |raw_row|
|
21
|
+
if lines_skipped < source.skip_lines
|
22
|
+
ETL::Engine.logger.debug "skipping line"
|
23
|
+
lines_skipped += 1
|
24
|
+
next
|
25
|
+
end
|
26
|
+
line += 1
|
18
27
|
row = {}
|
28
|
+
validate_row(raw_row, line, file)
|
19
29
|
raw_row.each_with_index do |record, index|
|
20
30
|
f = fields[index]
|
21
31
|
row[f.name] = convert(f.name, record, f.type)
|
@@ -31,6 +41,16 @@ module ETL #:nodoc:
|
|
31
41
|
end
|
32
42
|
|
33
43
|
private
|
44
|
+
def validate_row(row, line, file)
|
45
|
+
ETL::Engine.logger.debug "validing line #{line} in file #{file}"
|
46
|
+
if row.length != fields.length
|
47
|
+
raise_with_info( MismatchError,
|
48
|
+
"The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
|
49
|
+
line, file
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
34
54
|
def configure
|
35
55
|
source.definition.each do |options|
|
36
56
|
case options
|
@@ -16,7 +16,12 @@ module ETL #:nodoc:
|
|
16
16
|
Dir.glob(file).each do |file|
|
17
17
|
open(file).each do |line|
|
18
18
|
row = {}
|
19
|
+
lines_skipped = 0
|
19
20
|
fields.each do |name, f|
|
21
|
+
if lines_skipped < source.skip_lines
|
22
|
+
lines_skipped += 1
|
23
|
+
next
|
24
|
+
end
|
20
25
|
# TODO make strip optional?
|
21
26
|
row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
|
22
27
|
end
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module ETL
|
2
2
|
module Processor
|
3
|
+
# Processor which is used to bulk import data into a target database
|
3
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
4
5
|
attr_reader :file, :target, :truncate, :columns
|
5
6
|
def initialize(control, configuration)
|
@@ -16,21 +17,14 @@ module ETL
|
|
16
17
|
conn.transaction do
|
17
18
|
# TODO: Support all database types
|
18
19
|
# Since LOCAL is used this must be allowed by both the client and server
|
19
|
-
conn.
|
20
|
-
|
21
|
-
|
22
|
-
conn.
|
20
|
+
conn.truncate(target[:table]) if truncate
|
21
|
+
options = {}
|
22
|
+
options[:columns] = columns
|
23
|
+
conn.bulk_load(file, target[:table], options)
|
23
24
|
end
|
24
25
|
end
|
25
|
-
private
|
26
|
-
def log
|
27
|
-
unless @log
|
28
|
-
@log = Logger.new(STDOUT)
|
29
|
-
@log.level = Logger::DEBUG
|
30
|
-
end
|
31
|
-
@log
|
32
|
-
end
|
33
26
|
|
27
|
+
private
|
34
28
|
# Connect to the database
|
35
29
|
def connect
|
36
30
|
ActiveRecord::Base.establish_connection(
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
|
+
# prior to loading
|
5
|
+
class TruncateProcessor < ETL::Processor::Processor
|
6
|
+
attr_reader :file, :table
|
7
|
+
def initialize(control, configuration)
|
8
|
+
super
|
9
|
+
@file = File.join(File.dirname(control.file), configuration[:file])
|
10
|
+
@table = configuration[:table]
|
11
|
+
connect
|
12
|
+
end
|
13
|
+
def process
|
14
|
+
conn = ActiveRecord::Base.connection
|
15
|
+
conn.truncate
|
16
|
+
end
|
17
|
+
|
18
|
+
# Connect to the database
|
19
|
+
def connect
|
20
|
+
ActiveRecord::Base.establish_connection(
|
21
|
+
:adapter => (target[:adapter] || :mysql),
|
22
|
+
:username => (target[:username] || 'root'),
|
23
|
+
:host => (target[:host] || 'localhost'),
|
24
|
+
:password => target[:password],
|
25
|
+
:database => target[:database]
|
26
|
+
)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -2,6 +2,9 @@ module ETL
|
|
2
2
|
module Transform
|
3
3
|
class Transform
|
4
4
|
class << self
|
5
|
+
# Transform the specified value using the given transforms. The transforms can either be
|
6
|
+
# Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
|
7
|
+
# Any other object will result in a ControlError being raised.
|
5
8
|
def transform(name, value, transforms)
|
6
9
|
# logger.debug "Transforming field #{name}" if transforms.length > 0
|
7
10
|
transforms.each do |transform|
|
@@ -20,6 +23,7 @@ module ETL
|
|
20
23
|
|
21
24
|
attr_reader :control, :configuration
|
22
25
|
|
26
|
+
# Initialize the transform object
|
23
27
|
def initialize(control, configuration={})
|
24
28
|
@control = control
|
25
29
|
@configuration = configuration
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.4.0
|
7
|
+
date: 2007-01-11 00:00:00 -05:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -31,10 +31,13 @@ authors:
|
|
31
31
|
files:
|
32
32
|
- CHANGELOG
|
33
33
|
- README
|
34
|
+
- TODO
|
34
35
|
- Rakefile
|
35
36
|
- bin/etl
|
36
37
|
- lib/etl
|
37
38
|
- lib/etl.rb
|
39
|
+
- lib/etl/active_record_ext
|
40
|
+
- lib/etl/active_record_ext.rb
|
38
41
|
- lib/etl/commands
|
39
42
|
- lib/etl/control
|
40
43
|
- lib/etl/control.rb
|
@@ -48,6 +51,8 @@ files:
|
|
48
51
|
- lib/etl/transform
|
49
52
|
- lib/etl/transform.rb
|
50
53
|
- lib/etl/version.rb
|
54
|
+
- lib/etl/active_record_ext/connection_adapters
|
55
|
+
- lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb
|
51
56
|
- lib/etl/commands/etl.rb
|
52
57
|
- lib/etl/control/control.rb
|
53
58
|
- lib/etl/control/destination
|
@@ -66,6 +71,7 @@ files:
|
|
66
71
|
- lib/etl/parser/xml_parser.rb
|
67
72
|
- lib/etl/processor/bulk_import_processor.rb
|
68
73
|
- lib/etl/processor/processor.rb
|
74
|
+
- lib/etl/processor/truncate_processor.rb
|
69
75
|
- lib/etl/transform/decode_transform.rb
|
70
76
|
- lib/etl/transform/sha1_transform.rb
|
71
77
|
- lib/etl/transform/transform.rb
|