activewarehouse-etl 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -2
- data/Rakefile +6 -0
- data/TODO +9 -0
- data/lib/etl.rb +9 -1
- data/lib/etl/active_record_ext.rb +1 -0
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +34 -0
- data/lib/etl/control/destination.rb +36 -1
- data/lib/etl/control/destination/database_destination.rb +14 -2
- data/lib/etl/control/destination/file_destination.rb +1 -31
- data/lib/etl/control/source/file_source.rb +2 -0
- data/lib/etl/parser/delimited_parser.rb +21 -1
- data/lib/etl/parser/fixed_width_parser.rb +5 -0
- data/lib/etl/parser/parser.rb +4 -0
- data/lib/etl/processor/bulk_import_processor.rb +6 -12
- data/lib/etl/processor/processor.rb +7 -0
- data/lib/etl/processor/truncate_processor.rb +30 -0
- data/lib/etl/transform/sha1_transform.rb +1 -0
- data/lib/etl/transform/transform.rb +4 -0
- data/lib/etl/version.rb +1 -1
- metadata +8 -2
data/CHANGELOG
CHANGED
@@ -8,5 +8,11 @@
|
|
8
8
|
* Added support for generators in destinations
|
9
9
|
* Added a SurrogateKeyGenerator for cases where the database doesn't support auto generated surrogate keys
|
10
10
|
|
11
|
-
0.3.0
|
12
|
-
* Added support for calculated values in virtual fields with Proc
|
11
|
+
0.3.0 - Dec 19, 2006
|
12
|
+
* Added support for calculated values in virtual fields with Proc
|
13
|
+
|
14
|
+
0.4.0 - Jan 11, 2006
|
15
|
+
* Added :skip_lines option to file source configurations, which can be used to skip the first n lines in the source data file
|
16
|
+
* Added better error handling in delimited parser - an error is now raised if the expected and actual field lengths do not match
|
17
|
+
* Added :truncate option for database destination. Set to true to truncate before importing data.
|
18
|
+
* Added support for :unique => [] option and virtual fields for the database destination
|
data/Rakefile
CHANGED
@@ -41,6 +41,7 @@ end
|
|
41
41
|
PKG_FILES = FileList[
|
42
42
|
'CHANGELOG',
|
43
43
|
'README',
|
44
|
+
'TODO',
|
44
45
|
'Rakefile',
|
45
46
|
'bin/**/*',
|
46
47
|
'doc/**/*',
|
@@ -116,4 +117,9 @@ task :release => [ :package ] do
|
|
116
117
|
puts release_command
|
117
118
|
system(release_command)
|
118
119
|
end
|
120
|
+
end
|
121
|
+
|
122
|
+
desc "Publish the API documentation"
|
123
|
+
task :pdoc => [:rdoc] do
|
124
|
+
Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
|
119
125
|
end
|
data/TODO
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
TODO
|
2
|
+
|
3
|
+
* Add build-in support for audit_dimension
|
4
|
+
* Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
|
5
|
+
* Add built-in FK lookup
|
6
|
+
* Provide greater control in error handling
|
7
|
+
** Allow a error threshold
|
8
|
+
** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
|
9
|
+
** Allow mismatch row length error in delimited parser to be ignored
|
data/lib/etl.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
|
+
# This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
|
2
|
+
# load this source file all of the other required files and gems will also be brought into the
|
3
|
+
# runtime.
|
4
|
+
|
1
5
|
#--
|
2
|
-
# Copyright (c) 2006 Anthony Eden
|
6
|
+
# Copyright (c) 2006-2007 Anthony Eden
|
3
7
|
#
|
4
8
|
# Permission is hereby granted, free of charge, to any person obtaining
|
5
9
|
# a copy of this software and associated documentation files (the
|
@@ -49,6 +53,8 @@ require 'etl/transform'
|
|
49
53
|
require 'etl/processor'
|
50
54
|
require 'etl/generator'
|
51
55
|
|
56
|
+
require 'etl/active_record_ext'
|
57
|
+
|
52
58
|
module ETL #:nodoc:
|
53
59
|
class ETLError < StandardError #:nodoc:
|
54
60
|
end
|
@@ -56,4 +62,6 @@ module ETL #:nodoc:
|
|
56
62
|
end
|
57
63
|
class DefinitionError < ControlError #:nodoc:
|
58
64
|
end
|
65
|
+
class MismatchError < ETLError #:nodoc:
|
66
|
+
end
|
59
67
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'etl/active_record_ext/connection_adapters/mysql_adapter'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'active_record/connection_adapters/abstract_adapter'
|
2
|
+
|
3
|
+
module ActiveRecord
|
4
|
+
module ConnectionAdapters
|
5
|
+
class MysqlAdapter < AbstractAdapter
|
6
|
+
# Execute a truncate statement on the table. Note that in MySQL a truncate will *NOT* reset
|
7
|
+
# the auto_increment
|
8
|
+
def truncate(table_name)
|
9
|
+
execute("TRUNCATE #{table_name}")
|
10
|
+
end
|
11
|
+
|
12
|
+
# Bulk load the data in the specified file. This implementation always uses the LOCAL keyword
|
13
|
+
# so the file must be found locally, not on the remote server, to be loaded.
|
14
|
+
#
|
15
|
+
# Options:
|
16
|
+
# * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
|
17
|
+
# * <tt>:columns</tt> -- Array of column names defining the source file column order
|
18
|
+
# * <tt>:fields</tt> -- Hash of options for fields:
|
19
|
+
# ** <tt>:delimited_by</tt> -- The field delimiter
|
20
|
+
# ** <tt>:enclosed_by</tt> -- The field enclosure
|
21
|
+
def bulk_load(file, table_name, options={})
|
22
|
+
q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name}"
|
23
|
+
if options[:fields]
|
24
|
+
q << " FIELDS"
|
25
|
+
q << " TERMINATED BY '#{options[:fields][:delimited_by]}'" if options[:fields][:delimited_by]
|
26
|
+
q << " ENCLOSED BY '#{options[:fields][:enclosed_by]}'" if options[:fields][:enclosed_by]
|
27
|
+
end
|
28
|
+
q << " IGNORE #{options[:ignore]} LINES" if options[:ignore]
|
29
|
+
q << " (#{options[:columns].join(',')})" if options[:columns]
|
30
|
+
execute(q)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -2,7 +2,7 @@ module ETL #:nodoc:
|
|
2
2
|
module Control #:nodoc:
|
3
3
|
class Destination
|
4
4
|
attr_reader :control, :configuration, :mapping
|
5
|
-
attr_accessor :buffer_size, :current_row
|
5
|
+
attr_accessor :buffer_size, :current_row, :unique
|
6
6
|
|
7
7
|
class << self
|
8
8
|
def class_for_name(name)
|
@@ -58,6 +58,41 @@ module ETL #:nodoc:
|
|
58
58
|
end
|
59
59
|
order
|
60
60
|
end
|
61
|
+
|
62
|
+
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
63
|
+
# in the configuration and the compound key already exists
|
64
|
+
def row_allowed?(row)
|
65
|
+
if unique
|
66
|
+
key = (unique.collect { |k| row[k] }).join('|')
|
67
|
+
return false if compound_key_constraints[key]
|
68
|
+
compound_key_constraints[key] = 1
|
69
|
+
end
|
70
|
+
return true
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get a hash of compound key contraints. This is used to determine if a row can be written when the
|
74
|
+
# unique option is specified
|
75
|
+
def compound_key_constraints
|
76
|
+
@compound_key_constraints ||= {}
|
77
|
+
end
|
78
|
+
|
79
|
+
# Add any virtual fields to the row
|
80
|
+
def add_virtuals!(row)
|
81
|
+
if mapping[:virtual]
|
82
|
+
mapping[:virtual].each do |key,value|
|
83
|
+
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
84
|
+
case value
|
85
|
+
when Symbol
|
86
|
+
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
87
|
+
row[key] = generators[key].next
|
88
|
+
when Proc
|
89
|
+
row[key] = value.call(row)
|
90
|
+
else
|
91
|
+
row[key] = value
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
61
96
|
end
|
62
97
|
end
|
63
98
|
end
|
@@ -1,19 +1,29 @@
|
|
1
1
|
module ETL
|
2
2
|
module Control
|
3
3
|
class DatabaseDestination < Destination
|
4
|
-
attr_reader :order
|
4
|
+
attr_reader :order, :truncate
|
5
5
|
def initialize(control, configuration, mapping)
|
6
6
|
super
|
7
|
+
@truncate = configuration[:truncate] ||= false
|
8
|
+
@unique = configuration[:unique]
|
7
9
|
@order = mapping[:order] || order_from_source
|
8
10
|
raise ControlError, "Order required in mapping" unless @order
|
9
11
|
connect
|
10
12
|
end
|
11
13
|
|
12
14
|
def flush
|
13
|
-
# TODO: add virtual fields and compound key constraint support like in the FileDestination
|
14
15
|
conn = ActiveRecord::Base.connection
|
15
16
|
conn.transaction do
|
17
|
+
conn.truncate(configuration[:table]) if truncate
|
18
|
+
|
16
19
|
buffer.each do |row|
|
20
|
+
# check to see if this row's compound key constraint already exists
|
21
|
+
# note that the compound key constraint may not utilize virtual fields
|
22
|
+
next unless row_allowed?(row)
|
23
|
+
|
24
|
+
# add any virtual fields
|
25
|
+
add_virtuals!(row)
|
26
|
+
|
17
27
|
names = []
|
18
28
|
values = []
|
19
29
|
order.each do |name|
|
@@ -28,6 +38,8 @@ module ETL
|
|
28
38
|
buffer.clear
|
29
39
|
end
|
30
40
|
end
|
41
|
+
|
42
|
+
# Close the connection
|
31
43
|
def close
|
32
44
|
flush
|
33
45
|
ActiveRecord::Base.connection.disconnect!
|
@@ -3,7 +3,7 @@ module ETL #:nodoc:
|
|
3
3
|
# File as the final destination.
|
4
4
|
class FileDestination < Destination
|
5
5
|
attr_reader :file, :order
|
6
|
-
attr_accessor :append, :separator, :eol, :enclose
|
6
|
+
attr_accessor :append, :separator, :eol, :enclose
|
7
7
|
|
8
8
|
# Initialize the object.
|
9
9
|
# * <tt>control</tt>: The Control object
|
@@ -57,20 +57,6 @@ module ETL #:nodoc:
|
|
57
57
|
end
|
58
58
|
|
59
59
|
private
|
60
|
-
# Return true if the row is allowed. The row will not be allowed if the :unique option is specified
|
61
|
-
# in the configuration and the compound key already exists
|
62
|
-
def row_allowed?(row)
|
63
|
-
if unique
|
64
|
-
key = (unique.collect { |k| row[k] }).join('|')
|
65
|
-
return false if compound_key_constraints[key]
|
66
|
-
compound_key_constraints[key] = 1
|
67
|
-
end
|
68
|
-
return true
|
69
|
-
end
|
70
|
-
|
71
|
-
def compound_key_constraints
|
72
|
-
@compound_key_constraints ||= {}
|
73
|
-
end
|
74
60
|
|
75
61
|
# Get the open file stream
|
76
62
|
def f
|
@@ -82,22 +68,6 @@ module ETL #:nodoc:
|
|
82
68
|
append ? 'a' : 'w'
|
83
69
|
end
|
84
70
|
|
85
|
-
def add_virtuals!(row)
|
86
|
-
if mapping[:virtual]
|
87
|
-
mapping[:virtual].each do |key,value|
|
88
|
-
# Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
|
89
|
-
case value
|
90
|
-
when Symbol
|
91
|
-
generators[key] ||= ETL::Generator::Generator.class_for_name(value).new
|
92
|
-
row[key] = generators[key].next
|
93
|
-
when Proc
|
94
|
-
row[key] = value.call(row)
|
95
|
-
else
|
96
|
-
row[key] = value
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
71
|
end
|
102
72
|
end
|
103
73
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module ETL
|
2
2
|
module Control
|
3
3
|
class FileSource < Source
|
4
|
+
attr_accessor :skip_lines
|
4
5
|
def initialize(control, configuration, definition)
|
5
6
|
super
|
6
7
|
configure
|
@@ -13,6 +14,7 @@ module ETL
|
|
13
14
|
private
|
14
15
|
def configure
|
15
16
|
@parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
|
17
|
+
@skip_lines = @configuration[:skip_lines] ||= 0
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
@@ -10,12 +10,22 @@ module ETL #:nodoc:
|
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
|
-
# Returns each row
|
13
|
+
# Returns each row.
|
14
14
|
def each
|
15
15
|
options = {}
|
16
16
|
Dir.glob(file).each do |file|
|
17
|
+
ETL::Engine.logger.debug "parsing #{file}"
|
18
|
+
line = 0
|
19
|
+
lines_skipped = 0
|
17
20
|
FasterCSV.foreach(file, options) do |raw_row|
|
21
|
+
if lines_skipped < source.skip_lines
|
22
|
+
ETL::Engine.logger.debug "skipping line"
|
23
|
+
lines_skipped += 1
|
24
|
+
next
|
25
|
+
end
|
26
|
+
line += 1
|
18
27
|
row = {}
|
28
|
+
validate_row(raw_row, line, file)
|
19
29
|
raw_row.each_with_index do |record, index|
|
20
30
|
f = fields[index]
|
21
31
|
row[f.name] = convert(f.name, record, f.type)
|
@@ -31,6 +41,16 @@ module ETL #:nodoc:
|
|
31
41
|
end
|
32
42
|
|
33
43
|
private
|
44
|
+
def validate_row(row, line, file)
|
45
|
+
ETL::Engine.logger.debug "validing line #{line} in file #{file}"
|
46
|
+
if row.length != fields.length
|
47
|
+
raise_with_info( MismatchError,
|
48
|
+
"The number of rows from the source (#{row.length}) does not match the number of rows in the definition (#{fields.length})",
|
49
|
+
line, file
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
34
54
|
def configure
|
35
55
|
source.definition.each do |options|
|
36
56
|
case options
|
@@ -16,7 +16,12 @@ module ETL #:nodoc:
|
|
16
16
|
Dir.glob(file).each do |file|
|
17
17
|
open(file).each do |line|
|
18
18
|
row = {}
|
19
|
+
lines_skipped = 0
|
19
20
|
fields.each do |name, f|
|
21
|
+
if lines_skipped < source.skip_lines
|
22
|
+
lines_skipped += 1
|
23
|
+
next
|
24
|
+
end
|
20
25
|
# TODO make strip optional?
|
21
26
|
row[name] = convert(name, line[f.field_start, f.field_length].strip, f.type)
|
22
27
|
end
|
data/lib/etl/parser/parser.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module ETL
|
2
2
|
module Processor
|
3
|
+
# Processor which is used to bulk import data into a target database
|
3
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
4
5
|
attr_reader :file, :target, :truncate, :columns
|
5
6
|
def initialize(control, configuration)
|
@@ -16,21 +17,14 @@ module ETL
|
|
16
17
|
conn.transaction do
|
17
18
|
# TODO: Support all database types
|
18
19
|
# Since LOCAL is used this must be allowed by both the client and server
|
19
|
-
conn.
|
20
|
-
|
21
|
-
|
22
|
-
conn.
|
20
|
+
conn.truncate(target[:table]) if truncate
|
21
|
+
options = {}
|
22
|
+
options[:columns] = columns
|
23
|
+
conn.bulk_load(file, target[:table], options)
|
23
24
|
end
|
24
25
|
end
|
25
|
-
private
|
26
|
-
def log
|
27
|
-
unless @log
|
28
|
-
@log = Logger.new(STDOUT)
|
29
|
-
@log.level = Logger::DEBUG
|
30
|
-
end
|
31
|
-
@log
|
32
|
-
end
|
33
26
|
|
27
|
+
private
|
34
28
|
# Connect to the database
|
35
29
|
def connect
|
36
30
|
ActiveRecord::Base.establish_connection(
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ETL
|
2
|
+
module Processor
|
3
|
+
# A processor which will truncate a table. Use as a pre-processor for cleaning out a table
|
4
|
+
# prior to loading
|
5
|
+
class TruncateProcessor < ETL::Processor::Processor
|
6
|
+
attr_reader :file, :table
|
7
|
+
def initialize(control, configuration)
|
8
|
+
super
|
9
|
+
@file = File.join(File.dirname(control.file), configuration[:file])
|
10
|
+
@table = configuration[:table]
|
11
|
+
connect
|
12
|
+
end
|
13
|
+
def process
|
14
|
+
conn = ActiveRecord::Base.connection
|
15
|
+
conn.truncate
|
16
|
+
end
|
17
|
+
|
18
|
+
# Connect to the database
|
19
|
+
def connect
|
20
|
+
ActiveRecord::Base.establish_connection(
|
21
|
+
:adapter => (target[:adapter] || :mysql),
|
22
|
+
:username => (target[:username] || 'root'),
|
23
|
+
:host => (target[:host] || 'localhost'),
|
24
|
+
:password => target[:password],
|
25
|
+
:database => target[:database]
|
26
|
+
)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -2,6 +2,9 @@ module ETL
|
|
2
2
|
module Transform
|
3
3
|
class Transform
|
4
4
|
class << self
|
5
|
+
# Transform the specified value using the given transforms. The transforms can either be
|
6
|
+
# Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
|
7
|
+
# Any other object will result in a ControlError being raised.
|
5
8
|
def transform(name, value, transforms)
|
6
9
|
# logger.debug "Transforming field #{name}" if transforms.length > 0
|
7
10
|
transforms.each do |transform|
|
@@ -20,6 +23,7 @@ module ETL
|
|
20
23
|
|
21
24
|
attr_reader :control, :configuration
|
22
25
|
|
26
|
+
# Initialize the transform object
|
23
27
|
def initialize(control, configuration={})
|
24
28
|
@control = control
|
25
29
|
@configuration = configuration
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: activewarehouse-etl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.4.0
|
7
|
+
date: 2007-01-11 00:00:00 -05:00
|
8
8
|
summary: Pure Ruby ETL package.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -31,10 +31,13 @@ authors:
|
|
31
31
|
files:
|
32
32
|
- CHANGELOG
|
33
33
|
- README
|
34
|
+
- TODO
|
34
35
|
- Rakefile
|
35
36
|
- bin/etl
|
36
37
|
- lib/etl
|
37
38
|
- lib/etl.rb
|
39
|
+
- lib/etl/active_record_ext
|
40
|
+
- lib/etl/active_record_ext.rb
|
38
41
|
- lib/etl/commands
|
39
42
|
- lib/etl/control
|
40
43
|
- lib/etl/control.rb
|
@@ -48,6 +51,8 @@ files:
|
|
48
51
|
- lib/etl/transform
|
49
52
|
- lib/etl/transform.rb
|
50
53
|
- lib/etl/version.rb
|
54
|
+
- lib/etl/active_record_ext/connection_adapters
|
55
|
+
- lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb
|
51
56
|
- lib/etl/commands/etl.rb
|
52
57
|
- lib/etl/control/control.rb
|
53
58
|
- lib/etl/control/destination
|
@@ -66,6 +71,7 @@ files:
|
|
66
71
|
- lib/etl/parser/xml_parser.rb
|
67
72
|
- lib/etl/processor/bulk_import_processor.rb
|
68
73
|
- lib/etl/processor/processor.rb
|
74
|
+
- lib/etl/processor/truncate_processor.rb
|
69
75
|
- lib/etl/transform/decode_transform.rb
|
70
76
|
- lib/etl/transform/sha1_transform.rb
|
71
77
|
- lib/etl/transform/transform.rb
|