activewarehouse-etl 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +41 -13
- data/README +1 -1
- data/Rakefile +14 -4
- data/TODO +17 -1
- data/bin/etl +3 -1
- data/lib/etl.rb +11 -7
- data/lib/etl/commands/etl.rb +0 -1
- data/lib/etl/control/control.rb +113 -36
- data/lib/etl/control/destination.rb +13 -1
- data/lib/etl/control/destination/database_destination.rb +3 -1
- data/lib/etl/control/destination/file_destination.rb +5 -2
- data/lib/etl/control/source.rb +36 -0
- data/lib/etl/control/source/database_source.rb +63 -8
- data/lib/etl/control/source/file_source.rb +25 -4
- data/lib/etl/engine.rb +128 -14
- data/lib/etl/generator/surrogate_key_generator.rb +1 -0
- data/lib/etl/http_tools.rb +119 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
- data/lib/etl/parser/sax_parser.rb +18 -6
- data/lib/etl/processor.rb +1 -0
- data/lib/etl/processor/bulk_import_processor.rb +12 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
- data/lib/etl/processor/processor.rb +1 -5
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +15 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
- data/lib/etl/transform/sha1_transform.rb +1 -1
- data/lib/etl/transform/string_to_date_transform.rb +3 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
- data/lib/etl/transform/string_to_time_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +8 -4
- data/lib/etl/transform/type_transform.rb +2 -2
- data/lib/etl/version.rb +2 -2
- metadata +21 -8
- data/lib/etl/active_record_ext.rb +0 -1
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
# Module which has utility methods for HTTP.
|
4
|
+
module HttpTools
|
5
|
+
# Parse the given user agent string
|
6
|
+
#
|
7
|
+
# Code taken from http://gemtacular.com/gems/ParseUserAgent
|
8
|
+
def parse_user_agent(user_agent)
|
9
|
+
if '-' == user_agent
|
10
|
+
raise 'Invalid User Agent'
|
11
|
+
end
|
12
|
+
|
13
|
+
browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
|
14
|
+
|
15
|
+
# fix Opera
|
16
|
+
#useragent =~ s/Opera (\d)/Opera\/$1/i;
|
17
|
+
useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
|
18
|
+
|
19
|
+
# grab all Agent/version strings as 'agents'
|
20
|
+
agents = Array.new
|
21
|
+
user_agent.split(/\s+/).each {|string|
|
22
|
+
if string =~ /\//
|
23
|
+
agents<< string
|
24
|
+
end
|
25
|
+
}
|
26
|
+
|
27
|
+
# cycle through the agents to set browser and version (MSIE is set later)
|
28
|
+
if agents && agents.length > 0
|
29
|
+
agents.each {|agent|
|
30
|
+
parts = agent.split('/')
|
31
|
+
browser = parts[0]
|
32
|
+
browser_version = parts[1]
|
33
|
+
if browser == 'Firefox'
|
34
|
+
browser_version_major = parts[1].slice(0,3)
|
35
|
+
browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
|
36
|
+
elsif browser == 'Safari'
|
37
|
+
if parts[1].slice(0,3).to_f < 400
|
38
|
+
browser_version_major = '1'
|
39
|
+
else
|
40
|
+
browser_version_major = '2'
|
41
|
+
end
|
42
|
+
else
|
43
|
+
browser_version_major = parts[1].slice(0,1)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
# grab all of the properties (within parens)
|
49
|
+
# should be in relation to the agent if possible
|
50
|
+
detail = user_agent
|
51
|
+
user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
|
52
|
+
detail = detail.gsub('(','').gsub(')','').lstrip
|
53
|
+
properties = detail.split(/;\s+/)
|
54
|
+
|
55
|
+
# cycle through the properties to set known quantities
|
56
|
+
properties.each do |property|
|
57
|
+
if property =~ /^Win/
|
58
|
+
ostype = 'Windows'
|
59
|
+
os = property
|
60
|
+
if parts = property.split(/ /,2)
|
61
|
+
if parts[1] =~ /^NT/
|
62
|
+
ostype = 'Windows'
|
63
|
+
subparts = parts[1].split(/ /,2)
|
64
|
+
if subparts[1] == '5'
|
65
|
+
os_version = '2000'
|
66
|
+
elsif subparts[1] == '5.1'
|
67
|
+
os_version = 'XP'
|
68
|
+
else
|
69
|
+
os_version = subparts[1]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if property == 'Macintosh'
|
75
|
+
ostype = 'Macintosh'
|
76
|
+
os = property
|
77
|
+
end
|
78
|
+
if property =~ /OS X/
|
79
|
+
ostype = 'Macintosh'
|
80
|
+
os_version = 'OS X'
|
81
|
+
os = property
|
82
|
+
end
|
83
|
+
if property =~ /^Linux/
|
84
|
+
ostype = 'Linux'
|
85
|
+
os = property
|
86
|
+
end
|
87
|
+
if property =~ /^MSIE/
|
88
|
+
browser = 'MSIE'
|
89
|
+
browser_version = property.gsub('MSIE ','').lstrip
|
90
|
+
browser_version_major,browser_version_minor = browser_version.split('.')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
result = {
|
95
|
+
:browser => browser,
|
96
|
+
:browser_version_major => browser_version_major,
|
97
|
+
:browser_version_minor => browser_version_minor,
|
98
|
+
:ostype => ostype,
|
99
|
+
:os_version => os_version,
|
100
|
+
:os => os,
|
101
|
+
}
|
102
|
+
result.each do |key, value|
|
103
|
+
result[key] = value.blank? ? nil : value.strip
|
104
|
+
end
|
105
|
+
result
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse_uri(uri_string)
|
109
|
+
if uri_string
|
110
|
+
uri = URI.parse(uri_string)
|
111
|
+
|
112
|
+
results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
|
113
|
+
results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
|
114
|
+
results
|
115
|
+
else
|
116
|
+
{:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Parser #:nodoc:
|
3
|
+
# Parser which can parser the Apache Combined Log Format as defined at
|
4
|
+
# http://httpd.apache.org/docs/2.2/logs.html
|
5
|
+
class ApacheCombinedLogParser < ETL::Parser::Parser
|
6
|
+
include HttpTools
|
7
|
+
def initialize(source, options={})
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def each
|
12
|
+
Dir.glob(file).each do |file|
|
13
|
+
File.open(file).each_line do |line|
|
14
|
+
yield parse(line)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse(line)
|
20
|
+
# example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
|
21
|
+
line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
|
22
|
+
fields = {
|
23
|
+
:ip_address => $1,
|
24
|
+
:identd => $2,
|
25
|
+
:user => $3,
|
26
|
+
:timestamp => $4,
|
27
|
+
:request => $5,
|
28
|
+
:response_code => $6,
|
29
|
+
:bytes => $7,
|
30
|
+
:referrer => $8,
|
31
|
+
:user_agent => $9,
|
32
|
+
}
|
33
|
+
#fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
|
34
|
+
d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S')
|
35
|
+
fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction])
|
36
|
+
|
37
|
+
fields.merge!(parse_user_agent(fields[:user_agent]))
|
38
|
+
fields.merge!(parse_uri(fields[:referrer]))
|
39
|
+
|
40
|
+
fields.each do |key, value|
|
41
|
+
fields[key] = nil if value == '-'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'rexml/parsers/sax2parser'
|
2
2
|
require 'rexml/sax2listener'
|
3
3
|
|
4
|
-
module ETL
|
5
|
-
module Parser
|
4
|
+
module ETL #:nodoc:
|
5
|
+
module Parser #:nodoc:
|
6
|
+
# ETL parser implementation which uses SAX to parse XML files.
|
6
7
|
class SaxParser < ETL::Parser::Parser
|
7
8
|
|
8
9
|
# The write trigger causes whatever values are currently specified for the row to be returned.
|
@@ -77,7 +78,19 @@ module ETL
|
|
77
78
|
|
78
79
|
end
|
79
80
|
def start_element(uri, localname, qname, attributes)
|
80
|
-
|
81
|
+
element = XPath::Element.new(localname, attributes)
|
82
|
+
@path.elements << element
|
83
|
+
|
84
|
+
@parser.fields.each do |field|
|
85
|
+
#puts "#{@path} match? #{field.path}"
|
86
|
+
if @path.match?(field.path)
|
87
|
+
#puts "field.path: #{field.path}"
|
88
|
+
if field.path.is_attribute?
|
89
|
+
#puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
|
90
|
+
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
81
94
|
end
|
82
95
|
def end_element(uri, localname, qname)
|
83
96
|
element = @path.elements.last
|
@@ -86,13 +99,12 @@ module ETL
|
|
86
99
|
#puts "#{@path} match? #{field.path}"
|
87
100
|
if @path.match?(field.path)
|
88
101
|
#puts "field.path: #{field.path}"
|
89
|
-
if field.path.is_attribute?
|
90
|
-
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
-
else
|
102
|
+
if !field.path.is_attribute?
|
92
103
|
@row[field.name] = @value
|
93
104
|
end
|
94
105
|
end
|
95
106
|
end
|
107
|
+
|
96
108
|
#puts @path.to_s
|
97
109
|
if @path.match?(@parser.write_trigger)
|
98
110
|
#puts "matched: #{@path} =~ #{@parser.write_trigger}"
|
data/lib/etl/processor.rb
CHANGED
@@ -3,12 +3,18 @@ module ETL #:nodoc:
|
|
3
3
|
# Processor which is used to bulk import data into a target database
|
4
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
5
|
attr_reader :file, :target, :truncate, :columns
|
6
|
+
attr_accessor :field_separator
|
7
|
+
attr_accessor :field_enclosure
|
8
|
+
attr_accessor :line_separator
|
6
9
|
def initialize(control, configuration)
|
7
10
|
super
|
8
11
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
9
12
|
@target = configuration[:target]
|
10
13
|
@truncate = configuration[:truncate] ||= false
|
11
14
|
@columns = configuration[:columns]
|
15
|
+
@field_separator = (configuration[:field_separator] || ',')
|
16
|
+
@line_separator = configuration[:line_separator]
|
17
|
+
@field_enclosure = configuration[:field_enclosure]
|
12
18
|
connect
|
13
19
|
end
|
14
20
|
def process
|
@@ -20,6 +26,11 @@ module ETL #:nodoc:
|
|
20
26
|
conn.truncate(target[:table]) if truncate
|
21
27
|
options = {}
|
22
28
|
options[:columns] = columns
|
29
|
+
if field_separator || field_enclosure
|
30
|
+
options[:fields] = {}
|
31
|
+
options[:fields][:delimited_by] = field_separator if field_separator
|
32
|
+
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
33
|
+
end
|
23
34
|
conn.bulk_load(file, target[:table], options)
|
24
35
|
end
|
25
36
|
end
|
@@ -27,6 +38,7 @@ module ETL #:nodoc:
|
|
27
38
|
private
|
28
39
|
# Connect to the database
|
29
40
|
def connect
|
41
|
+
Engine.logger.debug "Connecting to database #{target[:database]}"
|
30
42
|
ETL::ActiveRecord::Base.establish_connection(
|
31
43
|
:adapter => (target[:adapter] || :mysql),
|
32
44
|
:username => (target[:username] || 'root'),
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
conn = configuration[:connection]
|
28
|
+
table = configuration[:table]
|
29
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
30
|
+
rows
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
35
|
+
# for all of the children of the current level
|
36
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
37
|
+
ids.each do |id|
|
38
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
39
|
+
|
40
|
+
row = {
|
41
|
+
:parent_id => row_id,
|
42
|
+
:child_id => id,
|
43
|
+
:num_levels_from_parent => level,
|
44
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
45
|
+
:is_top => (root ? 1 : 0),
|
46
|
+
}
|
47
|
+
rows << row
|
48
|
+
|
49
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which will replace nil or empty values with a specified value.
|
4
|
+
class DefaultTransform < Transform
|
5
|
+
attr_accessor :default_value
|
6
|
+
def initialize(control, configuration)
|
7
|
+
super
|
8
|
+
@default_value = configuration[:default_value]
|
9
|
+
end
|
10
|
+
def transform(name, value, row)
|
11
|
+
value.blank? ? default_value : value
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -21,7 +21,7 @@ module ETL #:nodoc:
|
|
21
21
|
end
|
22
22
|
|
23
23
|
# Transform the value by resolving it to a foriegn key
|
24
|
-
def transform(value)
|
24
|
+
def transform(name, value, row)
|
25
25
|
fk = @collection[value]
|
26
26
|
unless fk
|
27
27
|
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which walks up the hierarchy tree to find a value of the current level's value
|
4
|
+
# is nil.
|
5
|
+
#
|
6
|
+
# Configuration options:
|
7
|
+
# * <tt>:table</tt>: The name of the table to use for lookup (required)
|
8
|
+
# * <tt>:connection</tt>: The database adapter connection (required)
|
9
|
+
# * <tt>:parent_id_field</tt>: The name of the parent ID field (defaults to :parent_id)
|
10
|
+
#
|
11
|
+
# TODO: Let the resolver be implemented in a class so different resolution methods are
|
12
|
+
# possible.
|
13
|
+
class HierarchyLookupTransform < ETL::Transform::Transform
|
14
|
+
# The name of the field to use for the parent ID
|
15
|
+
attr_accessor :parent_id_field
|
16
|
+
|
17
|
+
# Initialize the transform
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:table</tt>: The table to search (required)
|
21
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter (required)
|
22
|
+
# * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
|
23
|
+
def initialize(control, configuration={})
|
24
|
+
super
|
25
|
+
@parent_id_field = configuration[:parent_id_field] || :parent_id
|
26
|
+
end
|
27
|
+
|
28
|
+
# Transform the value.
|
29
|
+
def transform(name, value, row)
|
30
|
+
if parent_id = row[parent_id_field]
|
31
|
+
# TODO: should use more than just the first source out of the control
|
32
|
+
parent_id, value = lookup(name,
|
33
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
34
|
+
until value || parent_id.nil?
|
35
|
+
# TODO: should use more than just the first source out of the control
|
36
|
+
parent_id, value = lookup(name,
|
37
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
|
43
|
+
# Lookup the parent value. Note that this method requires that configuration[:connection]
|
44
|
+
# is specified
|
45
|
+
def lookup(field, table, parent_id, parent_id_field)
|
46
|
+
unless configuration.has_key?(:connection)
|
47
|
+
raise ETL::ControlError, "The configuration hash must include the database connection"
|
48
|
+
end
|
49
|
+
|
50
|
+
q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
|
51
|
+
row = configuration[:connection].select_one(q)
|
52
|
+
return row[parent_id_field.to_s], row[field.to_s]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|