activewarehouse-etl 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +41 -13
- data/README +1 -1
- data/Rakefile +14 -4
- data/TODO +17 -1
- data/bin/etl +3 -1
- data/lib/etl.rb +11 -7
- data/lib/etl/commands/etl.rb +0 -1
- data/lib/etl/control/control.rb +113 -36
- data/lib/etl/control/destination.rb +13 -1
- data/lib/etl/control/destination/database_destination.rb +3 -1
- data/lib/etl/control/destination/file_destination.rb +5 -2
- data/lib/etl/control/source.rb +36 -0
- data/lib/etl/control/source/database_source.rb +63 -8
- data/lib/etl/control/source/file_source.rb +25 -4
- data/lib/etl/engine.rb +128 -14
- data/lib/etl/generator/surrogate_key_generator.rb +1 -0
- data/lib/etl/http_tools.rb +119 -0
- data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
- data/lib/etl/parser/sax_parser.rb +18 -6
- data/lib/etl/processor.rb +1 -0
- data/lib/etl/processor/bulk_import_processor.rb +12 -0
- data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
- data/lib/etl/processor/processor.rb +1 -5
- data/lib/etl/processor/row_processor.rb +17 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -1
- data/lib/etl/transform/decode_transform.rb +1 -1
- data/lib/etl/transform/default_transform.rb +15 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
- data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
- data/lib/etl/transform/sha1_transform.rb +1 -1
- data/lib/etl/transform/string_to_date_transform.rb +3 -3
- data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
- data/lib/etl/transform/string_to_time_transform.rb +14 -0
- data/lib/etl/transform/transform.rb +8 -4
- data/lib/etl/transform/type_transform.rb +2 -2
- data/lib/etl/version.rb +2 -2
- metadata +21 -8
- data/lib/etl/active_record_ext.rb +0 -1
- data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
# Module which has utility methods for HTTP.
|
4
|
+
module HttpTools
|
5
|
+
# Parse the given user agent string
|
6
|
+
#
|
7
|
+
# Code taken from http://gemtacular.com/gems/ParseUserAgent
|
8
|
+
def parse_user_agent(user_agent)
|
9
|
+
if '-' == user_agent
|
10
|
+
raise 'Invalid User Agent'
|
11
|
+
end
|
12
|
+
|
13
|
+
browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
|
14
|
+
|
15
|
+
# fix Opera
|
16
|
+
#useragent =~ s/Opera (\d)/Opera\/$1/i;
|
17
|
+
useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
|
18
|
+
|
19
|
+
# grab all Agent/version strings as 'agents'
|
20
|
+
agents = Array.new
|
21
|
+
user_agent.split(/\s+/).each {|string|
|
22
|
+
if string =~ /\//
|
23
|
+
agents<< string
|
24
|
+
end
|
25
|
+
}
|
26
|
+
|
27
|
+
# cycle through the agents to set browser and version (MSIE is set later)
|
28
|
+
if agents && agents.length > 0
|
29
|
+
agents.each {|agent|
|
30
|
+
parts = agent.split('/')
|
31
|
+
browser = parts[0]
|
32
|
+
browser_version = parts[1]
|
33
|
+
if browser == 'Firefox'
|
34
|
+
browser_version_major = parts[1].slice(0,3)
|
35
|
+
browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
|
36
|
+
elsif browser == 'Safari'
|
37
|
+
if parts[1].slice(0,3).to_f < 400
|
38
|
+
browser_version_major = '1'
|
39
|
+
else
|
40
|
+
browser_version_major = '2'
|
41
|
+
end
|
42
|
+
else
|
43
|
+
browser_version_major = parts[1].slice(0,1)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
# grab all of the properties (within parens)
|
49
|
+
# should be in relation to the agent if possible
|
50
|
+
detail = user_agent
|
51
|
+
user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
|
52
|
+
detail = detail.gsub('(','').gsub(')','').lstrip
|
53
|
+
properties = detail.split(/;\s+/)
|
54
|
+
|
55
|
+
# cycle through the properties to set known quantities
|
56
|
+
properties.each do |property|
|
57
|
+
if property =~ /^Win/
|
58
|
+
ostype = 'Windows'
|
59
|
+
os = property
|
60
|
+
if parts = property.split(/ /,2)
|
61
|
+
if parts[1] =~ /^NT/
|
62
|
+
ostype = 'Windows'
|
63
|
+
subparts = parts[1].split(/ /,2)
|
64
|
+
if subparts[1] == '5'
|
65
|
+
os_version = '2000'
|
66
|
+
elsif subparts[1] == '5.1'
|
67
|
+
os_version = 'XP'
|
68
|
+
else
|
69
|
+
os_version = subparts[1]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if property == 'Macintosh'
|
75
|
+
ostype = 'Macintosh'
|
76
|
+
os = property
|
77
|
+
end
|
78
|
+
if property =~ /OS X/
|
79
|
+
ostype = 'Macintosh'
|
80
|
+
os_version = 'OS X'
|
81
|
+
os = property
|
82
|
+
end
|
83
|
+
if property =~ /^Linux/
|
84
|
+
ostype = 'Linux'
|
85
|
+
os = property
|
86
|
+
end
|
87
|
+
if property =~ /^MSIE/
|
88
|
+
browser = 'MSIE'
|
89
|
+
browser_version = property.gsub('MSIE ','').lstrip
|
90
|
+
browser_version_major,browser_version_minor = browser_version.split('.')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
result = {
|
95
|
+
:browser => browser,
|
96
|
+
:browser_version_major => browser_version_major,
|
97
|
+
:browser_version_minor => browser_version_minor,
|
98
|
+
:ostype => ostype,
|
99
|
+
:os_version => os_version,
|
100
|
+
:os => os,
|
101
|
+
}
|
102
|
+
result.each do |key, value|
|
103
|
+
result[key] = value.blank? ? nil : value.strip
|
104
|
+
end
|
105
|
+
result
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse_uri(uri_string)
|
109
|
+
if uri_string
|
110
|
+
uri = URI.parse(uri_string)
|
111
|
+
|
112
|
+
results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
|
113
|
+
results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
|
114
|
+
results
|
115
|
+
else
|
116
|
+
{:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Parser #:nodoc:
|
3
|
+
# Parser which can parser the Apache Combined Log Format as defined at
|
4
|
+
# http://httpd.apache.org/docs/2.2/logs.html
|
5
|
+
class ApacheCombinedLogParser < ETL::Parser::Parser
|
6
|
+
include HttpTools
|
7
|
+
def initialize(source, options={})
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def each
|
12
|
+
Dir.glob(file).each do |file|
|
13
|
+
File.open(file).each_line do |line|
|
14
|
+
yield parse(line)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse(line)
|
20
|
+
# example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
|
21
|
+
line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
|
22
|
+
fields = {
|
23
|
+
:ip_address => $1,
|
24
|
+
:identd => $2,
|
25
|
+
:user => $3,
|
26
|
+
:timestamp => $4,
|
27
|
+
:request => $5,
|
28
|
+
:response_code => $6,
|
29
|
+
:bytes => $7,
|
30
|
+
:referrer => $8,
|
31
|
+
:user_agent => $9,
|
32
|
+
}
|
33
|
+
#fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
|
34
|
+
d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S')
|
35
|
+
fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction])
|
36
|
+
|
37
|
+
fields.merge!(parse_user_agent(fields[:user_agent]))
|
38
|
+
fields.merge!(parse_uri(fields[:referrer]))
|
39
|
+
|
40
|
+
fields.each do |key, value|
|
41
|
+
fields[key] = nil if value == '-'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'rexml/parsers/sax2parser'
|
2
2
|
require 'rexml/sax2listener'
|
3
3
|
|
4
|
-
module ETL
|
5
|
-
module Parser
|
4
|
+
module ETL #:nodoc:
|
5
|
+
module Parser #:nodoc:
|
6
|
+
# ETL parser implementation which uses SAX to parse XML files.
|
6
7
|
class SaxParser < ETL::Parser::Parser
|
7
8
|
|
8
9
|
# The write trigger causes whatever values are currently specified for the row to be returned.
|
@@ -77,7 +78,19 @@ module ETL
|
|
77
78
|
|
78
79
|
end
|
79
80
|
def start_element(uri, localname, qname, attributes)
|
80
|
-
|
81
|
+
element = XPath::Element.new(localname, attributes)
|
82
|
+
@path.elements << element
|
83
|
+
|
84
|
+
@parser.fields.each do |field|
|
85
|
+
#puts "#{@path} match? #{field.path}"
|
86
|
+
if @path.match?(field.path)
|
87
|
+
#puts "field.path: #{field.path}"
|
88
|
+
if field.path.is_attribute?
|
89
|
+
#puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
|
90
|
+
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
81
94
|
end
|
82
95
|
def end_element(uri, localname, qname)
|
83
96
|
element = @path.elements.last
|
@@ -86,13 +99,12 @@ module ETL
|
|
86
99
|
#puts "#{@path} match? #{field.path}"
|
87
100
|
if @path.match?(field.path)
|
88
101
|
#puts "field.path: #{field.path}"
|
89
|
-
if field.path.is_attribute?
|
90
|
-
@row[field.name] = element.attributes[field.path.attribute]
|
91
|
-
else
|
102
|
+
if !field.path.is_attribute?
|
92
103
|
@row[field.name] = @value
|
93
104
|
end
|
94
105
|
end
|
95
106
|
end
|
107
|
+
|
96
108
|
#puts @path.to_s
|
97
109
|
if @path.match?(@parser.write_trigger)
|
98
110
|
#puts "matched: #{@path} =~ #{@parser.write_trigger}"
|
data/lib/etl/processor.rb
CHANGED
@@ -3,12 +3,18 @@ module ETL #:nodoc:
|
|
3
3
|
# Processor which is used to bulk import data into a target database
|
4
4
|
class BulkImportProcessor < ETL::Processor::Processor
|
5
5
|
attr_reader :file, :target, :truncate, :columns
|
6
|
+
attr_accessor :field_separator
|
7
|
+
attr_accessor :field_enclosure
|
8
|
+
attr_accessor :line_separator
|
6
9
|
def initialize(control, configuration)
|
7
10
|
super
|
8
11
|
@file = File.join(File.dirname(control.file), configuration[:file])
|
9
12
|
@target = configuration[:target]
|
10
13
|
@truncate = configuration[:truncate] ||= false
|
11
14
|
@columns = configuration[:columns]
|
15
|
+
@field_separator = (configuration[:field_separator] || ',')
|
16
|
+
@line_separator = configuration[:line_separator]
|
17
|
+
@field_enclosure = configuration[:field_enclosure]
|
12
18
|
connect
|
13
19
|
end
|
14
20
|
def process
|
@@ -20,6 +26,11 @@ module ETL #:nodoc:
|
|
20
26
|
conn.truncate(target[:table]) if truncate
|
21
27
|
options = {}
|
22
28
|
options[:columns] = columns
|
29
|
+
if field_separator || field_enclosure
|
30
|
+
options[:fields] = {}
|
31
|
+
options[:fields][:delimited_by] = field_separator if field_separator
|
32
|
+
options[:fields][:enclosed_by] = field_enclosure if field_enclosure
|
33
|
+
end
|
23
34
|
conn.bulk_load(file, target[:table], options)
|
24
35
|
end
|
25
36
|
end
|
@@ -27,6 +38,7 @@ module ETL #:nodoc:
|
|
27
38
|
private
|
28
39
|
# Connect to the database
|
29
40
|
def connect
|
41
|
+
Engine.logger.debug "Connecting to database #{target[:database]}"
|
30
42
|
ETL::ActiveRecord::Base.establish_connection(
|
31
43
|
:adapter => (target[:adapter] || :mysql),
|
32
44
|
:username => (target[:username] || 'root'),
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Row-level processor that will convert a single row into multiple rows designed to be inserted
|
4
|
+
# into a hierarchy bridge table.
|
5
|
+
class HierarchyExploderProcessor < ETL::Processor::RowProcessor
|
6
|
+
attr_accessor :id_field
|
7
|
+
attr_accessor :parent_id_field
|
8
|
+
|
9
|
+
# Initialize the processor
|
10
|
+
#
|
11
|
+
# Configuration options:
|
12
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter connection
|
13
|
+
# * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
|
14
|
+
# * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
|
15
|
+
#
|
16
|
+
# TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
|
17
|
+
# on AR as the only resolution method.
|
18
|
+
def initialize(control, configuration={})
|
19
|
+
@id_field = configuration[:id_field] || 'id'
|
20
|
+
@parent_id_field = configuration[:parent_id_field] || 'parent_id'
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
# Process the row expanding it into hierarchy values
|
25
|
+
def process(row)
|
26
|
+
rows = []
|
27
|
+
conn = configuration[:connection]
|
28
|
+
table = configuration[:table]
|
29
|
+
build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
|
30
|
+
rows
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
# Recursive function that will add a row for the current level and then call build_rows
|
35
|
+
# for all of the children of the current level
|
36
|
+
def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
|
37
|
+
ids.each do |id|
|
38
|
+
child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
|
39
|
+
|
40
|
+
row = {
|
41
|
+
:parent_id => row_id,
|
42
|
+
:child_id => id,
|
43
|
+
:num_levels_from_parent => level,
|
44
|
+
:is_bottom => (child_ids.empty? ? 1 : 0),
|
45
|
+
:is_top => (root ? 1 : 0),
|
46
|
+
}
|
47
|
+
rows << row
|
48
|
+
|
49
|
+
build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Processor #:nodoc:
|
3
|
+
# Processor which processes a specific row. Unlike a transformer, which deals with a specific
|
4
|
+
# value in the row, row processors can process an entire row at once, which can be used to
|
5
|
+
# explode a single row into multiple rows (for example)
|
6
|
+
class RowProcessor < Processor
|
7
|
+
# Initialize the processor
|
8
|
+
def initialize(control, configuration)
|
9
|
+
super
|
10
|
+
end
|
11
|
+
# Process the specified row
|
12
|
+
def process(row)
|
13
|
+
raise "process_row is an abstract method"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which will replace nil or empty values with a specified value.
|
4
|
+
class DefaultTransform < Transform
|
5
|
+
attr_accessor :default_value
|
6
|
+
def initialize(control, configuration)
|
7
|
+
super
|
8
|
+
@default_value = configuration[:default_value]
|
9
|
+
end
|
10
|
+
def transform(name, value, row)
|
11
|
+
value.blank? ? default_value : value
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -21,7 +21,7 @@ module ETL #:nodoc:
|
|
21
21
|
end
|
22
22
|
|
23
23
|
# Transform the value by resolving it to a foriegn key
|
24
|
-
def transform(value)
|
24
|
+
def transform(name, value, row)
|
25
25
|
fk = @collection[value]
|
26
26
|
unless fk
|
27
27
|
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Transform #:nodoc:
|
3
|
+
# Transform which walks up the hierarchy tree to find a value of the current level's value
|
4
|
+
# is nil.
|
5
|
+
#
|
6
|
+
# Configuration options:
|
7
|
+
# * <tt>:table</tt>: The name of the table to use for lookup (required)
|
8
|
+
# * <tt>:connection</tt>: The database adapter connection (required)
|
9
|
+
# * <tt>:parent_id_field</tt>: The name of the parent ID field (defaults to :parent_id)
|
10
|
+
#
|
11
|
+
# TODO: Let the resolver be implemented in a class so different resolution methods are
|
12
|
+
# possible.
|
13
|
+
class HierarchyLookupTransform < ETL::Transform::Transform
|
14
|
+
# The name of the field to use for the parent ID
|
15
|
+
attr_accessor :parent_id_field
|
16
|
+
|
17
|
+
# Initialize the transform
|
18
|
+
#
|
19
|
+
# Configuration options:
|
20
|
+
# * <tt>:table</tt>: The table to search (required)
|
21
|
+
# * <tt>:connection</tt>: The ActiveRecord adapter (required)
|
22
|
+
# * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
|
23
|
+
def initialize(control, configuration={})
|
24
|
+
super
|
25
|
+
@parent_id_field = configuration[:parent_id_field] || :parent_id
|
26
|
+
end
|
27
|
+
|
28
|
+
# Transform the value.
|
29
|
+
def transform(name, value, row)
|
30
|
+
if parent_id = row[parent_id_field]
|
31
|
+
# TODO: should use more than just the first source out of the control
|
32
|
+
parent_id, value = lookup(name,
|
33
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
34
|
+
until value || parent_id.nil?
|
35
|
+
# TODO: should use more than just the first source out of the control
|
36
|
+
parent_id, value = lookup(name,
|
37
|
+
control.sources.first.configuration[:table], parent_id, parent_id_field)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
|
43
|
+
# Lookup the parent value. Note that this method requires that configuration[:connection]
|
44
|
+
# is specified
|
45
|
+
def lookup(field, table, parent_id, parent_id_field)
|
46
|
+
unless configuration.has_key?(:connection)
|
47
|
+
raise ETL::ControlError, "The configuration hash must include the database connection"
|
48
|
+
end
|
49
|
+
|
50
|
+
q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
|
51
|
+
row = configuration[:connection].select_one(q)
|
52
|
+
return row[parent_id_field.to_s], row[field.to_s]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|