activewarehouse-etl 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/CHANGELOG +41 -13
  2. data/README +1 -1
  3. data/Rakefile +14 -4
  4. data/TODO +17 -1
  5. data/bin/etl +3 -1
  6. data/lib/etl.rb +11 -7
  7. data/lib/etl/commands/etl.rb +0 -1
  8. data/lib/etl/control/control.rb +113 -36
  9. data/lib/etl/control/destination.rb +13 -1
  10. data/lib/etl/control/destination/database_destination.rb +3 -1
  11. data/lib/etl/control/destination/file_destination.rb +5 -2
  12. data/lib/etl/control/source.rb +36 -0
  13. data/lib/etl/control/source/database_source.rb +63 -8
  14. data/lib/etl/control/source/file_source.rb +25 -4
  15. data/lib/etl/engine.rb +128 -14
  16. data/lib/etl/generator/surrogate_key_generator.rb +1 -0
  17. data/lib/etl/http_tools.rb +119 -0
  18. data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
  19. data/lib/etl/parser/sax_parser.rb +18 -6
  20. data/lib/etl/processor.rb +1 -0
  21. data/lib/etl/processor/bulk_import_processor.rb +12 -0
  22. data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
  23. data/lib/etl/processor/processor.rb +1 -5
  24. data/lib/etl/processor/row_processor.rb +17 -0
  25. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  26. data/lib/etl/transform/decode_transform.rb +1 -1
  27. data/lib/etl/transform/default_transform.rb +15 -0
  28. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  29. data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
  30. data/lib/etl/transform/sha1_transform.rb +1 -1
  31. data/lib/etl/transform/string_to_date_transform.rb +3 -3
  32. data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
  33. data/lib/etl/transform/string_to_time_transform.rb +14 -0
  34. data/lib/etl/transform/transform.rb +8 -4
  35. data/lib/etl/transform/type_transform.rb +2 -2
  36. data/lib/etl/version.rb +2 -2
  37. metadata +21 -8
  38. data/lib/etl/active_record_ext.rb +0 -1
  39. data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -2,6 +2,7 @@ module ETL #:nodoc:
2
2
  module Generator #:nodoc:
3
3
  # Surrogate key generator.
4
4
  class SurrogateKeyGenerator < Generator
5
+ # Get the next surrogate key
5
6
  def next
6
7
  @surrogate_key ||= 0
7
8
  @surrogate_key += 1
@@ -0,0 +1,119 @@
1
+ require 'uri'
2
+
3
+ # Module which has utility methods for HTTP.
4
+ module HttpTools
5
+ # Parse the given user agent string
6
+ #
7
+ # Code taken from http://gemtacular.com/gems/ParseUserAgent
8
+ def parse_user_agent(user_agent)
9
+ if '-' == user_agent
10
+ raise 'Invalid User Agent'
11
+ end
12
+
13
+ browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
14
+
15
+ # fix Opera
16
+ #useragent =~ s/Opera (\d)/Opera\/$1/i;
17
+ useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
18
+
19
+ # grab all Agent/version strings as 'agents'
20
+ agents = Array.new
21
+ user_agent.split(/\s+/).each {|string|
22
+ if string =~ /\//
23
+ agents<< string
24
+ end
25
+ }
26
+
27
+ # cycle through the agents to set browser and version (MSIE is set later)
28
+ if agents && agents.length > 0
29
+ agents.each {|agent|
30
+ parts = agent.split('/')
31
+ browser = parts[0]
32
+ browser_version = parts[1]
33
+ if browser == 'Firefox'
34
+ browser_version_major = parts[1].slice(0,3)
35
+ browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
36
+ elsif browser == 'Safari'
37
+ if parts[1].slice(0,3).to_f < 400
38
+ browser_version_major = '1'
39
+ else
40
+ browser_version_major = '2'
41
+ end
42
+ else
43
+ browser_version_major = parts[1].slice(0,1)
44
+ end
45
+ }
46
+ end
47
+
48
+ # grab all of the properties (within parens)
49
+ # should be in relation to the agent if possible
50
+ detail = user_agent
51
+ user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
52
+ detail = detail.gsub('(','').gsub(')','').lstrip
53
+ properties = detail.split(/;\s+/)
54
+
55
+ # cycle through the properties to set known quantities
56
+ properties.each do |property|
57
+ if property =~ /^Win/
58
+ ostype = 'Windows'
59
+ os = property
60
+ if parts = property.split(/ /,2)
61
+ if parts[1] =~ /^NT/
62
+ ostype = 'Windows'
63
+ subparts = parts[1].split(/ /,2)
64
+ if subparts[1] == '5'
65
+ os_version = '2000'
66
+ elsif subparts[1] == '5.1'
67
+ os_version = 'XP'
68
+ else
69
+ os_version = subparts[1]
70
+ end
71
+ end
72
+ end
73
+ end
74
+ if property == 'Macintosh'
75
+ ostype = 'Macintosh'
76
+ os = property
77
+ end
78
+ if property =~ /OS X/
79
+ ostype = 'Macintosh'
80
+ os_version = 'OS X'
81
+ os = property
82
+ end
83
+ if property =~ /^Linux/
84
+ ostype = 'Linux'
85
+ os = property
86
+ end
87
+ if property =~ /^MSIE/
88
+ browser = 'MSIE'
89
+ browser_version = property.gsub('MSIE ','').lstrip
90
+ browser_version_major,browser_version_minor = browser_version.split('.')
91
+ end
92
+ end
93
+
94
+ result = {
95
+ :browser => browser,
96
+ :browser_version_major => browser_version_major,
97
+ :browser_version_minor => browser_version_minor,
98
+ :ostype => ostype,
99
+ :os_version => os_version,
100
+ :os => os,
101
+ }
102
+ result.each do |key, value|
103
+ result[key] = value.blank? ? nil : value.strip
104
+ end
105
+ result
106
+ end
107
+
108
+ def parse_uri(uri_string)
109
+ if uri_string
110
+ uri = URI.parse(uri_string)
111
+
112
+ results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
113
+ results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
114
+ results
115
+ else
116
+ {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,47 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser which can parser the Apache Combined Log Format as defined at
4
+ # http://httpd.apache.org/docs/2.2/logs.html
5
+ class ApacheCombinedLogParser < ETL::Parser::Parser
6
+ include HttpTools
7
+ def initialize(source, options={})
8
+ super
9
+ end
10
+
11
+ def each
12
+ Dir.glob(file).each do |file|
13
+ File.open(file).each_line do |line|
14
+ yield parse(line)
15
+ end
16
+ end
17
+ end
18
+
19
+ def parse(line)
20
+ # example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
21
+ line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
22
+ fields = {
23
+ :ip_address => $1,
24
+ :identd => $2,
25
+ :user => $3,
26
+ :timestamp => $4,
27
+ :request => $5,
28
+ :response_code => $6,
29
+ :bytes => $7,
30
+ :referrer => $8,
31
+ :user_agent => $9,
32
+ }
33
+ #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34
+ d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S')
35
+ fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction])
36
+
37
+ fields.merge!(parse_user_agent(fields[:user_agent]))
38
+ fields.merge!(parse_uri(fields[:referrer]))
39
+
40
+ fields.each do |key, value|
41
+ fields[key] = nil if value == '-'
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -1,8 +1,9 @@
1
1
  require 'rexml/parsers/sax2parser'
2
2
  require 'rexml/sax2listener'
3
3
 
4
- module ETL
5
- module Parser
4
+ module ETL #:nodoc:
5
+ module Parser #:nodoc:
6
+ # ETL parser implementation which uses SAX to parse XML files.
6
7
  class SaxParser < ETL::Parser::Parser
7
8
 
8
9
  # The write trigger causes whatever values are currently specified for the row to be returned.
@@ -77,7 +78,19 @@ module ETL
77
78
 
78
79
  end
79
80
  def start_element(uri, localname, qname, attributes)
80
- @path.elements << XPath::Element.new(localname, attributes)
81
+ element = XPath::Element.new(localname, attributes)
82
+ @path.elements << element
83
+
84
+ @parser.fields.each do |field|
85
+ #puts "#{@path} match? #{field.path}"
86
+ if @path.match?(field.path)
87
+ #puts "field.path: #{field.path}"
88
+ if field.path.is_attribute?
89
+ #puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
90
+ @row[field.name] = element.attributes[field.path.attribute]
91
+ end
92
+ end
93
+ end
81
94
  end
82
95
  def end_element(uri, localname, qname)
83
96
  element = @path.elements.last
@@ -86,13 +99,12 @@ module ETL
86
99
  #puts "#{@path} match? #{field.path}"
87
100
  if @path.match?(field.path)
88
101
  #puts "field.path: #{field.path}"
89
- if field.path.is_attribute?
90
- @row[field.name] = element.attributes[field.path.attribute]
91
- else
102
+ if !field.path.is_attribute?
92
103
  @row[field.name] = @value
93
104
  end
94
105
  end
95
106
  end
107
+
96
108
  #puts @path.to_s
97
109
  if @path.match?(@parser.write_trigger)
98
110
  #puts "matched: #{@path} =~ #{@parser.write_trigger}"
data/lib/etl/processor.rb CHANGED
@@ -1,2 +1,3 @@
1
1
  require 'etl/processor/processor'
2
+ require 'etl/processor/row_processor'
2
3
  Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -3,12 +3,18 @@ module ETL #:nodoc:
3
3
  # Processor which is used to bulk import data into a target database
4
4
  class BulkImportProcessor < ETL::Processor::Processor
5
5
  attr_reader :file, :target, :truncate, :columns
6
+ attr_accessor :field_separator
7
+ attr_accessor :field_enclosure
8
+ attr_accessor :line_separator
6
9
  def initialize(control, configuration)
7
10
  super
8
11
  @file = File.join(File.dirname(control.file), configuration[:file])
9
12
  @target = configuration[:target]
10
13
  @truncate = configuration[:truncate] ||= false
11
14
  @columns = configuration[:columns]
15
+ @field_separator = (configuration[:field_separator] || ',')
16
+ @line_separator = configuration[:line_separator]
17
+ @field_enclosure = configuration[:field_enclosure]
12
18
  connect
13
19
  end
14
20
  def process
@@ -20,6 +26,11 @@ module ETL #:nodoc:
20
26
  conn.truncate(target[:table]) if truncate
21
27
  options = {}
22
28
  options[:columns] = columns
29
+ if field_separator || field_enclosure
30
+ options[:fields] = {}
31
+ options[:fields][:delimited_by] = field_separator if field_separator
32
+ options[:fields][:enclosed_by] = field_enclosure if field_enclosure
33
+ end
23
34
  conn.bulk_load(file, target[:table], options)
24
35
  end
25
36
  end
@@ -27,6 +38,7 @@ module ETL #:nodoc:
27
38
  private
28
39
  # Connect to the database
29
40
  def connect
41
+ Engine.logger.debug "Connecting to database #{target[:database]}"
30
42
  ETL::ActiveRecord::Base.establish_connection(
31
43
  :adapter => (target[:adapter] || :mysql),
32
44
  :username => (target[:username] || 'root'),
@@ -0,0 +1,54 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ conn = configuration[:connection]
28
+ table = configuration[:table]
29
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
30
+ rows
31
+ end
32
+
33
+ protected
34
+ # Recursive function that will add a row for the current level and then call build_rows
35
+ # for all of the children of the current level
36
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
37
+ ids.each do |id|
38
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
39
+
40
+ row = {
41
+ :parent_id => row_id,
42
+ :child_id => id,
43
+ :num_levels_from_parent => level,
44
+ :is_bottom => (child_ids.empty? ? 1 : 0),
45
+ :is_top => (root ? 1 : 0),
46
+ }
47
+ rows << row
48
+
49
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -14,11 +14,7 @@ module ETL #:nodoc:
14
14
  @configuration
15
15
  end
16
16
  def log
17
- unless @log
18
- @log = Logger.new(STDOUT)
19
- @log.level = Logger::DEBUG
20
- end
21
- @log
17
+ Engine.logger
22
18
  end
23
19
  end
24
20
  end
@@ -0,0 +1,17 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which processes a specific row. Unlike a transformer, which deals with a specific
4
+ # value in the row, row processors can process an entire row at once, which can be used to
5
+ # explode a single row into multiple rows (for example)
6
+ class RowProcessor < Processor
7
+ # Initialize the processor
8
+ def initialize(control, configuration)
9
+ super
10
+ end
11
+ # Process the specified row
12
+ def process(row)
13
+ raise "process_row is an abstract method"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -11,7 +11,7 @@ module ETL #:nodoc:
11
11
  @format = configuration[:format] || "%Y-%m-%d"
12
12
  end
13
13
  # Transform the value using strftime
14
- def transform(value)
14
+ def transform(name, value, row)
15
15
  value.strftime(@format)
16
16
  end
17
17
  end
@@ -27,7 +27,7 @@ module ETL #:nodoc:
27
27
  end
28
28
 
29
29
  # Transform the value
30
- def transform(value)
30
+ def transform(name, value, row)
31
31
  decode_table[value] || default_value
32
32
  end
33
33
 
@@ -0,0 +1,15 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which will replace nil or empty values with a specified value.
4
+ class DefaultTransform < Transform
5
+ attr_accessor :default_value
6
+ def initialize(control, configuration)
7
+ super
8
+ @default_value = configuration[:default_value]
9
+ end
10
+ def transform(name, value, row)
11
+ value.blank? ? default_value : value
12
+ end
13
+ end
14
+ end
15
+ end
@@ -21,7 +21,7 @@ module ETL #:nodoc:
21
21
  end
22
22
 
23
23
  # Transform the value by resolving it to a foriegn key
24
- def transform(value)
24
+ def transform(name, value, row)
25
25
  fk = @collection[value]
26
26
  unless fk
27
27
  raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
@@ -0,0 +1,56 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which walks up the hierarchy tree to find a value of the current level's value
4
+ # is nil.
5
+ #
6
+ # Configuration options:
7
+ # * <tt>:table</tt>: The name of the table to use for lookup (required)
8
+ # * <tt>:connection</tt>: The database adapter connection (required)
9
+ # * <tt>:parent_id_field</tt>: The name of the parent ID field (defaults to :parent_id)
10
+ #
11
+ # TODO: Let the resolver be implemented in a class so different resolution methods are
12
+ # possible.
13
+ class HierarchyLookupTransform < ETL::Transform::Transform
14
+ # The name of the field to use for the parent ID
15
+ attr_accessor :parent_id_field
16
+
17
+ # Initialize the transform
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:table</tt>: The table to search (required)
21
+ # * <tt>:connection</tt>: The ActiveRecord adapter (required)
22
+ # * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
23
+ def initialize(control, configuration={})
24
+ super
25
+ @parent_id_field = configuration[:parent_id_field] || :parent_id
26
+ end
27
+
28
+ # Transform the value.
29
+ def transform(name, value, row)
30
+ if parent_id = row[parent_id_field]
31
+ # TODO: should use more than just the first source out of the control
32
+ parent_id, value = lookup(name,
33
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
34
+ until value || parent_id.nil?
35
+ # TODO: should use more than just the first source out of the control
36
+ parent_id, value = lookup(name,
37
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
38
+ end
39
+ end
40
+ value
41
+ end
42
+
43
+ # Lookup the parent value. Note that this method requires that configuration[:connection]
44
+ # is specified
45
+ def lookup(field, table, parent_id, parent_id_field)
46
+ unless configuration.has_key?(:connection)
47
+ raise ETL::ControlError, "The configuration hash must include the database connection"
48
+ end
49
+
50
+ q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
51
+ row = configuration[:connection].select_one(q)
52
+ return row[parent_id_field.to_s], row[field.to_s]
53
+ end
54
+ end
55
+ end
56
+ end