activewarehouse-etl 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/CHANGELOG +41 -13
  2. data/README +1 -1
  3. data/Rakefile +14 -4
  4. data/TODO +17 -1
  5. data/bin/etl +3 -1
  6. data/lib/etl.rb +11 -7
  7. data/lib/etl/commands/etl.rb +0 -1
  8. data/lib/etl/control/control.rb +113 -36
  9. data/lib/etl/control/destination.rb +13 -1
  10. data/lib/etl/control/destination/database_destination.rb +3 -1
  11. data/lib/etl/control/destination/file_destination.rb +5 -2
  12. data/lib/etl/control/source.rb +36 -0
  13. data/lib/etl/control/source/database_source.rb +63 -8
  14. data/lib/etl/control/source/file_source.rb +25 -4
  15. data/lib/etl/engine.rb +128 -14
  16. data/lib/etl/generator/surrogate_key_generator.rb +1 -0
  17. data/lib/etl/http_tools.rb +119 -0
  18. data/lib/etl/parser/apache_combined_log_parser.rb +47 -0
  19. data/lib/etl/parser/sax_parser.rb +18 -6
  20. data/lib/etl/processor.rb +1 -0
  21. data/lib/etl/processor/bulk_import_processor.rb +12 -0
  22. data/lib/etl/processor/hierarchy_exploder_processor.rb +54 -0
  23. data/lib/etl/processor/processor.rb +1 -5
  24. data/lib/etl/processor/row_processor.rb +17 -0
  25. data/lib/etl/transform/date_to_string_transform.rb +1 -1
  26. data/lib/etl/transform/decode_transform.rb +1 -1
  27. data/lib/etl/transform/default_transform.rb +15 -0
  28. data/lib/etl/transform/foreign_key_lookup_transform.rb +1 -1
  29. data/lib/etl/transform/hierarchy_lookup_transform.rb +56 -0
  30. data/lib/etl/transform/sha1_transform.rb +1 -1
  31. data/lib/etl/transform/string_to_date_transform.rb +3 -3
  32. data/lib/etl/transform/string_to_datetime_transform.rb +17 -0
  33. data/lib/etl/transform/string_to_time_transform.rb +14 -0
  34. data/lib/etl/transform/transform.rb +8 -4
  35. data/lib/etl/transform/type_transform.rb +2 -2
  36. data/lib/etl/version.rb +2 -2
  37. metadata +21 -8
  38. data/lib/etl/active_record_ext.rb +0 -1
  39. data/lib/etl/active_record_ext/connection_adapters/mysql_adapter.rb +0 -34
@@ -2,6 +2,7 @@ module ETL #:nodoc:
2
2
  module Generator #:nodoc:
3
3
  # Surrogate key generator.
4
4
  class SurrogateKeyGenerator < Generator
5
+ # Get the next surrogate key
5
6
  def next
6
7
  @surrogate_key ||= 0
7
8
  @surrogate_key += 1
@@ -0,0 +1,119 @@
1
+ require 'uri'
2
+
3
+ # Module which has utility methods for HTTP.
4
+ module HttpTools
5
+ # Parse the given user agent string
6
+ #
7
+ # Code taken from http://gemtacular.com/gems/ParseUserAgent
8
+ def parse_user_agent(user_agent)
9
+ if '-' == user_agent
10
+ raise 'Invalid User Agent'
11
+ end
12
+
13
+ browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
14
+
15
+ # fix Opera
16
+ #useragent =~ s/Opera (\d)/Opera\/$1/i;
17
+ useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
18
+
19
+ # grab all Agent/version strings as 'agents'
20
+ agents = Array.new
21
+ user_agent.split(/\s+/).each {|string|
22
+ if string =~ /\//
23
+ agents<< string
24
+ end
25
+ }
26
+
27
+ # cycle through the agents to set browser and version (MSIE is set later)
28
+ if agents && agents.length > 0
29
+ agents.each {|agent|
30
+ parts = agent.split('/')
31
+ browser = parts[0]
32
+ browser_version = parts[1]
33
+ if browser == 'Firefox'
34
+ browser_version_major = parts[1].slice(0,3)
35
+ browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
36
+ elsif browser == 'Safari'
37
+ if parts[1].slice(0,3).to_f < 400
38
+ browser_version_major = '1'
39
+ else
40
+ browser_version_major = '2'
41
+ end
42
+ else
43
+ browser_version_major = parts[1].slice(0,1)
44
+ end
45
+ }
46
+ end
47
+
48
+ # grab all of the properties (within parens)
49
+ # should be in relation to the agent if possible
50
+ detail = user_agent
51
+ user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
52
+ detail = detail.gsub('(','').gsub(')','').lstrip
53
+ properties = detail.split(/;\s+/)
54
+
55
+ # cycle through the properties to set known quantities
56
+ properties.each do |property|
57
+ if property =~ /^Win/
58
+ ostype = 'Windows'
59
+ os = property
60
+ if parts = property.split(/ /,2)
61
+ if parts[1] =~ /^NT/
62
+ ostype = 'Windows'
63
+ subparts = parts[1].split(/ /,2)
64
+ if subparts[1] == '5'
65
+ os_version = '2000'
66
+ elsif subparts[1] == '5.1'
67
+ os_version = 'XP'
68
+ else
69
+ os_version = subparts[1]
70
+ end
71
+ end
72
+ end
73
+ end
74
+ if property == 'Macintosh'
75
+ ostype = 'Macintosh'
76
+ os = property
77
+ end
78
+ if property =~ /OS X/
79
+ ostype = 'Macintosh'
80
+ os_version = 'OS X'
81
+ os = property
82
+ end
83
+ if property =~ /^Linux/
84
+ ostype = 'Linux'
85
+ os = property
86
+ end
87
+ if property =~ /^MSIE/
88
+ browser = 'MSIE'
89
+ browser_version = property.gsub('MSIE ','').lstrip
90
+ browser_version_major,browser_version_minor = browser_version.split('.')
91
+ end
92
+ end
93
+
94
+ result = {
95
+ :browser => browser,
96
+ :browser_version_major => browser_version_major,
97
+ :browser_version_minor => browser_version_minor,
98
+ :ostype => ostype,
99
+ :os_version => os_version,
100
+ :os => os,
101
+ }
102
+ result.each do |key, value|
103
+ result[key] = value.blank? ? nil : value.strip
104
+ end
105
+ result
106
+ end
107
+
108
+ def parse_uri(uri_string)
109
+ if uri_string
110
+ uri = URI.parse(uri_string)
111
+
112
+ results = {:scheme => uri.scheme, :host => uri.host, :port => uri.port, :uri_path => uri.path}
113
+ results[:domain] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
114
+ results
115
+ else
116
+ {:scheme => nil, :host => nil, :port => nil, :uri_path => nil, :domain => nil}
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,47 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser which can parser the Apache Combined Log Format as defined at
4
+ # http://httpd.apache.org/docs/2.2/logs.html
5
+ class ApacheCombinedLogParser < ETL::Parser::Parser
6
+ include HttpTools
7
+ def initialize(source, options={})
8
+ super
9
+ end
10
+
11
+ def each
12
+ Dir.glob(file).each do |file|
13
+ File.open(file).each_line do |line|
14
+ yield parse(line)
15
+ end
16
+ end
17
+ end
18
+
19
+ def parse(line)
20
+ # example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
21
+ line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
22
+ fields = {
23
+ :ip_address => $1,
24
+ :identd => $2,
25
+ :user => $3,
26
+ :timestamp => $4,
27
+ :request => $5,
28
+ :response_code => $6,
29
+ :bytes => $7,
30
+ :referrer => $8,
31
+ :user_agent => $9,
32
+ }
33
+ #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34
+ d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S')
35
+ fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction])
36
+
37
+ fields.merge!(parse_user_agent(fields[:user_agent]))
38
+ fields.merge!(parse_uri(fields[:referrer]))
39
+
40
+ fields.each do |key, value|
41
+ fields[key] = nil if value == '-'
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -1,8 +1,9 @@
1
1
  require 'rexml/parsers/sax2parser'
2
2
  require 'rexml/sax2listener'
3
3
 
4
- module ETL
5
- module Parser
4
+ module ETL #:nodoc:
5
+ module Parser #:nodoc:
6
+ # ETL parser implementation which uses SAX to parse XML files.
6
7
  class SaxParser < ETL::Parser::Parser
7
8
 
8
9
  # The write trigger causes whatever values are currently specified for the row to be returned.
@@ -77,7 +78,19 @@ module ETL
77
78
 
78
79
  end
79
80
  def start_element(uri, localname, qname, attributes)
80
- @path.elements << XPath::Element.new(localname, attributes)
81
+ element = XPath::Element.new(localname, attributes)
82
+ @path.elements << element
83
+
84
+ @parser.fields.each do |field|
85
+ #puts "#{@path} match? #{field.path}"
86
+ if @path.match?(field.path)
87
+ #puts "field.path: #{field.path}"
88
+ if field.path.is_attribute?
89
+ #puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
90
+ @row[field.name] = element.attributes[field.path.attribute]
91
+ end
92
+ end
93
+ end
81
94
  end
82
95
  def end_element(uri, localname, qname)
83
96
  element = @path.elements.last
@@ -86,13 +99,12 @@ module ETL
86
99
  #puts "#{@path} match? #{field.path}"
87
100
  if @path.match?(field.path)
88
101
  #puts "field.path: #{field.path}"
89
- if field.path.is_attribute?
90
- @row[field.name] = element.attributes[field.path.attribute]
91
- else
102
+ if !field.path.is_attribute?
92
103
  @row[field.name] = @value
93
104
  end
94
105
  end
95
106
  end
107
+
96
108
  #puts @path.to_s
97
109
  if @path.match?(@parser.write_trigger)
98
110
  #puts "matched: #{@path} =~ #{@parser.write_trigger}"
data/lib/etl/processor.rb CHANGED
@@ -1,2 +1,3 @@
1
1
  require 'etl/processor/processor'
2
+ require 'etl/processor/row_processor'
2
3
  Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
@@ -3,12 +3,18 @@ module ETL #:nodoc:
3
3
  # Processor which is used to bulk import data into a target database
4
4
  class BulkImportProcessor < ETL::Processor::Processor
5
5
  attr_reader :file, :target, :truncate, :columns
6
+ attr_accessor :field_separator
7
+ attr_accessor :field_enclosure
8
+ attr_accessor :line_separator
6
9
  def initialize(control, configuration)
7
10
  super
8
11
  @file = File.join(File.dirname(control.file), configuration[:file])
9
12
  @target = configuration[:target]
10
13
  @truncate = configuration[:truncate] ||= false
11
14
  @columns = configuration[:columns]
15
+ @field_separator = (configuration[:field_separator] || ',')
16
+ @line_separator = configuration[:line_separator]
17
+ @field_enclosure = configuration[:field_enclosure]
12
18
  connect
13
19
  end
14
20
  def process
@@ -20,6 +26,11 @@ module ETL #:nodoc:
20
26
  conn.truncate(target[:table]) if truncate
21
27
  options = {}
22
28
  options[:columns] = columns
29
+ if field_separator || field_enclosure
30
+ options[:fields] = {}
31
+ options[:fields][:delimited_by] = field_separator if field_separator
32
+ options[:fields][:enclosed_by] = field_enclosure if field_enclosure
33
+ end
23
34
  conn.bulk_load(file, target[:table], options)
24
35
  end
25
36
  end
@@ -27,6 +38,7 @@ module ETL #:nodoc:
27
38
  private
28
39
  # Connect to the database
29
40
  def connect
41
+ Engine.logger.debug "Connecting to database #{target[:database]}"
30
42
  ETL::ActiveRecord::Base.establish_connection(
31
43
  :adapter => (target[:adapter] || :mysql),
32
44
  :username => (target[:username] || 'root'),
@@ -0,0 +1,54 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Row-level processor that will convert a single row into multiple rows designed to be inserted
4
+ # into a hierarchy bridge table.
5
+ class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6
+ attr_accessor :id_field
7
+ attr_accessor :parent_id_field
8
+
9
+ # Initialize the processor
10
+ #
11
+ # Configuration options:
12
+ # * <tt>:connection</tt>: The ActiveRecord adapter connection
13
+ # * <tt>:id_field</tt>: The name of the id field (defaults to 'id')
14
+ # * <tt>:parent_id_field</tt>: The name of the parent id field (defaults to 'parent_id')
15
+ #
16
+ # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17
+ # on AR as the only resolution method.
18
+ def initialize(control, configuration={})
19
+ @id_field = configuration[:id_field] || 'id'
20
+ @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21
+ super
22
+ end
23
+
24
+ # Process the row expanding it into hierarchy values
25
+ def process(row)
26
+ rows = []
27
+ conn = configuration[:connection]
28
+ table = configuration[:table]
29
+ build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
30
+ rows
31
+ end
32
+
33
+ protected
34
+ # Recursive function that will add a row for the current level and then call build_rows
35
+ # for all of the children of the current level
36
+ def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
37
+ ids.each do |id|
38
+ child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
39
+
40
+ row = {
41
+ :parent_id => row_id,
42
+ :child_id => id,
43
+ :num_levels_from_parent => level,
44
+ :is_bottom => (child_ids.empty? ? 1 : 0),
45
+ :is_top => (root ? 1 : 0),
46
+ }
47
+ rows << row
48
+
49
+ build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -14,11 +14,7 @@ module ETL #:nodoc:
14
14
  @configuration
15
15
  end
16
16
  def log
17
- unless @log
18
- @log = Logger.new(STDOUT)
19
- @log.level = Logger::DEBUG
20
- end
21
- @log
17
+ Engine.logger
22
18
  end
23
19
  end
24
20
  end
@@ -0,0 +1,17 @@
1
+ module ETL #:nodoc:
2
+ module Processor #:nodoc:
3
+ # Processor which processes a specific row. Unlike a transformer, which deals with a specific
4
+ # value in the row, row processors can process an entire row at once, which can be used to
5
+ # explode a single row into multiple rows (for example)
6
+ class RowProcessor < Processor
7
+ # Initialize the processor
8
+ def initialize(control, configuration)
9
+ super
10
+ end
11
+ # Process the specified row
12
+ def process(row)
13
+ raise "process_row is an abstract method"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -11,7 +11,7 @@ module ETL #:nodoc:
11
11
  @format = configuration[:format] || "%Y-%m-%d"
12
12
  end
13
13
  # Transform the value using strftime
14
- def transform(value)
14
+ def transform(name, value, row)
15
15
  value.strftime(@format)
16
16
  end
17
17
  end
@@ -27,7 +27,7 @@ module ETL #:nodoc:
27
27
  end
28
28
 
29
29
  # Transform the value
30
- def transform(value)
30
+ def transform(name, value, row)
31
31
  decode_table[value] || default_value
32
32
  end
33
33
 
@@ -0,0 +1,15 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which will replace nil or empty values with a specified value.
4
+ class DefaultTransform < Transform
5
+ attr_accessor :default_value
6
+ def initialize(control, configuration)
7
+ super
8
+ @default_value = configuration[:default_value]
9
+ end
10
+ def transform(name, value, row)
11
+ value.blank? ? default_value : value
12
+ end
13
+ end
14
+ end
15
+ end
@@ -21,7 +21,7 @@ module ETL #:nodoc:
21
21
  end
22
22
 
23
23
  # Transform the value by resolving it to a foriegn key
24
- def transform(value)
24
+ def transform(name, value, row)
25
25
  fk = @collection[value]
26
26
  unless fk
27
27
  raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
@@ -0,0 +1,56 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform which walks up the hierarchy tree to find a value of the current level's value
4
+ # is nil.
5
+ #
6
+ # Configuration options:
7
+ # * <tt>:table</tt>: The name of the table to use for lookup (required)
8
+ # * <tt>:connection</tt>: The database adapter connection (required)
9
+ # * <tt>:parent_id_field</tt>: The name of the parent ID field (defaults to :parent_id)
10
+ #
11
+ # TODO: Let the resolver be implemented in a class so different resolution methods are
12
+ # possible.
13
+ class HierarchyLookupTransform < ETL::Transform::Transform
14
+ # The name of the field to use for the parent ID
15
+ attr_accessor :parent_id_field
16
+
17
+ # Initialize the transform
18
+ #
19
+ # Configuration options:
20
+ # * <tt>:table</tt>: The table to search (required)
21
+ # * <tt>:connection</tt>: The ActiveRecord adapter (required)
22
+ # * <tt>:parent_id_field</tt>: The name of the field to use for the parent ID (defaults to :parent_id)
23
+ def initialize(control, configuration={})
24
+ super
25
+ @parent_id_field = configuration[:parent_id_field] || :parent_id
26
+ end
27
+
28
+ # Transform the value.
29
+ def transform(name, value, row)
30
+ if parent_id = row[parent_id_field]
31
+ # TODO: should use more than just the first source out of the control
32
+ parent_id, value = lookup(name,
33
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
34
+ until value || parent_id.nil?
35
+ # TODO: should use more than just the first source out of the control
36
+ parent_id, value = lookup(name,
37
+ control.sources.first.configuration[:table], parent_id, parent_id_field)
38
+ end
39
+ end
40
+ value
41
+ end
42
+
43
+ # Lookup the parent value. Note that this method requires that configuration[:connection]
44
+ # is specified
45
+ def lookup(field, table, parent_id, parent_id_field)
46
+ unless configuration.has_key?(:connection)
47
+ raise ETL::ControlError, "The configuration hash must include the database connection"
48
+ end
49
+
50
+ q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
51
+ row = configuration[:connection].select_one(q)
52
+ return row[parent_id_field.to_s], row[field.to_s]
53
+ end
54
+ end
55
+ end
56
+ end