factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,20 @@
1
+ module ETL #:nodoc
2
+ # Classes which store information about ETL execution
3
+ module Execution
4
+ # Execution management
5
+ class Execution
6
+ class << self
7
+ # Migrate the data store
8
+ def migrate
9
+ ETL::Execution::Migration.migrate
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ require 'etl/execution/base'
17
+ require 'etl/execution/batch'
18
+ require 'etl/execution/job'
19
+ require 'etl/execution/record'
20
+ require 'etl/execution/migration'
@@ -0,0 +1,9 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Base class for ETL execution information
4
+ class Base < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ establish_connection :etl_execution
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL batch
4
+ class Batch < Base
5
+ has_many :jobs
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Persistent class representing an ETL job
4
+ class Job < Base
5
+ belongs_to :batch
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,85 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc
3
+ # Handles migration of tables required for persistent storage of meta data
4
+ # for the ETL engine
5
+ class Migration
6
+ class << self
7
+ protected
8
+ # Get the schema info table name
9
+ def schema_info_table_name
10
+ ActiveRecord::Migrator.schema_migrations_table_name
11
+ end
12
+ alias :schema_migrations_table_name :schema_info_table_name
13
+
14
+ public
15
+ # Execute the migrations
16
+ def migrate
17
+ connection.initialize_schema_migrations_table
18
+ last_migration.upto(target - 1) do |i|
19
+ __send__("migration_#{i+1}".to_sym)
20
+ connection.assume_migrated_upto_version(i+1)
21
+ end
22
+ end
23
+
24
+ protected
25
+ def last_migration
26
+ connection.select_values(
27
+ "SELECT version FROM #{schema_migrations_table_name}"
28
+ ).map(&:to_i).sort.last || 0
29
+ end
30
+
31
+ # Get the connection to use during migration
32
+ def connection
33
+ @connection ||= ETL::Execution::Base.connection
34
+ end
35
+
36
+ # Get the final target version number
37
+ def target
38
+ 4
39
+ end
40
+
41
+ private
42
+ def migration_1 #:nodoc:
43
+ connection.create_table :jobs do |t|
44
+ t.column :control_file, :string, :null => false
45
+ t.column :created_at, :datetime, :null => false
46
+ t.column :completed_at, :datetime
47
+ t.column :status, :string
48
+ end
49
+ connection.create_table :records do |t|
50
+ t.column :control_file, :string, :null => false
51
+ t.column :natural_key, :string, :null => false
52
+ t.column :crc, :string, :null => false
53
+ t.column :job_id, :integer, :null => false
54
+ end
55
+ end
56
+
57
+ def migration_2 #:nodoc:
58
+ connection.add_index :records, :control_file
59
+ connection.add_index :records, :natural_key
60
+ connection.add_index :records, :job_id
61
+ end
62
+
63
+ def migration_3 #:nodoc:
64
+ connection.create_table :batches do |t|
65
+ t.column :batch_file, :string, :null => false
66
+ t.column :created_at, :datetime, :null => false
67
+ t.column :completed_at, :datetime
68
+ t.column :status, :string
69
+ end
70
+ connection.add_column :jobs, :batch_id, :integer
71
+ connection.add_index :jobs, :batch_id
72
+ end
73
+
74
+ def migration_4
75
+ connection.drop_table :records
76
+ end
77
+
78
+ # Update the schema info table, setting the version value
79
+ def update_schema_info(version)
80
+ connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,18 @@
1
+ module ETL #:nodoc:
2
+ module Execution #:nodoc:
3
+ # Represents a single record
4
+ class Record < ETL::Execution::Base
5
+ belongs_to :table
6
+ class << self
7
+ attr_accessor :time_spent
8
+ def time_spent
9
+ @time_spent ||= 0
10
+ end
11
+ def average_time_spent
12
+ return 0 if time_spent == 0
13
+ ETL::Engine.rows_read / time_spent
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ require 'etl/generator/generator'
2
+ Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { |file| require(file) }
@@ -0,0 +1,20 @@
1
+ module ETL #:nodoc:
2
+ module Generator #:nodoc:
3
+ # Base class for generators.
4
+ class Generator
5
+ class << self
6
+ # Get the Class for the specified name.
7
+ #
8
+ # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
9
+ def class_for_name(name)
10
+ ETL::Generator.const_get("#{name.to_s.camelize}Generator")
11
+ end
12
+ end
13
+
14
+ # Generate the next value. This method must be implemented by subclasses
15
+ def next
16
+ raise "Must be implemented by a subclass"
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,39 @@
1
+ # This source file contains code for a basic sequential surrogate key generator
2
+
3
+ module ETL #:nodoc:
4
+ module Generator #:nodoc:
5
+ # Surrogate key generator.
6
+ class SurrogateKeyGenerator < Generator
7
+ attr_reader :table
8
+ attr_reader :target
9
+ attr_reader :column
10
+ attr_reader :query
11
+
12
+ # Initialize the generator
13
+ def initialize(options={})
14
+ @table = options[:table]
15
+ @target = options[:target]
16
+ @column = options[:column] || 'id'
17
+ @query = options[:query]
18
+
19
+ if table
20
+ @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
21
+ elsif query
22
+ @surrogate_key = ETL::Engine.connection(target).select_value(query)
23
+ end
24
+ @surrogate_key = 0 if @surrogate_key.blank?
25
+ @surrogate_key = @surrogate_key.to_i
26
+ end
27
+
28
+ # Get the next surrogate key
29
+ def next
30
+ @surrogate_key ||= 0
31
+ @surrogate_key += 1
32
+ end
33
+
34
+ def table_name
35
+ ETL::Engine.table(table, ETL::Engine.connection(target))
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,139 @@
1
+ require 'uri'
2
+
3
+ # Module which has utility methods for HTTP.
4
+ module HttpTools
5
+ # Parse the given user agent string
6
+ #
7
+ # Code taken from http://gemtacular.com/gems/ParseUserAgent
8
+ def parse_user_agent(user_agent)
9
+ if '-' == user_agent
10
+ #raise 'Invalid User Agent'
11
+ #puts 'Invalid User Agent'
12
+ end
13
+
14
+ browser, browser_version_major, browser_version_minor, ostype, os, os_version = nil
15
+
16
+ # fix Opera
17
+ #useragent =~ s/Opera (\d)/Opera\/$1/i;
18
+ useragent = user_agent.gsub(/(Opera [\d])/,'Opera\1')
19
+
20
+ # grab all Agent/version strings as 'agents'
21
+ agents = Array.new
22
+ user_agent.split(/\s+/).each {|string|
23
+ if string =~ /\//
24
+ agents<< string
25
+ end
26
+ }
27
+
28
+ # cycle through the agents to set browser and version (MSIE is set later)
29
+ if agents && agents.length > 0
30
+ agents.each {|agent|
31
+ parts = agent.split('/')
32
+ browser = parts[0]
33
+ browser_version = parts[1]
34
+ if browser == 'Firefox'
35
+ browser_version_major = parts[1].slice(0,3)
36
+ browser_version_minor = parts[1].sub(browser_version_major,'').sub('.','')
37
+ elsif browser == 'Safari'
38
+ if parts[1].slice(0,3).to_f < 400
39
+ browser_version_major = '1'
40
+ else
41
+ browser_version_major = '2'
42
+ end
43
+ else
44
+ browser_version_major = parts[1].slice(0,1)
45
+ end
46
+ }
47
+ end
48
+
49
+ # grab all of the properties (within parens)
50
+ # should be in relation to the agent if possible
51
+ detail = user_agent
52
+ user_agent.gsub(/\((.*)\)/,'').split(/\s/).each {|part| detail = detail.gsub(part,'')}
53
+ detail = detail.gsub('(','').gsub(')','').lstrip
54
+ properties = detail.split(/;\s+/)
55
+
56
+ # cycle through the properties to set known quantities
57
+ properties.each do |property|
58
+ if property =~ /^Win/
59
+ ostype = 'Windows'
60
+ os = property
61
+ if parts = property.split(/ /,2)
62
+ if parts[1] =~ /^NT/
63
+ ostype = 'Windows'
64
+ subparts = parts[1].split(/ /,2)
65
+ if subparts[1] == '5'
66
+ os_version = '2000'
67
+ elsif subparts[1] == '5.1'
68
+ os_version = 'XP'
69
+ else
70
+ os_version = subparts[1]
71
+ end
72
+ end
73
+ end
74
+ end
75
+ if property == 'Macintosh'
76
+ ostype = 'Macintosh'
77
+ os = property
78
+ end
79
+ if property =~ /OS X/
80
+ ostype = 'Macintosh'
81
+ os_version = 'OS X'
82
+ os = property
83
+ end
84
+ if property =~ /^Linux/
85
+ ostype = 'Linux'
86
+ os = property
87
+ end
88
+ if property =~ /^MSIE/
89
+ browser = 'MSIE'
90
+ browser_version = property.gsub('MSIE ','').lstrip
91
+ browser_version_major,browser_version_minor = browser_version.split('.')
92
+ end
93
+ end
94
+
95
+ result = {
96
+ :browser => browser,
97
+ :browser_version_major => browser_version_major,
98
+ :browser_version_minor => browser_version_minor,
99
+ :ostype => ostype,
100
+ :os_version => os_version,
101
+ :os => os,
102
+ }
103
+ result.each do |key, value|
104
+ result[key] = value.blank? ? nil : value.strip
105
+ end
106
+ result
107
+ end
108
+
109
+ # Parse a URI. If options[:prefix] is set then prepend it to the keys for the hash that
110
+ # is returned.
111
+ def parse_uri(uri_string, options={})
112
+ prefix = options[:prefix] ||= ''
113
+ empty_hash = {
114
+ "#{prefix}scheme".to_sym => nil,
115
+ "#{prefix}host".to_sym => nil,
116
+ "#{prefix}port".to_sym => nil,
117
+ "#{prefix}uri_path".to_sym => nil,
118
+ "#{prefix}domain".to_sym => nil
119
+ }
120
+ if uri_string
121
+ #attempt to parse uri --if it's a uri then catch the problem and set everything to nil
122
+ begin
123
+ uri = URI.parse(uri_string)
124
+ results = {
125
+ "#{prefix}scheme".to_sym => uri.scheme,
126
+ "#{prefix}host".to_sym => uri.host,
127
+ "#{prefix}port".to_sym => uri.port,
128
+ "#{prefix}uri_path".to_sym => uri.path
129
+ }
130
+ results["#{prefix}domain".to_sym] = $1 if uri.host =~ /\.?([^\.]+\.[^\.]+$)/
131
+ results
132
+ rescue
133
+ empty_hash
134
+ end
135
+ else
136
+ empty_hash
137
+ end
138
+ end
139
+ end
data/lib/etl/parser.rb ADDED
@@ -0,0 +1,11 @@
1
+ # This source file contains the ETL::Parser module and requires all of the files
2
+ # in the parser directory ending with .rb
3
+
4
+ module ETL #:nodoc:
5
+ # The ETL::Parser module provides various text parsers.
6
+ module Parser
7
+ end
8
+ end
9
+
10
+ require 'etl/parser/parser'
11
+ Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
@@ -0,0 +1,49 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser which can parser the Apache Combined Log Format as defined at
4
+ # http://httpd.apache.org/docs/2.2/logs.html
5
+ class ApacheCombinedLogParser < ETL::Parser::Parser
6
+ include HttpTools
7
+ def initialize(source, options={})
8
+ super
9
+ end
10
+
11
+ def each
12
+ Dir.glob(file).each do |file|
13
+ File.open(file).each_line do |line|
14
+ yield parse(line)
15
+ end
16
+ end
17
+ end
18
+
19
+ def parse(line)
20
+ # example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
21
+ line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
22
+ fields = {
23
+ :ip_address => $1,
24
+ :identd => $2,
25
+ :user => $3,
26
+ :timestamp => $4,
27
+ :request => $5,
28
+ :response_code => $6,
29
+ :bytes => $7,
30
+ :referrer => $8,
31
+ :user_agent => $9,
32
+ }
33
+ #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34
+ d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil?
35
+ fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil?
36
+
37
+ fields[:method], fields[:path] = fields[:request].split(/\s/)
38
+
39
+ fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil?
40
+ fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_'))
41
+
42
+ fields.each do |key, value|
43
+ fields[key] = nil if value == '-'
44
+ end
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,74 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parses delimited files
4
+ class DelimitedParser < ETL::Parser::Parser
5
+ # Initialize the parser
6
+ # * <tt>source</tt>: The Source object
7
+ # * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
8
+ def initialize(source, options={})
9
+ super
10
+ configure
11
+ end
12
+
13
+ # Returns each row.
14
+ def each
15
+ Dir.glob(file).each do |file|
16
+ ETL::Engine.logger.debug "parsing #{file}"
17
+ line = 0
18
+ lines_skipped = 0
19
+ FasterCSV.foreach(file, options) do |raw_row|
20
+ if lines_skipped < source.skip_lines
21
+ ETL::Engine.logger.debug "skipping line"
22
+ lines_skipped += 1
23
+ next
24
+ end
25
+ line += 1
26
+ row = {}
27
+ validate_row(raw_row, line, file)
28
+ raw_row.each_with_index do |value, index|
29
+ f = fields[index]
30
+ row[f.name] = value
31
+ end
32
+ yield row
33
+ end
34
+ end
35
+ end
36
+
37
+ # Get an array of defined fields
38
+ def fields
39
+ @fields ||= []
40
+ end
41
+
42
+ private
43
+ def validate_row(row, line, file)
44
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
45
+ if row.length != fields.length
46
+ raise_with_info( MismatchError,
47
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
48
+ line, file
49
+ )
50
+ end
51
+ end
52
+
53
+ def configure
54
+ source.definition.each do |options|
55
+ case options
56
+ when Symbol
57
+ fields << Field.new(options)
58
+ when Hash
59
+ fields << Field.new(options[:name])
60
+ else
61
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
62
+ end
63
+ end
64
+ end
65
+
66
+ class Field #:nodoc:
67
+ attr_reader :name
68
+ def initialize(name)
69
+ @name = name
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end