activewarehouse-etl 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,34 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
3
7
  class DatabaseDestination < Destination
4
- attr_reader :order, :truncate
8
+ # Specify the order from the source
9
+ attr_reader :order
10
+
11
+ # Set to true to truncate the destination table first
12
+ attr_reader :truncate
13
+
14
+ # Initialize the database destination
15
+ #
16
+ # * <tt>control</tt>: The ETL::Control::Control instance
17
+ # * <tt>configuration</tt>: The configuration Hash
18
+ # * <tt>mapping</tt>: The mapping
19
+ #
20
+ # Configuration options:
21
+ # * <tt>:database</tt>: The database name (REQUIRED)
22
+ # * <tt>:table<tt>: The table to write to (REQUIRED)
23
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
+ # * <tt>:username</tt>: The database username (defaults to 'root')
27
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
29
+ #
30
+ # Mapping options:
31
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
5
32
  def initialize(control, configuration, mapping)
6
33
  super
7
34
  @truncate = configuration[:truncate] ||= false
@@ -11,8 +38,9 @@ module ETL
11
38
  connect
12
39
  end
13
40
 
41
+ # Flush the currently buffered data
14
42
  def flush
15
- conn = ActiveRecord::Base.connection
43
+ conn = ETL::ActiveRecord::Base.connection
16
44
  conn.transaction do
17
45
  conn.truncate(configuration[:table]) if truncate
18
46
 
@@ -32,7 +60,7 @@ module ETL
32
60
  end
33
61
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
34
62
  # ETL::Engine.logger.debug("Query: #{q}")
35
- conn.execute(q, "Insert row #{current_row}")
63
+ conn.insert(q, "Insert row #{current_row}")
36
64
  @current_row += 1
37
65
  end
38
66
  buffer.clear
@@ -42,12 +70,22 @@ module ETL
42
70
  # Close the connection
43
71
  def close
44
72
  flush
45
- ActiveRecord::Base.connection.disconnect!
73
+ ETL::ActiveRecord::Base.connection.disconnect!
46
74
  end
47
75
 
48
76
  private
77
+ # Connect to the database.
78
+ #
79
+ # Required options:
80
+ # * <tt>:database</tt>: The database name
81
+ #
82
+ # Options:
83
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
84
+ # * <tt>:username</tt>: The database username (defaults to 'root')
85
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
86
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
49
87
  def connect
50
- ActiveRecord::Base.establish_connection(
88
+ ETL::ActiveRecord::Base.establish_connection(
51
89
  :adapter => (configuration[:adapter] || :mysql),
52
90
  :username => (configuration[:username] || 'root'),
53
91
  :host => (configuration[:host] || 'localhost'),
@@ -2,13 +2,39 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
- attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose
5
+ # The File to write to
6
+ attr_reader :file
7
+
8
+ # The output order
9
+ attr_reader :order
10
+
11
+ # Flag which indicates to append (default is to overwrite)
12
+ attr_accessor :append
13
+
14
+ # The separator
15
+ attr_accessor :separator
16
+
17
+ # The end of line marker
18
+ attr_accessor :eol
19
+
20
+ # The enclosure character
21
+ attr_accessor :enclose
7
22
 
8
23
  # Initialize the object.
9
24
  # * <tt>control</tt>: The Control object
10
25
  # * <tt>configuration</tt>: The configuration map
11
26
  # * <tt>mapping</tt>: The output mapping
27
+ #
28
+ # Configuration options:
29
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
30
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
31
+ # * <tt>:separator</tt>: Record separator (default is a comma)
32
+ # * <tt>:eol</tt>: End of line marker (default is \n)
33
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
34
+ # * <tt>:unique</tt>: Set to true to only write unique records
35
+ #
36
+ # Mapping options:
37
+ # * <tt>:order</tt>: The order array
12
38
  def initialize(control, configuration, mapping)
13
39
  super
14
40
  @file = File.join(File.dirname(control.file), configuration[:file])
@@ -57,7 +83,6 @@ module ETL #:nodoc:
57
83
  end
58
84
 
59
85
  private
60
-
61
86
  # Get the open file stream
62
87
  def f
63
88
  @f ||= open(file, mode)
@@ -67,7 +92,6 @@ module ETL #:nodoc:
67
92
  def mode
68
93
  append ? 'a' : 'w'
69
94
  end
70
-
71
95
  end
72
96
  end
73
97
  end
@@ -3,9 +3,20 @@ module ETL #:nodoc:
3
3
  # ETL source. Subclasses must implement the <tt>each</tt> method.
4
4
  class Source
5
5
  include Enumerable
6
- attr_accessor :control, :configuration, :definition
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
7
15
 
8
16
  class << self
17
+ # Convert the name to a Source class.
18
+ #
19
+ # For example if name is :database then this will return a DatabaseSource class
9
20
  def class_for_name(name)
10
21
  ETL::Control.const_get("#{name.to_s.classify}Source")
11
22
  end
@@ -20,6 +31,10 @@ module ETL #:nodoc:
20
31
  @configuration = configuration
21
32
  @definition = definition
22
33
  end
34
+
35
+ def errors
36
+ @errors ||= []
37
+ end
23
38
  end
24
39
  end
25
40
  end
@@ -1,23 +1,53 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Source object which extracts data from a database using ActiveRecord.
3
4
  class DatabaseSource < Source
5
+ # Initialize the source.
6
+ #
7
+ # Arguments:
8
+ # * <tt>control</tt>: The ETL::Control::Control instance
9
+ # * <tt>configuration</tt>: The configuration Hash
10
+ # * <tt>definition</tt>: The source definition
11
+ #
12
+ # Required configuration options:
13
+ # * <tt>:table</tt>: The source table name
14
+ # * <tt>:database</tt>: The database name
15
+ #
16
+ # Other options:
17
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
18
+ # * <tt>:username</tt>: The database username (defaults to 'root')
19
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
20
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
4
21
  def initialize(control, configuration, definition)
5
22
  super
6
23
  connect
7
24
  end
8
25
 
26
+ def to_s
27
+ "#{configuration[:host]}/#{configuration[:database]}"
28
+ end
29
+
9
30
  # Returns each row from the source
10
31
  def each
11
- conn = ActiveRecord::Base.connection
32
+ conn = ETL::ActiveRecord::Base.connection
12
33
  conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
13
34
  yield row
14
35
  end
15
36
  end
16
37
 
17
38
  private
39
+ # Connect to the database.
40
+ #
41
+ # Required options:
42
+ # * <tt>:database</tt>: The database name
43
+ #
44
+ # Options:
45
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
46
+ # * <tt>:username</tt>: The database username (defaults to 'root')
47
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
48
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
18
49
  def connect
19
- # set up the DB connection
20
- ActiveRecord::Base.establish_connection(
50
+ ETL::ActiveRecord::Base.establish_connection(
21
51
  :adapter => (configuration[:adapter] || :mysql),
22
52
  :username => (configuration[:username] || 'root'),
23
53
  :host => (configuration[:host] || 'localhost'),
@@ -1,19 +1,49 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
3
4
  class FileSource < Source
5
+ # The number of lines to skip, default is 0
4
6
  attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # Initialize the source
12
+ #
13
+ # Configuration options:
14
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
15
+ # a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
16
+ # which parser is used. See the documentation for each parser for information on what options it accepts.
17
+ # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
5
18
  def initialize(control, configuration, definition)
6
19
  super
7
20
  configure
8
21
  end
22
+
23
+ def to_s
24
+ configuration[:file]
25
+ end
26
+
9
27
  # Returns each row from the source
10
28
  def each
11
29
  @parser.each { |row| yield row }
12
30
  end
13
31
 
14
32
  private
33
+ # Configure the source
15
34
  def configure
16
- @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
35
+ case @configuration[:parser]
36
+ when Class
37
+ @parser = @configuration[:parser].new(self)
38
+ when String, Symbol
39
+ @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
40
+ when Hash
41
+ name = @configuration[:parser][:name]
42
+ options = @configuration[:parser][:options]
43
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
44
+ else
45
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
46
+ end
17
47
  @skip_lines = @configuration[:skip_lines] ||= 0
18
48
  end
19
49
  end
@@ -1,61 +1,181 @@
1
- module ETL
1
+ module ETL #:nodoc:
2
+ module ActiveRecord #:nodoc:
3
+ # Base class which is used for ActiveRecord connections. This is necessary
4
+ # since AR connections are tied to the class, and using ActiveRecord::Base
5
+ # directly can cause problems if the connection is closed.
6
+ class Base < ::ActiveRecord::Base
7
+ end
8
+ end
9
+
10
+ # The main ETL engine clas
2
11
  class Engine
3
12
 
4
13
  class << self
14
+ # Process the specified control file. Acceptable values for control_file are
15
+ # * Path to a file
16
+ # * File object
17
+ # * ETL::Control::Control instance
5
18
  def process(control_file)
6
19
  new().process(control_file)
7
20
  end
8
21
 
22
+ # A logger for the engine
9
23
  attr_accessor :logger
10
24
 
11
- def logger
25
+ def logger #:nodoc:
12
26
  unless @logger
13
27
  @logger = Logger.new('etl.log')
14
- @logger.level = Logger::DEBUG
28
+ @logger.level = Logger::WARN
15
29
  end
16
30
  @logger
17
31
  end
32
+
33
+ attr_accessor :current_source
34
+ attr_accessor :current_source_row
35
+ attr_accessor :current_destination
36
+
37
+ attr_accessor :realtime_activity
38
+ end
39
+
40
+ def say(message)
41
+ say_without_newline(message + "\n")
42
+ end
43
+
44
+ def say_without_newline(message)
45
+ if Engine.realtime_activity
46
+ $stdout.print message
47
+ $stdout.flush
48
+ end
49
+ end
50
+
51
+ def say_on_own_line(message)
52
+ say("\n" + message)
18
53
  end
19
54
 
20
- # Process a control file or object.
55
+ # Process a control file or object. Acceptable values for control are:
56
+ # * Path to a file
57
+ # * File object
58
+ # * ETL::Control::Control instance
21
59
  def process(control)
60
+ start_time = Time.now
22
61
  control = ETL::Control::Control.resolve(control)
23
62
 
63
+ Engine.logger.debug "Pre-processing #{control.file}"
24
64
  pre_process(control)
65
+ Engine.logger.debug "Pre-processing complete"
25
66
 
26
67
  sources = control.sources
27
68
  destinations = control.destinations
28
69
 
29
70
  sources.each do |source|
71
+ Engine.current_source = source
72
+ Engine.logger.debug "Processing source #{source}"
73
+ say "Source: #{source}"
30
74
  source.each_with_index do |row, index|
31
- row.each do |name, value|
75
+ Engine.current_source_row = index + 1
76
+ if Engine.realtime_activity && index % 1000 == 0
77
+ say_without_newline "."
78
+ end
79
+
80
+ begin
32
81
  # execute transforms
33
- row[name] = ETL::Transform::Transform.transform(name, value, control.get_transform(name))
82
+ row.each do |name, value|
83
+ row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
84
+ end
85
+ rescue => e
86
+ msg = "Error transforming from #{source} on line #{index}: #{e}"
87
+ source.errors << msg
88
+ Engine.logger.error msg
34
89
  end
35
- # write the row to the destination
36
- destinations.each do |destination|
37
- destination.write(row)
90
+
91
+ begin
92
+ # write the row to the destination
93
+ destinations.each do |destination|
94
+ Engine.current_destination = destination
95
+ destination.write(row)
96
+ end
97
+ rescue
98
+ msg = "Error writing to #{destination} on line #{index}"
99
+ destination.errors << msg
100
+ Engine.logger.error msg
38
101
  end
39
102
  end
103
+ say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)}"
40
104
  destinations.each do |destination|
41
105
  destination.close
42
106
  end
43
107
  end
44
108
 
109
+ Engine.logger.debug "Post-processing #{control.file}"
45
110
  post_process(control)
111
+ Engine.logger.debug "Post-processing complete"
46
112
  end
47
113
 
48
114
  private
115
+ # Execute all preprocessors
49
116
  def pre_process(control)
50
117
  control.pre_processors.each do |processor|
51
118
  processor.process
52
119
  end
53
120
  end
54
121
 
122
+ # Execute all postprocessors
55
123
  def post_process(control)
56
124
  control.post_processors.each do |processor|
57
125
  processor.process
58
126
  end
59
127
  end
128
+
129
+ # Return the distance of time in words from the given from_time to the specified to_time. If to_time
130
+ # is not specified then Time.now is used. By default seconds are included...set the include_seconds
131
+ # argument to false to disable the seconds.
132
+ def distance_of_time_in_words(from_time, to_time=Time.now)
133
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
134
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
135
+ seconds = (to_time - from_time).round
136
+ distance_in_days = (seconds/(60*60*24)).round
137
+ seconds = seconds % (60*60*24)
138
+ distance_in_hours = (seconds/(60*60)).round
139
+ seconds = seconds % (60*60)
140
+ distance_in_minutes = (seconds/60).round
141
+ seconds = seconds % 60
142
+ distance_in_seconds = seconds
143
+
144
+ s = ''
145
+ s << "#{distance_in_days} days," if distance_in_days > 0
146
+ s << "#{distance_in_hours} hours, " if distance_in_hours > 0
147
+ s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
148
+ s << "#{distance_in_seconds} seconds"
149
+ s
150
+ end
151
+
152
+ def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
153
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
154
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
155
+ distance_in_minutes = (((to_time - from_time).abs)/60).round
156
+ distance_in_seconds = ((to_time - from_time).abs).round
157
+
158
+ case distance_in_minutes
159
+ when 0..1
160
+ return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
161
+ case distance_in_seconds
162
+ when 0..4 then 'less than 5 seconds'
163
+ when 5..9 then 'less than 10 seconds'
164
+ when 10..19 then 'less than 20 seconds'
165
+ when 20..39 then 'half a minute'
166
+ when 40..59 then 'less than a minute'
167
+ else '1 minute'
168
+ end
169
+ when 2..44 then "#{distance_in_minutes} minutes"
170
+ when 45..89 then 'about 1 hour'
171
+ when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
172
+ when 1440..2879 then '1 day'
173
+ when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
174
+ when 43200..86399 then 'about 1 month'
175
+ when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
176
+ when 525960..1051919 then 'about 1 year'
177
+ else "over #{(distance_in_minutes / 525960).round} years"
178
+ end
179
+ end
60
180
  end
61
181
  end