activewarehouse-etl 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,34 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
3
7
  class DatabaseDestination < Destination
4
- attr_reader :order, :truncate
8
+ # Specify the order from the source
9
+ attr_reader :order
10
+
11
+ # Set to true to truncate the destination table first
12
+ attr_reader :truncate
13
+
14
+ # Initialize the database destination
15
+ #
16
+ # * <tt>control</tt>: The ETL::Control::Control instance
17
+ # * <tt>configuration</tt>: The configuration Hash
18
+ # * <tt>mapping</tt>: The mapping
19
+ #
20
+ # Configuration options:
21
+ # * <tt>:database</tt>: The database name (REQUIRED)
22
+ # * <tt>:table<tt>: The table to write to (REQUIRED)
23
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
24
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
25
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
26
+ # * <tt>:username</tt>: The database username (defaults to 'root')
27
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
28
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
29
+ #
30
+ # Mapping options:
31
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
5
32
  def initialize(control, configuration, mapping)
6
33
  super
7
34
  @truncate = configuration[:truncate] ||= false
@@ -11,8 +38,9 @@ module ETL
11
38
  connect
12
39
  end
13
40
 
41
+ # Flush the currently buffered data
14
42
  def flush
15
- conn = ActiveRecord::Base.connection
43
+ conn = ETL::ActiveRecord::Base.connection
16
44
  conn.transaction do
17
45
  conn.truncate(configuration[:table]) if truncate
18
46
 
@@ -32,7 +60,7 @@ module ETL
32
60
  end
33
61
  q = "INSERT INTO #{configuration[:table]} (#{names.join(',')}) VALUES (#{values.join(',')})"
34
62
  # ETL::Engine.logger.debug("Query: #{q}")
35
- conn.execute(q, "Insert row #{current_row}")
63
+ conn.insert(q, "Insert row #{current_row}")
36
64
  @current_row += 1
37
65
  end
38
66
  buffer.clear
@@ -42,12 +70,22 @@ module ETL
42
70
  # Close the connection
43
71
  def close
44
72
  flush
45
- ActiveRecord::Base.connection.disconnect!
73
+ ETL::ActiveRecord::Base.connection.disconnect!
46
74
  end
47
75
 
48
76
  private
77
+ # Connect to the database.
78
+ #
79
+ # Required options:
80
+ # * <tt>:database</tt>: The database name
81
+ #
82
+ # Options:
83
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
84
+ # * <tt>:username</tt>: The database username (defaults to 'root')
85
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
86
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
49
87
  def connect
50
- ActiveRecord::Base.establish_connection(
88
+ ETL::ActiveRecord::Base.establish_connection(
51
89
  :adapter => (configuration[:adapter] || :mysql),
52
90
  :username => (configuration[:username] || 'root'),
53
91
  :host => (configuration[:host] || 'localhost'),
@@ -2,13 +2,39 @@ module ETL #:nodoc:
2
2
  module Control #:nodoc:
3
3
  # File as the final destination.
4
4
  class FileDestination < Destination
5
- attr_reader :file, :order
6
- attr_accessor :append, :separator, :eol, :enclose
5
+ # The File to write to
6
+ attr_reader :file
7
+
8
+ # The output order
9
+ attr_reader :order
10
+
11
+ # Flag which indicates to append (default is to overwrite)
12
+ attr_accessor :append
13
+
14
+ # The separator
15
+ attr_accessor :separator
16
+
17
+ # The end of line marker
18
+ attr_accessor :eol
19
+
20
+ # The enclosure character
21
+ attr_accessor :enclose
7
22
 
8
23
  # Initialize the object.
9
24
  # * <tt>control</tt>: The Control object
10
25
  # * <tt>configuration</tt>: The configuration map
11
26
  # * <tt>mapping</tt>: The output mapping
27
+ #
28
+ # Configuration options:
29
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
30
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
31
+ # * <tt>:separator</tt>: Record separator (default is a comma)
32
+ # * <tt>:eol</tt>: End of line marker (default is \n)
33
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
34
+ # * <tt>:unique</tt>: Set to true to only write unique records
35
+ #
36
+ # Mapping options:
37
+ # * <tt>:order</tt>: The order array
12
38
  def initialize(control, configuration, mapping)
13
39
  super
14
40
  @file = File.join(File.dirname(control.file), configuration[:file])
@@ -57,7 +83,6 @@ module ETL #:nodoc:
57
83
  end
58
84
 
59
85
  private
60
-
61
86
  # Get the open file stream
62
87
  def f
63
88
  @f ||= open(file, mode)
@@ -67,7 +92,6 @@ module ETL #:nodoc:
67
92
  def mode
68
93
  append ? 'a' : 'w'
69
94
  end
70
-
71
95
  end
72
96
  end
73
97
  end
@@ -3,9 +3,20 @@ module ETL #:nodoc:
3
3
  # ETL source. Subclasses must implement the <tt>each</tt> method.
4
4
  class Source
5
5
  include Enumerable
6
- attr_accessor :control, :configuration, :definition
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
7
15
 
8
16
  class << self
17
+ # Convert the name to a Source class.
18
+ #
19
+ # For example if name is :database then this will return a DatabaseSource class
9
20
  def class_for_name(name)
10
21
  ETL::Control.const_get("#{name.to_s.classify}Source")
11
22
  end
@@ -20,6 +31,10 @@ module ETL #:nodoc:
20
31
  @configuration = configuration
21
32
  @definition = definition
22
33
  end
34
+
35
+ def errors
36
+ @errors ||= []
37
+ end
23
38
  end
24
39
  end
25
40
  end
@@ -1,23 +1,53 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Source object which extracts data from a database using ActiveRecord.
3
4
  class DatabaseSource < Source
5
+ # Initialize the source.
6
+ #
7
+ # Arguments:
8
+ # * <tt>control</tt>: The ETL::Control::Control instance
9
+ # * <tt>configuration</tt>: The configuration Hash
10
+ # * <tt>definition</tt>: The source definition
11
+ #
12
+ # Required configuration options:
13
+ # * <tt>:table</tt>: The source table name
14
+ # * <tt>:database</tt>: The database name
15
+ #
16
+ # Other options:
17
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
18
+ # * <tt>:username</tt>: The database username (defaults to 'root')
19
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
20
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
4
21
  def initialize(control, configuration, definition)
5
22
  super
6
23
  connect
7
24
  end
8
25
 
26
+ def to_s
27
+ "#{configuration[:host]}/#{configuration[:database]}"
28
+ end
29
+
9
30
  # Returns each row from the source
10
31
  def each
11
- conn = ActiveRecord::Base.connection
32
+ conn = ETL::ActiveRecord::Base.connection
12
33
  conn.select_all("SELECT * FROM #{configuration[:table]}").each do |row|
13
34
  yield row
14
35
  end
15
36
  end
16
37
 
17
38
  private
39
+ # Connect to the database.
40
+ #
41
+ # Required options:
42
+ # * <tt>:database</tt>: The database name
43
+ #
44
+ # Options:
45
+ # * <tt>:adapter</tt>: The adapter to use (defaults to :mysql)
46
+ # * <tt>:username</tt>: The database username (defaults to 'root')
47
+ # * <tt>:password</tt>: The password to the database (defaults to nothing)
48
+ # * <tt>:host<tt>: The host for the database (defaults to 'localhost')
18
49
  def connect
19
- # set up the DB connection
20
- ActiveRecord::Base.establish_connection(
50
+ ETL::ActiveRecord::Base.establish_connection(
21
51
  :adapter => (configuration[:adapter] || :mysql),
22
52
  :username => (configuration[:username] || 'root'),
23
53
  :host => (configuration[:host] || 'localhost'),
@@ -1,19 +1,49 @@
1
- module ETL
2
- module Control
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
3
4
  class FileSource < Source
5
+ # The number of lines to skip, default is 0
4
6
  attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # Initialize the source
12
+ #
13
+ # Configuration options:
14
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or symbol, a class which extends from Parser,
15
+ # a Hash with :name and optionally an :options key. Whether or not the parser uses the options is dependent on
16
+ # which parser is used. See the documentation for each parser for information on what options it accepts.
17
+ # * <tt>:skip_lines<tt>: The number of lines to skip (defaults to 0)
5
18
  def initialize(control, configuration, definition)
6
19
  super
7
20
  configure
8
21
  end
22
+
23
+ def to_s
24
+ configuration[:file]
25
+ end
26
+
9
27
  # Returns each row from the source
10
28
  def each
11
29
  @parser.each { |row| yield row }
12
30
  end
13
31
 
14
32
  private
33
+ # Configure the source
15
34
  def configure
16
- @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
35
+ case @configuration[:parser]
36
+ when Class
37
+ @parser = @configuration[:parser].new(self)
38
+ when String, Symbol
39
+ @parser = ETL::Parser::Parser.class_for_name(@configuration[:parser]).new(self)
40
+ when Hash
41
+ name = @configuration[:parser][:name]
42
+ options = @configuration[:parser][:options]
43
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
44
+ else
45
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
46
+ end
17
47
  @skip_lines = @configuration[:skip_lines] ||= 0
18
48
  end
19
49
  end
@@ -1,61 +1,181 @@
1
- module ETL
1
+ module ETL #:nodoc:
2
+ module ActiveRecord #:nodoc:
3
+ # Base class which is used for ActiveRecord connections. This is necessary
4
+ # since AR connections are tied to the class, and using ActiveRecord::Base
5
+ # directly can cause problems if the connection is closed.
6
+ class Base < ::ActiveRecord::Base
7
+ end
8
+ end
9
+
10
+ # The main ETL engine clas
2
11
  class Engine
3
12
 
4
13
  class << self
14
+ # Process the specified control file. Acceptable values for control_file are
15
+ # * Path to a file
16
+ # * File object
17
+ # * ETL::Control::Control instance
5
18
  def process(control_file)
6
19
  new().process(control_file)
7
20
  end
8
21
 
22
+ # A logger for the engine
9
23
  attr_accessor :logger
10
24
 
11
- def logger
25
+ def logger #:nodoc:
12
26
  unless @logger
13
27
  @logger = Logger.new('etl.log')
14
- @logger.level = Logger::DEBUG
28
+ @logger.level = Logger::WARN
15
29
  end
16
30
  @logger
17
31
  end
32
+
33
+ attr_accessor :current_source
34
+ attr_accessor :current_source_row
35
+ attr_accessor :current_destination
36
+
37
+ attr_accessor :realtime_activity
38
+ end
39
+
40
+ def say(message)
41
+ say_without_newline(message + "\n")
42
+ end
43
+
44
+ def say_without_newline(message)
45
+ if Engine.realtime_activity
46
+ $stdout.print message
47
+ $stdout.flush
48
+ end
49
+ end
50
+
51
+ def say_on_own_line(message)
52
+ say("\n" + message)
18
53
  end
19
54
 
20
- # Process a control file or object.
55
+ # Process a control file or object. Acceptable values for control are:
56
+ # * Path to a file
57
+ # * File object
58
+ # * ETL::Control::Control instance
21
59
  def process(control)
60
+ start_time = Time.now
22
61
  control = ETL::Control::Control.resolve(control)
23
62
 
63
+ Engine.logger.debug "Pre-processing #{control.file}"
24
64
  pre_process(control)
65
+ Engine.logger.debug "Pre-processing complete"
25
66
 
26
67
  sources = control.sources
27
68
  destinations = control.destinations
28
69
 
29
70
  sources.each do |source|
71
+ Engine.current_source = source
72
+ Engine.logger.debug "Processing source #{source}"
73
+ say "Source: #{source}"
30
74
  source.each_with_index do |row, index|
31
- row.each do |name, value|
75
+ Engine.current_source_row = index + 1
76
+ if Engine.realtime_activity && index % 1000 == 0
77
+ say_without_newline "."
78
+ end
79
+
80
+ begin
32
81
  # execute transforms
33
- row[name] = ETL::Transform::Transform.transform(name, value, control.get_transform(name))
82
+ row.each do |name, value|
83
+ row[name] = ETL::Transform::Transform.transform(name, value, control.transform(name))
84
+ end
85
+ rescue => e
86
+ msg = "Error transforming from #{source} on line #{index}: #{e}"
87
+ source.errors << msg
88
+ Engine.logger.error msg
34
89
  end
35
- # write the row to the destination
36
- destinations.each do |destination|
37
- destination.write(row)
90
+
91
+ begin
92
+ # write the row to the destination
93
+ destinations.each do |destination|
94
+ Engine.current_destination = destination
95
+ destination.write(row)
96
+ end
97
+ rescue
98
+ msg = "Error writing to #{destination} on line #{index}"
99
+ destination.errors << msg
100
+ Engine.logger.error msg
38
101
  end
39
102
  end
103
+ say_on_own_line "Processed #{Engine.current_source_row} rows in #{distance_of_time_in_words(start_time)}"
40
104
  destinations.each do |destination|
41
105
  destination.close
42
106
  end
43
107
  end
44
108
 
109
+ Engine.logger.debug "Post-processing #{control.file}"
45
110
  post_process(control)
111
+ Engine.logger.debug "Post-processing complete"
46
112
  end
47
113
 
48
114
  private
115
+ # Execute all preprocessors
49
116
  def pre_process(control)
50
117
  control.pre_processors.each do |processor|
51
118
  processor.process
52
119
  end
53
120
  end
54
121
 
122
+ # Execute all postprocessors
55
123
  def post_process(control)
56
124
  control.post_processors.each do |processor|
57
125
  processor.process
58
126
  end
59
127
  end
128
+
129
+ # Return the distance of time in words from the given from_time to the specified to_time. If to_time
130
+ # is not specified then Time.now is used. By default seconds are included...set the include_seconds
131
+ # argument to false to disable the seconds.
132
+ def distance_of_time_in_words(from_time, to_time=Time.now)
133
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
134
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
135
+ seconds = (to_time - from_time).round
136
+ distance_in_days = (seconds/(60*60*24)).round
137
+ seconds = seconds % (60*60*24)
138
+ distance_in_hours = (seconds/(60*60)).round
139
+ seconds = seconds % (60*60)
140
+ distance_in_minutes = (seconds/60).round
141
+ seconds = seconds % 60
142
+ distance_in_seconds = seconds
143
+
144
+ s = ''
145
+ s << "#{distance_in_days} days," if distance_in_days > 0
146
+ s << "#{distance_in_hours} hours, " if distance_in_hours > 0
147
+ s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
148
+ s << "#{distance_in_seconds} seconds"
149
+ s
150
+ end
151
+
152
+ def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
153
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
154
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
155
+ distance_in_minutes = (((to_time - from_time).abs)/60).round
156
+ distance_in_seconds = ((to_time - from_time).abs).round
157
+
158
+ case distance_in_minutes
159
+ when 0..1
160
+ return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
161
+ case distance_in_seconds
162
+ when 0..4 then 'less than 5 seconds'
163
+ when 5..9 then 'less than 10 seconds'
164
+ when 10..19 then 'less than 20 seconds'
165
+ when 20..39 then 'half a minute'
166
+ when 40..59 then 'less than a minute'
167
+ else '1 minute'
168
+ end
169
+ when 2..44 then "#{distance_in_minutes} minutes"
170
+ when 45..89 then 'about 1 hour'
171
+ when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
172
+ when 1440..2879 then '1 day'
173
+ when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
174
+ when 43200..86399 then 'about 1 month'
175
+ when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
176
+ when 525960..1051919 then 'about 1 year'
177
+ else "over #{(distance_in_minutes / 525960).round} years"
178
+ end
179
+ end
60
180
  end
61
181
  end