davidrichards-etl 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/lib/etl/csv_et.rb ADDED
@@ -0,0 +1,64 @@
1
+ require 'open-uri'
2
+ require 'fastercsv'
3
+
4
+ module CSV
5
+
6
+ # Extract and transform for CSV files: in context (as a string), from a
7
+ # local file, or from a remote file. Uses FasterCSV and open-uri
8
+ class ET < ETL
9
+
10
+ attr_reader :header
11
+
12
+ after_transform :get_header_conditionally
13
+
14
+ protected
15
+
16
+ def get_header_conditionally
17
+ @header = @raw.shift if self.options[:extract_header]
18
+ end
19
+
20
+ # Attempts to get a string from a file, a uri, or a string
21
+ def extract
22
+ obj = self.options.fetch(:source, nil)
23
+ extract_locally(obj) or extract_remotely(obj) or extract_from_string(obj)
24
+ raise ArgumentError, "Could not determine what #{obj.inspect} was. CSV::ET cannot work with this data." unless @raw
25
+ end
26
+
27
+ # Handles local filename cases, reading the contents of the file.
28
+ def extract_locally(filename)
29
+ @raw = File.read(filename) if File.exist?(filename)
30
+ ET.logger.info "Extracted the data from from filesystem" if @raw
31
+ @raw ? true : false
32
+ end
33
+
34
+ # Handles remote uri cases, reading the remote resource with open-uri, part of the Standard Library
35
+ def extract_remotely(uri)
36
+ begin
37
+ open(uri) {|f| @raw = f.read}
38
+ ET.logger.info "Extracted the data from a remote location."
39
+ return true
40
+ rescue
41
+ ET.logger.info "Tested whether #{uri} was a remote resource. Failed to read it."
42
+ return false
43
+ end
44
+ end
45
+
46
+ # If this is a string, assumes that the contents of the string are CSV contents.
47
+ def extract_from_string(string)
48
+ @raw = string if string.is_a?(String)
49
+ @raw ? true : false
50
+ end
51
+
52
+ def transform
53
+ opts = self.options.fetch(:parse_with, {})
54
+ ET.logger.info "Parsing the data with FasterCSV and #{default_csv_opts.merge(opts).inspect}"
55
+ @raw = FCSV.parse(@data, default_csv_opts.merge(opts))
56
+ end
57
+
58
+ def default_csv_opts; {:converters => :all}; end
59
+ end
60
+
61
+ # Try this out for size:
62
+ # file = CSV::ET.process(:source => 'http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
63
+
64
+ end
data/lib/etl/etl.rb ADDED
@@ -0,0 +1,273 @@
1
+ # State machine with useful callbacks for getting data (Extract,
2
+ # Transform, and Loading data) with some support for re-trying failed
3
+ # stages of the process. Raise errors liberally if things go wrong, the
4
+ # data is being staged and the process can usually be restarted once the
5
+ # issue has been addressed.
6
+
7
+ class ETL
8
+
9
+ VALID_STATES = [:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load, :complete].freeze
10
+ VALID_CALLBACKS = [:before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load, :complete].freeze
11
+
12
+ # Because we want to interchange these steps on the queueing system
13
+ if defined?(TeguGears) == 'constant'
14
+ include TeguGears
15
+ end
16
+
17
+ # Using ActiveSupports callback system
18
+ include ActiveSupport::Callbacks
19
+
20
+ class << self
21
+
22
+ def process(options={}, &block)
23
+ etl = new
24
+ etl.process(options, &block)
25
+ etl
26
+ end
27
+ alias :call :process
28
+
29
+ # Sets up a logger for the class. Respects inheritance, so a different
30
+ # logger will be created for each ETL subclass.
31
+ # Using the standard log levels here: DEBUG < INFO < WARN < ERROR < FATAL
32
+ def logger
33
+
34
+ logger_name = (self.to_s + "_logger").to_sym
35
+
36
+ # Find and return the cached logger, if it's setup
37
+ logger = read_inheritable_attribute(logger_name)
38
+ return logger if logger
39
+
40
+ # Create a logger. Will configure it here and save it in a moment.
41
+ logger = Log4r::Logger.new(self.to_s)
42
+
43
+ # Set my default output format
44
+ format = Log4r::PatternFormatter.new(:pattern => "[%l] %d :: %m")
45
+
46
+ # Setup a console logger with our formatting
47
+ console = Log4r::StderrOutputter.new 'console'
48
+ console.level = Log4r::WARN
49
+ console.formatter = format
50
+
51
+ # Setup a logger to a file with our formatting
52
+ logfile = Log4r::FileOutputter.new('logfile',
53
+ :filename => self.logger_filename,
54
+ :trunc => false,
55
+ :level => Log4r::DEBUG)
56
+ logfile.formatter = format
57
+
58
+ # Tell the logger about both outputs.
59
+ logger.add('console','logfile')
60
+
61
+ # Store the logger as an inheritable class attribute
62
+ write_inheritable_attribute(logger_name, logger)
63
+
64
+ # Return the logger
65
+ logger
66
+ end
67
+
68
+ # First tries to get the cached @@logger_root
69
+ # Second, sets the global @@logger_root unless it is cached. Sets it to
70
+ # the best possible place to locate the logs:
71
+ # 1) where log will be from RAILS_ROOT/vendor/gems/etl
72
+ # 2) where log will be in a Rails model
73
+ # 3) where log will be in a Rails lib
74
+ # 4) in the local directory where ETL is being subclassed
75
+ # Third, uses the subclasses stored logger_root, ignoring all the rest
76
+ # if this is found.
77
+ def logger_root
78
+ @@logger_root ||= case
79
+ when File.exist?(File.dirname(__FILE__) + "/../../../../../log")
80
+ File.expand_path(File.dirname(__FILE__) + "/../../../../../log")
81
+ when File.exist?(File.dirname(__FILE__) + "/../../log")
82
+ File.expand_path(File.dirname(__FILE__) + '/../../log')
83
+ when File.exist?(File.dirname(__FILE__) + "/../log")
84
+ File.expand_path(File.dirname(__FILE__) + '/../log')
85
+ when File.exist?(File.dirname(__FILE__) + "/log")
86
+ File.expand_path(File.dirname(__FILE__) + '/log')
87
+ else
88
+ File.expand_path(File.dirname(__FILE__))
89
+ end
90
+ logger_root = read_inheritable_attribute(:logger_root) || @@logger_root
91
+ end
92
+
93
+ # Sets the logger root for the subclass, and sets it globally if this is
94
+ # set on ETL. So, ETL.logger_root = "some location" sets the logger
95
+ # root for all subclasses. This is useful if a lot of ETL is being done,
96
+ # and it needs to be logged in a non-standard place.
97
+ def logger_root=(value)
98
+ write_inheritable_attribute(:logger_root, value)
99
+ @@logger_root = value if self == ETL
100
+ end
101
+
102
+ def logger_filename
103
+ File.join(self.logger_root, "#{self.to_s}.log")
104
+ end
105
+ end
106
+
107
+ # A series of callbacks that make the process quite transparent
108
+ define_callbacks :before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load
109
+
110
+ def initialize
111
+ @state = :before_extract
112
+ end
113
+
114
+ # The state of the transform process
115
+ attr_reader :state
116
+
117
+ # The data being worked on, after it has successfully completed an
118
+ # extract, transform, or load process.
119
+ attr_reader :data
120
+
121
+ # The data generated on a process that didn't complete.
122
+ attr_reader :raw
123
+
124
+ # The options to process with. All your code will have access to these
125
+ # options, so things like:
126
+ #
127
+ # :filename => '...', :destination => '...', :converters => :all
128
+ #
129
+ # would all be useful. Your extract, transform, and load methods
130
+ # plus your callbacks can then extract out the information they need
131
+ # to get the job done.
132
+ attr_reader :options
133
+
134
+ # An optional block to process with
135
+ attr_reader :block
136
+
137
+ # Working towards a universal workflow driver here. The signature is
138
+ # just a hash and a block. That should work for about anything.
139
+ def process(options={}, &block)
140
+ # Only setup the options the first time, the other times we are re-
141
+ # starting the process.
142
+ @options = options unless @options
143
+ @block = block
144
+
145
+ self.class.logger.info "Processing #{self.class.to_s}"
146
+ self.class.logger.info "To re-run this process, run: #{self.show_command}"
147
+ self.class.logger.info "Note: Also pass the same block to #{self.class.to_s}" if block
148
+
149
+ etl_callback(:before_extract)
150
+
151
+ if @state == :extract
152
+ extract
153
+ @state = :after_extract
154
+ end
155
+
156
+ etl_callback(:after_extract)
157
+
158
+ # To be sure this is after all after_extract callbacks
159
+ process_raw_data
160
+
161
+ etl_callback(:before_transform)
162
+
163
+ if @state == :transform
164
+ transform
165
+ @state = :after_transform
166
+ end
167
+
168
+ etl_callback(:after_transform)
169
+
170
+ # To be sure this is after all after_tranform callbacks
171
+ process_raw_data
172
+
173
+ etl_callback(:before_load)
174
+
175
+ if @state == :load
176
+ load
177
+ @state = :after_load
178
+ end
179
+
180
+ etl_callback(:after_load)
181
+ @state
182
+ end
183
+
184
+ def reverse_to(state)
185
+ raise ArgumentError, "State must be one of #{VALID_STATES.inspect}" unless VALID_STATES.include?(state)
186
+ loc = VALID_STATES.index(state)
187
+ possible_states = VALID_STATES[0..loc]
188
+ raise "Cannot reverse to a state that hasn't been acheived yet." unless possible_states.include?(state)
189
+ @state = state
190
+ end
191
+
192
+ protected
193
+
194
+ def extract
195
+ # Silently do nothing
196
+ end
197
+
198
+ def transform
199
+ # Silently do nothing
200
+ end
201
+
202
+ def load
203
+ # Silently do nothing
204
+ end
205
+
206
+ # Runs a callback, if there is one defined on the class. Advances the
207
+ # state to the next state. Silently ignores the request if the current
208
+ # state isn't the callback being asked for. In this way, we can just
209
+ # call etl_callback several times, and it will advance from one state to
210
+ # the next.
211
+ def etl_callback(callback)
212
+ return false unless self.state == callback
213
+ run_callbacks(callback)
214
+ advance_from(callback)
215
+ end
216
+
217
+ # Advances to the next state, only if we are in a valid state.
218
+ def advance_from(callback)
219
+
220
+ raise ArgumentError, "State: #{callback} not recognized" unless VALID_CALLBACKS.include?(callback)
221
+ before_state = @state
222
+ @state = case @state
223
+ when :before_extract
224
+ :extract
225
+ when :extract
226
+ :after_extract
227
+ when :after_extract
228
+ :before_transform
229
+ when :before_transform
230
+ :transform
231
+ when :transform
232
+ :after_transform
233
+ when :after_transform
234
+ :before_load
235
+ when :before_load
236
+ :load
237
+ when :load
238
+ :after_load
239
+ when :after_load
240
+ :complete
241
+ when :complete
242
+ :complete
243
+ end
244
+
245
+ self.class.logger.info "Advanced from #{before_state} to #{@state}"
246
+
247
+ end
248
+
249
+ def process_raw_data
250
+ @data = @raw if defined?(@raw)
251
+ @raw = nil
252
+ end
253
+
254
+ def show_command
255
+ "#{self.class.to_s}.process(#{show_parsed_options})"
256
+ end
257
+
258
+ def show_parsed_options
259
+ self.options.inject("") do |str, e|
260
+ if e.first.is_a?(Symbol) and e.last.is_a?(Symbol)
261
+ str << ":#{e.first} => :#{e.last}"
262
+ elsif e.first.is_a?(Symbol)
263
+ str << ":#{e.first} => #{e.last}"
264
+ elsif e.last.is_a?(Symbol)
265
+ str << "#{e.first} => :#{e.last}"
266
+ else
267
+ str << "#{e.first} => #{e.last}"
268
+ end
269
+ str
270
+ end
271
+ end
272
+
273
+ end
@@ -0,0 +1,104 @@
1
+ # # This keeps the state of all observations in a bucket. An observation
2
+ # # is expected to be an OpenStruct with an occured_at field set. An
3
+ # # Observation class is provided in the helpers directory and is
4
+ # # automatically loaded with this gem. This is setup to work well in the
5
+ # # observable pattern.
6
+ #
7
+ # # Uses
8
+ # class TimeCachedOpenStruct
9
+ #
10
+ # attr_reader :tick_time
11
+ # attr_reader :keep_for
12
+ # attr_reader :cache
13
+ #
14
+ # def initialize(opts={})
15
+ # @tick_time = opts.fetch(:tick_time, 1)
16
+ # @keep_for = opts.fetch(:keep_for, self.tick_time * 100)
17
+ # @cache = ... (hash structure)
18
+ # end
19
+ #
20
+ # def at(time)
21
+ # self.cache[index_for(time)]
22
+ # end
23
+ #
24
+ # protected
25
+ # def index_for(time)
26
+ # ...
27
+ # end
28
+ #
29
+ # def round(time)
30
+ # end
31
+ # end
32
+ #
33
+ # require 'observable'
34
+ # class TimeBucket
35
+ #
36
+ # include Observer
37
+ #
38
+ # class << self
39
+ #
40
+ # # Works more like a multiton with subclasses. Each subclass gets their
41
+ # # own instance.
42
+ # def instance(opts={})
43
+ # instance = read_inheritable_attribute(:instance)
44
+ # return instance if instance
45
+ # instance = new(opts)
46
+ # write_inheritable_attribute(:instance, instance)
47
+ # instance
48
+ # end
49
+ # end
50
+ #
51
+ # # How often the state is broadcast
52
+ # attr_reader :tick_time
53
+ #
54
+ # # How long to wait for messages to be gathered in the bucket. If they
55
+ # # are not gathered by this time, they will never be broadcast.
56
+ # attr_reader :delay_time
57
+ #
58
+ # # The actual state data, a OpenStruct-based cache with a time-based
59
+ # # eviction_policy and a time-based accessor:
60
+ # # TimeBucket.bucket.at(time_object)
61
+ # attr_reader :bucket
62
+ #
63
+ # def initialize(opts={})
64
+ # @tick_time = opts.fetch(:tick_time, 1)
65
+ # @delay_time = opts.fetch(:delay_time, 0.5)
66
+ # keep_time = self.tick_time * 100 + self.delay_time
67
+ # @bucket = TimeCachedOpenStruct.new(:tick_time => self.tick_time, :keep_for => keep_time)
68
+ # end
69
+ #
70
+ # # To be called in its own process:
71
+ # # Process.fork { TimeBucket.instance(...).service(@etl) }
72
+ # # @etl is an object that responds to process and can load the consolidated data.
73
+ # def service(etl)
74
+ # sleep self.delay_time
75
+ # loop do
76
+ # changed
77
+ # notify_observers(self.bucket.at(Time.now - self.sleep_time))
78
+ # sleep self.tick_time
79
+ # end
80
+ # end
81
+ #
82
+ # def update(obj)
83
+ # observation = infer_observation(obj)
84
+ # self.bucket.merge!(observation)
85
+ # end
86
+ #
87
+ # protected
88
+ # def infer_observation(obj)
89
+ # if obj.respond_to?(occured_at)
90
+ # obj
91
+ # elsif obj.respond_to?(observation) and obj.observation.occured_at
92
+ # obj.observation
93
+ # elsif obj.is_a?(OpenStruct)
94
+ # obj.occured_at = Time.now
95
+ # obj
96
+ # elsif obj.is_a?(Hash)
97
+ # observation = Observation.new(obj)
98
+ # observation.occured_at = obj.fetch(:occured_at, Time.now)
99
+ # observation
100
+ # else
101
+ # nil
102
+ # end
103
+ # end
104
+ # end
data/lib/etl/xml_et.rb ADDED
@@ -0,0 +1,6 @@
1
+ module XML #:nodoc:
2
+
3
+ # Works with XML. May want to break this up into several utilities.
4
+ class ET < ETL
5
+ end
6
+ end