davidrichards-etl 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/etl/csv_et.rb ADDED
@@ -0,0 +1,64 @@
1
+ require 'open-uri'
2
+ require 'fastercsv'
3
+
4
+ module CSV
5
+
6
+ # Extract and transform for CSV files: in context (as a string), from a
7
+ # local file, or from a remote file. Uses FasterCSV and open-uri
8
+ class ET < ETL
9
+
10
+ attr_reader :header
11
+
12
+ after_transform :get_header_conditionally
13
+
14
+ protected
15
+
16
+ def get_header_conditionally
17
+ @header = @raw.shift if self.options[:extract_header]
18
+ end
19
+
20
+ # Attempts to get a string from a file, a uri, or a string
21
+ def extract
22
+ obj = self.options.fetch(:source, nil)
23
+ extract_locally(obj) or extract_remotely(obj) or extract_from_string(obj)
24
+ raise ArgumentError, "Could not determine what #{obj.inspect} was. CSV::ET cannot work with this data." unless @raw
25
+ end
26
+
27
+ # Handles local filename cases, reading the contents of the file.
28
+ def extract_locally(filename)
29
+ @raw = File.read(filename) if File.exist?(filename)
30
+ ET.logger.info "Extracted the data from from filesystem" if @raw
31
+ @raw ? true : false
32
+ end
33
+
34
+ # Handles remote uri cases, reading the remote resource with open-uri, part of the Standard Library
35
+ def extract_remotely(uri)
36
+ begin
37
+ open(uri) {|f| @raw = f.read}
38
+ ET.logger.info "Extracted the data from a remote location."
39
+ return true
40
+ rescue
41
+ ET.logger.info "Tested whether #{uri} was a remote resource. Failed to read it."
42
+ return false
43
+ end
44
+ end
45
+
46
+ # If this is a string, assumes that the contents of the string are CSV contents.
47
+ def extract_from_string(string)
48
+ @raw = string if string.is_a?(String)
49
+ @raw ? true : false
50
+ end
51
+
52
+ def transform
53
+ opts = self.options.fetch(:parse_with, {})
54
+ ET.logger.info "Parsing the data with FasterCSV and #{default_csv_opts.merge(opts).inspect}"
55
+ @raw = FCSV.parse(@data, default_csv_opts.merge(opts))
56
+ end
57
+
58
+ def default_csv_opts; {:converters => :all}; end
59
+ end
60
+
61
+ # Try this out for size:
62
+ # file = CSV::ET.process(:source => 'http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
63
+
64
+ end
data/lib/etl/etl.rb ADDED
@@ -0,0 +1,273 @@
1
+ # State machine with useful callbacks for getting data (Extract,
2
+ # Transform, and Loading data) with some support for re-trying failed
3
+ # stages of the process. Raise errors liberally if things go wrong, the
4
+ # data is being staged and the process can usually be restarted once the
5
+ # issue has been addressed.
6
+
7
+ class ETL
8
+
9
+ VALID_STATES = [:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load, :complete].freeze
10
+ VALID_CALLBACKS = [:before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load, :complete].freeze
11
+
12
+ # Because we want to interchange these steps on the queueing system
13
+ if defined?(TeguGears) == 'constant'
14
+ include TeguGears
15
+ end
16
+
17
+ # Using ActiveSupports callback system
18
+ include ActiveSupport::Callbacks
19
+
20
+ class << self
21
+
22
+ def process(options={}, &block)
23
+ etl = new
24
+ etl.process(options, &block)
25
+ etl
26
+ end
27
+ alias :call :process
28
+
29
+ # Sets up a logger for the class. Respects inheritance, so a different
30
+ # logger will be created for each ETL subclass.
31
+ # Using the standard log levels here: DEBUG < INFO < WARN < ERROR < FATAL
32
+ def logger
33
+
34
+ logger_name = (self.to_s + "_logger").to_sym
35
+
36
+ # Find and return the cached logger, if it's setup
37
+ logger = read_inheritable_attribute(logger_name)
38
+ return logger if logger
39
+
40
+ # Create a logger. Will configure it here and save it in a moment.
41
+ logger = Log4r::Logger.new(self.to_s)
42
+
43
+ # Set my default output format
44
+ format = Log4r::PatternFormatter.new(:pattern => "[%l] %d :: %m")
45
+
46
+ # Setup a console logger with our formatting
47
+ console = Log4r::StderrOutputter.new 'console'
48
+ console.level = Log4r::WARN
49
+ console.formatter = format
50
+
51
+ # Setup a logger to a file with our formatting
52
+ logfile = Log4r::FileOutputter.new('logfile',
53
+ :filename => self.logger_filename,
54
+ :trunc => false,
55
+ :level => Log4r::DEBUG)
56
+ logfile.formatter = format
57
+
58
+ # Tell the logger about both outputs.
59
+ logger.add('console','logfile')
60
+
61
+ # Store the logger as an inheritable class attribute
62
+ write_inheritable_attribute(logger_name, logger)
63
+
64
+ # Return the logger
65
+ logger
66
+ end
67
+
68
+ # First tries to get the cached @@logger_root
69
+ # Second, sets the global @@logger_root unless it is cached. Sets it to
70
+ # the best possible place to locate the logs:
71
+ # 1) where log will be from RAILS_ROOT/vendor/gems/etl
72
+ # 2) where log will be in a Rails model
73
+ # 3) where log will be in a Rails lib
74
+ # 4) in the local directory where ETL is being subclassed
75
+ # Third, uses the subclasses stored logger_root, ignoring all the rest
76
+ # if this is found.
77
+ def logger_root
78
+ @@logger_root ||= case
79
+ when File.exist?(File.dirname(__FILE__) + "/../../../../../log")
80
+ File.expand_path(File.dirname(__FILE__) + "/../../../../../log")
81
+ when File.exist?(File.dirname(__FILE__) + "/../../log")
82
+ File.expand_path(File.dirname(__FILE__) + '/../../log')
83
+ when File.exist?(File.dirname(__FILE__) + "/../log")
84
+ File.expand_path(File.dirname(__FILE__) + '/../log')
85
+ when File.exist?(File.dirname(__FILE__) + "/log")
86
+ File.expand_path(File.dirname(__FILE__) + '/log')
87
+ else
88
+ File.expand_path(File.dirname(__FILE__))
89
+ end
90
+ logger_root = read_inheritable_attribute(:logger_root) || @@logger_root
91
+ end
92
+
93
+ # Sets the logger root for the subclass, and sets it globally if this is
94
+ # set on ETL. So, ETL.logger_root = "some location" sets the logger
95
+ # root for all subclasses. This is useful if a lot of ETL is being done,
96
+ # and it needs to be logged in a non-standard place.
97
+ def logger_root=(value)
98
+ write_inheritable_attribute(:logger_root, value)
99
+ @@logger_root = value if self == ETL
100
+ end
101
+
102
+ def logger_filename
103
+ File.join(self.logger_root, "#{self.to_s}.log")
104
+ end
105
+ end
106
+
107
+ # A series of callbacks that make the process quite transparent
108
+ define_callbacks :before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load
109
+
110
+ def initialize
111
+ @state = :before_extract
112
+ end
113
+
114
+ # The state of the transform process
115
+ attr_reader :state
116
+
117
+ # The data being worked on, after it has successfully completed an
118
+ # extract, transform, or load process.
119
+ attr_reader :data
120
+
121
+ # The data generated on a process that didn't complete.
122
+ attr_reader :raw
123
+
124
+ # The options to process with. All your code will have access to these
125
+ # options, so things like:
126
+ #
127
+ # :filename => '...', :destination => '...', :converters => :all
128
+ #
129
+ # would all be useful. Your extract, transform, and load methods
130
+ # plus your callbacks can then extract out the information they need
131
+ # to get the job done.
132
+ attr_reader :options
133
+
134
+ # An optional block to process with
135
+ attr_reader :block
136
+
137
+ # Working towards a universal workflow driver here. The signature is
138
+ # just a hash and a block. That should work for about anything.
139
+ def process(options={}, &block)
140
+ # Only setup the options the first time, the other times we are re-
141
+ # starting the process.
142
+ @options = options unless @options
143
+ @block = block
144
+
145
+ self.class.logger.info "Processing #{self.class.to_s}"
146
+ self.class.logger.info "To re-run this process, run: #{self.show_command}"
147
+ self.class.logger.info "Note: Also pass the same block to #{self.class.to_s}" if block
148
+
149
+ etl_callback(:before_extract)
150
+
151
+ if @state == :extract
152
+ extract
153
+ @state = :after_extract
154
+ end
155
+
156
+ etl_callback(:after_extract)
157
+
158
+ # To be sure this is after all after_extract callbacks
159
+ process_raw_data
160
+
161
+ etl_callback(:before_transform)
162
+
163
+ if @state == :transform
164
+ transform
165
+ @state = :after_transform
166
+ end
167
+
168
+ etl_callback(:after_transform)
169
+
170
+ # To be sure this is after all after_tranform callbacks
171
+ process_raw_data
172
+
173
+ etl_callback(:before_load)
174
+
175
+ if @state == :load
176
+ load
177
+ @state = :after_load
178
+ end
179
+
180
+ etl_callback(:after_load)
181
+ @state
182
+ end
183
+
184
+ def reverse_to(state)
185
+ raise ArgumentError, "State must be one of #{VALID_STATES.inspect}" unless VALID_STATES.include?(state)
186
+ loc = VALID_STATES.index(state)
187
+ possible_states = VALID_STATES[0..loc]
188
+ raise "Cannot reverse to a state that hasn't been acheived yet." unless possible_states.include?(state)
189
+ @state = state
190
+ end
191
+
192
+ protected
193
+
194
+ def extract
195
+ # Silently do nothing
196
+ end
197
+
198
+ def transform
199
+ # Silently do nothing
200
+ end
201
+
202
+ def load
203
+ # Silently do nothing
204
+ end
205
+
206
+ # Runs a callback, if there is one defined on the class. Advances the
207
+ # state to the next state. Silently ignores the request if the current
208
+ # state isn't the callback being asked for. In this way, we can just
209
+ # call etl_callback several times, and it will advance from one state to
210
+ # the next.
211
+ def etl_callback(callback)
212
+ return false unless self.state == callback
213
+ run_callbacks(callback)
214
+ advance_from(callback)
215
+ end
216
+
217
+ # Advances to the next state, only if we are in a valid state.
218
+ def advance_from(callback)
219
+
220
+ raise ArgumentError, "State: #{callback} not recognized" unless VALID_CALLBACKS.include?(callback)
221
+ before_state = @state
222
+ @state = case @state
223
+ when :before_extract
224
+ :extract
225
+ when :extract
226
+ :after_extract
227
+ when :after_extract
228
+ :before_transform
229
+ when :before_transform
230
+ :transform
231
+ when :transform
232
+ :after_transform
233
+ when :after_transform
234
+ :before_load
235
+ when :before_load
236
+ :load
237
+ when :load
238
+ :after_load
239
+ when :after_load
240
+ :complete
241
+ when :complete
242
+ :complete
243
+ end
244
+
245
+ self.class.logger.info "Advanced from #{before_state} to #{@state}"
246
+
247
+ end
248
+
249
+ def process_raw_data
250
+ @data = @raw if defined?(@raw)
251
+ @raw = nil
252
+ end
253
+
254
+ def show_command
255
+ "#{self.class.to_s}.process(#{show_parsed_options})"
256
+ end
257
+
258
+ def show_parsed_options
259
+ self.options.inject("") do |str, e|
260
+ if e.first.is_a?(Symbol) and e.last.is_a?(Symbol)
261
+ str << ":#{e.first} => :#{e.last}"
262
+ elsif e.first.is_a?(Symbol)
263
+ str << ":#{e.first} => #{e.last}"
264
+ elsif e.last.is_a?(Symbol)
265
+ str << "#{e.first} => :#{e.last}"
266
+ else
267
+ str << "#{e.first} => #{e.last}"
268
+ end
269
+ str
270
+ end
271
+ end
272
+
273
+ end
@@ -0,0 +1,104 @@
1
+ # # This keeps the state of all observations in a bucket. An observation
2
+ # # is expected to be an OpenStruct with an occured_at field set. An
3
+ # # Observation class is provided in the helpers directory and is
4
+ # # automatically loaded with this gem. This is setup to work well in the
5
+ # # observable pattern.
6
+ #
7
+ # # Uses
8
+ # class TimeCachedOpenStruct
9
+ #
10
+ # attr_reader :tick_time
11
+ # attr_reader :keep_for
12
+ # attr_reader :cache
13
+ #
14
+ # def initialize(opts={})
15
+ # @tick_time = opts.fetch(:tick_time, 1)
16
+ # @keep_for = opts.fetch(:keep_for, self.tick_time * 100)
17
+ # @cache = ... (hash structure)
18
+ # end
19
+ #
20
+ # def at(time)
21
+ # self.cache[index_for(time)]
22
+ # end
23
+ #
24
+ # protected
25
+ # def index_for(time)
26
+ # ...
27
+ # end
28
+ #
29
+ # def round(time)
30
+ # end
31
+ # end
32
+ #
33
+ # require 'observable'
34
+ # class TimeBucket
35
+ #
36
+ # include Observer
37
+ #
38
+ # class << self
39
+ #
40
+ # # Works more like a multiton with subclasses. Each subclass gets their
41
+ # # own instance.
42
+ # def instance(opts={})
43
+ # instance = read_inheritable_attribute(:instance)
44
+ # return instance if instance
45
+ # instance = new(opts)
46
+ # write_inheritable_attribute(:instance, instance)
47
+ # instance
48
+ # end
49
+ # end
50
+ #
51
+ # # How often the state is broadcast
52
+ # attr_reader :tick_time
53
+ #
54
+ # # How long to wait for messages to be gathered in the bucket. If they
55
+ # # are not gathered by this time, they will never be broadcast.
56
+ # attr_reader :delay_time
57
+ #
58
+ # # The actual state data, a OpenStruct-based cache with a time-based
59
+ # # eviction_policy and a time-based accessor:
60
+ # # TimeBucket.bucket.at(time_object)
61
+ # attr_reader :bucket
62
+ #
63
+ # def initialize(opts={})
64
+ # @tick_time = opts.fetch(:tick_time, 1)
65
+ # @delay_time = opts.fetch(:delay_time, 0.5)
66
+ # keep_time = self.tick_time * 100 + self.delay_time
67
+ # @bucket = TimeCachedOpenStruct.new(:tick_time => self.tick_time, :keep_for => keep_time)
68
+ # end
69
+ #
70
+ # # To be called in its own process:
71
+ # # Process.fork { TimeBucket.instance(...).service(@etl) }
72
+ # # @etl is an object that responds to process and can load the consolidated data.
73
+ # def service(etl)
74
+ # sleep self.delay_time
75
+ # loop do
76
+ # changed
77
+ # notify_observers(self.bucket.at(Time.now - self.sleep_time))
78
+ # sleep self.tick_time
79
+ # end
80
+ # end
81
+ #
82
+ # def update(obj)
83
+ # observation = infer_observation(obj)
84
+ # self.bucket.merge!(observation)
85
+ # end
86
+ #
87
+ # protected
88
+ # def infer_observation(obj)
89
+ # if obj.respond_to?(occured_at)
90
+ # obj
91
+ # elsif obj.respond_to?(observation) and obj.observation.occured_at
92
+ # obj.observation
93
+ # elsif obj.is_a?(OpenStruct)
94
+ # obj.occured_at = Time.now
95
+ # obj
96
+ # elsif obj.is_a?(Hash)
97
+ # observation = Observation.new(obj)
98
+ # observation.occured_at = obj.fetch(:occured_at, Time.now)
99
+ # observation
100
+ # else
101
+ # nil
102
+ # end
103
+ # end
104
+ # end
data/lib/etl/xml_et.rb ADDED
@@ -0,0 +1,6 @@
1
+ module XML #:nodoc:
2
+
3
+ # Works with XML. May want to break this up into several utilities.
4
+ class ET < ETL
5
+ end
6
+ end