davidrichards-etl 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +261 -0
- data/VERSION.yml +4 -0
- data/bin/etl +27 -0
- data/lib/all.rb +4 -0
- data/lib/etl/active_record_loader.rb +50 -0
- data/lib/etl/bucket.rb +148 -0
- data/lib/etl/csv_et.rb +64 -0
- data/lib/etl/etl.rb +273 -0
- data/lib/etl/time_bucket.rb +104 -0
- data/lib/etl/xml_et.rb +6 -0
- data/lib/etl.rb +36 -0
- data/lib/helpers/array.rb +11 -0
- data/lib/helpers/observation.rb +10 -0
- data/lib/helpers/open_struct.rb +18 -0
- data/lib/helpers/string.rb +6 -0
- data/lib/helpers/symbol.rb +6 -0
- data/spec/etl/bucket_spec.rb +112 -0
- data/spec/etl/csv_et_spec.rb +43 -0
- data/spec/etl/etl_spec.rb +237 -0
- data/spec/etl/xml_et_spec.rb +50 -0
- data/spec/etl_spec.rb +16 -0
- data/spec/fixtures/test_file.csv +3 -0
- data/spec/helpers/array_spec.rb +13 -0
- data/spec/helpers/observation_spec.rb +22 -0
- data/spec/helpers/open_struct_spec.rb +25 -0
- data/spec/helpers/string_spec.rb +8 -0
- data/spec/helpers/symbol_spec.rb +7 -0
- data/spec/spec_helper.rb +15 -0
- metadata +106 -0
data/lib/etl/csv_et.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'fastercsv'
|
3
|
+
|
4
|
+
module CSV
|
5
|
+
|
6
|
+
# Extract and transform for CSV files: in context (as a string), from a
|
7
|
+
# local file, or from a remote file. Uses FasterCSV and open-uri
|
8
|
+
class ET < ETL
|
9
|
+
|
10
|
+
attr_reader :header
|
11
|
+
|
12
|
+
after_transform :get_header_conditionally
|
13
|
+
|
14
|
+
protected
|
15
|
+
|
16
|
+
def get_header_conditionally
|
17
|
+
@header = @raw.shift if self.options[:extract_header]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Attempts to get a string from a file, a uri, or a string
|
21
|
+
def extract
|
22
|
+
obj = self.options.fetch(:source, nil)
|
23
|
+
extract_locally(obj) or extract_remotely(obj) or extract_from_string(obj)
|
24
|
+
raise ArgumentError, "Could not determine what #{obj.inspect} was. CSV::ET cannot work with this data." unless @raw
|
25
|
+
end
|
26
|
+
|
27
|
+
# Handles local filename cases, reading the contents of the file.
|
28
|
+
def extract_locally(filename)
|
29
|
+
@raw = File.read(filename) if File.exist?(filename)
|
30
|
+
ET.logger.info "Extracted the data from from filesystem" if @raw
|
31
|
+
@raw ? true : false
|
32
|
+
end
|
33
|
+
|
34
|
+
# Handles remote uri cases, reading the remote resource with open-uri, part of the Standard Library
|
35
|
+
def extract_remotely(uri)
|
36
|
+
begin
|
37
|
+
open(uri) {|f| @raw = f.read}
|
38
|
+
ET.logger.info "Extracted the data from a remote location."
|
39
|
+
return true
|
40
|
+
rescue
|
41
|
+
ET.logger.info "Tested whether #{uri} was a remote resource. Failed to read it."
|
42
|
+
return false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# If this is a string, assumes that the contents of the string are CSV contents.
|
47
|
+
def extract_from_string(string)
|
48
|
+
@raw = string if string.is_a?(String)
|
49
|
+
@raw ? true : false
|
50
|
+
end
|
51
|
+
|
52
|
+
def transform
|
53
|
+
opts = self.options.fetch(:parse_with, {})
|
54
|
+
ET.logger.info "Parsing the data with FasterCSV and #{default_csv_opts.merge(opts).inspect}"
|
55
|
+
@raw = FCSV.parse(@data, default_csv_opts.merge(opts))
|
56
|
+
end
|
57
|
+
|
58
|
+
def default_csv_opts; {:converters => :all}; end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Try this out for size:
|
62
|
+
# file = CSV::ET.process(:source => 'http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
63
|
+
|
64
|
+
end
|
data/lib/etl/etl.rb
ADDED
@@ -0,0 +1,273 @@
|
|
1
|
+
# State machine with useful callbacks for getting data (Extract,
|
2
|
+
# Transform, and Loading data) with some support for re-trying failed
|
3
|
+
# stages of the process. Raise errors liberally if things go wrong, the
|
4
|
+
# data is being staged and the process can usually be restarted once the
|
5
|
+
# issue has been addressed.
|
6
|
+
|
7
|
+
class ETL
|
8
|
+
|
9
|
+
VALID_STATES = [:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load, :complete].freeze
|
10
|
+
VALID_CALLBACKS = [:before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load, :complete].freeze
|
11
|
+
|
12
|
+
# Because we want to interchange these steps on the queueing system
|
13
|
+
if defined?(TeguGears) == 'constant'
|
14
|
+
include TeguGears
|
15
|
+
end
|
16
|
+
|
17
|
+
# Using ActiveSupports callback system
|
18
|
+
include ActiveSupport::Callbacks
|
19
|
+
|
20
|
+
class << self
|
21
|
+
|
22
|
+
def process(options={}, &block)
|
23
|
+
etl = new
|
24
|
+
etl.process(options, &block)
|
25
|
+
etl
|
26
|
+
end
|
27
|
+
alias :call :process
|
28
|
+
|
29
|
+
# Sets up a logger for the class. Respects inheritance, so a different
|
30
|
+
# logger will be created for each ETL subclass.
|
31
|
+
# Using the standard log levels here: DEBUG < INFO < WARN < ERROR < FATAL
|
32
|
+
def logger
|
33
|
+
|
34
|
+
logger_name = (self.to_s + "_logger").to_sym
|
35
|
+
|
36
|
+
# Find and return the cached logger, if it's setup
|
37
|
+
logger = read_inheritable_attribute(logger_name)
|
38
|
+
return logger if logger
|
39
|
+
|
40
|
+
# Create a logger. Will configure it here and save it in a moment.
|
41
|
+
logger = Log4r::Logger.new(self.to_s)
|
42
|
+
|
43
|
+
# Set my default output format
|
44
|
+
format = Log4r::PatternFormatter.new(:pattern => "[%l] %d :: %m")
|
45
|
+
|
46
|
+
# Setup a console logger with our formatting
|
47
|
+
console = Log4r::StderrOutputter.new 'console'
|
48
|
+
console.level = Log4r::WARN
|
49
|
+
console.formatter = format
|
50
|
+
|
51
|
+
# Setup a logger to a file with our formatting
|
52
|
+
logfile = Log4r::FileOutputter.new('logfile',
|
53
|
+
:filename => self.logger_filename,
|
54
|
+
:trunc => false,
|
55
|
+
:level => Log4r::DEBUG)
|
56
|
+
logfile.formatter = format
|
57
|
+
|
58
|
+
# Tell the logger about both outputs.
|
59
|
+
logger.add('console','logfile')
|
60
|
+
|
61
|
+
# Store the logger as an inheritable class attribute
|
62
|
+
write_inheritable_attribute(logger_name, logger)
|
63
|
+
|
64
|
+
# Return the logger
|
65
|
+
logger
|
66
|
+
end
|
67
|
+
|
68
|
+
# First tries to get the cached @@logger_root
|
69
|
+
# Second, sets the global @@logger_root unless it is cached. Sets it to
|
70
|
+
# the best possible place to locate the logs:
|
71
|
+
# 1) where log will be from RAILS_ROOT/vendor/gems/etl
|
72
|
+
# 2) where log will be in a Rails model
|
73
|
+
# 3) where log will be in a Rails lib
|
74
|
+
# 4) in the local directory where ETL is being subclassed
|
75
|
+
# Third, uses the subclasses stored logger_root, ignoring all the rest
|
76
|
+
# if this is found.
|
77
|
+
def logger_root
|
78
|
+
@@logger_root ||= case
|
79
|
+
when File.exist?(File.dirname(__FILE__) + "/../../../../../log")
|
80
|
+
File.expand_path(File.dirname(__FILE__) + "/../../../../../log")
|
81
|
+
when File.exist?(File.dirname(__FILE__) + "/../../log")
|
82
|
+
File.expand_path(File.dirname(__FILE__) + '/../../log')
|
83
|
+
when File.exist?(File.dirname(__FILE__) + "/../log")
|
84
|
+
File.expand_path(File.dirname(__FILE__) + '/../log')
|
85
|
+
when File.exist?(File.dirname(__FILE__) + "/log")
|
86
|
+
File.expand_path(File.dirname(__FILE__) + '/log')
|
87
|
+
else
|
88
|
+
File.expand_path(File.dirname(__FILE__))
|
89
|
+
end
|
90
|
+
logger_root = read_inheritable_attribute(:logger_root) || @@logger_root
|
91
|
+
end
|
92
|
+
|
93
|
+
# Sets the logger root for the subclass, and sets it globally if this is
|
94
|
+
# set on ETL. So, ETL.logger_root = "some location" sets the logger
|
95
|
+
# root for all subclasses. This is useful if a lot of ETL is being done,
|
96
|
+
# and it needs to be logged in a non-standard place.
|
97
|
+
def logger_root=(value)
|
98
|
+
write_inheritable_attribute(:logger_root, value)
|
99
|
+
@@logger_root = value if self == ETL
|
100
|
+
end
|
101
|
+
|
102
|
+
def logger_filename
|
103
|
+
File.join(self.logger_root, "#{self.to_s}.log")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# A series of callbacks that make the process quite transparent
|
108
|
+
define_callbacks :before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load
|
109
|
+
|
110
|
+
def initialize
|
111
|
+
@state = :before_extract
|
112
|
+
end
|
113
|
+
|
114
|
+
# The state of the transform process
|
115
|
+
attr_reader :state
|
116
|
+
|
117
|
+
# The data being worked on, after it has successfully completed an
|
118
|
+
# extract, transform, or load process.
|
119
|
+
attr_reader :data
|
120
|
+
|
121
|
+
# The data generated on a process that didn't complete.
|
122
|
+
attr_reader :raw
|
123
|
+
|
124
|
+
# The options to process with. All your code will have access to these
|
125
|
+
# options, so things like:
|
126
|
+
#
|
127
|
+
# :filename => '...', :destination => '...', :converters => :all
|
128
|
+
#
|
129
|
+
# would all be useful. Your extract, transform, and load methods
|
130
|
+
# plus your callbacks can then extract out the information they need
|
131
|
+
# to get the job done.
|
132
|
+
attr_reader :options
|
133
|
+
|
134
|
+
# An optional block to process with
|
135
|
+
attr_reader :block
|
136
|
+
|
137
|
+
# Working towards a universal workflow driver here. The signature is
|
138
|
+
# just a hash and a block. That should work for about anything.
|
139
|
+
def process(options={}, &block)
|
140
|
+
# Only setup the options the first time, the other times we are re-
|
141
|
+
# starting the process.
|
142
|
+
@options = options unless @options
|
143
|
+
@block = block
|
144
|
+
|
145
|
+
self.class.logger.info "Processing #{self.class.to_s}"
|
146
|
+
self.class.logger.info "To re-run this process, run: #{self.show_command}"
|
147
|
+
self.class.logger.info "Note: Also pass the same block to #{self.class.to_s}" if block
|
148
|
+
|
149
|
+
etl_callback(:before_extract)
|
150
|
+
|
151
|
+
if @state == :extract
|
152
|
+
extract
|
153
|
+
@state = :after_extract
|
154
|
+
end
|
155
|
+
|
156
|
+
etl_callback(:after_extract)
|
157
|
+
|
158
|
+
# To be sure this is after all after_extract callbacks
|
159
|
+
process_raw_data
|
160
|
+
|
161
|
+
etl_callback(:before_transform)
|
162
|
+
|
163
|
+
if @state == :transform
|
164
|
+
transform
|
165
|
+
@state = :after_transform
|
166
|
+
end
|
167
|
+
|
168
|
+
etl_callback(:after_transform)
|
169
|
+
|
170
|
+
# To be sure this is after all after_tranform callbacks
|
171
|
+
process_raw_data
|
172
|
+
|
173
|
+
etl_callback(:before_load)
|
174
|
+
|
175
|
+
if @state == :load
|
176
|
+
load
|
177
|
+
@state = :after_load
|
178
|
+
end
|
179
|
+
|
180
|
+
etl_callback(:after_load)
|
181
|
+
@state
|
182
|
+
end
|
183
|
+
|
184
|
+
def reverse_to(state)
|
185
|
+
raise ArgumentError, "State must be one of #{VALID_STATES.inspect}" unless VALID_STATES.include?(state)
|
186
|
+
loc = VALID_STATES.index(state)
|
187
|
+
possible_states = VALID_STATES[0..loc]
|
188
|
+
raise "Cannot reverse to a state that hasn't been acheived yet." unless possible_states.include?(state)
|
189
|
+
@state = state
|
190
|
+
end
|
191
|
+
|
192
|
+
protected
|
193
|
+
|
194
|
+
def extract
|
195
|
+
# Silently do nothing
|
196
|
+
end
|
197
|
+
|
198
|
+
def transform
|
199
|
+
# Silently do nothing
|
200
|
+
end
|
201
|
+
|
202
|
+
def load
|
203
|
+
# Silently do nothing
|
204
|
+
end
|
205
|
+
|
206
|
+
# Runs a callback, if there is one defined on the class. Advances the
|
207
|
+
# state to the next state. Silently ignores the request if the current
|
208
|
+
# state isn't the callback being asked for. In this way, we can just
|
209
|
+
# call etl_callback several times, and it will advance from one state to
|
210
|
+
# the next.
|
211
|
+
def etl_callback(callback)
|
212
|
+
return false unless self.state == callback
|
213
|
+
run_callbacks(callback)
|
214
|
+
advance_from(callback)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Advances to the next state, only if we are in a valid state.
|
218
|
+
def advance_from(callback)
|
219
|
+
|
220
|
+
raise ArgumentError, "State: #{callback} not recognized" unless VALID_CALLBACKS.include?(callback)
|
221
|
+
before_state = @state
|
222
|
+
@state = case @state
|
223
|
+
when :before_extract
|
224
|
+
:extract
|
225
|
+
when :extract
|
226
|
+
:after_extract
|
227
|
+
when :after_extract
|
228
|
+
:before_transform
|
229
|
+
when :before_transform
|
230
|
+
:transform
|
231
|
+
when :transform
|
232
|
+
:after_transform
|
233
|
+
when :after_transform
|
234
|
+
:before_load
|
235
|
+
when :before_load
|
236
|
+
:load
|
237
|
+
when :load
|
238
|
+
:after_load
|
239
|
+
when :after_load
|
240
|
+
:complete
|
241
|
+
when :complete
|
242
|
+
:complete
|
243
|
+
end
|
244
|
+
|
245
|
+
self.class.logger.info "Advanced from #{before_state} to #{@state}"
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
def process_raw_data
|
250
|
+
@data = @raw if defined?(@raw)
|
251
|
+
@raw = nil
|
252
|
+
end
|
253
|
+
|
254
|
+
def show_command
|
255
|
+
"#{self.class.to_s}.process(#{show_parsed_options})"
|
256
|
+
end
|
257
|
+
|
258
|
+
def show_parsed_options
|
259
|
+
self.options.inject("") do |str, e|
|
260
|
+
if e.first.is_a?(Symbol) and e.last.is_a?(Symbol)
|
261
|
+
str << ":#{e.first} => :#{e.last}"
|
262
|
+
elsif e.first.is_a?(Symbol)
|
263
|
+
str << ":#{e.first} => #{e.last}"
|
264
|
+
elsif e.last.is_a?(Symbol)
|
265
|
+
str << "#{e.first} => :#{e.last}"
|
266
|
+
else
|
267
|
+
str << "#{e.first} => #{e.last}"
|
268
|
+
end
|
269
|
+
str
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# # This keeps the state of all observations in a bucket. An observation
|
2
|
+
# # is expected to be an OpenStruct with an occured_at field set. An
|
3
|
+
# # Observation class is provided in the helpers directory and is
|
4
|
+
# # automatically loaded with this gem. This is setup to work well in the
|
5
|
+
# # observable pattern.
|
6
|
+
#
|
7
|
+
# # Uses
|
8
|
+
# class TimeCachedOpenStruct
|
9
|
+
#
|
10
|
+
# attr_reader :tick_time
|
11
|
+
# attr_reader :keep_for
|
12
|
+
# attr_reader :cache
|
13
|
+
#
|
14
|
+
# def initialize(opts={})
|
15
|
+
# @tick_time = opts.fetch(:tick_time, 1)
|
16
|
+
# @keep_for = opts.fetch(:keep_for, self.tick_time * 100)
|
17
|
+
# @cache = ... (hash structure)
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def at(time)
|
21
|
+
# self.cache[index_for(time)]
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# protected
|
25
|
+
# def index_for(time)
|
26
|
+
# ...
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# def round(time)
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# require 'observable'
|
34
|
+
# class TimeBucket
|
35
|
+
#
|
36
|
+
# include Observer
|
37
|
+
#
|
38
|
+
# class << self
|
39
|
+
#
|
40
|
+
# # Works more like a multiton with subclasses. Each subclass gets their
|
41
|
+
# # own instance.
|
42
|
+
# def instance(opts={})
|
43
|
+
# instance = read_inheritable_attribute(:instance)
|
44
|
+
# return instance if instance
|
45
|
+
# instance = new(opts)
|
46
|
+
# write_inheritable_attribute(:instance, instance)
|
47
|
+
# instance
|
48
|
+
# end
|
49
|
+
# end
|
50
|
+
#
|
51
|
+
# # How often the state is broadcast
|
52
|
+
# attr_reader :tick_time
|
53
|
+
#
|
54
|
+
# # How long to wait for messages to be gathered in the bucket. If they
|
55
|
+
# # are not gathered by this time, they will never be broadcast.
|
56
|
+
# attr_reader :delay_time
|
57
|
+
#
|
58
|
+
# # The actual state data, a OpenStruct-based cache with a time-based
|
59
|
+
# # eviction_policy and a time-based accessor:
|
60
|
+
# # TimeBucket.bucket.at(time_object)
|
61
|
+
# attr_reader :bucket
|
62
|
+
#
|
63
|
+
# def initialize(opts={})
|
64
|
+
# @tick_time = opts.fetch(:tick_time, 1)
|
65
|
+
# @delay_time = opts.fetch(:delay_time, 0.5)
|
66
|
+
# keep_time = self.tick_time * 100 + self.delay_time
|
67
|
+
# @bucket = TimeCachedOpenStruct.new(:tick_time => self.tick_time, :keep_for => keep_time)
|
68
|
+
# end
|
69
|
+
#
|
70
|
+
# # To be called in its own process:
|
71
|
+
# # Process.fork { TimeBucket.instance(...).service(@etl) }
|
72
|
+
# # @etl is an object that responds to process and can load the consolidated data.
|
73
|
+
# def service(etl)
|
74
|
+
# sleep self.delay_time
|
75
|
+
# loop do
|
76
|
+
# changed
|
77
|
+
# notify_observers(self.bucket.at(Time.now - self.sleep_time))
|
78
|
+
# sleep self.tick_time
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
#
|
82
|
+
# def update(obj)
|
83
|
+
# observation = infer_observation(obj)
|
84
|
+
# self.bucket.merge!(observation)
|
85
|
+
# end
|
86
|
+
#
|
87
|
+
# protected
|
88
|
+
# def infer_observation(obj)
|
89
|
+
# if obj.respond_to?(occured_at)
|
90
|
+
# obj
|
91
|
+
# elsif obj.respond_to?(observation) and obj.observation.occured_at
|
92
|
+
# obj.observation
|
93
|
+
# elsif obj.is_a?(OpenStruct)
|
94
|
+
# obj.occured_at = Time.now
|
95
|
+
# obj
|
96
|
+
# elsif obj.is_a?(Hash)
|
97
|
+
# observation = Observation.new(obj)
|
98
|
+
# observation.occured_at = obj.fetch(:occured_at, Time.now)
|
99
|
+
# observation
|
100
|
+
# else
|
101
|
+
# nil
|
102
|
+
# end
|
103
|
+
# end
|
104
|
+
# end
|