davidrichards-etl 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +261 -0
- data/VERSION.yml +4 -0
- data/bin/etl +27 -0
- data/lib/all.rb +4 -0
- data/lib/etl/active_record_loader.rb +50 -0
- data/lib/etl/bucket.rb +148 -0
- data/lib/etl/csv_et.rb +64 -0
- data/lib/etl/etl.rb +273 -0
- data/lib/etl/time_bucket.rb +104 -0
- data/lib/etl/xml_et.rb +6 -0
- data/lib/etl.rb +36 -0
- data/lib/helpers/array.rb +11 -0
- data/lib/helpers/observation.rb +10 -0
- data/lib/helpers/open_struct.rb +18 -0
- data/lib/helpers/string.rb +6 -0
- data/lib/helpers/symbol.rb +6 -0
- data/spec/etl/bucket_spec.rb +112 -0
- data/spec/etl/csv_et_spec.rb +43 -0
- data/spec/etl/etl_spec.rb +237 -0
- data/spec/etl/xml_et_spec.rb +50 -0
- data/spec/etl_spec.rb +16 -0
- data/spec/fixtures/test_file.csv +3 -0
- data/spec/helpers/array_spec.rb +13 -0
- data/spec/helpers/observation_spec.rb +22 -0
- data/spec/helpers/open_struct_spec.rb +25 -0
- data/spec/helpers/string_spec.rb +8 -0
- data/spec/helpers/symbol_spec.rb +7 -0
- data/spec/spec_helper.rb +15 -0
- metadata +106 -0
data/lib/etl/csv_et.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'fastercsv'
|
3
|
+
|
4
|
+
module CSV
|
5
|
+
|
6
|
+
# Extract and transform for CSV files: in context (as a string), from a
|
7
|
+
# local file, or from a remote file. Uses FasterCSV and open-uri
|
8
|
+
class ET < ETL
|
9
|
+
|
10
|
+
attr_reader :header
|
11
|
+
|
12
|
+
after_transform :get_header_conditionally
|
13
|
+
|
14
|
+
protected
|
15
|
+
|
16
|
+
def get_header_conditionally
|
17
|
+
@header = @raw.shift if self.options[:extract_header]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Attempts to get a string from a file, a uri, or a string
|
21
|
+
def extract
|
22
|
+
obj = self.options.fetch(:source, nil)
|
23
|
+
extract_locally(obj) or extract_remotely(obj) or extract_from_string(obj)
|
24
|
+
raise ArgumentError, "Could not determine what #{obj.inspect} was. CSV::ET cannot work with this data." unless @raw
|
25
|
+
end
|
26
|
+
|
27
|
+
# Handles local filename cases, reading the contents of the file.
|
28
|
+
def extract_locally(filename)
|
29
|
+
@raw = File.read(filename) if File.exist?(filename)
|
30
|
+
ET.logger.info "Extracted the data from from filesystem" if @raw
|
31
|
+
@raw ? true : false
|
32
|
+
end
|
33
|
+
|
34
|
+
# Handles remote uri cases, reading the remote resource with open-uri, part of the Standard Library
|
35
|
+
def extract_remotely(uri)
|
36
|
+
begin
|
37
|
+
open(uri) {|f| @raw = f.read}
|
38
|
+
ET.logger.info "Extracted the data from a remote location."
|
39
|
+
return true
|
40
|
+
rescue
|
41
|
+
ET.logger.info "Tested whether #{uri} was a remote resource. Failed to read it."
|
42
|
+
return false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# If this is a string, assumes that the contents of the string are CSV contents.
|
47
|
+
def extract_from_string(string)
|
48
|
+
@raw = string if string.is_a?(String)
|
49
|
+
@raw ? true : false
|
50
|
+
end
|
51
|
+
|
52
|
+
def transform
|
53
|
+
opts = self.options.fetch(:parse_with, {})
|
54
|
+
ET.logger.info "Parsing the data with FasterCSV and #{default_csv_opts.merge(opts).inspect}"
|
55
|
+
@raw = FCSV.parse(@data, default_csv_opts.merge(opts))
|
56
|
+
end
|
57
|
+
|
58
|
+
def default_csv_opts; {:converters => :all}; end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Try this out for size:
|
62
|
+
# file = CSV::ET.process(:source => 'http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
63
|
+
|
64
|
+
end
|
data/lib/etl/etl.rb
ADDED
@@ -0,0 +1,273 @@
|
|
1
|
+
# State machine with useful callbacks for getting data (Extract,
|
2
|
+
# Transform, and Loading data) with some support for re-trying failed
|
3
|
+
# stages of the process. Raise errors liberally if things go wrong, the
|
4
|
+
# data is being staged and the process can usually be restarted once the
|
5
|
+
# issue has been addressed.
|
6
|
+
|
7
|
+
class ETL
|
8
|
+
|
9
|
+
VALID_STATES = [:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load, :complete].freeze
|
10
|
+
VALID_CALLBACKS = [:before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load, :complete].freeze
|
11
|
+
|
12
|
+
# Because we want to interchange these steps on the queueing system
|
13
|
+
if defined?(TeguGears) == 'constant'
|
14
|
+
include TeguGears
|
15
|
+
end
|
16
|
+
|
17
|
+
# Using ActiveSupports callback system
|
18
|
+
include ActiveSupport::Callbacks
|
19
|
+
|
20
|
+
class << self
|
21
|
+
|
22
|
+
def process(options={}, &block)
|
23
|
+
etl = new
|
24
|
+
etl.process(options, &block)
|
25
|
+
etl
|
26
|
+
end
|
27
|
+
alias :call :process
|
28
|
+
|
29
|
+
# Sets up a logger for the class. Respects inheritance, so a different
|
30
|
+
# logger will be created for each ETL subclass.
|
31
|
+
# Using the standard log levels here: DEBUG < INFO < WARN < ERROR < FATAL
|
32
|
+
def logger
|
33
|
+
|
34
|
+
logger_name = (self.to_s + "_logger").to_sym
|
35
|
+
|
36
|
+
# Find and return the cached logger, if it's setup
|
37
|
+
logger = read_inheritable_attribute(logger_name)
|
38
|
+
return logger if logger
|
39
|
+
|
40
|
+
# Create a logger. Will configure it here and save it in a moment.
|
41
|
+
logger = Log4r::Logger.new(self.to_s)
|
42
|
+
|
43
|
+
# Set my default output format
|
44
|
+
format = Log4r::PatternFormatter.new(:pattern => "[%l] %d :: %m")
|
45
|
+
|
46
|
+
# Setup a console logger with our formatting
|
47
|
+
console = Log4r::StderrOutputter.new 'console'
|
48
|
+
console.level = Log4r::WARN
|
49
|
+
console.formatter = format
|
50
|
+
|
51
|
+
# Setup a logger to a file with our formatting
|
52
|
+
logfile = Log4r::FileOutputter.new('logfile',
|
53
|
+
:filename => self.logger_filename,
|
54
|
+
:trunc => false,
|
55
|
+
:level => Log4r::DEBUG)
|
56
|
+
logfile.formatter = format
|
57
|
+
|
58
|
+
# Tell the logger about both outputs.
|
59
|
+
logger.add('console','logfile')
|
60
|
+
|
61
|
+
# Store the logger as an inheritable class attribute
|
62
|
+
write_inheritable_attribute(logger_name, logger)
|
63
|
+
|
64
|
+
# Return the logger
|
65
|
+
logger
|
66
|
+
end
|
67
|
+
|
68
|
+
# First tries to get the cached @@logger_root
|
69
|
+
# Second, sets the global @@logger_root unless it is cached. Sets it to
|
70
|
+
# the best possible place to locate the logs:
|
71
|
+
# 1) where log will be from RAILS_ROOT/vendor/gems/etl
|
72
|
+
# 2) where log will be in a Rails model
|
73
|
+
# 3) where log will be in a Rails lib
|
74
|
+
# 4) in the local directory where ETL is being subclassed
|
75
|
+
# Third, uses the subclasses stored logger_root, ignoring all the rest
|
76
|
+
# if this is found.
|
77
|
+
def logger_root
|
78
|
+
@@logger_root ||= case
|
79
|
+
when File.exist?(File.dirname(__FILE__) + "/../../../../../log")
|
80
|
+
File.expand_path(File.dirname(__FILE__) + "/../../../../../log")
|
81
|
+
when File.exist?(File.dirname(__FILE__) + "/../../log")
|
82
|
+
File.expand_path(File.dirname(__FILE__) + '/../../log')
|
83
|
+
when File.exist?(File.dirname(__FILE__) + "/../log")
|
84
|
+
File.expand_path(File.dirname(__FILE__) + '/../log')
|
85
|
+
when File.exist?(File.dirname(__FILE__) + "/log")
|
86
|
+
File.expand_path(File.dirname(__FILE__) + '/log')
|
87
|
+
else
|
88
|
+
File.expand_path(File.dirname(__FILE__))
|
89
|
+
end
|
90
|
+
logger_root = read_inheritable_attribute(:logger_root) || @@logger_root
|
91
|
+
end
|
92
|
+
|
93
|
+
# Sets the logger root for the subclass, and sets it globally if this is
|
94
|
+
# set on ETL. So, ETL.logger_root = "some location" sets the logger
|
95
|
+
# root for all subclasses. This is useful if a lot of ETL is being done,
|
96
|
+
# and it needs to be logged in a non-standard place.
|
97
|
+
def logger_root=(value)
|
98
|
+
write_inheritable_attribute(:logger_root, value)
|
99
|
+
@@logger_root = value if self == ETL
|
100
|
+
end
|
101
|
+
|
102
|
+
def logger_filename
|
103
|
+
File.join(self.logger_root, "#{self.to_s}.log")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# A series of callbacks that make the process quite transparent
|
108
|
+
define_callbacks :before_extract, :after_extract, :before_transform, :after_transform, :before_load, :after_load
|
109
|
+
|
110
|
+
def initialize
|
111
|
+
@state = :before_extract
|
112
|
+
end
|
113
|
+
|
114
|
+
# The state of the transform process
|
115
|
+
attr_reader :state
|
116
|
+
|
117
|
+
# The data being worked on, after it has successfully completed an
|
118
|
+
# extract, transform, or load process.
|
119
|
+
attr_reader :data
|
120
|
+
|
121
|
+
# The data generated on a process that didn't complete.
|
122
|
+
attr_reader :raw
|
123
|
+
|
124
|
+
# The options to process with. All your code will have access to these
|
125
|
+
# options, so things like:
|
126
|
+
#
|
127
|
+
# :filename => '...', :destination => '...', :converters => :all
|
128
|
+
#
|
129
|
+
# would all be useful. Your extract, transform, and load methods
|
130
|
+
# plus your callbacks can then extract out the information they need
|
131
|
+
# to get the job done.
|
132
|
+
attr_reader :options
|
133
|
+
|
134
|
+
# An optional block to process with
|
135
|
+
attr_reader :block
|
136
|
+
|
137
|
+
# Working towards a universal workflow driver here. The signature is
|
138
|
+
# just a hash and a block. That should work for about anything.
|
139
|
+
def process(options={}, &block)
|
140
|
+
# Only setup the options the first time, the other times we are re-
|
141
|
+
# starting the process.
|
142
|
+
@options = options unless @options
|
143
|
+
@block = block
|
144
|
+
|
145
|
+
self.class.logger.info "Processing #{self.class.to_s}"
|
146
|
+
self.class.logger.info "To re-run this process, run: #{self.show_command}"
|
147
|
+
self.class.logger.info "Note: Also pass the same block to #{self.class.to_s}" if block
|
148
|
+
|
149
|
+
etl_callback(:before_extract)
|
150
|
+
|
151
|
+
if @state == :extract
|
152
|
+
extract
|
153
|
+
@state = :after_extract
|
154
|
+
end
|
155
|
+
|
156
|
+
etl_callback(:after_extract)
|
157
|
+
|
158
|
+
# To be sure this is after all after_extract callbacks
|
159
|
+
process_raw_data
|
160
|
+
|
161
|
+
etl_callback(:before_transform)
|
162
|
+
|
163
|
+
if @state == :transform
|
164
|
+
transform
|
165
|
+
@state = :after_transform
|
166
|
+
end
|
167
|
+
|
168
|
+
etl_callback(:after_transform)
|
169
|
+
|
170
|
+
# To be sure this is after all after_tranform callbacks
|
171
|
+
process_raw_data
|
172
|
+
|
173
|
+
etl_callback(:before_load)
|
174
|
+
|
175
|
+
if @state == :load
|
176
|
+
load
|
177
|
+
@state = :after_load
|
178
|
+
end
|
179
|
+
|
180
|
+
etl_callback(:after_load)
|
181
|
+
@state
|
182
|
+
end
|
183
|
+
|
184
|
+
def reverse_to(state)
|
185
|
+
raise ArgumentError, "State must be one of #{VALID_STATES.inspect}" unless VALID_STATES.include?(state)
|
186
|
+
loc = VALID_STATES.index(state)
|
187
|
+
possible_states = VALID_STATES[0..loc]
|
188
|
+
raise "Cannot reverse to a state that hasn't been acheived yet." unless possible_states.include?(state)
|
189
|
+
@state = state
|
190
|
+
end
|
191
|
+
|
192
|
+
protected
|
193
|
+
|
194
|
+
def extract
|
195
|
+
# Silently do nothing
|
196
|
+
end
|
197
|
+
|
198
|
+
def transform
|
199
|
+
# Silently do nothing
|
200
|
+
end
|
201
|
+
|
202
|
+
def load
|
203
|
+
# Silently do nothing
|
204
|
+
end
|
205
|
+
|
206
|
+
# Runs a callback, if there is one defined on the class. Advances the
|
207
|
+
# state to the next state. Silently ignores the request if the current
|
208
|
+
# state isn't the callback being asked for. In this way, we can just
|
209
|
+
# call etl_callback several times, and it will advance from one state to
|
210
|
+
# the next.
|
211
|
+
def etl_callback(callback)
|
212
|
+
return false unless self.state == callback
|
213
|
+
run_callbacks(callback)
|
214
|
+
advance_from(callback)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Advances to the next state, only if we are in a valid state.
|
218
|
+
def advance_from(callback)
|
219
|
+
|
220
|
+
raise ArgumentError, "State: #{callback} not recognized" unless VALID_CALLBACKS.include?(callback)
|
221
|
+
before_state = @state
|
222
|
+
@state = case @state
|
223
|
+
when :before_extract
|
224
|
+
:extract
|
225
|
+
when :extract
|
226
|
+
:after_extract
|
227
|
+
when :after_extract
|
228
|
+
:before_transform
|
229
|
+
when :before_transform
|
230
|
+
:transform
|
231
|
+
when :transform
|
232
|
+
:after_transform
|
233
|
+
when :after_transform
|
234
|
+
:before_load
|
235
|
+
when :before_load
|
236
|
+
:load
|
237
|
+
when :load
|
238
|
+
:after_load
|
239
|
+
when :after_load
|
240
|
+
:complete
|
241
|
+
when :complete
|
242
|
+
:complete
|
243
|
+
end
|
244
|
+
|
245
|
+
self.class.logger.info "Advanced from #{before_state} to #{@state}"
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
def process_raw_data
|
250
|
+
@data = @raw if defined?(@raw)
|
251
|
+
@raw = nil
|
252
|
+
end
|
253
|
+
|
254
|
+
def show_command
|
255
|
+
"#{self.class.to_s}.process(#{show_parsed_options})"
|
256
|
+
end
|
257
|
+
|
258
|
+
def show_parsed_options
|
259
|
+
self.options.inject("") do |str, e|
|
260
|
+
if e.first.is_a?(Symbol) and e.last.is_a?(Symbol)
|
261
|
+
str << ":#{e.first} => :#{e.last}"
|
262
|
+
elsif e.first.is_a?(Symbol)
|
263
|
+
str << ":#{e.first} => #{e.last}"
|
264
|
+
elsif e.last.is_a?(Symbol)
|
265
|
+
str << "#{e.first} => :#{e.last}"
|
266
|
+
else
|
267
|
+
str << "#{e.first} => #{e.last}"
|
268
|
+
end
|
269
|
+
str
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# # This keeps the state of all observations in a bucket. An observation
|
2
|
+
# # is expected to be an OpenStruct with an occured_at field set. An
|
3
|
+
# # Observation class is provided in the helpers directory and is
|
4
|
+
# # automatically loaded with this gem. This is setup to work well in the
|
5
|
+
# # observable pattern.
|
6
|
+
#
|
7
|
+
# # Uses
|
8
|
+
# class TimeCachedOpenStruct
|
9
|
+
#
|
10
|
+
# attr_reader :tick_time
|
11
|
+
# attr_reader :keep_for
|
12
|
+
# attr_reader :cache
|
13
|
+
#
|
14
|
+
# def initialize(opts={})
|
15
|
+
# @tick_time = opts.fetch(:tick_time, 1)
|
16
|
+
# @keep_for = opts.fetch(:keep_for, self.tick_time * 100)
|
17
|
+
# @cache = ... (hash structure)
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def at(time)
|
21
|
+
# self.cache[index_for(time)]
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# protected
|
25
|
+
# def index_for(time)
|
26
|
+
# ...
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# def round(time)
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# require 'observable'
|
34
|
+
# class TimeBucket
|
35
|
+
#
|
36
|
+
# include Observer
|
37
|
+
#
|
38
|
+
# class << self
|
39
|
+
#
|
40
|
+
# # Works more like a multiton with subclasses. Each subclass gets their
|
41
|
+
# # own instance.
|
42
|
+
# def instance(opts={})
|
43
|
+
# instance = read_inheritable_attribute(:instance)
|
44
|
+
# return instance if instance
|
45
|
+
# instance = new(opts)
|
46
|
+
# write_inheritable_attribute(:instance, instance)
|
47
|
+
# instance
|
48
|
+
# end
|
49
|
+
# end
|
50
|
+
#
|
51
|
+
# # How often the state is broadcast
|
52
|
+
# attr_reader :tick_time
|
53
|
+
#
|
54
|
+
# # How long to wait for messages to be gathered in the bucket. If they
|
55
|
+
# # are not gathered by this time, they will never be broadcast.
|
56
|
+
# attr_reader :delay_time
|
57
|
+
#
|
58
|
+
# # The actual state data, a OpenStruct-based cache with a time-based
|
59
|
+
# # eviction_policy and a time-based accessor:
|
60
|
+
# # TimeBucket.bucket.at(time_object)
|
61
|
+
# attr_reader :bucket
|
62
|
+
#
|
63
|
+
# def initialize(opts={})
|
64
|
+
# @tick_time = opts.fetch(:tick_time, 1)
|
65
|
+
# @delay_time = opts.fetch(:delay_time, 0.5)
|
66
|
+
# keep_time = self.tick_time * 100 + self.delay_time
|
67
|
+
# @bucket = TimeCachedOpenStruct.new(:tick_time => self.tick_time, :keep_for => keep_time)
|
68
|
+
# end
|
69
|
+
#
|
70
|
+
# # To be called in its own process:
|
71
|
+
# # Process.fork { TimeBucket.instance(...).service(@etl) }
|
72
|
+
# # @etl is an object that responds to process and can load the consolidated data.
|
73
|
+
# def service(etl)
|
74
|
+
# sleep self.delay_time
|
75
|
+
# loop do
|
76
|
+
# changed
|
77
|
+
# notify_observers(self.bucket.at(Time.now - self.sleep_time))
|
78
|
+
# sleep self.tick_time
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
#
|
82
|
+
# def update(obj)
|
83
|
+
# observation = infer_observation(obj)
|
84
|
+
# self.bucket.merge!(observation)
|
85
|
+
# end
|
86
|
+
#
|
87
|
+
# protected
|
88
|
+
# def infer_observation(obj)
|
89
|
+
# if obj.respond_to?(occured_at)
|
90
|
+
# obj
|
91
|
+
# elsif obj.respond_to?(observation) and obj.observation.occured_at
|
92
|
+
# obj.observation
|
93
|
+
# elsif obj.is_a?(OpenStruct)
|
94
|
+
# obj.occured_at = Time.now
|
95
|
+
# obj
|
96
|
+
# elsif obj.is_a?(Hash)
|
97
|
+
# observation = Observation.new(obj)
|
98
|
+
# observation.occured_at = obj.fetch(:occured_at, Time.now)
|
99
|
+
# observation
|
100
|
+
# else
|
101
|
+
# nil
|
102
|
+
# end
|
103
|
+
# end
|
104
|
+
# end
|