davidrichards-etl 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +261 -0
- data/VERSION.yml +4 -0
- data/bin/etl +27 -0
- data/lib/all.rb +4 -0
- data/lib/etl/active_record_loader.rb +50 -0
- data/lib/etl/bucket.rb +148 -0
- data/lib/etl/csv_et.rb +64 -0
- data/lib/etl/etl.rb +273 -0
- data/lib/etl/time_bucket.rb +104 -0
- data/lib/etl/xml_et.rb +6 -0
- data/lib/etl.rb +36 -0
- data/lib/helpers/array.rb +11 -0
- data/lib/helpers/observation.rb +10 -0
- data/lib/helpers/open_struct.rb +18 -0
- data/lib/helpers/string.rb +6 -0
- data/lib/helpers/symbol.rb +6 -0
- data/spec/etl/bucket_spec.rb +112 -0
- data/spec/etl/csv_et_spec.rb +43 -0
- data/spec/etl/etl_spec.rb +237 -0
- data/spec/etl/xml_et_spec.rb +50 -0
- data/spec/etl_spec.rb +16 -0
- data/spec/fixtures/test_file.csv +3 -0
- data/spec/helpers/array_spec.rb +13 -0
- data/spec/helpers/observation_spec.rb +22 -0
- data/spec/helpers/open_struct_spec.rb +25 -0
- data/spec/helpers/string_spec.rb +8 -0
- data/spec/helpers/symbol_spec.rb +7 -0
- data/spec/spec_helper.rb +15 -0
- metadata +106 -0
data/lib/etl.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# TODO:
|
2
|
+
# Test this (a bit of a bugger, because I need to fail at every callback and make sure that I can recover.)
|
3
|
+
# Get the logging done and demonstrated, because an ETL process without good logging really is useless.
|
4
|
+
# Include a logging example for syslog-ng and syslog
|
5
|
+
# Work through some bucket thoughts that I was having this morning: how to take random percepts and create consolidated snapshots of an environment at a point in time. This is driven from the belief maintenance systems, but certainly needs to be worked out.
|
6
|
+
# Figure out if TeguGears really should be doing this. Come back to how I'll parallelize this process. Demonstrate running this in parallel.
|
7
|
+
|
8
|
+
require 'rubygems'
|
9
|
+
require 'activesupport'
|
10
|
+
require 'ostruct'
|
11
|
+
require 'log4r'
|
12
|
+
require 'fileutils'
|
13
|
+
|
14
|
+
def load_gem_casually(name)
|
15
|
+
begin
|
16
|
+
gem name
|
17
|
+
require name
|
18
|
+
rescue Gem::LoadError
|
19
|
+
# Do nothing if this is not available. It's a convenience, not a requirement.
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
load_gem_casually('tegu_gears')
|
24
|
+
load_gem_casually('data_frame')
|
25
|
+
load_gem_casually('babel_icious')
|
26
|
+
|
27
|
+
Dir.glob("#{File.dirname(__FILE__)}/helpers/*.rb").each { |file| require file }
|
28
|
+
|
29
|
+
$:.unshift(File.dirname(__FILE__))
|
30
|
+
|
31
|
+
class ExtractError < StandardError; end
|
32
|
+
class TransformError < StandardError; end
|
33
|
+
# Note, LoadError is already used.
|
34
|
+
class LoadingError < StandardError; end
|
35
|
+
|
36
|
+
require 'etl/etl'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class Array # :nodoc:
|
2
|
+
# From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
|
3
|
+
def symbolize_values
|
4
|
+
self.map {|e| e.to_underscore_sym }
|
5
|
+
end
|
6
|
+
|
7
|
+
# From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
|
8
|
+
def symbolize_values!
|
9
|
+
self.map! {|e| e.to_underscore_sym }
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# Generic OpenStruct, with occured_at automatically set to the initialization time.
|
2
|
+
class Observation < OpenStruct
|
3
|
+
def initialize(*args)
|
4
|
+
@occured_at = Time.now
|
5
|
+
super
|
6
|
+
end
|
7
|
+
|
8
|
+
# Need to know when the observation was recorded to batch observations
|
9
|
+
attr_accessor :occured_at
|
10
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
require 'etl/bucket'
|
3
|
+
|
4
|
+
describe Bucket do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
class A
|
8
|
+
def initialize(*args)
|
9
|
+
@value = args
|
10
|
+
end
|
11
|
+
attr_reader :value
|
12
|
+
end
|
13
|
+
|
14
|
+
S = Struct.new(:this)
|
15
|
+
end
|
16
|
+
|
17
|
+
before do
|
18
|
+
@b = Bucket.new
|
19
|
+
@h = {:this => 1}
|
20
|
+
@o = OpenStruct.new(:this => 1)
|
21
|
+
@s = S.new(1)
|
22
|
+
@b1 = Bucket.new(@h)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should create a hash for storing raw, unordered data" do
|
26
|
+
@b.raw_data.should be_is_a(Hash)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should be able to add a record with a hash" do
|
30
|
+
@b.add(@h)
|
31
|
+
@b.filtered_data.should == @h
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should be able to add a record with an OpenStruct" do
|
35
|
+
@b.add(@o)
|
36
|
+
@b.filtered_data.should == @h
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should be able to add a record with a Struct" do
|
40
|
+
@b.add(@s)
|
41
|
+
@b.filtered_data.should == @h
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should be able to override values" do
|
45
|
+
@b.add(@h)
|
46
|
+
@b.add(:this => 2)
|
47
|
+
@b.filtered_data.should == {:this => 2}
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should create a way to setup labels" do
|
51
|
+
a = [:three, :two, :one]
|
52
|
+
@b.labels = a
|
53
|
+
@b.labels.all? {|l| a.should be_include(l)}
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should be constructable with a hash" do
|
57
|
+
b = Bucket.new(@h)
|
58
|
+
b.filtered_data.should == @h
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should be constructable with an OpenStruct" do
|
62
|
+
b = Bucket.new(@o)
|
63
|
+
b.filtered_data.should == @h
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should be constructable with a Struct" do
|
67
|
+
b = Bucket.new(@s)
|
68
|
+
b.filtered_data.should == @h
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should be able to dump the contents of the bucket" do
|
72
|
+
@b1.dump.should == @h
|
73
|
+
@b1.raw_data.should == {}
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be able to take an arbitrary filter" do
|
77
|
+
b = Bucket.new(@h) {|h| :not_the_data}
|
78
|
+
b.raw_data.should == @h
|
79
|
+
b.filtered_data.should eql(:not_the_data)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to return an array" do
|
83
|
+
@b1.to_a.should eql([1])
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should be able to return a hash" do
|
87
|
+
@b1.to_hash.should == @h
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should be able to return any object that initializes with the bucket values" do
|
91
|
+
a = @b1.to_obj(A)
|
92
|
+
a.value.should eql(@b1.to_a)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should be able to return a Struct" do
|
96
|
+
s = @b1.to_struct(S)
|
97
|
+
s.this.should eql(1)
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should be able to return an OpenStruct" do
|
101
|
+
o = @b1.to_open_struct
|
102
|
+
o.table.should == @h
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should be able to constrain and order keys, silently ignoring data that isn't white listed" do
|
106
|
+
h = {:ones => 1, :twos => 2, :threes => 3}
|
107
|
+
@b.white_list = [:ones, :twos, :threes]
|
108
|
+
@b.add :ones => 1, :twos => 2, :threes => 3, :fours => 4
|
109
|
+
@b.filtered_data.should == h
|
110
|
+
@b.to_a.should eql([1,2,3])
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
require 'etl/csv_et'
|
3
|
+
|
4
|
+
describe CSV::ET do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@csv_file = File.expand_path("#{File.dirname(__FILE__)}/../fixtures/test_file.csv")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be able to transform a csv file into an array of arrays" do
|
11
|
+
@etl = CSV::ET.process(:source => @csv_file)
|
12
|
+
@etl.data.should be_is_a(Array)
|
13
|
+
@etl.data.size.should eql(3)
|
14
|
+
@etl.data.first.should eql(["some", "data", "here"])
|
15
|
+
@etl.data.last.should eql([4,5,6])
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should be able to transforrm csv data into an array of arrays" do
|
19
|
+
content = File.read(@csv_file)
|
20
|
+
@etl = CSV::ET.process(:source => content)
|
21
|
+
@etl.data.should be_is_a(Array)
|
22
|
+
@etl.data.size.should eql(3)
|
23
|
+
@etl.data.first.should eql(["some", "data", "here"])
|
24
|
+
@etl.data.last.should eql([4,5,6])
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be able to pull the header out of the extracted data" do
|
28
|
+
@etl = CSV::ET.process(:source => @csv_file, :extract_header => true)
|
29
|
+
@etl.header.should eql(["some", "data", "here"])
|
30
|
+
@etl.data.first.should eql([1,2,3])
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should be able to use the FasterCSV options" do
|
34
|
+
FasterCSV::Converters[:foo] = lambda{|f| :foo }
|
35
|
+
@etl = CSV::ET.process(
|
36
|
+
:source => @csv_file,
|
37
|
+
:extract_header => true,
|
38
|
+
:parse_with => {:converters => :foo}
|
39
|
+
)
|
40
|
+
@etl.header.should eql([:foo, :foo, :foo])
|
41
|
+
@etl.data.first.should eql([:foo, :foo, :foo])
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,237 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
|
3
|
+
describe ETL do
|
4
|
+
|
5
|
+
after(:all) do
|
6
|
+
FileUtils.rm_f(ETL.logger_filename)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have a series of valid states" do
|
10
|
+
ETL::VALID_STATES.should eql([
|
11
|
+
:before_extract,
|
12
|
+
:extract,
|
13
|
+
:after_extract,
|
14
|
+
:before_transform,
|
15
|
+
:transform,
|
16
|
+
:after_transform,
|
17
|
+
:before_load,
|
18
|
+
:load,
|
19
|
+
:after_load,
|
20
|
+
:complete
|
21
|
+
])
|
22
|
+
end
|
23
|
+
|
24
|
+
context "Class Methods" do
|
25
|
+
it "should be able to process the ETL class" do
|
26
|
+
val = ETL.process
|
27
|
+
val.should be_is_a(ETL)
|
28
|
+
val.state.should eql(:complete)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should be able to run call as an alias to process" do
|
32
|
+
val = ETL.call
|
33
|
+
val.should be_is_a(ETL)
|
34
|
+
val.state.should eql(:complete)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have a logger" do
|
38
|
+
ETL.logger.should be_is_a(Log4r::Logger)
|
39
|
+
ETL.logger.name.should eql('ETL')
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should have a console logger" do
|
43
|
+
cl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::StderrOutputter)}
|
44
|
+
cl.name.should eql('console')
|
45
|
+
cl.level.should eql(Log4r::WARN)
|
46
|
+
cl.formatter.should be_is_a(Log4r::PatternFormatter)
|
47
|
+
cl.formatter.pattern.should eql("[%l] %d :: %m")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should have a file logger" do
|
51
|
+
fl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::FileOutputter)}
|
52
|
+
fl.name.should eql('logfile')
|
53
|
+
fl.filename.should match(/ETL.log$/)
|
54
|
+
fl.formatter.pattern.should eql("[%l] %d :: %m")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should log a script to duplicate the ETL" do
|
58
|
+
ETL.process(:funny => :options)
|
59
|
+
r = Regexp.new(Regexp.escape("ETL.process(:funny => :options)"))
|
60
|
+
logger_contents.should match(r)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should have a beginning state of :before_extract" do
|
66
|
+
ETL.new.state.should eql(:before_extract)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should have data and raw readers" do
|
70
|
+
e = ETL.new
|
71
|
+
e.should be_respond_to(:data)
|
72
|
+
e.should be_respond_to(:raw)
|
73
|
+
end
|
74
|
+
|
75
|
+
context "Process" do
|
76
|
+
it "should call each transition" do
|
77
|
+
PostBoard.reset
|
78
|
+
CheckTransitions.process
|
79
|
+
PostBoard.board.should eql([:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load])
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should use raw as a data holding bucket, useful for using post-transactional validations" do
|
83
|
+
PostBoard.reset
|
84
|
+
ShowRaw.process
|
85
|
+
PostBoard.board.should eql([nil, :extract, :extract, nil, :transform, :transform, nil, :load, :load])
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should convert raw to data after each step" do
|
89
|
+
PostBoard.reset
|
90
|
+
ShowData.process
|
91
|
+
PostBoard.board.should eql([nil, nil, nil, :extract, :extract, :extract, :transform, :transform, :transform])
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should be able to reverse back to a prior state and restart" do
|
95
|
+
PostBoard.reset
|
96
|
+
counter = ShowCounter.new
|
97
|
+
counter.process
|
98
|
+
PostBoard.board.last.should eql(9)
|
99
|
+
counter.reverse_to(:transform)
|
100
|
+
counter.process
|
101
|
+
PostBoard.board.last.should eql(14)
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should move data in @raw to @data at every stage" do
|
105
|
+
etl = ExplicitRawToDataShow.new
|
106
|
+
etl.process
|
107
|
+
etl.data.should eql(2)
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class PostBoard
|
114
|
+
class << self
|
115
|
+
def post(value)
|
116
|
+
self.board << value
|
117
|
+
end
|
118
|
+
|
119
|
+
def board
|
120
|
+
@@board ||= []
|
121
|
+
end
|
122
|
+
|
123
|
+
def reset
|
124
|
+
@@board = []
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Setting up for various ETL tests. Must implement post_state with an optional paramater
|
130
|
+
class Demo < ETL
|
131
|
+
before_extract :post_state
|
132
|
+
after_extract :post_state
|
133
|
+
before_transform :post_state
|
134
|
+
after_transform :post_state
|
135
|
+
before_load :post_state
|
136
|
+
after_load :post_state
|
137
|
+
|
138
|
+
def extract
|
139
|
+
post_state(:extract)
|
140
|
+
end
|
141
|
+
|
142
|
+
def transform
|
143
|
+
post_state(:transform)
|
144
|
+
end
|
145
|
+
|
146
|
+
def load
|
147
|
+
post_state(:load)
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# Doesn't do much but mark that the states were passed.
|
153
|
+
class CheckTransitions < Demo
|
154
|
+
def post_state(s=nil)
|
155
|
+
s ||= self.state
|
156
|
+
PostBoard.post s
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Marks the value of raw at every transition
|
161
|
+
class ShowRaw < Demo
|
162
|
+
|
163
|
+
def extract
|
164
|
+
@raw = :extract
|
165
|
+
post_state(self.raw)
|
166
|
+
end
|
167
|
+
|
168
|
+
def transform
|
169
|
+
@raw = :transform
|
170
|
+
post_state(self.raw)
|
171
|
+
end
|
172
|
+
|
173
|
+
def load
|
174
|
+
@raw = :load
|
175
|
+
post_state(self.raw)
|
176
|
+
end
|
177
|
+
|
178
|
+
def post_state(s=nil)
|
179
|
+
s ||= self.raw
|
180
|
+
PostBoard.post s
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ShowData < Demo
|
185
|
+
|
186
|
+
def extract
|
187
|
+
@raw = :extract
|
188
|
+
post_state(self.data)
|
189
|
+
end
|
190
|
+
|
191
|
+
def transform
|
192
|
+
@raw = :transform
|
193
|
+
post_state(self.data)
|
194
|
+
end
|
195
|
+
|
196
|
+
def load
|
197
|
+
@raw = :load
|
198
|
+
post_state(self.data)
|
199
|
+
end
|
200
|
+
|
201
|
+
def post_state(s=nil)
|
202
|
+
s ||= self.data
|
203
|
+
PostBoard.post s
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
class ShowCounter < Demo
|
208
|
+
|
209
|
+
def advance_count
|
210
|
+
@count = self.count + 1
|
211
|
+
end
|
212
|
+
|
213
|
+
def count
|
214
|
+
@count ||= 0
|
215
|
+
end
|
216
|
+
|
217
|
+
def extract
|
218
|
+
post_state
|
219
|
+
end
|
220
|
+
alias :transform :extract
|
221
|
+
alias :load :extract
|
222
|
+
|
223
|
+
def post_state
|
224
|
+
advance_count
|
225
|
+
PostBoard.post self.count
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
class ExplicitRawToDataShow < ETL
|
230
|
+
def extract
|
231
|
+
@raw = 1
|
232
|
+
end
|
233
|
+
|
234
|
+
def transform
|
235
|
+
@raw = @data + 1
|
236
|
+
end
|
237
|
+
end
|