davidrichards-etl 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +261 -0
- data/VERSION.yml +4 -0
- data/bin/etl +27 -0
- data/lib/all.rb +4 -0
- data/lib/etl/active_record_loader.rb +50 -0
- data/lib/etl/bucket.rb +148 -0
- data/lib/etl/csv_et.rb +64 -0
- data/lib/etl/etl.rb +273 -0
- data/lib/etl/time_bucket.rb +104 -0
- data/lib/etl/xml_et.rb +6 -0
- data/lib/etl.rb +36 -0
- data/lib/helpers/array.rb +11 -0
- data/lib/helpers/observation.rb +10 -0
- data/lib/helpers/open_struct.rb +18 -0
- data/lib/helpers/string.rb +6 -0
- data/lib/helpers/symbol.rb +6 -0
- data/spec/etl/bucket_spec.rb +112 -0
- data/spec/etl/csv_et_spec.rb +43 -0
- data/spec/etl/etl_spec.rb +237 -0
- data/spec/etl/xml_et_spec.rb +50 -0
- data/spec/etl_spec.rb +16 -0
- data/spec/fixtures/test_file.csv +3 -0
- data/spec/helpers/array_spec.rb +13 -0
- data/spec/helpers/observation_spec.rb +22 -0
- data/spec/helpers/open_struct_spec.rb +25 -0
- data/spec/helpers/string_spec.rb +8 -0
- data/spec/helpers/symbol_spec.rb +7 -0
- data/spec/spec_helper.rb +15 -0
- metadata +106 -0
data/lib/etl.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# TODO:
|
2
|
+
# Test this (a bit of a bugger, because I need to fail at every callback and make sure that I can recover.)
|
3
|
+
# Get the logging done and demonstrated, because an ETL process without good logging really is useless.
|
4
|
+
# Include a logging example for syslog-ng and syslog
|
5
|
+
# Work through some bucket thoughts that I was having this morning: how to take random percepts and create consolidated snapshots of an environment at a point in time. This is driven from the belief maintenance systems, but certainly needs to be worked out.
|
6
|
+
# Figure out if TeguGears really should be doing this. Come back to how I'll parallelize this process. Demonstrate running this in parallel.
|
7
|
+
|
8
|
+
require 'rubygems'
|
9
|
+
require 'activesupport'
|
10
|
+
require 'ostruct'
|
11
|
+
require 'log4r'
|
12
|
+
require 'fileutils'
|
13
|
+
|
14
|
+
def load_gem_casually(name)
|
15
|
+
begin
|
16
|
+
gem name
|
17
|
+
require name
|
18
|
+
rescue Gem::LoadError
|
19
|
+
# Do nothing if this is not available. It's a convenience, not a requirement.
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
load_gem_casually('tegu_gears')
|
24
|
+
load_gem_casually('data_frame')
|
25
|
+
load_gem_casually('babel_icious')
|
26
|
+
|
27
|
+
Dir.glob("#{File.dirname(__FILE__)}/helpers/*.rb").each { |file| require file }
|
28
|
+
|
29
|
+
$:.unshift(File.dirname(__FILE__))
|
30
|
+
|
31
|
+
class ExtractError < StandardError; end
|
32
|
+
class TransformError < StandardError; end
|
33
|
+
# Note, LoadError is already used.
|
34
|
+
class LoadingError < StandardError; end
|
35
|
+
|
36
|
+
require 'etl/etl'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class Array # :nodoc:
|
2
|
+
# From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
|
3
|
+
def symbolize_values
|
4
|
+
self.map {|e| e.to_underscore_sym }
|
5
|
+
end
|
6
|
+
|
7
|
+
# From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
|
8
|
+
def symbolize_values!
|
9
|
+
self.map! {|e| e.to_underscore_sym }
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# Generic OpenStruct, with occured_at automatically set to the initialization time.
|
2
|
+
class Observation < OpenStruct
|
3
|
+
def initialize(*args)
|
4
|
+
@occured_at = Time.now
|
5
|
+
super
|
6
|
+
end
|
7
|
+
|
8
|
+
# Need to know when the observation was recorded to batch observations
|
9
|
+
attr_accessor :occured_at
|
10
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
require 'etl/bucket'
|
3
|
+
|
4
|
+
describe Bucket do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
class A
|
8
|
+
def initialize(*args)
|
9
|
+
@value = args
|
10
|
+
end
|
11
|
+
attr_reader :value
|
12
|
+
end
|
13
|
+
|
14
|
+
S = Struct.new(:this)
|
15
|
+
end
|
16
|
+
|
17
|
+
before do
|
18
|
+
@b = Bucket.new
|
19
|
+
@h = {:this => 1}
|
20
|
+
@o = OpenStruct.new(:this => 1)
|
21
|
+
@s = S.new(1)
|
22
|
+
@b1 = Bucket.new(@h)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should create a hash for storing raw, unordered data" do
|
26
|
+
@b.raw_data.should be_is_a(Hash)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should be able to add a record with a hash" do
|
30
|
+
@b.add(@h)
|
31
|
+
@b.filtered_data.should == @h
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should be able to add a record with an OpenStruct" do
|
35
|
+
@b.add(@o)
|
36
|
+
@b.filtered_data.should == @h
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should be able to add a record with a Struct" do
|
40
|
+
@b.add(@s)
|
41
|
+
@b.filtered_data.should == @h
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should be able to override values" do
|
45
|
+
@b.add(@h)
|
46
|
+
@b.add(:this => 2)
|
47
|
+
@b.filtered_data.should == {:this => 2}
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should create a way to setup labels" do
|
51
|
+
a = [:three, :two, :one]
|
52
|
+
@b.labels = a
|
53
|
+
@b.labels.all? {|l| a.should be_include(l)}
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should be constructable with a hash" do
|
57
|
+
b = Bucket.new(@h)
|
58
|
+
b.filtered_data.should == @h
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should be constructable with an OpenStruct" do
|
62
|
+
b = Bucket.new(@o)
|
63
|
+
b.filtered_data.should == @h
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should be constructable with a Struct" do
|
67
|
+
b = Bucket.new(@s)
|
68
|
+
b.filtered_data.should == @h
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should be able to dump the contents of the bucket" do
|
72
|
+
@b1.dump.should == @h
|
73
|
+
@b1.raw_data.should == {}
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be able to take an arbitrary filter" do
|
77
|
+
b = Bucket.new(@h) {|h| :not_the_data}
|
78
|
+
b.raw_data.should == @h
|
79
|
+
b.filtered_data.should eql(:not_the_data)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to return an array" do
|
83
|
+
@b1.to_a.should eql([1])
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should be able to return a hash" do
|
87
|
+
@b1.to_hash.should == @h
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should be able to return any object that initializes with the bucket values" do
|
91
|
+
a = @b1.to_obj(A)
|
92
|
+
a.value.should eql(@b1.to_a)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should be able to return a Struct" do
|
96
|
+
s = @b1.to_struct(S)
|
97
|
+
s.this.should eql(1)
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should be able to return an OpenStruct" do
|
101
|
+
o = @b1.to_open_struct
|
102
|
+
o.table.should == @h
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should be able to constrain and order keys, silently ignoring data that isn't white listed" do
|
106
|
+
h = {:ones => 1, :twos => 2, :threes => 3}
|
107
|
+
@b.white_list = [:ones, :twos, :threes]
|
108
|
+
@b.add :ones => 1, :twos => 2, :threes => 3, :fours => 4
|
109
|
+
@b.filtered_data.should == h
|
110
|
+
@b.to_a.should eql([1,2,3])
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
require 'etl/csv_et'
|
3
|
+
|
4
|
+
describe CSV::ET do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@csv_file = File.expand_path("#{File.dirname(__FILE__)}/../fixtures/test_file.csv")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be able to transform a csv file into an array of arrays" do
|
11
|
+
@etl = CSV::ET.process(:source => @csv_file)
|
12
|
+
@etl.data.should be_is_a(Array)
|
13
|
+
@etl.data.size.should eql(3)
|
14
|
+
@etl.data.first.should eql(["some", "data", "here"])
|
15
|
+
@etl.data.last.should eql([4,5,6])
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should be able to transforrm csv data into an array of arrays" do
|
19
|
+
content = File.read(@csv_file)
|
20
|
+
@etl = CSV::ET.process(:source => content)
|
21
|
+
@etl.data.should be_is_a(Array)
|
22
|
+
@etl.data.size.should eql(3)
|
23
|
+
@etl.data.first.should eql(["some", "data", "here"])
|
24
|
+
@etl.data.last.should eql([4,5,6])
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be able to pull the header out of the extracted data" do
|
28
|
+
@etl = CSV::ET.process(:source => @csv_file, :extract_header => true)
|
29
|
+
@etl.header.should eql(["some", "data", "here"])
|
30
|
+
@etl.data.first.should eql([1,2,3])
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should be able to use the FasterCSV options" do
|
34
|
+
FasterCSV::Converters[:foo] = lambda{|f| :foo }
|
35
|
+
@etl = CSV::ET.process(
|
36
|
+
:source => @csv_file,
|
37
|
+
:extract_header => true,
|
38
|
+
:parse_with => {:converters => :foo}
|
39
|
+
)
|
40
|
+
@etl.header.should eql([:foo, :foo, :foo])
|
41
|
+
@etl.data.first.should eql([:foo, :foo, :foo])
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,237 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
|
3
|
+
describe ETL do
|
4
|
+
|
5
|
+
after(:all) do
|
6
|
+
FileUtils.rm_f(ETL.logger_filename)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have a series of valid states" do
|
10
|
+
ETL::VALID_STATES.should eql([
|
11
|
+
:before_extract,
|
12
|
+
:extract,
|
13
|
+
:after_extract,
|
14
|
+
:before_transform,
|
15
|
+
:transform,
|
16
|
+
:after_transform,
|
17
|
+
:before_load,
|
18
|
+
:load,
|
19
|
+
:after_load,
|
20
|
+
:complete
|
21
|
+
])
|
22
|
+
end
|
23
|
+
|
24
|
+
context "Class Methods" do
|
25
|
+
it "should be able to process the ETL class" do
|
26
|
+
val = ETL.process
|
27
|
+
val.should be_is_a(ETL)
|
28
|
+
val.state.should eql(:complete)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should be able to run call as an alias to process" do
|
32
|
+
val = ETL.call
|
33
|
+
val.should be_is_a(ETL)
|
34
|
+
val.state.should eql(:complete)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have a logger" do
|
38
|
+
ETL.logger.should be_is_a(Log4r::Logger)
|
39
|
+
ETL.logger.name.should eql('ETL')
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should have a console logger" do
|
43
|
+
cl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::StderrOutputter)}
|
44
|
+
cl.name.should eql('console')
|
45
|
+
cl.level.should eql(Log4r::WARN)
|
46
|
+
cl.formatter.should be_is_a(Log4r::PatternFormatter)
|
47
|
+
cl.formatter.pattern.should eql("[%l] %d :: %m")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should have a file logger" do
|
51
|
+
fl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::FileOutputter)}
|
52
|
+
fl.name.should eql('logfile')
|
53
|
+
fl.filename.should match(/ETL.log$/)
|
54
|
+
fl.formatter.pattern.should eql("[%l] %d :: %m")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should log a script to duplicate the ETL" do
|
58
|
+
ETL.process(:funny => :options)
|
59
|
+
r = Regexp.new(Regexp.escape("ETL.process(:funny => :options)"))
|
60
|
+
logger_contents.should match(r)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should have a beginning state of :before_extract" do
|
66
|
+
ETL.new.state.should eql(:before_extract)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should have data and raw readers" do
|
70
|
+
e = ETL.new
|
71
|
+
e.should be_respond_to(:data)
|
72
|
+
e.should be_respond_to(:raw)
|
73
|
+
end
|
74
|
+
|
75
|
+
context "Process" do
|
76
|
+
it "should call each transition" do
|
77
|
+
PostBoard.reset
|
78
|
+
CheckTransitions.process
|
79
|
+
PostBoard.board.should eql([:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load])
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should use raw as a data holding bucket, useful for using post-transactional validations" do
|
83
|
+
PostBoard.reset
|
84
|
+
ShowRaw.process
|
85
|
+
PostBoard.board.should eql([nil, :extract, :extract, nil, :transform, :transform, nil, :load, :load])
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should convert raw to data after each step" do
|
89
|
+
PostBoard.reset
|
90
|
+
ShowData.process
|
91
|
+
PostBoard.board.should eql([nil, nil, nil, :extract, :extract, :extract, :transform, :transform, :transform])
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should be able to reverse back to a prior state and restart" do
|
95
|
+
PostBoard.reset
|
96
|
+
counter = ShowCounter.new
|
97
|
+
counter.process
|
98
|
+
PostBoard.board.last.should eql(9)
|
99
|
+
counter.reverse_to(:transform)
|
100
|
+
counter.process
|
101
|
+
PostBoard.board.last.should eql(14)
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should move data in @raw to @data at every stage" do
|
105
|
+
etl = ExplicitRawToDataShow.new
|
106
|
+
etl.process
|
107
|
+
etl.data.should eql(2)
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class PostBoard
|
114
|
+
class << self
|
115
|
+
def post(value)
|
116
|
+
self.board << value
|
117
|
+
end
|
118
|
+
|
119
|
+
def board
|
120
|
+
@@board ||= []
|
121
|
+
end
|
122
|
+
|
123
|
+
def reset
|
124
|
+
@@board = []
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Setting up for various ETL tests. Must implement post_state with an optional paramater
|
130
|
+
class Demo < ETL
|
131
|
+
before_extract :post_state
|
132
|
+
after_extract :post_state
|
133
|
+
before_transform :post_state
|
134
|
+
after_transform :post_state
|
135
|
+
before_load :post_state
|
136
|
+
after_load :post_state
|
137
|
+
|
138
|
+
def extract
|
139
|
+
post_state(:extract)
|
140
|
+
end
|
141
|
+
|
142
|
+
def transform
|
143
|
+
post_state(:transform)
|
144
|
+
end
|
145
|
+
|
146
|
+
def load
|
147
|
+
post_state(:load)
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# Doesn't do much but mark that the states were passed.
|
153
|
+
class CheckTransitions < Demo
|
154
|
+
def post_state(s=nil)
|
155
|
+
s ||= self.state
|
156
|
+
PostBoard.post s
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Marks the value of raw at every transition
|
161
|
+
class ShowRaw < Demo
|
162
|
+
|
163
|
+
def extract
|
164
|
+
@raw = :extract
|
165
|
+
post_state(self.raw)
|
166
|
+
end
|
167
|
+
|
168
|
+
def transform
|
169
|
+
@raw = :transform
|
170
|
+
post_state(self.raw)
|
171
|
+
end
|
172
|
+
|
173
|
+
def load
|
174
|
+
@raw = :load
|
175
|
+
post_state(self.raw)
|
176
|
+
end
|
177
|
+
|
178
|
+
def post_state(s=nil)
|
179
|
+
s ||= self.raw
|
180
|
+
PostBoard.post s
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ShowData < Demo
|
185
|
+
|
186
|
+
def extract
|
187
|
+
@raw = :extract
|
188
|
+
post_state(self.data)
|
189
|
+
end
|
190
|
+
|
191
|
+
def transform
|
192
|
+
@raw = :transform
|
193
|
+
post_state(self.data)
|
194
|
+
end
|
195
|
+
|
196
|
+
def load
|
197
|
+
@raw = :load
|
198
|
+
post_state(self.data)
|
199
|
+
end
|
200
|
+
|
201
|
+
def post_state(s=nil)
|
202
|
+
s ||= self.data
|
203
|
+
PostBoard.post s
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
class ShowCounter < Demo
|
208
|
+
|
209
|
+
def advance_count
|
210
|
+
@count = self.count + 1
|
211
|
+
end
|
212
|
+
|
213
|
+
def count
|
214
|
+
@count ||= 0
|
215
|
+
end
|
216
|
+
|
217
|
+
def extract
|
218
|
+
post_state
|
219
|
+
end
|
220
|
+
alias :transform :extract
|
221
|
+
alias :load :extract
|
222
|
+
|
223
|
+
def post_state
|
224
|
+
advance_count
|
225
|
+
PostBoard.post self.count
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
class ExplicitRawToDataShow < ETL
|
230
|
+
def extract
|
231
|
+
@raw = 1
|
232
|
+
end
|
233
|
+
|
234
|
+
def transform
|
235
|
+
@raw = @data + 1
|
236
|
+
end
|
237
|
+
end
|