davidrichards-etl 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/lib/etl.rb ADDED
@@ -0,0 +1,36 @@
1
+ # TODO:
2
+ # Test this (a bit of a bugger, because I need to fail at every callback and make sure that I can recover.)
3
+ # Get the logging done and demonstrated, because an ETL process without good logging really is useless.
4
+ # Include a logging example for syslog-ng and syslog
5
+ # Work through some bucket thoughts that I was having this morning: how to take random percepts and create consolidated snapshots of an environment at a point in time. This is driven from the belief maintenance systems, but certainly needs to be worked out.
6
+ # Figure out if TeguGears really should be doing this. Come back to how I'll parallelize this process. Demonstrate running this in parallel.
7
+
8
+ require 'rubygems'
9
+ require 'activesupport'
10
+ require 'ostruct'
11
+ require 'log4r'
12
+ require 'fileutils'
13
+
14
+ def load_gem_casually(name)
15
+ begin
16
+ gem name
17
+ require name
18
+ rescue Gem::LoadError
19
+ # Do nothing if this is not available. It's a convenience, not a requirement.
20
+ end
21
+ end
22
+
23
+ load_gem_casually('tegu_gears')
24
+ load_gem_casually('data_frame')
25
+ load_gem_casually('babel_icious')
26
+
27
+ Dir.glob("#{File.dirname(__FILE__)}/helpers/*.rb").each { |file| require file }
28
+
29
+ $:.unshift(File.dirname(__FILE__))
30
+
31
+ class ExtractError < StandardError; end
32
+ class TransformError < StandardError; end
33
+ # Note, LoadError is already used.
34
+ class LoadingError < StandardError; end
35
+
36
+ require 'etl/etl'
@@ -0,0 +1,11 @@
1
+ class Array # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def symbolize_values
4
+ self.map {|e| e.to_underscore_sym }
5
+ end
6
+
7
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
8
+ def symbolize_values!
9
+ self.map! {|e| e.to_underscore_sym }
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ # Generic OpenStruct, with occured_at automatically set to the initialization time.
2
+ class Observation < OpenStruct
3
+ def initialize(*args)
4
+ @occured_at = Time.now
5
+ super
6
+ end
7
+
8
+ # Need to know when the observation was recorded to batch observations
9
+ attr_accessor :occured_at
10
+ end
@@ -0,0 +1,18 @@
1
+ class OpenStruct
2
+
3
+ def table
4
+ @table
5
+ end
6
+
7
+ def keys
8
+ self.table.keys
9
+ end
10
+
11
+ def values
12
+ self.table.values
13
+ end
14
+
15
+ def include?(key)
16
+ self.keys.include?(key)
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ class String # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def to_underscore_sym
4
+ self.titleize.gsub(/\s+/, '').underscore.to_sym
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ class Symbol # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def to_underscore_sym
4
+ self.to_s.titleize.gsub(/\s+/, '').underscore.to_sym
5
+ end
6
+ end
@@ -0,0 +1,112 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'etl/bucket'
3
+
4
+ describe Bucket do
5
+
6
+ before(:all) do
7
+ class A
8
+ def initialize(*args)
9
+ @value = args
10
+ end
11
+ attr_reader :value
12
+ end
13
+
14
+ S = Struct.new(:this)
15
+ end
16
+
17
+ before do
18
+ @b = Bucket.new
19
+ @h = {:this => 1}
20
+ @o = OpenStruct.new(:this => 1)
21
+ @s = S.new(1)
22
+ @b1 = Bucket.new(@h)
23
+ end
24
+
25
+ it "should create a hash for storing raw, unordered data" do
26
+ @b.raw_data.should be_is_a(Hash)
27
+ end
28
+
29
+ it "should be able to add a record with a hash" do
30
+ @b.add(@h)
31
+ @b.filtered_data.should == @h
32
+ end
33
+
34
+ it "should be able to add a record with an OpenStruct" do
35
+ @b.add(@o)
36
+ @b.filtered_data.should == @h
37
+ end
38
+
39
+ it "should be able to add a record with a Struct" do
40
+ @b.add(@s)
41
+ @b.filtered_data.should == @h
42
+ end
43
+
44
+ it "should be able to override values" do
45
+ @b.add(@h)
46
+ @b.add(:this => 2)
47
+ @b.filtered_data.should == {:this => 2}
48
+ end
49
+
50
+ it "should create a way to setup labels" do
51
+ a = [:three, :two, :one]
52
+ @b.labels = a
53
+ @b.labels.all? {|l| a.should be_include(l)}
54
+ end
55
+
56
+ it "should be constructable with a hash" do
57
+ b = Bucket.new(@h)
58
+ b.filtered_data.should == @h
59
+ end
60
+
61
+ it "should be constructable with an OpenStruct" do
62
+ b = Bucket.new(@o)
63
+ b.filtered_data.should == @h
64
+ end
65
+
66
+ it "should be constructable with a Struct" do
67
+ b = Bucket.new(@s)
68
+ b.filtered_data.should == @h
69
+ end
70
+
71
+ it "should be able to dump the contents of the bucket" do
72
+ @b1.dump.should == @h
73
+ @b1.raw_data.should == {}
74
+ end
75
+
76
+ it "should be able to take an arbitrary filter" do
77
+ b = Bucket.new(@h) {|h| :not_the_data}
78
+ b.raw_data.should == @h
79
+ b.filtered_data.should eql(:not_the_data)
80
+ end
81
+
82
+ it "should be able to return an array" do
83
+ @b1.to_a.should eql([1])
84
+ end
85
+
86
+ it "should be able to return a hash" do
87
+ @b1.to_hash.should == @h
88
+ end
89
+
90
+ it "should be able to return any object that initializes with the bucket values" do
91
+ a = @b1.to_obj(A)
92
+ a.value.should eql(@b1.to_a)
93
+ end
94
+
95
+ it "should be able to return a Struct" do
96
+ s = @b1.to_struct(S)
97
+ s.this.should eql(1)
98
+ end
99
+
100
+ it "should be able to return an OpenStruct" do
101
+ o = @b1.to_open_struct
102
+ o.table.should == @h
103
+ end
104
+
105
+ it "should be able to constrain and order keys, silently ignoring data that isn't white listed" do
106
+ h = {:ones => 1, :twos => 2, :threes => 3}
107
+ @b.white_list = [:ones, :twos, :threes]
108
+ @b.add :ones => 1, :twos => 2, :threes => 3, :fours => 4
109
+ @b.filtered_data.should == h
110
+ @b.to_a.should eql([1,2,3])
111
+ end
112
+ end
@@ -0,0 +1,43 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'etl/csv_et'
3
+
4
+ describe CSV::ET do
5
+
6
+ before do
7
+ @csv_file = File.expand_path("#{File.dirname(__FILE__)}/../fixtures/test_file.csv")
8
+ end
9
+
10
+ it "should be able to transform a csv file into an array of arrays" do
11
+ @etl = CSV::ET.process(:source => @csv_file)
12
+ @etl.data.should be_is_a(Array)
13
+ @etl.data.size.should eql(3)
14
+ @etl.data.first.should eql(["some", "data", "here"])
15
+ @etl.data.last.should eql([4,5,6])
16
+ end
17
+
18
+ it "should be able to transforrm csv data into an array of arrays" do
19
+ content = File.read(@csv_file)
20
+ @etl = CSV::ET.process(:source => content)
21
+ @etl.data.should be_is_a(Array)
22
+ @etl.data.size.should eql(3)
23
+ @etl.data.first.should eql(["some", "data", "here"])
24
+ @etl.data.last.should eql([4,5,6])
25
+ end
26
+
27
+ it "should be able to pull the header out of the extracted data" do
28
+ @etl = CSV::ET.process(:source => @csv_file, :extract_header => true)
29
+ @etl.header.should eql(["some", "data", "here"])
30
+ @etl.data.first.should eql([1,2,3])
31
+ end
32
+
33
+ it "should be able to use the FasterCSV options" do
34
+ FasterCSV::Converters[:foo] = lambda{|f| :foo }
35
+ @etl = CSV::ET.process(
36
+ :source => @csv_file,
37
+ :extract_header => true,
38
+ :parse_with => {:converters => :foo}
39
+ )
40
+ @etl.header.should eql([:foo, :foo, :foo])
41
+ @etl.data.first.should eql([:foo, :foo, :foo])
42
+ end
43
+ end
@@ -0,0 +1,237 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe ETL do
4
+
5
+ after(:all) do
6
+ FileUtils.rm_f(ETL.logger_filename)
7
+ end
8
+
9
+ it "should have a series of valid states" do
10
+ ETL::VALID_STATES.should eql([
11
+ :before_extract,
12
+ :extract,
13
+ :after_extract,
14
+ :before_transform,
15
+ :transform,
16
+ :after_transform,
17
+ :before_load,
18
+ :load,
19
+ :after_load,
20
+ :complete
21
+ ])
22
+ end
23
+
24
+ context "Class Methods" do
25
+ it "should be able to process the ETL class" do
26
+ val = ETL.process
27
+ val.should be_is_a(ETL)
28
+ val.state.should eql(:complete)
29
+ end
30
+
31
+ it "should be able to run call as an alias to process" do
32
+ val = ETL.call
33
+ val.should be_is_a(ETL)
34
+ val.state.should eql(:complete)
35
+ end
36
+
37
+ it "should have a logger" do
38
+ ETL.logger.should be_is_a(Log4r::Logger)
39
+ ETL.logger.name.should eql('ETL')
40
+ end
41
+
42
+ it "should have a console logger" do
43
+ cl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::StderrOutputter)}
44
+ cl.name.should eql('console')
45
+ cl.level.should eql(Log4r::WARN)
46
+ cl.formatter.should be_is_a(Log4r::PatternFormatter)
47
+ cl.formatter.pattern.should eql("[%l] %d :: %m")
48
+ end
49
+
50
+ it "should have a file logger" do
51
+ fl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::FileOutputter)}
52
+ fl.name.should eql('logfile')
53
+ fl.filename.should match(/ETL.log$/)
54
+ fl.formatter.pattern.should eql("[%l] %d :: %m")
55
+ end
56
+
57
+ it "should log a script to duplicate the ETL" do
58
+ ETL.process(:funny => :options)
59
+ r = Regexp.new(Regexp.escape("ETL.process(:funny => :options)"))
60
+ logger_contents.should match(r)
61
+ end
62
+
63
+ end
64
+
65
+ it "should have a beginning state of :before_extract" do
66
+ ETL.new.state.should eql(:before_extract)
67
+ end
68
+
69
+ it "should have data and raw readers" do
70
+ e = ETL.new
71
+ e.should be_respond_to(:data)
72
+ e.should be_respond_to(:raw)
73
+ end
74
+
75
+ context "Process" do
76
+ it "should call each transition" do
77
+ PostBoard.reset
78
+ CheckTransitions.process
79
+ PostBoard.board.should eql([:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load])
80
+ end
81
+
82
+ it "should use raw as a data holding bucket, useful for using post-transactional validations" do
83
+ PostBoard.reset
84
+ ShowRaw.process
85
+ PostBoard.board.should eql([nil, :extract, :extract, nil, :transform, :transform, nil, :load, :load])
86
+ end
87
+
88
+ it "should convert raw to data after each step" do
89
+ PostBoard.reset
90
+ ShowData.process
91
+ PostBoard.board.should eql([nil, nil, nil, :extract, :extract, :extract, :transform, :transform, :transform])
92
+ end
93
+
94
+ it "should be able to reverse back to a prior state and restart" do
95
+ PostBoard.reset
96
+ counter = ShowCounter.new
97
+ counter.process
98
+ PostBoard.board.last.should eql(9)
99
+ counter.reverse_to(:transform)
100
+ counter.process
101
+ PostBoard.board.last.should eql(14)
102
+ end
103
+
104
+ it "should move data in @raw to @data at every stage" do
105
+ etl = ExplicitRawToDataShow.new
106
+ etl.process
107
+ etl.data.should eql(2)
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ class PostBoard
114
+ class << self
115
+ def post(value)
116
+ self.board << value
117
+ end
118
+
119
+ def board
120
+ @@board ||= []
121
+ end
122
+
123
+ def reset
124
+ @@board = []
125
+ end
126
+ end
127
+ end
128
+
129
+ # Setting up for various ETL tests. Must implement post_state with an optional paramater
130
+ class Demo < ETL
131
+ before_extract :post_state
132
+ after_extract :post_state
133
+ before_transform :post_state
134
+ after_transform :post_state
135
+ before_load :post_state
136
+ after_load :post_state
137
+
138
+ def extract
139
+ post_state(:extract)
140
+ end
141
+
142
+ def transform
143
+ post_state(:transform)
144
+ end
145
+
146
+ def load
147
+ post_state(:load)
148
+ end
149
+
150
+ end
151
+
152
+ # Doesn't do much but mark that the states were passed.
153
+ class CheckTransitions < Demo
154
+ def post_state(s=nil)
155
+ s ||= self.state
156
+ PostBoard.post s
157
+ end
158
+ end
159
+
160
+ # Marks the value of raw at every transition
161
+ class ShowRaw < Demo
162
+
163
+ def extract
164
+ @raw = :extract
165
+ post_state(self.raw)
166
+ end
167
+
168
+ def transform
169
+ @raw = :transform
170
+ post_state(self.raw)
171
+ end
172
+
173
+ def load
174
+ @raw = :load
175
+ post_state(self.raw)
176
+ end
177
+
178
+ def post_state(s=nil)
179
+ s ||= self.raw
180
+ PostBoard.post s
181
+ end
182
+ end
183
+
184
+ class ShowData < Demo
185
+
186
+ def extract
187
+ @raw = :extract
188
+ post_state(self.data)
189
+ end
190
+
191
+ def transform
192
+ @raw = :transform
193
+ post_state(self.data)
194
+ end
195
+
196
+ def load
197
+ @raw = :load
198
+ post_state(self.data)
199
+ end
200
+
201
+ def post_state(s=nil)
202
+ s ||= self.data
203
+ PostBoard.post s
204
+ end
205
+ end
206
+
207
+ class ShowCounter < Demo
208
+
209
+ def advance_count
210
+ @count = self.count + 1
211
+ end
212
+
213
+ def count
214
+ @count ||= 0
215
+ end
216
+
217
+ def extract
218
+ post_state
219
+ end
220
+ alias :transform :extract
221
+ alias :load :extract
222
+
223
+ def post_state
224
+ advance_count
225
+ PostBoard.post self.count
226
+ end
227
+ end
228
+
229
+ class ExplicitRawToDataShow < ETL
230
+ def extract
231
+ @raw = 1
232
+ end
233
+
234
+ def transform
235
+ @raw = @data + 1
236
+ end
237
+ end