davidrichards-etl 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/etl.rb ADDED
@@ -0,0 +1,36 @@
1
+ # TODO:
2
+ # Test this (a bit of a bugger, because I need to fail at every callback and make sure that I can recover.)
3
+ # Get the logging done and demonstrated, because an ETL process without good logging really is useless.
4
+ # Include a logging example for syslog-ng and syslog
5
+ # Work through some bucket thoughts that I was having this morning: how to take random percepts and create consolidated snapshots of an environment at a point in time. This is driven from the belief maintenance systems, but certainly needs to be worked out.
6
+ # Figure out if TeguGears really should be doing this. Come back to how I'll parallelize this process. Demonstrate running this in parallel.
7
+
8
+ require 'rubygems'
9
+ require 'activesupport'
10
+ require 'ostruct'
11
+ require 'log4r'
12
+ require 'fileutils'
13
+
14
+ def load_gem_casually(name)
15
+ begin
16
+ gem name
17
+ require name
18
+ rescue Gem::LoadError
19
+ # Do nothing if this is not available. It's a convenience, not a requirement.
20
+ end
21
+ end
22
+
23
+ load_gem_casually('tegu_gears')
24
+ load_gem_casually('data_frame')
25
+ load_gem_casually('babel_icious')
26
+
27
+ Dir.glob("#{File.dirname(__FILE__)}/helpers/*.rb").each { |file| require file }
28
+
29
+ $:.unshift(File.dirname(__FILE__))
30
+
31
+ class ExtractError < StandardError; end
32
+ class TransformError < StandardError; end
33
+ # Note, LoadError is already used.
34
+ class LoadingError < StandardError; end
35
+
36
+ require 'etl/etl'
@@ -0,0 +1,11 @@
1
+ class Array # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def symbolize_values
4
+ self.map {|e| e.to_underscore_sym }
5
+ end
6
+
7
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
8
+ def symbolize_values!
9
+ self.map! {|e| e.to_underscore_sym }
10
+ end
11
+ end
@@ -0,0 +1,10 @@
1
+ # Generic OpenStruct, with occured_at automatically set to the initialization time.
2
+ class Observation < OpenStruct
3
+ def initialize(*args)
4
+ @occured_at = Time.now
5
+ super
6
+ end
7
+
8
+ # Need to know when the observation was recorded to batch observations
9
+ attr_accessor :occured_at
10
+ end
@@ -0,0 +1,18 @@
1
+ class OpenStruct
2
+
3
+ def table
4
+ @table
5
+ end
6
+
7
+ def keys
8
+ self.table.keys
9
+ end
10
+
11
+ def values
12
+ self.table.values
13
+ end
14
+
15
+ def include?(key)
16
+ self.keys.include?(key)
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ class String # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def to_underscore_sym
4
+ self.titleize.gsub(/\s+/, '').underscore.to_sym
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ class Symbol # :nodoc:
2
+ # From Carl Youngblood's excellent SBN gem: http://sbn.rubyforge.org/
3
+ def to_underscore_sym
4
+ self.to_s.titleize.gsub(/\s+/, '').underscore.to_sym
5
+ end
6
+ end
@@ -0,0 +1,112 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'etl/bucket'
3
+
4
+ describe Bucket do
5
+
6
+ before(:all) do
7
+ class A
8
+ def initialize(*args)
9
+ @value = args
10
+ end
11
+ attr_reader :value
12
+ end
13
+
14
+ S = Struct.new(:this)
15
+ end
16
+
17
+ before do
18
+ @b = Bucket.new
19
+ @h = {:this => 1}
20
+ @o = OpenStruct.new(:this => 1)
21
+ @s = S.new(1)
22
+ @b1 = Bucket.new(@h)
23
+ end
24
+
25
+ it "should create a hash for storing raw, unordered data" do
26
+ @b.raw_data.should be_is_a(Hash)
27
+ end
28
+
29
+ it "should be able to add a record with a hash" do
30
+ @b.add(@h)
31
+ @b.filtered_data.should == @h
32
+ end
33
+
34
+ it "should be able to add a record with an OpenStruct" do
35
+ @b.add(@o)
36
+ @b.filtered_data.should == @h
37
+ end
38
+
39
+ it "should be able to add a record with a Struct" do
40
+ @b.add(@s)
41
+ @b.filtered_data.should == @h
42
+ end
43
+
44
+ it "should be able to override values" do
45
+ @b.add(@h)
46
+ @b.add(:this => 2)
47
+ @b.filtered_data.should == {:this => 2}
48
+ end
49
+
50
+ it "should create a way to setup labels" do
51
+ a = [:three, :two, :one]
52
+ @b.labels = a
53
+ @b.labels.all? {|l| a.should be_include(l)}
54
+ end
55
+
56
+ it "should be constructable with a hash" do
57
+ b = Bucket.new(@h)
58
+ b.filtered_data.should == @h
59
+ end
60
+
61
+ it "should be constructable with an OpenStruct" do
62
+ b = Bucket.new(@o)
63
+ b.filtered_data.should == @h
64
+ end
65
+
66
+ it "should be constructable with a Struct" do
67
+ b = Bucket.new(@s)
68
+ b.filtered_data.should == @h
69
+ end
70
+
71
+ it "should be able to dump the contents of the bucket" do
72
+ @b1.dump.should == @h
73
+ @b1.raw_data.should == {}
74
+ end
75
+
76
+ it "should be able to take an arbitrary filter" do
77
+ b = Bucket.new(@h) {|h| :not_the_data}
78
+ b.raw_data.should == @h
79
+ b.filtered_data.should eql(:not_the_data)
80
+ end
81
+
82
+ it "should be able to return an array" do
83
+ @b1.to_a.should eql([1])
84
+ end
85
+
86
+ it "should be able to return a hash" do
87
+ @b1.to_hash.should == @h
88
+ end
89
+
90
+ it "should be able to return any object that initializes with the bucket values" do
91
+ a = @b1.to_obj(A)
92
+ a.value.should eql(@b1.to_a)
93
+ end
94
+
95
+ it "should be able to return a Struct" do
96
+ s = @b1.to_struct(S)
97
+ s.this.should eql(1)
98
+ end
99
+
100
+ it "should be able to return an OpenStruct" do
101
+ o = @b1.to_open_struct
102
+ o.table.should == @h
103
+ end
104
+
105
+ it "should be able to constrain and order keys, silently ignoring data that isn't white listed" do
106
+ h = {:ones => 1, :twos => 2, :threes => 3}
107
+ @b.white_list = [:ones, :twos, :threes]
108
+ @b.add :ones => 1, :twos => 2, :threes => 3, :fours => 4
109
+ @b.filtered_data.should == h
110
+ @b.to_a.should eql([1,2,3])
111
+ end
112
+ end
@@ -0,0 +1,43 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'etl/csv_et'
3
+
4
+ describe CSV::ET do
5
+
6
+ before do
7
+ @csv_file = File.expand_path("#{File.dirname(__FILE__)}/../fixtures/test_file.csv")
8
+ end
9
+
10
+ it "should be able to transform a csv file into an array of arrays" do
11
+ @etl = CSV::ET.process(:source => @csv_file)
12
+ @etl.data.should be_is_a(Array)
13
+ @etl.data.size.should eql(3)
14
+ @etl.data.first.should eql(["some", "data", "here"])
15
+ @etl.data.last.should eql([4,5,6])
16
+ end
17
+
18
+ it "should be able to transforrm csv data into an array of arrays" do
19
+ content = File.read(@csv_file)
20
+ @etl = CSV::ET.process(:source => content)
21
+ @etl.data.should be_is_a(Array)
22
+ @etl.data.size.should eql(3)
23
+ @etl.data.first.should eql(["some", "data", "here"])
24
+ @etl.data.last.should eql([4,5,6])
25
+ end
26
+
27
+ it "should be able to pull the header out of the extracted data" do
28
+ @etl = CSV::ET.process(:source => @csv_file, :extract_header => true)
29
+ @etl.header.should eql(["some", "data", "here"])
30
+ @etl.data.first.should eql([1,2,3])
31
+ end
32
+
33
+ it "should be able to use the FasterCSV options" do
34
+ FasterCSV::Converters[:foo] = lambda{|f| :foo }
35
+ @etl = CSV::ET.process(
36
+ :source => @csv_file,
37
+ :extract_header => true,
38
+ :parse_with => {:converters => :foo}
39
+ )
40
+ @etl.header.should eql([:foo, :foo, :foo])
41
+ @etl.data.first.should eql([:foo, :foo, :foo])
42
+ end
43
+ end
@@ -0,0 +1,237 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe ETL do
4
+
5
+ after(:all) do
6
+ FileUtils.rm_f(ETL.logger_filename)
7
+ end
8
+
9
+ it "should have a series of valid states" do
10
+ ETL::VALID_STATES.should eql([
11
+ :before_extract,
12
+ :extract,
13
+ :after_extract,
14
+ :before_transform,
15
+ :transform,
16
+ :after_transform,
17
+ :before_load,
18
+ :load,
19
+ :after_load,
20
+ :complete
21
+ ])
22
+ end
23
+
24
+ context "Class Methods" do
25
+ it "should be able to process the ETL class" do
26
+ val = ETL.process
27
+ val.should be_is_a(ETL)
28
+ val.state.should eql(:complete)
29
+ end
30
+
31
+ it "should be able to run call as an alias to process" do
32
+ val = ETL.call
33
+ val.should be_is_a(ETL)
34
+ val.state.should eql(:complete)
35
+ end
36
+
37
+ it "should have a logger" do
38
+ ETL.logger.should be_is_a(Log4r::Logger)
39
+ ETL.logger.name.should eql('ETL')
40
+ end
41
+
42
+ it "should have a console logger" do
43
+ cl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::StderrOutputter)}
44
+ cl.name.should eql('console')
45
+ cl.level.should eql(Log4r::WARN)
46
+ cl.formatter.should be_is_a(Log4r::PatternFormatter)
47
+ cl.formatter.pattern.should eql("[%l] %d :: %m")
48
+ end
49
+
50
+ it "should have a file logger" do
51
+ fl = ETL.logger.outputters.find {|l| l.is_a?(Log4r::FileOutputter)}
52
+ fl.name.should eql('logfile')
53
+ fl.filename.should match(/ETL.log$/)
54
+ fl.formatter.pattern.should eql("[%l] %d :: %m")
55
+ end
56
+
57
+ it "should log a script to duplicate the ETL" do
58
+ ETL.process(:funny => :options)
59
+ r = Regexp.new(Regexp.escape("ETL.process(:funny => :options)"))
60
+ logger_contents.should match(r)
61
+ end
62
+
63
+ end
64
+
65
+ it "should have a beginning state of :before_extract" do
66
+ ETL.new.state.should eql(:before_extract)
67
+ end
68
+
69
+ it "should have data and raw readers" do
70
+ e = ETL.new
71
+ e.should be_respond_to(:data)
72
+ e.should be_respond_to(:raw)
73
+ end
74
+
75
+ context "Process" do
76
+ it "should call each transition" do
77
+ PostBoard.reset
78
+ CheckTransitions.process
79
+ PostBoard.board.should eql([:before_extract, :extract, :after_extract, :before_transform, :transform, :after_transform, :before_load, :load, :after_load])
80
+ end
81
+
82
+ it "should use raw as a data holding bucket, useful for using post-transactional validations" do
83
+ PostBoard.reset
84
+ ShowRaw.process
85
+ PostBoard.board.should eql([nil, :extract, :extract, nil, :transform, :transform, nil, :load, :load])
86
+ end
87
+
88
+ it "should convert raw to data after each step" do
89
+ PostBoard.reset
90
+ ShowData.process
91
+ PostBoard.board.should eql([nil, nil, nil, :extract, :extract, :extract, :transform, :transform, :transform])
92
+ end
93
+
94
+ it "should be able to reverse back to a prior state and restart" do
95
+ PostBoard.reset
96
+ counter = ShowCounter.new
97
+ counter.process
98
+ PostBoard.board.last.should eql(9)
99
+ counter.reverse_to(:transform)
100
+ counter.process
101
+ PostBoard.board.last.should eql(14)
102
+ end
103
+
104
+ it "should move data in @raw to @data at every stage" do
105
+ etl = ExplicitRawToDataShow.new
106
+ etl.process
107
+ etl.data.should eql(2)
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ class PostBoard
114
+ class << self
115
+ def post(value)
116
+ self.board << value
117
+ end
118
+
119
+ def board
120
+ @@board ||= []
121
+ end
122
+
123
+ def reset
124
+ @@board = []
125
+ end
126
+ end
127
+ end
128
+
129
+ # Setting up for various ETL tests. Must implement post_state with an optional paramater
130
+ class Demo < ETL
131
+ before_extract :post_state
132
+ after_extract :post_state
133
+ before_transform :post_state
134
+ after_transform :post_state
135
+ before_load :post_state
136
+ after_load :post_state
137
+
138
+ def extract
139
+ post_state(:extract)
140
+ end
141
+
142
+ def transform
143
+ post_state(:transform)
144
+ end
145
+
146
+ def load
147
+ post_state(:load)
148
+ end
149
+
150
+ end
151
+
152
+ # Doesn't do much but mark that the states were passed.
153
+ class CheckTransitions < Demo
154
+ def post_state(s=nil)
155
+ s ||= self.state
156
+ PostBoard.post s
157
+ end
158
+ end
159
+
160
+ # Marks the value of raw at every transition
161
+ class ShowRaw < Demo
162
+
163
+ def extract
164
+ @raw = :extract
165
+ post_state(self.raw)
166
+ end
167
+
168
+ def transform
169
+ @raw = :transform
170
+ post_state(self.raw)
171
+ end
172
+
173
+ def load
174
+ @raw = :load
175
+ post_state(self.raw)
176
+ end
177
+
178
+ def post_state(s=nil)
179
+ s ||= self.raw
180
+ PostBoard.post s
181
+ end
182
+ end
183
+
184
+ class ShowData < Demo
185
+
186
+ def extract
187
+ @raw = :extract
188
+ post_state(self.data)
189
+ end
190
+
191
+ def transform
192
+ @raw = :transform
193
+ post_state(self.data)
194
+ end
195
+
196
+ def load
197
+ @raw = :load
198
+ post_state(self.data)
199
+ end
200
+
201
+ def post_state(s=nil)
202
+ s ||= self.data
203
+ PostBoard.post s
204
+ end
205
+ end
206
+
207
+ class ShowCounter < Demo
208
+
209
+ def advance_count
210
+ @count = self.count + 1
211
+ end
212
+
213
+ def count
214
+ @count ||= 0
215
+ end
216
+
217
+ def extract
218
+ post_state
219
+ end
220
+ alias :transform :extract
221
+ alias :load :extract
222
+
223
+ def post_state
224
+ advance_count
225
+ PostBoard.post self.count
226
+ end
227
+ end
228
+
229
+ class ExplicitRawToDataShow < ETL
230
+ def extract
231
+ @raw = 1
232
+ end
233
+
234
+ def transform
235
+ @raw = @data + 1
236
+ end
237
+ end