ETL 0.0.1 → 1.0.0.rc

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,2 +1,28 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |t|
7
+ t.rspec_opts = '-b'
8
+ end
9
+
10
+ task default: :spec
11
+ rescue LoadError
12
+ $stderr.puts "rspec not available, spec task not provided"
13
+ end
14
+
15
+ begin
16
+ require 'cane/rake_task'
17
+
18
+ desc "Run cane to check quality metrics"
19
+ Cane::RakeTask.new(:quality) do |cane|
20
+ cane.abc_max = 10
21
+ cane.style_glob = "lib/**/*.rb"
22
+ cane.no_doc = true
23
+ end
24
+
25
+ task :default => :quality
26
+ rescue LoadError
27
+ warn "cane not available, quality task not provided."
28
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/etl/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jeff Iacono"]
6
+ gem.email = ["iacono@squareup.com"]
7
+ gem.description = %q{Extract, Transform, and Load (ETL) ruby wrapper}
8
+ gem.summary = %q{Extract, Transform, and Load (ETL) ruby wrapper. Supports basic and iterative ETL operations.}
9
+ gem.homepage = "https://github.com/square/ETL"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "ETL"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = ETL::VERSION
17
+
18
+ gem.add_runtime_dependency "activesupport", [">= 3.2.3"]
19
+
20
+ gem.add_development_dependency "rake"
21
+ gem.add_development_dependency "cane"
22
+ gem.add_development_dependency "mysql2"
23
+ gem.add_development_dependency "rspec", [">= 2"]
24
+ end
@@ -0,0 +1,195 @@
1
+ require 'etl/version'
2
+ require 'etl/helpers'
3
+ require 'logger'
4
+ require 'date'
5
+ require 'time'
6
+
7
+ class ETL
8
+ include Helpers
9
+
10
+ attr_accessor :description
11
+ attr_accessor :connection
12
+ attr_reader :logger
13
+
14
+ ORDERED_ETL_OPERATIONS = [
15
+ :ensure_destination,
16
+ :before_etl,
17
+ :etl,
18
+ :after_etl
19
+ ]
20
+
21
+ ITERATOR_OPERATIONS = [
22
+ :start,
23
+ :step,
24
+ :stop
25
+ ]
26
+
27
+ def initialize attributes = {}
28
+ attributes.keys.uniq.each do |attribute|
29
+ self.send "#{attribute}=", attributes[attribute]
30
+ end
31
+ default_logger! unless attributes.keys.include?(:logger)
32
+ end
33
+
34
+ def config &block
35
+ yield self if block_given?
36
+ self
37
+ end
38
+
39
+ def logger= logger
40
+ @logger = logger
41
+ end
42
+
43
+ # A little metaprogramming to consolidate the generation of our sql
44
+ # generating / querying methods. Note that we don't metaprogram the etl
45
+ # operation as it's a little more complex.
46
+ #
47
+ # This will produce methods of the form:
48
+ #
49
+ # def [name] *args, &block
50
+ # if block_given?
51
+ # @[name] = block
52
+ # else
53
+ # @[name].call self, *args if @[name]
54
+ # end
55
+ # end
56
+ #
57
+ # for any given variable included in the method name's array
58
+ (ORDERED_ETL_OPERATIONS - [:etl]).each do |method|
59
+ define_method method do |*args, &block|
60
+ if block
61
+ instance_variable_set("@#{method}", block)
62
+ else
63
+ instance_variable_get("@#{method}").
64
+ call(self, *args) if instance_variable_get("@#{method}")
65
+ end
66
+ end
67
+ end
68
+
69
+ def etl *args, &block
70
+ if block_given?
71
+ @etl = block
72
+ else
73
+ if iterate?
74
+ if @etl
75
+ current = start
76
+ @etl.call self, cast(current), cast(current += step) while stop >= current
77
+ end
78
+ else
79
+ @etl.call self, *args if @etl
80
+ end
81
+ end
82
+ end
83
+
84
+ # A little more metaprogramming to consolidate the generation of
85
+ # our sql generating / querying methods.
86
+ #
87
+ # This will produce methods of the form:
88
+ #
89
+ # def [method] *args, &block
90
+ # if block
91
+ # @_[method]_block = block
92
+ # else
93
+ # # cache block's result
94
+ # if defined? @[method]
95
+ # @[method]
96
+ # else
97
+ # @[method] = @_[method]_block.call(self, *args)
98
+ # end
99
+ # end
100
+ # end
101
+ #
102
+ # for any given variable included in the method name's array
103
+ ITERATOR_OPERATIONS.each do |method|
104
+ define_method method do |*args, &block|
105
+ if block
106
+ instance_variable_set("@_#{method}_block", block)
107
+ else
108
+ if instance_variable_defined?("@#{method}")
109
+ instance_variable_get("@#{method}")
110
+ else
111
+ instance_variable_set("@#{method}",
112
+ instance_variable_get("@_#{method}_block")
113
+ .call(self, *args))
114
+ end
115
+ end
116
+ end
117
+ end
118
+
119
+ def run options = {}
120
+ (ORDERED_ETL_OPERATIONS - [*options[:except]]).each do |method|
121
+ send method
122
+ end
123
+ end
124
+
125
+ def query sql
126
+ time_and_log(sql: sql) do
127
+ connection.query sql
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ def iterate?
134
+ ITERATOR_OPERATIONS.all? do |method|
135
+ instance_variable_defined?("@_#{method}_block")
136
+ end
137
+ end
138
+
139
+ def default_logger!
140
+ @logger = default_logger
141
+ end
142
+
143
+ def logger?
144
+ !!@logger
145
+ end
146
+
147
+ def info data = {}
148
+ logger.info data.merge(emitter: self) if logger?
149
+ end
150
+
151
+ def debug data = {}
152
+ logger.debug data.merge(emitter: self) if logger?
153
+ end
154
+
155
+ def default_logger
156
+ ::Logger.new(STDOUT).tap do |logger|
157
+ logger.formatter = proc do |severity, datetime, progname, msg|
158
+ lead = "[#{datetime}] #{severity} #{msg[:event_type]}"
159
+ desc = "\"#{msg[:emitter].description || 'no description given'}\""
160
+ desc += " (object #{msg[:emitter].object_id})"
161
+
162
+ case msg[:event_type]
163
+ when :query_start
164
+ "#{lead} for #{desc}\n#{msg[:sql]}\n"
165
+ when :query_complete
166
+ "#{lead} for #{desc} runtime: #{msg[:runtime]}s\n"
167
+ else
168
+ "#{msg}"
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ def time_and_log data = {}, &block
175
+ start_runtime = Time.now
176
+ debug data.merge(event_type: :query_start)
177
+ retval = yield
178
+ info data.merge(event_type: :query_complete,
179
+ runtime: Time.now - start_runtime)
180
+ retval
181
+ end
182
+
183
+ # NOTE: If you needed to handle more type data type casting you can add a
184
+ # case statement. If you need to be able to handle entirely different sets
185
+ # of casting depending on database engine, you can modify #cast to take a
186
+ # "type" arg and then determine which caster to route the arg through
187
+ def cast arg
188
+ case arg
189
+ when Date then arg.strftime("%Y-%m-%d")
190
+ when Time then arg.strftime("%Y-%m-%d %H:%M:%S")
191
+ else
192
+ arg
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,57 @@
1
+ class ETL
2
+ module Helpers
3
+ # max_for returns the max value for the passed in column as found in the
4
+ # specified database.table. If there is not currently a max, we use COALESCE
5
+ # and a default value. You can specify a :default_floor value or the method
6
+ # will try to derive it for you.
7
+ #
8
+ # Note: we try to detect if we want a date return type via the #datetype?
9
+ # check.
10
+ #
11
+ # If this is found we wrap the whole SELECT clause in a DATE so it is cast
12
+ # accordingly.
13
+ def max_for options = {}
14
+ database = options[:database]
15
+ table = options[:table]
16
+ column = options[:column]
17
+
18
+ default_value = options[:default_floor] ||
19
+ default_floor_for(column)
20
+
21
+ if date? default_value
22
+ default_value = "DATE('#{default_value}')"
23
+ caster = ->(str) { "DATE(#{str})" }
24
+ end
25
+
26
+ max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
27
+ max_sql_clause = caster.(max_sql_clause) if caster
28
+
29
+ sql = <<-EOS
30
+ SELECT #{max_sql_clause} AS the_max
31
+ FROM #{database}.#{table}
32
+ EOS
33
+ sql += " WHERE #{options[:conditions]}" if options[:conditions]
34
+
35
+ query(sql).to_a.first['the_max']
36
+ end
37
+
38
+ private
39
+
40
+ def date? val
41
+ val =~ /^\d{4}-\d{1,2}-\d{1,2}( \d{2}:\d{2}:\d{2}( ((-|\+)\d+)| UTC)?)?$/
42
+ end
43
+
44
+ def default_floor_for column
45
+ case column
46
+ when /_at$/
47
+ return '1970-01-01'
48
+ when /_date$/
49
+ return '1970-01-01'
50
+ when /(^id$|_id$)/
51
+ return 0
52
+ else
53
+ raise ArgumentError, "could not determine a default for #{column}"
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,3 @@
1
+ class ETL
2
+ VERSION = "1.0.0.rc"
3
+ end
@@ -0,0 +1,622 @@
1
+ require 'mysql2'
2
+ require 'active_support/time'
3
+ require 'etl'
4
+
5
+ def test_connection
6
+ Mysql2::Client.new host: 'localhost', username: 'root', database: 'etl_test'
7
+ end
8
+
9
+ def reset_test_env connection, &block
10
+ connection.query %[DROP DATABASE IF EXISTS etl_test]
11
+ connection.query %[CREATE DATABASE etl_test]
12
+ connection.query %[USE etl_test]
13
+
14
+ if block_given?
15
+ yield connection
16
+ else
17
+ connection.query %[
18
+ CREATE TABLE etl_source (
19
+ id INT NOT NULL
20
+ , name VARCHAR(10)
21
+ , amount INT(11) DEFAULT 0
22
+ , PRIMARY KEY (id))]
23
+
24
+ connection.query %[
25
+ INSERT INTO etl_test.etl_source (id, name, amount)
26
+ VALUES
27
+ (1, 'Jeff', 100),
28
+ (2, 'Ryan', 50),
29
+ (3, 'Jack', 75),
30
+ (4, 'Jeff', 10),
31
+ (5, 'Jack', 45),
32
+ (6, 'Nick', -90),
33
+ (7, 'Nick', 90)
34
+ ]
35
+ end
36
+ end
37
+
38
+ describe ETL do
39
+ let(:logger) { nil }
40
+
41
+ describe "#logger=" do
42
+ let(:etl) { described_class.new connection: stub }
43
+
44
+ it 'assigns' do
45
+ logger = stub
46
+ etl.logger = logger
47
+ etl.logger.should == logger
48
+ end
49
+ end
50
+
51
+ describe '#max_for' do
52
+ let(:connection) { test_connection }
53
+ let(:etl) { described_class.new connection: connection, logger: logger }
54
+
55
+ before do
56
+ client = Mysql2::Client.new host: 'localhost', username: 'root'
57
+ client.query %[DROP DATABASE IF EXISTS etl_test]
58
+ client.query %[CREATE DATABASE etl_test]
59
+ client.query %[USE etl_test]
60
+ client.query %[
61
+ CREATE TABLE IF NOT EXISTS etl_source (
62
+ id INT(11) NOT NULL AUTO_INCREMENT
63
+ , name VARCHAR(10)
64
+ , amount INT(11) DEFAULT 0
65
+ , the_date DATE DEFAULT NULL
66
+ , the_null_date DATE DEFAULT NULL
67
+ , the_time_at DATETIME DEFAULT NULL
68
+ , the_null_time_at DATETIME DEFAULT NULL
69
+ , PRIMARY KEY (id))]
70
+
71
+ client.query %[
72
+ INSERT INTO etl_source (
73
+ name
74
+ , amount
75
+ , the_date
76
+ , the_null_date
77
+ , the_time_at
78
+ , the_null_time_at
79
+ ) VALUES
80
+ ('Jeff', 100, '2012-01-02', NULL, '2012-01-02 00:00:01', NULL)
81
+ , ('Ryan', 50, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
82
+ , ('Jack', 75, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
83
+ , ('Jeff', 10, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
84
+ , ('Jack', 45, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
85
+ , ('Nick', -90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
86
+ , ('Nick', 90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)]
87
+
88
+ client.close
89
+ end
90
+
91
+ after { connection.close }
92
+
93
+ it "finds the max for dates" do
94
+ etl.max_for(database: :etl_test,
95
+ table: :etl_source,
96
+ column: :the_date).should == Date.parse('2012-01-02')
97
+ end
98
+
99
+ it "defaults to the beginning of time date when a max date cannot be found" do
100
+ etl.max_for(database: :etl_test,
101
+ table: :etl_source,
102
+ column: :the_null_date).should == Date.parse('1970-01-01')
103
+ end
104
+
105
+ it "defaults to the specified default floor when a max date cannot be found" do
106
+ etl.max_for(database: :etl_test,
107
+ table: :etl_source,
108
+ column: :the_null_date,
109
+ default_floor: '2011-01-01').should == Date.parse('2011-01-01')
110
+ end
111
+
112
+ it "finds the max for datetimes" do
113
+ etl.max_for(database: :etl_test,
114
+ table: :etl_source,
115
+ column: :the_time_at).should == Date.parse('2012-01-02')
116
+ end
117
+
118
+ it "defaults to the beginning of time when a max datetime cannot be found" do
119
+ etl.max_for(database: :etl_test,
120
+ table: :etl_source,
121
+ column: :the_null_time_at).should == Date.parse('1970-01-01 00:00:00')
122
+ end
123
+
124
+ it "defaults to the specified default floor when a max datetime cannot be found" do
125
+ etl.max_for(database: :etl_test,
126
+ table: :etl_source,
127
+ column: :the_null_time_at,
128
+ default_floor: '2011-01-01 00:00:00').should == Date.parse('2011-01-01 00:00:00')
129
+ end
130
+
131
+ it "raises an error if a non-standard column is supplied with no default floor" do
132
+ expect {
133
+ etl.max_for database: :etl_test,
134
+ table: :etl_source,
135
+ column: :amount
136
+ }.to raise_exception
137
+ end
138
+
139
+ it "finds the max for a non-standard column, using the default floor" do
140
+ etl.max_for(database: :etl_test,
141
+ table: :etl_source,
142
+ column: :amount,
143
+ default_floor: 0).should == 100
144
+ end
145
+ end
146
+
147
+ describe '#run' do
148
+ let(:connection) { test_connection }
149
+ let(:etl) { described_class.new connection: connection, logger: logger }
150
+
151
+ before do
152
+ client = Mysql2::Client.new host: 'localhost', username: 'root'
153
+ client.query %[DROP DATABASE IF EXISTS etl_test]
154
+ client.query %[CREATE DATABASE etl_test]
155
+ client.query %[USE etl_test]
156
+ client.query %[
157
+ CREATE TABLE IF NOT EXISTS etl_source (
158
+ id INT(11) NOT NULL AUTO_INCREMENT
159
+ , name VARCHAR(10)
160
+ , amount INT(11) DEFAULT 0
161
+ , PRIMARY KEY (id))]
162
+
163
+ client.query %[
164
+ INSERT INTO etl_source (name, amount)
165
+ VALUES
166
+ ('Jeff', 100),
167
+ ('Ryan', 50),
168
+ ('Jack', 75),
169
+ ('Jeff', 10),
170
+ ('Jack', 45),
171
+ ('Nick', -90),
172
+ ('Nick', 90)]
173
+
174
+ client.close
175
+ end
176
+
177
+ it "executes the specified sql in the appropriate order" do
178
+ etl.ensure_destination do |etl|
179
+ etl.query %[
180
+ CREATE TABLE IF NOT EXISTS etl_destination (
181
+ name VARCHAR(10)
182
+ , total_amount INT(11) DEFAULT 0
183
+ , PRIMARY KEY (name))]
184
+ end
185
+
186
+ etl.before_etl do |etl|
187
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
188
+ end
189
+
190
+ etl.etl do |etl|
191
+ etl.query %[
192
+ REPLACE INTO etl_destination
193
+ SELECT name, SUM(amount) FROM etl_source
194
+ GROUP BY name]
195
+ end
196
+
197
+ etl.after_etl do |etl|
198
+ etl.query %[
199
+ UPDATE etl_destination
200
+ SET name = CONCAT("SUPER ", name)
201
+ WHERE total_amount > 115]
202
+ end
203
+
204
+ etl.run
205
+
206
+ connection
207
+ .query("SELECT * FROM etl_destination ORDER BY total_amount DESC")
208
+ .to_a
209
+ .should == [
210
+ {'name' => 'SUPER Jack', 'total_amount' => 120},
211
+ {'name' => 'Jeff', 'total_amount' => 110},
212
+ {'name' => 'Nick', 'total_amount' => 90},
213
+ {'name' => 'Ryan', 'total_amount' => 50}]
214
+ end
215
+ end
216
+
217
+ describe '#run with operations specified for exclusion' do
218
+ let(:connection) { stub }
219
+ let(:etl) { described_class.new connection: connection, logger: logger }
220
+
221
+ it "does not call the specified method" do
222
+ etl.ensure_destination {}
223
+ etl.should_not_receive(:ensure_destination)
224
+ etl.run except: :ensure_destination
225
+ end
226
+ end
227
+
228
+ context "with iteration" do
229
+ describe '#run over full table' do
230
+ let(:connection) { test_connection }
231
+ let(:etl) { described_class.new connection: connection, logger: logger }
232
+
233
+ before { reset_test_env connection }
234
+ after { connection.close }
235
+
236
+ it "executes the specified sql in the appropriate order and ETLs properly" do
237
+ etl.ensure_destination do |etl|
238
+ etl.query %[
239
+ CREATE TABLE etl_destination (
240
+ id INT NOT NULL
241
+ , name VARCHAR(10)
242
+ , amount INT(11) DEFAULT 0
243
+ , PRIMARY KEY (id))]
244
+ end
245
+
246
+ etl.before_etl do |etl|
247
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
248
+ end
249
+
250
+ etl.start do |etl|
251
+ etl.query(
252
+ "SELECT COALESCE(MAX(id), 0) AS the_start FROM etl_destination"
253
+ ).to_a.first['the_start']
254
+ end
255
+
256
+ etl.step do
257
+ 1
258
+ end
259
+
260
+ etl.stop do |etl|
261
+ etl.query(
262
+ "SELECT MAX(id) AS the_stop FROM etl_source"
263
+ ).to_a.first['the_stop']
264
+ end
265
+
266
+ etl.etl do |etl, lbound, ubound|
267
+ etl.query %[
268
+ REPLACE INTO etl_destination
269
+ SELECT id, name, amount FROM etl_source s
270
+ WHERE s.id >= #{lbound}
271
+ AND s.id < #{ubound}]
272
+ end
273
+
274
+ etl.after_etl do |etl|
275
+ etl.query %[
276
+ UPDATE etl_destination
277
+ SET name = CONCAT("SUPER ", name)
278
+ WHERE id <= 1]
279
+ end
280
+
281
+ etl.run
282
+
283
+ connection
284
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
285
+ .to_a
286
+ .should == [
287
+ {'id' => 1, 'name' => 'SUPER Jeff', 'amount' => 100},
288
+ {'id' => 2, 'name' => 'Ryan', 'amount' => 50},
289
+ {'id' => 3, 'name' => 'Jack', 'amount' => 75},
290
+ {'id' => 4, 'name' => 'Jeff', 'amount' => 10},
291
+ {'id' => 5, 'name' => 'Jack', 'amount' => 45},
292
+ {'id' => 7, 'name' => 'Nick', 'amount' => 90}]
293
+ end
294
+ end
295
+
296
+ describe '#run over part of table' do
297
+ let(:connection) { test_connection }
298
+ let(:etl) { described_class.new connection: connection, logger: logger }
299
+
300
+ before { reset_test_env connection }
301
+ after { connection.close }
302
+
303
+ it "executes the specified sql in the appropriate order and ETLs properly" do
304
+ etl.ensure_destination do |etl|
305
+ etl.query %[
306
+ CREATE TABLE etl_destination (
307
+ id INT NOT NULL
308
+ , name VARCHAR(10)
309
+ , amount INT(11) DEFAULT 0
310
+ , PRIMARY KEY (id))]
311
+ end
312
+
313
+ etl.before_etl do |etl|
314
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
315
+ end
316
+
317
+ etl.start do
318
+ 4
319
+ end
320
+
321
+ etl.step do
322
+ 1
323
+ end
324
+
325
+ etl.stop do |etl|
326
+ etl.query(
327
+ "SELECT MAX(id) AS the_stop FROM etl_source"
328
+ ).to_a.first['the_stop']
329
+ end
330
+
331
+ etl.etl do |etl, lbound, ubound|
332
+ etl.query %[
333
+ REPLACE INTO etl_destination
334
+ SELECT id, name, amount FROM etl_source s
335
+ WHERE s.id >= #{lbound}
336
+ AND s.id < #{ubound}]
337
+ end
338
+
339
+ etl.run
340
+
341
+ connection
342
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
343
+ .to_a.should == [
344
+ {'id' => 4, 'name' => 'Jeff', 'amount' => 10},
345
+ {'id' => 5, 'name' => 'Jack', 'amount' => 45},
346
+ {'id' => 7, 'name' => 'Nick', 'amount' => 90}]
347
+ end
348
+ end
349
+
350
+ describe "#run over gappy data" do
351
+ let(:connection) { test_connection }
352
+ let(:etl) { described_class.new connection: connection, logger: logger }
353
+
354
+ before do
355
+ reset_test_env(connection) do |connection|
356
+ connection.query %[
357
+ CREATE TABLE etl_source (
358
+ id INT NOT NULL
359
+ , name VARCHAR(10)
360
+ , amount INT(11) DEFAULT 0
361
+ , PRIMARY KEY (id))]
362
+
363
+ connection.query %[
364
+ INSERT INTO etl_source (id, name, amount)
365
+ VALUES
366
+ (1, 'Jeff', 100),
367
+ (2, 'Ryan', 50),
368
+ (13, 'Jack', 75),
369
+ (14, 'Jeff', 10),
370
+ (15, 'Jack', 45),
371
+ (16, 'Nick', -90),
372
+ (17, 'Nick', 90)]
373
+ end
374
+ end
375
+
376
+ after { connection.close }
377
+
378
+ it "executes the specified sql in the appropriate order without getting stuck" do
379
+ etl.ensure_destination do |etl|
380
+ etl.query %[
381
+ CREATE TABLE etl_destination (
382
+ id INT NOT NULL
383
+ , name VARCHAR(10)
384
+ , amount INT(11) DEFAULT 0
385
+ , PRIMARY KEY (id))]
386
+ end
387
+
388
+ etl.before_etl do |etl|
389
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
390
+ end
391
+
392
+ etl.start do |etl|
393
+ 1
394
+ end
395
+
396
+ etl.step do
397
+ 1
398
+ end
399
+
400
+ etl.stop do |etl|
401
+ etl.query(
402
+ "SELECT MAX(id) AS the_stop FROM etl_source"
403
+ ).to_a.first['the_stop']
404
+ end
405
+
406
+ etl.etl do |etl, lbound, ubound|
407
+ etl.query %[
408
+ REPLACE INTO etl_destination
409
+ SELECT
410
+ id
411
+ , name
412
+ , amount
413
+ FROM etl_source s
414
+ WHERE s.id >= #{lbound}
415
+ AND s.id < #{ubound}]
416
+ end
417
+
418
+ etl.run
419
+
420
+ connection
421
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
422
+ .to_a
423
+ .should == [
424
+ {'id' => 1, 'name' => 'Jeff', 'amount' => 100},
425
+ {'id' => 2, 'name' => 'Ryan', 'amount' => 50},
426
+ {'id' => 13, 'name' => 'Jack', 'amount' => 75},
427
+ {'id' => 14, 'name' => 'Jeff', 'amount' => 10},
428
+ {'id' => 15, 'name' => 'Jack', 'amount' => 45},
429
+ {'id' => 17, 'name' => 'Nick', 'amount' => 90}]
430
+ end
431
+ end
432
+
433
+ describe "#run over date data" do
434
+ let(:connection) { test_connection }
435
+ let(:etl) { described_class.new connection: connection, logger: logger }
436
+
437
+ before do
438
+ reset_test_env(connection) do |connection|
439
+ connection.query %[
440
+ CREATE TABLE etl_source (
441
+ the_date DATE NOT NULL
442
+ , name VARCHAR(10)
443
+ , amount INT(11) DEFAULT 0)]
444
+
445
+ connection.query %[
446
+ INSERT INTO etl_source (the_date, name, amount)
447
+ VALUES
448
+ ('2012-01-01', 'Jeff', 100),
449
+ ('2012-01-01', 'Ryan', 50),
450
+ ('2012-01-01', 'Jack', 75),
451
+ ('2012-01-01', 'Jeff', 10),
452
+ ('2012-01-02', 'Jack', 45),
453
+ ('2012-01-02', 'Nick', -90),
454
+ ('2012-01-02', 'Nick', 90)]
455
+ end
456
+ end
457
+
458
+ after { connection.close }
459
+
460
+ it "executes the specified sql in the appropriate order and ETLs properly" do
461
+ etl.ensure_destination do |etl|
462
+ etl.query %[
463
+ CREATE TABLE etl_destination (
464
+ the_date DATE NOT NULL
465
+ , name VARCHAR(10)
466
+ , total_amount INT(11) DEFAULT 0
467
+ , PRIMARY KEY (the_date, name))]
468
+ end
469
+
470
+ etl.before_etl do |etl|
471
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
472
+ end
473
+
474
+ etl.start do |etl|
475
+ etl.query(%[
476
+ SELECT COALESCE(MAX(the_date), DATE('2012-01-01')) AS the_start
477
+ FROM etl_destination
478
+ ]).to_a.first['the_start']
479
+ end
480
+
481
+ etl.step do
482
+ 1.day
483
+ end
484
+
485
+ etl.stop do |etl|
486
+ etl.query(
487
+ "SELECT MAX(the_date) AS the_stop FROM etl_source"
488
+ ).to_a.first['the_stop']
489
+ end
490
+
491
+ etl.etl do |etl, lbound, ubound|
492
+ etl.query %[
493
+ REPLACE INTO etl_destination
494
+ SELECT
495
+ the_date
496
+ , name
497
+ , SUM(amount) AS total_amount
498
+ FROM etl_source s
499
+ WHERE s.the_date >= '#{lbound}'
500
+ AND s.the_date < '#{ubound}'
501
+ GROUP BY
502
+ the_date
503
+ , name]
504
+ end
505
+
506
+ etl.run
507
+
508
+ connection
509
+ .query(%[
510
+ SELECT
511
+ the_date
512
+ , name
513
+ , total_amount
514
+ FROM
515
+ etl_destination
516
+ ORDER BY
517
+ the_date ASC
518
+ , name ASC
519
+ ]).to_a
520
+ .should == [
521
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jack', 'total_amount' => 75},
522
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jeff', 'total_amount' => 110},
523
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Ryan', 'total_amount' => 50},
524
+ {'the_date' => Date.parse('2012-01-02'), 'name' => 'Jack', 'total_amount' => 45},
525
+ {'the_date' => Date.parse('2012-01-02'), 'name' => 'Nick', 'total_amount' => 90}]
526
+ end
527
+ end
528
+
529
+ describe "#run over datetime data" do
530
+ let(:connection) { test_connection }
531
+ let(:etl) { described_class.new connection: connection, logger: logger }
532
+
533
+ before do
534
+ reset_test_env(connection) do |connection|
535
+ connection.query %[
536
+ CREATE TABLE etl_source (
537
+ the_datetime DATETIME NOT NULL
538
+ , name VARCHAR(10)
539
+ , amount INT(11) DEFAULT 0)]
540
+
541
+ connection.query %[
542
+ INSERT INTO etl_source (the_datetime, name, amount)
543
+ VALUES
544
+ ('2011-12-31 23:59:59', 'Jeff', 100),
545
+ ('2012-01-01 00:01:00', 'Ryan', 50),
546
+ ('2012-01-01 00:01:01', 'Jack', 75),
547
+ ('2012-01-01 00:01:02', 'Jeff', 10),
548
+ ('2012-01-02 00:02:00', 'Jack', 45),
549
+ ('2012-01-02 00:02:01', 'Nick', -90),
550
+ ('2012-01-02 00:02:02', 'Nick', 90)]
551
+ end
552
+ end
553
+
554
+ after { connection.close }
555
+
556
+ it "executes the specified sql in the appropriate order and ETLs properly" do
557
+ etl.ensure_destination do |etl|
558
+ etl.query %[
559
+ CREATE TABLE etl_destination (
560
+ the_datetime DATETIME NOT NULL
561
+ , name VARCHAR(10)
562
+ , amount INT(11) DEFAULT 0
563
+ , PRIMARY KEY (the_datetime, name))]
564
+ end
565
+
566
+ etl.before_etl do |etl|
567
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
568
+ end
569
+
570
+ etl.start do |etl|
571
+ etl.query(%[
572
+ SELECT CAST(COALESCE(MAX(the_datetime), '2012-01-01 00:00:00') AS DATETIME) AS the_start
573
+ FROM etl_destination
574
+ ]).to_a.first['the_start']
575
+ end
576
+
577
+ etl.step do
578
+ 1.minute
579
+ end
580
+
581
+ etl.stop do |etl|
582
+ etl.query(
583
+ "SELECT MAX(the_datetime) AS the_stop FROM etl_source"
584
+ ).to_a.first['the_stop']
585
+ end
586
+
587
+ etl.etl do |etl, lbound, ubound|
588
+ etl.query %[
589
+ REPLACE INTO etl_destination
590
+ SELECT
591
+ the_datetime
592
+ , name
593
+ , amount
594
+ FROM etl_source s
595
+ WHERE s.the_datetime >= '#{lbound}'
596
+ AND s.the_datetime < '#{ubound}']
597
+ end
598
+
599
+ etl.run
600
+
601
+ connection
602
+ .query(%[
603
+ SELECT
604
+ the_datetime
605
+ , name
606
+ , amount
607
+ FROM
608
+ etl_destination
609
+ ORDER BY
610
+ the_datetime ASC
611
+ , name ASC
612
+ ]).to_a
613
+ .should == [
614
+ {'the_datetime' => Time.parse('2012-01-01 00:01:00'), 'name' => 'Ryan', 'amount' => 50},
615
+ {'the_datetime' => Time.parse('2012-01-01 00:01:01'), 'name' => 'Jack', 'amount' => 75},
616
+ {'the_datetime' => Time.parse('2012-01-01 00:01:02'), 'name' => 'Jeff', 'amount' => 10},
617
+ {'the_datetime' => Time.parse('2012-01-02 00:02:00'), 'name' => 'Jack', 'amount' => 45},
618
+ {'the_datetime' => Time.parse('2012-01-02 00:02:02'), 'name' => 'Nick', 'amount' => 90}]
619
+ end
620
+ end
621
+ end
622
+ end