ETL 0.0.1 → 1.0.0.rc

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,2 +1,28 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |t|
7
+ t.rspec_opts = '-b'
8
+ end
9
+
10
+ task default: :spec
11
+ rescue LoadError
12
+ $stderr.puts "rspec not available, spec task not provided"
13
+ end
14
+
15
+ begin
16
+ require 'cane/rake_task'
17
+
18
+ desc "Run cane to check quality metrics"
19
+ Cane::RakeTask.new(:quality) do |cane|
20
+ cane.abc_max = 10
21
+ cane.style_glob = "lib/**/*.rb"
22
+ cane.no_doc = true
23
+ end
24
+
25
+ task :default => :quality
26
+ rescue LoadError
27
+ warn "cane not available, quality task not provided."
28
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/etl/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jeff Iacono"]
6
+ gem.email = ["iacono@squareup.com"]
7
+ gem.description = %q{Extract, Transform, and Load (ETL) ruby wrapper}
8
+ gem.summary = %q{Extract, Transform, and Load (ETL) ruby wrapper. Supports basic and iterative ETL operations.}
9
+ gem.homepage = "https://github.com/square/ETL"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "ETL"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = ETL::VERSION
17
+
18
+ gem.add_runtime_dependency "activesupport", [">= 3.2.3"]
19
+
20
+ gem.add_development_dependency "rake"
21
+ gem.add_development_dependency "cane"
22
+ gem.add_development_dependency "mysql2"
23
+ gem.add_development_dependency "rspec", [">= 2"]
24
+ end
@@ -0,0 +1,195 @@
1
+ require 'etl/version'
2
+ require 'etl/helpers'
3
+ require 'logger'
4
+ require 'date'
5
+ require 'time'
6
+
7
+ class ETL
8
+ include Helpers
9
+
10
+ attr_accessor :description
11
+ attr_accessor :connection
12
+ attr_reader :logger
13
+
14
+ ORDERED_ETL_OPERATIONS = [
15
+ :ensure_destination,
16
+ :before_etl,
17
+ :etl,
18
+ :after_etl
19
+ ]
20
+
21
+ ITERATOR_OPERATIONS = [
22
+ :start,
23
+ :step,
24
+ :stop
25
+ ]
26
+
27
+ def initialize attributes = {}
28
+ attributes.keys.uniq.each do |attribute|
29
+ self.send "#{attribute}=", attributes[attribute]
30
+ end
31
+ default_logger! unless attributes.keys.include?(:logger)
32
+ end
33
+
34
+ def config &block
35
+ yield self if block_given?
36
+ self
37
+ end
38
+
39
+ def logger= logger
40
+ @logger = logger
41
+ end
42
+
43
+ # A little metaprogramming to consolidate the generation of our sql
44
+ # generating / querying methods. Note that we don't metaprogram the etl
45
+ # operation as it's a little more complex.
46
+ #
47
+ # This will produce methods of the form:
48
+ #
49
+ # def [name] *args, &block
50
+ # if block_given?
51
+ # @[name] = block
52
+ # else
53
+ # @[name].call self, *args if @[name]
54
+ # end
55
+ # end
56
+ #
57
+ # for any given variable included in the method name's array
58
+ (ORDERED_ETL_OPERATIONS - [:etl]).each do |method|
59
+ define_method method do |*args, &block|
60
+ if block
61
+ instance_variable_set("@#{method}", block)
62
+ else
63
+ instance_variable_get("@#{method}").
64
+ call(self, *args) if instance_variable_get("@#{method}")
65
+ end
66
+ end
67
+ end
68
+
69
+ def etl *args, &block
70
+ if block_given?
71
+ @etl = block
72
+ else
73
+ if iterate?
74
+ if @etl
75
+ current = start
76
+ @etl.call self, cast(current), cast(current += step) while stop >= current
77
+ end
78
+ else
79
+ @etl.call self, *args if @etl
80
+ end
81
+ end
82
+ end
83
+
84
+ # A little more metaprogramming to consolidate the generation of
85
+ # our sql generating / querying methods.
86
+ #
87
+ # This will produce methods of the form:
88
+ #
89
+ # def [method] *args, &block
90
+ # if block
91
+ # @_[method]_block = block
92
+ # else
93
+ # # cache block's result
94
+ # if defined? @[method]
95
+ # @[method]
96
+ # else
97
+ # @[method] = @_[method]_block.call(self, *args)
98
+ # end
99
+ # end
100
+ # end
101
+ #
102
+ # for any given variable included in the method name's array
103
+ ITERATOR_OPERATIONS.each do |method|
104
+ define_method method do |*args, &block|
105
+ if block
106
+ instance_variable_set("@_#{method}_block", block)
107
+ else
108
+ if instance_variable_defined?("@#{method}")
109
+ instance_variable_get("@#{method}")
110
+ else
111
+ instance_variable_set("@#{method}",
112
+ instance_variable_get("@_#{method}_block")
113
+ .call(self, *args))
114
+ end
115
+ end
116
+ end
117
+ end
118
+
119
+ def run options = {}
120
+ (ORDERED_ETL_OPERATIONS - [*options[:except]]).each do |method|
121
+ send method
122
+ end
123
+ end
124
+
125
+ def query sql
126
+ time_and_log(sql: sql) do
127
+ connection.query sql
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ def iterate?
134
+ ITERATOR_OPERATIONS.all? do |method|
135
+ instance_variable_defined?("@_#{method}_block")
136
+ end
137
+ end
138
+
139
+ def default_logger!
140
+ @logger = default_logger
141
+ end
142
+
143
+ def logger?
144
+ !!@logger
145
+ end
146
+
147
+ def info data = {}
148
+ logger.info data.merge(emitter: self) if logger?
149
+ end
150
+
151
+ def debug data = {}
152
+ logger.debug data.merge(emitter: self) if logger?
153
+ end
154
+
155
+ def default_logger
156
+ ::Logger.new(STDOUT).tap do |logger|
157
+ logger.formatter = proc do |severity, datetime, progname, msg|
158
+ lead = "[#{datetime}] #{severity} #{msg[:event_type]}"
159
+ desc = "\"#{msg[:emitter].description || 'no description given'}\""
160
+ desc += " (object #{msg[:emitter].object_id})"
161
+
162
+ case msg[:event_type]
163
+ when :query_start
164
+ "#{lead} for #{desc}\n#{msg[:sql]}\n"
165
+ when :query_complete
166
+ "#{lead} for #{desc} runtime: #{msg[:runtime]}s\n"
167
+ else
168
+ "#{msg}"
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ def time_and_log data = {}, &block
175
+ start_runtime = Time.now
176
+ debug data.merge(event_type: :query_start)
177
+ retval = yield
178
+ info data.merge(event_type: :query_complete,
179
+ runtime: Time.now - start_runtime)
180
+ retval
181
+ end
182
+
183
+ # NOTE: If you needed to handle more type data type casting you can add a
184
+ # case statement. If you need to be able to handle entirely different sets
185
+ # of casting depending on database engine, you can modify #cast to take a
186
+ # "type" arg and then determine which caster to route the arg through
187
+ def cast arg
188
+ case arg
189
+ when Date then arg.strftime("%Y-%m-%d")
190
+ when Time then arg.strftime("%Y-%m-%d %H:%M:%S")
191
+ else
192
+ arg
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,57 @@
1
+ class ETL
2
+ module Helpers
3
+ # max_for returns the max value for the passed in column as found in the
4
+ # specified database.table. If there is not currently a max, we use COALESCE
5
+ # and a default value. You can specify a :default_floor value or the method
6
+ # will try to derive it for you.
7
+ #
8
+ # Note: we try to detect if we want a date return type via the #datetype?
9
+ # check.
10
+ #
11
+ # If this is found we wrap the whole SELECT clause in a DATE so it is cast
12
+ # accordingly.
13
+ def max_for options = {}
14
+ database = options[:database]
15
+ table = options[:table]
16
+ column = options[:column]
17
+
18
+ default_value = options[:default_floor] ||
19
+ default_floor_for(column)
20
+
21
+ if date? default_value
22
+ default_value = "DATE('#{default_value}')"
23
+ caster = ->(str) { "DATE(#{str})" }
24
+ end
25
+
26
+ max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
27
+ max_sql_clause = caster.(max_sql_clause) if caster
28
+
29
+ sql = <<-EOS
30
+ SELECT #{max_sql_clause} AS the_max
31
+ FROM #{database}.#{table}
32
+ EOS
33
+ sql += " WHERE #{options[:conditions]}" if options[:conditions]
34
+
35
+ query(sql).to_a.first['the_max']
36
+ end
37
+
38
+ private
39
+
40
+ def date? val
41
+ val =~ /^\d{4}-\d{1,2}-\d{1,2}( \d{2}:\d{2}:\d{2}( ((-|\+)\d+)| UTC)?)?$/
42
+ end
43
+
44
+ def default_floor_for column
45
+ case column
46
+ when /_at$/
47
+ return '1970-01-01'
48
+ when /_date$/
49
+ return '1970-01-01'
50
+ when /(^id$|_id$)/
51
+ return 0
52
+ else
53
+ raise ArgumentError, "could not determine a default for #{column}"
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,3 @@
1
+ class ETL
2
+ VERSION = "1.0.0.rc"
3
+ end
@@ -0,0 +1,622 @@
1
+ require 'mysql2'
2
+ require 'active_support/time'
3
+ require 'etl'
4
+
5
+ def test_connection
6
+ Mysql2::Client.new host: 'localhost', username: 'root', database: 'etl_test'
7
+ end
8
+
9
+ def reset_test_env connection, &block
10
+ connection.query %[DROP DATABASE IF EXISTS etl_test]
11
+ connection.query %[CREATE DATABASE etl_test]
12
+ connection.query %[USE etl_test]
13
+
14
+ if block_given?
15
+ yield connection
16
+ else
17
+ connection.query %[
18
+ CREATE TABLE etl_source (
19
+ id INT NOT NULL
20
+ , name VARCHAR(10)
21
+ , amount INT(11) DEFAULT 0
22
+ , PRIMARY KEY (id))]
23
+
24
+ connection.query %[
25
+ INSERT INTO etl_test.etl_source (id, name, amount)
26
+ VALUES
27
+ (1, 'Jeff', 100),
28
+ (2, 'Ryan', 50),
29
+ (3, 'Jack', 75),
30
+ (4, 'Jeff', 10),
31
+ (5, 'Jack', 45),
32
+ (6, 'Nick', -90),
33
+ (7, 'Nick', 90)
34
+ ]
35
+ end
36
+ end
37
+
38
+ describe ETL do
39
+ let(:logger) { nil }
40
+
41
+ describe "#logger=" do
42
+ let(:etl) { described_class.new connection: stub }
43
+
44
+ it 'assigns' do
45
+ logger = stub
46
+ etl.logger = logger
47
+ etl.logger.should == logger
48
+ end
49
+ end
50
+
51
+ describe '#max_for' do
52
+ let(:connection) { test_connection }
53
+ let(:etl) { described_class.new connection: connection, logger: logger }
54
+
55
+ before do
56
+ client = Mysql2::Client.new host: 'localhost', username: 'root'
57
+ client.query %[DROP DATABASE IF EXISTS etl_test]
58
+ client.query %[CREATE DATABASE etl_test]
59
+ client.query %[USE etl_test]
60
+ client.query %[
61
+ CREATE TABLE IF NOT EXISTS etl_source (
62
+ id INT(11) NOT NULL AUTO_INCREMENT
63
+ , name VARCHAR(10)
64
+ , amount INT(11) DEFAULT 0
65
+ , the_date DATE DEFAULT NULL
66
+ , the_null_date DATE DEFAULT NULL
67
+ , the_time_at DATETIME DEFAULT NULL
68
+ , the_null_time_at DATETIME DEFAULT NULL
69
+ , PRIMARY KEY (id))]
70
+
71
+ client.query %[
72
+ INSERT INTO etl_source (
73
+ name
74
+ , amount
75
+ , the_date
76
+ , the_null_date
77
+ , the_time_at
78
+ , the_null_time_at
79
+ ) VALUES
80
+ ('Jeff', 100, '2012-01-02', NULL, '2012-01-02 00:00:01', NULL)
81
+ , ('Ryan', 50, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
82
+ , ('Jack', 75, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
83
+ , ('Jeff', 10, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
84
+ , ('Jack', 45, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
85
+ , ('Nick', -90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
86
+ , ('Nick', 90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)]
87
+
88
+ client.close
89
+ end
90
+
91
+ after { connection.close }
92
+
93
+ it "finds the max for dates" do
94
+ etl.max_for(database: :etl_test,
95
+ table: :etl_source,
96
+ column: :the_date).should == Date.parse('2012-01-02')
97
+ end
98
+
99
+ it "defaults to the beginning of time date when a max date cannot be found" do
100
+ etl.max_for(database: :etl_test,
101
+ table: :etl_source,
102
+ column: :the_null_date).should == Date.parse('1970-01-01')
103
+ end
104
+
105
+ it "defaults to the specified default floor when a max date cannot be found" do
106
+ etl.max_for(database: :etl_test,
107
+ table: :etl_source,
108
+ column: :the_null_date,
109
+ default_floor: '2011-01-01').should == Date.parse('2011-01-01')
110
+ end
111
+
112
+ it "finds the max for datetimes" do
113
+ etl.max_for(database: :etl_test,
114
+ table: :etl_source,
115
+ column: :the_time_at).should == Date.parse('2012-01-02')
116
+ end
117
+
118
+ it "defaults to the beginning of time when a max datetime cannot be found" do
119
+ etl.max_for(database: :etl_test,
120
+ table: :etl_source,
121
+ column: :the_null_time_at).should == Date.parse('1970-01-01 00:00:00')
122
+ end
123
+
124
+ it "defaults to the specified default floor when a max datetime cannot be found" do
125
+ etl.max_for(database: :etl_test,
126
+ table: :etl_source,
127
+ column: :the_null_time_at,
128
+ default_floor: '2011-01-01 00:00:00').should == Date.parse('2011-01-01 00:00:00')
129
+ end
130
+
131
+ it "raises an error if a non-standard column is supplied with no default floor" do
132
+ expect {
133
+ etl.max_for database: :etl_test,
134
+ table: :etl_source,
135
+ column: :amount
136
+ }.to raise_exception
137
+ end
138
+
139
+ it "finds the max for a non-standard column, using the default floor" do
140
+ etl.max_for(database: :etl_test,
141
+ table: :etl_source,
142
+ column: :amount,
143
+ default_floor: 0).should == 100
144
+ end
145
+ end
146
+
147
+ describe '#run' do
148
+ let(:connection) { test_connection }
149
+ let(:etl) { described_class.new connection: connection, logger: logger }
150
+
151
+ before do
152
+ client = Mysql2::Client.new host: 'localhost', username: 'root'
153
+ client.query %[DROP DATABASE IF EXISTS etl_test]
154
+ client.query %[CREATE DATABASE etl_test]
155
+ client.query %[USE etl_test]
156
+ client.query %[
157
+ CREATE TABLE IF NOT EXISTS etl_source (
158
+ id INT(11) NOT NULL AUTO_INCREMENT
159
+ , name VARCHAR(10)
160
+ , amount INT(11) DEFAULT 0
161
+ , PRIMARY KEY (id))]
162
+
163
+ client.query %[
164
+ INSERT INTO etl_source (name, amount)
165
+ VALUES
166
+ ('Jeff', 100),
167
+ ('Ryan', 50),
168
+ ('Jack', 75),
169
+ ('Jeff', 10),
170
+ ('Jack', 45),
171
+ ('Nick', -90),
172
+ ('Nick', 90)]
173
+
174
+ client.close
175
+ end
176
+
177
+ it "executes the specified sql in the appropriate order" do
178
+ etl.ensure_destination do |etl|
179
+ etl.query %[
180
+ CREATE TABLE IF NOT EXISTS etl_destination (
181
+ name VARCHAR(10)
182
+ , total_amount INT(11) DEFAULT 0
183
+ , PRIMARY KEY (name))]
184
+ end
185
+
186
+ etl.before_etl do |etl|
187
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
188
+ end
189
+
190
+ etl.etl do |etl|
191
+ etl.query %[
192
+ REPLACE INTO etl_destination
193
+ SELECT name, SUM(amount) FROM etl_source
194
+ GROUP BY name]
195
+ end
196
+
197
+ etl.after_etl do |etl|
198
+ etl.query %[
199
+ UPDATE etl_destination
200
+ SET name = CONCAT("SUPER ", name)
201
+ WHERE total_amount > 115]
202
+ end
203
+
204
+ etl.run
205
+
206
+ connection
207
+ .query("SELECT * FROM etl_destination ORDER BY total_amount DESC")
208
+ .to_a
209
+ .should == [
210
+ {'name' => 'SUPER Jack', 'total_amount' => 120},
211
+ {'name' => 'Jeff', 'total_amount' => 110},
212
+ {'name' => 'Nick', 'total_amount' => 90},
213
+ {'name' => 'Ryan', 'total_amount' => 50}]
214
+ end
215
+ end
216
+
217
+ describe '#run with operations specified for exclusion' do
218
+ let(:connection) { stub }
219
+ let(:etl) { described_class.new connection: connection, logger: logger }
220
+
221
+ it "does not call the specified method" do
222
+ etl.ensure_destination {}
223
+ etl.should_not_receive(:ensure_destination)
224
+ etl.run except: :ensure_destination
225
+ end
226
+ end
227
+
228
+ context "with iteration" do
229
+ describe '#run over full table' do
230
+ let(:connection) { test_connection }
231
+ let(:etl) { described_class.new connection: connection, logger: logger }
232
+
233
+ before { reset_test_env connection }
234
+ after { connection.close }
235
+
236
+ it "executes the specified sql in the appropriate order and ETLs properly" do
237
+ etl.ensure_destination do |etl|
238
+ etl.query %[
239
+ CREATE TABLE etl_destination (
240
+ id INT NOT NULL
241
+ , name VARCHAR(10)
242
+ , amount INT(11) DEFAULT 0
243
+ , PRIMARY KEY (id))]
244
+ end
245
+
246
+ etl.before_etl do |etl|
247
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
248
+ end
249
+
250
+ etl.start do |etl|
251
+ etl.query(
252
+ "SELECT COALESCE(MAX(id), 0) AS the_start FROM etl_destination"
253
+ ).to_a.first['the_start']
254
+ end
255
+
256
+ etl.step do
257
+ 1
258
+ end
259
+
260
+ etl.stop do |etl|
261
+ etl.query(
262
+ "SELECT MAX(id) AS the_stop FROM etl_source"
263
+ ).to_a.first['the_stop']
264
+ end
265
+
266
+ etl.etl do |etl, lbound, ubound|
267
+ etl.query %[
268
+ REPLACE INTO etl_destination
269
+ SELECT id, name, amount FROM etl_source s
270
+ WHERE s.id >= #{lbound}
271
+ AND s.id < #{ubound}]
272
+ end
273
+
274
+ etl.after_etl do |etl|
275
+ etl.query %[
276
+ UPDATE etl_destination
277
+ SET name = CONCAT("SUPER ", name)
278
+ WHERE id <= 1]
279
+ end
280
+
281
+ etl.run
282
+
283
+ connection
284
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
285
+ .to_a
286
+ .should == [
287
+ {'id' => 1, 'name' => 'SUPER Jeff', 'amount' => 100},
288
+ {'id' => 2, 'name' => 'Ryan', 'amount' => 50},
289
+ {'id' => 3, 'name' => 'Jack', 'amount' => 75},
290
+ {'id' => 4, 'name' => 'Jeff', 'amount' => 10},
291
+ {'id' => 5, 'name' => 'Jack', 'amount' => 45},
292
+ {'id' => 7, 'name' => 'Nick', 'amount' => 90}]
293
+ end
294
+ end
295
+
296
+ describe '#run over part of table' do
297
+ let(:connection) { test_connection }
298
+ let(:etl) { described_class.new connection: connection, logger: logger }
299
+
300
+ before { reset_test_env connection }
301
+ after { connection.close }
302
+
303
+ it "executes the specified sql in the appropriate order and ETLs properly" do
304
+ etl.ensure_destination do |etl|
305
+ etl.query %[
306
+ CREATE TABLE etl_destination (
307
+ id INT NOT NULL
308
+ , name VARCHAR(10)
309
+ , amount INT(11) DEFAULT 0
310
+ , PRIMARY KEY (id))]
311
+ end
312
+
313
+ etl.before_etl do |etl|
314
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
315
+ end
316
+
317
+ etl.start do
318
+ 4
319
+ end
320
+
321
+ etl.step do
322
+ 1
323
+ end
324
+
325
+ etl.stop do |etl|
326
+ etl.query(
327
+ "SELECT MAX(id) AS the_stop FROM etl_source"
328
+ ).to_a.first['the_stop']
329
+ end
330
+
331
+ etl.etl do |etl, lbound, ubound|
332
+ etl.query %[
333
+ REPLACE INTO etl_destination
334
+ SELECT id, name, amount FROM etl_source s
335
+ WHERE s.id >= #{lbound}
336
+ AND s.id < #{ubound}]
337
+ end
338
+
339
+ etl.run
340
+
341
+ connection
342
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
343
+ .to_a.should == [
344
+ {'id' => 4, 'name' => 'Jeff', 'amount' => 10},
345
+ {'id' => 5, 'name' => 'Jack', 'amount' => 45},
346
+ {'id' => 7, 'name' => 'Nick', 'amount' => 90}]
347
+ end
348
+ end
349
+
350
+ describe "#run over gappy data" do
351
+ let(:connection) { test_connection }
352
+ let(:etl) { described_class.new connection: connection, logger: logger }
353
+
354
+ before do
355
+ reset_test_env(connection) do |connection|
356
+ connection.query %[
357
+ CREATE TABLE etl_source (
358
+ id INT NOT NULL
359
+ , name VARCHAR(10)
360
+ , amount INT(11) DEFAULT 0
361
+ , PRIMARY KEY (id))]
362
+
363
+ connection.query %[
364
+ INSERT INTO etl_source (id, name, amount)
365
+ VALUES
366
+ (1, 'Jeff', 100),
367
+ (2, 'Ryan', 50),
368
+ (13, 'Jack', 75),
369
+ (14, 'Jeff', 10),
370
+ (15, 'Jack', 45),
371
+ (16, 'Nick', -90),
372
+ (17, 'Nick', 90)]
373
+ end
374
+ end
375
+
376
+ after { connection.close }
377
+
378
+ it "executes the specified sql in the appropriate order without getting stuck" do
379
+ etl.ensure_destination do |etl|
380
+ etl.query %[
381
+ CREATE TABLE etl_destination (
382
+ id INT NOT NULL
383
+ , name VARCHAR(10)
384
+ , amount INT(11) DEFAULT 0
385
+ , PRIMARY KEY (id))]
386
+ end
387
+
388
+ etl.before_etl do |etl|
389
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
390
+ end
391
+
392
+ etl.start do |etl|
393
+ 1
394
+ end
395
+
396
+ etl.step do
397
+ 1
398
+ end
399
+
400
+ etl.stop do |etl|
401
+ etl.query(
402
+ "SELECT MAX(id) AS the_stop FROM etl_source"
403
+ ).to_a.first['the_stop']
404
+ end
405
+
406
+ etl.etl do |etl, lbound, ubound|
407
+ etl.query %[
408
+ REPLACE INTO etl_destination
409
+ SELECT
410
+ id
411
+ , name
412
+ , amount
413
+ FROM etl_source s
414
+ WHERE s.id >= #{lbound}
415
+ AND s.id < #{ubound}]
416
+ end
417
+
418
+ etl.run
419
+
420
+ connection
421
+ .query("SELECT * FROM etl_destination ORDER BY id ASC")
422
+ .to_a
423
+ .should == [
424
+ {'id' => 1, 'name' => 'Jeff', 'amount' => 100},
425
+ {'id' => 2, 'name' => 'Ryan', 'amount' => 50},
426
+ {'id' => 13, 'name' => 'Jack', 'amount' => 75},
427
+ {'id' => 14, 'name' => 'Jeff', 'amount' => 10},
428
+ {'id' => 15, 'name' => 'Jack', 'amount' => 45},
429
+ {'id' => 17, 'name' => 'Nick', 'amount' => 90}]
430
+ end
431
+ end
432
+
433
+ describe "#run over date data" do
434
+ let(:connection) { test_connection }
435
+ let(:etl) { described_class.new connection: connection, logger: logger }
436
+
437
+ before do
438
+ reset_test_env(connection) do |connection|
439
+ connection.query %[
440
+ CREATE TABLE etl_source (
441
+ the_date DATE NOT NULL
442
+ , name VARCHAR(10)
443
+ , amount INT(11) DEFAULT 0)]
444
+
445
+ connection.query %[
446
+ INSERT INTO etl_source (the_date, name, amount)
447
+ VALUES
448
+ ('2012-01-01', 'Jeff', 100),
449
+ ('2012-01-01', 'Ryan', 50),
450
+ ('2012-01-01', 'Jack', 75),
451
+ ('2012-01-01', 'Jeff', 10),
452
+ ('2012-01-02', 'Jack', 45),
453
+ ('2012-01-02', 'Nick', -90),
454
+ ('2012-01-02', 'Nick', 90)]
455
+ end
456
+ end
457
+
458
+ after { connection.close }
459
+
460
+ it "executes the specified sql in the appropriate order and ETLs properly" do
461
+ etl.ensure_destination do |etl|
462
+ etl.query %[
463
+ CREATE TABLE etl_destination (
464
+ the_date DATE NOT NULL
465
+ , name VARCHAR(10)
466
+ , total_amount INT(11) DEFAULT 0
467
+ , PRIMARY KEY (the_date, name))]
468
+ end
469
+
470
+ etl.before_etl do |etl|
471
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
472
+ end
473
+
474
+ etl.start do |etl|
475
+ etl.query(%[
476
+ SELECT COALESCE(MAX(the_date), DATE('2012-01-01')) AS the_start
477
+ FROM etl_destination
478
+ ]).to_a.first['the_start']
479
+ end
480
+
481
+ etl.step do
482
+ 1.day
483
+ end
484
+
485
+ etl.stop do |etl|
486
+ etl.query(
487
+ "SELECT MAX(the_date) AS the_stop FROM etl_source"
488
+ ).to_a.first['the_stop']
489
+ end
490
+
491
+ etl.etl do |etl, lbound, ubound|
492
+ etl.query %[
493
+ REPLACE INTO etl_destination
494
+ SELECT
495
+ the_date
496
+ , name
497
+ , SUM(amount) AS total_amount
498
+ FROM etl_source s
499
+ WHERE s.the_date >= '#{lbound}'
500
+ AND s.the_date < '#{ubound}'
501
+ GROUP BY
502
+ the_date
503
+ , name]
504
+ end
505
+
506
+ etl.run
507
+
508
+ connection
509
+ .query(%[
510
+ SELECT
511
+ the_date
512
+ , name
513
+ , total_amount
514
+ FROM
515
+ etl_destination
516
+ ORDER BY
517
+ the_date ASC
518
+ , name ASC
519
+ ]).to_a
520
+ .should == [
521
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jack', 'total_amount' => 75},
522
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jeff', 'total_amount' => 110},
523
+ {'the_date' => Date.parse('2012-01-01'), 'name' => 'Ryan', 'total_amount' => 50},
524
+ {'the_date' => Date.parse('2012-01-02'), 'name' => 'Jack', 'total_amount' => 45},
525
+ {'the_date' => Date.parse('2012-01-02'), 'name' => 'Nick', 'total_amount' => 90}]
526
+ end
527
+ end
528
+
529
+ describe "#run over datetime data" do
530
+ let(:connection) { test_connection }
531
+ let(:etl) { described_class.new connection: connection, logger: logger }
532
+
533
+ before do
534
+ reset_test_env(connection) do |connection|
535
+ connection.query %[
536
+ CREATE TABLE etl_source (
537
+ the_datetime DATETIME NOT NULL
538
+ , name VARCHAR(10)
539
+ , amount INT(11) DEFAULT 0)]
540
+
541
+ connection.query %[
542
+ INSERT INTO etl_source (the_datetime, name, amount)
543
+ VALUES
544
+ ('2011-12-31 23:59:59', 'Jeff', 100),
545
+ ('2012-01-01 00:01:00', 'Ryan', 50),
546
+ ('2012-01-01 00:01:01', 'Jack', 75),
547
+ ('2012-01-01 00:01:02', 'Jeff', 10),
548
+ ('2012-01-02 00:02:00', 'Jack', 45),
549
+ ('2012-01-02 00:02:01', 'Nick', -90),
550
+ ('2012-01-02 00:02:02', 'Nick', 90)]
551
+ end
552
+ end
553
+
554
+ after { connection.close }
555
+
556
+ it "executes the specified sql in the appropriate order and ETLs properly" do
557
+ etl.ensure_destination do |etl|
558
+ etl.query %[
559
+ CREATE TABLE etl_destination (
560
+ the_datetime DATETIME NOT NULL
561
+ , name VARCHAR(10)
562
+ , amount INT(11) DEFAULT 0
563
+ , PRIMARY KEY (the_datetime, name))]
564
+ end
565
+
566
+ etl.before_etl do |etl|
567
+ etl.query "DELETE FROM etl_source WHERE amount < 0"
568
+ end
569
+
570
+ etl.start do |etl|
571
+ etl.query(%[
572
+ SELECT CAST(COALESCE(MAX(the_datetime), '2012-01-01 00:00:00') AS DATETIME) AS the_start
573
+ FROM etl_destination
574
+ ]).to_a.first['the_start']
575
+ end
576
+
577
+ etl.step do
578
+ 1.minute
579
+ end
580
+
581
+ etl.stop do |etl|
582
+ etl.query(
583
+ "SELECT MAX(the_datetime) AS the_stop FROM etl_source"
584
+ ).to_a.first['the_stop']
585
+ end
586
+
587
+ etl.etl do |etl, lbound, ubound|
588
+ etl.query %[
589
+ REPLACE INTO etl_destination
590
+ SELECT
591
+ the_datetime
592
+ , name
593
+ , amount
594
+ FROM etl_source s
595
+ WHERE s.the_datetime >= '#{lbound}'
596
+ AND s.the_datetime < '#{ubound}']
597
+ end
598
+
599
+ etl.run
600
+
601
+ connection
602
+ .query(%[
603
+ SELECT
604
+ the_datetime
605
+ , name
606
+ , amount
607
+ FROM
608
+ etl_destination
609
+ ORDER BY
610
+ the_datetime ASC
611
+ , name ASC
612
+ ]).to_a
613
+ .should == [
614
+ {'the_datetime' => Time.parse('2012-01-01 00:01:00'), 'name' => 'Ryan', 'amount' => 50},
615
+ {'the_datetime' => Time.parse('2012-01-01 00:01:01'), 'name' => 'Jack', 'amount' => 75},
616
+ {'the_datetime' => Time.parse('2012-01-01 00:01:02'), 'name' => 'Jeff', 'amount' => 10},
617
+ {'the_datetime' => Time.parse('2012-01-02 00:02:00'), 'name' => 'Jack', 'amount' => 45},
618
+ {'the_datetime' => Time.parse('2012-01-02 00:02:02'), 'name' => 'Nick', 'amount' => 90}]
619
+ end
620
+ end
621
+ end
622
+ end