ETL 0.0.1 → 1.0.0.rc
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/CONTRIBUTING.md +14 -0
- data/Gemfile +1 -1
- data/LICENSE +10 -19
- data/README.md +364 -8
- data/Rakefile +26 -0
- data/etl.gemspec +24 -0
- data/lib/etl.rb +195 -0
- data/lib/etl/helpers.rb +57 -0
- data/lib/etl/version.rb +3 -0
- data/spec/etl_spec.rb +622 -0
- metadata +101 -13
- data/ETL.gemspec +0 -17
- data/lib/ETL.rb +0 -5
- data/lib/ETL/version.rb +0 -3
data/Rakefile
CHANGED
@@ -1,2 +1,28 @@
|
|
1
1
|
#!/usr/bin/env rake
|
2
2
|
require "bundler/gem_tasks"
|
3
|
+
begin
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
7
|
+
t.rspec_opts = '-b'
|
8
|
+
end
|
9
|
+
|
10
|
+
task default: :spec
|
11
|
+
rescue LoadError
|
12
|
+
$stderr.puts "rspec not available, spec task not provided"
|
13
|
+
end
|
14
|
+
|
15
|
+
begin
|
16
|
+
require 'cane/rake_task'
|
17
|
+
|
18
|
+
desc "Run cane to check quality metrics"
|
19
|
+
Cane::RakeTask.new(:quality) do |cane|
|
20
|
+
cane.abc_max = 10
|
21
|
+
cane.style_glob = "lib/**/*.rb"
|
22
|
+
cane.no_doc = true
|
23
|
+
end
|
24
|
+
|
25
|
+
task :default => :quality
|
26
|
+
rescue LoadError
|
27
|
+
warn "cane not available, quality task not provided."
|
28
|
+
end
|
data/etl.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/etl/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jeff Iacono"]
|
6
|
+
gem.email = ["iacono@squareup.com"]
|
7
|
+
gem.description = %q{Extract, Transform, and Load (ETL) ruby wrapper}
|
8
|
+
gem.summary = %q{Extract, Transform, and Load (ETL) ruby wrapper. Supports basic and iterative ETL operations.}
|
9
|
+
gem.homepage = "https://github.com/square/ETL"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "ETL"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = ETL::VERSION
|
17
|
+
|
18
|
+
gem.add_runtime_dependency "activesupport", [">= 3.2.3"]
|
19
|
+
|
20
|
+
gem.add_development_dependency "rake"
|
21
|
+
gem.add_development_dependency "cane"
|
22
|
+
gem.add_development_dependency "mysql2"
|
23
|
+
gem.add_development_dependency "rspec", [">= 2"]
|
24
|
+
end
|
data/lib/etl.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
require 'etl/version'
|
2
|
+
require 'etl/helpers'
|
3
|
+
require 'logger'
|
4
|
+
require 'date'
|
5
|
+
require 'time'
|
6
|
+
|
7
|
+
class ETL
|
8
|
+
include Helpers
|
9
|
+
|
10
|
+
attr_accessor :description
|
11
|
+
attr_accessor :connection
|
12
|
+
attr_reader :logger
|
13
|
+
|
14
|
+
ORDERED_ETL_OPERATIONS = [
|
15
|
+
:ensure_destination,
|
16
|
+
:before_etl,
|
17
|
+
:etl,
|
18
|
+
:after_etl
|
19
|
+
]
|
20
|
+
|
21
|
+
ITERATOR_OPERATIONS = [
|
22
|
+
:start,
|
23
|
+
:step,
|
24
|
+
:stop
|
25
|
+
]
|
26
|
+
|
27
|
+
def initialize attributes = {}
|
28
|
+
attributes.keys.uniq.each do |attribute|
|
29
|
+
self.send "#{attribute}=", attributes[attribute]
|
30
|
+
end
|
31
|
+
default_logger! unless attributes.keys.include?(:logger)
|
32
|
+
end
|
33
|
+
|
34
|
+
def config &block
|
35
|
+
yield self if block_given?
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
def logger= logger
|
40
|
+
@logger = logger
|
41
|
+
end
|
42
|
+
|
43
|
+
# A little metaprogramming to consolidate the generation of our sql
|
44
|
+
# generating / querying methods. Note that we don't metaprogram the etl
|
45
|
+
# operation as it's a little more complex.
|
46
|
+
#
|
47
|
+
# This will produce methods of the form:
|
48
|
+
#
|
49
|
+
# def [name] *args, &block
|
50
|
+
# if block_given?
|
51
|
+
# @[name] = block
|
52
|
+
# else
|
53
|
+
# @[name].call self, *args if @[name]
|
54
|
+
# end
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# for any given variable included in the method name's array
|
58
|
+
(ORDERED_ETL_OPERATIONS - [:etl]).each do |method|
|
59
|
+
define_method method do |*args, &block|
|
60
|
+
if block
|
61
|
+
instance_variable_set("@#{method}", block)
|
62
|
+
else
|
63
|
+
instance_variable_get("@#{method}").
|
64
|
+
call(self, *args) if instance_variable_get("@#{method}")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def etl *args, &block
|
70
|
+
if block_given?
|
71
|
+
@etl = block
|
72
|
+
else
|
73
|
+
if iterate?
|
74
|
+
if @etl
|
75
|
+
current = start
|
76
|
+
@etl.call self, cast(current), cast(current += step) while stop >= current
|
77
|
+
end
|
78
|
+
else
|
79
|
+
@etl.call self, *args if @etl
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# A little more metaprogramming to consolidate the generation of
|
85
|
+
# our sql generating / querying methods.
|
86
|
+
#
|
87
|
+
# This will produce methods of the form:
|
88
|
+
#
|
89
|
+
# def [method] *args, &block
|
90
|
+
# if block
|
91
|
+
# @_[method]_block = block
|
92
|
+
# else
|
93
|
+
# # cache block's result
|
94
|
+
# if defined? @[method]
|
95
|
+
# @[method]
|
96
|
+
# else
|
97
|
+
# @[method] = @_[method]_block.call(self, *args)
|
98
|
+
# end
|
99
|
+
# end
|
100
|
+
# end
|
101
|
+
#
|
102
|
+
# for any given variable included in the method name's array
|
103
|
+
ITERATOR_OPERATIONS.each do |method|
|
104
|
+
define_method method do |*args, &block|
|
105
|
+
if block
|
106
|
+
instance_variable_set("@_#{method}_block", block)
|
107
|
+
else
|
108
|
+
if instance_variable_defined?("@#{method}")
|
109
|
+
instance_variable_get("@#{method}")
|
110
|
+
else
|
111
|
+
instance_variable_set("@#{method}",
|
112
|
+
instance_variable_get("@_#{method}_block")
|
113
|
+
.call(self, *args))
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def run options = {}
|
120
|
+
(ORDERED_ETL_OPERATIONS - [*options[:except]]).each do |method|
|
121
|
+
send method
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def query sql
|
126
|
+
time_and_log(sql: sql) do
|
127
|
+
connection.query sql
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def iterate?
|
134
|
+
ITERATOR_OPERATIONS.all? do |method|
|
135
|
+
instance_variable_defined?("@_#{method}_block")
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def default_logger!
|
140
|
+
@logger = default_logger
|
141
|
+
end
|
142
|
+
|
143
|
+
def logger?
|
144
|
+
!!@logger
|
145
|
+
end
|
146
|
+
|
147
|
+
def info data = {}
|
148
|
+
logger.info data.merge(emitter: self) if logger?
|
149
|
+
end
|
150
|
+
|
151
|
+
def debug data = {}
|
152
|
+
logger.debug data.merge(emitter: self) if logger?
|
153
|
+
end
|
154
|
+
|
155
|
+
def default_logger
|
156
|
+
::Logger.new(STDOUT).tap do |logger|
|
157
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
158
|
+
lead = "[#{datetime}] #{severity} #{msg[:event_type]}"
|
159
|
+
desc = "\"#{msg[:emitter].description || 'no description given'}\""
|
160
|
+
desc += " (object #{msg[:emitter].object_id})"
|
161
|
+
|
162
|
+
case msg[:event_type]
|
163
|
+
when :query_start
|
164
|
+
"#{lead} for #{desc}\n#{msg[:sql]}\n"
|
165
|
+
when :query_complete
|
166
|
+
"#{lead} for #{desc} runtime: #{msg[:runtime]}s\n"
|
167
|
+
else
|
168
|
+
"#{msg}"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def time_and_log data = {}, &block
|
175
|
+
start_runtime = Time.now
|
176
|
+
debug data.merge(event_type: :query_start)
|
177
|
+
retval = yield
|
178
|
+
info data.merge(event_type: :query_complete,
|
179
|
+
runtime: Time.now - start_runtime)
|
180
|
+
retval
|
181
|
+
end
|
182
|
+
|
183
|
+
# NOTE: If you needed to handle more type data type casting you can add a
|
184
|
+
# case statement. If you need to be able to handle entirely different sets
|
185
|
+
# of casting depending on database engine, you can modify #cast to take a
|
186
|
+
# "type" arg and then determine which caster to route the arg through
|
187
|
+
def cast arg
|
188
|
+
case arg
|
189
|
+
when Date then arg.strftime("%Y-%m-%d")
|
190
|
+
when Time then arg.strftime("%Y-%m-%d %H:%M:%S")
|
191
|
+
else
|
192
|
+
arg
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
data/lib/etl/helpers.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
class ETL
|
2
|
+
module Helpers
|
3
|
+
# max_for returns the max value for the passed in column as found in the
|
4
|
+
# specified database.table. If there is not currently a max, we use COALESCE
|
5
|
+
# and a default value. You can specify a :default_floor value or the method
|
6
|
+
# will try to derive it for you.
|
7
|
+
#
|
8
|
+
# Note: we try to detect if we want a date return type via the #datetype?
|
9
|
+
# check.
|
10
|
+
#
|
11
|
+
# If this is found we wrap the whole SELECT clause in a DATE so it is cast
|
12
|
+
# accordingly.
|
13
|
+
def max_for options = {}
|
14
|
+
database = options[:database]
|
15
|
+
table = options[:table]
|
16
|
+
column = options[:column]
|
17
|
+
|
18
|
+
default_value = options[:default_floor] ||
|
19
|
+
default_floor_for(column)
|
20
|
+
|
21
|
+
if date? default_value
|
22
|
+
default_value = "DATE('#{default_value}')"
|
23
|
+
caster = ->(str) { "DATE(#{str})" }
|
24
|
+
end
|
25
|
+
|
26
|
+
max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
|
27
|
+
max_sql_clause = caster.(max_sql_clause) if caster
|
28
|
+
|
29
|
+
sql = <<-EOS
|
30
|
+
SELECT #{max_sql_clause} AS the_max
|
31
|
+
FROM #{database}.#{table}
|
32
|
+
EOS
|
33
|
+
sql += " WHERE #{options[:conditions]}" if options[:conditions]
|
34
|
+
|
35
|
+
query(sql).to_a.first['the_max']
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def date? val
|
41
|
+
val =~ /^\d{4}-\d{1,2}-\d{1,2}( \d{2}:\d{2}:\d{2}( ((-|\+)\d+)| UTC)?)?$/
|
42
|
+
end
|
43
|
+
|
44
|
+
def default_floor_for column
|
45
|
+
case column
|
46
|
+
when /_at$/
|
47
|
+
return '1970-01-01'
|
48
|
+
when /_date$/
|
49
|
+
return '1970-01-01'
|
50
|
+
when /(^id$|_id$)/
|
51
|
+
return 0
|
52
|
+
else
|
53
|
+
raise ArgumentError, "could not determine a default for #{column}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/etl/version.rb
ADDED
data/spec/etl_spec.rb
ADDED
@@ -0,0 +1,622 @@
|
|
1
|
+
require 'mysql2'
|
2
|
+
require 'active_support/time'
|
3
|
+
require 'etl'
|
4
|
+
|
5
|
+
def test_connection
|
6
|
+
Mysql2::Client.new host: 'localhost', username: 'root', database: 'etl_test'
|
7
|
+
end
|
8
|
+
|
9
|
+
def reset_test_env connection, &block
|
10
|
+
connection.query %[DROP DATABASE IF EXISTS etl_test]
|
11
|
+
connection.query %[CREATE DATABASE etl_test]
|
12
|
+
connection.query %[USE etl_test]
|
13
|
+
|
14
|
+
if block_given?
|
15
|
+
yield connection
|
16
|
+
else
|
17
|
+
connection.query %[
|
18
|
+
CREATE TABLE etl_source (
|
19
|
+
id INT NOT NULL
|
20
|
+
, name VARCHAR(10)
|
21
|
+
, amount INT(11) DEFAULT 0
|
22
|
+
, PRIMARY KEY (id))]
|
23
|
+
|
24
|
+
connection.query %[
|
25
|
+
INSERT INTO etl_test.etl_source (id, name, amount)
|
26
|
+
VALUES
|
27
|
+
(1, 'Jeff', 100),
|
28
|
+
(2, 'Ryan', 50),
|
29
|
+
(3, 'Jack', 75),
|
30
|
+
(4, 'Jeff', 10),
|
31
|
+
(5, 'Jack', 45),
|
32
|
+
(6, 'Nick', -90),
|
33
|
+
(7, 'Nick', 90)
|
34
|
+
]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe ETL do
|
39
|
+
let(:logger) { nil }
|
40
|
+
|
41
|
+
describe "#logger=" do
|
42
|
+
let(:etl) { described_class.new connection: stub }
|
43
|
+
|
44
|
+
it 'assigns' do
|
45
|
+
logger = stub
|
46
|
+
etl.logger = logger
|
47
|
+
etl.logger.should == logger
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe '#max_for' do
|
52
|
+
let(:connection) { test_connection }
|
53
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
54
|
+
|
55
|
+
before do
|
56
|
+
client = Mysql2::Client.new host: 'localhost', username: 'root'
|
57
|
+
client.query %[DROP DATABASE IF EXISTS etl_test]
|
58
|
+
client.query %[CREATE DATABASE etl_test]
|
59
|
+
client.query %[USE etl_test]
|
60
|
+
client.query %[
|
61
|
+
CREATE TABLE IF NOT EXISTS etl_source (
|
62
|
+
id INT(11) NOT NULL AUTO_INCREMENT
|
63
|
+
, name VARCHAR(10)
|
64
|
+
, amount INT(11) DEFAULT 0
|
65
|
+
, the_date DATE DEFAULT NULL
|
66
|
+
, the_null_date DATE DEFAULT NULL
|
67
|
+
, the_time_at DATETIME DEFAULT NULL
|
68
|
+
, the_null_time_at DATETIME DEFAULT NULL
|
69
|
+
, PRIMARY KEY (id))]
|
70
|
+
|
71
|
+
client.query %[
|
72
|
+
INSERT INTO etl_source (
|
73
|
+
name
|
74
|
+
, amount
|
75
|
+
, the_date
|
76
|
+
, the_null_date
|
77
|
+
, the_time_at
|
78
|
+
, the_null_time_at
|
79
|
+
) VALUES
|
80
|
+
('Jeff', 100, '2012-01-02', NULL, '2012-01-02 00:00:01', NULL)
|
81
|
+
, ('Ryan', 50, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
|
82
|
+
, ('Jack', 75, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
|
83
|
+
, ('Jeff', 10, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
|
84
|
+
, ('Jack', 45, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
|
85
|
+
, ('Nick', -90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
|
86
|
+
, ('Nick', 90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)]
|
87
|
+
|
88
|
+
client.close
|
89
|
+
end
|
90
|
+
|
91
|
+
after { connection.close }
|
92
|
+
|
93
|
+
it "finds the max for dates" do
|
94
|
+
etl.max_for(database: :etl_test,
|
95
|
+
table: :etl_source,
|
96
|
+
column: :the_date).should == Date.parse('2012-01-02')
|
97
|
+
end
|
98
|
+
|
99
|
+
it "defaults to the beginning of time date when a max date cannot be found" do
|
100
|
+
etl.max_for(database: :etl_test,
|
101
|
+
table: :etl_source,
|
102
|
+
column: :the_null_date).should == Date.parse('1970-01-01')
|
103
|
+
end
|
104
|
+
|
105
|
+
it "defaults to the specified default floor when a max date cannot be found" do
|
106
|
+
etl.max_for(database: :etl_test,
|
107
|
+
table: :etl_source,
|
108
|
+
column: :the_null_date,
|
109
|
+
default_floor: '2011-01-01').should == Date.parse('2011-01-01')
|
110
|
+
end
|
111
|
+
|
112
|
+
it "finds the max for datetimes" do
|
113
|
+
etl.max_for(database: :etl_test,
|
114
|
+
table: :etl_source,
|
115
|
+
column: :the_time_at).should == Date.parse('2012-01-02')
|
116
|
+
end
|
117
|
+
|
118
|
+
it "defaults to the beginning of time when a max datetime cannot be found" do
|
119
|
+
etl.max_for(database: :etl_test,
|
120
|
+
table: :etl_source,
|
121
|
+
column: :the_null_time_at).should == Date.parse('1970-01-01 00:00:00')
|
122
|
+
end
|
123
|
+
|
124
|
+
it "defaults to the specified default floor when a max datetime cannot be found" do
|
125
|
+
etl.max_for(database: :etl_test,
|
126
|
+
table: :etl_source,
|
127
|
+
column: :the_null_time_at,
|
128
|
+
default_floor: '2011-01-01 00:00:00').should == Date.parse('2011-01-01 00:00:00')
|
129
|
+
end
|
130
|
+
|
131
|
+
it "raises an error if a non-standard column is supplied with no default floor" do
|
132
|
+
expect {
|
133
|
+
etl.max_for database: :etl_test,
|
134
|
+
table: :etl_source,
|
135
|
+
column: :amount
|
136
|
+
}.to raise_exception
|
137
|
+
end
|
138
|
+
|
139
|
+
it "finds the max for a non-standard column, using the default floor" do
|
140
|
+
etl.max_for(database: :etl_test,
|
141
|
+
table: :etl_source,
|
142
|
+
column: :amount,
|
143
|
+
default_floor: 0).should == 100
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
describe '#run' do
|
148
|
+
let(:connection) { test_connection }
|
149
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
150
|
+
|
151
|
+
before do
|
152
|
+
client = Mysql2::Client.new host: 'localhost', username: 'root'
|
153
|
+
client.query %[DROP DATABASE IF EXISTS etl_test]
|
154
|
+
client.query %[CREATE DATABASE etl_test]
|
155
|
+
client.query %[USE etl_test]
|
156
|
+
client.query %[
|
157
|
+
CREATE TABLE IF NOT EXISTS etl_source (
|
158
|
+
id INT(11) NOT NULL AUTO_INCREMENT
|
159
|
+
, name VARCHAR(10)
|
160
|
+
, amount INT(11) DEFAULT 0
|
161
|
+
, PRIMARY KEY (id))]
|
162
|
+
|
163
|
+
client.query %[
|
164
|
+
INSERT INTO etl_source (name, amount)
|
165
|
+
VALUES
|
166
|
+
('Jeff', 100),
|
167
|
+
('Ryan', 50),
|
168
|
+
('Jack', 75),
|
169
|
+
('Jeff', 10),
|
170
|
+
('Jack', 45),
|
171
|
+
('Nick', -90),
|
172
|
+
('Nick', 90)]
|
173
|
+
|
174
|
+
client.close
|
175
|
+
end
|
176
|
+
|
177
|
+
it "executes the specified sql in the appropriate order" do
|
178
|
+
etl.ensure_destination do |etl|
|
179
|
+
etl.query %[
|
180
|
+
CREATE TABLE IF NOT EXISTS etl_destination (
|
181
|
+
name VARCHAR(10)
|
182
|
+
, total_amount INT(11) DEFAULT 0
|
183
|
+
, PRIMARY KEY (name))]
|
184
|
+
end
|
185
|
+
|
186
|
+
etl.before_etl do |etl|
|
187
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
188
|
+
end
|
189
|
+
|
190
|
+
etl.etl do |etl|
|
191
|
+
etl.query %[
|
192
|
+
REPLACE INTO etl_destination
|
193
|
+
SELECT name, SUM(amount) FROM etl_source
|
194
|
+
GROUP BY name]
|
195
|
+
end
|
196
|
+
|
197
|
+
etl.after_etl do |etl|
|
198
|
+
etl.query %[
|
199
|
+
UPDATE etl_destination
|
200
|
+
SET name = CONCAT("SUPER ", name)
|
201
|
+
WHERE total_amount > 115]
|
202
|
+
end
|
203
|
+
|
204
|
+
etl.run
|
205
|
+
|
206
|
+
connection
|
207
|
+
.query("SELECT * FROM etl_destination ORDER BY total_amount DESC")
|
208
|
+
.to_a
|
209
|
+
.should == [
|
210
|
+
{'name' => 'SUPER Jack', 'total_amount' => 120},
|
211
|
+
{'name' => 'Jeff', 'total_amount' => 110},
|
212
|
+
{'name' => 'Nick', 'total_amount' => 90},
|
213
|
+
{'name' => 'Ryan', 'total_amount' => 50}]
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
describe '#run with operations specified for exclusion' do
|
218
|
+
let(:connection) { stub }
|
219
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
220
|
+
|
221
|
+
it "does not call the specified method" do
|
222
|
+
etl.ensure_destination {}
|
223
|
+
etl.should_not_receive(:ensure_destination)
|
224
|
+
etl.run except: :ensure_destination
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
context "with iteration" do
|
229
|
+
describe '#run over full table' do
|
230
|
+
let(:connection) { test_connection }
|
231
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
232
|
+
|
233
|
+
before { reset_test_env connection }
|
234
|
+
after { connection.close }
|
235
|
+
|
236
|
+
it "executes the specified sql in the appropriate order and ETLs properly" do
|
237
|
+
etl.ensure_destination do |etl|
|
238
|
+
etl.query %[
|
239
|
+
CREATE TABLE etl_destination (
|
240
|
+
id INT NOT NULL
|
241
|
+
, name VARCHAR(10)
|
242
|
+
, amount INT(11) DEFAULT 0
|
243
|
+
, PRIMARY KEY (id))]
|
244
|
+
end
|
245
|
+
|
246
|
+
etl.before_etl do |etl|
|
247
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
248
|
+
end
|
249
|
+
|
250
|
+
etl.start do |etl|
|
251
|
+
etl.query(
|
252
|
+
"SELECT COALESCE(MAX(id), 0) AS the_start FROM etl_destination"
|
253
|
+
).to_a.first['the_start']
|
254
|
+
end
|
255
|
+
|
256
|
+
etl.step do
|
257
|
+
1
|
258
|
+
end
|
259
|
+
|
260
|
+
etl.stop do |etl|
|
261
|
+
etl.query(
|
262
|
+
"SELECT MAX(id) AS the_stop FROM etl_source"
|
263
|
+
).to_a.first['the_stop']
|
264
|
+
end
|
265
|
+
|
266
|
+
etl.etl do |etl, lbound, ubound|
|
267
|
+
etl.query %[
|
268
|
+
REPLACE INTO etl_destination
|
269
|
+
SELECT id, name, amount FROM etl_source s
|
270
|
+
WHERE s.id >= #{lbound}
|
271
|
+
AND s.id < #{ubound}]
|
272
|
+
end
|
273
|
+
|
274
|
+
etl.after_etl do |etl|
|
275
|
+
etl.query %[
|
276
|
+
UPDATE etl_destination
|
277
|
+
SET name = CONCAT("SUPER ", name)
|
278
|
+
WHERE id <= 1]
|
279
|
+
end
|
280
|
+
|
281
|
+
etl.run
|
282
|
+
|
283
|
+
connection
|
284
|
+
.query("SELECT * FROM etl_destination ORDER BY id ASC")
|
285
|
+
.to_a
|
286
|
+
.should == [
|
287
|
+
{'id' => 1, 'name' => 'SUPER Jeff', 'amount' => 100},
|
288
|
+
{'id' => 2, 'name' => 'Ryan', 'amount' => 50},
|
289
|
+
{'id' => 3, 'name' => 'Jack', 'amount' => 75},
|
290
|
+
{'id' => 4, 'name' => 'Jeff', 'amount' => 10},
|
291
|
+
{'id' => 5, 'name' => 'Jack', 'amount' => 45},
|
292
|
+
{'id' => 7, 'name' => 'Nick', 'amount' => 90}]
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
describe '#run over part of table' do
|
297
|
+
let(:connection) { test_connection }
|
298
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
299
|
+
|
300
|
+
before { reset_test_env connection }
|
301
|
+
after { connection.close }
|
302
|
+
|
303
|
+
it "executes the specified sql in the appropriate order and ETLs properly" do
|
304
|
+
etl.ensure_destination do |etl|
|
305
|
+
etl.query %[
|
306
|
+
CREATE TABLE etl_destination (
|
307
|
+
id INT NOT NULL
|
308
|
+
, name VARCHAR(10)
|
309
|
+
, amount INT(11) DEFAULT 0
|
310
|
+
, PRIMARY KEY (id))]
|
311
|
+
end
|
312
|
+
|
313
|
+
etl.before_etl do |etl|
|
314
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
315
|
+
end
|
316
|
+
|
317
|
+
etl.start do
|
318
|
+
4
|
319
|
+
end
|
320
|
+
|
321
|
+
etl.step do
|
322
|
+
1
|
323
|
+
end
|
324
|
+
|
325
|
+
etl.stop do |etl|
|
326
|
+
etl.query(
|
327
|
+
"SELECT MAX(id) AS the_stop FROM etl_source"
|
328
|
+
).to_a.first['the_stop']
|
329
|
+
end
|
330
|
+
|
331
|
+
etl.etl do |etl, lbound, ubound|
|
332
|
+
etl.query %[
|
333
|
+
REPLACE INTO etl_destination
|
334
|
+
SELECT id, name, amount FROM etl_source s
|
335
|
+
WHERE s.id >= #{lbound}
|
336
|
+
AND s.id < #{ubound}]
|
337
|
+
end
|
338
|
+
|
339
|
+
etl.run
|
340
|
+
|
341
|
+
connection
|
342
|
+
.query("SELECT * FROM etl_destination ORDER BY id ASC")
|
343
|
+
.to_a.should == [
|
344
|
+
{'id' => 4, 'name' => 'Jeff', 'amount' => 10},
|
345
|
+
{'id' => 5, 'name' => 'Jack', 'amount' => 45},
|
346
|
+
{'id' => 7, 'name' => 'Nick', 'amount' => 90}]
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
describe "#run over gappy data" do
|
351
|
+
let(:connection) { test_connection }
|
352
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
353
|
+
|
354
|
+
before do
|
355
|
+
reset_test_env(connection) do |connection|
|
356
|
+
connection.query %[
|
357
|
+
CREATE TABLE etl_source (
|
358
|
+
id INT NOT NULL
|
359
|
+
, name VARCHAR(10)
|
360
|
+
, amount INT(11) DEFAULT 0
|
361
|
+
, PRIMARY KEY (id))]
|
362
|
+
|
363
|
+
connection.query %[
|
364
|
+
INSERT INTO etl_source (id, name, amount)
|
365
|
+
VALUES
|
366
|
+
(1, 'Jeff', 100),
|
367
|
+
(2, 'Ryan', 50),
|
368
|
+
(13, 'Jack', 75),
|
369
|
+
(14, 'Jeff', 10),
|
370
|
+
(15, 'Jack', 45),
|
371
|
+
(16, 'Nick', -90),
|
372
|
+
(17, 'Nick', 90)]
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
after { connection.close }
|
377
|
+
|
378
|
+
it "executes the specified sql in the appropriate order without getting stuck" do
|
379
|
+
etl.ensure_destination do |etl|
|
380
|
+
etl.query %[
|
381
|
+
CREATE TABLE etl_destination (
|
382
|
+
id INT NOT NULL
|
383
|
+
, name VARCHAR(10)
|
384
|
+
, amount INT(11) DEFAULT 0
|
385
|
+
, PRIMARY KEY (id))]
|
386
|
+
end
|
387
|
+
|
388
|
+
etl.before_etl do |etl|
|
389
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
390
|
+
end
|
391
|
+
|
392
|
+
etl.start do |etl|
|
393
|
+
1
|
394
|
+
end
|
395
|
+
|
396
|
+
etl.step do
|
397
|
+
1
|
398
|
+
end
|
399
|
+
|
400
|
+
etl.stop do |etl|
|
401
|
+
etl.query(
|
402
|
+
"SELECT MAX(id) AS the_stop FROM etl_source"
|
403
|
+
).to_a.first['the_stop']
|
404
|
+
end
|
405
|
+
|
406
|
+
etl.etl do |etl, lbound, ubound|
|
407
|
+
etl.query %[
|
408
|
+
REPLACE INTO etl_destination
|
409
|
+
SELECT
|
410
|
+
id
|
411
|
+
, name
|
412
|
+
, amount
|
413
|
+
FROM etl_source s
|
414
|
+
WHERE s.id >= #{lbound}
|
415
|
+
AND s.id < #{ubound}]
|
416
|
+
end
|
417
|
+
|
418
|
+
etl.run
|
419
|
+
|
420
|
+
connection
|
421
|
+
.query("SELECT * FROM etl_destination ORDER BY id ASC")
|
422
|
+
.to_a
|
423
|
+
.should == [
|
424
|
+
{'id' => 1, 'name' => 'Jeff', 'amount' => 100},
|
425
|
+
{'id' => 2, 'name' => 'Ryan', 'amount' => 50},
|
426
|
+
{'id' => 13, 'name' => 'Jack', 'amount' => 75},
|
427
|
+
{'id' => 14, 'name' => 'Jeff', 'amount' => 10},
|
428
|
+
{'id' => 15, 'name' => 'Jack', 'amount' => 45},
|
429
|
+
{'id' => 17, 'name' => 'Nick', 'amount' => 90}]
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
describe "#run over date data" do
|
434
|
+
let(:connection) { test_connection }
|
435
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
436
|
+
|
437
|
+
before do
|
438
|
+
reset_test_env(connection) do |connection|
|
439
|
+
connection.query %[
|
440
|
+
CREATE TABLE etl_source (
|
441
|
+
the_date DATE NOT NULL
|
442
|
+
, name VARCHAR(10)
|
443
|
+
, amount INT(11) DEFAULT 0)]
|
444
|
+
|
445
|
+
connection.query %[
|
446
|
+
INSERT INTO etl_source (the_date, name, amount)
|
447
|
+
VALUES
|
448
|
+
('2012-01-01', 'Jeff', 100),
|
449
|
+
('2012-01-01', 'Ryan', 50),
|
450
|
+
('2012-01-01', 'Jack', 75),
|
451
|
+
('2012-01-01', 'Jeff', 10),
|
452
|
+
('2012-01-02', 'Jack', 45),
|
453
|
+
('2012-01-02', 'Nick', -90),
|
454
|
+
('2012-01-02', 'Nick', 90)]
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
after { connection.close }
|
459
|
+
|
460
|
+
it "executes the specified sql in the appropriate order and ETLs properly" do
|
461
|
+
etl.ensure_destination do |etl|
|
462
|
+
etl.query %[
|
463
|
+
CREATE TABLE etl_destination (
|
464
|
+
the_date DATE NOT NULL
|
465
|
+
, name VARCHAR(10)
|
466
|
+
, total_amount INT(11) DEFAULT 0
|
467
|
+
, PRIMARY KEY (the_date, name))]
|
468
|
+
end
|
469
|
+
|
470
|
+
etl.before_etl do |etl|
|
471
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
472
|
+
end
|
473
|
+
|
474
|
+
etl.start do |etl|
|
475
|
+
etl.query(%[
|
476
|
+
SELECT COALESCE(MAX(the_date), DATE('2012-01-01')) AS the_start
|
477
|
+
FROM etl_destination
|
478
|
+
]).to_a.first['the_start']
|
479
|
+
end
|
480
|
+
|
481
|
+
etl.step do
|
482
|
+
1.day
|
483
|
+
end
|
484
|
+
|
485
|
+
etl.stop do |etl|
|
486
|
+
etl.query(
|
487
|
+
"SELECT MAX(the_date) AS the_stop FROM etl_source"
|
488
|
+
).to_a.first['the_stop']
|
489
|
+
end
|
490
|
+
|
491
|
+
etl.etl do |etl, lbound, ubound|
|
492
|
+
etl.query %[
|
493
|
+
REPLACE INTO etl_destination
|
494
|
+
SELECT
|
495
|
+
the_date
|
496
|
+
, name
|
497
|
+
, SUM(amount) AS total_amount
|
498
|
+
FROM etl_source s
|
499
|
+
WHERE s.the_date >= '#{lbound}'
|
500
|
+
AND s.the_date < '#{ubound}'
|
501
|
+
GROUP BY
|
502
|
+
the_date
|
503
|
+
, name]
|
504
|
+
end
|
505
|
+
|
506
|
+
etl.run
|
507
|
+
|
508
|
+
connection
|
509
|
+
.query(%[
|
510
|
+
SELECT
|
511
|
+
the_date
|
512
|
+
, name
|
513
|
+
, total_amount
|
514
|
+
FROM
|
515
|
+
etl_destination
|
516
|
+
ORDER BY
|
517
|
+
the_date ASC
|
518
|
+
, name ASC
|
519
|
+
]).to_a
|
520
|
+
.should == [
|
521
|
+
{'the_date' => Date.parse('2012-01-01'), 'name' => 'Jack', 'total_amount' => 75},
|
522
|
+
{'the_date' => Date.parse('2012-01-01'), 'name' => 'Jeff', 'total_amount' => 110},
|
523
|
+
{'the_date' => Date.parse('2012-01-01'), 'name' => 'Ryan', 'total_amount' => 50},
|
524
|
+
{'the_date' => Date.parse('2012-01-02'), 'name' => 'Jack', 'total_amount' => 45},
|
525
|
+
{'the_date' => Date.parse('2012-01-02'), 'name' => 'Nick', 'total_amount' => 90}]
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
describe "#run over datetime data" do
|
530
|
+
let(:connection) { test_connection }
|
531
|
+
let(:etl) { described_class.new connection: connection, logger: logger }
|
532
|
+
|
533
|
+
before do
|
534
|
+
reset_test_env(connection) do |connection|
|
535
|
+
connection.query %[
|
536
|
+
CREATE TABLE etl_source (
|
537
|
+
the_datetime DATETIME NOT NULL
|
538
|
+
, name VARCHAR(10)
|
539
|
+
, amount INT(11) DEFAULT 0)]
|
540
|
+
|
541
|
+
connection.query %[
|
542
|
+
INSERT INTO etl_source (the_datetime, name, amount)
|
543
|
+
VALUES
|
544
|
+
('2011-12-31 23:59:59', 'Jeff', 100),
|
545
|
+
('2012-01-01 00:01:00', 'Ryan', 50),
|
546
|
+
('2012-01-01 00:01:01', 'Jack', 75),
|
547
|
+
('2012-01-01 00:01:02', 'Jeff', 10),
|
548
|
+
('2012-01-02 00:02:00', 'Jack', 45),
|
549
|
+
('2012-01-02 00:02:01', 'Nick', -90),
|
550
|
+
('2012-01-02 00:02:02', 'Nick', 90)]
|
551
|
+
end
|
552
|
+
end
|
553
|
+
|
554
|
+
after { connection.close }
|
555
|
+
|
556
|
+
it "executes the specified sql in the appropriate order and ETLs properly" do
|
557
|
+
etl.ensure_destination do |etl|
|
558
|
+
etl.query %[
|
559
|
+
CREATE TABLE etl_destination (
|
560
|
+
the_datetime DATETIME NOT NULL
|
561
|
+
, name VARCHAR(10)
|
562
|
+
, amount INT(11) DEFAULT 0
|
563
|
+
, PRIMARY KEY (the_datetime, name))]
|
564
|
+
end
|
565
|
+
|
566
|
+
etl.before_etl do |etl|
|
567
|
+
etl.query "DELETE FROM etl_source WHERE amount < 0"
|
568
|
+
end
|
569
|
+
|
570
|
+
etl.start do |etl|
|
571
|
+
etl.query(%[
|
572
|
+
SELECT CAST(COALESCE(MAX(the_datetime), '2012-01-01 00:00:00') AS DATETIME) AS the_start
|
573
|
+
FROM etl_destination
|
574
|
+
]).to_a.first['the_start']
|
575
|
+
end
|
576
|
+
|
577
|
+
etl.step do
|
578
|
+
1.minute
|
579
|
+
end
|
580
|
+
|
581
|
+
etl.stop do |etl|
|
582
|
+
etl.query(
|
583
|
+
"SELECT MAX(the_datetime) AS the_stop FROM etl_source"
|
584
|
+
).to_a.first['the_stop']
|
585
|
+
end
|
586
|
+
|
587
|
+
etl.etl do |etl, lbound, ubound|
|
588
|
+
etl.query %[
|
589
|
+
REPLACE INTO etl_destination
|
590
|
+
SELECT
|
591
|
+
the_datetime
|
592
|
+
, name
|
593
|
+
, amount
|
594
|
+
FROM etl_source s
|
595
|
+
WHERE s.the_datetime >= '#{lbound}'
|
596
|
+
AND s.the_datetime < '#{ubound}']
|
597
|
+
end
|
598
|
+
|
599
|
+
etl.run
|
600
|
+
|
601
|
+
connection
|
602
|
+
.query(%[
|
603
|
+
SELECT
|
604
|
+
the_datetime
|
605
|
+
, name
|
606
|
+
, amount
|
607
|
+
FROM
|
608
|
+
etl_destination
|
609
|
+
ORDER BY
|
610
|
+
the_datetime ASC
|
611
|
+
, name ASC
|
612
|
+
]).to_a
|
613
|
+
.should == [
|
614
|
+
{'the_datetime' => Time.parse('2012-01-01 00:01:00'), 'name' => 'Ryan', 'amount' => 50},
|
615
|
+
{'the_datetime' => Time.parse('2012-01-01 00:01:01'), 'name' => 'Jack', 'amount' => 75},
|
616
|
+
{'the_datetime' => Time.parse('2012-01-01 00:01:02'), 'name' => 'Jeff', 'amount' => 10},
|
617
|
+
{'the_datetime' => Time.parse('2012-01-02 00:02:00'), 'name' => 'Jack', 'amount' => 45},
|
618
|
+
{'the_datetime' => Time.parse('2012-01-02 00:02:02'), 'name' => 'Nick', 'amount' => 90}]
|
619
|
+
end
|
620
|
+
end
|
621
|
+
end
|
622
|
+
end
|