ETL 0.0.1 → 1.0.0.rc
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/CONTRIBUTING.md +14 -0
- data/Gemfile +1 -1
- data/LICENSE +10 -19
- data/README.md +364 -8
- data/Rakefile +26 -0
- data/etl.gemspec +24 -0
- data/lib/etl.rb +195 -0
- data/lib/etl/helpers.rb +57 -0
- data/lib/etl/version.rb +3 -0
- data/spec/etl_spec.rb +622 -0
- metadata +101 -13
- data/ETL.gemspec +0 -17
- data/lib/ETL.rb +0 -5
- data/lib/ETL/version.rb +0 -3
data/.gitignore
CHANGED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/CONTRIBUTING.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Contributing
|
2
|
+
============
|
3
|
+
|
4
|
+
If you would like to contribute code to ETL you can do so through GitHub by
|
5
|
+
forking the repository and sending a pull request.
|
6
|
+
|
7
|
+
When submitting code, please make every effort to follow existing conventions
|
8
|
+
and style in order to keep the code as readable as possible.
|
9
|
+
|
10
|
+
Before your code can be accepted into the project you must also sign the
|
11
|
+
[Individual Contributor License Agreement (CLA)][1].
|
12
|
+
|
13
|
+
|
14
|
+
[1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1
|
data/Gemfile
CHANGED
data/LICENSE
CHANGED
@@ -1,22 +1,13 @@
|
|
1
|
-
Copyright
|
1
|
+
Copyright 2013 Square Inc.
|
2
2
|
|
3
|
-
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
4
6
|
|
5
|
-
|
6
|
-
a copy of this software and associated documentation files (the
|
7
|
-
"Software"), to deal in the Software without restriction, including
|
8
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
the following conditions:
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# ETL
|
2
2
|
|
3
|
-
|
3
|
+
Extract, transform, and load data with ruby!
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -16,14 +16,370 @@ Or install it yourself as:
|
|
16
16
|
|
17
17
|
$ gem install ETL
|
18
18
|
|
19
|
-
##
|
19
|
+
## ETL Dependencies
|
20
20
|
|
21
|
-
|
21
|
+
ETL depends on having a database connection object that __must__ respond
|
22
|
+
to `#query`. The [mysql2](https://github.com/brianmario/mysql2) gem is a good option.
|
23
|
+
You can also proxy another library using Ruby's `SimpleDelegator` and add a `#query`
|
24
|
+
method if need be.
|
25
|
+
|
26
|
+
The gem comes bundled with a default logger. If you'd like to write your own
|
27
|
+
just make sure that it implements `#debug` and `#info`. For more information
|
28
|
+
on what is logged and when, view the [logger details](#logger-details).
|
29
|
+
|
30
|
+
### Basic ETL
|
31
|
+
|
32
|
+
Assume that we have a database connection represented by `connection`.
|
33
|
+
|
34
|
+
To run a basic ETL that is composed of sequential SQL statements, start by
|
35
|
+
creating a new ETL instance:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
etl = ETL.new(description: "a description of what this ETL does",
|
39
|
+
connection: connection)
|
40
|
+
```
|
41
|
+
which can then be configured:
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
etl.config do |etl|
|
45
|
+
etl.ensure_destination do |etl|
|
46
|
+
# For most ETLs you may want to ensure that the destination exists, so the
|
47
|
+
# #ensure_destination block is ideally suited to fulfill this requirement.
|
48
|
+
#
|
49
|
+
# By way of example:
|
50
|
+
#
|
51
|
+
etl.query %[
|
52
|
+
CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
|
53
|
+
user_id INT UNSIGNED NOT NULL,
|
54
|
+
created_date DATE NOT NULL,
|
55
|
+
total_amount INT SIGNED NOT NULL,
|
56
|
+
message VARCHAR(100) DEFAULT NULL,
|
57
|
+
PRIMARY KEY (user_id),
|
58
|
+
KEY (user_id, created_date),
|
59
|
+
KEY (created_date)
|
60
|
+
)]
|
61
|
+
end
|
62
|
+
|
63
|
+
etl.before_etl do |etl|
|
64
|
+
# All pre-ETL work is performed in this block.
|
65
|
+
#
|
66
|
+
# This can be thought of as a before-ETL hook that will fire only once. When
|
67
|
+
# you are not leveraging the ETL iteration capabilities, the value of this
|
68
|
+
# block vs the #etl block is not very clear. We will see how and when to
|
69
|
+
# leverage this block effectively when we introduce iteration.
|
70
|
+
#
|
71
|
+
# As an example, let's say we want to get rid of all entries that have an
|
72
|
+
# amount less than zero before moving on to our actual etl:
|
73
|
+
#
|
74
|
+
etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
|
75
|
+
end
|
76
|
+
|
77
|
+
etl.etl do |etl|
|
78
|
+
# Here is where the magic happens! This block contains the main ETL
|
79
|
+
# operation.
|
80
|
+
#
|
81
|
+
# For example:
|
82
|
+
#
|
83
|
+
etl.query %[
|
84
|
+
REPLACE INTO some_database.some_destination_table
|
85
|
+
SELECT
|
86
|
+
user_id
|
87
|
+
, DATE(created_at) AS created_date
|
88
|
+
, SUM(amount) AS total_amount
|
89
|
+
FROM
|
90
|
+
some_database.some_source_table sst
|
91
|
+
GROUP BY
|
92
|
+
sst.user_id
|
93
|
+
, sst.DATE(created_at)]
|
94
|
+
end
|
95
|
+
|
96
|
+
etl.after_etl do |etl|
|
97
|
+
# All post-ETL work is performed in this block.
|
98
|
+
#
|
99
|
+
# Again, to finish up with an example:
|
100
|
+
#
|
101
|
+
etl.query %[
|
102
|
+
UPDATE some_database.some_destination_table
|
103
|
+
SET message = "WOW"
|
104
|
+
WHERE total_amount > 100]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
```
|
108
|
+
|
109
|
+
At this point it is possible to run the ETL instance via:
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
etl.run
|
113
|
+
```
|
114
|
+
which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in
|
115
|
+
that order.
|
116
|
+
|
117
|
+
### ETL with iteration
|
118
|
+
|
119
|
+
To add in iteration, simply supply `#start`, `#step`, and `#stop` blocks. This
|
120
|
+
is useful when dealing with large data sets or when executing queries that,
|
121
|
+
while optimized, are still slow.
|
122
|
+
|
123
|
+
Again, to kick things off:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
etl = ETL.new(description: "a description of what this ETL does",
|
127
|
+
connection: connection)
|
128
|
+
```
|
129
|
+
|
130
|
+
where `connection` is the same as described above.
|
131
|
+
|
132
|
+
Next we can configure the ETL:
|
133
|
+
|
134
|
+
```ruby
|
135
|
+
# assuming we have the ETL instance from above
|
136
|
+
etl.config do |etl|
|
137
|
+
etl.ensure_destination do |etl|
|
138
|
+
# For most ETLs you may want to ensure that the destination exists, so the
|
139
|
+
# #ensure_destination block is ideally suited to fulfill this requirement.
|
140
|
+
#
|
141
|
+
# By way of example:
|
142
|
+
#
|
143
|
+
etl.query %[
|
144
|
+
CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
|
145
|
+
user_id INT UNSIGNED NOT NULL,
|
146
|
+
created_date DATE NOT NULL,
|
147
|
+
total_amount INT SIGNED NOT NULL,
|
148
|
+
message VARCHAR(100) DEFAULT NULL,
|
149
|
+
PRIMARY KEY (user_id),
|
150
|
+
KEY (user_id, created_date),
|
151
|
+
KEY (created_date)
|
152
|
+
)]
|
153
|
+
end
|
154
|
+
|
155
|
+
etl.before_etl do |etl|
|
156
|
+
# All pre-ETL work is performed in this block.
|
157
|
+
#
|
158
|
+
# Now that we are leveraging iteration the #before_etl block becomes
|
159
|
+
# more useful as a way to execute an operation once before we begin
|
160
|
+
# our iteration.
|
161
|
+
#
|
162
|
+
# As an example, let's say we want to get rid of all entries that have an
|
163
|
+
# amount less than zero before moving on to our actual etl:
|
164
|
+
#
|
165
|
+
etl.query %[
|
166
|
+
DELETE FROM some_database.some_source_table
|
167
|
+
WHERE amount < 0]
|
168
|
+
end
|
169
|
+
|
170
|
+
etl.start do |etl|
|
171
|
+
# This defines where the ETL should start. This can be a flat number
|
172
|
+
# or date, or even SQL / other code can be executed to produce a starting
|
173
|
+
# value.
|
174
|
+
#
|
175
|
+
# Usually, this is the last known entry for the destination table with
|
176
|
+
# some sensible default if the destination does not yet contain data.
|
177
|
+
#
|
178
|
+
# As an example:
|
179
|
+
#
|
180
|
+
res = etl.query %[
|
181
|
+
SELECT COALESCE(MAX(created_date), '1970-01-01') AS the_max
|
182
|
+
FROM some_database.some_destination_table]
|
183
|
+
|
184
|
+
res.to_a.first['the_max']
|
185
|
+
end
|
186
|
+
|
187
|
+
etl.step do |etl|
|
188
|
+
# The step block defines the size of the iteration block. To iterate by
|
189
|
+
# ten records, the step block should be set to return 10.
|
190
|
+
#
|
191
|
+
# As an alternative example, to set the iteration to go 10,000 units
|
192
|
+
# at a time, the following value should be provided:
|
193
|
+
#
|
194
|
+
# 10_000 (Note: an underscore is used for readability)
|
195
|
+
#
|
196
|
+
# As an example, to iterate 7 days at a time:
|
197
|
+
#
|
198
|
+
7.days
|
199
|
+
end
|
200
|
+
|
201
|
+
etl.stop do |etl|
|
202
|
+
# The stop block defines when the iteration should halt.
|
203
|
+
# Again, this can be a flat value or code. Either way, one value *must* be
|
204
|
+
# returned.
|
205
|
+
#
|
206
|
+
# As a flat value:
|
207
|
+
#
|
208
|
+
# 1_000_000
|
209
|
+
#
|
210
|
+
# Or a date value:
|
211
|
+
#
|
212
|
+
# Time.now.to_date
|
213
|
+
#
|
214
|
+
# Or as a code example:
|
215
|
+
#
|
216
|
+
res = etl.query %[
|
217
|
+
SELECT DATE(MAX(created_at)) AS the_max
|
218
|
+
FROM some_database.some_source_table]
|
219
|
+
|
220
|
+
res.to_a.first['the_max']
|
221
|
+
end
|
222
|
+
|
223
|
+
etl.etl do |etl, lbound, ubound|
|
224
|
+
# The etl block is the main part of the framework. Note: there are
|
225
|
+
# two extra args with the iterator this time around: "lbound" and "ubound"
|
226
|
+
#
|
227
|
+
# "lbound" is the lower bound of the current iteration. When iterating
|
228
|
+
# from 0 to 10 and stepping by 2, the lbound would equal 2 on the
|
229
|
+
# second iteration.
|
230
|
+
#
|
231
|
+
# "ubound" is the upper bound of the current iteration. In continuing with the
|
232
|
+
# example above, when iterating from 0 to 10 and stepping by 2, the ubound would
|
233
|
+
# equal 4 on the second iteration.
|
234
|
+
#
|
235
|
+
# These args can be used to "window" SQL queries or other code operations.
|
236
|
+
#
|
237
|
+
# As a first example, to iterate over a set of ids:
|
238
|
+
#
|
239
|
+
# etl.query %[
|
240
|
+
# REPLACE INTO some_database.some_destination_table
|
241
|
+
# SELECT
|
242
|
+
# user_id
|
243
|
+
# , SUM(amount) AS total_amount
|
244
|
+
# FROM
|
245
|
+
# some_database.some_source_table sst
|
246
|
+
# WHERE
|
247
|
+
# sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
|
248
|
+
# GROUP BY
|
249
|
+
# sst.user_id]
|
250
|
+
#
|
251
|
+
# To "window" a SQL query using dates:
|
252
|
+
#
|
253
|
+
etl.query %[
|
254
|
+
REPLACE INTO some_database.some_destination_table
|
255
|
+
SELECT
|
256
|
+
DATE(created_at)
|
257
|
+
, SUM(amount) AS total_amount
|
258
|
+
FROM
|
259
|
+
some_database.some_source_table sst
|
260
|
+
WHERE
|
261
|
+
-- Note the usage of quotes surrounding the lbound and ubound vars.
|
262
|
+
-- This is is required when dealing with dates / datetimes
|
263
|
+
sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
|
264
|
+
GROUP BY
|
265
|
+
sst.user_id]
|
266
|
+
|
267
|
+
# Note that there is no sql sanitization here so there is *potential* for SQL
|
268
|
+
# injection. That being said you'll likely be using this gem in an internal
|
269
|
+
# tool so hopefully your co-workers are not looking to sabotage your ETL
|
270
|
+
# pipeline. Just be aware of this and handle it as you see fit.
|
271
|
+
end
|
272
|
+
|
273
|
+
etl.after_etl do |etl|
|
274
|
+
# All post-ETL work is performed in this block.
|
275
|
+
#
|
276
|
+
# Again, to finish up with an example:
|
277
|
+
#
|
278
|
+
etl.query %[
|
279
|
+
UPDATE some_database.some_destination_table
|
280
|
+
SET message = "WOW"
|
281
|
+
WHERE total_amount > 100]
|
282
|
+
end
|
283
|
+
end
|
284
|
+
```
|
285
|
+
|
286
|
+
At this point it is possible to run the ETL instance via:
|
287
|
+
|
288
|
+
```ruby
|
289
|
+
etl.run
|
290
|
+
```
|
291
|
+
which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in
|
292
|
+
that order.
|
293
|
+
|
294
|
+
Note that `#etl` executes `#start` and `#stop` once and memoizes the result for
|
295
|
+
each. It then begins to iterate from what `#start` evaluated to up until what `#stop`
|
296
|
+
evaluated to by what `#step` evaluates to.
|
297
|
+
|
298
|
+
## Logger Details
|
299
|
+
|
300
|
+
A logger must support two methods: `#info` and `#warn`.
|
301
|
+
|
302
|
+
Both methods should accept a single hash argument. The argument will contain:
|
303
|
+
|
304
|
+
- `:emitter` => a reference to the ETL instance's `self`
|
305
|
+
- `:event_type` => a symbol that includes the type of event being logged. You
|
306
|
+
can use this value to derive which other data you'll have available
|
307
|
+
|
308
|
+
When `:event_type` is equal to `:query_start`, you'll have the following
|
309
|
+
available in the hash argument:
|
310
|
+
|
311
|
+
- `:sql` => the sql that is going to be run
|
312
|
+
|
313
|
+
These events are logged at the debug level.
|
314
|
+
|
315
|
+
When `:event_type` is equal to `:query_complete`, you'll have the following
|
316
|
+
available in the hash argument:
|
317
|
+
|
318
|
+
- `:sql` => the sql that was run
|
319
|
+
- `:runtime` => how long the query took to execute
|
320
|
+
|
321
|
+
These events are logged at the info level.
|
322
|
+
|
323
|
+
Following from this you could implement a simple logger as:
|
324
|
+
|
325
|
+
```ruby
|
326
|
+
class PutsLogger
|
327
|
+
def info data
|
328
|
+
@data = data
|
329
|
+
write!
|
330
|
+
end
|
331
|
+
|
332
|
+
def debug data
|
333
|
+
@data = data
|
334
|
+
write!
|
335
|
+
end
|
336
|
+
|
337
|
+
private
|
338
|
+
|
339
|
+
def write!
|
340
|
+
case (event_type = @data.delete(:event_type))
|
341
|
+
when :query_start
|
342
|
+
output = "#{@data[:emitter].description} is about to run\n"
|
343
|
+
output += "#{@data[:sql]}\n"
|
344
|
+
when :query_complete
|
345
|
+
output = "#{@data[:emitter].description} executed:\n"
|
346
|
+
output += "#{@data[:sql]}\n"
|
347
|
+
output += "query completed at #{Time.now} and took #{@data[:runtime]}s\n"
|
348
|
+
else
|
349
|
+
output = "no special logging for #{event_type} event_type yet\n"
|
350
|
+
end
|
351
|
+
puts output
|
352
|
+
@data = nil
|
353
|
+
end
|
354
|
+
end
|
355
|
+
```
|
22
356
|
|
23
357
|
## Contributing
|
24
358
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
359
|
+
If you would like to contribute code to ETL you can do so through GitHub by
|
360
|
+
forking the repository and sending a pull request.
|
361
|
+
|
362
|
+
When submitting code, please make every effort to follow existing conventions
|
363
|
+
and style in order to keep the code as readable as possible.
|
364
|
+
|
365
|
+
Before your code can be accepted into the project you must also sign the
|
366
|
+
[Individual Contributor License Agreement (CLA)][1].
|
367
|
+
|
368
|
+
|
369
|
+
[1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1
|
370
|
+
|
371
|
+
## License
|
372
|
+
|
373
|
+
Copyright 2013 Square Inc.
|
374
|
+
|
375
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
376
|
+
you may not use this file except in compliance with the License.
|
377
|
+
You may obtain a copy of the License at
|
378
|
+
|
379
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
380
|
+
|
381
|
+
Unless required by applicable law or agreed to in writing, software
|
382
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
383
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
384
|
+
See the License for the specific language governing permissions and
|
385
|
+
limitations under the License.
|