ETL 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -35,6 +35,16 @@ To run a basic ETL that is composed of sequential SQL statements, start by
35
35
  creating a new ETL instance:
36
36
 
37
37
  ```ruby
38
+ # setting connection at the class level
39
+ ETL.connection = connection
40
+
41
+ etl = ETL.new(description: "a description of what this ETL does")
42
+ ```
43
+
44
+ or
45
+
46
+ ```ruby
47
+ # setting connection at the instance level
38
48
  etl = ETL.new(description: "a description of what this ETL does",
39
49
  connection: connection)
40
50
  ```
@@ -50,13 +60,12 @@ etl.config do |etl|
50
60
  #
51
61
  etl.query %[
52
62
  CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
53
- user_id INT UNSIGNED NOT NULL,
54
- created_date DATE NOT NULL,
55
- total_amount INT SIGNED NOT NULL,
56
- message VARCHAR(100) DEFAULT NULL,
57
- PRIMARY KEY (user_id),
58
- KEY (user_id, created_date),
59
- KEY (created_date)
63
+ user_id INT UNSIGNED NOT NULL
64
+ , created_date DATE NOT NULL
65
+ , total_amount INT SIGNED NOT NULL
66
+ , message VARCHAR(100) DEFAULT NULL
67
+ , PRIMARY KEY (user_id, created_date)
68
+ , KEY (created_date)
60
69
  )]
61
70
  end
62
71
 
@@ -81,8 +90,11 @@ etl.config do |etl|
81
90
  # For example:
82
91
  #
83
92
  etl.query %[
84
- REPLACE INTO some_database.some_destination_table
85
- SELECT
93
+ REPLACE INTO some_database.some_destination_table (
94
+ user_id
95
+ , created_date
96
+ , total_amount
97
+ ) SELECT
86
98
  user_id
87
99
  , DATE(created_at) AS created_date
88
100
  , SUM(amount) AS total_amount
@@ -90,7 +102,7 @@ etl.config do |etl|
90
102
  some_database.some_source_table sst
91
103
  GROUP BY
92
104
  sst.user_id
93
- , sst.DATE(created_at)]
105
+ , DATE(sst.created_at)]
94
106
  end
95
107
 
96
108
  etl.after_etl do |etl|
@@ -142,13 +154,12 @@ etl.config do |etl|
142
154
  #
143
155
  etl.query %[
144
156
  CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
145
- user_id INT UNSIGNED NOT NULL,
146
- created_date DATE NOT NULL,
147
- total_amount INT SIGNED NOT NULL,
148
- message VARCHAR(100) DEFAULT NULL,
149
- PRIMARY KEY (user_id),
150
- KEY (user_id, created_date),
151
- KEY (created_date)
157
+ user_id INT UNSIGNED NOT NULL
158
+ , created_date DATE NOT NULL
159
+ , total_amount INT SIGNED NOT NULL
160
+ , message VARCHAR(100) DEFAULT NULL
161
+ , PRIMARY KEY (user_id, created_date)
162
+ , KEY (created_date)
152
163
  )]
153
164
  end
154
165
 
@@ -177,8 +188,11 @@ etl.config do |etl|
177
188
  #
178
189
  # As an example:
179
190
  #
191
+ # Note that we cast the default date as a DATE. If we don't, it will be
192
+ # treated as a string and our iterator will fail under the hood when testing
193
+ # if it is complete.
180
194
  res = etl.query %[
181
- SELECT COALESCE(MAX(created_date), '1970-01-01') AS the_max
195
+ SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
182
196
  FROM some_database.some_destination_table]
183
197
 
184
198
  res.to_a.first['the_max']
@@ -195,7 +209,7 @@ etl.config do |etl|
195
209
  #
196
210
  # As an example, to iterate 7 days at a time:
197
211
  #
198
- 7.days
212
+ 7
199
213
  end
200
214
 
201
215
  etl.stop do |etl|
@@ -237,24 +251,33 @@ etl.config do |etl|
237
251
  # As a first example, to iterate over a set of ids:
238
252
  #
239
253
  # etl.query %[
240
- # REPLACE INTO some_database.some_destination_table
241
- # SELECT
242
- # user_id
243
- # , SUM(amount) AS total_amount
254
+ # REPLACE INTO some_database.some_destination_table (
255
+ # created_date
256
+ # , user_id
257
+ # , total_amount
258
+ # ) SELECT
259
+ # DATE(sst.created_at) AS created_date
260
+ # , sst.user_id
261
+ # , SUM(sst.amount) AS total_amount
244
262
  # FROM
245
263
  # some_database.some_source_table sst
246
264
  # WHERE
247
265
  # sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
248
266
  # GROUP BY
249
- # sst.user_id]
267
+ # DATE(sst.created_at)
268
+ # , sst.user_id]
250
269
  #
251
270
  # To "window" a SQL query using dates:
252
271
  #
253
272
  etl.query %[
254
- REPLACE INTO some_database.some_destination_table
255
- SELECT
256
- DATE(created_at)
257
- , SUM(amount) AS total_amount
273
+ REPLACE INTO some_database.some_destination_table (
274
+ created_date
275
+ , user_id
276
+ , total_amount
277
+ ) SELECT
278
+ DATE(sst.created_at) AS created_date
279
+ , sst.user_id
280
+ , SUM(sst.amount) AS total_amount
258
281
  FROM
259
282
  some_database.some_source_table sst
260
283
  WHERE
@@ -262,7 +285,8 @@ etl.config do |etl|
262
285
  -- This is is required when dealing with dates / datetimes
263
286
  sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
264
287
  GROUP BY
265
- sst.user_id]
288
+ DATE(sst.created_at)
289
+ , sst.user_id]
266
290
 
267
291
  # Note that there is no sql sanitization here so there is *potential* for SQL
268
292
  # injection. That being said you'll likely be using this gem in an internal
@@ -295,6 +319,13 @@ Note that `#etl` executes `#start` and `#stop` once and memoizes the result for
295
319
  each. It then begins to iterate from what `#start` evaluated to up until what `#stop`
296
320
  evaluated to by what `#step` evaluates to.
297
321
 
322
+ ## Examples
323
+
324
+ There are two examples found in `./examples` that demonstrate the basic ETL and
325
+ iteration ETL. Each file uses the [mysql2](https://github.com/brianmario/mysql2)
326
+ gem and reads / writes data to localhost using the root user with no password.
327
+ Adjust as needed.
328
+
298
329
  ## Logger Details
299
330
 
300
331
  A logger must support two methods: `#info` and `#warn`.
@@ -0,0 +1,115 @@
1
+ require 'mysql2'
2
+ require 'ETL'
3
+
4
+ connection = Mysql2::Client.new host: 'localhost',
5
+ username: 'root',
6
+ password: '',
7
+ database: 'some_database'
8
+
9
+ # set up the source database
10
+ connection.query %[
11
+ CREATE DATABASE IF NOT EXISTS some_database]
12
+
13
+ connection.query %[
14
+ CREATE TABLE IF NOT EXISTS some_database.some_source_table (
15
+ user_id INT NOT NULL
16
+ , created_at DATETIME NOT NULL
17
+ , amount INT NOT NULL)]
18
+
19
+ connection.query %[
20
+ TRUNCATE some_database.some_source_table]
21
+
22
+ connection.query %[
23
+ INSERT INTO some_database.some_source_table (
24
+ user_id
25
+ , created_at
26
+ , amount
27
+ ) VALUES
28
+ (1, UTC_TIMESTAMP, 100)
29
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
30
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
31
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
32
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
33
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
34
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
35
+ , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
36
+
37
+ # set up the ETL
38
+ etl = ETL.new(description: "a description of what this ETL does",
39
+ connection: connection)
40
+
41
+ # configure ETL
42
+ etl.config do |etl|
43
+ etl.ensure_destination do |etl|
44
+ # For most ETLs you may want to ensure that the destination exists, so the
45
+ # #ensure_destination block is ideally suited to fulfill this requirement.
46
+ #
47
+ # By way of example:
48
+ #
49
+ etl.query %[
50
+ CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
51
+ user_id INT UNSIGNED NOT NULL
52
+ , created_date DATE NOT NULL
53
+ , total_amount INT SIGNED NOT NULL
54
+ , message VARCHAR(100) DEFAULT NULL
55
+ , PRIMARY KEY (user_id, created_date)
56
+ , KEY (created_date)
57
+ )]
58
+ end
59
+
60
+ etl.before_etl do |etl|
61
+ # All pre-ETL work is performed in this block.
62
+ #
63
+ # This can be thought of as a before-ETL hook that will fire only once. When
64
+ # you are not leveraging the ETL iteration capabilities, the value of this
65
+ # block vs the #etl block is not very clear. We will see how and when to
66
+ # leverage this block effectively when we introduce iteration.
67
+ #
68
+ # As an example, let's say we want to get rid of all entries that have an
69
+ # amount less than zero before moving on to our actual etl:
70
+ #
71
+ etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
72
+ end
73
+
74
+ etl.etl do |etl|
75
+ # Here is where the magic happens! This block contains the main ETL
76
+ # operation.
77
+ #
78
+ # For example:
79
+ #
80
+ etl.query %[
81
+ REPLACE INTO some_database.some_destination_table (
82
+ user_id
83
+ , created_date
84
+ , total_amount
85
+ ) SELECT
86
+ sst.user_id
87
+ , DATE(sst.created_at) AS created_date
88
+ , SUM(sst.amount) AS total_amount
89
+ FROM
90
+ some_database.some_source_table sst
91
+ GROUP BY
92
+ sst.user_id
93
+ , DATE(sst.created_at)]
94
+ end
95
+
96
+ etl.after_etl do |etl|
97
+ # All post-ETL work is performed in this block.
98
+ #
99
+ # Again, to finish up with an example:
100
+ #
101
+ etl.query %[
102
+ UPDATE some_database.some_destination_table
103
+ SET message = "WOW"
104
+ WHERE total_amount > 100]
105
+ end
106
+ end
107
+
108
+ # ship it
109
+ etl.run
110
+
111
+ puts %[
112
+ ETL complete. Now go have a look at some_database.some_destination_table
113
+ That was build from some_database.some_source_table using the above ETL configuration.
114
+
115
+ SELECT * FROM some_database.some_destination_table;]
@@ -0,0 +1,209 @@
1
+ require 'mysql2'
2
+ require 'ETL'
3
+
4
+ connection = Mysql2::Client.new host: 'localhost',
5
+ username: 'root',
6
+ password: '',
7
+ database: 'some_database'
8
+
9
+ # set up the source database:
10
+ connection.query %[
11
+ CREATE DATABASE IF NOT EXISTS some_database]
12
+
13
+ connection.query %[
14
+ CREATE TABLE IF NOT EXISTS some_database.some_source_table (
15
+ user_id INT NOT NULL
16
+ , created_at DATETIME NOT NULL
17
+ , amount INT NOT NULL)]
18
+
19
+ connection.query %[
20
+ TRUNCATE some_database.some_source_table]
21
+
22
+ connection.query %[
23
+ INSERT INTO some_database.some_source_table (
24
+ user_id
25
+ , created_at
26
+ , amount
27
+ ) VALUES
28
+ (1, UTC_TIMESTAMP, 100)
29
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
30
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
31
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
32
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
33
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
34
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
35
+ , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
36
+
37
+ # set up the ETL
38
+ etl = ETL.new(description: "a description of what this ETL does",
39
+ connection: connection)
40
+
41
+ # configure it
42
+ etl.config do |etl|
43
+ etl.ensure_destination do |etl|
44
+ # For most ETLs you may want to ensure that the destination exists, so the
45
+ # #ensure_destination block is ideally suited to fulfill this requirement.
46
+ #
47
+ # By way of example:
48
+ #
49
+ etl.query %[
50
+ CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
51
+ user_id INT UNSIGNED NOT NULL
52
+ , created_date DATE NOT NULL
53
+ , total_amount INT SIGNED NOT NULL
54
+ , message VARCHAR(100) DEFAULT NULL
55
+ , PRIMARY KEY (user_id, created_date)
56
+ , KEY (created_date)
57
+ )]
58
+ end
59
+
60
+ etl.before_etl do |etl|
61
+ # All pre-ETL work is performed in this block.
62
+ #
63
+ # Now that we are leveraging iteration the #before_etl block becomes
64
+ # more useful as a way to execute an operation once before we begin
65
+ # our iteration.
66
+ #
67
+ # As an example, let's say we want to get rid of all entries that have an
68
+ # amount less than zero before moving on to our actual etl:
69
+ #
70
+ etl.query %[
71
+ DELETE FROM some_database.some_source_table
72
+ WHERE amount < 0]
73
+ end
74
+
75
+ etl.start do |etl|
76
+ # This defines where the ETL should start. This can be a flat number
77
+ # or date, or even SQL / other code can be executed to produce a starting
78
+ # value.
79
+ #
80
+ # Usually, this is the last known entry for the destination table with
81
+ # some sensible default if the destination does not yet contain data.
82
+ #
83
+ # As an example:
84
+ #
85
+ # Note that we cast the default date as a DATE. If we don't, it will be
86
+ # treated as a string and our iterator will fail under the hood when testing
87
+ # if it is complete.
88
+ res = etl.query %[
89
+ SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
90
+ FROM some_database.some_destination_table]
91
+
92
+ res.to_a.first['the_max']
93
+ end
94
+
95
+ etl.step do |etl|
96
+ # The step block defines the size of the iteration block. To iterate by
97
+ # ten records, the step block should be set to return 10.
98
+ #
99
+ # As an alternative example, to set the iteration to go 10,000 units
100
+ # at a time, the following value should be provided:
101
+ #
102
+ # 10_000 (Note: an underscore is used for readability)
103
+ #
104
+ # As an example, to iterate 7 days at a time:
105
+ #
106
+ 7
107
+ end
108
+
109
+ etl.stop do |etl|
110
+ # The stop block defines when the iteration should halt.
111
+ # Again, this can be a flat value or code. Either way, one value *must* be
112
+ # returned.
113
+ #
114
+ # As a flat value:
115
+ #
116
+ # 1_000_000
117
+ #
118
+ # Or a date value:
119
+ #
120
+ # Time.now.to_date
121
+ #
122
+ # Or as a code example:
123
+ #
124
+ res = etl.query %[
125
+ SELECT DATE(MAX(created_at)) AS the_max
126
+ FROM some_database.some_source_table]
127
+
128
+ res.to_a.first['the_max']
129
+ end
130
+
131
+ etl.etl do |etl, lbound, ubound|
132
+ # The etl block is the main part of the framework. Note: there are
133
+ # two extra args with the iterator this time around: "lbound" and "ubound"
134
+ #
135
+ # "lbound" is the lower bound of the current iteration. When iterating
136
+ # from 0 to 10 and stepping by 2, the lbound would equal 2 on the
137
+ # second iteration.
138
+ #
139
+ # "ubound" is the upper bound of the current iteration. In continuing with the
140
+ # example above, when iterating from 0 to 10 and stepping by 2, the ubound would
141
+ # equal 4 on the second iteration.
142
+ #
143
+ # These args can be used to "window" SQL queries or other code operations.
144
+ #
145
+ # As a first example, to iterate over a set of ids:
146
+ #
147
+ # etl.query %[
148
+ # REPLACE INTO some_database.some_destination_table (
149
+ # created_date
150
+ # , user_id
151
+ # , total_amount
152
+ # ) SELECT
153
+ # DATE(sst.created_at) AS created_date
154
+ # , sst.user_id
155
+ # , SUM(sst.amount) AS total_amount
156
+ # FROM
157
+ # some_database.some_source_table sst
158
+ # WHERE
159
+ # sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
160
+ # GROUP BY
161
+ # DATE(sst.created_at)
162
+ # , sst.user_id]
163
+ #
164
+ # To "window" a SQL query using dates:
165
+ #
166
+ etl.query %[
167
+ REPLACE INTO some_database.some_destination_table (
168
+ created_date
169
+ , user_id
170
+ , total_amount
171
+ ) SELECT
172
+ DATE(sst.created_at) AS created_date
173
+ , sst.user_id
174
+ , SUM(sst.amount) AS total_amount
175
+ FROM
176
+ some_database.some_source_table sst
177
+ WHERE
178
+ -- Note the usage of quotes surrounding the lbound and ubound vars.
179
+ -- This is is required when dealing with dates / datetimes
180
+ sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
181
+ GROUP BY
182
+ DATE(sst.created_at)
183
+ , sst.user_id]
184
+
185
+ # Note that there is no sql sanitization here so there is *potential* for SQL
186
+ # injection. That being said you'll likely be using this gem in an internal
187
+ # tool so hopefully your co-workers are not looking to sabotage your ETL
188
+ # pipeline. Just be aware of this and handle it as you see fit.
189
+ end
190
+
191
+ etl.after_etl do |etl|
192
+ # All post-ETL work is performed in this block.
193
+ #
194
+ # Again, to finish up with an example:
195
+ #
196
+ etl.query %[
197
+ UPDATE some_database.some_destination_table
198
+ SET message = "WOW"
199
+ WHERE total_amount > 100]
200
+ end
201
+ end
202
+
203
+ etl.run
204
+
205
+ puts %[
206
+ ETL complete. Now go have a look at some_database.some_destination_table
207
+ That was build from some_database.some_source_table using the above ETL configuration.
208
+
209
+ SELECT * FROM some_database.some_destination_table;]
data/lib/etl.rb CHANGED
@@ -24,9 +24,21 @@ class ETL
24
24
  :stop
25
25
  ]
26
26
 
27
+ def self.connection= connection
28
+ @connection = connection
29
+ end
30
+
31
+ def self.connection
32
+ @connection
33
+ end
34
+
35
+ def self.defaults
36
+ {connection: @connection}
37
+ end
38
+
27
39
  def initialize attributes = {}
28
- attributes.keys.uniq.each do |attribute|
29
- self.send "#{attribute}=", attributes[attribute]
40
+ self.class.defaults.merge(attributes).each do |key, value|
41
+ self.send "#{key}=", value
30
42
  end
31
43
  default_logger! unless attributes.keys.include?(:logger)
32
44
  end
@@ -23,7 +23,7 @@ class ETL
23
23
  caster = ->(str) { "DATE(#{str})" }
24
24
  end
25
25
 
26
- max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
26
+ max_sql_clause = "IFNULL(MAX(#{table}.#{column}), #{default_value})"
27
27
  max_sql_clause = caster.(max_sql_clause) if caster
28
28
 
29
29
  sql = <<-EOS
@@ -1,3 +1,3 @@
1
1
  class ETL
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
@@ -38,6 +38,25 @@ end
38
38
  describe ETL do
39
39
  let(:logger) { nil }
40
40
 
41
+ describe ".connection=" do
42
+ let(:class_level_connection) { stub('class_level_connection') }
43
+
44
+ it "sets the #connection for all instances" do
45
+ ETL.connection = class_level_connection
46
+ etl = ETL.new
47
+ expect(etl.connection).to eq class_level_connection
48
+ end
49
+
50
+ it "allows instance-level overrides" do
51
+ instance_level_connection = stub('instance_level_connection')
52
+ ETL.connection = class_level_connection
53
+ etl_with_connection_override = ETL.new connection: instance_level_connection
54
+ etl = ETL.new
55
+ expect(etl.connection).to eq class_level_connection
56
+ expect(etl_with_connection_override.connection).to eq instance_level_connection
57
+ end
58
+ end
59
+
41
60
  describe "#logger=" do
42
61
  let(:etl) { described_class.new connection: stub }
43
62
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ETL
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-14 00:00:00.000000000 Z
12
+ date: 2013-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -108,6 +108,8 @@ files:
108
108
  - README.md
109
109
  - Rakefile
110
110
  - etl.gemspec
111
+ - examples/basic_etl.rb
112
+ - examples/iterator_etl.rb
111
113
  - lib/etl.rb
112
114
  - lib/etl/helpers.rb
113
115
  - lib/etl/version.rb