ETL 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -35,6 +35,16 @@ To run a basic ETL that is composed of sequential SQL statements, start by
35
35
  creating a new ETL instance:
36
36
 
37
37
  ```ruby
38
+ # setting connection at the class level
39
+ ETL.connection = connection
40
+
41
+ etl = ETL.new(description: "a description of what this ETL does")
42
+ ```
43
+
44
+ or
45
+
46
+ ```ruby
47
+ # setting connection at the instance level
38
48
  etl = ETL.new(description: "a description of what this ETL does",
39
49
  connection: connection)
40
50
  ```
@@ -50,13 +60,12 @@ etl.config do |etl|
50
60
  #
51
61
  etl.query %[
52
62
  CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
53
- user_id INT UNSIGNED NOT NULL,
54
- created_date DATE NOT NULL,
55
- total_amount INT SIGNED NOT NULL,
56
- message VARCHAR(100) DEFAULT NULL,
57
- PRIMARY KEY (user_id),
58
- KEY (user_id, created_date),
59
- KEY (created_date)
63
+ user_id INT UNSIGNED NOT NULL
64
+ , created_date DATE NOT NULL
65
+ , total_amount INT SIGNED NOT NULL
66
+ , message VARCHAR(100) DEFAULT NULL
67
+ , PRIMARY KEY (user_id, created_date)
68
+ , KEY (created_date)
60
69
  )]
61
70
  end
62
71
 
@@ -81,8 +90,11 @@ etl.config do |etl|
81
90
  # For example:
82
91
  #
83
92
  etl.query %[
84
- REPLACE INTO some_database.some_destination_table
85
- SELECT
93
+ REPLACE INTO some_database.some_destination_table (
94
+ user_id
95
+ , created_date
96
+ , total_amount
97
+ ) SELECT
86
98
  user_id
87
99
  , DATE(created_at) AS created_date
88
100
  , SUM(amount) AS total_amount
@@ -90,7 +102,7 @@ etl.config do |etl|
90
102
  some_database.some_source_table sst
91
103
  GROUP BY
92
104
  sst.user_id
93
- , sst.DATE(created_at)]
105
+ , DATE(sst.created_at)]
94
106
  end
95
107
 
96
108
  etl.after_etl do |etl|
@@ -142,13 +154,12 @@ etl.config do |etl|
142
154
  #
143
155
  etl.query %[
144
156
  CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
145
- user_id INT UNSIGNED NOT NULL,
146
- created_date DATE NOT NULL,
147
- total_amount INT SIGNED NOT NULL,
148
- message VARCHAR(100) DEFAULT NULL,
149
- PRIMARY KEY (user_id),
150
- KEY (user_id, created_date),
151
- KEY (created_date)
157
+ user_id INT UNSIGNED NOT NULL
158
+ , created_date DATE NOT NULL
159
+ , total_amount INT SIGNED NOT NULL
160
+ , message VARCHAR(100) DEFAULT NULL
161
+ , PRIMARY KEY (user_id, created_date)
162
+ , KEY (created_date)
152
163
  )]
153
164
  end
154
165
 
@@ -177,8 +188,11 @@ etl.config do |etl|
177
188
  #
178
189
  # As an example:
179
190
  #
191
+ # Note that we cast the default date as a DATE. If we don't, it will be
192
+ # treated as a string and our iterator will fail under the hood when testing
193
+ # if it is complete.
180
194
  res = etl.query %[
181
- SELECT COALESCE(MAX(created_date), '1970-01-01') AS the_max
195
+ SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
182
196
  FROM some_database.some_destination_table]
183
197
 
184
198
  res.to_a.first['the_max']
@@ -195,7 +209,7 @@ etl.config do |etl|
195
209
  #
196
210
  # As an example, to iterate 7 days at a time:
197
211
  #
198
- 7.days
212
+ 7
199
213
  end
200
214
 
201
215
  etl.stop do |etl|
@@ -237,24 +251,33 @@ etl.config do |etl|
237
251
  # As a first example, to iterate over a set of ids:
238
252
  #
239
253
  # etl.query %[
240
- # REPLACE INTO some_database.some_destination_table
241
- # SELECT
242
- # user_id
243
- # , SUM(amount) AS total_amount
254
+ # REPLACE INTO some_database.some_destination_table (
255
+ # created_date
256
+ # , user_id
257
+ # , total_amount
258
+ # ) SELECT
259
+ # DATE(sst.created_at) AS created_date
260
+ # , sst.user_id
261
+ # , SUM(sst.amount) AS total_amount
244
262
  # FROM
245
263
  # some_database.some_source_table sst
246
264
  # WHERE
247
265
  # sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
248
266
  # GROUP BY
249
- # sst.user_id]
267
+ # DATE(sst.created_at)
268
+ # , sst.user_id]
250
269
  #
251
270
  # To "window" a SQL query using dates:
252
271
  #
253
272
  etl.query %[
254
- REPLACE INTO some_database.some_destination_table
255
- SELECT
256
- DATE(created_at)
257
- , SUM(amount) AS total_amount
273
+ REPLACE INTO some_database.some_destination_table (
274
+ created_date
275
+ , user_id
276
+ , total_amount
277
+ ) SELECT
278
+ DATE(sst.created_at) AS created_date
279
+ , sst.user_id
280
+ , SUM(sst.amount) AS total_amount
258
281
  FROM
259
282
  some_database.some_source_table sst
260
283
  WHERE
@@ -262,7 +285,8 @@ etl.config do |etl|
262
285
  -- This is is required when dealing with dates / datetimes
263
286
  sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
264
287
  GROUP BY
265
- sst.user_id]
288
+ DATE(sst.created_at)
289
+ , sst.user_id]
266
290
 
267
291
  # Note that there is no sql sanitization here so there is *potential* for SQL
268
292
  # injection. That being said you'll likely be using this gem in an internal
@@ -295,6 +319,13 @@ Note that `#etl` executes `#start` and `#stop` once and memoizes the result for
295
319
  each. It then begins to iterate from what `#start` evaluated to up until what `#stop`
296
320
  evaluated to by what `#step` evaluates to.
297
321
 
322
+ ## Examples
323
+
324
+ There are two examples found in `./examples` that demonstrate the basic ETL and
325
+ iteration ETL. Each file uses the [mysql2](https://github.com/brianmario/mysql2)
326
+ gem and reads / writes data to localhost using the root user with no password.
327
+ Adjust as needed.
328
+
298
329
  ## Logger Details
299
330
 
300
331
  A logger must support two methods: `#info` and `#warn`.
@@ -0,0 +1,115 @@
1
+ require 'mysql2'
2
+ require 'ETL'
3
+
4
+ connection = Mysql2::Client.new host: 'localhost',
5
+ username: 'root',
6
+ password: '',
7
+ database: 'some_database'
8
+
9
+ # set up the source database
10
+ connection.query %[
11
+ CREATE DATABASE IF NOT EXISTS some_database]
12
+
13
+ connection.query %[
14
+ CREATE TABLE IF NOT EXISTS some_database.some_source_table (
15
+ user_id INT NOT NULL
16
+ , created_at DATETIME NOT NULL
17
+ , amount INT NOT NULL)]
18
+
19
+ connection.query %[
20
+ TRUNCATE some_database.some_source_table]
21
+
22
+ connection.query %[
23
+ INSERT INTO some_database.some_source_table (
24
+ user_id
25
+ , created_at
26
+ , amount
27
+ ) VALUES
28
+ (1, UTC_TIMESTAMP, 100)
29
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
30
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
31
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
32
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
33
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
34
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
35
+ , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
36
+
37
+ # set up the ETL
38
+ etl = ETL.new(description: "a description of what this ETL does",
39
+ connection: connection)
40
+
41
+ # configure ETL
42
+ etl.config do |etl|
43
+ etl.ensure_destination do |etl|
44
+ # For most ETLs you may want to ensure that the destination exists, so the
45
+ # #ensure_destination block is ideally suited to fulfill this requirement.
46
+ #
47
+ # By way of example:
48
+ #
49
+ etl.query %[
50
+ CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
51
+ user_id INT UNSIGNED NOT NULL
52
+ , created_date DATE NOT NULL
53
+ , total_amount INT SIGNED NOT NULL
54
+ , message VARCHAR(100) DEFAULT NULL
55
+ , PRIMARY KEY (user_id, created_date)
56
+ , KEY (created_date)
57
+ )]
58
+ end
59
+
60
+ etl.before_etl do |etl|
61
+ # All pre-ETL work is performed in this block.
62
+ #
63
+ # This can be thought of as a before-ETL hook that will fire only once. When
64
+ # you are not leveraging the ETL iteration capabilities, the value of this
65
+ # block vs the #etl block is not very clear. We will see how and when to
66
+ # leverage this block effectively when we introduce iteration.
67
+ #
68
+ # As an example, let's say we want to get rid of all entries that have an
69
+ # amount less than zero before moving on to our actual etl:
70
+ #
71
+ etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
72
+ end
73
+
74
+ etl.etl do |etl|
75
+ # Here is where the magic happens! This block contains the main ETL
76
+ # operation.
77
+ #
78
+ # For example:
79
+ #
80
+ etl.query %[
81
+ REPLACE INTO some_database.some_destination_table (
82
+ user_id
83
+ , created_date
84
+ , total_amount
85
+ ) SELECT
86
+ sst.user_id
87
+ , DATE(sst.created_at) AS created_date
88
+ , SUM(sst.amount) AS total_amount
89
+ FROM
90
+ some_database.some_source_table sst
91
+ GROUP BY
92
+ sst.user_id
93
+ , DATE(sst.created_at)]
94
+ end
95
+
96
+ etl.after_etl do |etl|
97
+ # All post-ETL work is performed in this block.
98
+ #
99
+ # Again, to finish up with an example:
100
+ #
101
+ etl.query %[
102
+ UPDATE some_database.some_destination_table
103
+ SET message = "WOW"
104
+ WHERE total_amount > 100]
105
+ end
106
+ end
107
+
108
+ # ship it
109
+ etl.run
110
+
111
+ puts %[
112
+ ETL complete. Now go have a look at some_database.some_destination_table
113
+ That was build from some_database.some_source_table using the above ETL configuration.
114
+
115
+ SELECT * FROM some_database.some_destination_table;]
@@ -0,0 +1,209 @@
1
+ require 'mysql2'
2
+ require 'ETL'
3
+
4
+ connection = Mysql2::Client.new host: 'localhost',
5
+ username: 'root',
6
+ password: '',
7
+ database: 'some_database'
8
+
9
+ # set up the source database:
10
+ connection.query %[
11
+ CREATE DATABASE IF NOT EXISTS some_database]
12
+
13
+ connection.query %[
14
+ CREATE TABLE IF NOT EXISTS some_database.some_source_table (
15
+ user_id INT NOT NULL
16
+ , created_at DATETIME NOT NULL
17
+ , amount INT NOT NULL)]
18
+
19
+ connection.query %[
20
+ TRUNCATE some_database.some_source_table]
21
+
22
+ connection.query %[
23
+ INSERT INTO some_database.some_source_table (
24
+ user_id
25
+ , created_at
26
+ , amount
27
+ ) VALUES
28
+ (1, UTC_TIMESTAMP, 100)
29
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
30
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
31
+ , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
32
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
33
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
34
+ , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
35
+ , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
36
+
37
+ # set up the ETL
38
+ etl = ETL.new(description: "a description of what this ETL does",
39
+ connection: connection)
40
+
41
+ # configure it
42
+ etl.config do |etl|
43
+ etl.ensure_destination do |etl|
44
+ # For most ETLs you may want to ensure that the destination exists, so the
45
+ # #ensure_destination block is ideally suited to fulfill this requirement.
46
+ #
47
+ # By way of example:
48
+ #
49
+ etl.query %[
50
+ CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
51
+ user_id INT UNSIGNED NOT NULL
52
+ , created_date DATE NOT NULL
53
+ , total_amount INT SIGNED NOT NULL
54
+ , message VARCHAR(100) DEFAULT NULL
55
+ , PRIMARY KEY (user_id, created_date)
56
+ , KEY (created_date)
57
+ )]
58
+ end
59
+
60
+ etl.before_etl do |etl|
61
+ # All pre-ETL work is performed in this block.
62
+ #
63
+ # Now that we are leveraging iteration the #before_etl block becomes
64
+ # more useful as a way to execute an operation once before we begin
65
+ # our iteration.
66
+ #
67
+ # As an example, let's say we want to get rid of all entries that have an
68
+ # amount less than zero before moving on to our actual etl:
69
+ #
70
+ etl.query %[
71
+ DELETE FROM some_database.some_source_table
72
+ WHERE amount < 0]
73
+ end
74
+
75
+ etl.start do |etl|
76
+ # This defines where the ETL should start. This can be a flat number
77
+ # or date, or even SQL / other code can be executed to produce a starting
78
+ # value.
79
+ #
80
+ # Usually, this is the last known entry for the destination table with
81
+ # some sensible default if the destination does not yet contain data.
82
+ #
83
+ # As an example:
84
+ #
85
+ # Note that we cast the default date as a DATE. If we don't, it will be
86
+ # treated as a string and our iterator will fail under the hood when testing
87
+ # if it is complete.
88
+ res = etl.query %[
89
+ SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
90
+ FROM some_database.some_destination_table]
91
+
92
+ res.to_a.first['the_max']
93
+ end
94
+
95
+ etl.step do |etl|
96
+ # The step block defines the size of the iteration block. To iterate by
97
+ # ten records, the step block should be set to return 10.
98
+ #
99
+ # As an alternative example, to set the iteration to go 10,000 units
100
+ # at a time, the following value should be provided:
101
+ #
102
+ # 10_000 (Note: an underscore is used for readability)
103
+ #
104
+ # As an example, to iterate 7 days at a time:
105
+ #
106
+ 7
107
+ end
108
+
109
+ etl.stop do |etl|
110
+ # The stop block defines when the iteration should halt.
111
+ # Again, this can be a flat value or code. Either way, one value *must* be
112
+ # returned.
113
+ #
114
+ # As a flat value:
115
+ #
116
+ # 1_000_000
117
+ #
118
+ # Or a date value:
119
+ #
120
+ # Time.now.to_date
121
+ #
122
+ # Or as a code example:
123
+ #
124
+ res = etl.query %[
125
+ SELECT DATE(MAX(created_at)) AS the_max
126
+ FROM some_database.some_source_table]
127
+
128
+ res.to_a.first['the_max']
129
+ end
130
+
131
+ etl.etl do |etl, lbound, ubound|
132
+ # The etl block is the main part of the framework. Note: there are
133
+ # two extra args with the iterator this time around: "lbound" and "ubound"
134
+ #
135
+ # "lbound" is the lower bound of the current iteration. When iterating
136
+ # from 0 to 10 and stepping by 2, the lbound would equal 2 on the
137
+ # second iteration.
138
+ #
139
+ # "ubound" is the upper bound of the current iteration. In continuing with the
140
+ # example above, when iterating from 0 to 10 and stepping by 2, the ubound would
141
+ # equal 4 on the second iteration.
142
+ #
143
+ # These args can be used to "window" SQL queries or other code operations.
144
+ #
145
+ # As a first example, to iterate over a set of ids:
146
+ #
147
+ # etl.query %[
148
+ # REPLACE INTO some_database.some_destination_table (
149
+ # created_date
150
+ # , user_id
151
+ # , total_amount
152
+ # ) SELECT
153
+ # DATE(sst.created_at) AS created_date
154
+ # , sst.user_id
155
+ # , SUM(sst.amount) AS total_amount
156
+ # FROM
157
+ # some_database.some_source_table sst
158
+ # WHERE
159
+ # sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
160
+ # GROUP BY
161
+ # DATE(sst.created_at)
162
+ # , sst.user_id]
163
+ #
164
+ # To "window" a SQL query using dates:
165
+ #
166
+ etl.query %[
167
+ REPLACE INTO some_database.some_destination_table (
168
+ created_date
169
+ , user_id
170
+ , total_amount
171
+ ) SELECT
172
+ DATE(sst.created_at) AS created_date
173
+ , sst.user_id
174
+ , SUM(sst.amount) AS total_amount
175
+ FROM
176
+ some_database.some_source_table sst
177
+ WHERE
178
+ -- Note the usage of quotes surrounding the lbound and ubound vars.
179
+ -- This is is required when dealing with dates / datetimes
180
+ sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
181
+ GROUP BY
182
+ DATE(sst.created_at)
183
+ , sst.user_id]
184
+
185
+ # Note that there is no sql sanitization here so there is *potential* for SQL
186
+ # injection. That being said you'll likely be using this gem in an internal
187
+ # tool so hopefully your co-workers are not looking to sabotage your ETL
188
+ # pipeline. Just be aware of this and handle it as you see fit.
189
+ end
190
+
191
+ etl.after_etl do |etl|
192
+ # All post-ETL work is performed in this block.
193
+ #
194
+ # Again, to finish up with an example:
195
+ #
196
+ etl.query %[
197
+ UPDATE some_database.some_destination_table
198
+ SET message = "WOW"
199
+ WHERE total_amount > 100]
200
+ end
201
+ end
202
+
203
+ etl.run
204
+
205
+ puts %[
206
+ ETL complete. Now go have a look at some_database.some_destination_table
207
+ That was build from some_database.some_source_table using the above ETL configuration.
208
+
209
+ SELECT * FROM some_database.some_destination_table;]
data/lib/etl.rb CHANGED
@@ -24,9 +24,21 @@ class ETL
24
24
  :stop
25
25
  ]
26
26
 
27
+ def self.connection= connection
28
+ @connection = connection
29
+ end
30
+
31
+ def self.connection
32
+ @connection
33
+ end
34
+
35
+ def self.defaults
36
+ {connection: @connection}
37
+ end
38
+
27
39
  def initialize attributes = {}
28
- attributes.keys.uniq.each do |attribute|
29
- self.send "#{attribute}=", attributes[attribute]
40
+ self.class.defaults.merge(attributes).each do |key, value|
41
+ self.send "#{key}=", value
30
42
  end
31
43
  default_logger! unless attributes.keys.include?(:logger)
32
44
  end
@@ -23,7 +23,7 @@ class ETL
23
23
  caster = ->(str) { "DATE(#{str})" }
24
24
  end
25
25
 
26
- max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
26
+ max_sql_clause = "IFNULL(MAX(#{table}.#{column}), #{default_value})"
27
27
  max_sql_clause = caster.(max_sql_clause) if caster
28
28
 
29
29
  sql = <<-EOS
@@ -1,3 +1,3 @@
1
1
  class ETL
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
@@ -38,6 +38,25 @@ end
38
38
  describe ETL do
39
39
  let(:logger) { nil }
40
40
 
41
+ describe ".connection=" do
42
+ let(:class_level_connection) { stub('class_level_connection') }
43
+
44
+ it "sets the #connection for all instances" do
45
+ ETL.connection = class_level_connection
46
+ etl = ETL.new
47
+ expect(etl.connection).to eq class_level_connection
48
+ end
49
+
50
+ it "allows instance-level overrides" do
51
+ instance_level_connection = stub('instance_level_connection')
52
+ ETL.connection = class_level_connection
53
+ etl_with_connection_override = ETL.new connection: instance_level_connection
54
+ etl = ETL.new
55
+ expect(etl.connection).to eq class_level_connection
56
+ expect(etl_with_connection_override.connection).to eq instance_level_connection
57
+ end
58
+ end
59
+
41
60
  describe "#logger=" do
42
61
  let(:etl) { described_class.new connection: stub }
43
62
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ETL
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-14 00:00:00.000000000 Z
12
+ date: 2013-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -108,6 +108,8 @@ files:
108
108
  - README.md
109
109
  - Rakefile
110
110
  - etl.gemspec
111
+ - examples/basic_etl.rb
112
+ - examples/iterator_etl.rb
111
113
  - lib/etl.rb
112
114
  - lib/etl/helpers.rb
113
115
  - lib/etl/version.rb