RubyGems - ETL - Versions diffs - 1.0.0 → 1.1.0 - Mend

ETL 1.0.0 → 1.1.0

Files changed (8) hide show

data/README.md CHANGED

@@ -35,6 +35,16 @@ To run a basic ETL that is composed of sequential SQL statements, start by
 creating a new ETL instance:
 ```ruby
+# setting connection at the class level
+ETL.connection = connection
+etl = ETL.new(description: "a description of what this ETL does")
+```
+or
+```ruby
+# setting connection at the instance level
 etl = ETL.new(description: "a description of what this ETL does",
               connection:  connection)
 ```
@@ -50,13 +60,12 @@ etl.config do |etl|
     #
     etl.query %[
       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
-        user_id INT UNSIGNED NOT NULL,
-        created_date DATE NOT NULL,
-        total_amount INT SIGNED NOT NULL,
-        message VARCHAR(100) DEFAULT NULL,
-        PRIMARY KEY (user_id),
-        KEY (user_id, created_date),
-        KEY (created_date)
+          user_id INT UNSIGNED NOT NULL
+        , created_date DATE NOT NULL
+        , total_amount INT SIGNED NOT NULL
+        , message VARCHAR(100) DEFAULT NULL
+        , PRIMARY KEY (user_id, created_date)
+        , KEY (created_date)
       )]
   end
@@ -81,8 +90,11 @@ etl.config do |etl|
     # For example:
     #
     etl.query %[
-      REPLACE INTO some_database.some_destination_table
-      SELECT
+      REPLACE INTO some_database.some_destination_table (
+          user_id
+        , created_date
+        , total_amount
+      ) SELECT
           user_id
         , DATE(created_at) AS created_date
         , SUM(amount) AS total_amount
@@ -90,7 +102,7 @@ etl.config do |etl|
         some_database.some_source_table sst
       GROUP BY
           sst.user_id
-        , sst.DATE(created_at)]
+        , DATE(sst.created_at)]
   end
   etl.after_etl do |etl|
@@ -142,13 +154,12 @@ etl.config do |etl|
     #
     etl.query %[
       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
-        user_id INT UNSIGNED NOT NULL,
-        created_date DATE NOT NULL,
-        total_amount INT SIGNED NOT NULL,
-        message VARCHAR(100) DEFAULT NULL,
-        PRIMARY KEY (user_id),
-        KEY (user_id, created_date),
-        KEY (created_date)
+          user_id INT UNSIGNED NOT NULL
+        , created_date DATE NOT NULL
+        , total_amount INT SIGNED NOT NULL
+        , message VARCHAR(100) DEFAULT NULL
+        , PRIMARY KEY (user_id, created_date)
+        , KEY (created_date)
       )]
   end
@@ -177,8 +188,11 @@ etl.config do |etl|
     #
     # As an example:
     #
+    # Note that we cast the default date as a DATE. If we don't, it will be
+    # treated as a string and our iterator will fail under the hood when testing
+    # if it is complete.
     res = etl.query %[
-      SELECT COALESCE(MAX(created_date), '1970-01-01') AS the_max
+      SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
       FROM some_database.some_destination_table]
     res.to_a.first['the_max']
@@ -195,7 +209,7 @@ etl.config do |etl|
     #
     # As an example, to iterate 7 days at a time:
     #
-    7.days
+    7
   end
   etl.stop do |etl|
@@ -237,24 +251,33 @@ etl.config do |etl|
     # As a first example, to iterate over a set of ids:
     #
     #   etl.query %[
-    #     REPLACE INTO some_database.some_destination_table
-    #     SELECT
-    #         user_id
-    #       , SUM(amount) AS total_amount
+    #     REPLACE INTO some_database.some_destination_table (
+    #         created_date
+    #       , user_id
+    #       , total_amount
+    #     ) SELECT
+    #         DATE(sst.created_at) AS created_date
+    #       , sst.user_id
+    #       , SUM(sst.amount) AS total_amount
     #     FROM
     #       some_database.some_source_table sst
     #     WHERE
     #       sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
     #     GROUP BY
-    #       sst.user_id]
+    #         DATE(sst.created_at)
+    #       , sst.user_id]
     #
     # To "window" a SQL query using dates:
     #
     etl.query %[
-      REPLACE INTO some_database.some_destination_table
-      SELECT
-          DATE(created_at)
-        , SUM(amount) AS total_amount
+      REPLACE INTO some_database.some_destination_table (
+          created_date
+        , user_id
+        , total_amount
+      ) SELECT
+          DATE(sst.created_at) AS created_date
+        , sst.user_id
+        , SUM(sst.amount) AS total_amount
       FROM
         some_database.some_source_table sst
       WHERE
@@ -262,7 +285,8 @@ etl.config do |etl|
         -- This is is required when dealing with dates / datetimes
         sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
       GROUP BY
-        sst.user_id]
+          DATE(sst.created_at)
+        , sst.user_id]
     # Note that there is no sql sanitization here so there is *potential* for SQL
     # injection. That being said you'll likely be using this gem in an internal
@@ -295,6 +319,13 @@ Note that `#etl` executes `#start` and `#stop` once and memoizes the result for
 each. It then begins to iterate from what `#start` evaluated to up until what `#stop`
 evaluated to by what `#step` evaluates to.
+## Examples
+There are two examples found in `./examples` that demonstrate the basic ETL and
+iteration ETL. Each file uses the [mysql2](https://github.com/brianmario/mysql2)
+gem and reads / writes data to localhost using the root user with no password.
+Adjust as needed.
 ## Logger Details
 A logger must support two methods: `#info` and `#warn`.

data/examples/basic_etl.rb ADDED

@@ -0,0 +1,115 @@
+require 'mysql2'
+require 'ETL'
+connection = Mysql2::Client.new host:     'localhost',
+                                username: 'root',
+                                password: '',
+                                database: 'some_database'
+# set up the source database
+connection.query %[
+  CREATE DATABASE IF NOT EXISTS some_database]
+connection.query %[
+  CREATE TABLE IF NOT EXISTS some_database.some_source_table (
+      user_id INT NOT NULL
+    , created_at DATETIME NOT NULL
+    , amount INT NOT NULL)]
+connection.query %[
+  TRUNCATE some_database.some_source_table]
+connection.query %[
+  INSERT INTO some_database.some_source_table (
+      user_id
+    , created_at
+    , amount
+  ) VALUES
+      (1, UTC_TIMESTAMP, 100)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
+    , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
+# set up the ETL
+etl = ETL.new(description: "a description of what this ETL does",
+              connection:  connection)
+# configure ETL
+etl.config do |etl|
+  etl.ensure_destination do |etl|
+    # For most ETLs you may want to ensure that the destination exists, so the
+    # #ensure_destination block is ideally suited to fulfill this requirement.
+    #
+    # By way of example:
+    #
+    etl.query %[
+      CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
+          user_id INT UNSIGNED NOT NULL
+        , created_date DATE NOT NULL
+        , total_amount INT SIGNED NOT NULL
+        , message VARCHAR(100) DEFAULT NULL
+        , PRIMARY KEY (user_id, created_date)
+        , KEY (created_date)
+      )]
+  end
+  etl.before_etl do |etl|
+    # All pre-ETL work is performed in this block.
+    #
+    # This can be thought of as a before-ETL hook that will fire only once. When
+    # you are not leveraging the ETL iteration capabilities, the value of this
+    # block vs the #etl block is not very clear. We will see how and when to
+    # leverage this block effectively when we introduce iteration.
+    #
+    # As an example, let's say we want to get rid of all entries that have an
+    # amount less than zero before moving on to our actual etl:
+    #
+    etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
+  end
+  etl.etl do |etl|
+    # Here is where the magic happens! This block contains the main ETL
+    # operation.
+    #
+    # For example:
+    #
+    etl.query %[
+      REPLACE INTO some_database.some_destination_table (
+          user_id
+        , created_date
+        , total_amount
+      ) SELECT
+          sst.user_id
+        , DATE(sst.created_at) AS created_date
+        , SUM(sst.amount) AS total_amount
+      FROM
+        some_database.some_source_table sst
+      GROUP BY
+          sst.user_id
+        , DATE(sst.created_at)]
+  end
+  etl.after_etl do |etl|
+    # All post-ETL work is performed in this block.
+    #
+    # Again, to finish up with an example:
+    #
+    etl.query %[
+      UPDATE some_database.some_destination_table
+      SET message = "WOW"
+      WHERE total_amount > 100]
+  end
+end
+# ship it
+etl.run
+puts %[
+ETL complete. Now go have a look at some_database.some_destination_table
+That was build from some_database.some_source_table using the above ETL configuration.
+SELECT * FROM some_database.some_destination_table;]

data/examples/iterator_etl.rb ADDED

@@ -0,0 +1,209 @@
+require 'mysql2'
+require 'ETL'
+connection = Mysql2::Client.new host:     'localhost',
+                                username: 'root',
+                                password: '',
+                                database: 'some_database'
+# set up the source database:
+connection.query %[
+  CREATE DATABASE IF NOT EXISTS some_database]
+connection.query %[
+  CREATE TABLE IF NOT EXISTS some_database.some_source_table (
+      user_id INT NOT NULL
+    , created_at DATETIME NOT NULL
+    , amount INT NOT NULL)]
+connection.query %[
+  TRUNCATE some_database.some_source_table]
+connection.query %[
+  INSERT INTO some_database.some_source_table (
+      user_id
+    , created_at
+    , amount
+  ) VALUES
+      (1, UTC_TIMESTAMP, 100)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
+    , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
+    , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
+    , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
+# set up the ETL
+etl = ETL.new(description: "a description of what this ETL does",
+              connection:  connection)
+# configure it
+etl.config do |etl|
+  etl.ensure_destination do |etl|
+    # For most ETLs you may want to ensure that the destination exists, so the
+    # #ensure_destination block is ideally suited to fulfill this requirement.
+    #
+    # By way of example:
+    #
+    etl.query %[
+      CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
+          user_id INT UNSIGNED NOT NULL
+        , created_date DATE NOT NULL
+        , total_amount INT SIGNED NOT NULL
+        , message VARCHAR(100) DEFAULT NULL
+        , PRIMARY KEY (user_id, created_date)
+        , KEY (created_date)
+      )]
+  end
+  etl.before_etl do |etl|
+    # All pre-ETL work is performed in this block.
+    #
+    # Now that we are leveraging iteration the #before_etl block becomes
+    # more useful as a way to execute an operation once before we begin
+    # our iteration.
+    #
+    # As an example, let's say we want to get rid of all entries that have an
+    # amount less than zero before moving on to our actual etl:
+    #
+    etl.query %[
+      DELETE FROM some_database.some_source_table
+      WHERE amount < 0]
+  end
+  etl.start do |etl|
+    # This defines where the ETL should start. This can be a flat number
+    # or date, or even SQL / other code can be executed to produce a starting
+    # value.
+    #
+    # Usually, this is the last known entry for the destination table with
+    # some sensible default if the destination does not yet contain data.
+    #
+    # As an example:
+    #
+    # Note that we cast the default date as a DATE. If we don't, it will be
+    # treated as a string and our iterator will fail under the hood when testing
+    # if it is complete.
+    res = etl.query %[
+      SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
+      FROM some_database.some_destination_table]
+    res.to_a.first['the_max']
+  end
+  etl.step do |etl|
+    # The step block defines the size of the iteration block. To iterate by
+    # ten records, the step block should be set to return 10.
+    #
+    # As an alternative example, to set the iteration to go 10,000 units
+    # at a time, the following value should be provided:
+    #
+    #   10_000 (Note: an underscore is used for readability)
+    #
+    # As an example, to iterate 7 days at a time:
+    #
+    7
+  end
+  etl.stop do |etl|
+    # The stop block defines when the iteration should halt.
+    # Again, this can be a flat value or code. Either way, one value *must* be
+    # returned.
+    #
+    # As a flat value:
+    #
+    #   1_000_000
+    #
+    # Or a date value:
+    #
+    #   Time.now.to_date
+    #
+    # Or as a code example:
+    #
+    res = etl.query %[
+      SELECT DATE(MAX(created_at)) AS the_max
+      FROM some_database.some_source_table]
+    res.to_a.first['the_max']
+  end
+  etl.etl do |etl, lbound, ubound|
+    # The etl block is the main part of the framework. Note: there are
+    # two extra args with the iterator this time around: "lbound" and "ubound"
+    #
+    # "lbound" is the lower bound of the current iteration. When iterating
+    # from 0 to 10 and stepping by 2, the lbound would equal 2 on the
+    # second iteration.
+    #
+    # "ubound" is the upper bound of the current iteration. In continuing with the
+    # example above, when iterating from 0 to 10 and stepping by 2, the ubound would
+    # equal 4 on the second iteration.
+    #
+    # These args can be used to "window" SQL queries or other code operations.
+    #
+    # As a first example, to iterate over a set of ids:
+    #
+    #   etl.query %[
+    #     REPLACE INTO some_database.some_destination_table (
+    #         created_date
+    #       , user_id
+    #       , total_amount
+    #     ) SELECT
+    #         DATE(sst.created_at) AS created_date
+    #       , sst.user_id
+    #       , SUM(sst.amount) AS total_amount
+    #     FROM
+    #       some_database.some_source_table sst
+    #     WHERE
+    #       sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
+    #     GROUP BY
+    #         DATE(sst.created_at)
+    #       , sst.user_id]
+    #
+    # To "window" a SQL query using dates:
+    #
+    etl.query %[
+      REPLACE INTO some_database.some_destination_table (
+          created_date
+        , user_id
+        , total_amount
+      ) SELECT
+          DATE(sst.created_at) AS created_date
+        , sst.user_id
+        , SUM(sst.amount) AS total_amount
+      FROM
+        some_database.some_source_table sst
+      WHERE
+        -- Note the usage of quotes surrounding the lbound and ubound vars.
+        -- This is is required when dealing with dates / datetimes
+        sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
+      GROUP BY
+          DATE(sst.created_at)
+        , sst.user_id]
+    # Note that there is no sql sanitization here so there is *potential* for SQL
+    # injection. That being said you'll likely be using this gem in an internal
+    # tool so hopefully your co-workers are not looking to sabotage your ETL
+    # pipeline. Just be aware of this and handle it as you see fit.
+  end
+  etl.after_etl do |etl|
+    # All post-ETL work is performed in this block.
+    #
+    # Again, to finish up with an example:
+    #
+    etl.query %[
+      UPDATE some_database.some_destination_table
+      SET message = "WOW"
+      WHERE total_amount > 100]
+  end
+end
+etl.run
+puts %[
+ETL complete. Now go have a look at some_database.some_destination_table
+That was build from some_database.some_source_table using the above ETL configuration.
+SELECT * FROM some_database.some_destination_table;]

data/lib/etl.rb CHANGED

@@ -24,9 +24,21 @@ class ETL
     :stop
   ]
+  def self.connection= connection
+    @connection = connection
+  end
+  def self.connection
+    @connection
+  end
+  def self.defaults
+    {connection: @connection}
+  end
   def initialize attributes = {}
-    attributes.keys.uniq.each do |attribute|
-      self.send "#{attribute}=", attributes[attribute]
+    self.class.defaults.merge(attributes).each do |key, value|
+      self.send "#{key}=", value
     end
     default_logger! unless attributes.keys.include?(:logger)
   end

data/lib/etl/helpers.rb CHANGED

@@ -23,7 +23,7 @@ class ETL
         caster = ->(str) { "DATE(#{str})" }
       end
-      max_sql_clause = "COALESCE(MAX(#{table}.#{column}), #{default_value})"
+      max_sql_clause = "IFNULL(MAX(#{table}.#{column}), #{default_value})"
       max_sql_clause = caster.(max_sql_clause) if caster
       sql = <<-EOS

data/lib/etl/version.rb CHANGED

@@ -1,3 +1,3 @@
 class ETL
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
 end

data/spec/etl_spec.rb CHANGED

@@ -38,6 +38,25 @@ end
 describe ETL do
   let(:logger) { nil }
+  describe ".connection=" do
+    let(:class_level_connection) { stub('class_level_connection') }
+    it "sets the #connection for all instances" do
+      ETL.connection = class_level_connection
+      etl = ETL.new
+      expect(etl.connection).to eq class_level_connection
+    end
+    it "allows instance-level overrides" do
+      instance_level_connection = stub('instance_level_connection')
+      ETL.connection = class_level_connection
+      etl_with_connection_override = ETL.new connection: instance_level_connection
+      etl = ETL.new
+      expect(etl.connection).to eq class_level_connection
+      expect(etl_with_connection_override.connection).to eq instance_level_connection
+    end
+  end
   describe "#logger=" do
     let(:etl) { described_class.new connection: stub }

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ETL
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-03-14 00:00:00.000000000 Z
+date: 2013-06-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -108,6 +108,8 @@ files:
 - README.md
 - Rakefile
 - etl.gemspec
+- examples/basic_etl.rb
+- examples/iterator_etl.rb
 - lib/etl.rb
 - lib/etl/helpers.rb
 - lib/etl/version.rb