RubyGems - redshifter - Versions diffs - 0.3.0 - Mend

redshifter 0.3.0

Files changed (27) hide show

checksums.yaml +7 -0
data/.gitignore +4 -0
data/.rspec +3 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +147 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +7 -0
data/lib/redshifter.rb +15 -0
data/lib/redshifter/config.rb +41 -0
data/lib/redshifter/extract_and_replace_redshift_table.rb +33 -0
data/lib/redshifter/extract_and_update_redshift_table.rb +33 -0
data/lib/redshifter/job/update_redshift_table_job.rb +15 -0
data/lib/redshifter/table.rb +109 -0
data/lib/redshifter/tasks.rb +25 -0
data/lib/redshifter/util/create_or_replace_table.rb +65 -0
data/lib/redshifter/util/extract_and_transform_updates.rb +81 -0
data/lib/redshifter/util/redshift.rb +18 -0
data/lib/redshifter/util/s3.rb +92 -0
data/lib/redshifter/util/s3_manifest_writer.rb +36 -0
data/lib/redshifter/util/table_config_validator.rb +65 -0
data/lib/redshifter/util/update_table.rb +88 -0
data/lib/redshifter/version.rb +3 -0
data/redshifter.gemspec +30 -0
metadata +180 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8760ba2ad72e5e668d9f3c4eb6b70d04d86b15e9
+  data.tar.gz: 8eb1c3fa50e558f9cfac14ae1d4c606754d0bdf7
+SHA512:
+  metadata.gz: d1e972e73eea10034797bf4713185a845e9f8f1c61227b1db480c355aeda7a232aa8c2caa50a2cdb3e95243ec94ece2a5acc35383cbb016f35bb6f63ba150099
+  data.tar.gz: 6ed7ad01c948ee874b558e0f13a1009249feb95b7365330557e3d5a6bd754426c47092fa3ff53788f5b92af4c53a2cdea0792ba82586428d3f46b4a97d33a685

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+.rvmrc
+*.gem
+Gemfile.lock
+tmp

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require 'spec_helper'

data/.travis.yml ADDED Viewed

@@ -0,0 +1,4 @@
+language: ruby
+rvm:
+ - 2.2.1
+ - 2.2.3

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in redshifter.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2015 Apartment List, Inc
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,147 @@
+# Redshifter
+Provides a resque job and rake tasks to facilitate ETL (Extract Transform Load) processing of Postgres tables for export to a Redshift cluster.
+Specifically provides:
+ 1) Create/Replace job to replace all model data in Redshift
+ 2) Update job updates all records created or updated since the last update ran
+Limitations:
+* deleted records are NOT synced to Redshift by the update job
+Feature Roadmap:
+* store last runtime locally instead of using potentially costly redshift query
+## Versions
+0.2.4 - New config format; update and replace rake tasks available
+0.3.0 - Public version
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'redshifter'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install redshifter
+## Usage
+### Setup Redshifter in a Rails initializer
+```ruby
+# config/initializers/redshifter.rb
+Redshifter.setup do |config|
+  # path in your app available for writing temp files
+  config.temp_directory_path = File.expand_path('../../tmp', __FILE__)
+  # path to redshifter table config
+  config.table_config_path = File.expand_path('../redshfiter.rb', __FILE__)
+  # redshift user should have access to create tables in the specified schema
+  config.redshift_username = 'your_app_user'
+  config.redshift_password = 'p@ssw0rd'
+  config.redshift_host = 'app.host.without.protocol.com'
+  config.redshift_port = 5439
+  config.redshift_database = 'database_name'
+  config.redshift_schema = 'a_schema'
+  # AWS user should be allowed full access to the specified bucket
+  config.aws_access_key_id = '<AWS user access key ID>'
+  config.aws_secret_access_key = '<AWS user secret access key>'
+  config.s3_bucket = 'a_redshifter_bucket'
+  # Heroku user must be a member of the app and have privileges to
+  # start new dynos
+  config.heroku_api_key = '<Heroku user api key>'
+  config.heroku_app_name = 'name of the app on heroku'
+end
+```
+### Require Redshifter tasks in your Rakefile
+```ruby
+# Rakefile
+# ...
+require 'redshifter/tasks'
+# ...
+```
+### Create a config file describing the tables, columns and transforms for export to redshift
+```ruby
+# config/redshifter.rb
+Redshifter.config.tables = {
+  'books_with_export_at' => {
+    # [required] Source *table* name, not the Rails model name
+    source_table_name: 'books',
+    # [required] Prefixing your redshift table with its source is recommended
+    redshift_table_name: 'app_name_books',
+    # [required] Columns with Redshift datatypes to create; may differ from source DB
+    redshift_columns: {
+      'id' => 'INTEGER',
+      'title' => 'VARCHAR(128)',
+      'published_at' => 'TIMESTAMP',
+      'updated_at' => 'TIMESTAMP',
+      'exported_at' => 'TIMESTAMP'
+    },
+    # [optional] SQL statements to transform or populate redshift columns from
+    # source DB. By default, redshift columns will be populated from source
+    # column with the same name. Column key must exist in redshift_columns.
+    # If a matching source column does not exist you MUST specify it here.
+    source_column_transforms: {
+      'title' => "lower(title)",
+      'published_at' => 'first_edition_published_at',
+      'exported_at' => 'now()'
+    },
+    # [required] valid values: KEY, EVEN, ALL
+    redshift_distribution_style: 'KEY',
+    # [required, if redshift_distribution_style: 'KEY'] distribution key column
+    # name MUST be present in redshift_columns.keys
+    redshift_distribution_key: 'id',
+    # [optional] valid values: COMPOUND, INTERLEAVED; If omitted the Redshift
+    # table DDL statement will not specify the sort style and Redshift will
+    # implicitly default to COMPOUND style.
+    redshift_sort_style: 'INTERLEAVED',
+    # [required] Column names MUST be present in redshift_columns.keys; Max
+    # length of 8 when using INTERLEAVED sort style, and 400 when using
+    # COMPOUND sort style.
+    redshift_sort_keys: ['published_at'],
+    # [optional] Used for query planning in Redshift
+    redshift_primary_key: 'id'
+  }
+}
+```
+### Run redshifter:replace rake task for each table you want to export
+```
+$ rake redshifter:replace[books_with_export_at]
+```
+### Schedule a Redshifter::Job::UpdateRedshiftTableJob resque job per each table you want to export updates for
+Then schedule this meta job to run in `resque_schedule.yml` to run once at 10:00pm
+```YAML
+# config/resque_schedule.yml
+etl_books_to_redshift:
+  cron: "0 22 * * *"
+  class: 'Redshifter::Job::UpdateRedshiftTableJob'
+  args: 'books_with_export_at'
+  description: 'Export the books table to Redshift'
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/redshifter/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "redshifter"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED Viewed

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+bundle install
+# Do any other automated setup that you need to do here

data/lib/redshifter.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'redshifter/version'
+require 'redshifter/config'
+require 'redshifter/table'
+require 'redshifter/util/table_config_validator'
+require 'redshifter/util/redshift'
+require 'redshifter/util/s3'
+require 'redshifter/util/extract_and_transform_updates'
+require 'redshifter/util/s3_manifest_writer'
+require 'redshifter/util/create_or_replace_table'
+require 'redshifter/util/update_table'
+require 'redshifter/extract_and_replace_redshift_table'
+require 'redshifter/extract_and_update_redshift_table'
+require 'redshifter/job/update_redshift_table_job'

data/lib/redshifter/config.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module Redshifter
+  class << self
+    def setup
+      yield config if block_given?
+      load_export_table_definitions
+      config_dynosaur
+    end
+    def config
+      @config ||= Struct.new(:tables,
+                             :redshift_username,
+                             :redshift_password,
+                             :redshift_host,
+                             :redshift_port,
+                             :redshift_database,
+                             :redshift_schema,
+                             :aws_access_key_id,
+                             :aws_secret_access_key,
+                             :s3_bucket,
+                             :heroku_api_key,
+                             :heroku_app_name,
+                             :temp_directory_path,
+                             :table_config_path).new
+    end
+    private
+    def load_export_table_definitions
+      load config.table_config_path
+    end
+    def config_dynosaur
+      require 'dynosaur'
+      Dynosaur::Client::HerokuClient.configure do |config|
+        config.api_key = Redshifter.config.heroku_api_key
+        config.app_name = Redshifter.config.heroku_app_name
+      end
+    end
+  end
+end

data/lib/redshifter/extract_and_replace_redshift_table.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Redshifter
+  class ExtractAndReplaceRedshiftTable
+    def initialize(table, s3_util = Util::S3.new)
+      @table = table
+      @s3_util = s3_util
+    end
+    def run
+      extracted_s3_urls = Util::ExtractAndTransformUpdates
+        .new(table: table,
+             since: Table::EPOCH_TIMESTAMP,
+             s3_util: s3_util
+        ).run
+      if extracted_s3_urls.any?
+        manifest_url = Util::S3ManifestWriter
+          .new(file_name: "#{SecureRandom.uuid}.manifest",
+               file_urls: extracted_s3_urls,
+               s3_util: s3_util
+          ).run
+        Util::CreateOrReplaceTable
+          .new(table: table,
+               manifest_url: manifest_url
+          ).run
+      end
+    end
+    private
+    attr_reader :table, :s3_util
+  end
+end

data/lib/redshifter/extract_and_update_redshift_table.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Redshifter
+  class ExtractAndUpdateRedshiftTable
+    def initialize(table, s3_util = Util::S3.new)
+      @table = table
+      @s3_util = s3_util
+    end
+    def run
+      extracted_s3_urls = Util::ExtractAndTransformUpdates
+                            .new(table: table,
+                                 since: table.redshift_last_update,
+                                 s3_util: s3_util
+                            ).run
+      if extracted_s3_urls.any?
+        manifest_url = Util::S3ManifestWriter
+                         .new(file_name: "#{SecureRandom.uuid}.manifest",
+                              file_urls: extracted_s3_urls,
+                              s3_util: s3_util
+                         ).run
+        Util::UpdateTable
+          .new(table: table,
+               manifest_url: manifest_url
+          ).run
+      end
+    end
+    private
+    attr_reader :table, :s3_util
+  end
+end

data/lib/redshifter/job/update_redshift_table_job.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'dynosaur'
+module Redshifter
+  module Job
+    class UpdateRedshiftTableJob
+      @queue = :low
+      def self.perform(table_config_key)
+        dyno = Dynosaur::Process::Heroku
+                 .new(task: 'redshifter:update', args: [table_config_key])
+        dyno.start
+      end
+    end
+  end
+end

data/lib/redshifter/table.rb ADDED Viewed

@@ -0,0 +1,109 @@
+module Redshifter
+  class Table
+    EPOCH_TIMESTAMP = '1970-01-01 00:00:00'
+    def initialize(config)
+      Util::TableConfigValidator.new(config).validate!
+      @source_table_name = config[:source_table_name]
+      @redshift_table_name = config[:redshift_table_name]
+      @redshift_columns = config[:redshift_columns]
+      @source_column_transforms = config[:source_column_transforms] || {}
+      @redshift_distribution_style = config[:redshift_distribution_style]
+      @redshift_distribution_key = config[:redshift_distribution_key]
+      @redshift_sort_keys = config[:redshift_sort_keys]
+      @redshift_sort_style = config[:redshift_sort_style]
+      @redshift_primary_key = config[:redshift_primary_key]
+    end
+    attr_reader :source_table_name, :redshift_table_name
+    def redshift_column_names
+      redshift_columns.keys
+    end
+    def source_column_statements
+      redshift_columns.keys.map do |v|
+        return v unless source_column_transforms
+        source_column_transforms[v] || v
+      end
+    end
+    def redshift_schema
+      Redshifter.config.redshift_schema
+    end
+    def redshift_table_ddl(table_name = redshift_table_name)
+      <<-QUERY.squish
+          CREATE TABLE
+            #{redshift_schema}.#{table_name}(
+            #{redshift_columns.map { |k, v| "#{k} #{v}" }.join(', ')}#{primary_key_statement}
+          )
+          #{dist_statement}
+          #{sortkey_statement};
+      QUERY
+    end
+    # returns unix epoch timestamp literal if table does not exist or table
+    # exist with zero rows.  Otherwise returns timestamp literal of most
+    # recently updated row in the analytics table
+    def redshift_last_update
+      conn = Util::Redshift.connect
+      table_presence_query = <<-QUERY.squish
+            SELECT EXISTS(
+              SELECT 1
+              FROM   information_schema.tables
+              WHERE  table_schema = '#{redshift_schema}'
+              AND    table_name = '#{redshift_table_name}')
+      QUERY
+      # Redshift does not allow pg catalog table and user table to be accessed
+      # in the same query
+      # http://docs.aws.amazon.com/redshift/latest/dg/c_sql-functions-leader-node.html
+      table_present = conn.exec(table_presence_query).getvalue(0, 0)
+      if table_present == 't'
+        conn.exec(
+          <<-QUERY.squish
+              SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
+              FROM #{redshift_schema}.#{redshift_table_name}
+          QUERY
+        ).getvalue(0, 0)
+      else
+        EPOCH_TIMESTAMP
+      end
+    end
+    private
+    attr_reader :redshift_columns,
+                :source_column_transforms,
+                :redshift_distribution_key,
+                :redshift_sort_keys,
+                :redshift_distribution_style,
+                :redshift_sort_style,
+                :redshift_primary_key
+    def dist_statement
+      output = "DISTSTYLE #{redshift_distribution_style}"
+      if redshift_distribution_style == 'KEY'
+        output << " DISTKEY(#{redshift_distribution_key})"
+      end
+      output
+    end
+    def sortkey_statement
+      output = ''
+      output << "#{redshift_sort_style} " if redshift_sort_style
+      output << "SORTKEY(#{redshift_sort_keys.join(', ')})"
+      output
+    end
+    def primary_key_statement
+      redshift_primary_key ? ", PRIMARY KEY(#{redshift_primary_key})" : ''
+    end
+  end
+end

data/lib/redshifter/tasks.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module Redshifter
+  class Tasks
+    include Rake::DSL if defined? Rake::DSL
+    def install_tasks
+      namespace :redshifter do
+        desc 'Create or replace an extracted table in Redshift'
+        task :replace, [:table_config_key] => :environment do |_task, args|
+          table_config = Redshifter.config.tables[args[:table_config_key]]
+          table = Redshifter::Table.new(table_config)
+          Redshifter::ExtractAndReplaceRedshiftTable.new(table).run
+        end
+        desc 'Update an extracted table in Redshift'
+        task :update, [:table_config_key] => :environment do |_task, args|
+          table_config = Redshifter.config.tables[args[:table_config_key]]
+          table = Redshifter::Table.new(table_config)
+          Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
+        end
+      end
+    end
+  end
+end
+Redshifter::Tasks.new.install_tasks

data/lib/redshifter/util/create_or_replace_table.rb ADDED Viewed

@@ -0,0 +1,65 @@
+module Redshifter
+  module Util
+    class CreateOrReplaceTable
+      def initialize(table:, manifest_url:)
+        @table = table
+        @manifest_url = manifest_url
+      end
+      def run(conn = Redshift.connect)
+        conn.transaction do |within_transaction|
+          [
+            drop_and_create_table_sql,
+            grant_readonly_permissions_sql
+          ].each do |query|
+            within_transaction.exec(query)
+          end
+        end
+      end
+      private
+      attr_reader :table, :manifest_url
+      # creates or replaces an existing table from an s3 manifest of gzipped pipe
+      # delimitted files.
+      #
+      # COPY is the most efficient way to load data into redshift. When COPY is
+      # run on a new table with zero rows it automatically runs statistics
+      # (ANALYZE) and sorts and distributes data (VACUUM). Using the manifest
+      # allows COPY command to run in parallel.
+      #
+      #SQL Notes:
+      #  CSV QUOTE AS '"' -- only way to define quote character for multiline
+      #                   -- column data (even though file is not comma separated)
+      #  DELIMITER '|'    -- this is the default separator, just being explicit
+      #  TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'  -- being explicit insead of auto detect
+      def drop_and_create_table_sql
+        <<-QUERY.squish
+        DROP TABLE IF EXISTS #{table.redshift_schema}.#{table.redshift_table_name};
+        #{table.redshift_table_ddl}
+        COPY #{table.redshift_schema}.#{table.redshift_table_name}
+          (#{table.redshift_column_names.join(', ')})
+        FROM '#{manifest_url}'
+        CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
+        CSV QUOTE AS '"'
+        DELIMITER '|'
+        GZIP
+        TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
+        NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
+        MANIFEST;
+        QUERY
+      end
+      def grant_readonly_permissions_sql
+        <<-QUERY.squish
+        GRANT SELECT, REFERENCES
+          ON TABLE #{table.redshift_schema}.#{table.redshift_table_name}
+          TO GROUP readonly;
+        QUERY
+      end
+    end
+  end
+end

data/lib/redshifter/util/extract_and_transform_updates.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require 'csv'
+module Redshifter
+  module Util
+    class  ExtractAndTransformUpdates
+      # Character used to represent a NULL value in the CSV. We cannot use an
+      # empty string to represent NULL, because then any tables with empty string
+      # values would have those converted to NULL in the Redshift tables. This
+      # needs to be a character we do not expect to appear as a value in any of
+      # the tables that are in the ETL process.
+      NULL_CHARACTER = '∅'
+      def initialize(table:, since:, s3_util:)
+        @table = table
+        @since = since
+        @s3_util = s3_util
+      end
+      # Writes pipe delimited 'CSV' files to S3 of updated records.
+      # Returns a list of internal s3 URLs created
+      def run(batch_size: 1000)
+        uploaded_s3_urls = []
+        run_name = SecureRandom.uuid
+        transform_in_batches(table.source_column_statements,
+                             batch_size: batch_size) do |rows, batch|
+          csv_rows = rows.map(&method(:csv_row))
+          uploaded_s3_urls << s3_util.upload_file(
+            file_name: "#{table.redshift_table_name}_updates_#{run_name}_#{batch}.txt",
+            body: csv_rows.join,
+            gzip: true)
+        end
+        uploaded_s3_urls
+      end
+      private
+      attr_reader :table, :since, :s3_util
+      def csv_row(row)
+        row.map! { |value| value.nil? ? NULL_CHARACTER : value }
+        CSV.generate_line(row, col_sep: '|')
+      end
+      def transform_in_batches(*column_transforms, batch_size: 1000)
+        # guarantee id is present in the first column for batch functionality
+        select_column_transforms = column_transforms.dup.unshift('id as id_for_batching')
+        batch_start_id = 1
+        batch_count = 0
+        loop do
+          rows = ActiveRecord::Base.connection_pool.with_connection do |conn|
+            conn.exec_query(select_batch_sql(columns: select_column_transforms,
+                                             batch_size: batch_size,
+                                             start_id: batch_start_id)
+            )
+          end.rows
+          break if rows.empty?
+          # get the id from the first column position where it was injected
+          last_id = rows.last[0].to_i
+          # remove the injected id from the first column
+          rows.map! { |row| row[1..-1] }
+          yield rows, batch_count
+          break if rows.size < batch_size
+          batch_start_id = last_id + 1
+          batch_count += 1
+        end
+      end
+      def select_batch_sql(columns:, batch_size:, start_id:)
+        "select #{columns.join(', ')} from #{table.source_table_name} where updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
+      end
+    end
+  end
+end

data/lib/redshifter/util/redshift.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module Redshifter
+  module Util
+    module Redshift
+      def self.connect
+        PG.connect(
+          {
+            host: Redshifter.config.redshift_host,
+            port: Redshifter.config.redshift_port,
+            dbname: Redshifter.config.redshift_database,
+            user: Redshifter.config.redshift_username,
+            password: Redshifter.config.redshift_password,
+            sslmode: 'require'
+          }
+        )
+      end
+    end
+  end
+end

data/lib/redshifter/util/s3.rb ADDED Viewed

@@ -0,0 +1,92 @@
+require 'fog'
+require 'tmpdir'
+module Redshifter
+  module Util
+    class S3
+      def upload_file(file_name:, body:, gzip: false)
+        s3_url = ''
+        Dir.mktmpdir('redshifter', ensure_app_tmp_directory) do |temp_dir|
+          temp_file = write_temp_file(File.join(temp_dir, file_name), body, gzip)
+          File.open(temp_file) do |file|
+            s3_file = bucket.files.create(file_options(file, gzip))
+            s3_url = internal_file_url(s3_file.key)
+          end
+        end
+        s3_url
+      end
+      private
+      def internal_file_url(file_name)
+        "s3://#{bucket_name}/#{file_name}"
+      end
+      def conn
+        @conn ||= Fog::Storage.new(
+          provider: 'AWS',
+          aws_access_key_id: Redshifter.config.aws_access_key_id,
+          aws_secret_access_key: Redshifter.config.aws_secret_access_key
+        )
+      end
+      def bucket
+        @bucket ||= conn.directories.get(bucket_name)
+      end
+      def bucket_name
+        Redshifter.config.s3_bucket
+      end
+      def file_options(file, gzip)
+        s3_file_options = {
+          key: File.basename(file),
+          body: file,
+          public: false
+        }
+        s3_file_options.merge!(
+          content_encoding: 'ASCII-8BIT',
+          compression_mime_type: 'application/x-gzip'
+        ) if gzip
+        s3_file_options
+      end
+      def ensure_app_tmp_directory
+        tmp_path = Redshifter.config.temp_directory_path
+        Dir.mkdir(tmp_path) unless File.directory?(tmp_path)
+        tmp_path
+      end
+      def write_temp_file(file_path, body, gzip)
+        if gzip
+          write_temp_gzipped_file(file_path, body)
+        else
+          write_temp_uncompressed_file(file_path, body)
+        end
+      end
+      def write_temp_uncompressed_file(file_path, body)
+        File.open(file_path, 'w') do |file|
+          file << body
+          file
+        end
+      end
+      def write_temp_gzipped_file(file_path, body)
+        gz_file_path = file_path + '.gz'
+        File.open(gz_file_path, 'w', encoding: 'ASCII-8BIT') do |compressed_file|
+          compressed_file.sync = true
+          gzip = Zlib::GzipWriter.new(compressed_file)
+          gzip.write(body)
+          gzip.close # Important: Without this call, gzip headers won't be written
+          compressed_file
+        end
+      end
+    end
+  end
+end

data/lib/redshifter/util/s3_manifest_writer.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Redshifter
+  module Util
+    class S3ManifestWriter
+      def initialize(file_name:, file_urls:, s3_util:)
+        @file_name = file_name
+        @file_urls = file_urls
+        @s3_util = s3_util
+      end
+      # Uploads a s3 manifest file that requires all files be processed with
+      # mandatory: true attribute per file.
+      # Returns internal s3 URL for the manifest
+      def run
+        s3_util.upload_file(
+          file_name: file_name,
+          body: JSON.generate(manifest),
+          gzip: false
+        )
+      end
+      private
+      attr_reader :file_name, :file_urls, :s3_util
+      def manifest
+        { entries: manifest_files }
+      end
+      def manifest_files
+        file_urls.map do |child_url|
+          { url: child_url, mandatory: true }
+        end
+      end
+    end
+  end
+end

data/lib/redshifter/util/table_config_validator.rb ADDED Viewed

@@ -0,0 +1,65 @@
+module Redshifter
+  module Util
+    class TableConfigValidator
+      REQUIRED_KEYS = [:source_table_name,
+                       :redshift_table_name,
+                       :redshift_columns,
+                       :redshift_distribution_style,
+                       :redshift_sort_keys]
+      # hash format: { required_key: { when_this_key_and_value: 'present'} }
+      CONDITIONALLY_REQUIRED_KEYS = {
+        redshift_distribution_key: { redshift_distribution_style: 'KEY' }
+      }
+      # Use a nil value when the key is optional, but has allowable values
+      # when it is present. e.g. {optional_key: ['valid1', 'valid2', nil]}
+      ALLOWABLE_VALUES = {
+        redshift_distribution_style: ['KEY', 'ALL', 'EVEN'],
+        redshift_sort_style: ['COMPOUND', 'INTERLEAVED', nil]
+      }
+      def initialize(config)
+        @config = config
+      end
+      def validate!
+        raise 'invalid table config' unless valid?
+      end
+      def valid?
+        all_required_keys_present? &&
+          all_conditionally_required_keys_present? &&
+          all_validated_values_allowed?
+      end
+      private
+      attr_reader :config
+      def all_required_keys_present?
+        ((REQUIRED_KEYS & config.keys) == REQUIRED_KEYS)
+      end
+      def all_conditionally_required_keys_present?
+        conditional_keys_presence = []
+        CONDITIONALLY_REQUIRED_KEYS.each do |required_k, condition_kv|
+          if (config.select { |k, v| k == condition_kv.keys.first } == condition_kv)
+            conditional_keys_presence << config.include?(required_k)
+          end
+        end
+        conditional_keys_presence.all?
+      end
+      def all_validated_values_allowed?
+        allowed_values_presence = []
+        ALLOWABLE_VALUES.each do |k, v|
+          allowed_values_presence << v.include?(config[k])
+        end
+        allowed_values_presence.all?
+      end
+    end
+  end
+end

data/lib/redshifter/util/update_table.rb ADDED Viewed

@@ -0,0 +1,88 @@
+module Redshifter
+  module Util
+    class UpdateTable
+      def initialize(table:, manifest_url:)
+        @table = table
+        @manifest_url = manifest_url
+      end
+      def run(conn = Redshift.connect)
+        conn.transaction do |within_transaction|
+          [
+            create_and_load_temp_table_sql,
+            upsert_changes_sql,
+            cleanup_temp_table_sql,
+            analyze_updated_table_sql
+          ].each do |query|
+            within_transaction.exec(query)
+          end
+        end
+      end
+      private
+      attr_reader :table, :manifest_url
+      def analytics_temp_table
+        "#{table.redshift_table_name}_temp"
+      end
+      #creates a temp table from original table DDL because it's the
+      # most efficient way to get a table the most closely mimics the destination.
+      # http://docs.aws.amazon.com/redshift/latest/dg/performing-a-deep-copy.html
+      #
+      # Imports rows from an s3 manifest of gzipped pipe delimitted files.
+      # Using the manifest allows COPY command to run in parallel.
+      #
+      #SQL Notes:
+      #  CSV QUOTE AS '"' -- only way to define quote character for multiline
+      #                   -- column data (even though file is not comma separated)
+      #  DELIMITER '|'    -- this is the default separator, just being explicit
+      #  TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'  -- being explicit insead of auto detect
+      #  compupdate off   -- save time by not compressing columns; it's temporary
+      #  statupdate off   -- save time by not running statistics; it's temporary
+      def create_and_load_temp_table_sql
+        <<-QUERY.squish
+        #{table.redshift_table_ddl(analytics_temp_table)}
+        COPY #{table.redshift_schema}.#{analytics_temp_table}
+          (#{table.redshift_column_names.join(', ')})
+        FROM '#{manifest_url}'
+        CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
+        CSV QUOTE AS '"'
+        DELIMITER '|'
+        GZIP
+        TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
+        NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
+        MANIFEST
+        compupdate off
+        statupdate off;
+        QUERY
+      end
+      # Replaces existing rows with updated row in a single transaction using AWS
+      # recommended method
+      # http://docs.aws.amazon.com/redshift/latest/dg/merge-replacing-existing-rows.html
+      def upsert_changes_sql
+        <<-QUERY.squish
+        DELETE FROM #{table.redshift_schema}.#{table.redshift_table_name}
+        USING #{table.redshift_schema}.#{analytics_temp_table}
+        WHERE #{table.redshift_schema}.#{table.redshift_table_name}.id
+              = #{table.redshift_schema}.#{analytics_temp_table}.id;
+        INSERT INTO #{table.redshift_schema}.#{table.redshift_table_name}
+          SELECT * FROM #{table.redshift_schema}.#{analytics_temp_table};
+        QUERY
+      end
+      def cleanup_temp_table_sql
+        "drop table #{table.redshift_schema}.#{analytics_temp_table};"
+      end
+      # analyze recomputes table statistics for efficient querying after change
+      def analyze_updated_table_sql
+        "analyze #{table.redshift_schema}.#{table.redshift_table_name};"
+      end
+    end
+  end
+end

data/lib/redshifter/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Redshifter
+  VERSION = "0.3.0"
+end

data/redshifter.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'redshifter/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'redshifter'
+  spec.version       = Redshifter::VERSION
+  spec.authors       = ['Justin Richard']
+  spec.email         = ['justin@apartmentlist.com']
+  spec.summary       = %q{ETL processing jobs to exporting Rails model tables to Redshift}
+  spec.homepage      = 'https://github.com/apartmentlist/redshifter'
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.require_paths = ['lib']
+  spec.add_runtime_dependency 'dynosaur', '~> 0'
+  spec.add_runtime_dependency 'fog', '~> 1.36.0'
+  # mime-types now an explicit dependency of fog-core >=1.35.0
+  # fog 1.36.0 has a loose dependency on fog-core "~> 1.32" that causes this
+  # dependency change to bubble up to redshifter
+  spec.add_runtime_dependency 'mime-types'
+  spec.add_runtime_dependency 'pg', '~> 0.18'
+  spec.add_development_dependency 'bundler'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'pry-byebug', '~> 3'
+  spec.add_development_dependency 'rspec', '~> 3.3'
+end

metadata ADDED Viewed

@@ -0,0 +1,180 @@
+--- !ruby/object:Gem::Specification
+name: redshifter
+version: !ruby/object:Gem::Version
+  version: 0.3.0
+platform: ruby
+authors:
+- Justin Richard
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-12-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: dynosaur
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: fog
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.36.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.36.0
+- !ruby/object:Gem::Dependency
+  name: mime-types
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: pg
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.18'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.18'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: pry-byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+description:
+email:
+- justin@apartmentlist.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/redshifter.rb
+- lib/redshifter/config.rb
+- lib/redshifter/extract_and_replace_redshift_table.rb
+- lib/redshifter/extract_and_update_redshift_table.rb
+- lib/redshifter/job/update_redshift_table_job.rb
+- lib/redshifter/table.rb
+- lib/redshifter/tasks.rb
+- lib/redshifter/util/create_or_replace_table.rb
+- lib/redshifter/util/extract_and_transform_updates.rb
+- lib/redshifter/util/redshift.rb
+- lib/redshifter/util/s3.rb
+- lib/redshifter/util/s3_manifest_writer.rb
+- lib/redshifter/util/table_config_validator.rb
+- lib/redshifter/util/update_table.rb
+- lib/redshifter/version.rb
+- redshifter.gemspec
+homepage: https://github.com/apartmentlist/redshifter
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
+specification_version: 4
+summary: ETL processing jobs to exporting Rails model tables to Redshift
+test_files: []