RubyGems - dataduck - Versions diffs - 0.2.0 - Mend

dataduck 0.2.0

Files changed (37) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/Gemfile +4 -0
data/README.md +87 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/dataduck +6 -0
data/bin/setup +7 -0
data/dataduck.gemspec +28 -0
data/examples/example/.gitignore +5 -0
data/examples/example/.ruby-version +1 -0
data/examples/example/Gemfile +5 -0
data/examples/example/README.md +11 -0
data/examples/example/config/replace_me.yml +22 -0
data/examples/example/src/main.rb +13 -0
data/examples/example/src/tables/games.rb +10 -0
data/examples/example/src/tables/users.rb +16 -0
data/lib/dataduck.rb +29 -0
data/lib/dataduck/commands.rb +165 -0
data/lib/dataduck/destination.rb +40 -0
data/lib/dataduck/etl.rb +49 -0
data/lib/dataduck/mysql_source.rb +11 -0
data/lib/dataduck/postgresql_source.rb +11 -0
data/lib/dataduck/redshift_destination.rb +176 -0
data/lib/dataduck/s3_object.rb +71 -0
data/lib/dataduck/source.rb +46 -0
data/lib/dataduck/sql_db_source.rb +41 -0
data/lib/dataduck/table.rb +82 -0
data/lib/dataduck/util.rb +15 -0
data/lib/dataduck/version.rb +6 -0
data/lib/helpers/module_vars.rb +19 -0
data/lib/templates/quickstart/main.rb.erb +10 -0
data/lib/templates/quickstart/table.rb.erb +7 -0
data/static/logo.png +0 -0
metadata +178 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 0f87bbaf674b1943242d3ea173a5e34fe00e0724
+  data.tar.gz: b8fbacadd9323ab917498712c8d4f39f1f5ca907
+SHA512:
+  metadata.gz: 40bbfce9c990d1542236c59967c31fe3fe5982c84bed12ccaf604c7ce15f2cebc5432b865dfedac5e95607dc37f20e0d681462ee9e7936e30ffdce8361688c96
+  data.tar.gz: fdc25e1ddf3a00faeceb13f11f4b7452f4085b3c0e5ca805137cc21174727aabec052dd335c371cd21c78db7ca0af7ef4e5a93f2876df9abfff531ad90bc8612

data/.gitignore ADDED

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+log
+.DS_Store
+*.lock
+.idea
+*.gem

data/.rspec ADDED

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ ruby-2.1.2

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in dataduck.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,87 @@
+# DataDuck ETL
+##### Set up in under 5 minutes
+DataDuck ETL is probably the quickest extract-transform-load framework system to set up. If you want to set up a data warehouse, give DataDuck ETL a try.
+##### Extract-transform-load to Amazon Redshift
+DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon S3).
+![DataDuck ETL](static/logo.png "DataDuck ETL")
+## Installation
+##### Example project
+See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
+##### Instructions for using DataDuck ETL
+Create a new project, then add the following to your Gemfile:
+```ruby
+gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
+```
+Then execute:
+    $ bundle install
+Finally, run the quickstart command:
+    $ dataduck quickstart
+The quickstart wizard will ask you for credentials to your database, then create the basic setup for your project. After the setup, your project's ETL can be run by running `ruby src/main.rb`
+If you'd like to run this regularly, such as every night, it's recommended to use the [whenever](https://github.com/javan/whenever) gem to manage a cron job to regularly run the ETL.
+## Documentation
+Tables are defined in their own file under /src/tables. Here's an example table:
+```ruby
+class Decks < DataDuck::Table
+  source :my_database, ["id", "name", "user_id", "cards",
+      "num_wins", "num_losses", "created_at", "updated_at",
+      "is_drafted", "num_draft_wins", "num_draft_losses"]
+  transforms :calculate_num_totals
+  validates :validates_num_total
+  output({
+      :id => :integer,
+      :name => :string,
+      :user_id => :integer,
+      :num_wins => :integer,
+      :num_losses => :integer,
+      :num_total => :integer,
+      :num_draft_total => :integer,
+      :created_at => :datetime,
+      :updated_at => :datetime,
+      :is_drafted => :boolean,
+      # Note that num_draft_wins and num_draft_losses
+      # are not included in the output, but are used in
+      # the transformation.
+  })
+  def calculate_num_totals(row)
+    row[:num_total] = row[:num_wins] + row[:num_losses]
+    row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
+    row
+  end
+  def validates_num_total(row)
+    return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
+  end
+end
+```
+## Contributing
+To contribute, get in touch at http://DataDuckETL.com/ so that we can share the [Contributor License Agreement (CLA)](https://en.wikipedia.org/wiki/Contributor_License_Agreement) with you, then create a pull request.
+## License
+Get in touch or visit [http://dataducketl.com/licensing](http://dataducketl.com/licensing) for licensing.

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "dataduck"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/dataduck ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require_relative "../lib/dataduck"
+require_relative "../lib/dataduck/commands"
+DataDuck::Commands.route_command(ARGV)

data/bin/setup ADDED

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+bundle install
+# Do any other automated setup that you need to do here

data/dataduck.gemspec ADDED

@@ -0,0 +1,28 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'dataduck/version'
+Gem::Specification.new do |spec|
+  spec.authors = ["Jeff Pickhardt"]
+  spec.description = "A straightforward, effective ETL framework."
+  spec.email = ["pickhardt@gmail.com", "admin@dataducketl.com"]
+  spec.executables = ["dataduck"]
+  spec.homepage = "http://dataducketl.com/"
+  spec.name = "dataduck"
+  spec.summary = "A straightforward, effective ETL framework."
+  spec.version = DataDuck::VERSION
+  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir = "bin"
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.6"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.3"
+  spec.add_runtime_dependency "sequel", '~> 4.19'
+  spec.add_runtime_dependency "pg", '~> 0.16'
+  spec.add_runtime_dependency "aws-sdk", "~> 2.0"
+  spec.add_runtime_dependency "sequel-redshift"
+end

data/examples/example/.gitignore ADDED

@@ -0,0 +1,5 @@
+.idea
+.DS_Store
+config/secrets.yml
+config/secret
+*.lock

data/examples/example/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ ruby-2.1.2

data/examples/example/Gemfile ADDED

@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
+ruby '2.1.2'

data/examples/example/README.md ADDED

@@ -0,0 +1,11 @@
+# DataDuck ETL Example
+This gives an example project showing how to set up [DataDuck ETL](http://dataducketl.com/)
+# Instructions
+Copy /config/replace_me.yml to /config/secret/development.yml, then replace the secrets with your AWS and DB connection details.
+For each table you want to import, create a table file in /src/tables. You can use /src/tables/games.rb and /src/tables/users.rb as examples. (You should also delete, modify, or rename games.rb and users.rb, by the way, otherwise DataDuck ETL will try to load them.)
+For further help, reach out at [http://dataducketl.com/](http://dataducketl.com/)

data/examples/example/config/replace_me.yml ADDED

@@ -0,0 +1,22 @@
+# Move this file to /config/secret/development.yml and /config/secret/production.yml
+destinations:
+  main_destination:
+    type: redshift
+    aws_key: YOUR_AWS_KEY
+    aws_secret: YOUR_AWS_SECRET
+    s3_bucket: YOUR_BUCKET
+    s3_region: YOUR_BUCKET_REGION
+    host: redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com
+    port: 5439
+    database: main
+    schema: public
+    username: YOUR_UESRNAME
+    password: YOUR_PASSWORD
+sources:
+  my_database:
+    type: postgresql
+    host: some.host.goes.here.com
+    database: db_name_goes_here
+    port: 5522
+    username: some_username_goes_here_probably_read_only
+    password: some_password_goes_here

data/examples/example/src/main.rb ADDED

@@ -0,0 +1,13 @@
+require 'rubygems'
+require 'bundler/setup'
+Bundler.require
+require_relative "tables/games"
+require_relative "tables/users"
+class MyCompanyETL < DataDuck::ETL
+  destination :main_destination
+end
+etl = MyCompanyETL.new
+etl.process!

data/examples/example/src/tables/games.rb ADDED

@@ -0,0 +1,10 @@
+class Games < DataDuck::Table
+  source :my_database, [:id, :first_user_id, :second_user_id, :game_type]
+  output({
+      :id => :integer,
+      :first_user_id => :integer,
+      :second_user_id => :integer,
+      :game_type => :string,
+  })
+end

data/examples/example/src/tables/users.rb ADDED

@@ -0,0 +1,16 @@
+class Users < DataDuck::Table
+  source :my_database, [:id, :username, :rating, :credits]
+  validate :non_negative_credits
+  columns({
+      :id => :integer,
+      :username => :string,
+      :rating => :integer,
+      :credits => :integer,
+  })
+  def non_negative_credits(row)
+    return "User id #{ row[:id] } has negative value of #{ row[:credits] } for credits." if row[:credits] < 0
+  end
+end

data/lib/dataduck.rb ADDED

@@ -0,0 +1,29 @@
+Dir[File.dirname(__FILE__) + '/helpers/*.rb'].each do |file|
+  require file
+end
+Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
+  require file
+end
+require 'yaml'
+module DataDuck
+  extend ModuleVars
+  ENV['DATADUCK_ENV'] ||= "development"
+  create_module_var("environment",  ENV['DATADUCK_ENV'])
+  spec = Gem::Specification.find_by_name("dataduck")
+  create_module_var("gem_root", spec.gem_dir)
+  create_module_var("project_root", Dir.getwd)
+  create_module_var("config", {})
+  dd_env_path = DataDuck.project_root + "/config/secret/#{ ENV['DATADUCK_ENV'] }.yml"
+  env_config = File.exist?(dd_env_path) ? YAML.load_file(dd_env_path) : {}
+  DataDuck.config.merge!(env_config)
+  create_module_var("sources", {})
+  create_module_var("destinations", {})
+end

data/lib/dataduck/commands.rb ADDED

@@ -0,0 +1,165 @@
+require 'erb'
+require 'yaml'
+require 'fileutils'
+module DataDuck
+  class Commands
+    class Namespace
+      def initialize(hash = {})
+        hash.each do |key, value|
+          singleton_class.send(:define_method, key) { value }
+        end
+      end
+      def get_binding
+        binding
+      end
+    end
+    def self.acceptable_commands
+      ['console', 'quickstart']
+    end
+    def self.route_command(args)
+      if args.length == 0
+        return DataDuck::Commands.help
+      end
+      command = args[0]
+      if !Commands.acceptable_commands.include?(command)
+        puts "No such command: #{ command }"
+        return DataDuck::Commands.help
+      end
+      DataDuck::Commands.public_send(command)
+    end
+    def self.console
+      require "irb"
+      IRB.start
+    end
+    def self.help
+      puts "Usage: dataduck commandname"
+    end
+    def self.quickstart
+      puts "Welcome to DataDuck!"
+      puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
+      puts "Enter the source (Postgres database) hostname:"
+      source_host = STDIN.gets.strip
+      puts "Enter the name of the database when connecting to #{ source_host }:"
+      source_database = STDIN.gets.strip
+      puts "Enter the source's port:"
+      source_port = STDIN.gets.strip.to_i
+      puts "Enter the username:"
+      source_username = STDIN.gets.strip
+      puts "Enter the password:"
+      source_password = STDIN.noecho(&:gets).chomp
+      db_source = DataDuck::PostgresqlSource.new({
+          'type' => 'postgresql',
+          'host' => source_host,
+          'database' => source_database,
+          'port' => source_port,
+          'username' => source_username,
+          'password' => source_password,
+      })
+      puts "Connecting to source database..."
+      table_names = db_source.table_names
+      puts "Connection successful. Detected #{ table_names.length } tables."
+      puts "Creating scaffolding..."
+      table_names.each do |table_name|
+        DataDuck::Commands.quickstart_create_table(table_name, db_source)
+      end
+      config_obj = {
+        'sources' => {
+          'my_database' => {
+            'type' => 'postgresql',
+            'host' => source_host,
+            'database' => source_database,
+            'port' => source_port,
+            'username' => source_username,
+            'password' => source_password,
+          }
+        },
+        'destinations' => {
+          'my_destination' => {
+            'type'  => 'redshift',
+            'aws_key'  => 'YOUR_AWS_KEY',
+            'aws_secret'  => 'YOUR_AWS_SECRET',
+            's3_bucket'  => 'YOUR_BUCKET',
+            's3_region'  => 'YOUR_BUCKET_REGION',
+            'host'  => 'redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com',
+            'port'  => 5439,
+            'database'  => 'main',
+            'schema'  => 'public',
+            'username'  => 'YOUR_UESRNAME',
+            'password'  => 'YOUR_PASSWORD',
+          }
+        }
+      }
+      DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/config/secret/#{ DataDuck.environment }.yml", config_obj.to_yaml)
+      DataDuck::Commands.quickstart_save_main
+      DataDuck::Commands.quickstart_update_gitignore
+      puts "Quickstart complete!"
+      puts "You still need to edit your config/secret/*.yml file with your AWS and Redshift credentials."
+      puts "Run your ETL with: ruby src/main.rb"
+    end
+    def self.quickstart_update_gitignore
+      main_gitignore_path = "#{ DataDuck.project_root }/.gitignore"
+      FileUtils.touch(main_gitignore_path)
+      secret_gitignore_path = "#{ DataDuck.project_root }/config/secret/.gitignore"
+      FileUtils.touch(secret_gitignore_path)
+      output = File.open(secret_gitignore_path, "w")
+      output << '[^.]*'
+      output.close
+    end
+    def self.quickstart_create_table(table_name, db)
+      columns = []
+      schema = db.schema(table_name)
+      schema.each do |property_schema|
+        property_name = property_schema[0]
+        property_type = property_schema[1][:type]
+        commented_out = ['ssn', 'socialsecurity', 'password', 'encrypted_password', 'salt', 'password_salt', 'pw'].include?(property_name.to_s.downcase)
+        columns << [property_name.to_s, property_type.to_s, commented_out]
+      end
+      table_name = table_name.to_s.downcase
+      table_name_camelcased = table_name.split('_').collect(&:capitalize).join
+      namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
+      template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
+      result = ERB.new(template).result(namespace.get_binding)
+      DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
+    end
+    def self.quickstart_save_file(output_path_full, contents)
+      *output_path, output_filename = output_path_full.split('/')
+      output_path = output_path.join("/")
+      FileUtils::mkdir_p(output_path)
+      output = File.open(output_path_full, "w")
+      output << contents
+      output.close
+    end
+    def self.quickstart_save_main
+      namespace = Namespace.new
+      template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/main.rb.erb", 'r').read
+      result = ERB.new(template).result(namespace.get_binding)
+      DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/main.rb", result)
+    end
+  end
+end

data/lib/dataduck/destination.rb ADDED

@@ -0,0 +1,40 @@
+module DataDuck
+  class Destination
+    def self.destination_config(name)
+      if DataDuck.config['destinations'].nil? || DataDuck.config['destinations'][name.to_s].nil?
+        raise Exception.new("Could not find destination #{ name } in destinations configs.")
+      end
+      DataDuck.config['destinations'][name.to_s]
+    end
+    def load_tables!(tables)
+      raise Exception.new("Must implement load_tables! in subclass")
+    end
+    def before_all_loads!
+    end
+    def after_all_loads!
+      # e.g. cleanup
+    end
+    def self.destination(destination_name)
+      destination_name = destination_name.to_s
+      if DataDuck.destinations[destination_name]
+        return DataDuck.destinations[destination_name]
+      end
+      destination_configuration = DataDuck::Destination.destination_config(destination_name)
+      destination_type = destination_configuration['type']
+      if destination_type == "redshift"
+        DataDuck.destinations[destination_name] = DataDuck::RedshiftDestination.new(destination_configuration)
+        return DataDuck.destinations[destination_name]
+      else
+        raise ArgumentError.new("Unknown type '#{ destination_type }' for destination #{ destination_name }.")
+      end
+    end
+  end
+end

data/lib/dataduck/etl.rb ADDED

@@ -0,0 +1,49 @@
+require_relative 'redshift_destination.rb'
+module DataDuck
+  class ETL
+    class << self
+      attr_accessor :destinations
+    end
+    def self.destination(destination_name)
+      self.destinations ||= []
+      self.destinations << DataDuck::Destination.destination(destination_name)
+    end
+    def initialize(options = {})
+      @tables = options[:tables] || []
+      @autoload_tables = options[:autoload_tables].nil? ? true : options[:autoload_tables]
+      if @autoload_tables
+        Dir[DataDuck.project_root + "/src/tables/*.rb"].each do |file|
+          table_name_underscores = file.split("/").last.gsub(".rb", "")
+          table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
+          require file
+          table = Object.const_get(table_name_camelized)
+          if table <= DataDuck::Table
+            @tables << table
+          end
+        end
+      end
+    end
+    def process!
+      puts "Processing ETL..."
+      table_instances = []
+      @tables.each do |table_class|
+        table_instance = table_class.new
+        table_instances << table_instance
+        table_instance.extract!
+        table_instance.transform!
+      end
+      self.class.destinations.each do |destination|
+        destination.before_all_loads!(table_instances)
+        destination.load_tables!(table_instances)
+        destination.after_all_loads!(table_instances)
+      end
+    end
+  end
+end

data/lib/dataduck/mysql_source.rb ADDED

@@ -0,0 +1,11 @@
+require_relative 'sql_db_source.rb'
+require 'sequel'
+module DataDuck
+  class MysqlSource < DataDuck::SqlDbSource
+    def db_type
+      'mysql'
+    end
+  end
+end

data/lib/dataduck/postgresql_source.rb ADDED

@@ -0,0 +1,11 @@
+require_relative 'sql_db_source.rb'
+require 'sequel'
+module DataDuck
+  class PostrgresqlSource < DataDuck::SqlDbSource
+    def db_type
+      'postgres'
+    end
+  end
+end

data/lib/dataduck/redshift_destination.rb ADDED

@@ -0,0 +1,176 @@
+require_relative 'destination.rb'
+module DataDuck
+  class RedshiftDestination < DataDuck::Destination
+    def initialize(config)
+      @aws_key = config['aws_key']
+      @aws_secret = config['aws_secret']
+      @s3_bucket = config['s3_bucket']
+      @s3_region = config['s3_region']
+      @host = config['host']
+      @port = config['port']
+      @database = config['database']
+      @schema = config['schema']
+      @username = config['username']
+      @password = config['password']
+      @redshift_connection = nil
+    end
+    def connection
+      @redshift_connection ||= Sequel.connect("redshift://#{ @username }:#{ @password }@#{ @host }:#{ @port }/#{ @database }" +
+              "?force_standard_strings=f",
+          :client_min_messages => '',
+          :force_standard_strings => false
+      )
+    end
+    def copy_query(table, s3_path)
+      properties_joined_string = "\"#{ table.output_column_names.join('","') }\""
+      query_fragments = []
+      query_fragments << "COPY #{ self.staging_table_name(table) } (#{ properties_joined_string })"
+      query_fragments << "FROM '#{ s3_path }'"
+      query_fragments << "CREDENTIALS 'aws_access_key_id=#{ @aws_key };aws_secret_access_key=#{ @aws_secret }'"
+      query_fragments << "REGION '#{ @s3_region }'"
+      query_fragments << "CSV TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
+      query_fragments << "DATEFORMAT 'auto'"
+      return query_fragments.join(" ")
+    end
+    def create_columns_on_data_warehouse!(table)
+      columns = get_columns_in_data_warehouse(table)
+      column_names = columns.map { |col| col[:name].to_s }
+      table.output_schema.map do |name, data_type|
+        if !column_names.include?(name.to_s)
+          redshift_data_type = data_type.to_s
+          redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
+          self.run_query("ALTER TABLE #{ table.name } ADD #{ name } #{ redshift_data_type }")
+        end
+      end
+    end
+    def create_table_query(table, table_name = nil)
+      table_name ||= table.name
+      props_array = table.output_schema.map do |name, data_type|
+        redshift_data_type = data_type.to_s
+        redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
+        "\"#{ name }\" #{ redshift_data_type }"
+      end
+      props_string = props_array.join(', ')
+      "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string })"
+    end
+    def create_output_table_on_data_warehouse!(table)
+      self.run_query(self.create_table_query(table))
+      self.create_columns_on_data_warehouse!(table)
+    end
+    def create_staging_table!(table)
+      table_name = self.staging_table_name(table)
+      self.drop_staging_table!(table)
+      self.run_query(self.create_table_query(table, table_name))
+    end
+    def data_as_csv_string(data, property_names)
+      data_string_components = [] # for performance reasons, join strings this way
+      data.each do |result|
+        property_names.each_with_index do |property_name, index|
+          value = result[property_name.to_sym]
+          if index == 0
+            data_string_components << '"'
+          end
+          data_string_components << DataDuck::RedshiftDestination.value_to_string(value)
+          if index == property_names.length - 1
+            data_string_components << '"'
+          else
+            data_string_components << '","'
+          end
+        end
+        data_string_components << "\n"
+      end
+      return data_string_components.join
+    end
+    def drop_staging_table!(table)
+      self.run_query("DROP TABLE IF EXISTS #{ self.staging_table_name(table) }")
+    end
+    def get_columns_in_data_warehouse(table)
+      query = "SELECT pg_table_def.column as name, type as data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table.name }'"
+      results = self.run_query(query)
+      columns = []
+      results.each do |result|
+        columns << {
+            name: result[:name],
+            data_type: result[:data_type],
+            distkey: result[:distkey],
+            sortkey: result[:sortkey]
+        }
+      end
+      return columns
+    end
+    def merge_from_staging!(table)
+      # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
+      staging_name = self.staging_table_name(table)
+      delete_query = "DELETE FROM #{ table.name } USING #{ staging_name } WHERE #{ table.name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
+      self.run_query(delete_query)
+      insert_query = "INSERT INTO #{ table.name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
+      self.run_query(insert_query)
+    end
+    def run_query(sql)
+      self.connection[sql].map { |elem| elem }
+    end
+    def staging_table_name(table)
+      "zz_dataduck_#{ table.name }"
+    end
+    def upload_table_to_s3!(table)
+      now_epoch = Time.now.to_i.to_s
+      filepath = "pending/#{ table.name.downcase }_#{ now_epoch }.csv"
+      table_csv = self.data_as_csv_string(table.data, table.output_column_names)
+      s3_obj = S3Object.new(filepath, table_csv, @aws_key, @aws_secret,
+          @s3_bucket, @s3_region)
+      s3_obj.upload!
+      return s3_obj
+    end
+    def before_all_loads!(tables)
+    end
+    def load_tables!(tables)
+      tables.each do |table|
+        puts "Loading table #{ table.name }..."
+        s3_object = self.upload_table_to_s3!(table)
+        self.create_staging_table!(table)
+        self.create_output_table_on_data_warehouse!(table)
+        self.run_query(self.copy_query(table, s3_object.s3_path))
+        self.merge_from_staging!(table)
+        self.drop_staging_table!(table)
+      end
+    end
+    def after_all_loads!(tables)
+    end
+    def self.value_to_string(value)
+      string_value = ''
+      if value.respond_to? :to_s
+        string_value = value.to_s
+      end
+      string_value.gsub!('"', '""')
+      return string_value
+    end
+  end
+end

data/lib/dataduck/s3_object.rb ADDED

@@ -0,0 +1,71 @@
+require 'aws-sdk'
+module DataDuck
+  class S3Object
+    def initialize(path, contents, aws_key, aws_secret, bucket, region, options={})
+      @path = path
+      @contents = contents
+      @options = options
+      @aws_key = aws_key
+      @aws_secret = aws_secret
+      @bucket = bucket
+      @region = region
+    end
+    def upload!
+      s3 = Aws::S3::Client.new(
+          region: @region,
+          access_key_id: @aws_key,
+          secret_access_key: @aws_secret,
+      )
+      attempts = 0
+      while attempts <= S3Object.max_retries
+        attempts += 1
+        put_hash = @options.merge({
+                acl: 'private',
+                bucket: @bucket,
+                body: @contents,
+                key: self.full_path,
+                server_side_encryption: 'AES256',
+            })
+        begin
+          response = s3.put_object(put_hash)
+        rescue Exception => e
+          if attempts == S3Object.max_retries
+            throw e
+          end
+        end
+      end
+      response
+    end
+    def full_path
+      'dataduck/' + @path
+    end
+    def s3_path
+      "s3://#{ @bucket }/#{ full_path }"
+    end
+    def self.max_retries
+      3
+    end
+    def self.regions
+      [
+          { name: 'US Standard - N. Virginia', region: 'us-east-1' },
+          { name: 'US West - N. California', region: 'us-west-1' },
+          { name: 'US West - Oregon', region: 'us-west-2' },
+          { name: 'EU - Ireland', region: 'eu-west-1' },
+          { name: 'EU - Frankfurt', region: 'eu-central-1' },
+          { name: 'Asia Pacific - Singapore', region: 'ap-southeast-1' },
+          { name: 'Asia Pacific - Sydney', region: 'ap-southeast-2' },
+          { name: 'Asia Pacific - Tokyo', region: 'ap-northeast-1' },
+          { name: 'South America - Sao Paulo', region: 'sa-east-1' },
+      ]
+    end
+  end
+end

data/lib/dataduck/source.rb ADDED

@@ -0,0 +1,46 @@
+module DataDuck
+  class Source
+    def self.source_config(name)
+      if DataDuck.config['sources'].nil? || DataDuck.config['sources'][name.to_s].nil?
+        raise Exception.new("Could not find source #{ name } in source configs.")
+      end
+      DataDuck.config['sources'][name.to_s]
+    end
+    def self.source(name)
+      name = name.to_s
+      if DataDuck.sources[name]
+        return DataDuck.sources[name]
+      end
+      configuration = DataDuck::Source.source_config(name)
+      source_type = configuration['type']
+      if source_type == "postgresql"
+        DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
+        return DataDuck.sources[name]
+      else
+        raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
+      end
+    end
+    def connection
+      raise Exception.new("Must implement connection in subclass.")
+    end
+    def query
+      raise Exception.new("Must implement query in subclass.")
+    end
+    def schema(table_name)
+      self.connection.schema(table_name)
+    end
+    def self.skip_these_table_names
+      [:delayed_jobs, :schema_migrations]
+    end
+  end
+end

data/lib/dataduck/sql_db_source.rb ADDED

@@ -0,0 +1,41 @@
+require_relative 'source.rb'
+require 'sequel'
+module DataDuck
+  class SqlDbSource < DataDuck::Source
+    def initialize(data)
+      @host = data['host']
+      @port = data['port']
+      @username = data['username']
+      @password = data['password']
+      @database = data['database']
+      @initialized_db_type = data['db_type']
+    end
+    def connection
+      @connection ||= Sequel.connect(
+        adapter: self.db_type,
+        user: @username,
+        host: @host,
+        database: @database,
+        password: @password,
+        port: @port
+      )
+    end
+    def db_type
+      return @initialized_db_type if @initialized_db_type
+      raise Exception.new("Abstract method db_type must be overwritten by subclass, or passed as data when initializing.")
+    end
+    def table_names
+      self.connection.tables.map { |table| DataDuck::Source.skip_these_table_names.include?(table) ? nil : table }.compact
+    end
+    def query(sql)
+      self.connection.fetch(sql).all
+    end
+  end
+end

data/lib/dataduck/table.rb ADDED

@@ -0,0 +1,82 @@
+module DataDuck
+  class Table
+    class << self
+      attr_accessor :sources
+      attr_accessor :output_schema
+      attr_accessor :actions
+      attr_accessor :errors
+    end
+    attr_accessor :data
+    def self.transforms(transformation_name)
+      self.actions ||= []
+      self.actions << [:transform, transformation_name]
+    end
+    singleton_class.send(:alias_method, :transform, :transforms)
+    def self.validates(validation_name)
+      self.actions ||= []
+      self.actions << [:validate, validation_name]
+    end
+    singleton_class.send(:alias_method, :validate, :validates)
+    def self.source(source_name, source_data = [])
+      self.sources ||= {}
+      source = DataDuck::Source.source(source_name)
+      self.sources[source] = source_data
+    end
+    def self.output(schema)
+      self.output_schema ||= {}
+      self.output_schema.merge!(schema)
+    end
+    def actions
+      self.class.actions
+    end
+    def output_schema
+      self.class.output_schema
+    end
+    def output_column_names
+      self.class.output_schema.keys.sort
+    end
+    def extract!
+      puts "Extracting table #{ self.name }..."
+      self.errors ||= []
+      self.data = []
+      self.class.sources.each_pair do |source, source_columns|
+        import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
+        results = source.query(import_query)
+        self.data = results
+      end
+      self.data
+    end
+    def transform!
+      puts "Transforming table #{ self.name }..."
+      self.errors ||= []
+      self.actions.each do |action|
+        action_type = action[0]
+        action_method_name = action[1]
+        if action_type == :transform
+          self.data.map! { |row| self.public_send(action_method_name, row) }
+        elsif action_type == :validate
+          self.data.each do |row|
+            error = self.public_send(action_method_name, row)
+            self.errors << error if !error.blank?
+          end
+        end
+      end
+    end
+    def name
+      DataDuck::Util.camelcase_to_underscore(self.class.name)
+    end
+  end
+end

data/lib/dataduck/util.rb ADDED

@@ -0,0 +1,15 @@
+module DataDuck
+  class Util
+    def self.underscore_to_camelcase(str)
+      str.split('_').map{ |chunk| chunk.capitalize }.join
+    end
+    def self.camelcase_to_underscore(str)
+      str.gsub(/::/, '/')
+          .gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
+          .gsub(/([a-z\d])([A-Z])/,'\1_\2')
+          .tr("-", "_")
+          .downcase
+    end
+  end
+end

data/lib/dataduck/version.rb ADDED

@@ -0,0 +1,6 @@
+module DataDuck
+  VERSION_MAJOR = 0
+  VERSION_MINOR = 2
+  VERSION_PATCH = 0
+  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
+end

data/lib/helpers/module_vars.rb ADDED

@@ -0,0 +1,19 @@
+module ModuleVars
+  def define_class_method(name, &block)
+    (class << self; self; end).instance_eval do
+      define_method(name, &block)
+    end
+  end
+  def create_module_var(name, val = nil)
+    class_variable_set("@@#{ name }", val)
+    define_class_method(name) do
+      class_variable_get("@@#{ name }")
+    end
+    define_class_method("#{name}=") do |set_to|
+      class_variable_set("@@#{ name }", set_to)
+    end
+  end
+end

data/lib/templates/quickstart/main.rb.erb ADDED

@@ -0,0 +1,10 @@
+require 'rubygems'
+require 'bundler/setup'
+Bundler.require
+class MyETL < DataDuck::ETL
+  destination :my_destination
+end
+etl = MyETL.new
+etl.process!

data/lib/templates/quickstart/table.rb.erb ADDED

@@ -0,0 +1,7 @@
+class <%= table_name %> < DataDuck::Table
+  source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
+  output({<% columns.each do |col| %>
+      <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
+  })
+end

data/static/logo.png ADDED

Binary file

metadata ADDED

@@ -0,0 +1,178 @@
+--- !ruby/object:Gem::Specification
+name: dataduck
+version: !ruby/object:Gem::Version
+  version: 0.2.0
+platform: ruby
+authors:
+- Jeff Pickhardt
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-10-10 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+- !ruby/object:Gem::Dependency
+  name: sequel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.19'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.19'
+- !ruby/object:Gem::Dependency
+  name: pg
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.16'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.16'
+- !ruby/object:Gem::Dependency
+  name: aws-sdk
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: sequel-redshift
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A straightforward, effective ETL framework.
+email:
+- pickhardt@gmail.com
+- admin@dataducketl.com
+executables:
+- dataduck
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".ruby-version"
+- Gemfile
+- README.md
+- Rakefile
+- bin/console
+- bin/dataduck
+- bin/setup
+- dataduck.gemspec
+- examples/example/.gitignore
+- examples/example/.ruby-version
+- examples/example/Gemfile
+- examples/example/README.md
+- examples/example/config/replace_me.yml
+- examples/example/src/main.rb
+- examples/example/src/tables/games.rb
+- examples/example/src/tables/users.rb
+- lib/dataduck.rb
+- lib/dataduck/commands.rb
+- lib/dataduck/destination.rb
+- lib/dataduck/etl.rb
+- lib/dataduck/mysql_source.rb
+- lib/dataduck/postgresql_source.rb
+- lib/dataduck/redshift_destination.rb
+- lib/dataduck/s3_object.rb
+- lib/dataduck/source.rb
+- lib/dataduck/sql_db_source.rb
+- lib/dataduck/table.rb
+- lib/dataduck/util.rb
+- lib/dataduck/version.rb
+- lib/helpers/module_vars.rb
+- lib/templates/quickstart/main.rb.erb
+- lib/templates/quickstart/table.rb.erb
+- static/logo.png
+homepage: http://dataducketl.com/
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.8
+signing_key:
+specification_version: 4
+summary: A straightforward, effective ETL framework.
+test_files: []