dataduck 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f87bbaf674b1943242d3ea173a5e34fe00e0724
4
+ data.tar.gz: b8fbacadd9323ab917498712c8d4f39f1f5ca907
5
+ SHA512:
6
+ metadata.gz: 40bbfce9c990d1542236c59967c31fe3fe5982c84bed12ccaf604c7ce15f2cebc5432b865dfedac5e95607dc37f20e0d681462ee9e7936e30ffdce8361688c96
7
+ data.tar.gz: fdc25e1ddf3a00faeceb13f11f4b7452f4085b3c0e5ca805137cc21174727aabec052dd335c371cd21c78db7ca0af7ef4e5a93f2876df9abfff531ad90bc8612
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ log
11
+ .DS_Store
12
+ *.lock
13
+ .idea
14
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1 @@
1
+ ruby-2.1.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in dataduck.gemspec
4
+ gemspec
@@ -0,0 +1,87 @@
1
+ # DataDuck ETL
2
+
3
+ ##### Set up in under 5 minutes
4
+
5
+ DataDuck ETL is probably the quickest extract-transform-load framework system to set up. If you want to set up a data warehouse, give DataDuck ETL a try.
6
+
7
+ ##### Extract-transform-load to Amazon Redshift
8
+
9
+ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon S3).
10
+
11
+ ![DataDuck ETL](static/logo.png "DataDuck ETL")
12
+
13
+ ## Installation
14
+
15
+ ##### Example project
16
+
17
+ See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
18
+
19
+ ##### Instructions for using DataDuck ETL
20
+
21
+ Create a new project, then add the following to your Gemfile:
22
+
23
+ ```ruby
24
+ gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
25
+ ```
26
+
27
+ Then execute:
28
+
29
+ $ bundle install
30
+
31
+ Finally, run the quickstart command:
32
+
33
+ $ dataduck quickstart
34
+
35
+ The quickstart wizard will ask you for credentials to your database, then create the basic setup for your project. After the setup, your project's ETL can be run by running `ruby src/main.rb`
36
+
37
+ If you'd like to run this regularly, such as every night, it's recommended to use the [whenever](https://github.com/javan/whenever) gem to manage a cron job to regularly run the ETL.
38
+
39
+ ## Documentation
40
+
41
+ Tables are defined in their own file under /src/tables. Here's an example table:
42
+
43
+ ```ruby
44
+ class Decks < DataDuck::Table
45
+ source :my_database, ["id", "name", "user_id", "cards",
46
+ "num_wins", "num_losses", "created_at", "updated_at",
47
+ "is_drafted", "num_draft_wins", "num_draft_losses"]
48
+
49
+ transforms :calculate_num_totals
50
+
51
+ validates :validates_num_total
52
+
53
+ output({
54
+ :id => :integer,
55
+ :name => :string,
56
+ :user_id => :integer,
57
+ :num_wins => :integer,
58
+ :num_losses => :integer,
59
+ :num_total => :integer,
60
+ :num_draft_total => :integer,
61
+ :created_at => :datetime,
62
+ :updated_at => :datetime,
63
+ :is_drafted => :boolean,
64
+ # Note that num_draft_wins and num_draft_losses
65
+ # are not included in the output, but are used in
66
+ # the transformation.
67
+ })
68
+
69
+ def calculate_num_totals(row)
70
+ row[:num_total] = row[:num_wins] + row[:num_losses]
71
+ row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
72
+ row
73
+ end
74
+
75
+ def validates_num_total(row)
76
+ return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
77
+ end
78
+ end
79
+ ```
80
+
81
+ ## Contributing
82
+
83
+ To contribute, get in touch at http://DataDuckETL.com/ so that we can share the [Contributor License Agreement (CLA)](https://en.wikipedia.org/wiki/Contributor_License_Agreement) with you, then create a pull request.
84
+
85
+ ## License
86
+
87
+ Get in touch or visit [http://dataducketl.com/licensing](http://dataducketl.com/licensing) for licensing.
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "dataduck"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../lib/dataduck"
4
+ require_relative "../lib/dataduck/commands"
5
+
6
+ DataDuck::Commands.route_command(ARGV)
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'dataduck/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.authors = ["Jeff Pickhardt"]
8
+ spec.description = "A straightforward, effective ETL framework."
9
+ spec.email = ["pickhardt@gmail.com", "admin@dataducketl.com"]
10
+ spec.executables = ["dataduck"]
11
+ spec.homepage = "http://dataducketl.com/"
12
+ spec.name = "dataduck"
13
+ spec.summary = "A straightforward, effective ETL framework."
14
+ spec.version = DataDuck::VERSION
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "bin"
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.6"
21
+ spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rspec", "~> 3.3"
23
+
24
+ spec.add_runtime_dependency "sequel", '~> 4.19'
25
+ spec.add_runtime_dependency "pg", '~> 0.16'
26
+ spec.add_runtime_dependency "aws-sdk", "~> 2.0"
27
+ spec.add_runtime_dependency "sequel-redshift"
28
+ end
@@ -0,0 +1,5 @@
1
+ .idea
2
+ .DS_Store
3
+ config/secrets.yml
4
+ config/secret
5
+ *.lock
@@ -0,0 +1 @@
1
+ ruby-2.1.2
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
4
+
5
+ ruby '2.1.2'
@@ -0,0 +1,11 @@
1
+ # DataDuck ETL Example
2
+
3
+ This gives an example project showing how to set up [DataDuck ETL](http://dataducketl.com/)
4
+
5
+ # Instructions
6
+
7
+ Copy /config/replace_me.yml to /config/secret/development.yml, then replace the secrets with your AWS and DB connection details.
8
+
9
+ For each table you want to import, create a table file in /src/tables. You can use /src/tables/games.rb and /src/tables/users.rb as examples. (You should also delete, modify, or rename games.rb and users.rb, by the way, otherwise DataDuck ETL will try to load them.)
10
+
11
+ For further help, reach out at [http://dataducketl.com/](http://dataducketl.com/)
@@ -0,0 +1,22 @@
1
+ # Move this file to /config/secret/development.yml and /config/secret/production.yml
2
+ destinations:
3
+ main_destination:
4
+ type: redshift
5
+ aws_key: YOUR_AWS_KEY
6
+ aws_secret: YOUR_AWS_SECRET
7
+ s3_bucket: YOUR_BUCKET
8
+ s3_region: YOUR_BUCKET_REGION
9
+ host: redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com
10
+ port: 5439
11
+ database: main
12
+ schema: public
13
+ username: YOUR_UESRNAME
14
+ password: YOUR_PASSWORD
15
+ sources:
16
+ my_database:
17
+ type: postgresql
18
+ host: some.host.goes.here.com
19
+ database: db_name_goes_here
20
+ port: 5522
21
+ username: some_username_goes_here_probably_read_only
22
+ password: some_password_goes_here
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.require
4
+
5
+ require_relative "tables/games"
6
+ require_relative "tables/users"
7
+
8
+ class MyCompanyETL < DataDuck::ETL
9
+ destination :main_destination
10
+ end
11
+
12
+ etl = MyCompanyETL.new
13
+ etl.process!
@@ -0,0 +1,10 @@
1
+ class Games < DataDuck::Table
2
+ source :my_database, [:id, :first_user_id, :second_user_id, :game_type]
3
+
4
+ output({
5
+ :id => :integer,
6
+ :first_user_id => :integer,
7
+ :second_user_id => :integer,
8
+ :game_type => :string,
9
+ })
10
+ end
@@ -0,0 +1,16 @@
1
+ class Users < DataDuck::Table
2
+ source :my_database, [:id, :username, :rating, :credits]
3
+
4
+ validate :non_negative_credits
5
+
6
+ columns({
7
+ :id => :integer,
8
+ :username => :string,
9
+ :rating => :integer,
10
+ :credits => :integer,
11
+ })
12
+
13
+ def non_negative_credits(row)
14
+ return "User id #{ row[:id] } has negative value of #{ row[:credits] } for credits." if row[:credits] < 0
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ Dir[File.dirname(__FILE__) + '/helpers/*.rb'].each do |file|
2
+ require file
3
+ end
4
+
5
+ Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
6
+ require file
7
+ end
8
+
9
+ require 'yaml'
10
+
11
+ module DataDuck
12
+ extend ModuleVars
13
+
14
+ ENV['DATADUCK_ENV'] ||= "development"
15
+ create_module_var("environment", ENV['DATADUCK_ENV'])
16
+
17
+ spec = Gem::Specification.find_by_name("dataduck")
18
+ create_module_var("gem_root", spec.gem_dir)
19
+
20
+ create_module_var("project_root", Dir.getwd)
21
+ create_module_var("config", {})
22
+
23
+ dd_env_path = DataDuck.project_root + "/config/secret/#{ ENV['DATADUCK_ENV'] }.yml"
24
+ env_config = File.exist?(dd_env_path) ? YAML.load_file(dd_env_path) : {}
25
+ DataDuck.config.merge!(env_config)
26
+
27
+ create_module_var("sources", {})
28
+ create_module_var("destinations", {})
29
+ end
@@ -0,0 +1,165 @@
1
+ require 'erb'
2
+ require 'yaml'
3
+ require 'fileutils'
4
+
5
+ module DataDuck
6
+ class Commands
7
+ class Namespace
8
+ def initialize(hash = {})
9
+ hash.each do |key, value|
10
+ singleton_class.send(:define_method, key) { value }
11
+ end
12
+ end
13
+
14
+ def get_binding
15
+ binding
16
+ end
17
+ end
18
+
19
+ def self.acceptable_commands
20
+ ['console', 'quickstart']
21
+ end
22
+
23
+ def self.route_command(args)
24
+ if args.length == 0
25
+ return DataDuck::Commands.help
26
+ end
27
+
28
+ command = args[0]
29
+ if !Commands.acceptable_commands.include?(command)
30
+ puts "No such command: #{ command }"
31
+ return DataDuck::Commands.help
32
+ end
33
+
34
+ DataDuck::Commands.public_send(command)
35
+ end
36
+
37
+ def self.console
38
+ require "irb"
39
+ IRB.start
40
+ end
41
+
42
+ def self.help
43
+ puts "Usage: dataduck commandname"
44
+ end
45
+
46
+ def self.quickstart
47
+ puts "Welcome to DataDuck!"
48
+ puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
49
+
50
+ puts "Enter the source (Postgres database) hostname:"
51
+ source_host = STDIN.gets.strip
52
+
53
+ puts "Enter the name of the database when connecting to #{ source_host }:"
54
+ source_database = STDIN.gets.strip
55
+
56
+ puts "Enter the source's port:"
57
+ source_port = STDIN.gets.strip.to_i
58
+
59
+ puts "Enter the username:"
60
+ source_username = STDIN.gets.strip
61
+
62
+ puts "Enter the password:"
63
+ source_password = STDIN.noecho(&:gets).chomp
64
+
65
+ db_source = DataDuck::PostgresqlSource.new({
66
+ 'type' => 'postgresql',
67
+ 'host' => source_host,
68
+ 'database' => source_database,
69
+ 'port' => source_port,
70
+ 'username' => source_username,
71
+ 'password' => source_password,
72
+ })
73
+
74
+ puts "Connecting to source database..."
75
+ table_names = db_source.table_names
76
+ puts "Connection successful. Detected #{ table_names.length } tables."
77
+ puts "Creating scaffolding..."
78
+ table_names.each do |table_name|
79
+ DataDuck::Commands.quickstart_create_table(table_name, db_source)
80
+ end
81
+
82
+ config_obj = {
83
+ 'sources' => {
84
+ 'my_database' => {
85
+ 'type' => 'postgresql',
86
+ 'host' => source_host,
87
+ 'database' => source_database,
88
+ 'port' => source_port,
89
+ 'username' => source_username,
90
+ 'password' => source_password,
91
+ }
92
+ },
93
+ 'destinations' => {
94
+ 'my_destination' => {
95
+ 'type' => 'redshift',
96
+ 'aws_key' => 'YOUR_AWS_KEY',
97
+ 'aws_secret' => 'YOUR_AWS_SECRET',
98
+ 's3_bucket' => 'YOUR_BUCKET',
99
+ 's3_region' => 'YOUR_BUCKET_REGION',
100
+ 'host' => 'redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com',
101
+ 'port' => 5439,
102
+ 'database' => 'main',
103
+ 'schema' => 'public',
104
+ 'username' => 'YOUR_UESRNAME',
105
+ 'password' => 'YOUR_PASSWORD',
106
+ }
107
+ }
108
+ }
109
+
110
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/config/secret/#{ DataDuck.environment }.yml", config_obj.to_yaml)
111
+ DataDuck::Commands.quickstart_save_main
112
+ DataDuck::Commands.quickstart_update_gitignore
113
+
114
+ puts "Quickstart complete!"
115
+ puts "You still need to edit your config/secret/*.yml file with your AWS and Redshift credentials."
116
+ puts "Run your ETL with: ruby src/main.rb"
117
+ end
118
+
119
+ def self.quickstart_update_gitignore
120
+ main_gitignore_path = "#{ DataDuck.project_root }/.gitignore"
121
+ FileUtils.touch(main_gitignore_path)
122
+
123
+ secret_gitignore_path = "#{ DataDuck.project_root }/config/secret/.gitignore"
124
+ FileUtils.touch(secret_gitignore_path)
125
+ output = File.open(secret_gitignore_path, "w")
126
+ output << '[^.]*'
127
+ output.close
128
+ end
129
+
130
+ def self.quickstart_create_table(table_name, db)
131
+ columns = []
132
+ schema = db.schema(table_name)
133
+ schema.each do |property_schema|
134
+ property_name = property_schema[0]
135
+ property_type = property_schema[1][:type]
136
+ commented_out = ['ssn', 'socialsecurity', 'password', 'encrypted_password', 'salt', 'password_salt', 'pw'].include?(property_name.to_s.downcase)
137
+ columns << [property_name.to_s, property_type.to_s, commented_out]
138
+ end
139
+
140
+ table_name = table_name.to_s.downcase
141
+ table_name_camelcased = table_name.split('_').collect(&:capitalize).join
142
+ namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
143
+ template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
144
+ result = ERB.new(template).result(namespace.get_binding)
145
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
146
+ end
147
+
148
+ def self.quickstart_save_file(output_path_full, contents)
149
+ *output_path, output_filename = output_path_full.split('/')
150
+ output_path = output_path.join("/")
151
+ FileUtils::mkdir_p(output_path)
152
+
153
+ output = File.open(output_path_full, "w")
154
+ output << contents
155
+ output.close
156
+ end
157
+
158
+ def self.quickstart_save_main
159
+ namespace = Namespace.new
160
+ template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/main.rb.erb", 'r').read
161
+ result = ERB.new(template).result(namespace.get_binding)
162
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/main.rb", result)
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,40 @@
1
+ module DataDuck
2
+ class Destination
3
+ def self.destination_config(name)
4
+ if DataDuck.config['destinations'].nil? || DataDuck.config['destinations'][name.to_s].nil?
5
+ raise Exception.new("Could not find destination #{ name } in destinations configs.")
6
+ end
7
+
8
+ DataDuck.config['destinations'][name.to_s]
9
+ end
10
+
11
+ def load_tables!(tables)
12
+ raise Exception.new("Must implement load_tables! in subclass")
13
+ end
14
+
15
+ def before_all_loads!
16
+
17
+ end
18
+
19
+ def after_all_loads!
20
+ # e.g. cleanup
21
+ end
22
+
23
+ def self.destination(destination_name)
24
+ destination_name = destination_name.to_s
25
+
26
+ if DataDuck.destinations[destination_name]
27
+ return DataDuck.destinations[destination_name]
28
+ end
29
+
30
+ destination_configuration = DataDuck::Destination.destination_config(destination_name)
31
+ destination_type = destination_configuration['type']
32
+ if destination_type == "redshift"
33
+ DataDuck.destinations[destination_name] = DataDuck::RedshiftDestination.new(destination_configuration)
34
+ return DataDuck.destinations[destination_name]
35
+ else
36
+ raise ArgumentError.new("Unknown type '#{ destination_type }' for destination #{ destination_name }.")
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require_relative 'redshift_destination.rb'
2
+
3
+ module DataDuck
4
+ class ETL
5
+ class << self
6
+ attr_accessor :destinations
7
+ end
8
+
9
+ def self.destination(destination_name)
10
+ self.destinations ||= []
11
+ self.destinations << DataDuck::Destination.destination(destination_name)
12
+ end
13
+
14
+ def initialize(options = {})
15
+ @tables = options[:tables] || []
16
+
17
+ @autoload_tables = options[:autoload_tables].nil? ? true : options[:autoload_tables]
18
+ if @autoload_tables
19
+ Dir[DataDuck.project_root + "/src/tables/*.rb"].each do |file|
20
+ table_name_underscores = file.split("/").last.gsub(".rb", "")
21
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
22
+ require file
23
+ table = Object.const_get(table_name_camelized)
24
+ if table <= DataDuck::Table
25
+ @tables << table
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ def process!
32
+ puts "Processing ETL..."
33
+
34
+ table_instances = []
35
+ @tables.each do |table_class|
36
+ table_instance = table_class.new
37
+ table_instances << table_instance
38
+ table_instance.extract!
39
+ table_instance.transform!
40
+ end
41
+
42
+ self.class.destinations.each do |destination|
43
+ destination.before_all_loads!(table_instances)
44
+ destination.load_tables!(table_instances)
45
+ destination.after_all_loads!(table_instances)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,11 @@
1
+ require_relative 'sql_db_source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class MysqlSource < DataDuck::SqlDbSource
7
+ def db_type
8
+ 'mysql'
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ require_relative 'sql_db_source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class PostrgresqlSource < DataDuck::SqlDbSource
7
+ def db_type
8
+ 'postgres'
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,176 @@
1
+ require_relative 'destination.rb'
2
+
3
+ module DataDuck
4
+ class RedshiftDestination < DataDuck::Destination
5
+ def initialize(config)
6
+ @aws_key = config['aws_key']
7
+ @aws_secret = config['aws_secret']
8
+ @s3_bucket = config['s3_bucket']
9
+ @s3_region = config['s3_region']
10
+ @host = config['host']
11
+ @port = config['port']
12
+ @database = config['database']
13
+ @schema = config['schema']
14
+ @username = config['username']
15
+ @password = config['password']
16
+ @redshift_connection = nil
17
+ end
18
+
19
+ def connection
20
+ @redshift_connection ||= Sequel.connect("redshift://#{ @username }:#{ @password }@#{ @host }:#{ @port }/#{ @database }" +
21
+ "?force_standard_strings=f",
22
+ :client_min_messages => '',
23
+ :force_standard_strings => false
24
+ )
25
+ end
26
+
27
+ def copy_query(table, s3_path)
28
+ properties_joined_string = "\"#{ table.output_column_names.join('","') }\""
29
+ query_fragments = []
30
+ query_fragments << "COPY #{ self.staging_table_name(table) } (#{ properties_joined_string })"
31
+ query_fragments << "FROM '#{ s3_path }'"
32
+ query_fragments << "CREDENTIALS 'aws_access_key_id=#{ @aws_key };aws_secret_access_key=#{ @aws_secret }'"
33
+ query_fragments << "REGION '#{ @s3_region }'"
34
+ query_fragments << "CSV TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
35
+ query_fragments << "DATEFORMAT 'auto'"
36
+ return query_fragments.join(" ")
37
+ end
38
+
39
+ def create_columns_on_data_warehouse!(table)
40
+ columns = get_columns_in_data_warehouse(table)
41
+ column_names = columns.map { |col| col[:name].to_s }
42
+ table.output_schema.map do |name, data_type|
43
+ if !column_names.include?(name.to_s)
44
+ redshift_data_type = data_type.to_s
45
+ redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
46
+ self.run_query("ALTER TABLE #{ table.name } ADD #{ name } #{ redshift_data_type }")
47
+ end
48
+ end
49
+ end
50
+
51
+ def create_table_query(table, table_name = nil)
52
+ table_name ||= table.name
53
+ props_array = table.output_schema.map do |name, data_type|
54
+ redshift_data_type = data_type.to_s
55
+ redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
56
+ "\"#{ name }\" #{ redshift_data_type }"
57
+ end
58
+ props_string = props_array.join(', ')
59
+ "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string })"
60
+ end
61
+
62
+ def create_output_table_on_data_warehouse!(table)
63
+ self.run_query(self.create_table_query(table))
64
+ self.create_columns_on_data_warehouse!(table)
65
+ end
66
+
67
+ def create_staging_table!(table)
68
+ table_name = self.staging_table_name(table)
69
+ self.drop_staging_table!(table)
70
+ self.run_query(self.create_table_query(table, table_name))
71
+ end
72
+
73
+ def data_as_csv_string(data, property_names)
74
+ data_string_components = [] # for performance reasons, join strings this way
75
+ data.each do |result|
76
+ property_names.each_with_index do |property_name, index|
77
+ value = result[property_name.to_sym]
78
+
79
+ if index == 0
80
+ data_string_components << '"'
81
+ end
82
+
83
+ data_string_components << DataDuck::RedshiftDestination.value_to_string(value)
84
+
85
+ if index == property_names.length - 1
86
+ data_string_components << '"'
87
+ else
88
+ data_string_components << '","'
89
+ end
90
+ end
91
+ data_string_components << "\n"
92
+ end
93
+
94
+ return data_string_components.join
95
+ end
96
+
97
+ def drop_staging_table!(table)
98
+ self.run_query("DROP TABLE IF EXISTS #{ self.staging_table_name(table) }")
99
+ end
100
+
101
+ def get_columns_in_data_warehouse(table)
102
+ query = "SELECT pg_table_def.column as name, type as data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table.name }'"
103
+ results = self.run_query(query)
104
+
105
+ columns = []
106
+ results.each do |result|
107
+ columns << {
108
+ name: result[:name],
109
+ data_type: result[:data_type],
110
+ distkey: result[:distkey],
111
+ sortkey: result[:sortkey]
112
+ }
113
+ end
114
+
115
+ return columns
116
+ end
117
+
118
+ def merge_from_staging!(table)
119
+ # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
120
+ staging_name = self.staging_table_name(table)
121
+ delete_query = "DELETE FROM #{ table.name } USING #{ staging_name } WHERE #{ table.name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
122
+ self.run_query(delete_query)
123
+ insert_query = "INSERT INTO #{ table.name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
124
+ self.run_query(insert_query)
125
+ end
126
+
127
+ def run_query(sql)
128
+ self.connection[sql].map { |elem| elem }
129
+ end
130
+
131
+ def staging_table_name(table)
132
+ "zz_dataduck_#{ table.name }"
133
+ end
134
+
135
+ def upload_table_to_s3!(table)
136
+ now_epoch = Time.now.to_i.to_s
137
+ filepath = "pending/#{ table.name.downcase }_#{ now_epoch }.csv"
138
+
139
+ table_csv = self.data_as_csv_string(table.data, table.output_column_names)
140
+
141
+ s3_obj = S3Object.new(filepath, table_csv, @aws_key, @aws_secret,
142
+ @s3_bucket, @s3_region)
143
+ s3_obj.upload!
144
+ return s3_obj
145
+ end
146
+
147
+ def before_all_loads!(tables)
148
+
149
+ end
150
+
151
+ def load_tables!(tables)
152
+ tables.each do |table|
153
+ puts "Loading table #{ table.name }..."
154
+ s3_object = self.upload_table_to_s3!(table)
155
+ self.create_staging_table!(table)
156
+ self.create_output_table_on_data_warehouse!(table)
157
+ self.run_query(self.copy_query(table, s3_object.s3_path))
158
+ self.merge_from_staging!(table)
159
+ self.drop_staging_table!(table)
160
+ end
161
+ end
162
+
163
+ def after_all_loads!(tables)
164
+
165
+ end
166
+
167
+ def self.value_to_string(value)
168
+ string_value = ''
169
+ if value.respond_to? :to_s
170
+ string_value = value.to_s
171
+ end
172
+ string_value.gsub!('"', '""')
173
+ return string_value
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,71 @@
1
+ require 'aws-sdk'
2
+
3
+ module DataDuck
4
+ class S3Object
5
+ def initialize(path, contents, aws_key, aws_secret, bucket, region, options={})
6
+ @path = path
7
+ @contents = contents
8
+ @options = options
9
+ @aws_key = aws_key
10
+ @aws_secret = aws_secret
11
+ @bucket = bucket
12
+ @region = region
13
+ end
14
+
15
+ def upload!
16
+ s3 = Aws::S3::Client.new(
17
+ region: @region,
18
+ access_key_id: @aws_key,
19
+ secret_access_key: @aws_secret,
20
+ )
21
+
22
+ attempts = 0
23
+
24
+ while attempts <= S3Object.max_retries
25
+ attempts += 1
26
+ put_hash = @options.merge({
27
+ acl: 'private',
28
+ bucket: @bucket,
29
+ body: @contents,
30
+ key: self.full_path,
31
+ server_side_encryption: 'AES256',
32
+ })
33
+ begin
34
+ response = s3.put_object(put_hash)
35
+ rescue Exception => e
36
+ if attempts == S3Object.max_retries
37
+ throw e
38
+ end
39
+ end
40
+ end
41
+
42
+ response
43
+ end
44
+
45
+ def full_path
46
+ 'dataduck/' + @path
47
+ end
48
+
49
+ def s3_path
50
+ "s3://#{ @bucket }/#{ full_path }"
51
+ end
52
+
53
+ def self.max_retries
54
+ 3
55
+ end
56
+
57
+ def self.regions
58
+ [
59
+ { name: 'US Standard - N. Virginia', region: 'us-east-1' },
60
+ { name: 'US West - N. California', region: 'us-west-1' },
61
+ { name: 'US West - Oregon', region: 'us-west-2' },
62
+ { name: 'EU - Ireland', region: 'eu-west-1' },
63
+ { name: 'EU - Frankfurt', region: 'eu-central-1' },
64
+ { name: 'Asia Pacific - Singapore', region: 'ap-southeast-1' },
65
+ { name: 'Asia Pacific - Sydney', region: 'ap-southeast-2' },
66
+ { name: 'Asia Pacific - Tokyo', region: 'ap-northeast-1' },
67
+ { name: 'South America - Sao Paulo', region: 'sa-east-1' },
68
+ ]
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,46 @@
1
+ module DataDuck
2
+
3
+ class Source
4
+ def self.source_config(name)
5
+ if DataDuck.config['sources'].nil? || DataDuck.config['sources'][name.to_s].nil?
6
+ raise Exception.new("Could not find source #{ name } in source configs.")
7
+ end
8
+
9
+ DataDuck.config['sources'][name.to_s]
10
+ end
11
+
12
+ def self.source(name)
13
+ name = name.to_s
14
+
15
+ if DataDuck.sources[name]
16
+ return DataDuck.sources[name]
17
+ end
18
+
19
+ configuration = DataDuck::Source.source_config(name)
20
+ source_type = configuration['type']
21
+
22
+ if source_type == "postgresql"
23
+ DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
24
+ return DataDuck.sources[name]
25
+ else
26
+ raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
27
+ end
28
+ end
29
+
30
+ def connection
31
+ raise Exception.new("Must implement connection in subclass.")
32
+ end
33
+
34
+ def query
35
+ raise Exception.new("Must implement query in subclass.")
36
+ end
37
+
38
+ def schema(table_name)
39
+ self.connection.schema(table_name)
40
+ end
41
+
42
+ def self.skip_these_table_names
43
+ [:delayed_jobs, :schema_migrations]
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,41 @@
1
+ require_relative 'source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class SqlDbSource < DataDuck::Source
7
+ def initialize(data)
8
+ @host = data['host']
9
+ @port = data['port']
10
+ @username = data['username']
11
+ @password = data['password']
12
+ @database = data['database']
13
+ @initialized_db_type = data['db_type']
14
+ end
15
+
16
+ def connection
17
+ @connection ||= Sequel.connect(
18
+ adapter: self.db_type,
19
+ user: @username,
20
+ host: @host,
21
+ database: @database,
22
+ password: @password,
23
+ port: @port
24
+ )
25
+ end
26
+
27
+ def db_type
28
+ return @initialized_db_type if @initialized_db_type
29
+
30
+ raise Exception.new("Abstract method db_type must be overwritten by subclass, or passed as data when initializing.")
31
+ end
32
+
33
+ def table_names
34
+ self.connection.tables.map { |table| DataDuck::Source.skip_these_table_names.include?(table) ? nil : table }.compact
35
+ end
36
+
37
+ def query(sql)
38
+ self.connection.fetch(sql).all
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,82 @@
1
+ module DataDuck
2
+ class Table
3
+ class << self
4
+ attr_accessor :sources
5
+ attr_accessor :output_schema
6
+ attr_accessor :actions
7
+ attr_accessor :errors
8
+ end
9
+
10
+ attr_accessor :data
11
+
12
+ def self.transforms(transformation_name)
13
+ self.actions ||= []
14
+ self.actions << [:transform, transformation_name]
15
+ end
16
+ singleton_class.send(:alias_method, :transform, :transforms)
17
+
18
+ def self.validates(validation_name)
19
+ self.actions ||= []
20
+ self.actions << [:validate, validation_name]
21
+ end
22
+ singleton_class.send(:alias_method, :validate, :validates)
23
+
24
+ def self.source(source_name, source_data = [])
25
+ self.sources ||= {}
26
+ source = DataDuck::Source.source(source_name)
27
+ self.sources[source] = source_data
28
+ end
29
+
30
+ def self.output(schema)
31
+ self.output_schema ||= {}
32
+ self.output_schema.merge!(schema)
33
+ end
34
+
35
+ def actions
36
+ self.class.actions
37
+ end
38
+
39
+ def output_schema
40
+ self.class.output_schema
41
+ end
42
+
43
+ def output_column_names
44
+ self.class.output_schema.keys.sort
45
+ end
46
+
47
+ def extract!
48
+ puts "Extracting table #{ self.name }..."
49
+
50
+ self.errors ||= []
51
+ self.data = []
52
+ self.class.sources.each_pair do |source, source_columns|
53
+ import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
54
+ results = source.query(import_query)
55
+ self.data = results
56
+ end
57
+ self.data
58
+ end
59
+
60
+ def transform!
61
+ puts "Transforming table #{ self.name }..."
62
+
63
+ self.errors ||= []
64
+ self.actions.each do |action|
65
+ action_type = action[0]
66
+ action_method_name = action[1]
67
+ if action_type == :transform
68
+ self.data.map! { |row| self.public_send(action_method_name, row) }
69
+ elsif action_type == :validate
70
+ self.data.each do |row|
71
+ error = self.public_send(action_method_name, row)
72
+ self.errors << error if !error.blank?
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def name
79
+ DataDuck::Util.camelcase_to_underscore(self.class.name)
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,15 @@
1
+ module DataDuck
2
+ class Util
3
+ def self.underscore_to_camelcase(str)
4
+ str.split('_').map{ |chunk| chunk.capitalize }.join
5
+ end
6
+
7
+ def self.camelcase_to_underscore(str)
8
+ str.gsub(/::/, '/')
9
+ .gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
10
+ .gsub(/([a-z\d])([A-Z])/,'\1_\2')
11
+ .tr("-", "_")
12
+ .downcase
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,6 @@
1
+ module DataDuck
2
+ VERSION_MAJOR = 0
3
+ VERSION_MINOR = 2
4
+ VERSION_PATCH = 0
5
+ VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
6
+ end
@@ -0,0 +1,19 @@
1
+ module ModuleVars
2
+ def define_class_method(name, &block)
3
+ (class << self; self; end).instance_eval do
4
+ define_method(name, &block)
5
+ end
6
+ end
7
+
8
+ def create_module_var(name, val = nil)
9
+ class_variable_set("@@#{ name }", val)
10
+
11
+ define_class_method(name) do
12
+ class_variable_get("@@#{ name }")
13
+ end
14
+
15
+ define_class_method("#{name}=") do |set_to|
16
+ class_variable_set("@@#{ name }", set_to)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.require
4
+
5
+ class MyETL < DataDuck::ETL
6
+ destination :my_destination
7
+ end
8
+
9
+ etl = MyETL.new
10
+ etl.process!
@@ -0,0 +1,7 @@
1
+ class <%= table_name %> < DataDuck::Table
2
+ source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
3
+
4
+ output({<% columns.each do |col| %>
5
+ <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
6
+ })
7
+ end
Binary file
metadata ADDED
@@ -0,0 +1,178 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dataduck
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeff Pickhardt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: sequel
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '4.19'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '4.19'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pg
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.16'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.16'
83
+ - !ruby/object:Gem::Dependency
84
+ name: aws-sdk
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: sequel-redshift
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: A straightforward, effective ETL framework.
112
+ email:
113
+ - pickhardt@gmail.com
114
+ - admin@dataducketl.com
115
+ executables:
116
+ - dataduck
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - ".gitignore"
121
+ - ".rspec"
122
+ - ".ruby-version"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - bin/console
127
+ - bin/dataduck
128
+ - bin/setup
129
+ - dataduck.gemspec
130
+ - examples/example/.gitignore
131
+ - examples/example/.ruby-version
132
+ - examples/example/Gemfile
133
+ - examples/example/README.md
134
+ - examples/example/config/replace_me.yml
135
+ - examples/example/src/main.rb
136
+ - examples/example/src/tables/games.rb
137
+ - examples/example/src/tables/users.rb
138
+ - lib/dataduck.rb
139
+ - lib/dataduck/commands.rb
140
+ - lib/dataduck/destination.rb
141
+ - lib/dataduck/etl.rb
142
+ - lib/dataduck/mysql_source.rb
143
+ - lib/dataduck/postgresql_source.rb
144
+ - lib/dataduck/redshift_destination.rb
145
+ - lib/dataduck/s3_object.rb
146
+ - lib/dataduck/source.rb
147
+ - lib/dataduck/sql_db_source.rb
148
+ - lib/dataduck/table.rb
149
+ - lib/dataduck/util.rb
150
+ - lib/dataduck/version.rb
151
+ - lib/helpers/module_vars.rb
152
+ - lib/templates/quickstart/main.rb.erb
153
+ - lib/templates/quickstart/table.rb.erb
154
+ - static/logo.png
155
+ homepage: http://dataducketl.com/
156
+ licenses: []
157
+ metadata: {}
158
+ post_install_message:
159
+ rdoc_options: []
160
+ require_paths:
161
+ - lib
162
+ required_ruby_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ requirements: []
173
+ rubyforge_project:
174
+ rubygems_version: 2.4.8
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: A straightforward, effective ETL framework.
178
+ test_files: []