dataduck 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f87bbaf674b1943242d3ea173a5e34fe00e0724
4
+ data.tar.gz: b8fbacadd9323ab917498712c8d4f39f1f5ca907
5
+ SHA512:
6
+ metadata.gz: 40bbfce9c990d1542236c59967c31fe3fe5982c84bed12ccaf604c7ce15f2cebc5432b865dfedac5e95607dc37f20e0d681462ee9e7936e30ffdce8361688c96
7
+ data.tar.gz: fdc25e1ddf3a00faeceb13f11f4b7452f4085b3c0e5ca805137cc21174727aabec052dd335c371cd21c78db7ca0af7ef4e5a93f2876df9abfff531ad90bc8612
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ log
11
+ .DS_Store
12
+ *.lock
13
+ .idea
14
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1 @@
1
+ ruby-2.1.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in dataduck.gemspec
4
+ gemspec
@@ -0,0 +1,87 @@
1
+ # DataDuck ETL
2
+
3
+ ##### Set up in under 5 minutes
4
+
5
+ DataDuck ETL is probably the quickest extract-transform-load framework system to set up. If you want to set up a data warehouse, give DataDuck ETL a try.
6
+
7
+ ##### Extract-transform-load to Amazon Redshift
8
+
9
+ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon S3).
10
+
11
+ ![DataDuck ETL](static/logo.png "DataDuck ETL")
12
+
13
+ ## Installation
14
+
15
+ ##### Example project
16
+
17
+ See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
18
+
19
+ ##### Instructions for using DataDuck ETL
20
+
21
+ Create a new project, then add the following to your Gemfile:
22
+
23
+ ```ruby
24
+ gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
25
+ ```
26
+
27
+ Then execute:
28
+
29
+ $ bundle install
30
+
31
+ Finally, run the quickstart command:
32
+
33
+ $ dataduck quickstart
34
+
35
+ The quickstart wizard will ask you for credentials to your database, then create the basic setup for your project. After the setup, your project's ETL can be run by running `ruby src/main.rb`
36
+
37
+ If you'd like to run this regularly, such as every night, it's recommended to use the [whenever](https://github.com/javan/whenever) gem to manage a cron job to regularly run the ETL.
38
+
39
+ ## Documentation
40
+
41
+ Tables are defined in their own file under /src/tables. Here's an example table:
42
+
43
+ ```ruby
44
+ class Decks < DataDuck::Table
45
+ source :my_database, ["id", "name", "user_id", "cards",
46
+ "num_wins", "num_losses", "created_at", "updated_at",
47
+ "is_drafted", "num_draft_wins", "num_draft_losses"]
48
+
49
+ transforms :calculate_num_totals
50
+
51
+ validates :validates_num_total
52
+
53
+ output({
54
+ :id => :integer,
55
+ :name => :string,
56
+ :user_id => :integer,
57
+ :num_wins => :integer,
58
+ :num_losses => :integer,
59
+ :num_total => :integer,
60
+ :num_draft_total => :integer,
61
+ :created_at => :datetime,
62
+ :updated_at => :datetime,
63
+ :is_drafted => :boolean,
64
+ # Note that num_draft_wins and num_draft_losses
65
+ # are not included in the output, but are used in
66
+ # the transformation.
67
+ })
68
+
69
+ def calculate_num_totals(row)
70
+ row[:num_total] = row[:num_wins] + row[:num_losses]
71
+ row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
72
+ row
73
+ end
74
+
75
+ def validates_num_total(row)
76
+ return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
77
+ end
78
+ end
79
+ ```
80
+
81
+ ## Contributing
82
+
83
+ To contribute, get in touch at http://DataDuckETL.com/ so that we can share the [Contributor License Agreement (CLA)](https://en.wikipedia.org/wiki/Contributor_License_Agreement) with you, then create a pull request.
84
+
85
+ ## License
86
+
87
+ Get in touch or visit [http://dataducketl.com/licensing](http://dataducketl.com/licensing) for licensing.
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "dataduck"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../lib/dataduck"
4
+ require_relative "../lib/dataduck/commands"
5
+
6
+ DataDuck::Commands.route_command(ARGV)
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'dataduck/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.authors = ["Jeff Pickhardt"]
8
+ spec.description = "A straightforward, effective ETL framework."
9
+ spec.email = ["pickhardt@gmail.com", "admin@dataducketl.com"]
10
+ spec.executables = ["dataduck"]
11
+ spec.homepage = "http://dataducketl.com/"
12
+ spec.name = "dataduck"
13
+ spec.summary = "A straightforward, effective ETL framework."
14
+ spec.version = DataDuck::VERSION
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "bin"
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.6"
21
+ spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rspec", "~> 3.3"
23
+
24
+ spec.add_runtime_dependency "sequel", '~> 4.19'
25
+ spec.add_runtime_dependency "pg", '~> 0.16'
26
+ spec.add_runtime_dependency "aws-sdk", "~> 2.0"
27
+ spec.add_runtime_dependency "sequel-redshift"
28
+ end
@@ -0,0 +1,5 @@
1
+ .idea
2
+ .DS_Store
3
+ config/secrets.yml
4
+ config/secret
5
+ *.lock
@@ -0,0 +1 @@
1
+ ruby-2.1.2
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'dataduck', :git => 'git://github.com/DataDuckETL/DataDuck.git'
4
+
5
+ ruby '2.1.2'
@@ -0,0 +1,11 @@
1
+ # DataDuck ETL Example
2
+
3
+ This gives an example project showing how to set up [DataDuck ETL](http://dataducketl.com/)
4
+
5
+ # Instructions
6
+
7
+ Copy /config/replace_me.yml to /config/secret/development.yml, then replace the secrets with your AWS and DB connection details.
8
+
9
+ For each table you want to import, create a table file in /src/tables. You can use /src/tables/games.rb and /src/tables/users.rb as examples. (You should also delete, modify, or rename games.rb and users.rb, by the way, otherwise DataDuck ETL will try to load them.)
10
+
11
+ For further help, reach out at [http://dataducketl.com/](http://dataducketl.com/)
@@ -0,0 +1,22 @@
1
+ # Move this file to /config/secret/development.yml and /config/secret/production.yml
2
+ destinations:
3
+ main_destination:
4
+ type: redshift
5
+ aws_key: YOUR_AWS_KEY
6
+ aws_secret: YOUR_AWS_SECRET
7
+ s3_bucket: YOUR_BUCKET
8
+ s3_region: YOUR_BUCKET_REGION
9
+ host: redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com
10
+ port: 5439
11
+ database: main
12
+ schema: public
13
+ username: YOUR_UESRNAME
14
+ password: YOUR_PASSWORD
15
+ sources:
16
+ my_database:
17
+ type: postgresql
18
+ host: some.host.goes.here.com
19
+ database: db_name_goes_here
20
+ port: 5522
21
+ username: some_username_goes_here_probably_read_only
22
+ password: some_password_goes_here
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.require
4
+
5
+ require_relative "tables/games"
6
+ require_relative "tables/users"
7
+
8
+ class MyCompanyETL < DataDuck::ETL
9
+ destination :main_destination
10
+ end
11
+
12
+ etl = MyCompanyETL.new
13
+ etl.process!
@@ -0,0 +1,10 @@
1
+ class Games < DataDuck::Table
2
+ source :my_database, [:id, :first_user_id, :second_user_id, :game_type]
3
+
4
+ output({
5
+ :id => :integer,
6
+ :first_user_id => :integer,
7
+ :second_user_id => :integer,
8
+ :game_type => :string,
9
+ })
10
+ end
@@ -0,0 +1,16 @@
1
+ class Users < DataDuck::Table
2
+ source :my_database, [:id, :username, :rating, :credits]
3
+
4
+ validate :non_negative_credits
5
+
6
+ columns({
7
+ :id => :integer,
8
+ :username => :string,
9
+ :rating => :integer,
10
+ :credits => :integer,
11
+ })
12
+
13
+ def non_negative_credits(row)
14
+ return "User id #{ row[:id] } has negative value of #{ row[:credits] } for credits." if row[:credits] < 0
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ Dir[File.dirname(__FILE__) + '/helpers/*.rb'].each do |file|
2
+ require file
3
+ end
4
+
5
+ Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
6
+ require file
7
+ end
8
+
9
+ require 'yaml'
10
+
11
+ module DataDuck
12
+ extend ModuleVars
13
+
14
+ ENV['DATADUCK_ENV'] ||= "development"
15
+ create_module_var("environment", ENV['DATADUCK_ENV'])
16
+
17
+ spec = Gem::Specification.find_by_name("dataduck")
18
+ create_module_var("gem_root", spec.gem_dir)
19
+
20
+ create_module_var("project_root", Dir.getwd)
21
+ create_module_var("config", {})
22
+
23
+ dd_env_path = DataDuck.project_root + "/config/secret/#{ ENV['DATADUCK_ENV'] }.yml"
24
+ env_config = File.exist?(dd_env_path) ? YAML.load_file(dd_env_path) : {}
25
+ DataDuck.config.merge!(env_config)
26
+
27
+ create_module_var("sources", {})
28
+ create_module_var("destinations", {})
29
+ end
@@ -0,0 +1,165 @@
1
+ require 'erb'
2
+ require 'yaml'
3
+ require 'fileutils'
4
+
5
+ module DataDuck
6
+ class Commands
7
+ class Namespace
8
+ def initialize(hash = {})
9
+ hash.each do |key, value|
10
+ singleton_class.send(:define_method, key) { value }
11
+ end
12
+ end
13
+
14
+ def get_binding
15
+ binding
16
+ end
17
+ end
18
+
19
+ def self.acceptable_commands
20
+ ['console', 'quickstart']
21
+ end
22
+
23
+ def self.route_command(args)
24
+ if args.length == 0
25
+ return DataDuck::Commands.help
26
+ end
27
+
28
+ command = args[0]
29
+ if !Commands.acceptable_commands.include?(command)
30
+ puts "No such command: #{ command }"
31
+ return DataDuck::Commands.help
32
+ end
33
+
34
+ DataDuck::Commands.public_send(command)
35
+ end
36
+
37
+ def self.console
38
+ require "irb"
39
+ IRB.start
40
+ end
41
+
42
+ def self.help
43
+ puts "Usage: dataduck commandname"
44
+ end
45
+
46
+ def self.quickstart
47
+ puts "Welcome to DataDuck!"
48
+ puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
49
+
50
+ puts "Enter the source (Postgres database) hostname:"
51
+ source_host = STDIN.gets.strip
52
+
53
+ puts "Enter the name of the database when connecting to #{ source_host }:"
54
+ source_database = STDIN.gets.strip
55
+
56
+ puts "Enter the source's port:"
57
+ source_port = STDIN.gets.strip.to_i
58
+
59
+ puts "Enter the username:"
60
+ source_username = STDIN.gets.strip
61
+
62
+ puts "Enter the password:"
63
+ source_password = STDIN.noecho(&:gets).chomp
64
+
65
+ db_source = DataDuck::PostgresqlSource.new({
66
+ 'type' => 'postgresql',
67
+ 'host' => source_host,
68
+ 'database' => source_database,
69
+ 'port' => source_port,
70
+ 'username' => source_username,
71
+ 'password' => source_password,
72
+ })
73
+
74
+ puts "Connecting to source database..."
75
+ table_names = db_source.table_names
76
+ puts "Connection successful. Detected #{ table_names.length } tables."
77
+ puts "Creating scaffolding..."
78
+ table_names.each do |table_name|
79
+ DataDuck::Commands.quickstart_create_table(table_name, db_source)
80
+ end
81
+
82
+ config_obj = {
83
+ 'sources' => {
84
+ 'my_database' => {
85
+ 'type' => 'postgresql',
86
+ 'host' => source_host,
87
+ 'database' => source_database,
88
+ 'port' => source_port,
89
+ 'username' => source_username,
90
+ 'password' => source_password,
91
+ }
92
+ },
93
+ 'destinations' => {
94
+ 'my_destination' => {
95
+ 'type' => 'redshift',
96
+ 'aws_key' => 'YOUR_AWS_KEY',
97
+ 'aws_secret' => 'YOUR_AWS_SECRET',
98
+ 's3_bucket' => 'YOUR_BUCKET',
99
+ 's3_region' => 'YOUR_BUCKET_REGION',
100
+ 'host' => 'redshift.somekeygoeshere.us-west-2.redshift.amazonaws.com',
101
+ 'port' => 5439,
102
+ 'database' => 'main',
103
+ 'schema' => 'public',
104
+ 'username' => 'YOUR_UESRNAME',
105
+ 'password' => 'YOUR_PASSWORD',
106
+ }
107
+ }
108
+ }
109
+
110
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/config/secret/#{ DataDuck.environment }.yml", config_obj.to_yaml)
111
+ DataDuck::Commands.quickstart_save_main
112
+ DataDuck::Commands.quickstart_update_gitignore
113
+
114
+ puts "Quickstart complete!"
115
+ puts "You still need to edit your config/secret/*.yml file with your AWS and Redshift credentials."
116
+ puts "Run your ETL with: ruby src/main.rb"
117
+ end
118
+
119
+ def self.quickstart_update_gitignore
120
+ main_gitignore_path = "#{ DataDuck.project_root }/.gitignore"
121
+ FileUtils.touch(main_gitignore_path)
122
+
123
+ secret_gitignore_path = "#{ DataDuck.project_root }/config/secret/.gitignore"
124
+ FileUtils.touch(secret_gitignore_path)
125
+ output = File.open(secret_gitignore_path, "w")
126
+ output << '[^.]*'
127
+ output.close
128
+ end
129
+
130
+ def self.quickstart_create_table(table_name, db)
131
+ columns = []
132
+ schema = db.schema(table_name)
133
+ schema.each do |property_schema|
134
+ property_name = property_schema[0]
135
+ property_type = property_schema[1][:type]
136
+ commented_out = ['ssn', 'socialsecurity', 'password', 'encrypted_password', 'salt', 'password_salt', 'pw'].include?(property_name.to_s.downcase)
137
+ columns << [property_name.to_s, property_type.to_s, commented_out]
138
+ end
139
+
140
+ table_name = table_name.to_s.downcase
141
+ table_name_camelcased = table_name.split('_').collect(&:capitalize).join
142
+ namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
143
+ template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
144
+ result = ERB.new(template).result(namespace.get_binding)
145
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
146
+ end
147
+
148
+ def self.quickstart_save_file(output_path_full, contents)
149
+ *output_path, output_filename = output_path_full.split('/')
150
+ output_path = output_path.join("/")
151
+ FileUtils::mkdir_p(output_path)
152
+
153
+ output = File.open(output_path_full, "w")
154
+ output << contents
155
+ output.close
156
+ end
157
+
158
+ def self.quickstart_save_main
159
+ namespace = Namespace.new
160
+ template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/main.rb.erb", 'r').read
161
+ result = ERB.new(template).result(namespace.get_binding)
162
+ DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/main.rb", result)
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,40 @@
1
+ module DataDuck
2
+ class Destination
3
+ def self.destination_config(name)
4
+ if DataDuck.config['destinations'].nil? || DataDuck.config['destinations'][name.to_s].nil?
5
+ raise Exception.new("Could not find destination #{ name } in destinations configs.")
6
+ end
7
+
8
+ DataDuck.config['destinations'][name.to_s]
9
+ end
10
+
11
+ def load_tables!(tables)
12
+ raise Exception.new("Must implement load_tables! in subclass")
13
+ end
14
+
15
+ def before_all_loads!
16
+
17
+ end
18
+
19
+ def after_all_loads!
20
+ # e.g. cleanup
21
+ end
22
+
23
+ def self.destination(destination_name)
24
+ destination_name = destination_name.to_s
25
+
26
+ if DataDuck.destinations[destination_name]
27
+ return DataDuck.destinations[destination_name]
28
+ end
29
+
30
+ destination_configuration = DataDuck::Destination.destination_config(destination_name)
31
+ destination_type = destination_configuration['type']
32
+ if destination_type == "redshift"
33
+ DataDuck.destinations[destination_name] = DataDuck::RedshiftDestination.new(destination_configuration)
34
+ return DataDuck.destinations[destination_name]
35
+ else
36
+ raise ArgumentError.new("Unknown type '#{ destination_type }' for destination #{ destination_name }.")
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require_relative 'redshift_destination.rb'
2
+
3
+ module DataDuck
4
+ class ETL
5
+ class << self
6
+ attr_accessor :destinations
7
+ end
8
+
9
+ def self.destination(destination_name)
10
+ self.destinations ||= []
11
+ self.destinations << DataDuck::Destination.destination(destination_name)
12
+ end
13
+
14
+ def initialize(options = {})
15
+ @tables = options[:tables] || []
16
+
17
+ @autoload_tables = options[:autoload_tables].nil? ? true : options[:autoload_tables]
18
+ if @autoload_tables
19
+ Dir[DataDuck.project_root + "/src/tables/*.rb"].each do |file|
20
+ table_name_underscores = file.split("/").last.gsub(".rb", "")
21
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
22
+ require file
23
+ table = Object.const_get(table_name_camelized)
24
+ if table <= DataDuck::Table
25
+ @tables << table
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ def process!
32
+ puts "Processing ETL..."
33
+
34
+ table_instances = []
35
+ @tables.each do |table_class|
36
+ table_instance = table_class.new
37
+ table_instances << table_instance
38
+ table_instance.extract!
39
+ table_instance.transform!
40
+ end
41
+
42
+ self.class.destinations.each do |destination|
43
+ destination.before_all_loads!(table_instances)
44
+ destination.load_tables!(table_instances)
45
+ destination.after_all_loads!(table_instances)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,11 @@
1
+ require_relative 'sql_db_source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class MysqlSource < DataDuck::SqlDbSource
7
+ def db_type
8
+ 'mysql'
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ require_relative 'sql_db_source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class PostrgresqlSource < DataDuck::SqlDbSource
7
+ def db_type
8
+ 'postgres'
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,176 @@
1
+ require_relative 'destination.rb'
2
+
3
+ module DataDuck
4
+ class RedshiftDestination < DataDuck::Destination
5
+ def initialize(config)
6
+ @aws_key = config['aws_key']
7
+ @aws_secret = config['aws_secret']
8
+ @s3_bucket = config['s3_bucket']
9
+ @s3_region = config['s3_region']
10
+ @host = config['host']
11
+ @port = config['port']
12
+ @database = config['database']
13
+ @schema = config['schema']
14
+ @username = config['username']
15
+ @password = config['password']
16
+ @redshift_connection = nil
17
+ end
18
+
19
+ def connection
20
+ @redshift_connection ||= Sequel.connect("redshift://#{ @username }:#{ @password }@#{ @host }:#{ @port }/#{ @database }" +
21
+ "?force_standard_strings=f",
22
+ :client_min_messages => '',
23
+ :force_standard_strings => false
24
+ )
25
+ end
26
+
27
+ def copy_query(table, s3_path)
28
+ properties_joined_string = "\"#{ table.output_column_names.join('","') }\""
29
+ query_fragments = []
30
+ query_fragments << "COPY #{ self.staging_table_name(table) } (#{ properties_joined_string })"
31
+ query_fragments << "FROM '#{ s3_path }'"
32
+ query_fragments << "CREDENTIALS 'aws_access_key_id=#{ @aws_key };aws_secret_access_key=#{ @aws_secret }'"
33
+ query_fragments << "REGION '#{ @s3_region }'"
34
+ query_fragments << "CSV TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
35
+ query_fragments << "DATEFORMAT 'auto'"
36
+ return query_fragments.join(" ")
37
+ end
38
+
39
+ def create_columns_on_data_warehouse!(table)
40
+ columns = get_columns_in_data_warehouse(table)
41
+ column_names = columns.map { |col| col[:name].to_s }
42
+ table.output_schema.map do |name, data_type|
43
+ if !column_names.include?(name.to_s)
44
+ redshift_data_type = data_type.to_s
45
+ redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
46
+ self.run_query("ALTER TABLE #{ table.name } ADD #{ name } #{ redshift_data_type }")
47
+ end
48
+ end
49
+ end
50
+
51
+ def create_table_query(table, table_name = nil)
52
+ table_name ||= table.name
53
+ props_array = table.output_schema.map do |name, data_type|
54
+ redshift_data_type = data_type.to_s
55
+ redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
56
+ "\"#{ name }\" #{ redshift_data_type }"
57
+ end
58
+ props_string = props_array.join(', ')
59
+ "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string })"
60
+ end
61
+
62
+ def create_output_table_on_data_warehouse!(table)
63
+ self.run_query(self.create_table_query(table))
64
+ self.create_columns_on_data_warehouse!(table)
65
+ end
66
+
67
+ def create_staging_table!(table)
68
+ table_name = self.staging_table_name(table)
69
+ self.drop_staging_table!(table)
70
+ self.run_query(self.create_table_query(table, table_name))
71
+ end
72
+
73
+ def data_as_csv_string(data, property_names)
74
+ data_string_components = [] # for performance reasons, join strings this way
75
+ data.each do |result|
76
+ property_names.each_with_index do |property_name, index|
77
+ value = result[property_name.to_sym]
78
+
79
+ if index == 0
80
+ data_string_components << '"'
81
+ end
82
+
83
+ data_string_components << DataDuck::RedshiftDestination.value_to_string(value)
84
+
85
+ if index == property_names.length - 1
86
+ data_string_components << '"'
87
+ else
88
+ data_string_components << '","'
89
+ end
90
+ end
91
+ data_string_components << "\n"
92
+ end
93
+
94
+ return data_string_components.join
95
+ end
96
+
97
+ def drop_staging_table!(table)
98
+ self.run_query("DROP TABLE IF EXISTS #{ self.staging_table_name(table) }")
99
+ end
100
+
101
+ def get_columns_in_data_warehouse(table)
102
+ query = "SELECT pg_table_def.column as name, type as data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table.name }'"
103
+ results = self.run_query(query)
104
+
105
+ columns = []
106
+ results.each do |result|
107
+ columns << {
108
+ name: result[:name],
109
+ data_type: result[:data_type],
110
+ distkey: result[:distkey],
111
+ sortkey: result[:sortkey]
112
+ }
113
+ end
114
+
115
+ return columns
116
+ end
117
+
118
+ def merge_from_staging!(table)
119
+ # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
120
+ staging_name = self.staging_table_name(table)
121
+ delete_query = "DELETE FROM #{ table.name } USING #{ staging_name } WHERE #{ table.name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
122
+ self.run_query(delete_query)
123
+ insert_query = "INSERT INTO #{ table.name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
124
+ self.run_query(insert_query)
125
+ end
126
+
127
+ def run_query(sql)
128
+ self.connection[sql].map { |elem| elem }
129
+ end
130
+
131
+ def staging_table_name(table)
132
+ "zz_dataduck_#{ table.name }"
133
+ end
134
+
135
+ def upload_table_to_s3!(table)
136
+ now_epoch = Time.now.to_i.to_s
137
+ filepath = "pending/#{ table.name.downcase }_#{ now_epoch }.csv"
138
+
139
+ table_csv = self.data_as_csv_string(table.data, table.output_column_names)
140
+
141
+ s3_obj = S3Object.new(filepath, table_csv, @aws_key, @aws_secret,
142
+ @s3_bucket, @s3_region)
143
+ s3_obj.upload!
144
+ return s3_obj
145
+ end
146
+
147
+ def before_all_loads!(tables)
148
+
149
+ end
150
+
151
+ def load_tables!(tables)
152
+ tables.each do |table|
153
+ puts "Loading table #{ table.name }..."
154
+ s3_object = self.upload_table_to_s3!(table)
155
+ self.create_staging_table!(table)
156
+ self.create_output_table_on_data_warehouse!(table)
157
+ self.run_query(self.copy_query(table, s3_object.s3_path))
158
+ self.merge_from_staging!(table)
159
+ self.drop_staging_table!(table)
160
+ end
161
+ end
162
+
163
+ def after_all_loads!(tables)
164
+
165
+ end
166
+
167
+ def self.value_to_string(value)
168
+ string_value = ''
169
+ if value.respond_to? :to_s
170
+ string_value = value.to_s
171
+ end
172
+ string_value.gsub!('"', '""')
173
+ return string_value
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,71 @@
1
+ require 'aws-sdk'
2
+
3
+ module DataDuck
4
+ class S3Object
5
+ def initialize(path, contents, aws_key, aws_secret, bucket, region, options={})
6
+ @path = path
7
+ @contents = contents
8
+ @options = options
9
+ @aws_key = aws_key
10
+ @aws_secret = aws_secret
11
+ @bucket = bucket
12
+ @region = region
13
+ end
14
+
15
+ def upload!
16
+ s3 = Aws::S3::Client.new(
17
+ region: @region,
18
+ access_key_id: @aws_key,
19
+ secret_access_key: @aws_secret,
20
+ )
21
+
22
+ attempts = 0
23
+
24
+ while attempts <= S3Object.max_retries
25
+ attempts += 1
26
+ put_hash = @options.merge({
27
+ acl: 'private',
28
+ bucket: @bucket,
29
+ body: @contents,
30
+ key: self.full_path,
31
+ server_side_encryption: 'AES256',
32
+ })
33
+ begin
34
+ response = s3.put_object(put_hash)
35
+ rescue Exception => e
36
+ if attempts == S3Object.max_retries
37
+ throw e
38
+ end
39
+ end
40
+ end
41
+
42
+ response
43
+ end
44
+
45
+ def full_path
46
+ 'dataduck/' + @path
47
+ end
48
+
49
+ def s3_path
50
+ "s3://#{ @bucket }/#{ full_path }"
51
+ end
52
+
53
+ def self.max_retries
54
+ 3
55
+ end
56
+
57
+ def self.regions
58
+ [
59
+ { name: 'US Standard - N. Virginia', region: 'us-east-1' },
60
+ { name: 'US West - N. California', region: 'us-west-1' },
61
+ { name: 'US West - Oregon', region: 'us-west-2' },
62
+ { name: 'EU - Ireland', region: 'eu-west-1' },
63
+ { name: 'EU - Frankfurt', region: 'eu-central-1' },
64
+ { name: 'Asia Pacific - Singapore', region: 'ap-southeast-1' },
65
+ { name: 'Asia Pacific - Sydney', region: 'ap-southeast-2' },
66
+ { name: 'Asia Pacific - Tokyo', region: 'ap-northeast-1' },
67
+ { name: 'South America - Sao Paulo', region: 'sa-east-1' },
68
+ ]
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,46 @@
1
+ module DataDuck
2
+
3
+ class Source
4
+ def self.source_config(name)
5
+ if DataDuck.config['sources'].nil? || DataDuck.config['sources'][name.to_s].nil?
6
+ raise Exception.new("Could not find source #{ name } in source configs.")
7
+ end
8
+
9
+ DataDuck.config['sources'][name.to_s]
10
+ end
11
+
12
+ def self.source(name)
13
+ name = name.to_s
14
+
15
+ if DataDuck.sources[name]
16
+ return DataDuck.sources[name]
17
+ end
18
+
19
+ configuration = DataDuck::Source.source_config(name)
20
+ source_type = configuration['type']
21
+
22
+ if source_type == "postgresql"
23
+ DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
24
+ return DataDuck.sources[name]
25
+ else
26
+ raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
27
+ end
28
+ end
29
+
30
+ def connection
31
+ raise Exception.new("Must implement connection in subclass.")
32
+ end
33
+
34
+ def query
35
+ raise Exception.new("Must implement query in subclass.")
36
+ end
37
+
38
+ def schema(table_name)
39
+ self.connection.schema(table_name)
40
+ end
41
+
42
+ def self.skip_these_table_names
43
+ [:delayed_jobs, :schema_migrations]
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,41 @@
1
+ require_relative 'source.rb'
2
+
3
+ require 'sequel'
4
+
5
+ module DataDuck
6
+ class SqlDbSource < DataDuck::Source
7
+ def initialize(data)
8
+ @host = data['host']
9
+ @port = data['port']
10
+ @username = data['username']
11
+ @password = data['password']
12
+ @database = data['database']
13
+ @initialized_db_type = data['db_type']
14
+ end
15
+
16
+ def connection
17
+ @connection ||= Sequel.connect(
18
+ adapter: self.db_type,
19
+ user: @username,
20
+ host: @host,
21
+ database: @database,
22
+ password: @password,
23
+ port: @port
24
+ )
25
+ end
26
+
27
+ def db_type
28
+ return @initialized_db_type if @initialized_db_type
29
+
30
+ raise Exception.new("Abstract method db_type must be overwritten by subclass, or passed as data when initializing.")
31
+ end
32
+
33
+ def table_names
34
+ self.connection.tables.map { |table| DataDuck::Source.skip_these_table_names.include?(table) ? nil : table }.compact
35
+ end
36
+
37
+ def query(sql)
38
+ self.connection.fetch(sql).all
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,82 @@
1
+ module DataDuck
2
+ class Table
3
+ class << self
4
+ attr_accessor :sources
5
+ attr_accessor :output_schema
6
+ attr_accessor :actions
7
+ attr_accessor :errors
8
+ end
9
+
10
+ attr_accessor :data
11
+
12
+ def self.transforms(transformation_name)
13
+ self.actions ||= []
14
+ self.actions << [:transform, transformation_name]
15
+ end
16
+ singleton_class.send(:alias_method, :transform, :transforms)
17
+
18
+ def self.validates(validation_name)
19
+ self.actions ||= []
20
+ self.actions << [:validate, validation_name]
21
+ end
22
+ singleton_class.send(:alias_method, :validate, :validates)
23
+
24
+ def self.source(source_name, source_data = [])
25
+ self.sources ||= {}
26
+ source = DataDuck::Source.source(source_name)
27
+ self.sources[source] = source_data
28
+ end
29
+
30
+ def self.output(schema)
31
+ self.output_schema ||= {}
32
+ self.output_schema.merge!(schema)
33
+ end
34
+
35
+ def actions
36
+ self.class.actions
37
+ end
38
+
39
+ def output_schema
40
+ self.class.output_schema
41
+ end
42
+
43
+ def output_column_names
44
+ self.class.output_schema.keys.sort
45
+ end
46
+
47
+ def extract!
48
+ puts "Extracting table #{ self.name }..."
49
+
50
+ self.errors ||= []
51
+ self.data = []
52
+ self.class.sources.each_pair do |source, source_columns|
53
+ import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
54
+ results = source.query(import_query)
55
+ self.data = results
56
+ end
57
+ self.data
58
+ end
59
+
60
+ def transform!
61
+ puts "Transforming table #{ self.name }..."
62
+
63
+ self.errors ||= []
64
+ self.actions.each do |action|
65
+ action_type = action[0]
66
+ action_method_name = action[1]
67
+ if action_type == :transform
68
+ self.data.map! { |row| self.public_send(action_method_name, row) }
69
+ elsif action_type == :validate
70
+ self.data.each do |row|
71
+ error = self.public_send(action_method_name, row)
72
+ self.errors << error if !error.blank?
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def name
79
+ DataDuck::Util.camelcase_to_underscore(self.class.name)
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,15 @@
1
+ module DataDuck
2
+ class Util
3
+ def self.underscore_to_camelcase(str)
4
+ str.split('_').map{ |chunk| chunk.capitalize }.join
5
+ end
6
+
7
+ def self.camelcase_to_underscore(str)
8
+ str.gsub(/::/, '/')
9
+ .gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
10
+ .gsub(/([a-z\d])([A-Z])/,'\1_\2')
11
+ .tr("-", "_")
12
+ .downcase
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,6 @@
1
+ module DataDuck
2
+ VERSION_MAJOR = 0
3
+ VERSION_MINOR = 2
4
+ VERSION_PATCH = 0
5
+ VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
6
+ end
@@ -0,0 +1,19 @@
1
+ module ModuleVars
2
+ def define_class_method(name, &block)
3
+ (class << self; self; end).instance_eval do
4
+ define_method(name, &block)
5
+ end
6
+ end
7
+
8
+ def create_module_var(name, val = nil)
9
+ class_variable_set("@@#{ name }", val)
10
+
11
+ define_class_method(name) do
12
+ class_variable_get("@@#{ name }")
13
+ end
14
+
15
+ define_class_method("#{name}=") do |set_to|
16
+ class_variable_set("@@#{ name }", set_to)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.require
4
+
5
+ class MyETL < DataDuck::ETL
6
+ destination :my_destination
7
+ end
8
+
9
+ etl = MyETL.new
10
+ etl.process!
@@ -0,0 +1,7 @@
1
+ class <%= table_name %> < DataDuck::Table
2
+ source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
3
+
4
+ output({<% columns.each do |col| %>
5
+ <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
6
+ })
7
+ end
Binary file
metadata ADDED
@@ -0,0 +1,178 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dataduck
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeff Pickhardt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: sequel
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '4.19'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '4.19'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pg
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.16'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.16'
83
+ - !ruby/object:Gem::Dependency
84
+ name: aws-sdk
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: sequel-redshift
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: A straightforward, effective ETL framework.
112
+ email:
113
+ - pickhardt@gmail.com
114
+ - admin@dataducketl.com
115
+ executables:
116
+ - dataduck
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - ".gitignore"
121
+ - ".rspec"
122
+ - ".ruby-version"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - bin/console
127
+ - bin/dataduck
128
+ - bin/setup
129
+ - dataduck.gemspec
130
+ - examples/example/.gitignore
131
+ - examples/example/.ruby-version
132
+ - examples/example/Gemfile
133
+ - examples/example/README.md
134
+ - examples/example/config/replace_me.yml
135
+ - examples/example/src/main.rb
136
+ - examples/example/src/tables/games.rb
137
+ - examples/example/src/tables/users.rb
138
+ - lib/dataduck.rb
139
+ - lib/dataduck/commands.rb
140
+ - lib/dataduck/destination.rb
141
+ - lib/dataduck/etl.rb
142
+ - lib/dataduck/mysql_source.rb
143
+ - lib/dataduck/postgresql_source.rb
144
+ - lib/dataduck/redshift_destination.rb
145
+ - lib/dataduck/s3_object.rb
146
+ - lib/dataduck/source.rb
147
+ - lib/dataduck/sql_db_source.rb
148
+ - lib/dataduck/table.rb
149
+ - lib/dataduck/util.rb
150
+ - lib/dataduck/version.rb
151
+ - lib/helpers/module_vars.rb
152
+ - lib/templates/quickstart/main.rb.erb
153
+ - lib/templates/quickstart/table.rb.erb
154
+ - static/logo.png
155
+ homepage: http://dataducketl.com/
156
+ licenses: []
157
+ metadata: {}
158
+ post_install_message:
159
+ rdoc_options: []
160
+ require_paths:
161
+ - lib
162
+ required_ruby_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ requirements: []
173
+ rubyforge_project:
174
+ rubygems_version: 2.4.8
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: A straightforward, effective ETL framework.
178
+ test_files: []