redshift_etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dba6f0ba7cea3305daeaa95a294202898e7629b0
4
+ data.tar.gz: 65557d8fbdab5bc5f5b4e71da8076204ecc67b94
5
+ SHA512:
6
+ metadata.gz: 336794a6b640282754b8aab124819c2593df3d5e00f4a210bc9cdbec5e1ff910ffa7f60d75fb8fe9461d152b272d80980d2c23b0989c292b06d78bffcee82b46
7
+ data.tar.gz: 8021e32c16bc1dc3f3a4e161b89171e5261a84e84b1191ce9e2a9be013652ea1b466597b34e5eb5fa795bbf43c3ebfaa2e3c958d56a226307c13a951acbd1751
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ redshift_etl (0.0.1)
5
+ activesupport
6
+ aws-sdk (~> 1)
7
+ pg
8
+ que (= 0.9.0)
9
+ sequel
10
+
11
+ GEM
12
+ remote: http://rubygems.org/
13
+ specs:
14
+ activesupport (4.2.3)
15
+ i18n (~> 0.7)
16
+ json (~> 1.7, >= 1.7.7)
17
+ minitest (~> 5.1)
18
+ thread_safe (~> 0.3, >= 0.3.4)
19
+ tzinfo (~> 1.1)
20
+ aws-sdk (1.64.0)
21
+ aws-sdk-v1 (= 1.64.0)
22
+ aws-sdk-v1 (1.64.0)
23
+ json (~> 1.4)
24
+ nokogiri (>= 1.4.4)
25
+ i18n (0.7.0)
26
+ json (1.8.3)
27
+ mini_portile (0.6.2)
28
+ minitest (5.8.0)
29
+ nokogiri (1.6.6.2)
30
+ mini_portile (~> 0.6.0)
31
+ pg (0.18.2)
32
+ que (0.9.0)
33
+ sequel (4.25.0)
34
+ thread_safe (0.3.5)
35
+ tzinfo (1.2.2)
36
+ thread_safe (~> 0.1)
37
+
38
+ PLATFORMS
39
+ ruby
40
+
41
+ DEPENDENCIES
42
+ redshift_etl!
data/TODO ADDED
@@ -0,0 +1,29 @@
1
+ - Use Sequel instead of shell+psql+sql to dump / load data
2
+ - Use ruby gem for s3 uploads
3
+ - Use proper temp directory / file handling
4
+ - TMPDIR should be configurable via envvar if that's not automatic
5
+ - Fail on errors, don't silently fail and continue (probably handled by using gems instead of shell commands)
6
+ - Support incremental uploads
7
+ - Add way to automatically run (passing in the time period we want to process updates for)
8
+ - Use a temporary s3 folder
9
+ - when upload completes, move to a backup place
10
+ - on finish, delete temporary folder (and files)
11
+
12
+ + Add way to automatically generate the config files based off schema
13
+ + More tables!
14
+ + Do a load to a temp table, then replace table with new one (or whatever redshift docs say to do)
15
+
16
+
17
+ sql generator changes:
18
+ - switch to sequel
19
+ - trim text length to 64k
20
+
21
+ incremental:
22
+ - specify the column to do the timestamp on
23
+ - specify the primary key to do the replacement on
24
+ - pass this information into the yaml generator
25
+ - extract changes:
26
+ * change the filename to include the day (today at 12 am to current time)
27
+ * query should select on the timestamp
28
+ - load changes:
29
+ * instead of drop/create,rename, intead do a 'delete from where primary key=..; insert into..'
data/go.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+
3
+ for file in config/*/*.yml
4
+ do
5
+ envdir /etc/env/redshift bundle exec /usr/local/bin/ruby bin/redshift_etl.rb $file
6
+ done
@@ -0,0 +1,8 @@
1
+ require_relative './redshift_etl/config'
2
+ require_relative './redshift_etl/extract'
3
+ require_relative './redshift_etl/transform'
4
+ require_relative './redshift_etl/load'
5
+ require_relative './redshift_etl/archive'
6
+
7
+ module RedshiftETL
8
+ end
@@ -0,0 +1,15 @@
1
+ module RedshiftETL
2
+ module Archive
3
+ def self.call(config)
4
+ # Delete previously-stored files if doing a full update.
5
+ config.s3_bucket.objects.with_prefix(config.final_s3_path).delete_all
6
+
7
+ # Move objects from temp dir to final location.
8
+ config.s3_bucket.objects.with_prefix(config.temp_s3_path).each do |object|
9
+ filename = File.basename(object.key)
10
+ new_location = config.final_s3_path + filename
11
+ object.move_to(new_location)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,114 @@
1
+ require 'yaml'
2
+ require 'active_support'
3
+ require 'active_support/core_ext'
4
+ require 'securerandom'
5
+ require 'time'
6
+ require 'aws-sdk'
7
+
8
+ module RedshiftETL
9
+ class Config
10
+ def initialize(file_path)
11
+ fail "plz provide a file" if file_path.blank?
12
+ @config = YAML.load(File.open(file_path))
13
+ rescue
14
+ puts $!
15
+ fail "Did you provide a valid YAML file?"
16
+ end
17
+
18
+ def temp_s3_path
19
+ "tmp/#{redshift_schema}/#{redshift_table}/#{temp_key}/"
20
+ end
21
+
22
+ def final_s3_path
23
+ "files/#{redshift_schema}/#{redshift_table}/"
24
+ end
25
+
26
+ def enabled?
27
+ @config["enabled"]
28
+ end
29
+
30
+ def disabled?
31
+ !enabled?
32
+ end
33
+
34
+ def primary_key
35
+ @config["primary_key"]
36
+ end
37
+
38
+ def incremental_updates?
39
+ @config["incremental_updates"]
40
+ end
41
+
42
+ def update_timestamp_column
43
+ @config["update_timestamp_column"]
44
+ end
45
+
46
+ def query
47
+ @config["query"]
48
+ end
49
+
50
+ def table_name
51
+ @config["table_name"]
52
+ end
53
+
54
+ def connection_string
55
+ @config["connection_string"]
56
+ end
57
+
58
+ def filename
59
+ @config["filename"] + '.csv'
60
+ end
61
+
62
+ def redshift_table
63
+ @config["redshift_table"]
64
+ end
65
+
66
+ def redshift_schema
67
+ @config["redshift_schema"]
68
+ end
69
+
70
+ def columns
71
+ @config["columns"]
72
+ end
73
+
74
+ def redshift_connection
75
+ ENV['REDSHIFT_DATABASE_URL']
76
+ end
77
+
78
+ def s3_access_key_id
79
+ ENV['AWS_ACCESS_KEY_ID']
80
+ end
81
+
82
+ def s3_secret_access_key
83
+ ENV['AWS_SECRET_ACCESS_KEY']
84
+ end
85
+
86
+ def s3_region
87
+ "us-east-1"
88
+ end
89
+
90
+ def s3_bucket_name
91
+ "tanga-redshift"
92
+ end
93
+
94
+ def check!
95
+ fail "plz set AWS_SECRET_ACCESS_KEY" if s3_secret_access_key.blank?
96
+ fail "plz set REDSHIFT_DATABASE_URL" if redshift_connection.blank?
97
+ fail "plz set AWS_ACCESS_KEY_ID" if s3_access_key_id.blank?
98
+ end
99
+
100
+ def s3
101
+ AWS::S3.new(access_key_id: s3_access_key_id, secret_access_key: s3_secret_access_key)
102
+ end
103
+
104
+ def s3_bucket
105
+ s3.buckets[s3_bucket_name]
106
+ end
107
+
108
+ private
109
+
110
+ def temp_key
111
+ @temp_key ||= Time.now.iso8601 + SecureRandom.hex[0..7]
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,15 @@
1
+ require_relative './config'
2
+ require_relative './extract/database_fetcher'
3
+ require_relative './extract/file_maker'
4
+
5
+ # Copies stuff from the database into a CSV file.
6
+ module RedshiftETL
7
+ module Extract
8
+ def self.call(config)
9
+ db = DatabaseFetcher.new(config)
10
+ file_maker = FileMaker.new(config)
11
+
12
+ file_maker.run(db)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,43 @@
1
+ require 'sequel'
2
+
3
+ module RedshiftETL
4
+ module Extract
5
+ # Copies data from a database in CSV format.
6
+ class DatabaseFetcher
7
+ attr_reader :config
8
+ def initialize(config)
9
+ @config = config
10
+ end
11
+
12
+ def lines
13
+ sequel do |db|
14
+ db.copy_table(dataset_to_copy(db), format: :csv, options: 'header') do |line|
15
+ begin
16
+ yield line
17
+ rescue # copy_table rescues errors, hard to debug without rescuing them ourselves.
18
+ puts $!.inspect
19
+ puts $!.backtrace
20
+ raise
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def sequel(&block)
27
+ Sequel.connect(config.connection_string) do |db|
28
+ db << "set time zone 'UTC'"
29
+ block.call(db)
30
+ end
31
+ end
32
+
33
+ def dataset_to_copy(db)
34
+ table = db[config.table_name.to_sym]
35
+ # TODO truncate to 65k for varchar columns
36
+ # TODO maybe later, if citext, convert to lower(column)
37
+ columns_to_copy = config.columns.keys.map(&:to_sym)
38
+
39
+ table.select(*columns_to_copy)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,59 @@
1
+ require 'zlib'
2
+ require 'tempfile'
3
+
4
+ module RedshiftETL
5
+ module Extract
6
+ # Extracts lines from the data source and puts the CSV files
7
+ # on S3.
8
+ class FileMaker
9
+ attr_reader :config
10
+ def initialize(config)
11
+ @config = config
12
+ @file_index = 0
13
+ @line_index = 0
14
+ end
15
+
16
+ def run(data_source)
17
+ new_file!
18
+ data_source.lines do |line|
19
+ process_line(line)
20
+ end
21
+ process_file!
22
+ end
23
+
24
+ private
25
+
26
+ def process_line(line)
27
+ @line_index += 1
28
+
29
+ @header ||= line
30
+ @gzip.write(line)
31
+
32
+ if @line_index == 1_000_000
33
+ process_file!
34
+ new_file!
35
+ @gzip.write(@header)
36
+ end
37
+ end
38
+
39
+ def new_file!
40
+ @temp_file = Tempfile.new("#{config.table_name}.#{@file_index}.")
41
+ @gzip = Zlib::GzipWriter.open(@temp_file.path)
42
+ @file_index += 1
43
+ @line_index = 0
44
+ end
45
+
46
+ def process_file!
47
+ @gzip.close
48
+ upload_to_s3
49
+ @temp_file.close!
50
+ GC.start
51
+ end
52
+
53
+ def upload_to_s3
54
+ s3_path = config.temp_s3_path + "#{config.filename}.#{@file_index}.gz"
55
+ config.s3_bucket.objects[s3_path].write(File.open(@temp_file))
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Loads stuff from CSV files into Redshift
4
+ module RedshiftETL
5
+ # TODO: break into some objects
6
+ module Load
7
+ def self.call(config)
8
+ load_into_redshift(config)
9
+ end
10
+
11
+ def self.load_into_redshift(config)
12
+ sql_statements = []
13
+ sql_statements << "create schema if not exists #{config.redshift_schema};"
14
+ sql_statements << "set search_path=#{config.redshift_schema};"
15
+ sql_statements << "drop table if exists #{config.redshift_table}_staging;"
16
+ sql_statements << create_table(config)
17
+
18
+ sql_statements << "
19
+ copy #{staging_table_name(config)} from 's3://#{config.s3_bucket_name}/#{config.temp_s3_path}'
20
+ credentials 'aws_access_key_id=#{config.s3_access_key_id};aws_secret_access_key=#{config.s3_secret_access_key}'
21
+ csv
22
+ gzip
23
+ ignoreheader 1
24
+ trimblanks
25
+ statupdate on
26
+ truncatecolumns
27
+ timeformat 'auto'; "
28
+
29
+ if config.incremental_updates?
30
+ sql_statements << "
31
+ begin;
32
+
33
+ #{create_table_statement(config, config.redshift_table)}
34
+
35
+ delete from #{config.redshift_table}
36
+ using #{staging_table_name(config)}
37
+ where #{config.redshift_table}.#{config.primary_key} = #{staging_table_name(config)}.#{config.primary_key};
38
+
39
+ insert into #{config.redshift_table}
40
+ select * from #{staging_table_name(config)};
41
+
42
+ drop table #{staging_table_name(config)};
43
+
44
+ commit;
45
+ "
46
+ else
47
+ sql_statements << "drop table if exists #{ config.redshift_table};"
48
+ sql_statements << "alter table #{staging_table_name(config)} rename to #{config.redshift_table};"
49
+ end
50
+
51
+ Sequel.connect(config.redshift_connection + "?force_standard_strings=f&client_min_messages=") do |db|
52
+ sql_statements.each do |statement|
53
+ print db[statement].get
54
+ end
55
+ end
56
+ end
57
+
58
+ def self.create_table(config)
59
+ create_table_statement(config, staging_table_name(config))
60
+ end
61
+
62
+ def self.create_table_statement(config, table_name)
63
+ sql = "create table if not exists #{table_name} ("
64
+ sql << config.columns.map { |column, type| "#{column} #{type}" }.join(', ')
65
+ sql << "); "
66
+ end
67
+
68
+ def self.staging_table_name(config)
69
+ config.redshift_table + "_staging"
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,7 @@
1
+ require_relative './config'
2
+ require_relative './extract'
3
+ require_relative './load'
4
+
5
+ config = RedshiftETL::Config.new('config/tanga/base_skus.yml')
6
+ puts RedshiftETL::Extract.call(config)
7
+ puts RedshiftETL::Load.call(config)
@@ -0,0 +1,7 @@
1
+ module RedshiftETL
2
+ module Transform
3
+ def self.call(_config)
4
+ # Empty for now.
5
+ end
6
+ end
7
+ end
data/redshift.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+
3
+ # Describe your gem and declare its dependencies:
4
+ Gem::Specification.new do |s|
5
+ s.name = "redshift_etl"
6
+ s.version = "0.0.1"
7
+ s.authors = ["Joe Van Dyk"]
8
+ s.email = ["joe@tanga.com"]
9
+ s.homepage = "https://www.tanga.com"
10
+ s.summary = "redshift etl"
11
+ s.require_paths = ["lib"]
12
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features|config|bin)/}) }
13
+ s.add_dependency 'que', '0.9.0'
14
+ s.add_dependency 'activesupport'
15
+ s.add_dependency 'sequel'
16
+ s.add_dependency 'pg'
17
+ s.add_dependency 'aws-sdk', '~> 1'
18
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshift_etl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Joe Van Dyk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: que
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: activesupport
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sequel
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pg
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: aws-sdk
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1'
83
+ description:
84
+ email:
85
+ - joe@tanga.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - Gemfile
91
+ - Gemfile.lock
92
+ - TODO
93
+ - go.sh
94
+ - lib/redshift_etl.rb
95
+ - lib/redshift_etl/archive.rb
96
+ - lib/redshift_etl/config.rb
97
+ - lib/redshift_etl/extract.rb
98
+ - lib/redshift_etl/extract/database_fetcher.rb
99
+ - lib/redshift_etl/extract/file_maker.rb
100
+ - lib/redshift_etl/load.rb
101
+ - lib/redshift_etl/test.rb
102
+ - lib/redshift_etl/transform.rb
103
+ - redshift.gemspec
104
+ homepage: https://www.tanga.com
105
+ licenses: []
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 2.4.5
124
+ signing_key:
125
+ specification_version: 4
126
+ summary: redshift etl
127
+ test_files: []
128
+ has_rdoc: