redshift_etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dba6f0ba7cea3305daeaa95a294202898e7629b0
4
+ data.tar.gz: 65557d8fbdab5bc5f5b4e71da8076204ecc67b94
5
+ SHA512:
6
+ metadata.gz: 336794a6b640282754b8aab124819c2593df3d5e00f4a210bc9cdbec5e1ff910ffa7f60d75fb8fe9461d152b272d80980d2c23b0989c292b06d78bffcee82b46
7
+ data.tar.gz: 8021e32c16bc1dc3f3a4e161b89171e5261a84e84b1191ce9e2a9be013652ea1b466597b34e5eb5fa795bbf43c3ebfaa2e3c958d56a226307c13a951acbd1751
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ redshift_etl (0.0.1)
5
+ activesupport
6
+ aws-sdk (~> 1)
7
+ pg
8
+ que (= 0.9.0)
9
+ sequel
10
+
11
+ GEM
12
+ remote: http://rubygems.org/
13
+ specs:
14
+ activesupport (4.2.3)
15
+ i18n (~> 0.7)
16
+ json (~> 1.7, >= 1.7.7)
17
+ minitest (~> 5.1)
18
+ thread_safe (~> 0.3, >= 0.3.4)
19
+ tzinfo (~> 1.1)
20
+ aws-sdk (1.64.0)
21
+ aws-sdk-v1 (= 1.64.0)
22
+ aws-sdk-v1 (1.64.0)
23
+ json (~> 1.4)
24
+ nokogiri (>= 1.4.4)
25
+ i18n (0.7.0)
26
+ json (1.8.3)
27
+ mini_portile (0.6.2)
28
+ minitest (5.8.0)
29
+ nokogiri (1.6.6.2)
30
+ mini_portile (~> 0.6.0)
31
+ pg (0.18.2)
32
+ que (0.9.0)
33
+ sequel (4.25.0)
34
+ thread_safe (0.3.5)
35
+ tzinfo (1.2.2)
36
+ thread_safe (~> 0.1)
37
+
38
+ PLATFORMS
39
+ ruby
40
+
41
+ DEPENDENCIES
42
+ redshift_etl!
data/TODO ADDED
@@ -0,0 +1,29 @@
1
+ - Use Sequel instead of shell+psql+sql to dump / load data
2
+ - Use ruby gem for s3 uploads
3
+ - Use proper temp directory / file handling
4
+ - TMPDIR should be configurable via envvar if that's not automatic
5
+ - Fail on errors, don't silently fail and continue (probably handled by using gems instead of shell commands)
6
+ - Support incremental uploads
7
+ - Add way to automatically run (passing in the time period we want to process updates for)
8
+ - Use a temporary s3 folder
9
+ - when upload completes, move to a backup place
10
+ - on finish, delete temporary folder (and files)
11
+
12
+ + Add way to automatically generate the config files based off schema
13
+ + More tables!
14
+ + Do a load to a temp table, then replace table with new one (or whatever redshift docs say to do)
15
+
16
+
17
+ sql generator changes:
18
+ - switch to sequel
19
+ - trim text length to 64k
20
+
21
+ incremental:
22
+ - specify the column to do the timestamp on
23
+ - specify the primary key to do the replacement on
24
+ - pass this information into the yaml generator
25
+ - extract changes:
26
+ * change the filename to include the day (today at 12 am to current time)
27
+ * query should select on the timestamp
28
+ - load changes:
29
+ * instead of drop/create,rename, intead do a 'delete from where primary key=..; insert into..'
data/go.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+
3
+ for file in config/*/*.yml
4
+ do
5
+ envdir /etc/env/redshift bundle exec /usr/local/bin/ruby bin/redshift_etl.rb $file
6
+ done
@@ -0,0 +1,8 @@
1
+ require_relative './redshift_etl/config'
2
+ require_relative './redshift_etl/extract'
3
+ require_relative './redshift_etl/transform'
4
+ require_relative './redshift_etl/load'
5
+ require_relative './redshift_etl/archive'
6
+
7
+ module RedshiftETL
8
+ end
@@ -0,0 +1,15 @@
1
+ module RedshiftETL
2
+ module Archive
3
+ def self.call(config)
4
+ # Delete previously-stored files if doing a full update.
5
+ config.s3_bucket.objects.with_prefix(config.final_s3_path).delete_all
6
+
7
+ # Move objects from temp dir to final location.
8
+ config.s3_bucket.objects.with_prefix(config.temp_s3_path).each do |object|
9
+ filename = File.basename(object.key)
10
+ new_location = config.final_s3_path + filename
11
+ object.move_to(new_location)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,114 @@
1
+ require 'yaml'
2
+ require 'active_support'
3
+ require 'active_support/core_ext'
4
+ require 'securerandom'
5
+ require 'time'
6
+ require 'aws-sdk'
7
+
8
+ module RedshiftETL
9
+ class Config
10
+ def initialize(file_path)
11
+ fail "plz provide a file" if file_path.blank?
12
+ @config = YAML.load(File.open(file_path))
13
+ rescue
14
+ puts $!
15
+ fail "Did you provide a valid YAML file?"
16
+ end
17
+
18
+ def temp_s3_path
19
+ "tmp/#{redshift_schema}/#{redshift_table}/#{temp_key}/"
20
+ end
21
+
22
+ def final_s3_path
23
+ "files/#{redshift_schema}/#{redshift_table}/"
24
+ end
25
+
26
+ def enabled?
27
+ @config["enabled"]
28
+ end
29
+
30
+ def disabled?
31
+ !enabled?
32
+ end
33
+
34
+ def primary_key
35
+ @config["primary_key"]
36
+ end
37
+
38
+ def incremental_updates?
39
+ @config["incremental_updates"]
40
+ end
41
+
42
+ def update_timestamp_column
43
+ @config["update_timestamp_column"]
44
+ end
45
+
46
+ def query
47
+ @config["query"]
48
+ end
49
+
50
+ def table_name
51
+ @config["table_name"]
52
+ end
53
+
54
+ def connection_string
55
+ @config["connection_string"]
56
+ end
57
+
58
+ def filename
59
+ @config["filename"] + '.csv'
60
+ end
61
+
62
+ def redshift_table
63
+ @config["redshift_table"]
64
+ end
65
+
66
+ def redshift_schema
67
+ @config["redshift_schema"]
68
+ end
69
+
70
+ def columns
71
+ @config["columns"]
72
+ end
73
+
74
+ def redshift_connection
75
+ ENV['REDSHIFT_DATABASE_URL']
76
+ end
77
+
78
+ def s3_access_key_id
79
+ ENV['AWS_ACCESS_KEY_ID']
80
+ end
81
+
82
+ def s3_secret_access_key
83
+ ENV['AWS_SECRET_ACCESS_KEY']
84
+ end
85
+
86
+ def s3_region
87
+ "us-east-1"
88
+ end
89
+
90
+ def s3_bucket_name
91
+ "tanga-redshift"
92
+ end
93
+
94
+ def check!
95
+ fail "plz set AWS_SECRET_ACCESS_KEY" if s3_secret_access_key.blank?
96
+ fail "plz set REDSHIFT_DATABASE_URL" if redshift_connection.blank?
97
+ fail "plz set AWS_ACCESS_KEY_ID" if s3_access_key_id.blank?
98
+ end
99
+
100
+ def s3
101
+ AWS::S3.new(access_key_id: s3_access_key_id, secret_access_key: s3_secret_access_key)
102
+ end
103
+
104
+ def s3_bucket
105
+ s3.buckets[s3_bucket_name]
106
+ end
107
+
108
+ private
109
+
110
+ def temp_key
111
+ @temp_key ||= Time.now.iso8601 + SecureRandom.hex[0..7]
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,15 @@
1
+ require_relative './config'
2
+ require_relative './extract/database_fetcher'
3
+ require_relative './extract/file_maker'
4
+
5
+ # Copies stuff from the database into a CSV file.
6
+ module RedshiftETL
7
+ module Extract
8
+ def self.call(config)
9
+ db = DatabaseFetcher.new(config)
10
+ file_maker = FileMaker.new(config)
11
+
12
+ file_maker.run(db)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,43 @@
1
+ require 'sequel'
2
+
3
+ module RedshiftETL
4
+ module Extract
5
+ # Copies data from a database in CSV format.
6
+ class DatabaseFetcher
7
+ attr_reader :config
8
+ def initialize(config)
9
+ @config = config
10
+ end
11
+
12
+ def lines
13
+ sequel do |db|
14
+ db.copy_table(dataset_to_copy(db), format: :csv, options: 'header') do |line|
15
+ begin
16
+ yield line
17
+ rescue # copy_table rescues errors, hard to debug without rescuing them ourselves.
18
+ puts $!.inspect
19
+ puts $!.backtrace
20
+ raise
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def sequel(&block)
27
+ Sequel.connect(config.connection_string) do |db|
28
+ db << "set time zone 'UTC'"
29
+ block.call(db)
30
+ end
31
+ end
32
+
33
+ def dataset_to_copy(db)
34
+ table = db[config.table_name.to_sym]
35
+ # TODO truncate to 65k for varchar columns
36
+ # TODO maybe later, if citext, convert to lower(column)
37
+ columns_to_copy = config.columns.keys.map(&:to_sym)
38
+
39
+ table.select(*columns_to_copy)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,59 @@
1
+ require 'zlib'
2
+ require 'tempfile'
3
+
4
+ module RedshiftETL
5
+ module Extract
6
+ # Extracts lines from the data source and puts the CSV files
7
+ # on S3.
8
+ class FileMaker
9
+ attr_reader :config
10
+ def initialize(config)
11
+ @config = config
12
+ @file_index = 0
13
+ @line_index = 0
14
+ end
15
+
16
+ def run(data_source)
17
+ new_file!
18
+ data_source.lines do |line|
19
+ process_line(line)
20
+ end
21
+ process_file!
22
+ end
23
+
24
+ private
25
+
26
+ def process_line(line)
27
+ @line_index += 1
28
+
29
+ @header ||= line
30
+ @gzip.write(line)
31
+
32
+ if @line_index == 1_000_000
33
+ process_file!
34
+ new_file!
35
+ @gzip.write(@header)
36
+ end
37
+ end
38
+
39
+ def new_file!
40
+ @temp_file = Tempfile.new("#{config.table_name}.#{@file_index}.")
41
+ @gzip = Zlib::GzipWriter.open(@temp_file.path)
42
+ @file_index += 1
43
+ @line_index = 0
44
+ end
45
+
46
+ def process_file!
47
+ @gzip.close
48
+ upload_to_s3
49
+ @temp_file.close!
50
+ GC.start
51
+ end
52
+
53
+ def upload_to_s3
54
+ s3_path = config.temp_s3_path + "#{config.filename}.#{@file_index}.gz"
55
+ config.s3_bucket.objects[s3_path].write(File.open(@temp_file))
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Loads stuff from CSV files into Redshift
4
+ module RedshiftETL
5
+ # TODO: break into some objects
6
+ module Load
7
+ def self.call(config)
8
+ load_into_redshift(config)
9
+ end
10
+
11
+ def self.load_into_redshift(config)
12
+ sql_statements = []
13
+ sql_statements << "create schema if not exists #{config.redshift_schema};"
14
+ sql_statements << "set search_path=#{config.redshift_schema};"
15
+ sql_statements << "drop table if exists #{config.redshift_table}_staging;"
16
+ sql_statements << create_table(config)
17
+
18
+ sql_statements << "
19
+ copy #{staging_table_name(config)} from 's3://#{config.s3_bucket_name}/#{config.temp_s3_path}'
20
+ credentials 'aws_access_key_id=#{config.s3_access_key_id};aws_secret_access_key=#{config.s3_secret_access_key}'
21
+ csv
22
+ gzip
23
+ ignoreheader 1
24
+ trimblanks
25
+ statupdate on
26
+ truncatecolumns
27
+ timeformat 'auto'; "
28
+
29
+ if config.incremental_updates?
30
+ sql_statements << "
31
+ begin;
32
+
33
+ #{create_table_statement(config, config.redshift_table)}
34
+
35
+ delete from #{config.redshift_table}
36
+ using #{staging_table_name(config)}
37
+ where #{config.redshift_table}.#{config.primary_key} = #{staging_table_name(config)}.#{config.primary_key};
38
+
39
+ insert into #{config.redshift_table}
40
+ select * from #{staging_table_name(config)};
41
+
42
+ drop table #{staging_table_name(config)};
43
+
44
+ commit;
45
+ "
46
+ else
47
+ sql_statements << "drop table if exists #{ config.redshift_table};"
48
+ sql_statements << "alter table #{staging_table_name(config)} rename to #{config.redshift_table};"
49
+ end
50
+
51
+ Sequel.connect(config.redshift_connection + "?force_standard_strings=f&client_min_messages=") do |db|
52
+ sql_statements.each do |statement|
53
+ print db[statement].get
54
+ end
55
+ end
56
+ end
57
+
58
+ def self.create_table(config)
59
+ create_table_statement(config, staging_table_name(config))
60
+ end
61
+
62
+ def self.create_table_statement(config, table_name)
63
+ sql = "create table if not exists #{table_name} ("
64
+ sql << config.columns.map { |column, type| "#{column} #{type}" }.join(', ')
65
+ sql << "); "
66
+ end
67
+
68
+ def self.staging_table_name(config)
69
+ config.redshift_table + "_staging"
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,7 @@
1
+ require_relative './config'
2
+ require_relative './extract'
3
+ require_relative './load'
4
+
5
+ config = RedshiftETL::Config.new('config/tanga/base_skus.yml')
6
+ puts RedshiftETL::Extract.call(config)
7
+ puts RedshiftETL::Load.call(config)
@@ -0,0 +1,7 @@
1
+ module RedshiftETL
2
+ module Transform
3
+ def self.call(_config)
4
+ # Empty for now.
5
+ end
6
+ end
7
+ end
data/redshift.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+
3
+ # Describe your gem and declare its dependencies:
4
+ Gem::Specification.new do |s|
5
+ s.name = "redshift_etl"
6
+ s.version = "0.0.1"
7
+ s.authors = ["Joe Van Dyk"]
8
+ s.email = ["joe@tanga.com"]
9
+ s.homepage = "https://www.tanga.com"
10
+ s.summary = "redshift etl"
11
+ s.require_paths = ["lib"]
12
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features|config|bin)/}) }
13
+ s.add_dependency 'que', '0.9.0'
14
+ s.add_dependency 'activesupport'
15
+ s.add_dependency 'sequel'
16
+ s.add_dependency 'pg'
17
+ s.add_dependency 'aws-sdk', '~> 1'
18
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshift_etl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Joe Van Dyk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: que
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: activesupport
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sequel
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pg
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: aws-sdk
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1'
83
+ description:
84
+ email:
85
+ - joe@tanga.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - Gemfile
91
+ - Gemfile.lock
92
+ - TODO
93
+ - go.sh
94
+ - lib/redshift_etl.rb
95
+ - lib/redshift_etl/archive.rb
96
+ - lib/redshift_etl/config.rb
97
+ - lib/redshift_etl/extract.rb
98
+ - lib/redshift_etl/extract/database_fetcher.rb
99
+ - lib/redshift_etl/extract/file_maker.rb
100
+ - lib/redshift_etl/load.rb
101
+ - lib/redshift_etl/test.rb
102
+ - lib/redshift_etl/transform.rb
103
+ - redshift.gemspec
104
+ homepage: https://www.tanga.com
105
+ licenses: []
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 2.4.5
124
+ signing_key:
125
+ specification_version: 4
126
+ summary: redshift etl
127
+ test_files: []
128
+ has_rdoc: