redshifter 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +3 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +147 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/redshifter.rb +15 -0
- data/lib/redshifter/config.rb +41 -0
- data/lib/redshifter/extract_and_replace_redshift_table.rb +33 -0
- data/lib/redshifter/extract_and_update_redshift_table.rb +33 -0
- data/lib/redshifter/job/update_redshift_table_job.rb +15 -0
- data/lib/redshifter/table.rb +109 -0
- data/lib/redshifter/tasks.rb +25 -0
- data/lib/redshifter/util/create_or_replace_table.rb +65 -0
- data/lib/redshifter/util/extract_and_transform_updates.rb +81 -0
- data/lib/redshifter/util/redshift.rb +18 -0
- data/lib/redshifter/util/s3.rb +92 -0
- data/lib/redshifter/util/s3_manifest_writer.rb +36 -0
- data/lib/redshifter/util/table_config_validator.rb +65 -0
- data/lib/redshifter/util/update_table.rb +88 -0
- data/lib/redshifter/version.rb +3 -0
- data/redshifter.gemspec +30 -0
- metadata +180 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 8760ba2ad72e5e668d9f3c4eb6b70d04d86b15e9
|
|
4
|
+
data.tar.gz: 8eb1c3fa50e558f9cfac14ae1d4c606754d0bdf7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: d1e972e73eea10034797bf4713185a845e9f8f1c61227b1db480c355aeda7a232aa8c2caa50a2cdb3e95243ec94ece2a5acc35383cbb016f35bb6f63ba150099
|
|
7
|
+
data.tar.gz: 6ed7ad01c948ee874b558e0f13a1009249feb95b7365330557e3d5a6bd754426c47092fa3ff53788f5b92af4c53a2cdea0792ba82586428d3f46b4a97d33a685
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2015 Apartment List, Inc
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Redshifter
|
|
2
|
+
|
|
3
|
+
Provides a resque job and rake tasks to facilitate ETL (Extract Transform Load) processing of Postgres tables for export to a Redshift cluster.
|
|
4
|
+
|
|
5
|
+
Specifically provides:
|
|
6
|
+
1) Create/Replace job to replace all model data in Redshift
|
|
7
|
+
2) Update job updates all records created or updated since the last update ran
|
|
8
|
+
|
|
9
|
+
Limitations:
|
|
10
|
+
* deleted records are NOT synced to Redshift by the update job
|
|
11
|
+
|
|
12
|
+
Feature Roadmap:
|
|
13
|
+
* store last runtime locally instead of using potentially costly redshift query
|
|
14
|
+
|
|
15
|
+
## Versions
|
|
16
|
+
0.2.4 - New config format; update and replace rake tasks available
|
|
17
|
+
0.3.0 - Public version
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
Add this line to your application's Gemfile:
|
|
22
|
+
|
|
23
|
+
```ruby
|
|
24
|
+
gem 'redshifter'
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
And then execute:
|
|
28
|
+
|
|
29
|
+
$ bundle
|
|
30
|
+
|
|
31
|
+
Or install it yourself as:
|
|
32
|
+
|
|
33
|
+
$ gem install redshifter
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Setup Redshifter in a Rails initializer
|
|
38
|
+
```ruby
|
|
39
|
+
# config/initializers/redshifter.rb
|
|
40
|
+
Redshifter.setup do |config|
|
|
41
|
+
# path in your app available for writing temp files
|
|
42
|
+
config.temp_directory_path = File.expand_path('../../tmp', __FILE__)
|
|
43
|
+
# path to redshifter table config
|
|
44
|
+
config.table_config_path = File.expand_path('../redshfiter.rb', __FILE__)
|
|
45
|
+
# redshift user should have access to create tables in the specified schema
|
|
46
|
+
config.redshift_username = 'your_app_user'
|
|
47
|
+
config.redshift_password = 'p@ssw0rd'
|
|
48
|
+
config.redshift_host = 'app.host.without.protocol.com'
|
|
49
|
+
config.redshift_port = 5439
|
|
50
|
+
config.redshift_database = 'database_name'
|
|
51
|
+
config.redshift_schema = 'a_schema'
|
|
52
|
+
# AWS user should be allowed full access to the specified bucket
|
|
53
|
+
config.aws_access_key_id = '<AWS user access key ID>'
|
|
54
|
+
config.aws_secret_access_key = '<AWS user secret access key>'
|
|
55
|
+
config.s3_bucket = 'a_redshifter_bucket'
|
|
56
|
+
# Heroku user must be a member of the app and have privileges to
|
|
57
|
+
# start new dynos
|
|
58
|
+
config.heroku_api_key = '<Heroku user api key>'
|
|
59
|
+
config.heroku_app_name = 'name of the app on heroku'
|
|
60
|
+
end
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Require Redshifter tasks in your Rakefile
|
|
64
|
+
```ruby
|
|
65
|
+
# Rakefile
|
|
66
|
+
# ...
|
|
67
|
+
require 'redshifter/tasks'
|
|
68
|
+
# ...
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Create a config file describing the tables, columns and transforms for export to redshift
|
|
72
|
+
```ruby
|
|
73
|
+
# config/redshifter.rb
|
|
74
|
+
Redshifter.config.tables = {
|
|
75
|
+
'books_with_export_at' => {
|
|
76
|
+
# [required] Source *table* name, not the Rails model name
|
|
77
|
+
source_table_name: 'books',
|
|
78
|
+
# [required] Prefixing your redshift table with its source is recommended
|
|
79
|
+
redshift_table_name: 'app_name_books',
|
|
80
|
+
# [required] Columns with Redshift datatypes to create; may differ from source DB
|
|
81
|
+
redshift_columns: {
|
|
82
|
+
'id' => 'INTEGER',
|
|
83
|
+
'title' => 'VARCHAR(128)',
|
|
84
|
+
'published_at' => 'TIMESTAMP',
|
|
85
|
+
'updated_at' => 'TIMESTAMP',
|
|
86
|
+
'exported_at' => 'TIMESTAMP'
|
|
87
|
+
},
|
|
88
|
+
# [optional] SQL statements to transform or populate redshift columns from
|
|
89
|
+
# source DB. By default, redshift columns will be populated from source
|
|
90
|
+
# column with the same name. Column key must exist in redshift_columns.
|
|
91
|
+
# If a matching source column does not exist you MUST specify it here.
|
|
92
|
+
source_column_transforms: {
|
|
93
|
+
'title' => "lower(title)",
|
|
94
|
+
'published_at' => 'first_edition_published_at',
|
|
95
|
+
'exported_at' => 'now()'
|
|
96
|
+
},
|
|
97
|
+
# [required] valid values: KEY, EVEN, ALL
|
|
98
|
+
redshift_distribution_style: 'KEY',
|
|
99
|
+
# [required, if redshift_distribution_style: 'KEY'] distribution key column
|
|
100
|
+
# name MUST be present in redshift_columns.keys
|
|
101
|
+
redshift_distribution_key: 'id',
|
|
102
|
+
# [optional] valid values: COMPOUND, INTERLEAVED; If omitted the Redshift
|
|
103
|
+
# table DDL statement will not specify the sort style and Redshift will
|
|
104
|
+
# implicitly default to COMPOUND style.
|
|
105
|
+
redshift_sort_style: 'INTERLEAVED',
|
|
106
|
+
# [required] Column names MUST be present in redshift_columns.keys; Max
|
|
107
|
+
# length of 8 when using INTERLEAVED sort style, and 400 when using
|
|
108
|
+
# COMPOUND sort style.
|
|
109
|
+
redshift_sort_keys: ['published_at'],
|
|
110
|
+
# [optional] Used for query planning in Redshift
|
|
111
|
+
redshift_primary_key: 'id'
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Run redshifter:replace rake task for each table you want to export
|
|
118
|
+
```
|
|
119
|
+
$ rake redshifter:replace[books_with_export_at]
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Schedule a Redshifter::Job::UpdateRedshiftTableJob resque job per each table you want to export updates for
|
|
123
|
+
|
|
124
|
+
Then schedule this meta job to run in `resque_schedule.yml` to run once at 10:00pm
|
|
125
|
+
```YAML
|
|
126
|
+
# config/resque_schedule.yml
|
|
127
|
+
|
|
128
|
+
etl_books_to_redshift:
|
|
129
|
+
cron: "0 22 * * *"
|
|
130
|
+
class: 'Redshifter::Job::UpdateRedshiftTableJob'
|
|
131
|
+
args: 'books_with_export_at'
|
|
132
|
+
description: 'Export the books table to Redshift'
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Development
|
|
136
|
+
|
|
137
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
138
|
+
|
|
139
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
140
|
+
|
|
141
|
+
## Contributing
|
|
142
|
+
|
|
143
|
+
1. Fork it ( https://github.com/[my-github-username]/redshifter/fork )
|
|
144
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
|
145
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
|
146
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
|
147
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bin/console
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require "bundler/setup"
|
|
4
|
+
require "redshifter"
|
|
5
|
+
|
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
|
8
|
+
|
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
|
10
|
+
# require "pry"
|
|
11
|
+
# Pry.start
|
|
12
|
+
|
|
13
|
+
require "irb"
|
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/lib/redshifter.rb
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'redshifter/version'
|
|
2
|
+
require 'redshifter/config'
|
|
3
|
+
require 'redshifter/table'
|
|
4
|
+
|
|
5
|
+
require 'redshifter/util/table_config_validator'
|
|
6
|
+
require 'redshifter/util/redshift'
|
|
7
|
+
require 'redshifter/util/s3'
|
|
8
|
+
require 'redshifter/util/extract_and_transform_updates'
|
|
9
|
+
require 'redshifter/util/s3_manifest_writer'
|
|
10
|
+
require 'redshifter/util/create_or_replace_table'
|
|
11
|
+
require 'redshifter/util/update_table'
|
|
12
|
+
|
|
13
|
+
require 'redshifter/extract_and_replace_redshift_table'
|
|
14
|
+
require 'redshifter/extract_and_update_redshift_table'
|
|
15
|
+
require 'redshifter/job/update_redshift_table_job'
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
class << self
|
|
3
|
+
def setup
|
|
4
|
+
yield config if block_given?
|
|
5
|
+
load_export_table_definitions
|
|
6
|
+
config_dynosaur
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def config
|
|
10
|
+
@config ||= Struct.new(:tables,
|
|
11
|
+
:redshift_username,
|
|
12
|
+
:redshift_password,
|
|
13
|
+
:redshift_host,
|
|
14
|
+
:redshift_port,
|
|
15
|
+
:redshift_database,
|
|
16
|
+
:redshift_schema,
|
|
17
|
+
:aws_access_key_id,
|
|
18
|
+
:aws_secret_access_key,
|
|
19
|
+
:s3_bucket,
|
|
20
|
+
:heroku_api_key,
|
|
21
|
+
:heroku_app_name,
|
|
22
|
+
:temp_directory_path,
|
|
23
|
+
:table_config_path).new
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def load_export_table_definitions
|
|
29
|
+
load config.table_config_path
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def config_dynosaur
|
|
33
|
+
require 'dynosaur'
|
|
34
|
+
|
|
35
|
+
Dynosaur::Client::HerokuClient.configure do |config|
|
|
36
|
+
config.api_key = Redshifter.config.heroku_api_key
|
|
37
|
+
config.app_name = Redshifter.config.heroku_app_name
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
class ExtractAndReplaceRedshiftTable
|
|
3
|
+
def initialize(table, s3_util = Util::S3.new)
|
|
4
|
+
@table = table
|
|
5
|
+
@s3_util = s3_util
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def run
|
|
9
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
|
10
|
+
.new(table: table,
|
|
11
|
+
since: Table::EPOCH_TIMESTAMP,
|
|
12
|
+
s3_util: s3_util
|
|
13
|
+
).run
|
|
14
|
+
|
|
15
|
+
if extracted_s3_urls.any?
|
|
16
|
+
manifest_url = Util::S3ManifestWriter
|
|
17
|
+
.new(file_name: "#{SecureRandom.uuid}.manifest",
|
|
18
|
+
file_urls: extracted_s3_urls,
|
|
19
|
+
s3_util: s3_util
|
|
20
|
+
).run
|
|
21
|
+
|
|
22
|
+
Util::CreateOrReplaceTable
|
|
23
|
+
.new(table: table,
|
|
24
|
+
manifest_url: manifest_url
|
|
25
|
+
).run
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
attr_reader :table, :s3_util
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
class ExtractAndUpdateRedshiftTable
|
|
3
|
+
def initialize(table, s3_util = Util::S3.new)
|
|
4
|
+
@table = table
|
|
5
|
+
@s3_util = s3_util
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def run
|
|
9
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
|
10
|
+
.new(table: table,
|
|
11
|
+
since: table.redshift_last_update,
|
|
12
|
+
s3_util: s3_util
|
|
13
|
+
).run
|
|
14
|
+
|
|
15
|
+
if extracted_s3_urls.any?
|
|
16
|
+
manifest_url = Util::S3ManifestWriter
|
|
17
|
+
.new(file_name: "#{SecureRandom.uuid}.manifest",
|
|
18
|
+
file_urls: extracted_s3_urls,
|
|
19
|
+
s3_util: s3_util
|
|
20
|
+
).run
|
|
21
|
+
|
|
22
|
+
Util::UpdateTable
|
|
23
|
+
.new(table: table,
|
|
24
|
+
manifest_url: manifest_url
|
|
25
|
+
).run
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
attr_reader :table, :s3_util
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'dynosaur'
|
|
2
|
+
|
|
3
|
+
module Redshifter
|
|
4
|
+
module Job
|
|
5
|
+
class UpdateRedshiftTableJob
|
|
6
|
+
@queue = :low
|
|
7
|
+
|
|
8
|
+
def self.perform(table_config_key)
|
|
9
|
+
dyno = Dynosaur::Process::Heroku
|
|
10
|
+
.new(task: 'redshifter:update', args: [table_config_key])
|
|
11
|
+
dyno.start
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
class Table
|
|
3
|
+
EPOCH_TIMESTAMP = '1970-01-01 00:00:00'
|
|
4
|
+
|
|
5
|
+
def initialize(config)
|
|
6
|
+
Util::TableConfigValidator.new(config).validate!
|
|
7
|
+
|
|
8
|
+
@source_table_name = config[:source_table_name]
|
|
9
|
+
@redshift_table_name = config[:redshift_table_name]
|
|
10
|
+
@redshift_columns = config[:redshift_columns]
|
|
11
|
+
@source_column_transforms = config[:source_column_transforms] || {}
|
|
12
|
+
@redshift_distribution_style = config[:redshift_distribution_style]
|
|
13
|
+
@redshift_distribution_key = config[:redshift_distribution_key]
|
|
14
|
+
@redshift_sort_keys = config[:redshift_sort_keys]
|
|
15
|
+
@redshift_sort_style = config[:redshift_sort_style]
|
|
16
|
+
@redshift_primary_key = config[:redshift_primary_key]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
attr_reader :source_table_name, :redshift_table_name
|
|
20
|
+
|
|
21
|
+
def redshift_column_names
|
|
22
|
+
redshift_columns.keys
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def source_column_statements
|
|
26
|
+
redshift_columns.keys.map do |v|
|
|
27
|
+
return v unless source_column_transforms
|
|
28
|
+
source_column_transforms[v] || v
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def redshift_schema
|
|
33
|
+
Redshifter.config.redshift_schema
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def redshift_table_ddl(table_name = redshift_table_name)
|
|
37
|
+
<<-QUERY.squish
|
|
38
|
+
CREATE TABLE
|
|
39
|
+
#{redshift_schema}.#{table_name}(
|
|
40
|
+
#{redshift_columns.map { |k, v| "#{k} #{v}" }.join(', ')}#{primary_key_statement}
|
|
41
|
+
)
|
|
42
|
+
#{dist_statement}
|
|
43
|
+
#{sortkey_statement};
|
|
44
|
+
QUERY
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# returns unix epoch timestamp literal if table does not exist or table
|
|
48
|
+
# exist with zero rows. Otherwise returns timestamp literal of most
|
|
49
|
+
# recently updated row in the analytics table
|
|
50
|
+
def redshift_last_update
|
|
51
|
+
conn = Util::Redshift.connect
|
|
52
|
+
|
|
53
|
+
table_presence_query = <<-QUERY.squish
|
|
54
|
+
SELECT EXISTS(
|
|
55
|
+
SELECT 1
|
|
56
|
+
FROM information_schema.tables
|
|
57
|
+
WHERE table_schema = '#{redshift_schema}'
|
|
58
|
+
AND table_name = '#{redshift_table_name}')
|
|
59
|
+
QUERY
|
|
60
|
+
|
|
61
|
+
# Redshift does not allow pg catalog table and user table to be accessed
|
|
62
|
+
# in the same query
|
|
63
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/c_sql-functions-leader-node.html
|
|
64
|
+
table_present = conn.exec(table_presence_query).getvalue(0, 0)
|
|
65
|
+
|
|
66
|
+
if table_present == 't'
|
|
67
|
+
conn.exec(
|
|
68
|
+
<<-QUERY.squish
|
|
69
|
+
SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
|
|
70
|
+
FROM #{redshift_schema}.#{redshift_table_name}
|
|
71
|
+
QUERY
|
|
72
|
+
).getvalue(0, 0)
|
|
73
|
+
else
|
|
74
|
+
EPOCH_TIMESTAMP
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
attr_reader :redshift_columns,
|
|
81
|
+
:source_column_transforms,
|
|
82
|
+
:redshift_distribution_key,
|
|
83
|
+
:redshift_sort_keys,
|
|
84
|
+
:redshift_distribution_style,
|
|
85
|
+
:redshift_sort_style,
|
|
86
|
+
:redshift_primary_key
|
|
87
|
+
|
|
88
|
+
def dist_statement
|
|
89
|
+
output = "DISTSTYLE #{redshift_distribution_style}"
|
|
90
|
+
if redshift_distribution_style == 'KEY'
|
|
91
|
+
output << " DISTKEY(#{redshift_distribution_key})"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
output
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def sortkey_statement
|
|
98
|
+
output = ''
|
|
99
|
+
output << "#{redshift_sort_style} " if redshift_sort_style
|
|
100
|
+
output << "SORTKEY(#{redshift_sort_keys.join(', ')})"
|
|
101
|
+
|
|
102
|
+
output
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def primary_key_statement
|
|
106
|
+
redshift_primary_key ? ", PRIMARY KEY(#{redshift_primary_key})" : ''
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
class Tasks
|
|
3
|
+
include Rake::DSL if defined? Rake::DSL
|
|
4
|
+
|
|
5
|
+
def install_tasks
|
|
6
|
+
namespace :redshifter do
|
|
7
|
+
desc 'Create or replace an extracted table in Redshift'
|
|
8
|
+
task :replace, [:table_config_key] => :environment do |_task, args|
|
|
9
|
+
table_config = Redshifter.config.tables[args[:table_config_key]]
|
|
10
|
+
table = Redshifter::Table.new(table_config)
|
|
11
|
+
Redshifter::ExtractAndReplaceRedshiftTable.new(table).run
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
desc 'Update an extracted table in Redshift'
|
|
15
|
+
task :update, [:table_config_key] => :environment do |_task, args|
|
|
16
|
+
table_config = Redshifter.config.tables[args[:table_config_key]]
|
|
17
|
+
table = Redshifter::Table.new(table_config)
|
|
18
|
+
Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
Redshifter::Tasks.new.install_tasks
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
module Util
|
|
3
|
+
class CreateOrReplaceTable
|
|
4
|
+
def initialize(table:, manifest_url:)
|
|
5
|
+
@table = table
|
|
6
|
+
@manifest_url = manifest_url
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def run(conn = Redshift.connect)
|
|
10
|
+
conn.transaction do |within_transaction|
|
|
11
|
+
[
|
|
12
|
+
drop_and_create_table_sql,
|
|
13
|
+
grant_readonly_permissions_sql
|
|
14
|
+
].each do |query|
|
|
15
|
+
within_transaction.exec(query)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
attr_reader :table, :manifest_url
|
|
23
|
+
|
|
24
|
+
# creates or replaces an existing table from an s3 manifest of gzipped pipe
|
|
25
|
+
# delimitted files.
|
|
26
|
+
#
|
|
27
|
+
# COPY is the most efficient way to load data into redshift. When COPY is
|
|
28
|
+
# run on a new table with zero rows it automatically runs statistics
|
|
29
|
+
# (ANALYZE) and sorts and distributes data (VACUUM). Using the manifest
|
|
30
|
+
# allows COPY command to run in parallel.
|
|
31
|
+
#
|
|
32
|
+
#SQL Notes:
|
|
33
|
+
# CSV QUOTE AS '"' -- only way to define quote character for multiline
|
|
34
|
+
# -- column data (even though file is not comma separated)
|
|
35
|
+
# DELIMITER '|' -- this is the default separator, just being explicit
|
|
36
|
+
# TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
|
|
37
|
+
def drop_and_create_table_sql
|
|
38
|
+
<<-QUERY.squish
|
|
39
|
+
DROP TABLE IF EXISTS #{table.redshift_schema}.#{table.redshift_table_name};
|
|
40
|
+
|
|
41
|
+
#{table.redshift_table_ddl}
|
|
42
|
+
|
|
43
|
+
COPY #{table.redshift_schema}.#{table.redshift_table_name}
|
|
44
|
+
(#{table.redshift_column_names.join(', ')})
|
|
45
|
+
FROM '#{manifest_url}'
|
|
46
|
+
CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
|
|
47
|
+
CSV QUOTE AS '"'
|
|
48
|
+
DELIMITER '|'
|
|
49
|
+
GZIP
|
|
50
|
+
TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
|
|
51
|
+
NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
|
|
52
|
+
MANIFEST;
|
|
53
|
+
QUERY
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def grant_readonly_permissions_sql
|
|
57
|
+
<<-QUERY.squish
|
|
58
|
+
GRANT SELECT, REFERENCES
|
|
59
|
+
ON TABLE #{table.redshift_schema}.#{table.redshift_table_name}
|
|
60
|
+
TO GROUP readonly;
|
|
61
|
+
QUERY
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
|
|
3
|
+
module Redshifter
|
|
4
|
+
module Util
|
|
5
|
+
class ExtractAndTransformUpdates
|
|
6
|
+
# Character used to represent a NULL value in the CSV. We cannot use an
|
|
7
|
+
# empty string to represent NULL, because then any tables with empty string
|
|
8
|
+
# values would have those converted to NULL in the Redshift tables. This
|
|
9
|
+
# needs to be a character we do not expect to appear as a value in any of
|
|
10
|
+
# the tables that are in the ETL process.
|
|
11
|
+
NULL_CHARACTER = '∅'
|
|
12
|
+
|
|
13
|
+
def initialize(table:, since:, s3_util:)
|
|
14
|
+
@table = table
|
|
15
|
+
@since = since
|
|
16
|
+
@s3_util = s3_util
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Writes pipe delimited 'CSV' files to S3 of updated records.
|
|
20
|
+
# Returns a list of internal s3 URLs created
|
|
21
|
+
def run(batch_size: 1000)
|
|
22
|
+
uploaded_s3_urls = []
|
|
23
|
+
run_name = SecureRandom.uuid
|
|
24
|
+
|
|
25
|
+
transform_in_batches(table.source_column_statements,
|
|
26
|
+
batch_size: batch_size) do |rows, batch|
|
|
27
|
+
csv_rows = rows.map(&method(:csv_row))
|
|
28
|
+
|
|
29
|
+
uploaded_s3_urls << s3_util.upload_file(
|
|
30
|
+
file_name: "#{table.redshift_table_name}_updates_#{run_name}_#{batch}.txt",
|
|
31
|
+
body: csv_rows.join,
|
|
32
|
+
gzip: true)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
uploaded_s3_urls
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
attr_reader :table, :since, :s3_util
|
|
41
|
+
|
|
42
|
+
def csv_row(row)
|
|
43
|
+
row.map! { |value| value.nil? ? NULL_CHARACTER : value }
|
|
44
|
+
CSV.generate_line(row, col_sep: '|')
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def transform_in_batches(*column_transforms, batch_size: 1000)
|
|
48
|
+
# guarantee id is present in the first column for batch functionality
|
|
49
|
+
select_column_transforms = column_transforms.dup.unshift('id as id_for_batching')
|
|
50
|
+
batch_start_id = 1
|
|
51
|
+
batch_count = 0
|
|
52
|
+
|
|
53
|
+
loop do
|
|
54
|
+
rows = ActiveRecord::Base.connection_pool.with_connection do |conn|
|
|
55
|
+
conn.exec_query(select_batch_sql(columns: select_column_transforms,
|
|
56
|
+
batch_size: batch_size,
|
|
57
|
+
start_id: batch_start_id)
|
|
58
|
+
)
|
|
59
|
+
end.rows
|
|
60
|
+
|
|
61
|
+
break if rows.empty?
|
|
62
|
+
|
|
63
|
+
# get the id from the first column position where it was injected
|
|
64
|
+
last_id = rows.last[0].to_i
|
|
65
|
+
# remove the injected id from the first column
|
|
66
|
+
rows.map! { |row| row[1..-1] }
|
|
67
|
+
|
|
68
|
+
yield rows, batch_count
|
|
69
|
+
|
|
70
|
+
break if rows.size < batch_size
|
|
71
|
+
batch_start_id = last_id + 1
|
|
72
|
+
batch_count += 1
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def select_batch_sql(columns:, batch_size:, start_id:)
|
|
77
|
+
"select #{columns.join(', ')} from #{table.source_table_name} where updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
module Util
|
|
3
|
+
module Redshift
|
|
4
|
+
def self.connect
|
|
5
|
+
PG.connect(
|
|
6
|
+
{
|
|
7
|
+
host: Redshifter.config.redshift_host,
|
|
8
|
+
port: Redshifter.config.redshift_port,
|
|
9
|
+
dbname: Redshifter.config.redshift_database,
|
|
10
|
+
user: Redshifter.config.redshift_username,
|
|
11
|
+
password: Redshifter.config.redshift_password,
|
|
12
|
+
sslmode: 'require'
|
|
13
|
+
}
|
|
14
|
+
)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
require 'fog'
|
|
2
|
+
require 'tmpdir'
|
|
3
|
+
|
|
4
|
+
module Redshifter
|
|
5
|
+
module Util
|
|
6
|
+
class S3
|
|
7
|
+
def upload_file(file_name:, body:, gzip: false)
|
|
8
|
+
s3_url = ''
|
|
9
|
+
|
|
10
|
+
Dir.mktmpdir('redshifter', ensure_app_tmp_directory) do |temp_dir|
|
|
11
|
+
temp_file = write_temp_file(File.join(temp_dir, file_name), body, gzip)
|
|
12
|
+
File.open(temp_file) do |file|
|
|
13
|
+
s3_file = bucket.files.create(file_options(file, gzip))
|
|
14
|
+
s3_url = internal_file_url(s3_file.key)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
s3_url
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def internal_file_url(file_name)
|
|
24
|
+
"s3://#{bucket_name}/#{file_name}"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def conn
|
|
28
|
+
@conn ||= Fog::Storage.new(
|
|
29
|
+
provider: 'AWS',
|
|
30
|
+
aws_access_key_id: Redshifter.config.aws_access_key_id,
|
|
31
|
+
aws_secret_access_key: Redshifter.config.aws_secret_access_key
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def bucket
|
|
36
|
+
@bucket ||= conn.directories.get(bucket_name)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def bucket_name
|
|
40
|
+
Redshifter.config.s3_bucket
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def file_options(file, gzip)
|
|
44
|
+
s3_file_options = {
|
|
45
|
+
key: File.basename(file),
|
|
46
|
+
body: file,
|
|
47
|
+
public: false
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
s3_file_options.merge!(
|
|
51
|
+
content_encoding: 'ASCII-8BIT',
|
|
52
|
+
compression_mime_type: 'application/x-gzip'
|
|
53
|
+
) if gzip
|
|
54
|
+
|
|
55
|
+
s3_file_options
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def ensure_app_tmp_directory
|
|
59
|
+
tmp_path = Redshifter.config.temp_directory_path
|
|
60
|
+
Dir.mkdir(tmp_path) unless File.directory?(tmp_path)
|
|
61
|
+
tmp_path
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def write_temp_file(file_path, body, gzip)
|
|
65
|
+
if gzip
|
|
66
|
+
write_temp_gzipped_file(file_path, body)
|
|
67
|
+
else
|
|
68
|
+
write_temp_uncompressed_file(file_path, body)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def write_temp_uncompressed_file(file_path, body)
|
|
73
|
+
File.open(file_path, 'w') do |file|
|
|
74
|
+
file << body
|
|
75
|
+
file
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def write_temp_gzipped_file(file_path, body)
|
|
80
|
+
gz_file_path = file_path + '.gz'
|
|
81
|
+
File.open(gz_file_path, 'w', encoding: 'ASCII-8BIT') do |compressed_file|
|
|
82
|
+
compressed_file.sync = true
|
|
83
|
+
gzip = Zlib::GzipWriter.new(compressed_file)
|
|
84
|
+
gzip.write(body)
|
|
85
|
+
gzip.close # Important: Without this call, gzip headers won't be written
|
|
86
|
+
compressed_file
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
module Util
|
|
3
|
+
class S3ManifestWriter
|
|
4
|
+
def initialize(file_name:, file_urls:, s3_util:)
|
|
5
|
+
@file_name = file_name
|
|
6
|
+
@file_urls = file_urls
|
|
7
|
+
@s3_util = s3_util
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Uploads a s3 manifest file that requires all files be processed with
|
|
11
|
+
# mandatory: true attribute per file.
|
|
12
|
+
# Returns internal s3 URL for the manifest
|
|
13
|
+
def run
|
|
14
|
+
s3_util.upload_file(
|
|
15
|
+
file_name: file_name,
|
|
16
|
+
body: JSON.generate(manifest),
|
|
17
|
+
gzip: false
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
attr_reader :file_name, :file_urls, :s3_util
|
|
24
|
+
|
|
25
|
+
def manifest
|
|
26
|
+
{ entries: manifest_files }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def manifest_files
|
|
30
|
+
file_urls.map do |child_url|
|
|
31
|
+
{ url: child_url, mandatory: true }
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
module Util
|
|
3
|
+
class TableConfigValidator
|
|
4
|
+
REQUIRED_KEYS = [:source_table_name,
|
|
5
|
+
:redshift_table_name,
|
|
6
|
+
:redshift_columns,
|
|
7
|
+
:redshift_distribution_style,
|
|
8
|
+
:redshift_sort_keys]
|
|
9
|
+
|
|
10
|
+
# hash format: { required_key: { when_this_key_and_value: 'present'} }
|
|
11
|
+
CONDITIONALLY_REQUIRED_KEYS = {
|
|
12
|
+
redshift_distribution_key: { redshift_distribution_style: 'KEY' }
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# Use a nil value when the key is optional, but has allowable values
|
|
16
|
+
# when it is present. e.g. {optional_key: ['valid1', 'valid2', nil]}
|
|
17
|
+
ALLOWABLE_VALUES = {
|
|
18
|
+
redshift_distribution_style: ['KEY', 'ALL', 'EVEN'],
|
|
19
|
+
redshift_sort_style: ['COMPOUND', 'INTERLEAVED', nil]
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def initialize(config)
|
|
23
|
+
@config = config
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def validate!
|
|
27
|
+
raise 'invalid table config' unless valid?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def valid?
|
|
31
|
+
all_required_keys_present? &&
|
|
32
|
+
all_conditionally_required_keys_present? &&
|
|
33
|
+
all_validated_values_allowed?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
attr_reader :config
|
|
39
|
+
|
|
40
|
+
def all_required_keys_present?
|
|
41
|
+
((REQUIRED_KEYS & config.keys) == REQUIRED_KEYS)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def all_conditionally_required_keys_present?
|
|
45
|
+
conditional_keys_presence = []
|
|
46
|
+
CONDITIONALLY_REQUIRED_KEYS.each do |required_k, condition_kv|
|
|
47
|
+
if (config.select { |k, v| k == condition_kv.keys.first } == condition_kv)
|
|
48
|
+
conditional_keys_presence << config.include?(required_k)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
conditional_keys_presence.all?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def all_validated_values_allowed?
|
|
56
|
+
allowed_values_presence = []
|
|
57
|
+
ALLOWABLE_VALUES.each do |k, v|
|
|
58
|
+
allowed_values_presence << v.include?(config[k])
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
allowed_values_presence.all?
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
module Redshifter
|
|
2
|
+
module Util
|
|
3
|
+
class UpdateTable
|
|
4
|
+
def initialize(table:, manifest_url:)
|
|
5
|
+
@table = table
|
|
6
|
+
@manifest_url = manifest_url
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def run(conn = Redshift.connect)
|
|
10
|
+
conn.transaction do |within_transaction|
|
|
11
|
+
[
|
|
12
|
+
create_and_load_temp_table_sql,
|
|
13
|
+
upsert_changes_sql,
|
|
14
|
+
cleanup_temp_table_sql,
|
|
15
|
+
analyze_updated_table_sql
|
|
16
|
+
].each do |query|
|
|
17
|
+
within_transaction.exec(query)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
attr_reader :table, :manifest_url
|
|
25
|
+
|
|
26
|
+
def analytics_temp_table
|
|
27
|
+
"#{table.redshift_table_name}_temp"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
#creates a temp table from original table DDL because it's the
|
|
31
|
+
# most efficient way to get a table the most closely mimics the destination.
|
|
32
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/performing-a-deep-copy.html
|
|
33
|
+
#
|
|
34
|
+
# Imports rows from an s3 manifest of gzipped pipe delimitted files.
|
|
35
|
+
# Using the manifest allows COPY command to run in parallel.
|
|
36
|
+
#
|
|
37
|
+
#SQL Notes:
|
|
38
|
+
# CSV QUOTE AS '"' -- only way to define quote character for multiline
|
|
39
|
+
# -- column data (even though file is not comma separated)
|
|
40
|
+
# DELIMITER '|' -- this is the default separator, just being explicit
|
|
41
|
+
# TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
|
|
42
|
+
# compupdate off -- save time by not compressing columns; it's temporary
|
|
43
|
+
# statupdate off -- save time by not running statistics; it's temporary
|
|
44
|
+
def create_and_load_temp_table_sql
|
|
45
|
+
<<-QUERY.squish
|
|
46
|
+
#{table.redshift_table_ddl(analytics_temp_table)}
|
|
47
|
+
|
|
48
|
+
COPY #{table.redshift_schema}.#{analytics_temp_table}
|
|
49
|
+
(#{table.redshift_column_names.join(', ')})
|
|
50
|
+
FROM '#{manifest_url}'
|
|
51
|
+
CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
|
|
52
|
+
CSV QUOTE AS '"'
|
|
53
|
+
DELIMITER '|'
|
|
54
|
+
GZIP
|
|
55
|
+
TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
|
|
56
|
+
NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
|
|
57
|
+
MANIFEST
|
|
58
|
+
compupdate off
|
|
59
|
+
statupdate off;
|
|
60
|
+
QUERY
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Replaces existing rows with updated row in a single transaction using AWS
|
|
64
|
+
# recommended method
|
|
65
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/merge-replacing-existing-rows.html
|
|
66
|
+
def upsert_changes_sql
|
|
67
|
+
<<-QUERY.squish
|
|
68
|
+
DELETE FROM #{table.redshift_schema}.#{table.redshift_table_name}
|
|
69
|
+
USING #{table.redshift_schema}.#{analytics_temp_table}
|
|
70
|
+
WHERE #{table.redshift_schema}.#{table.redshift_table_name}.id
|
|
71
|
+
= #{table.redshift_schema}.#{analytics_temp_table}.id;
|
|
72
|
+
|
|
73
|
+
INSERT INTO #{table.redshift_schema}.#{table.redshift_table_name}
|
|
74
|
+
SELECT * FROM #{table.redshift_schema}.#{analytics_temp_table};
|
|
75
|
+
QUERY
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def cleanup_temp_table_sql
|
|
79
|
+
"drop table #{table.redshift_schema}.#{analytics_temp_table};"
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# analyze recomputes table statistics for efficient querying after change
|
|
83
|
+
def analyze_updated_table_sql
|
|
84
|
+
"analyze #{table.redshift_schema}.#{table.redshift_table_name};"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
data/redshifter.gemspec
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'redshifter/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = 'redshifter'
|
|
8
|
+
spec.version = Redshifter::VERSION
|
|
9
|
+
spec.authors = ['Justin Richard']
|
|
10
|
+
spec.email = ['justin@apartmentlist.com']
|
|
11
|
+
|
|
12
|
+
spec.summary = %q{ETL processing jobs to exporting Rails model tables to Redshift}
|
|
13
|
+
spec.homepage = 'https://github.com/apartmentlist/redshifter'
|
|
14
|
+
|
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
16
|
+
spec.require_paths = ['lib']
|
|
17
|
+
|
|
18
|
+
spec.add_runtime_dependency 'dynosaur', '~> 0'
|
|
19
|
+
spec.add_runtime_dependency 'fog', '~> 1.36.0'
|
|
20
|
+
# mime-types now an explicit dependency of fog-core >=1.35.0
|
|
21
|
+
# fog 1.36.0 has a loose dependency on fog-core "~> 1.32" that causes this
|
|
22
|
+
# dependency change to bubble up to redshifter
|
|
23
|
+
spec.add_runtime_dependency 'mime-types'
|
|
24
|
+
spec.add_runtime_dependency 'pg', '~> 0.18'
|
|
25
|
+
|
|
26
|
+
spec.add_development_dependency 'bundler'
|
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
|
28
|
+
spec.add_development_dependency 'pry-byebug', '~> 3'
|
|
29
|
+
spec.add_development_dependency 'rspec', '~> 3.3'
|
|
30
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: redshifter
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.3.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Justin Richard
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2015-12-04 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: dynosaur
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: fog
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 1.36.0
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: 1.36.0
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: mime-types
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: pg
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0.18'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0.18'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: bundler
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ">="
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: rake
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - "~>"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '10.0'
|
|
90
|
+
type: :development
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - "~>"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '10.0'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: pry-byebug
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - "~>"
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '3'
|
|
104
|
+
type: :development
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - "~>"
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '3'
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: rspec
|
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
|
114
|
+
requirements:
|
|
115
|
+
- - "~>"
|
|
116
|
+
- !ruby/object:Gem::Version
|
|
117
|
+
version: '3.3'
|
|
118
|
+
type: :development
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - "~>"
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '3.3'
|
|
125
|
+
description:
|
|
126
|
+
email:
|
|
127
|
+
- justin@apartmentlist.com
|
|
128
|
+
executables: []
|
|
129
|
+
extensions: []
|
|
130
|
+
extra_rdoc_files: []
|
|
131
|
+
files:
|
|
132
|
+
- ".gitignore"
|
|
133
|
+
- ".rspec"
|
|
134
|
+
- ".travis.yml"
|
|
135
|
+
- Gemfile
|
|
136
|
+
- LICENSE.txt
|
|
137
|
+
- README.md
|
|
138
|
+
- Rakefile
|
|
139
|
+
- bin/console
|
|
140
|
+
- bin/setup
|
|
141
|
+
- lib/redshifter.rb
|
|
142
|
+
- lib/redshifter/config.rb
|
|
143
|
+
- lib/redshifter/extract_and_replace_redshift_table.rb
|
|
144
|
+
- lib/redshifter/extract_and_update_redshift_table.rb
|
|
145
|
+
- lib/redshifter/job/update_redshift_table_job.rb
|
|
146
|
+
- lib/redshifter/table.rb
|
|
147
|
+
- lib/redshifter/tasks.rb
|
|
148
|
+
- lib/redshifter/util/create_or_replace_table.rb
|
|
149
|
+
- lib/redshifter/util/extract_and_transform_updates.rb
|
|
150
|
+
- lib/redshifter/util/redshift.rb
|
|
151
|
+
- lib/redshifter/util/s3.rb
|
|
152
|
+
- lib/redshifter/util/s3_manifest_writer.rb
|
|
153
|
+
- lib/redshifter/util/table_config_validator.rb
|
|
154
|
+
- lib/redshifter/util/update_table.rb
|
|
155
|
+
- lib/redshifter/version.rb
|
|
156
|
+
- redshifter.gemspec
|
|
157
|
+
homepage: https://github.com/apartmentlist/redshifter
|
|
158
|
+
licenses: []
|
|
159
|
+
metadata: {}
|
|
160
|
+
post_install_message:
|
|
161
|
+
rdoc_options: []
|
|
162
|
+
require_paths:
|
|
163
|
+
- lib
|
|
164
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
165
|
+
requirements:
|
|
166
|
+
- - ">="
|
|
167
|
+
- !ruby/object:Gem::Version
|
|
168
|
+
version: '0'
|
|
169
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
170
|
+
requirements:
|
|
171
|
+
- - ">="
|
|
172
|
+
- !ruby/object:Gem::Version
|
|
173
|
+
version: '0'
|
|
174
|
+
requirements: []
|
|
175
|
+
rubyforge_project:
|
|
176
|
+
rubygems_version: 2.4.5
|
|
177
|
+
signing_key:
|
|
178
|
+
specification_version: 4
|
|
179
|
+
summary: ETL processing jobs to exporting Rails model tables to Redshift
|
|
180
|
+
test_files: []
|