redshifter 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8760ba2ad72e5e668d9f3c4eb6b70d04d86b15e9
4
+ data.tar.gz: 8eb1c3fa50e558f9cfac14ae1d4c606754d0bdf7
5
+ SHA512:
6
+ metadata.gz: d1e972e73eea10034797bf4713185a845e9f8f1c61227b1db480c355aeda7a232aa8c2caa50a2cdb3e95243ec94ece2a5acc35383cbb016f35bb6f63ba150099
7
+ data.tar.gz: 6ed7ad01c948ee874b558e0f13a1009249feb95b7365330557e3d5a6bd754426c47092fa3ff53788f5b92af4c53a2cdea0792ba82586428d3f46b4a97d33a685
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .rvmrc
2
+ *.gem
3
+ Gemfile.lock
4
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require 'spec_helper'
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.1
4
+ - 2.2.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in redshifter.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Apartment List, Inc
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,147 @@
1
+ # Redshifter
2
+
3
+ Provides a resque job and rake tasks to facilitate ETL (Extract Transform Load) processing of Postgres tables for export to a Redshift cluster.
4
+
5
+ Specifically provides:
6
+ 1) Create/Replace job to replace all model data in Redshift
7
+ 2) Update job updates all records created or updated since the last update ran
8
+
9
+ Limitations:
10
+ * deleted records are NOT synced to Redshift by the update job
11
+
12
+ Feature Roadmap:
13
+ * store last runtime locally instead of using potentially costly redshift query
14
+
15
+ ## Versions
16
+ 0.2.4 - New config format; update and replace rake tasks available
17
+ 0.3.0 - Public version
18
+
19
+ ## Installation
20
+
21
+ Add this line to your application's Gemfile:
22
+
23
+ ```ruby
24
+ gem 'redshifter'
25
+ ```
26
+
27
+ And then execute:
28
+
29
+ $ bundle
30
+
31
+ Or install it yourself as:
32
+
33
+ $ gem install redshifter
34
+
35
+ ## Usage
36
+
37
+ ### Setup Redshifter in a Rails initializer
38
+ ```ruby
39
+ # config/initializers/redshifter.rb
40
+ Redshifter.setup do |config|
41
+ # path in your app available for writing temp files
42
+ config.temp_directory_path = File.expand_path('../../tmp', __FILE__)
43
+ # path to redshifter table config
44
+ config.table_config_path = File.expand_path('../redshfiter.rb', __FILE__)
45
+ # redshift user should have access to create tables in the specified schema
46
+ config.redshift_username = 'your_app_user'
47
+ config.redshift_password = 'p@ssw0rd'
48
+ config.redshift_host = 'app.host.without.protocol.com'
49
+ config.redshift_port = 5439
50
+ config.redshift_database = 'database_name'
51
+ config.redshift_schema = 'a_schema'
52
+ # AWS user should be allowed full access to the specified bucket
53
+ config.aws_access_key_id = '<AWS user access key ID>'
54
+ config.aws_secret_access_key = '<AWS user secret access key>'
55
+ config.s3_bucket = 'a_redshifter_bucket'
56
+ # Heroku user must be a member of the app and have privileges to
57
+ # start new dynos
58
+ config.heroku_api_key = '<Heroku user api key>'
59
+ config.heroku_app_name = 'name of the app on heroku'
60
+ end
61
+ ```
62
+
63
+ ### Require Redshifter tasks in your Rakefile
64
+ ```ruby
65
+ # Rakefile
66
+ # ...
67
+ require 'redshifter/tasks'
68
+ # ...
69
+ ```
70
+
71
+ ### Create a config file describing the tables, columns and transforms for export to redshift
72
+ ```ruby
73
+ # config/redshifter.rb
74
+ Redshifter.config.tables = {
75
+ 'books_with_export_at' => {
76
+ # [required] Source *table* name, not the Rails model name
77
+ source_table_name: 'books',
78
+ # [required] Prefixing your redshift table with its source is recommended
79
+ redshift_table_name: 'app_name_books',
80
+ # [required] Columns with Redshift datatypes to create; may differ from source DB
81
+ redshift_columns: {
82
+ 'id' => 'INTEGER',
83
+ 'title' => 'VARCHAR(128)',
84
+ 'published_at' => 'TIMESTAMP',
85
+ 'updated_at' => 'TIMESTAMP',
86
+ 'exported_at' => 'TIMESTAMP'
87
+ },
88
+ # [optional] SQL statements to transform or populate redshift columns from
89
+ # source DB. By default, redshift columns will be populated from source
90
+ # column with the same name. Column key must exist in redshift_columns.
91
+ # If a matching source column does not exist you MUST specify it here.
92
+ source_column_transforms: {
93
+ 'title' => "lower(title)",
94
+ 'published_at' => 'first_edition_published_at',
95
+ 'exported_at' => 'now()'
96
+ },
97
+ # [required] valid values: KEY, EVEN, ALL
98
+ redshift_distribution_style: 'KEY',
99
+ # [required, if redshift_distribution_style: 'KEY'] distribution key column
100
+ # name MUST be present in redshift_columns.keys
101
+ redshift_distribution_key: 'id',
102
+ # [optional] valid values: COMPOUND, INTERLEAVED; If omitted the Redshift
103
+ # table DDL statement will not specify the sort style and Redshift will
104
+ # implicitly default to COMPOUND style.
105
+ redshift_sort_style: 'INTERLEAVED',
106
+ # [required] Column names MUST be present in redshift_columns.keys; Max
107
+ # length of 8 when using INTERLEAVED sort style, and 400 when using
108
+ # COMPOUND sort style.
109
+ redshift_sort_keys: ['published_at'],
110
+ # [optional] Used for query planning in Redshift
111
+ redshift_primary_key: 'id'
112
+ }
113
+ }
114
+
115
+ ```
116
+
117
+ ### Run redshifter:replace rake task for each table you want to export
118
+ ```
119
+ $ rake redshifter:replace[books_with_export_at]
120
+ ```
121
+
122
+ ### Schedule a Redshifter::Job::UpdateRedshiftTableJob resque job per each table you want to export updates for
123
+
124
+ Then schedule this meta job to run in `resque_schedule.yml` to run once at 10:00pm
125
+ ```YAML
126
+ # config/resque_schedule.yml
127
+
128
+ etl_books_to_redshift:
129
+ cron: "0 22 * * *"
130
+ class: 'Redshifter::Job::UpdateRedshiftTableJob'
131
+ args: 'books_with_export_at'
132
+ description: 'Export the books table to Redshift'
133
+ ```
134
+
135
+ ## Development
136
+
137
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
138
+
139
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
140
+
141
+ ## Contributing
142
+
143
+ 1. Fork it ( https://github.com/[my-github-username]/redshifter/fork )
144
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
145
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
146
+ 4. Push to the branch (`git push origin my-new-feature`)
147
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "redshifter"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/lib/redshifter.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'redshifter/version'
2
+ require 'redshifter/config'
3
+ require 'redshifter/table'
4
+
5
+ require 'redshifter/util/table_config_validator'
6
+ require 'redshifter/util/redshift'
7
+ require 'redshifter/util/s3'
8
+ require 'redshifter/util/extract_and_transform_updates'
9
+ require 'redshifter/util/s3_manifest_writer'
10
+ require 'redshifter/util/create_or_replace_table'
11
+ require 'redshifter/util/update_table'
12
+
13
+ require 'redshifter/extract_and_replace_redshift_table'
14
+ require 'redshifter/extract_and_update_redshift_table'
15
+ require 'redshifter/job/update_redshift_table_job'
@@ -0,0 +1,41 @@
1
+ module Redshifter
2
+ class << self
3
+ def setup
4
+ yield config if block_given?
5
+ load_export_table_definitions
6
+ config_dynosaur
7
+ end
8
+
9
+ def config
10
+ @config ||= Struct.new(:tables,
11
+ :redshift_username,
12
+ :redshift_password,
13
+ :redshift_host,
14
+ :redshift_port,
15
+ :redshift_database,
16
+ :redshift_schema,
17
+ :aws_access_key_id,
18
+ :aws_secret_access_key,
19
+ :s3_bucket,
20
+ :heroku_api_key,
21
+ :heroku_app_name,
22
+ :temp_directory_path,
23
+ :table_config_path).new
24
+ end
25
+
26
+ private
27
+
28
+ def load_export_table_definitions
29
+ load config.table_config_path
30
+ end
31
+
32
+ def config_dynosaur
33
+ require 'dynosaur'
34
+
35
+ Dynosaur::Client::HerokuClient.configure do |config|
36
+ config.api_key = Redshifter.config.heroku_api_key
37
+ config.app_name = Redshifter.config.heroku_app_name
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,33 @@
1
+ module Redshifter
2
+ class ExtractAndReplaceRedshiftTable
3
+ def initialize(table, s3_util = Util::S3.new)
4
+ @table = table
5
+ @s3_util = s3_util
6
+ end
7
+
8
+ def run
9
+ extracted_s3_urls = Util::ExtractAndTransformUpdates
10
+ .new(table: table,
11
+ since: Table::EPOCH_TIMESTAMP,
12
+ s3_util: s3_util
13
+ ).run
14
+
15
+ if extracted_s3_urls.any?
16
+ manifest_url = Util::S3ManifestWriter
17
+ .new(file_name: "#{SecureRandom.uuid}.manifest",
18
+ file_urls: extracted_s3_urls,
19
+ s3_util: s3_util
20
+ ).run
21
+
22
+ Util::CreateOrReplaceTable
23
+ .new(table: table,
24
+ manifest_url: manifest_url
25
+ ).run
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :table, :s3_util
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ module Redshifter
2
+ class ExtractAndUpdateRedshiftTable
3
+ def initialize(table, s3_util = Util::S3.new)
4
+ @table = table
5
+ @s3_util = s3_util
6
+ end
7
+
8
+ def run
9
+ extracted_s3_urls = Util::ExtractAndTransformUpdates
10
+ .new(table: table,
11
+ since: table.redshift_last_update,
12
+ s3_util: s3_util
13
+ ).run
14
+
15
+ if extracted_s3_urls.any?
16
+ manifest_url = Util::S3ManifestWriter
17
+ .new(file_name: "#{SecureRandom.uuid}.manifest",
18
+ file_urls: extracted_s3_urls,
19
+ s3_util: s3_util
20
+ ).run
21
+
22
+ Util::UpdateTable
23
+ .new(table: table,
24
+ manifest_url: manifest_url
25
+ ).run
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :table, :s3_util
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ require 'dynosaur'
2
+
3
+ module Redshifter
4
+ module Job
5
+ class UpdateRedshiftTableJob
6
+ @queue = :low
7
+
8
+ def self.perform(table_config_key)
9
+ dyno = Dynosaur::Process::Heroku
10
+ .new(task: 'redshifter:update', args: [table_config_key])
11
+ dyno.start
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,109 @@
1
+ module Redshifter
2
+ class Table
3
+ EPOCH_TIMESTAMP = '1970-01-01 00:00:00'
4
+
5
+ def initialize(config)
6
+ Util::TableConfigValidator.new(config).validate!
7
+
8
+ @source_table_name = config[:source_table_name]
9
+ @redshift_table_name = config[:redshift_table_name]
10
+ @redshift_columns = config[:redshift_columns]
11
+ @source_column_transforms = config[:source_column_transforms] || {}
12
+ @redshift_distribution_style = config[:redshift_distribution_style]
13
+ @redshift_distribution_key = config[:redshift_distribution_key]
14
+ @redshift_sort_keys = config[:redshift_sort_keys]
15
+ @redshift_sort_style = config[:redshift_sort_style]
16
+ @redshift_primary_key = config[:redshift_primary_key]
17
+ end
18
+
19
+ attr_reader :source_table_name, :redshift_table_name
20
+
21
+ def redshift_column_names
22
+ redshift_columns.keys
23
+ end
24
+
25
+ def source_column_statements
26
+ redshift_columns.keys.map do |v|
27
+ return v unless source_column_transforms
28
+ source_column_transforms[v] || v
29
+ end
30
+ end
31
+
32
+ def redshift_schema
33
+ Redshifter.config.redshift_schema
34
+ end
35
+
36
+ def redshift_table_ddl(table_name = redshift_table_name)
37
+ <<-QUERY.squish
38
+ CREATE TABLE
39
+ #{redshift_schema}.#{table_name}(
40
+ #{redshift_columns.map { |k, v| "#{k} #{v}" }.join(', ')}#{primary_key_statement}
41
+ )
42
+ #{dist_statement}
43
+ #{sortkey_statement};
44
+ QUERY
45
+ end
46
+
47
+ # returns unix epoch timestamp literal if table does not exist or table
48
+ # exist with zero rows. Otherwise returns timestamp literal of most
49
+ # recently updated row in the analytics table
50
+ def redshift_last_update
51
+ conn = Util::Redshift.connect
52
+
53
+ table_presence_query = <<-QUERY.squish
54
+ SELECT EXISTS(
55
+ SELECT 1
56
+ FROM information_schema.tables
57
+ WHERE table_schema = '#{redshift_schema}'
58
+ AND table_name = '#{redshift_table_name}')
59
+ QUERY
60
+
61
+ # Redshift does not allow pg catalog table and user table to be accessed
62
+ # in the same query
63
+ # http://docs.aws.amazon.com/redshift/latest/dg/c_sql-functions-leader-node.html
64
+ table_present = conn.exec(table_presence_query).getvalue(0, 0)
65
+
66
+ if table_present == 't'
67
+ conn.exec(
68
+ <<-QUERY.squish
69
+ SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
70
+ FROM #{redshift_schema}.#{redshift_table_name}
71
+ QUERY
72
+ ).getvalue(0, 0)
73
+ else
74
+ EPOCH_TIMESTAMP
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ attr_reader :redshift_columns,
81
+ :source_column_transforms,
82
+ :redshift_distribution_key,
83
+ :redshift_sort_keys,
84
+ :redshift_distribution_style,
85
+ :redshift_sort_style,
86
+ :redshift_primary_key
87
+
88
+ def dist_statement
89
+ output = "DISTSTYLE #{redshift_distribution_style}"
90
+ if redshift_distribution_style == 'KEY'
91
+ output << " DISTKEY(#{redshift_distribution_key})"
92
+ end
93
+
94
+ output
95
+ end
96
+
97
+ def sortkey_statement
98
+ output = ''
99
+ output << "#{redshift_sort_style} " if redshift_sort_style
100
+ output << "SORTKEY(#{redshift_sort_keys.join(', ')})"
101
+
102
+ output
103
+ end
104
+
105
+ def primary_key_statement
106
+ redshift_primary_key ? ", PRIMARY KEY(#{redshift_primary_key})" : ''
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,25 @@
1
+ module Redshifter
2
+ class Tasks
3
+ include Rake::DSL if defined? Rake::DSL
4
+
5
+ def install_tasks
6
+ namespace :redshifter do
7
+ desc 'Create or replace an extracted table in Redshift'
8
+ task :replace, [:table_config_key] => :environment do |_task, args|
9
+ table_config = Redshifter.config.tables[args[:table_config_key]]
10
+ table = Redshifter::Table.new(table_config)
11
+ Redshifter::ExtractAndReplaceRedshiftTable.new(table).run
12
+ end
13
+
14
+ desc 'Update an extracted table in Redshift'
15
+ task :update, [:table_config_key] => :environment do |_task, args|
16
+ table_config = Redshifter.config.tables[args[:table_config_key]]
17
+ table = Redshifter::Table.new(table_config)
18
+ Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ Redshifter::Tasks.new.install_tasks
@@ -0,0 +1,65 @@
1
+ module Redshifter
2
+ module Util
3
+ class CreateOrReplaceTable
4
+ def initialize(table:, manifest_url:)
5
+ @table = table
6
+ @manifest_url = manifest_url
7
+ end
8
+
9
+ def run(conn = Redshift.connect)
10
+ conn.transaction do |within_transaction|
11
+ [
12
+ drop_and_create_table_sql,
13
+ grant_readonly_permissions_sql
14
+ ].each do |query|
15
+ within_transaction.exec(query)
16
+ end
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :table, :manifest_url
23
+
24
+ # creates or replaces an existing table from an s3 manifest of gzipped pipe
25
+ # delimitted files.
26
+ #
27
+ # COPY is the most efficient way to load data into redshift. When COPY is
28
+ # run on a new table with zero rows it automatically runs statistics
29
+ # (ANALYZE) and sorts and distributes data (VACUUM). Using the manifest
30
+ # allows COPY command to run in parallel.
31
+ #
32
+ #SQL Notes:
33
+ # CSV QUOTE AS '"' -- only way to define quote character for multiline
34
+ # -- column data (even though file is not comma separated)
35
+ # DELIMITER '|' -- this is the default separator, just being explicit
36
+ # TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
37
+ def drop_and_create_table_sql
38
+ <<-QUERY.squish
39
+ DROP TABLE IF EXISTS #{table.redshift_schema}.#{table.redshift_table_name};
40
+
41
+ #{table.redshift_table_ddl}
42
+
43
+ COPY #{table.redshift_schema}.#{table.redshift_table_name}
44
+ (#{table.redshift_column_names.join(', ')})
45
+ FROM '#{manifest_url}'
46
+ CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
47
+ CSV QUOTE AS '"'
48
+ DELIMITER '|'
49
+ GZIP
50
+ TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
51
+ NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
52
+ MANIFEST;
53
+ QUERY
54
+ end
55
+
56
+ def grant_readonly_permissions_sql
57
+ <<-QUERY.squish
58
+ GRANT SELECT, REFERENCES
59
+ ON TABLE #{table.redshift_schema}.#{table.redshift_table_name}
60
+ TO GROUP readonly;
61
+ QUERY
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,81 @@
1
+ require 'csv'
2
+
3
+ module Redshifter
4
+ module Util
5
+ class ExtractAndTransformUpdates
6
+ # Character used to represent a NULL value in the CSV. We cannot use an
7
+ # empty string to represent NULL, because then any tables with empty string
8
+ # values would have those converted to NULL in the Redshift tables. This
9
+ # needs to be a character we do not expect to appear as a value in any of
10
+ # the tables that are in the ETL process.
11
+ NULL_CHARACTER = '∅'
12
+
13
+ def initialize(table:, since:, s3_util:)
14
+ @table = table
15
+ @since = since
16
+ @s3_util = s3_util
17
+ end
18
+
19
+ # Writes pipe delimited 'CSV' files to S3 of updated records.
20
+ # Returns a list of internal s3 URLs created
21
+ def run(batch_size: 1000)
22
+ uploaded_s3_urls = []
23
+ run_name = SecureRandom.uuid
24
+
25
+ transform_in_batches(table.source_column_statements,
26
+ batch_size: batch_size) do |rows, batch|
27
+ csv_rows = rows.map(&method(:csv_row))
28
+
29
+ uploaded_s3_urls << s3_util.upload_file(
30
+ file_name: "#{table.redshift_table_name}_updates_#{run_name}_#{batch}.txt",
31
+ body: csv_rows.join,
32
+ gzip: true)
33
+ end
34
+
35
+ uploaded_s3_urls
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :table, :since, :s3_util
41
+
42
+ def csv_row(row)
43
+ row.map! { |value| value.nil? ? NULL_CHARACTER : value }
44
+ CSV.generate_line(row, col_sep: '|')
45
+ end
46
+
47
+ def transform_in_batches(*column_transforms, batch_size: 1000)
48
+ # guarantee id is present in the first column for batch functionality
49
+ select_column_transforms = column_transforms.dup.unshift('id as id_for_batching')
50
+ batch_start_id = 1
51
+ batch_count = 0
52
+
53
+ loop do
54
+ rows = ActiveRecord::Base.connection_pool.with_connection do |conn|
55
+ conn.exec_query(select_batch_sql(columns: select_column_transforms,
56
+ batch_size: batch_size,
57
+ start_id: batch_start_id)
58
+ )
59
+ end.rows
60
+
61
+ break if rows.empty?
62
+
63
+ # get the id from the first column position where it was injected
64
+ last_id = rows.last[0].to_i
65
+ # remove the injected id from the first column
66
+ rows.map! { |row| row[1..-1] }
67
+
68
+ yield rows, batch_count
69
+
70
+ break if rows.size < batch_size
71
+ batch_start_id = last_id + 1
72
+ batch_count += 1
73
+ end
74
+ end
75
+
76
+ def select_batch_sql(columns:, batch_size:, start_id:)
77
+ "select #{columns.join(', ')} from #{table.source_table_name} where updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,18 @@
1
+ module Redshifter
2
+ module Util
3
+ module Redshift
4
+ def self.connect
5
+ PG.connect(
6
+ {
7
+ host: Redshifter.config.redshift_host,
8
+ port: Redshifter.config.redshift_port,
9
+ dbname: Redshifter.config.redshift_database,
10
+ user: Redshifter.config.redshift_username,
11
+ password: Redshifter.config.redshift_password,
12
+ sslmode: 'require'
13
+ }
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,92 @@
1
+ require 'fog'
2
+ require 'tmpdir'
3
+
4
+ module Redshifter
5
+ module Util
6
+ class S3
7
+ def upload_file(file_name:, body:, gzip: false)
8
+ s3_url = ''
9
+
10
+ Dir.mktmpdir('redshifter', ensure_app_tmp_directory) do |temp_dir|
11
+ temp_file = write_temp_file(File.join(temp_dir, file_name), body, gzip)
12
+ File.open(temp_file) do |file|
13
+ s3_file = bucket.files.create(file_options(file, gzip))
14
+ s3_url = internal_file_url(s3_file.key)
15
+ end
16
+ end
17
+
18
+ s3_url
19
+ end
20
+
21
+ private
22
+
23
+ def internal_file_url(file_name)
24
+ "s3://#{bucket_name}/#{file_name}"
25
+ end
26
+
27
+ def conn
28
+ @conn ||= Fog::Storage.new(
29
+ provider: 'AWS',
30
+ aws_access_key_id: Redshifter.config.aws_access_key_id,
31
+ aws_secret_access_key: Redshifter.config.aws_secret_access_key
32
+ )
33
+ end
34
+
35
+ def bucket
36
+ @bucket ||= conn.directories.get(bucket_name)
37
+ end
38
+
39
+ def bucket_name
40
+ Redshifter.config.s3_bucket
41
+ end
42
+
43
+ def file_options(file, gzip)
44
+ s3_file_options = {
45
+ key: File.basename(file),
46
+ body: file,
47
+ public: false
48
+ }
49
+
50
+ s3_file_options.merge!(
51
+ content_encoding: 'ASCII-8BIT',
52
+ compression_mime_type: 'application/x-gzip'
53
+ ) if gzip
54
+
55
+ s3_file_options
56
+ end
57
+
58
+ def ensure_app_tmp_directory
59
+ tmp_path = Redshifter.config.temp_directory_path
60
+ Dir.mkdir(tmp_path) unless File.directory?(tmp_path)
61
+ tmp_path
62
+ end
63
+
64
+ def write_temp_file(file_path, body, gzip)
65
+ if gzip
66
+ write_temp_gzipped_file(file_path, body)
67
+ else
68
+ write_temp_uncompressed_file(file_path, body)
69
+ end
70
+ end
71
+
72
+ def write_temp_uncompressed_file(file_path, body)
73
+ File.open(file_path, 'w') do |file|
74
+ file << body
75
+ file
76
+ end
77
+ end
78
+
79
+ def write_temp_gzipped_file(file_path, body)
80
+ gz_file_path = file_path + '.gz'
81
+ File.open(gz_file_path, 'w', encoding: 'ASCII-8BIT') do |compressed_file|
82
+ compressed_file.sync = true
83
+ gzip = Zlib::GzipWriter.new(compressed_file)
84
+ gzip.write(body)
85
+ gzip.close # Important: Without this call, gzip headers won't be written
86
+ compressed_file
87
+ end
88
+ end
89
+
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,36 @@
1
+ module Redshifter
2
+ module Util
3
+ class S3ManifestWriter
4
+ def initialize(file_name:, file_urls:, s3_util:)
5
+ @file_name = file_name
6
+ @file_urls = file_urls
7
+ @s3_util = s3_util
8
+ end
9
+
10
+ # Uploads a s3 manifest file that requires all files be processed with
11
+ # mandatory: true attribute per file.
12
+ # Returns internal s3 URL for the manifest
13
+ def run
14
+ s3_util.upload_file(
15
+ file_name: file_name,
16
+ body: JSON.generate(manifest),
17
+ gzip: false
18
+ )
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :file_name, :file_urls, :s3_util
24
+
25
+ def manifest
26
+ { entries: manifest_files }
27
+ end
28
+
29
+ def manifest_files
30
+ file_urls.map do |child_url|
31
+ { url: child_url, mandatory: true }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,65 @@
1
+ module Redshifter
2
+ module Util
3
+ class TableConfigValidator
4
+ REQUIRED_KEYS = [:source_table_name,
5
+ :redshift_table_name,
6
+ :redshift_columns,
7
+ :redshift_distribution_style,
8
+ :redshift_sort_keys]
9
+
10
+ # hash format: { required_key: { when_this_key_and_value: 'present'} }
11
+ CONDITIONALLY_REQUIRED_KEYS = {
12
+ redshift_distribution_key: { redshift_distribution_style: 'KEY' }
13
+ }
14
+
15
+ # Use a nil value when the key is optional, but has allowable values
16
+ # when it is present. e.g. {optional_key: ['valid1', 'valid2', nil]}
17
+ ALLOWABLE_VALUES = {
18
+ redshift_distribution_style: ['KEY', 'ALL', 'EVEN'],
19
+ redshift_sort_style: ['COMPOUND', 'INTERLEAVED', nil]
20
+ }
21
+
22
+ def initialize(config)
23
+ @config = config
24
+ end
25
+
26
+ def validate!
27
+ raise 'invalid table config' unless valid?
28
+ end
29
+
30
+ def valid?
31
+ all_required_keys_present? &&
32
+ all_conditionally_required_keys_present? &&
33
+ all_validated_values_allowed?
34
+ end
35
+
36
+ private
37
+
38
+ attr_reader :config
39
+
40
+ def all_required_keys_present?
41
+ ((REQUIRED_KEYS & config.keys) == REQUIRED_KEYS)
42
+ end
43
+
44
+ def all_conditionally_required_keys_present?
45
+ conditional_keys_presence = []
46
+ CONDITIONALLY_REQUIRED_KEYS.each do |required_k, condition_kv|
47
+ if (config.select { |k, v| k == condition_kv.keys.first } == condition_kv)
48
+ conditional_keys_presence << config.include?(required_k)
49
+ end
50
+ end
51
+
52
+ conditional_keys_presence.all?
53
+ end
54
+
55
+ def all_validated_values_allowed?
56
+ allowed_values_presence = []
57
+ ALLOWABLE_VALUES.each do |k, v|
58
+ allowed_values_presence << v.include?(config[k])
59
+ end
60
+
61
+ allowed_values_presence.all?
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ module Redshifter
2
+ module Util
3
+ class UpdateTable
4
+ def initialize(table:, manifest_url:)
5
+ @table = table
6
+ @manifest_url = manifest_url
7
+ end
8
+
9
+ def run(conn = Redshift.connect)
10
+ conn.transaction do |within_transaction|
11
+ [
12
+ create_and_load_temp_table_sql,
13
+ upsert_changes_sql,
14
+ cleanup_temp_table_sql,
15
+ analyze_updated_table_sql
16
+ ].each do |query|
17
+ within_transaction.exec(query)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ attr_reader :table, :manifest_url
25
+
26
+ def analytics_temp_table
27
+ "#{table.redshift_table_name}_temp"
28
+ end
29
+
30
+ #creates a temp table from original table DDL because it's the
31
+ # most efficient way to get a table the most closely mimics the destination.
32
+ # http://docs.aws.amazon.com/redshift/latest/dg/performing-a-deep-copy.html
33
+ #
34
+ # Imports rows from an s3 manifest of gzipped pipe delimitted files.
35
+ # Using the manifest allows COPY command to run in parallel.
36
+ #
37
+ #SQL Notes:
38
+ # CSV QUOTE AS '"' -- only way to define quote character for multiline
39
+ # -- column data (even though file is not comma separated)
40
+ # DELIMITER '|' -- this is the default separator, just being explicit
41
+ # TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
42
+ # compupdate off -- save time by not compressing columns; it's temporary
43
+ # statupdate off -- save time by not running statistics; it's temporary
44
+ def create_and_load_temp_table_sql
45
+ <<-QUERY.squish
46
+ #{table.redshift_table_ddl(analytics_temp_table)}
47
+
48
+ COPY #{table.redshift_schema}.#{analytics_temp_table}
49
+ (#{table.redshift_column_names.join(', ')})
50
+ FROM '#{manifest_url}'
51
+ CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
52
+ CSV QUOTE AS '"'
53
+ DELIMITER '|'
54
+ GZIP
55
+ TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
56
+ NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
57
+ MANIFEST
58
+ compupdate off
59
+ statupdate off;
60
+ QUERY
61
+ end
62
+
63
+ # Replaces existing rows with updated row in a single transaction using AWS
64
+ # recommended method
65
+ # http://docs.aws.amazon.com/redshift/latest/dg/merge-replacing-existing-rows.html
66
+ def upsert_changes_sql
67
+ <<-QUERY.squish
68
+ DELETE FROM #{table.redshift_schema}.#{table.redshift_table_name}
69
+ USING #{table.redshift_schema}.#{analytics_temp_table}
70
+ WHERE #{table.redshift_schema}.#{table.redshift_table_name}.id
71
+ = #{table.redshift_schema}.#{analytics_temp_table}.id;
72
+
73
+ INSERT INTO #{table.redshift_schema}.#{table.redshift_table_name}
74
+ SELECT * FROM #{table.redshift_schema}.#{analytics_temp_table};
75
+ QUERY
76
+ end
77
+
78
+ def cleanup_temp_table_sql
79
+ "drop table #{table.redshift_schema}.#{analytics_temp_table};"
80
+ end
81
+
82
+ # analyze recomputes table statistics for efficient querying after change
83
+ def analyze_updated_table_sql
84
+ "analyze #{table.redshift_schema}.#{table.redshift_table_name};"
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,3 @@
1
+ module Redshifter
2
+ VERSION = "0.3.0"
3
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'redshifter/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'redshifter'
8
+ spec.version = Redshifter::VERSION
9
+ spec.authors = ['Justin Richard']
10
+ spec.email = ['justin@apartmentlist.com']
11
+
12
+ spec.summary = %q{ETL processing jobs to exporting Rails model tables to Redshift}
13
+ spec.homepage = 'https://github.com/apartmentlist/redshifter'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.require_paths = ['lib']
17
+
18
+ spec.add_runtime_dependency 'dynosaur', '~> 0'
19
+ spec.add_runtime_dependency 'fog', '~> 1.36.0'
20
+ # mime-types now an explicit dependency of fog-core >=1.35.0
21
+ # fog 1.36.0 has a loose dependency on fog-core "~> 1.32" that causes this
22
+ # dependency change to bubble up to redshifter
23
+ spec.add_runtime_dependency 'mime-types'
24
+ spec.add_runtime_dependency 'pg', '~> 0.18'
25
+
26
+ spec.add_development_dependency 'bundler'
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'pry-byebug', '~> 3'
29
+ spec.add_development_dependency 'rspec', '~> 3.3'
30
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshifter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Justin Richard
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: dynosaur
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: fog
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.36.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.36.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pg
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.18'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.18'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry-byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.3'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.3'
125
+ description:
126
+ email:
127
+ - justin@apartmentlist.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".gitignore"
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - bin/console
140
+ - bin/setup
141
+ - lib/redshifter.rb
142
+ - lib/redshifter/config.rb
143
+ - lib/redshifter/extract_and_replace_redshift_table.rb
144
+ - lib/redshifter/extract_and_update_redshift_table.rb
145
+ - lib/redshifter/job/update_redshift_table_job.rb
146
+ - lib/redshifter/table.rb
147
+ - lib/redshifter/tasks.rb
148
+ - lib/redshifter/util/create_or_replace_table.rb
149
+ - lib/redshifter/util/extract_and_transform_updates.rb
150
+ - lib/redshifter/util/redshift.rb
151
+ - lib/redshifter/util/s3.rb
152
+ - lib/redshifter/util/s3_manifest_writer.rb
153
+ - lib/redshifter/util/table_config_validator.rb
154
+ - lib/redshifter/util/update_table.rb
155
+ - lib/redshifter/version.rb
156
+ - redshifter.gemspec
157
+ homepage: https://github.com/apartmentlist/redshifter
158
+ licenses: []
159
+ metadata: {}
160
+ post_install_message:
161
+ rdoc_options: []
162
+ require_paths:
163
+ - lib
164
+ required_ruby_version: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
169
+ required_rubygems_version: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 2.4.5
177
+ signing_key:
178
+ specification_version: 4
179
+ summary: ETL processing jobs to exporting Rails model tables to Redshift
180
+ test_files: []