redshifter 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8760ba2ad72e5e668d9f3c4eb6b70d04d86b15e9
4
+ data.tar.gz: 8eb1c3fa50e558f9cfac14ae1d4c606754d0bdf7
5
+ SHA512:
6
+ metadata.gz: d1e972e73eea10034797bf4713185a845e9f8f1c61227b1db480c355aeda7a232aa8c2caa50a2cdb3e95243ec94ece2a5acc35383cbb016f35bb6f63ba150099
7
+ data.tar.gz: 6ed7ad01c948ee874b558e0f13a1009249feb95b7365330557e3d5a6bd754426c47092fa3ff53788f5b92af4c53a2cdea0792ba82586428d3f46b4a97d33a685
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .rvmrc
2
+ *.gem
3
+ Gemfile.lock
4
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require 'spec_helper'
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.1
4
+ - 2.2.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in redshifter.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Apartment List, Inc
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,147 @@
1
+ # Redshifter
2
+
3
+ Provides a resque job and rake tasks to facilitate ETL (Extract Transform Load) processing of Postgres tables for export to a Redshift cluster.
4
+
5
+ Specifically provides:
6
+ 1) Create/Replace job to replace all model data in Redshift
7
+ 2) Update job updates all records created or updated since the last update ran
8
+
9
+ Limitations:
10
+ * deleted records are NOT synced to Redshift by the update job
11
+
12
+ Feature Roadmap:
13
+ * store last runtime locally instead of using potentially costly redshift query
14
+
15
+ ## Versions
16
+ 0.2.4 - New config format; update and replace rake tasks available
17
+ 0.3.0 - Public version
18
+
19
+ ## Installation
20
+
21
+ Add this line to your application's Gemfile:
22
+
23
+ ```ruby
24
+ gem 'redshifter'
25
+ ```
26
+
27
+ And then execute:
28
+
29
+ $ bundle
30
+
31
+ Or install it yourself as:
32
+
33
+ $ gem install redshifter
34
+
35
+ ## Usage
36
+
37
+ ### Setup Redshifter in a Rails initializer
38
+ ```ruby
39
+ # config/initializers/redshifter.rb
40
+ Redshifter.setup do |config|
41
+ # path in your app available for writing temp files
42
+ config.temp_directory_path = File.expand_path('../../tmp', __FILE__)
43
+ # path to redshifter table config
44
+ config.table_config_path = File.expand_path('../redshfiter.rb', __FILE__)
45
+ # redshift user should have access to create tables in the specified schema
46
+ config.redshift_username = 'your_app_user'
47
+ config.redshift_password = 'p@ssw0rd'
48
+ config.redshift_host = 'app.host.without.protocol.com'
49
+ config.redshift_port = 5439
50
+ config.redshift_database = 'database_name'
51
+ config.redshift_schema = 'a_schema'
52
+ # AWS user should be allowed full access to the specified bucket
53
+ config.aws_access_key_id = '<AWS user access key ID>'
54
+ config.aws_secret_access_key = '<AWS user secret access key>'
55
+ config.s3_bucket = 'a_redshifter_bucket'
56
+ # Heroku user must be a member of the app and have privileges to
57
+ # start new dynos
58
+ config.heroku_api_key = '<Heroku user api key>'
59
+ config.heroku_app_name = 'name of the app on heroku'
60
+ end
61
+ ```
62
+
63
+ ### Require Redshifter tasks in your Rakefile
64
+ ```ruby
65
+ # Rakefile
66
+ # ...
67
+ require 'redshifter/tasks'
68
+ # ...
69
+ ```
70
+
71
+ ### Create a config file describing the tables, columns and transforms for export to redshift
72
+ ```ruby
73
+ # config/redshifter.rb
74
+ Redshifter.config.tables = {
75
+ 'books_with_export_at' => {
76
+ # [required] Source *table* name, not the Rails model name
77
+ source_table_name: 'books',
78
+ # [required] Prefixing your redshift table with its source is recommended
79
+ redshift_table_name: 'app_name_books',
80
+ # [required] Columns with Redshift datatypes to create; may differ from source DB
81
+ redshift_columns: {
82
+ 'id' => 'INTEGER',
83
+ 'title' => 'VARCHAR(128)',
84
+ 'published_at' => 'TIMESTAMP',
85
+ 'updated_at' => 'TIMESTAMP',
86
+ 'exported_at' => 'TIMESTAMP'
87
+ },
88
+ # [optional] SQL statements to transform or populate redshift columns from
89
+ # source DB. By default, redshift columns will be populated from source
90
+ # column with the same name. Column key must exist in redshift_columns.
91
+ # If a matching source column does not exist you MUST specify it here.
92
+ source_column_transforms: {
93
+ 'title' => "lower(title)",
94
+ 'published_at' => 'first_edition_published_at',
95
+ 'exported_at' => 'now()'
96
+ },
97
+ # [required] valid values: KEY, EVEN, ALL
98
+ redshift_distribution_style: 'KEY',
99
+ # [required, if redshift_distribution_style: 'KEY'] distribution key column
100
+ # name MUST be present in redshift_columns.keys
101
+ redshift_distribution_key: 'id',
102
+ # [optional] valid values: COMPOUND, INTERLEAVED; If omitted the Redshift
103
+ # table DDL statement will not specify the sort style and Redshift will
104
+ # implicitly default to COMPOUND style.
105
+ redshift_sort_style: 'INTERLEAVED',
106
+ # [required] Column names MUST be present in redshift_columns.keys; Max
107
+ # length of 8 when using INTERLEAVED sort style, and 400 when using
108
+ # COMPOUND sort style.
109
+ redshift_sort_keys: ['published_at'],
110
+ # [optional] Used for query planning in Redshift
111
+ redshift_primary_key: 'id'
112
+ }
113
+ }
114
+
115
+ ```
116
+
117
+ ### Run redshifter:replace rake task for each table you want to export
118
+ ```
119
+ $ rake redshifter:replace[books_with_export_at]
120
+ ```
121
+
122
+ ### Schedule a Redshifter::Job::UpdateRedshiftTableJob resque job per each table you want to export updates for
123
+
124
+ Then schedule this meta job to run in `resque_schedule.yml` to run once at 10:00pm
125
+ ```YAML
126
+ # config/resque_schedule.yml
127
+
128
+ etl_books_to_redshift:
129
+ cron: "0 22 * * *"
130
+ class: 'Redshifter::Job::UpdateRedshiftTableJob'
131
+ args: 'books_with_export_at'
132
+ description: 'Export the books table to Redshift'
133
+ ```
134
+
135
+ ## Development
136
+
137
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
138
+
139
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
140
+
141
+ ## Contributing
142
+
143
+ 1. Fork it ( https://github.com/[my-github-username]/redshifter/fork )
144
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
145
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
146
+ 4. Push to the branch (`git push origin my-new-feature`)
147
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "redshifter"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/lib/redshifter.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'redshifter/version'
2
+ require 'redshifter/config'
3
+ require 'redshifter/table'
4
+
5
+ require 'redshifter/util/table_config_validator'
6
+ require 'redshifter/util/redshift'
7
+ require 'redshifter/util/s3'
8
+ require 'redshifter/util/extract_and_transform_updates'
9
+ require 'redshifter/util/s3_manifest_writer'
10
+ require 'redshifter/util/create_or_replace_table'
11
+ require 'redshifter/util/update_table'
12
+
13
+ require 'redshifter/extract_and_replace_redshift_table'
14
+ require 'redshifter/extract_and_update_redshift_table'
15
+ require 'redshifter/job/update_redshift_table_job'
@@ -0,0 +1,41 @@
1
+ module Redshifter
2
+ class << self
3
+ def setup
4
+ yield config if block_given?
5
+ load_export_table_definitions
6
+ config_dynosaur
7
+ end
8
+
9
+ def config
10
+ @config ||= Struct.new(:tables,
11
+ :redshift_username,
12
+ :redshift_password,
13
+ :redshift_host,
14
+ :redshift_port,
15
+ :redshift_database,
16
+ :redshift_schema,
17
+ :aws_access_key_id,
18
+ :aws_secret_access_key,
19
+ :s3_bucket,
20
+ :heroku_api_key,
21
+ :heroku_app_name,
22
+ :temp_directory_path,
23
+ :table_config_path).new
24
+ end
25
+
26
+ private
27
+
28
+ def load_export_table_definitions
29
+ load config.table_config_path
30
+ end
31
+
32
+ def config_dynosaur
33
+ require 'dynosaur'
34
+
35
+ Dynosaur::Client::HerokuClient.configure do |config|
36
+ config.api_key = Redshifter.config.heroku_api_key
37
+ config.app_name = Redshifter.config.heroku_app_name
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,33 @@
1
+ module Redshifter
2
+ class ExtractAndReplaceRedshiftTable
3
+ def initialize(table, s3_util = Util::S3.new)
4
+ @table = table
5
+ @s3_util = s3_util
6
+ end
7
+
8
+ def run
9
+ extracted_s3_urls = Util::ExtractAndTransformUpdates
10
+ .new(table: table,
11
+ since: Table::EPOCH_TIMESTAMP,
12
+ s3_util: s3_util
13
+ ).run
14
+
15
+ if extracted_s3_urls.any?
16
+ manifest_url = Util::S3ManifestWriter
17
+ .new(file_name: "#{SecureRandom.uuid}.manifest",
18
+ file_urls: extracted_s3_urls,
19
+ s3_util: s3_util
20
+ ).run
21
+
22
+ Util::CreateOrReplaceTable
23
+ .new(table: table,
24
+ manifest_url: manifest_url
25
+ ).run
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :table, :s3_util
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ module Redshifter
2
+ class ExtractAndUpdateRedshiftTable
3
+ def initialize(table, s3_util = Util::S3.new)
4
+ @table = table
5
+ @s3_util = s3_util
6
+ end
7
+
8
+ def run
9
+ extracted_s3_urls = Util::ExtractAndTransformUpdates
10
+ .new(table: table,
11
+ since: table.redshift_last_update,
12
+ s3_util: s3_util
13
+ ).run
14
+
15
+ if extracted_s3_urls.any?
16
+ manifest_url = Util::S3ManifestWriter
17
+ .new(file_name: "#{SecureRandom.uuid}.manifest",
18
+ file_urls: extracted_s3_urls,
19
+ s3_util: s3_util
20
+ ).run
21
+
22
+ Util::UpdateTable
23
+ .new(table: table,
24
+ manifest_url: manifest_url
25
+ ).run
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :table, :s3_util
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ require 'dynosaur'
2
+
3
+ module Redshifter
4
+ module Job
5
+ class UpdateRedshiftTableJob
6
+ @queue = :low
7
+
8
+ def self.perform(table_config_key)
9
+ dyno = Dynosaur::Process::Heroku
10
+ .new(task: 'redshifter:update', args: [table_config_key])
11
+ dyno.start
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,109 @@
1
+ module Redshifter
2
+ class Table
3
+ EPOCH_TIMESTAMP = '1970-01-01 00:00:00'
4
+
5
+ def initialize(config)
6
+ Util::TableConfigValidator.new(config).validate!
7
+
8
+ @source_table_name = config[:source_table_name]
9
+ @redshift_table_name = config[:redshift_table_name]
10
+ @redshift_columns = config[:redshift_columns]
11
+ @source_column_transforms = config[:source_column_transforms] || {}
12
+ @redshift_distribution_style = config[:redshift_distribution_style]
13
+ @redshift_distribution_key = config[:redshift_distribution_key]
14
+ @redshift_sort_keys = config[:redshift_sort_keys]
15
+ @redshift_sort_style = config[:redshift_sort_style]
16
+ @redshift_primary_key = config[:redshift_primary_key]
17
+ end
18
+
19
+ attr_reader :source_table_name, :redshift_table_name
20
+
21
+ def redshift_column_names
22
+ redshift_columns.keys
23
+ end
24
+
25
+ def source_column_statements
26
+ redshift_columns.keys.map do |v|
27
+ return v unless source_column_transforms
28
+ source_column_transforms[v] || v
29
+ end
30
+ end
31
+
32
+ def redshift_schema
33
+ Redshifter.config.redshift_schema
34
+ end
35
+
36
+ def redshift_table_ddl(table_name = redshift_table_name)
37
+ <<-QUERY.squish
38
+ CREATE TABLE
39
+ #{redshift_schema}.#{table_name}(
40
+ #{redshift_columns.map { |k, v| "#{k} #{v}" }.join(', ')}#{primary_key_statement}
41
+ )
42
+ #{dist_statement}
43
+ #{sortkey_statement};
44
+ QUERY
45
+ end
46
+
47
+ # returns unix epoch timestamp literal if table does not exist or table
48
+ # exist with zero rows. Otherwise returns timestamp literal of most
49
+ # recently updated row in the analytics table
50
+ def redshift_last_update
51
+ conn = Util::Redshift.connect
52
+
53
+ table_presence_query = <<-QUERY.squish
54
+ SELECT EXISTS(
55
+ SELECT 1
56
+ FROM information_schema.tables
57
+ WHERE table_schema = '#{redshift_schema}'
58
+ AND table_name = '#{redshift_table_name}')
59
+ QUERY
60
+
61
+ # Redshift does not allow pg catalog table and user table to be accessed
62
+ # in the same query
63
+ # http://docs.aws.amazon.com/redshift/latest/dg/c_sql-functions-leader-node.html
64
+ table_present = conn.exec(table_presence_query).getvalue(0, 0)
65
+
66
+ if table_present == 't'
67
+ conn.exec(
68
+ <<-QUERY.squish
69
+ SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
70
+ FROM #{redshift_schema}.#{redshift_table_name}
71
+ QUERY
72
+ ).getvalue(0, 0)
73
+ else
74
+ EPOCH_TIMESTAMP
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ attr_reader :redshift_columns,
81
+ :source_column_transforms,
82
+ :redshift_distribution_key,
83
+ :redshift_sort_keys,
84
+ :redshift_distribution_style,
85
+ :redshift_sort_style,
86
+ :redshift_primary_key
87
+
88
+ def dist_statement
89
+ output = "DISTSTYLE #{redshift_distribution_style}"
90
+ if redshift_distribution_style == 'KEY'
91
+ output << " DISTKEY(#{redshift_distribution_key})"
92
+ end
93
+
94
+ output
95
+ end
96
+
97
+ def sortkey_statement
98
+ output = ''
99
+ output << "#{redshift_sort_style} " if redshift_sort_style
100
+ output << "SORTKEY(#{redshift_sort_keys.join(', ')})"
101
+
102
+ output
103
+ end
104
+
105
+ def primary_key_statement
106
+ redshift_primary_key ? ", PRIMARY KEY(#{redshift_primary_key})" : ''
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,25 @@
1
+ module Redshifter
2
+ class Tasks
3
+ include Rake::DSL if defined? Rake::DSL
4
+
5
+ def install_tasks
6
+ namespace :redshifter do
7
+ desc 'Create or replace an extracted table in Redshift'
8
+ task :replace, [:table_config_key] => :environment do |_task, args|
9
+ table_config = Redshifter.config.tables[args[:table_config_key]]
10
+ table = Redshifter::Table.new(table_config)
11
+ Redshifter::ExtractAndReplaceRedshiftTable.new(table).run
12
+ end
13
+
14
+ desc 'Update an extracted table in Redshift'
15
+ task :update, [:table_config_key] => :environment do |_task, args|
16
+ table_config = Redshifter.config.tables[args[:table_config_key]]
17
+ table = Redshifter::Table.new(table_config)
18
+ Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ Redshifter::Tasks.new.install_tasks
@@ -0,0 +1,65 @@
1
+ module Redshifter
2
+ module Util
3
+ class CreateOrReplaceTable
4
+ def initialize(table:, manifest_url:)
5
+ @table = table
6
+ @manifest_url = manifest_url
7
+ end
8
+
9
+ def run(conn = Redshift.connect)
10
+ conn.transaction do |within_transaction|
11
+ [
12
+ drop_and_create_table_sql,
13
+ grant_readonly_permissions_sql
14
+ ].each do |query|
15
+ within_transaction.exec(query)
16
+ end
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :table, :manifest_url
23
+
24
+ # creates or replaces an existing table from an s3 manifest of gzipped pipe
25
+ # delimitted files.
26
+ #
27
+ # COPY is the most efficient way to load data into redshift. When COPY is
28
+ # run on a new table with zero rows it automatically runs statistics
29
+ # (ANALYZE) and sorts and distributes data (VACUUM). Using the manifest
30
+ # allows COPY command to run in parallel.
31
+ #
32
+ #SQL Notes:
33
+ # CSV QUOTE AS '"' -- only way to define quote character for multiline
34
+ # -- column data (even though file is not comma separated)
35
+ # DELIMITER '|' -- this is the default separator, just being explicit
36
+ # TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
37
+ def drop_and_create_table_sql
38
+ <<-QUERY.squish
39
+ DROP TABLE IF EXISTS #{table.redshift_schema}.#{table.redshift_table_name};
40
+
41
+ #{table.redshift_table_ddl}
42
+
43
+ COPY #{table.redshift_schema}.#{table.redshift_table_name}
44
+ (#{table.redshift_column_names.join(', ')})
45
+ FROM '#{manifest_url}'
46
+ CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
47
+ CSV QUOTE AS '"'
48
+ DELIMITER '|'
49
+ GZIP
50
+ TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
51
+ NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
52
+ MANIFEST;
53
+ QUERY
54
+ end
55
+
56
+ def grant_readonly_permissions_sql
57
+ <<-QUERY.squish
58
+ GRANT SELECT, REFERENCES
59
+ ON TABLE #{table.redshift_schema}.#{table.redshift_table_name}
60
+ TO GROUP readonly;
61
+ QUERY
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,81 @@
1
+ require 'csv'
2
+
3
+ module Redshifter
4
+ module Util
5
+ class ExtractAndTransformUpdates
6
+ # Character used to represent a NULL value in the CSV. We cannot use an
7
+ # empty string to represent NULL, because then any tables with empty string
8
+ # values would have those converted to NULL in the Redshift tables. This
9
+ # needs to be a character we do not expect to appear as a value in any of
10
+ # the tables that are in the ETL process.
11
+ NULL_CHARACTER = '∅'
12
+
13
+ def initialize(table:, since:, s3_util:)
14
+ @table = table
15
+ @since = since
16
+ @s3_util = s3_util
17
+ end
18
+
19
+ # Writes pipe delimited 'CSV' files to S3 of updated records.
20
+ # Returns a list of internal s3 URLs created
21
+ def run(batch_size: 1000)
22
+ uploaded_s3_urls = []
23
+ run_name = SecureRandom.uuid
24
+
25
+ transform_in_batches(table.source_column_statements,
26
+ batch_size: batch_size) do |rows, batch|
27
+ csv_rows = rows.map(&method(:csv_row))
28
+
29
+ uploaded_s3_urls << s3_util.upload_file(
30
+ file_name: "#{table.redshift_table_name}_updates_#{run_name}_#{batch}.txt",
31
+ body: csv_rows.join,
32
+ gzip: true)
33
+ end
34
+
35
+ uploaded_s3_urls
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :table, :since, :s3_util
41
+
42
+ def csv_row(row)
43
+ row.map! { |value| value.nil? ? NULL_CHARACTER : value }
44
+ CSV.generate_line(row, col_sep: '|')
45
+ end
46
+
47
+ def transform_in_batches(*column_transforms, batch_size: 1000)
48
+ # guarantee id is present in the first column for batch functionality
49
+ select_column_transforms = column_transforms.dup.unshift('id as id_for_batching')
50
+ batch_start_id = 1
51
+ batch_count = 0
52
+
53
+ loop do
54
+ rows = ActiveRecord::Base.connection_pool.with_connection do |conn|
55
+ conn.exec_query(select_batch_sql(columns: select_column_transforms,
56
+ batch_size: batch_size,
57
+ start_id: batch_start_id)
58
+ )
59
+ end.rows
60
+
61
+ break if rows.empty?
62
+
63
+ # get the id from the first column position where it was injected
64
+ last_id = rows.last[0].to_i
65
+ # remove the injected id from the first column
66
+ rows.map! { |row| row[1..-1] }
67
+
68
+ yield rows, batch_count
69
+
70
+ break if rows.size < batch_size
71
+ batch_start_id = last_id + 1
72
+ batch_count += 1
73
+ end
74
+ end
75
+
76
+ def select_batch_sql(columns:, batch_size:, start_id:)
77
+ "select #{columns.join(', ')} from #{table.source_table_name} where updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,18 @@
1
+ module Redshifter
2
+ module Util
3
+ module Redshift
4
+ def self.connect
5
+ PG.connect(
6
+ {
7
+ host: Redshifter.config.redshift_host,
8
+ port: Redshifter.config.redshift_port,
9
+ dbname: Redshifter.config.redshift_database,
10
+ user: Redshifter.config.redshift_username,
11
+ password: Redshifter.config.redshift_password,
12
+ sslmode: 'require'
13
+ }
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,92 @@
1
+ require 'fog'
2
+ require 'tmpdir'
3
+
4
+ module Redshifter
5
+ module Util
6
+ class S3
7
+ def upload_file(file_name:, body:, gzip: false)
8
+ s3_url = ''
9
+
10
+ Dir.mktmpdir('redshifter', ensure_app_tmp_directory) do |temp_dir|
11
+ temp_file = write_temp_file(File.join(temp_dir, file_name), body, gzip)
12
+ File.open(temp_file) do |file|
13
+ s3_file = bucket.files.create(file_options(file, gzip))
14
+ s3_url = internal_file_url(s3_file.key)
15
+ end
16
+ end
17
+
18
+ s3_url
19
+ end
20
+
21
+ private
22
+
23
+ def internal_file_url(file_name)
24
+ "s3://#{bucket_name}/#{file_name}"
25
+ end
26
+
27
+ def conn
28
+ @conn ||= Fog::Storage.new(
29
+ provider: 'AWS',
30
+ aws_access_key_id: Redshifter.config.aws_access_key_id,
31
+ aws_secret_access_key: Redshifter.config.aws_secret_access_key
32
+ )
33
+ end
34
+
35
+ def bucket
36
+ @bucket ||= conn.directories.get(bucket_name)
37
+ end
38
+
39
+ def bucket_name
40
+ Redshifter.config.s3_bucket
41
+ end
42
+
43
+ def file_options(file, gzip)
44
+ s3_file_options = {
45
+ key: File.basename(file),
46
+ body: file,
47
+ public: false
48
+ }
49
+
50
+ s3_file_options.merge!(
51
+ content_encoding: 'ASCII-8BIT',
52
+ compression_mime_type: 'application/x-gzip'
53
+ ) if gzip
54
+
55
+ s3_file_options
56
+ end
57
+
58
+ def ensure_app_tmp_directory
59
+ tmp_path = Redshifter.config.temp_directory_path
60
+ Dir.mkdir(tmp_path) unless File.directory?(tmp_path)
61
+ tmp_path
62
+ end
63
+
64
+ def write_temp_file(file_path, body, gzip)
65
+ if gzip
66
+ write_temp_gzipped_file(file_path, body)
67
+ else
68
+ write_temp_uncompressed_file(file_path, body)
69
+ end
70
+ end
71
+
72
+ def write_temp_uncompressed_file(file_path, body)
73
+ File.open(file_path, 'w') do |file|
74
+ file << body
75
+ file
76
+ end
77
+ end
78
+
79
+ def write_temp_gzipped_file(file_path, body)
80
+ gz_file_path = file_path + '.gz'
81
+ File.open(gz_file_path, 'w', encoding: 'ASCII-8BIT') do |compressed_file|
82
+ compressed_file.sync = true
83
+ gzip = Zlib::GzipWriter.new(compressed_file)
84
+ gzip.write(body)
85
+ gzip.close # Important: Without this call, gzip headers won't be written
86
+ compressed_file
87
+ end
88
+ end
89
+
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,36 @@
1
+ module Redshifter
2
+ module Util
3
+ class S3ManifestWriter
4
+ def initialize(file_name:, file_urls:, s3_util:)
5
+ @file_name = file_name
6
+ @file_urls = file_urls
7
+ @s3_util = s3_util
8
+ end
9
+
10
+ # Uploads a s3 manifest file that requires all files be processed with
11
+ # mandatory: true attribute per file.
12
+ # Returns internal s3 URL for the manifest
13
+ def run
14
+ s3_util.upload_file(
15
+ file_name: file_name,
16
+ body: JSON.generate(manifest),
17
+ gzip: false
18
+ )
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :file_name, :file_urls, :s3_util
24
+
25
+ def manifest
26
+ { entries: manifest_files }
27
+ end
28
+
29
+ def manifest_files
30
+ file_urls.map do |child_url|
31
+ { url: child_url, mandatory: true }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,65 @@
1
+ module Redshifter
2
+ module Util
3
+ class TableConfigValidator
4
+ REQUIRED_KEYS = [:source_table_name,
5
+ :redshift_table_name,
6
+ :redshift_columns,
7
+ :redshift_distribution_style,
8
+ :redshift_sort_keys]
9
+
10
+ # hash format: { required_key: { when_this_key_and_value: 'present'} }
11
+ CONDITIONALLY_REQUIRED_KEYS = {
12
+ redshift_distribution_key: { redshift_distribution_style: 'KEY' }
13
+ }
14
+
15
+ # Use a nil value when the key is optional, but has allowable values
16
+ # when it is present. e.g. {optional_key: ['valid1', 'valid2', nil]}
17
+ ALLOWABLE_VALUES = {
18
+ redshift_distribution_style: ['KEY', 'ALL', 'EVEN'],
19
+ redshift_sort_style: ['COMPOUND', 'INTERLEAVED', nil]
20
+ }
21
+
22
+ def initialize(config)
23
+ @config = config
24
+ end
25
+
26
+ def validate!
27
+ raise 'invalid table config' unless valid?
28
+ end
29
+
30
+ def valid?
31
+ all_required_keys_present? &&
32
+ all_conditionally_required_keys_present? &&
33
+ all_validated_values_allowed?
34
+ end
35
+
36
+ private
37
+
38
+ attr_reader :config
39
+
40
+ def all_required_keys_present?
41
+ ((REQUIRED_KEYS & config.keys) == REQUIRED_KEYS)
42
+ end
43
+
44
+ def all_conditionally_required_keys_present?
45
+ conditional_keys_presence = []
46
+ CONDITIONALLY_REQUIRED_KEYS.each do |required_k, condition_kv|
47
+ if (config.select { |k, v| k == condition_kv.keys.first } == condition_kv)
48
+ conditional_keys_presence << config.include?(required_k)
49
+ end
50
+ end
51
+
52
+ conditional_keys_presence.all?
53
+ end
54
+
55
+ def all_validated_values_allowed?
56
+ allowed_values_presence = []
57
+ ALLOWABLE_VALUES.each do |k, v|
58
+ allowed_values_presence << v.include?(config[k])
59
+ end
60
+
61
+ allowed_values_presence.all?
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ module Redshifter
2
+ module Util
3
+ class UpdateTable
4
+ def initialize(table:, manifest_url:)
5
+ @table = table
6
+ @manifest_url = manifest_url
7
+ end
8
+
9
+ def run(conn = Redshift.connect)
10
+ conn.transaction do |within_transaction|
11
+ [
12
+ create_and_load_temp_table_sql,
13
+ upsert_changes_sql,
14
+ cleanup_temp_table_sql,
15
+ analyze_updated_table_sql
16
+ ].each do |query|
17
+ within_transaction.exec(query)
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ attr_reader :table, :manifest_url
25
+
26
+ def analytics_temp_table
27
+ "#{table.redshift_table_name}_temp"
28
+ end
29
+
30
+ #creates a temp table from original table DDL because it's the
31
+ # most efficient way to get a table the most closely mimics the destination.
32
+ # http://docs.aws.amazon.com/redshift/latest/dg/performing-a-deep-copy.html
33
+ #
34
+ # Imports rows from an s3 manifest of gzipped pipe delimitted files.
35
+ # Using the manifest allows COPY command to run in parallel.
36
+ #
37
+ #SQL Notes:
38
+ # CSV QUOTE AS '"' -- only way to define quote character for multiline
39
+ # -- column data (even though file is not comma separated)
40
+ # DELIMITER '|' -- this is the default separator, just being explicit
41
+ # TIMEFORMAT 'YYYY-MM-DD HH:MI:SS' -- being explicit insead of auto detect
42
+ # compupdate off -- save time by not compressing columns; it's temporary
43
+ # statupdate off -- save time by not running statistics; it's temporary
44
+ def create_and_load_temp_table_sql
45
+ <<-QUERY.squish
46
+ #{table.redshift_table_ddl(analytics_temp_table)}
47
+
48
+ COPY #{table.redshift_schema}.#{analytics_temp_table}
49
+ (#{table.redshift_column_names.join(', ')})
50
+ FROM '#{manifest_url}'
51
+ CREDENTIALS 'aws_access_key_id=#{Redshifter.config.aws_access_key_id};aws_secret_access_key=#{Redshifter.config.aws_secret_access_key}'
52
+ CSV QUOTE AS '"'
53
+ DELIMITER '|'
54
+ GZIP
55
+ TIMEFORMAT 'YYYY-MM-DD HH:MI:SS'
56
+ NULL AS '#{ExtractAndTransformUpdates::NULL_CHARACTER}'
57
+ MANIFEST
58
+ compupdate off
59
+ statupdate off;
60
+ QUERY
61
+ end
62
+
63
+ # Replaces existing rows with updated row in a single transaction using AWS
64
+ # recommended method
65
+ # http://docs.aws.amazon.com/redshift/latest/dg/merge-replacing-existing-rows.html
66
+ def upsert_changes_sql
67
+ <<-QUERY.squish
68
+ DELETE FROM #{table.redshift_schema}.#{table.redshift_table_name}
69
+ USING #{table.redshift_schema}.#{analytics_temp_table}
70
+ WHERE #{table.redshift_schema}.#{table.redshift_table_name}.id
71
+ = #{table.redshift_schema}.#{analytics_temp_table}.id;
72
+
73
+ INSERT INTO #{table.redshift_schema}.#{table.redshift_table_name}
74
+ SELECT * FROM #{table.redshift_schema}.#{analytics_temp_table};
75
+ QUERY
76
+ end
77
+
78
+ def cleanup_temp_table_sql
79
+ "drop table #{table.redshift_schema}.#{analytics_temp_table};"
80
+ end
81
+
82
+ # analyze recomputes table statistics for efficient querying after change
83
+ def analyze_updated_table_sql
84
+ "analyze #{table.redshift_schema}.#{table.redshift_table_name};"
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,3 @@
1
+ module Redshifter
2
+ VERSION = "0.3.0"
3
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'redshifter/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'redshifter'
8
+ spec.version = Redshifter::VERSION
9
+ spec.authors = ['Justin Richard']
10
+ spec.email = ['justin@apartmentlist.com']
11
+
12
+ spec.summary = %q{ETL processing jobs to exporting Rails model tables to Redshift}
13
+ spec.homepage = 'https://github.com/apartmentlist/redshifter'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.require_paths = ['lib']
17
+
18
+ spec.add_runtime_dependency 'dynosaur', '~> 0'
19
+ spec.add_runtime_dependency 'fog', '~> 1.36.0'
20
+ # mime-types now an explicit dependency of fog-core >=1.35.0
21
+ # fog 1.36.0 has a loose dependency on fog-core "~> 1.32" that causes this
22
+ # dependency change to bubble up to redshifter
23
+ spec.add_runtime_dependency 'mime-types'
24
+ spec.add_runtime_dependency 'pg', '~> 0.18'
25
+
26
+ spec.add_development_dependency 'bundler'
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'pry-byebug', '~> 3'
29
+ spec.add_development_dependency 'rspec', '~> 3.3'
30
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshifter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Justin Richard
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: dynosaur
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: fog
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.36.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.36.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pg
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.18'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.18'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry-byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.3'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.3'
125
+ description:
126
+ email:
127
+ - justin@apartmentlist.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".gitignore"
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - bin/console
140
+ - bin/setup
141
+ - lib/redshifter.rb
142
+ - lib/redshifter/config.rb
143
+ - lib/redshifter/extract_and_replace_redshift_table.rb
144
+ - lib/redshifter/extract_and_update_redshift_table.rb
145
+ - lib/redshifter/job/update_redshift_table_job.rb
146
+ - lib/redshifter/table.rb
147
+ - lib/redshifter/tasks.rb
148
+ - lib/redshifter/util/create_or_replace_table.rb
149
+ - lib/redshifter/util/extract_and_transform_updates.rb
150
+ - lib/redshifter/util/redshift.rb
151
+ - lib/redshifter/util/s3.rb
152
+ - lib/redshifter/util/s3_manifest_writer.rb
153
+ - lib/redshifter/util/table_config_validator.rb
154
+ - lib/redshifter/util/update_table.rb
155
+ - lib/redshifter/version.rb
156
+ - redshifter.gemspec
157
+ homepage: https://github.com/apartmentlist/redshifter
158
+ licenses: []
159
+ metadata: {}
160
+ post_install_message:
161
+ rdoc_options: []
162
+ require_paths:
163
+ - lib
164
+ required_ruby_version: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
169
+ required_rubygems_version: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 2.4.5
177
+ signing_key:
178
+ specification_version: 4
179
+ summary: ETL processing jobs to exporting Rails model tables to Redshift
180
+ test_files: []