redshifter 0.6.2 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/redshifter.rb +1 -0
- data/lib/redshifter/extract_and_replace_redshift_table.rb +7 -6
- data/lib/redshifter/extract_and_update_redshift_table.rb +9 -7
- data/lib/redshifter/job/append_redshift_table_job.rb +15 -0
- data/lib/redshifter/table.rb +2 -2
- data/lib/redshifter/tasks.rb +12 -1
- data/lib/redshifter/util/extract_and_transform_updates.rb +4 -3
- data/lib/redshifter/util/redshift.rb +2 -0
- data/lib/redshifter/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 99e068ccc5128f42b216208140cea7df8703f014
|
4
|
+
data.tar.gz: 93eb9194c02f2cc2800e54548052c4ab745226d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60c06243a08e41617ec8851baed2e79eb1ba4f88127ea1403197043201054dad59c9e270dff03db1126d5414fb14417b1ac28a65b008e8fb7bd48f4a464da8ed
|
7
|
+
data.tar.gz: 726b925959807d2108435e583742fd01db36cb2c19998797be8e709a294f40135d8854f510a42b6a0a7a28e1f44756e307e583df5e966713d886ac6e2459f85d
|
data/README.md
CHANGED
@@ -81,6 +81,7 @@ Redshifter.config.tables = {
|
|
81
81
|
redshift_columns: {
|
82
82
|
'id' => 'INTEGER',
|
83
83
|
'title' => 'VARCHAR(128)',
|
84
|
+
'json_value' => 'BOOLEAN',
|
84
85
|
'published_at' => 'TIMESTAMP',
|
85
86
|
'updated_at' => 'TIMESTAMP',
|
86
87
|
'exported_at' => 'TIMESTAMP'
|
@@ -89,8 +90,11 @@ Redshifter.config.tables = {
|
|
89
90
|
# source DB. By default, redshift columns will be populated from source
|
90
91
|
# column with the same name. Column key must exist in redshift_columns.
|
91
92
|
# If a matching source column does not exist you MUST specify it here.
|
93
|
+
#
|
94
|
+
# Extract json values as text (using ->>), then cast non-strings to appropriate type
|
92
95
|
source_column_transforms: {
|
93
96
|
'title' => "lower(title)",
|
97
|
+
'json_value' => "cast(json_column->>'json_key' as BOOLEAN)",
|
94
98
|
'published_at' => 'first_edition_published_at',
|
95
99
|
'exported_at' => 'now()'
|
96
100
|
},
|
data/lib/redshifter.rb
CHANGED
@@ -12,5 +12,6 @@ require 'redshifter/util/update_table'
|
|
12
12
|
|
13
13
|
require 'redshifter/extract_and_replace_redshift_table'
|
14
14
|
require 'redshifter/extract_and_update_redshift_table'
|
15
|
+
require 'redshifter/job/append_redshift_table_job'
|
15
16
|
require 'redshifter/job/replace_redshift_table_job'
|
16
17
|
require 'redshifter/job/update_redshift_table_job'
|
@@ -1,17 +1,18 @@
|
|
1
1
|
module Redshifter
|
2
2
|
class ExtractAndReplaceRedshiftTable
|
3
|
-
def initialize(table, s3_util
|
3
|
+
def initialize(table, s3_util: Util::S3.new)
|
4
4
|
@table = table
|
5
5
|
@s3_util = s3_util
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
9
|
Redshifter.config.logger.info "Extracting rows in batches ..."
|
10
|
-
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
|
11
|
+
table: table,
|
12
|
+
since: Table::EPOCH_TIMESTAMP,
|
13
|
+
s3_util: s3_util,
|
14
|
+
timestamp_column: 'updated_at'
|
15
|
+
).run
|
15
16
|
|
16
17
|
Redshifter.config.logger.info "Extracted #{extracted_s3_urls.size} batches. "
|
17
18
|
|
@@ -1,17 +1,19 @@
|
|
1
1
|
module Redshifter
|
2
2
|
class ExtractAndUpdateRedshiftTable
|
3
|
-
def initialize(table, s3_util
|
3
|
+
def initialize(table, timestamp_column:, s3_util: Util::S3.new)
|
4
4
|
@table = table
|
5
5
|
@s3_util = s3_util
|
6
|
+
@timestamp_column = timestamp_column
|
6
7
|
end
|
7
8
|
|
8
9
|
def run
|
9
10
|
Redshifter.config.logger.info "Extracting rows in batches ..."
|
10
|
-
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
|
12
|
+
table: table,
|
13
|
+
since: table.redshift_max_timestamp(timestamp_column),
|
14
|
+
s3_util: s3_util,
|
15
|
+
timestamp_column: timestamp_column
|
16
|
+
).run
|
15
17
|
|
16
18
|
if extracted_s3_urls.any?
|
17
19
|
Redshifter.config.logger.info "Writing manifest file to S3 ..."
|
@@ -33,6 +35,6 @@ module Redshifter
|
|
33
35
|
|
34
36
|
private
|
35
37
|
|
36
|
-
attr_reader :table, :
|
38
|
+
attr_reader :s3_util, :table, :timestamp_column
|
37
39
|
end
|
38
40
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'dynosaur'
|
2
|
+
|
3
|
+
module Redshifter
|
4
|
+
module Job
|
5
|
+
class AppendRedshiftTableJob
|
6
|
+
@queue = :low
|
7
|
+
|
8
|
+
def self.perform(table_config_key)
|
9
|
+
dyno = Dynosaur::Process::Heroku
|
10
|
+
.new(task: 'redshifter:append', args: [table_config_key])
|
11
|
+
dyno.start
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/redshifter/table.rb
CHANGED
@@ -48,7 +48,7 @@ module Redshifter
|
|
48
48
|
# returns unix epoch timestamp literal if table does not exist or table
|
49
49
|
# exist with zero rows. Otherwise returns timestamp literal of most
|
50
50
|
# recently updated row in the analytics table
|
51
|
-
def
|
51
|
+
def redshift_max_timestamp(timestamp_column)
|
52
52
|
conn = Util::Redshift.connect
|
53
53
|
|
54
54
|
table_presence_query = <<-QUERY.squish
|
@@ -67,7 +67,7 @@ module Redshifter
|
|
67
67
|
if table_present == 't'
|
68
68
|
conn.exec(
|
69
69
|
<<-QUERY.squish
|
70
|
-
SELECT COALESCE(MAX(
|
70
|
+
SELECT COALESCE(MAX(#{timestamp_column}), TIMESTAMP '#{EPOCH_TIMESTAMP}')
|
71
71
|
FROM #{redshift_schema}.#{redshift_table_name}
|
72
72
|
QUERY
|
73
73
|
).getvalue(0, 0)
|
data/lib/redshifter/tasks.rb
CHANGED
@@ -4,6 +4,15 @@ module Redshifter
|
|
4
4
|
|
5
5
|
def install_tasks
|
6
6
|
namespace :redshifter do
|
7
|
+
desc 'Append an extracted table to Redshift'
|
8
|
+
task :append, [:table_config_key] => :environment do |_task, args|
|
9
|
+
table_config = Redshifter.config.tables[args[:table_config_key]]
|
10
|
+
table = Redshifter::Table.new(table_config)
|
11
|
+
Redshifter::ExtractAndUpdateRedshiftTable
|
12
|
+
.new(table, timestamp_column: 'created_at')
|
13
|
+
.run
|
14
|
+
end
|
15
|
+
|
7
16
|
desc 'Create or replace an extracted table in Redshift'
|
8
17
|
task :replace, [:table_config_key] => :environment do |_task, args|
|
9
18
|
table_config = Redshifter.config.tables[args[:table_config_key]]
|
@@ -15,7 +24,9 @@ module Redshifter
|
|
15
24
|
task :update, [:table_config_key] => :environment do |_task, args|
|
16
25
|
table_config = Redshifter.config.tables[args[:table_config_key]]
|
17
26
|
table = Redshifter::Table.new(table_config)
|
18
|
-
Redshifter::ExtractAndUpdateRedshiftTable
|
27
|
+
Redshifter::ExtractAndUpdateRedshiftTable
|
28
|
+
.new(table, timestamp_column: 'updated_at')
|
29
|
+
.run
|
19
30
|
end
|
20
31
|
end
|
21
32
|
end
|
@@ -10,10 +10,11 @@ module Redshifter
|
|
10
10
|
# the tables that are in the ETL process.
|
11
11
|
NULL_CHARACTER = '∅'
|
12
12
|
|
13
|
-
def initialize(table:, since:, s3_util:)
|
13
|
+
def initialize(table:, since:, s3_util:, timestamp_column:)
|
14
14
|
@table = table
|
15
15
|
@since = since
|
16
16
|
@s3_util = s3_util
|
17
|
+
@timestamp_column = timestamp_column
|
17
18
|
end
|
18
19
|
|
19
20
|
# Writes pipe delimited 'CSV' files to S3 of updated records.
|
@@ -37,7 +38,7 @@ module Redshifter
|
|
37
38
|
|
38
39
|
private
|
39
40
|
|
40
|
-
attr_reader :table, :since, :s3_util
|
41
|
+
attr_reader :table, :since, :s3_util, :timestamp_column
|
41
42
|
|
42
43
|
def csv_row(row)
|
43
44
|
row.map! { |value| value.nil? ? NULL_CHARACTER : value }
|
@@ -76,7 +77,7 @@ module Redshifter
|
|
76
77
|
end
|
77
78
|
|
78
79
|
def select_batch_sql(columns:, batch_size:, start_id:)
|
79
|
-
"SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND
|
80
|
+
"SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND #{timestamp_column} >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
|
80
81
|
end
|
81
82
|
end
|
82
83
|
end
|
data/lib/redshifter/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: redshifter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dynosaur
|
@@ -142,6 +142,7 @@ files:
|
|
142
142
|
- lib/redshifter/config.rb
|
143
143
|
- lib/redshifter/extract_and_replace_redshift_table.rb
|
144
144
|
- lib/redshifter/extract_and_update_redshift_table.rb
|
145
|
+
- lib/redshifter/job/append_redshift_table_job.rb
|
145
146
|
- lib/redshifter/job/replace_redshift_table_job.rb
|
146
147
|
- lib/redshifter/job/update_redshift_table_job.rb
|
147
148
|
- lib/redshifter/table.rb
|
@@ -174,9 +175,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
174
175
|
version: '0'
|
175
176
|
requirements: []
|
176
177
|
rubyforge_project:
|
177
|
-
rubygems_version: 2.
|
178
|
+
rubygems_version: 2.6.10
|
178
179
|
signing_key:
|
179
180
|
specification_version: 4
|
180
181
|
summary: ETL processing jobs to exporting Rails model tables to Redshift
|
181
182
|
test_files: []
|
182
|
-
has_rdoc:
|