redshifter 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/redshifter.rb +1 -0
- data/lib/redshifter/extract_and_replace_redshift_table.rb +7 -6
- data/lib/redshifter/extract_and_update_redshift_table.rb +9 -7
- data/lib/redshifter/job/append_redshift_table_job.rb +15 -0
- data/lib/redshifter/table.rb +2 -2
- data/lib/redshifter/tasks.rb +12 -1
- data/lib/redshifter/util/extract_and_transform_updates.rb +4 -3
- data/lib/redshifter/util/redshift.rb +2 -0
- data/lib/redshifter/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 99e068ccc5128f42b216208140cea7df8703f014
|
4
|
+
data.tar.gz: 93eb9194c02f2cc2800e54548052c4ab745226d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60c06243a08e41617ec8851baed2e79eb1ba4f88127ea1403197043201054dad59c9e270dff03db1126d5414fb14417b1ac28a65b008e8fb7bd48f4a464da8ed
|
7
|
+
data.tar.gz: 726b925959807d2108435e583742fd01db36cb2c19998797be8e709a294f40135d8854f510a42b6a0a7a28e1f44756e307e583df5e966713d886ac6e2459f85d
|
data/README.md
CHANGED
@@ -81,6 +81,7 @@ Redshifter.config.tables = {
|
|
81
81
|
redshift_columns: {
|
82
82
|
'id' => 'INTEGER',
|
83
83
|
'title' => 'VARCHAR(128)',
|
84
|
+
'json_value' => 'BOOLEAN',
|
84
85
|
'published_at' => 'TIMESTAMP',
|
85
86
|
'updated_at' => 'TIMESTAMP',
|
86
87
|
'exported_at' => 'TIMESTAMP'
|
@@ -89,8 +90,11 @@ Redshifter.config.tables = {
|
|
89
90
|
# source DB. By default, redshift columns will be populated from source
|
90
91
|
# column with the same name. Column key must exist in redshift_columns.
|
91
92
|
# If a matching source column does not exist you MUST specify it here.
|
93
|
+
#
|
94
|
+
# Extract json values as text (using ->>), then cast non-strings to appropriate type
|
92
95
|
source_column_transforms: {
|
93
96
|
'title' => "lower(title)",
|
97
|
+
'json_value' => "cast(json_column->>'json_key' as BOOLEAN)",
|
94
98
|
'published_at' => 'first_edition_published_at',
|
95
99
|
'exported_at' => 'now()'
|
96
100
|
},
|
data/lib/redshifter.rb
CHANGED
@@ -12,5 +12,6 @@ require 'redshifter/util/update_table'
|
|
12
12
|
|
13
13
|
require 'redshifter/extract_and_replace_redshift_table'
|
14
14
|
require 'redshifter/extract_and_update_redshift_table'
|
15
|
+
require 'redshifter/job/append_redshift_table_job'
|
15
16
|
require 'redshifter/job/replace_redshift_table_job'
|
16
17
|
require 'redshifter/job/update_redshift_table_job'
|
@@ -1,17 +1,18 @@
|
|
1
1
|
module Redshifter
|
2
2
|
class ExtractAndReplaceRedshiftTable
|
3
|
-
def initialize(table, s3_util
|
3
|
+
def initialize(table, s3_util: Util::S3.new)
|
4
4
|
@table = table
|
5
5
|
@s3_util = s3_util
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
9
|
Redshifter.config.logger.info "Extracting rows in batches ..."
|
10
|
-
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
|
11
|
+
table: table,
|
12
|
+
since: Table::EPOCH_TIMESTAMP,
|
13
|
+
s3_util: s3_util,
|
14
|
+
timestamp_column: 'updated_at'
|
15
|
+
).run
|
15
16
|
|
16
17
|
Redshifter.config.logger.info "Extracted #{extracted_s3_urls.size} batches. "
|
17
18
|
|
@@ -1,17 +1,19 @@
|
|
1
1
|
module Redshifter
|
2
2
|
class ExtractAndUpdateRedshiftTable
|
3
|
-
def initialize(table, s3_util
|
3
|
+
def initialize(table, timestamp_column:, s3_util: Util::S3.new)
|
4
4
|
@table = table
|
5
5
|
@s3_util = s3_util
|
6
|
+
@timestamp_column = timestamp_column
|
6
7
|
end
|
7
8
|
|
8
9
|
def run
|
9
10
|
Redshifter.config.logger.info "Extracting rows in batches ..."
|
10
|
-
extracted_s3_urls = Util::ExtractAndTransformUpdates
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
|
12
|
+
table: table,
|
13
|
+
since: table.redshift_max_timestamp(timestamp_column),
|
14
|
+
s3_util: s3_util,
|
15
|
+
timestamp_column: timestamp_column
|
16
|
+
).run
|
15
17
|
|
16
18
|
if extracted_s3_urls.any?
|
17
19
|
Redshifter.config.logger.info "Writing manifest file to S3 ..."
|
@@ -33,6 +35,6 @@ module Redshifter
|
|
33
35
|
|
34
36
|
private
|
35
37
|
|
36
|
-
attr_reader :table, :
|
38
|
+
attr_reader :s3_util, :table, :timestamp_column
|
37
39
|
end
|
38
40
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'dynosaur'
|
2
|
+
|
3
|
+
module Redshifter
|
4
|
+
module Job
|
5
|
+
class AppendRedshiftTableJob
|
6
|
+
@queue = :low
|
7
|
+
|
8
|
+
def self.perform(table_config_key)
|
9
|
+
dyno = Dynosaur::Process::Heroku
|
10
|
+
.new(task: 'redshifter:append', args: [table_config_key])
|
11
|
+
dyno.start
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/redshifter/table.rb
CHANGED
@@ -48,7 +48,7 @@ module Redshifter
|
|
48
48
|
# returns unix epoch timestamp literal if table does not exist or table
|
49
49
|
# exist with zero rows. Otherwise returns timestamp literal of most
|
50
50
|
# recently updated row in the analytics table
|
51
|
-
def
|
51
|
+
def redshift_max_timestamp(timestamp_column)
|
52
52
|
conn = Util::Redshift.connect
|
53
53
|
|
54
54
|
table_presence_query = <<-QUERY.squish
|
@@ -67,7 +67,7 @@ module Redshifter
|
|
67
67
|
if table_present == 't'
|
68
68
|
conn.exec(
|
69
69
|
<<-QUERY.squish
|
70
|
-
SELECT COALESCE(MAX(
|
70
|
+
SELECT COALESCE(MAX(#{timestamp_column}), TIMESTAMP '#{EPOCH_TIMESTAMP}')
|
71
71
|
FROM #{redshift_schema}.#{redshift_table_name}
|
72
72
|
QUERY
|
73
73
|
).getvalue(0, 0)
|
data/lib/redshifter/tasks.rb
CHANGED
@@ -4,6 +4,15 @@ module Redshifter
|
|
4
4
|
|
5
5
|
def install_tasks
|
6
6
|
namespace :redshifter do
|
7
|
+
desc 'Append an extracted table to Redshift'
|
8
|
+
task :append, [:table_config_key] => :environment do |_task, args|
|
9
|
+
table_config = Redshifter.config.tables[args[:table_config_key]]
|
10
|
+
table = Redshifter::Table.new(table_config)
|
11
|
+
Redshifter::ExtractAndUpdateRedshiftTable
|
12
|
+
.new(table, timestamp_column: 'created_at')
|
13
|
+
.run
|
14
|
+
end
|
15
|
+
|
7
16
|
desc 'Create or replace an extracted table in Redshift'
|
8
17
|
task :replace, [:table_config_key] => :environment do |_task, args|
|
9
18
|
table_config = Redshifter.config.tables[args[:table_config_key]]
|
@@ -15,7 +24,9 @@ module Redshifter
|
|
15
24
|
task :update, [:table_config_key] => :environment do |_task, args|
|
16
25
|
table_config = Redshifter.config.tables[args[:table_config_key]]
|
17
26
|
table = Redshifter::Table.new(table_config)
|
18
|
-
Redshifter::ExtractAndUpdateRedshiftTable
|
27
|
+
Redshifter::ExtractAndUpdateRedshiftTable
|
28
|
+
.new(table, timestamp_column: 'updated_at')
|
29
|
+
.run
|
19
30
|
end
|
20
31
|
end
|
21
32
|
end
|
@@ -10,10 +10,11 @@ module Redshifter
|
|
10
10
|
# the tables that are in the ETL process.
|
11
11
|
NULL_CHARACTER = '∅'
|
12
12
|
|
13
|
-
def initialize(table:, since:, s3_util:)
|
13
|
+
def initialize(table:, since:, s3_util:, timestamp_column:)
|
14
14
|
@table = table
|
15
15
|
@since = since
|
16
16
|
@s3_util = s3_util
|
17
|
+
@timestamp_column = timestamp_column
|
17
18
|
end
|
18
19
|
|
19
20
|
# Writes pipe delimited 'CSV' files to S3 of updated records.
|
@@ -37,7 +38,7 @@ module Redshifter
|
|
37
38
|
|
38
39
|
private
|
39
40
|
|
40
|
-
attr_reader :table, :since, :s3_util
|
41
|
+
attr_reader :table, :since, :s3_util, :timestamp_column
|
41
42
|
|
42
43
|
def csv_row(row)
|
43
44
|
row.map! { |value| value.nil? ? NULL_CHARACTER : value }
|
@@ -76,7 +77,7 @@ module Redshifter
|
|
76
77
|
end
|
77
78
|
|
78
79
|
def select_batch_sql(columns:, batch_size:, start_id:)
|
79
|
-
"SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND
|
80
|
+
"SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND #{timestamp_column} >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
|
80
81
|
end
|
81
82
|
end
|
82
83
|
end
|
data/lib/redshifter/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: redshifter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Richard
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dynosaur
|
@@ -142,6 +142,7 @@ files:
|
|
142
142
|
- lib/redshifter/config.rb
|
143
143
|
- lib/redshifter/extract_and_replace_redshift_table.rb
|
144
144
|
- lib/redshifter/extract_and_update_redshift_table.rb
|
145
|
+
- lib/redshifter/job/append_redshift_table_job.rb
|
145
146
|
- lib/redshifter/job/replace_redshift_table_job.rb
|
146
147
|
- lib/redshifter/job/update_redshift_table_job.rb
|
147
148
|
- lib/redshifter/table.rb
|
@@ -174,9 +175,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
174
175
|
version: '0'
|
175
176
|
requirements: []
|
176
177
|
rubyforge_project:
|
177
|
-
rubygems_version: 2.
|
178
|
+
rubygems_version: 2.6.10
|
178
179
|
signing_key:
|
179
180
|
specification_version: 4
|
180
181
|
summary: ETL processing jobs to exporting Rails model tables to Redshift
|
181
182
|
test_files: []
|
182
|
-
has_rdoc:
|