redshifter 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb2c60961160a7bb4203faa3798ca442128db1fe
4
- data.tar.gz: 60e54cd36a888621165e6544e8ea37b4c4d0b17a
3
+ metadata.gz: 99e068ccc5128f42b216208140cea7df8703f014
4
+ data.tar.gz: 93eb9194c02f2cc2800e54548052c4ab745226d7
5
5
  SHA512:
6
- metadata.gz: 41f10099834f83582c5f9e67af1a088902c1143b75768adecded997658b3a8843fb5aaa4a02d934d51dbdd0e4e21a4ffa9c7580282ce4af156601ec950c8e869
7
- data.tar.gz: e7e56defe7b36942aaa6e3ad9e27ed2eb2e3fc64cb3d25581656492b8b68f81399b101d526ceee641894d748c53144d0f2533e9f8176afeadb1704c33525804f
6
+ metadata.gz: 60c06243a08e41617ec8851baed2e79eb1ba4f88127ea1403197043201054dad59c9e270dff03db1126d5414fb14417b1ac28a65b008e8fb7bd48f4a464da8ed
7
+ data.tar.gz: 726b925959807d2108435e583742fd01db36cb2c19998797be8e709a294f40135d8854f510a42b6a0a7a28e1f44756e307e583df5e966713d886ac6e2459f85d
data/README.md CHANGED
@@ -81,6 +81,7 @@ Redshifter.config.tables = {
81
81
  redshift_columns: {
82
82
  'id' => 'INTEGER',
83
83
  'title' => 'VARCHAR(128)',
84
+ 'json_value' => 'BOOLEAN',
84
85
  'published_at' => 'TIMESTAMP',
85
86
  'updated_at' => 'TIMESTAMP',
86
87
  'exported_at' => 'TIMESTAMP'
@@ -89,8 +90,11 @@ Redshifter.config.tables = {
89
90
  # source DB. By default, redshift columns will be populated from source
90
91
  # column with the same name. Column key must exist in redshift_columns.
91
92
  # If a matching source column does not exist you MUST specify it here.
93
+ #
94
+ # Extract json values as text (using ->>), then cast non-strings to appropriate type
92
95
  source_column_transforms: {
93
96
  'title' => "lower(title)",
97
+ 'json_value' => "cast(json_column->>'json_key' as BOOLEAN)",
94
98
  'published_at' => 'first_edition_published_at',
95
99
  'exported_at' => 'now()'
96
100
  },
@@ -12,5 +12,6 @@ require 'redshifter/util/update_table'
12
12
 
13
13
  require 'redshifter/extract_and_replace_redshift_table'
14
14
  require 'redshifter/extract_and_update_redshift_table'
15
+ require 'redshifter/job/append_redshift_table_job'
15
16
  require 'redshifter/job/replace_redshift_table_job'
16
17
  require 'redshifter/job/update_redshift_table_job'
@@ -1,17 +1,18 @@
1
1
  module Redshifter
2
2
  class ExtractAndReplaceRedshiftTable
3
- def initialize(table, s3_util = Util::S3.new)
3
+ def initialize(table, s3_util: Util::S3.new)
4
4
  @table = table
5
5
  @s3_util = s3_util
6
6
  end
7
7
 
8
8
  def run
9
9
  Redshifter.config.logger.info "Extracting rows in batches ..."
10
- extracted_s3_urls = Util::ExtractAndTransformUpdates
11
- .new(table: table,
12
- since: Table::EPOCH_TIMESTAMP,
13
- s3_util: s3_util
14
- ).run
10
+ extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
11
+ table: table,
12
+ since: Table::EPOCH_TIMESTAMP,
13
+ s3_util: s3_util,
14
+ timestamp_column: 'updated_at'
15
+ ).run
15
16
 
16
17
  Redshifter.config.logger.info "Extracted #{extracted_s3_urls.size} batches. "
17
18
 
@@ -1,17 +1,19 @@
1
1
  module Redshifter
2
2
  class ExtractAndUpdateRedshiftTable
3
- def initialize(table, s3_util = Util::S3.new)
3
+ def initialize(table, timestamp_column:, s3_util: Util::S3.new)
4
4
  @table = table
5
5
  @s3_util = s3_util
6
+ @timestamp_column = timestamp_column
6
7
  end
7
8
 
8
9
  def run
9
10
  Redshifter.config.logger.info "Extracting rows in batches ..."
10
- extracted_s3_urls = Util::ExtractAndTransformUpdates
11
- .new(table: table,
12
- since: table.redshift_last_update,
13
- s3_util: s3_util
14
- ).run
11
+ extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
12
+ table: table,
13
+ since: table.redshift_max_timestamp(timestamp_column),
14
+ s3_util: s3_util,
15
+ timestamp_column: timestamp_column
16
+ ).run
15
17
 
16
18
  if extracted_s3_urls.any?
17
19
  Redshifter.config.logger.info "Writing manifest file to S3 ..."
@@ -33,6 +35,6 @@ module Redshifter
33
35
 
34
36
  private
35
37
 
36
- attr_reader :table, :s3_util
38
+ attr_reader :s3_util, :table, :timestamp_column
37
39
  end
38
40
  end
@@ -0,0 +1,15 @@
1
+ require 'dynosaur'
2
+
3
+ module Redshifter
4
+ module Job
5
+ class AppendRedshiftTableJob
6
+ @queue = :low
7
+
8
+ def self.perform(table_config_key)
9
+ dyno = Dynosaur::Process::Heroku
10
+ .new(task: 'redshifter:append', args: [table_config_key])
11
+ dyno.start
12
+ end
13
+ end
14
+ end
15
+ end
@@ -48,7 +48,7 @@ module Redshifter
48
48
  # returns unix epoch timestamp literal if table does not exist or table
49
49
  # exist with zero rows. Otherwise returns timestamp literal of most
50
50
  # recently updated row in the analytics table
51
- def redshift_last_update
51
+ def redshift_max_timestamp(timestamp_column)
52
52
  conn = Util::Redshift.connect
53
53
 
54
54
  table_presence_query = <<-QUERY.squish
@@ -67,7 +67,7 @@ module Redshifter
67
67
  if table_present == 't'
68
68
  conn.exec(
69
69
  <<-QUERY.squish
70
- SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
70
+ SELECT COALESCE(MAX(#{timestamp_column}), TIMESTAMP '#{EPOCH_TIMESTAMP}')
71
71
  FROM #{redshift_schema}.#{redshift_table_name}
72
72
  QUERY
73
73
  ).getvalue(0, 0)
@@ -4,6 +4,15 @@ module Redshifter
4
4
 
5
5
  def install_tasks
6
6
  namespace :redshifter do
7
+ desc 'Append an extracted table to Redshift'
8
+ task :append, [:table_config_key] => :environment do |_task, args|
9
+ table_config = Redshifter.config.tables[args[:table_config_key]]
10
+ table = Redshifter::Table.new(table_config)
11
+ Redshifter::ExtractAndUpdateRedshiftTable
12
+ .new(table, timestamp_column: 'created_at')
13
+ .run
14
+ end
15
+
7
16
  desc 'Create or replace an extracted table in Redshift'
8
17
  task :replace, [:table_config_key] => :environment do |_task, args|
9
18
  table_config = Redshifter.config.tables[args[:table_config_key]]
@@ -15,7 +24,9 @@ module Redshifter
15
24
  task :update, [:table_config_key] => :environment do |_task, args|
16
25
  table_config = Redshifter.config.tables[args[:table_config_key]]
17
26
  table = Redshifter::Table.new(table_config)
18
- Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
27
+ Redshifter::ExtractAndUpdateRedshiftTable
28
+ .new(table, timestamp_column: 'updated_at')
29
+ .run
19
30
  end
20
31
  end
21
32
  end
@@ -10,10 +10,11 @@ module Redshifter
10
10
  # the tables that are in the ETL process.
11
11
  NULL_CHARACTER = '∅'
12
12
 
13
- def initialize(table:, since:, s3_util:)
13
+ def initialize(table:, since:, s3_util:, timestamp_column:)
14
14
  @table = table
15
15
  @since = since
16
16
  @s3_util = s3_util
17
+ @timestamp_column = timestamp_column
17
18
  end
18
19
 
19
20
  # Writes pipe delimited 'CSV' files to S3 of updated records.
@@ -37,7 +38,7 @@ module Redshifter
37
38
 
38
39
  private
39
40
 
40
- attr_reader :table, :since, :s3_util
41
+ attr_reader :table, :since, :s3_util, :timestamp_column
41
42
 
42
43
  def csv_row(row)
43
44
  row.map! { |value| value.nil? ? NULL_CHARACTER : value }
@@ -76,7 +77,7 @@ module Redshifter
76
77
  end
77
78
 
78
79
  def select_batch_sql(columns:, batch_size:, start_id:)
79
- "SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
80
+ "SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND #{timestamp_column} >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
80
81
  end
81
82
  end
82
83
  end
@@ -1,3 +1,5 @@
1
+ require 'pg'
2
+
1
3
  module Redshifter
2
4
  module Util
3
5
  module Redshift
@@ -1,3 +1,3 @@
1
1
  module Redshifter
2
- VERSION = "0.6.2"
2
+ VERSION = "0.7.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redshifter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justin Richard
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-07 00:00:00.000000000 Z
11
+ date: 2017-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: dynosaur
@@ -142,6 +142,7 @@ files:
142
142
  - lib/redshifter/config.rb
143
143
  - lib/redshifter/extract_and_replace_redshift_table.rb
144
144
  - lib/redshifter/extract_and_update_redshift_table.rb
145
+ - lib/redshifter/job/append_redshift_table_job.rb
145
146
  - lib/redshifter/job/replace_redshift_table_job.rb
146
147
  - lib/redshifter/job/update_redshift_table_job.rb
147
148
  - lib/redshifter/table.rb
@@ -174,9 +175,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
175
  version: '0'
175
176
  requirements: []
176
177
  rubyforge_project:
177
- rubygems_version: 2.4.8
178
+ rubygems_version: 2.6.10
178
179
  signing_key:
179
180
  specification_version: 4
180
181
  summary: ETL processing jobs to exporting Rails model tables to Redshift
181
182
  test_files: []
182
- has_rdoc: