redshifter 0.6.2 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb2c60961160a7bb4203faa3798ca442128db1fe
4
- data.tar.gz: 60e54cd36a888621165e6544e8ea37b4c4d0b17a
3
+ metadata.gz: 99e068ccc5128f42b216208140cea7df8703f014
4
+ data.tar.gz: 93eb9194c02f2cc2800e54548052c4ab745226d7
5
5
  SHA512:
6
- metadata.gz: 41f10099834f83582c5f9e67af1a088902c1143b75768adecded997658b3a8843fb5aaa4a02d934d51dbdd0e4e21a4ffa9c7580282ce4af156601ec950c8e869
7
- data.tar.gz: e7e56defe7b36942aaa6e3ad9e27ed2eb2e3fc64cb3d25581656492b8b68f81399b101d526ceee641894d748c53144d0f2533e9f8176afeadb1704c33525804f
6
+ metadata.gz: 60c06243a08e41617ec8851baed2e79eb1ba4f88127ea1403197043201054dad59c9e270dff03db1126d5414fb14417b1ac28a65b008e8fb7bd48f4a464da8ed
7
+ data.tar.gz: 726b925959807d2108435e583742fd01db36cb2c19998797be8e709a294f40135d8854f510a42b6a0a7a28e1f44756e307e583df5e966713d886ac6e2459f85d
data/README.md CHANGED
@@ -81,6 +81,7 @@ Redshifter.config.tables = {
81
81
  redshift_columns: {
82
82
  'id' => 'INTEGER',
83
83
  'title' => 'VARCHAR(128)',
84
+ 'json_value' => 'BOOLEAN',
84
85
  'published_at' => 'TIMESTAMP',
85
86
  'updated_at' => 'TIMESTAMP',
86
87
  'exported_at' => 'TIMESTAMP'
@@ -89,8 +90,11 @@ Redshifter.config.tables = {
89
90
  # source DB. By default, redshift columns will be populated from source
90
91
  # column with the same name. Column key must exist in redshift_columns.
91
92
  # If a matching source column does not exist you MUST specify it here.
93
+ #
94
+ # Extract json values as text (using ->>), then cast non-strings to appropriate type
92
95
  source_column_transforms: {
93
96
  'title' => "lower(title)",
97
+ 'json_value' => "cast(json_column->>'json_key' as BOOLEAN)",
94
98
  'published_at' => 'first_edition_published_at',
95
99
  'exported_at' => 'now()'
96
100
  },
@@ -12,5 +12,6 @@ require 'redshifter/util/update_table'
12
12
 
13
13
  require 'redshifter/extract_and_replace_redshift_table'
14
14
  require 'redshifter/extract_and_update_redshift_table'
15
+ require 'redshifter/job/append_redshift_table_job'
15
16
  require 'redshifter/job/replace_redshift_table_job'
16
17
  require 'redshifter/job/update_redshift_table_job'
@@ -1,17 +1,18 @@
1
1
  module Redshifter
2
2
  class ExtractAndReplaceRedshiftTable
3
- def initialize(table, s3_util = Util::S3.new)
3
+ def initialize(table, s3_util: Util::S3.new)
4
4
  @table = table
5
5
  @s3_util = s3_util
6
6
  end
7
7
 
8
8
  def run
9
9
  Redshifter.config.logger.info "Extracting rows in batches ..."
10
- extracted_s3_urls = Util::ExtractAndTransformUpdates
11
- .new(table: table,
12
- since: Table::EPOCH_TIMESTAMP,
13
- s3_util: s3_util
14
- ).run
10
+ extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
11
+ table: table,
12
+ since: Table::EPOCH_TIMESTAMP,
13
+ s3_util: s3_util,
14
+ timestamp_column: 'updated_at'
15
+ ).run
15
16
 
16
17
  Redshifter.config.logger.info "Extracted #{extracted_s3_urls.size} batches. "
17
18
 
@@ -1,17 +1,19 @@
1
1
  module Redshifter
2
2
  class ExtractAndUpdateRedshiftTable
3
- def initialize(table, s3_util = Util::S3.new)
3
+ def initialize(table, timestamp_column:, s3_util: Util::S3.new)
4
4
  @table = table
5
5
  @s3_util = s3_util
6
+ @timestamp_column = timestamp_column
6
7
  end
7
8
 
8
9
  def run
9
10
  Redshifter.config.logger.info "Extracting rows in batches ..."
10
- extracted_s3_urls = Util::ExtractAndTransformUpdates
11
- .new(table: table,
12
- since: table.redshift_last_update,
13
- s3_util: s3_util
14
- ).run
11
+ extracted_s3_urls = Util::ExtractAndTransformUpdates.new(
12
+ table: table,
13
+ since: table.redshift_max_timestamp(timestamp_column),
14
+ s3_util: s3_util,
15
+ timestamp_column: timestamp_column
16
+ ).run
15
17
 
16
18
  if extracted_s3_urls.any?
17
19
  Redshifter.config.logger.info "Writing manifest file to S3 ..."
@@ -33,6 +35,6 @@ module Redshifter
33
35
 
34
36
  private
35
37
 
36
- attr_reader :table, :s3_util
38
+ attr_reader :s3_util, :table, :timestamp_column
37
39
  end
38
40
  end
@@ -0,0 +1,15 @@
1
+ require 'dynosaur'
2
+
3
+ module Redshifter
4
+ module Job
5
+ class AppendRedshiftTableJob
6
+ @queue = :low
7
+
8
+ def self.perform(table_config_key)
9
+ dyno = Dynosaur::Process::Heroku
10
+ .new(task: 'redshifter:append', args: [table_config_key])
11
+ dyno.start
12
+ end
13
+ end
14
+ end
15
+ end
@@ -48,7 +48,7 @@ module Redshifter
48
48
  # returns unix epoch timestamp literal if table does not exist or table
49
49
  # exist with zero rows. Otherwise returns timestamp literal of most
50
50
  # recently updated row in the analytics table
51
- def redshift_last_update
51
+ def redshift_max_timestamp(timestamp_column)
52
52
  conn = Util::Redshift.connect
53
53
 
54
54
  table_presence_query = <<-QUERY.squish
@@ -67,7 +67,7 @@ module Redshifter
67
67
  if table_present == 't'
68
68
  conn.exec(
69
69
  <<-QUERY.squish
70
- SELECT COALESCE(MAX(updated_at), TIMESTAMP '#{EPOCH_TIMESTAMP}')
70
+ SELECT COALESCE(MAX(#{timestamp_column}), TIMESTAMP '#{EPOCH_TIMESTAMP}')
71
71
  FROM #{redshift_schema}.#{redshift_table_name}
72
72
  QUERY
73
73
  ).getvalue(0, 0)
@@ -4,6 +4,15 @@ module Redshifter
4
4
 
5
5
  def install_tasks
6
6
  namespace :redshifter do
7
+ desc 'Append an extracted table to Redshift'
8
+ task :append, [:table_config_key] => :environment do |_task, args|
9
+ table_config = Redshifter.config.tables[args[:table_config_key]]
10
+ table = Redshifter::Table.new(table_config)
11
+ Redshifter::ExtractAndUpdateRedshiftTable
12
+ .new(table, timestamp_column: 'created_at')
13
+ .run
14
+ end
15
+
7
16
  desc 'Create or replace an extracted table in Redshift'
8
17
  task :replace, [:table_config_key] => :environment do |_task, args|
9
18
  table_config = Redshifter.config.tables[args[:table_config_key]]
@@ -15,7 +24,9 @@ module Redshifter
15
24
  task :update, [:table_config_key] => :environment do |_task, args|
16
25
  table_config = Redshifter.config.tables[args[:table_config_key]]
17
26
  table = Redshifter::Table.new(table_config)
18
- Redshifter::ExtractAndUpdateRedshiftTable.new(table).run
27
+ Redshifter::ExtractAndUpdateRedshiftTable
28
+ .new(table, timestamp_column: 'updated_at')
29
+ .run
19
30
  end
20
31
  end
21
32
  end
@@ -10,10 +10,11 @@ module Redshifter
10
10
  # the tables that are in the ETL process.
11
11
  NULL_CHARACTER = '∅'
12
12
 
13
- def initialize(table:, since:, s3_util:)
13
+ def initialize(table:, since:, s3_util:, timestamp_column:)
14
14
  @table = table
15
15
  @since = since
16
16
  @s3_util = s3_util
17
+ @timestamp_column = timestamp_column
17
18
  end
18
19
 
19
20
  # Writes pipe delimited 'CSV' files to S3 of updated records.
@@ -37,7 +38,7 @@ module Redshifter
37
38
 
38
39
  private
39
40
 
40
- attr_reader :table, :since, :s3_util
41
+ attr_reader :table, :since, :s3_util, :timestamp_column
41
42
 
42
43
  def csv_row(row)
43
44
  row.map! { |value| value.nil? ? NULL_CHARACTER : value }
@@ -76,7 +77,7 @@ module Redshifter
76
77
  end
77
78
 
78
79
  def select_batch_sql(columns:, batch_size:, start_id:)
79
- "SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND updated_at >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
80
+ "SELECT #{columns.join(', ')} FROM #{table.source_table_name} WHERE (#{table.source_table_filter}) AND #{timestamp_column} >= '#{since}' AND id >= #{start_id} ORDER BY id ASC limit #{batch_size}"
80
81
  end
81
82
  end
82
83
  end
@@ -1,3 +1,5 @@
1
+ require 'pg'
2
+
1
3
  module Redshifter
2
4
  module Util
3
5
  module Redshift
@@ -1,3 +1,3 @@
1
1
  module Redshifter
2
- VERSION = "0.6.2"
2
+ VERSION = "0.7.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redshifter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justin Richard
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-07 00:00:00.000000000 Z
11
+ date: 2017-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: dynosaur
@@ -142,6 +142,7 @@ files:
142
142
  - lib/redshifter/config.rb
143
143
  - lib/redshifter/extract_and_replace_redshift_table.rb
144
144
  - lib/redshifter/extract_and_update_redshift_table.rb
145
+ - lib/redshifter/job/append_redshift_table_job.rb
145
146
  - lib/redshifter/job/replace_redshift_table_job.rb
146
147
  - lib/redshifter/job/update_redshift_table_job.rb
147
148
  - lib/redshifter/table.rb
@@ -174,9 +175,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
175
  version: '0'
175
176
  requirements: []
176
177
  rubyforge_project:
177
- rubygems_version: 2.4.8
178
+ rubygems_version: 2.6.10
178
179
  signing_key:
179
180
  specification_version: 4
180
181
  summary: ETL processing jobs to exporting Rails model tables to Redshift
181
182
  test_files: []
182
- has_rdoc: