bigshift 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -8
- data/lib/bigshift.rb +2 -1
- data/lib/bigshift/big_query/table.rb +2 -5
- data/lib/bigshift/cli.rb +65 -30
- data/lib/bigshift/cloud_storage_transfer.rb +4 -4
- data/lib/bigshift/redshift_table_schema.rb +12 -5
- data/lib/bigshift/redshift_unloader.rb +1 -0
- data/lib/bigshift/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 60b5d96dd6c068f548e07446f970f8520a332516
|
4
|
+
data.tar.gz: 8216e56b7ce9ae684e1d17e2fb89a64c77a4157b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1fbeea6fcb26d64a3416f376b5b324fcd086f2c62d1878589eefd9be18437a8ae2ea9d47116cd485ff0b86dd0c263ba9ad797f135581968ff237edd7d1e939b
|
7
|
+
data.tar.gz: 763dd96b31254c70b1d596500ee0ed28f7f9c4025ff3272051b9ffb018618742908ff52f0b0c35bac887ba0f970b0f791091138d293009364673d3c370efa18a
|
data/README.md
CHANGED
@@ -36,7 +36,11 @@ Running `bigshift` without any arguments, or with `--help` will show the options
|
|
36
36
|
|
37
37
|
### GCP credentials
|
38
38
|
|
39
|
-
|
39
|
+
You can provide GCP credentials either with the environment variable `GOOGLE_APPLICATION_CREDENTIALS` or with the `--gcp-credentials` argument. These must be a path to a JSON file that contains a public/private key pair for a GCP user. The best way to obtain this is to create a new service account and choose JSON as the key type when prompted. See the [GCP documentation](https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually) for more information.
|
40
|
+
|
41
|
+
If Bigshift is run directly on Compute Engine, Kubernetes Engine or App Engine flexible environment, the embedded service account will be used instead. Please note the service account will need to have the `cloud-platform` authorization scope as detailed in the [Storage Transfer Service documentation](https://cloud.google.com/storage-transfer/docs/create-client#scope).
|
42
|
+
|
43
|
+
If you haven't used Storage Transfer Service with your destination bucket before it might not have the right permissions setup, see below under [Troubleshooting](#insufficientpermissionswhentransferringtogcs) for more information.
|
40
44
|
|
41
45
|
### AWS credentials
|
42
46
|
|
@@ -163,19 +167,23 @@ The certificates used by the Google APIs might not be installed on your system,
|
|
163
167
|
export SSL_CERT_FILE="$(find $GEM_HOME/gems -name 'google-api-client-*' | tail -n 1)/lib/cacerts.pem"
|
164
168
|
```
|
165
169
|
|
166
|
-
###
|
170
|
+
### I get errors when the data is loaded into BigQuery
|
171
|
+
|
172
|
+
This could be anything, but it could be things that aren't escaped properly when the data is dumped from Redshift. Try figuring out from the errors where the problem is and what the data looks like and open an issue. The more you can figure out yourself the more likely it is that you will get help. No one wants to trawl through your data, make an effort.
|
167
173
|
|
168
|
-
|
174
|
+
### Insufficient permissions when transferring to GCS
|
169
175
|
|
170
|
-
|
176
|
+
The Google Storage bucket needs permissions for the Storage Transfer service's Service Account to write to it. If you haven't used Storage Transfer service with this bucket before the bucket might not have the necessary permissions set up.
|
171
177
|
|
172
|
-
|
178
|
+
The easiest way for now to get that ID applied is to just create a manual Transfer request through the UI at which point you will have the permission automatically applied to the bucket.
|
173
179
|
|
174
|
-
|
180
|
+
You can verify that this has been set up by inspecting the permissions for your bucket and check that there is a user with a name like `storage-transfer-<ID>@partnercontent.gserviceaccount.com` that is set up as a writer.
|
175
181
|
|
176
|
-
|
182
|
+
If the permission on the bucket isn't there, the Storage Transfer service won't be able to find the bucket and will fail. You might see an error like "Failed to obtain the location of the destination Google Cloud Storage (GCS) bucket due to insufficient permissions".
|
177
183
|
|
178
|
-
|
184
|
+
### I get a NoMethodError: undefined method 'match' for nil:NilClass
|
185
|
+
|
186
|
+
This appears to be a bug in the AWS SDK that manifests when your [AWS credentials](#aws-credentials) have not been properly specified.
|
179
187
|
|
180
188
|
# Copyright
|
181
189
|
|
data/lib/bigshift.rb
CHANGED
@@ -18,6 +18,7 @@ module BigShift
|
|
18
18
|
load_configuration[:source_format] = 'CSV'
|
19
19
|
load_configuration[:field_delimiter] = '\t'
|
20
20
|
load_configuration[:quote] = '"'
|
21
|
+
load_configuration[:allow_quoted_newlines] = true
|
21
22
|
load_configuration[:destination_table] = @table_data.table_reference
|
22
23
|
load_configuration[:max_bad_records] = options[:max_bad_records] if options[:max_bad_records]
|
23
24
|
job = Google::Apis::BigqueryV2::Job.new(
|
@@ -36,11 +37,7 @@ module BigShift
|
|
36
37
|
else
|
37
38
|
job.status.errors.each do |error|
|
38
39
|
message = %<Load error: "#{error.message}">
|
39
|
-
if error.location
|
40
|
-
file, line, field = error.location.split('/').map { |s| s.split(':').last.strip }
|
41
|
-
message << " at file #{file}, line #{line}"
|
42
|
-
message << ", field #{field}" if field
|
43
|
-
end
|
40
|
+
message << " in #{error.location}" if error.location
|
44
41
|
@logger.debug(message)
|
45
42
|
end
|
46
43
|
raise job.status.error_result.message
|
data/lib/bigshift/cli.rb
CHANGED
@@ -25,12 +25,18 @@ module BigShift
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def run
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
begin
|
29
|
+
setup
|
30
|
+
unload
|
31
|
+
transfer
|
32
|
+
load
|
33
|
+
cleanup
|
34
|
+
nil
|
35
|
+
rescue Aws::Errors::MissingRegionError, Aws::Sigv4::Errors::MissingCredentialsError => e
|
36
|
+
raise CliError.new('AWS configuration missing or malformed: ' + e.message, e.backtrace, @usage)
|
37
|
+
rescue Signet::AuthorizationError => e
|
38
|
+
raise CliError.new('GCP configuration missing or malformed: ' + e.message, e.backtrace, @usage)
|
39
|
+
end
|
34
40
|
end
|
35
41
|
|
36
42
|
private
|
@@ -43,12 +49,15 @@ module BigShift
|
|
43
49
|
@config = parse_args(@argv)
|
44
50
|
@factory = @factory_factory.call(@config)
|
45
51
|
@logger = @factory.logger
|
52
|
+
@logger.debug('Setup complete')
|
46
53
|
end
|
47
54
|
|
48
55
|
def unload
|
49
56
|
if run?(:unload)
|
57
|
+
@logger.debug('Running unload')
|
50
58
|
s3_uri = "s3://#{@config[:s3_bucket_name]}/#{s3_table_prefix}"
|
51
59
|
@factory.redshift_unloader.unload_to(@config[:rs_schema_name], @config[:rs_table_name], s3_uri, allow_overwrite: false, compression: @config[:compression])
|
60
|
+
@logger.debug('Unload complete')
|
52
61
|
else
|
53
62
|
@logger.debug('Skipping unload')
|
54
63
|
end
|
@@ -57,8 +66,10 @@ module BigShift
|
|
57
66
|
|
58
67
|
def transfer
|
59
68
|
if run?(:transfer)
|
69
|
+
@logger.debug('Running transfer')
|
60
70
|
description = "bigshift-#{@config[:rs_database_name]}-#{@config[:rs_schema_name]}-#{@config[:rs_table_name]}-#{Time.now.utc.strftime('%Y%m%dT%H%M')}"
|
61
71
|
@factory.cloud_storage_transfer.copy_to_cloud_storage(@unload_manifest, @config[:cs_bucket_name], description: description, allow_overwrite: false)
|
72
|
+
@logger.debug('Transfer complete')
|
62
73
|
else
|
63
74
|
@logger.debug('Skipping transfer')
|
64
75
|
end
|
@@ -66,6 +77,7 @@ module BigShift
|
|
66
77
|
|
67
78
|
def load
|
68
79
|
if run?(:load)
|
80
|
+
@logger.debug('Querying Redshift schema')
|
69
81
|
rs_table_schema = @factory.redshift_table_schema
|
70
82
|
bq_dataset = @factory.big_query_dataset
|
71
83
|
bq_table = bq_dataset.table(@config[:bq_table_id]) || bq_dataset.create_table(@config[:bq_table_id])
|
@@ -74,7 +86,9 @@ module BigShift
|
|
74
86
|
options[:schema] = rs_table_schema.to_big_query
|
75
87
|
options[:allow_overwrite] = true
|
76
88
|
options[:max_bad_records] = @config[:max_bad_records] if @config[:max_bad_records]
|
89
|
+
@logger.debug('Running load')
|
77
90
|
bq_table.load(gcs_uri, options)
|
91
|
+
@logger.debug('Load complete')
|
78
92
|
else
|
79
93
|
@logger.debug('Skipping load')
|
80
94
|
end
|
@@ -82,7 +96,9 @@ module BigShift
|
|
82
96
|
|
83
97
|
def cleanup
|
84
98
|
if run?(:cleanup)
|
99
|
+
@logger.debug('Running cleanup')
|
85
100
|
@factory.cleaner.cleanup(@unload_manifest, @config[:cs_bucket_name])
|
101
|
+
@logger.debug('Cleanup complete')
|
86
102
|
else
|
87
103
|
@logger.debug('Skipping cleanup')
|
88
104
|
end
|
@@ -96,7 +112,7 @@ module BigShift
|
|
96
112
|
].freeze
|
97
113
|
|
98
114
|
ARGUMENTS = [
|
99
|
-
['--gcp-credentials', 'PATH', String, :gcp_credentials_path,
|
115
|
+
['--gcp-credentials', 'PATH', String, :gcp_credentials_path, nil],
|
100
116
|
['--aws-credentials', 'PATH', String, :aws_credentials_path, nil],
|
101
117
|
['--rs-credentials', 'PATH', String, :rs_credentials_path, :required],
|
102
118
|
['--rs-database', 'DB_NAME', String, :rs_database_name, :required],
|
@@ -125,6 +141,9 @@ module BigShift
|
|
125
141
|
rescue OptionParser::InvalidOption => e
|
126
142
|
config_errors << e.message
|
127
143
|
end
|
144
|
+
if !config[:gcp_credentials_path] && ENV['GOOGLE_APPLICATION_CREDENTIALS']
|
145
|
+
config[:gcp_credentials_path] = ENV['GOOGLE_APPLICATION_CREDENTIALS']
|
146
|
+
end
|
128
147
|
%w[gcp aws rs].each do |prefix|
|
129
148
|
if (path = config["#{prefix}_credentials_path".to_sym]) && File.exist?(path)
|
130
149
|
config["#{prefix}_credentials".to_sym] = YAML.load(File.read(path))
|
@@ -144,8 +163,9 @@ module BigShift
|
|
144
163
|
else
|
145
164
|
config[:steps] = STEPS
|
146
165
|
end
|
166
|
+
@usage = parser.to_s
|
147
167
|
unless config_errors.empty?
|
148
|
-
raise CliError.new('Configuration missing or malformed', config_errors,
|
168
|
+
raise CliError.new('Configuration missing or malformed', config_errors, @usage)
|
149
169
|
end
|
150
170
|
config
|
151
171
|
end
|
@@ -171,19 +191,19 @@ module BigShift
|
|
171
191
|
end
|
172
192
|
|
173
193
|
def redshift_unloader
|
174
|
-
@redshift_unloader ||= RedshiftUnloader.new(
|
194
|
+
@redshift_unloader ||= RedshiftUnloader.new(create_rs_connection, aws_credentials, logger: logger)
|
175
195
|
end
|
176
196
|
|
177
197
|
def cloud_storage_transfer
|
178
|
-
@cloud_storage_transfer ||= CloudStorageTransfer.new(cs_transfer_service,
|
198
|
+
@cloud_storage_transfer ||= CloudStorageTransfer.new(cs_transfer_service, gcp_project, aws_credentials, logger: logger)
|
179
199
|
end
|
180
200
|
|
181
201
|
def redshift_table_schema
|
182
|
-
@redshift_table_schema ||= RedshiftTableSchema.new(@config[:rs_schema_name], @config[:rs_table_name],
|
202
|
+
@redshift_table_schema ||= RedshiftTableSchema.new(@config[:rs_schema_name], @config[:rs_table_name], create_rs_connection)
|
183
203
|
end
|
184
204
|
|
185
205
|
def big_query_dataset
|
186
|
-
@big_query_dataset ||= BigQuery::Dataset.new(bq_service,
|
206
|
+
@big_query_dataset ||= BigQuery::Dataset.new(bq_service, gcp_project, @config[:bq_dataset_id], logger: logger)
|
187
207
|
end
|
188
208
|
|
189
209
|
def cleaner
|
@@ -207,8 +227,8 @@ module BigShift
|
|
207
227
|
|
208
228
|
private
|
209
229
|
|
210
|
-
def
|
211
|
-
|
230
|
+
def create_rs_connection
|
231
|
+
rs_connection = PG.connect(
|
212
232
|
host: @config[:rs_credentials]['host'],
|
213
233
|
port: @config[:rs_credentials]['port'],
|
214
234
|
dbname: @config[:rs_database_name],
|
@@ -216,13 +236,13 @@ module BigShift
|
|
216
236
|
password: @config[:rs_credentials]['password'],
|
217
237
|
sslmode: 'require'
|
218
238
|
)
|
219
|
-
socket = Socket.for_fd(
|
239
|
+
socket = Socket.for_fd(rs_connection.socket)
|
220
240
|
socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, 1)
|
221
241
|
socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_KEEPCNT, 5)
|
222
242
|
socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_KEEPINTVL, 2)
|
223
243
|
socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_KEEPIDLE, 2) if defined?(Socket::TCP_KEEPIDLE)
|
224
|
-
|
225
|
-
|
244
|
+
rs_connection.exec("SET search_path = \"#{@config[:rs_schema_name]}\"")
|
245
|
+
rs_connection
|
226
246
|
end
|
227
247
|
|
228
248
|
def cs_transfer_service
|
@@ -254,29 +274,44 @@ module BigShift
|
|
254
274
|
if @config[:aws_credentials]
|
255
275
|
credentials = Aws::Credentials.new(*@config[:aws_credentials].values_at('access_key_id', 'secret_access_key'))
|
256
276
|
else
|
257
|
-
credentials =
|
258
|
-
end
|
259
|
-
if (credentials = Aws::CredentialProviderChain.new(credentials).resolve)
|
260
|
-
credentials
|
261
|
-
else
|
262
|
-
raise 'No AWS credentials found'
|
277
|
+
credentials = Aws::CredentialProviderChain.new.resolve
|
263
278
|
end
|
264
279
|
end
|
265
280
|
end
|
266
281
|
|
267
282
|
def aws_region
|
268
|
-
@aws_region ||=
|
283
|
+
@aws_region ||= begin
|
284
|
+
if @config[:aws_credentials]
|
285
|
+
region = @config[:aws_credentials]['region']
|
286
|
+
else
|
287
|
+
region = ENV['AWS_REGION'] || ENV['AWS_DEFAULT_REGION']
|
288
|
+
end
|
289
|
+
|
290
|
+
if !region
|
291
|
+
raise BigShiftError.new('AWS Region not specified')
|
292
|
+
end
|
293
|
+
end
|
269
294
|
end
|
270
295
|
|
271
|
-
def
|
272
|
-
@config[:gcp_credentials]
|
296
|
+
def gcp_project
|
297
|
+
if @config[:gcp_credentials]
|
298
|
+
@config[:gcp_credentials]['project_id']
|
299
|
+
else
|
300
|
+
Google::Cloud.env.project_id
|
301
|
+
end
|
273
302
|
end
|
274
303
|
|
275
304
|
def gcp_credentials
|
276
|
-
@gcp_credentials ||=
|
277
|
-
|
278
|
-
|
279
|
-
|
305
|
+
@gcp_credentials ||= begin
|
306
|
+
if @config[:gcp_credentials]
|
307
|
+
credentials = Google::Auth::ServiceAccountCredentials.make_creds(
|
308
|
+
json_key_io: StringIO.new(JSON.dump(@config[:gcp_credentials])),
|
309
|
+
scope: Google::Apis::StoragetransferV1::AUTH_CLOUD_PLATFORM
|
310
|
+
)
|
311
|
+
else
|
312
|
+
credentials = Google::Auth::GCECredentials.new
|
313
|
+
end
|
314
|
+
end
|
280
315
|
end
|
281
316
|
end
|
282
317
|
end
|
@@ -24,15 +24,15 @@ module BigShift
|
|
24
24
|
DEFAULT_POLL_INTERVAL = 30
|
25
25
|
|
26
26
|
def create_transfer_job(unload_manifest, cloud_storage_bucket, description, allow_overwrite)
|
27
|
-
|
27
|
+
soon = @clock.now.utc + 60
|
28
28
|
Google::Apis::StoragetransferV1::TransferJob.new(
|
29
29
|
description: description,
|
30
30
|
project_id: @project_id,
|
31
31
|
status: 'ENABLED',
|
32
32
|
schedule: Google::Apis::StoragetransferV1::Schedule.new(
|
33
|
-
schedule_start_date: Google::Apis::StoragetransferV1::Date.new(year:
|
34
|
-
schedule_end_date: Google::Apis::StoragetransferV1::Date.new(year:
|
35
|
-
start_time_of_day: Google::Apis::StoragetransferV1::TimeOfDay.new(hours:
|
33
|
+
schedule_start_date: Google::Apis::StoragetransferV1::Date.new(year: soon.year, month: soon.month, day: soon.day),
|
34
|
+
schedule_end_date: Google::Apis::StoragetransferV1::Date.new(year: soon.year, month: soon.month, day: soon.day),
|
35
|
+
start_time_of_day: Google::Apis::StoragetransferV1::TimeOfDay.new(hours: soon.hour, minutes: soon.min)
|
36
36
|
),
|
37
37
|
transfer_spec: Google::Apis::StoragetransferV1::TransferSpec.new(
|
38
38
|
aws_s3_data_source: Google::Apis::StoragetransferV1::AwsS3Data.new(
|
@@ -8,7 +8,17 @@ module BigShift
|
|
8
8
|
|
9
9
|
def columns
|
10
10
|
@columns ||= begin
|
11
|
-
|
11
|
+
query = %{
|
12
|
+
SELECT "column", "type", "notnull"
|
13
|
+
FROM pg_table_def ptd, information_schema.columns isc
|
14
|
+
WHERE ptd.schemaname = isc.table_schema
|
15
|
+
AND ptd.tablename = isc.table_name
|
16
|
+
AND ptd.column = isc.column_name
|
17
|
+
AND schemaname = $1
|
18
|
+
AND tablename = $2
|
19
|
+
ORDER BY ordinal_position
|
20
|
+
}.gsub(/\s+/, ' ').strip
|
21
|
+
rows = @redshift_connection.exec_params(query, [@schema_name, @table_name])
|
12
22
|
if rows.count == 0
|
13
23
|
raise sprintf('Table %s for schema %s not found', @table_name.inspect, @schema_name.inspect)
|
14
24
|
else
|
@@ -18,7 +28,6 @@ module BigShift
|
|
18
28
|
nullable = row['notnull'] == 'f'
|
19
29
|
Column.new(name, type, nullable)
|
20
30
|
end
|
21
|
-
columns.sort_by!(&:name)
|
22
31
|
columns
|
23
32
|
end
|
24
33
|
end
|
@@ -51,12 +60,10 @@ module BigShift
|
|
51
60
|
|
52
61
|
def to_sql
|
53
62
|
case @type
|
54
|
-
when /^numeric/, /int/, /^double/, 'real'
|
63
|
+
when /^numeric/, /int/, /^double/, 'real', /^timestamp/
|
55
64
|
sprintf('"%s"', @name)
|
56
65
|
when /^character/
|
57
66
|
sprintf(%q<('"' || REPLACE(REPLACE(REPLACE("%s", '"', '""'), '\\n', '\\\\n'), '\\r', '\\\\r') || '"')>, @name)
|
58
|
-
when /^timestamp/
|
59
|
-
sprintf('(EXTRACT(epoch FROM "%s") + EXTRACT(milliseconds FROM "%s")/1000.0)', @name, @name)
|
60
67
|
when 'date'
|
61
68
|
sprintf(%q<(TO_CHAR("%s", 'YYYY-MM-DD'))>, @name)
|
62
69
|
when 'boolean'
|
@@ -20,6 +20,7 @@ module BigShift
|
|
20
20
|
unload_sql << %q< DELIMITER '\t'>
|
21
21
|
unload_sql << %q< GZIP> if options[:compression] || options[:compression].nil?
|
22
22
|
unload_sql << %q< ALLOWOVERWRITE> if options[:allow_overwrite]
|
23
|
+
unload_sql << %q< MAXFILESIZE 3.9 GB>
|
23
24
|
@logger.info(sprintf('Unloading Redshift table %s to %s', table_name, s3_uri))
|
24
25
|
@redshift_connection.exec(unload_sql)
|
25
26
|
@logger.info(sprintf('Unload of %s complete', table_name))
|
data/lib/bigshift/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bigshift
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Theo Hultberg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pg
|
@@ -53,7 +53,21 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: google-cloud-env
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: aws-sdk-s3
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - ">="
|
@@ -112,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
126
|
version: '0'
|
113
127
|
requirements: []
|
114
128
|
rubyforge_project:
|
115
|
-
rubygems_version: 2.
|
129
|
+
rubygems_version: 2.6.14
|
116
130
|
signing_key:
|
117
131
|
specification_version: 4
|
118
132
|
summary: A tool for moving tables from Redshift to BigQuery
|