bigquery_migration 0.1.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +19 -0
- data/README.md +107 -0
- data/Rakefile +10 -0
- data/bigquery_migration.gemspec +31 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/example.yml +22 -0
- data/example/schema.json +22 -0
- data/exe/bq_migrate +4 -0
- data/lib/bigquery_migration.rb +29 -0
- data/lib/bigquery_migration/action.rb +85 -0
- data/lib/bigquery_migration/action_runner.rb +60 -0
- data/lib/bigquery_migration/bigquery_wrapper.rb +675 -0
- data/lib/bigquery_migration/cli.rb +105 -0
- data/lib/bigquery_migration/config_loader.rb +51 -0
- data/lib/bigquery_migration/error.rb +6 -0
- data/lib/bigquery_migration/hash_util.rb +35 -0
- data/lib/bigquery_migration/logger.rb +45 -0
- data/lib/bigquery_migration/schema.rb +388 -0
- data/lib/bigquery_migration/time_with_zone.rb +38 -0
- data/lib/bigquery_migration/version.rb +3 -0
- metadata +183 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
require_relative 'config_loader'
|
2
|
+
require_relative 'error'
|
3
|
+
require_relative 'action'
|
4
|
+
require_relative 'hash_util'
|
5
|
+
|
6
|
+
class BigqueryMigration
|
7
|
+
class ActionRunner
|
8
|
+
attr_reader :config, :config_path, :opts
|
9
|
+
|
10
|
+
def initialize(config_path = nil, opts = {})
|
11
|
+
@config_path = config_path
|
12
|
+
@opts = opts
|
13
|
+
config = ConfigLoader.new(@config_path, opts[:vars]).load
|
14
|
+
@config = HashUtil.deep_symbolize_keys(config)
|
15
|
+
validate_config!
|
16
|
+
end
|
17
|
+
|
18
|
+
def run
|
19
|
+
success, responses = run_actions
|
20
|
+
{ success: success, dry_run: @opts[:dry_run], actions: responses }
|
21
|
+
end
|
22
|
+
|
23
|
+
def run_actions
|
24
|
+
success = true
|
25
|
+
responses = []
|
26
|
+
|
27
|
+
@config[:actions].each do |action_config|
|
28
|
+
_success, result = Action.new(action_config, @opts).run
|
29
|
+
response = action_config.merge({'result' => result})
|
30
|
+
responses << response
|
31
|
+
unless _success
|
32
|
+
success = false
|
33
|
+
break
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
[success, responses]
|
38
|
+
end
|
39
|
+
|
40
|
+
def validate_config!
|
41
|
+
unless config.is_a?(Hash)
|
42
|
+
raise ConfigError, "config file format has to be YAML Hash"
|
43
|
+
end
|
44
|
+
|
45
|
+
unless config[:actions]
|
46
|
+
raise ConfigError, "config must have `actions` key"
|
47
|
+
end
|
48
|
+
|
49
|
+
unless config[:actions].is_a?(Array)
|
50
|
+
raise ConfigError, "config[:actions] must be an Array"
|
51
|
+
end
|
52
|
+
|
53
|
+
config[:actions].each do |action_config|
|
54
|
+
unless action_config[:action]
|
55
|
+
raise ConfigError, "Elements of `config[:actions]` must have `action` key"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,675 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'json'
|
3
|
+
require_relative 'schema'
|
4
|
+
require_relative 'error'
|
5
|
+
require_relative 'time_with_zone'
|
6
|
+
require_relative 'hash_util'
|
7
|
+
require 'google/apis/bigquery_v2'
|
8
|
+
require 'google/api_client/auth/key_utils'
|
9
|
+
|
10
|
+
class BigqueryMigration
|
11
|
+
class BigqueryWrapper
|
12
|
+
attr_reader :config
|
13
|
+
|
14
|
+
def logger
|
15
|
+
BigqueryMigration.logger
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(config, opts = {})
|
19
|
+
@config = HashUtil.deep_symbolize_keys(config)
|
20
|
+
@opts = HashUtil.deep_symbolize_keys(opts)
|
21
|
+
configure
|
22
|
+
end
|
23
|
+
|
24
|
+
def configure
|
25
|
+
if json_keyfile = config[:json_keyfile]
|
26
|
+
config[:json_key] =
|
27
|
+
case json_keyfile
|
28
|
+
when String
|
29
|
+
File.read(json_keyfile)
|
30
|
+
when Hash
|
31
|
+
json_keyfile[:content]
|
32
|
+
else
|
33
|
+
raise ConfigError.new "Unsupported json_keyfile type"
|
34
|
+
end
|
35
|
+
else
|
36
|
+
config[:json_key] = {
|
37
|
+
project_id: config[:project_id],
|
38
|
+
service_email: config[:service_email],
|
39
|
+
private_key: config[:private_key],
|
40
|
+
}.to_json
|
41
|
+
end
|
42
|
+
|
43
|
+
if config[:json_key]
|
44
|
+
begin
|
45
|
+
jsonkey_params = JSON.parse(config[:json_key])
|
46
|
+
rescue => e
|
47
|
+
raise ConfigError.new "json_keyfile is not a JSON file"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
if jsonkey_params
|
52
|
+
config[:project] ||= jsonkey_params['project_id']
|
53
|
+
end
|
54
|
+
|
55
|
+
config[:retries] ||= 5
|
56
|
+
config[:timeout_sec] ||= 300
|
57
|
+
config[:open_timeout_sec] ||= 300
|
58
|
+
end
|
59
|
+
|
60
|
+
def project
|
61
|
+
@project ||= config[:project] || raise(ConfigError, '`project` is required.')
|
62
|
+
end
|
63
|
+
|
64
|
+
def dataset
|
65
|
+
@dataset ||= config[:dataset] || raise(ConfigError, '`dataset` is required.')
|
66
|
+
end
|
67
|
+
|
68
|
+
def table
|
69
|
+
@table ||= config[:table] || raise(ConfigError, '`table` is required.')
|
70
|
+
end
|
71
|
+
|
72
|
+
def job_status_polling_interval
|
73
|
+
@job_status_polling_interval ||= config[:job_status_polling_interval] || 5
|
74
|
+
end
|
75
|
+
|
76
|
+
def job_status_max_polling_time
|
77
|
+
@job_status_max_polling_time ||= config[:job_status_polling_time] || 3600
|
78
|
+
end
|
79
|
+
|
80
|
+
def dry_run?
|
81
|
+
@opts[:dry_run]
|
82
|
+
end
|
83
|
+
|
84
|
+
def head
|
85
|
+
dry_run? ? '(DRY-RUN) ' : '(EXECUTE) '
|
86
|
+
end
|
87
|
+
|
88
|
+
def client
|
89
|
+
return @cached_client if @cached_client && @cached_client_expiration > Time.now
|
90
|
+
|
91
|
+
client = Google::Apis::BigqueryV2::BigqueryService.new
|
92
|
+
client.request_options.retries = config[:retries]
|
93
|
+
client.request_options.timeout_sec = config[:timeout_sec]
|
94
|
+
client.request_options.open_timeout_sec = config[:open_timeout_sec]
|
95
|
+
logger.debug { "client_options: #{client.client_options.to_h}" }
|
96
|
+
logger.debug { "request_options: #{client.request_options.to_h}" }
|
97
|
+
|
98
|
+
scope = "https://www.googleapis.com/auth/bigquery"
|
99
|
+
|
100
|
+
key = StringIO.new(config[:json_key])
|
101
|
+
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
|
102
|
+
client.authorization = auth
|
103
|
+
|
104
|
+
@cached_client_expiration = Time.now + 1800
|
105
|
+
@cached_client = client
|
106
|
+
end
|
107
|
+
|
108
|
+
def existing_columns
|
109
|
+
begin
|
110
|
+
result = get_table
|
111
|
+
response = result[:responses][:get_table]
|
112
|
+
response.schema.fields.map {|column| column.to_h }
|
113
|
+
rescue NotFoundError
|
114
|
+
return []
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def get_dataset(dataset: nil)
|
119
|
+
dataset ||= self.dataset
|
120
|
+
begin
|
121
|
+
logger.info { "Get dataset... #{project}:#{dataset}" }
|
122
|
+
client.get_dataset(project, dataset)
|
123
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
124
|
+
if e.status_code == 404
|
125
|
+
raise NotFoundError, "Dataset #{project}:#{dataset} is not found"
|
126
|
+
end
|
127
|
+
|
128
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
129
|
+
raise Error, "Failed to get_dataset(#{project}, #{dataset}), response:#{response}"
|
130
|
+
end
|
131
|
+
|
132
|
+
{ responses: { get_dataset: response } }
|
133
|
+
end
|
134
|
+
|
135
|
+
def insert_dataset(dataset: nil, reference: nil)
|
136
|
+
dataset ||= self.dataset
|
137
|
+
begin
|
138
|
+
logger.info { "#{head}Insert (create) dataset... #{project}:#{dataset}" }
|
139
|
+
hint = {}
|
140
|
+
if reference
|
141
|
+
response = get_dataset(reference)
|
142
|
+
hint = { access: response.access }
|
143
|
+
end
|
144
|
+
body = {
|
145
|
+
dataset_reference: {
|
146
|
+
project_id: project,
|
147
|
+
dataset_id: dataset,
|
148
|
+
},
|
149
|
+
}.merge(hint)
|
150
|
+
opts = {}
|
151
|
+
logger.debug { "#{head}insert_dataset(#{project}, #{body}, #{opts})" }
|
152
|
+
unless dry_run?
|
153
|
+
response = client.insert_dataset(project, body, opts)
|
154
|
+
end
|
155
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
156
|
+
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
157
|
+
# ignore 'Already Exists' error
|
158
|
+
return {}
|
159
|
+
end
|
160
|
+
|
161
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
162
|
+
raise Error, "Failed to insert_dataset(#{project}, #{body}, #{opts}), response:#{response}"
|
163
|
+
end
|
164
|
+
|
165
|
+
{ responses: { insert_dataset: response } }
|
166
|
+
end
|
167
|
+
alias :create_dataset :insert_dataset
|
168
|
+
|
169
|
+
def get_table(dataset: nil, table: nil)
|
170
|
+
dataset ||= self.dataset
|
171
|
+
table ||= self.table
|
172
|
+
begin
|
173
|
+
logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
|
174
|
+
response = client.get_table(project, dataset, table)
|
175
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
176
|
+
if e.status_code == 404 # not found
|
177
|
+
raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
|
178
|
+
end
|
179
|
+
|
180
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
181
|
+
raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
|
182
|
+
end
|
183
|
+
|
184
|
+
{ responses: { get_table: response } }
|
185
|
+
end
|
186
|
+
|
187
|
+
def insert_table(dataset: nil, table: nil, columns: )
|
188
|
+
dataset ||= self.dataset
|
189
|
+
table ||= self.table
|
190
|
+
schema = Schema.new(columns)
|
191
|
+
|
192
|
+
begin
|
193
|
+
logger.info { "#{head}Insert (create) table... #{project}:#{dataset}.#{table}" }
|
194
|
+
body = {
|
195
|
+
table_reference: {
|
196
|
+
table_id: table,
|
197
|
+
},
|
198
|
+
schema: {
|
199
|
+
fields: schema,
|
200
|
+
}
|
201
|
+
}
|
202
|
+
opts = {}
|
203
|
+
logger.debug { "#{head}insert_table(#{project}, #{dataset}, #{body}, #{opts})" }
|
204
|
+
unless dry_run?
|
205
|
+
response = client.insert_table(project, dataset, body, opts)
|
206
|
+
end
|
207
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
208
|
+
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
209
|
+
# ignore 'Already Exists' error
|
210
|
+
return {}
|
211
|
+
end
|
212
|
+
|
213
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
214
|
+
raise Error, "Failed to insert_table(#{project}, #{dataset}, #{body}, #{opts}), response:#{response}"
|
215
|
+
end
|
216
|
+
|
217
|
+
{ responses: { insert_table: response } }
|
218
|
+
end
|
219
|
+
alias :create_table :insert_table
|
220
|
+
|
221
|
+
def delete_table(dataset: nil, table: nil)
|
222
|
+
dataset ||= self.dataset
|
223
|
+
table ||= self.table
|
224
|
+
|
225
|
+
begin
|
226
|
+
logger.info { "#{head}Delete (drop) table... #{project}:#{dataset}.#{table}" }
|
227
|
+
unless dry_run?
|
228
|
+
client.delete_table(project, dataset, table) # no response
|
229
|
+
success = true
|
230
|
+
end
|
231
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
232
|
+
if e.status_code == 404 && /Not found:/ =~ e.message
|
233
|
+
# ignore 'Not Found' error
|
234
|
+
return {}
|
235
|
+
end
|
236
|
+
|
237
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
238
|
+
raise Error, "Failed to delete_table(#{project}, #{dataset}, #{table}), response:#{response}"
|
239
|
+
end
|
240
|
+
|
241
|
+
{ success: success }
|
242
|
+
end
|
243
|
+
alias :drop_table :delete_table
|
244
|
+
|
245
|
+
def list_tables(dataset: nil, max_results: 999999)
|
246
|
+
dataset ||= self.dataset
|
247
|
+
|
248
|
+
tables = []
|
249
|
+
begin
|
250
|
+
logger.info { "List tables... #{project}:#{dataset}" }
|
251
|
+
response = client.list_tables(project, dataset, max_results: max_results)
|
252
|
+
while true
|
253
|
+
_tables = (response.tables || []).map { |t| t.table_reference.table_id.to_s }
|
254
|
+
tables.concat(_tables)
|
255
|
+
if next_page_token = response.next_page_token
|
256
|
+
response = client.list_tables(project, dataset, page_token: next_page_token, max_results: max_results)
|
257
|
+
else
|
258
|
+
break
|
259
|
+
end
|
260
|
+
end
|
261
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
262
|
+
if e.status_code == 404 && /Not found:/ =~ e.message
|
263
|
+
railse NotFoundError, "Dataset #{project}:#{dataset} is not found"
|
264
|
+
end
|
265
|
+
|
266
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
267
|
+
logger.error { "list_tables(#{project}, #{dataset}), response:#{response}" }
|
268
|
+
raise Error, "failed to list tables #{project}:#{dataset}, response:#{response}"
|
269
|
+
end
|
270
|
+
|
271
|
+
{ tables: tables }
|
272
|
+
end
|
273
|
+
|
274
|
+
def purge_tables(dataset: nil, table_prefix: , suffix_format: , purge_before: , timezone: nil)
|
275
|
+
dataset ||= self.dataset
|
276
|
+
timezone ||= Time.now.strftime('%z')
|
277
|
+
|
278
|
+
before_tables = list_tables[:tables]
|
279
|
+
|
280
|
+
purge_before_t = TimeWithZone.strptime_with_zone(purge_before, suffix_format, timezone)
|
281
|
+
tables = before_tables.select do |tbl|
|
282
|
+
suffix = tbl.gsub(table_prefix, '')
|
283
|
+
begin
|
284
|
+
suffix_t = TimeWithZone.strptime_with_zone(suffix, suffix_format, timezone)
|
285
|
+
rescue
|
286
|
+
next
|
287
|
+
end
|
288
|
+
# skip if different from the suffix_format
|
289
|
+
next if suffix_t.strftime(suffix_format) != suffix
|
290
|
+
suffix_t <= purge_before_t
|
291
|
+
end
|
292
|
+
|
293
|
+
tables.each do |_table|
|
294
|
+
delete_table(table: _table)
|
295
|
+
# If you make more than 100 requests per second, throttling might occur.
|
296
|
+
# See https://cloud.google.com/bigquery/quota-policy#apirequests
|
297
|
+
sleep 1
|
298
|
+
end
|
299
|
+
|
300
|
+
{ delete_tables: tables }
|
301
|
+
end
|
302
|
+
|
303
|
+
# rows:
|
304
|
+
# - id: 1
|
305
|
+
# type: one
|
306
|
+
# record:
|
307
|
+
# child1: 'child1'
|
308
|
+
# child2: 'child2'
|
309
|
+
# - id: 2
|
310
|
+
# type: two
|
311
|
+
# record:
|
312
|
+
# child1: 'child3'
|
313
|
+
# child2: 'child4'
|
314
|
+
def insert_all_table_data(dataset: nil, table: nil, rows: )
|
315
|
+
dataset ||= self.dataset
|
316
|
+
table ||= self.table
|
317
|
+
|
318
|
+
begin
|
319
|
+
logger.info { "#{head}insertAll tableData... #{project}:#{dataset}.#{table}" }
|
320
|
+
body = {
|
321
|
+
rows: rows.map {|row| { json: row } },
|
322
|
+
}
|
323
|
+
opts = {}
|
324
|
+
unless dry_run?
|
325
|
+
response = client.insert_all_table_data(project, dataset, table, body, opts)
|
326
|
+
end
|
327
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
328
|
+
if e.status_code == 404 # not found
|
329
|
+
raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
|
330
|
+
end
|
331
|
+
|
332
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
333
|
+
Medjed::Bulk.logger.error {
|
334
|
+
"insert_all_table_data(#{project}, #{dataset}, #{table}, #{opts}), response:#{response}"
|
335
|
+
}
|
336
|
+
raise Error, "failed to insert_all table_data #{project}:#{dataset}.#{table}, response:#{response}"
|
337
|
+
end
|
338
|
+
|
339
|
+
{ responses: { insert_all_table_data: response } }
|
340
|
+
end
|
341
|
+
|
342
|
+
# @return Hash result of list table_data
|
343
|
+
#
|
344
|
+
# Example:
|
345
|
+
# {
|
346
|
+
# columns:
|
347
|
+
# [
|
348
|
+
# {
|
349
|
+
# name: id,
|
350
|
+
# type: INTEGER
|
351
|
+
# },
|
352
|
+
# {
|
353
|
+
# name: type,
|
354
|
+
# type: STRING
|
355
|
+
# },
|
356
|
+
# {
|
357
|
+
# name: record.child1,
|
358
|
+
# type: STRING
|
359
|
+
# },
|
360
|
+
# {
|
361
|
+
# name: record.child2,
|
362
|
+
# type: STRING
|
363
|
+
# },
|
364
|
+
# values:
|
365
|
+
# [
|
366
|
+
# [2,"two","child3","child4"],
|
367
|
+
# [1,"one","child1","child2"]
|
368
|
+
# ],
|
369
|
+
# total_rows: 2
|
370
|
+
# }
|
371
|
+
def list_table_data(dataset: nil, table: nil, max_results: 100)
|
372
|
+
dataset ||= self.dataset
|
373
|
+
table ||= self.table
|
374
|
+
|
375
|
+
begin
|
376
|
+
logger.info { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
|
377
|
+
response = client.list_table_data(project, dataset, table, max_results: max_results)
|
378
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
379
|
+
if e.status_code == 404 # not found
|
380
|
+
raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
|
381
|
+
end
|
382
|
+
|
383
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
384
|
+
logger.error { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
|
385
|
+
raise Error, "Failed to list table_data #{project}:#{dataset}.#{table}, response:#{response}"
|
386
|
+
end
|
387
|
+
|
388
|
+
flattened_columns = Schema.new(existing_columns).flattened_columns.map do |name, column|
|
389
|
+
{name: name}.merge!(column)
|
390
|
+
end
|
391
|
+
if rows = response.to_h[:rows]
|
392
|
+
flattened_values = flatten_values(rows)
|
393
|
+
end
|
394
|
+
|
395
|
+
{
|
396
|
+
total_rows: response.total_rows,
|
397
|
+
columns: flattened_columns,
|
398
|
+
values: flattened_values,
|
399
|
+
response: {
|
400
|
+
list_table_data: response,
|
401
|
+
}
|
402
|
+
}
|
403
|
+
end
|
404
|
+
|
405
|
+
private def flatten_values(rows)
|
406
|
+
rows.map do |r|
|
407
|
+
if r.key?(:f)
|
408
|
+
r[:f].map do |f|
|
409
|
+
if f[:v].respond_to?(:key?) && f[:v].key?(:f)
|
410
|
+
flatten_values(f[:v][:f])
|
411
|
+
else
|
412
|
+
f[:v]
|
413
|
+
end
|
414
|
+
end.flatten
|
415
|
+
else
|
416
|
+
r[:v]
|
417
|
+
end
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
def patch_table(dataset: nil, table: nil, columns: nil, add_columns: nil)
|
422
|
+
dataset ||= self.dataset
|
423
|
+
table ||= self.table
|
424
|
+
|
425
|
+
if columns.nil? and add_columns.nil?
|
426
|
+
raise ArgumentError, 'patch_table: `columns` or `add_columns` is required'
|
427
|
+
end
|
428
|
+
|
429
|
+
before_columns = existing_columns
|
430
|
+
if columns # if already given
|
431
|
+
schema = Schema.new(columns)
|
432
|
+
else
|
433
|
+
schema = Schema.new(add_columns)
|
434
|
+
schema.reverse_merge!(before_columns)
|
435
|
+
end
|
436
|
+
schema.validate_permitted_operations!(before_columns)
|
437
|
+
|
438
|
+
begin
|
439
|
+
logger.info { "#{head}Patch table... #{project}:#{dataset}.#{table}" }
|
440
|
+
fields = schema.map {|column| HashUtil.deep_symbolize_keys(column) }
|
441
|
+
body = {
|
442
|
+
schema: {
|
443
|
+
fields: fields,
|
444
|
+
}
|
445
|
+
}
|
446
|
+
opts = {}
|
447
|
+
logger.debug { "#{head}patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts})" }
|
448
|
+
unless dry_run?
|
449
|
+
response = client.patch_table(project, dataset, table, body, options: opts)
|
450
|
+
end
|
451
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
452
|
+
if e.status_code == 404 # not found
|
453
|
+
raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
|
454
|
+
end
|
455
|
+
|
456
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
457
|
+
logger.error {
|
458
|
+
"patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts}), response:#{response}"
|
459
|
+
}
|
460
|
+
raise Error, "Failed to patch table #{project}:#{dataset}.#{table}, response:#{response}"
|
461
|
+
end
|
462
|
+
|
463
|
+
after_columns = existing_columns
|
464
|
+
|
465
|
+
{
|
466
|
+
before_columns: before_columns,
|
467
|
+
after_columns: after_columns,
|
468
|
+
responses: { patch_table: response },
|
469
|
+
}
|
470
|
+
end
|
471
|
+
alias :add_column :patch_table
|
472
|
+
|
473
|
+
def copy_table(destination_table:, destination_dataset: nil, source_table: nil, source_dataset: nil, write_disposition: 'WRITE_TRUNCATE')
|
474
|
+
source_table ||= self.table
|
475
|
+
source_dataset ||= self.dataset
|
476
|
+
destination_dataset ||= source_dataset
|
477
|
+
|
478
|
+
body = {
|
479
|
+
configuration: {
|
480
|
+
copy: {
|
481
|
+
create_deposition: 'CREATE_IF_NEEDED',
|
482
|
+
write_disposition: write_disposition,
|
483
|
+
source_table: {
|
484
|
+
project_id: project,
|
485
|
+
dataset_id: source_dataset,
|
486
|
+
table_id: source_table,
|
487
|
+
},
|
488
|
+
destination_table: {
|
489
|
+
project_id: project,
|
490
|
+
dataset_id: destination_dataset,
|
491
|
+
table_id: destination_table,
|
492
|
+
},
|
493
|
+
}
|
494
|
+
}
|
495
|
+
}
|
496
|
+
opts = {}
|
497
|
+
|
498
|
+
logger.info { "#{head}insert_job(#{project}, #{body}, #{opts})" }
|
499
|
+
unless dry_run?
|
500
|
+
response = client.insert_job(project, body, opts)
|
501
|
+
get_response = wait_load('copy', response)
|
502
|
+
end
|
503
|
+
|
504
|
+
{
|
505
|
+
responses: {
|
506
|
+
insert_job: response,
|
507
|
+
last_get_job: get_response,
|
508
|
+
}
|
509
|
+
}
|
510
|
+
end
|
511
|
+
|
512
|
+
def insert_select(query:, destination_table: nil, destination_dataset: nil, write_disposition: 'WRITE_TRUNCATE')
|
513
|
+
destination_table ||= self.table
|
514
|
+
destination_dataset ||= self.dataset
|
515
|
+
|
516
|
+
body = {
|
517
|
+
configuration: {
|
518
|
+
query: {
|
519
|
+
allow_large_results: true,
|
520
|
+
flatten_results: false,
|
521
|
+
write_disposition: write_disposition,
|
522
|
+
query: query,
|
523
|
+
destination_table: {
|
524
|
+
project_id: self.project,
|
525
|
+
dataset_id: destination_dataset,
|
526
|
+
table_id: destination_table,
|
527
|
+
},
|
528
|
+
}
|
529
|
+
}
|
530
|
+
}
|
531
|
+
opts = {}
|
532
|
+
|
533
|
+
logger.info { "#{head}insert_job(#{project}, #{body}, #{opts})" }
|
534
|
+
unless dry_run?
|
535
|
+
response = client.insert_job(project, body, opts)
|
536
|
+
get_response = wait_load('query', response)
|
537
|
+
end
|
538
|
+
|
539
|
+
{
|
540
|
+
responses: {
|
541
|
+
insert_job: response,
|
542
|
+
last_get_job: get_response,
|
543
|
+
}
|
544
|
+
}
|
545
|
+
end
|
546
|
+
|
547
|
+
private def wait_load(kind, response)
|
548
|
+
started = Time.now
|
549
|
+
|
550
|
+
wait_interval = self.job_status_polling_interval
|
551
|
+
max_polling_time = self.job_status_max_polling_time
|
552
|
+
_response = response
|
553
|
+
|
554
|
+
while true
|
555
|
+
job_id = _response.job_reference.job_id
|
556
|
+
elapsed = Time.now - started
|
557
|
+
status = _response.status.state
|
558
|
+
if status == "DONE"
|
559
|
+
logger.info {
|
560
|
+
"#{kind} job completed... " \
|
561
|
+
"job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
562
|
+
}
|
563
|
+
break
|
564
|
+
elsif elapsed.to_i > max_polling_time
|
565
|
+
message = "Checking #{kind} job status... " \
|
566
|
+
"job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
|
567
|
+
logger.info { message }
|
568
|
+
raise JobTimeoutError.new(message)
|
569
|
+
else
|
570
|
+
logger.info {
|
571
|
+
"Checking #{kind} job status... " \
|
572
|
+
"job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
573
|
+
}
|
574
|
+
sleep wait_interval
|
575
|
+
_response = client.get_job(project, job_id)
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
# cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method
|
580
|
+
# `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
|
581
|
+
# Otherwise, this returns nil.
|
582
|
+
if _errors = _response.status.errors
|
583
|
+
raise Error, "Failed during waiting a job, get_job(#{project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
|
584
|
+
end
|
585
|
+
|
586
|
+
_response
|
587
|
+
end
|
588
|
+
|
589
|
+
def drop_column(table: nil, columns: nil, drop_columns: nil, backup_dataset: nil, backup_table: nil)
|
590
|
+
table ||= self.table
|
591
|
+
backup_dataset ||= self.dataset
|
592
|
+
if columns.nil? and drop_columns.nil?
|
593
|
+
raise ArgumentError, '`drop_columns` or `columns` is required'
|
594
|
+
end
|
595
|
+
|
596
|
+
result = { responses: {} }
|
597
|
+
|
598
|
+
before_columns = existing_columns
|
599
|
+
|
600
|
+
if columns # if already given
|
601
|
+
schema = Schema.new(columns)
|
602
|
+
else
|
603
|
+
schema = Schema.new(existing_columns)
|
604
|
+
schema.reject_columns!(drop_columns)
|
605
|
+
end
|
606
|
+
if schema.empty? && !dry_run?
|
607
|
+
raise Error, 'No column is remained'
|
608
|
+
end
|
609
|
+
|
610
|
+
schema.validate_permitted_operations!(before_columns)
|
611
|
+
|
612
|
+
unless backup_dataset == self.dataset
|
613
|
+
create_dataset(dataset: backup_dataset)
|
614
|
+
end
|
615
|
+
|
616
|
+
if backup_table
|
617
|
+
_result = copy_table(source_table: table, destination_table: backup_table, destination_dataset: backup_dataset)
|
618
|
+
result[:responses].merge!(_result[:responses])
|
619
|
+
end
|
620
|
+
|
621
|
+
unless (add_columns = schema.diff_columns_by_name(before_columns)).empty?
|
622
|
+
_result = patch_table(add_columns: add_columns)
|
623
|
+
result[:responses].merge!(_result[:responses])
|
624
|
+
end
|
625
|
+
|
626
|
+
query_fields = schema.build_query_fields(before_columns)
|
627
|
+
query = "SELECT #{query_fields.join(',')} FROM [#{dataset}.#{table}]"
|
628
|
+
_result = insert_select(query: query, destination_table: table)
|
629
|
+
result[:responses].merge!(_result[:responses])
|
630
|
+
|
631
|
+
after_columns = existing_columns
|
632
|
+
|
633
|
+
result.merge!({before_columns: before_columns, after_columns: after_columns})
|
634
|
+
end
|
635
|
+
|
636
|
+
def migrate_table(table: nil, schema_file: nil, columns: nil, backup_dataset: nil, backup_table: nil)
|
637
|
+
table ||= self.table
|
638
|
+
backup_dataset ||= self.dataset
|
639
|
+
|
640
|
+
if schema_file.nil? and columns.nil?
|
641
|
+
raise ArgumentError, '`schema_file` or `columns` is required'
|
642
|
+
end
|
643
|
+
if schema_file
|
644
|
+
columns = HashUtil.deep_symbolize_keys(JSON.parse(File.read(schema_file)))
|
645
|
+
end
|
646
|
+
Schema.validate_columns!(columns)
|
647
|
+
|
648
|
+
before_columns = existing_columns
|
649
|
+
|
650
|
+
result = {}
|
651
|
+
if before_columns.empty?
|
652
|
+
result = create_table(table: table, columns: columns)
|
653
|
+
else
|
654
|
+
add_columns = Schema.diff_columns(before_columns, columns)
|
655
|
+
drop_columns = Schema.diff_columns(columns, before_columns)
|
656
|
+
|
657
|
+
if !drop_columns.empty?
|
658
|
+
drop_column(table: table, columns: columns,
|
659
|
+
backup_dataset: backup_dataset, backup_table: backup_table)
|
660
|
+
elsif !add_columns.empty?
|
661
|
+
add_column(table: table, columns: columns)
|
662
|
+
end
|
663
|
+
end
|
664
|
+
|
665
|
+
after_columns = existing_columns
|
666
|
+
|
667
|
+
if after_columns.empty? and !dry_run?
|
668
|
+
raise Error, "after_columns is empty. " \
|
669
|
+
"before_columns: #{before_columns}, after_columns: after_columns, columns: #{columns}"
|
670
|
+
end
|
671
|
+
|
672
|
+
result.merge!( before_columns: before_columns, after_columns: after_columns )
|
673
|
+
end
|
674
|
+
end
|
675
|
+
end
|