google-cloud-bigquery 1.21.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +16 -0
  3. data/AUTHENTICATION.md +158 -0
  4. data/CHANGELOG.md +397 -0
  5. data/CODE_OF_CONDUCT.md +40 -0
  6. data/CONTRIBUTING.md +188 -0
  7. data/LICENSE +201 -0
  8. data/LOGGING.md +27 -0
  9. data/OVERVIEW.md +463 -0
  10. data/TROUBLESHOOTING.md +31 -0
  11. data/lib/google-cloud-bigquery.rb +139 -0
  12. data/lib/google/cloud/bigquery.rb +145 -0
  13. data/lib/google/cloud/bigquery/argument.rb +197 -0
  14. data/lib/google/cloud/bigquery/convert.rb +383 -0
  15. data/lib/google/cloud/bigquery/copy_job.rb +316 -0
  16. data/lib/google/cloud/bigquery/credentials.rb +50 -0
  17. data/lib/google/cloud/bigquery/data.rb +526 -0
  18. data/lib/google/cloud/bigquery/dataset.rb +2845 -0
  19. data/lib/google/cloud/bigquery/dataset/access.rb +1021 -0
  20. data/lib/google/cloud/bigquery/dataset/list.rb +162 -0
  21. data/lib/google/cloud/bigquery/encryption_configuration.rb +123 -0
  22. data/lib/google/cloud/bigquery/external.rb +2432 -0
  23. data/lib/google/cloud/bigquery/extract_job.rb +368 -0
  24. data/lib/google/cloud/bigquery/insert_response.rb +180 -0
  25. data/lib/google/cloud/bigquery/job.rb +657 -0
  26. data/lib/google/cloud/bigquery/job/list.rb +162 -0
  27. data/lib/google/cloud/bigquery/load_job.rb +1704 -0
  28. data/lib/google/cloud/bigquery/model.rb +740 -0
  29. data/lib/google/cloud/bigquery/model/list.rb +164 -0
  30. data/lib/google/cloud/bigquery/project.rb +1655 -0
  31. data/lib/google/cloud/bigquery/project/list.rb +161 -0
  32. data/lib/google/cloud/bigquery/query_job.rb +1695 -0
  33. data/lib/google/cloud/bigquery/routine.rb +1108 -0
  34. data/lib/google/cloud/bigquery/routine/list.rb +165 -0
  35. data/lib/google/cloud/bigquery/schema.rb +564 -0
  36. data/lib/google/cloud/bigquery/schema/field.rb +668 -0
  37. data/lib/google/cloud/bigquery/service.rb +589 -0
  38. data/lib/google/cloud/bigquery/standard_sql.rb +495 -0
  39. data/lib/google/cloud/bigquery/table.rb +3340 -0
  40. data/lib/google/cloud/bigquery/table/async_inserter.rb +520 -0
  41. data/lib/google/cloud/bigquery/table/list.rb +172 -0
  42. data/lib/google/cloud/bigquery/time.rb +65 -0
  43. data/lib/google/cloud/bigquery/version.rb +22 -0
  44. metadata +297 -0
@@ -0,0 +1,589 @@
1
+ # Copyright 2015 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google/cloud/bigquery/version"
17
+ require "google/cloud/bigquery/convert"
18
+ require "google/cloud/errors"
19
+ require "google/apis/bigquery_v2"
20
+ require "pathname"
21
+ require "securerandom"
22
+ require "mini_mime"
23
+ require "date"
24
+
25
+ module Google
26
+ module Cloud
27
+ module Bigquery
28
+ ##
29
+ # @private Represents the Bigquery service and API calls.
30
+ class Service
31
+ ##
32
+ # Alias to the Google Client API module
33
+ API = Google::Apis::BigqueryV2
34
+
35
+ # @private
36
+ attr_accessor :project
37
+
38
+ # @private
39
+ attr_accessor :credentials
40
+
41
+ # @private
42
+ attr_reader :retries, :timeout, :host
43
+
44
+ ##
45
+ # Creates a new Service instance.
46
+ def initialize project, credentials, retries: nil, timeout: nil, host: nil
47
+ @project = project
48
+ @credentials = credentials
49
+ @retries = retries
50
+ @timeout = timeout
51
+ @host = host
52
+ end
53
+
54
+ def service
55
+ return mocked_service if mocked_service
56
+ @service ||= begin
57
+ service = API::BigqueryService.new
58
+ service.client_options.application_name = "gcloud-ruby"
59
+ service.client_options.application_version = Google::Cloud::Bigquery::VERSION
60
+ service.client_options.open_timeout_sec = timeout
61
+ service.client_options.read_timeout_sec = timeout
62
+ service.client_options.send_timeout_sec = timeout
63
+ service.request_options.retries = 0 # handle retries in #execute
64
+ service.request_options.header ||= {}
65
+ service.request_options.header["x-goog-api-client"] = \
66
+ "gl-ruby/#{RUBY_VERSION} gccl/#{Google::Cloud::Bigquery::VERSION}"
67
+ service.authorization = @credentials.client
68
+ service.root_url = host if host
69
+ service
70
+ end
71
+ end
72
+ attr_accessor :mocked_service
73
+
74
+ def project_service_account
75
+ service.get_project_service_account project
76
+ end
77
+
78
+ ##
79
+ # Lists all datasets in the specified project to which you have
80
+ # been granted the READER dataset role.
81
+ def list_datasets all: nil, filter: nil, max: nil, token: nil
82
+ # The list operation is considered idempotent
83
+ execute backoff: true do
84
+ service.list_datasets @project, all: all, filter: filter, max_results: max, page_token: token
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Returns the dataset specified by datasetID.
90
+ def get_dataset dataset_id
91
+ # The get operation is considered idempotent
92
+ execute backoff: true do
93
+ service.get_dataset @project, dataset_id
94
+ end
95
+ end
96
+
97
+ ##
98
+ # Creates a new empty dataset.
99
+ def insert_dataset new_dataset_gapi
100
+ execute { service.insert_dataset @project, new_dataset_gapi }
101
+ end
102
+
103
+ ##
104
+ # Updates information in an existing dataset, only replacing
105
+ # fields that are provided in the submitted dataset resource.
106
+ def patch_dataset dataset_id, patched_dataset_gapi
107
+ patch_with_backoff = false
108
+ options = {}
109
+ if patched_dataset_gapi.etag
110
+ options[:header] = { "If-Match" => patched_dataset_gapi.etag }
111
+ # The patch with etag operation is considered idempotent
112
+ patch_with_backoff = true
113
+ end
114
+ execute backoff: patch_with_backoff do
115
+ service.patch_dataset @project, dataset_id, patched_dataset_gapi, options: options
116
+ end
117
+ end
118
+
119
+ ##
120
+ # Deletes the dataset specified by the datasetId value.
121
+ # Before you can delete a dataset, you must delete all its tables,
122
+ # either manually or by specifying force: true in options.
123
+ # Immediately after deletion, you can create another dataset with
124
+ # the same name.
125
+ def delete_dataset dataset_id, force = nil
126
+ execute do
127
+ service.delete_dataset @project, dataset_id, delete_contents: force
128
+ end
129
+ end
130
+
131
+ ##
132
+ # Lists all tables in the specified dataset.
133
+ # Requires the READER dataset role.
134
+ def list_tables dataset_id, max: nil, token: nil
135
+ # The list operation is considered idempotent
136
+ execute backoff: true do
137
+ service.list_tables @project, dataset_id, max_results: max, page_token: token
138
+ end
139
+ end
140
+
141
+ def get_project_table project_id, dataset_id, table_id
142
+ # The get operation is considered idempotent
143
+ execute backoff: true do
144
+ service.get_table project_id, dataset_id, table_id
145
+ end
146
+ end
147
+
148
+ ##
149
+ # Gets the specified table resource by table ID.
150
+ # This method does not return the data in the table,
151
+ # it only returns the table resource,
152
+ # which describes the structure of this table.
153
+ def get_table dataset_id, table_id
154
+ # The get operation is considered idempotent
155
+ execute backoff: true do
156
+ get_project_table @project, dataset_id, table_id
157
+ end
158
+ end
159
+
160
+ ##
161
+ # Creates a new, empty table in the dataset.
162
+ def insert_table dataset_id, new_table_gapi
163
+ execute { service.insert_table @project, dataset_id, new_table_gapi }
164
+ end
165
+
166
+ ##
167
+ # Updates information in an existing table, replacing fields that
168
+ # are provided in the submitted table resource.
169
+ def patch_table dataset_id, table_id, patched_table_gapi
170
+ patch_with_backoff = false
171
+ options = {}
172
+ if patched_table_gapi.etag
173
+ options[:header] = { "If-Match" => patched_table_gapi.etag }
174
+ # The patch with etag operation is considered idempotent
175
+ patch_with_backoff = true
176
+ end
177
+ execute backoff: patch_with_backoff do
178
+ service.patch_table @project, dataset_id, table_id, patched_table_gapi, options: options
179
+ end
180
+ end
181
+
182
+ ##
183
+ # Deletes the table specified by tableId from the dataset.
184
+ # If the table contains data, all the data will be deleted.
185
+ def delete_table dataset_id, table_id
186
+ execute { service.delete_table @project, dataset_id, table_id }
187
+ end
188
+
189
+ ##
190
+ # Retrieves data from the table.
191
+ def list_tabledata dataset_id, table_id, max: nil, token: nil, start: nil
192
+ # The list operation is considered idempotent
193
+ execute backoff: true do
194
+ json_txt = service.list_table_data \
195
+ @project, dataset_id, table_id,
196
+ max_results: max,
197
+ page_token: token,
198
+ start_index: start,
199
+ options: { skip_deserialization: true }
200
+ JSON.parse json_txt, symbolize_names: true
201
+ end
202
+ end
203
+
204
+ def insert_tabledata dataset_id, table_id, rows, insert_ids: nil, ignore_unknown: nil, skip_invalid: nil
205
+ json_rows = Array(rows).map { |row| Convert.to_json_row row }
206
+ insert_tabledata_json_rows dataset_id, table_id, json_rows, insert_ids: insert_ids,
207
+ ignore_unknown: ignore_unknown,
208
+ skip_invalid: skip_invalid
209
+ end
210
+
211
+ def insert_tabledata_json_rows dataset_id, table_id, json_rows, insert_ids: nil, ignore_unknown: nil,
212
+ skip_invalid: nil
213
+ rows_and_ids = Array(json_rows).zip Array(insert_ids)
214
+ insert_rows = rows_and_ids.map do |json_row, insert_id|
215
+ if insert_id == :skip
216
+ { json: json_row }
217
+ else
218
+ insert_id ||= SecureRandom.uuid
219
+ {
220
+ insertId: insert_id,
221
+ json: json_row
222
+ }
223
+ end
224
+ end
225
+
226
+ insert_req = {
227
+ rows: insert_rows,
228
+ ignoreUnknownValues: ignore_unknown,
229
+ skipInvalidRows: skip_invalid
230
+ }.to_json
231
+
232
+ # The insertAll with insertId operation is considered idempotent
233
+ execute backoff: true do
234
+ service.insert_all_table_data(
235
+ @project, dataset_id, table_id, insert_req,
236
+ options: { skip_serialization: true }
237
+ )
238
+ end
239
+ end
240
+
241
+ ##
242
+ # Lists all models in the specified dataset.
243
+ # Requires the READER dataset role.
244
+ def list_models dataset_id, max: nil, token: nil
245
+ options = { skip_deserialization: true }
246
+ # The list operation is considered idempotent
247
+ execute backoff: true do
248
+ json_txt = service.list_models @project, dataset_id, max_results: max, page_token: token, options: options
249
+ JSON.parse json_txt, symbolize_names: true
250
+ end
251
+ end
252
+
253
+ # Gets the specified model resource by model ID.
254
+ # This method does not return the data in the model,
255
+ # it only returns the model resource,
256
+ # which describes the structure of this model.
257
+ def get_model dataset_id, model_id
258
+ # The get operation is considered idempotent
259
+ execute backoff: true do
260
+ json_txt = service.get_model @project, dataset_id, model_id, options: { skip_deserialization: true }
261
+ JSON.parse json_txt, symbolize_names: true
262
+ end
263
+ end
264
+
265
+ ##
266
+ # Updates information in an existing model, replacing fields that
267
+ # are provided in the submitted model resource.
268
+ def patch_model dataset_id, model_id, patched_model_gapi, etag = nil
269
+ patch_with_backoff = false
270
+ options = { skip_deserialization: true }
271
+ if etag
272
+ options[:header] = { "If-Match" => etag }
273
+ # The patch with etag operation is considered idempotent
274
+ patch_with_backoff = true
275
+ end
276
+ execute backoff: patch_with_backoff do
277
+ json_txt = service.patch_model @project, dataset_id, model_id, patched_model_gapi, options: options
278
+ JSON.parse json_txt, symbolize_names: true
279
+ end
280
+ end
281
+
282
+ ##
283
+ # Deletes the model specified by modelId from the dataset.
284
+ # If the model contains data, all the data will be deleted.
285
+ def delete_model dataset_id, model_id
286
+ execute { service.delete_model @project, dataset_id, model_id }
287
+ end
288
+
289
+ ##
290
+ # Creates a new routine in the dataset.
291
+ def insert_routine dataset_id, new_routine_gapi
292
+ execute { service.insert_routine @project, dataset_id, new_routine_gapi }
293
+ end
294
+
295
+ ##
296
+ # Lists all routines in the specified dataset.
297
+ # Requires the READER dataset role.
298
+ # Unless readMask is set in the request, only the following fields are populated:
299
+ # etag, projectId, datasetId, routineId, routineType, creationTime, lastModifiedTime, and language.
300
+ def list_routines dataset_id, max: nil, token: nil, filter: nil
301
+ # The list operation is considered idempotent
302
+ execute backoff: true do
303
+ service.list_routines @project, dataset_id, max_results: max,
304
+ page_token: token,
305
+ filter: filter
306
+ end
307
+ end
308
+
309
+ ##
310
+ # Gets the specified routine resource by routine ID.
311
+ def get_routine dataset_id, routine_id
312
+ # The get operation is considered idempotent
313
+ execute backoff: true do
314
+ service.get_routine @project, dataset_id, routine_id
315
+ end
316
+ end
317
+
318
+ ##
319
+ # Updates information in an existing routine, replacing the entire routine resource.
320
+ def update_routine dataset_id, routine_id, new_routine_gapi
321
+ update_with_backoff = false
322
+ options = {}
323
+ if new_routine_gapi.etag
324
+ options[:header] = { "If-Match" => new_routine_gapi.etag }
325
+ # The update with etag operation is considered idempotent
326
+ update_with_backoff = true
327
+ end
328
+ execute backoff: update_with_backoff do
329
+ service.update_routine @project, dataset_id, routine_id, new_routine_gapi, options: options
330
+ end
331
+ end
332
+
333
+ ##
334
+ # Deletes the routine specified by routine_id from the dataset.
335
+ def delete_routine dataset_id, routine_id
336
+ execute { service.delete_routine @project, dataset_id, routine_id }
337
+ end
338
+
339
+ ##
340
+ # Lists all jobs in the specified project to which you have
341
+ # been granted the READER job role.
342
+ def list_jobs all: nil, token: nil, max: nil, filter: nil, min_created_at: nil, max_created_at: nil,
343
+ parent_job_id: nil
344
+ # The list operation is considered idempotent
345
+ min_creation_time = Convert.time_to_millis min_created_at
346
+ max_creation_time = Convert.time_to_millis max_created_at
347
+ execute backoff: true do
348
+ service.list_jobs @project, all_users: all, max_results: max,
349
+ page_token: token, projection: "full", state_filter: filter,
350
+ min_creation_time: min_creation_time, max_creation_time: max_creation_time,
351
+ parent_job_id: parent_job_id
352
+ end
353
+ end
354
+
355
+ ##
356
+ # Cancel the job specified by jobId.
357
+ def cancel_job job_id, location: nil
358
+ # The BigQuery team has told us cancelling is considered idempotent
359
+ execute backoff: true do
360
+ service.cancel_job @project, job_id, location: location
361
+ end
362
+ end
363
+
364
+ ##
365
+ # Returns the job specified by jobID.
366
+ def get_job job_id, location: nil
367
+ # The get operation is considered idempotent
368
+ execute backoff: true do
369
+ service.get_job @project, job_id, location: location
370
+ end
371
+ end
372
+
373
+ def insert_job config, location: nil
374
+ job_object = API::Job.new job_reference: job_ref_from(nil, nil, location: location), configuration: config
375
+ # Jobs have generated id, so this operation is considered idempotent
376
+ execute backoff: true do
377
+ service.insert_job @project, job_object
378
+ end
379
+ end
380
+
381
+ def query_job query_job_gapi
382
+ execute backoff: true do
383
+ service.insert_job @project, query_job_gapi
384
+ end
385
+ end
386
+
387
+ ##
388
+ # Returns the query data for the job
389
+ def job_query_results job_id, location: nil, max: nil, token: nil, start: nil, timeout: nil
390
+ # The get operation is considered idempotent
391
+ execute backoff: true do
392
+ service.get_job_query_results @project, job_id,
393
+ location: location,
394
+ max_results: max,
395
+ page_token: token,
396
+ start_index: start,
397
+ timeout_ms: timeout
398
+ end
399
+ end
400
+
401
+ def copy_table copy_job_gapi
402
+ execute backoff: true do
403
+ service.insert_job @project, copy_job_gapi
404
+ end
405
+ end
406
+
407
+ def extract_table extract_job_gapi
408
+ execute backoff: true do
409
+ service.insert_job @project, extract_job_gapi
410
+ end
411
+ end
412
+
413
+ def load_table_gs_url load_job_gapi
414
+ execute backoff: true do
415
+ service.insert_job @project, load_job_gapi
416
+ end
417
+ end
418
+
419
+ def load_table_file file, load_job_gapi
420
+ execute backoff: true do
421
+ service.insert_job @project, load_job_gapi, upload_source: file, content_type: mime_type_for(file)
422
+ end
423
+ end
424
+
425
+ def self.get_table_ref table, default_ref: nil
426
+ if table.respond_to? :table_ref
427
+ table.table_ref
428
+ else
429
+ table_ref_from_s table, default_ref: default_ref
430
+ end
431
+ end
432
+
433
+ ##
434
+ # Extracts at least `tbl` group, and possibly `dts` and `prj` groups,
435
+ # from strings in the formats: "my_table", "my_dataset.my_table", or
436
+ # "my-project:my_dataset.my_table". Then merges project_id and
437
+ # dataset_id from the default table ref if they are missing.
438
+ #
439
+ # The regex matches both Standard SQL
440
+ # ("bigquery-public-data.samples.shakespeare") and Legacy SQL
441
+ # ("bigquery-public-data:samples.shakespeare").
442
+ def self.table_ref_from_s str, default_ref: {}
443
+ str = str.to_s
444
+ m = /\A(((?<prj>\S*)(:|\.))?(?<dts>\S*)\.)?(?<tbl>\S*)\z/.match str
445
+ raise ArgumentError, "unable to identify table from #{str.inspect}" unless m
446
+ str_table_ref_hash = {
447
+ project_id: m["prj"],
448
+ dataset_id: m["dts"],
449
+ table_id: m["tbl"]
450
+ }.delete_if { |_, v| v.nil? }
451
+ str_table_ref_hash = default_ref.to_h.merge str_table_ref_hash
452
+ ref = Google::Apis::BigqueryV2::TableReference.new str_table_ref_hash
453
+ validate_table_ref ref
454
+ ref
455
+ end
456
+
457
+ def self.validate_table_ref table_ref
458
+ [:project_id, :dataset_id, :table_id].each do |f|
459
+ raise ArgumentError, "TableReference is missing #{f}" if table_ref.send(f).nil?
460
+ end
461
+ end
462
+
463
+ ##
464
+ # Lists all projects to which you have been granted any project role.
465
+ def list_projects max: nil, token: nil
466
+ execute backoff: true do
467
+ service.list_projects max_results: max, page_token: token
468
+ end
469
+ end
470
+
471
+ # If no job_id or prefix is given, always generate a client-side job ID
472
+ # anyway, for idempotent retry in the google-api-client layer.
473
+ # See https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid
474
+ def job_ref_from job_id, prefix, location: nil
475
+ prefix ||= "job_"
476
+ job_id ||= "#{prefix}#{generate_id}"
477
+ job_ref = API::JobReference.new project_id: @project, job_id: job_id
478
+ # BigQuery does not allow nil location, but missing is ok.
479
+ job_ref.location = location if location
480
+ job_ref
481
+ end
482
+
483
+ # API object for dataset.
484
+ def dataset_ref_from dts, pjt = nil
485
+ return nil if dts.nil?
486
+ if dts.respond_to? :dataset_id
487
+ Google::Apis::BigqueryV2::DatasetReference.new(
488
+ project_id: (pjt || dts.project_id || @project),
489
+ dataset_id: dts.dataset_id
490
+ )
491
+ else
492
+ Google::Apis::BigqueryV2::DatasetReference.new(
493
+ project_id: (pjt || @project),
494
+ dataset_id: dts
495
+ )
496
+ end
497
+ end
498
+
499
+ def inspect
500
+ "#{self.class}(#{@project})"
501
+ end
502
+
503
+ protected
504
+
505
+ # Generate a random string similar to the BigQuery service job IDs.
506
+ def generate_id
507
+ SecureRandom.urlsafe_base64 21
508
+ end
509
+
510
+ def mime_type_for file
511
+ mime_type = MiniMime.lookup_by_filename Pathname(file).to_path
512
+ return nil if mime_type.nil?
513
+ mime_type.content_type
514
+ rescue StandardError
515
+ nil
516
+ end
517
+
518
+ def execute backoff: nil
519
+ if backoff
520
+ Backoff.new(retries: retries).execute { yield }
521
+ else
522
+ yield
523
+ end
524
+ rescue Google::Apis::Error => e
525
+ raise Google::Cloud::Error.from_error e
526
+ end
527
+
528
+ class Backoff
529
+ class << self
530
+ attr_accessor :retries
531
+ attr_accessor :reasons
532
+ attr_accessor :backoff
533
+ end
534
+ self.retries = 5
535
+ self.reasons = ["rateLimitExceeded", "backendError"]
536
+ self.backoff = lambda do |retries|
537
+ # Max delay is 32 seconds
538
+ # See "Back-off Requirements" here:
539
+ # https://cloud.google.com/bigquery/sla
540
+ retries = 5 if retries > 5
541
+ delay = 2**retries
542
+ sleep delay
543
+ end
544
+
545
+ def initialize retries: nil, reasons: nil, backoff: nil
546
+ @retries = (retries || Backoff.retries).to_i
547
+ @reasons = (reasons || Backoff.reasons).to_a
548
+ @backoff = backoff || Backoff.backoff
549
+ end
550
+
551
+ def execute
552
+ current_retries = 0
553
+ loop do
554
+ begin
555
+ return yield
556
+ rescue Google::Apis::Error => e
557
+ raise e unless retry? e.body, current_retries
558
+
559
+ @backoff.call current_retries
560
+ current_retries += 1
561
+ end
562
+ end
563
+ end
564
+
565
+ protected
566
+
567
+ def retry? result, current_retries #:nodoc:
568
+ if current_retries < @retries
569
+ return true if retry_error_reason? result
570
+ end
571
+ false
572
+ end
573
+
574
+ def retry_error_reason? err_body
575
+ err_hash = JSON.parse err_body
576
+ json_errors = Array err_hash["error"]["errors"]
577
+ return false if json_errors.empty?
578
+ json_errors.each do |json_error|
579
+ return false unless @reasons.include? json_error["reason"]
580
+ end
581
+ true
582
+ rescue StandardError
583
+ false
584
+ end
585
+ end
586
+ end
587
+ end
588
+ end
589
+ end