google-cloud-bigquery 1.21.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +16 -0
  3. data/AUTHENTICATION.md +158 -0
  4. data/CHANGELOG.md +397 -0
  5. data/CODE_OF_CONDUCT.md +40 -0
  6. data/CONTRIBUTING.md +188 -0
  7. data/LICENSE +201 -0
  8. data/LOGGING.md +27 -0
  9. data/OVERVIEW.md +463 -0
  10. data/TROUBLESHOOTING.md +31 -0
  11. data/lib/google-cloud-bigquery.rb +139 -0
  12. data/lib/google/cloud/bigquery.rb +145 -0
  13. data/lib/google/cloud/bigquery/argument.rb +197 -0
  14. data/lib/google/cloud/bigquery/convert.rb +383 -0
  15. data/lib/google/cloud/bigquery/copy_job.rb +316 -0
  16. data/lib/google/cloud/bigquery/credentials.rb +50 -0
  17. data/lib/google/cloud/bigquery/data.rb +526 -0
  18. data/lib/google/cloud/bigquery/dataset.rb +2845 -0
  19. data/lib/google/cloud/bigquery/dataset/access.rb +1021 -0
  20. data/lib/google/cloud/bigquery/dataset/list.rb +162 -0
  21. data/lib/google/cloud/bigquery/encryption_configuration.rb +123 -0
  22. data/lib/google/cloud/bigquery/external.rb +2432 -0
  23. data/lib/google/cloud/bigquery/extract_job.rb +368 -0
  24. data/lib/google/cloud/bigquery/insert_response.rb +180 -0
  25. data/lib/google/cloud/bigquery/job.rb +657 -0
  26. data/lib/google/cloud/bigquery/job/list.rb +162 -0
  27. data/lib/google/cloud/bigquery/load_job.rb +1704 -0
  28. data/lib/google/cloud/bigquery/model.rb +740 -0
  29. data/lib/google/cloud/bigquery/model/list.rb +164 -0
  30. data/lib/google/cloud/bigquery/project.rb +1655 -0
  31. data/lib/google/cloud/bigquery/project/list.rb +161 -0
  32. data/lib/google/cloud/bigquery/query_job.rb +1695 -0
  33. data/lib/google/cloud/bigquery/routine.rb +1108 -0
  34. data/lib/google/cloud/bigquery/routine/list.rb +165 -0
  35. data/lib/google/cloud/bigquery/schema.rb +564 -0
  36. data/lib/google/cloud/bigquery/schema/field.rb +668 -0
  37. data/lib/google/cloud/bigquery/service.rb +589 -0
  38. data/lib/google/cloud/bigquery/standard_sql.rb +495 -0
  39. data/lib/google/cloud/bigquery/table.rb +3340 -0
  40. data/lib/google/cloud/bigquery/table/async_inserter.rb +520 -0
  41. data/lib/google/cloud/bigquery/table/list.rb +172 -0
  42. data/lib/google/cloud/bigquery/time.rb +65 -0
  43. data/lib/google/cloud/bigquery/version.rb +22 -0
  44. metadata +297 -0
@@ -0,0 +1,589 @@
1
+ # Copyright 2015 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google/cloud/bigquery/version"
17
+ require "google/cloud/bigquery/convert"
18
+ require "google/cloud/errors"
19
+ require "google/apis/bigquery_v2"
20
+ require "pathname"
21
+ require "securerandom"
22
+ require "mini_mime"
23
+ require "date"
24
+
25
+ module Google
26
+ module Cloud
27
+ module Bigquery
28
+ ##
29
+ # @private Represents the Bigquery service and API calls.
30
+ class Service
31
+ ##
32
+ # Alias to the Google Client API module
33
+ API = Google::Apis::BigqueryV2
34
+
35
+ # @private
36
+ attr_accessor :project
37
+
38
+ # @private
39
+ attr_accessor :credentials
40
+
41
+ # @private
42
+ attr_reader :retries, :timeout, :host
43
+
44
+ ##
45
+ # Creates a new Service instance.
46
+ def initialize project, credentials, retries: nil, timeout: nil, host: nil
47
+ @project = project
48
+ @credentials = credentials
49
+ @retries = retries
50
+ @timeout = timeout
51
+ @host = host
52
+ end
53
+
54
+ def service
55
+ return mocked_service if mocked_service
56
+ @service ||= begin
57
+ service = API::BigqueryService.new
58
+ service.client_options.application_name = "gcloud-ruby"
59
+ service.client_options.application_version = Google::Cloud::Bigquery::VERSION
60
+ service.client_options.open_timeout_sec = timeout
61
+ service.client_options.read_timeout_sec = timeout
62
+ service.client_options.send_timeout_sec = timeout
63
+ service.request_options.retries = 0 # handle retries in #execute
64
+ service.request_options.header ||= {}
65
+ service.request_options.header["x-goog-api-client"] = \
66
+ "gl-ruby/#{RUBY_VERSION} gccl/#{Google::Cloud::Bigquery::VERSION}"
67
+ service.authorization = @credentials.client
68
+ service.root_url = host if host
69
+ service
70
+ end
71
+ end
72
+ attr_accessor :mocked_service
73
+
74
+ def project_service_account
75
+ service.get_project_service_account project
76
+ end
77
+
78
+ ##
79
+ # Lists all datasets in the specified project to which you have
80
+ # been granted the READER dataset role.
81
+ def list_datasets all: nil, filter: nil, max: nil, token: nil
82
+ # The list operation is considered idempotent
83
+ execute backoff: true do
84
+ service.list_datasets @project, all: all, filter: filter, max_results: max, page_token: token
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Returns the dataset specified by datasetID.
90
+ def get_dataset dataset_id
91
+ # The get operation is considered idempotent
92
+ execute backoff: true do
93
+ service.get_dataset @project, dataset_id
94
+ end
95
+ end
96
+
97
+ ##
98
+ # Creates a new empty dataset.
99
+ def insert_dataset new_dataset_gapi
100
+ execute { service.insert_dataset @project, new_dataset_gapi }
101
+ end
102
+
103
+ ##
104
+ # Updates information in an existing dataset, only replacing
105
+ # fields that are provided in the submitted dataset resource.
106
+ def patch_dataset dataset_id, patched_dataset_gapi
107
+ patch_with_backoff = false
108
+ options = {}
109
+ if patched_dataset_gapi.etag
110
+ options[:header] = { "If-Match" => patched_dataset_gapi.etag }
111
+ # The patch with etag operation is considered idempotent
112
+ patch_with_backoff = true
113
+ end
114
+ execute backoff: patch_with_backoff do
115
+ service.patch_dataset @project, dataset_id, patched_dataset_gapi, options: options
116
+ end
117
+ end
118
+
119
+ ##
120
+ # Deletes the dataset specified by the datasetId value.
121
+ # Before you can delete a dataset, you must delete all its tables,
122
+ # either manually or by specifying force: true in options.
123
+ # Immediately after deletion, you can create another dataset with
124
+ # the same name.
125
+ def delete_dataset dataset_id, force = nil
126
+ execute do
127
+ service.delete_dataset @project, dataset_id, delete_contents: force
128
+ end
129
+ end
130
+
131
+ ##
132
+ # Lists all tables in the specified dataset.
133
+ # Requires the READER dataset role.
134
+ def list_tables dataset_id, max: nil, token: nil
135
+ # The list operation is considered idempotent
136
+ execute backoff: true do
137
+ service.list_tables @project, dataset_id, max_results: max, page_token: token
138
+ end
139
+ end
140
+
141
+ def get_project_table project_id, dataset_id, table_id
142
+ # The get operation is considered idempotent
143
+ execute backoff: true do
144
+ service.get_table project_id, dataset_id, table_id
145
+ end
146
+ end
147
+
148
+ ##
149
+ # Gets the specified table resource by table ID.
150
+ # This method does not return the data in the table,
151
+ # it only returns the table resource,
152
+ # which describes the structure of this table.
153
+ def get_table dataset_id, table_id
154
+ # The get operation is considered idempotent
155
+ execute backoff: true do
156
+ get_project_table @project, dataset_id, table_id
157
+ end
158
+ end
159
+
160
+ ##
161
+ # Creates a new, empty table in the dataset.
162
+ def insert_table dataset_id, new_table_gapi
163
+ execute { service.insert_table @project, dataset_id, new_table_gapi }
164
+ end
165
+
166
+ ##
167
+ # Updates information in an existing table, replacing fields that
168
+ # are provided in the submitted table resource.
169
+ def patch_table dataset_id, table_id, patched_table_gapi
170
+ patch_with_backoff = false
171
+ options = {}
172
+ if patched_table_gapi.etag
173
+ options[:header] = { "If-Match" => patched_table_gapi.etag }
174
+ # The patch with etag operation is considered idempotent
175
+ patch_with_backoff = true
176
+ end
177
+ execute backoff: patch_with_backoff do
178
+ service.patch_table @project, dataset_id, table_id, patched_table_gapi, options: options
179
+ end
180
+ end
181
+
182
+ ##
183
+ # Deletes the table specified by tableId from the dataset.
184
+ # If the table contains data, all the data will be deleted.
185
+ def delete_table dataset_id, table_id
186
+ execute { service.delete_table @project, dataset_id, table_id }
187
+ end
188
+
189
+ ##
190
+ # Retrieves data from the table.
191
+ def list_tabledata dataset_id, table_id, max: nil, token: nil, start: nil
192
+ # The list operation is considered idempotent
193
+ execute backoff: true do
194
+ json_txt = service.list_table_data \
195
+ @project, dataset_id, table_id,
196
+ max_results: max,
197
+ page_token: token,
198
+ start_index: start,
199
+ options: { skip_deserialization: true }
200
+ JSON.parse json_txt, symbolize_names: true
201
+ end
202
+ end
203
+
204
+ def insert_tabledata dataset_id, table_id, rows, insert_ids: nil, ignore_unknown: nil, skip_invalid: nil
205
+ json_rows = Array(rows).map { |row| Convert.to_json_row row }
206
+ insert_tabledata_json_rows dataset_id, table_id, json_rows, insert_ids: insert_ids,
207
+ ignore_unknown: ignore_unknown,
208
+ skip_invalid: skip_invalid
209
+ end
210
+
211
+ def insert_tabledata_json_rows dataset_id, table_id, json_rows, insert_ids: nil, ignore_unknown: nil,
212
+ skip_invalid: nil
213
+ rows_and_ids = Array(json_rows).zip Array(insert_ids)
214
+ insert_rows = rows_and_ids.map do |json_row, insert_id|
215
+ if insert_id == :skip
216
+ { json: json_row }
217
+ else
218
+ insert_id ||= SecureRandom.uuid
219
+ {
220
+ insertId: insert_id,
221
+ json: json_row
222
+ }
223
+ end
224
+ end
225
+
226
+ insert_req = {
227
+ rows: insert_rows,
228
+ ignoreUnknownValues: ignore_unknown,
229
+ skipInvalidRows: skip_invalid
230
+ }.to_json
231
+
232
+ # The insertAll with insertId operation is considered idempotent
233
+ execute backoff: true do
234
+ service.insert_all_table_data(
235
+ @project, dataset_id, table_id, insert_req,
236
+ options: { skip_serialization: true }
237
+ )
238
+ end
239
+ end
240
+
241
+ ##
242
+ # Lists all models in the specified dataset.
243
+ # Requires the READER dataset role.
244
+ def list_models dataset_id, max: nil, token: nil
245
+ options = { skip_deserialization: true }
246
+ # The list operation is considered idempotent
247
+ execute backoff: true do
248
+ json_txt = service.list_models @project, dataset_id, max_results: max, page_token: token, options: options
249
+ JSON.parse json_txt, symbolize_names: true
250
+ end
251
+ end
252
+
253
+ # Gets the specified model resource by model ID.
254
+ # This method does not return the data in the model,
255
+ # it only returns the model resource,
256
+ # which describes the structure of this model.
257
+ def get_model dataset_id, model_id
258
+ # The get operation is considered idempotent
259
+ execute backoff: true do
260
+ json_txt = service.get_model @project, dataset_id, model_id, options: { skip_deserialization: true }
261
+ JSON.parse json_txt, symbolize_names: true
262
+ end
263
+ end
264
+
265
+ ##
266
+ # Updates information in an existing model, replacing fields that
267
+ # are provided in the submitted model resource.
268
+ def patch_model dataset_id, model_id, patched_model_gapi, etag = nil
269
+ patch_with_backoff = false
270
+ options = { skip_deserialization: true }
271
+ if etag
272
+ options[:header] = { "If-Match" => etag }
273
+ # The patch with etag operation is considered idempotent
274
+ patch_with_backoff = true
275
+ end
276
+ execute backoff: patch_with_backoff do
277
+ json_txt = service.patch_model @project, dataset_id, model_id, patched_model_gapi, options: options
278
+ JSON.parse json_txt, symbolize_names: true
279
+ end
280
+ end
281
+
282
+ ##
283
+ # Deletes the model specified by modelId from the dataset.
284
+ # If the model contains data, all the data will be deleted.
285
+ def delete_model dataset_id, model_id
286
+ execute { service.delete_model @project, dataset_id, model_id }
287
+ end
288
+
289
+ ##
290
+ # Creates a new routine in the dataset.
291
+ def insert_routine dataset_id, new_routine_gapi
292
+ execute { service.insert_routine @project, dataset_id, new_routine_gapi }
293
+ end
294
+
295
+ ##
296
+ # Lists all routines in the specified dataset.
297
+ # Requires the READER dataset role.
298
+ # Unless readMask is set in the request, only the following fields are populated:
299
+ # etag, projectId, datasetId, routineId, routineType, creationTime, lastModifiedTime, and language.
300
+ def list_routines dataset_id, max: nil, token: nil, filter: nil
301
+ # The list operation is considered idempotent
302
+ execute backoff: true do
303
+ service.list_routines @project, dataset_id, max_results: max,
304
+ page_token: token,
305
+ filter: filter
306
+ end
307
+ end
308
+
309
+ ##
310
+ # Gets the specified routine resource by routine ID.
311
+ def get_routine dataset_id, routine_id
312
+ # The get operation is considered idempotent
313
+ execute backoff: true do
314
+ service.get_routine @project, dataset_id, routine_id
315
+ end
316
+ end
317
+
318
+ ##
319
+ # Updates information in an existing routine, replacing the entire routine resource.
320
+ def update_routine dataset_id, routine_id, new_routine_gapi
321
+ update_with_backoff = false
322
+ options = {}
323
+ if new_routine_gapi.etag
324
+ options[:header] = { "If-Match" => new_routine_gapi.etag }
325
+ # The update with etag operation is considered idempotent
326
+ update_with_backoff = true
327
+ end
328
+ execute backoff: update_with_backoff do
329
+ service.update_routine @project, dataset_id, routine_id, new_routine_gapi, options: options
330
+ end
331
+ end
332
+
333
+ ##
334
+ # Deletes the routine specified by routine_id from the dataset.
335
+ def delete_routine dataset_id, routine_id
336
+ execute { service.delete_routine @project, dataset_id, routine_id }
337
+ end
338
+
339
+ ##
340
+ # Lists all jobs in the specified project to which you have
341
+ # been granted the READER job role.
342
+ def list_jobs all: nil, token: nil, max: nil, filter: nil, min_created_at: nil, max_created_at: nil,
343
+ parent_job_id: nil
344
+ # The list operation is considered idempotent
345
+ min_creation_time = Convert.time_to_millis min_created_at
346
+ max_creation_time = Convert.time_to_millis max_created_at
347
+ execute backoff: true do
348
+ service.list_jobs @project, all_users: all, max_results: max,
349
+ page_token: token, projection: "full", state_filter: filter,
350
+ min_creation_time: min_creation_time, max_creation_time: max_creation_time,
351
+ parent_job_id: parent_job_id
352
+ end
353
+ end
354
+
355
+ ##
356
+ # Cancel the job specified by jobId.
357
+ def cancel_job job_id, location: nil
358
+ # The BigQuery team has told us cancelling is considered idempotent
359
+ execute backoff: true do
360
+ service.cancel_job @project, job_id, location: location
361
+ end
362
+ end
363
+
364
+ ##
365
+ # Returns the job specified by jobID.
366
+ def get_job job_id, location: nil
367
+ # The get operation is considered idempotent
368
+ execute backoff: true do
369
+ service.get_job @project, job_id, location: location
370
+ end
371
+ end
372
+
373
+ def insert_job config, location: nil
374
+ job_object = API::Job.new job_reference: job_ref_from(nil, nil, location: location), configuration: config
375
+ # Jobs have generated id, so this operation is considered idempotent
376
+ execute backoff: true do
377
+ service.insert_job @project, job_object
378
+ end
379
+ end
380
+
381
+ def query_job query_job_gapi
382
+ execute backoff: true do
383
+ service.insert_job @project, query_job_gapi
384
+ end
385
+ end
386
+
387
+ ##
388
+ # Returns the query data for the job
389
+ def job_query_results job_id, location: nil, max: nil, token: nil, start: nil, timeout: nil
390
+ # The get operation is considered idempotent
391
+ execute backoff: true do
392
+ service.get_job_query_results @project, job_id,
393
+ location: location,
394
+ max_results: max,
395
+ page_token: token,
396
+ start_index: start,
397
+ timeout_ms: timeout
398
+ end
399
+ end
400
+
401
+ def copy_table copy_job_gapi
402
+ execute backoff: true do
403
+ service.insert_job @project, copy_job_gapi
404
+ end
405
+ end
406
+
407
+ def extract_table extract_job_gapi
408
+ execute backoff: true do
409
+ service.insert_job @project, extract_job_gapi
410
+ end
411
+ end
412
+
413
+ def load_table_gs_url load_job_gapi
414
+ execute backoff: true do
415
+ service.insert_job @project, load_job_gapi
416
+ end
417
+ end
418
+
419
+ def load_table_file file, load_job_gapi
420
+ execute backoff: true do
421
+ service.insert_job @project, load_job_gapi, upload_source: file, content_type: mime_type_for(file)
422
+ end
423
+ end
424
+
425
+ def self.get_table_ref table, default_ref: nil
426
+ if table.respond_to? :table_ref
427
+ table.table_ref
428
+ else
429
+ table_ref_from_s table, default_ref: default_ref
430
+ end
431
+ end
432
+
433
+ ##
434
+ # Extracts at least `tbl` group, and possibly `dts` and `prj` groups,
435
+ # from strings in the formats: "my_table", "my_dataset.my_table", or
436
+ # "my-project:my_dataset.my_table". Then merges project_id and
437
+ # dataset_id from the default table ref if they are missing.
438
+ #
439
+ # The regex matches both Standard SQL
440
+ # ("bigquery-public-data.samples.shakespeare") and Legacy SQL
441
+ # ("bigquery-public-data:samples.shakespeare").
442
+ def self.table_ref_from_s str, default_ref: {}
443
+ str = str.to_s
444
+ m = /\A(((?<prj>\S*)(:|\.))?(?<dts>\S*)\.)?(?<tbl>\S*)\z/.match str
445
+ raise ArgumentError, "unable to identify table from #{str.inspect}" unless m
446
+ str_table_ref_hash = {
447
+ project_id: m["prj"],
448
+ dataset_id: m["dts"],
449
+ table_id: m["tbl"]
450
+ }.delete_if { |_, v| v.nil? }
451
+ str_table_ref_hash = default_ref.to_h.merge str_table_ref_hash
452
+ ref = Google::Apis::BigqueryV2::TableReference.new str_table_ref_hash
453
+ validate_table_ref ref
454
+ ref
455
+ end
456
+
457
+ def self.validate_table_ref table_ref
458
+ [:project_id, :dataset_id, :table_id].each do |f|
459
+ raise ArgumentError, "TableReference is missing #{f}" if table_ref.send(f).nil?
460
+ end
461
+ end
462
+
463
+ ##
464
+ # Lists all projects to which you have been granted any project role.
465
+ def list_projects max: nil, token: nil
466
+ execute backoff: true do
467
+ service.list_projects max_results: max, page_token: token
468
+ end
469
+ end
470
+
471
+ # If no job_id or prefix is given, always generate a client-side job ID
472
+ # anyway, for idempotent retry in the google-api-client layer.
473
+ # See https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid
474
+ def job_ref_from job_id, prefix, location: nil
475
+ prefix ||= "job_"
476
+ job_id ||= "#{prefix}#{generate_id}"
477
+ job_ref = API::JobReference.new project_id: @project, job_id: job_id
478
+ # BigQuery does not allow nil location, but missing is ok.
479
+ job_ref.location = location if location
480
+ job_ref
481
+ end
482
+
483
+ # API object for dataset.
484
+ def dataset_ref_from dts, pjt = nil
485
+ return nil if dts.nil?
486
+ if dts.respond_to? :dataset_id
487
+ Google::Apis::BigqueryV2::DatasetReference.new(
488
+ project_id: (pjt || dts.project_id || @project),
489
+ dataset_id: dts.dataset_id
490
+ )
491
+ else
492
+ Google::Apis::BigqueryV2::DatasetReference.new(
493
+ project_id: (pjt || @project),
494
+ dataset_id: dts
495
+ )
496
+ end
497
+ end
498
+
499
+ def inspect
500
+ "#{self.class}(#{@project})"
501
+ end
502
+
503
+ protected
504
+
505
+ # Generate a random string similar to the BigQuery service job IDs.
506
+ def generate_id
507
+ SecureRandom.urlsafe_base64 21
508
+ end
509
+
510
+ def mime_type_for file
511
+ mime_type = MiniMime.lookup_by_filename Pathname(file).to_path
512
+ return nil if mime_type.nil?
513
+ mime_type.content_type
514
+ rescue StandardError
515
+ nil
516
+ end
517
+
518
+ def execute backoff: nil
519
+ if backoff
520
+ Backoff.new(retries: retries).execute { yield }
521
+ else
522
+ yield
523
+ end
524
+ rescue Google::Apis::Error => e
525
+ raise Google::Cloud::Error.from_error e
526
+ end
527
+
528
+ class Backoff
529
+ class << self
530
+ attr_accessor :retries
531
+ attr_accessor :reasons
532
+ attr_accessor :backoff
533
+ end
534
+ self.retries = 5
535
+ self.reasons = ["rateLimitExceeded", "backendError"]
536
+ self.backoff = lambda do |retries|
537
+ # Max delay is 32 seconds
538
+ # See "Back-off Requirements" here:
539
+ # https://cloud.google.com/bigquery/sla
540
+ retries = 5 if retries > 5
541
+ delay = 2**retries
542
+ sleep delay
543
+ end
544
+
545
+ def initialize retries: nil, reasons: nil, backoff: nil
546
+ @retries = (retries || Backoff.retries).to_i
547
+ @reasons = (reasons || Backoff.reasons).to_a
548
+ @backoff = backoff || Backoff.backoff
549
+ end
550
+
551
+ def execute
552
+ current_retries = 0
553
+ loop do
554
+ begin
555
+ return yield
556
+ rescue Google::Apis::Error => e
557
+ raise e unless retry? e.body, current_retries
558
+
559
+ @backoff.call current_retries
560
+ current_retries += 1
561
+ end
562
+ end
563
+ end
564
+
565
+ protected
566
+
567
+ def retry? result, current_retries #:nodoc:
568
+ if current_retries < @retries
569
+ return true if retry_error_reason? result
570
+ end
571
+ false
572
+ end
573
+
574
+ def retry_error_reason? err_body
575
+ err_hash = JSON.parse err_body
576
+ json_errors = Array err_hash["error"]["errors"]
577
+ return false if json_errors.empty?
578
+ json_errors.each do |json_error|
579
+ return false unless @reasons.include? json_error["reason"]
580
+ end
581
+ true
582
+ rescue StandardError
583
+ false
584
+ end
585
+ end
586
+ end
587
+ end
588
+ end
589
+ end