google-cloud-bigquery 0.20.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,502 @@
1
+ # Copyright 2015 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google/cloud/bigquery/version"
17
+ require "google/cloud/errors"
18
+ require "google/apis/bigquery_v2"
19
+ require "pathname"
20
+ require "digest/md5"
21
+ require "mime/types"
22
+
23
+ module Google
24
+ module Cloud
25
+ module Bigquery
26
+ ##
27
+ # @private Represents the Bigquery service and API calls.
28
+ class Service
29
+ ##
30
+ # Alias to the Google Client API module
31
+ API = Google::Apis::BigqueryV2
32
+
33
+ # @private
34
+ attr_accessor :project
35
+
36
+ # @private
37
+ attr_accessor :credentials
38
+
39
+ ##
40
+ # Creates a new Service instance.
41
+ def initialize project, credentials, retries: nil, timeout: nil
42
+ @project = project
43
+ @credentials = credentials
44
+ @credentials = credentials
45
+ @service = API::BigqueryService.new
46
+ @service.client_options.application_name = "google-cloud-bigquery"
47
+ @service.client_options.application_version = \
48
+ Google::Cloud::Bigquery::VERSION
49
+ @service.request_options.retries = retries || 3
50
+ @service.request_options.timeout_sec = timeout if timeout
51
+ @service.authorization = @credentials.client
52
+ end
53
+
54
+ def service
55
+ return mocked_service if mocked_service
56
+ @service
57
+ end
58
+ attr_accessor :mocked_service
59
+
60
+ ##
61
+ # Lists all datasets in the specified project to which you have
62
+ # been granted the READER dataset role.
63
+ def list_datasets options = {}
64
+ execute do
65
+ service.list_datasets \
66
+ @project, all: options[:all], max_results: options[:max],
67
+ page_token: options[:token]
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Returns the dataset specified by datasetID.
73
+ def get_dataset dataset_id
74
+ execute { service.get_dataset @project, dataset_id }
75
+ end
76
+
77
+ ##
78
+ # Creates a new empty dataset.
79
+ def insert_dataset new_dataset_gapi
80
+ execute { service.insert_dataset @project, new_dataset_gapi }
81
+ end
82
+
83
+ ##
84
+ # Updates information in an existing dataset, only replacing
85
+ # fields that are provided in the submitted dataset resource.
86
+ def patch_dataset dataset_id, patched_dataset_gapi
87
+ execute do
88
+ service.patch_dataset @project, dataset_id, patched_dataset_gapi
89
+ end
90
+ end
91
+
92
+ ##
93
+ # Deletes the dataset specified by the datasetId value.
94
+ # Before you can delete a dataset, you must delete all its tables,
95
+ # either manually or by specifying force: true in options.
96
+ # Immediately after deletion, you can create another dataset with
97
+ # the same name.
98
+ def delete_dataset dataset_id, force = nil
99
+ execute do
100
+ service.delete_dataset @project, dataset_id, delete_contents: force
101
+ end
102
+ end
103
+
104
+ ##
105
+ # Lists all tables in the specified dataset.
106
+ # Requires the READER dataset role.
107
+ def list_tables dataset_id, options = {}
108
+ execute do
109
+ service.list_tables @project, dataset_id,
110
+ max_results: options[:max],
111
+ page_token: options[:token]
112
+ end
113
+ end
114
+
115
+ def get_project_table project_id, dataset_id, table_id
116
+ execute { service.get_table project_id, dataset_id, table_id }
117
+ end
118
+
119
+ ##
120
+ # Gets the specified table resource by table ID.
121
+ # This method does not return the data in the table,
122
+ # it only returns the table resource,
123
+ # which describes the structure of this table.
124
+ def get_table dataset_id, table_id
125
+ execute { get_project_table @project, dataset_id, table_id }
126
+ end
127
+
128
+ ##
129
+ # Creates a new, empty table in the dataset.
130
+ def insert_table dataset_id, new_table_gapi
131
+ execute { service.insert_table @project, dataset_id, new_table_gapi }
132
+ end
133
+
134
+ ##
135
+ # Updates information in an existing table, replacing fields that
136
+ # are provided in the submitted table resource.
137
+ def patch_table dataset_id, table_id, patched_table_gapi
138
+ execute do
139
+ service.patch_table @project, dataset_id, table_id,
140
+ patched_table_gapi
141
+ end
142
+ end
143
+
144
+ ##
145
+ # Deletes the table specified by tableId from the dataset.
146
+ # If the table contains data, all the data will be deleted.
147
+ def delete_table dataset_id, table_id
148
+ execute { service.delete_table @project, dataset_id, table_id }
149
+ end
150
+
151
+ ##
152
+ # Retrieves data from the table.
153
+ def list_tabledata dataset_id, table_id, options = {}
154
+ execute do
155
+ service.list_table_data @project, dataset_id, table_id,
156
+ max_results: options.delete(:max),
157
+ page_token: options.delete(:token),
158
+ start_index: options.delete(:start)
159
+ end
160
+ end
161
+
162
+ def insert_tabledata dataset_id, table_id, rows, options = {}
163
+ insert_rows = Array(rows).map do |row|
164
+ Google::Apis::BigqueryV2::InsertAllTableDataRequest::Row.new(
165
+ insert_id: Digest::MD5.base64digest(row.inspect),
166
+ # Hash[row.map{|(k,v)| [k.to_s,v]}] for Hash<String,Object>
167
+ json: row
168
+ )
169
+ end
170
+ insert_req = Google::Apis::BigqueryV2::InsertAllTableDataRequest.new(
171
+ rows: insert_rows,
172
+ ignore_unknown_values: options[:ignore_unknown],
173
+ skip_invalid_rows: options[:skip_invalid]
174
+ )
175
+
176
+ execute do
177
+ service.insert_all_table_data(
178
+ @project, dataset_id, table_id, insert_req)
179
+ end
180
+ end
181
+
182
+ ##
183
+ # Lists all jobs in the specified project to which you have
184
+ # been granted the READER job role.
185
+ def list_jobs options = {}
186
+ execute do
187
+ service.list_jobs \
188
+ @project, all_users: options[:all], max_results: options[:max],
189
+ page_token: options[:token], projection: "full",
190
+ state_filter: options[:filter]
191
+ end
192
+ end
193
+
194
+ ##
195
+ # Returns the job specified by jobID.
196
+ def get_job job_id
197
+ execute { service.get_job @project, job_id }
198
+ end
199
+
200
+ def insert_job config
201
+ job_object = API::Job.new(
202
+ configuration: config
203
+ )
204
+ execute { service.insert_job @project, job_object }
205
+ end
206
+
207
+ def query_job query, options = {}
208
+ config = query_table_config(query, options)
209
+ execute { service.insert_job @project, config }
210
+ end
211
+
212
+ def query query, options = {}
213
+ execute { service.query_job @project, query_config(query, options) }
214
+ end
215
+
216
+ ##
217
+ # Returns the query data for the job
218
+ def job_query_results job_id, options = {}
219
+ execute do
220
+ service.get_job_query_results @project,
221
+ job_id,
222
+ max_results: options.delete(:max),
223
+ page_token: options.delete(:token),
224
+ start_index: options.delete(:start),
225
+ timeout_ms: options.delete(:timeout)
226
+ end
227
+ end
228
+
229
+ def copy_table source, target, options = {}
230
+ execute do
231
+ service.insert_job @project, copy_table_config(
232
+ source, target, options)
233
+ end
234
+ end
235
+
236
+ def extract_table table, storage_files, options = {}
237
+ execute do
238
+ service.insert_job \
239
+ @project, extract_table_config(table, storage_files, options)
240
+ end
241
+ end
242
+
243
+ def load_table_gs_url dataset_id, table_id, url, options = {}
244
+ execute do
245
+ service.insert_job \
246
+ @project, load_table_url_config(dataset_id, table_id,
247
+ url, options)
248
+ end
249
+ end
250
+
251
+ def load_table_file dataset_id, table_id, file, options = {}
252
+ execute do
253
+ service.insert_job \
254
+ @project, load_table_file_config(
255
+ dataset_id, table_id, file, options),
256
+ upload_source: file, content_type: mime_type_for(file)
257
+ end
258
+ end
259
+
260
+ ##
261
+ # Extracts at least `tbl` group, and possibly `dts` and `prj` groups,
262
+ # from strings in the formats: "my_table", "my_dataset.my_table", or
263
+ # "my-project:my_dataset.my_table". Then merges project_id and
264
+ # dataset_id from the default table if they are missing.
265
+ def self.table_ref_from_s str, default_table_ref
266
+ str = str.to_s
267
+ m = /\A(((?<prj>\S*):)?(?<dts>\S*)\.)?(?<tbl>\S*)\z/.match str
268
+ unless m
269
+ fail ArgumentError, "unable to identify table from #{str.inspect}"
270
+ end
271
+ str_table_ref_hash = {
272
+ project_id: m["prj"],
273
+ dataset_id: m["dts"],
274
+ table_id: m["tbl"]
275
+ }.delete_if { |_, v| v.nil? }
276
+ new_table_ref_hash = default_table_ref.to_h.merge str_table_ref_hash
277
+ Google::Apis::BigqueryV2::TableReference.new new_table_ref_hash
278
+ end
279
+
280
+ def inspect
281
+ "#{self.class}(#{@project})"
282
+ end
283
+
284
+ protected
285
+
286
+ def table_ref_from tbl
287
+ return nil if tbl.nil?
288
+ API::TableReference.new(
289
+ project_id: tbl.project_id,
290
+ dataset_id: tbl.dataset_id,
291
+ table_id: tbl.table_id
292
+ )
293
+ end
294
+
295
+ def dataset_ref_from dts, pjt = nil
296
+ return nil if dts.nil?
297
+ if dts.respond_to? :dataset_id
298
+ API::DatasetReference.new(
299
+ project_id: (pjt || dts.project_id || @project),
300
+ dataset_id: dts.dataset_id
301
+ )
302
+ else
303
+ API::DatasetReference.new(
304
+ project_id: (pjt || @project),
305
+ dataset_id: dts
306
+ )
307
+ end
308
+ end
309
+
310
+ def load_table_file_opts dataset_id, table_id, file, options = {}
311
+ path = Pathname(file).to_path
312
+ {
313
+ destination_table: Google::Apis::BigqueryV2::TableReference.new(
314
+ project_id: @project, dataset_id: dataset_id, table_id: table_id),
315
+ create_disposition: create_disposition(options[:create]),
316
+ write_disposition: write_disposition(options[:write]),
317
+ source_format: source_format(path, options[:format]),
318
+ projection_fields: projection_fields(options[:projection_fields]),
319
+ allow_jagged_rows: options[:jagged_rows],
320
+ allow_quoted_newlines: options[:quoted_newlines],
321
+ encoding: options[:encoding], field_delimiter: options[:delimiter],
322
+ ignore_unknown_values: options[:ignore_unknown],
323
+ max_bad_records: options[:max_bad_records], quote: options[:quote],
324
+ schema: options[:schema], skip_leading_rows: options[:skip_leading]
325
+ }.delete_if { |_, v| v.nil? }
326
+ end
327
+
328
+ def load_table_file_config dataset_id, table_id, file, options = {}
329
+ load_opts = load_table_file_opts dataset_id, table_id, file, options
330
+ API::Job.new(
331
+ configuration: API::JobConfiguration.new(
332
+ load: API::JobConfigurationLoad.new(load_opts),
333
+ dry_run: options[:dryrun]
334
+ )
335
+ )
336
+ end
337
+
338
+ def load_table_url_opts dataset_id, table_id, url, options = {}
339
+ {
340
+ destination_table: Google::Apis::BigqueryV2::TableReference.new(
341
+ project_id: @project, dataset_id: dataset_id, table_id: table_id),
342
+ source_uris: Array(url),
343
+ create_disposition: create_disposition(options[:create]),
344
+ write_disposition: write_disposition(options[:write]),
345
+ source_format: source_format(url, options[:format]),
346
+ projection_fields: projection_fields(options[:projection_fields]),
347
+ allow_jagged_rows: options[:jagged_rows],
348
+ allow_quoted_newlines: options[:quoted_newlines],
349
+ encoding: options[:encoding], field_delimiter: options[:delimiter],
350
+ ignore_unknown_values: options[:ignore_unknown],
351
+ max_bad_records: options[:max_bad_records], quote: options[:quote],
352
+ schema: options[:schema], skip_leading_rows: options[:skip_leading]
353
+ }.delete_if { |_, v| v.nil? }
354
+ end
355
+
356
+ def load_table_url_config dataset_id, table_id, url, options = {}
357
+ load_opts = load_table_url_opts dataset_id, table_id, url, options
358
+ API::Job.new(
359
+ configuration: API::JobConfiguration.new(
360
+ load: API::JobConfigurationLoad.new(load_opts),
361
+ dry_run: options[:dryrun]
362
+ )
363
+ )
364
+ end
365
+
366
+ ##
367
+ # Job description for query job
368
+ def query_table_config query, options
369
+ dest_table = table_ref_from options[:table]
370
+ default_dataset = dataset_ref_from options[:dataset]
371
+ API::Job.new(
372
+ configuration: API::JobConfiguration.new(
373
+ query: API::JobConfigurationQuery.new(
374
+ query: query,
375
+ # tableDefinitions: { ... },
376
+ priority: priority_value(options[:priority]),
377
+ use_query_cache: options[:cache],
378
+ destination_table: dest_table,
379
+ create_disposition: create_disposition(options[:create]),
380
+ write_disposition: write_disposition(options[:write]),
381
+ allow_large_results: options[:large_results],
382
+ flatten_results: options[:flatten],
383
+ default_dataset: default_dataset
384
+ )
385
+ )
386
+ )
387
+ end
388
+
389
+ def query_config query, options = {}
390
+ dataset_config = dataset_ref_from options[:dataset], options[:project]
391
+
392
+ API::QueryRequest.new(
393
+ query: query,
394
+ max_results: options[:max],
395
+ default_dataset: dataset_config,
396
+ timeout_ms: options[:timeout],
397
+ dry_run: options[:dryrun],
398
+ use_query_cache: options[:cache]
399
+ )
400
+ end
401
+
402
+ ##
403
+ # Job description for copy job
404
+ def copy_table_config source, target, options = {}
405
+ API::Job.new(
406
+ configuration: API::JobConfiguration.new(
407
+ copy: API::JobConfigurationTableCopy.new(
408
+ source_table: source,
409
+ destination_table: target,
410
+ create_disposition: create_disposition(options[:create]),
411
+ write_disposition: write_disposition(options[:write])
412
+ ),
413
+ dry_run: options[:dryrun]
414
+ )
415
+ )
416
+ end
417
+
418
+ def extract_table_config table, storage_files, options = {}
419
+ storage_urls = Array(storage_files).map do |url|
420
+ url.respond_to?(:to_gs_url) ? url.to_gs_url : url
421
+ end
422
+ dest_format = source_format storage_urls.first, options[:format]
423
+ API::Job.new(
424
+ configuration: API::JobConfiguration.new(
425
+ extract: API::JobConfigurationExtract.new(
426
+ destination_uris: Array(storage_urls),
427
+ source_table: table,
428
+ destination_format: dest_format,
429
+ compression: options[:compression],
430
+ field_delimiter: options[:delimiter],
431
+ print_header: options[:header]
432
+ ),
433
+ dry_run: options[:dryrun]
434
+ )
435
+ )
436
+ end
437
+
438
+ def create_disposition str
439
+ { "create_if_needed" => "CREATE_IF_NEEDED",
440
+ "createifneeded" => "CREATE_IF_NEEDED",
441
+ "if_needed" => "CREATE_IF_NEEDED",
442
+ "needed" => "CREATE_IF_NEEDED",
443
+ "create_never" => "CREATE_NEVER",
444
+ "createnever" => "CREATE_NEVER",
445
+ "never" => "CREATE_NEVER" }[str.to_s.downcase]
446
+ end
447
+
448
+ def write_disposition str
449
+ { "write_truncate" => "WRITE_TRUNCATE",
450
+ "writetruncate" => "WRITE_TRUNCATE",
451
+ "truncate" => "WRITE_TRUNCATE",
452
+ "write_append" => "WRITE_APPEND",
453
+ "writeappend" => "WRITE_APPEND",
454
+ "append" => "WRITE_APPEND",
455
+ "write_empty" => "WRITE_EMPTY",
456
+ "writeempty" => "WRITE_EMPTY",
457
+ "empty" => "WRITE_EMPTY" }[str.to_s.downcase]
458
+ end
459
+
460
+ def priority_value str
461
+ { "batch" => "BATCH",
462
+ "interactive" => "INTERACTIVE" }[str.to_s.downcase]
463
+ end
464
+
465
+ def source_format path, format
466
+ val = { "csv" => "CSV",
467
+ "json" => "NEWLINE_DELIMITED_JSON",
468
+ "newline_delimited_json" => "NEWLINE_DELIMITED_JSON",
469
+ "avro" => "AVRO",
470
+ "datastore" => "DATASTORE_BACKUP",
471
+ "datastore_backup" => "DATASTORE_BACKUP"
472
+ }[format.to_s.downcase]
473
+ return val unless val.nil?
474
+ return nil if path.nil?
475
+ return "CSV" if path.end_with? ".csv"
476
+ return "NEWLINE_DELIMITED_JSON" if path.end_with? ".json"
477
+ return "AVRO" if path.end_with? ".avro"
478
+ return "DATASTORE_BACKUP" if path.end_with? ".backup_info"
479
+ nil
480
+ end
481
+
482
+ def projection_fields array_or_str
483
+ Array(array_or_str) unless array_or_str.nil?
484
+ end
485
+
486
+ def mime_type_for file
487
+ mime_type = MIME::Types.of(Pathname(file).to_path).first.to_s
488
+ return nil if mime_type.empty?
489
+ mime_type
490
+ rescue
491
+ nil
492
+ end
493
+
494
+ def execute
495
+ yield
496
+ rescue Google::Apis::Error => e
497
+ raise Google::Cloud::Error.from_error(e)
498
+ end
499
+ end
500
+ end
501
+ end
502
+ end