google-cloud-bigquery 1.20.0 → 1.23.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,15 +20,17 @@ module Google
20
20
  # # ExtractJob
21
21
  #
22
22
  # A {Job} subclass representing an export operation that may be performed
23
- # on a {Table}. A ExtractJob instance is created when you call
24
- # {Table#extract_job}.
23
+ # on a {Table} or {Model}. A ExtractJob instance is returned when you call
24
+ # {Project#extract_job}, {Table#extract_job} or {Model#extract_job}.
25
25
  #
26
26
  # @see https://cloud.google.com/bigquery/docs/exporting-data
27
- # Exporting Data From BigQuery
27
+ # Exporting table data
28
+ # @see https://cloud.google.com/bigquery-ml/docs/exporting-models
29
+ # Exporting models
28
30
  # @see https://cloud.google.com/bigquery/docs/reference/v2/jobs Jobs API
29
31
  # reference
30
32
  #
31
- # @example
33
+ # @example Export table data
32
34
  # require "google/cloud/bigquery"
33
35
  #
34
36
  # bigquery = Google::Cloud::Bigquery.new
@@ -40,6 +42,18 @@ module Google
40
42
  # extract_job.wait_until_done!
41
43
  # extract_job.done? #=> true
42
44
  #
45
+ # @example Export a model
46
+ # require "google/cloud/bigquery"
47
+ #
48
+ # bigquery = Google::Cloud::Bigquery.new
49
+ # dataset = bigquery.dataset "my_dataset"
50
+ # model = dataset.model "my_model"
51
+ #
52
+ # extract_job = model.extract_job "gs://my-bucket/#{model.model_id}"
53
+ #
54
+ # extract_job.wait_until_done!
55
+ # extract_job.done? #=> true
56
+ #
43
57
  class ExtractJob < Job
44
58
  ##
45
59
  # The URI or URIs representing the Google Cloud Storage files to which
@@ -49,71 +63,130 @@ module Google
49
63
  end
50
64
 
51
65
  ##
52
- # The table from which the data is exported. This is the table upon
53
- # which {Table#extract_job} was called.
66
+ # The table or model which is exported.
54
67
  #
55
- # @return [Table] A table instance.
68
+ # @return [Table, Model, nil] A table or model instance, or `nil`.
56
69
  #
57
70
  def source
58
- table = @gapi.configuration.extract.source_table
59
- return nil unless table
60
- retrieve_table table.project_id, table.dataset_id, table.table_id
71
+ if (table = @gapi.configuration.extract.source_table)
72
+ retrieve_table table.project_id, table.dataset_id, table.table_id
73
+ elsif (model = @gapi.configuration.extract.source_model)
74
+ retrieve_model model.project_id, model.dataset_id, model.model_id
75
+ end
61
76
  end
62
77
 
63
78
  ##
64
- # Checks if the export operation compresses the data using gzip. The
65
- # default is `false`.
79
+ # Whether the source of the export job is a table. See {#source}.
66
80
  #
67
- # @return [Boolean] `true` when `GZIP`, `false` otherwise.
81
+ # @return [Boolean] `true` when the source is a table, `false`
82
+ # otherwise.
83
+ #
84
+ def table?
85
+ !@gapi.configuration.extract.source_table.nil?
86
+ end
87
+
88
+ ##
89
+ # Whether the source of the export job is a model. See {#source}.
90
+ #
91
+ # @return [Boolean] `true` when the source is a model, `false`
92
+ # otherwise.
93
+ #
94
+ def model?
95
+ !@gapi.configuration.extract.source_model.nil?
96
+ end
97
+
98
+ ##
99
+ # Checks if the export operation compresses the data using gzip. The
100
+ # default is `false`. Not applicable when extracting models.
68
101
  #
102
+ # @return [Boolean] `true` when `GZIP`, `false` if not `GZIP` or not a
103
+ # table extraction.
69
104
  def compression?
105
+ return false unless table?
70
106
  val = @gapi.configuration.extract.compression
71
107
  val == "GZIP"
72
108
  end
73
109
 
74
110
  ##
75
- # Checks if the destination format for the data is [newline-delimited
76
- # JSON](http://jsonlines.org/). The default is `false`.
111
+ # Checks if the destination format for the table data is [newline-delimited
112
+ # JSON](http://jsonlines.org/). The default is `false`. Not applicable when
113
+ # extracting models.
77
114
  #
78
- # @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false`
79
- # otherwise.
115
+ # @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false` if not
116
+ # `NEWLINE_DELIMITED_JSON` or not a table extraction.
80
117
  #
81
118
  def json?
119
+ return false unless table?
82
120
  val = @gapi.configuration.extract.destination_format
83
121
  val == "NEWLINE_DELIMITED_JSON"
84
122
  end
85
123
 
86
124
  ##
87
- # Checks if the destination format for the data is CSV. Tables with
125
+ # Checks if the destination format for the table data is CSV. Tables with
88
126
  # nested or repeated fields cannot be exported as CSV. The default is
89
- # `true`.
127
+ # `true` for tables. Not applicable when extracting models.
90
128
  #
91
- # @return [Boolean] `true` when `CSV`, `false` otherwise.
129
+ # @return [Boolean] `true` when `CSV`, or `false` if not `CSV` or not a
130
+ # table extraction.
92
131
  #
93
132
  def csv?
133
+ return false unless table?
94
134
  val = @gapi.configuration.extract.destination_format
95
135
  return true if val.nil?
96
136
  val == "CSV"
97
137
  end
98
138
 
99
139
  ##
100
- # Checks if the destination format for the data is
101
- # [Avro](http://avro.apache.org/). The default is `false`.
140
+ # Checks if the destination format for the table data is
141
+ # [Avro](http://avro.apache.org/). The default is `false`. Not applicable
142
+ # when extracting models.
102
143
  #
103
- # @return [Boolean] `true` when `AVRO`, `false` otherwise.
144
+ # @return [Boolean] `true` when `AVRO`, `false` if not `AVRO` or not a
145
+ # table extraction.
104
146
  #
105
147
  def avro?
148
+ return false unless table?
106
149
  val = @gapi.configuration.extract.destination_format
107
150
  val == "AVRO"
108
151
  end
109
152
 
153
+ ##
154
+ # Checks if the destination format for the model is TensorFlow SavedModel.
155
+ # The default is `true` for models. Not applicable when extracting tables.
156
+ #
157
+ # @return [Boolean] `true` when `ML_TF_SAVED_MODEL`, `false` if not
158
+ # `ML_TF_SAVED_MODEL` or not a model extraction.
159
+ #
160
+ def ml_tf_saved_model?
161
+ return false unless model?
162
+ val = @gapi.configuration.extract.destination_format
163
+ return true if val.nil?
164
+ val == "ML_TF_SAVED_MODEL"
165
+ end
166
+
167
+ ##
168
+ # Checks if the destination format for the model is XGBoost. The default
169
+ # is `false`. Not applicable when extracting tables.
170
+ #
171
+ # @return [Boolean] `true` when `ML_XGBOOST_BOOSTER`, `false` if not
172
+ # `ML_XGBOOST_BOOSTER` or not a model extraction.
173
+ #
174
+ def ml_xgboost_booster?
175
+ return false unless model?
176
+ val = @gapi.configuration.extract.destination_format
177
+ val == "ML_XGBOOST_BOOSTER"
178
+ end
179
+
110
180
  ##
111
181
  # The character or symbol the operation uses to delimit fields in the
112
- # exported data. The default is a comma (,).
182
+ # exported data. The default is a comma (,) for tables. Not applicable
183
+ # when extracting models.
113
184
  #
114
- # @return [String] A string containing the character, such as `","`.
185
+ # @return [String, nil] A string containing the character, such as `","`,
186
+ # `nil` if not a table extraction.
115
187
  #
116
188
  def delimiter
189
+ return unless table?
117
190
  val = @gapi.configuration.extract.field_delimiter
118
191
  val = "," if val.nil?
119
192
  val
@@ -121,12 +194,13 @@ module Google
121
194
 
122
195
  ##
123
196
  # Checks if the exported data contains a header row. The default is
124
- # `true`.
197
+ # `true` for tables. Not applicable when extracting models.
125
198
  #
126
199
  # @return [Boolean] `true` when the print header configuration is
127
- # present or `nil`, `false` otherwise.
200
+ # present or `nil`, `false` if disabled or not a table extraction.
128
201
  #
129
202
  def print_header?
203
+ return false unless table?
130
204
  val = @gapi.configuration.extract.print_header
131
205
  val = true if val.nil?
132
206
  val
@@ -159,12 +233,14 @@ module Google
159
233
  # whether to enable extracting applicable column types (such as
160
234
  # `TIMESTAMP`) to their corresponding AVRO logical types
161
235
  # (`timestamp-micros`), instead of only using their raw types
162
- # (`avro-long`).
236
+ # (`avro-long`). Not applicable when extracting models.
163
237
  #
164
238
  # @return [Boolean] `true` when applicable column types will use their
165
- # corresponding AVRO logical types, `false` otherwise.
239
+ # corresponding AVRO logical types, `false` if not enabled or not a
240
+ # table extraction.
166
241
  #
167
242
  def use_avro_logical_types?
243
+ return false unless table?
168
244
  @gapi.configuration.extract.use_avro_logical_types
169
245
  end
170
246
 
@@ -182,19 +258,24 @@ module Google
182
258
  #
183
259
  # @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job
184
260
  # configuration object for setting query options.
185
- def self.from_options service, table, storage_files, options
261
+ def self.from_options service, source, storage_files, options
186
262
  job_ref = service.job_ref_from options[:job_id], options[:prefix]
187
263
  storage_urls = Array(storage_files).map do |url|
188
264
  url.respond_to?(:to_gs_url) ? url.to_gs_url : url
189
265
  end
190
266
  options[:format] ||= Convert.derive_source_format storage_urls.first
267
+ extract_config = Google::Apis::BigqueryV2::JobConfigurationExtract.new(
268
+ destination_uris: Array(storage_urls)
269
+ )
270
+ if source.is_a? Google::Apis::BigqueryV2::TableReference
271
+ extract_config.source_table = source
272
+ elsif source.is_a? Google::Apis::BigqueryV2::ModelReference
273
+ extract_config.source_model = source
274
+ end
191
275
  job = Google::Apis::BigqueryV2::Job.new(
192
276
  job_reference: job_ref,
193
277
  configuration: Google::Apis::BigqueryV2::JobConfiguration.new(
194
- extract: Google::Apis::BigqueryV2::JobConfigurationExtract.new(
195
- destination_uris: Array(storage_urls),
196
- source_table: table
197
- ),
278
+ extract: extract_config,
198
279
  dry_run: options[:dryrun]
199
280
  )
200
281
  )
@@ -253,7 +334,7 @@ module Google
253
334
  end
254
335
 
255
336
  ##
256
- # Sets the compression type.
337
+ # Sets the compression type. Not applicable when extracting models.
257
338
  #
258
339
  # @param [String] value The compression type to use for exported
259
340
  # files. Possible values include `GZIP` and `NONE`. The default
@@ -265,7 +346,7 @@ module Google
265
346
  end
266
347
 
267
348
  ##
268
- # Sets the field delimiter.
349
+ # Sets the field delimiter. Not applicable when extracting models.
269
350
  #
270
351
  # @param [String] value Delimiter to use between fields in the
271
352
  # exported data. Default is <code>,</code>.
@@ -276,14 +357,21 @@ module Google
276
357
  end
277
358
 
278
359
  ##
279
- # Sets the destination file format. The default value is `csv`.
360
+ # Sets the destination file format. The default value for
361
+ # tables is `csv`. Tables with nested or repeated fields cannot be
362
+ # exported as CSV. The default value for models is `ml_tf_saved_model`.
280
363
  #
281
- # The following values are supported:
364
+ # Supported values for tables:
282
365
  #
283
366
  # * `csv` - CSV
284
367
  # * `json` - [Newline-delimited JSON](http://jsonlines.org/)
285
368
  # * `avro` - [Avro](http://avro.apache.org/)
286
369
  #
370
+ # Supported values for models:
371
+ #
372
+ # * `ml_tf_saved_model` - TensorFlow SavedModel
373
+ # * `ml_xgboost_booster` - XGBoost Booster
374
+ #
287
375
  # @param [String] new_format The new source format.
288
376
  #
289
377
  # @!group Attributes
@@ -293,7 +381,8 @@ module Google
293
381
  end
294
382
 
295
383
  ##
296
- # Print a header row in the exported file.
384
+ # Print a header row in the exported file. Not applicable when
385
+ # extracting models.
297
386
  #
298
387
  # @param [Boolean] value Whether to print out a header row in the
299
388
  # results. Default is `true`.
@@ -307,12 +396,21 @@ module Google
307
396
  # Sets the labels to use for the job.
308
397
  #
309
398
  # @param [Hash] value A hash of user-provided labels associated with
310
- # the job. You can use these to organize and group your jobs. Label
311
- # keys and values can be no longer than 63 characters, can only
312
- # contain lowercase letters, numeric characters, underscores and
313
- # dashes. International characters are allowed. Label values are
314
- # optional. Label keys must start with a letter and each label in
315
- # the list must have a different key.
399
+ # the job. You can use these to organize and group your jobs.
400
+ #
401
+ # The labels applied to a resource must meet the following requirements:
402
+ #
403
+ # * Each resource can have multiple labels, up to a maximum of 64.
404
+ # * Each label must be a key-value pair.
405
+ # * Keys have a minimum length of 1 character and a maximum length of
406
+ # 63 characters, and cannot be empty. Values can be empty, and have
407
+ # a maximum length of 63 characters.
408
+ # * Keys and values can contain only lowercase letters, numeric characters,
409
+ # underscores, and dashes. All characters must use UTF-8 encoding, and
410
+ # international characters are allowed.
411
+ # * The key portion of a label must be unique. However, you can use the
412
+ # same key with multiple resources.
413
+ # * Keys must start with a lowercase letter or international character.
316
414
  #
317
415
  # @!group Attributes
318
416
  #
@@ -362,6 +460,16 @@ module Google
362
460
  @gapi
363
461
  end
364
462
  end
463
+
464
+ protected
465
+
466
+ def retrieve_model project_id, dataset_id, model_id
467
+ ensure_service!
468
+ gapi = service.get_project_model project_id, dataset_id, model_id
469
+ Model.from_gapi_json gapi, service
470
+ rescue Google::Cloud::NotFoundError
471
+ nil
472
+ end
365
473
  end
366
474
  end
367
475
  end
@@ -197,6 +197,72 @@ module Google
197
197
  Convert.millis_to_time @gapi.statistics.end_time
198
198
  end
199
199
 
200
+ ##
201
+ # The number of child jobs executed.
202
+ #
203
+ # @return [Integer] The number of child jobs executed.
204
+ #
205
+ def num_child_jobs
206
+ @gapi.statistics.num_child_jobs || 0
207
+ end
208
+
209
+ ##
210
+ # If this is a child job, the id of the parent.
211
+ #
212
+ # @return [String, nil] The ID of the parent job, or `nil` if not a child job.
213
+ #
214
+ def parent_job_id
215
+ @gapi.statistics.parent_job_id
216
+ end
217
+
218
+ ##
219
+ # The statistics including stack frames for a child job of a script.
220
+ #
221
+ # @return [Google::Cloud::Bigquery::Job::ScriptStatistics, nil] The script statistics, or `nil` if the job is
222
+ # not a child job.
223
+ #
224
+ # @example
225
+ # require "google/cloud/bigquery"
226
+ #
227
+ # bigquery = Google::Cloud::Bigquery.new
228
+ #
229
+ # multi_statement_sql = <<~SQL
230
+ # -- Declare a variable to hold names as an array.
231
+ # DECLARE top_names ARRAY<STRING>;
232
+ # -- Build an array of the top 100 names from the year 2017.
233
+ # SET top_names = (
234
+ # SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
235
+ # FROM `bigquery-public-data.usa_names.usa_1910_current`
236
+ # WHERE year = 2017
237
+ # );
238
+ # -- Which names appear as words in Shakespeare's plays?
239
+ # SELECT
240
+ # name AS shakespeare_name
241
+ # FROM UNNEST(top_names) AS name
242
+ # WHERE name IN (
243
+ # SELECT word
244
+ # FROM `bigquery-public-data.samples.shakespeare`
245
+ # );
246
+ # SQL
247
+ #
248
+ # job = bigquery.query_job multi_statement_sql
249
+ #
250
+ # job.wait_until_done!
251
+ #
252
+ # child_jobs = bigquery.jobs parent_job: job
253
+ #
254
+ # child_jobs.each do |child_job|
255
+ # script_statistics = child_job.script_statistics
256
+ # puts script_statistics.evaluation_kind
257
+ # script_statistics.stack_frames.each do |stack_frame|
258
+ # puts stack_frame.text
259
+ # end
260
+ # end
261
+ #
262
+ def script_statistics
263
+ ScriptStatistics.from_gapi @gapi.statistics.script_statistics if @gapi.statistics.script_statistics
264
+ end
265
+
200
266
  ##
201
267
  # The configuration for the job. Returns a hash.
202
268
  #
@@ -423,6 +489,138 @@ module Google
423
489
  end
424
490
  end
425
491
 
492
+ ##
493
+ # Represents statistics for a child job of a script.
494
+ #
495
+ # @attr_reader [String] evaluation_kind Indicates the type of child job. Possible values include `STATEMENT` and
496
+ # `EXPRESSION`.
497
+ # @attr_reader [Array<Google::Cloud::Bigquery::Job::ScriptStackFrame>] stack_frames Stack trace where the
498
+ # current evaluation happened. Shows line/column/procedure name of each frame on the stack at the point where
499
+ # the current evaluation happened. The leaf frame is first, the primary script is last.
500
+ #
501
+ # @example
502
+ # require "google/cloud/bigquery"
503
+ #
504
+ # bigquery = Google::Cloud::Bigquery.new
505
+ #
506
+ # multi_statement_sql = <<~SQL
507
+ # -- Declare a variable to hold names as an array.
508
+ # DECLARE top_names ARRAY<STRING>;
509
+ # -- Build an array of the top 100 names from the year 2017.
510
+ # SET top_names = (
511
+ # SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
512
+ # FROM `bigquery-public-data.usa_names.usa_1910_current`
513
+ # WHERE year = 2017
514
+ # );
515
+ # -- Which names appear as words in Shakespeare's plays?
516
+ # SELECT
517
+ # name AS shakespeare_name
518
+ # FROM UNNEST(top_names) AS name
519
+ # WHERE name IN (
520
+ # SELECT word
521
+ # FROM `bigquery-public-data.samples.shakespeare`
522
+ # );
523
+ # SQL
524
+ #
525
+ # job = bigquery.query_job multi_statement_sql
526
+ #
527
+ # job.wait_until_done!
528
+ #
529
+ # child_jobs = bigquery.jobs parent_job: job
530
+ #
531
+ # child_jobs.each do |child_job|
532
+ # script_statistics = child_job.script_statistics
533
+ # puts script_statistics.evaluation_kind
534
+ # script_statistics.stack_frames.each do |stack_frame|
535
+ # puts stack_frame.text
536
+ # end
537
+ # end
538
+ #
539
+ class ScriptStatistics
540
+ attr_reader :evaluation_kind, :stack_frames
541
+
542
+ ##
543
+ # @private Creates a new ScriptStatistics instance.
544
+ def initialize evaluation_kind, stack_frames
545
+ @evaluation_kind = evaluation_kind
546
+ @stack_frames = stack_frames
547
+ end
548
+
549
+ ##
550
+ # @private New ScriptStatistics from a statistics.script_statistics object.
551
+ def self.from_gapi gapi
552
+ frames = Array(gapi.stack_frames).map { |g| ScriptStackFrame.from_gapi g }
553
+ new gapi.evaluation_kind, frames
554
+ end
555
+ end
556
+
557
+ ##
558
+ # Represents a stack frame showing the line/column/procedure name where the current evaluation happened.
559
+ #
560
+ # @attr_reader [Integer] start_line One-based start line.
561
+ # @attr_reader [Integer] start_column One-based start column.
562
+ # @attr_reader [Integer] end_line One-based end line.
563
+ # @attr_reader [Integer] end_column One-based end column.
564
+ # @attr_reader [String] text Text of the current statement/expression.
565
+ #
566
+ # @example
567
+ # require "google/cloud/bigquery"
568
+ #
569
+ # bigquery = Google::Cloud::Bigquery.new
570
+ #
571
+ # multi_statement_sql = <<~SQL
572
+ # -- Declare a variable to hold names as an array.
573
+ # DECLARE top_names ARRAY<STRING>;
574
+ # -- Build an array of the top 100 names from the year 2017.
575
+ # SET top_names = (
576
+ # SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
577
+ # FROM `bigquery-public-data.usa_names.usa_1910_current`
578
+ # WHERE year = 2017
579
+ # );
580
+ # -- Which names appear as words in Shakespeare's plays?
581
+ # SELECT
582
+ # name AS shakespeare_name
583
+ # FROM UNNEST(top_names) AS name
584
+ # WHERE name IN (
585
+ # SELECT word
586
+ # FROM `bigquery-public-data.samples.shakespeare`
587
+ # );
588
+ # SQL
589
+ #
590
+ # job = bigquery.query_job multi_statement_sql
591
+ #
592
+ # job.wait_until_done!
593
+ #
594
+ # child_jobs = bigquery.jobs parent_job: job
595
+ #
596
+ # child_jobs.each do |child_job|
597
+ # script_statistics = child_job.script_statistics
598
+ # puts script_statistics.evaluation_kind
599
+ # script_statistics.stack_frames.each do |stack_frame|
600
+ # puts stack_frame.text
601
+ # end
602
+ # end
603
+ #
604
+ class ScriptStackFrame
605
+ attr_reader :start_line, :start_column, :end_line, :end_column, :text
606
+
607
+ ##
608
+ # @private Creates a new ScriptStackFrame instance.
609
+ def initialize start_line, start_column, end_line, end_column, text
610
+ @start_line = start_line
611
+ @start_column = start_column
612
+ @end_line = end_line
613
+ @end_column = end_column
614
+ @text = text
615
+ end
616
+
617
+ ##
618
+ # @private New ScriptStackFrame from a statistics.script_statistics[].stack_frames element.
619
+ def self.from_gapi gapi
620
+ new gapi.start_line, gapi.start_column, gapi.end_line, gapi.end_column, gapi.text
621
+ end
622
+ end
623
+
426
624
  protected
427
625
 
428
626
  ##