RubyGems - google-cloud-bigquery - Versions diffs - 1.19.0 → 1.22.0 - Mend

google-cloud-bigquery 1.19.0 → 1.22.0

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +42 -0
data/TROUBLESHOOTING.md +2 -8
data/lib/google-cloud-bigquery.rb +8 -2
data/lib/google/cloud/bigquery/convert.rb +3 -1
data/lib/google/cloud/bigquery/copy_job.rb +15 -6
data/lib/google/cloud/bigquery/data.rb +12 -0
data/lib/google/cloud/bigquery/dataset.rb +119 -28
data/lib/google/cloud/bigquery/external.rb +24 -0
data/lib/google/cloud/bigquery/extract_job.rb +153 -45
data/lib/google/cloud/bigquery/job.rb +198 -0
data/lib/google/cloud/bigquery/load_job.rb +271 -32
data/lib/google/cloud/bigquery/model.rb +164 -8
data/lib/google/cloud/bigquery/project.rb +242 -84
data/lib/google/cloud/bigquery/query_job.rb +277 -6
data/lib/google/cloud/bigquery/schema.rb +2 -2
data/lib/google/cloud/bigquery/service.rb +16 -12
data/lib/google/cloud/bigquery/table.rb +371 -83
data/lib/google/cloud/bigquery/time.rb +6 -0
data/lib/google/cloud/bigquery/version.rb +1 -1
metadata +7 -7

data/lib/google/cloud/bigquery/extract_job.rb CHANGED

@@ -20,15 +20,17 @@ module Google
       # # ExtractJob
       #
       # A {Job} subclass representing an export operation that may be performed
-      # on a {Table}. A ExtractJob instance is created when you call
-      # {Table#extract_job}.
+      # on a {Table} or {Model}. A ExtractJob instance is returned when you call
+      # {Project#extract_job}, {Table#extract_job} or {Model#extract_job}.
       #
       # @see https://cloud.google.com/bigquery/docs/exporting-data
-      #   Exporting Data From BigQuery
+      #   Exporting table data
+      # @see https://cloud.google.com/bigquery-ml/docs/exporting-models
+      #   Exporting models
       # @see https://cloud.google.com/bigquery/docs/reference/v2/jobs Jobs API
       #   reference
       #
-      # @example
+      # @example Export table data
       #   require "google/cloud/bigquery"
       #
       #   bigquery = Google::Cloud::Bigquery.new
@@ -40,6 +42,18 @@ module Google
       #   extract_job.wait_until_done!
       #   extract_job.done? #=> true
       #
+      # @example Export a model
+      #   require "google/cloud/bigquery"
+      #
+      #   bigquery = Google::Cloud::Bigquery.new
+      #   dataset = bigquery.dataset "my_dataset"
+      #   model = dataset.model "my_model"
+      #
+      #   extract_job = model.extract_job "gs://my-bucket/#{model.model_id}"
+      #
+      #   extract_job.wait_until_done!
+      #   extract_job.done? #=> true
+      #
       class ExtractJob < Job
         ##
         # The URI or URIs representing the Google Cloud Storage files to which
@@ -49,71 +63,130 @@ module Google
         end
         ##
-        # The table from which the data is exported. This is the table upon
-        # which {Table#extract_job} was called.
+        # The table or model which is exported.
         #
-        # @return [Table] A table instance.
+        # @return [Table, Model, nil] A table or model instance, or `nil`.
         #
         def source
-          table = @gapi.configuration.extract.source_table
-          return nil unless table
-          retrieve_table table.project_id, table.dataset_id, table.table_id
+          if (table = @gapi.configuration.extract.source_table)
+            retrieve_table table.project_id, table.dataset_id, table.table_id
+          elsif (model = @gapi.configuration.extract.source_model)
+            retrieve_model model.project_id, model.dataset_id, model.model_id
+          end
         end
         ##
-        # Checks if the export operation compresses the data using gzip. The
-        # default is `false`.
+        # Whether the source of the export job is a table. See {#source}.
         #
-        # @return [Boolean] `true` when `GZIP`, `false` otherwise.
+        # @return [Boolean] `true` when the source is a table, `false`
+        #   otherwise.
+        #
+        def table?
+          !@gapi.configuration.extract.source_table.nil?
+        end
+        ##
+        # Whether the source of the export job is a model. See {#source}.
+        #
+        # @return [Boolean] `true` when the source is a model, `false`
+        #   otherwise.
+        #
+        def model?
+          !@gapi.configuration.extract.source_model.nil?
+        end
+        ##
+        # Checks if the export operation compresses the data using gzip. The
+        # default is `false`. Not applicable when extracting models.
         #
+        # @return [Boolean] `true` when `GZIP`, `false` if not `GZIP` or not a
+        #   table extraction.
         def compression?
+          return false unless table?
           val = @gapi.configuration.extract.compression
           val == "GZIP"
         end
         ##
-        # Checks if the destination format for the data is [newline-delimited
-        # JSON](http://jsonlines.org/). The default is `false`.
+        # Checks if the destination format for the table data is [newline-delimited
+        # JSON](http://jsonlines.org/). The default is `false`. Not applicable when
+        # extracting models.
         #
-        # @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false`
-        #   otherwise.
+        # @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false` if not
+        #   `NEWLINE_DELIMITED_JSON` or not a table extraction.
         #
         def json?
+          return false unless table?
           val = @gapi.configuration.extract.destination_format
           val == "NEWLINE_DELIMITED_JSON"
         end
         ##
-        # Checks if the destination format for the data is CSV. Tables with
+        # Checks if the destination format for the table data is CSV. Tables with
         # nested or repeated fields cannot be exported as CSV. The default is
-        # `true`.
+        # `true` for tables. Not applicable when extracting models.
         #
-        # @return [Boolean] `true` when `CSV`, `false` otherwise.
+        # @return [Boolean] `true` when `CSV`, or `false` if not `CSV` or not a
+        #   table extraction.
         #
         def csv?
+          return false unless table?
           val = @gapi.configuration.extract.destination_format
           return true if val.nil?
           val == "CSV"
         end
         ##
-        # Checks if the destination format for the data is
-        # [Avro](http://avro.apache.org/). The default is `false`.
+        # Checks if the destination format for the table data is
+        # [Avro](http://avro.apache.org/). The default is `false`. Not applicable
+        # when extracting models.
         #
-        # @return [Boolean] `true` when `AVRO`, `false` otherwise.
+        # @return [Boolean] `true` when `AVRO`, `false` if not `AVRO` or not a
+        #   table extraction.
         #
         def avro?
+          return false unless table?
           val = @gapi.configuration.extract.destination_format
           val == "AVRO"
         end
+        ##
+        # Checks if the destination format for the model is TensorFlow SavedModel.
+        # The default is `true` for models. Not applicable when extracting tables.
+        #
+        # @return [Boolean] `true` when `ML_TF_SAVED_MODEL`, `false` if not
+        #   `ML_TF_SAVED_MODEL` or not a model extraction.
+        #
+        def ml_tf_saved_model?
+          return false unless model?
+          val = @gapi.configuration.extract.destination_format
+          return true if val.nil?
+          val == "ML_TF_SAVED_MODEL"
+        end
+        ##
+        # Checks if the destination format for the model is XGBoost. The default
+        # is `false`. Not applicable when extracting tables.
+        #
+        # @return [Boolean] `true` when `ML_XGBOOST_BOOSTER`, `false` if not
+        #   `ML_XGBOOST_BOOSTER` or not a model extraction.
+        #
+        def ml_xgboost_booster?
+          return false unless model?
+          val = @gapi.configuration.extract.destination_format
+          val == "ML_XGBOOST_BOOSTER"
+        end
         ##
         # The character or symbol the operation uses to delimit fields in the
-        # exported data. The default is a comma (,).
+        # exported data. The default is a comma (,) for tables. Not applicable
+        # when extracting models.
         #
-        # @return [String] A string containing the character, such as `","`.
+        # @return [String, nil] A string containing the character, such as `","`,
+        #   `nil` if not a table extraction.
         #
         def delimiter
+          return unless table?
           val = @gapi.configuration.extract.field_delimiter
           val = "," if val.nil?
           val
@@ -121,12 +194,13 @@ module Google
         ##
         # Checks if the exported data contains a header row. The default is
-        # `true`.
+        # `true` for tables. Not applicable when extracting models.
         #
         # @return [Boolean] `true` when the print header configuration is
-        #   present or `nil`, `false` otherwise.
+        #   present or `nil`, `false` if disabled or not a table extraction.
         #
         def print_header?
+          return false unless table?
           val = @gapi.configuration.extract.print_header
           val = true if val.nil?
           val
@@ -159,12 +233,14 @@ module Google
         # whether to enable extracting applicable column types (such as
         # `TIMESTAMP`) to their corresponding AVRO logical types
         # (`timestamp-micros`), instead of only using their raw types
-        # (`avro-long`).
+        # (`avro-long`). Not applicable when extracting models.
         #
         # @return [Boolean] `true` when applicable column types will use their
-        #   corresponding AVRO logical types, `false` otherwise.
+        #   corresponding AVRO logical types, `false` if not enabled or not a
+        #   table extraction.
         #
         def use_avro_logical_types?
+          return false unless table?
           @gapi.configuration.extract.use_avro_logical_types
         end
@@ -182,19 +258,24 @@ module Google
           #
           # @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job
           #   configuration object for setting query options.
-          def self.from_options service, table, storage_files, options
+          def self.from_options service, source, storage_files, options
             job_ref = service.job_ref_from options[:job_id], options[:prefix]
             storage_urls = Array(storage_files).map do |url|
               url.respond_to?(:to_gs_url) ? url.to_gs_url : url
             end
             options[:format] ||= Convert.derive_source_format storage_urls.first
+            extract_config = Google::Apis::BigqueryV2::JobConfigurationExtract.new(
+              destination_uris: Array(storage_urls)
+            )
+            if source.is_a? Google::Apis::BigqueryV2::TableReference
+              extract_config.source_table = source
+            elsif source.is_a? Google::Apis::BigqueryV2::ModelReference
+              extract_config.source_model = source
+            end
             job = Google::Apis::BigqueryV2::Job.new(
               job_reference: job_ref,
               configuration: Google::Apis::BigqueryV2::JobConfiguration.new(
-                extract: Google::Apis::BigqueryV2::JobConfigurationExtract.new(
-                  destination_uris: Array(storage_urls),
-                  source_table:     table
-                ),
+                extract: extract_config,
                 dry_run: options[:dryrun]
               )
             )
@@ -253,7 +334,7 @@ module Google
           end
           ##
-          # Sets the compression type.
+          # Sets the compression type. Not applicable when extracting models.
           #
           # @param [String] value The compression type to use for exported
           #   files. Possible values include `GZIP` and `NONE`. The default
@@ -265,7 +346,7 @@ module Google
           end
           ##
-          # Sets the field delimiter.
+          # Sets the field delimiter. Not applicable when extracting models.
           #
           # @param [String] value Delimiter to use between fields in the
           #   exported data. Default is <code>,</code>.
@@ -276,14 +357,21 @@ module Google
           end
           ##
-          # Sets the destination file format. The default value is `csv`.
+          # Sets the destination file format. The default value for
+          # tables is `csv`. Tables with nested or repeated fields cannot be
+          # exported as CSV. The default value for models is `ml_tf_saved_model`.
           #
-          # The following values are supported:
+          # Supported values for tables:
           #
           # * `csv` - CSV
           # * `json` - [Newline-delimited JSON](http://jsonlines.org/)
           # * `avro` - [Avro](http://avro.apache.org/)
           #
+          # Supported values for models:
+          #
+          # * `ml_tf_saved_model` - TensorFlow SavedModel
+          # * `ml_xgboost_booster` - XGBoost Booster
+          #
           # @param [String] new_format The new source format.
           #
           # @!group Attributes
@@ -293,7 +381,8 @@ module Google
           end
           ##
-          # Print a header row in the exported file.
+          # Print a header row in the exported file. Not applicable when
+          # extracting models.
           #
           # @param [Boolean] value Whether to print out a header row in the
           #   results. Default is `true`.
@@ -307,12 +396,21 @@ module Google
           # Sets the labels to use for the job.
           #
           # @param [Hash] value A hash of user-provided labels associated with
-          #   the job. You can use these to organize and group your jobs. Label
-          #   keys and values can be no longer than 63 characters, can only
-          #   contain lowercase letters, numeric characters, underscores and
-          #   dashes. International characters are allowed. Label values are
-          #   optional. Label keys must start with a letter and each label in
-          #   the list must have a different key.
+          #   the job. You can use these to organize and group your jobs.
+          #
+          #   The labels applied to a resource must meet the following requirements:
+          #
+          #   * Each resource can have multiple labels, up to a maximum of 64.
+          #   * Each label must be a key-value pair.
+          #   * Keys have a minimum length of 1 character and a maximum length of
+          #     63 characters, and cannot be empty. Values can be empty, and have
+          #     a maximum length of 63 characters.
+          #   * Keys and values can contain only lowercase letters, numeric characters,
+          #     underscores, and dashes. All characters must use UTF-8 encoding, and
+          #     international characters are allowed.
+          #   * The key portion of a label must be unique. However, you can use the
+          #     same key with multiple resources.
+          #   * Keys must start with a lowercase letter or international character.
           #
           # @!group Attributes
           #
@@ -362,6 +460,16 @@ module Google
             @gapi
           end
         end
+        protected
+        def retrieve_model project_id, dataset_id, model_id
+          ensure_service!
+          gapi = service.get_project_model project_id, dataset_id, model_id
+          Model.from_gapi_json gapi, service
+        rescue Google::Cloud::NotFoundError
+          nil
+        end
       end
     end
   end

data/lib/google/cloud/bigquery/job.rb CHANGED

@@ -197,6 +197,72 @@ module Google
           Convert.millis_to_time @gapi.statistics.end_time
         end
+        ##
+        # The number of child jobs executed.
+        #
+        # @return [Integer] The number of child jobs executed.
+        #
+        def num_child_jobs
+          @gapi.statistics.num_child_jobs || 0
+        end
+        ##
+        # If this is a child job, the id of the parent.
+        #
+        # @return [String, nil] The ID of the parent job, or `nil` if not a child job.
+        #
+        def parent_job_id
+          @gapi.statistics.parent_job_id
+        end
+        ##
+        # The statistics including stack frames for a child job of a script.
+        #
+        # @return [Google::Cloud::Bigquery::Job::ScriptStatistics, nil] The script statistics, or `nil` if the job is
+        #   not a child job.
+        #
+        # @example
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   multi_statement_sql = <<~SQL
+        #     -- Declare a variable to hold names as an array.
+        #     DECLARE top_names ARRAY<STRING>;
+        #     -- Build an array of the top 100 names from the year 2017.
+        #     SET top_names = (
+        #     SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
+        #     FROM `bigquery-public-data.usa_names.usa_1910_current`
+        #     WHERE year = 2017
+        #     );
+        #     -- Which names appear as words in Shakespeare's plays?
+        #     SELECT
+        #     name AS shakespeare_name
+        #     FROM UNNEST(top_names) AS name
+        #     WHERE name IN (
+        #     SELECT word
+        #     FROM `bigquery-public-data.samples.shakespeare`
+        #     );
+        #   SQL
+        #
+        #   job = bigquery.query_job multi_statement_sql
+        #
+        #   job.wait_until_done!
+        #
+        #   child_jobs = bigquery.jobs parent_job: job
+        #
+        #   child_jobs.each do |child_job|
+        #     script_statistics = child_job.script_statistics
+        #     puts script_statistics.evaluation_kind
+        #     script_statistics.stack_frames.each do |stack_frame|
+        #       puts stack_frame.text
+        #     end
+        #   end
+        #
+        def script_statistics
+          ScriptStatistics.from_gapi @gapi.statistics.script_statistics if @gapi.statistics.script_statistics
+        end
         ##
         # The configuration for the job. Returns a hash.
         #
@@ -423,6 +489,138 @@ module Google
           end
         end
+        ##
+        # Represents statistics for a child job of a script.
+        #
+        # @attr_reader [String] evaluation_kind Indicates the type of child job. Possible values include `STATEMENT` and
+        #   `EXPRESSION`.
+        # @attr_reader [Array<Google::Cloud::Bigquery::Job::ScriptStackFrame>] stack_frames Stack trace where the
+        #   current evaluation happened. Shows line/column/procedure name of each frame on the stack at the point where
+        #   the current evaluation happened. The leaf frame is first, the primary script is last.
+        #
+        # @example
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   multi_statement_sql = <<~SQL
+        #     -- Declare a variable to hold names as an array.
+        #     DECLARE top_names ARRAY<STRING>;
+        #     -- Build an array of the top 100 names from the year 2017.
+        #     SET top_names = (
+        #     SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
+        #     FROM `bigquery-public-data.usa_names.usa_1910_current`
+        #     WHERE year = 2017
+        #     );
+        #     -- Which names appear as words in Shakespeare's plays?
+        #     SELECT
+        #     name AS shakespeare_name
+        #     FROM UNNEST(top_names) AS name
+        #     WHERE name IN (
+        #     SELECT word
+        #     FROM `bigquery-public-data.samples.shakespeare`
+        #     );
+        #   SQL
+        #
+        #   job = bigquery.query_job multi_statement_sql
+        #
+        #   job.wait_until_done!
+        #
+        #   child_jobs = bigquery.jobs parent_job: job
+        #
+        #   child_jobs.each do |child_job|
+        #     script_statistics = child_job.script_statistics
+        #     puts script_statistics.evaluation_kind
+        #     script_statistics.stack_frames.each do |stack_frame|
+        #       puts stack_frame.text
+        #     end
+        #   end
+        #
+        class ScriptStatistics
+          attr_reader :evaluation_kind, :stack_frames
+          ##
+          # @private Creates a new ScriptStatistics instance.
+          def initialize evaluation_kind, stack_frames
+            @evaluation_kind = evaluation_kind
+            @stack_frames = stack_frames
+          end
+          ##
+          # @private New ScriptStatistics from a statistics.script_statistics object.
+          def self.from_gapi gapi
+            frames = Array(gapi.stack_frames).map { |g| ScriptStackFrame.from_gapi g }
+            new gapi.evaluation_kind, frames
+          end
+        end
+        ##
+        # Represents a stack frame showing the line/column/procedure name where the current evaluation happened.
+        #
+        # @attr_reader [Integer] start_line One-based start line.
+        # @attr_reader [Integer] start_column One-based start column.
+        # @attr_reader [Integer] end_line One-based end line.
+        # @attr_reader [Integer] end_column One-based end column.
+        # @attr_reader [String] text Text of the current statement/expression.
+        #
+        # @example
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   multi_statement_sql = <<~SQL
+        #     -- Declare a variable to hold names as an array.
+        #     DECLARE top_names ARRAY<STRING>;
+        #     -- Build an array of the top 100 names from the year 2017.
+        #     SET top_names = (
+        #     SELECT ARRAY_AGG(name ORDER BY number DESC LIMIT 100)
+        #     FROM `bigquery-public-data.usa_names.usa_1910_current`
+        #     WHERE year = 2017
+        #     );
+        #     -- Which names appear as words in Shakespeare's plays?
+        #     SELECT
+        #     name AS shakespeare_name
+        #     FROM UNNEST(top_names) AS name
+        #     WHERE name IN (
+        #     SELECT word
+        #     FROM `bigquery-public-data.samples.shakespeare`
+        #     );
+        #   SQL
+        #
+        #   job = bigquery.query_job multi_statement_sql
+        #
+        #   job.wait_until_done!
+        #
+        #   child_jobs = bigquery.jobs parent_job: job
+        #
+        #   child_jobs.each do |child_job|
+        #     script_statistics = child_job.script_statistics
+        #     puts script_statistics.evaluation_kind
+        #     script_statistics.stack_frames.each do |stack_frame|
+        #       puts stack_frame.text
+        #     end
+        #   end
+        #
+        class ScriptStackFrame
+          attr_reader :start_line, :start_column, :end_line, :end_column, :text
+          ##
+          # @private Creates a new ScriptStackFrame instance.
+          def initialize start_line, start_column, end_line, end_column, text
+            @start_line = start_line
+            @start_column = start_column
+            @end_line = end_line
+            @end_column = end_column
+            @text = text
+          end
+          ##
+          # @private New ScriptStackFrame from a statistics.script_statistics[].stack_frames element.
+          def self.from_gapi gapi
+            new gapi.start_line, gapi.start_column, gapi.end_line, gapi.end_column, gapi.text
+          end
+        end
         protected
         ##