google-cloud-bigquery 1.21.2 → 1.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +54 -0
- data/CONTRIBUTING.md +1 -1
- data/lib/google-cloud-bigquery.rb +1 -0
- data/lib/google/cloud/bigquery.rb +1 -1
- data/lib/google/cloud/bigquery/convert.rb +3 -1
- data/lib/google/cloud/bigquery/copy_job.rb +15 -6
- data/lib/google/cloud/bigquery/dataset.rb +43 -20
- data/lib/google/cloud/bigquery/dataset/access.rb +293 -16
- data/lib/google/cloud/bigquery/external.rb +328 -3
- data/lib/google/cloud/bigquery/extract_job.rb +154 -50
- data/lib/google/cloud/bigquery/load_job.rb +197 -34
- data/lib/google/cloud/bigquery/model.rb +164 -8
- data/lib/google/cloud/bigquery/policy.rb +431 -0
- data/lib/google/cloud/bigquery/project.rb +137 -68
- data/lib/google/cloud/bigquery/query_job.rb +24 -12
- data/lib/google/cloud/bigquery/service.rb +50 -11
- data/lib/google/cloud/bigquery/table.rb +174 -37
- data/lib/google/cloud/bigquery/version.rb +1 -1
- metadata +7 -6
@@ -52,6 +52,24 @@ module Google
|
|
52
52
|
# # Retrieve the next page of results
|
53
53
|
# data = data.next if data.next?
|
54
54
|
#
|
55
|
+
# @example Hive partitioning options:
|
56
|
+
# require "google/cloud/bigquery"
|
57
|
+
#
|
58
|
+
# bigquery = Google::Cloud::Bigquery.new
|
59
|
+
#
|
60
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
61
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
62
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
63
|
+
# ext.hive_partitioning_mode = :auto
|
64
|
+
# ext.hive_partitioning_require_partition_filter = true
|
65
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# external_data.hive_partitioning? #=> true
|
69
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
70
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
71
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
72
|
+
#
|
55
73
|
module External
|
56
74
|
##
|
57
75
|
# @private New External from URLs and format
|
@@ -79,7 +97,8 @@ module Google
|
|
79
97
|
# @private Determine source_format from inputs
|
80
98
|
def self.source_format_for urls, format
|
81
99
|
val = {
|
82
|
-
"csv"
|
100
|
+
"csv" => "CSV",
|
101
|
+
"avro" => "AVRO",
|
83
102
|
"json" => "NEWLINE_DELIMITED_JSON",
|
84
103
|
"newline_delimited_json" => "NEWLINE_DELIMITED_JSON",
|
85
104
|
"sheets" => "GOOGLE_SHEETS",
|
@@ -87,7 +106,9 @@ module Google
|
|
87
106
|
"datastore" => "DATASTORE_BACKUP",
|
88
107
|
"backup" => "DATASTORE_BACKUP",
|
89
108
|
"datastore_backup" => "DATASTORE_BACKUP",
|
90
|
-
"bigtable" => "BIGTABLE"
|
109
|
+
"bigtable" => "BIGTABLE",
|
110
|
+
"orc" => "ORC",
|
111
|
+
"parquet" => "PARQUET"
|
91
112
|
}[format.to_s.downcase]
|
92
113
|
return val unless val.nil?
|
93
114
|
Array(urls).each do |url|
|
@@ -110,7 +131,7 @@ module Google
|
|
110
131
|
when "GOOGLE_SHEETS" then External::SheetsSource
|
111
132
|
when "BIGTABLE" then External::BigtableSource
|
112
133
|
else
|
113
|
-
# AVRO
|
134
|
+
# AVRO, DATASTORE_BACKUP, PARQUET
|
114
135
|
External::DataSource
|
115
136
|
end
|
116
137
|
end
|
@@ -148,6 +169,24 @@ module Google
|
|
148
169
|
# # Retrieve the next page of results
|
149
170
|
# data = data.next if data.next?
|
150
171
|
#
|
172
|
+
# @example Hive partitioning options:
|
173
|
+
# require "google/cloud/bigquery"
|
174
|
+
#
|
175
|
+
# bigquery = Google::Cloud::Bigquery.new
|
176
|
+
#
|
177
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
178
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
179
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
180
|
+
# ext.hive_partitioning_mode = :auto
|
181
|
+
# ext.hive_partitioning_require_partition_filter = true
|
182
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# external_data.hive_partitioning? #=> true
|
186
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
187
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
188
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
189
|
+
#
|
151
190
|
class DataSource
|
152
191
|
##
|
153
192
|
# @private The Google API Client object.
|
@@ -302,6 +341,52 @@ module Google
|
|
302
341
|
@gapi.source_format == "BIGTABLE"
|
303
342
|
end
|
304
343
|
|
344
|
+
##
|
345
|
+
# Whether the data format is "ORC".
|
346
|
+
#
|
347
|
+
# @return [Boolean]
|
348
|
+
#
|
349
|
+
# @example
|
350
|
+
# require "google/cloud/bigquery"
|
351
|
+
#
|
352
|
+
# bigquery = Google::Cloud::Bigquery.new
|
353
|
+
#
|
354
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
355
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
356
|
+
# external_data = bigquery.external gcs_uri, format: :orc do |ext|
|
357
|
+
# ext.hive_partitioning_mode = :auto
|
358
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
359
|
+
# end
|
360
|
+
# external_data.format #=> "ORC"
|
361
|
+
# external_data.orc? #=> true
|
362
|
+
#
|
363
|
+
def orc?
|
364
|
+
@gapi.source_format == "ORC"
|
365
|
+
end
|
366
|
+
|
367
|
+
##
|
368
|
+
# Whether the data format is "PARQUET".
|
369
|
+
#
|
370
|
+
# @return [Boolean]
|
371
|
+
#
|
372
|
+
# @example
|
373
|
+
# require "google/cloud/bigquery"
|
374
|
+
#
|
375
|
+
# bigquery = Google::Cloud::Bigquery.new
|
376
|
+
#
|
377
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
378
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
379
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
380
|
+
# ext.hive_partitioning_mode = :auto
|
381
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
382
|
+
# end
|
383
|
+
# external_data.format #=> "PARQUET"
|
384
|
+
# external_data.parquet? #=> true
|
385
|
+
#
|
386
|
+
def parquet?
|
387
|
+
@gapi.source_format == "PARQUET"
|
388
|
+
end
|
389
|
+
|
305
390
|
##
|
306
391
|
# The fully-qualified URIs that point to your data in Google Cloud.
|
307
392
|
# For Google Cloud Storage URIs: Each URI can contain one '*' wildcard
|
@@ -536,6 +621,246 @@ module Google
|
|
536
621
|
@gapi.max_bad_records = new_max_bad_records
|
537
622
|
end
|
538
623
|
|
624
|
+
###
|
625
|
+
# Checks if hive partitioning options are set.
|
626
|
+
#
|
627
|
+
# Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
|
628
|
+
# will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
|
629
|
+
# If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
|
630
|
+
# Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
|
631
|
+
#
|
632
|
+
# @return [Boolean] `true` when hive partitioning options are set, or `false` otherwise.
|
633
|
+
#
|
634
|
+
# @example
|
635
|
+
# require "google/cloud/bigquery"
|
636
|
+
#
|
637
|
+
# bigquery = Google::Cloud::Bigquery.new
|
638
|
+
#
|
639
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
640
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
641
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
642
|
+
# ext.hive_partitioning_mode = :auto
|
643
|
+
# ext.hive_partitioning_require_partition_filter = true
|
644
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
645
|
+
# end
|
646
|
+
#
|
647
|
+
# external_data.hive_partitioning? #=> true
|
648
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
649
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
650
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
651
|
+
#
|
652
|
+
def hive_partitioning?
|
653
|
+
!@gapi.hive_partitioning_options.nil?
|
654
|
+
end
|
655
|
+
|
656
|
+
###
|
657
|
+
# The mode of hive partitioning to use when reading data. The following modes are supported:
|
658
|
+
#
|
659
|
+
# 1. `AUTO`: automatically infer partition key name(s) and type(s).
|
660
|
+
# 2. `STRINGS`: automatically infer partition key name(s). All types are interpreted as strings.
|
661
|
+
# 3. `CUSTOM`: partition key schema is encoded in the source URI prefix.
|
662
|
+
#
|
663
|
+
# @return [String, nil] The mode of hive partitioning, or `nil` if not set.
|
664
|
+
#
|
665
|
+
# @example
|
666
|
+
# require "google/cloud/bigquery"
|
667
|
+
#
|
668
|
+
# bigquery = Google::Cloud::Bigquery.new
|
669
|
+
#
|
670
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
671
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
672
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
673
|
+
# ext.hive_partitioning_mode = :auto
|
674
|
+
# ext.hive_partitioning_require_partition_filter = true
|
675
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
676
|
+
# end
|
677
|
+
#
|
678
|
+
# external_data.hive_partitioning? #=> true
|
679
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
680
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
681
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
682
|
+
#
|
683
|
+
def hive_partitioning_mode
|
684
|
+
@gapi.hive_partitioning_options.mode if hive_partitioning?
|
685
|
+
end
|
686
|
+
|
687
|
+
##
|
688
|
+
# Sets the mode of hive partitioning to use when reading data. The following modes are supported:
|
689
|
+
#
|
690
|
+
# 1. `auto`: automatically infer partition key name(s) and type(s).
|
691
|
+
# 2. `strings`: automatically infer partition key name(s). All types are interpreted as strings.
|
692
|
+
# 3. `custom`: partition key schema is encoded in the source URI prefix.
|
693
|
+
#
|
694
|
+
# Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
|
695
|
+
# will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
|
696
|
+
# If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
|
697
|
+
# Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
|
698
|
+
#
|
699
|
+
# See {#format}, {#hive_partitioning_require_partition_filter=} and {#hive_partitioning_source_uri_prefix=}.
|
700
|
+
#
|
701
|
+
# @param [String, Symbol] mode The mode of hive partitioning to use when reading data.
|
702
|
+
#
|
703
|
+
# @example
|
704
|
+
# require "google/cloud/bigquery"
|
705
|
+
#
|
706
|
+
# bigquery = Google::Cloud::Bigquery.new
|
707
|
+
#
|
708
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
709
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
710
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
711
|
+
# ext.hive_partitioning_mode = :auto
|
712
|
+
# ext.hive_partitioning_require_partition_filter = true
|
713
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
714
|
+
# end
|
715
|
+
#
|
716
|
+
# external_data.hive_partitioning? #=> true
|
717
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
718
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
719
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
720
|
+
#
|
721
|
+
def hive_partitioning_mode= mode
|
722
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
723
|
+
@gapi.hive_partitioning_options.mode = mode.to_s.upcase
|
724
|
+
end
|
725
|
+
|
726
|
+
###
|
727
|
+
# Whether queries over the table using this external data source require a partition filter that can be used
|
728
|
+
# for partition elimination to be specified. Note that this field should only be true when creating a
|
729
|
+
# permanent external table or querying a temporary external table.
|
730
|
+
#
|
731
|
+
# @return [Boolean] `true` when queries over this table require a partition filter, or `false` otherwise.
|
732
|
+
#
|
733
|
+
# @example
|
734
|
+
# require "google/cloud/bigquery"
|
735
|
+
#
|
736
|
+
# bigquery = Google::Cloud::Bigquery.new
|
737
|
+
#
|
738
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
739
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
740
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
741
|
+
# ext.hive_partitioning_mode = :auto
|
742
|
+
# ext.hive_partitioning_require_partition_filter = true
|
743
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
744
|
+
# end
|
745
|
+
#
|
746
|
+
# external_data.hive_partitioning? #=> true
|
747
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
748
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
749
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
750
|
+
#
|
751
|
+
def hive_partitioning_require_partition_filter?
|
752
|
+
return false unless hive_partitioning?
|
753
|
+
!@gapi.hive_partitioning_options.require_partition_filter.nil?
|
754
|
+
end
|
755
|
+
|
756
|
+
##
|
757
|
+
# Sets whether queries over the table using this external data source require a partition filter
|
758
|
+
# that can be used for partition elimination to be specified.
|
759
|
+
#
|
760
|
+
# See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_source_uri_prefix=}.
|
761
|
+
#
|
762
|
+
# @param [Boolean] require_partition_filter `true` if a partition filter must be specified, `false` otherwise.
|
763
|
+
#
|
764
|
+
# @example
|
765
|
+
# require "google/cloud/bigquery"
|
766
|
+
#
|
767
|
+
# bigquery = Google::Cloud::Bigquery.new
|
768
|
+
#
|
769
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
770
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
771
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
772
|
+
# ext.hive_partitioning_mode = :auto
|
773
|
+
# ext.hive_partitioning_require_partition_filter = true
|
774
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
775
|
+
# end
|
776
|
+
#
|
777
|
+
# external_data.hive_partitioning? #=> true
|
778
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
779
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
780
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
781
|
+
#
|
782
|
+
def hive_partitioning_require_partition_filter= require_partition_filter
|
783
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
784
|
+
@gapi.hive_partitioning_options.require_partition_filter = require_partition_filter
|
785
|
+
end
|
786
|
+
|
787
|
+
###
|
788
|
+
# The common prefix for all source uris when hive partition detection is requested. The prefix must end
|
789
|
+
# immediately before the partition key encoding begins. For example, consider files following this data
|
790
|
+
# layout:
|
791
|
+
#
|
792
|
+
# ```
|
793
|
+
# gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
|
794
|
+
# gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
|
795
|
+
# ```
|
796
|
+
#
|
797
|
+
# When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
|
798
|
+
# `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
|
799
|
+
#
|
800
|
+
# @return [String, nil] The common prefix for all source uris, or `nil` if not set.
|
801
|
+
#
|
802
|
+
# @example
|
803
|
+
# require "google/cloud/bigquery"
|
804
|
+
#
|
805
|
+
# bigquery = Google::Cloud::Bigquery.new
|
806
|
+
#
|
807
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
808
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
809
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
810
|
+
# ext.hive_partitioning_mode = :auto
|
811
|
+
# ext.hive_partitioning_require_partition_filter = true
|
812
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
813
|
+
# end
|
814
|
+
#
|
815
|
+
# external_data.hive_partitioning? #=> true
|
816
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
817
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
818
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
819
|
+
#
|
820
|
+
def hive_partitioning_source_uri_prefix
|
821
|
+
@gapi.hive_partitioning_options.source_uri_prefix if hive_partitioning?
|
822
|
+
end
|
823
|
+
|
824
|
+
##
|
825
|
+
# Sets the common prefix for all source uris when hive partition detection is requested. The prefix must end
|
826
|
+
# immediately before the partition key encoding begins. For example, consider files following this data
|
827
|
+
# layout:
|
828
|
+
#
|
829
|
+
# ```
|
830
|
+
# gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
|
831
|
+
# gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
|
832
|
+
# ```
|
833
|
+
#
|
834
|
+
# When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
|
835
|
+
# `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
|
836
|
+
#
|
837
|
+
# See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_require_partition_filter=}.
|
838
|
+
#
|
839
|
+
# @param [String] source_uri_prefix The common prefix for all source uris.
|
840
|
+
#
|
841
|
+
# @example
|
842
|
+
# require "google/cloud/bigquery"
|
843
|
+
#
|
844
|
+
# bigquery = Google::Cloud::Bigquery.new
|
845
|
+
#
|
846
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
847
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
848
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
849
|
+
# ext.hive_partitioning_mode = :auto
|
850
|
+
# ext.hive_partitioning_require_partition_filter = true
|
851
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
852
|
+
# end
|
853
|
+
#
|
854
|
+
# external_data.hive_partitioning? #=> true
|
855
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
856
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
857
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
858
|
+
#
|
859
|
+
def hive_partitioning_source_uri_prefix= source_uri_prefix
|
860
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
861
|
+
@gapi.hive_partitioning_options.source_uri_prefix = source_uri_prefix
|
862
|
+
end
|
863
|
+
|
539
864
|
##
|
540
865
|
# @private Google API Client object.
|
541
866
|
def to_gapi
|
@@ -20,15 +20,17 @@ module Google
|
|
20
20
|
# # ExtractJob
|
21
21
|
#
|
22
22
|
# A {Job} subclass representing an export operation that may be performed
|
23
|
-
# on a {Table}. A ExtractJob instance is
|
24
|
-
# {Table#extract_job}.
|
23
|
+
# on a {Table} or {Model}. A ExtractJob instance is returned when you call
|
24
|
+
# {Project#extract_job}, {Table#extract_job} or {Model#extract_job}.
|
25
25
|
#
|
26
26
|
# @see https://cloud.google.com/bigquery/docs/exporting-data
|
27
|
-
# Exporting
|
27
|
+
# Exporting table data
|
28
|
+
# @see https://cloud.google.com/bigquery-ml/docs/exporting-models
|
29
|
+
# Exporting models
|
28
30
|
# @see https://cloud.google.com/bigquery/docs/reference/v2/jobs Jobs API
|
29
31
|
# reference
|
30
32
|
#
|
31
|
-
# @example
|
33
|
+
# @example Export table data
|
32
34
|
# require "google/cloud/bigquery"
|
33
35
|
#
|
34
36
|
# bigquery = Google::Cloud::Bigquery.new
|
@@ -40,6 +42,18 @@ module Google
|
|
40
42
|
# extract_job.wait_until_done!
|
41
43
|
# extract_job.done? #=> true
|
42
44
|
#
|
45
|
+
# @example Export a model
|
46
|
+
# require "google/cloud/bigquery"
|
47
|
+
#
|
48
|
+
# bigquery = Google::Cloud::Bigquery.new
|
49
|
+
# dataset = bigquery.dataset "my_dataset"
|
50
|
+
# model = dataset.model "my_model"
|
51
|
+
#
|
52
|
+
# extract_job = model.extract_job "gs://my-bucket/#{model.model_id}"
|
53
|
+
#
|
54
|
+
# extract_job.wait_until_done!
|
55
|
+
# extract_job.done? #=> true
|
56
|
+
#
|
43
57
|
class ExtractJob < Job
|
44
58
|
##
|
45
59
|
# The URI or URIs representing the Google Cloud Storage files to which
|
@@ -49,71 +63,126 @@ module Google
|
|
49
63
|
end
|
50
64
|
|
51
65
|
##
|
52
|
-
# The table
|
53
|
-
# which {Table#extract_job} was called.
|
66
|
+
# The table or model which is exported.
|
54
67
|
#
|
55
|
-
# @return [Table] A table instance
|
68
|
+
# @return [Table, Model, nil] A table or model instance, or `nil`.
|
56
69
|
#
|
57
70
|
def source
|
58
|
-
table = @gapi.configuration.extract.source_table
|
59
|
-
|
60
|
-
|
71
|
+
if (table = @gapi.configuration.extract.source_table)
|
72
|
+
retrieve_table table.project_id, table.dataset_id, table.table_id
|
73
|
+
elsif (model = @gapi.configuration.extract.source_model)
|
74
|
+
retrieve_model model.project_id, model.dataset_id, model.model_id
|
75
|
+
end
|
61
76
|
end
|
62
77
|
|
63
78
|
##
|
64
|
-
#
|
65
|
-
# default is `false`.
|
79
|
+
# Whether the source of the export job is a table. See {#source}.
|
66
80
|
#
|
67
|
-
# @return [Boolean] `true` when
|
81
|
+
# @return [Boolean] `true` when the source is a table, `false`
|
82
|
+
# otherwise.
|
68
83
|
#
|
69
|
-
def
|
70
|
-
|
71
|
-
val == "GZIP"
|
84
|
+
def table?
|
85
|
+
!@gapi.configuration.extract.source_table.nil?
|
72
86
|
end
|
73
87
|
|
74
88
|
##
|
75
|
-
#
|
76
|
-
# JSON](http://jsonlines.org/). The default is `false`.
|
89
|
+
# Whether the source of the export job is a model. See {#source}.
|
77
90
|
#
|
78
|
-
# @return [Boolean] `true` when
|
91
|
+
# @return [Boolean] `true` when the source is a model, `false`
|
79
92
|
# otherwise.
|
80
93
|
#
|
94
|
+
def model?
|
95
|
+
!@gapi.configuration.extract.source_model.nil?
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Checks if the export operation compresses the data using gzip. The
|
100
|
+
# default is `false`. Not applicable when extracting models.
|
101
|
+
#
|
102
|
+
# @return [Boolean] `true` when `GZIP`, `false` if not `GZIP` or not a
|
103
|
+
# table extraction.
|
104
|
+
def compression?
|
105
|
+
return false unless table?
|
106
|
+
@gapi.configuration.extract.compression == "GZIP"
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# Checks if the destination format for the table data is [newline-delimited
|
111
|
+
# JSON](http://jsonlines.org/). The default is `false`. Not applicable when
|
112
|
+
# extracting models.
|
113
|
+
#
|
114
|
+
# @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false` if not
|
115
|
+
# `NEWLINE_DELIMITED_JSON` or not a table extraction.
|
116
|
+
#
|
81
117
|
def json?
|
82
|
-
|
83
|
-
|
118
|
+
return false unless table?
|
119
|
+
@gapi.configuration.extract.destination_format == "NEWLINE_DELIMITED_JSON"
|
84
120
|
end
|
85
121
|
|
86
122
|
##
|
87
|
-
# Checks if the destination format for the data is CSV. Tables with
|
123
|
+
# Checks if the destination format for the table data is CSV. Tables with
|
88
124
|
# nested or repeated fields cannot be exported as CSV. The default is
|
89
|
-
# `true
|
125
|
+
# `true` for tables. Not applicable when extracting models.
|
90
126
|
#
|
91
|
-
# @return [Boolean] `true` when `CSV`, `false`
|
127
|
+
# @return [Boolean] `true` when `CSV`, or `false` if not `CSV` or not a
|
128
|
+
# table extraction.
|
92
129
|
#
|
93
130
|
def csv?
|
131
|
+
return false unless table?
|
94
132
|
val = @gapi.configuration.extract.destination_format
|
95
133
|
return true if val.nil?
|
96
134
|
val == "CSV"
|
97
135
|
end
|
98
136
|
|
99
137
|
##
|
100
|
-
# Checks if the destination format for the data is
|
101
|
-
# [Avro](http://avro.apache.org/). The default is `false`.
|
138
|
+
# Checks if the destination format for the table data is
|
139
|
+
# [Avro](http://avro.apache.org/). The default is `false`. Not applicable
|
140
|
+
# when extracting models.
|
102
141
|
#
|
103
|
-
# @return [Boolean] `true` when `AVRO`, `false`
|
142
|
+
# @return [Boolean] `true` when `AVRO`, `false` if not `AVRO` or not a
|
143
|
+
# table extraction.
|
104
144
|
#
|
105
145
|
def avro?
|
146
|
+
return false unless table?
|
147
|
+
@gapi.configuration.extract.destination_format == "AVRO"
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Checks if the destination format for the model is TensorFlow SavedModel.
|
152
|
+
# The default is `true` for models. Not applicable when extracting tables.
|
153
|
+
#
|
154
|
+
# @return [Boolean] `true` when `ML_TF_SAVED_MODEL`, `false` if not
|
155
|
+
# `ML_TF_SAVED_MODEL` or not a model extraction.
|
156
|
+
#
|
157
|
+
def ml_tf_saved_model?
|
158
|
+
return false unless model?
|
106
159
|
val = @gapi.configuration.extract.destination_format
|
107
|
-
|
160
|
+
return true if val.nil?
|
161
|
+
val == "ML_TF_SAVED_MODEL"
|
162
|
+
end
|
163
|
+
|
164
|
+
##
|
165
|
+
# Checks if the destination format for the model is XGBoost. The default
|
166
|
+
# is `false`. Not applicable when extracting tables.
|
167
|
+
#
|
168
|
+
# @return [Boolean] `true` when `ML_XGBOOST_BOOSTER`, `false` if not
|
169
|
+
# `ML_XGBOOST_BOOSTER` or not a model extraction.
|
170
|
+
#
|
171
|
+
def ml_xgboost_booster?
|
172
|
+
return false unless model?
|
173
|
+
@gapi.configuration.extract.destination_format == "ML_XGBOOST_BOOSTER"
|
108
174
|
end
|
109
175
|
|
110
176
|
##
|
111
177
|
# The character or symbol the operation uses to delimit fields in the
|
112
|
-
# exported data. The default is a comma (,).
|
178
|
+
# exported data. The default is a comma (,) for tables. Not applicable
|
179
|
+
# when extracting models.
|
113
180
|
#
|
114
|
-
# @return [String] A string containing the character, such as `","
|
181
|
+
# @return [String, nil] A string containing the character, such as `","`,
|
182
|
+
# `nil` if not a table extraction.
|
115
183
|
#
|
116
184
|
def delimiter
|
185
|
+
return unless table?
|
117
186
|
val = @gapi.configuration.extract.field_delimiter
|
118
187
|
val = "," if val.nil?
|
119
188
|
val
|
@@ -121,12 +190,13 @@ module Google
|
|
121
190
|
|
122
191
|
##
|
123
192
|
# Checks if the exported data contains a header row. The default is
|
124
|
-
# `true
|
193
|
+
# `true` for tables. Not applicable when extracting models.
|
125
194
|
#
|
126
195
|
# @return [Boolean] `true` when the print header configuration is
|
127
|
-
# present or `nil`, `false`
|
196
|
+
# present or `nil`, `false` if disabled or not a table extraction.
|
128
197
|
#
|
129
198
|
def print_header?
|
199
|
+
return false unless table?
|
130
200
|
val = @gapi.configuration.extract.print_header
|
131
201
|
val = true if val.nil?
|
132
202
|
val
|
@@ -159,12 +229,14 @@ module Google
|
|
159
229
|
# whether to enable extracting applicable column types (such as
|
160
230
|
# `TIMESTAMP`) to their corresponding AVRO logical types
|
161
231
|
# (`timestamp-micros`), instead of only using their raw types
|
162
|
-
# (`avro-long`).
|
232
|
+
# (`avro-long`). Not applicable when extracting models.
|
163
233
|
#
|
164
234
|
# @return [Boolean] `true` when applicable column types will use their
|
165
|
-
# corresponding AVRO logical types, `false`
|
235
|
+
# corresponding AVRO logical types, `false` if not enabled or not a
|
236
|
+
# table extraction.
|
166
237
|
#
|
167
238
|
def use_avro_logical_types?
|
239
|
+
return false unless table?
|
168
240
|
@gapi.configuration.extract.use_avro_logical_types
|
169
241
|
end
|
170
242
|
|
@@ -182,19 +254,24 @@ module Google
|
|
182
254
|
#
|
183
255
|
# @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job
|
184
256
|
# configuration object for setting query options.
|
185
|
-
def self.from_options service,
|
257
|
+
def self.from_options service, source, storage_files, options
|
186
258
|
job_ref = service.job_ref_from options[:job_id], options[:prefix]
|
187
259
|
storage_urls = Array(storage_files).map do |url|
|
188
260
|
url.respond_to?(:to_gs_url) ? url.to_gs_url : url
|
189
261
|
end
|
190
262
|
options[:format] ||= Convert.derive_source_format storage_urls.first
|
263
|
+
extract_config = Google::Apis::BigqueryV2::JobConfigurationExtract.new(
|
264
|
+
destination_uris: Array(storage_urls)
|
265
|
+
)
|
266
|
+
if source.is_a? Google::Apis::BigqueryV2::TableReference
|
267
|
+
extract_config.source_table = source
|
268
|
+
elsif source.is_a? Google::Apis::BigqueryV2::ModelReference
|
269
|
+
extract_config.source_model = source
|
270
|
+
end
|
191
271
|
job = Google::Apis::BigqueryV2::Job.new(
|
192
272
|
job_reference: job_ref,
|
193
273
|
configuration: Google::Apis::BigqueryV2::JobConfiguration.new(
|
194
|
-
extract:
|
195
|
-
destination_uris: Array(storage_urls),
|
196
|
-
source_table: table
|
197
|
-
),
|
274
|
+
extract: extract_config,
|
198
275
|
dry_run: options[:dryrun]
|
199
276
|
)
|
200
277
|
)
|
@@ -253,7 +330,7 @@ module Google
|
|
253
330
|
end
|
254
331
|
|
255
332
|
##
|
256
|
-
# Sets the compression type.
|
333
|
+
# Sets the compression type. Not applicable when extracting models.
|
257
334
|
#
|
258
335
|
# @param [String] value The compression type to use for exported
|
259
336
|
# files. Possible values include `GZIP` and `NONE`. The default
|
@@ -265,7 +342,7 @@ module Google
|
|
265
342
|
end
|
266
343
|
|
267
344
|
##
|
268
|
-
# Sets the field delimiter.
|
345
|
+
# Sets the field delimiter. Not applicable when extracting models.
|
269
346
|
#
|
270
347
|
# @param [String] value Delimiter to use between fields in the
|
271
348
|
# exported data. Default is <code>,</code>.
|
@@ -276,14 +353,21 @@ module Google
|
|
276
353
|
end
|
277
354
|
|
278
355
|
##
|
279
|
-
# Sets the destination file format. The default value
|
356
|
+
# Sets the destination file format. The default value for
|
357
|
+
# tables is `csv`. Tables with nested or repeated fields cannot be
|
358
|
+
# exported as CSV. The default value for models is `ml_tf_saved_model`.
|
280
359
|
#
|
281
|
-
#
|
360
|
+
# Supported values for tables:
|
282
361
|
#
|
283
362
|
# * `csv` - CSV
|
284
363
|
# * `json` - [Newline-delimited JSON](http://jsonlines.org/)
|
285
364
|
# * `avro` - [Avro](http://avro.apache.org/)
|
286
365
|
#
|
366
|
+
# Supported values for models:
|
367
|
+
#
|
368
|
+
# * `ml_tf_saved_model` - TensorFlow SavedModel
|
369
|
+
# * `ml_xgboost_booster` - XGBoost Booster
|
370
|
+
#
|
287
371
|
# @param [String] new_format The new source format.
|
288
372
|
#
|
289
373
|
# @!group Attributes
|
@@ -293,7 +377,8 @@ module Google
|
|
293
377
|
end
|
294
378
|
|
295
379
|
##
|
296
|
-
# Print a header row in the exported file.
|
380
|
+
# Print a header row in the exported file. Not applicable when
|
381
|
+
# extracting models.
|
297
382
|
#
|
298
383
|
# @param [Boolean] value Whether to print out a header row in the
|
299
384
|
# results. Default is `true`.
|
@@ -307,12 +392,21 @@ module Google
|
|
307
392
|
# Sets the labels to use for the job.
|
308
393
|
#
|
309
394
|
# @param [Hash] value A hash of user-provided labels associated with
|
310
|
-
# the job. You can use these to organize and group your jobs.
|
311
|
-
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
#
|
315
|
-
#
|
395
|
+
# the job. You can use these to organize and group your jobs.
|
396
|
+
#
|
397
|
+
# The labels applied to a resource must meet the following requirements:
|
398
|
+
#
|
399
|
+
# * Each resource can have multiple labels, up to a maximum of 64.
|
400
|
+
# * Each label must be a key-value pair.
|
401
|
+
# * Keys have a minimum length of 1 character and a maximum length of
|
402
|
+
# 63 characters, and cannot be empty. Values can be empty, and have
|
403
|
+
# a maximum length of 63 characters.
|
404
|
+
# * Keys and values can contain only lowercase letters, numeric characters,
|
405
|
+
# underscores, and dashes. All characters must use UTF-8 encoding, and
|
406
|
+
# international characters are allowed.
|
407
|
+
# * The key portion of a label must be unique. However, you can use the
|
408
|
+
# same key with multiple resources.
|
409
|
+
# * Keys must start with a lowercase letter or international character.
|
316
410
|
#
|
317
411
|
# @!group Attributes
|
318
412
|
#
|
@@ -362,6 +456,16 @@ module Google
|
|
362
456
|
@gapi
|
363
457
|
end
|
364
458
|
end
|
459
|
+
|
460
|
+
protected
|
461
|
+
|
462
|
+
def retrieve_model project_id, dataset_id, model_id
|
463
|
+
ensure_service!
|
464
|
+
gapi = service.get_project_model project_id, dataset_id, model_id
|
465
|
+
Model.from_gapi_json gapi, service
|
466
|
+
rescue Google::Cloud::NotFoundError
|
467
|
+
nil
|
468
|
+
end
|
365
469
|
end
|
366
470
|
end
|
367
471
|
end
|