google-cloud-bigquery 1.21.2 → 1.26.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +54 -0
- data/CONTRIBUTING.md +1 -1
- data/lib/google-cloud-bigquery.rb +1 -0
- data/lib/google/cloud/bigquery.rb +1 -1
- data/lib/google/cloud/bigquery/convert.rb +3 -1
- data/lib/google/cloud/bigquery/copy_job.rb +15 -6
- data/lib/google/cloud/bigquery/dataset.rb +43 -20
- data/lib/google/cloud/bigquery/dataset/access.rb +293 -16
- data/lib/google/cloud/bigquery/external.rb +328 -3
- data/lib/google/cloud/bigquery/extract_job.rb +154 -50
- data/lib/google/cloud/bigquery/load_job.rb +197 -34
- data/lib/google/cloud/bigquery/model.rb +164 -8
- data/lib/google/cloud/bigquery/policy.rb +431 -0
- data/lib/google/cloud/bigquery/project.rb +137 -68
- data/lib/google/cloud/bigquery/query_job.rb +24 -12
- data/lib/google/cloud/bigquery/service.rb +50 -11
- data/lib/google/cloud/bigquery/table.rb +174 -37
- data/lib/google/cloud/bigquery/version.rb +1 -1
- metadata +7 -6
@@ -52,6 +52,24 @@ module Google
|
|
52
52
|
# # Retrieve the next page of results
|
53
53
|
# data = data.next if data.next?
|
54
54
|
#
|
55
|
+
# @example Hive partitioning options:
|
56
|
+
# require "google/cloud/bigquery"
|
57
|
+
#
|
58
|
+
# bigquery = Google::Cloud::Bigquery.new
|
59
|
+
#
|
60
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
61
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
62
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
63
|
+
# ext.hive_partitioning_mode = :auto
|
64
|
+
# ext.hive_partitioning_require_partition_filter = true
|
65
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# external_data.hive_partitioning? #=> true
|
69
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
70
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
71
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
72
|
+
#
|
55
73
|
module External
|
56
74
|
##
|
57
75
|
# @private New External from URLs and format
|
@@ -79,7 +97,8 @@ module Google
|
|
79
97
|
# @private Determine source_format from inputs
|
80
98
|
def self.source_format_for urls, format
|
81
99
|
val = {
|
82
|
-
"csv"
|
100
|
+
"csv" => "CSV",
|
101
|
+
"avro" => "AVRO",
|
83
102
|
"json" => "NEWLINE_DELIMITED_JSON",
|
84
103
|
"newline_delimited_json" => "NEWLINE_DELIMITED_JSON",
|
85
104
|
"sheets" => "GOOGLE_SHEETS",
|
@@ -87,7 +106,9 @@ module Google
|
|
87
106
|
"datastore" => "DATASTORE_BACKUP",
|
88
107
|
"backup" => "DATASTORE_BACKUP",
|
89
108
|
"datastore_backup" => "DATASTORE_BACKUP",
|
90
|
-
"bigtable" => "BIGTABLE"
|
109
|
+
"bigtable" => "BIGTABLE",
|
110
|
+
"orc" => "ORC",
|
111
|
+
"parquet" => "PARQUET"
|
91
112
|
}[format.to_s.downcase]
|
92
113
|
return val unless val.nil?
|
93
114
|
Array(urls).each do |url|
|
@@ -110,7 +131,7 @@ module Google
|
|
110
131
|
when "GOOGLE_SHEETS" then External::SheetsSource
|
111
132
|
when "BIGTABLE" then External::BigtableSource
|
112
133
|
else
|
113
|
-
# AVRO
|
134
|
+
# AVRO, DATASTORE_BACKUP, PARQUET
|
114
135
|
External::DataSource
|
115
136
|
end
|
116
137
|
end
|
@@ -148,6 +169,24 @@ module Google
|
|
148
169
|
# # Retrieve the next page of results
|
149
170
|
# data = data.next if data.next?
|
150
171
|
#
|
172
|
+
# @example Hive partitioning options:
|
173
|
+
# require "google/cloud/bigquery"
|
174
|
+
#
|
175
|
+
# bigquery = Google::Cloud::Bigquery.new
|
176
|
+
#
|
177
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
178
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
179
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
180
|
+
# ext.hive_partitioning_mode = :auto
|
181
|
+
# ext.hive_partitioning_require_partition_filter = true
|
182
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# external_data.hive_partitioning? #=> true
|
186
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
187
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
188
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
189
|
+
#
|
151
190
|
class DataSource
|
152
191
|
##
|
153
192
|
# @private The Google API Client object.
|
@@ -302,6 +341,52 @@ module Google
|
|
302
341
|
@gapi.source_format == "BIGTABLE"
|
303
342
|
end
|
304
343
|
|
344
|
+
##
|
345
|
+
# Whether the data format is "ORC".
|
346
|
+
#
|
347
|
+
# @return [Boolean]
|
348
|
+
#
|
349
|
+
# @example
|
350
|
+
# require "google/cloud/bigquery"
|
351
|
+
#
|
352
|
+
# bigquery = Google::Cloud::Bigquery.new
|
353
|
+
#
|
354
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
355
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
356
|
+
# external_data = bigquery.external gcs_uri, format: :orc do |ext|
|
357
|
+
# ext.hive_partitioning_mode = :auto
|
358
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
359
|
+
# end
|
360
|
+
# external_data.format #=> "ORC"
|
361
|
+
# external_data.orc? #=> true
|
362
|
+
#
|
363
|
+
def orc?
|
364
|
+
@gapi.source_format == "ORC"
|
365
|
+
end
|
366
|
+
|
367
|
+
##
|
368
|
+
# Whether the data format is "PARQUET".
|
369
|
+
#
|
370
|
+
# @return [Boolean]
|
371
|
+
#
|
372
|
+
# @example
|
373
|
+
# require "google/cloud/bigquery"
|
374
|
+
#
|
375
|
+
# bigquery = Google::Cloud::Bigquery.new
|
376
|
+
#
|
377
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
378
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
379
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
380
|
+
# ext.hive_partitioning_mode = :auto
|
381
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
382
|
+
# end
|
383
|
+
# external_data.format #=> "PARQUET"
|
384
|
+
# external_data.parquet? #=> true
|
385
|
+
#
|
386
|
+
def parquet?
|
387
|
+
@gapi.source_format == "PARQUET"
|
388
|
+
end
|
389
|
+
|
305
390
|
##
|
306
391
|
# The fully-qualified URIs that point to your data in Google Cloud.
|
307
392
|
# For Google Cloud Storage URIs: Each URI can contain one '*' wildcard
|
@@ -536,6 +621,246 @@ module Google
|
|
536
621
|
@gapi.max_bad_records = new_max_bad_records
|
537
622
|
end
|
538
623
|
|
624
|
+
###
|
625
|
+
# Checks if hive partitioning options are set.
|
626
|
+
#
|
627
|
+
# Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
|
628
|
+
# will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
|
629
|
+
# If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
|
630
|
+
# Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
|
631
|
+
#
|
632
|
+
# @return [Boolean] `true` when hive partitioning options are set, or `false` otherwise.
|
633
|
+
#
|
634
|
+
# @example
|
635
|
+
# require "google/cloud/bigquery"
|
636
|
+
#
|
637
|
+
# bigquery = Google::Cloud::Bigquery.new
|
638
|
+
#
|
639
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
640
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
641
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
642
|
+
# ext.hive_partitioning_mode = :auto
|
643
|
+
# ext.hive_partitioning_require_partition_filter = true
|
644
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
645
|
+
# end
|
646
|
+
#
|
647
|
+
# external_data.hive_partitioning? #=> true
|
648
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
649
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
650
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
651
|
+
#
|
652
|
+
def hive_partitioning?
|
653
|
+
!@gapi.hive_partitioning_options.nil?
|
654
|
+
end
|
655
|
+
|
656
|
+
###
|
657
|
+
# The mode of hive partitioning to use when reading data. The following modes are supported:
|
658
|
+
#
|
659
|
+
# 1. `AUTO`: automatically infer partition key name(s) and type(s).
|
660
|
+
# 2. `STRINGS`: automatically infer partition key name(s). All types are interpreted as strings.
|
661
|
+
# 3. `CUSTOM`: partition key schema is encoded in the source URI prefix.
|
662
|
+
#
|
663
|
+
# @return [String, nil] The mode of hive partitioning, or `nil` if not set.
|
664
|
+
#
|
665
|
+
# @example
|
666
|
+
# require "google/cloud/bigquery"
|
667
|
+
#
|
668
|
+
# bigquery = Google::Cloud::Bigquery.new
|
669
|
+
#
|
670
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
671
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
672
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
673
|
+
# ext.hive_partitioning_mode = :auto
|
674
|
+
# ext.hive_partitioning_require_partition_filter = true
|
675
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
676
|
+
# end
|
677
|
+
#
|
678
|
+
# external_data.hive_partitioning? #=> true
|
679
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
680
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
681
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
682
|
+
#
|
683
|
+
def hive_partitioning_mode
|
684
|
+
@gapi.hive_partitioning_options.mode if hive_partitioning?
|
685
|
+
end
|
686
|
+
|
687
|
+
##
|
688
|
+
# Sets the mode of hive partitioning to use when reading data. The following modes are supported:
|
689
|
+
#
|
690
|
+
# 1. `auto`: automatically infer partition key name(s) and type(s).
|
691
|
+
# 2. `strings`: automatically infer partition key name(s). All types are interpreted as strings.
|
692
|
+
# 3. `custom`: partition key schema is encoded in the source URI prefix.
|
693
|
+
#
|
694
|
+
# Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
|
695
|
+
# will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
|
696
|
+
# If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
|
697
|
+
# Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
|
698
|
+
#
|
699
|
+
# See {#format}, {#hive_partitioning_require_partition_filter=} and {#hive_partitioning_source_uri_prefix=}.
|
700
|
+
#
|
701
|
+
# @param [String, Symbol] mode The mode of hive partitioning to use when reading data.
|
702
|
+
#
|
703
|
+
# @example
|
704
|
+
# require "google/cloud/bigquery"
|
705
|
+
#
|
706
|
+
# bigquery = Google::Cloud::Bigquery.new
|
707
|
+
#
|
708
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
709
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
710
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
711
|
+
# ext.hive_partitioning_mode = :auto
|
712
|
+
# ext.hive_partitioning_require_partition_filter = true
|
713
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
714
|
+
# end
|
715
|
+
#
|
716
|
+
# external_data.hive_partitioning? #=> true
|
717
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
718
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
719
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
720
|
+
#
|
721
|
+
def hive_partitioning_mode= mode
|
722
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
723
|
+
@gapi.hive_partitioning_options.mode = mode.to_s.upcase
|
724
|
+
end
|
725
|
+
|
726
|
+
###
|
727
|
+
# Whether queries over the table using this external data source require a partition filter that can be used
|
728
|
+
# for partition elimination to be specified. Note that this field should only be true when creating a
|
729
|
+
# permanent external table or querying a temporary external table.
|
730
|
+
#
|
731
|
+
# @return [Boolean] `true` when queries over this table require a partition filter, or `false` otherwise.
|
732
|
+
#
|
733
|
+
# @example
|
734
|
+
# require "google/cloud/bigquery"
|
735
|
+
#
|
736
|
+
# bigquery = Google::Cloud::Bigquery.new
|
737
|
+
#
|
738
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
739
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
740
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
741
|
+
# ext.hive_partitioning_mode = :auto
|
742
|
+
# ext.hive_partitioning_require_partition_filter = true
|
743
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
744
|
+
# end
|
745
|
+
#
|
746
|
+
# external_data.hive_partitioning? #=> true
|
747
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
748
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
749
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
750
|
+
#
|
751
|
+
def hive_partitioning_require_partition_filter?
|
752
|
+
return false unless hive_partitioning?
|
753
|
+
!@gapi.hive_partitioning_options.require_partition_filter.nil?
|
754
|
+
end
|
755
|
+
|
756
|
+
##
|
757
|
+
# Sets whether queries over the table using this external data source require a partition filter
|
758
|
+
# that can be used for partition elimination to be specified.
|
759
|
+
#
|
760
|
+
# See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_source_uri_prefix=}.
|
761
|
+
#
|
762
|
+
# @param [Boolean] require_partition_filter `true` if a partition filter must be specified, `false` otherwise.
|
763
|
+
#
|
764
|
+
# @example
|
765
|
+
# require "google/cloud/bigquery"
|
766
|
+
#
|
767
|
+
# bigquery = Google::Cloud::Bigquery.new
|
768
|
+
#
|
769
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
770
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
771
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
772
|
+
# ext.hive_partitioning_mode = :auto
|
773
|
+
# ext.hive_partitioning_require_partition_filter = true
|
774
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
775
|
+
# end
|
776
|
+
#
|
777
|
+
# external_data.hive_partitioning? #=> true
|
778
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
779
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
780
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
781
|
+
#
|
782
|
+
def hive_partitioning_require_partition_filter= require_partition_filter
|
783
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
784
|
+
@gapi.hive_partitioning_options.require_partition_filter = require_partition_filter
|
785
|
+
end
|
786
|
+
|
787
|
+
###
|
788
|
+
# The common prefix for all source uris when hive partition detection is requested. The prefix must end
|
789
|
+
# immediately before the partition key encoding begins. For example, consider files following this data
|
790
|
+
# layout:
|
791
|
+
#
|
792
|
+
# ```
|
793
|
+
# gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
|
794
|
+
# gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
|
795
|
+
# ```
|
796
|
+
#
|
797
|
+
# When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
|
798
|
+
# `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
|
799
|
+
#
|
800
|
+
# @return [String, nil] The common prefix for all source uris, or `nil` if not set.
|
801
|
+
#
|
802
|
+
# @example
|
803
|
+
# require "google/cloud/bigquery"
|
804
|
+
#
|
805
|
+
# bigquery = Google::Cloud::Bigquery.new
|
806
|
+
#
|
807
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
808
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
809
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
810
|
+
# ext.hive_partitioning_mode = :auto
|
811
|
+
# ext.hive_partitioning_require_partition_filter = true
|
812
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
813
|
+
# end
|
814
|
+
#
|
815
|
+
# external_data.hive_partitioning? #=> true
|
816
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
817
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
818
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
819
|
+
#
|
820
|
+
def hive_partitioning_source_uri_prefix
|
821
|
+
@gapi.hive_partitioning_options.source_uri_prefix if hive_partitioning?
|
822
|
+
end
|
823
|
+
|
824
|
+
##
|
825
|
+
# Sets the common prefix for all source uris when hive partition detection is requested. The prefix must end
|
826
|
+
# immediately before the partition key encoding begins. For example, consider files following this data
|
827
|
+
# layout:
|
828
|
+
#
|
829
|
+
# ```
|
830
|
+
# gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
|
831
|
+
# gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
|
832
|
+
# ```
|
833
|
+
#
|
834
|
+
# When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
|
835
|
+
# `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
|
836
|
+
#
|
837
|
+
# See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_require_partition_filter=}.
|
838
|
+
#
|
839
|
+
# @param [String] source_uri_prefix The common prefix for all source uris.
|
840
|
+
#
|
841
|
+
# @example
|
842
|
+
# require "google/cloud/bigquery"
|
843
|
+
#
|
844
|
+
# bigquery = Google::Cloud::Bigquery.new
|
845
|
+
#
|
846
|
+
# gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
|
847
|
+
# source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
|
848
|
+
# external_data = bigquery.external gcs_uri, format: :parquet do |ext|
|
849
|
+
# ext.hive_partitioning_mode = :auto
|
850
|
+
# ext.hive_partitioning_require_partition_filter = true
|
851
|
+
# ext.hive_partitioning_source_uri_prefix = source_uri_prefix
|
852
|
+
# end
|
853
|
+
#
|
854
|
+
# external_data.hive_partitioning? #=> true
|
855
|
+
# external_data.hive_partitioning_mode #=> "AUTO"
|
856
|
+
# external_data.hive_partitioning_require_partition_filter? #=> true
|
857
|
+
# external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
|
858
|
+
#
|
859
|
+
def hive_partitioning_source_uri_prefix= source_uri_prefix
|
860
|
+
@gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
|
861
|
+
@gapi.hive_partitioning_options.source_uri_prefix = source_uri_prefix
|
862
|
+
end
|
863
|
+
|
539
864
|
##
|
540
865
|
# @private Google API Client object.
|
541
866
|
def to_gapi
|
@@ -20,15 +20,17 @@ module Google
|
|
20
20
|
# # ExtractJob
|
21
21
|
#
|
22
22
|
# A {Job} subclass representing an export operation that may be performed
|
23
|
-
# on a {Table}. A ExtractJob instance is
|
24
|
-
# {Table#extract_job}.
|
23
|
+
# on a {Table} or {Model}. A ExtractJob instance is returned when you call
|
24
|
+
# {Project#extract_job}, {Table#extract_job} or {Model#extract_job}.
|
25
25
|
#
|
26
26
|
# @see https://cloud.google.com/bigquery/docs/exporting-data
|
27
|
-
# Exporting
|
27
|
+
# Exporting table data
|
28
|
+
# @see https://cloud.google.com/bigquery-ml/docs/exporting-models
|
29
|
+
# Exporting models
|
28
30
|
# @see https://cloud.google.com/bigquery/docs/reference/v2/jobs Jobs API
|
29
31
|
# reference
|
30
32
|
#
|
31
|
-
# @example
|
33
|
+
# @example Export table data
|
32
34
|
# require "google/cloud/bigquery"
|
33
35
|
#
|
34
36
|
# bigquery = Google::Cloud::Bigquery.new
|
@@ -40,6 +42,18 @@ module Google
|
|
40
42
|
# extract_job.wait_until_done!
|
41
43
|
# extract_job.done? #=> true
|
42
44
|
#
|
45
|
+
# @example Export a model
|
46
|
+
# require "google/cloud/bigquery"
|
47
|
+
#
|
48
|
+
# bigquery = Google::Cloud::Bigquery.new
|
49
|
+
# dataset = bigquery.dataset "my_dataset"
|
50
|
+
# model = dataset.model "my_model"
|
51
|
+
#
|
52
|
+
# extract_job = model.extract_job "gs://my-bucket/#{model.model_id}"
|
53
|
+
#
|
54
|
+
# extract_job.wait_until_done!
|
55
|
+
# extract_job.done? #=> true
|
56
|
+
#
|
43
57
|
class ExtractJob < Job
|
44
58
|
##
|
45
59
|
# The URI or URIs representing the Google Cloud Storage files to which
|
@@ -49,71 +63,126 @@ module Google
|
|
49
63
|
end
|
50
64
|
|
51
65
|
##
|
52
|
-
# The table
|
53
|
-
# which {Table#extract_job} was called.
|
66
|
+
# The table or model which is exported.
|
54
67
|
#
|
55
|
-
# @return [Table] A table instance
|
68
|
+
# @return [Table, Model, nil] A table or model instance, or `nil`.
|
56
69
|
#
|
57
70
|
def source
|
58
|
-
table = @gapi.configuration.extract.source_table
|
59
|
-
|
60
|
-
|
71
|
+
if (table = @gapi.configuration.extract.source_table)
|
72
|
+
retrieve_table table.project_id, table.dataset_id, table.table_id
|
73
|
+
elsif (model = @gapi.configuration.extract.source_model)
|
74
|
+
retrieve_model model.project_id, model.dataset_id, model.model_id
|
75
|
+
end
|
61
76
|
end
|
62
77
|
|
63
78
|
##
|
64
|
-
#
|
65
|
-
# default is `false`.
|
79
|
+
# Whether the source of the export job is a table. See {#source}.
|
66
80
|
#
|
67
|
-
# @return [Boolean] `true` when
|
81
|
+
# @return [Boolean] `true` when the source is a table, `false`
|
82
|
+
# otherwise.
|
68
83
|
#
|
69
|
-
def
|
70
|
-
|
71
|
-
val == "GZIP"
|
84
|
+
def table?
|
85
|
+
!@gapi.configuration.extract.source_table.nil?
|
72
86
|
end
|
73
87
|
|
74
88
|
##
|
75
|
-
#
|
76
|
-
# JSON](http://jsonlines.org/). The default is `false`.
|
89
|
+
# Whether the source of the export job is a model. See {#source}.
|
77
90
|
#
|
78
|
-
# @return [Boolean] `true` when
|
91
|
+
# @return [Boolean] `true` when the source is a model, `false`
|
79
92
|
# otherwise.
|
80
93
|
#
|
94
|
+
def model?
|
95
|
+
!@gapi.configuration.extract.source_model.nil?
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Checks if the export operation compresses the data using gzip. The
|
100
|
+
# default is `false`. Not applicable when extracting models.
|
101
|
+
#
|
102
|
+
# @return [Boolean] `true` when `GZIP`, `false` if not `GZIP` or not a
|
103
|
+
# table extraction.
|
104
|
+
def compression?
|
105
|
+
return false unless table?
|
106
|
+
@gapi.configuration.extract.compression == "GZIP"
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# Checks if the destination format for the table data is [newline-delimited
|
111
|
+
# JSON](http://jsonlines.org/). The default is `false`. Not applicable when
|
112
|
+
# extracting models.
|
113
|
+
#
|
114
|
+
# @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false` if not
|
115
|
+
# `NEWLINE_DELIMITED_JSON` or not a table extraction.
|
116
|
+
#
|
81
117
|
def json?
|
82
|
-
|
83
|
-
|
118
|
+
return false unless table?
|
119
|
+
@gapi.configuration.extract.destination_format == "NEWLINE_DELIMITED_JSON"
|
84
120
|
end
|
85
121
|
|
86
122
|
##
|
87
|
-
# Checks if the destination format for the data is CSV. Tables with
|
123
|
+
# Checks if the destination format for the table data is CSV. Tables with
|
88
124
|
# nested or repeated fields cannot be exported as CSV. The default is
|
89
|
-
# `true
|
125
|
+
# `true` for tables. Not applicable when extracting models.
|
90
126
|
#
|
91
|
-
# @return [Boolean] `true` when `CSV`, `false`
|
127
|
+
# @return [Boolean] `true` when `CSV`, or `false` if not `CSV` or not a
|
128
|
+
# table extraction.
|
92
129
|
#
|
93
130
|
def csv?
|
131
|
+
return false unless table?
|
94
132
|
val = @gapi.configuration.extract.destination_format
|
95
133
|
return true if val.nil?
|
96
134
|
val == "CSV"
|
97
135
|
end
|
98
136
|
|
99
137
|
##
|
100
|
-
# Checks if the destination format for the data is
|
101
|
-
# [Avro](http://avro.apache.org/). The default is `false`.
|
138
|
+
# Checks if the destination format for the table data is
|
139
|
+
# [Avro](http://avro.apache.org/). The default is `false`. Not applicable
|
140
|
+
# when extracting models.
|
102
141
|
#
|
103
|
-
# @return [Boolean] `true` when `AVRO`, `false`
|
142
|
+
# @return [Boolean] `true` when `AVRO`, `false` if not `AVRO` or not a
|
143
|
+
# table extraction.
|
104
144
|
#
|
105
145
|
def avro?
|
146
|
+
return false unless table?
|
147
|
+
@gapi.configuration.extract.destination_format == "AVRO"
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Checks if the destination format for the model is TensorFlow SavedModel.
|
152
|
+
# The default is `true` for models. Not applicable when extracting tables.
|
153
|
+
#
|
154
|
+
# @return [Boolean] `true` when `ML_TF_SAVED_MODEL`, `false` if not
|
155
|
+
# `ML_TF_SAVED_MODEL` or not a model extraction.
|
156
|
+
#
|
157
|
+
def ml_tf_saved_model?
|
158
|
+
return false unless model?
|
106
159
|
val = @gapi.configuration.extract.destination_format
|
107
|
-
|
160
|
+
return true if val.nil?
|
161
|
+
val == "ML_TF_SAVED_MODEL"
|
162
|
+
end
|
163
|
+
|
164
|
+
##
|
165
|
+
# Checks if the destination format for the model is XGBoost. The default
|
166
|
+
# is `false`. Not applicable when extracting tables.
|
167
|
+
#
|
168
|
+
# @return [Boolean] `true` when `ML_XGBOOST_BOOSTER`, `false` if not
|
169
|
+
# `ML_XGBOOST_BOOSTER` or not a model extraction.
|
170
|
+
#
|
171
|
+
def ml_xgboost_booster?
|
172
|
+
return false unless model?
|
173
|
+
@gapi.configuration.extract.destination_format == "ML_XGBOOST_BOOSTER"
|
108
174
|
end
|
109
175
|
|
110
176
|
##
|
111
177
|
# The character or symbol the operation uses to delimit fields in the
|
112
|
-
# exported data. The default is a comma (,).
|
178
|
+
# exported data. The default is a comma (,) for tables. Not applicable
|
179
|
+
# when extracting models.
|
113
180
|
#
|
114
|
-
# @return [String] A string containing the character, such as `","
|
181
|
+
# @return [String, nil] A string containing the character, such as `","`,
|
182
|
+
# `nil` if not a table extraction.
|
115
183
|
#
|
116
184
|
def delimiter
|
185
|
+
return unless table?
|
117
186
|
val = @gapi.configuration.extract.field_delimiter
|
118
187
|
val = "," if val.nil?
|
119
188
|
val
|
@@ -121,12 +190,13 @@ module Google
|
|
121
190
|
|
122
191
|
##
|
123
192
|
# Checks if the exported data contains a header row. The default is
|
124
|
-
# `true
|
193
|
+
# `true` for tables. Not applicable when extracting models.
|
125
194
|
#
|
126
195
|
# @return [Boolean] `true` when the print header configuration is
|
127
|
-
# present or `nil`, `false`
|
196
|
+
# present or `nil`, `false` if disabled or not a table extraction.
|
128
197
|
#
|
129
198
|
def print_header?
|
199
|
+
return false unless table?
|
130
200
|
val = @gapi.configuration.extract.print_header
|
131
201
|
val = true if val.nil?
|
132
202
|
val
|
@@ -159,12 +229,14 @@ module Google
|
|
159
229
|
# whether to enable extracting applicable column types (such as
|
160
230
|
# `TIMESTAMP`) to their corresponding AVRO logical types
|
161
231
|
# (`timestamp-micros`), instead of only using their raw types
|
162
|
-
# (`avro-long`).
|
232
|
+
# (`avro-long`). Not applicable when extracting models.
|
163
233
|
#
|
164
234
|
# @return [Boolean] `true` when applicable column types will use their
|
165
|
-
# corresponding AVRO logical types, `false`
|
235
|
+
# corresponding AVRO logical types, `false` if not enabled or not a
|
236
|
+
# table extraction.
|
166
237
|
#
|
167
238
|
def use_avro_logical_types?
|
239
|
+
return false unless table?
|
168
240
|
@gapi.configuration.extract.use_avro_logical_types
|
169
241
|
end
|
170
242
|
|
@@ -182,19 +254,24 @@ module Google
|
|
182
254
|
#
|
183
255
|
# @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job
|
184
256
|
# configuration object for setting query options.
|
185
|
-
def self.from_options service,
|
257
|
+
def self.from_options service, source, storage_files, options
|
186
258
|
job_ref = service.job_ref_from options[:job_id], options[:prefix]
|
187
259
|
storage_urls = Array(storage_files).map do |url|
|
188
260
|
url.respond_to?(:to_gs_url) ? url.to_gs_url : url
|
189
261
|
end
|
190
262
|
options[:format] ||= Convert.derive_source_format storage_urls.first
|
263
|
+
extract_config = Google::Apis::BigqueryV2::JobConfigurationExtract.new(
|
264
|
+
destination_uris: Array(storage_urls)
|
265
|
+
)
|
266
|
+
if source.is_a? Google::Apis::BigqueryV2::TableReference
|
267
|
+
extract_config.source_table = source
|
268
|
+
elsif source.is_a? Google::Apis::BigqueryV2::ModelReference
|
269
|
+
extract_config.source_model = source
|
270
|
+
end
|
191
271
|
job = Google::Apis::BigqueryV2::Job.new(
|
192
272
|
job_reference: job_ref,
|
193
273
|
configuration: Google::Apis::BigqueryV2::JobConfiguration.new(
|
194
|
-
extract:
|
195
|
-
destination_uris: Array(storage_urls),
|
196
|
-
source_table: table
|
197
|
-
),
|
274
|
+
extract: extract_config,
|
198
275
|
dry_run: options[:dryrun]
|
199
276
|
)
|
200
277
|
)
|
@@ -253,7 +330,7 @@ module Google
|
|
253
330
|
end
|
254
331
|
|
255
332
|
##
|
256
|
-
# Sets the compression type.
|
333
|
+
# Sets the compression type. Not applicable when extracting models.
|
257
334
|
#
|
258
335
|
# @param [String] value The compression type to use for exported
|
259
336
|
# files. Possible values include `GZIP` and `NONE`. The default
|
@@ -265,7 +342,7 @@ module Google
|
|
265
342
|
end
|
266
343
|
|
267
344
|
##
|
268
|
-
# Sets the field delimiter.
|
345
|
+
# Sets the field delimiter. Not applicable when extracting models.
|
269
346
|
#
|
270
347
|
# @param [String] value Delimiter to use between fields in the
|
271
348
|
# exported data. Default is <code>,</code>.
|
@@ -276,14 +353,21 @@ module Google
|
|
276
353
|
end
|
277
354
|
|
278
355
|
##
|
279
|
-
# Sets the destination file format. The default value
|
356
|
+
# Sets the destination file format. The default value for
|
357
|
+
# tables is `csv`. Tables with nested or repeated fields cannot be
|
358
|
+
# exported as CSV. The default value for models is `ml_tf_saved_model`.
|
280
359
|
#
|
281
|
-
#
|
360
|
+
# Supported values for tables:
|
282
361
|
#
|
283
362
|
# * `csv` - CSV
|
284
363
|
# * `json` - [Newline-delimited JSON](http://jsonlines.org/)
|
285
364
|
# * `avro` - [Avro](http://avro.apache.org/)
|
286
365
|
#
|
366
|
+
# Supported values for models:
|
367
|
+
#
|
368
|
+
# * `ml_tf_saved_model` - TensorFlow SavedModel
|
369
|
+
# * `ml_xgboost_booster` - XGBoost Booster
|
370
|
+
#
|
287
371
|
# @param [String] new_format The new source format.
|
288
372
|
#
|
289
373
|
# @!group Attributes
|
@@ -293,7 +377,8 @@ module Google
|
|
293
377
|
end
|
294
378
|
|
295
379
|
##
|
296
|
-
# Print a header row in the exported file.
|
380
|
+
# Print a header row in the exported file. Not applicable when
|
381
|
+
# extracting models.
|
297
382
|
#
|
298
383
|
# @param [Boolean] value Whether to print out a header row in the
|
299
384
|
# results. Default is `true`.
|
@@ -307,12 +392,21 @@ module Google
|
|
307
392
|
# Sets the labels to use for the job.
|
308
393
|
#
|
309
394
|
# @param [Hash] value A hash of user-provided labels associated with
|
310
|
-
# the job. You can use these to organize and group your jobs.
|
311
|
-
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
#
|
315
|
-
#
|
395
|
+
# the job. You can use these to organize and group your jobs.
|
396
|
+
#
|
397
|
+
# The labels applied to a resource must meet the following requirements:
|
398
|
+
#
|
399
|
+
# * Each resource can have multiple labels, up to a maximum of 64.
|
400
|
+
# * Each label must be a key-value pair.
|
401
|
+
# * Keys have a minimum length of 1 character and a maximum length of
|
402
|
+
# 63 characters, and cannot be empty. Values can be empty, and have
|
403
|
+
# a maximum length of 63 characters.
|
404
|
+
# * Keys and values can contain only lowercase letters, numeric characters,
|
405
|
+
# underscores, and dashes. All characters must use UTF-8 encoding, and
|
406
|
+
# international characters are allowed.
|
407
|
+
# * The key portion of a label must be unique. However, you can use the
|
408
|
+
# same key with multiple resources.
|
409
|
+
# * Keys must start with a lowercase letter or international character.
|
316
410
|
#
|
317
411
|
# @!group Attributes
|
318
412
|
#
|
@@ -362,6 +456,16 @@ module Google
|
|
362
456
|
@gapi
|
363
457
|
end
|
364
458
|
end
|
459
|
+
|
460
|
+
protected
|
461
|
+
|
462
|
+
def retrieve_model project_id, dataset_id, model_id
|
463
|
+
ensure_service!
|
464
|
+
gapi = service.get_project_model project_id, dataset_id, model_id
|
465
|
+
Model.from_gapi_json gapi, service
|
466
|
+
rescue Google::Cloud::NotFoundError
|
467
|
+
nil
|
468
|
+
end
|
365
469
|
end
|
366
470
|
end
|
367
471
|
end
|