google-cloud-bigquery 1.21.1 → 1.27.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -45,9 +45,30 @@ module Google
45
45
  # data = bigquery.query "SELECT * FROM my_ext_table",
46
46
  # external: { my_ext_table: csv_table }
47
47
  #
48
+ # # Iterate over the first page of results
48
49
  # data.each do |row|
49
50
  # puts row[:name]
50
51
  # end
52
+ # # Retrieve the next page of results
53
+ # data = data.next if data.next?
54
+ #
55
+ # @example Hive partitioning options:
56
+ # require "google/cloud/bigquery"
57
+ #
58
+ # bigquery = Google::Cloud::Bigquery.new
59
+ #
60
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
61
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
62
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
63
+ # ext.hive_partitioning_mode = :auto
64
+ # ext.hive_partitioning_require_partition_filter = true
65
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
66
+ # end
67
+ #
68
+ # external_data.hive_partitioning? #=> true
69
+ # external_data.hive_partitioning_mode #=> "AUTO"
70
+ # external_data.hive_partitioning_require_partition_filter? #=> true
71
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
51
72
  #
52
73
  module External
53
74
  ##
@@ -76,7 +97,8 @@ module Google
76
97
  # @private Determine source_format from inputs
77
98
  def self.source_format_for urls, format
78
99
  val = {
79
- "csv" => "CSV", "avro" => "AVRO",
100
+ "csv" => "CSV",
101
+ "avro" => "AVRO",
80
102
  "json" => "NEWLINE_DELIMITED_JSON",
81
103
  "newline_delimited_json" => "NEWLINE_DELIMITED_JSON",
82
104
  "sheets" => "GOOGLE_SHEETS",
@@ -84,7 +106,9 @@ module Google
84
106
  "datastore" => "DATASTORE_BACKUP",
85
107
  "backup" => "DATASTORE_BACKUP",
86
108
  "datastore_backup" => "DATASTORE_BACKUP",
87
- "bigtable" => "BIGTABLE"
109
+ "bigtable" => "BIGTABLE",
110
+ "orc" => "ORC",
111
+ "parquet" => "PARQUET"
88
112
  }[format.to_s.downcase]
89
113
  return val unless val.nil?
90
114
  Array(urls).each do |url|
@@ -107,7 +131,7 @@ module Google
107
131
  when "GOOGLE_SHEETS" then External::SheetsSource
108
132
  when "BIGTABLE" then External::BigtableSource
109
133
  else
110
- # AVRO and DATASTORE_BACKUP
134
+ # AVRO, DATASTORE_BACKUP, PARQUET
111
135
  External::DataSource
112
136
  end
113
137
  end
@@ -138,9 +162,30 @@ module Google
138
162
  # data = bigquery.query "SELECT * FROM my_ext_table",
139
163
  # external: { my_ext_table: avro_table }
140
164
  #
165
+ # # Iterate over the first page of results
141
166
  # data.each do |row|
142
167
  # puts row[:name]
143
168
  # end
169
+ # # Retrieve the next page of results
170
+ # data = data.next if data.next?
171
+ #
172
+ # @example Hive partitioning options:
173
+ # require "google/cloud/bigquery"
174
+ #
175
+ # bigquery = Google::Cloud::Bigquery.new
176
+ #
177
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
178
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
179
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
180
+ # ext.hive_partitioning_mode = :auto
181
+ # ext.hive_partitioning_require_partition_filter = true
182
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
183
+ # end
184
+ #
185
+ # external_data.hive_partitioning? #=> true
186
+ # external_data.hive_partitioning_mode #=> "AUTO"
187
+ # external_data.hive_partitioning_require_partition_filter? #=> true
188
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
144
189
  #
145
190
  class DataSource
146
191
  ##
@@ -296,6 +341,52 @@ module Google
296
341
  @gapi.source_format == "BIGTABLE"
297
342
  end
298
343
 
344
+ ##
345
+ # Whether the data format is "ORC".
346
+ #
347
+ # @return [Boolean]
348
+ #
349
+ # @example
350
+ # require "google/cloud/bigquery"
351
+ #
352
+ # bigquery = Google::Cloud::Bigquery.new
353
+ #
354
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
355
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
356
+ # external_data = bigquery.external gcs_uri, format: :orc do |ext|
357
+ # ext.hive_partitioning_mode = :auto
358
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
359
+ # end
360
+ # external_data.format #=> "ORC"
361
+ # external_data.orc? #=> true
362
+ #
363
+ def orc?
364
+ @gapi.source_format == "ORC"
365
+ end
366
+
367
+ ##
368
+ # Whether the data format is "PARQUET".
369
+ #
370
+ # @return [Boolean]
371
+ #
372
+ # @example
373
+ # require "google/cloud/bigquery"
374
+ #
375
+ # bigquery = Google::Cloud::Bigquery.new
376
+ #
377
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
378
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
379
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
380
+ # ext.hive_partitioning_mode = :auto
381
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
382
+ # end
383
+ # external_data.format #=> "PARQUET"
384
+ # external_data.parquet? #=> true
385
+ #
386
+ def parquet?
387
+ @gapi.source_format == "PARQUET"
388
+ end
389
+
299
390
  ##
300
391
  # The fully-qualified URIs that point to your data in Google Cloud.
301
392
  # For Google Cloud Storage URIs: Each URI can contain one '*' wildcard
@@ -530,6 +621,246 @@ module Google
530
621
  @gapi.max_bad_records = new_max_bad_records
531
622
  end
532
623
 
624
+ ###
625
+ # Checks if hive partitioning options are set.
626
+ #
627
+ # Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
628
+ # will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
629
+ # If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
630
+ # Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
631
+ #
632
+ # @return [Boolean] `true` when hive partitioning options are set, or `false` otherwise.
633
+ #
634
+ # @example
635
+ # require "google/cloud/bigquery"
636
+ #
637
+ # bigquery = Google::Cloud::Bigquery.new
638
+ #
639
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
640
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
641
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
642
+ # ext.hive_partitioning_mode = :auto
643
+ # ext.hive_partitioning_require_partition_filter = true
644
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
645
+ # end
646
+ #
647
+ # external_data.hive_partitioning? #=> true
648
+ # external_data.hive_partitioning_mode #=> "AUTO"
649
+ # external_data.hive_partitioning_require_partition_filter? #=> true
650
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
651
+ #
652
+ def hive_partitioning?
653
+ !@gapi.hive_partitioning_options.nil?
654
+ end
655
+
656
+ ###
657
+ # The mode of hive partitioning to use when reading data. The following modes are supported:
658
+ #
659
+ # 1. `AUTO`: automatically infer partition key name(s) and type(s).
660
+ # 2. `STRINGS`: automatically infer partition key name(s). All types are interpreted as strings.
661
+ # 3. `CUSTOM`: partition key schema is encoded in the source URI prefix.
662
+ #
663
+ # @return [String, nil] The mode of hive partitioning, or `nil` if not set.
664
+ #
665
+ # @example
666
+ # require "google/cloud/bigquery"
667
+ #
668
+ # bigquery = Google::Cloud::Bigquery.new
669
+ #
670
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
671
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
672
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
673
+ # ext.hive_partitioning_mode = :auto
674
+ # ext.hive_partitioning_require_partition_filter = true
675
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
676
+ # end
677
+ #
678
+ # external_data.hive_partitioning? #=> true
679
+ # external_data.hive_partitioning_mode #=> "AUTO"
680
+ # external_data.hive_partitioning_require_partition_filter? #=> true
681
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
682
+ #
683
+ def hive_partitioning_mode
684
+ @gapi.hive_partitioning_options.mode if hive_partitioning?
685
+ end
686
+
687
+ ##
688
+ # Sets the mode of hive partitioning to use when reading data. The following modes are supported:
689
+ #
690
+ # 1. `auto`: automatically infer partition key name(s) and type(s).
691
+ # 2. `strings`: automatically infer partition key name(s). All types are interpreted as strings.
692
+ # 3. `custom`: partition key schema is encoded in the source URI prefix.
693
+ #
694
+ # Not all storage formats support hive partitioning. Requesting hive partitioning on an unsupported format
695
+ # will lead to an error. Currently supported types include: `avro`, `csv`, `json`, `orc` and `parquet`.
696
+ # If your data is stored in ORC or Parquet on Cloud Storage, see [Querying columnar formats on Cloud
697
+ # Storage](https://cloud.google.com/bigquery/pricing#columnar_formats_pricing).
698
+ #
699
+ # See {#format}, {#hive_partitioning_require_partition_filter=} and {#hive_partitioning_source_uri_prefix=}.
700
+ #
701
+ # @param [String, Symbol] mode The mode of hive partitioning to use when reading data.
702
+ #
703
+ # @example
704
+ # require "google/cloud/bigquery"
705
+ #
706
+ # bigquery = Google::Cloud::Bigquery.new
707
+ #
708
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
709
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
710
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
711
+ # ext.hive_partitioning_mode = :auto
712
+ # ext.hive_partitioning_require_partition_filter = true
713
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
714
+ # end
715
+ #
716
+ # external_data.hive_partitioning? #=> true
717
+ # external_data.hive_partitioning_mode #=> "AUTO"
718
+ # external_data.hive_partitioning_require_partition_filter? #=> true
719
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
720
+ #
721
+ def hive_partitioning_mode= mode
722
+ @gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
723
+ @gapi.hive_partitioning_options.mode = mode.to_s.upcase
724
+ end
725
+
726
+ ###
727
+ # Whether queries over the table using this external data source require a partition filter that can be used
728
+ # for partition elimination to be specified. Note that this field should only be true when creating a
729
+ # permanent external table or querying a temporary external table.
730
+ #
731
+ # @return [Boolean] `true` when queries over this table require a partition filter, or `false` otherwise.
732
+ #
733
+ # @example
734
+ # require "google/cloud/bigquery"
735
+ #
736
+ # bigquery = Google::Cloud::Bigquery.new
737
+ #
738
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
739
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
740
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
741
+ # ext.hive_partitioning_mode = :auto
742
+ # ext.hive_partitioning_require_partition_filter = true
743
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
744
+ # end
745
+ #
746
+ # external_data.hive_partitioning? #=> true
747
+ # external_data.hive_partitioning_mode #=> "AUTO"
748
+ # external_data.hive_partitioning_require_partition_filter? #=> true
749
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
750
+ #
751
+ def hive_partitioning_require_partition_filter?
752
+ return false unless hive_partitioning?
753
+ !@gapi.hive_partitioning_options.require_partition_filter.nil?
754
+ end
755
+
756
+ ##
757
+ # Sets whether queries over the table using this external data source require a partition filter
758
+ # that can be used for partition elimination to be specified.
759
+ #
760
+ # See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_source_uri_prefix=}.
761
+ #
762
+ # @param [Boolean] require_partition_filter `true` if a partition filter must be specified, `false` otherwise.
763
+ #
764
+ # @example
765
+ # require "google/cloud/bigquery"
766
+ #
767
+ # bigquery = Google::Cloud::Bigquery.new
768
+ #
769
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
770
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
771
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
772
+ # ext.hive_partitioning_mode = :auto
773
+ # ext.hive_partitioning_require_partition_filter = true
774
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
775
+ # end
776
+ #
777
+ # external_data.hive_partitioning? #=> true
778
+ # external_data.hive_partitioning_mode #=> "AUTO"
779
+ # external_data.hive_partitioning_require_partition_filter? #=> true
780
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
781
+ #
782
+ def hive_partitioning_require_partition_filter= require_partition_filter
783
+ @gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
784
+ @gapi.hive_partitioning_options.require_partition_filter = require_partition_filter
785
+ end
786
+
787
+ ###
788
+ # The common prefix for all source uris when hive partition detection is requested. The prefix must end
789
+ # immediately before the partition key encoding begins. For example, consider files following this data
790
+ # layout:
791
+ #
792
+ # ```
793
+ # gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
794
+ # gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
795
+ # ```
796
+ #
797
+ # When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
798
+ # `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
799
+ #
800
+ # @return [String, nil] The common prefix for all source uris, or `nil` if not set.
801
+ #
802
+ # @example
803
+ # require "google/cloud/bigquery"
804
+ #
805
+ # bigquery = Google::Cloud::Bigquery.new
806
+ #
807
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
808
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
809
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
810
+ # ext.hive_partitioning_mode = :auto
811
+ # ext.hive_partitioning_require_partition_filter = true
812
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
813
+ # end
814
+ #
815
+ # external_data.hive_partitioning? #=> true
816
+ # external_data.hive_partitioning_mode #=> "AUTO"
817
+ # external_data.hive_partitioning_require_partition_filter? #=> true
818
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
819
+ #
820
+ def hive_partitioning_source_uri_prefix
821
+ @gapi.hive_partitioning_options.source_uri_prefix if hive_partitioning?
822
+ end
823
+
824
+ ##
825
+ # Sets the common prefix for all source uris when hive partition detection is requested. The prefix must end
826
+ # immediately before the partition key encoding begins. For example, consider files following this data
827
+ # layout:
828
+ #
829
+ # ```
830
+ # gs://bucket/path_to_table/dt=2019-01-01/country=BR/id=7/file.avro
831
+ # gs://bucket/path_to_table/dt=2018-12-31/country=CA/id=3/file.avro
832
+ # ```
833
+ #
834
+ # When hive partitioning is requested with either `AUTO` or `STRINGS` mode, the common prefix can be either of
835
+ # `gs://bucket/path_to_table` or `gs://bucket/path_to_table/` (trailing slash does not matter).
836
+ #
837
+ # See {#format}, {#hive_partitioning_mode=} and {#hive_partitioning_require_partition_filter=}.
838
+ #
839
+ # @param [String] source_uri_prefix The common prefix for all source uris.
840
+ #
841
+ # @example
842
+ # require "google/cloud/bigquery"
843
+ #
844
+ # bigquery = Google::Cloud::Bigquery.new
845
+ #
846
+ # gcs_uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*"
847
+ # source_uri_prefix = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/"
848
+ # external_data = bigquery.external gcs_uri, format: :parquet do |ext|
849
+ # ext.hive_partitioning_mode = :auto
850
+ # ext.hive_partitioning_require_partition_filter = true
851
+ # ext.hive_partitioning_source_uri_prefix = source_uri_prefix
852
+ # end
853
+ #
854
+ # external_data.hive_partitioning? #=> true
855
+ # external_data.hive_partitioning_mode #=> "AUTO"
856
+ # external_data.hive_partitioning_require_partition_filter? #=> true
857
+ # external_data.hive_partitioning_source_uri_prefix #=> source_uri_prefix
858
+ #
859
+ def hive_partitioning_source_uri_prefix= source_uri_prefix
860
+ @gapi.hive_partitioning_options ||= Google::Apis::BigqueryV2::HivePartitioningOptions.new
861
+ @gapi.hive_partitioning_options.source_uri_prefix = source_uri_prefix
862
+ end
863
+
533
864
  ##
534
865
  # @private Google API Client object.
535
866
  def to_gapi
@@ -575,9 +906,12 @@ module Google
575
906
  # data = bigquery.query "SELECT * FROM my_ext_table",
576
907
  # external: { my_ext_table: csv_table }
577
908
  #
909
+ # # Iterate over the first page of results
578
910
  # data.each do |row|
579
911
  # puts row[:name]
580
912
  # end
913
+ # # Retrieve the next page of results
914
+ # data = data.next if data.next?
581
915
  #
582
916
  class CsvSource < External::DataSource
583
917
  ##
@@ -1037,9 +1371,12 @@ module Google
1037
1371
  # data = bigquery.query "SELECT * FROM my_ext_table",
1038
1372
  # external: { my_ext_table: json_table }
1039
1373
  #
1374
+ # # Iterate over the first page of results
1040
1375
  # data.each do |row|
1041
1376
  # puts row[:name]
1042
1377
  # end
1378
+ # # Retrieve the next page of results
1379
+ # data = data.next if data.next?
1043
1380
  #
1044
1381
  class JsonSource < External::DataSource
1045
1382
  ##
@@ -1173,9 +1510,12 @@ module Google
1173
1510
  # data = bigquery.query "SELECT * FROM my_ext_table",
1174
1511
  # external: { my_ext_table: sheets_table }
1175
1512
  #
1513
+ # # Iterate over the first page of results
1176
1514
  # data.each do |row|
1177
1515
  # puts row[:name]
1178
1516
  # end
1517
+ # # Retrieve the next page of results
1518
+ # data = data.next if data.next?
1179
1519
  #
1180
1520
  class SheetsSource < External::DataSource
1181
1521
  ##
@@ -1318,9 +1658,12 @@ module Google
1318
1658
  # data = bigquery.query "SELECT * FROM my_ext_table",
1319
1659
  # external: { my_ext_table: bigtable_table }
1320
1660
  #
1661
+ # # Iterate over the first page of results
1321
1662
  # data.each do |row|
1322
1663
  # puts row[:name]
1323
1664
  # end
1665
+ # # Retrieve the next page of results
1666
+ # data = data.next if data.next?
1324
1667
  #
1325
1668
  class BigtableSource < External::DataSource
1326
1669
  ##
@@ -1516,9 +1859,12 @@ module Google
1516
1859
  # data = bigquery.query "SELECT * FROM my_ext_table",
1517
1860
  # external: { my_ext_table: bigtable_table }
1518
1861
  #
1862
+ # # Iterate over the first page of results
1519
1863
  # data.each do |row|
1520
1864
  # puts row[:name]
1521
1865
  # end
1866
+ # # Retrieve the next page of results
1867
+ # data = data.next if data.next?
1522
1868
  #
1523
1869
  class ColumnFamily
1524
1870
  ##
@@ -2053,9 +2399,12 @@ module Google
2053
2399
  # data = bigquery.query "SELECT * FROM my_ext_table",
2054
2400
  # external: { my_ext_table: bigtable_table }
2055
2401
  #
2402
+ # # Iterate over the first page of results
2056
2403
  # data.each do |row|
2057
2404
  # puts row[:name]
2058
2405
  # end
2406
+ # # Retrieve the next page of results
2407
+ # data = data.next if data.next?
2059
2408
  #
2060
2409
  class Column
2061
2410
  ##