google-cloud-bigquery 1.21.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +16 -0
  3. data/AUTHENTICATION.md +158 -0
  4. data/CHANGELOG.md +397 -0
  5. data/CODE_OF_CONDUCT.md +40 -0
  6. data/CONTRIBUTING.md +188 -0
  7. data/LICENSE +201 -0
  8. data/LOGGING.md +27 -0
  9. data/OVERVIEW.md +463 -0
  10. data/TROUBLESHOOTING.md +31 -0
  11. data/lib/google-cloud-bigquery.rb +139 -0
  12. data/lib/google/cloud/bigquery.rb +145 -0
  13. data/lib/google/cloud/bigquery/argument.rb +197 -0
  14. data/lib/google/cloud/bigquery/convert.rb +383 -0
  15. data/lib/google/cloud/bigquery/copy_job.rb +316 -0
  16. data/lib/google/cloud/bigquery/credentials.rb +50 -0
  17. data/lib/google/cloud/bigquery/data.rb +526 -0
  18. data/lib/google/cloud/bigquery/dataset.rb +2845 -0
  19. data/lib/google/cloud/bigquery/dataset/access.rb +1021 -0
  20. data/lib/google/cloud/bigquery/dataset/list.rb +162 -0
  21. data/lib/google/cloud/bigquery/encryption_configuration.rb +123 -0
  22. data/lib/google/cloud/bigquery/external.rb +2432 -0
  23. data/lib/google/cloud/bigquery/extract_job.rb +368 -0
  24. data/lib/google/cloud/bigquery/insert_response.rb +180 -0
  25. data/lib/google/cloud/bigquery/job.rb +657 -0
  26. data/lib/google/cloud/bigquery/job/list.rb +162 -0
  27. data/lib/google/cloud/bigquery/load_job.rb +1704 -0
  28. data/lib/google/cloud/bigquery/model.rb +740 -0
  29. data/lib/google/cloud/bigquery/model/list.rb +164 -0
  30. data/lib/google/cloud/bigquery/project.rb +1655 -0
  31. data/lib/google/cloud/bigquery/project/list.rb +161 -0
  32. data/lib/google/cloud/bigquery/query_job.rb +1695 -0
  33. data/lib/google/cloud/bigquery/routine.rb +1108 -0
  34. data/lib/google/cloud/bigquery/routine/list.rb +165 -0
  35. data/lib/google/cloud/bigquery/schema.rb +564 -0
  36. data/lib/google/cloud/bigquery/schema/field.rb +668 -0
  37. data/lib/google/cloud/bigquery/service.rb +589 -0
  38. data/lib/google/cloud/bigquery/standard_sql.rb +495 -0
  39. data/lib/google/cloud/bigquery/table.rb +3340 -0
  40. data/lib/google/cloud/bigquery/table/async_inserter.rb +520 -0
  41. data/lib/google/cloud/bigquery/table/list.rb +172 -0
  42. data/lib/google/cloud/bigquery/time.rb +65 -0
  43. data/lib/google/cloud/bigquery/version.rb +22 -0
  44. metadata +297 -0
@@ -0,0 +1,520 @@
1
+ # Copyright 2017 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google/cloud/bigquery/convert"
17
+ require "monitor"
18
+ require "concurrent"
19
+ require "securerandom"
20
+
21
+ module Google
22
+ module Cloud
23
+ module Bigquery
24
+ class Table
25
+ ##
26
+ # # AsyncInserter
27
+ #
28
+ # Used to insert multiple rows in batches to a topic. See
29
+ # {Google::Cloud::Bigquery::Table#insert_async}.
30
+ #
31
+ # @example
32
+ # require "google/cloud/bigquery"
33
+ #
34
+ # bigquery = Google::Cloud::Bigquery.new
35
+ # dataset = bigquery.dataset "my_dataset"
36
+ # table = dataset.table "my_table"
37
+ # inserter = table.insert_async do |result|
38
+ # if result.error?
39
+ # log_error result.error
40
+ # else
41
+ # log_insert "inserted #{result.insert_count} rows " \
42
+ # "with #{result.error_count} errors"
43
+ # end
44
+ # end
45
+ #
46
+ # rows = [
47
+ # { "first_name" => "Alice", "age" => 21 },
48
+ # { "first_name" => "Bob", "age" => 22 }
49
+ # ]
50
+ # inserter.insert rows
51
+ #
52
+ # inserter.stop.wait!
53
+ #
54
+ # @attr_reader [Integer] max_bytes The maximum size of rows to be
55
+ # collected before the batch is inserted. Default is 10,000,000
56
+ # (10MB).
57
+ # @attr_reader [Integer] max_rows The maximum number of rows to be
58
+ # collected before the batch is inserted. Default is 500.
59
+ # @attr_reader [Numeric] interval The number of seconds to collect rows
60
+ # before the batch is inserted. Default is 10.
61
+ # @attr_reader [Integer] threads The number of threads used to insert
62
+ # rows. Default is 4.
63
+ #
64
+ class AsyncInserter
65
+ include MonitorMixin
66
+
67
+ attr_reader :max_bytes, :max_rows, :interval, :threads
68
+ ##
69
+ # @private Implementation accessors
70
+ attr_reader :table, :batch
71
+
72
+ ##
73
+ # @private
74
+ def initialize table, skip_invalid: nil, ignore_unknown: nil, max_bytes: 10_000_000, max_rows: 500,
75
+ interval: 10, threads: 4, &block
76
+ # init MonitorMixin
77
+ super()
78
+
79
+ @table = table
80
+ @skip_invalid = skip_invalid
81
+ @ignore_unknown = ignore_unknown
82
+
83
+ @max_bytes = max_bytes
84
+ @max_rows = max_rows
85
+ @interval = interval
86
+ @threads = threads
87
+ @callback = block
88
+
89
+ @batch = nil
90
+
91
+ @thread_pool = Concurrent::ThreadPoolExecutor.new max_threads: @threads
92
+
93
+ @cond = new_cond
94
+ end
95
+
96
+ ##
97
+ # Adds rows to the async inserter to be inserted. Rows will be
98
+ # collected in batches and inserted together.
99
+ # See {Google::Cloud::Bigquery::Table#insert_async}.
100
+ #
101
+ # Because BigQuery's streaming API is designed for high insertion
102
+ # rates, modifications to the underlying table metadata are eventually
103
+ # consistent when interacting with the streaming system. In most cases
104
+ # metadata changes are propagated within minutes, but during this
105
+ # period API responses may reflect the inconsistent state of the
106
+ # table.
107
+ #
108
+ # @see https://cloud.google.com/bigquery/streaming-data-into-bigquery
109
+ # Streaming Data Into BigQuery
110
+ #
111
+ # @see https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
112
+ # BigQuery Troubleshooting: Metadata errors for streaming inserts
113
+ #
114
+ # @param [Hash, Array<Hash>] rows A hash object or array of hash
115
+ # objects containing the data.
116
+ # @param [Array<String|Symbol>, Symbol] insert_ids A unique ID for each row. BigQuery uses this property to
117
+ # detect duplicate insertion requests on a best-effort basis. For more information, see [data
118
+ # consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency). Optional. If
119
+ # not provided, the client library will assign a UUID to each row before the request is sent.
120
+ #
121
+ # The value `:skip` can be provided to skip the generation of IDs for all rows, or to skip the generation of
122
+ # an ID for a specific row in the array.
123
+ #
124
+ def insert rows, insert_ids: nil
125
+ return nil if rows.nil?
126
+ return nil if rows.is_a?(Array) && rows.empty?
127
+ rows, insert_ids = validate_insert_args rows, insert_ids
128
+
129
+ synchronize do
130
+ rows.zip(Array(insert_ids)).each do |row, insert_id|
131
+ if @batch.nil?
132
+ @batch = Batch.new max_bytes: @max_bytes, max_rows: @max_rows
133
+ @batch.insert row, insert_id
134
+ else
135
+ unless @batch.try_insert row, insert_id
136
+ push_batch_request!
137
+
138
+ @batch = Batch.new max_bytes: @max_bytes, max_rows: @max_rows
139
+ @batch.insert row, insert_id
140
+ end
141
+ end
142
+
143
+ @batch_created_at ||= ::Time.now
144
+ @background_thread ||= Thread.new { run_background }
145
+
146
+ push_batch_request! if @batch.ready?
147
+ end
148
+
149
+ @cond.signal
150
+ end
151
+
152
+ true
153
+ end
154
+
155
+ ##
156
+ # Begins the process of stopping the inserter. Rows already in the
157
+ # queue will be inserted, but no new rows can be added. Use {#wait!}
158
+ # to block until the inserter is fully stopped and all pending rows
159
+ # have been inserted.
160
+ #
161
+ # @return [AsyncInserter] returns self so calls can be chained.
162
+ #
163
+ def stop
164
+ synchronize do
165
+ break if @stopped
166
+
167
+ @stopped = true
168
+ push_batch_request!
169
+ @cond.signal
170
+ end
171
+
172
+ self
173
+ end
174
+
175
+ ##
176
+ # Blocks until the inserter is fully stopped, all pending rows
177
+ # have been inserted, and all callbacks have completed. Does not stop
178
+ # the inserter. To stop the inserter, first call {#stop} and then
179
+ # call {#wait!} to block until the inserter is stopped.
180
+ #
181
+ # @return [AsyncInserter] returns self so calls can be chained.
182
+ #
183
+ def wait! timeout = nil
184
+ synchronize do
185
+ @thread_pool.shutdown
186
+ @thread_pool.wait_for_termination timeout
187
+ end
188
+
189
+ self
190
+ end
191
+
192
+ ##
193
+ # Forces all rows in the current batch to be inserted immediately.
194
+ #
195
+ # @return [AsyncInserter] returns self so calls can be chained.
196
+ #
197
+ def flush
198
+ synchronize do
199
+ push_batch_request!
200
+ @cond.signal
201
+ end
202
+
203
+ self
204
+ end
205
+
206
+ ##
207
+ # Whether the inserter has been started.
208
+ #
209
+ # @return [boolean] `true` when started, `false` otherwise.
210
+ #
211
+ def started?
212
+ !stopped?
213
+ end
214
+
215
+ ##
216
+ # Whether the inserter has been stopped.
217
+ #
218
+ # @return [boolean] `true` when stopped, `false` otherwise.
219
+ #
220
+ def stopped?
221
+ synchronize { @stopped }
222
+ end
223
+
224
+ protected
225
+
226
+ def validate_insert_args rows, insert_ids
227
+ rows = [rows] if rows.is_a? Hash
228
+ raise ArgumentError, "No rows provided" if rows.empty?
229
+
230
+ insert_ids = Array.new(rows.count) { :skip } if insert_ids == :skip
231
+ insert_ids = Array insert_ids
232
+ if insert_ids.count.positive? && insert_ids.count != rows.count
233
+ raise ArgumentError, "insert_ids must be the same size as rows"
234
+ end
235
+
236
+ [rows, insert_ids]
237
+ end
238
+
239
+ def run_background
240
+ synchronize do
241
+ until @stopped
242
+ if @batch.nil?
243
+ @cond.wait
244
+ next
245
+ end
246
+
247
+ time_since_first_publish = ::Time.now - @batch_created_at
248
+ if time_since_first_publish < @interval
249
+ # still waiting for the interval to insert the batch...
250
+ timeout = @interval - time_since_first_publish
251
+ @cond.wait timeout
252
+ else
253
+ # interval met, insert the batch...
254
+ push_batch_request!
255
+ @cond.wait
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ def push_batch_request!
262
+ return unless @batch
263
+
264
+ orig_rows = @batch.rows
265
+ json_rows = @batch.json_rows
266
+ insert_ids = @batch.insert_ids
267
+ Concurrent::Future.new executor: @thread_pool do
268
+ begin
269
+ raise ArgumentError, "No rows provided" if json_rows.empty?
270
+ options = { skip_invalid: @skip_invalid, ignore_unknown: @ignore_unknown, insert_ids: insert_ids }
271
+ insert_resp = @table.service.insert_tabledata_json_rows(
272
+ @table.dataset_id, @table.table_id, json_rows, options
273
+ )
274
+ result = Result.new InsertResponse.from_gapi(orig_rows, insert_resp)
275
+ rescue StandardError => e
276
+ result = Result.new nil, e
277
+ ensure
278
+ @callback&.call result
279
+ end
280
+ end.execute
281
+
282
+ @batch = nil
283
+ @batch_created_at = nil
284
+ end
285
+
286
+ ##
287
+ # @private
288
+ class Batch
289
+ attr_reader :max_bytes, :max_rows, :rows, :json_rows, :insert_ids
290
+
291
+ def initialize max_bytes: 10_000_000, max_rows: 500
292
+ @max_bytes = max_bytes
293
+ @max_rows = max_rows
294
+ @rows = []
295
+ @json_rows = []
296
+ @insert_ids = []
297
+ # The default request byte size overhead is 63.
298
+ # "{\"rows\":[],\"ignoreUnknownValues\":false,
299
+ # \"skipInvalidRows\":false}".bytesize #=> 63
300
+ @current_bytes = 63
301
+ end
302
+
303
+ def insert row, insert_id
304
+ insert_id ||= SecureRandom.uuid
305
+ json_row = to_json_row row
306
+
307
+ insert_rows_bytes row, json_row, insert_id, addl_bytes_for(json_row, insert_id)
308
+ end
309
+
310
+ def try_insert row, insert_id
311
+ insert_id ||= SecureRandom.uuid
312
+ json_row = to_json_row row
313
+ addl_bytes = addl_bytes_for json_row, insert_id
314
+
315
+ return false if @current_bytes + addl_bytes >= @max_bytes
316
+ return false if @rows.count + 1 >= @max_rows
317
+
318
+ insert_rows_bytes row, json_row, insert_id, addl_bytes
319
+ true
320
+ end
321
+
322
+ def ready?
323
+ @current_bytes >= @max_bytes || rows.count >= @max_rows
324
+ end
325
+
326
+ private
327
+
328
+ def insert_rows_bytes row, json_row, insert_id, addl_bytes
329
+ @rows << row
330
+ @json_rows << json_row
331
+ @insert_ids << insert_id if insert_id
332
+ @current_bytes += addl_bytes
333
+ end
334
+
335
+ def to_json_row row
336
+ Convert.to_json_row row
337
+ end
338
+
339
+ def addl_bytes_for json_row, insert_id
340
+ if insert_id == :skip
341
+ # "{\"json\":},".bytesize #=> 10
342
+ 10 + json_row.to_json.bytesize
343
+ else
344
+ # "{\"insertId\":\"\",\"json\":},".bytesize #=> 24
345
+ 24 + json_row.to_json.bytesize + insert_id.bytesize
346
+ end
347
+ end
348
+ end
349
+
350
+ ##
351
+ # AsyncInserter::Result
352
+ #
353
+ # Represents the result from BigQuery, including any error
354
+ # encountered, when data is asynchronously inserted into a table for
355
+ # near-immediate querying. See {Dataset#insert_async} and
356
+ # {Table#insert_async}.
357
+ #
358
+ # @see https://cloud.google.com/bigquery/streaming-data-into-bigquery
359
+ # Streaming Data Into BigQuery
360
+ #
361
+ # @attr_reader [Google::Cloud::Bigquery::InsertResponse, nil]
362
+ # insert_response The response from the insert operation if no
363
+ # error was encountered, or `nil` if the insert operation
364
+ # encountered an error.
365
+ # @attr_reader [Error, nil] error The error from the insert operation
366
+ # if any error was encountered, otherwise `nil`.
367
+ #
368
+ # @example
369
+ # require "google/cloud/bigquery"
370
+ #
371
+ # bigquery = Google::Cloud::Bigquery.new
372
+ # dataset = bigquery.dataset "my_dataset"
373
+ # table = dataset.table "my_table"
374
+ # inserter = table.insert_async do |result|
375
+ # if result.error?
376
+ # log_error result.error
377
+ # else
378
+ # log_insert "inserted #{result.insert_count} rows " \
379
+ # "with #{result.error_count} errors"
380
+ # end
381
+ # end
382
+ #
383
+ # rows = [
384
+ # { "first_name" => "Alice", "age" => 21 },
385
+ # { "first_name" => "Bob", "age" => 22 }
386
+ # ]
387
+ # inserter.insert rows
388
+ #
389
+ # inserter.stop.wait!
390
+ #
391
+ class Result
392
+ # @private
393
+ def initialize insert_response, error = nil
394
+ @insert_response = insert_response
395
+ @error = error
396
+ end
397
+
398
+ attr_reader :insert_response, :error
399
+
400
+ ##
401
+ # Checks if an error is present, meaning that the insert operation
402
+ # encountered an error. Use {#error} to access the error. For
403
+ # row-level errors, see {#success?} and {#insert_errors}.
404
+ #
405
+ # @return [Boolean] `true` when an error is present, `false`
406
+ # otherwise.
407
+ #
408
+ def error?
409
+ !error.nil?
410
+ end
411
+
412
+ ##
413
+ # Checks if the error count for row-level errors is zero, meaning
414
+ # that all of the rows were inserted. Use {#insert_errors} to access
415
+ # the row-level errors. To check for and access any operation-level
416
+ # error, use {#error?} and {#error}.
417
+ #
418
+ # @return [Boolean, nil] `true` when the error count is zero,
419
+ # `false` when the error count is positive, or `nil` if the insert
420
+ # operation encountered an error.
421
+ #
422
+ def success?
423
+ return nil if error?
424
+ insert_response.success?
425
+ end
426
+
427
+ ##
428
+ # The count of rows in the response, minus the count of errors for
429
+ # rows that were not inserted.
430
+ #
431
+ # @return [Integer, nil] The number of rows inserted, or `nil` if
432
+ # the insert operation encountered an error.
433
+ #
434
+ def insert_count
435
+ return nil if error?
436
+ insert_response.insert_count
437
+ end
438
+
439
+ ##
440
+ # The count of errors for rows that were not inserted.
441
+ #
442
+ # @return [Integer, nil] The number of errors, or `nil` if the
443
+ # insert operation encountered an error.
444
+ #
445
+ def error_count
446
+ return nil if error?
447
+ insert_response.error_count
448
+ end
449
+
450
+ ##
451
+ # The error objects for rows that were not inserted.
452
+ #
453
+ # @return [Array<InsertError>, nil] An array containing error
454
+ # objects, or `nil` if the insert operation encountered an error.
455
+ #
456
+ def insert_errors
457
+ return nil if error?
458
+ insert_response.insert_errors
459
+ end
460
+
461
+ ##
462
+ # The rows that were not inserted.
463
+ #
464
+ # @return [Array<Hash>, nil] An array of hash objects containing the
465
+ # row data, or `nil` if the insert operation encountered an error.
466
+ #
467
+ def error_rows
468
+ return nil if error?
469
+ insert_response.error_rows
470
+ end
471
+
472
+ ##
473
+ # Returns the error object for a row that was not inserted.
474
+ #
475
+ # @param [Hash] row A hash containing the data for a row.
476
+ #
477
+ # @return [InsertError, nil] An error object, `nil` if no error is
478
+ # found in the response for the row, or `nil` if the insert
479
+ # operation encountered an error.
480
+ #
481
+ def insert_error_for row
482
+ return nil if error?
483
+ insert_response.insert_error_for row
484
+ end
485
+
486
+ ##
487
+ # Returns the error hashes for a row that was not inserted. Each
488
+ # error hash contains the following keys: `reason`, `location`,
489
+ # `debugInfo`, and `message`.
490
+ #
491
+ # @param [Hash, nil] row A hash containing the data for a row.
492
+ #
493
+ # @return [Array<Hash>, nil] An array of error hashes, `nil` if no
494
+ # errors are found in the response for the row, or `nil` if the
495
+ # insert operation encountered an error.
496
+ #
497
+ def errors_for row
498
+ return nil if error?
499
+ insert_response.errors_for row
500
+ end
501
+
502
+ ##
503
+ # Returns the index for a row that was not inserted.
504
+ #
505
+ # @param [Hash, nil] row A hash containing the data for a row.
506
+ #
507
+ # @return [Integer, nil] An error object, `nil` if no error is
508
+ # found in the response for the row, or `nil` if the insert
509
+ # operation encountered an error.
510
+ #
511
+ def index_for row
512
+ return nil if error?
513
+ insert_response.index_for row
514
+ end
515
+ end
516
+ end
517
+ end
518
+ end
519
+ end
520
+ end