google-cloud-bigquery 1.21.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +16 -0
  3. data/AUTHENTICATION.md +158 -0
  4. data/CHANGELOG.md +397 -0
  5. data/CODE_OF_CONDUCT.md +40 -0
  6. data/CONTRIBUTING.md +188 -0
  7. data/LICENSE +201 -0
  8. data/LOGGING.md +27 -0
  9. data/OVERVIEW.md +463 -0
  10. data/TROUBLESHOOTING.md +31 -0
  11. data/lib/google-cloud-bigquery.rb +139 -0
  12. data/lib/google/cloud/bigquery.rb +145 -0
  13. data/lib/google/cloud/bigquery/argument.rb +197 -0
  14. data/lib/google/cloud/bigquery/convert.rb +383 -0
  15. data/lib/google/cloud/bigquery/copy_job.rb +316 -0
  16. data/lib/google/cloud/bigquery/credentials.rb +50 -0
  17. data/lib/google/cloud/bigquery/data.rb +526 -0
  18. data/lib/google/cloud/bigquery/dataset.rb +2845 -0
  19. data/lib/google/cloud/bigquery/dataset/access.rb +1021 -0
  20. data/lib/google/cloud/bigquery/dataset/list.rb +162 -0
  21. data/lib/google/cloud/bigquery/encryption_configuration.rb +123 -0
  22. data/lib/google/cloud/bigquery/external.rb +2432 -0
  23. data/lib/google/cloud/bigquery/extract_job.rb +368 -0
  24. data/lib/google/cloud/bigquery/insert_response.rb +180 -0
  25. data/lib/google/cloud/bigquery/job.rb +657 -0
  26. data/lib/google/cloud/bigquery/job/list.rb +162 -0
  27. data/lib/google/cloud/bigquery/load_job.rb +1704 -0
  28. data/lib/google/cloud/bigquery/model.rb +740 -0
  29. data/lib/google/cloud/bigquery/model/list.rb +164 -0
  30. data/lib/google/cloud/bigquery/project.rb +1655 -0
  31. data/lib/google/cloud/bigquery/project/list.rb +161 -0
  32. data/lib/google/cloud/bigquery/query_job.rb +1695 -0
  33. data/lib/google/cloud/bigquery/routine.rb +1108 -0
  34. data/lib/google/cloud/bigquery/routine/list.rb +165 -0
  35. data/lib/google/cloud/bigquery/schema.rb +564 -0
  36. data/lib/google/cloud/bigquery/schema/field.rb +668 -0
  37. data/lib/google/cloud/bigquery/service.rb +589 -0
  38. data/lib/google/cloud/bigquery/standard_sql.rb +495 -0
  39. data/lib/google/cloud/bigquery/table.rb +3340 -0
  40. data/lib/google/cloud/bigquery/table/async_inserter.rb +520 -0
  41. data/lib/google/cloud/bigquery/table/list.rb +172 -0
  42. data/lib/google/cloud/bigquery/time.rb +65 -0
  43. data/lib/google/cloud/bigquery/version.rb +22 -0
  44. metadata +297 -0
@@ -0,0 +1,520 @@
1
+ # Copyright 2017 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google/cloud/bigquery/convert"
17
+ require "monitor"
18
+ require "concurrent"
19
+ require "securerandom"
20
+
21
+ module Google
22
+ module Cloud
23
+ module Bigquery
24
+ class Table
25
+ ##
26
+ # # AsyncInserter
27
+ #
28
+ # Used to insert multiple rows in batches to a topic. See
29
+ # {Google::Cloud::Bigquery::Table#insert_async}.
30
+ #
31
+ # @example
32
+ # require "google/cloud/bigquery"
33
+ #
34
+ # bigquery = Google::Cloud::Bigquery.new
35
+ # dataset = bigquery.dataset "my_dataset"
36
+ # table = dataset.table "my_table"
37
+ # inserter = table.insert_async do |result|
38
+ # if result.error?
39
+ # log_error result.error
40
+ # else
41
+ # log_insert "inserted #{result.insert_count} rows " \
42
+ # "with #{result.error_count} errors"
43
+ # end
44
+ # end
45
+ #
46
+ # rows = [
47
+ # { "first_name" => "Alice", "age" => 21 },
48
+ # { "first_name" => "Bob", "age" => 22 }
49
+ # ]
50
+ # inserter.insert rows
51
+ #
52
+ # inserter.stop.wait!
53
+ #
54
+ # @attr_reader [Integer] max_bytes The maximum size of rows to be
55
+ # collected before the batch is inserted. Default is 10,000,000
56
+ # (10MB).
57
+ # @attr_reader [Integer] max_rows The maximum number of rows to be
58
+ # collected before the batch is inserted. Default is 500.
59
+ # @attr_reader [Numeric] interval The number of seconds to collect rows
60
+ # before the batch is inserted. Default is 10.
61
+ # @attr_reader [Integer] threads The number of threads used to insert
62
+ # rows. Default is 4.
63
+ #
64
+ class AsyncInserter
65
+ include MonitorMixin
66
+
67
+ attr_reader :max_bytes, :max_rows, :interval, :threads
68
+ ##
69
+ # @private Implementation accessors
70
+ attr_reader :table, :batch
71
+
72
+ ##
73
+ # @private
74
+ def initialize table, skip_invalid: nil, ignore_unknown: nil, max_bytes: 10_000_000, max_rows: 500,
75
+ interval: 10, threads: 4, &block
76
+ # init MonitorMixin
77
+ super()
78
+
79
+ @table = table
80
+ @skip_invalid = skip_invalid
81
+ @ignore_unknown = ignore_unknown
82
+
83
+ @max_bytes = max_bytes
84
+ @max_rows = max_rows
85
+ @interval = interval
86
+ @threads = threads
87
+ @callback = block
88
+
89
+ @batch = nil
90
+
91
+ @thread_pool = Concurrent::ThreadPoolExecutor.new max_threads: @threads
92
+
93
+ @cond = new_cond
94
+ end
95
+
96
+ ##
97
+ # Adds rows to the async inserter to be inserted. Rows will be
98
+ # collected in batches and inserted together.
99
+ # See {Google::Cloud::Bigquery::Table#insert_async}.
100
+ #
101
+ # Because BigQuery's streaming API is designed for high insertion
102
+ # rates, modifications to the underlying table metadata are eventually
103
+ # consistent when interacting with the streaming system. In most cases
104
+ # metadata changes are propagated within minutes, but during this
105
+ # period API responses may reflect the inconsistent state of the
106
+ # table.
107
+ #
108
+ # @see https://cloud.google.com/bigquery/streaming-data-into-bigquery
109
+ # Streaming Data Into BigQuery
110
+ #
111
+ # @see https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
112
+ # BigQuery Troubleshooting: Metadata errors for streaming inserts
113
+ #
114
+ # @param [Hash, Array<Hash>] rows A hash object or array of hash
115
+ # objects containing the data.
116
+ # @param [Array<String|Symbol>, Symbol] insert_ids A unique ID for each row. BigQuery uses this property to
117
+ # detect duplicate insertion requests on a best-effort basis. For more information, see [data
118
+ # consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency). Optional. If
119
+ # not provided, the client library will assign a UUID to each row before the request is sent.
120
+ #
121
+ # The value `:skip` can be provided to skip the generation of IDs for all rows, or to skip the generation of
122
+ # an ID for a specific row in the array.
123
+ #
124
+ def insert rows, insert_ids: nil
125
+ return nil if rows.nil?
126
+ return nil if rows.is_a?(Array) && rows.empty?
127
+ rows, insert_ids = validate_insert_args rows, insert_ids
128
+
129
+ synchronize do
130
+ rows.zip(Array(insert_ids)).each do |row, insert_id|
131
+ if @batch.nil?
132
+ @batch = Batch.new max_bytes: @max_bytes, max_rows: @max_rows
133
+ @batch.insert row, insert_id
134
+ else
135
+ unless @batch.try_insert row, insert_id
136
+ push_batch_request!
137
+
138
+ @batch = Batch.new max_bytes: @max_bytes, max_rows: @max_rows
139
+ @batch.insert row, insert_id
140
+ end
141
+ end
142
+
143
+ @batch_created_at ||= ::Time.now
144
+ @background_thread ||= Thread.new { run_background }
145
+
146
+ push_batch_request! if @batch.ready?
147
+ end
148
+
149
+ @cond.signal
150
+ end
151
+
152
+ true
153
+ end
154
+
155
+ ##
156
+ # Begins the process of stopping the inserter. Rows already in the
157
+ # queue will be inserted, but no new rows can be added. Use {#wait!}
158
+ # to block until the inserter is fully stopped and all pending rows
159
+ # have been inserted.
160
+ #
161
+ # @return [AsyncInserter] returns self so calls can be chained.
162
+ #
163
+ def stop
164
+ synchronize do
165
+ break if @stopped
166
+
167
+ @stopped = true
168
+ push_batch_request!
169
+ @cond.signal
170
+ end
171
+
172
+ self
173
+ end
174
+
175
+ ##
176
+ # Blocks until the inserter is fully stopped, all pending rows
177
+ # have been inserted, and all callbacks have completed. Does not stop
178
+ # the inserter. To stop the inserter, first call {#stop} and then
179
+ # call {#wait!} to block until the inserter is stopped.
180
+ #
181
+ # @return [AsyncInserter] returns self so calls can be chained.
182
+ #
183
+ def wait! timeout = nil
184
+ synchronize do
185
+ @thread_pool.shutdown
186
+ @thread_pool.wait_for_termination timeout
187
+ end
188
+
189
+ self
190
+ end
191
+
192
+ ##
193
+ # Forces all rows in the current batch to be inserted immediately.
194
+ #
195
+ # @return [AsyncInserter] returns self so calls can be chained.
196
+ #
197
+ def flush
198
+ synchronize do
199
+ push_batch_request!
200
+ @cond.signal
201
+ end
202
+
203
+ self
204
+ end
205
+
206
+ ##
207
+ # Whether the inserter has been started.
208
+ #
209
+ # @return [boolean] `true` when started, `false` otherwise.
210
+ #
211
+ def started?
212
+ !stopped?
213
+ end
214
+
215
+ ##
216
+ # Whether the inserter has been stopped.
217
+ #
218
+ # @return [boolean] `true` when stopped, `false` otherwise.
219
+ #
220
+ def stopped?
221
+ synchronize { @stopped }
222
+ end
223
+
224
+ protected
225
+
226
+ def validate_insert_args rows, insert_ids
227
+ rows = [rows] if rows.is_a? Hash
228
+ raise ArgumentError, "No rows provided" if rows.empty?
229
+
230
+ insert_ids = Array.new(rows.count) { :skip } if insert_ids == :skip
231
+ insert_ids = Array insert_ids
232
+ if insert_ids.count.positive? && insert_ids.count != rows.count
233
+ raise ArgumentError, "insert_ids must be the same size as rows"
234
+ end
235
+
236
+ [rows, insert_ids]
237
+ end
238
+
239
+ def run_background
240
+ synchronize do
241
+ until @stopped
242
+ if @batch.nil?
243
+ @cond.wait
244
+ next
245
+ end
246
+
247
+ time_since_first_publish = ::Time.now - @batch_created_at
248
+ if time_since_first_publish < @interval
249
+ # still waiting for the interval to insert the batch...
250
+ timeout = @interval - time_since_first_publish
251
+ @cond.wait timeout
252
+ else
253
+ # interval met, insert the batch...
254
+ push_batch_request!
255
+ @cond.wait
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ def push_batch_request!
262
+ return unless @batch
263
+
264
+ orig_rows = @batch.rows
265
+ json_rows = @batch.json_rows
266
+ insert_ids = @batch.insert_ids
267
+ Concurrent::Future.new executor: @thread_pool do
268
+ begin
269
+ raise ArgumentError, "No rows provided" if json_rows.empty?
270
+ options = { skip_invalid: @skip_invalid, ignore_unknown: @ignore_unknown, insert_ids: insert_ids }
271
+ insert_resp = @table.service.insert_tabledata_json_rows(
272
+ @table.dataset_id, @table.table_id, json_rows, options
273
+ )
274
+ result = Result.new InsertResponse.from_gapi(orig_rows, insert_resp)
275
+ rescue StandardError => e
276
+ result = Result.new nil, e
277
+ ensure
278
+ @callback&.call result
279
+ end
280
+ end.execute
281
+
282
+ @batch = nil
283
+ @batch_created_at = nil
284
+ end
285
+
286
+ ##
287
+ # @private
288
+ class Batch
289
+ attr_reader :max_bytes, :max_rows, :rows, :json_rows, :insert_ids
290
+
291
+ def initialize max_bytes: 10_000_000, max_rows: 500
292
+ @max_bytes = max_bytes
293
+ @max_rows = max_rows
294
+ @rows = []
295
+ @json_rows = []
296
+ @insert_ids = []
297
+ # The default request byte size overhead is 63.
298
+ # "{\"rows\":[],\"ignoreUnknownValues\":false,
299
+ # \"skipInvalidRows\":false}".bytesize #=> 63
300
+ @current_bytes = 63
301
+ end
302
+
303
+ def insert row, insert_id
304
+ insert_id ||= SecureRandom.uuid
305
+ json_row = to_json_row row
306
+
307
+ insert_rows_bytes row, json_row, insert_id, addl_bytes_for(json_row, insert_id)
308
+ end
309
+
310
+ def try_insert row, insert_id
311
+ insert_id ||= SecureRandom.uuid
312
+ json_row = to_json_row row
313
+ addl_bytes = addl_bytes_for json_row, insert_id
314
+
315
+ return false if @current_bytes + addl_bytes >= @max_bytes
316
+ return false if @rows.count + 1 >= @max_rows
317
+
318
+ insert_rows_bytes row, json_row, insert_id, addl_bytes
319
+ true
320
+ end
321
+
322
+ def ready?
323
+ @current_bytes >= @max_bytes || rows.count >= @max_rows
324
+ end
325
+
326
+ private
327
+
328
+ def insert_rows_bytes row, json_row, insert_id, addl_bytes
329
+ @rows << row
330
+ @json_rows << json_row
331
+ @insert_ids << insert_id if insert_id
332
+ @current_bytes += addl_bytes
333
+ end
334
+
335
+ def to_json_row row
336
+ Convert.to_json_row row
337
+ end
338
+
339
+ def addl_bytes_for json_row, insert_id
340
+ if insert_id == :skip
341
+ # "{\"json\":},".bytesize #=> 10
342
+ 10 + json_row.to_json.bytesize
343
+ else
344
+ # "{\"insertId\":\"\",\"json\":},".bytesize #=> 24
345
+ 24 + json_row.to_json.bytesize + insert_id.bytesize
346
+ end
347
+ end
348
+ end
349
+
350
+ ##
351
+ # AsyncInserter::Result
352
+ #
353
+ # Represents the result from BigQuery, including any error
354
+ # encountered, when data is asynchronously inserted into a table for
355
+ # near-immediate querying. See {Dataset#insert_async} and
356
+ # {Table#insert_async}.
357
+ #
358
+ # @see https://cloud.google.com/bigquery/streaming-data-into-bigquery
359
+ # Streaming Data Into BigQuery
360
+ #
361
+ # @attr_reader [Google::Cloud::Bigquery::InsertResponse, nil]
362
+ # insert_response The response from the insert operation if no
363
+ # error was encountered, or `nil` if the insert operation
364
+ # encountered an error.
365
+ # @attr_reader [Error, nil] error The error from the insert operation
366
+ # if any error was encountered, otherwise `nil`.
367
+ #
368
+ # @example
369
+ # require "google/cloud/bigquery"
370
+ #
371
+ # bigquery = Google::Cloud::Bigquery.new
372
+ # dataset = bigquery.dataset "my_dataset"
373
+ # table = dataset.table "my_table"
374
+ # inserter = table.insert_async do |result|
375
+ # if result.error?
376
+ # log_error result.error
377
+ # else
378
+ # log_insert "inserted #{result.insert_count} rows " \
379
+ # "with #{result.error_count} errors"
380
+ # end
381
+ # end
382
+ #
383
+ # rows = [
384
+ # { "first_name" => "Alice", "age" => 21 },
385
+ # { "first_name" => "Bob", "age" => 22 }
386
+ # ]
387
+ # inserter.insert rows
388
+ #
389
+ # inserter.stop.wait!
390
+ #
391
+ class Result
392
+ # @private
393
+ def initialize insert_response, error = nil
394
+ @insert_response = insert_response
395
+ @error = error
396
+ end
397
+
398
+ attr_reader :insert_response, :error
399
+
400
+ ##
401
+ # Checks if an error is present, meaning that the insert operation
402
+ # encountered an error. Use {#error} to access the error. For
403
+ # row-level errors, see {#success?} and {#insert_errors}.
404
+ #
405
+ # @return [Boolean] `true` when an error is present, `false`
406
+ # otherwise.
407
+ #
408
+ def error?
409
+ !error.nil?
410
+ end
411
+
412
+ ##
413
+ # Checks if the error count for row-level errors is zero, meaning
414
+ # that all of the rows were inserted. Use {#insert_errors} to access
415
+ # the row-level errors. To check for and access any operation-level
416
+ # error, use {#error?} and {#error}.
417
+ #
418
+ # @return [Boolean, nil] `true` when the error count is zero,
419
+ # `false` when the error count is positive, or `nil` if the insert
420
+ # operation encountered an error.
421
+ #
422
+ def success?
423
+ return nil if error?
424
+ insert_response.success?
425
+ end
426
+
427
+ ##
428
+ # The count of rows in the response, minus the count of errors for
429
+ # rows that were not inserted.
430
+ #
431
+ # @return [Integer, nil] The number of rows inserted, or `nil` if
432
+ # the insert operation encountered an error.
433
+ #
434
+ def insert_count
435
+ return nil if error?
436
+ insert_response.insert_count
437
+ end
438
+
439
+ ##
440
+ # The count of errors for rows that were not inserted.
441
+ #
442
+ # @return [Integer, nil] The number of errors, or `nil` if the
443
+ # insert operation encountered an error.
444
+ #
445
+ def error_count
446
+ return nil if error?
447
+ insert_response.error_count
448
+ end
449
+
450
+ ##
451
+ # The error objects for rows that were not inserted.
452
+ #
453
+ # @return [Array<InsertError>, nil] An array containing error
454
+ # objects, or `nil` if the insert operation encountered an error.
455
+ #
456
+ def insert_errors
457
+ return nil if error?
458
+ insert_response.insert_errors
459
+ end
460
+
461
+ ##
462
+ # The rows that were not inserted.
463
+ #
464
+ # @return [Array<Hash>, nil] An array of hash objects containing the
465
+ # row data, or `nil` if the insert operation encountered an error.
466
+ #
467
+ def error_rows
468
+ return nil if error?
469
+ insert_response.error_rows
470
+ end
471
+
472
+ ##
473
+ # Returns the error object for a row that was not inserted.
474
+ #
475
+ # @param [Hash] row A hash containing the data for a row.
476
+ #
477
+ # @return [InsertError, nil] An error object, `nil` if no error is
478
+ # found in the response for the row, or `nil` if the insert
479
+ # operation encountered an error.
480
+ #
481
+ def insert_error_for row
482
+ return nil if error?
483
+ insert_response.insert_error_for row
484
+ end
485
+
486
+ ##
487
+ # Returns the error hashes for a row that was not inserted. Each
488
+ # error hash contains the following keys: `reason`, `location`,
489
+ # `debugInfo`, and `message`.
490
+ #
491
+ # @param [Hash, nil] row A hash containing the data for a row.
492
+ #
493
+ # @return [Array<Hash>, nil] An array of error hashes, `nil` if no
494
+ # errors are found in the response for the row, or `nil` if the
495
+ # insert operation encountered an error.
496
+ #
497
+ def errors_for row
498
+ return nil if error?
499
+ insert_response.errors_for row
500
+ end
501
+
502
+ ##
503
+ # Returns the index for a row that was not inserted.
504
+ #
505
+ # @param [Hash, nil] row A hash containing the data for a row.
506
+ #
507
+ # @return [Integer, nil] An error object, `nil` if no error is
508
+ # found in the response for the row, or `nil` if the insert
509
+ # operation encountered an error.
510
+ #
511
+ def index_for row
512
+ return nil if error?
513
+ insert_response.index_for row
514
+ end
515
+ end
516
+ end
517
+ end
518
+ end
519
+ end
520
+ end