google-cloud-bigquery 0.20.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 59023c44b0659ebbbfe3154585a39e06177f690f
4
+ data.tar.gz: 8577d120217e1134d3e9077f94982edb64882780
5
+ SHA512:
6
+ metadata.gz: c265d96170b9ff9080617bab8c14e2ae0a2a0093a2fe526664545bc8c97d1ac159000d71d3b56616ee125f0f63f9ae24f51828b280c6a2bd06e045d08c6dc5db
7
+ data.tar.gz: bd886ccf5f8a66ce36249e28e81e2a5f24299ba3537331d88852abb95812653e2bff5661b65fe450c9598598425af52b2c6cba063762ce4c741ce6594de1450e
@@ -0,0 +1,122 @@
1
+ # Copyright 2016 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ##
16
+ # This file is here to be autorequired by bundler, so that the .bigquery and
17
+ # #bigquery methods can be available, but the library and all dependencies won't
18
+ # be loaded until required and used.
19
+
20
+
21
+ gem "google-cloud-core"
22
+ require "google/cloud"
23
+
24
+ module Google
25
+ module Cloud
26
+ ##
27
+ # Creates a new object for connecting to the BigQuery service.
28
+ # Each call creates a new connection.
29
+ #
30
+ # For more information on connecting to Google Cloud see the [Authentication
31
+ # Guide](https://googlecloudplatform.github.io/google-cloud-ruby/#/docs/guides/authentication).
32
+ #
33
+ # @param [String, Array<String>] scope The OAuth 2.0 scopes controlling the
34
+ # set of resources and operations that the connection can access. See
35
+ # [Using OAuth 2.0 to Access Google
36
+ # APIs](https://developers.google.com/identity/protocols/OAuth2).
37
+ #
38
+ # The default scope is:
39
+ #
40
+ # * `https://www.googleapis.com/auth/bigquery`
41
+ # @param [Integer] retries Number of times to retry requests on server
42
+ # error. The default value is `3`. Optional.
43
+ # @param [Integer] timeout Default request timeout in seconds. Optional.
44
+ #
45
+ # @return [Google::Cloud::Bigquery::Project]
46
+ #
47
+ # @example
48
+ # require "google/cloud"
49
+ #
50
+ # gcloud = Google::Cloud.new
51
+ # bigquery = gcloud.bigquery
52
+ # dataset = bigquery.dataset "my-dataset"
53
+ # table = dataset.table "my-table"
54
+ # table.data.each do |row|
55
+ # puts row
56
+ # end
57
+ #
58
+ # @example The default scope can be overridden with the `scope` option:
59
+ # require "google/cloud"
60
+ #
61
+ # gcloud = Google::Cloud.new
62
+ # platform_scope = "https://www.googleapis.com/auth/cloud-platform"
63
+ # bigquery = gcloud.bigquery scope: platform_scope
64
+ #
65
+ def bigquery scope: nil, retries: nil, timeout: nil
66
+ Google::Cloud.bigquery @project, @keyfile, scope: scope,
67
+ retries: (retries || @retries),
68
+ timeout: (timeout || @timeout)
69
+ end
70
+
71
+ ##
72
+ # Creates a new `Project` instance connected to the BigQuery service.
73
+ # Each call creates a new connection.
74
+ #
75
+ # For more information on connecting to Google Cloud see the [Authentication
76
+ # Guide](https://googlecloudplatform.github.io/google-cloud-ruby/#/docs/guides/authentication).
77
+ #
78
+ # @param [String] project Identifier for a BigQuery project. If not present,
79
+ # the default project for the credentials is used.
80
+ # @param [String, Hash] keyfile Keyfile downloaded from Google Cloud. If
81
+ # file path the file must be readable.
82
+ # @param [String, Array<String>] scope The OAuth 2.0 scopes controlling the
83
+ # set of resources and operations that the connection can access. See
84
+ # [Using OAuth 2.0 to Access Google
85
+ # APIs](https://developers.google.com/identity/protocols/OAuth2).
86
+ #
87
+ # The default scope is:
88
+ #
89
+ # * `https://www.googleapis.com/auth/bigquery`
90
+ # @param [Integer] retries Number of times to retry requests on server
91
+ # error. The default value is `3`. Optional.
92
+ # @param [Integer] timeout Default timeout to use in requests. Optional.
93
+ #
94
+ # @return [Google::Cloud::Bigquery::Project]
95
+ #
96
+ # @example
97
+ # require "google/cloud/bigquery"
98
+ #
99
+ # bigquery = Google::Cloud.bigquery
100
+ # dataset = bigquery.dataset "my_dataset"
101
+ # table = dataset.table "my_table"
102
+ #
103
+ def self.bigquery project = nil, keyfile = nil, scope: nil, retries: nil,
104
+ timeout: nil
105
+ require "google/cloud/bigquery"
106
+ project ||= Google::Cloud::Bigquery::Project.default_project
107
+ project = project.to_s # Always cast to a string
108
+ fail ArgumentError, "project is missing" if project.empty?
109
+
110
+ if keyfile.nil?
111
+ credentials = Google::Cloud::Bigquery::Credentials.default scope: scope
112
+ else
113
+ credentials = Google::Cloud::Bigquery::Credentials.new(
114
+ keyfile, scope: scope)
115
+ end
116
+
117
+ Google::Cloud::Bigquery::Project.new(
118
+ Google::Cloud::Bigquery::Service.new(
119
+ project, credentials, retries: retries, timeout: timeout))
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,353 @@
1
+ # Copyright 2015 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ require "google-cloud-bigquery"
17
+ require "google/cloud/bigquery/project"
18
+
19
+ module Google
20
+ module Cloud
21
+ ##
22
+ # # Google Cloud BigQuery
23
+ #
24
+ # Google Cloud BigQuery enables super-fast, SQL-like queries against massive
25
+ # datasets, using the processing power of Google's infrastructure. To learn
26
+ # more, read [What is
27
+ # BigQuery?](https://cloud.google.com/bigquery/what-is-bigquery).
28
+ #
29
+ # The goal of google-cloud is to provide an API that is comfortable to
30
+ # Rubyists. Authentication is handled by {Google::Cloud#bigquery}. You can
31
+ # provide the project and credential information to connect to the BigQuery
32
+ # service, or if you are running on Google Compute Engine this configuration
33
+ # is taken care of for you. You can read more about the options for
34
+ # connecting in the [Authentication
35
+ # Guide](https://googlecloudplatform.github.io/google-cloud-ruby/#/docs/guides/authentication).
36
+ #
37
+ # To help you get started quickly, the first few examples below use a public
38
+ # dataset provided by Google. As soon as you have [signed
39
+ # up](https://cloud.google.com/bigquery/sign-up) to use BigQuery, and
40
+ # provided that you stay in the free tier for queries, you should be able to
41
+ # run these first examples without the need to set up billing or to load
42
+ # data (although we'll show you how to do that too.)
43
+ #
44
+ # ## Listing Datasets and Tables
45
+ #
46
+ # A BigQuery project holds datasets, which in turn hold tables. Assuming
47
+ # that you have not yet created datasets or tables in your own project,
48
+ # let's connect to Google's `publicdata` project, and see what you find.
49
+ #
50
+ # ```ruby
51
+ # require "google/cloud"
52
+ #
53
+ # gcloud = Google::Cloud.new "publicdata"
54
+ # bigquery = gcloud.bigquery
55
+ #
56
+ # bigquery.datasets.count #=> 1
57
+ # bigquery.datasets.first.dataset_id #=> "samples"
58
+ #
59
+ # dataset = bigquery.datasets.first
60
+ # tables = dataset.tables
61
+ #
62
+ # tables.count #=> 7
63
+ # tables.map &:table_id #=> [..., "shakespeare", "trigrams", "wikipedia"]
64
+ # ```
65
+ #
66
+ # In addition listing all datasets and tables in the project, you can also
67
+ # retrieve individual datasets and tables by ID. Let's look at the structure
68
+ # of the `shakespeare` table, which contains an entry for every word in
69
+ # every play written by Shakespeare.
70
+ #
71
+ # ```ruby
72
+ # require "google/cloud"
73
+ #
74
+ # gcloud = Google::Cloud.new "publicdata"
75
+ # bigquery = gcloud.bigquery
76
+ #
77
+ # dataset = bigquery.dataset "samples"
78
+ # table = dataset.table "shakespeare"
79
+ #
80
+ # table.headers #=> ["word", "word_count", "corpus", "corpus_date"]
81
+ # table.rows_count #=> 164656
82
+ # ```
83
+ #
84
+ # Now that you know the column names for the Shakespeare table, you can
85
+ # write and run a query.
86
+ #
87
+ # ## Running queries
88
+ #
89
+ # BigQuery offers both synchronous and asynchronous methods, as explained in
90
+ # [Querying Data](https://cloud.google.com/bigquery/querying-data).
91
+ #
92
+ # ### Synchronous queries
93
+ #
94
+ # Let's start with the simpler synchronous approach. Notice that this time
95
+ # you are connecting using your own default project. This is necessary for
96
+ # running a query, since queries need to be able to create tables to hold
97
+ # results.
98
+ #
99
+ # ```ruby
100
+ # require "google/cloud"
101
+ #
102
+ # gcloud = Google::Cloud.new
103
+ # bigquery = gcloud.bigquery
104
+ #
105
+ # sql = "SELECT TOP(word, 50) as word, COUNT(*) as count " +
106
+ # "FROM publicdata:samples.shakespeare"
107
+ # data = bigquery.query sql
108
+ #
109
+ # data.count #=> 50
110
+ # data.next? #=> false
111
+ # data.first #=> {"word"=>"you", "count"=>42}
112
+ # ```
113
+ #
114
+ # The `TOP` function shown above is just one of a variety of functions
115
+ # offered by BigQuery. See the [Query
116
+ # Reference](https://cloud.google.com/bigquery/query-reference) for a full
117
+ # listing.
118
+ #
119
+ # ### Asynchronous queries
120
+ #
121
+ # Because you probably should not block for most BigQuery operations,
122
+ # including querying as well as importing, exporting, and copying data, the
123
+ # BigQuery API enables you to manage longer-running jobs. In the
124
+ # asynchronous approach to running a query, an instance of
125
+ # {Google::Cloud::Bigquery::QueryJob} is returned, rather than an instance
126
+ # of {Google::Cloud::Bigquery::QueryData}.
127
+ #
128
+ # ```ruby
129
+ # require "google/cloud"
130
+ #
131
+ # gcloud = Google::Cloud.new
132
+ # bigquery = gcloud.bigquery
133
+ #
134
+ # sql = "SELECT TOP(word, 50) as word, COUNT(*) as count " +
135
+ # "FROM publicdata:samples.shakespeare"
136
+ # job = bigquery.query_job sql
137
+ #
138
+ # job.wait_until_done!
139
+ # if !job.failed?
140
+ # job.query_results.each do |row|
141
+ # puts row["word"]
142
+ # end
143
+ # end
144
+ # ```
145
+ #
146
+ # Once you have determined that the job is done and has not failed, you can
147
+ # obtain an instance of {Google::Cloud::Bigquery::QueryData} by calling
148
+ # {Google::Cloud::Bigquery::QueryJob#query_results}. The query results for
149
+ # both of the above examples are stored in temporary tables with a lifetime
150
+ # of about 24 hours. See the final example below for a demonstration of how
151
+ # to store query results in a permanent table.
152
+ #
153
+ # ## Creating Datasets and Tables
154
+ #
155
+ # The first thing you need to do in a new BigQuery project is to create a
156
+ # {Google::Cloud::Bigquery::Dataset}. Datasets hold tables and control
157
+ # access to them.
158
+ #
159
+ # ```ruby
160
+ # require "google/cloud/bigquery"
161
+ #
162
+ # gcloud = Google::Cloud.new
163
+ # bigquery = gcloud.bigquery
164
+ # dataset = bigquery.create_dataset "my_dataset"
165
+ # ```
166
+ #
167
+ # Now that you have a dataset, you can use it to create a table. Every table
168
+ # is defined by a schema that may contain nested and repeated fields. The
169
+ # example below shows a schema with a repeated record field named
170
+ # `cities_lived`. (For more information about nested and repeated fields,
171
+ # see [Preparing Data for
172
+ # BigQuery](https://cloud.google.com/bigquery/preparing-data-for-bigquery).)
173
+ #
174
+ # ```ruby
175
+ # require "google/cloud"
176
+ #
177
+ # gcloud = Google::Cloud.new
178
+ # bigquery = gcloud.bigquery
179
+ # dataset = bigquery.dataset "my_dataset"
180
+ #
181
+ # table = dataset.create_table "people" do |schema|
182
+ # schema.string "first_name", mode: :required
183
+ # schema.record "cities_lived", mode: :repeated do |nested_schema|
184
+ # nested_schema.string "place", mode: :required
185
+ # nested_schema.integer "number_of_years", mode: :required
186
+ # end
187
+ # end
188
+ # ```
189
+ #
190
+ # Because of the repeated field in this schema, we cannot use the CSV format
191
+ # to load data into the table.
192
+ #
193
+ # ## Loading records
194
+ #
195
+ # In addition to CSV, data can be imported from files that are formatted as
196
+ # [Newline-delimited JSON](http://jsonlines.org/) or
197
+ # [Avro](http://avro.apache.org/), or from a Google Cloud Datastore backup.
198
+ # It can also be "streamed" into BigQuery.
199
+ #
200
+ # To follow along with these examples, you will need to set up billing on
201
+ # the [Google Developers Console](https://console.developers.google.com).
202
+ #
203
+ # ### Streaming records
204
+ #
205
+ # For situations in which you want new data to be available for querying as
206
+ # soon as possible, inserting individual records directly from your Ruby
207
+ # application is a great approach.
208
+ #
209
+ # ```ruby
210
+ # require "google/cloud"
211
+ #
212
+ # gcloud = Google::Cloud.new
213
+ # bigquery = gcloud.bigquery
214
+ # dataset = bigquery.dataset "my_dataset"
215
+ # table = dataset.table "people"
216
+ #
217
+ # rows = [
218
+ # {
219
+ # "first_name" => "Anna",
220
+ # "cities_lived" => [
221
+ # {
222
+ # "place" => "Stockholm",
223
+ # "number_of_years" => 2
224
+ # }
225
+ # ]
226
+ # },
227
+ # {
228
+ # "first_name" => "Bob",
229
+ # "cities_lived" => [
230
+ # {
231
+ # "place" => "Seattle",
232
+ # "number_of_years" => 5
233
+ # },
234
+ # {
235
+ # "place" => "Austin",
236
+ # "number_of_years" => 6
237
+ # }
238
+ # ]
239
+ # }
240
+ # ]
241
+ # table.insert rows
242
+ # ```
243
+ #
244
+ # There are some trade-offs involved with streaming, so be sure to read the
245
+ # discussion of data consistency in [Streaming Data Into
246
+ # BigQuery](https://cloud.google.com/bigquery/streaming-data-into-bigquery).
247
+ #
248
+ # ### Uploading a file
249
+ #
250
+ # To follow along with this example, please download the
251
+ # [names.zip](http://www.ssa.gov/OACT/babynames/names.zip) archive from the
252
+ # U.S. Social Security Administration. Inside the archive you will find over
253
+ # 100 files containing baby name records since the year 1880. A PDF file
254
+ # also contained in the archive specifies the schema used below.
255
+ #
256
+ # ```ruby
257
+ # require "google/cloud"
258
+ #
259
+ # gcloud = Google::Cloud.new
260
+ # bigquery = gcloud.bigquery
261
+ # dataset = bigquery.dataset "my_dataset"
262
+ # table = dataset.create_table "baby_names" do |schema|
263
+ # schema.string "name", mode: :required
264
+ # schema.string "sex", mode: :required
265
+ # schema.integer "number", mode: :required
266
+ # end
267
+ #
268
+ # file = File.open "names/yob2014.txt"
269
+ # load_job = table.load file, format: "csv"
270
+ # ```
271
+ #
272
+ # Because the names data, although formatted as CSV, is distributed in files
273
+ # with a `.txt` extension, this example explicitly passes the `format`
274
+ # option in order to demonstrate how to handle such situations. Because CSV
275
+ # is the default format for load operations, the option is not actually
276
+ # necessary. For JSON saved with a `.txt` extension, however, it would be.
277
+ #
278
+ # ## Exporting query results to Google Cloud Storage
279
+ #
280
+ # The example below shows how to pass the `table` option with a query in
281
+ # order to store results in a permanent table. It also shows how to export
282
+ # the result data to a Google Cloud Storage file. In order to follow along,
283
+ # you will need to enable the Google Cloud Storage API in addition to
284
+ # setting up billing.
285
+ #
286
+ # ```ruby
287
+ # require "google/cloud"
288
+ #
289
+ # gcloud = Google::Cloud.new
290
+ # bigquery = gcloud.bigquery
291
+ # dataset = bigquery.dataset "my_dataset"
292
+ # source_table = dataset.table "baby_names"
293
+ # result_table = dataset.create_table "baby_names_results"
294
+ #
295
+ # sql = "SELECT name, number as count " +
296
+ # "FROM baby_names " +
297
+ # "WHERE name CONTAINS 'Sam' " +
298
+ # "ORDER BY count DESC"
299
+ # query_job = dataset.query_job sql, table: result_table
300
+ #
301
+ # query_job.wait_until_done!
302
+ #
303
+ # if !query_job.failed?
304
+ #
305
+ # storage = gcloud.storage
306
+ # bucket_id = "bigquery-exports-#{SecureRandom.uuid}"
307
+ # bucket = storage.create_bucket bucket_id
308
+ # extract_url = "gs://#{bucket.id}/baby-names-sam.csv"
309
+ #
310
+ # extract_job = result_table.extract extract_url
311
+ #
312
+ # extract_job.wait_until_done!
313
+ #
314
+ # # Download to local filesystem
315
+ # bucket.files.first.download "baby-names-sam.csv"
316
+ #
317
+ # end
318
+ # ```
319
+ #
320
+ # If a table you wish to export contains a large amount of data, you can
321
+ # pass a wildcard URI to export to multiple files (for sharding), or an
322
+ # array of URIs (for partitioning), or both. See [Exporting Data From
323
+ # BigQuery](https://cloud.google.com/bigquery/exporting-data-from-bigquery)
324
+ # for details.
325
+ #
326
+ # ## Configuring retries and timeout
327
+ #
328
+ # You can configure how many times API requests may be automatically
329
+ # retried. When an API request fails, the response will be inspected to see
330
+ # if the request meets criteria indicating that it may succeed on retry,
331
+ # such as `500` and `503` status codes or a specific internal error code
332
+ # such as `rateLimitExceeded`. If it meets the criteria, the request will be
333
+ # retried after a delay. If another error occurs, the delay will be
334
+ # increased before a subsequent attempt, until the `retries` limit is
335
+ # reached.
336
+ #
337
+ # You can also set the request `timeout` value in seconds.
338
+ #
339
+ # ```ruby
340
+ # require "google/cloud"
341
+ #
342
+ # gcloud = Google::Cloud.new
343
+ # bigquery = gcloud.bigquery retries: 10, timeout: 120
344
+ # ```
345
+ #
346
+ # See the [BigQuery error
347
+ # table](https://cloud.google.com/bigquery/troubleshooting-errors#errortable)
348
+ # for a list of error conditions.
349
+ #
350
+ module Bigquery
351
+ end
352
+ end
353
+ end