embulk-input-bigquery 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +49 -1
- data/Rakefile +1 -1
- data/embulk-input-bigquery.gemspec +13 -12
- data/lib/embulk/input/bigquery.rb +164 -55
- data/lib/embulk/input/bigquery/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: adc126def78ac278dafebe7ad7bf5830ad7f4f29
|
4
|
+
data.tar.gz: caa6d2d3500b9051889f8039d9bb42b5f6cbf13a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d459d42c1d9c5c995f35010298657fc79df8fbc03eedfa26692f34090163fb28a60a6c1c73029c4e50ecd3a2595f5c85241b569aec07bf69d711f5b15c6059f
|
7
|
+
data.tar.gz: a670a9fde47bd8ca7cfb0412b35cd2c581549ba74788b8bbd96e32360d38f0a6d00e9b1434331ff2adbf593afe8c24450099856d18f05a9f3565b4be270a0c56
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -19,6 +19,7 @@ in:
|
|
19
19
|
columns:
|
20
20
|
- {name: price, type: long}
|
21
21
|
- {name: category_id, type: string}
|
22
|
+
max: 2000
|
22
23
|
out:
|
23
24
|
type: stdout
|
24
25
|
```
|
@@ -30,7 +31,7 @@ in:
|
|
30
31
|
type: bigquery
|
31
32
|
project: 'project-name'
|
32
33
|
keyfile: '/home/hogehoge/bigquery-keyfile.json'
|
33
|
-
|
34
|
+
sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
|
34
35
|
erb_params:
|
35
36
|
date: "require 'date'; (Date.today - 1)"
|
36
37
|
columns:
|
@@ -39,3 +40,50 @@ in:
|
|
39
40
|
- {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
|
40
41
|
```
|
41
42
|
|
43
|
+
### Determine columns from query results if columns definition is empty
|
44
|
+
|
45
|
+
```
|
46
|
+
in:
|
47
|
+
type: bigquery
|
48
|
+
project: 'project-name'
|
49
|
+
keyfile: '/home/hogehoge/bigquery-keyfile.json'
|
50
|
+
sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
|
51
|
+
out:
|
52
|
+
type: stdout
|
53
|
+
```
|
54
|
+
|
55
|
+
### Embed keyfile content as string into config
|
56
|
+
|
57
|
+
```
|
58
|
+
in:
|
59
|
+
type: bigquery
|
60
|
+
project: 'project-name'
|
61
|
+
keyfile:
|
62
|
+
content: |
|
63
|
+
{
|
64
|
+
"type": "service_account",
|
65
|
+
"project_id": "example-project",
|
66
|
+
"private_key_id": "1234567890ABCDEFG",
|
67
|
+
"private_key": "**************************************",
|
68
|
+
"client_email": "example-project@hogehoge.gserviceaccount.com",
|
69
|
+
"client_id": "12345678901234567890",
|
70
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
71
|
+
"token_uri": "https://accounts.google.com/o/oauth2/token",
|
72
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
73
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/hogehoge.gcp.iam.gserviceaccount.com"
|
74
|
+
}
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Optional Configuration
|
79
|
+
This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using [the synchronous method](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L281).
|
80
|
+
Therefore some optional configuration items comply with the Google Cloud Client Library.
|
81
|
+
|
82
|
+
- [max](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L315) :
|
83
|
+
- default value : **null** and null value is interpreted as [no maximum row count](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L319) in the Google Cloud Client Library.
|
84
|
+
- [cache](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L331) :
|
85
|
+
- default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L333) in the Google Cloud Client Library.
|
86
|
+
- [standard_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L343):
|
87
|
+
- default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L351) in the Google Cloud Client Library.
|
88
|
+
- [legacy_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L353):
|
89
|
+
- default value : **null** and null value is interpreted as [false](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L361) in the Google Cloud Client Library.
|
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
@@ -1,24 +1,25 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'embulk/input/bigquery/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'embulk-input-bigquery'
|
8
9
|
spec.version = Embulk::Input::Bigquery::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
10
|
+
spec.authors = ['Takeru Narita']
|
11
|
+
spec.email = ['naritano77@gmail.com']
|
12
|
+
spec.description = 'embulk input plugin from bigquery.'
|
13
|
+
spec.summary = 'Embulk input plugin from bigquery.'
|
14
|
+
spec.homepage = 'https://github.com/medjed/embulk-input-bigquery'
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
23
|
-
spec.add_dependency
|
22
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_dependency 'google-cloud-bigquery', '~> 0.23'
|
24
25
|
end
|
@@ -1,63 +1,172 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'embulk/input/bigquery/version'
|
2
|
+
require 'google/cloud/bigquery'
|
3
3
|
require 'erb'
|
4
4
|
|
5
5
|
module Embulk
|
6
6
|
module Input
|
7
7
|
class InputBigquery < InputPlugin
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
8
|
+
Plugin.register_input('bigquery', self)
|
9
|
+
|
10
|
+
# support config by file path or content which supported by org.embulk.spi.unit.LocalFile
|
11
|
+
# keyfile:
|
12
|
+
# content: |
|
13
|
+
class LocalFile
|
14
|
+
# return JSON string
|
15
|
+
def self.load(v)
|
16
|
+
if v.is_a?(String)
|
17
|
+
v
|
18
|
+
elsif v.is_a?(Hash)
|
19
|
+
JSON.parse(v['content'])
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.transaction(config, &control)
|
25
|
+
sql = config[:sql]
|
26
|
+
params = {}
|
27
|
+
unless sql
|
28
|
+
sql_erb = config[:sql_erb]
|
29
|
+
erb = ERB.new(sql_erb)
|
30
|
+
erb_params = config[:erb_params]
|
31
|
+
erb_params.each do |k, v|
|
32
|
+
params[k] = eval(v)
|
33
|
+
end
|
34
|
+
|
35
|
+
sql = erb.result(binding)
|
36
|
+
end
|
37
|
+
|
38
|
+
task = {
|
39
|
+
project: config[:project],
|
40
|
+
keyfile: config.param(:keyfile, LocalFile, nil),
|
41
|
+
sql: sql,
|
42
|
+
params: params,
|
43
|
+
option: {
|
44
|
+
max: config[:max],
|
45
|
+
cache: config[:cache],
|
46
|
+
standard_sql: config[:standard_sql],
|
47
|
+
legacy_sql: config[:legacy_sql]
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
if config[:columns]
|
52
|
+
task[:columns] = config[:columns]
|
53
|
+
else
|
54
|
+
bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
|
55
|
+
task[:job_id], task[:columns] = determine_columns_by_query_results(sql, task[:option], bq)
|
56
|
+
end
|
57
|
+
|
58
|
+
columns = []
|
59
|
+
task[:columns].each_with_index do |c, i|
|
60
|
+
columns << Column.new(i, c['name'], c['type'].to_sym)
|
61
|
+
end
|
62
|
+
|
63
|
+
resume(task, columns, 1, &control)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.resume(task, columns, count, &control)
|
67
|
+
task_reports = yield(task, columns, count)
|
68
|
+
|
69
|
+
next_config_diff = {}
|
70
|
+
end
|
71
|
+
|
72
|
+
def run
|
73
|
+
bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
|
74
|
+
params = @task[:params]
|
75
|
+
option = keys_to_sym(@task[:option])
|
76
|
+
|
77
|
+
rows = if @task[:job_id].nil?
|
78
|
+
bq.query(@task[:sql], **option)
|
79
|
+
else
|
80
|
+
bq.job(@task[:job_id]).query_results(max: option[:max])
|
81
|
+
end
|
82
|
+
|
83
|
+
@task[:columns] = values_to_sym(@task[:columns], 'name')
|
84
|
+
|
85
|
+
rows.all do |row|
|
86
|
+
columns = []
|
87
|
+
@task[:columns].each do |c|
|
88
|
+
val = row[c['name'].to_sym]
|
89
|
+
val = eval(c['eval'], binding) if c['eval']
|
90
|
+
|
91
|
+
columns << as_serializable(val)
|
92
|
+
end
|
93
|
+
|
94
|
+
@page_builder.add(columns)
|
95
|
+
end
|
96
|
+
@page_builder.finish
|
97
|
+
{}
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.determine_columns_by_query_results(sql, option, bigquery_client)
|
101
|
+
Embulk.logger.info 'determine columns using the getQueryResults API instead of the config.yml'
|
102
|
+
|
103
|
+
filtered_option = option.dup
|
104
|
+
filtered_option.delete(:max)
|
105
|
+
job = bigquery_client.query_job(sql, **filtered_option)
|
106
|
+
|
107
|
+
Embulk.logger.info 'waiting for the query job to complete to get schema from query results'
|
108
|
+
job.wait_until_done!
|
109
|
+
|
110
|
+
Embulk.logger.info "completed: job_id=#{job.job_id}"
|
111
|
+
result = job.query_results(max: 0)
|
112
|
+
|
113
|
+
columns = result.fields.map do |f|
|
114
|
+
{
|
115
|
+
'name' => f.name,
|
116
|
+
'type' => embulk_column_type(f.type)
|
117
|
+
}
|
118
|
+
end
|
119
|
+
Embulk.logger.info "determined columns: #{columns.inspect}"
|
120
|
+
|
121
|
+
[job.job_id, columns]
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.embulk_column_type(bq_data_type)
|
125
|
+
case bq_data_type
|
126
|
+
when 'BOOLEAN', 'BOOL'
|
127
|
+
:boolean
|
128
|
+
when 'INTEGER', 'INT64'
|
129
|
+
:long
|
130
|
+
when 'FLOAT', 'FLOAT64'
|
131
|
+
:double
|
132
|
+
when 'STRING', 'DATETIME', 'DATE', 'TIME'
|
133
|
+
:string
|
134
|
+
when 'TIMESTAMP'
|
135
|
+
:timestamp
|
136
|
+
when 'RECORD', 'BYTES'
|
137
|
+
raise "unsupported type #{bq_data_type.inspect}"
|
138
|
+
else
|
139
|
+
raise "unknown type #{bq_data_type.inspect}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def keys_to_sym(hash)
|
144
|
+
ret = {}
|
145
|
+
hash.each do |key, value|
|
146
|
+
ret[key.to_sym] = value
|
147
|
+
end
|
148
|
+
ret
|
149
|
+
end
|
150
|
+
|
151
|
+
def values_to_sym(hashs, key)
|
152
|
+
hashs.map do |h|
|
153
|
+
h[key] = h[key].to_sym
|
154
|
+
h
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def as_serializable(v)
|
159
|
+
case v
|
160
|
+
when ::Google::Cloud::Bigquery::Time
|
161
|
+
v.value
|
162
|
+
when DateTime
|
163
|
+
v.strftime('%Y-%m-%d %H:%M:%S.%6N')
|
164
|
+
when Date
|
165
|
+
v.strftime('%Y-%m-%d')
|
166
|
+
else
|
167
|
+
v
|
168
|
+
end
|
169
|
+
end
|
61
170
|
end
|
62
171
|
end
|
63
172
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takeru Narita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,7 +67,7 @@ files:
|
|
67
67
|
- embulk-input-bigquery.gemspec
|
68
68
|
- lib/embulk/input/bigquery.rb
|
69
69
|
- lib/embulk/input/bigquery/version.rb
|
70
|
-
homepage:
|
70
|
+
homepage: https://github.com/medjed/embulk-input-bigquery
|
71
71
|
licenses:
|
72
72
|
- MIT
|
73
73
|
metadata: {}
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
87
|
version: '0'
|
88
88
|
requirements: []
|
89
89
|
rubyforge_project:
|
90
|
-
rubygems_version: 2.
|
90
|
+
rubygems_version: 2.6.11
|
91
91
|
signing_key:
|
92
92
|
specification_version: 4
|
93
93
|
summary: Embulk input plugin from bigquery.
|