embulk-input-bigquery 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +49 -1
- data/Rakefile +1 -1
- data/embulk-input-bigquery.gemspec +13 -12
- data/lib/embulk/input/bigquery.rb +164 -55
- data/lib/embulk/input/bigquery/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: adc126def78ac278dafebe7ad7bf5830ad7f4f29
|
4
|
+
data.tar.gz: caa6d2d3500b9051889f8039d9bb42b5f6cbf13a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d459d42c1d9c5c995f35010298657fc79df8fbc03eedfa26692f34090163fb28a60a6c1c73029c4e50ecd3a2595f5c85241b569aec07bf69d711f5b15c6059f
|
7
|
+
data.tar.gz: a670a9fde47bd8ca7cfb0412b35cd2c581549ba74788b8bbd96e32360d38f0a6d00e9b1434331ff2adbf593afe8c24450099856d18f05a9f3565b4be270a0c56
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -19,6 +19,7 @@ in:
|
|
19
19
|
columns:
|
20
20
|
- {name: price, type: long}
|
21
21
|
- {name: category_id, type: string}
|
22
|
+
max: 2000
|
22
23
|
out:
|
23
24
|
type: stdout
|
24
25
|
```
|
@@ -30,7 +31,7 @@ in:
|
|
30
31
|
type: bigquery
|
31
32
|
project: 'project-name'
|
32
33
|
keyfile: '/home/hogehoge/bigquery-keyfile.json'
|
33
|
-
|
34
|
+
sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
|
34
35
|
erb_params:
|
35
36
|
date: "require 'date'; (Date.today - 1)"
|
36
37
|
columns:
|
@@ -39,3 +40,50 @@ in:
|
|
39
40
|
- {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
|
40
41
|
```
|
41
42
|
|
43
|
+
### Determine columns from query results if columns definition is empty
|
44
|
+
|
45
|
+
```
|
46
|
+
in:
|
47
|
+
type: bigquery
|
48
|
+
project: 'project-name'
|
49
|
+
keyfile: '/home/hogehoge/bigquery-keyfile.json'
|
50
|
+
sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
|
51
|
+
out:
|
52
|
+
type: stdout
|
53
|
+
```
|
54
|
+
|
55
|
+
### Embed keyfile content as string into config
|
56
|
+
|
57
|
+
```
|
58
|
+
in:
|
59
|
+
type: bigquery
|
60
|
+
project: 'project-name'
|
61
|
+
keyfile:
|
62
|
+
content: |
|
63
|
+
{
|
64
|
+
"type": "service_account",
|
65
|
+
"project_id": "example-project",
|
66
|
+
"private_key_id": "1234567890ABCDEFG",
|
67
|
+
"private_key": "**************************************",
|
68
|
+
"client_email": "example-project@hogehoge.gserviceaccount.com",
|
69
|
+
"client_id": "12345678901234567890",
|
70
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
71
|
+
"token_uri": "https://accounts.google.com/o/oauth2/token",
|
72
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
73
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/hogehoge.gcp.iam.gserviceaccount.com"
|
74
|
+
}
|
75
|
+
```
|
76
|
+
|
77
|
+
|
78
|
+
## Optional Configuration
|
79
|
+
This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using [the synchronous method](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L281).
|
80
|
+
Therefore some optional configuration items comply with the Google Cloud Client Library.
|
81
|
+
|
82
|
+
- [max](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L315) :
|
83
|
+
- default value : **null** and null value is interpreted as [no maximum row count](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L319) in the Google Cloud Client Library.
|
84
|
+
- [cache](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L331) :
|
85
|
+
- default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L333) in the Google Cloud Client Library.
|
86
|
+
- [standard_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L343):
|
87
|
+
- default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L351) in the Google Cloud Client Library.
|
88
|
+
- [legacy_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L353):
|
89
|
+
- default value : **null** and null value is interpreted as [false](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L361) in the Google Cloud Client Library.
|
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
@@ -1,24 +1,25 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'embulk/input/bigquery/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'embulk-input-bigquery'
|
8
9
|
spec.version = Embulk::Input::Bigquery::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
10
|
+
spec.authors = ['Takeru Narita']
|
11
|
+
spec.email = ['naritano77@gmail.com']
|
12
|
+
spec.description = 'embulk input plugin from bigquery.'
|
13
|
+
spec.summary = 'Embulk input plugin from bigquery.'
|
14
|
+
spec.homepage = 'https://github.com/medjed/embulk-input-bigquery'
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
23
|
-
spec.add_dependency
|
22
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_dependency 'google-cloud-bigquery', '~> 0.23'
|
24
25
|
end
|
@@ -1,63 +1,172 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'embulk/input/bigquery/version'
|
2
|
+
require 'google/cloud/bigquery'
|
3
3
|
require 'erb'
|
4
4
|
|
5
5
|
module Embulk
|
6
6
|
module Input
|
7
7
|
class InputBigquery < InputPlugin
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
8
|
+
Plugin.register_input('bigquery', self)
|
9
|
+
|
10
|
+
# support config by file path or content which supported by org.embulk.spi.unit.LocalFile
|
11
|
+
# keyfile:
|
12
|
+
# content: |
|
13
|
+
class LocalFile
|
14
|
+
# return JSON string
|
15
|
+
def self.load(v)
|
16
|
+
if v.is_a?(String)
|
17
|
+
v
|
18
|
+
elsif v.is_a?(Hash)
|
19
|
+
JSON.parse(v['content'])
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.transaction(config, &control)
|
25
|
+
sql = config[:sql]
|
26
|
+
params = {}
|
27
|
+
unless sql
|
28
|
+
sql_erb = config[:sql_erb]
|
29
|
+
erb = ERB.new(sql_erb)
|
30
|
+
erb_params = config[:erb_params]
|
31
|
+
erb_params.each do |k, v|
|
32
|
+
params[k] = eval(v)
|
33
|
+
end
|
34
|
+
|
35
|
+
sql = erb.result(binding)
|
36
|
+
end
|
37
|
+
|
38
|
+
task = {
|
39
|
+
project: config[:project],
|
40
|
+
keyfile: config.param(:keyfile, LocalFile, nil),
|
41
|
+
sql: sql,
|
42
|
+
params: params,
|
43
|
+
option: {
|
44
|
+
max: config[:max],
|
45
|
+
cache: config[:cache],
|
46
|
+
standard_sql: config[:standard_sql],
|
47
|
+
legacy_sql: config[:legacy_sql]
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
if config[:columns]
|
52
|
+
task[:columns] = config[:columns]
|
53
|
+
else
|
54
|
+
bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
|
55
|
+
task[:job_id], task[:columns] = determine_columns_by_query_results(sql, task[:option], bq)
|
56
|
+
end
|
57
|
+
|
58
|
+
columns = []
|
59
|
+
task[:columns].each_with_index do |c, i|
|
60
|
+
columns << Column.new(i, c['name'], c['type'].to_sym)
|
61
|
+
end
|
62
|
+
|
63
|
+
resume(task, columns, 1, &control)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.resume(task, columns, count, &control)
|
67
|
+
task_reports = yield(task, columns, count)
|
68
|
+
|
69
|
+
next_config_diff = {}
|
70
|
+
end
|
71
|
+
|
72
|
+
def run
|
73
|
+
bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
|
74
|
+
params = @task[:params]
|
75
|
+
option = keys_to_sym(@task[:option])
|
76
|
+
|
77
|
+
rows = if @task[:job_id].nil?
|
78
|
+
bq.query(@task[:sql], **option)
|
79
|
+
else
|
80
|
+
bq.job(@task[:job_id]).query_results(max: option[:max])
|
81
|
+
end
|
82
|
+
|
83
|
+
@task[:columns] = values_to_sym(@task[:columns], 'name')
|
84
|
+
|
85
|
+
rows.all do |row|
|
86
|
+
columns = []
|
87
|
+
@task[:columns].each do |c|
|
88
|
+
val = row[c['name'].to_sym]
|
89
|
+
val = eval(c['eval'], binding) if c['eval']
|
90
|
+
|
91
|
+
columns << as_serializable(val)
|
92
|
+
end
|
93
|
+
|
94
|
+
@page_builder.add(columns)
|
95
|
+
end
|
96
|
+
@page_builder.finish
|
97
|
+
{}
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.determine_columns_by_query_results(sql, option, bigquery_client)
|
101
|
+
Embulk.logger.info 'determine columns using the getQueryResults API instead of the config.yml'
|
102
|
+
|
103
|
+
filtered_option = option.dup
|
104
|
+
filtered_option.delete(:max)
|
105
|
+
job = bigquery_client.query_job(sql, **filtered_option)
|
106
|
+
|
107
|
+
Embulk.logger.info 'waiting for the query job to complete to get schema from query results'
|
108
|
+
job.wait_until_done!
|
109
|
+
|
110
|
+
Embulk.logger.info "completed: job_id=#{job.job_id}"
|
111
|
+
result = job.query_results(max: 0)
|
112
|
+
|
113
|
+
columns = result.fields.map do |f|
|
114
|
+
{
|
115
|
+
'name' => f.name,
|
116
|
+
'type' => embulk_column_type(f.type)
|
117
|
+
}
|
118
|
+
end
|
119
|
+
Embulk.logger.info "determined columns: #{columns.inspect}"
|
120
|
+
|
121
|
+
[job.job_id, columns]
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.embulk_column_type(bq_data_type)
|
125
|
+
case bq_data_type
|
126
|
+
when 'BOOLEAN', 'BOOL'
|
127
|
+
:boolean
|
128
|
+
when 'INTEGER', 'INT64'
|
129
|
+
:long
|
130
|
+
when 'FLOAT', 'FLOAT64'
|
131
|
+
:double
|
132
|
+
when 'STRING', 'DATETIME', 'DATE', 'TIME'
|
133
|
+
:string
|
134
|
+
when 'TIMESTAMP'
|
135
|
+
:timestamp
|
136
|
+
when 'RECORD', 'BYTES'
|
137
|
+
raise "unsupported type #{bq_data_type.inspect}"
|
138
|
+
else
|
139
|
+
raise "unknown type #{bq_data_type.inspect}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def keys_to_sym(hash)
|
144
|
+
ret = {}
|
145
|
+
hash.each do |key, value|
|
146
|
+
ret[key.to_sym] = value
|
147
|
+
end
|
148
|
+
ret
|
149
|
+
end
|
150
|
+
|
151
|
+
def values_to_sym(hashs, key)
|
152
|
+
hashs.map do |h|
|
153
|
+
h[key] = h[key].to_sym
|
154
|
+
h
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def as_serializable(v)
|
159
|
+
case v
|
160
|
+
when ::Google::Cloud::Bigquery::Time
|
161
|
+
v.value
|
162
|
+
when DateTime
|
163
|
+
v.strftime('%Y-%m-%d %H:%M:%S.%6N')
|
164
|
+
when Date
|
165
|
+
v.strftime('%Y-%m-%d')
|
166
|
+
else
|
167
|
+
v
|
168
|
+
end
|
169
|
+
end
|
61
170
|
end
|
62
171
|
end
|
63
172
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takeru Narita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,7 +67,7 @@ files:
|
|
67
67
|
- embulk-input-bigquery.gemspec
|
68
68
|
- lib/embulk/input/bigquery.rb
|
69
69
|
- lib/embulk/input/bigquery/version.rb
|
70
|
-
homepage:
|
70
|
+
homepage: https://github.com/medjed/embulk-input-bigquery
|
71
71
|
licenses:
|
72
72
|
- MIT
|
73
73
|
metadata: {}
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
87
|
version: '0'
|
88
88
|
requirements: []
|
89
89
|
rubyforge_project:
|
90
|
-
rubygems_version: 2.
|
90
|
+
rubygems_version: 2.6.11
|
91
91
|
signing_key:
|
92
92
|
specification_version: 4
|
93
93
|
summary: Embulk input plugin from bigquery.
|