embulk-input-bigquery 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b7f8b54f5c0bf602236407494d3c0bf35257e04d
4
- data.tar.gz: bd1116a46dc6016dc58cc536d69e1b71c0f80242
3
+ metadata.gz: adc126def78ac278dafebe7ad7bf5830ad7f4f29
4
+ data.tar.gz: caa6d2d3500b9051889f8039d9bb42b5f6cbf13a
5
5
  SHA512:
6
- metadata.gz: dfcfe921546bc8d89e2df091056247f3a1b1a1eb17590eea4f512d7eb3f8856fe5f778c071b281f75e8349a4af93e2a8a3131aeadbf5b8893acb2bbea93fb547
7
- data.tar.gz: 9f00dd42dc3219ab1f75a445609cc73419f36c87c4ef5ab98e00954bc990c8560f42f3fc75208037d542ef3767c54b6356932f9bdb68ff45286c0007d4b05e85
6
+ metadata.gz: 8d459d42c1d9c5c995f35010298657fc79df8fbc03eedfa26692f34090163fb28a60a6c1c73029c4e50ecd3a2595f5c85241b569aec07bf69d711f5b15c6059f
7
+ data.tar.gz: a670a9fde47bd8ca7cfb0412b35cd2c581549ba74788b8bbd96e32360d38f0a6d00e9b1434331ff2adbf593afe8c24450099856d18f05a9f3565b4be270a0c56
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ vendor
data/README.md CHANGED
@@ -19,6 +19,7 @@ in:
19
19
  columns:
20
20
  - {name: price, type: long}
21
21
  - {name: category_id, type: string}
22
+ max: 2000
22
23
  out:
23
24
  type: stdout
24
25
  ```
@@ -30,7 +31,7 @@ in:
30
31
  type: bigquery
31
32
  project: 'project-name'
32
33
  keyfile: '/home/hogehoge/bigquery-keyfile.json'
33
- sql: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
34
+ sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
34
35
  erb_params:
35
36
  date: "require 'date'; (Date.today - 1)"
36
37
  columns:
@@ -39,3 +40,50 @@ in:
39
40
  - {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
40
41
  ```
41
42
 
43
+ ### Determine columns from query results if columns definition is empty
44
+
45
+ ```
46
+ in:
47
+ type: bigquery
48
+ project: 'project-name'
49
+ keyfile: '/home/hogehoge/bigquery-keyfile.json'
50
+ sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
51
+ out:
52
+ type: stdout
53
+ ```
54
+
55
+ ### Embed keyfile content as string into config
56
+
57
+ ```
58
+ in:
59
+ type: bigquery
60
+ project: 'project-name'
61
+ keyfile:
62
+ content: |
63
+ {
64
+ "type": "service_account",
65
+ "project_id": "example-project",
66
+ "private_key_id": "1234567890ABCDEFG",
67
+ "private_key": "**************************************",
68
+ "client_email": "example-project@hogehoge.gserviceaccount.com",
69
+ "client_id": "12345678901234567890",
70
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
71
+ "token_uri": "https://accounts.google.com/o/oauth2/token",
72
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
73
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/hogehoge.gcp.iam.gserviceaccount.com"
74
+ }
75
+ ```
76
+
77
+
78
+ ## Optional Configuration
79
+ This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using [the synchronous method](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L281).
80
+ Therefore some optional configuration items comply with the Google Cloud Client Library.
81
+
82
+ - [max](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L315) :
83
+ - default value : **null** and null value is interpreted as [no maximum row count](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L319) in the Google Cloud Client Library.
84
+ - [cache](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L331) :
85
+ - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L333) in the Google Cloud Client Library.
86
+ - [standard_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L343):
87
+ - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L351) in the Google Cloud Client Library.
88
+ - [legacy_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L353):
89
+ - default value : **null** and null value is interpreted as [false](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L361) in the Google Cloud Client Library.
data/Rakefile CHANGED
@@ -1 +1 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
@@ -1,24 +1,25 @@
1
1
  # coding: utf-8
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'embulk/input/bigquery/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "embulk-input-bigquery"
8
+ spec.name = 'embulk-input-bigquery'
8
9
  spec.version = Embulk::Input::Bigquery::VERSION
9
- spec.authors = ["Takeru Narita"]
10
- spec.email = ["naritano77@gmail.com"]
11
- spec.description = %q{embulk input plugin from bigquery.}
12
- spec.summary = %q{Embulk input plugin from bigquery.}
13
- spec.homepage = ""
14
- spec.license = "MIT"
10
+ spec.authors = ['Takeru Narita']
11
+ spec.email = ['naritano77@gmail.com']
12
+ spec.description = 'embulk input plugin from bigquery.'
13
+ spec.summary = 'Embulk input plugin from bigquery.'
14
+ spec.homepage = 'https://github.com/medjed/embulk-input-bigquery'
15
+ spec.license = 'MIT'
15
16
 
16
- spec.files = `git ls-files`.split($/)
17
+ spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_development_dependency "bundler", "~> 1.3"
22
- spec.add_development_dependency "rake"
23
- spec.add_dependency "google-cloud-bigquery", '~> 0.23'
22
+ spec.add_development_dependency 'bundler', '~> 1.3'
23
+ spec.add_development_dependency 'rake'
24
+ spec.add_dependency 'google-cloud-bigquery', '~> 0.23'
24
25
  end
@@ -1,63 +1,172 @@
1
- require "embulk/input/bigquery/version"
2
- require "google/cloud/bigquery"
1
+ require 'embulk/input/bigquery/version'
2
+ require 'google/cloud/bigquery'
3
3
  require 'erb'
4
4
 
5
5
  module Embulk
6
6
  module Input
7
7
  class InputBigquery < InputPlugin
8
- Plugin.register_input('bigquery', self)
9
-
10
- def self.transaction(config, &control)
11
- sql = config[:sql]
12
- params = {}
13
- unless sql
14
- sql_erb = config[:sql_erb]
15
- erb = ERB.new(sql_erb)
16
- erb_params = config[:erb_params]
17
- erb_params.each do |k, v|
18
- params[k] = eval(v)
19
- end
20
-
21
- sql = erb.result(binding)
22
- end
23
-
24
- task = {
25
- project: config[:project],
26
- keyfile: config[:keyfile],
27
- sql: sql,
28
- columns: config[:columns],
29
- params: params
30
- }
31
-
32
- columns = []
33
- config[:columns].each_with_index do |c, i|
34
- columns << Column.new(i, c['name'], c['type'].to_sym)
35
- end
36
-
37
- yield(task, columns, 1)
38
-
39
- return {}
40
- end
41
-
42
- def run
43
- bq = Google::Cloud::Bigquery.new(project: @task[:project], keyfile: @task[:keyfile])
44
- params = @task[:params]
45
- rows = bq.query(@task[:sql])
46
- rows.each do |row|
47
- columns = []
48
- @task[:columns].each do |c|
49
- val = row[c['name']]
50
- if c['eval']
51
- val = eval(c['eval'], binding)
52
- end
53
- columns << val
54
- end
55
-
56
- @page_builder.add(columns)
57
- end
58
- @page_builder.finish
59
- return {}
60
- end
8
+ Plugin.register_input('bigquery', self)
9
+
10
+ # support config by file path or content which supported by org.embulk.spi.unit.LocalFile
11
+ # keyfile:
12
+ # content: |
13
+ class LocalFile
14
+ # return JSON string
15
+ def self.load(v)
16
+ if v.is_a?(String)
17
+ v
18
+ elsif v.is_a?(Hash)
19
+ JSON.parse(v['content'])
20
+ end
21
+ end
22
+ end
23
+
24
+ def self.transaction(config, &control)
25
+ sql = config[:sql]
26
+ params = {}
27
+ unless sql
28
+ sql_erb = config[:sql_erb]
29
+ erb = ERB.new(sql_erb)
30
+ erb_params = config[:erb_params]
31
+ erb_params.each do |k, v|
32
+ params[k] = eval(v)
33
+ end
34
+
35
+ sql = erb.result(binding)
36
+ end
37
+
38
+ task = {
39
+ project: config[:project],
40
+ keyfile: config.param(:keyfile, LocalFile, nil),
41
+ sql: sql,
42
+ params: params,
43
+ option: {
44
+ max: config[:max],
45
+ cache: config[:cache],
46
+ standard_sql: config[:standard_sql],
47
+ legacy_sql: config[:legacy_sql]
48
+ }
49
+ }
50
+
51
+ if config[:columns]
52
+ task[:columns] = config[:columns]
53
+ else
54
+ bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
55
+ task[:job_id], task[:columns] = determine_columns_by_query_results(sql, task[:option], bq)
56
+ end
57
+
58
+ columns = []
59
+ task[:columns].each_with_index do |c, i|
60
+ columns << Column.new(i, c['name'], c['type'].to_sym)
61
+ end
62
+
63
+ resume(task, columns, 1, &control)
64
+ end
65
+
66
+ def self.resume(task, columns, count, &control)
67
+ task_reports = yield(task, columns, count)
68
+
69
+ next_config_diff = {}
70
+ end
71
+
72
+ def run
73
+ bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
74
+ params = @task[:params]
75
+ option = keys_to_sym(@task[:option])
76
+
77
+ rows = if @task[:job_id].nil?
78
+ bq.query(@task[:sql], **option)
79
+ else
80
+ bq.job(@task[:job_id]).query_results(max: option[:max])
81
+ end
82
+
83
+ @task[:columns] = values_to_sym(@task[:columns], 'name')
84
+
85
+ rows.all do |row|
86
+ columns = []
87
+ @task[:columns].each do |c|
88
+ val = row[c['name'].to_sym]
89
+ val = eval(c['eval'], binding) if c['eval']
90
+
91
+ columns << as_serializable(val)
92
+ end
93
+
94
+ @page_builder.add(columns)
95
+ end
96
+ @page_builder.finish
97
+ {}
98
+ end
99
+
100
+ def self.determine_columns_by_query_results(sql, option, bigquery_client)
101
+ Embulk.logger.info 'determine columns using the getQueryResults API instead of the config.yml'
102
+
103
+ filtered_option = option.dup
104
+ filtered_option.delete(:max)
105
+ job = bigquery_client.query_job(sql, **filtered_option)
106
+
107
+ Embulk.logger.info 'waiting for the query job to complete to get schema from query results'
108
+ job.wait_until_done!
109
+
110
+ Embulk.logger.info "completed: job_id=#{job.job_id}"
111
+ result = job.query_results(max: 0)
112
+
113
+ columns = result.fields.map do |f|
114
+ {
115
+ 'name' => f.name,
116
+ 'type' => embulk_column_type(f.type)
117
+ }
118
+ end
119
+ Embulk.logger.info "determined columns: #{columns.inspect}"
120
+
121
+ [job.job_id, columns]
122
+ end
123
+
124
+ def self.embulk_column_type(bq_data_type)
125
+ case bq_data_type
126
+ when 'BOOLEAN', 'BOOL'
127
+ :boolean
128
+ when 'INTEGER', 'INT64'
129
+ :long
130
+ when 'FLOAT', 'FLOAT64'
131
+ :double
132
+ when 'STRING', 'DATETIME', 'DATE', 'TIME'
133
+ :string
134
+ when 'TIMESTAMP'
135
+ :timestamp
136
+ when 'RECORD', 'BYTES'
137
+ raise "unsupported type #{bq_data_type.inspect}"
138
+ else
139
+ raise "unknown type #{bq_data_type.inspect}"
140
+ end
141
+ end
142
+
143
+ def keys_to_sym(hash)
144
+ ret = {}
145
+ hash.each do |key, value|
146
+ ret[key.to_sym] = value
147
+ end
148
+ ret
149
+ end
150
+
151
+ def values_to_sym(hashs, key)
152
+ hashs.map do |h|
153
+ h[key] = h[key].to_sym
154
+ h
155
+ end
156
+ end
157
+
158
+ def as_serializable(v)
159
+ case v
160
+ when ::Google::Cloud::Bigquery::Time
161
+ v.value
162
+ when DateTime
163
+ v.strftime('%Y-%m-%d %H:%M:%S.%6N')
164
+ when Date
165
+ v.strftime('%Y-%m-%d')
166
+ else
167
+ v
168
+ end
169
+ end
61
170
  end
62
171
  end
63
172
  end
@@ -1,7 +1,7 @@
1
1
  module Embulk
2
2
  module Input
3
3
  module Bigquery
4
- VERSION = "0.0.2"
4
+ VERSION = '0.0.3'.freeze
5
5
  end
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takeru Narita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-01 00:00:00.000000000 Z
11
+ date: 2017-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,7 +67,7 @@ files:
67
67
  - embulk-input-bigquery.gemspec
68
68
  - lib/embulk/input/bigquery.rb
69
69
  - lib/embulk/input/bigquery/version.rb
70
- homepage: ''
70
+ homepage: https://github.com/medjed/embulk-input-bigquery
71
71
  licenses:
72
72
  - MIT
73
73
  metadata: {}
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  version: '0'
88
88
  requirements: []
89
89
  rubyforge_project:
90
- rubygems_version: 2.2.0
90
+ rubygems_version: 2.6.11
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Embulk input plugin from bigquery.