embulk-input-bigquery 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b7f8b54f5c0bf602236407494d3c0bf35257e04d
4
- data.tar.gz: bd1116a46dc6016dc58cc536d69e1b71c0f80242
3
+ metadata.gz: adc126def78ac278dafebe7ad7bf5830ad7f4f29
4
+ data.tar.gz: caa6d2d3500b9051889f8039d9bb42b5f6cbf13a
5
5
  SHA512:
6
- metadata.gz: dfcfe921546bc8d89e2df091056247f3a1b1a1eb17590eea4f512d7eb3f8856fe5f778c071b281f75e8349a4af93e2a8a3131aeadbf5b8893acb2bbea93fb547
7
- data.tar.gz: 9f00dd42dc3219ab1f75a445609cc73419f36c87c4ef5ab98e00954bc990c8560f42f3fc75208037d542ef3767c54b6356932f9bdb68ff45286c0007d4b05e85
6
+ metadata.gz: 8d459d42c1d9c5c995f35010298657fc79df8fbc03eedfa26692f34090163fb28a60a6c1c73029c4e50ecd3a2595f5c85241b569aec07bf69d711f5b15c6059f
7
+ data.tar.gz: a670a9fde47bd8ca7cfb0412b35cd2c581549ba74788b8bbd96e32360d38f0a6d00e9b1434331ff2adbf593afe8c24450099856d18f05a9f3565b4be270a0c56
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ vendor
data/README.md CHANGED
@@ -19,6 +19,7 @@ in:
19
19
  columns:
20
20
  - {name: price, type: long}
21
21
  - {name: category_id, type: string}
22
+ max: 2000
22
23
  out:
23
24
  type: stdout
24
25
  ```
@@ -30,7 +31,7 @@ in:
30
31
  type: bigquery
31
32
  project: 'project-name'
32
33
  keyfile: '/home/hogehoge/bigquery-keyfile.json'
33
- sql: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
34
+ sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
34
35
  erb_params:
35
36
  date: "require 'date'; (Date.today - 1)"
36
37
  columns:
@@ -39,3 +40,50 @@ in:
39
40
  - {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
40
41
  ```
41
42
 
43
+ ### Determine columns from query results if columns definition is empty
44
+
45
+ ```
46
+ in:
47
+ type: bigquery
48
+ project: 'project-name'
49
+ keyfile: '/home/hogehoge/bigquery-keyfile.json'
50
+ sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
51
+ out:
52
+ type: stdout
53
+ ```
54
+
55
+ ### Embed keyfile content as string into config
56
+
57
+ ```
58
+ in:
59
+ type: bigquery
60
+ project: 'project-name'
61
+ keyfile:
62
+ content: |
63
+ {
64
+ "type": "service_account",
65
+ "project_id": "example-project",
66
+ "private_key_id": "1234567890ABCDEFG",
67
+ "private_key": "**************************************",
68
+ "client_email": "example-project@hogehoge.gserviceaccount.com",
69
+ "client_id": "12345678901234567890",
70
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
71
+ "token_uri": "https://accounts.google.com/o/oauth2/token",
72
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
73
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/hogehoge.gcp.iam.gserviceaccount.com"
74
+ }
75
+ ```
76
+
77
+
78
+ ## Optional Configuration
79
+ This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using [the synchronous method](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L281).
80
+ Therefore some optional configuration items comply with the Google Cloud Client Library.
81
+
82
+ - [max](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L315) :
83
+ - default value : **null** and null value is interpreted as [no maximum row count](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L319) in the Google Cloud Client Library.
84
+ - [cache](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L331) :
85
+ - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L333) in the Google Cloud Client Library.
86
+ - [standard_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L343):
87
+ - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L351) in the Google Cloud Client Library.
88
+ - [legacy_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L353):
89
+ - default value : **null** and null value is interpreted as [false](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L361) in the Google Cloud Client Library.
data/Rakefile CHANGED
@@ -1 +1 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
@@ -1,24 +1,25 @@
1
1
  # coding: utf-8
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'embulk/input/bigquery/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "embulk-input-bigquery"
8
+ spec.name = 'embulk-input-bigquery'
8
9
  spec.version = Embulk::Input::Bigquery::VERSION
9
- spec.authors = ["Takeru Narita"]
10
- spec.email = ["naritano77@gmail.com"]
11
- spec.description = %q{embulk input plugin from bigquery.}
12
- spec.summary = %q{Embulk input plugin from bigquery.}
13
- spec.homepage = ""
14
- spec.license = "MIT"
10
+ spec.authors = ['Takeru Narita']
11
+ spec.email = ['naritano77@gmail.com']
12
+ spec.description = 'embulk input plugin from bigquery.'
13
+ spec.summary = 'Embulk input plugin from bigquery.'
14
+ spec.homepage = 'https://github.com/medjed/embulk-input-bigquery'
15
+ spec.license = 'MIT'
15
16
 
16
- spec.files = `git ls-files`.split($/)
17
+ spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_development_dependency "bundler", "~> 1.3"
22
- spec.add_development_dependency "rake"
23
- spec.add_dependency "google-cloud-bigquery", '~> 0.23'
22
+ spec.add_development_dependency 'bundler', '~> 1.3'
23
+ spec.add_development_dependency 'rake'
24
+ spec.add_dependency 'google-cloud-bigquery', '~> 0.23'
24
25
  end
@@ -1,63 +1,172 @@
1
- require "embulk/input/bigquery/version"
2
- require "google/cloud/bigquery"
1
+ require 'embulk/input/bigquery/version'
2
+ require 'google/cloud/bigquery'
3
3
  require 'erb'
4
4
 
5
5
  module Embulk
6
6
  module Input
7
7
  class InputBigquery < InputPlugin
8
- Plugin.register_input('bigquery', self)
9
-
10
- def self.transaction(config, &control)
11
- sql = config[:sql]
12
- params = {}
13
- unless sql
14
- sql_erb = config[:sql_erb]
15
- erb = ERB.new(sql_erb)
16
- erb_params = config[:erb_params]
17
- erb_params.each do |k, v|
18
- params[k] = eval(v)
19
- end
20
-
21
- sql = erb.result(binding)
22
- end
23
-
24
- task = {
25
- project: config[:project],
26
- keyfile: config[:keyfile],
27
- sql: sql,
28
- columns: config[:columns],
29
- params: params
30
- }
31
-
32
- columns = []
33
- config[:columns].each_with_index do |c, i|
34
- columns << Column.new(i, c['name'], c['type'].to_sym)
35
- end
36
-
37
- yield(task, columns, 1)
38
-
39
- return {}
40
- end
41
-
42
- def run
43
- bq = Google::Cloud::Bigquery.new(project: @task[:project], keyfile: @task[:keyfile])
44
- params = @task[:params]
45
- rows = bq.query(@task[:sql])
46
- rows.each do |row|
47
- columns = []
48
- @task[:columns].each do |c|
49
- val = row[c['name']]
50
- if c['eval']
51
- val = eval(c['eval'], binding)
52
- end
53
- columns << val
54
- end
55
-
56
- @page_builder.add(columns)
57
- end
58
- @page_builder.finish
59
- return {}
60
- end
8
+ Plugin.register_input('bigquery', self)
9
+
10
+ # support config by file path or content which supported by org.embulk.spi.unit.LocalFile
11
+ # keyfile:
12
+ # content: |
13
+ class LocalFile
14
+ # return JSON string
15
+ def self.load(v)
16
+ if v.is_a?(String)
17
+ v
18
+ elsif v.is_a?(Hash)
19
+ JSON.parse(v['content'])
20
+ end
21
+ end
22
+ end
23
+
24
+ def self.transaction(config, &control)
25
+ sql = config[:sql]
26
+ params = {}
27
+ unless sql
28
+ sql_erb = config[:sql_erb]
29
+ erb = ERB.new(sql_erb)
30
+ erb_params = config[:erb_params]
31
+ erb_params.each do |k, v|
32
+ params[k] = eval(v)
33
+ end
34
+
35
+ sql = erb.result(binding)
36
+ end
37
+
38
+ task = {
39
+ project: config[:project],
40
+ keyfile: config.param(:keyfile, LocalFile, nil),
41
+ sql: sql,
42
+ params: params,
43
+ option: {
44
+ max: config[:max],
45
+ cache: config[:cache],
46
+ standard_sql: config[:standard_sql],
47
+ legacy_sql: config[:legacy_sql]
48
+ }
49
+ }
50
+
51
+ if config[:columns]
52
+ task[:columns] = config[:columns]
53
+ else
54
+ bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
55
+ task[:job_id], task[:columns] = determine_columns_by_query_results(sql, task[:option], bq)
56
+ end
57
+
58
+ columns = []
59
+ task[:columns].each_with_index do |c, i|
60
+ columns << Column.new(i, c['name'], c['type'].to_sym)
61
+ end
62
+
63
+ resume(task, columns, 1, &control)
64
+ end
65
+
66
+ def self.resume(task, columns, count, &control)
67
+ task_reports = yield(task, columns, count)
68
+
69
+ next_config_diff = {}
70
+ end
71
+
72
+ def run
73
+ bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
74
+ params = @task[:params]
75
+ option = keys_to_sym(@task[:option])
76
+
77
+ rows = if @task[:job_id].nil?
78
+ bq.query(@task[:sql], **option)
79
+ else
80
+ bq.job(@task[:job_id]).query_results(max: option[:max])
81
+ end
82
+
83
+ @task[:columns] = values_to_sym(@task[:columns], 'name')
84
+
85
+ rows.all do |row|
86
+ columns = []
87
+ @task[:columns].each do |c|
88
+ val = row[c['name'].to_sym]
89
+ val = eval(c['eval'], binding) if c['eval']
90
+
91
+ columns << as_serializable(val)
92
+ end
93
+
94
+ @page_builder.add(columns)
95
+ end
96
+ @page_builder.finish
97
+ {}
98
+ end
99
+
100
+ def self.determine_columns_by_query_results(sql, option, bigquery_client)
101
+ Embulk.logger.info 'determine columns using the getQueryResults API instead of the config.yml'
102
+
103
+ filtered_option = option.dup
104
+ filtered_option.delete(:max)
105
+ job = bigquery_client.query_job(sql, **filtered_option)
106
+
107
+ Embulk.logger.info 'waiting for the query job to complete to get schema from query results'
108
+ job.wait_until_done!
109
+
110
+ Embulk.logger.info "completed: job_id=#{job.job_id}"
111
+ result = job.query_results(max: 0)
112
+
113
+ columns = result.fields.map do |f|
114
+ {
115
+ 'name' => f.name,
116
+ 'type' => embulk_column_type(f.type)
117
+ }
118
+ end
119
+ Embulk.logger.info "determined columns: #{columns.inspect}"
120
+
121
+ [job.job_id, columns]
122
+ end
123
+
124
+ def self.embulk_column_type(bq_data_type)
125
+ case bq_data_type
126
+ when 'BOOLEAN', 'BOOL'
127
+ :boolean
128
+ when 'INTEGER', 'INT64'
129
+ :long
130
+ when 'FLOAT', 'FLOAT64'
131
+ :double
132
+ when 'STRING', 'DATETIME', 'DATE', 'TIME'
133
+ :string
134
+ when 'TIMESTAMP'
135
+ :timestamp
136
+ when 'RECORD', 'BYTES'
137
+ raise "unsupported type #{bq_data_type.inspect}"
138
+ else
139
+ raise "unknown type #{bq_data_type.inspect}"
140
+ end
141
+ end
142
+
143
+ def keys_to_sym(hash)
144
+ ret = {}
145
+ hash.each do |key, value|
146
+ ret[key.to_sym] = value
147
+ end
148
+ ret
149
+ end
150
+
151
+ def values_to_sym(hashs, key)
152
+ hashs.map do |h|
153
+ h[key] = h[key].to_sym
154
+ h
155
+ end
156
+ end
157
+
158
+ def as_serializable(v)
159
+ case v
160
+ when ::Google::Cloud::Bigquery::Time
161
+ v.value
162
+ when DateTime
163
+ v.strftime('%Y-%m-%d %H:%M:%S.%6N')
164
+ when Date
165
+ v.strftime('%Y-%m-%d')
166
+ else
167
+ v
168
+ end
169
+ end
61
170
  end
62
171
  end
63
172
  end
@@ -1,7 +1,7 @@
1
1
  module Embulk
2
2
  module Input
3
3
  module Bigquery
4
- VERSION = "0.0.2"
4
+ VERSION = '0.0.3'.freeze
5
5
  end
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takeru Narita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-01 00:00:00.000000000 Z
11
+ date: 2017-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,7 +67,7 @@ files:
67
67
  - embulk-input-bigquery.gemspec
68
68
  - lib/embulk/input/bigquery.rb
69
69
  - lib/embulk/input/bigquery/version.rb
70
- homepage: ''
70
+ homepage: https://github.com/medjed/embulk-input-bigquery
71
71
  licenses:
72
72
  - MIT
73
73
  metadata: {}
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  version: '0'
88
88
  requirements: []
89
89
  rubyforge_project:
90
- rubygems_version: 2.2.0
90
+ rubygems_version: 2.6.11
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Embulk input plugin from bigquery.