embulk-input-big-query-async 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e1e61ab475f7f94c2420064083bb8faf71e03ca29c31342cd4f288fd0af323f6
4
+ data.tar.gz: a88983894e55334d9a09a45c239a493ed362ab71d0ecf6d46c6df5887b915eef
5
+ SHA512:
6
+ metadata.gz: 7fd62497ffc4bbbd314368abe870a62ea3cf2746133361d572704ef906050f30907380436f640d7941ac71a4c15af41ebc40d94adfdd2140fcdd0cb2bc9aaa58
7
+ data.tar.gz: 0e2ff2c50ceba30a3c0c352898452257a6fb3b2a11d3df8695fad6a5703adbd5fab04979ef703df0cd1206200ef13ab3d0bd5f79ee36ebb268d2ebb3549ffcce
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in embulk-input-big-query-async.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2017 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,83 @@
1
+ # Embulk::Input::Bigquery
2
+
3
+ This is Embulk input plugin from Bigquery.
4
+
5
+ ## Installation
6
+
7
+ install it yourself as:
8
+
9
+ $ embulk gem install embulk-input-big-query-async
10
+
11
+ ## Usage
12
+
13
+ ```
14
+ in:
15
+ type: bigquery-async
16
+ project: 'project-name'
17
+ keyfile: '/home/hogehoge/bigquery-keyfile.json'
18
+ sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
19
+ columns:
20
+ - {name: price, type: long}
21
+ - {name: category_id, type: string}
22
+ max: 2000
23
+ synchronous_method: true
24
+ out:
25
+ type: stdout
26
+ ```
27
+
28
+ If, table name is changeable, then
29
+
30
+ ```
31
+ in:
32
+ type: bibigquery-asyncquery
33
+ project: 'project-name'
34
+ keyfile: '/home/hogehoge/bigquery-keyfile.json'
35
+ sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m") %>] GROUP BY category_id'
36
+ erb_params:
37
+ date: "require 'date'; (Date.today - 1)"
38
+ columns:
39
+ - {name: price, type: long}
40
+ - {name: category_id, type: long}
41
+ - {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
42
+ ```
43
+
44
+ ## Optional Configuration
45
+ This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using the synchronous method or the asynchronous method.
46
+ Therefore some optional configuration items comply with the Google Cloud Client Library.
47
+
48
+ ### optional bigquery parameter
49
+
50
+ The detail of follows params is [here](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb).
51
+
52
+ - max :
53
+ - default value : **null** and null value is interpreted as no maximum row count in the Google Cloud Client Library. This param is supported only synchronous method.
54
+ - cache :
55
+ - default value : **null** and null value is interpreted as true in the Google Cloud Client Library.
56
+ - timeout :
57
+ - default value : **null** and null value is interpreted as 10000 milliseconds in the Google Cloud Client Library. This param is supported only synchronous method.
58
+ - dryrun :
59
+ - default value : **null** and null value is interpreted as false in the Google Cloud Client Library. This param is supported only synchronous method.
60
+ - standard_sql :
61
+ - default value : **null** and null value is interpreted as true in the Google Cloud Client Library.
62
+ - legacy_sql :
63
+ - default value : **null** and null value is interpreted as false in the Google Cloud Client Library.
64
+ - large_results :
65
+ - default value : **null** and null value is interpreted as false in the Google Cloud Client Library. This param is supported only asynchronous method.
66
+ - write :
67
+ - default value : **null** and null value is interpreted as empty in the Google Cloud Client Library. This param is supported only asynchronous method.
68
+
69
+ ### the bigquery method
70
+ Big query library in Google Cloud Client Library has [two methods](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb) for query.
71
+
72
+ The default method in this plugin is synchronous_method.
73
+ The logic which how select query method is [here](https://github.com/ykoyano/embulk-input-bigquery/blob/master/lib/embulk/input/bigquery.rb#L41).
74
+
75
+ - synchronous_method:
76
+ - type : boolean
77
+ - default value : **null**
78
+ - This method uses `query` method in the Google Cloud Client Library.
79
+ - It should be noted that the number of records for `query` method is **limited**. Therefore, if you get many records, you should use `query_job` method with asynchronous_method option.
80
+ - asynchronous_method:
81
+ - type : boolean
82
+ - default value : **null**
83
+ - This method uses `query_job` method in the Google Cloud Client Library.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'embulk/input/big-query-async/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "embulk-input-big-query-async"
8
+ spec.version = Embulk::Input::Bigqueryasync::VERSION
9
+ spec.authors = ["Angelos Alexopoulos"]
10
+ spec.email = ["alexopoulos7@gmail.com"]
11
+ spec.description = %q{embulk input plugin from bigquery.}
12
+ spec.summary = %q{Embulk input plugin from bigquery.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_dependency "google-cloud-bigquery", '~> 0.26.0'
24
+ end
@@ -0,0 +1,146 @@
1
+ require "embulk/input/bigquery/version"
2
+ require "google/cloud/bigquery"
3
+ require 'erb'
4
+
5
+ module Embulk
6
+ module Input
7
+ class InputBigquery < InputPlugin
8
+ Plugin.register_input('big-query-async', self)
9
+
10
+ def self.transaction(config, &control)
11
+ sql = config[:sql]
12
+ params = {}
13
+ unless sql
14
+ sql_erb = config[:sql_erb]
15
+ erb = ERB.new(sql_erb)
16
+ erb_params = config[:erb_params]
17
+ erb_params.each do |k, v|
18
+ params[k] = eval(v)
19
+ end
20
+
21
+ sql = erb.result(binding)
22
+ end
23
+
24
+ task = {
25
+ project: config[:project],
26
+ keyfile: config[:keyfile],
27
+ sql: sql,
28
+ columns: config[:columns],
29
+ params: params,
30
+ synchronous_method: config[:synchronous_method],
31
+ asynchronous_method: config[:asynchronous_method],
32
+ dataset: config[:dataset],
33
+ table: config[:table],
34
+ option: {
35
+ cache: config[:cache],
36
+ standard_sql: config[:standard_sql],
37
+ legacy_sql: config[:legacy_sql],
38
+ }
39
+ }
40
+
41
+ if task[:synchronous_method] || !task[:asynchronous_method]
42
+ task[:option].merge!(
43
+ {
44
+ max: config[:max],
45
+ timeout: config[:timeout],
46
+ dryrun: config[:dryrun],
47
+ }
48
+ )
49
+ else
50
+ task[:option].merge!(
51
+ {
52
+ large_results: config[:legacy_sql],
53
+ write: config[:write],
54
+ }
55
+ )
56
+ end
57
+
58
+ columns = []
59
+ config[:columns].each_with_index do |c, i|
60
+ columns << Column.new(i, c['name'], c['type'].to_sym)
61
+ end
62
+
63
+ yield(task, columns, 1)
64
+
65
+ return {}
66
+ end
67
+
68
+ def run
69
+ bq = Google::Cloud::Bigquery.new(project: @task[:project], keyfile: @task[:keyfile])
70
+ params = @task[:params]
71
+ @task[:columns] = values_to_sym(@task[:columns], 'name')
72
+ option = keys_to_sym(@task[:option])
73
+ if @task[:synchronous_method] || @task[:asynchronous_method].nil?
74
+ run_synchronous_query(bq, option)
75
+ else
76
+ if @task[:dataset]
77
+ dataset = bq.dataset(@task[:dataset])
78
+ option[:table] = dataset.table(@task[:table])
79
+ if option[:table].nil?
80
+ option[:table] = dataset.create_table(@task[:table])
81
+ end
82
+ end
83
+ run_asynchronous_query(bq, option)
84
+ end
85
+ @page_builder.finish
86
+ return {}
87
+ end
88
+
89
+ def run_synchronous_query(bq, option)
90
+ rows = bq.query(@task[:sql], **option)
91
+ rows.each do |row|
92
+ record = extract_record(row)
93
+ @page_builder.add(record)
94
+ end
95
+ end
96
+
97
+ def run_asynchronous_query(bq, option)
98
+ job = bq.query_job(@task[:sql], **option)
99
+ job.wait_until_done!
100
+ return {} if job.failed?
101
+ results = job.query_results
102
+ while results
103
+ results.each do |row|
104
+ record = extract_record(row)
105
+ @page_builder.add(record)
106
+ end
107
+ results = results.next
108
+ end
109
+ end
110
+
111
+ def extract_record(row)
112
+ columns = []
113
+ @task[:columns].each do |c|
114
+ val = row[c['name']]
115
+ if c['eval']
116
+ val = eval(c['eval'], binding)
117
+ end
118
+ columns << val
119
+ end
120
+ return columns
121
+ end
122
+
123
+ def values_to_sym(hashs, key)
124
+ hashs.map do |h|
125
+ h[key] = h[key].to_sym
126
+ h
127
+ end
128
+ end
129
+
130
+ def keys_to_sym(hash)
131
+ ret = {}
132
+ hash.each do |key, value|
133
+ ret[key.to_sym] = value
134
+ end
135
+ ret
136
+ end
137
+
138
+ def values_to_sym(hashs, key)
139
+ hashs.map do |h|
140
+ h[key] = h[key].to_sym
141
+ h
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,7 @@
1
+ module Embulk
2
+ module Input
3
+ module Bigqueryasync
4
+ VERSION = "0.0.2"
5
+ end
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-big-query-async
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Angelos Alexopoulos
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: google-cloud-bigquery
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.26.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.26.0
55
+ description: embulk input plugin from bigquery.
56
+ email:
57
+ - alexopoulos7@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - Gemfile
63
+ - LICENSE.txt
64
+ - README.md
65
+ - Rakefile
66
+ - embulk-input-big-query-async.gemspec
67
+ - lib/embulk/input/big-query-async.rb
68
+ - lib/embulk/input/big-query-async/version.rb
69
+ - pkg/embulk-input-big-query-async-0.0.1.gem
70
+ homepage: ''
71
+ licenses:
72
+ - MIT
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubygems_version: 3.0.3
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: Embulk input plugin from bigquery.
93
+ test_files: []