RubyGems - embulk-input-bigquery - Versions diffs - 0.0.2 → 0.0.3 - Mend

embulk-input-bigquery 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +49 -1
data/Rakefile +1 -1
data/embulk-input-bigquery.gemspec +13 -12
data/lib/embulk/input/bigquery.rb +164 -55
data/lib/embulk/input/bigquery/version.rb +1 -1
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b7f8b54f5c0bf602236407494d3c0bf35257e04d
-  data.tar.gz: bd1116a46dc6016dc58cc536d69e1b71c0f80242
+  metadata.gz: adc126def78ac278dafebe7ad7bf5830ad7f4f29
+  data.tar.gz: caa6d2d3500b9051889f8039d9bb42b5f6cbf13a
 SHA512:
-  metadata.gz: dfcfe921546bc8d89e2df091056247f3a1b1a1eb17590eea4f512d7eb3f8856fe5f778c071b281f75e8349a4af93e2a8a3131aeadbf5b8893acb2bbea93fb547
-  data.tar.gz: 9f00dd42dc3219ab1f75a445609cc73419f36c87c4ef5ab98e00954bc990c8560f42f3fc75208037d542ef3767c54b6356932f9bdb68ff45286c0007d4b05e85
+  metadata.gz: 8d459d42c1d9c5c995f35010298657fc79df8fbc03eedfa26692f34090163fb28a60a6c1c73029c4e50ecd3a2595f5c85241b569aec07bf69d711f5b15c6059f
+  data.tar.gz: a670a9fde47bd8ca7cfb0412b35cd2c581549ba74788b8bbd96e32360d38f0a6d00e9b1434331ff2adbf593afe8c24450099856d18f05a9f3565b4be270a0c56

data/.gitignore CHANGED Viewed

@@ -15,3 +15,4 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
+vendor

data/README.md CHANGED Viewed

@@ -19,6 +19,7 @@ in:
   columns:
     - {name: price, type: long}
     - {name: category_id, type: string}
+  max: 2000
 out:
   type: stdout
 ```
@@ -30,7 +31,7 @@ in:
   type: bigquery
   project: 'project-name'
   keyfile: '/home/hogehoge/bigquery-keyfile.json'
-  sql: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m")  %>] GROUP BY category_id'
+  sql_erb: 'SELECT price,category_id FROM [ecsite.products_<%= params["date"].strftime("%Y%m")  %>] GROUP BY category_id'
   erb_params:
     date: "require 'date'; (Date.today - 1)"
   columns:
@@ -39,3 +40,50 @@ in:
     - {name: month, type: timestamp, format: '%Y-%m', eval: 'require "time"; Time.parse(params["date"]).to_i'}
 ```
+### Determine columns from query results if columns definition is empty
+```
+in:
+  type: bigquery
+  project: 'project-name'
+  keyfile: '/home/hogehoge/bigquery-keyfile.json'
+  sql: 'SELECT price,category_id FROM [ecsite.products] GROUP BY category_id'
+out:
+  type: stdout
+```
+### Embed keyfile content as string into config
+```
+in:
+  type: bigquery
+  project: 'project-name'
+  keyfile:
+    content: |
+      {
+        "type": "service_account",
+        "project_id": "example-project",
+        "private_key_id": "1234567890ABCDEFG",
+        "private_key": "**************************************",
+        "client_email": "example-project@hogehoge.gserviceaccount.com",
+        "client_id": "12345678901234567890",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://accounts.google.com/o/oauth2/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/hogehoge.gcp.iam.gserviceaccount.com"
+      }
+```
+## Optional Configuration
+This plugin uses the gem [`google-cloud(Google Cloud Client Library for Ruby)`](https://github.com/GoogleCloudPlatform/google-cloud-ruby) and queries data using [the synchronous method](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L281).
+Therefore some optional configuration items comply with the Google Cloud Client Library.
+- [max](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L315) :
+  - default value : **null** and null value is interpreted as [no maximum row count](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L319) in the Google Cloud Client Library.
+- [cache](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L331) :
+  - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L333) in the Google Cloud Client Library.
+- [standard_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L343):
+  - default value : **null** and null value is interpreted as [true](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L351) in the Google Cloud Client Library.
+- [legacy_sql](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L353):
+  - default value : **null** and null value is interpreted as [false](https://github.com/GoogleCloudPlatform/google-cloud-ruby/blob/master/google-cloud-bigquery/lib/google/cloud/bigquery/project.rb#L361) in the Google Cloud Client Library.

data/Rakefile CHANGED Viewed

	@@ -1 +1 @@
1	- require "bundler/gem_tasks"
1	+ require 'bundler/gem_tasks'

data/embulk-input-bigquery.gemspec CHANGED Viewed

@@ -1,24 +1,25 @@
 # coding: utf-8
 lib = File.expand_path('../lib', __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'embulk/input/bigquery/version'
 Gem::Specification.new do |spec|
-  spec.name          = "embulk-input-bigquery"
+  spec.name          = 'embulk-input-bigquery'
   spec.version       = Embulk::Input::Bigquery::VERSION
-  spec.authors       = ["Takeru Narita"]
-  spec.email         = ["naritano77@gmail.com"]
-  spec.description   = %q{embulk input plugin from bigquery.}
-  spec.summary       = %q{Embulk input plugin from bigquery.}
-  spec.homepage      = ""
-  spec.license       = "MIT"
+  spec.authors       = ['Takeru Narita']
+  spec.email         = ['naritano77@gmail.com']
+  spec.description   = 'embulk input plugin from bigquery.'
+  spec.summary       = 'Embulk input plugin from bigquery.'
+  spec.homepage      = 'https://github.com/medjed/embulk-input-bigquery'
+  spec.license       = 'MIT'
-  spec.files         = `git ls-files`.split($/)
+  spec.files         = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
-  spec.require_paths = ["lib"]
+  spec.require_paths = ['lib']
-  spec.add_development_dependency "bundler", "~> 1.3"
-  spec.add_development_dependency "rake"
-  spec.add_dependency "google-cloud-bigquery", '~> 0.23'
+  spec.add_development_dependency 'bundler', '~> 1.3'
+  spec.add_development_dependency 'rake'
+  spec.add_dependency 'google-cloud-bigquery', '~> 0.23'
 end

data/lib/embulk/input/bigquery.rb CHANGED Viewed

@@ -1,63 +1,172 @@
-require "embulk/input/bigquery/version"
-require "google/cloud/bigquery"
+require 'embulk/input/bigquery/version'
+require 'google/cloud/bigquery'
 require 'erb'
 module Embulk
   module Input
     class InputBigquery < InputPlugin
-			Plugin.register_input('bigquery', self)
-			def self.transaction(config, &control)
-				sql = config[:sql]
-				params = {}
-				unless sql
-					sql_erb = config[:sql_erb]
-					erb = ERB.new(sql_erb)
-					erb_params = config[:erb_params]
-					erb_params.each do |k, v|
-						params[k] = eval(v)
-					end
-					sql = erb.result(binding)
-				end
-				task = {
-					project: config[:project],
-					keyfile: config[:keyfile],
-					sql: sql,
-					columns: config[:columns],
-					params: params
-				}
-				columns = []
-				config[:columns].each_with_index do |c, i|
-					columns << Column.new(i, c['name'], c['type'].to_sym)
-				end
-				yield(task, columns, 1)
-				return {}
-			end
-			def run
-				bq = Google::Cloud::Bigquery.new(project: @task[:project], keyfile: @task[:keyfile])
-				params = @task[:params]
-				rows = bq.query(@task[:sql])
-				rows.each do |row|
-					columns = []
-					@task[:columns].each do |c|
-						val = row[c['name']]
-						if c['eval']
-							val = eval(c['eval'], binding)
-						end
-						columns << val
-					end
-					@page_builder.add(columns)
-				end
-				@page_builder.finish
-				return {}
-			end
+      Plugin.register_input('bigquery', self)
+      # support config by file path or content which supported by org.embulk.spi.unit.LocalFile
+      # keyfile:
+      #   content: |
+      class LocalFile
+        # return JSON string
+        def self.load(v)
+          if v.is_a?(String)
+            v
+          elsif v.is_a?(Hash)
+            JSON.parse(v['content'])
+          end
+        end
+      end
+      def self.transaction(config, &control)
+        sql = config[:sql]
+        params = {}
+        unless sql
+          sql_erb = config[:sql_erb]
+          erb = ERB.new(sql_erb)
+          erb_params = config[:erb_params]
+          erb_params.each do |k, v|
+            params[k] = eval(v)
+          end
+          sql = erb.result(binding)
+        end
+        task = {
+          project: config[:project],
+          keyfile: config.param(:keyfile, LocalFile, nil),
+          sql: sql,
+          params: params,
+          option: {
+            max: config[:max],
+            cache: config[:cache],
+            standard_sql: config[:standard_sql],
+            legacy_sql: config[:legacy_sql]
+          }
+        }
+        if config[:columns]
+          task[:columns] = config[:columns]
+        else
+          bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
+          task[:job_id], task[:columns] = determine_columns_by_query_results(sql, task[:option], bq)
+        end
+        columns = []
+        task[:columns].each_with_index do |c, i|
+          columns << Column.new(i, c['name'], c['type'].to_sym)
+        end
+        resume(task, columns, 1, &control)
+      end
+      def self.resume(task, columns, count, &control)
+        task_reports = yield(task, columns, count)
+        next_config_diff = {}
+      end
+      def run
+        bq = Google::Cloud::Bigquery.new(project: task[:project], keyfile: task[:keyfile])
+        params = @task[:params]
+        option = keys_to_sym(@task[:option])
+        rows = if @task[:job_id].nil?
+                 bq.query(@task[:sql], **option)
+               else
+                 bq.job(@task[:job_id]).query_results(max: option[:max])
+               end
+        @task[:columns] = values_to_sym(@task[:columns], 'name')
+        rows.all do |row|
+          columns = []
+          @task[:columns].each do |c|
+            val = row[c['name'].to_sym]
+            val = eval(c['eval'], binding) if c['eval']
+            columns << as_serializable(val)
+          end
+          @page_builder.add(columns)
+        end
+        @page_builder.finish
+        {}
+      end
+      def self.determine_columns_by_query_results(sql, option, bigquery_client)
+        Embulk.logger.info 'determine columns using the getQueryResults API instead of the config.yml'
+        filtered_option = option.dup
+        filtered_option.delete(:max)
+        job = bigquery_client.query_job(sql, **filtered_option)
+        Embulk.logger.info 'waiting for the query job to complete to get schema from query results'
+        job.wait_until_done!
+        Embulk.logger.info "completed: job_id=#{job.job_id}"
+        result = job.query_results(max: 0)
+        columns = result.fields.map do |f|
+          {
+            'name' => f.name,
+            'type' => embulk_column_type(f.type)
+          }
+        end
+        Embulk.logger.info "determined columns: #{columns.inspect}"
+        [job.job_id, columns]
+      end
+      def self.embulk_column_type(bq_data_type)
+        case bq_data_type
+        when 'BOOLEAN', 'BOOL'
+          :boolean
+        when 'INTEGER', 'INT64'
+          :long
+        when 'FLOAT', 'FLOAT64'
+          :double
+        when 'STRING', 'DATETIME', 'DATE', 'TIME'
+          :string
+        when 'TIMESTAMP'
+          :timestamp
+        when 'RECORD', 'BYTES'
+          raise "unsupported type #{bq_data_type.inspect}"
+        else
+          raise "unknown type #{bq_data_type.inspect}"
+        end
+      end
+      def keys_to_sym(hash)
+        ret = {}
+        hash.each do |key, value|
+          ret[key.to_sym] = value
+        end
+        ret
+      end
+      def values_to_sym(hashs, key)
+        hashs.map do |h|
+          h[key] = h[key].to_sym
+          h
+        end
+      end
+      def as_serializable(v)
+        case v
+        when ::Google::Cloud::Bigquery::Time
+          v.value
+        when DateTime
+          v.strftime('%Y-%m-%d %H:%M:%S.%6N')
+        when Date
+          v.strftime('%Y-%m-%d')
+        else
+          v
+        end
+      end
     end
   end
 end

data/lib/embulk/input/bigquery/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Embulk
   module Input
     module Bigquery
-      VERSION = "0.0.2"
+      VERSION = '0.0.3'.freeze
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-input-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Takeru Narita
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-02-01 00:00:00.000000000 Z
+date: 2017-12-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -67,7 +67,7 @@ files:
 - embulk-input-bigquery.gemspec
 - lib/embulk/input/bigquery.rb
 - lib/embulk/input/bigquery/version.rb
-homepage: ''
+homepage: https://github.com/medjed/embulk-input-bigquery
 licenses:
 - MIT
 metadata: {}
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.0
+rubygems_version: 2.6.11
 signing_key:
 specification_version: 4
 summary: Embulk input plugin from bigquery.