RubyGems - embulk-output-bigquery - Versions diffs - 0.2.3 → 0.3.0.pre1 - Mend

embulk-output-bigquery 0.2.3 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/.gitignore +6 -12
data/CHANGELOG.md +18 -0
data/Gemfile +8 -0
data/LICENSE.txt +20 -0
data/README.md +165 -39
data/Rakefile +11 -0
data/embulk-output-bigquery.gemspec +20 -0
data/example/config_client_options.yml +33 -0
data/example/config_csv.yml +30 -0
data/example/config_delete_in_advance.yml +29 -0
data/example/config_expose_errors.yml +30 -0
data/example/config_guess_from_embulk_schema.yml +29 -0
data/example/config_guess_with_column_options.yml +40 -0
data/example/config_gzip.yml +30 -0
data/example/config_jsonl.yml +30 -0
data/example/config_mode_append.yml +30 -0
data/example/config_mode_append_direct.yml +30 -0
data/example/config_payload_column.yml +20 -0
data/example/config_payload_column_index.yml +20 -0
data/example/config_prevent_duplicate_insert.yml +30 -0
data/example/config_replace.yml +30 -0
data/example/config_replace_backup.yml +32 -0
data/example/config_skip_file_generation.yml +32 -0
data/example/config_table_strftime.yml +30 -0
data/example/config_template_table.yml +21 -0
data/example/config_uncompressed.yml +30 -0
data/example/config_with_rehearsal.yml +32 -0
data/example/example.csv +17 -0
data/example/example.jsonl +16 -0
data/example/example.yml +30 -0
data/example/json_key.json +12 -0
data/example/nested_example.jsonl +16 -0
data/example/schema.json +30 -0
data/example/schema_expose_errors.json +30 -0
data/lib/embulk/output/bigquery.rb +388 -3
data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
data/lib/embulk/output/bigquery/file_writer.rb +103 -0
data/lib/embulk/output/bigquery/helper.rb +78 -0
data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
data/test/helper.rb +13 -0
data/test/test_bigquery_client.rb +166 -0
data/test/test_configure.rb +254 -0
data/test/test_example.rb +34 -0
data/test/test_file_writer.rb +129 -0
data/test/test_helper.rb +103 -0
data/test/test_transaction.rb +129 -0
data/test/test_value_converter_factory.rb +316 -0
metadata +114 -45
data/build.gradle +0 -80
data/config/checkstyle/checkstyle.xml +0 -128
data/config/checkstyle/default.xml +0 -108
data/gradle/wrapper/gradle-wrapper.jar +0 -0
data/gradle/wrapper/gradle-wrapper.properties +0 -6
data/gradlew +0 -164
data/gradlew.bat +0 -90
data/settings.gradle +0 -2
data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5

data/lib/embulk/output/bigquery/helper.rb ADDED Viewed

@@ -0,0 +1,78 @@
+require 'digest/md5'
+module Embulk
+  module Output
+    class Bigquery < OutputPlugin
+      class Helper
+        def self.bq_type_from_embulk_type(embulk_type)
+          case embulk_type
+          when :boolean then 'BOOLEAN'
+          when :long then 'INTEGER'
+          when :double then 'FLOAT'
+          when :string then 'STRING'
+          when :timestamp then 'TIMESTAMP'
+          when :json then 'STRING' # NOTE: Default is not RECORD since it requires `fields`
+          else raise ArgumentError, "embulk type #{embulk_type} is not supported"
+          end
+        end
+        # @return [Hash] name => column_option.
+        # ToDo: recursively map fields?
+        def self.column_options_map(column_options)
+          (column_options || {}).map do |column_option|
+            [column_option['name'], column_option]
+          end.to_h
+        end
+        def self.fields_from_embulk_schema(task, schema)
+          column_options_map = self.column_options_map(task['column_options'])
+          schema.map do |column|
+            column_name   = column[:name]
+            embulk_type   = column[:type]
+            column_option = column_options_map[column_name] || {}
+            {}.tap do |field|
+              field[:name]   = column_name
+              field[:type]   = (column_option['type'] || bq_type_from_embulk_type(embulk_type)).upcase
+              field[:mode]   = column_option['mode'] if column_option['mode']
+              field[:fields] = deep_symbolize_keys(column_option['fields']) if column_option['fields']
+            end
+          end
+        end
+        def self.deep_symbolize_keys(obj)
+          if obj.is_a?(Hash)
+            obj.inject({}) do |options, (key, value)|
+              options[(key.to_sym rescue key) || key] = deep_symbolize_keys(value)
+              options
+            end
+          elsif obj.is_a?(Array)
+            obj.map {|value| deep_symbolize_keys(value) }
+          else
+            obj
+          end
+        end
+        def self.create_job_id(task, path, table, fields)
+          elements = [
+            Digest::MD5.file(path).hexdigest,
+            task['dataset'],
+            table,
+            fields,
+            task['source_format'],
+            task['max_bad_records'],
+            task['field_delimiter'],
+            task['encoding'],
+            task['ignore_unknown_values'],
+            task['allow_quoted_newlines'],
+          ]
+          str = elements.map(&:to_s).join('')
+          md5 = Digest::MD5.hexdigest(str)
+          job_id = "embulk_job_#{md5}"
+          Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
+          job_id
+        end
+      end
+    end
+  end
+end

data/lib/embulk/output/bigquery/value_converter_factory.rb ADDED Viewed

@@ -0,0 +1,292 @@
+require 'time'
+require 'tzinfo'
+require 'json'
+require_relative 'helper'
+module Embulk
+  module Output
+    class Bigquery < OutputPlugin
+      class ValueConverterFactory
+        class NotSupportedType < StandardError; end
+        class TypeCastError < StandardError; end
+        # ref. https://cloud.google.com/bigquery/preparing-data-for-bigquery
+        DEFAULT_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%6N" # BigQuery timestamp format
+        DEFAULT_TIMEZONE         = "UTC"
+        # @param [Hash] task
+        # @option task [String] default_timestamp_format
+        # @option task [String] default_timezone
+        # @option task [Hash]   column_options user defined column types
+        # @param [Schema] schema embulk defined column types
+        # @return [Array] an arary whose key is column_index, and value is its converter (Proc)
+        def self.create_converters(task, schema)
+          column_options_map       = Helper.column_options_map(task['column_options'])
+          default_timestamp_format = task['default_timestamp_format']
+          default_timezone         = task['default_timezone']
+          schema.map do |column|
+            column_name   = column[:name]
+            embulk_type   = column[:type]
+            column_option = column_options_map[column_name] || {}
+            self.new(
+              embulk_type, column_option['type'],
+              timestamp_format: column_option['timestamp_format'],
+              timezone: column_option['timezone'],
+              strict: column_option['strict'],
+              default_timestamp_format: default_timestamp_format,
+              default_timezone: default_timezone,
+            ).create_converter
+          end
+        end
+        attr_reader :embulk_type, :type, :timestamp_format, :timezone, :zone_offset, :strict
+        def initialize(
+          embulk_type, type = nil,
+          timestamp_format: nil, timezone: nil, strict: nil,
+          default_timestamp_format: DEFAULT_TIMESTAMP_FORMAT,
+          default_timezone: DEFAULT_TIMEZONE
+        )
+          @embulk_type      = embulk_type
+          @type             = (type || Helper.bq_type_from_embulk_type(embulk_type)).upcase
+          @timestamp_format = timestamp_format
+          @default_timestamp_format = default_timestamp_format
+          @timezone         = timezone || default_timezone
+          @zone_offset      = get_zone_offset(@timezone) if @timezone
+          @strict           = strict.nil? ? true : strict
+        end
+        def create_converter
+          case embulk_type
+          when :boolean   then boolean_converter
+          when :long      then long_converter
+          when :double    then double_converter
+          when :string    then string_converter
+          when :timestamp then timestamp_converter
+          when :json      then json_converter
+          else raise NotSupportedType, "embulk type #{embulk_type} is not supported"
+          end
+        end
+        def with_typecast_error(val)
+          begin
+            yield(val)
+          rescue => e
+            raise_typecast_error(val)
+          end
+        end
+        def raise_typecast_error(val)
+          message = "cannot cast #{@embulk_type} `#{val}` to #{@type}"
+          if @strict
+            raise TypeCastError, message
+          else
+            Embulk.logger.trace { message }
+            return nil
+          end
+        end
+        def boolean_converter
+          case type
+          when 'BOOLEAN'
+            Proc.new {|val|
+              val
+            }
+          when 'STRING'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_s
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for boolean column"
+          end
+        end
+        def long_converter
+          case type
+          when 'BOOLEAN'
+            Proc.new {|val|
+              next nil if val.nil?
+              next true if val == 1
+              next false if val == 0
+              raise_typecast_error(val)
+            }
+          when 'INTEGER'
+            Proc.new {|val|
+              val
+            }
+          when 'FLOAT'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_f
+            }
+          when 'STRING'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_s
+            }
+          when 'TIMESTAMP'
+            Proc.new {|val|
+              next nil if val.nil?
+              val # BigQuery supports UNIX timestamp
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for long column"
+          end
+        end
+        def double_converter
+          case type
+          when 'INTEGER'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_i
+            }
+          when 'FLOAT'
+            Proc.new {|val|
+              val
+            }
+          when 'STRING'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_s
+            }
+          when 'TIMESTAMP'
+            Proc.new {|val|
+              next nil if val.nil?
+              val # BigQuery supports UNIX timestamp
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for double column"
+          end
+        end
+        def string_converter
+          case type
+          when 'BOOLEAN'
+            Proc.new {|val|
+              next nil if val.nil?
+              next true if val == 'true'.freeze
+              next false if val == 'false'.freeze
+              raise_typecast_error(val)
+            }
+          when 'INTEGER'
+            Proc.new {|val|
+              next nil if val.nil?
+              with_typecast_error(val) do |val|
+                Integer(val)
+              end
+            }
+          when 'FLOAT'
+            Proc.new {|val|
+              next nil if val.nil?
+              with_typecast_error(val) do |val|
+                Float(val)
+              end
+            }
+          when 'STRING'
+            Proc.new {|val|
+              val
+            }
+          when 'TIMESTAMP'
+            if @timestamp_format
+              Proc.new {|val|
+                next nil if val.nil?
+                with_typecast_error(val) do |val|
+                  strptime_with_zone(val, @timestamp_format, zone_offset).to_f
+                end
+              }
+            else
+              Proc.new {|val|
+                next nil if val.nil?
+                val # Users must care of BQ timestamp format
+              }
+            end
+          when 'RECORD'
+            Proc.new {|val|
+              next nil if val.nil?
+              with_typecast_error(val) do |val|
+                JSON.parse(val)
+              end
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for string column"
+          end
+        end
+        def timestamp_converter
+          case type
+          when 'INTEGER'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_i
+            }
+          when 'FLOAT'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_f
+            }
+          when 'STRING'
+            _timestamp_format = @timestamp_format || @default_timestamp_format
+            Proc.new {|val|
+              next nil if val.nil?
+              with_typecast_error(val) do |val|
+                val.localtime(zone_offset).strftime(_timestamp_format)
+              end
+            }
+          when 'TIMESTAMP'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_f # BigQuery supports UNIX timestamp
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for timestamp column"
+          end
+        end
+        # ToDo: recursive conversion
+        def json_converter
+          case type
+          when 'STRING'
+            Proc.new {|val|
+              next nil if val.nil?
+              val.to_json
+            }
+          when 'RECORD'
+            Proc.new {|val|
+              val
+            }
+          else
+            raise NotSupportedType, "cannot take column type #{type} for json column"
+          end
+        end
+        private
+        # [+-]HH:MM, [+-]HHMM, [+-]HH
+        NUMERIC_PATTERN = %r{\A[+-]\d\d(:?\d\d)?\z}
+        # Region/Zone, Region/Zone/Zone
+        NAME_PATTERN = %r{\A[^/]+/[^/]+(/[^/]+)?\z}
+        def strptime_with_zone(date, timestamp_format, zone_offset)
+          time = Time.strptime(date, timestamp_format)
+          utc_offset = time.utc_offset
+          time.localtime(zone_offset) + utc_offset - zone_offset
+        end
+        def get_zone_offset(timezone)
+          if NUMERIC_PATTERN === timezone
+            Time.zone_offset(timezone)
+          elsif NAME_PATTERN === timezone || 'UTC' == timezone
+            tz = TZInfo::Timezone.get(timezone)
+            tz.period_for_utc(Time.now).utc_total_offset
+          else
+            raise ArgumentError, "timezone format is invalid: #{timezone}"
+          end
+        end
+      end
+    end
+  end
+end

data/test/helper.rb ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+require 'test/unit'
+require 'test/unit/rr'
+# require 'embulk/java/bootstrap'
+require 'embulk'
+Embulk.setup
+Embulk.logger = Embulk::Logger.new('/dev/null')
+APP_ROOT = File.expand_path('../', __dir__)
+EXAMPLE_ROOT = File.expand_path('../example', __dir__)
+TEST_ROOT = File.expand_path(File.dirname(__FILE__))

data/test/test_bigquery_client.rb ADDED Viewed

@@ -0,0 +1,166 @@
+require_relative './helper'
+require 'embulk/output/bigquery/bigquery_client'
+require 'csv'
+# 1. Prepare /tmp/your-project-000.json
+# 2. CONNECT=1 bunlde exec ruby test/test_bigquery_client.rb
+if ENV['CONNECT']
+  module Embulk
+    class Output::Bigquery
+      class TestBigqueryClient < Test::Unit::TestCase
+        class << self
+          def startup
+            FileUtils.mkdir_p('tmp')
+          end
+          def shutdown
+            FileUtils.rm_rf('tmp')
+          end
+        end
+        def client(task = {})
+          task = least_task.merge(task)
+          BigqueryClient.new(task, schema)
+        end
+        def least_task
+          {
+            'project'          => JSON.parse(File.read('/tmp/your-project-000.json'))['project_id'],
+            'dataset'          => 'your_dataset_name',
+            'table'            => 'your_table_name',
+            'auth_method'      => 'json_key',
+            'json_keyfile'     => '/tmp/your-project-000.json',
+            'retries'          => 3,
+            'timeout_sec'      => 300,
+            'open_timeout_sec' => 300,
+            'job_status_max_polling_time' => 3600,
+            'job_status_polling_interval' => 10,
+            'source_format'    => 'CSV'
+          }
+        end
+        def schema
+          Schema.new([
+            Column.new({index: 0, name: 'boolean', type: :boolean}),
+            Column.new({index: 1, name: 'long', type: :long}),
+            Column.new({index: 2, name: 'double', type: :double}),
+            Column.new({index: 3, name: 'string', type: :string}),
+            Column.new({index: 4, name: 'timestamp', type: :timestamp}),
+            Column.new({index: 5, name: 'json', type: :json}),
+          ])
+        end
+        def record
+          [true,1,1.1,'1',Time.parse("2016-02-26 +00:00"),'{"foo":"bar"}']
+        end
+        sub_test_case "client" do
+          def test_json_keyfile
+            assert_nothing_raised { BigqueryClient.new(least_task, schema).client }
+          end
+          def test_p12_keyfile
+            # pending
+          end
+        end
+        sub_test_case "create_dataset" do
+          def test_create_dataset
+            assert_nothing_raised { client.create_dataset }
+          end
+          def test_create_dataset_with_reference
+            response = client.get_dataset
+            any_instance_of(BigqueryClient) do |obj|
+              mock(obj).get_dataset('your_dataset_name') { response }
+            end
+            assert_nothing_raised do
+              client.create_dataset('your_dataset_name_old', reference: 'your_dataset_name')
+            end
+          end
+        end
+        sub_test_case "get_dataset" do
+          def test_get_dataset
+            assert_nothing_raised { client.create_dataset }
+            assert_nothing_raised { client.get_dataset }
+          end
+          def test_get_dataset_not_found
+            assert_raise(NotFoundError) {
+              client.get_dataset('something_does_not_exist')
+            }
+          end
+        end
+        sub_test_case "create_table" do
+          def test_create_table
+            client.delete_table('your_table_name')
+            assert_nothing_raised { client.create_table('your_table_name') }
+          end
+          def test_create_table_already_exists
+            assert_nothing_raised { client.create_table('your_table_name') }
+          end
+        end
+        sub_test_case "delete_table" do
+          def test_delete_table
+            client.create_table('your_table_name')
+            assert_nothing_raised { client.delete_table('your_table_name') }
+          end
+          def test_delete_table_not_found
+            assert_nothing_raised { client.delete_table('your_table_name') }
+          end
+        end
+        sub_test_case "get_table" do
+          def test_get_table
+            client.create_table('your_table_name')
+            assert_nothing_raised { client.get_table('your_table_name') }
+          end
+          def test_get_table_not_found
+            client.delete_table('your_table_name')
+            assert_raise(NotFoundError) {
+              client.get_table('your_table_name')
+            }
+          end
+        end
+        sub_test_case "fields" do
+          def test_fields_from_table
+            client.create_table('your_table_name')
+            fields = client.fields_from_table('your_table_name')
+            expected = [
+              {:type=>"BOOLEAN", :name=>"boolean"},
+              {:type=>"INTEGER", :name=>"long"},
+              {:type=>"FLOAT", :name=>"double"},
+              {:type=>"STRING", :name=>"string"},
+              {:type=>"TIMESTAMP", :name=>"timestamp"},
+              {:type=>"STRING", :name=>"json"},
+            ]
+            assert_equal expected, fields
+          end
+        end
+        sub_test_case "copy" do
+          def test_create_table
+            client.create_table('your_table_name')
+            assert_nothing_raised { client.copy('your_table_name', 'your_table_name_old') }
+          end
+        end
+        sub_test_case "load" do
+          def test_load
+            client.create_table('your_table_name')
+            File.write("tmp/your_file_name.csv", record.to_csv)
+            assert_nothing_raised { client.load("/tmp/your_file_name.csv", 'your_table_name') }
+          end
+        end
+      end
+    end
+  end
+end