RubyGems - bigquery_migration - Versions diffs - 0.1.0.pre1 - Mend

bigquery_migration 0.1.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +3 -0
data/LICENSE.txt +19 -0
data/README.md +107 -0
data/Rakefile +10 -0
data/bigquery_migration.gemspec +31 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/example/example.yml +22 -0
data/example/schema.json +22 -0
data/exe/bq_migrate +4 -0
data/lib/bigquery_migration.rb +29 -0
data/lib/bigquery_migration/action.rb +85 -0
data/lib/bigquery_migration/action_runner.rb +60 -0
data/lib/bigquery_migration/bigquery_wrapper.rb +675 -0
data/lib/bigquery_migration/cli.rb +105 -0
data/lib/bigquery_migration/config_loader.rb +51 -0
data/lib/bigquery_migration/error.rb +6 -0
data/lib/bigquery_migration/hash_util.rb +35 -0
data/lib/bigquery_migration/logger.rb +45 -0
data/lib/bigquery_migration/schema.rb +388 -0
data/lib/bigquery_migration/time_with_zone.rb +38 -0
data/lib/bigquery_migration/version.rb +3 -0
metadata +183 -0

data/lib/bigquery_migration/action_runner.rb ADDED

@@ -0,0 +1,60 @@
+require_relative 'config_loader'
+require_relative 'error'
+require_relative 'action'
+require_relative 'hash_util'
+class BigqueryMigration
+  class ActionRunner
+    attr_reader :config, :config_path, :opts
+    def initialize(config_path = nil, opts = {})
+      @config_path = config_path
+      @opts = opts
+      config = ConfigLoader.new(@config_path, opts[:vars]).load
+      @config = HashUtil.deep_symbolize_keys(config)
+      validate_config!
+    end
+    def run
+      success, responses = run_actions
+      { success: success, dry_run: @opts[:dry_run], actions: responses }
+    end
+    def run_actions
+      success = true
+      responses = []
+      @config[:actions].each do |action_config|
+        _success, result = Action.new(action_config, @opts).run
+        response = action_config.merge({'result' => result})
+        responses << response
+        unless _success
+          success = false
+          break
+        end
+      end
+      [success, responses]
+    end
+    def validate_config!
+      unless config.is_a?(Hash)
+        raise ConfigError, "config file format has to be YAML Hash"
+      end
+      unless config[:actions]
+        raise ConfigError, "config must have `actions` key"
+      end
+      unless config[:actions].is_a?(Array)
+        raise ConfigError, "config[:actions] must be an Array"
+      end
+      config[:actions].each do |action_config|
+        unless action_config[:action]
+          raise ConfigError, "Elements of `config[:actions]` must have `action` key"
+        end
+      end
+    end
+  end
+end

data/lib/bigquery_migration/bigquery_wrapper.rb ADDED

@@ -0,0 +1,675 @@
+require 'csv'
+require 'json'
+require_relative 'schema'
+require_relative 'error'
+require_relative 'time_with_zone'
+require_relative 'hash_util'
+require 'google/apis/bigquery_v2'
+require 'google/api_client/auth/key_utils'
+class BigqueryMigration
+  class BigqueryWrapper
+    attr_reader :config
+    def logger
+      BigqueryMigration.logger
+    end
+    def initialize(config, opts = {})
+      @config = HashUtil.deep_symbolize_keys(config)
+      @opts = HashUtil.deep_symbolize_keys(opts)
+      configure
+    end
+    def configure
+      if json_keyfile = config[:json_keyfile]
+        config[:json_key] =
+          case json_keyfile
+          when String
+            File.read(json_keyfile)
+          when Hash
+            json_keyfile[:content]
+          else
+            raise ConfigError.new "Unsupported json_keyfile type"
+          end
+      else
+        config[:json_key] = {
+          project_id: config[:project_id],
+          service_email: config[:service_email],
+          private_key: config[:private_key],
+        }.to_json
+      end
+      if config[:json_key]
+        begin
+          jsonkey_params = JSON.parse(config[:json_key])
+        rescue => e
+          raise ConfigError.new "json_keyfile is not a JSON file"
+        end
+      end
+      if jsonkey_params
+        config[:project] ||= jsonkey_params['project_id']
+      end
+      config[:retries] ||= 5
+      config[:timeout_sec] ||= 300
+      config[:open_timeout_sec] ||= 300
+    end
+    def project
+      @project ||= config[:project] || raise(ConfigError, '`project` is required.')
+    end
+    def dataset
+      @dataset ||= config[:dataset] || raise(ConfigError, '`dataset` is required.')
+    end
+    def table
+      @table  ||= config[:table]   || raise(ConfigError, '`table` is required.')
+    end
+    def job_status_polling_interval
+      @job_status_polling_interval ||= config[:job_status_polling_interval] || 5
+    end
+    def job_status_max_polling_time
+      @job_status_max_polling_time ||= config[:job_status_polling_time] || 3600
+    end
+    def dry_run?
+      @opts[:dry_run]
+    end
+    def head
+      dry_run? ? '(DRY-RUN) ' : '(EXECUTE) '
+    end
+    def client
+      return @cached_client if @cached_client && @cached_client_expiration > Time.now
+      client = Google::Apis::BigqueryV2::BigqueryService.new
+      client.request_options.retries = config[:retries]
+      client.request_options.timeout_sec = config[:timeout_sec]
+      client.request_options.open_timeout_sec = config[:open_timeout_sec]
+      logger.debug { "client_options: #{client.client_options.to_h}" }
+      logger.debug { "request_options: #{client.request_options.to_h}" }
+      scope = "https://www.googleapis.com/auth/bigquery"
+      key = StringIO.new(config[:json_key])
+      auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
+      client.authorization = auth
+      @cached_client_expiration = Time.now + 1800
+      @cached_client = client
+    end
+    def existing_columns
+      begin
+        result = get_table
+        response = result[:responses][:get_table]
+        response.schema.fields.map {|column| column.to_h }
+      rescue NotFoundError
+        return []
+      end
+    end
+    def get_dataset(dataset: nil)
+      dataset ||= self.dataset
+      begin
+        logger.info { "Get dataset... #{project}:#{dataset}" }
+        client.get_dataset(project, dataset)
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404
+          raise NotFoundError, "Dataset #{project}:#{dataset} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        raise Error, "Failed to get_dataset(#{project}, #{dataset}), response:#{response}"
+      end
+      { responses: { get_dataset: response } }
+    end
+    def insert_dataset(dataset: nil, reference: nil)
+      dataset ||= self.dataset
+      begin
+        logger.info { "#{head}Insert (create) dataset... #{project}:#{dataset}" }
+        hint = {}
+        if reference
+          response = get_dataset(reference)
+          hint = { access: response.access }
+        end
+        body = {
+          dataset_reference: {
+            project_id: project,
+            dataset_id: dataset,
+          },
+        }.merge(hint)
+        opts = {}
+        logger.debug { "#{head}insert_dataset(#{project}, #{body}, #{opts})" }
+        unless dry_run?
+          response = client.insert_dataset(project, body, opts)
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 409 && /Already Exists:/ =~ e.message
+          # ignore 'Already Exists' error
+          return {}
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        raise Error, "Failed to insert_dataset(#{project}, #{body}, #{opts}), response:#{response}"
+      end
+      { responses: { insert_dataset: response } }
+    end
+    alias :create_dataset :insert_dataset
+    def get_table(dataset: nil, table: nil)
+      dataset ||= self.dataset
+      table ||= self.table
+      begin
+        logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
+        response = client.get_table(project, dataset, table)
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 # not found
+          raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
+      end
+      { responses: { get_table: response } }
+    end
+    def insert_table(dataset: nil, table: nil, columns: )
+      dataset ||= self.dataset
+      table ||= self.table
+      schema = Schema.new(columns)
+      begin
+        logger.info { "#{head}Insert (create) table... #{project}:#{dataset}.#{table}" }
+        body = {
+          table_reference: {
+            table_id: table,
+          },
+          schema: {
+            fields: schema,
+          }
+        }
+        opts = {}
+        logger.debug { "#{head}insert_table(#{project}, #{dataset}, #{body}, #{opts})" }
+        unless dry_run?
+          response = client.insert_table(project, dataset, body, opts)
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 409 && /Already Exists:/ =~ e.message
+          # ignore 'Already Exists' error
+          return {}
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        raise Error, "Failed to insert_table(#{project}, #{dataset}, #{body}, #{opts}), response:#{response}"
+      end
+      { responses: { insert_table: response } }
+    end
+    alias :create_table :insert_table
+    def delete_table(dataset: nil, table: nil)
+      dataset ||= self.dataset
+      table ||= self.table
+      begin
+        logger.info { "#{head}Delete (drop) table... #{project}:#{dataset}.#{table}" }
+        unless dry_run?
+          client.delete_table(project, dataset, table) # no response
+          success = true
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 && /Not found:/ =~ e.message
+          # ignore 'Not Found' error
+          return {}
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        raise Error, "Failed to delete_table(#{project}, #{dataset}, #{table}), response:#{response}"
+      end
+      { success: success }
+    end
+    alias :drop_table :delete_table
+    def list_tables(dataset: nil, max_results: 999999)
+      dataset ||= self.dataset
+      tables = []
+      begin
+        logger.info { "List tables... #{project}:#{dataset}" }
+        response = client.list_tables(project, dataset, max_results: max_results)
+        while true
+          _tables = (response.tables || []).map { |t| t.table_reference.table_id.to_s }
+          tables.concat(_tables)
+          if next_page_token = response.next_page_token
+            response = client.list_tables(project, dataset, page_token: next_page_token, max_results: max_results)
+          else
+            break
+          end
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 && /Not found:/ =~ e.message
+          railse NotFoundError, "Dataset #{project}:#{dataset} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        logger.error { "list_tables(#{project}, #{dataset}), response:#{response}" }
+        raise Error, "failed to list tables #{project}:#{dataset}, response:#{response}"
+      end
+      { tables: tables }
+    end
+    def purge_tables(dataset: nil, table_prefix: , suffix_format: , purge_before: , timezone: nil)
+      dataset ||= self.dataset
+      timezone ||= Time.now.strftime('%z')
+      before_tables = list_tables[:tables]
+      purge_before_t = TimeWithZone.strptime_with_zone(purge_before, suffix_format, timezone)
+      tables = before_tables.select do |tbl|
+        suffix = tbl.gsub(table_prefix, '')
+        begin
+          suffix_t = TimeWithZone.strptime_with_zone(suffix, suffix_format, timezone)
+        rescue
+          next
+        end
+        # skip if different from the suffix_format
+        next if suffix_t.strftime(suffix_format) != suffix
+        suffix_t <= purge_before_t
+      end
+      tables.each do |_table|
+        delete_table(table: _table)
+        # If you make more than 100 requests per second, throttling might occur.
+        # See https://cloud.google.com/bigquery/quota-policy#apirequests
+        sleep 1
+      end
+      { delete_tables: tables }
+    end
+    # rows:
+    #   - id: 1
+    #     type: one
+    #     record:
+    #       child1: 'child1'
+    #       child2: 'child2'
+    #   - id: 2
+    #     type: two
+    #     record:
+    #       child1: 'child3'
+    #       child2: 'child4'
+    def insert_all_table_data(dataset: nil, table: nil, rows: )
+      dataset ||= self.dataset
+      table ||= self.table
+      begin
+        logger.info { "#{head}insertAll tableData... #{project}:#{dataset}.#{table}" }
+        body = {
+          rows: rows.map {|row| { json: row } },
+        }
+        opts = {}
+        unless dry_run?
+          response = client.insert_all_table_data(project, dataset, table, body, opts)
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 # not found
+          raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        Medjed::Bulk.logger.error {
+          "insert_all_table_data(#{project}, #{dataset}, #{table}, #{opts}), response:#{response}"
+        }
+        raise Error, "failed to insert_all table_data #{project}:#{dataset}.#{table}, response:#{response}"
+      end
+      { responses: { insert_all_table_data: response } }
+    end
+    # @return Hash result of list table_data
+    #
+    # Example:
+    # {
+    #   columns:
+    #     [
+    #       {
+    #         name: id,
+    #         type: INTEGER
+    #       },
+    #       {
+    #         name: type,
+    #         type: STRING
+    #       },
+    #       {
+    #         name: record.child1,
+    #         type: STRING
+    #       },
+    #       {
+    #         name: record.child2,
+    #         type: STRING
+    #       },
+    #   values:
+    #     [
+    #       [2,"two","child3","child4"],
+    #       [1,"one","child1","child2"]
+    #     ],
+    #   total_rows: 2
+    # }
+    def list_table_data(dataset: nil, table: nil, max_results: 100)
+      dataset ||= self.dataset
+      table ||= self.table
+      begin
+        logger.info  { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
+        response = client.list_table_data(project, dataset, table, max_results: max_results)
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 # not found
+          raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        logger.error  { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
+        raise Error, "Failed to list table_data #{project}:#{dataset}.#{table}, response:#{response}"
+      end
+      flattened_columns = Schema.new(existing_columns).flattened_columns.map do |name, column|
+        {name: name}.merge!(column)
+      end
+      if rows = response.to_h[:rows]
+        flattened_values = flatten_values(rows)
+      end
+      {
+        total_rows: response.total_rows,
+        columns: flattened_columns,
+        values: flattened_values,
+        response: {
+          list_table_data: response,
+        }
+      }
+    end
+    private def flatten_values(rows)
+      rows.map do |r|
+        if r.key?(:f)
+          r[:f].map do |f|
+            if f[:v].respond_to?(:key?) && f[:v].key?(:f)
+              flatten_values(f[:v][:f])
+            else
+              f[:v]
+            end
+          end.flatten
+        else
+          r[:v]
+        end
+      end
+    end
+    def patch_table(dataset: nil, table: nil, columns: nil, add_columns: nil)
+      dataset ||= self.dataset
+      table ||= self.table
+      if columns.nil? and add_columns.nil?
+        raise ArgumentError, 'patch_table: `columns` or `add_columns` is required'
+      end
+      before_columns = existing_columns
+      if columns # if already given
+        schema = Schema.new(columns)
+      else
+        schema = Schema.new(add_columns)
+        schema.reverse_merge!(before_columns)
+      end
+      schema.validate_permitted_operations!(before_columns)
+      begin
+        logger.info { "#{head}Patch table... #{project}:#{dataset}.#{table}" }
+        fields = schema.map {|column| HashUtil.deep_symbolize_keys(column) }
+        body = {
+          schema: {
+            fields: fields,
+          }
+        }
+        opts = {}
+        logger.debug { "#{head}patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts})" }
+        unless dry_run?
+          response = client.patch_table(project, dataset, table, body, options: opts)
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        if e.status_code == 404 # not found
+          raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
+        end
+        response = {status_code: e.status_code, message: e.message, error_class: e.class}
+        logger.error {
+          "patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts}), response:#{response}"
+        }
+        raise Error, "Failed to patch table #{project}:#{dataset}.#{table}, response:#{response}"
+      end
+      after_columns = existing_columns
+      {
+        before_columns: before_columns,
+        after_columns:  after_columns,
+        responses: { patch_table: response },
+      }
+    end
+    alias :add_column :patch_table
+    def copy_table(destination_table:, destination_dataset: nil, source_table: nil, source_dataset: nil, write_disposition: 'WRITE_TRUNCATE')
+      source_table ||= self.table
+      source_dataset ||= self.dataset
+      destination_dataset ||= source_dataset
+      body = {
+        configuration: {
+          copy: {
+            create_deposition: 'CREATE_IF_NEEDED',
+            write_disposition: write_disposition,
+            source_table: {
+              project_id: project,
+              dataset_id: source_dataset,
+              table_id: source_table,
+            },
+            destination_table: {
+              project_id: project,
+              dataset_id: destination_dataset,
+              table_id: destination_table,
+            },
+          }
+        }
+      }
+      opts = {}
+      logger.info  { "#{head}insert_job(#{project}, #{body}, #{opts})" }
+      unless dry_run?
+        response = client.insert_job(project, body, opts)
+        get_response = wait_load('copy', response)
+      end
+      {
+        responses: {
+          insert_job: response,
+          last_get_job: get_response,
+        }
+      }
+    end
+    def insert_select(query:, destination_table: nil, destination_dataset: nil, write_disposition: 'WRITE_TRUNCATE')
+      destination_table   ||= self.table
+      destination_dataset ||= self.dataset
+      body  = {
+        configuration: {
+          query: {
+            allow_large_results: true,
+            flatten_results: false,
+            write_disposition: write_disposition,
+            query: query,
+            destination_table: {
+              project_id: self.project,
+              dataset_id: destination_dataset,
+              table_id: destination_table,
+            },
+          }
+        }
+      }
+      opts = {}
+      logger.info { "#{head}insert_job(#{project}, #{body}, #{opts})" }
+      unless dry_run?
+        response = client.insert_job(project, body, opts)
+        get_response = wait_load('query', response)
+      end
+      {
+        responses: {
+          insert_job: response,
+          last_get_job: get_response,
+        }
+      }
+    end
+    private def wait_load(kind, response)
+      started = Time.now
+      wait_interval = self.job_status_polling_interval
+      max_polling_time = self.job_status_max_polling_time
+      _response = response
+      while true
+        job_id = _response.job_reference.job_id
+        elapsed = Time.now - started
+        status = _response.status.state
+        if status == "DONE"
+          logger.info {
+            "#{kind} job completed... " \
+            "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
+          }
+          break
+        elsif elapsed.to_i > max_polling_time
+          message = "Checking #{kind} job status... " \
+            "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
+          logger.info { message }
+          raise JobTimeoutError.new(message)
+        else
+          logger.info {
+            "Checking #{kind} job status... " \
+            "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
+          }
+          sleep wait_interval
+          _response = client.get_job(project, job_id)
+        end
+      end
+      # cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method
+      # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
+      # Otherwise, this returns nil.
+      if _errors = _response.status.errors
+        raise Error, "Failed during waiting a job, get_job(#{project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
+      end
+      _response
+    end
+    def drop_column(table: nil, columns: nil, drop_columns: nil, backup_dataset: nil, backup_table: nil)
+      table ||= self.table
+      backup_dataset ||= self.dataset
+      if columns.nil? and drop_columns.nil?
+        raise ArgumentError, '`drop_columns` or `columns` is required'
+      end
+      result = { responses: {} }
+      before_columns = existing_columns
+      if columns # if already given
+        schema = Schema.new(columns)
+      else
+        schema = Schema.new(existing_columns)
+        schema.reject_columns!(drop_columns)
+      end
+      if schema.empty? && !dry_run?
+        raise Error, 'No column is remained'
+      end
+      schema.validate_permitted_operations!(before_columns)
+      unless backup_dataset == self.dataset
+        create_dataset(dataset: backup_dataset)
+      end
+      if backup_table
+        _result = copy_table(source_table: table, destination_table: backup_table, destination_dataset: backup_dataset)
+        result[:responses].merge!(_result[:responses])
+      end
+      unless (add_columns = schema.diff_columns_by_name(before_columns)).empty?
+        _result = patch_table(add_columns: add_columns)
+        result[:responses].merge!(_result[:responses])
+      end
+      query_fields = schema.build_query_fields(before_columns)
+      query = "SELECT #{query_fields.join(',')} FROM [#{dataset}.#{table}]"
+      _result = insert_select(query: query, destination_table: table)
+      result[:responses].merge!(_result[:responses])
+      after_columns = existing_columns
+      result.merge!({before_columns: before_columns, after_columns: after_columns})
+    end
+    def migrate_table(table: nil, schema_file: nil, columns: nil, backup_dataset: nil, backup_table: nil)
+      table ||= self.table
+      backup_dataset ||= self.dataset
+      if schema_file.nil? and columns.nil?
+        raise ArgumentError, '`schema_file` or `columns` is required'
+      end
+      if schema_file
+        columns = HashUtil.deep_symbolize_keys(JSON.parse(File.read(schema_file)))
+      end
+      Schema.validate_columns!(columns)
+      before_columns = existing_columns
+      result = {}
+      if before_columns.empty?
+        result = create_table(table: table, columns: columns)
+      else
+        add_columns  = Schema.diff_columns(before_columns, columns)
+        drop_columns = Schema.diff_columns(columns, before_columns)
+        if !drop_columns.empty?
+          drop_column(table: table, columns: columns,
+                      backup_dataset: backup_dataset, backup_table: backup_table)
+        elsif !add_columns.empty?
+          add_column(table: table, columns: columns)
+        end
+      end
+      after_columns = existing_columns
+      if after_columns.empty? and !dry_run?
+        raise Error, "after_columns is empty. " \
+          "before_columns: #{before_columns}, after_columns: after_columns, columns: #{columns}"
+      end
+      result.merge!( before_columns: before_columns, after_columns: after_columns )
+    end
+  end
+end