RubyGems - bigquery_migration - Versions diffs - 0.1.0.pre1 - Mend

bigquery_migration 0.1.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +3 -0
data/LICENSE.txt +19 -0
data/README.md +107 -0
data/Rakefile +10 -0
data/bigquery_migration.gemspec +31 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/example/example.yml +22 -0
data/example/schema.json +22 -0
data/exe/bq_migrate +4 -0
data/lib/bigquery_migration.rb +29 -0
data/lib/bigquery_migration/action.rb +85 -0
data/lib/bigquery_migration/action_runner.rb +60 -0
data/lib/bigquery_migration/bigquery_wrapper.rb +675 -0
data/lib/bigquery_migration/cli.rb +105 -0
data/lib/bigquery_migration/config_loader.rb +51 -0
data/lib/bigquery_migration/error.rb +6 -0
data/lib/bigquery_migration/hash_util.rb +35 -0
data/lib/bigquery_migration/logger.rb +45 -0
data/lib/bigquery_migration/schema.rb +388 -0
data/lib/bigquery_migration/time_with_zone.rb +38 -0
data/lib/bigquery_migration/version.rb +3 -0
metadata +183 -0

data/lib/bigquery_migration/cli.rb ADDED

@@ -0,0 +1,105 @@
+require 'thor'
+require 'json'
+require 'bigquery_migration'
+require_relative 'action_runner'
+require_relative 'hash_util'
+class BigqueryMigration
+  class CLI < Thor
+    # cf. http://qiita.com/KitaitiMakoto/items/c6b9d6311c20a3cc21f9
+    def self.exit_on_failure?
+      true
+    end
+    # `run` is reserved by thor, we have to use def _run
+    map "run" => "_run"
+    option :config_path, :aliases => ['-c'], :type => :string,
+      :default => 'config.yml'
+    option :log_level, :aliases => ["-l"], :type => :string,
+      :desc => 'Log level such as fatal, error, warn, info, or debug',
+      :default => 'info'
+    option :log, :type => :string,
+      :desc => 'Output log to a file',
+      :default => 'STDOUT'
+    option :stdout, :type => :string,
+      :desc => 'Redirect STDOUT to a file',
+      :default => 'STDOUT'
+    option :stderr, :type => :string,
+      :desc => 'Redirect STDERR to a file',
+      :default => 'STDERR'
+    option :exec, :type => :boolean,
+      :desc => 'Execute or dry-run (Default: dry-run)',
+      :default => false
+    option :vars, :type => :hash,
+      :desc => 'Variables used in ERB, thor hash format'
+    option :output, :aliases => ["-o"], :type => :string,
+      :desc => 'Output result yaml to a file',
+      :default => 'STDOUT'
+    desc 'run <config.yml>', 'run bigquery_migration'
+    def _run(config_path)
+      opts = options.merge(
+        dry_run: !options[:exec]
+      )
+      init_logger
+      reopen_stdout
+      reopen_stderr
+      result = ActionRunner.new(config_path, opts).run
+      open_output do |io|
+        io.puts mask_secret(HashUtil.deep_stringify_keys(result).to_yaml)
+        logger.info { "DRY-RUN has finished. Use --exec option to run." } if opts[:dry_run]
+      end
+      exit(1) unless result[:success]
+    end
+    private
+    def logger
+      BigqueryMigration.logger
+    end
+    def init_logger
+      logger = BigqueryMigration::Logger.new(options[:log])
+      logger.level = options[:log_level]
+      BigqueryMigration.logger = logger
+    end
+    def reopen_stdout
+      unless options[:stdout] == 'STDOUT'
+        $stdout.reopen(options[:stdout])
+      end
+      $stdout.sync = true
+    end
+    def reopen_stderr
+      unless options[:stderr] == 'STDERR'
+        $stderr.reopen(options[:stderr])
+      end
+      $stderr.sync = true
+    end
+    def open_output
+      output = options[:output]
+      if output == 'STDOUT'
+        yield($stdout)
+      elsif output == 'STDERR'
+        yield($stderr)
+      else
+        File.open(output, 'w') do |io|
+          yield(io)
+        end
+      end
+    end
+    def mask_secret(yaml_string)
+      %w(password key).each do |secret|
+        yaml_string.gsub!(/([^ ]*#{secret}): .*$/, '\1: xxxxx')
+      end
+      yaml_string.gsub!(/(-----BEGIN\s+PRIVATE\s+KEY-----)[0-9A-Za-z+\/=\s\\]+(-----END\s+PRIVATE\s+KEY-----)/m, '\1 xxxxx \2')
+      yaml_string
+    end
+  end
+end

data/lib/bigquery_migration/config_loader.rb ADDED

@@ -0,0 +1,51 @@
+require 'set'
+require 'yaml'
+require 'erb'
+require 'ostruct'
+class BigqueryMigration
+  class ConfigLoader
+    attr_reader :config_path, :namespace
+    class AlreayIncluded < ::StandardError; end
+    def initialize(config_path, vars = {})
+      @config_path = File.expand_path(config_path)
+      @included_files = Set.new
+      @namespace = OpenStruct.new(vars)
+      unless @namespace.respond_to?(:include_file)
+        itself = self
+        # ToDo: better way?
+        @namespace.define_singleton_method(:include_file) do |path|
+          caller_path = caller[0][/^([^:]+):\d+:in `[^']*'$/, 1]
+          abs_path = File.expand_path(path, File.dirname(caller_path))
+          if File.extname(path) == '.erb'
+            itself.load_erb(abs_path)
+          else
+            File.read(abs_path)
+          end
+        end
+      end
+    end
+    def load
+      if File.extname(config_path) == '.erb'
+        YAML.load(load_erb(config_path))
+      else
+        YAML.load(File.read(config_path))
+      end
+    end
+    def load_erb(path = config_path)
+      unless @included_files.add?(path)
+        raise AlreayIncluded, "#{path} was included twice"
+      end
+      raw = File.read(path)
+      erb = ERB.new(raw, nil, "-")
+      erb.filename = path
+      erb.result(namespace.instance_eval { binding })
+    end
+  end
+end

data/lib/bigquery_migration/error.rb ADDED

@@ -0,0 +1,6 @@
+class BigqueryMigration
+  class Error < StandardError; end
+  class ConfigError < Error; end
+  class JobTimeoutError < Error; end
+  class NotFoundError < Error; end
+end

data/lib/bigquery_migration/hash_util.rb ADDED

@@ -0,0 +1,35 @@
+class BigqueryMigration
+  class HashUtil
+    def self.deep_symbolize_keys(hash)
+      if hash.is_a?(Hash)
+        hash.map do |key, val|
+          new_key = key.to_sym
+          new_val = deep_symbolize_keys(val)
+          [new_key, new_val]
+        end.to_h
+      elsif hash.is_a?(Array)
+        hash.map do |val|
+          deep_symbolize_keys(val)
+        end
+      else
+        hash
+      end
+    end
+    def self.deep_stringify_keys(hash)
+      if hash.is_a?(Hash)
+        hash.map do |key, val|
+          new_key = key.to_s
+          new_val = deep_stringify_keys(val)
+          [new_key, new_val]
+        end.to_h
+      elsif hash.is_a?(Array)
+        hash.map do |val|
+          deep_stringify_keys(val)
+        end
+      else
+        hash
+      end
+    end
+  end
+end

data/lib/bigquery_migration/logger.rb ADDED

@@ -0,0 +1,45 @@
+require 'logger'
+class BigqueryMigration
+  class LogFormatter
+    FORMAT = "%s [%s] %s\n"
+    def initialize(opts={})
+    end
+    def call(severity, time, progname, msg)
+      FORMAT % [format_datetime(time), severity, format_message(msg)]
+    end
+    private
+    def format_datetime(time)
+      time.iso8601
+    end
+    def format_severity(severity)
+      severity
+    end
+    def format_message(message)
+      case message
+      when ::Exception
+        e = message
+        "#{e.class} (#{e.message})\n  #{e.backtrace.join("\n  ")}"
+      else
+        message.to_s
+      end
+    end
+  end
+  class Logger < ::Logger
+    def initialize(logdev, shift_age = 0, shift_size = 1048576)
+      logdev = STDOUT if logdev == 'STDOUT'
+      super(logdev, shift_age, shift_size)
+      @formatter = LogFormatter.new
+    end
+    def write(msg)
+      @logdev.write msg
+    end
+  end
+end

data/lib/bigquery_migration/schema.rb ADDED

@@ -0,0 +1,388 @@
+require 'csv'
+require 'json'
+require_relative 'error'
+class BigqueryMigration
+  class Schema < ::Array
+    ALLOWED_FIELD_TYPES = Set.new(['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'RECORD', 'TIMESTAMP'])
+    ALLOWED_FIELD_MODES = Set.new(['NULLABLE', 'REQUIRED', 'REPEATED'])
+    def initialize(columns = [])
+      normalized = self.class.normalize_columns(columns)
+      super(normalized)
+      validate_columns!
+    end
+    def find_column_by_name(name)
+      self.class.find_column_by_name(self, name)
+    end
+    def validate_columns!
+      self.class.validate_columns!(self)
+    end
+    def validate_permitted_operations!(source_columns)
+      target_columns = self
+      self.class.validate_permitted_operations!(source_columns, target_columns)
+    end
+    def normalize_columns
+      self.class.normalize_columns(self)
+    end
+    def shallow_normalize_columns
+      self.class.shallow_normalize_columns(self)
+    end
+    def shallow_normalize_columns!
+      self.class.shallow_normalize_column!(self)
+    end
+    def flattened_columns
+      self.class.flattened_columns(self)
+    end
+    def equals?(source_columns)
+      self.class.equals?(source_columns, self)
+    end
+    # self - source_columns
+    def diff_columns(source_columns)
+      self.class.diff_columns(source_columns, self)
+    end
+    # diff with only column names
+    # self - source_columns
+    def diff_columns_by_name(source_columns)
+      self.class.diff_columns_by_name(source_columns, self)
+    end
+    # A.merge!(B) => B overwrites A
+    # A.reverse_merge!(B) => A overwrites B, but A is modified
+    def reverse_merge!(source_columns)
+      self.class.reverse_merge!(source_columns, self)
+    end
+    def reject_columns!(drop_columns)
+      self.class.reject_columns!(drop_columns, self)
+    end
+    def build_query_fields(source_columns)
+      self.class.build_query_fields(source_columns, self)
+    end
+    class << self
+      # The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
+      # and must start with a letter or underscore. The maximum length is 128 characters.
+      def validate_name!(name)
+        unless name =~ /\A[a-zA-Z_]+\w*\Z/
+          raise ConfigError, "Column name `#{name}` is invalid format"
+        end
+        unless name.length < 128
+          raise ConfigError, "Column name `#{name}` must be less than 128"
+        end
+      end
+      def validate_type!(type)
+        unless ALLOWED_FIELD_TYPES.include?(type)
+          raise ConfigError, "Column type `#{type}` is not allowed type"
+        end
+      end
+      def validate_mode!(mode)
+        unless ALLOWED_FIELD_MODES.include?(mode)
+          raise ConfigError, "Column mode `#{mode}` is not allowed mode"
+        end
+      end
+      def validate_columns!(columns)
+        columns.each do |column|
+          validate_name!(column[:name])
+          validate_type!(column[:type])
+          validate_mode!(column[:mode]) if column[:mode]
+          if column[:type] == 'RECORD'
+            validate_columns!(column[:fields])
+          end
+        end
+      end
+      def find_column_by_name(columns, name)
+        (columns || []).find { |c| c[:name] == name }
+      end
+      # validates permitted changes from old schema to new schema
+      def validate_permitted_operations!(source_columns, target_columns)
+        flattened_source_columns = flattened_columns(normalize_columns(source_columns))
+        flattened_target_columns = flattened_columns(normalize_columns(target_columns))
+        flattened_target_columns.keys.each do |flattened_name|
+          next unless flattened_source_columns.key?(flattened_name)
+          validate_permitted_operations_for_type!(
+            flattened_source_columns[flattened_name],
+            flattened_target_columns[flattened_name]
+          )
+          validate_permitted_operations_for_mode!(
+            flattened_source_columns[flattened_name],
+            flattened_target_columns[flattened_name]
+          )
+        end
+      end
+      # @param [Hash] source_column
+      # @param [Hash] target_column
+      #
+      # Disallowed conversion rule is as follows:
+      #
+      #   type: RECORD => type: others
+      #   mode: REPEATED => change type
+      #
+      def validate_permitted_operations_for_type!(source_column, target_column)
+        source_column = shallow_normalize_column(source_column)
+        target_column = shallow_normalize_column(target_column)
+        msg = "(#{source_column.to_h} => #{target_column.to_h})"
+        if source_column[:type] == 'RECORD'
+          if target_column[:type] != 'RECORD'
+            raise ConfigError, "`RECORD` can not be changed #{msg}"
+          end
+        end
+        if source_column[:mode] and source_column[:mode] == 'REPEATED'
+          if source_column[:type] != target_column[:type]
+            raise ConfigError, "`REPEATED` mode column's type can not be changed #{msg}"
+          end
+        end
+      end
+      # @param [Hash] source_column
+      # @param [Hash] target_column
+      #
+      # Allowed conversion rule is as follows:
+      #
+      #     (new)    => NULLABLE, REPEATED
+      #     NULLABLE => NULLABLE
+      #     REQUIRED => REQUIRED, NULLABLE
+      #     REPEATED => REPEATED
+      def validate_permitted_operations_for_mode!(source_column, target_column)
+        source_column = shallow_normalize_column(source_column)
+        target_column = shallow_normalize_column(target_column)
+        source_mode   = source_column[:mode]
+        target_mode   = target_column[:mode]
+        return if source_mode == target_mode
+        msg = "(#{source_column.to_h} => #{target_column.to_h})"
+        case source_mode
+        when nil
+          if target_mode == 'REQUIRED'
+            raise ConfigError, "Newly adding a `REQUIRED` column is not allowed #{msg}"
+          end
+        when 'NULLABLE'
+          raise ConfigError, "`NULLABLE` column can not be changed #{msg}"
+        when 'REQUIRED'
+          if target_mode == 'REPEATED'
+            raise ConfigError, "`REQUIRED` column can not be changed to `REPEATED` #{msg}"
+          end
+        when 'REPEATED'
+          raise ConfigError, "`REPEATED` column can not be changed #{msg}"
+        end
+      end
+      def normalize_columns(columns)
+        columns = shallow_normalize_columns(columns)
+        columns.map do |column|
+          if column[:type] == 'RECORD' and column[:fields]
+            column[:fields] = normalize_columns(column[:fields])
+          end
+          column
+        end
+      end
+      def shallow_normalize_columns(columns)
+        columns.map {|column| shallow_normalize_column(column) }
+      end
+      def shallow_normalize_columns!(columns)
+        columns.each {|column| shallow_normalize_column!(column) }
+        columns
+      end
+      def shallow_normalize_column(column)
+        shallow_normalize_column!(column.dup)
+      end
+      def shallow_normalize_column!(column)
+        symbolize_keys!(column)
+        column[:type] = column[:type].upcase if column[:type]
+        column[:mode] ||= 'NULLABLE'
+        column[:mode] = column[:mode].upcase
+        column
+      end
+      def symbolize_keys!(column)
+        new_column = column.map do |key, val|
+          [key.to_sym, val]
+        end.to_h
+        column.replace(new_column)
+      end
+      # @param [Array] columns
+      # [{
+      #   name: 'citiesLived',
+      #   type: 'RECORD',
+      #   fields: [
+      #     {
+      #       name: 'place', type: 'RECORD',
+      #       fields: [
+      #         { name: 'city', type: 'STRING' }, { name: 'postcode', type: 'STRING' }
+      #       ]
+      #     },
+      #     { name: 'yearsLived', type: 'INTEGER' }
+      #   ]
+      # }]
+      # @return Hash
+      # {
+      #   'citiesLived.place.city' => {
+      #     type: 'STRING'
+      #   },
+      #   'citiesLived.place.postcode' => {
+      #     type: 'STRING'
+      #   },
+      #   'citiesLived.yearsLived' => {
+      #     type: 'INTEGER'
+      #   }
+      # }
+      def flattened_columns(columns, parent_name: nil)
+        result = {}
+        columns.each do |column|
+          column_name = parent_name.nil? ? column[:name] : "#{parent_name}.#{column[:name]}"
+          if column[:type].upcase != 'RECORD'
+            result[column_name] = {}.tap do |value|
+              value[:type] = column[:type]
+              value[:mode] = column[:mode] if column[:mode]
+            end
+          else
+            result.merge!(flattened_columns(column[:fields], parent_name: column_name))
+          end
+        end
+        result
+      end
+      def equals?(source_columns, target_columns)
+        diff_columns(source_columns, target_columns).empty? and \
+          diff_columns(target_columns, source_columns).empty?
+      end
+      # target_columns - source_columns
+      def diff_columns(source_columns, target_columns)
+        _target_columns = shallow_normalize_columns(target_columns)
+        _source_columns = shallow_normalize_columns(source_columns)
+        diff_columns = _target_columns - _source_columns # shallow diff
+        diff_columns.map do |target_column|
+          t = target_column
+          source_column = find_column_by_name(_source_columns, target_column[:name])
+          next t unless source_column
+          next t unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
+          next t unless target_column[:fields] and source_column[:fields]
+          # recusive diff for RECORD columns
+          diff_fields = diff_columns(source_column[:fields], target_column[:fields])
+          next nil if diff_fields.empty? # remove
+          target_column[:fields] = diff_fields
+          target_column
+        end.compact
+      end
+      # diff with only column_names
+      # target_columns - source_columns
+      def diff_columns_by_name(source_columns, target_columns)
+        _target_columns = shallow_normalize_columns(target_columns)
+        _source_columns = shallow_normalize_columns(source_columns)
+        diff_columns = _target_columns - _source_columns # shallow diff
+        diff_columns.map do |target_column|
+          t = target_column
+          source_column = find_column_by_name(_source_columns, target_column[:name])
+          next t unless source_column
+          next nil unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
+          next nil unless target_column[:fields] and source_column[:fields]
+          # recusive diff for RECORD columns
+          diff_fields = diff_columns_by_name(source_column[:fields], target_column[:fields])
+          next nil if diff_fields.empty? # remove
+          target_column[:fields] = diff_fields
+          target_column
+        end.compact
+      end
+      # 1. target_column[:mode] ||= source_column[:mode] || 'NULLABLE' (not overwrite, but set if does not exist)
+      # 2. Add into target_columns if a source column does not exist in target_columns
+      #
+      # @param [Array] source_columns
+      # @param [Array] target_columns
+      def reverse_merge!(source_columns, target_columns)
+        shallow_normalize_columns!(source_columns)
+        shallow_normalize_columns!(target_columns)
+        source_columns.map do |source_column|
+          if target_column = find_column_by_name(target_columns, source_column[:name])
+            target_column[:mode] ||= source_column[:mode] || 'NULLABLE'
+            target_column[:type] ||= source_column[:type] # should never be happened
+            # Recursive merge fields of `RECORD` type
+            if target_column[:type] == 'RECORD' and target_column[:fields] and source_column[:fields]
+              reverse_merge!(source_column[:fields], target_column[:fields])
+            end
+          else
+            target_column = source_column.dup
+            target_column[:mode] ||= 'NULLABLE'
+            target_columns << target_column
+          end
+        end
+        target_columns
+      end
+      def reject_columns!(drop_columns, target_columns)
+        flattened_drop_columns = flattened_columns(drop_columns)
+        flattened_drop_columns.keys.each do |flattened_name|
+          # paths like a %w(citiesLived place city child1)
+          paths = flattened_name.split('.')
+          # object_id of fields and target_columns are different.
+          # But the internal elements refer to the same ones
+          fields = target_columns
+          paths.each do |path|
+            # The last element of the path does not have the fields
+            next if path == paths.last
+            # find recursively
+            column = fields.find { |f| f[:name] == path }
+            next if column.nil?
+            fields = column[:fields]
+          end
+          unless fields.empty?
+            fields.delete_if { |f| f[:name] == paths.last }
+          end
+        end
+        target_columns
+      end
+      def build_query_fields(source_columns, target_columns)
+        flattened_source_columns = flattened_columns(source_columns)
+        flattened_target_columns = flattened_columns(target_columns)
+        query_fields = flattened_target_columns.map do |flattened_name, flattened_target_column|
+          flattened_source_column = flattened_source_columns[flattened_name]
+          target_type = flattened_target_column[:type].upcase
+          if flattened_source_column
+            "#{target_type}(#{flattened_name}) AS #{flattened_name}"
+          else
+            flattened_name
+            #  MEMO: NULL cast like "#{target_type}(NULL) AS #{flattened_name}" breaks RECORD columns as
+            #  INTEGER(NULL) AS add_record.add_record.add_column1 => add_record_add_record_add_column1
+            #  We have to add columns with patch_table beforehand
+          end
+        end
+      end
+    end
+  end
+end