RubyGems - pgsync - Versions diffs - 0.5.5 → 0.6.4 - Mend

pgsync 0.5.5 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pgsync might be problematic. Click here for more details.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +38 -3
data/LICENSE.txt +1 -1
data/README.md +87 -32
data/config.yml +4 -0
data/lib/pgsync.rb +6 -1
data/lib/pgsync/client.rb +55 -57
data/lib/pgsync/data_source.rb +55 -104
data/lib/pgsync/init.rb +50 -6
data/lib/pgsync/schema_sync.rb +83 -0
data/lib/pgsync/sequence.rb +29 -0
data/lib/pgsync/sync.rb +82 -199
data/lib/pgsync/table.rb +28 -0
data/lib/pgsync/table_sync.rb +254 -207
data/lib/pgsync/task.rb +325 -0
data/lib/pgsync/task_resolver.rb +237 -0
data/lib/pgsync/utils.rb +55 -14
data/lib/pgsync/version.rb +1 -1
metadata +7 -3
data/lib/pgsync/table_list.rb +0 -141

data/lib/pgsync/task.rb ADDED

@@ -0,0 +1,325 @@
+module PgSync
+  class Task
+    include Utils
+    attr_reader :source, :destination, :config, :table, :opts
+    attr_accessor :from_columns, :to_columns, :from_sequences, :to_sequences, :to_primary_key
+    def initialize(source:, destination:, config:, table:, opts:)
+      @source = source
+      @destination = destination
+      @config = config
+      @table = table
+      @opts = opts
+      @from_sequences = []
+      @to_sequences = []
+    end
+    def quoted_table
+      quote_ident_full(table)
+    end
+    def perform
+      with_notices do
+        handle_errors do
+          maybe_disable_triggers do
+            sync_data
+          end
+        end
+      end
+    end
+    def from_fields
+      @from_fields ||= from_columns.map { |c| c[:name] }
+    end
+    def to_fields
+      @to_fields ||= to_columns.map { |c| c[:name] }
+    end
+    def shared_fields
+      @shared_fields ||= to_fields & from_fields
+    end
+    def shared_sequences
+      @shared_sequences ||= to_sequences & from_sequences
+    end
+    def notes
+      notes = []
+      if shared_fields.empty?
+        notes << "No fields to copy"
+      else
+        extra_fields = to_fields - from_fields
+        notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
+        missing_fields = from_fields - to_fields
+        notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
+        extra_sequences = to_sequences - from_sequences
+        notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
+        missing_sequences = from_sequences - to_sequences
+        notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
+        from_types = from_columns.map { |c| [c[:name], c[:type]] }.to_h
+        to_types = to_columns.map { |c| [c[:name], c[:type]] }.to_h
+        different_types = []
+        shared_fields.each do |field|
+          if from_types[field] != to_types[field]
+            different_types << "#{field} (#{from_types[field]} -> #{to_types[field]})"
+          end
+        end
+        notes << "Different column types: #{different_types.join(", ")}" if different_types.any?
+      end
+      notes
+    end
+    def sync_data
+      raise Error, "This should never happen. Please file a bug." if shared_fields.empty?
+      sql_clause = String.new("")
+      sql_clause << " #{opts[:sql]}" if opts[:sql]
+      bad_fields = opts[:no_rules] ? [] : config["data_rules"]
+      primary_key = to_primary_key
+      copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quoted_table}.#{quote_ident(f)}" }.join(", ")
+      fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
+      copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{sql_clause}) TO STDOUT"
+      if opts[:in_batches]
+        raise Error, "No primary key" if primary_key.empty?
+        primary_key = primary_key.first
+        destination.truncate(table) if opts[:truncate]
+        from_max_id = source.max_id(table, primary_key)
+        to_max_id = destination.max_id(table, primary_key) + 1
+        if to_max_id == 1
+          from_min_id = source.min_id(table, primary_key)
+          to_max_id = from_min_id if from_min_id > 0
+        end
+        starting_id = to_max_id
+        batch_size = opts[:batch_size]
+        i = 1
+        batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
+        while starting_id <= from_max_id
+          where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
+          log "    #{i}/#{batch_count}: #{where}"
+          # TODO be smarter for advance sql clauses
+          batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
+          batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{batch_sql_clause}) TO STDOUT"
+          copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
+          starting_id += batch_size
+          i += 1
+          if opts[:sleep] && starting_id <= from_max_id
+            sleep(opts[:sleep])
+          end
+        end
+      elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
+        raise Error, "No primary key" if primary_key.empty?
+        # create a temp table
+        temp_table = "pgsync_#{rand(1_000_000_000)}"
+        destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quoted_table} WITH NO DATA")
+        # load data
+        copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
+        on_conflict = primary_key.map { |pk| quote_ident(pk) }.join(", ")
+        action =
+          if opts[:preserve]
+            "NOTHING"
+          else # overwrite or sql clause
+            setter = shared_fields.reject { |f| primary_key.include?(f) }.map { |f| "#{quote_ident(f)} = EXCLUDED.#{quote_ident(f)}" }
+            "UPDATE SET #{setter.join(", ")}"
+          end
+        destination.execute("INSERT INTO #{quoted_table} (SELECT * FROM #{quote_ident_full(temp_table)}) ON CONFLICT (#{on_conflict}) DO #{action}")
+      else
+        # use delete instead of truncate for foreign keys
+        if opts[:defer_constraints] || opts[:defer_constraints_v2]
+          destination.execute("DELETE FROM #{quoted_table}")
+        else
+          destination.truncate(table)
+        end
+        copy(copy_to_command, dest_table: table, dest_fields: fields)
+      end
+      # update sequences
+      shared_sequences.each do |seq|
+        value = source.last_value(seq)
+        destination.execute("SELECT setval(#{escape(quote_ident_full(seq))}, #{escape(value)})")
+      end
+      {status: "success"}
+    end
+    private
+    def with_notices
+      notices = []
+      [source, destination].each do |data_source|
+        data_source.send(:conn).set_notice_processor do |message|
+          notices << message.strip
+        end
+      end
+      result = yield
+      result[:notices] = notices if result
+      result
+    ensure
+      # clear notice processor
+      [source, destination].each do |data_source|
+        data_source.send(:conn).set_notice_processor
+      end
+    end
+    # TODO add retries
+    def handle_errors
+      yield
+    rescue => e
+      raise e if opts[:debug]
+      message =
+        case e
+        when PG::ConnectionBad
+          # likely fine to show simplified message here
+          # the full message will be shown when first trying to connect
+          "Connection failed"
+        when PG::Error
+          e.message.sub("ERROR:  ", "")
+        when Error
+          e.message
+        else
+          "#{e.class.name}: #{e.message}"
+        end
+      {status: "error", message: message}
+    end
+    def copy(source_command, dest_table:, dest_fields:)
+      destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
+      source.log_sql(source_command)
+      destination.log_sql(destination_command)
+      destination.conn.copy_data(destination_command) do
+        source.conn.copy_data(source_command) do
+          while (row = source.conn.get_copy_data)
+            destination.conn.put_copy_data(row)
+          end
+        end
+      end
+    end
+    # TODO better performance
+    def rule_match?(table, column, rule)
+      regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
+      regex.match(column) || regex.match("#{table.name}.#{column}") || regex.match("#{table.schema}.#{table.name}.#{column}")
+    end
+    # TODO wildcard rules
+    def apply_strategy(rule, table, column, primary_key)
+      if rule.is_a?(Hash)
+        if rule.key?("value")
+          escape(rule["value"])
+        elsif rule.key?("statement")
+          rule["statement"]
+        else
+          raise Error, "Unknown rule #{rule.inspect} for column #{column}"
+        end
+      else
+        case rule
+        when "untouched"
+          quote_ident(column)
+        when "unique_email"
+          "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
+        when "unique_phone"
+          "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
+        when "unique_secret"
+          "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
+        when "random_int", "random_number"
+          "(RANDOM() * 100)::int"
+        when "random_date"
+          "date '1970-01-01' + (RANDOM() * 10000)::int"
+        when "random_time"
+          "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
+        when "random_ip"
+          "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
+        when "random_letter"
+          "chr(65 + (RANDOM() * 26)::int)"
+        when "random_string"
+          "RIGHT(MD5(RANDOM()::text), 10)"
+        when "null", nil
+          "NULL"
+        else
+          raise Error, "Unknown rule #{rule} for column #{column}"
+        end
+      end
+    end
+    def quoted_primary_key(table, primary_key, rule)
+      raise Error, "Single column primary key required for this data rule: #{rule}" unless primary_key.size == 1
+      "#{quoted_table}.#{quote_ident(primary_key.first)}"
+    end
+    def maybe_disable_triggers
+      if opts[:disable_integrity] || opts[:disable_integrity_v2] || opts[:disable_user_triggers]
+        destination.transaction do
+          triggers = destination.triggers(table)
+          triggers.select! { |t| t["enabled"] == "t" }
+          internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
+          integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
+          restore_triggers = []
+          # both --disable-integrity options require superuser privileges
+          # however, only v2 works on Amazon RDS, which added specific support for it
+          # https://aws.amazon.com/about-aws/whats-new/2014/11/10/amazon-rds-postgresql-read-replicas/
+          #
+          # session_replication_role disables more than foreign keys (like triggers and rules)
+          # this is probably fine, but keep the current default for now
+          if opts[:disable_integrity_v2] || (opts[:disable_integrity] && rds?)
+            # SET LOCAL lasts until the end of the transaction
+            # https://www.postgresql.org/docs/current/sql-set.html
+            destination.execute("SET LOCAL session_replication_role = replica")
+          elsif opts[:disable_integrity]
+            integrity_triggers.each do |trigger|
+              destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
+            end
+            restore_triggers.concat(integrity_triggers)
+          end
+          if opts[:disable_user_triggers]
+            # important!
+            # rely on Postgres to disable user triggers
+            # we don't want to accidentally disable non-user triggers if logic above is off
+            destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER USER")
+            restore_triggers.concat(user_triggers)
+          end
+          result = yield
+          # restore triggers that were previously enabled
+          restore_triggers.each do |trigger|
+            destination.execute("ALTER TABLE #{quoted_table} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
+          end
+          result
+        end
+      else
+        yield
+      end
+    end
+    def rds?
+      destination.execute("SELECT name, setting FROM pg_settings WHERE name LIKE 'rds.%'").any?
+    end
+  end
+end

data/lib/pgsync/task_resolver.rb ADDED

@@ -0,0 +1,237 @@
+module PgSync
+  class TaskResolver
+    include Utils
+    attr_reader :args, :opts, :source, :destination, :config, :first_schema, :notes
+    def initialize(args:, opts:, source:, destination:, config:, first_schema:)
+      @args = args
+      @opts = opts
+      @source = source
+      @destination = destination
+      @config = config
+      @groups = config["groups"] || {}
+      @first_schema = first_schema
+      @notes = []
+    end
+    def tasks
+      tasks = []
+      # get lists from args
+      groups, tables = process_args
+      # expand groups into tasks
+      groups.each do |group|
+        tasks.concat(group_to_tasks(group))
+      end
+      # expand tables into tasks
+      tables.each do |table|
+        tasks.concat(table_to_tasks(table))
+      end
+      # get default if none given
+      if !opts[:groups] && !opts[:tables] && args.size == 0
+        tasks.concat(default_tasks)
+      end
+      # resolve any tables that need it
+      tasks.each do |task|
+        task[:table] = fully_resolve(task[:table])
+      end
+      tasks
+    end
+    def group?(group)
+      @groups.key?(group)
+    end
+    private
+    def group_to_tasks(value)
+      group, param = value.split(":", 2)
+      raise Error, "Group not found: #{group}" unless group?(group)
+      @groups[group].map do |table|
+        table_sql = nil
+        if table.is_a?(Array)
+          table, table_sql = table
+        end
+        {
+          table: to_table(table),
+          sql: expand_sql(table_sql, param)
+        }
+      end
+    end
+    def table_to_tasks(value)
+      raise Error, "Cannot use parameters with tables" if value.include?(":")
+      tables =
+        if value.include?("*")
+          regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
+          shared_tables.select { |t| regex.match(t.full_name) || regex.match(t.name) }
+        else
+          [to_table(value)]
+        end
+      tables.map do |table|
+        {
+          table: table,
+          sql: sql_arg # doesn't support params
+        }
+      end
+    end
+    # treats identifiers as if they were quoted (Users == "Users")
+    # this is different from Postgres (Users == "users")
+    #
+    # TODO add support for quoted identifiers like "my.schema"."my.table"
+    # so it's possible to specify identifiers with "." in them
+    def to_table(value)
+      parts = value.split(".")
+      case parts.size
+      when 1
+        # unknown schema
+        Table.new(nil, parts[0])
+      when 2
+        Table.new(*parts)
+      else
+        raise Error, "Cannot resolve table: #{value}"
+      end
+    end
+    def default_tasks
+      shared_tables.map do |table|
+        {
+          table: table
+        }
+      end
+    end
+    # tables that exists in both source and destination
+    # used when no tables specified, or a wildcard
+    # removes excluded tables and filters by schema
+    def shared_tables
+      tables = filter_tables(source.tables)
+      unless opts[:schema_only] || opts[:schema_first]
+        from_tables = tables
+        to_tables = filter_tables(destination.tables)
+        extra_tables = to_tables - from_tables
+        notes << "Extra tables: #{extra_tables.map { |t| friendly_name(t) }.join(", ")}" if extra_tables.any?
+        missing_tables = from_tables - to_tables
+        notes << "Missing tables: #{missing_tables.map { |t| friendly_name(t) }.join(", ")}" if missing_tables.any?
+        tables &= to_tables
+      end
+      tables
+    end
+    def filter_tables(tables)
+      tables = tables.dup
+      unless opts[:all_schemas]
+        # could support wildcard schemas as well
+        schemas = Set.new(opts[:schemas] ? to_arr(opts[:schemas]) : source.search_path)
+        tables.select! { |t| schemas.include?(t.schema) }
+      end
+      to_arr(opts[:exclude]).each do |value|
+        if value.include?("*")
+          regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
+          tables.reject! { |t| regex.match(t.full_name) || regex.match(t.name) }
+        else
+          tables -= [fully_resolve(to_table(value), error: false)].compact
+        end
+      end
+      tables
+    end
+    def process_args
+      groups = to_arr(opts[:groups])
+      tables = to_arr(opts[:tables])
+      if args[0]
+        # could be a group, table, or mix
+        to_arr(args[0]).each do |value|
+          if group?(value.split(":", 2)[0])
+            groups << value
+          else
+            tables << value
+          end
+        end
+      end
+      [groups, tables]
+    end
+    def no_schema_tables
+      @no_schema_tables ||= begin
+        search_path_index = source.search_path.map.with_index.to_h
+        source.tables.group_by(&:name).map do |group, t2|
+          [group, t2.select { |t| search_path_index[t.schema] }.sort_by { |t| search_path_index[t.schema] }.first]
+        end.to_h
+      end
+    end
+    # for tables without a schema, find the table in the search path
+    def fully_resolve(table, error: true)
+      return table if table.schema
+      resolved_table = no_schema_tables[table.name]
+      raise Error, "Table not found in source: #{table.name}" if !resolved_table && error
+      resolved_table
+    end
+    # parse command line arguments and YAML
+    def to_arr(value)
+      if value.is_a?(Array)
+        value
+      else
+        # Split by commas, but don't use commas inside double quotes
+        # https://stackoverflow.com/questions/21105360/regex-find-comma-not-inside-quotes
+        value.to_s.split(/(?!\B"[^"]*),(?![^"]*"\B)/)
+      end
+    end
+    def sql_arg
+      args[1]
+    end
+    def expand_sql(sql, param)
+      # command line option takes precedence over group option
+      sql = sql_arg if sql_arg
+      return unless sql
+      # vars must match \w
+      missing_vars = sql.scan(/{\w+}/).map { |v| v[1..-2] }
+      vars = {}
+      if param
+        vars["id"] = cast(param)
+        vars["1"] = cast(param)
+      end
+      sql = sql.dup
+      vars.each do |k, v|
+        # only sub if in var list
+        sql.gsub!("{#{k}}", cast(v)) if missing_vars.delete(k)
+      end
+      raise Error, "Missing variables: #{missing_vars.uniq.join(", ")}" if missing_vars.any?
+      sql
+    end
+    # TODO quote vars in next major version
+    def cast(value)
+      value.to_s.gsub(/\A\"|\"\z/, '')
+    end
+  end
+end