RubyGems - pgsync - Versions diffs - 0.5.2 → 0.6.1 - Mend

pgsync 0.5.2 → 0.6.1

Potentially problematic release.

This version of pgsync might be problematic. Click here for more details.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +37 -0
data/LICENSE.txt +1 -1
data/README.md +121 -38
data/config.yml +4 -0
data/exe/pgsync +0 -4
data/lib/pgsync.rb +5 -1
data/lib/pgsync/client.rb +54 -52
data/lib/pgsync/data_source.rb +78 -80
data/lib/pgsync/init.rb +48 -10
data/lib/pgsync/schema_sync.rb +83 -0
data/lib/pgsync/sync.rb +98 -175
data/lib/pgsync/table.rb +28 -0
data/lib/pgsync/table_sync.rb +167 -219
data/lib/pgsync/task.rb +315 -0
data/lib/pgsync/task_resolver.rb +235 -0
data/lib/pgsync/utils.rb +64 -24
data/lib/pgsync/version.rb +1 -1
metadata +8 -5
data/lib/pgsync/table_list.rb +0 -143

@@ -0,0 +1,315 @@
+module PgSync
+  class Task
+    include Utils
+    attr_reader :source, :destination, :config, :table, :opts
+    attr_accessor :from_columns, :to_columns
+    def initialize(source:, destination:, config:, table:, opts:)
+      @source = source
+      @destination = destination
+      @config = config
+      @table = table
+      @opts = opts
+    end
+    def quoted_table
+      quote_ident_full(table)
+    end
+    def perform
+      with_notices do
+        handle_errors do
+          maybe_disable_triggers do
+            sync_data
+          end
+        end
+      end
+    end
+    def from_fields
+      @from_fields ||= from_columns.map { |c| c[:name] }
+    end
+    def to_fields
+      @to_fields ||= to_columns.map { |c| c[:name] }
+    end
+    def shared_fields
+      @shared_fields ||= to_fields & from_fields
+    end
+    def from_sequences
+      @from_sequences ||= opts[:no_sequences] ? [] : source.sequences(table, shared_fields)
+    end
+    def to_sequences
+      @to_sequences ||= opts[:no_sequences] ? [] : destination.sequences(table, shared_fields)
+    end
+    def shared_sequences
+      @shared_sequences ||= to_sequences & from_sequences
+    end
+    def notes
+      notes = []
+      if shared_fields.empty?
+        notes << "No fields to copy"
+      else
+        extra_fields = to_fields - from_fields
+        notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
+        missing_fields = from_fields - to_fields
+        notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
+        extra_sequences = to_sequences - from_sequences
+        notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
+        missing_sequences = from_sequences - to_sequences
+        notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
+        from_types = from_columns.map { |c| [c[:name], c[:type]] }.to_h
+        to_types = to_columns.map { |c| [c[:name], c[:type]] }.to_h
+        different_types = []
+        shared_fields.each do |field|
+          if from_types[field] != to_types[field]
+            different_types << "#{field} (#{from_types[field]} -> #{to_types[field]})"
+          end
+        end
+        notes << "Different column types: #{different_types.join(", ")}" if different_types.any?
+      end
+      notes
+    end
+    def sync_data
+      raise Error, "This should never happen. Please file a bug." if shared_fields.empty?
+      sql_clause = String.new("")
+      sql_clause << " #{opts[:sql]}" if opts[:sql]
+      bad_fields = opts[:no_rules] ? [] : config["data_rules"]
+      primary_key = destination.primary_key(table)
+      copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quoted_table}.#{quote_ident(f)}" }.join(", ")
+      fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
+      seq_values = {}
+      shared_sequences.each do |seq|
+        seq_values[seq] = source.last_value(seq)
+      end
+      copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{sql_clause}) TO STDOUT"
+      if opts[:in_batches]
+        raise Error, "No primary key" if primary_key.empty?
+        primary_key = primary_key.first
+        destination.truncate(table) if opts[:truncate]
+        from_max_id = source.max_id(table, primary_key)
+        to_max_id = destination.max_id(table, primary_key) + 1
+        if to_max_id == 1
+          from_min_id = source.min_id(table, primary_key)
+          to_max_id = from_min_id if from_min_id > 0
+        end
+        starting_id = to_max_id
+        batch_size = opts[:batch_size]
+        i = 1
+        batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
+        while starting_id <= from_max_id
+          where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
+          log "    #{i}/#{batch_count}: #{where}"
+          # TODO be smarter for advance sql clauses
+          batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
+          batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{batch_sql_clause}) TO STDOUT"
+          copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
+          starting_id += batch_size
+          i += 1
+          if opts[:sleep] && starting_id <= from_max_id
+            sleep(opts[:sleep])
+          end
+        end
+      elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
+        raise Error, "No primary key" if primary_key.empty?
+        # create a temp table
+        temp_table = "pgsync_#{rand(1_000_000_000)}"
+        destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quoted_table} WITH NO DATA")
+        # load data
+        copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
+        on_conflict = primary_key.map { |pk| quote_ident(pk) }.join(", ")
+        action =
+          if opts[:preserve]
+            "NOTHING"
+          else # overwrite or sql clause
+            setter = shared_fields.reject { |f| primary_key.include?(f) }.map { |f| "#{quote_ident(f)} = EXCLUDED.#{quote_ident(f)}" }
+            "UPDATE SET #{setter.join(", ")}"
+          end
+        destination.execute("INSERT INTO #{quoted_table} (SELECT * FROM #{quote_ident_full(temp_table)}) ON CONFLICT (#{on_conflict}) DO #{action}")
+      else
+        # use delete instead of truncate for foreign keys
+        if opts[:defer_constraints]
+          destination.execute("DELETE FROM #{quoted_table}")
+        else
+          destination.truncate(table)
+        end
+        copy(copy_to_command, dest_table: table, dest_fields: fields)
+      end
+      seq_values.each do |seq, value|
+        destination.execute("SELECT setval(#{escape(seq)}, #{escape(value)})")
+      end
+      {status: "success"}
+    end
+    private
+    def with_notices
+      notices = []
+      [source, destination].each do |data_source|
+        data_source.send(:conn).set_notice_processor do |message|
+          notices << message.strip
+        end
+      end
+      result = yield
+      result[:notices] = notices if result
+      result
+    ensure
+      # clear notice processor
+      [source, destination].each do |data_source|
+        data_source.send(:conn).set_notice_processor
+      end
+    end
+    # TODO add retries
+    def handle_errors
+      yield
+    rescue => e
+      raise e if opts[:debug]
+      message =
+        case e
+        when PG::ConnectionBad
+          # likely fine to show simplified message here
+          # the full message will be shown when first trying to connect
+          "Connection failed"
+        when PG::Error
+          e.message.sub("ERROR:  ", "")
+        when Error
+          e.message
+        else
+          "#{e.class.name}: #{e.message}"
+        end
+      {status: "error", message: message}
+    end
+    def copy(source_command, dest_table:, dest_fields:)
+      destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
+      destination.conn.copy_data(destination_command) do
+        source.conn.copy_data(source_command) do
+          while (row = source.conn.get_copy_data)
+            destination.conn.put_copy_data(row)
+          end
+        end
+      end
+    end
+    # TODO better performance
+    def rule_match?(table, column, rule)
+      regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
+      regex.match(column) || regex.match("#{table.name}.#{column}") || regex.match("#{table.schema}.#{table.name}.#{column}")
+    end
+    # TODO wildcard rules
+    def apply_strategy(rule, table, column, primary_key)
+      if rule.is_a?(Hash)
+        if rule.key?("value")
+          escape(rule["value"])
+        elsif rule.key?("statement")
+          rule["statement"]
+        else
+          raise Error, "Unknown rule #{rule.inspect} for column #{column}"
+        end
+      else
+        case rule
+        when "untouched"
+          quote_ident(column)
+        when "unique_email"
+          "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
+        when "unique_phone"
+          "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
+        when "unique_secret"
+          "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
+        when "random_int", "random_number"
+          "(RANDOM() * 100)::int"
+        when "random_date"
+          "date '1970-01-01' + (RANDOM() * 10000)::int"
+        when "random_time"
+          "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
+        when "random_ip"
+          "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
+        when "random_letter"
+          "chr(65 + (RANDOM() * 26)::int)"
+        when "random_string"
+          "RIGHT(MD5(RANDOM()::text), 10)"
+        when "null", nil
+          "NULL"
+        else
+          raise Error, "Unknown rule #{rule} for column #{column}"
+        end
+      end
+    end
+    def quoted_primary_key(table, primary_key, rule)
+      raise Error, "Single column primary key required for this data rule: #{rule}" unless primary_key.size == 1
+      "#{quoted_table}.#{quote_ident(primary_key.first)}"
+    end
+    def maybe_disable_triggers
+      if opts[:disable_integrity] || opts[:disable_user_triggers]
+        destination.transaction do
+          triggers = destination.triggers(table)
+          triggers.select! { |t| t["enabled"] == "t" }
+          internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
+          integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
+          restore_triggers = []
+          if opts[:disable_integrity]
+            integrity_triggers.each do |trigger|
+              destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
+            end
+            restore_triggers.concat(integrity_triggers)
+          end
+          if opts[:disable_user_triggers]
+            # important!
+            # rely on Postgres to disable user triggers
+            # we don't want to accidentally disable non-user triggers if logic above is off
+            destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER USER")
+            restore_triggers.concat(user_triggers)
+          end
+          result = yield
+          # restore triggers that were previously enabled
+          restore_triggers.each do |trigger|
+            destination.execute("ALTER TABLE #{quoted_table} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
+          end
+          result
+        end
+      else
+        yield
+      end
+    end
+  end
+end

data/lib/pgsync/task_resolver.rb ADDED

@@ -0,0 +1,235 @@
+module PgSync
+  class TaskResolver
+    include Utils
+    attr_reader :args, :opts, :source, :destination, :config, :first_schema, :notes
+    def initialize(args:, opts:, source:, destination:, config:, first_schema:)
+      @args = args
+      @opts = opts
+      @source = source
+      @destination = destination
+      @config = config
+      @groups = config["groups"] || {}
+      @first_schema = first_schema
+      @notes = []
+    end
+    def tasks
+      tasks = []
+      # get lists from args
+      groups, tables = process_args
+      # expand groups into tasks
+      groups.each do |group|
+        tasks.concat(group_to_tasks(group))
+      end
+      # expand tables into tasks
+      tables.each do |table|
+        tasks.concat(table_to_tasks(table))
+      end
+      # get default if none given
+      if !opts[:groups] && !opts[:tables] && args.size == 0
+        tasks.concat(default_tasks)
+      end
+      # resolve any tables that need it
+      tasks.each do |task|
+        task[:table] = fully_resolve(task[:table])
+      end
+      tasks
+    end
+    def group?(group)
+      @groups.key?(group)
+    end
+    private
+    def group_to_tasks(value)
+      group, param = value.split(":", 2)
+      raise Error, "Group not found: #{group}" unless group?(group)
+      @groups[group].map do |table|
+        table_sql = nil
+        if table.is_a?(Array)
+          table, table_sql = table
+        end
+        {
+          table: to_table(table),
+          sql: expand_sql(table_sql, param)
+        }
+      end
+    end
+    def table_to_tasks(value)
+      raise Error, "Cannot use parameters with tables" if value.include?(":")
+      tables =
+        if value.include?("*")
+          regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
+          shared_tables.select { |t| regex.match(t.full_name) || regex.match(t.name) }
+        else
+          [to_table(value)]
+        end
+      tables.map do |table|
+        {
+          table: table,
+          sql: sql_arg # doesn't support params
+        }
+      end
+    end
+    # treats identifiers as if they were quoted (Users == "Users")
+    # this is different from Postgres (Users == "users")
+    #
+    # TODO add support for quoted identifiers like "my.schema"."my.table"
+    # so it's possible to specify identifiers with "." in them
+    def to_table(value)
+      parts = value.split(".")
+      case parts.size
+      when 1
+        # unknown schema
+        Table.new(nil, parts[0])
+      when 2
+        Table.new(*parts)
+      else
+        raise Error, "Cannot resolve table: #{value}"
+      end
+    end
+    def default_tasks
+      shared_tables.map do |table|
+        {
+          table: table
+        }
+      end
+    end
+    # tables that exists in both source and destination
+    # used when no tables specified, or a wildcard
+    # removes excluded tables and filters by schema
+    def shared_tables
+      tables = filter_tables(source.tables)
+      unless opts[:schema_only] || opts[:schema_first]
+        from_tables = tables
+        to_tables = filter_tables(destination.tables)
+        extra_tables = to_tables - from_tables
+        notes << "Extra tables: #{extra_tables.map { |t| friendly_name(t) }.join(", ")}" if extra_tables.any?
+        missing_tables = from_tables - to_tables
+        notes << "Missing tables: #{missing_tables.map { |t| friendly_name(t) }.join(", ")}" if missing_tables.any?
+        tables &= to_tables
+      end
+      tables
+    end
+    def filter_tables(tables)
+      tables = tables.dup
+      unless opts[:all_schemas]
+        # could support wildcard schemas as well
+        schemas = Set.new(opts[:schemas] ? to_arr(opts[:schemas]) : source.search_path)
+        tables.select! { |t| schemas.include?(t.schema) }
+      end
+      to_arr(opts[:exclude]).each do |value|
+        if value.include?("*")
+          regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
+          tables.reject! { |t| regex.match(t.full_name) || regex.match(t.name) }
+        else
+          tables -= [fully_resolve(to_table(value))]
+        end
+      end
+      tables
+    end
+    def process_args
+      groups = to_arr(opts[:groups])
+      tables = to_arr(opts[:tables])
+      if args[0]
+        # could be a group, table, or mix
+        to_arr(args[0]).each do |value|
+          if group?(value.split(":", 2)[0])
+            groups << value
+          else
+            tables << value
+          end
+        end
+      end
+      [groups, tables]
+    end
+    def no_schema_tables
+      @no_schema_tables ||= begin
+        search_path_index = source.search_path.map.with_index.to_h
+        source.tables.group_by(&:name).map do |group, t2|
+          [group, t2.select { |t| search_path_index[t.schema] }.sort_by { |t| search_path_index[t.schema] }.first]
+        end.to_h
+      end
+    end
+    # for tables without a schema, find the table in the search path
+    def fully_resolve(table)
+      return table if table.schema
+      no_schema_tables[table.name] || (raise Error, "Table not found in source: #{table.name}")
+    end
+    # parse command line arguments and YAML
+    def to_arr(value)
+      if value.is_a?(Array)
+        value
+      else
+        # Split by commas, but don't use commas inside double quotes
+        # https://stackoverflow.com/questions/21105360/regex-find-comma-not-inside-quotes
+        value.to_s.split(/(?!\B"[^"]*),(?![^"]*"\B)/)
+      end
+    end
+    def sql_arg
+      args[1]
+    end
+    def expand_sql(sql, param)
+      # command line option takes precedence over group option
+      sql = sql_arg if sql_arg
+      return unless sql
+      # vars must match \w
+      missing_vars = sql.scan(/{\w+}/).map { |v| v[1..-2] }
+      vars = {}
+      if param
+        vars["id"] = cast(param)
+        vars["1"] = cast(param)
+      end
+      sql = sql.dup
+      vars.each do |k, v|
+        # only sub if in var list
+        sql.gsub!("{#{k}}", cast(v)) if missing_vars.delete(k)
+      end
+      raise Error, "Missing variables: #{missing_vars.uniq.join(", ")}" if missing_vars.any?
+      sql
+    end
+    # TODO quote vars in next major version
+    def cast(value)
+      value.to_s.gsub(/\A\"|\"\z/, '')
+    end
+  end
+end