pcrd 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +24 -0
- data/LICENSE +21 -0
- data/README.md +614 -0
- data/bin/pcrd +7 -0
- data/lib/pcrd/advisory_lock.rb +50 -0
- data/lib/pcrd/apply/engine.rb +184 -0
- data/lib/pcrd/apply/worker.rb +97 -0
- data/lib/pcrd/backfill/batch.rb +158 -0
- data/lib/pcrd/backfill/engine.rb +153 -0
- data/lib/pcrd/checkpoint/store.rb +217 -0
- data/lib/pcrd/cli.rb +274 -0
- data/lib/pcrd/commands/analyze.rb +125 -0
- data/lib/pcrd/commands/cleanup.rb +112 -0
- data/lib/pcrd/commands/demo.rb +152 -0
- data/lib/pcrd/commands/readiness.rb +30 -0
- data/lib/pcrd/commands/status.rb +129 -0
- data/lib/pcrd/commands/verify.rb +172 -0
- data/lib/pcrd/config/add_column.rb +7 -0
- data/lib/pcrd/config/analyze_config.rb +8 -0
- data/lib/pcrd/config/column_spec.rb +10 -0
- data/lib/pcrd/config/connection.rb +7 -0
- data/lib/pcrd/config/cutover_config.rb +7 -0
- data/lib/pcrd/config/load_error.rb +7 -0
- data/lib/pcrd/config/loader.rb +158 -0
- data/lib/pcrd/config/migrate_config.rb +21 -0
- data/lib/pcrd/config/root.rb +9 -0
- data/lib/pcrd/config/schema.rb +62 -0
- data/lib/pcrd/config/table.rb +9 -0
- data/lib/pcrd/config/verify_config.rb +7 -0
- data/lib/pcrd/config.rb +7 -0
- data/lib/pcrd/connection/client.rb +129 -0
- data/lib/pcrd/connection/error.rb +7 -0
- data/lib/pcrd/connection/replication.rb +108 -0
- data/lib/pcrd/cutover/orchestrator.rb +108 -0
- data/lib/pcrd/cutover/sequences.rb +138 -0
- data/lib/pcrd/demo/generator.rb +214 -0
- data/lib/pcrd/demo/schema.rb +154 -0
- data/lib/pcrd/error.rb +12 -0
- data/lib/pcrd/migration/orchestrator.rb +272 -0
- data/lib/pcrd/monitor/lag.rb +107 -0
- data/lib/pcrd/options.rb +15 -0
- data/lib/pcrd/output/analyze_printer.rb +173 -0
- data/lib/pcrd/output/cutover_printer.rb +128 -0
- data/lib/pcrd/output/preflight_printer.rb +119 -0
- data/lib/pcrd/output/readiness_printer.rb +72 -0
- data/lib/pcrd/preflight.rb +331 -0
- data/lib/pcrd/readiness/manifest.rb +201 -0
- data/lib/pcrd/replication/consumer.rb +235 -0
- data/lib/pcrd/replication/error.rb +10 -0
- data/lib/pcrd/replication/pgoutput/messages.rb +68 -0
- data/lib/pcrd/replication/pgoutput/parser.rb +316 -0
- data/lib/pcrd/reporter/console.rb +46 -0
- data/lib/pcrd/reporter/null.rb +14 -0
- data/lib/pcrd/schema/column.rb +59 -0
- data/lib/pcrd/schema/ddl.rb +71 -0
- data/lib/pcrd/schema/diff_entry.rb +36 -0
- data/lib/pcrd/schema/differ.rb +175 -0
- data/lib/pcrd/schema/object_reader.rb +187 -0
- data/lib/pcrd/schema/packer.rb +90 -0
- data/lib/pcrd/schema/reader.rb +118 -0
- data/lib/pcrd/schema/setup.rb +143 -0
- data/lib/pcrd/schema/setup_error.rb +9 -0
- data/lib/pcrd/schema/table_not_found.rb +8 -0
- data/lib/pcrd/schema/type_registry.rb +116 -0
- data/lib/pcrd/sql.rb +55 -0
- data/lib/pcrd/transform/row_transformer.rb +69 -0
- data/lib/pcrd/transform/type_map.rb +209 -0
- data/lib/pcrd/transform/validator.rb +106 -0
- data/lib/pcrd/version.rb +5 -0
- data/lib/pcrd.rb +11 -0
- metadata +231 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
|
|
5
|
+
module Pcrd
|
|
6
|
+
module Config
|
|
7
|
+
MIGRATE_DEFAULTS = {
|
|
8
|
+
batch_size: 10_000,
|
|
9
|
+
lag_threshold_bytes: 1_048_576, # 1 MB
|
|
10
|
+
checkpoint_db: "./pcrd_checkpoint.sqlite3"
|
|
11
|
+
}.freeze
|
|
12
|
+
|
|
13
|
+
VERIFY_DEFAULTS = { sample_size: 1_000 }.freeze
|
|
14
|
+
CUTOVER_DEFAULTS = { sequence_buffer: 1_000, lag_drain_timeout: 300 }.freeze
|
|
15
|
+
|
|
16
|
+
class Loader
|
|
17
|
+
# Returns a Config::Root. Raises Config::LoadError on any problem.
|
|
18
|
+
def self.load(path)
|
|
19
|
+
new(path).load
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def initialize(path)
|
|
23
|
+
@path = path
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def load
|
|
27
|
+
raw = read_file
|
|
28
|
+
data = parse_yaml(raw)
|
|
29
|
+
validate!(data)
|
|
30
|
+
build(data)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def read_file
|
|
36
|
+
File.read(@path)
|
|
37
|
+
rescue Errno::ENOENT
|
|
38
|
+
raise LoadError, "Config file not found: #{@path}"
|
|
39
|
+
rescue Errno::EACCES
|
|
40
|
+
raise LoadError, "Cannot read config file (permission denied): #{@path}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def parse_yaml(raw)
|
|
44
|
+
YAML.safe_load(raw, symbolize_names: true)
|
|
45
|
+
rescue Psych::SyntaxError => e
|
|
46
|
+
raise LoadError, "Config file has invalid YAML: #{e.message}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def validate!(data)
|
|
50
|
+
result = Schema::DEFINITION.call(data)
|
|
51
|
+
return if result.success?
|
|
52
|
+
|
|
53
|
+
messages = result.errors.messages.map do |msg|
|
|
54
|
+
" #{msg.path.join(".")}: #{msg.text}"
|
|
55
|
+
end
|
|
56
|
+
raise LoadError, "Config file is invalid:\n#{messages.join("\n")}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def build(data)
|
|
60
|
+
Root.new(
|
|
61
|
+
source: build_connection(data[:source], env_prefix: "SOURCE"),
|
|
62
|
+
target: data[:target] ? build_connection(data[:target], env_prefix: "TARGET") : nil,
|
|
63
|
+
migrate: data[:migrate] ? build_migrate(data[:migrate]) : nil,
|
|
64
|
+
analyze: data[:analyze] ? build_analyze(data[:analyze]) : nil,
|
|
65
|
+
verify: data[:verify] ? build_verify(data[:verify]) : nil,
|
|
66
|
+
cutover: data[:cutover] ? build_cutover(data[:cutover]) : nil,
|
|
67
|
+
path: @path
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def build_connection(raw, env_prefix:)
|
|
72
|
+
password = raw[:password] ||
|
|
73
|
+
ENV["PCRD_#{env_prefix}_PASSWORD"] ||
|
|
74
|
+
nil # falls back to .pgpass / PGPASSWORD at connection time
|
|
75
|
+
Connection.new(
|
|
76
|
+
host: raw[:host],
|
|
77
|
+
port: raw.fetch(:port, 5432),
|
|
78
|
+
database: raw[:database],
|
|
79
|
+
user: raw[:user],
|
|
80
|
+
password: password
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def build_migrate(raw)
|
|
85
|
+
slot_base = derive_slot_base(raw[:tables])
|
|
86
|
+
MigrateConfig.new(
|
|
87
|
+
replication_slot: raw.fetch(:replication_slot, "pcrd_#{slot_base}"),
|
|
88
|
+
publication: raw.fetch(:publication, "pcrd_pub_#{slot_base}"),
|
|
89
|
+
checkpoint_db: raw.fetch(:checkpoint_db, MIGRATE_DEFAULTS[:checkpoint_db]),
|
|
90
|
+
batch_size: raw.fetch(:batch_size, MIGRATE_DEFAULTS[:batch_size]),
|
|
91
|
+
lag_threshold_bytes: raw.fetch(:lag_threshold_bytes, MIGRATE_DEFAULTS[:lag_threshold_bytes]),
|
|
92
|
+
max_rows_per_second: raw[:max_rows_per_second],
|
|
93
|
+
tables: (raw[:tables] || []).map { build_table(_1) }
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def build_table(raw)
|
|
98
|
+
Table.new(
|
|
99
|
+
name: raw[:name],
|
|
100
|
+
optimize_column_order: raw.fetch(:optimize_column_order, false),
|
|
101
|
+
columns: build_column_specs(raw[:columns] || {}),
|
|
102
|
+
add_columns: (raw[:add_columns] || []).map { build_add_column(_1) }
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def build_column_specs(raw_columns)
|
|
107
|
+
raw_columns.transform_keys(&:to_s).transform_values do |spec|
|
|
108
|
+
spec ||= {}
|
|
109
|
+
validate_column_spec!(spec)
|
|
110
|
+
ColumnSpec.new(
|
|
111
|
+
type: spec[:type]&.to_s,
|
|
112
|
+
rename: spec[:rename]&.to_s,
|
|
113
|
+
drop: spec.fetch(:drop, false)
|
|
114
|
+
)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def validate_column_spec!(spec)
|
|
119
|
+
return unless spec[:drop] && (spec[:type] || spec[:rename])
|
|
120
|
+
|
|
121
|
+
raise LoadError,
|
|
122
|
+
"A column spec cannot combine `drop: true` with `type` or `rename`"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def build_add_column(raw)
|
|
126
|
+
AddColumn.new(
|
|
127
|
+
name: raw[:name],
|
|
128
|
+
type: raw[:type],
|
|
129
|
+
default: raw[:default]
|
|
130
|
+
)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def build_analyze(raw)
|
|
134
|
+
AnalyzeConfig.new(tables: raw[:tables]&.map(&:to_s))
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def build_verify(raw)
|
|
138
|
+
VerifyConfig.new(
|
|
139
|
+
sample_size: raw.fetch(:sample_size, VERIFY_DEFAULTS[:sample_size])
|
|
140
|
+
)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def build_cutover(raw)
|
|
144
|
+
CutoverConfig.new(
|
|
145
|
+
sequence_buffer: raw.fetch(:sequence_buffer, CUTOVER_DEFAULTS[:sequence_buffer]),
|
|
146
|
+
lag_drain_timeout: raw.fetch(:lag_drain_timeout, CUTOVER_DEFAULTS[:lag_drain_timeout])
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Derives a short stable name for the replication slot / publication
|
|
151
|
+
# from the first table name when not explicitly configured.
|
|
152
|
+
def derive_slot_base(tables)
|
|
153
|
+
first = tables&.first&.dig(:name)
|
|
154
|
+
first ? first.gsub(/\W/, "_").downcase : "migration"
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pcrd
|
|
4
|
+
module Config
|
|
5
|
+
MigrateConfig = Data.define(
|
|
6
|
+
:replication_slot,
|
|
7
|
+
:publication,
|
|
8
|
+
:checkpoint_db,
|
|
9
|
+
:batch_size,
|
|
10
|
+
:lag_threshold_bytes,
|
|
11
|
+
:tables,
|
|
12
|
+
:max_rows_per_second
|
|
13
|
+
) do
|
|
14
|
+
# max_rows_per_second is optional (nil = unthrottled). Defaulting it here
|
|
15
|
+
# keeps existing callers and configs without the key working.
|
|
16
|
+
def initialize(max_rows_per_second: nil, **rest)
|
|
17
|
+
super(max_rows_per_second: max_rows_per_second, **rest)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pcrd
|
|
4
|
+
module Config
|
|
5
|
+
# Top-level config object returned by Config::Loader.load.
|
|
6
|
+
# target, migrate, analyze, verify, cutover are all optional (may be nil).
|
|
7
|
+
Root = Data.define(:source, :target, :migrate, :analyze, :verify, :cutover, :path)
|
|
8
|
+
end
|
|
9
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "dry-schema"
|
|
4
|
+
|
|
5
|
+
module Pcrd
|
|
6
|
+
module Config
|
|
7
|
+
module Schema
|
|
8
|
+
# Validates a YAML config hash (already natively typed by YAML.safe_load).
|
|
9
|
+
# Uses Dry::Schema.define (no coercion) since YAML gives us real Ruby types.
|
|
10
|
+
DEFINITION = Dry::Schema.define do
|
|
11
|
+
required(:source).hash do
|
|
12
|
+
required(:host).filled(:string)
|
|
13
|
+
optional(:port).value(:integer, gt?: 0, lt?: 65_536)
|
|
14
|
+
required(:database).filled(:string)
|
|
15
|
+
required(:user).filled(:string)
|
|
16
|
+
optional(:password).maybe(:string)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
optional(:target).hash do
|
|
20
|
+
required(:host).filled(:string)
|
|
21
|
+
optional(:port).value(:integer, gt?: 0, lt?: 65_536)
|
|
22
|
+
required(:database).filled(:string)
|
|
23
|
+
required(:user).filled(:string)
|
|
24
|
+
optional(:password).maybe(:string)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
optional(:migrate).hash do
|
|
28
|
+
optional(:replication_slot).filled(:string)
|
|
29
|
+
optional(:publication).filled(:string)
|
|
30
|
+
optional(:checkpoint_db).filled(:string)
|
|
31
|
+
optional(:batch_size).value(:integer, gt?: 0)
|
|
32
|
+
optional(:lag_threshold_bytes).value(:integer, gt?: 0)
|
|
33
|
+
optional(:tables).array(:hash) do
|
|
34
|
+
required(:name).filled(:string)
|
|
35
|
+
optional(:optimize_column_order).value(:bool)
|
|
36
|
+
# columns: dynamic keys (column names) — validated structurally in Loader
|
|
37
|
+
optional(:columns).value(:hash)
|
|
38
|
+
optional(:add_columns).array(:hash) do
|
|
39
|
+
required(:name).filled(:string)
|
|
40
|
+
required(:type).filled(:string)
|
|
41
|
+
optional(:default).maybe(:string)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
optional(:analyze).hash do
|
|
47
|
+
# nil means "use tables from migrate section"
|
|
48
|
+
optional(:tables).array(:string)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
optional(:verify).hash do
|
|
52
|
+
optional(:sample_size).value(:integer, gt?: 0)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
optional(:cutover).hash do
|
|
56
|
+
optional(:sequence_buffer).value(:integer, gteq?: 0)
|
|
57
|
+
optional(:lag_drain_timeout).value(:integer, gt?: 0)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end # module Schema
|
|
61
|
+
end
|
|
62
|
+
end
|
data/lib/pcrd/config.rb
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pg"
|
|
4
|
+
|
|
5
|
+
module Pcrd
|
|
6
|
+
module Connection
|
|
7
|
+
class Client
|
|
8
|
+
# Conservative per-session defaults applied to every connection.
|
|
9
|
+
#
|
|
10
|
+
# application_name — identifies pcrd in pg_stat_activity
|
|
11
|
+
# lock_timeout=5s — fail fast instead of blocking
|
|
12
|
+
# production behind a lock (DDL etc.)
|
|
13
|
+
# idle_in_transaction_session_timeout — release locks if a transaction is
|
|
14
|
+
# =60s left open idle (e.g. a stalled tool)
|
|
15
|
+
# statement_timeout=0 — DISABLED on purpose: backfill COPY
|
|
16
|
+
# and large batches run for a long
|
|
17
|
+
# time and must not be killed
|
|
18
|
+
#
|
|
19
|
+
# Override per pool via `settings:`; values are GUC strings (units allowed).
|
|
20
|
+
DEFAULT_SESSION_SETTINGS = {
|
|
21
|
+
"application_name" => "pcrd",
|
|
22
|
+
"lock_timeout" => "5s",
|
|
23
|
+
"idle_in_transaction_session_timeout" => "60s",
|
|
24
|
+
"statement_timeout" => "0"
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
attr_reader :session_settings
|
|
28
|
+
|
|
29
|
+
def initialize(config, settings: {})
|
|
30
|
+
@config = config
|
|
31
|
+
@session_settings = DEFAULT_SESSION_SETTINGS.merge(settings)
|
|
32
|
+
@conn = nil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# For parameterized queries (SELECT, INSERT with $1 placeholders).
|
|
36
|
+
def exec(sql, params = [])
|
|
37
|
+
connection.exec_params(sql, params)
|
|
38
|
+
rescue PG::Error => e
|
|
39
|
+
reset_connection!
|
|
40
|
+
raise Error, e.message
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# For DDL and multi-statement SQL where no parameter substitution is needed.
|
|
44
|
+
def exec_sql(sql)
|
|
45
|
+
connection.exec(sql)
|
|
46
|
+
rescue PG::Error => e
|
|
47
|
+
reset_connection!
|
|
48
|
+
raise Error, e.message
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# For COPY ... FROM STDIN. Yields the raw PG::Connection so the caller
|
|
52
|
+
# can call conn.put_copy_data(line) inside the block.
|
|
53
|
+
def copy_data(sql)
|
|
54
|
+
connection.copy_data(sql) { yield connection }
|
|
55
|
+
rescue PG::Error => e
|
|
56
|
+
# A failed COPY can leave the connection mid-COPY or in an aborted
|
|
57
|
+
# transaction; reset it like exec/exec_sql so the next call is usable.
|
|
58
|
+
reset_connection!
|
|
59
|
+
raise Error, e.message
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def quote_ident(name)
|
|
63
|
+
connection.quote_ident(name)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def escape_literal(val)
|
|
67
|
+
connection.escape_literal(val.to_s)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def transaction
|
|
71
|
+
exec("BEGIN")
|
|
72
|
+
result = yield
|
|
73
|
+
exec("COMMIT")
|
|
74
|
+
result
|
|
75
|
+
rescue StandardError
|
|
76
|
+
exec("ROLLBACK") rescue nil
|
|
77
|
+
raise
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def close
|
|
81
|
+
@conn&.close
|
|
82
|
+
@conn = nil
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def connected?
|
|
86
|
+
@conn && !@conn.finished?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# libpq options string that applies the session settings at connect time
|
|
90
|
+
# (-c key=value), so they are in force for the very first statement.
|
|
91
|
+
# application_name is excluded here — it is passed as the dedicated
|
|
92
|
+
# connect parameter because a -c value is overridden by libpq's
|
|
93
|
+
# fallback_application_name.
|
|
94
|
+
def session_options
|
|
95
|
+
@session_settings
|
|
96
|
+
.reject { |key, _| key == "application_name" }
|
|
97
|
+
.map { |key, value| "-c #{key}=#{value}" }
|
|
98
|
+
.join(" ")
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
def connection
|
|
104
|
+
@conn = connect unless connected?
|
|
105
|
+
@conn
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def connect
|
|
109
|
+
PG.connect(
|
|
110
|
+
host: @config.host,
|
|
111
|
+
port: @config.port,
|
|
112
|
+
dbname: @config.database,
|
|
113
|
+
user: @config.user,
|
|
114
|
+
password: @config.password,
|
|
115
|
+
application_name: @session_settings["application_name"],
|
|
116
|
+
options: session_options
|
|
117
|
+
)
|
|
118
|
+
rescue PG::ConnectionBad => e
|
|
119
|
+
raise Error, "Cannot connect to #{@config.host}:#{@config.port}/#{@config.database}: #{e.message}"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def reset_connection!
|
|
123
|
+
# If the connection is in an aborted transaction, attempt a rollback
|
|
124
|
+
# so subsequent commands on the same connection can proceed.
|
|
125
|
+
@conn&.exec("ROLLBACK") rescue nil
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pg"
|
|
4
|
+
|
|
5
|
+
module Pcrd
|
|
6
|
+
module Connection
|
|
7
|
+
# Manages a PostgreSQL logical replication connection.
|
|
8
|
+
#
|
|
9
|
+
# Opened with replication: 'database' so the server accepts streaming
|
|
10
|
+
# replication protocol commands. Use open → start_replication, then
|
|
11
|
+
# poll with get_copy_data / respond with put_copy_data.
|
|
12
|
+
class Replication
|
|
13
|
+
# START_REPLICATION is replication-protocol SQL, not ordinary SQL: the
|
|
14
|
+
# slot name and LSN are interpolated as bare tokens, so they must be
|
|
15
|
+
# validated rather than quoted. Slot names follow PostgreSQL's own rule
|
|
16
|
+
# (lowercase letters, digits, underscore; max 63). LSN is the standard
|
|
17
|
+
# hex/hex form. Both are config/checkpoint-derived, so validate them.
|
|
18
|
+
SLOT_NAME_RE = /\A[a-z0-9_]{1,63}\z/
|
|
19
|
+
LSN_RE = %r{\A[0-9A-Fa-f]{1,8}/[0-9A-Fa-f]{1,8}\z}
|
|
20
|
+
|
|
21
|
+
def initialize(config)
|
|
22
|
+
@config = config
|
|
23
|
+
@conn = nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def open
|
|
27
|
+
@conn = PG.connect(
|
|
28
|
+
host: @config.host,
|
|
29
|
+
port: @config.port,
|
|
30
|
+
dbname: @config.database,
|
|
31
|
+
user: @config.user,
|
|
32
|
+
password: @config.password,
|
|
33
|
+
application_name: "pcrd-replication",
|
|
34
|
+
replication: "database"
|
|
35
|
+
)
|
|
36
|
+
self
|
|
37
|
+
rescue PG::ConnectionBad, PG::Error => e
|
|
38
|
+
raise Error, "Replication connection failed to " \
|
|
39
|
+
"#{@config.host}:#{@config.port}/#{@config.database}: #{e.message}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Sends START_REPLICATION and enters COPY streaming mode.
|
|
43
|
+
# Uses send_query + get_result (not exec) so the CopyBoth response is
|
|
44
|
+
# handled correctly and the connection is left in streaming copy mode.
|
|
45
|
+
def start_replication(slot_name:, pub_name:, start_lsn: "0/0")
|
|
46
|
+
validate_slot_name!(slot_name)
|
|
47
|
+
validate_lsn!(start_lsn)
|
|
48
|
+
pub_id = pub_name.gsub("'", "''")
|
|
49
|
+
|
|
50
|
+
@conn.send_query(
|
|
51
|
+
"START_REPLICATION SLOT #{slot_name} LOGICAL #{start_lsn} " \
|
|
52
|
+
"(proto_version '1', publication_names '#{pub_id}')"
|
|
53
|
+
)
|
|
54
|
+
@conn.get_result # reads CopyBothResponse; puts connection in copy mode
|
|
55
|
+
self
|
|
56
|
+
rescue PG::Error => e
|
|
57
|
+
raise Error, "START_REPLICATION failed: #{e.message}"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Waits up to `timeout` seconds for data on the replication socket.
|
|
61
|
+
# Returns true if data is available, false if the timeout expired.
|
|
62
|
+
def wait_readable(timeout)
|
|
63
|
+
@conn.socket_io.wait_readable(timeout)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Returns a String (message bytes), nil (no data yet), or false (stream ended).
|
|
67
|
+
# Call after wait_readable returns true, or after consume_input.
|
|
68
|
+
def get_copy_data
|
|
69
|
+
@conn.consume_input
|
|
70
|
+
@conn.get_copy_data(true)
|
|
71
|
+
rescue PG::Error => e
|
|
72
|
+
raise Error, e.message
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Sends a client message (keepalive response) to the server.
|
|
76
|
+
def put_copy_data(data)
|
|
77
|
+
@conn.put_copy_data(data)
|
|
78
|
+
rescue PG::Error => e
|
|
79
|
+
raise Error, e.message
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def close
|
|
83
|
+
@conn&.finish
|
|
84
|
+
@conn = nil
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def connected?
|
|
88
|
+
@conn && !@conn.finished?
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
def validate_slot_name!(slot_name)
|
|
94
|
+
return if slot_name.to_s.match?(SLOT_NAME_RE)
|
|
95
|
+
|
|
96
|
+
raise Error,
|
|
97
|
+
"Invalid replication slot name #{slot_name.inspect}: must be 1-63 " \
|
|
98
|
+
"characters of lowercase letters, digits, or underscores."
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def validate_lsn!(lsn)
|
|
102
|
+
return if lsn.to_s.match?(LSN_RE)
|
|
103
|
+
|
|
104
|
+
raise Error, "Invalid start LSN #{lsn.inspect}: expected hex/hex form like \"0/0\"."
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pcrd
|
|
4
|
+
module Cutover
|
|
5
|
+
# Orchestrates the cutover sequence.
|
|
6
|
+
#
|
|
7
|
+
# Preconditions (operator's responsibility):
|
|
8
|
+
# - Application is in maintenance mode (writes to source have stopped)
|
|
9
|
+
# - pcrd migrate is running in streaming mode (or was cleanly stopped)
|
|
10
|
+
#
|
|
11
|
+
# Steps:
|
|
12
|
+
# 1. Verify the migration is at a cuttable phase (backfill complete)
|
|
13
|
+
# 2. Drain remaining replication lag to zero (with timeout)
|
|
14
|
+
# 3. Advance sequences on target
|
|
15
|
+
# 4. Verify row counts match
|
|
16
|
+
# 5. Print cutover report and "READY" signal
|
|
17
|
+
class Orchestrator
|
|
18
|
+
Result = Data.define(
|
|
19
|
+
:passed,
|
|
20
|
+
:row_counts, # Hash<table_name, {source:, target:}>
|
|
21
|
+
:sequence_results, # Array<Sequences::SequenceResult>
|
|
22
|
+
:lag_at_cutover, # Integer bytes
|
|
23
|
+
:warnings # Array<String>
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def initialize(source_pool:, target_pool:, config:)
|
|
27
|
+
@source = source_pool
|
|
28
|
+
@target = target_pool
|
|
29
|
+
@config = config
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Runs the full cutover sequence.
|
|
33
|
+
# on_progress: optional Proc called with a status string during drain
|
|
34
|
+
def run(on_progress: nil)
|
|
35
|
+
warnings = []
|
|
36
|
+
table_names = (@config.migrate&.tables || []).map(&:name)
|
|
37
|
+
|
|
38
|
+
# 1. Drain remaining lag
|
|
39
|
+
on_progress&.call("Draining replication lag...")
|
|
40
|
+
lag = drain_lag(table_names, on_progress: on_progress)
|
|
41
|
+
|
|
42
|
+
# 2. Advance sequences
|
|
43
|
+
on_progress&.call("Advancing target sequences...")
|
|
44
|
+
seq_results = Sequences.new(
|
|
45
|
+
source_pool: @source,
|
|
46
|
+
target_pool: @target,
|
|
47
|
+
safety_buffer: @config.cutover&.sequence_buffer || 1_000
|
|
48
|
+
).advance(table_names)
|
|
49
|
+
|
|
50
|
+
# 3. Row count verification
|
|
51
|
+
on_progress&.call("Verifying row counts...")
|
|
52
|
+
row_counts = verify_counts(table_names, warnings)
|
|
53
|
+
|
|
54
|
+
passed = row_counts.all? { |_, v| v[:source] == v[:target] }
|
|
55
|
+
|
|
56
|
+
Result.new(
|
|
57
|
+
passed: passed,
|
|
58
|
+
row_counts: row_counts,
|
|
59
|
+
sequence_results: seq_results,
|
|
60
|
+
lag_at_cutover: lag,
|
|
61
|
+
warnings: warnings
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def drain_lag(table_names, on_progress:)
|
|
68
|
+
slot_name = @config.migrate&.replication_slot
|
|
69
|
+
return 0 unless slot_name
|
|
70
|
+
|
|
71
|
+
timeout = @config.cutover&.lag_drain_timeout || 300
|
|
72
|
+
deadline = Time.now + timeout
|
|
73
|
+
lag_monitor = Monitor::Lag.new(source_pool: @source, slot_name: slot_name)
|
|
74
|
+
|
|
75
|
+
loop do
|
|
76
|
+
lag = lag_monitor.lag_bytes
|
|
77
|
+
on_progress&.call(" Lag: #{lag ? "#{lag} bytes" : "unknown"}")
|
|
78
|
+
|
|
79
|
+
return lag || 0 if lag&.zero?
|
|
80
|
+
return lag || 0 if !lag # slot may have been dropped
|
|
81
|
+
|
|
82
|
+
if Time.now > deadline
|
|
83
|
+
on_progress&.call(" Warning: lag did not reach zero within #{timeout}s (#{lag} bytes remaining)")
|
|
84
|
+
return lag
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
sleep 1
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def verify_counts(table_names, warnings)
|
|
92
|
+
table_names.each_with_object({}) do |name, counts|
|
|
93
|
+
src_count = @source.exec("SELECT COUNT(*) FROM #{@source.quote_ident(name)}")[0]["count"].to_i
|
|
94
|
+
tgt_count = @target.exec("SELECT COUNT(*) FROM #{@target.quote_ident(name)}")[0]["count"].to_i
|
|
95
|
+
|
|
96
|
+
counts[name] = { source: src_count, target: tgt_count }
|
|
97
|
+
|
|
98
|
+
if src_count != tgt_count
|
|
99
|
+
warnings << "#{name}: row count mismatch (source=#{src_count}, target=#{tgt_count})"
|
|
100
|
+
end
|
|
101
|
+
rescue Connection::Error => e
|
|
102
|
+
warnings << "#{name}: could not verify row count: #{e.message}"
|
|
103
|
+
counts[name] = { source: nil, target: nil }
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|