pcrd 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +24 -0
  3. data/LICENSE +21 -0
  4. data/README.md +614 -0
  5. data/bin/pcrd +7 -0
  6. data/lib/pcrd/advisory_lock.rb +50 -0
  7. data/lib/pcrd/apply/engine.rb +184 -0
  8. data/lib/pcrd/apply/worker.rb +97 -0
  9. data/lib/pcrd/backfill/batch.rb +158 -0
  10. data/lib/pcrd/backfill/engine.rb +153 -0
  11. data/lib/pcrd/checkpoint/store.rb +217 -0
  12. data/lib/pcrd/cli.rb +274 -0
  13. data/lib/pcrd/commands/analyze.rb +125 -0
  14. data/lib/pcrd/commands/cleanup.rb +112 -0
  15. data/lib/pcrd/commands/demo.rb +152 -0
  16. data/lib/pcrd/commands/readiness.rb +30 -0
  17. data/lib/pcrd/commands/status.rb +129 -0
  18. data/lib/pcrd/commands/verify.rb +172 -0
  19. data/lib/pcrd/config/add_column.rb +7 -0
  20. data/lib/pcrd/config/analyze_config.rb +8 -0
  21. data/lib/pcrd/config/column_spec.rb +10 -0
  22. data/lib/pcrd/config/connection.rb +7 -0
  23. data/lib/pcrd/config/cutover_config.rb +7 -0
  24. data/lib/pcrd/config/load_error.rb +7 -0
  25. data/lib/pcrd/config/loader.rb +158 -0
  26. data/lib/pcrd/config/migrate_config.rb +21 -0
  27. data/lib/pcrd/config/root.rb +9 -0
  28. data/lib/pcrd/config/schema.rb +62 -0
  29. data/lib/pcrd/config/table.rb +9 -0
  30. data/lib/pcrd/config/verify_config.rb +7 -0
  31. data/lib/pcrd/config.rb +7 -0
  32. data/lib/pcrd/connection/client.rb +129 -0
  33. data/lib/pcrd/connection/error.rb +7 -0
  34. data/lib/pcrd/connection/replication.rb +108 -0
  35. data/lib/pcrd/cutover/orchestrator.rb +108 -0
  36. data/lib/pcrd/cutover/sequences.rb +138 -0
  37. data/lib/pcrd/demo/generator.rb +214 -0
  38. data/lib/pcrd/demo/schema.rb +154 -0
  39. data/lib/pcrd/error.rb +12 -0
  40. data/lib/pcrd/migration/orchestrator.rb +272 -0
  41. data/lib/pcrd/monitor/lag.rb +107 -0
  42. data/lib/pcrd/options.rb +15 -0
  43. data/lib/pcrd/output/analyze_printer.rb +173 -0
  44. data/lib/pcrd/output/cutover_printer.rb +128 -0
  45. data/lib/pcrd/output/preflight_printer.rb +119 -0
  46. data/lib/pcrd/output/readiness_printer.rb +72 -0
  47. data/lib/pcrd/preflight.rb +331 -0
  48. data/lib/pcrd/readiness/manifest.rb +201 -0
  49. data/lib/pcrd/replication/consumer.rb +235 -0
  50. data/lib/pcrd/replication/error.rb +10 -0
  51. data/lib/pcrd/replication/pgoutput/messages.rb +68 -0
  52. data/lib/pcrd/replication/pgoutput/parser.rb +316 -0
  53. data/lib/pcrd/reporter/console.rb +46 -0
  54. data/lib/pcrd/reporter/null.rb +14 -0
  55. data/lib/pcrd/schema/column.rb +59 -0
  56. data/lib/pcrd/schema/ddl.rb +71 -0
  57. data/lib/pcrd/schema/diff_entry.rb +36 -0
  58. data/lib/pcrd/schema/differ.rb +175 -0
  59. data/lib/pcrd/schema/object_reader.rb +187 -0
  60. data/lib/pcrd/schema/packer.rb +90 -0
  61. data/lib/pcrd/schema/reader.rb +118 -0
  62. data/lib/pcrd/schema/setup.rb +143 -0
  63. data/lib/pcrd/schema/setup_error.rb +9 -0
  64. data/lib/pcrd/schema/table_not_found.rb +8 -0
  65. data/lib/pcrd/schema/type_registry.rb +116 -0
  66. data/lib/pcrd/sql.rb +55 -0
  67. data/lib/pcrd/transform/row_transformer.rb +69 -0
  68. data/lib/pcrd/transform/type_map.rb +209 -0
  69. data/lib/pcrd/transform/validator.rb +106 -0
  70. data/lib/pcrd/version.rb +5 -0
  71. data/lib/pcrd.rb +11 -0
  72. metadata +231 -0
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+
5
+ module Pcrd
6
+ module Config
7
+ MIGRATE_DEFAULTS = {
8
+ batch_size: 10_000,
9
+ lag_threshold_bytes: 1_048_576, # 1 MB
10
+ checkpoint_db: "./pcrd_checkpoint.sqlite3"
11
+ }.freeze
12
+
13
+ VERIFY_DEFAULTS = { sample_size: 1_000 }.freeze
14
+ CUTOVER_DEFAULTS = { sequence_buffer: 1_000, lag_drain_timeout: 300 }.freeze
15
+
16
+ class Loader
17
+ # Returns a Config::Root. Raises Config::LoadError on any problem.
18
+ def self.load(path)
19
+ new(path).load
20
+ end
21
+
22
+ def initialize(path)
23
+ @path = path
24
+ end
25
+
26
+ def load
27
+ raw = read_file
28
+ data = parse_yaml(raw)
29
+ validate!(data)
30
+ build(data)
31
+ end
32
+
33
+ private
34
+
35
+ def read_file
36
+ File.read(@path)
37
+ rescue Errno::ENOENT
38
+ raise LoadError, "Config file not found: #{@path}"
39
+ rescue Errno::EACCES
40
+ raise LoadError, "Cannot read config file (permission denied): #{@path}"
41
+ end
42
+
43
+ def parse_yaml(raw)
44
+ YAML.safe_load(raw, symbolize_names: true)
45
+ rescue Psych::SyntaxError => e
46
+ raise LoadError, "Config file has invalid YAML: #{e.message}"
47
+ end
48
+
49
+ def validate!(data)
50
+ result = Schema::DEFINITION.call(data)
51
+ return if result.success?
52
+
53
+ messages = result.errors.messages.map do |msg|
54
+ " #{msg.path.join(".")}: #{msg.text}"
55
+ end
56
+ raise LoadError, "Config file is invalid:\n#{messages.join("\n")}"
57
+ end
58
+
59
+ def build(data)
60
+ Root.new(
61
+ source: build_connection(data[:source], env_prefix: "SOURCE"),
62
+ target: data[:target] ? build_connection(data[:target], env_prefix: "TARGET") : nil,
63
+ migrate: data[:migrate] ? build_migrate(data[:migrate]) : nil,
64
+ analyze: data[:analyze] ? build_analyze(data[:analyze]) : nil,
65
+ verify: data[:verify] ? build_verify(data[:verify]) : nil,
66
+ cutover: data[:cutover] ? build_cutover(data[:cutover]) : nil,
67
+ path: @path
68
+ )
69
+ end
70
+
71
+ def build_connection(raw, env_prefix:)
72
+ password = raw[:password] ||
73
+ ENV["PCRD_#{env_prefix}_PASSWORD"] ||
74
+ nil # falls back to .pgpass / PGPASSWORD at connection time
75
+ Connection.new(
76
+ host: raw[:host],
77
+ port: raw.fetch(:port, 5432),
78
+ database: raw[:database],
79
+ user: raw[:user],
80
+ password: password
81
+ )
82
+ end
83
+
84
+ def build_migrate(raw)
85
+ slot_base = derive_slot_base(raw[:tables])
86
+ MigrateConfig.new(
87
+ replication_slot: raw.fetch(:replication_slot, "pcrd_#{slot_base}"),
88
+ publication: raw.fetch(:publication, "pcrd_pub_#{slot_base}"),
89
+ checkpoint_db: raw.fetch(:checkpoint_db, MIGRATE_DEFAULTS[:checkpoint_db]),
90
+ batch_size: raw.fetch(:batch_size, MIGRATE_DEFAULTS[:batch_size]),
91
+ lag_threshold_bytes: raw.fetch(:lag_threshold_bytes, MIGRATE_DEFAULTS[:lag_threshold_bytes]),
92
+ max_rows_per_second: raw[:max_rows_per_second],
93
+ tables: (raw[:tables] || []).map { build_table(_1) }
94
+ )
95
+ end
96
+
97
+ def build_table(raw)
98
+ Table.new(
99
+ name: raw[:name],
100
+ optimize_column_order: raw.fetch(:optimize_column_order, false),
101
+ columns: build_column_specs(raw[:columns] || {}),
102
+ add_columns: (raw[:add_columns] || []).map { build_add_column(_1) }
103
+ )
104
+ end
105
+
106
+ def build_column_specs(raw_columns)
107
+ raw_columns.transform_keys(&:to_s).transform_values do |spec|
108
+ spec ||= {}
109
+ validate_column_spec!(spec)
110
+ ColumnSpec.new(
111
+ type: spec[:type]&.to_s,
112
+ rename: spec[:rename]&.to_s,
113
+ drop: spec.fetch(:drop, false)
114
+ )
115
+ end
116
+ end
117
+
118
+ def validate_column_spec!(spec)
119
+ return unless spec[:drop] && (spec[:type] || spec[:rename])
120
+
121
+ raise LoadError,
122
+ "A column spec cannot combine `drop: true` with `type` or `rename`"
123
+ end
124
+
125
+ def build_add_column(raw)
126
+ AddColumn.new(
127
+ name: raw[:name],
128
+ type: raw[:type],
129
+ default: raw[:default]
130
+ )
131
+ end
132
+
133
+ def build_analyze(raw)
134
+ AnalyzeConfig.new(tables: raw[:tables]&.map(&:to_s))
135
+ end
136
+
137
+ def build_verify(raw)
138
+ VerifyConfig.new(
139
+ sample_size: raw.fetch(:sample_size, VERIFY_DEFAULTS[:sample_size])
140
+ )
141
+ end
142
+
143
+ def build_cutover(raw)
144
+ CutoverConfig.new(
145
+ sequence_buffer: raw.fetch(:sequence_buffer, CUTOVER_DEFAULTS[:sequence_buffer]),
146
+ lag_drain_timeout: raw.fetch(:lag_drain_timeout, CUTOVER_DEFAULTS[:lag_drain_timeout])
147
+ )
148
+ end
149
+
150
+ # Derives a short stable name for the replication slot / publication
151
+ # from the first table name when not explicitly configured.
152
+ def derive_slot_base(tables)
153
+ first = tables&.first&.dig(:name)
154
+ first ? first.gsub(/\W/, "_").downcase : "migration"
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Config
5
+ MigrateConfig = Data.define(
6
+ :replication_slot,
7
+ :publication,
8
+ :checkpoint_db,
9
+ :batch_size,
10
+ :lag_threshold_bytes,
11
+ :tables,
12
+ :max_rows_per_second
13
+ ) do
14
+ # max_rows_per_second is optional (nil = unthrottled). Defaulting it here
15
+ # keeps existing callers and configs without the key working.
16
+ def initialize(max_rows_per_second: nil, **rest)
17
+ super(max_rows_per_second: max_rows_per_second, **rest)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Config
5
+ # Top-level config object returned by Config::Loader.load.
6
+ # target, migrate, analyze, verify, cutover are all optional (may be nil).
7
+ Root = Data.define(:source, :target, :migrate, :analyze, :verify, :cutover, :path)
8
+ end
9
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "dry-schema"
4
+
5
+ module Pcrd
6
+ module Config
7
+ module Schema
8
+ # Validates a YAML config hash (already natively typed by YAML.safe_load).
9
+ # Uses Dry::Schema.define (no coercion) since YAML gives us real Ruby types.
10
+ DEFINITION = Dry::Schema.define do
11
+ required(:source).hash do
12
+ required(:host).filled(:string)
13
+ optional(:port).value(:integer, gt?: 0, lt?: 65_536)
14
+ required(:database).filled(:string)
15
+ required(:user).filled(:string)
16
+ optional(:password).maybe(:string)
17
+ end
18
+
19
+ optional(:target).hash do
20
+ required(:host).filled(:string)
21
+ optional(:port).value(:integer, gt?: 0, lt?: 65_536)
22
+ required(:database).filled(:string)
23
+ required(:user).filled(:string)
24
+ optional(:password).maybe(:string)
25
+ end
26
+
27
+ optional(:migrate).hash do
28
+ optional(:replication_slot).filled(:string)
29
+ optional(:publication).filled(:string)
30
+ optional(:checkpoint_db).filled(:string)
31
+ optional(:batch_size).value(:integer, gt?: 0)
32
+ optional(:lag_threshold_bytes).value(:integer, gt?: 0)
33
+ optional(:tables).array(:hash) do
34
+ required(:name).filled(:string)
35
+ optional(:optimize_column_order).value(:bool)
36
+ # columns: dynamic keys (column names) — validated structurally in Loader
37
+ optional(:columns).value(:hash)
38
+ optional(:add_columns).array(:hash) do
39
+ required(:name).filled(:string)
40
+ required(:type).filled(:string)
41
+ optional(:default).maybe(:string)
42
+ end
43
+ end
44
+ end
45
+
46
+ optional(:analyze).hash do
47
+ # nil means "use tables from migrate section"
48
+ optional(:tables).array(:string)
49
+ end
50
+
51
+ optional(:verify).hash do
52
+ optional(:sample_size).value(:integer, gt?: 0)
53
+ end
54
+
55
+ optional(:cutover).hash do
56
+ optional(:sequence_buffer).value(:integer, gteq?: 0)
57
+ optional(:lag_drain_timeout).value(:integer, gt?: 0)
58
+ end
59
+ end
60
+ end # module Schema
61
+ end
62
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Config
5
+ # columns: Hash<source_column_name, ColumnSpec>
6
+ # add_columns: Array<AddColumn>
7
+ Table = Data.define(:name, :optimize_column_order, :columns, :add_columns)
8
+ end
9
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Config
5
+ VerifyConfig = Data.define(:sample_size)
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Config
5
+ DEFAULT_CONFIG_FILE = "pcrd.config.yml"
6
+ end
7
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pg"
4
+
5
+ module Pcrd
6
+ module Connection
7
+ class Client
8
+ # Conservative per-session defaults applied to every connection.
9
+ #
10
+ # application_name — identifies pcrd in pg_stat_activity
11
+ # lock_timeout=5s — fail fast instead of blocking
12
+ # production behind a lock (DDL etc.)
13
+ # idle_in_transaction_session_timeout — release locks if a transaction is
14
+ # =60s left open idle (e.g. a stalled tool)
15
+ # statement_timeout=0 — DISABLED on purpose: backfill COPY
16
+ # and large batches run for a long
17
+ # time and must not be killed
18
+ #
19
+ # Override per pool via `settings:`; values are GUC strings (units allowed).
20
+ DEFAULT_SESSION_SETTINGS = {
21
+ "application_name" => "pcrd",
22
+ "lock_timeout" => "5s",
23
+ "idle_in_transaction_session_timeout" => "60s",
24
+ "statement_timeout" => "0"
25
+ }.freeze
26
+
27
+ attr_reader :session_settings
28
+
29
+ def initialize(config, settings: {})
30
+ @config = config
31
+ @session_settings = DEFAULT_SESSION_SETTINGS.merge(settings)
32
+ @conn = nil
33
+ end
34
+
35
+ # For parameterized queries (SELECT, INSERT with $1 placeholders).
36
+ def exec(sql, params = [])
37
+ connection.exec_params(sql, params)
38
+ rescue PG::Error => e
39
+ reset_connection!
40
+ raise Error, e.message
41
+ end
42
+
43
+ # For DDL and multi-statement SQL where no parameter substitution is needed.
44
+ def exec_sql(sql)
45
+ connection.exec(sql)
46
+ rescue PG::Error => e
47
+ reset_connection!
48
+ raise Error, e.message
49
+ end
50
+
51
+ # For COPY ... FROM STDIN. Yields the raw PG::Connection so the caller
52
+ # can call conn.put_copy_data(line) inside the block.
53
+ def copy_data(sql)
54
+ connection.copy_data(sql) { yield connection }
55
+ rescue PG::Error => e
56
+ # A failed COPY can leave the connection mid-COPY or in an aborted
57
+ # transaction; reset it like exec/exec_sql so the next call is usable.
58
+ reset_connection!
59
+ raise Error, e.message
60
+ end
61
+
62
+ def quote_ident(name)
63
+ connection.quote_ident(name)
64
+ end
65
+
66
+ def escape_literal(val)
67
+ connection.escape_literal(val.to_s)
68
+ end
69
+
70
+ def transaction
71
+ exec("BEGIN")
72
+ result = yield
73
+ exec("COMMIT")
74
+ result
75
+ rescue StandardError
76
+ exec("ROLLBACK") rescue nil
77
+ raise
78
+ end
79
+
80
+ def close
81
+ @conn&.close
82
+ @conn = nil
83
+ end
84
+
85
+ def connected?
86
+ @conn && !@conn.finished?
87
+ end
88
+
89
+ # libpq options string that applies the session settings at connect time
90
+ # (-c key=value), so they are in force for the very first statement.
91
+ # application_name is excluded here — it is passed as the dedicated
92
+ # connect parameter because a -c value is overridden by libpq's
93
+ # fallback_application_name.
94
+ def session_options
95
+ @session_settings
96
+ .reject { |key, _| key == "application_name" }
97
+ .map { |key, value| "-c #{key}=#{value}" }
98
+ .join(" ")
99
+ end
100
+
101
+ private
102
+
103
+ def connection
104
+ @conn = connect unless connected?
105
+ @conn
106
+ end
107
+
108
+ def connect
109
+ PG.connect(
110
+ host: @config.host,
111
+ port: @config.port,
112
+ dbname: @config.database,
113
+ user: @config.user,
114
+ password: @config.password,
115
+ application_name: @session_settings["application_name"],
116
+ options: session_options
117
+ )
118
+ rescue PG::ConnectionBad => e
119
+ raise Error, "Cannot connect to #{@config.host}:#{@config.port}/#{@config.database}: #{e.message}"
120
+ end
121
+
122
+ def reset_connection!
123
+ # If the connection is in an aborted transaction, attempt a rollback
124
+ # so subsequent commands on the same connection can proceed.
125
+ @conn&.exec("ROLLBACK") rescue nil
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Connection
5
+ class Error < Pcrd::Error; end
6
+ end
7
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pg"
4
+
5
+ module Pcrd
6
+ module Connection
7
+ # Manages a PostgreSQL logical replication connection.
8
+ #
9
+ # Opened with replication: 'database' so the server accepts streaming
10
+ # replication protocol commands. Use open → start_replication, then
11
+ # poll with get_copy_data / respond with put_copy_data.
12
+ class Replication
13
+ # START_REPLICATION is replication-protocol SQL, not ordinary SQL: the
14
+ # slot name and LSN are interpolated as bare tokens, so they must be
15
+ # validated rather than quoted. Slot names follow PostgreSQL's own rule
16
+ # (lowercase letters, digits, underscore; max 63). LSN is the standard
17
+ # hex/hex form. Both are config/checkpoint-derived, so validate them.
18
+ SLOT_NAME_RE = /\A[a-z0-9_]{1,63}\z/
19
+ LSN_RE = %r{\A[0-9A-Fa-f]{1,8}/[0-9A-Fa-f]{1,8}\z}
20
+
21
+ def initialize(config)
22
+ @config = config
23
+ @conn = nil
24
+ end
25
+
26
+ def open
27
+ @conn = PG.connect(
28
+ host: @config.host,
29
+ port: @config.port,
30
+ dbname: @config.database,
31
+ user: @config.user,
32
+ password: @config.password,
33
+ application_name: "pcrd-replication",
34
+ replication: "database"
35
+ )
36
+ self
37
+ rescue PG::ConnectionBad, PG::Error => e
38
+ raise Error, "Replication connection failed to " \
39
+ "#{@config.host}:#{@config.port}/#{@config.database}: #{e.message}"
40
+ end
41
+
42
+ # Sends START_REPLICATION and enters COPY streaming mode.
43
+ # Uses send_query + get_result (not exec) so the CopyBoth response is
44
+ # handled correctly and the connection is left in streaming copy mode.
45
+ def start_replication(slot_name:, pub_name:, start_lsn: "0/0")
46
+ validate_slot_name!(slot_name)
47
+ validate_lsn!(start_lsn)
48
+ pub_id = pub_name.gsub("'", "''")
49
+
50
+ @conn.send_query(
51
+ "START_REPLICATION SLOT #{slot_name} LOGICAL #{start_lsn} " \
52
+ "(proto_version '1', publication_names '#{pub_id}')"
53
+ )
54
+ @conn.get_result # reads CopyBothResponse; puts connection in copy mode
55
+ self
56
+ rescue PG::Error => e
57
+ raise Error, "START_REPLICATION failed: #{e.message}"
58
+ end
59
+
60
+ # Waits up to `timeout` seconds for data on the replication socket.
61
+ # Returns true if data is available, false if the timeout expired.
62
+ def wait_readable(timeout)
63
+ @conn.socket_io.wait_readable(timeout)
64
+ end
65
+
66
+ # Returns a String (message bytes), nil (no data yet), or false (stream ended).
67
+ # Call after wait_readable returns true, or after consume_input.
68
+ def get_copy_data
69
+ @conn.consume_input
70
+ @conn.get_copy_data(true)
71
+ rescue PG::Error => e
72
+ raise Error, e.message
73
+ end
74
+
75
+ # Sends a client message (keepalive response) to the server.
76
+ def put_copy_data(data)
77
+ @conn.put_copy_data(data)
78
+ rescue PG::Error => e
79
+ raise Error, e.message
80
+ end
81
+
82
+ def close
83
+ @conn&.finish
84
+ @conn = nil
85
+ end
86
+
87
+ def connected?
88
+ @conn && !@conn.finished?
89
+ end
90
+
91
+ private
92
+
93
+ def validate_slot_name!(slot_name)
94
+ return if slot_name.to_s.match?(SLOT_NAME_RE)
95
+
96
+ raise Error,
97
+ "Invalid replication slot name #{slot_name.inspect}: must be 1-63 " \
98
+ "characters of lowercase letters, digits, or underscores."
99
+ end
100
+
101
+ def validate_lsn!(lsn)
102
+ return if lsn.to_s.match?(LSN_RE)
103
+
104
+ raise Error, "Invalid start LSN #{lsn.inspect}: expected hex/hex form like \"0/0\"."
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pcrd
4
+ module Cutover
5
+ # Orchestrates the cutover sequence.
6
+ #
7
+ # Preconditions (operator's responsibility):
8
+ # - Application is in maintenance mode (writes to source have stopped)
9
+ # - pcrd migrate is running in streaming mode (or was cleanly stopped)
10
+ #
11
+ # Steps:
12
+ # 1. Verify the migration is at a cuttable phase (backfill complete)
13
+ # 2. Drain remaining replication lag to zero (with timeout)
14
+ # 3. Advance sequences on target
15
+ # 4. Verify row counts match
16
+ # 5. Print cutover report and "READY" signal
17
+ class Orchestrator
18
+ Result = Data.define(
19
+ :passed,
20
+ :row_counts, # Hash<table_name, {source:, target:}>
21
+ :sequence_results, # Array<Sequences::SequenceResult>
22
+ :lag_at_cutover, # Integer bytes
23
+ :warnings # Array<String>
24
+ )
25
+
26
+ def initialize(source_pool:, target_pool:, config:)
27
+ @source = source_pool
28
+ @target = target_pool
29
+ @config = config
30
+ end
31
+
32
+ # Runs the full cutover sequence.
33
+ # on_progress: optional Proc called with a status string during drain
34
+ def run(on_progress: nil)
35
+ warnings = []
36
+ table_names = (@config.migrate&.tables || []).map(&:name)
37
+
38
+ # 1. Drain remaining lag
39
+ on_progress&.call("Draining replication lag...")
40
+ lag = drain_lag(table_names, on_progress: on_progress)
41
+
42
+ # 2. Advance sequences
43
+ on_progress&.call("Advancing target sequences...")
44
+ seq_results = Sequences.new(
45
+ source_pool: @source,
46
+ target_pool: @target,
47
+ safety_buffer: @config.cutover&.sequence_buffer || 1_000
48
+ ).advance(table_names)
49
+
50
+ # 3. Row count verification
51
+ on_progress&.call("Verifying row counts...")
52
+ row_counts = verify_counts(table_names, warnings)
53
+
54
+ passed = row_counts.all? { |_, v| v[:source] == v[:target] }
55
+
56
+ Result.new(
57
+ passed: passed,
58
+ row_counts: row_counts,
59
+ sequence_results: seq_results,
60
+ lag_at_cutover: lag,
61
+ warnings: warnings
62
+ )
63
+ end
64
+
65
+ private
66
+
67
+ def drain_lag(table_names, on_progress:)
68
+ slot_name = @config.migrate&.replication_slot
69
+ return 0 unless slot_name
70
+
71
+ timeout = @config.cutover&.lag_drain_timeout || 300
72
+ deadline = Time.now + timeout
73
+ lag_monitor = Monitor::Lag.new(source_pool: @source, slot_name: slot_name)
74
+
75
+ loop do
76
+ lag = lag_monitor.lag_bytes
77
+ on_progress&.call(" Lag: #{lag ? "#{lag} bytes" : "unknown"}")
78
+
79
+ return lag || 0 if lag&.zero?
80
+ return lag || 0 if !lag # slot may have been dropped
81
+
82
+ if Time.now > deadline
83
+ on_progress&.call(" Warning: lag did not reach zero within #{timeout}s (#{lag} bytes remaining)")
84
+ return lag
85
+ end
86
+
87
+ sleep 1
88
+ end
89
+ end
90
+
91
+ def verify_counts(table_names, warnings)
92
+ table_names.each_with_object({}) do |name, counts|
93
+ src_count = @source.exec("SELECT COUNT(*) FROM #{@source.quote_ident(name)}")[0]["count"].to_i
94
+ tgt_count = @target.exec("SELECT COUNT(*) FROM #{@target.quote_ident(name)}")[0]["count"].to_i
95
+
96
+ counts[name] = { source: src_count, target: tgt_count }
97
+
98
+ if src_count != tgt_count
99
+ warnings << "#{name}: row count mismatch (source=#{src_count}, target=#{tgt_count})"
100
+ end
101
+ rescue Connection::Error => e
102
+ warnings << "#{name}: could not verify row count: #{e.message}"
103
+ counts[name] = { source: nil, target: nil }
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end