pg_easy_replicate 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,308 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PgEasyReplicate
4
+ class Orchestrate
5
+ extend Helper
6
+
7
+ class << self
8
+ DEFAULT_LAG = 200_000 # 200kb
9
+ DEFAULT_WAIT = 5 # seconds
10
+
11
+ def start_sync(options)
12
+ PgEasyReplicate.assert_config
13
+
14
+ create_publication(
15
+ group_name: options[:group_name],
16
+ conn_string: source_db_url,
17
+ )
18
+
19
+ add_tables_to_publication(
20
+ group_name: options[:group_name],
21
+ tables: options[:tables],
22
+ conn_string: source_db_url,
23
+ schema: options[:schema],
24
+ )
25
+
26
+ create_subscription(
27
+ group_name: options[:group_name],
28
+ source_conn_string: secondary_source_db_url || source_db_url,
29
+ target_conn_string: target_db_url,
30
+ )
31
+
32
+ Group.create(
33
+ name: options[:group_name],
34
+ table_names: options[:tables],
35
+ schema_name: options[:schema],
36
+ started_at: Time.now.utc,
37
+ )
38
+ rescue => e
39
+ stop_sync(
40
+ group_name: options[:group_name],
41
+ source_conn_string: source_db_url,
42
+ target_conn_string: target_db_url,
43
+ )
44
+
45
+ if Group.find(options[:group_name])
46
+ Group.update(group_name: options[:group_name], failed_at: Time.now)
47
+ else
48
+ Group.create(
49
+ name: options[:group_name],
50
+ table_names: options[:tables],
51
+ schema_name: options[:schema],
52
+ started_at: Time.now.utc,
53
+ failed_at: Time.now.utc,
54
+ )
55
+ end
56
+
57
+ abort_with("Starting sync failed: #{e.message}")
58
+ end
59
+
60
+ def create_publication(group_name:, conn_string:)
61
+ logger.info(
62
+ "Setting up publication",
63
+ { publication_name: publication_name(group_name) },
64
+ )
65
+ Query.run(
66
+ query: "create publication #{publication_name(group_name)}",
67
+ connection_url: conn_string,
68
+ )
69
+ end
70
+
71
+ def add_tables_to_publication(
72
+ schema:,
73
+ group_name:,
74
+ conn_string:,
75
+ tables: ""
76
+ )
77
+ logger.info(
78
+ "Adding tables up publication",
79
+ { publication_name: publication_name(group_name) },
80
+ )
81
+ tables = tables&.split(",") || []
82
+ unless tables.size > 0
83
+ tables = list_all_tables(schema: schema, conn_string: conn_string)
84
+ end
85
+
86
+ tables.map do |table_name|
87
+ Query.run(
88
+ query:
89
+ "ALTER PUBLICATION #{publication_name(group_name)} ADD TABLE \"#{table_name}\"",
90
+ connection_url: conn_string,
91
+ schema: schema,
92
+ )
93
+ end
94
+ end
95
+
96
+ def list_all_tables(schema:, conn_string:)
97
+ Query
98
+ .run(
99
+ query:
100
+ "SELECT table_name FROM information_schema.tables WHERE table_schema = '#{schema}'",
101
+ connection_url: conn_string,
102
+ )
103
+ .map(&:values)
104
+ .flatten
105
+ end
106
+
107
+ def drop_publication(group_name:, conn_string:)
108
+ logger.info(
109
+ "Dropping publication",
110
+ { publication_name: publication_name(group_name) },
111
+ )
112
+ Query.run(
113
+ query: "DROP PUBLICATION IF EXISTS #{publication_name(group_name)}",
114
+ connection_url: conn_string,
115
+ )
116
+ end
117
+
118
+ def create_subscription(
119
+ group_name:,
120
+ source_conn_string:,
121
+ target_conn_string:
122
+ )
123
+ logger.info(
124
+ "Setting up subscription",
125
+ {
126
+ publication_name: publication_name(group_name),
127
+ subscription_name: subscription_name(group_name),
128
+ },
129
+ )
130
+
131
+ Query.run(
132
+ query:
133
+ "CREATE SUBSCRIPTION #{subscription_name(group_name)} CONNECTION '#{source_conn_string}' PUBLICATION #{publication_name(group_name)}",
134
+ connection_url: target_conn_string,
135
+ transaction: false,
136
+ )
137
+ rescue Sequel::DatabaseError => e
138
+ if e.message.include?("canceling statement due to statement timeout")
139
+ abort_with(
140
+ "Subscription creation failed, please ensure both databases are in the same network region: #{e.message}",
141
+ )
142
+ end
143
+
144
+ raise
145
+ end
146
+
147
+ def drop_subscription(group_name:, target_conn_string:)
148
+ logger.info(
149
+ "Dropping subscription",
150
+ {
151
+ publication_name: publication_name(group_name),
152
+ subscription_name: subscription_name(group_name),
153
+ },
154
+ )
155
+ Query.run(
156
+ query: "DROP SUBSCRIPTION IF EXISTS #{subscription_name(group_name)}",
157
+ connection_url: target_conn_string,
158
+ transaction: false,
159
+ )
160
+ end
161
+
162
+ def stop_sync(target_conn_string:, source_conn_string:, group_name:)
163
+ PgEasyReplicate.assert_config
164
+
165
+ logger.info(
166
+ "Stopping sync",
167
+ {
168
+ publication_name: publication_name(group_name),
169
+ subscription_name: subscription_name(group_name),
170
+ },
171
+ )
172
+ drop_publication(
173
+ group_name: group_name,
174
+ conn_string: source_conn_string,
175
+ )
176
+ drop_subscription(
177
+ group_name: group_name,
178
+ target_conn_string: target_conn_string,
179
+ )
180
+ end
181
+
182
+ def switchover(
183
+ group_name:,
184
+ source_conn_string: source_db_url,
185
+ target_conn_string: target_db_url,
186
+ lag_delta_size: DEFAULT_LAG
187
+ )
188
+ PgEasyReplicate.assert_config
189
+ group = Group.find(group_name)
190
+
191
+ watch_lag(group_name: group_name, lag: lag_delta_size)
192
+ revoke_connections_on_source_db(group_name)
193
+ wait_for_remaining_catchup(group_name)
194
+ refresh_sequences(
195
+ conn_string: target_conn_string,
196
+ schema: group[:schema_name],
197
+ )
198
+ mark_switchover_complete(group_name)
199
+ drop_subscription(
200
+ group_name: group_name,
201
+ target_conn_string: target_conn_string,
202
+ )
203
+ rescue => e
204
+ restore_connections_on_source_db(group_name)
205
+
206
+ abort_with("Switchover sync failed: #{e.message}")
207
+ end
208
+
209
+ def watch_lag(group_name:, wait_time: DEFAULT_WAIT, lag: DEFAULT_LAG)
210
+ logger.info("Watching lag stats")
211
+
212
+ loop do
213
+ sleep(wait_time)
214
+
215
+ unless Stats.all_tables_replicating?(group_name)
216
+ logger.debug(
217
+ "All tables haven't reached replicating state, skipping check",
218
+ )
219
+ next
220
+ end
221
+
222
+ lag_stat = Stats.lag_stats(group_name).first
223
+ if lag_stat[:write_lag].nil? || lag_stat[:flush_lag].nil? ||
224
+ lag_stat[:replay_lag].nil?
225
+ next
226
+ end
227
+
228
+ logger.debug("Current lag stats: #{lag_stat}")
229
+
230
+ below_write_lag = lag_stat[:write_lag] <= lag
231
+ below_flush_lag = lag_stat[:flush_lag] <= lag
232
+ below_replay_lag = lag_stat[:replay_lag] <= lag
233
+
234
+ break if below_write_lag && below_flush_lag && below_replay_lag
235
+ end
236
+
237
+ logger.info("Lag below #{DEFAULT_LAG} bytes. Continuing...")
238
+ end
239
+
240
+ def wait_for_remaining_catchup(group_name)
241
+ logger.info("Waiting for remaining WAL to get flushed")
242
+
243
+ watch_lag(group_name: group_name, lag: 0, wait_time: 0.2)
244
+
245
+ logger.info("Caught up on remaining WAL lag")
246
+ end
247
+
248
+ def revoke_connections_on_source_db(group_name)
249
+ logger.info(
250
+ "Lag is now below #{DEFAULT_LAG}, marking source DB to read only",
251
+ )
252
+
253
+ alter_sql =
254
+ "ALTER USER #{db_user(source_db_url)} set default_transaction_read_only = true"
255
+ Query.run(query: alter_sql, connection_url: source_db_url)
256
+
257
+ kill_sql =
258
+ "SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE usename = '#{db_user(source_db_url)}';"
259
+
260
+ Query.run(query: kill_sql, connection_url: source_db_url)
261
+ end
262
+
263
+ def restore_connections_on_source_db(group_name)
264
+ logger.info("Restoring connections")
265
+
266
+ alter_sql =
267
+ "ALTER USER #{db_user(source_db_url)} set default_transaction_read_only = false"
268
+ Query.run(query: alter_sql, connection_url: source_db_url)
269
+ end
270
+
271
+ def refresh_sequences(conn_string:, schema: nil)
272
+ logger.info("Refreshing sequences")
273
+ sql = <<~SQL
274
+ DO $$
275
+ DECLARE
276
+ i TEXT;
277
+ BEGIN
278
+ FOR i IN (
279
+ SELECT 'SELECT SETVAL('
280
+ || quote_literal(quote_ident(PGT.schemaname) || '.' || quote_ident(S.relname))
281
+ || ', COALESCE(MAX(' ||quote_ident(C.attname)|| '), 1) ) FROM '
282
+ || quote_ident(PGT.schemaname)|| '.'||quote_ident(T.relname)|| ';'
283
+ FROM pg_class AS S,
284
+ pg_depend AS D,
285
+ pg_class AS T,
286
+ pg_attribute AS C,
287
+ pg_tables AS PGT
288
+ WHERE S.relkind = 'S'
289
+ AND S.oid = D.objid
290
+ AND D.refobjid = T.oid
291
+ AND D.refobjid = C.attrelid
292
+ AND D.refobjsubid = C.attnum
293
+ AND T.relname = PGT.tablename
294
+ ) LOOP
295
+ EXECUTE i;
296
+ END LOOP;
297
+ END $$;
298
+ SQL
299
+
300
+ Query.run(query: sql, connection_url: conn_string, schema: schema)
301
+ end
302
+
303
+ def mark_switchover_complete(group_name)
304
+ Group.update(group_name: group_name, switchover_completed_at: Time.now)
305
+ end
306
+ end
307
+ end
308
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PgEasyReplicate
4
+ class Query
5
+ extend Helper
6
+
7
+ class << self
8
+ def run(
9
+ query:,
10
+ connection_url:,
11
+ user: internal_user_name,
12
+ schema: nil,
13
+ transaction: true
14
+ )
15
+ conn =
16
+ connect(connection_url: connection_url, schema: schema, user: user)
17
+ if transaction
18
+ r =
19
+ conn.transaction do
20
+ conn.run("SET search_path to #{schema}") if schema
21
+ conn.run("SET statement_timeout to '5s'")
22
+ conn.fetch(query).to_a
23
+ end
24
+ else
25
+ conn.run("SET search_path to #{schema}") if schema
26
+ conn.run("SET statement_timeout to '5s'")
27
+ r = conn.fetch(query).to_a
28
+ end
29
+ conn.disconnect
30
+ r
31
+ ensure
32
+ conn&.fetch("RESET statement_timeout")
33
+ conn&.disconnect
34
+ end
35
+
36
+ def connect(connection_url:, user: internal_user_name, schema: nil)
37
+ c =
38
+ Sequel.connect(
39
+ connection_url,
40
+ user: user,
41
+ logger: ENV.fetch("DEBUG", nil) ? logger : nil,
42
+ search_path: schema,
43
+ )
44
+ logger.debug("Connection established")
45
+ c
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PgEasyReplicate
4
+ class Stats
5
+ REPLICATION_STATE_MAP = {
6
+ "i" => "initializing",
7
+ "d" => "data_is_being_copied",
8
+ "f" => "finished_table_copy",
9
+ "s" => "synchronized",
10
+ "r" => "replicating",
11
+ }.freeze
12
+ extend Helper
13
+
14
+ class << self
15
+ def object(group_name)
16
+ PgEasyReplicate.assert_config
17
+ stats = replication_stats(group_name)
18
+ group = Group.find(group_name)
19
+ {
20
+ lag_stats: lag_stats(group_name),
21
+ replication_slots: pg_replication_slots(group_name),
22
+ replication_stats: stats,
23
+ replication_stats_count_by_state:
24
+ replication_stats_count_by_state(stats),
25
+ message_lsn_receipts: message_lsn_receipts(group_name),
26
+ sync_started_at: group[:started_at],
27
+ sync_failed_at: group[:failed_at],
28
+ switchover_completed_at: group[:switchover_completed_at],
29
+ }
30
+ end
31
+
32
+ def print(group_name)
33
+ puts JSON.pretty_generate(object(group_name))
34
+ end
35
+
36
+ def follow(group_name)
37
+ loop do
38
+ print(group_name)
39
+ sleep(1)
40
+ end
41
+ end
42
+
43
+ # Get
44
+ def lag_stats(group_name)
45
+ sql = <<~SQL
46
+ SELECT pid,
47
+ client_addr,
48
+ usename as user_name,
49
+ application_name,
50
+ state,
51
+ sync_state,
52
+ pg_wal_lsn_diff(sent_lsn, write_lsn) AS write_lag,
53
+ pg_wal_lsn_diff(sent_lsn, flush_lsn) AS flush_lag,
54
+ pg_wal_lsn_diff(sent_lsn, replay_lsn) AS replay_lag
55
+ FROM pg_stat_replication
56
+ WHERE application_name = '#{subscription_name(group_name)}';
57
+ SQL
58
+
59
+ Query.run(query: sql, connection_url: source_db_url)
60
+ end
61
+
62
+ def pg_replication_slots(group_name)
63
+ sql = <<~SQL
64
+ select * from pg_replication_slots WHERE slot_name = '#{subscription_name(group_name)}';
65
+ SQL
66
+
67
+ Query.run(query: sql, connection_url: source_db_url)
68
+ end
69
+
70
+ def replication_stats(group_name)
71
+ sql = <<~SQL
72
+ SELECT
73
+ s.subname AS subscription_name,
74
+ c.relnamespace :: regnamespace :: text as table_schema,
75
+ c.relname as table_name,
76
+ rel.srsubstate as replication_state
77
+ FROM
78
+ pg_catalog.pg_subscription s
79
+ JOIN pg_catalog.pg_subscription_rel rel ON rel.srsubid = s.oid
80
+ JOIN pg_catalog.pg_class c on c.oid = rel.srrelid
81
+ WHERE s.subname = '#{subscription_name(group_name)}'
82
+ SQL
83
+
84
+ Query
85
+ .run(query: sql, connection_url: target_db_url)
86
+ .each do |obj|
87
+ obj[:replication_state] = REPLICATION_STATE_MAP[
88
+ obj[:replication_state]
89
+ ]
90
+ end
91
+ end
92
+
93
+ def all_tables_replicating?(group_name)
94
+ result =
95
+ replication_stats(group_name)
96
+ .each
97
+ .with_object(Hash.new(0)) do |state, counts|
98
+ counts[state[:replication_state]] += 1
99
+ end
100
+ result.keys.uniq.count == 1 &&
101
+ result.keys.first == REPLICATION_STATE_MAP["r"]
102
+ end
103
+
104
+ def replication_stats_count_by_state(stats)
105
+ stats
106
+ .each
107
+ .with_object(Hash.new(0)) do |state, counts|
108
+ counts[state[:replication_state]] += 1
109
+ end
110
+ end
111
+
112
+ def message_lsn_receipts(group_name)
113
+ sql = <<~SQL
114
+ select
115
+ received_lsn,
116
+ last_msg_send_time,
117
+ last_msg_receipt_time,
118
+ latest_end_lsn,
119
+ latest_end_time
120
+ from
121
+ pg_catalog.pg_stat_subscription
122
+ WHERE subname = '#{subscription_name(group_name)}'
123
+ SQL
124
+ Query.run(query: sql, connection_url: target_db_url)
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PgEasyReplicate
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,197 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "ougai"
5
+ require "lockbox"
6
+ require "pg"
7
+ require "sequel"
8
+
9
+ require "pg_easy_replicate/helper"
10
+ require "pg_easy_replicate/version"
11
+ require "pg_easy_replicate/query"
12
+ require "pg_easy_replicate/orchestrate"
13
+ require "pg_easy_replicate/stats"
14
+ require "pg_easy_replicate/group"
15
+ require "pg_easy_replicate/cli"
16
+
17
+ Sequel.default_timezone = :utc
18
+ module PgEasyReplicate
19
+ class Error < StandardError
20
+ end
21
+
22
+ extend Helper
23
+
24
+ class << self
25
+ def config
26
+ abort_with("SOURCE_DB_URL is missing") if source_db_url.nil?
27
+ abort_with("TARGET_DB_URL is missing") if target_db_url.nil?
28
+ @config ||=
29
+ begin
30
+ q =
31
+ "select name, setting from pg_settings where name in ('max_wal_senders', 'max_worker_processes', 'wal_level', 'max_replication_slots', 'max_logical_replication_workers');"
32
+
33
+ {
34
+ source_db_is_superuser: is_super_user?(source_db_url),
35
+ target_db_is_superuser: is_super_user?(target_db_url),
36
+ source_db:
37
+ Query.run(
38
+ query: q,
39
+ connection_url: source_db_url,
40
+ user: db_user(source_db_url),
41
+ ),
42
+ target_db:
43
+ Query.run(
44
+ query: q,
45
+ connection_url: target_db_url,
46
+ user: db_user(target_db_url),
47
+ ),
48
+ }
49
+ rescue => e
50
+ abort_with("Unable to check config: #{e.message}")
51
+ end
52
+ end
53
+
54
+ def assert_config
55
+ unless assert_wal_level_logical(config.dig(:source_db))
56
+ abort_with("WAL_LEVEL should be LOGICAL on source DB")
57
+ end
58
+
59
+ unless assert_wal_level_logical(config.dig(:target_db))
60
+ abort_with("WAL_LEVEL should be LOGICAL on target DB")
61
+ end
62
+
63
+ unless config.dig(:source_db_is_superuser)
64
+ abort_with("User on source database should be a superuser")
65
+ end
66
+
67
+ return if config.dig(:target_db_is_superuser)
68
+ abort_with("User on target database should be a superuser")
69
+ end
70
+
71
+ def bootstrap(options)
72
+ assert_config
73
+ logger.info("Setting up schema")
74
+ setup_schema
75
+
76
+ logger.info("Setting up replication user on source database")
77
+ create_user(conn_string: source_db_url, group_name: options[:group_name])
78
+
79
+ logger.info("Setting up replication user on target database")
80
+ create_user(conn_string: target_db_url, group_name: options[:group_name])
81
+
82
+ logger.info("Setting up groups tables")
83
+ Group.setup
84
+ rescue => e
85
+ abort_with("Unable to bootstrap: #{e.message}")
86
+ end
87
+
88
+ def cleanup(options)
89
+ logger.info("Dropping groups table")
90
+ Group.drop
91
+
92
+ if options[:everything]
93
+ logger.info("Dropping schema")
94
+ drop_schema
95
+ end
96
+
97
+ if options[:everything] || options[:sync]
98
+ Orchestrate.drop_publication(
99
+ group_name: options[:group_name],
100
+ conn_string: source_db_url,
101
+ )
102
+
103
+ Orchestrate.drop_subscription(
104
+ group_name: options[:group_name],
105
+ target_conn_string: target_db_url,
106
+ )
107
+ end
108
+
109
+ if options[:everything]
110
+ # Drop users at last
111
+ logger.info("Dropping replication user on source database")
112
+ drop_user(conn_string: source_db_url, group_name: options[:group_name])
113
+
114
+ logger.info("Dropping replication user on target database")
115
+ drop_user(conn_string: target_db_url, group_name: options[:group_name])
116
+ end
117
+ rescue => e
118
+ abort_with("Unable to cleanup: #{e.message}")
119
+ end
120
+
121
+ def drop_schema
122
+ Query.run(
123
+ query: "DROP SCHEMA IF EXISTS #{internal_schema_name} CASCADE",
124
+ connection_url: source_db_url,
125
+ schema: internal_schema_name,
126
+ )
127
+ end
128
+
129
+ def setup_schema
130
+ sql = <<~SQL
131
+ create schema if not exists #{internal_schema_name};
132
+ grant usage on schema #{internal_schema_name} to #{db_user(source_db_url)};
133
+ grant create on schema #{internal_schema_name} to #{db_user(source_db_url)};
134
+ SQL
135
+
136
+ Query.run(
137
+ query: sql,
138
+ connection_url: source_db_url,
139
+ schema: internal_schema_name,
140
+ user: db_user(target_db_url),
141
+ )
142
+ end
143
+
144
+ def logger
145
+ @logger ||=
146
+ begin
147
+ logger = Ougai::Logger.new($stdout)
148
+ logger.level =
149
+ ENV["DEBUG"] ? Ougai::Logger::TRACE : Ougai::Logger::INFO
150
+ logger.with_fields = { version: PgEasyReplicate::VERSION }
151
+ logger
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ def assert_wal_level_logical(db_config)
158
+ db_config&.find do |r|
159
+ r.dig(:name) == "wal_level" && r.dig(:setting) == "logical"
160
+ end
161
+ end
162
+
163
+ def is_super_user?(url)
164
+ Query.run(
165
+ query:
166
+ "select usesuper from pg_user where usename = '#{db_user(url)}';",
167
+ connection_url: url,
168
+ user: db_user(target_db_url),
169
+ ).first[
170
+ :usesuper
171
+ ]
172
+ end
173
+
174
+ def create_user(conn_string:, group_name:)
175
+ password = connection_info(conn_string)[:user]
176
+ sql = <<~SQL
177
+ drop role if exists #{internal_user_name};
178
+ create role #{internal_user_name} with password '#{password}' login superuser createdb createrole;
179
+ SQL
180
+
181
+ Query.run(
182
+ query: sql,
183
+ connection_url: conn_string,
184
+ user: db_user(target_db_url),
185
+ )
186
+ end
187
+
188
+ def drop_user(conn_string:, group_name:)
189
+ sql = "drop role if exists #{internal_user_name};"
190
+ Query.run(
191
+ query: sql,
192
+ connection_url: conn_string,
193
+ user: db_user(conn_string),
194
+ )
195
+ end
196
+ end
197
+ end
data/package.json ADDED
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "pg-osc",
3
+ "version": "1.0.0",
4
+ "main": "index.js",
5
+ "repository": "git@github.com:shayonj/pg-osc.git",
6
+ "author": "Shayon Mukherjee <shayonj@gmail.com>",
7
+ "license": "MIT",
8
+ "private": true,
9
+ "dependencies": {
10
+ "@prettier/plugin-ruby": "^3.2.2",
11
+ "prettier": "^2.8.8"
12
+ }
13
+ }