pgsync 0.5.5 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pgsync might be problematic. Click here for more details.

@@ -0,0 +1,28 @@
1
+ # minimal class to keep schema and table name separate
2
+ module PgSync
3
+ class Table
4
+ attr_reader :schema, :name
5
+
6
+ def initialize(schema, name)
7
+ @schema = schema
8
+ @name = name
9
+ end
10
+
11
+ def full_name
12
+ "#{schema}.#{name}"
13
+ end
14
+
15
+ def eql?(other)
16
+ other.schema == schema && other.name == name
17
+ end
18
+
19
+ # override hash when overriding eql?
20
+ def hash
21
+ [schema, name].hash
22
+ end
23
+
24
+ def to_s
25
+ full_name
26
+ end
27
+ end
28
+ end
@@ -2,274 +2,321 @@ module PgSync
2
2
  class TableSync
3
3
  include Utils
4
4
 
5
- attr_reader :source, :destination
5
+ attr_reader :source, :destination, :tasks, :opts, :resolver
6
6
 
7
- def initialize(source:, destination:)
7
+ def initialize(source:, destination:, tasks:, opts:, resolver:)
8
8
  @source = source
9
9
  @destination = destination
10
+ @tasks = tasks
11
+ @opts = opts
12
+ @resolver = resolver
10
13
  end
11
14
 
12
- def sync(config, table, opts)
13
- maybe_disable_triggers(table, opts) do
14
- sync_data(config, table, opts)
15
- end
16
- end
15
+ def perform
16
+ confirm_tables_exist(destination, tasks, "destination")
17
17
 
18
- def sync_data(config, table, opts)
19
- start_time = Time.now
18
+ add_columns
20
19
 
21
- from_fields = source.columns(table)
22
- to_fields = destination.columns(table)
23
- shared_fields = to_fields & from_fields
24
- extra_fields = to_fields - from_fields
25
- missing_fields = from_fields - to_fields
20
+ add_primary_keys
26
21
 
27
- if opts[:no_sequences]
28
- from_sequences = []
29
- to_sequences = []
30
- else
31
- from_sequences = source.sequences(table, shared_fields)
32
- to_sequences = destination.sequences(table, shared_fields)
33
- end
22
+ add_sequences unless opts[:no_sequences]
34
23
 
35
- shared_sequences = to_sequences & from_sequences
36
- extra_sequences = to_sequences - from_sequences
37
- missing_sequences = from_sequences - to_sequences
24
+ show_notes
38
25
 
39
- sql_clause = String.new("")
40
- sql_clause << " #{opts[:sql]}" if opts[:sql]
26
+ # don't sync tables with no shared fields
27
+ # we show a warning message above
28
+ run_tasks(tasks.reject { |task| task.shared_fields.empty? })
29
+ end
41
30
 
42
- notes = []
43
- notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
44
- notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
45
- notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
46
- notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
31
+ def add_columns
32
+ source_columns = columns(source)
33
+ destination_columns = columns(destination)
47
34
 
48
- return {status: "success", message: "No fields to copy"} if shared_fields.empty?
35
+ tasks.each do |task|
36
+ task.from_columns = source_columns[task.table] || []
37
+ task.to_columns = destination_columns[task.table] || []
38
+ end
39
+ end
49
40
 
50
- bad_fields = opts[:no_rules] ? [] : config["data_rules"]
51
- primary_key = destination.primary_key(table)
52
- copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quote_ident_full(table)}.#{quote_ident(f)}" }.join(", ")
53
- fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
41
+ def add_primary_keys
42
+ destination_primary_keys = primary_keys(destination)
54
43
 
55
- seq_values = {}
56
- shared_sequences.each do |seq|
57
- seq_values[seq] = source.last_value(seq)
44
+ tasks.each do |task|
45
+ task.to_primary_key = destination_primary_keys[task.table] || []
58
46
  end
47
+ end
59
48
 
60
- copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quote_ident_full(table)}#{sql_clause}) TO STDOUT"
61
- if opts[:in_batches]
62
- raise Error, "Cannot use --overwrite with --in-batches" if opts[:overwrite]
63
- raise Error, "No primary key" unless primary_key
49
+ def add_sequences
50
+ source_sequences = sequences(source)
51
+ destination_sequences = sequences(destination)
64
52
 
65
- destination.truncate(table) if opts[:truncate]
53
+ tasks.each do |task|
54
+ shared_columns = Set.new(task.shared_fields)
66
55
 
67
- from_max_id = source.max_id(table, primary_key)
68
- to_max_id = destination.max_id(table, primary_key) + 1
56
+ task.from_sequences = (source_sequences[task.table] || []).select { |s| shared_columns.include?(s.column) }
57
+ task.to_sequences = (destination_sequences[task.table] || []).select { |s| shared_columns.include?(s.column) }
58
+ end
59
+ end
69
60
 
70
- if to_max_id == 1
71
- from_min_id = source.min_id(table, primary_key)
72
- to_max_id = from_min_id if from_min_id > 0
73
- end
61
+ def sequences(data_source)
62
+ query = <<~SQL
63
+ SELECT
64
+ nt.nspname as schema,
65
+ t.relname as table,
66
+ a.attname as column,
67
+ n.nspname as sequence_schema,
68
+ s.relname as sequence
69
+ FROM
70
+ pg_class s
71
+ INNER JOIN
72
+ pg_depend d ON d.objid = s.oid
73
+ INNER JOIN
74
+ pg_class t ON d.objid = s.oid AND d.refobjid = t.oid
75
+ INNER JOIN
76
+ pg_attribute a ON (d.refobjid, d.refobjsubid) = (a.attrelid, a.attnum)
77
+ INNER JOIN
78
+ pg_namespace n ON n.oid = s.relnamespace
79
+ INNER JOIN
80
+ pg_namespace nt ON nt.oid = t.relnamespace
81
+ WHERE
82
+ s.relkind = 'S'
83
+ SQL
84
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
85
+ [k, v.map { |r| Sequence.new(r["sequence_schema"], r["sequence"], column: r["column"]) }]
86
+ end.to_h
87
+ end
74
88
 
75
- starting_id = to_max_id
76
- batch_size = opts[:batch_size]
89
+ def primary_keys(data_source)
90
+ # https://stackoverflow.com/a/20537829
91
+ # TODO can simplify with array_position in Postgres 9.5+
92
+ query = <<~SQL
93
+ SELECT
94
+ nspname AS schema,
95
+ relname AS table,
96
+ pg_attribute.attname AS column,
97
+ format_type(pg_attribute.atttypid, pg_attribute.atttypmod),
98
+ pg_attribute.attnum,
99
+ pg_index.indkey
100
+ FROM
101
+ pg_index, pg_class, pg_attribute, pg_namespace
102
+ WHERE
103
+ indrelid = pg_class.oid AND
104
+ pg_class.relnamespace = pg_namespace.oid AND
105
+ pg_attribute.attrelid = pg_class.oid AND
106
+ pg_attribute.attnum = any(pg_index.indkey) AND
107
+ indisprimary
108
+ SQL
109
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
110
+ [k, v.sort_by { |r| r["indkey"].split(" ").index(r["attnum"]) }.map { |r| r["column"] }]
111
+ end.to_h
112
+ end
77
113
 
78
- i = 1
79
- batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
114
+ def show_notes
115
+ # for tables
116
+ resolver.notes.each do |note|
117
+ warning note
118
+ end
80
119
 
81
- while starting_id <= from_max_id
82
- where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
83
- log " #{i}/#{batch_count}: #{where}"
120
+ # for columns and sequences
121
+ tasks.each do |task|
122
+ task.notes.each do |note|
123
+ warning "#{task_name(task)}: #{note}"
124
+ end
125
+ end
84
126
 
85
- # TODO be smarter for advance sql clauses
86
- batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
127
+ # for non-deferrable constraints
128
+ if opts[:defer_constraints]
129
+ constraints = non_deferrable_constraints(destination)
130
+ constraints = tasks.flat_map { |t| constraints[t.table] || [] }
131
+ warning "Non-deferrable constraints: #{constraints.join(", ")}" if constraints.any?
132
+ end
133
+ end
87
134
 
88
- batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quote_ident_full(table)}#{batch_sql_clause}) TO STDOUT"
89
- copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
135
+ def columns(data_source)
136
+ query = <<~SQL
137
+ SELECT
138
+ table_schema AS schema,
139
+ table_name AS table,
140
+ column_name AS column,
141
+ data_type AS type
142
+ FROM
143
+ information_schema.columns
144
+ ORDER BY 1, 2, 3
145
+ SQL
146
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
147
+ [k, v.map { |r| {name: r["column"], type: r["type"]} }]
148
+ end.to_h
149
+ end
90
150
 
91
- starting_id += batch_size
92
- i += 1
151
+ def non_deferrable_constraints(data_source)
152
+ query = <<~SQL
153
+ SELECT
154
+ table_schema AS schema,
155
+ table_name AS table,
156
+ constraint_name
157
+ FROM
158
+ information_schema.table_constraints
159
+ WHERE
160
+ constraint_type = 'FOREIGN KEY' AND
161
+ is_deferrable = 'NO'
162
+ SQL
163
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
164
+ [k, v.map { |r| r["constraint_name"] }]
165
+ end.to_h
166
+ end
93
167
 
94
- if opts[:sleep] && starting_id <= from_max_id
95
- sleep(opts[:sleep])
96
- end
97
- end
98
- elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
99
- raise Error, "No primary key" unless primary_key
168
+ def run_tasks(tasks, &block)
169
+ notices = []
170
+ failed_tables = []
171
+ started_at = {}
100
172
 
101
- # create a temp table
102
- temp_table = "pgsync_#{rand(1_000_000_000)}"
103
- destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quote_ident_full(table)} WITH NO DATA")
173
+ show_spinners = output.tty? && !opts[:in_batches] && !opts[:debug]
174
+ if show_spinners
175
+ spinners = TTY::Spinner::Multi.new(format: :dots, output: output)
176
+ task_spinners = {}
177
+ end
104
178
 
105
- # load data
106
- copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
179
+ start = lambda do |task, i|
180
+ message = ":spinner #{display_item(task)}"
107
181
 
108
- if opts[:preserve]
109
- # insert into
110
- destination.execute("INSERT INTO #{quote_ident_full(table)} (SELECT * FROM #{quote_ident_full(temp_table)} WHERE NOT EXISTS (SELECT 1 FROM #{quote_ident_full(table)} WHERE #{quote_ident_full(table)}.#{quote_ident(primary_key)} = #{quote_ident_full(temp_table)}.#{quote_ident(primary_key)}))")
111
- else
112
- destination.transaction do
113
- destination.execute("DELETE FROM #{quote_ident_full(table)} WHERE #{quote_ident(primary_key)} IN (SELECT #{quote_ident(primary_key)} FROM #{quote_ident_full(temp_table)})")
114
- destination.execute("INSERT INTO #{quote_ident_full(table)} (SELECT * FROM #{quote_ident(temp_table)})")
115
- end
182
+ if show_spinners
183
+ spinner = spinners.register(message)
184
+ spinner.auto_spin
185
+ task_spinners[task] = spinner
186
+ elsif opts[:in_batches]
187
+ log message.sub(":spinner", "⠋")
116
188
  end
117
- else
118
- # use delete instead of truncate for foreign keys
119
- if opts[:defer_constraints]
120
- destination.execute("DELETE FROM #{quote_ident_full(table)}")
121
- else
122
- destination.truncate(table)
123
- end
124
- copy(copy_to_command, dest_table: table, dest_fields: fields)
125
- end
126
- seq_values.each do |seq, value|
127
- destination.execute("SELECT setval(#{escape(seq)}, #{escape(value)})")
189
+
190
+ started_at[task] = Time.now
128
191
  end
129
192
 
130
- message = nil
131
- message = notes.join(", ") if notes.any?
132
-
133
- {status: "success", message: message, time: (Time.now - start_time).round(1)}
134
- rescue => e
135
- message =
136
- case e
137
- when PG::ConnectionBad
138
- # likely fine to show simplified message here
139
- # the full message will be shown when first trying to connect
140
- "Connection failed"
141
- when PG::Error
142
- e.message.sub("ERROR: ", "")
143
- when Error
144
- e.message
145
- else
146
- "#{e.class.name}: #{e.message}"
147
- end
193
+ finish = lambda do |task, i, result|
194
+ time = (Time.now - started_at[task]).round(1)
148
195
 
149
- {status: "error", message: message}
150
- end
196
+ success = result[:status] == "success"
151
197
 
152
- private
153
-
154
- def copy(source_command, dest_table:, dest_fields:)
155
- destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
156
- destination.conn.copy_data(destination_command) do
157
- source.conn.copy_data(source_command) do
158
- while (row = source.conn.get_copy_data)
159
- destination.conn.put_copy_data(row)
198
+ message =
199
+ if result[:message]
200
+ "(#{result[:message].lines.first.to_s.strip})"
201
+ else
202
+ "- #{time}s"
160
203
  end
161
- end
162
- end
163
- end
164
204
 
165
- # TODO better performance
166
- def rule_match?(table, column, rule)
167
- regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
168
- regex.match(column) || regex.match("#{table.split(".", 2)[-1]}.#{column}") || regex.match("#{table}.#{column}")
169
- end
205
+ notices.concat(result[:notices])
170
206
 
171
- # TODO wildcard rules
172
- def apply_strategy(rule, table, column, primary_key)
173
- if rule.is_a?(Hash)
174
- if rule.key?("value")
175
- escape(rule["value"])
176
- elsif rule.key?("statement")
177
- rule["statement"]
207
+ if show_spinners
208
+ spinner = task_spinners[task]
209
+ if success
210
+ spinner.success(message)
211
+ else
212
+ spinner.error(message)
213
+ end
178
214
  else
179
- raise Error, "Unknown rule #{rule.inspect} for column #{column}"
215
+ status = success ? "✔" : "✖"
216
+ log [status, display_item(task), message].join(" ")
180
217
  end
181
- else
182
- case rule
183
- when "untouched"
184
- quote_ident(column)
185
- when "unique_email"
186
- "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
187
- when "unique_phone"
188
- "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
189
- when "unique_secret"
190
- "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
191
- when "random_int", "random_number"
192
- "(RANDOM() * 100)::int"
193
- when "random_date"
194
- "date '1970-01-01' + (RANDOM() * 10000)::int"
195
- when "random_time"
196
- "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
197
- when "random_ip"
198
- "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
199
- when "random_letter"
200
- "chr(65 + (RANDOM() * 26)::int)"
201
- when "random_string"
202
- "RIGHT(MD5(RANDOM()::text), 10)"
203
- when "null", nil
204
- "NULL"
205
- else
206
- raise Error, "Unknown rule #{rule} for column #{column}"
218
+
219
+ unless success
220
+ failed_tables << task_name(task)
221
+ fail_sync(failed_tables) if opts[:fail_fast]
207
222
  end
208
223
  end
209
- end
210
224
 
211
- def quoted_primary_key(table, primary_key, rule)
212
- raise "Primary key required for this data rule: #{rule}" unless primary_key
213
- "#{quote_ident_full(table)}.#{quote_ident(primary_key)}"
214
- end
225
+ options = {start: start, finish: finish}
215
226
 
216
- def quote_ident_full(ident)
217
- ident.split(".").map { |v| quote_ident(v) }.join(".")
218
- end
227
+ jobs = opts[:jobs]
219
228
 
220
- def quote_ident(value)
221
- PG::Connection.quote_ident(value)
222
- end
229
+ # disable multiple jobs for defer constraints and disable integrity
230
+ # so we can use a transaction to ensure a consistent snapshot
231
+ if opts[:debug] || opts[:in_batches] || opts[:defer_constraints] || opts[:defer_constraints_v2] || opts[:disable_integrity] || opts[:disable_integrity_v2]
232
+ warning "--jobs ignored" if jobs
233
+ jobs = 0
234
+ end
223
235
 
224
- def escape(value)
225
- if value.is_a?(String)
226
- "'#{quote_string(value)}'"
236
+ if windows?
237
+ options[:in_threads] = jobs || 4
227
238
  else
228
- value
239
+ options[:in_processes] = jobs if jobs
229
240
  end
230
- end
231
241
 
232
- # activerecord
233
- def quote_string(s)
234
- s.gsub(/\\/, '\&\&').gsub(/'/, "''")
242
+ maybe_defer_constraints do
243
+ # could try to use `raise Parallel::Kill` to fail faster with --fail-fast
244
+ # see `fast_faster` branch
245
+ # however, need to make sure connections are cleaned up properly
246
+ Parallel.each(tasks, **options) do |task|
247
+ source.reconnect_if_needed
248
+ destination.reconnect_if_needed
249
+
250
+ task.perform
251
+ end
252
+ end
253
+
254
+ notices.each do |notice|
255
+ warning notice
256
+ end
257
+
258
+ fail_sync(failed_tables) if failed_tables.any?
235
259
  end
236
260
 
237
- def maybe_disable_triggers(table, opts)
238
- if opts[:disable_integrity] || opts[:disable_user_triggers]
261
+ # TODO add option to open transaction on source when manually specifying order of tables
262
+ def maybe_defer_constraints
263
+ if opts[:disable_integrity] || opts[:disable_integrity_v2]
264
+ # create a transaction on the source
265
+ # to ensure we get a consistent snapshot
266
+ source.transaction do
267
+ yield
268
+ end
269
+ elsif opts[:defer_constraints] || opts[:defer_constraints_v2]
239
270
  destination.transaction do
240
- triggers = destination.triggers(table)
241
- triggers.select! { |t| t["enabled"] == "t" }
242
- internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
243
- integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
244
- restore_triggers = []
245
-
246
- if opts[:disable_integrity]
247
- integrity_triggers.each do |trigger|
248
- destination.execute("ALTER TABLE #{quote_ident_full(table)} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
271
+ if opts[:defer_constraints_v2]
272
+ table_constraints = non_deferrable_constraints(destination)
273
+ table_constraints.each do |table, constraints|
274
+ constraints.each do |constraint|
275
+ destination.execute("ALTER TABLE #{quote_ident_full(table)} ALTER CONSTRAINT #{quote_ident(constraint)} DEFERRABLE")
276
+ end
249
277
  end
250
- restore_triggers.concat(integrity_triggers)
251
- end
252
-
253
- if opts[:disable_user_triggers]
254
- # important!
255
- # rely on Postgres to disable user triggers
256
- # we don't want to accidentally disable non-user triggers if logic above is off
257
- destination.execute("ALTER TABLE #{quote_ident_full(table)} DISABLE TRIGGER USER")
258
- restore_triggers.concat(user_triggers)
259
278
  end
260
279
 
261
- result = yield
280
+ destination.execute("SET CONSTRAINTS ALL DEFERRED")
262
281
 
263
- # restore triggers that were previously enabled
264
- restore_triggers.each do |trigger|
265
- destination.execute("ALTER TABLE #{quote_ident_full(table)} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
282
+ # create a transaction on the source
283
+ # to ensure we get a consistent snapshot
284
+ source.transaction do
285
+ yield
266
286
  end
267
287
 
268
- result
288
+ # set them back
289
+ # there are 3 modes: DEFERRABLE INITIALLY DEFERRED, DEFERRABLE INITIALLY IMMEDIATE, and NOT DEFERRABLE
290
+ # we only update NOT DEFERRABLE
291
+ # https://www.postgresql.org/docs/current/sql-set-constraints.html
292
+ if opts[:defer_constraints_v2]
293
+ destination.execute("SET CONSTRAINTS ALL IMMEDIATE")
294
+
295
+ table_constraints.each do |table, constraints|
296
+ constraints.each do |constraint|
297
+ destination.execute("ALTER TABLE #{quote_ident_full(table)} ALTER CONSTRAINT #{quote_ident(constraint)} NOT DEFERRABLE")
298
+ end
299
+ end
300
+ end
269
301
  end
270
302
  else
271
303
  yield
272
304
  end
273
305
  end
306
+
307
+ def fail_sync(failed_tables)
308
+ raise Error, "Sync failed for #{failed_tables.size} table#{failed_tables.size == 1 ? nil : "s"}: #{failed_tables.join(", ")}"
309
+ end
310
+
311
+ def display_item(item)
312
+ messages = []
313
+ messages << task_name(item)
314
+ messages << item.opts[:sql] if item.opts[:sql]
315
+ messages.join(" ")
316
+ end
317
+
318
+ def windows?
319
+ Gem.win_platform?
320
+ end
274
321
  end
275
322
  end