pgsync 0.5.5 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of pgsync might be problematic. Click here for more details.

@@ -0,0 +1,28 @@
1
+ # minimal class to keep schema and table name separate
2
+ module PgSync
3
+ class Table
4
+ attr_reader :schema, :name
5
+
6
+ def initialize(schema, name)
7
+ @schema = schema
8
+ @name = name
9
+ end
10
+
11
+ def full_name
12
+ "#{schema}.#{name}"
13
+ end
14
+
15
+ def eql?(other)
16
+ other.schema == schema && other.name == name
17
+ end
18
+
19
+ # override hash when overriding eql?
20
+ def hash
21
+ [schema, name].hash
22
+ end
23
+
24
+ def to_s
25
+ full_name
26
+ end
27
+ end
28
+ end
@@ -2,274 +2,204 @@ module PgSync
2
2
  class TableSync
3
3
  include Utils
4
4
 
5
- attr_reader :source, :destination
5
+ attr_reader :source, :destination, :tasks, :opts, :resolver
6
6
 
7
- def initialize(source:, destination:)
7
+ def initialize(source:, destination:, tasks:, opts:, resolver:)
8
8
  @source = source
9
9
  @destination = destination
10
+ @tasks = tasks
11
+ @opts = opts
12
+ @resolver = resolver
10
13
  end
11
14
 
12
- def sync(config, table, opts)
13
- maybe_disable_triggers(table, opts) do
14
- sync_data(config, table, opts)
15
- end
16
- end
17
-
18
- def sync_data(config, table, opts)
19
- start_time = Time.now
20
-
21
- from_fields = source.columns(table)
22
- to_fields = destination.columns(table)
23
- shared_fields = to_fields & from_fields
24
- extra_fields = to_fields - from_fields
25
- missing_fields = from_fields - to_fields
26
-
27
- if opts[:no_sequences]
28
- from_sequences = []
29
- to_sequences = []
30
- else
31
- from_sequences = source.sequences(table, shared_fields)
32
- to_sequences = destination.sequences(table, shared_fields)
33
- end
34
-
35
- shared_sequences = to_sequences & from_sequences
36
- extra_sequences = to_sequences - from_sequences
37
- missing_sequences = from_sequences - to_sequences
15
+ def perform
16
+ confirm_tables_exist(destination, tasks, "destination")
38
17
 
39
- sql_clause = String.new("")
40
- sql_clause << " #{opts[:sql]}" if opts[:sql]
18
+ add_columns
41
19
 
42
- notes = []
43
- notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
44
- notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
45
- notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
46
- notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
20
+ show_notes
47
21
 
48
- return {status: "success", message: "No fields to copy"} if shared_fields.empty?
22
+ # don't sync tables with no shared fields
23
+ # we show a warning message above
24
+ run_tasks(tasks.reject { |task| task.shared_fields.empty? })
25
+ end
49
26
 
50
- bad_fields = opts[:no_rules] ? [] : config["data_rules"]
51
- primary_key = destination.primary_key(table)
52
- copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quote_ident_full(table)}.#{quote_ident(f)}" }.join(", ")
53
- fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
27
+ # TODO only query specific tables
28
+ # TODO add sequences, primary keys, etc
29
+ def add_columns
30
+ source_columns = columns(source)
31
+ destination_columns = columns(destination)
54
32
 
55
- seq_values = {}
56
- shared_sequences.each do |seq|
57
- seq_values[seq] = source.last_value(seq)
33
+ tasks.each do |task|
34
+ task.from_columns = source_columns[task.table] || []
35
+ task.to_columns = destination_columns[task.table] || []
58
36
  end
37
+ end
59
38
 
60
- copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quote_ident_full(table)}#{sql_clause}) TO STDOUT"
61
- if opts[:in_batches]
62
- raise Error, "Cannot use --overwrite with --in-batches" if opts[:overwrite]
63
- raise Error, "No primary key" unless primary_key
64
-
65
- destination.truncate(table) if opts[:truncate]
66
-
67
- from_max_id = source.max_id(table, primary_key)
68
- to_max_id = destination.max_id(table, primary_key) + 1
39
+ def show_notes
40
+ # for tables
41
+ resolver.notes.each do |note|
42
+ warning note
43
+ end
69
44
 
70
- if to_max_id == 1
71
- from_min_id = source.min_id(table, primary_key)
72
- to_max_id = from_min_id if from_min_id > 0
45
+ # for columns and sequences
46
+ tasks.each do |task|
47
+ task.notes.each do |note|
48
+ warning "#{task_name(task)}: #{note}"
73
49
  end
50
+ end
74
51
 
75
- starting_id = to_max_id
76
- batch_size = opts[:batch_size]
77
-
78
- i = 1
79
- batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
52
+ # for non-deferrable constraints
53
+ if opts[:defer_constraints]
54
+ constraints = non_deferrable_constraints(destination)
55
+ constraints = tasks.flat_map { |t| constraints[t.table] || [] }
56
+ warning "Non-deferrable constraints: #{constraints.join(", ")}" if constraints.any?
57
+ end
58
+ end
80
59
 
81
- while starting_id <= from_max_id
82
- where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
83
- log " #{i}/#{batch_count}: #{where}"
60
+ def columns(data_source)
61
+ query = <<~SQL
62
+ SELECT
63
+ table_schema AS schema,
64
+ table_name AS table,
65
+ column_name AS column,
66
+ data_type AS type
67
+ FROM
68
+ information_schema.columns
69
+ ORDER BY 1, 2, 3
70
+ SQL
71
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
72
+ [k, v.map { |r| {name: r["column"], type: r["type"]} }]
73
+ end.to_h
74
+ end
84
75
 
85
- # TODO be smarter for advance sql clauses
86
- batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
76
+ def non_deferrable_constraints(data_source)
77
+ query = <<~SQL
78
+ SELECT
79
+ table_schema AS schema,
80
+ table_name AS table,
81
+ constraint_name
82
+ FROM
83
+ information_schema.table_constraints
84
+ WHERE
85
+ constraint_type = 'FOREIGN KEY' AND
86
+ is_deferrable = 'NO'
87
+ SQL
88
+ data_source.execute(query).group_by { |r| Table.new(r["schema"], r["table"]) }.map do |k, v|
89
+ [k, v.map { |r| r["constraint_name"] }]
90
+ end.to_h
91
+ end
87
92
 
88
- batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quote_ident_full(table)}#{batch_sql_clause}) TO STDOUT"
89
- copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
93
+ def run_tasks(tasks, &block)
94
+ notices = []
95
+ failed_tables = []
90
96
 
91
- starting_id += batch_size
92
- i += 1
97
+ spinners = TTY::Spinner::Multi.new(format: :dots, output: output)
98
+ task_spinners = {}
99
+ started_at = {}
93
100
 
94
- if opts[:sleep] && starting_id <= from_max_id
95
- sleep(opts[:sleep])
96
- end
101
+ start = lambda do |task, i|
102
+ message = ":spinner #{display_item(task)}"
103
+ spinner = spinners.register(message)
104
+ if opts[:in_batches]
105
+ # log instead of spin for non-tty
106
+ log message.sub(":spinner", "⠋")
107
+ else
108
+ spinner.auto_spin
97
109
  end
98
- elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
99
- raise Error, "No primary key" unless primary_key
100
-
101
- # create a temp table
102
- temp_table = "pgsync_#{rand(1_000_000_000)}"
103
- destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quote_ident_full(table)} WITH NO DATA")
110
+ task_spinners[task] = spinner
111
+ started_at[task] = Time.now
112
+ end
104
113
 
105
- # load data
106
- copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
114
+ finish = lambda do |task, i, result|
115
+ spinner = task_spinners[task]
116
+ time = (Time.now - started_at[task]).round(1)
107
117
 
108
- if opts[:preserve]
109
- # insert into
110
- destination.execute("INSERT INTO #{quote_ident_full(table)} (SELECT * FROM #{quote_ident_full(temp_table)} WHERE NOT EXISTS (SELECT 1 FROM #{quote_ident_full(table)} WHERE #{quote_ident_full(table)}.#{quote_ident(primary_key)} = #{quote_ident_full(temp_table)}.#{quote_ident(primary_key)}))")
111
- else
112
- destination.transaction do
113
- destination.execute("DELETE FROM #{quote_ident_full(table)} WHERE #{quote_ident(primary_key)} IN (SELECT #{quote_ident(primary_key)} FROM #{quote_ident_full(temp_table)})")
114
- destination.execute("INSERT INTO #{quote_ident_full(table)} (SELECT * FROM #{quote_ident(temp_table)})")
118
+ message =
119
+ if result[:message]
120
+ "(#{result[:message].lines.first.to_s.strip})"
121
+ else
122
+ "- #{time}s"
115
123
  end
116
- end
117
- else
118
- # use delete instead of truncate for foreign keys
119
- if opts[:defer_constraints]
120
- destination.execute("DELETE FROM #{quote_ident_full(table)}")
121
- else
122
- destination.truncate(table)
123
- end
124
- copy(copy_to_command, dest_table: table, dest_fields: fields)
125
- end
126
- seq_values.each do |seq, value|
127
- destination.execute("SELECT setval(#{escape(seq)}, #{escape(value)})")
128
- end
129
124
 
130
- message = nil
131
- message = notes.join(", ") if notes.any?
132
-
133
- {status: "success", message: message, time: (Time.now - start_time).round(1)}
134
- rescue => e
135
- message =
136
- case e
137
- when PG::ConnectionBad
138
- # likely fine to show simplified message here
139
- # the full message will be shown when first trying to connect
140
- "Connection failed"
141
- when PG::Error
142
- e.message.sub("ERROR: ", "")
143
- when Error
144
- e.message
125
+ notices.concat(result[:notices])
126
+
127
+ if result[:status] == "success"
128
+ spinner.success(message)
145
129
  else
146
- "#{e.class.name}: #{e.message}"
130
+ spinner.error(message)
131
+ failed_tables << task_name(task)
132
+ fail_sync(failed_tables) if opts[:fail_fast]
147
133
  end
148
134
 
149
- {status: "error", message: message}
150
- end
151
-
152
- private
153
-
154
- def copy(source_command, dest_table:, dest_fields:)
155
- destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
156
- destination.conn.copy_data(destination_command) do
157
- source.conn.copy_data(source_command) do
158
- while (row = source.conn.get_copy_data)
159
- destination.conn.put_copy_data(row)
160
- end
135
+ unless spinner.send(:tty?)
136
+ status = result[:status] == "success" ? "✔" : "✖"
137
+ log [status, display_item(task), message].join(" ")
161
138
  end
162
139
  end
163
- end
164
140
 
165
- # TODO better performance
166
- def rule_match?(table, column, rule)
167
- regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
168
- regex.match(column) || regex.match("#{table.split(".", 2)[-1]}.#{column}") || regex.match("#{table}.#{column}")
169
- end
141
+ options = {start: start, finish: finish}
170
142
 
171
- # TODO wildcard rules
172
- def apply_strategy(rule, table, column, primary_key)
173
- if rule.is_a?(Hash)
174
- if rule.key?("value")
175
- escape(rule["value"])
176
- elsif rule.key?("statement")
177
- rule["statement"]
178
- else
179
- raise Error, "Unknown rule #{rule.inspect} for column #{column}"
180
- end
181
- else
182
- case rule
183
- when "untouched"
184
- quote_ident(column)
185
- when "unique_email"
186
- "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
187
- when "unique_phone"
188
- "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
189
- when "unique_secret"
190
- "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
191
- when "random_int", "random_number"
192
- "(RANDOM() * 100)::int"
193
- when "random_date"
194
- "date '1970-01-01' + (RANDOM() * 10000)::int"
195
- when "random_time"
196
- "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
197
- when "random_ip"
198
- "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
199
- when "random_letter"
200
- "chr(65 + (RANDOM() * 26)::int)"
201
- when "random_string"
202
- "RIGHT(MD5(RANDOM()::text), 10)"
203
- when "null", nil
204
- "NULL"
205
- else
206
- raise Error, "Unknown rule #{rule} for column #{column}"
207
- end
143
+ jobs = opts[:jobs]
144
+ if opts[:debug] || opts[:in_batches] || opts[:defer_constraints]
145
+ warning "--jobs ignored" if jobs
146
+ jobs = 0
208
147
  end
209
- end
210
148
 
211
- def quoted_primary_key(table, primary_key, rule)
212
- raise "Primary key required for this data rule: #{rule}" unless primary_key
213
- "#{quote_ident_full(table)}.#{quote_ident(primary_key)}"
214
- end
149
+ if windows?
150
+ options[:in_threads] = jobs || 4
151
+ else
152
+ options[:in_processes] = jobs if jobs
153
+ end
215
154
 
216
- def quote_ident_full(ident)
217
- ident.split(".").map { |v| quote_ident(v) }.join(".")
218
- end
155
+ maybe_defer_constraints do
156
+ # could try to use `raise Parallel::Kill` to fail faster with --fail-fast
157
+ # see `fast_faster` branch
158
+ # however, need to make sure connections are cleaned up properly
159
+ Parallel.each(tasks, **options) do |task|
160
+ source.reconnect_if_needed
161
+ destination.reconnect_if_needed
219
162
 
220
- def quote_ident(value)
221
- PG::Connection.quote_ident(value)
222
- end
163
+ task.perform
164
+ end
165
+ end
223
166
 
224
- def escape(value)
225
- if value.is_a?(String)
226
- "'#{quote_string(value)}'"
227
- else
228
- value
167
+ notices.each do |notice|
168
+ warning notice
229
169
  end
230
- end
231
170
 
232
- # activerecord
233
- def quote_string(s)
234
- s.gsub(/\\/, '\&\&').gsub(/'/, "''")
171
+ fail_sync(failed_tables) if failed_tables.any?
235
172
  end
236
173
 
237
- def maybe_disable_triggers(table, opts)
238
- if opts[:disable_integrity] || opts[:disable_user_triggers]
174
+ def maybe_defer_constraints
175
+ if opts[:defer_constraints]
239
176
  destination.transaction do
240
- triggers = destination.triggers(table)
241
- triggers.select! { |t| t["enabled"] == "t" }
242
- internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
243
- integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
244
- restore_triggers = []
245
-
246
- if opts[:disable_integrity]
247
- integrity_triggers.each do |trigger|
248
- destination.execute("ALTER TABLE #{quote_ident_full(table)} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
249
- end
250
- restore_triggers.concat(integrity_triggers)
251
- end
252
-
253
- if opts[:disable_user_triggers]
254
- # important!
255
- # rely on Postgres to disable user triggers
256
- # we don't want to accidentally disable non-user triggers if logic above is off
257
- destination.execute("ALTER TABLE #{quote_ident_full(table)} DISABLE TRIGGER USER")
258
- restore_triggers.concat(user_triggers)
259
- end
260
-
261
- result = yield
177
+ destination.execute("SET CONSTRAINTS ALL DEFERRED")
262
178
 
263
- # restore triggers that were previously enabled
264
- restore_triggers.each do |trigger|
265
- destination.execute("ALTER TABLE #{quote_ident_full(table)} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
179
+ # create a transaction on the source
180
+ # to ensure we get a consistent snapshot
181
+ source.transaction do
182
+ yield
266
183
  end
267
-
268
- result
269
184
  end
270
185
  else
271
186
  yield
272
187
  end
273
188
  end
189
+
190
+ def fail_sync(failed_tables)
191
+ raise Error, "Sync failed for #{failed_tables.size} table#{failed_tables.size == 1 ? nil : "s"}: #{failed_tables.join(", ")}"
192
+ end
193
+
194
+ def display_item(item)
195
+ messages = []
196
+ messages << task_name(item)
197
+ messages << item.opts[:sql] if item.opts[:sql]
198
+ messages.join(" ")
199
+ end
200
+
201
+ def windows?
202
+ Gem.win_platform?
203
+ end
274
204
  end
275
205
  end