pgsync 0.5.5 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pgsync might be problematic. Click here for more details.

@@ -0,0 +1,325 @@
1
+ module PgSync
2
+ class Task
3
+ include Utils
4
+
5
+ attr_reader :source, :destination, :config, :table, :opts
6
+ attr_accessor :from_columns, :to_columns, :from_sequences, :to_sequences, :to_primary_key
7
+
8
+ def initialize(source:, destination:, config:, table:, opts:)
9
+ @source = source
10
+ @destination = destination
11
+ @config = config
12
+ @table = table
13
+ @opts = opts
14
+ @from_sequences = []
15
+ @to_sequences = []
16
+ end
17
+
18
+ def quoted_table
19
+ quote_ident_full(table)
20
+ end
21
+
22
+ def perform
23
+ with_notices do
24
+ handle_errors do
25
+ maybe_disable_triggers do
26
+ sync_data
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def from_fields
33
+ @from_fields ||= from_columns.map { |c| c[:name] }
34
+ end
35
+
36
+ def to_fields
37
+ @to_fields ||= to_columns.map { |c| c[:name] }
38
+ end
39
+
40
+ def shared_fields
41
+ @shared_fields ||= to_fields & from_fields
42
+ end
43
+
44
+ def shared_sequences
45
+ @shared_sequences ||= to_sequences & from_sequences
46
+ end
47
+
48
+ def notes
49
+ notes = []
50
+ if shared_fields.empty?
51
+ notes << "No fields to copy"
52
+ else
53
+ extra_fields = to_fields - from_fields
54
+ notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
55
+
56
+ missing_fields = from_fields - to_fields
57
+ notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
58
+
59
+ extra_sequences = to_sequences - from_sequences
60
+ notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
61
+
62
+ missing_sequences = from_sequences - to_sequences
63
+ notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
64
+
65
+ from_types = from_columns.map { |c| [c[:name], c[:type]] }.to_h
66
+ to_types = to_columns.map { |c| [c[:name], c[:type]] }.to_h
67
+ different_types = []
68
+ shared_fields.each do |field|
69
+ if from_types[field] != to_types[field]
70
+ different_types << "#{field} (#{from_types[field]} -> #{to_types[field]})"
71
+ end
72
+ end
73
+ notes << "Different column types: #{different_types.join(", ")}" if different_types.any?
74
+ end
75
+ notes
76
+ end
77
+
78
+ def sync_data
79
+ raise Error, "This should never happen. Please file a bug." if shared_fields.empty?
80
+
81
+ sql_clause = String.new("")
82
+ sql_clause << " #{opts[:sql]}" if opts[:sql]
83
+
84
+ bad_fields = opts[:no_rules] ? [] : config["data_rules"]
85
+ primary_key = to_primary_key
86
+ copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quoted_table}.#{quote_ident(f)}" }.join(", ")
87
+ fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
88
+
89
+ copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{sql_clause}) TO STDOUT"
90
+ if opts[:in_batches]
91
+ raise Error, "No primary key" if primary_key.empty?
92
+ primary_key = primary_key.first
93
+
94
+ destination.truncate(table) if opts[:truncate]
95
+
96
+ from_max_id = source.max_id(table, primary_key)
97
+ to_max_id = destination.max_id(table, primary_key) + 1
98
+
99
+ if to_max_id == 1
100
+ from_min_id = source.min_id(table, primary_key)
101
+ to_max_id = from_min_id if from_min_id > 0
102
+ end
103
+
104
+ starting_id = to_max_id
105
+ batch_size = opts[:batch_size]
106
+
107
+ i = 1
108
+ batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
109
+
110
+ while starting_id <= from_max_id
111
+ where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
112
+ log " #{i}/#{batch_count}: #{where}"
113
+
114
+ # TODO be smarter for advance sql clauses
115
+ batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
116
+
117
+ batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{batch_sql_clause}) TO STDOUT"
118
+ copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
119
+
120
+ starting_id += batch_size
121
+ i += 1
122
+
123
+ if opts[:sleep] && starting_id <= from_max_id
124
+ sleep(opts[:sleep])
125
+ end
126
+ end
127
+ elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
128
+ raise Error, "No primary key" if primary_key.empty?
129
+
130
+ # create a temp table
131
+ temp_table = "pgsync_#{rand(1_000_000_000)}"
132
+ destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quoted_table} WITH NO DATA")
133
+
134
+ # load data
135
+ copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
136
+
137
+ on_conflict = primary_key.map { |pk| quote_ident(pk) }.join(", ")
138
+ action =
139
+ if opts[:preserve]
140
+ "NOTHING"
141
+ else # overwrite or sql clause
142
+ setter = shared_fields.reject { |f| primary_key.include?(f) }.map { |f| "#{quote_ident(f)} = EXCLUDED.#{quote_ident(f)}" }
143
+ "UPDATE SET #{setter.join(", ")}"
144
+ end
145
+ destination.execute("INSERT INTO #{quoted_table} (SELECT * FROM #{quote_ident_full(temp_table)}) ON CONFLICT (#{on_conflict}) DO #{action}")
146
+ else
147
+ # use delete instead of truncate for foreign keys
148
+ if opts[:defer_constraints] || opts[:defer_constraints_v2]
149
+ destination.execute("DELETE FROM #{quoted_table}")
150
+ else
151
+ destination.truncate(table)
152
+ end
153
+ copy(copy_to_command, dest_table: table, dest_fields: fields)
154
+ end
155
+
156
+ # update sequences
157
+ shared_sequences.each do |seq|
158
+ value = source.last_value(seq)
159
+ destination.execute("SELECT setval(#{escape(quote_ident_full(seq))}, #{escape(value)})")
160
+ end
161
+
162
+ {status: "success"}
163
+ end
164
+
165
+ private
166
+
167
+ def with_notices
168
+ notices = []
169
+ [source, destination].each do |data_source|
170
+ data_source.send(:conn).set_notice_processor do |message|
171
+ notices << message.strip
172
+ end
173
+ end
174
+ result = yield
175
+ result[:notices] = notices if result
176
+ result
177
+ ensure
178
+ # clear notice processor
179
+ [source, destination].each do |data_source|
180
+ data_source.send(:conn).set_notice_processor
181
+ end
182
+ end
183
+
184
+ # TODO add retries
185
+ def handle_errors
186
+ yield
187
+ rescue => e
188
+ raise e if opts[:debug]
189
+
190
+ message =
191
+ case e
192
+ when PG::ConnectionBad
193
+ # likely fine to show simplified message here
194
+ # the full message will be shown when first trying to connect
195
+ "Connection failed"
196
+ when PG::Error
197
+ e.message.sub("ERROR: ", "")
198
+ when Error
199
+ e.message
200
+ else
201
+ "#{e.class.name}: #{e.message}"
202
+ end
203
+
204
+ {status: "error", message: message}
205
+ end
206
+
207
+ def copy(source_command, dest_table:, dest_fields:)
208
+ destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
209
+
210
+ source.log_sql(source_command)
211
+ destination.log_sql(destination_command)
212
+
213
+ destination.conn.copy_data(destination_command) do
214
+ source.conn.copy_data(source_command) do
215
+ while (row = source.conn.get_copy_data)
216
+ destination.conn.put_copy_data(row)
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+ # TODO better performance
223
+ def rule_match?(table, column, rule)
224
+ regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
225
+ regex.match(column) || regex.match("#{table.name}.#{column}") || regex.match("#{table.schema}.#{table.name}.#{column}")
226
+ end
227
+
228
+ # TODO wildcard rules
229
+ def apply_strategy(rule, table, column, primary_key)
230
+ if rule.is_a?(Hash)
231
+ if rule.key?("value")
232
+ escape(rule["value"])
233
+ elsif rule.key?("statement")
234
+ rule["statement"]
235
+ else
236
+ raise Error, "Unknown rule #{rule.inspect} for column #{column}"
237
+ end
238
+ else
239
+ case rule
240
+ when "untouched"
241
+ quote_ident(column)
242
+ when "unique_email"
243
+ "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
244
+ when "unique_phone"
245
+ "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
246
+ when "unique_secret"
247
+ "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
248
+ when "random_int", "random_number"
249
+ "(RANDOM() * 100)::int"
250
+ when "random_date"
251
+ "date '1970-01-01' + (RANDOM() * 10000)::int"
252
+ when "random_time"
253
+ "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
254
+ when "random_ip"
255
+ "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
256
+ when "random_letter"
257
+ "chr(65 + (RANDOM() * 26)::int)"
258
+ when "random_string"
259
+ "RIGHT(MD5(RANDOM()::text), 10)"
260
+ when "null", nil
261
+ "NULL"
262
+ else
263
+ raise Error, "Unknown rule #{rule} for column #{column}"
264
+ end
265
+ end
266
+ end
267
+
268
+ def quoted_primary_key(table, primary_key, rule)
269
+ raise Error, "Single column primary key required for this data rule: #{rule}" unless primary_key.size == 1
270
+ "#{quoted_table}.#{quote_ident(primary_key.first)}"
271
+ end
272
+
273
+ def maybe_disable_triggers
274
+ if opts[:disable_integrity] || opts[:disable_integrity_v2] || opts[:disable_user_triggers]
275
+ destination.transaction do
276
+ triggers = destination.triggers(table)
277
+ triggers.select! { |t| t["enabled"] == "t" }
278
+ internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
279
+ integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
280
+ restore_triggers = []
281
+
282
+ # both --disable-integrity options require superuser privileges
283
+ # however, only v2 works on Amazon RDS, which added specific support for it
284
+ # https://aws.amazon.com/about-aws/whats-new/2014/11/10/amazon-rds-postgresql-read-replicas/
285
+ #
286
+ # session_replication_role disables more than foreign keys (like triggers and rules)
287
+ # this is probably fine, but keep the current default for now
288
+ if opts[:disable_integrity_v2] || (opts[:disable_integrity] && rds?)
289
+ # SET LOCAL lasts until the end of the transaction
290
+ # https://www.postgresql.org/docs/current/sql-set.html
291
+ destination.execute("SET LOCAL session_replication_role = replica")
292
+ elsif opts[:disable_integrity]
293
+ integrity_triggers.each do |trigger|
294
+ destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
295
+ end
296
+ restore_triggers.concat(integrity_triggers)
297
+ end
298
+
299
+ if opts[:disable_user_triggers]
300
+ # important!
301
+ # rely on Postgres to disable user triggers
302
+ # we don't want to accidentally disable non-user triggers if logic above is off
303
+ destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER USER")
304
+ restore_triggers.concat(user_triggers)
305
+ end
306
+
307
+ result = yield
308
+
309
+ # restore triggers that were previously enabled
310
+ restore_triggers.each do |trigger|
311
+ destination.execute("ALTER TABLE #{quoted_table} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
312
+ end
313
+
314
+ result
315
+ end
316
+ else
317
+ yield
318
+ end
319
+ end
320
+
321
+ def rds?
322
+ destination.execute("SELECT name, setting FROM pg_settings WHERE name LIKE 'rds.%'").any?
323
+ end
324
+ end
325
+ end
@@ -0,0 +1,237 @@
1
+ module PgSync
2
+ class TaskResolver
3
+ include Utils
4
+
5
+ attr_reader :args, :opts, :source, :destination, :config, :first_schema, :notes
6
+
7
+ def initialize(args:, opts:, source:, destination:, config:, first_schema:)
8
+ @args = args
9
+ @opts = opts
10
+ @source = source
11
+ @destination = destination
12
+ @config = config
13
+ @groups = config["groups"] || {}
14
+ @first_schema = first_schema
15
+ @notes = []
16
+ end
17
+
18
+ def tasks
19
+ tasks = []
20
+
21
+ # get lists from args
22
+ groups, tables = process_args
23
+
24
+ # expand groups into tasks
25
+ groups.each do |group|
26
+ tasks.concat(group_to_tasks(group))
27
+ end
28
+
29
+ # expand tables into tasks
30
+ tables.each do |table|
31
+ tasks.concat(table_to_tasks(table))
32
+ end
33
+
34
+ # get default if none given
35
+ if !opts[:groups] && !opts[:tables] && args.size == 0
36
+ tasks.concat(default_tasks)
37
+ end
38
+
39
+ # resolve any tables that need it
40
+ tasks.each do |task|
41
+ task[:table] = fully_resolve(task[:table])
42
+ end
43
+
44
+ tasks
45
+ end
46
+
47
+ def group?(group)
48
+ @groups.key?(group)
49
+ end
50
+
51
+ private
52
+
53
+ def group_to_tasks(value)
54
+ group, param = value.split(":", 2)
55
+ raise Error, "Group not found: #{group}" unless group?(group)
56
+
57
+ @groups[group].map do |table|
58
+ table_sql = nil
59
+ if table.is_a?(Array)
60
+ table, table_sql = table
61
+ end
62
+
63
+ {
64
+ table: to_table(table),
65
+ sql: expand_sql(table_sql, param)
66
+ }
67
+ end
68
+ end
69
+
70
+ def table_to_tasks(value)
71
+ raise Error, "Cannot use parameters with tables" if value.include?(":")
72
+
73
+ tables =
74
+ if value.include?("*")
75
+ regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
76
+ shared_tables.select { |t| regex.match(t.full_name) || regex.match(t.name) }
77
+ else
78
+ [to_table(value)]
79
+ end
80
+
81
+ tables.map do |table|
82
+ {
83
+ table: table,
84
+ sql: sql_arg # doesn't support params
85
+ }
86
+ end
87
+ end
88
+
89
+ # treats identifiers as if they were quoted (Users == "Users")
90
+ # this is different from Postgres (Users == "users")
91
+ #
92
+ # TODO add support for quoted identifiers like "my.schema"."my.table"
93
+ # so it's possible to specify identifiers with "." in them
94
+ def to_table(value)
95
+ parts = value.split(".")
96
+ case parts.size
97
+ when 1
98
+ # unknown schema
99
+ Table.new(nil, parts[0])
100
+ when 2
101
+ Table.new(*parts)
102
+ else
103
+ raise Error, "Cannot resolve table: #{value}"
104
+ end
105
+ end
106
+
107
+ def default_tasks
108
+ shared_tables.map do |table|
109
+ {
110
+ table: table
111
+ }
112
+ end
113
+ end
114
+
115
+ # tables that exists in both source and destination
116
+ # used when no tables specified, or a wildcard
117
+ # removes excluded tables and filters by schema
118
+ def shared_tables
119
+ tables = filter_tables(source.tables)
120
+
121
+ unless opts[:schema_only] || opts[:schema_first]
122
+ from_tables = tables
123
+ to_tables = filter_tables(destination.tables)
124
+
125
+ extra_tables = to_tables - from_tables
126
+ notes << "Extra tables: #{extra_tables.map { |t| friendly_name(t) }.join(", ")}" if extra_tables.any?
127
+
128
+ missing_tables = from_tables - to_tables
129
+ notes << "Missing tables: #{missing_tables.map { |t| friendly_name(t) }.join(", ")}" if missing_tables.any?
130
+
131
+ tables &= to_tables
132
+ end
133
+
134
+ tables
135
+ end
136
+
137
+ def filter_tables(tables)
138
+ tables = tables.dup
139
+
140
+ unless opts[:all_schemas]
141
+ # could support wildcard schemas as well
142
+ schemas = Set.new(opts[:schemas] ? to_arr(opts[:schemas]) : source.search_path)
143
+ tables.select! { |t| schemas.include?(t.schema) }
144
+ end
145
+
146
+ to_arr(opts[:exclude]).each do |value|
147
+ if value.include?("*")
148
+ regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
149
+ tables.reject! { |t| regex.match(t.full_name) || regex.match(t.name) }
150
+ else
151
+ tables -= [fully_resolve(to_table(value), error: false)].compact
152
+ end
153
+ end
154
+
155
+ tables
156
+ end
157
+
158
+ def process_args
159
+ groups = to_arr(opts[:groups])
160
+ tables = to_arr(opts[:tables])
161
+ if args[0]
162
+ # could be a group, table, or mix
163
+ to_arr(args[0]).each do |value|
164
+ if group?(value.split(":", 2)[0])
165
+ groups << value
166
+ else
167
+ tables << value
168
+ end
169
+ end
170
+ end
171
+ [groups, tables]
172
+ end
173
+
174
+ def no_schema_tables
175
+ @no_schema_tables ||= begin
176
+ search_path_index = source.search_path.map.with_index.to_h
177
+ source.tables.group_by(&:name).map do |group, t2|
178
+ [group, t2.select { |t| search_path_index[t.schema] }.sort_by { |t| search_path_index[t.schema] }.first]
179
+ end.to_h
180
+ end
181
+ end
182
+
183
+ # for tables without a schema, find the table in the search path
184
+ def fully_resolve(table, error: true)
185
+ return table if table.schema
186
+ resolved_table = no_schema_tables[table.name]
187
+ raise Error, "Table not found in source: #{table.name}" if !resolved_table && error
188
+ resolved_table
189
+ end
190
+
191
+ # parse command line arguments and YAML
192
+ def to_arr(value)
193
+ if value.is_a?(Array)
194
+ value
195
+ else
196
+ # Split by commas, but don't use commas inside double quotes
197
+ # https://stackoverflow.com/questions/21105360/regex-find-comma-not-inside-quotes
198
+ value.to_s.split(/(?!\B"[^"]*),(?![^"]*"\B)/)
199
+ end
200
+ end
201
+
202
+ def sql_arg
203
+ args[1]
204
+ end
205
+
206
+ def expand_sql(sql, param)
207
+ # command line option takes precedence over group option
208
+ sql = sql_arg if sql_arg
209
+
210
+ return unless sql
211
+
212
+ # vars must match \w
213
+ missing_vars = sql.scan(/{\w+}/).map { |v| v[1..-2] }
214
+
215
+ vars = {}
216
+ if param
217
+ vars["id"] = cast(param)
218
+ vars["1"] = cast(param)
219
+ end
220
+
221
+ sql = sql.dup
222
+ vars.each do |k, v|
223
+ # only sub if in var list
224
+ sql.gsub!("{#{k}}", cast(v)) if missing_vars.delete(k)
225
+ end
226
+
227
+ raise Error, "Missing variables: #{missing_vars.uniq.join(", ")}" if missing_vars.any?
228
+
229
+ sql
230
+ end
231
+
232
+ # TODO quote vars in next major version
233
+ def cast(value)
234
+ value.to_s.gsub(/\A\"|\"\z/, '')
235
+ end
236
+ end
237
+ end