pgsync 0.5.2 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of pgsync might be problematic. Click here for more details.

@@ -0,0 +1,315 @@
1
+ module PgSync
2
+ class Task
3
+ include Utils
4
+
5
+ attr_reader :source, :destination, :config, :table, :opts
6
+ attr_accessor :from_columns, :to_columns
7
+
8
+ def initialize(source:, destination:, config:, table:, opts:)
9
+ @source = source
10
+ @destination = destination
11
+ @config = config
12
+ @table = table
13
+ @opts = opts
14
+ end
15
+
16
+ def quoted_table
17
+ quote_ident_full(table)
18
+ end
19
+
20
+ def perform
21
+ with_notices do
22
+ handle_errors do
23
+ maybe_disable_triggers do
24
+ sync_data
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def from_fields
31
+ @from_fields ||= from_columns.map { |c| c[:name] }
32
+ end
33
+
34
+ def to_fields
35
+ @to_fields ||= to_columns.map { |c| c[:name] }
36
+ end
37
+
38
+ def shared_fields
39
+ @shared_fields ||= to_fields & from_fields
40
+ end
41
+
42
+ def from_sequences
43
+ @from_sequences ||= opts[:no_sequences] ? [] : source.sequences(table, shared_fields)
44
+ end
45
+
46
+ def to_sequences
47
+ @to_sequences ||= opts[:no_sequences] ? [] : destination.sequences(table, shared_fields)
48
+ end
49
+
50
+ def shared_sequences
51
+ @shared_sequences ||= to_sequences & from_sequences
52
+ end
53
+
54
+ def notes
55
+ notes = []
56
+ if shared_fields.empty?
57
+ notes << "No fields to copy"
58
+ else
59
+ extra_fields = to_fields - from_fields
60
+ notes << "Extra columns: #{extra_fields.join(", ")}" if extra_fields.any?
61
+
62
+ missing_fields = from_fields - to_fields
63
+ notes << "Missing columns: #{missing_fields.join(", ")}" if missing_fields.any?
64
+
65
+ extra_sequences = to_sequences - from_sequences
66
+ notes << "Extra sequences: #{extra_sequences.join(", ")}" if extra_sequences.any?
67
+
68
+ missing_sequences = from_sequences - to_sequences
69
+ notes << "Missing sequences: #{missing_sequences.join(", ")}" if missing_sequences.any?
70
+
71
+ from_types = from_columns.map { |c| [c[:name], c[:type]] }.to_h
72
+ to_types = to_columns.map { |c| [c[:name], c[:type]] }.to_h
73
+ different_types = []
74
+ shared_fields.each do |field|
75
+ if from_types[field] != to_types[field]
76
+ different_types << "#{field} (#{from_types[field]} -> #{to_types[field]})"
77
+ end
78
+ end
79
+ notes << "Different column types: #{different_types.join(", ")}" if different_types.any?
80
+ end
81
+ notes
82
+ end
83
+
84
+ def sync_data
85
+ raise Error, "This should never happen. Please file a bug." if shared_fields.empty?
86
+
87
+ sql_clause = String.new("")
88
+ sql_clause << " #{opts[:sql]}" if opts[:sql]
89
+
90
+ bad_fields = opts[:no_rules] ? [] : config["data_rules"]
91
+ primary_key = destination.primary_key(table)
92
+ copy_fields = shared_fields.map { |f| f2 = bad_fields.to_a.find { |bf, _| rule_match?(table, f, bf) }; f2 ? "#{apply_strategy(f2[1], table, f, primary_key)} AS #{quote_ident(f)}" : "#{quoted_table}.#{quote_ident(f)}" }.join(", ")
93
+ fields = shared_fields.map { |f| quote_ident(f) }.join(", ")
94
+
95
+ seq_values = {}
96
+ shared_sequences.each do |seq|
97
+ seq_values[seq] = source.last_value(seq)
98
+ end
99
+
100
+ copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{sql_clause}) TO STDOUT"
101
+ if opts[:in_batches]
102
+ raise Error, "No primary key" if primary_key.empty?
103
+ primary_key = primary_key.first
104
+
105
+ destination.truncate(table) if opts[:truncate]
106
+
107
+ from_max_id = source.max_id(table, primary_key)
108
+ to_max_id = destination.max_id(table, primary_key) + 1
109
+
110
+ if to_max_id == 1
111
+ from_min_id = source.min_id(table, primary_key)
112
+ to_max_id = from_min_id if from_min_id > 0
113
+ end
114
+
115
+ starting_id = to_max_id
116
+ batch_size = opts[:batch_size]
117
+
118
+ i = 1
119
+ batch_count = ((from_max_id - starting_id + 1) / batch_size.to_f).ceil
120
+
121
+ while starting_id <= from_max_id
122
+ where = "#{quote_ident(primary_key)} >= #{starting_id} AND #{quote_ident(primary_key)} < #{starting_id + batch_size}"
123
+ log " #{i}/#{batch_count}: #{where}"
124
+
125
+ # TODO be smarter for advance sql clauses
126
+ batch_sql_clause = " #{sql_clause.length > 0 ? "#{sql_clause} AND" : "WHERE"} #{where}"
127
+
128
+ batch_copy_to_command = "COPY (SELECT #{copy_fields} FROM #{quoted_table}#{batch_sql_clause}) TO STDOUT"
129
+ copy(batch_copy_to_command, dest_table: table, dest_fields: fields)
130
+
131
+ starting_id += batch_size
132
+ i += 1
133
+
134
+ if opts[:sleep] && starting_id <= from_max_id
135
+ sleep(opts[:sleep])
136
+ end
137
+ end
138
+ elsif !opts[:truncate] && (opts[:overwrite] || opts[:preserve] || !sql_clause.empty?)
139
+ raise Error, "No primary key" if primary_key.empty?
140
+
141
+ # create a temp table
142
+ temp_table = "pgsync_#{rand(1_000_000_000)}"
143
+ destination.execute("CREATE TEMPORARY TABLE #{quote_ident_full(temp_table)} AS TABLE #{quoted_table} WITH NO DATA")
144
+
145
+ # load data
146
+ copy(copy_to_command, dest_table: temp_table, dest_fields: fields)
147
+
148
+ on_conflict = primary_key.map { |pk| quote_ident(pk) }.join(", ")
149
+ action =
150
+ if opts[:preserve]
151
+ "NOTHING"
152
+ else # overwrite or sql clause
153
+ setter = shared_fields.reject { |f| primary_key.include?(f) }.map { |f| "#{quote_ident(f)} = EXCLUDED.#{quote_ident(f)}" }
154
+ "UPDATE SET #{setter.join(", ")}"
155
+ end
156
+ destination.execute("INSERT INTO #{quoted_table} (SELECT * FROM #{quote_ident_full(temp_table)}) ON CONFLICT (#{on_conflict}) DO #{action}")
157
+ else
158
+ # use delete instead of truncate for foreign keys
159
+ if opts[:defer_constraints]
160
+ destination.execute("DELETE FROM #{quoted_table}")
161
+ else
162
+ destination.truncate(table)
163
+ end
164
+ copy(copy_to_command, dest_table: table, dest_fields: fields)
165
+ end
166
+ seq_values.each do |seq, value|
167
+ destination.execute("SELECT setval(#{escape(seq)}, #{escape(value)})")
168
+ end
169
+
170
+ {status: "success"}
171
+ end
172
+
173
+ private
174
+
175
+ def with_notices
176
+ notices = []
177
+ [source, destination].each do |data_source|
178
+ data_source.send(:conn).set_notice_processor do |message|
179
+ notices << message.strip
180
+ end
181
+ end
182
+ result = yield
183
+ result[:notices] = notices if result
184
+ result
185
+ ensure
186
+ # clear notice processor
187
+ [source, destination].each do |data_source|
188
+ data_source.send(:conn).set_notice_processor
189
+ end
190
+ end
191
+
192
+ # TODO add retries
193
+ def handle_errors
194
+ yield
195
+ rescue => e
196
+ raise e if opts[:debug]
197
+
198
+ message =
199
+ case e
200
+ when PG::ConnectionBad
201
+ # likely fine to show simplified message here
202
+ # the full message will be shown when first trying to connect
203
+ "Connection failed"
204
+ when PG::Error
205
+ e.message.sub("ERROR: ", "")
206
+ when Error
207
+ e.message
208
+ else
209
+ "#{e.class.name}: #{e.message}"
210
+ end
211
+
212
+ {status: "error", message: message}
213
+ end
214
+
215
+ def copy(source_command, dest_table:, dest_fields:)
216
+ destination_command = "COPY #{quote_ident_full(dest_table)} (#{dest_fields}) FROM STDIN"
217
+ destination.conn.copy_data(destination_command) do
218
+ source.conn.copy_data(source_command) do
219
+ while (row = source.conn.get_copy_data)
220
+ destination.conn.put_copy_data(row)
221
+ end
222
+ end
223
+ end
224
+ end
225
+
226
+ # TODO better performance
227
+ def rule_match?(table, column, rule)
228
+ regex = Regexp.new('\A' + Regexp.escape(rule).gsub('\*','[^\.]*') + '\z')
229
+ regex.match(column) || regex.match("#{table.name}.#{column}") || regex.match("#{table.schema}.#{table.name}.#{column}")
230
+ end
231
+
232
+ # TODO wildcard rules
233
+ def apply_strategy(rule, table, column, primary_key)
234
+ if rule.is_a?(Hash)
235
+ if rule.key?("value")
236
+ escape(rule["value"])
237
+ elsif rule.key?("statement")
238
+ rule["statement"]
239
+ else
240
+ raise Error, "Unknown rule #{rule.inspect} for column #{column}"
241
+ end
242
+ else
243
+ case rule
244
+ when "untouched"
245
+ quote_ident(column)
246
+ when "unique_email"
247
+ "'email' || #{quoted_primary_key(table, primary_key, rule)}::text || '@example.org'"
248
+ when "unique_phone"
249
+ "(#{quoted_primary_key(table, primary_key, rule)}::bigint + 1000000000)::text"
250
+ when "unique_secret"
251
+ "'secret' || #{quoted_primary_key(table, primary_key, rule)}::text"
252
+ when "random_int", "random_number"
253
+ "(RANDOM() * 100)::int"
254
+ when "random_date"
255
+ "date '1970-01-01' + (RANDOM() * 10000)::int"
256
+ when "random_time"
257
+ "NOW() - (RANDOM() * 100000000)::int * INTERVAL '1 second'"
258
+ when "random_ip"
259
+ "(1 + RANDOM() * 254)::int::text || '.0.0.1'"
260
+ when "random_letter"
261
+ "chr(65 + (RANDOM() * 26)::int)"
262
+ when "random_string"
263
+ "RIGHT(MD5(RANDOM()::text), 10)"
264
+ when "null", nil
265
+ "NULL"
266
+ else
267
+ raise Error, "Unknown rule #{rule} for column #{column}"
268
+ end
269
+ end
270
+ end
271
+
272
+ def quoted_primary_key(table, primary_key, rule)
273
+ raise Error, "Single column primary key required for this data rule: #{rule}" unless primary_key.size == 1
274
+ "#{quoted_table}.#{quote_ident(primary_key.first)}"
275
+ end
276
+
277
+ def maybe_disable_triggers
278
+ if opts[:disable_integrity] || opts[:disable_user_triggers]
279
+ destination.transaction do
280
+ triggers = destination.triggers(table)
281
+ triggers.select! { |t| t["enabled"] == "t" }
282
+ internal_triggers, user_triggers = triggers.partition { |t| t["internal"] == "t" }
283
+ integrity_triggers = internal_triggers.select { |t| t["integrity"] == "t" }
284
+ restore_triggers = []
285
+
286
+ if opts[:disable_integrity]
287
+ integrity_triggers.each do |trigger|
288
+ destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER #{quote_ident(trigger["name"])}")
289
+ end
290
+ restore_triggers.concat(integrity_triggers)
291
+ end
292
+
293
+ if opts[:disable_user_triggers]
294
+ # important!
295
+ # rely on Postgres to disable user triggers
296
+ # we don't want to accidentally disable non-user triggers if logic above is off
297
+ destination.execute("ALTER TABLE #{quoted_table} DISABLE TRIGGER USER")
298
+ restore_triggers.concat(user_triggers)
299
+ end
300
+
301
+ result = yield
302
+
303
+ # restore triggers that were previously enabled
304
+ restore_triggers.each do |trigger|
305
+ destination.execute("ALTER TABLE #{quoted_table} ENABLE TRIGGER #{quote_ident(trigger["name"])}")
306
+ end
307
+
308
+ result
309
+ end
310
+ else
311
+ yield
312
+ end
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,235 @@
1
+ module PgSync
2
+ class TaskResolver
3
+ include Utils
4
+
5
+ attr_reader :args, :opts, :source, :destination, :config, :first_schema, :notes
6
+
7
+ def initialize(args:, opts:, source:, destination:, config:, first_schema:)
8
+ @args = args
9
+ @opts = opts
10
+ @source = source
11
+ @destination = destination
12
+ @config = config
13
+ @groups = config["groups"] || {}
14
+ @first_schema = first_schema
15
+ @notes = []
16
+ end
17
+
18
+ def tasks
19
+ tasks = []
20
+
21
+ # get lists from args
22
+ groups, tables = process_args
23
+
24
+ # expand groups into tasks
25
+ groups.each do |group|
26
+ tasks.concat(group_to_tasks(group))
27
+ end
28
+
29
+ # expand tables into tasks
30
+ tables.each do |table|
31
+ tasks.concat(table_to_tasks(table))
32
+ end
33
+
34
+ # get default if none given
35
+ if !opts[:groups] && !opts[:tables] && args.size == 0
36
+ tasks.concat(default_tasks)
37
+ end
38
+
39
+ # resolve any tables that need it
40
+ tasks.each do |task|
41
+ task[:table] = fully_resolve(task[:table])
42
+ end
43
+
44
+ tasks
45
+ end
46
+
47
+ def group?(group)
48
+ @groups.key?(group)
49
+ end
50
+
51
+ private
52
+
53
+ def group_to_tasks(value)
54
+ group, param = value.split(":", 2)
55
+ raise Error, "Group not found: #{group}" unless group?(group)
56
+
57
+ @groups[group].map do |table|
58
+ table_sql = nil
59
+ if table.is_a?(Array)
60
+ table, table_sql = table
61
+ end
62
+
63
+ {
64
+ table: to_table(table),
65
+ sql: expand_sql(table_sql, param)
66
+ }
67
+ end
68
+ end
69
+
70
+ def table_to_tasks(value)
71
+ raise Error, "Cannot use parameters with tables" if value.include?(":")
72
+
73
+ tables =
74
+ if value.include?("*")
75
+ regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
76
+ shared_tables.select { |t| regex.match(t.full_name) || regex.match(t.name) }
77
+ else
78
+ [to_table(value)]
79
+ end
80
+
81
+ tables.map do |table|
82
+ {
83
+ table: table,
84
+ sql: sql_arg # doesn't support params
85
+ }
86
+ end
87
+ end
88
+
89
+ # treats identifiers as if they were quoted (Users == "Users")
90
+ # this is different from Postgres (Users == "users")
91
+ #
92
+ # TODO add support for quoted identifiers like "my.schema"."my.table"
93
+ # so it's possible to specify identifiers with "." in them
94
+ def to_table(value)
95
+ parts = value.split(".")
96
+ case parts.size
97
+ when 1
98
+ # unknown schema
99
+ Table.new(nil, parts[0])
100
+ when 2
101
+ Table.new(*parts)
102
+ else
103
+ raise Error, "Cannot resolve table: #{value}"
104
+ end
105
+ end
106
+
107
+ def default_tasks
108
+ shared_tables.map do |table|
109
+ {
110
+ table: table
111
+ }
112
+ end
113
+ end
114
+
115
+ # tables that exists in both source and destination
116
+ # used when no tables specified, or a wildcard
117
+ # removes excluded tables and filters by schema
118
+ def shared_tables
119
+ tables = filter_tables(source.tables)
120
+
121
+ unless opts[:schema_only] || opts[:schema_first]
122
+ from_tables = tables
123
+ to_tables = filter_tables(destination.tables)
124
+
125
+ extra_tables = to_tables - from_tables
126
+ notes << "Extra tables: #{extra_tables.map { |t| friendly_name(t) }.join(", ")}" if extra_tables.any?
127
+
128
+ missing_tables = from_tables - to_tables
129
+ notes << "Missing tables: #{missing_tables.map { |t| friendly_name(t) }.join(", ")}" if missing_tables.any?
130
+
131
+ tables &= to_tables
132
+ end
133
+
134
+ tables
135
+ end
136
+
137
+ def filter_tables(tables)
138
+ tables = tables.dup
139
+
140
+ unless opts[:all_schemas]
141
+ # could support wildcard schemas as well
142
+ schemas = Set.new(opts[:schemas] ? to_arr(opts[:schemas]) : source.search_path)
143
+ tables.select! { |t| schemas.include?(t.schema) }
144
+ end
145
+
146
+ to_arr(opts[:exclude]).each do |value|
147
+ if value.include?("*")
148
+ regex = Regexp.new('\A' + Regexp.escape(value).gsub('\*','[^\.]*') + '\z')
149
+ tables.reject! { |t| regex.match(t.full_name) || regex.match(t.name) }
150
+ else
151
+ tables -= [fully_resolve(to_table(value))]
152
+ end
153
+ end
154
+
155
+ tables
156
+ end
157
+
158
+ def process_args
159
+ groups = to_arr(opts[:groups])
160
+ tables = to_arr(opts[:tables])
161
+ if args[0]
162
+ # could be a group, table, or mix
163
+ to_arr(args[0]).each do |value|
164
+ if group?(value.split(":", 2)[0])
165
+ groups << value
166
+ else
167
+ tables << value
168
+ end
169
+ end
170
+ end
171
+ [groups, tables]
172
+ end
173
+
174
+ def no_schema_tables
175
+ @no_schema_tables ||= begin
176
+ search_path_index = source.search_path.map.with_index.to_h
177
+ source.tables.group_by(&:name).map do |group, t2|
178
+ [group, t2.select { |t| search_path_index[t.schema] }.sort_by { |t| search_path_index[t.schema] }.first]
179
+ end.to_h
180
+ end
181
+ end
182
+
183
+ # for tables without a schema, find the table in the search path
184
+ def fully_resolve(table)
185
+ return table if table.schema
186
+ no_schema_tables[table.name] || (raise Error, "Table not found in source: #{table.name}")
187
+ end
188
+
189
+ # parse command line arguments and YAML
190
+ def to_arr(value)
191
+ if value.is_a?(Array)
192
+ value
193
+ else
194
+ # Split by commas, but don't use commas inside double quotes
195
+ # https://stackoverflow.com/questions/21105360/regex-find-comma-not-inside-quotes
196
+ value.to_s.split(/(?!\B"[^"]*),(?![^"]*"\B)/)
197
+ end
198
+ end
199
+
200
+ def sql_arg
201
+ args[1]
202
+ end
203
+
204
+ def expand_sql(sql, param)
205
+ # command line option takes precedence over group option
206
+ sql = sql_arg if sql_arg
207
+
208
+ return unless sql
209
+
210
+ # vars must match \w
211
+ missing_vars = sql.scan(/{\w+}/).map { |v| v[1..-2] }
212
+
213
+ vars = {}
214
+ if param
215
+ vars["id"] = cast(param)
216
+ vars["1"] = cast(param)
217
+ end
218
+
219
+ sql = sql.dup
220
+ vars.each do |k, v|
221
+ # only sub if in var list
222
+ sql.gsub!("{#{k}}", cast(v)) if missing_vars.delete(k)
223
+ end
224
+
225
+ raise Error, "Missing variables: #{missing_vars.uniq.join(", ")}" if missing_vars.any?
226
+
227
+ sql
228
+ end
229
+
230
+ # TODO quote vars in next major version
231
+ def cast(value)
232
+ value.to_s.gsub(/\A\"|\"\z/, '')
233
+ end
234
+ end
235
+ end