exwiw 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +3 -2
- data/docs/optimization-notes.md +126 -0
- data/docs/optimize-mongodb-export-with-native-ext.md +229 -0
- data/docs/plans/2026-05-15-insert-000-schema-file.md +4 -4
- data/docs/plans/2026-05-16-mongodb-from-clean-scenario.md +8 -8
- data/docs/plans/2026-05-22-postgres-copy-mode-scenario-test.md +7 -7
- data/docs/plans/2026-05-31-ids-column-for-sql-adapters.md +1 -1
- data/docs/plans/2026-06-19-mongodb-export-remove-parallelism-native-ext.md +70 -0
- data/lib/exwiw/adapter/mongodb_adapter.rb +208 -43
- data/lib/exwiw/adapter/postgresql_adapter.rb +18 -1
- data/lib/exwiw/adapter.rb +10 -0
- data/lib/exwiw/determine_table_processing_order.rb +142 -25
- data/lib/exwiw/explain_runner.rb +1 -1
- data/lib/exwiw/runner.rb +25 -7
- data/lib/exwiw/version.rb +1 -1
- data/mise.toml +2 -2
- metadata +4 -1
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
|
+
require 'set'
|
|
4
5
|
|
|
5
6
|
# NOTE: This adapter consumes MongodbCollectionConfig (`fields` instead of
|
|
6
7
|
# `columns`, plus `embedded_in`). Top-level collections are dumped as one
|
|
@@ -13,6 +14,57 @@ module Exwiw
|
|
|
13
14
|
Exwiw::MongodbCollectionConfig
|
|
14
15
|
end
|
|
15
16
|
|
|
17
|
+
# A lazy, streaming stand-in for the materialized result array #execute
|
|
18
|
+
# used to return. Wrapping the Mongo cursor (instead of `.to_a`) keeps the
|
|
19
|
+
# dump's dominant memory cost — the full result set — off the heap: the
|
|
20
|
+
# Runner pulls documents through `each_slice`, so at most one chunk of
|
|
21
|
+
# documents (plus the small propagation-key arrays) is resident at a time,
|
|
22
|
+
# even for large or embed-heavy collections.
|
|
23
|
+
#
|
|
24
|
+
# It satisfies the two things the Runner asks of an execute result:
|
|
25
|
+
# - #size: the record count, used to skip empty collections and to log.
|
|
26
|
+
# Answered with a `count_documents` query (which only walks index
|
|
27
|
+
# entries, far cheaper than fetching every document) rather than by
|
|
28
|
+
# draining the cursor.
|
|
29
|
+
# - #each (via Enumerable / each_slice): a single streaming pass over the
|
|
30
|
+
# cursor. While streaming it captures — per propagation key, BEFORE
|
|
31
|
+
# handing the document to the caller's masking — the values downstream
|
|
32
|
+
# children will `$in`-match against, publishing them into @state once
|
|
33
|
+
# the pass completes.
|
|
34
|
+
#
|
|
35
|
+
# Contract note: unlike the old `.to_a` execute, which populated @state
|
|
36
|
+
# eagerly, this defers state capture until the result is consumed. The
|
|
37
|
+
# Runner always fully consumes a non-empty result before any child
|
|
38
|
+
# collection is processed, so propagation is unaffected; a caller that only
|
|
39
|
+
# needs @state must iterate the result (e.g. `.to_a`).
|
|
40
|
+
class StreamingResult
|
|
41
|
+
include Enumerable
|
|
42
|
+
|
|
43
|
+
def initialize(view:, collection:, keys:, state:)
|
|
44
|
+
@view = view
|
|
45
|
+
@collection = collection
|
|
46
|
+
@keys = keys
|
|
47
|
+
@state = state
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def size
|
|
51
|
+
@size ||= @view.count_documents
|
|
52
|
+
end
|
|
53
|
+
alias length size
|
|
54
|
+
|
|
55
|
+
def each
|
|
56
|
+
return enum_for(:each) { size } unless block_given?
|
|
57
|
+
|
|
58
|
+
captured = @keys.each_with_object({}) { |key, acc| acc[key] = [] }
|
|
59
|
+
@view.each do |doc|
|
|
60
|
+
@keys.each { |key| captured[key] << doc[key] }
|
|
61
|
+
yield doc
|
|
62
|
+
end
|
|
63
|
+
@state[@collection] = captured
|
|
64
|
+
self
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
16
68
|
def initialize(connection_config, logger)
|
|
17
69
|
super
|
|
18
70
|
@state = {}
|
|
@@ -71,16 +123,7 @@ module Exwiw
|
|
|
71
123
|
{ config.primary_key => { "$in" => coerce_ids(dump_target.ids) } }
|
|
72
124
|
end
|
|
73
125
|
else
|
|
74
|
-
config
|
|
75
|
-
# Constrain by the parent field this FK actually references
|
|
76
|
-
# (`relation.references`, default the parent primary_key). The
|
|
77
|
-
# values were captured from that field's documents in #execute, so
|
|
78
|
-
# their BSON type already matches the stored FK — no coercion.
|
|
79
|
-
values = parent_state_for(relation, config_by_name)
|
|
80
|
-
next if values.nil? || values.empty?
|
|
81
|
-
|
|
82
|
-
acc[relation.foreign_key] = { "$in" => values }
|
|
83
|
-
end
|
|
126
|
+
related_collection_filter(config, config_by_name)
|
|
84
127
|
end
|
|
85
128
|
|
|
86
129
|
Exwiw::MongoQuery::Find.new(
|
|
@@ -94,22 +137,24 @@ module Exwiw
|
|
|
94
137
|
def execute(query)
|
|
95
138
|
@logger.debug(" Executing Mongo find on '#{query.collection}': filter=#{query.filter.inspect} projection=#{query.projection.inspect}")
|
|
96
139
|
|
|
97
|
-
|
|
140
|
+
view = db[query.collection]
|
|
98
141
|
.find(query.filter)
|
|
99
142
|
.projection(query.projection)
|
|
100
143
|
.comment(query_comment_text("collection=#{query.collection}"))
|
|
101
|
-
.to_a
|
|
102
144
|
|
|
103
|
-
#
|
|
104
|
-
#
|
|
145
|
+
# Per referenced field, the values children will `$in`-match against.
|
|
146
|
+
# @propagation_keys is set by the build_query call for this same
|
|
105
147
|
# collection; fall back to the primary key if execute is driven without a
|
|
106
148
|
# preceding build_query (e.g. in isolation from a test).
|
|
107
149
|
keys = @propagation_keys || [query.primary_key]
|
|
108
|
-
@state[query.collection] = keys.each_with_object({}) do |key, acc|
|
|
109
|
-
acc[key] = docs.map { |doc| doc[key] }
|
|
110
|
-
end
|
|
111
150
|
|
|
112
|
-
|
|
151
|
+
# Return a streaming view of the result set rather than `.to_a`-ing the
|
|
152
|
+
# whole collection into memory. The Runner pulls documents through
|
|
153
|
+
# `each_slice`, so only one chunk's worth is resident at a time even for
|
|
154
|
+
# large / embed-heavy collections — the dump's dominant memory cost. The
|
|
155
|
+
# propagation-key values are captured as the cursor streams and published
|
|
156
|
+
# into @state once the pass completes (see StreamingResult).
|
|
157
|
+
StreamingResult.new(view: view, collection: query.collection, keys: keys, state: @state)
|
|
113
158
|
end
|
|
114
159
|
|
|
115
160
|
# NOTE: relies on @embedded_children_by_parent set by a prior build_query
|
|
@@ -118,9 +163,9 @@ module Exwiw
|
|
|
118
163
|
# to_bulk_insert (SQL adapters don't need it). Safe in Runner, fragile in
|
|
119
164
|
# tests — call build_query first.
|
|
120
165
|
def to_bulk_insert(rows, config)
|
|
166
|
+
plan = mask_plan(config)
|
|
121
167
|
rows.map do |doc|
|
|
122
|
-
|
|
123
|
-
apply_embedded_masking!(doc, config)
|
|
168
|
+
apply_mask_plan!(doc, plan)
|
|
124
169
|
JSON.generate(extended_json(doc))
|
|
125
170
|
end.join("\n")
|
|
126
171
|
end
|
|
@@ -143,6 +188,20 @@ module Exwiw
|
|
|
143
188
|
'jsonl'
|
|
144
189
|
end
|
|
145
190
|
|
|
191
|
+
# Bound how many documents are serialized at once when a collection config
|
|
192
|
+
# carries no explicit bulk_insert_chunk_size. A MongoDB dump is one JSONL
|
|
193
|
+
# line per document and, without chunking, the Runner would materialize the
|
|
194
|
+
# entire collection's output as a single giant string while the full
|
|
195
|
+
# in-memory result set is still alive — doubling peak memory on large or
|
|
196
|
+
# embed-heavy collections. Chunking lets the Runner stream each slice to the
|
|
197
|
+
# file and release its serialized string (and the transient extended-JSON
|
|
198
|
+
# trees) before building the next.
|
|
199
|
+
DEFAULT_BULK_INSERT_CHUNK_SIZE = 1_000
|
|
200
|
+
|
|
201
|
+
def default_bulk_insert_chunk_size
|
|
202
|
+
DEFAULT_BULK_INSERT_CHUNK_SIZE
|
|
203
|
+
end
|
|
204
|
+
|
|
146
205
|
def schema_output_extension
|
|
147
206
|
'js'
|
|
148
207
|
end
|
|
@@ -160,6 +219,14 @@ module Exwiw
|
|
|
160
219
|
|
|
161
220
|
collections = ordered_tables.reject(&:embedded?)
|
|
162
221
|
|
|
222
|
+
# Index listing targets a specific collection, and MongoDB raises
|
|
223
|
+
# NamespaceNotFound (code 26) for one that does not exist. The schema may
|
|
224
|
+
# declare collections absent from this database (schema/DB drift, or a
|
|
225
|
+
# sparse dev DB), so resolve the set that actually exists up front and emit
|
|
226
|
+
# indexes only for those. `createCollection` is still emitted for every
|
|
227
|
+
# config below, so the target schema is created in full regardless.
|
|
228
|
+
existing_collections = db.database.collection_names.to_set
|
|
229
|
+
|
|
163
230
|
File.open(output_path, 'w') do |file|
|
|
164
231
|
file.puts("// Auto-generated by exwiw. Apply with: mongosh \"$MONGODB_URI\" #{File.basename(output_path)}")
|
|
165
232
|
file.puts
|
|
@@ -172,6 +239,11 @@ module Exwiw
|
|
|
172
239
|
|
|
173
240
|
collections.each do |config|
|
|
174
241
|
name = config.name
|
|
242
|
+
unless existing_collections.include?(name)
|
|
243
|
+
@logger.debug(" Collection '#{name}' is not present in the source database; emitting no indexes.")
|
|
244
|
+
next
|
|
245
|
+
end
|
|
246
|
+
|
|
175
247
|
indexes = db[name].indexes.to_a.reject { |idx| idx['name'] == '_id_' }
|
|
176
248
|
indexes.each do |idx|
|
|
177
249
|
key = idx['key']
|
|
@@ -274,6 +346,39 @@ module Exwiw
|
|
|
274
346
|
([config.primary_key] + referenced).uniq
|
|
275
347
|
end
|
|
276
348
|
|
|
349
|
+
# Build the scoping filter for a non-target collection from its belongs_to
|
|
350
|
+
# parents' captured ids. Each belongs_to is constrained by the parent field
|
|
351
|
+
# the FK references (`relation.references`, default the parent primary_key);
|
|
352
|
+
# the values were captured from that field in #execute, so their BSON type
|
|
353
|
+
# already matches the stored FK — no coercion.
|
|
354
|
+
#
|
|
355
|
+
# A belongs_to whose parent produced no ids contributes no constraint:
|
|
356
|
+
# either the parent matched nothing, or it is not dumped here (e.g. an
|
|
357
|
+
# embedded collection, or one excluded from the run). If that leaves the
|
|
358
|
+
# filter empty even though the collection HAS belongs_to, the collection
|
|
359
|
+
# cannot be scoped from the dump target — and falling back to an empty `{}`
|
|
360
|
+
# filter would scan and dump the ENTIRE collection across every scope. That
|
|
361
|
+
# is never what a scoped extraction wants, so constrain it to match nothing
|
|
362
|
+
# and warn instead. (A collection with no belongs_to at all is genuine
|
|
363
|
+
# reference/master data and is still dumped in full via `{}`.)
|
|
364
|
+
private def related_collection_filter(config, config_by_name)
|
|
365
|
+
filter = config.belongs_tos.each_with_object({}) do |relation, acc|
|
|
366
|
+
values = parent_state_for(relation, config_by_name)
|
|
367
|
+
next if values.nil? || values.empty?
|
|
368
|
+
|
|
369
|
+
acc[relation.foreign_key] = { "$in" => values }
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
return filter unless filter.empty? && config.belongs_tos.any?
|
|
373
|
+
|
|
374
|
+
@logger.warn(
|
|
375
|
+
" Collection '#{config.name}' has belongs_to but no parent produced ids to scope by " \
|
|
376
|
+
"(parents matched nothing, or are not dumped on their own such as embedded collections). " \
|
|
377
|
+
"Constraining it to match no rows to avoid an unscoped full-collection dump."
|
|
378
|
+
)
|
|
379
|
+
{ config.primary_key => { "$in" => [] } }
|
|
380
|
+
end
|
|
381
|
+
|
|
277
382
|
# The captured parent-collection values a child belongs_to should be
|
|
278
383
|
# constrained by: the values of the parent field the FK references
|
|
279
384
|
# (`relation.references`, default the parent primary_key). nil when the
|
|
@@ -287,41 +392,101 @@ module Exwiw
|
|
|
287
392
|
parent_fields[reference_field]
|
|
288
393
|
end
|
|
289
394
|
|
|
290
|
-
|
|
291
|
-
|
|
395
|
+
# A masking plan compiled once per collection config and reused for every
|
|
396
|
+
# document of that collection. `masked_fields` is `[field_name,
|
|
397
|
+
# template_segments]` for each field carrying a `replace_with`;
|
|
398
|
+
# `embedded` is one EmbeddedMask per embedded child.
|
|
399
|
+
MaskPlan = Struct.new(:masked_fields, :embedded)
|
|
400
|
+
|
|
401
|
+
# A pre-resolved embedded-child mask: the parent path split once into
|
|
402
|
+
# `prefix` (the containers to descend into) and `last` (the field holding
|
|
403
|
+
# the subdocument(s)), plus the child's own MaskPlan.
|
|
404
|
+
EmbeddedMask = Struct.new(:prefix, :last, :plan)
|
|
405
|
+
|
|
406
|
+
# Build (or fetch) the cached MaskPlan for `config`. Masking runs over every
|
|
407
|
+
# document AND every embedded subdocument, so for an embed-heavy collection
|
|
408
|
+
# the same per-config decisions — which fields carry a `replace_with`, how
|
|
409
|
+
# each template splits into segments, where the embedded children live —
|
|
410
|
+
# were previously recomputed tens of times per document. Compiling them once
|
|
411
|
+
# per config lets #apply_mask_plan! do nothing but the work that actually
|
|
412
|
+
# varies per document (rendering templates, descending into subdocuments),
|
|
413
|
+
# so the saved per-subdocument overhead scales down with embedding count.
|
|
414
|
+
#
|
|
415
|
+
# Cached by config name: names are unique within a run and the configs do
|
|
416
|
+
# not mutate mid-dump. Relies on @embedded_children_by_parent, set by the
|
|
417
|
+
# build_query call that always precedes to_bulk_insert (see #to_bulk_insert).
|
|
418
|
+
private def mask_plan(config)
|
|
419
|
+
(@mask_plans ||= {})[config.name] ||= build_mask_plan(config)
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
private def build_mask_plan(config)
|
|
423
|
+
masked_fields = config.fields.each_with_object([]) do |field, acc|
|
|
292
424
|
next unless field.replace_with
|
|
293
425
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
426
|
+
acc << [field.name, compile_template(field.replace_with)]
|
|
427
|
+
end
|
|
428
|
+
embedded = embedded_children_of(config).map do |child|
|
|
429
|
+
*prefix, last = child.embedded_in.path.split(".")
|
|
430
|
+
EmbeddedMask.new(prefix, last, build_mask_plan(child))
|
|
298
431
|
end
|
|
432
|
+
MaskPlan.new(masked_fields, embedded)
|
|
299
433
|
end
|
|
300
434
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
435
|
+
# Apply a precompiled MaskPlan to a document in place: render each masked
|
|
436
|
+
# field, then descend into each embedded child (recursing into its own
|
|
437
|
+
# plan). Equivalent to the old apply_replace_with! + apply_embedded_masking!
|
|
438
|
+
# pair, with all per-config lookups hoisted into the plan.
|
|
439
|
+
private def apply_mask_plan!(doc, plan)
|
|
440
|
+
plan.masked_fields.each do |name, segments|
|
|
441
|
+
doc[name] = render_template(segments, doc)
|
|
442
|
+
end
|
|
443
|
+
plan.embedded.each do |child|
|
|
444
|
+
container = child.prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
|
|
445
|
+
next unless container.is_a?(Hash)
|
|
446
|
+
|
|
447
|
+
case (value = container[child.last])
|
|
448
|
+
when Array then value.each { |sub| apply_mask_plan!(sub, child.plan) if sub.is_a?(Hash) }
|
|
449
|
+
when Hash then apply_mask_plan!(value, child.plan)
|
|
306
450
|
end
|
|
307
451
|
end
|
|
308
452
|
end
|
|
309
453
|
|
|
310
|
-
|
|
311
|
-
|
|
454
|
+
PLACEHOLDER_PATTERN = /\{([^{}]+)\}/
|
|
455
|
+
|
|
456
|
+
# Split a `replace_with` template into a flat list of segments (called once
|
|
457
|
+
# per masked field at plan-build time, see #build_mask_plan). A segment is
|
|
458
|
+
# either a literal String or a 1-element Array `[ref]` marking a `{ref}`
|
|
459
|
+
# placeholder. #render_template then concatenates them, skipping the regex
|
|
460
|
+
# scan / block / `Regexp.last_match` a per-document `gsub` would repeat (~2.5x
|
|
461
|
+
# faster per field). The segment walk reproduces the old gsub byte-for-byte
|
|
462
|
+
# (missing keys render as "", literals pass through unchanged).
|
|
463
|
+
private def compile_template(template)
|
|
464
|
+
segments = []
|
|
465
|
+
pos = 0
|
|
466
|
+
while (md = PLACEHOLDER_PATTERN.match(template, pos))
|
|
467
|
+
segments << template[pos...md.begin(0)] if md.begin(0) > pos
|
|
468
|
+
segments << [md[1]]
|
|
469
|
+
pos = md.end(0)
|
|
470
|
+
end
|
|
471
|
+
segments << template[pos..] if pos < template.length
|
|
472
|
+
segments
|
|
312
473
|
end
|
|
313
474
|
|
|
314
|
-
private def
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
when Hash then yield value
|
|
475
|
+
private def render_template(segments, doc)
|
|
476
|
+
out = +''
|
|
477
|
+
segments.each do |seg|
|
|
478
|
+
if seg.is_a?(Array)
|
|
479
|
+
ref = seg[0]
|
|
480
|
+
out << (doc.key?(ref) ? doc[ref] : nil).to_s
|
|
481
|
+
else
|
|
482
|
+
out << seg
|
|
483
|
+
end
|
|
324
484
|
end
|
|
485
|
+
out
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
private def embedded_children_of(parent_config)
|
|
489
|
+
@embedded_children_by_parent.fetch(parent_config.name, [])
|
|
325
490
|
end
|
|
326
491
|
|
|
327
492
|
private def extended_json(doc)
|
|
@@ -65,7 +65,18 @@ module Exwiw
|
|
|
65
65
|
ext_ddl = extensions.map do |extname, schema|
|
|
66
66
|
stmt = "CREATE EXTENSION IF NOT EXISTS #{connection.quote_ident(extname)}"
|
|
67
67
|
stmt += " SCHEMA #{connection.quote_ident(schema)}" unless schema == "public"
|
|
68
|
-
|
|
68
|
+
# Best-effort prepend: a restore target that genuinely cannot create the
|
|
69
|
+
# extension should not abort the whole restore. Two such cases are caught:
|
|
70
|
+
# feature_not_supported (0A000) -- the extension's binaries are unavailable
|
|
71
|
+
# invalid_schema_name (3F000) -- the extension's required schema is absent
|
|
72
|
+
# insufficient_privilege (42501) is deliberately NOT caught: a restore role
|
|
73
|
+
# lacking CREATE privilege is a misconfiguration to fix, not to skip silently.
|
|
74
|
+
# The skip is re-raised as a WARNING so it surfaces in the restore logs
|
|
75
|
+
# instead of vanishing.
|
|
76
|
+
warning = connection.escape_literal("exwiw: skipped CREATE EXTENSION #{extname} (SQLSTATE %): %")
|
|
77
|
+
"DO $$ BEGIN #{stmt}; " \
|
|
78
|
+
"EXCEPTION WHEN feature_not_supported OR invalid_schema_name THEN " \
|
|
79
|
+
"RAISE WARNING #{warning}, SQLSTATE, SQLERRM; END $$;"
|
|
69
80
|
end.join("\n") + "\n\n"
|
|
70
81
|
@logger.debug(" Found #{extensions.size} extension(s) to prepend.")
|
|
71
82
|
stdout = ext_ddl + stdout
|
|
@@ -382,11 +393,17 @@ module Exwiw
|
|
|
382
393
|
end
|
|
383
394
|
|
|
384
395
|
private def query_extensions
|
|
396
|
+
# Skip plpgsql (always present) and managed-platform bookkeeping extensions
|
|
397
|
+
# (google_*/rds_*/aiven_*). pglogical is also skipped: it is a logical-
|
|
398
|
+
# replication mechanism of the source, not part of the data being copied,
|
|
399
|
+
# and its dedicated `pglogical` schema is typically absent on the restore
|
|
400
|
+
# target — so prepending CREATE EXTENSION for it only breaks the restore.
|
|
385
401
|
sql = <<~SQL
|
|
386
402
|
SELECT e.extname, n.nspname
|
|
387
403
|
FROM pg_extension e
|
|
388
404
|
JOIN pg_namespace n ON n.oid = e.extnamespace
|
|
389
405
|
WHERE e.extname != 'plpgsql'
|
|
406
|
+
AND e.extname != 'pglogical'
|
|
390
407
|
AND e.extname NOT LIKE 'google\\_%' ESCAPE '\\'
|
|
391
408
|
AND e.extname NOT LIKE 'rds\\_%' ESCAPE '\\'
|
|
392
409
|
AND e.extname NOT LIKE 'aiven\\_%' ESCAPE '\\'
|
data/lib/exwiw/adapter.rb
CHANGED
|
@@ -113,6 +113,16 @@ module Exwiw
|
|
|
113
113
|
raise NotImplementedError, "COPY format is not supported by #{self.class.name}"
|
|
114
114
|
end
|
|
115
115
|
|
|
116
|
+
# Default bulk-insert chunk size when a table config does not set one.
|
|
117
|
+
# The Runner streams each chunk straight to the output file, so a non-nil
|
|
118
|
+
# value here bounds how much serialized output (and how many transient
|
|
119
|
+
# intermediate objects) live in memory at once. SQL adapters keep nil
|
|
120
|
+
# (one statement per table, as before); adapters whose output is large
|
|
121
|
+
# and built per-row (e.g. MongoDB JSONL) override with a positive value.
|
|
122
|
+
def default_bulk_insert_chunk_size
|
|
123
|
+
nil
|
|
124
|
+
end
|
|
125
|
+
|
|
116
126
|
# Run the database-specific EXPLAIN for the given query and return the
|
|
117
127
|
# output as a single string for `explain` subcommand to print.
|
|
118
128
|
# SQL adapters override; MongodbAdapter currently raises.
|
|
@@ -1,35 +1,58 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
3
5
|
module Exwiw
|
|
4
6
|
module DetermineTableProcessingOrder
|
|
5
7
|
module_function
|
|
6
8
|
|
|
7
9
|
# @param tables [Array<Exwiw::TableConfig>] tables
|
|
10
|
+
# @param logger [Logger, nil] receives a warning when a cycle has to be broken
|
|
8
11
|
# @return [Array<String>] sorted table names
|
|
9
|
-
def run(tables)
|
|
12
|
+
def run(tables, logger: nil)
|
|
10
13
|
return tables.map(&:name) if tables.size < 2
|
|
11
14
|
|
|
12
15
|
ordered_table_names = []
|
|
16
|
+
ordered = Set.new
|
|
13
17
|
|
|
14
18
|
table_by_name = tables.each_with_object({}) do |table, acc|
|
|
15
19
|
acc[table.name] = table
|
|
16
20
|
end
|
|
17
21
|
|
|
22
|
+
# Only belongs_to relations whose target is also in this run constrain the
|
|
23
|
+
# order. A belongs_to pointing at a table that is not being processed here
|
|
24
|
+
# — e.g. an embedded MongoDB collection (masked through its parent, never
|
|
25
|
+
# dumped on its own) or any table excluded from the run — is not something
|
|
26
|
+
# we can or need to order against, so it must never block resolution.
|
|
27
|
+
# Without this, such a dependency would stay unresolved forever and
|
|
28
|
+
# masquerade as a circular dependency, freezing every table that
|
|
29
|
+
# (transitively) references it.
|
|
30
|
+
present_names = table_by_name.keys.to_set
|
|
31
|
+
|
|
18
32
|
loop do
|
|
19
33
|
break if table_by_name.empty?
|
|
20
34
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
not_resolved_names.empty?
|
|
35
|
+
resolvable = table_by_name.values.select do |table|
|
|
36
|
+
unresolved_dependencies(table, present_names, ordered).empty?
|
|
25
37
|
end
|
|
26
38
|
|
|
27
|
-
if
|
|
28
|
-
|
|
39
|
+
if resolvable.empty?
|
|
40
|
+
# No table has all its (in-run) dependencies satisfied, yet tables
|
|
41
|
+
# remain: the belongs_to graph has a genuine cycle and no strict
|
|
42
|
+
# topological order exists. Rather than aborting the whole export, break
|
|
43
|
+
# the cycle by emitting one cycle member; see pick_cycle_victim for how
|
|
44
|
+
# the member is chosen. Warn so the dropped constraint is visible.
|
|
45
|
+
victim = pick_cycle_victim(table_by_name.values, present_names, ordered)
|
|
46
|
+
warn_cycle_break(logger, victim, unresolved_dependencies(victim, present_names, ordered))
|
|
47
|
+
resolvable = [victim]
|
|
29
48
|
end
|
|
30
49
|
|
|
31
|
-
|
|
50
|
+
# In the normal (acyclic) path, emit every currently-resolvable table in
|
|
51
|
+
# insertion order — preserving the historical ordering the snapshot specs
|
|
52
|
+
# depend on. The cycle-break path emits exactly its single chosen victim.
|
|
53
|
+
resolvable.each do |table|
|
|
32
54
|
ordered_table_names << table.name
|
|
55
|
+
ordered << table.name
|
|
33
56
|
table_by_name.delete(table.name)
|
|
34
57
|
end
|
|
35
58
|
end
|
|
@@ -37,30 +60,124 @@ module Exwiw
|
|
|
37
60
|
ordered_table_names
|
|
38
61
|
end
|
|
39
62
|
|
|
63
|
+
# The belongs_to target table names of `table`. A polymorphic belongs_to is
|
|
64
|
+
# expanded into one entry per concrete target by schema generation, so each
|
|
65
|
+
# entry is a plain table name here.
|
|
40
66
|
def compute_table_dependencies(table)
|
|
41
|
-
table.belongs_tos.
|
|
42
|
-
|
|
67
|
+
table.belongs_tos.map(&:table_name)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# The dependencies still blocking `table`: belongs_to targets that are part
|
|
71
|
+
# of this run, not yet ordered, and not the table itself (a self-referential
|
|
72
|
+
# belongs_to never blocks).
|
|
73
|
+
private_class_method def unresolved_dependencies(table, present_names, ordered)
|
|
74
|
+
compute_table_dependencies(table).uniq.select do |dep|
|
|
75
|
+
present_names.include?(dep) && !ordered.include?(dep) && dep != table.name
|
|
43
76
|
end
|
|
44
77
|
end
|
|
45
78
|
|
|
46
|
-
#
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
79
|
+
# Choose the next table to emit when the order is stuck in a cycle. Only
|
|
80
|
+
# genuine cycle members are eligible — a table in a non-trivial
|
|
81
|
+
# strongly-connected component of the unresolved-dependency subgraph — so an
|
|
82
|
+
# acyclic table that merely waits on a cycle is never reordered ahead of its
|
|
83
|
+
# parent. Among the members, prefer one that still has at least one
|
|
84
|
+
# already-ordered parent, so its extraction stays constrained instead of
|
|
85
|
+
# collapsing to "match every row" (a cross-scope over-extraction risk for the
|
|
86
|
+
# mongodb adapter); break remaining ties by fewest unresolved dependencies,
|
|
87
|
+
# then by name, for determinism.
|
|
88
|
+
private_class_method def pick_cycle_victim(remaining, present_names, ordered)
|
|
89
|
+
adjacency = remaining.each_with_object({}) do |table, acc|
|
|
90
|
+
acc[table.name] = unresolved_dependencies(table, present_names, ordered)
|
|
91
|
+
end
|
|
92
|
+
cyclic_names = strongly_connected_members(adjacency)
|
|
93
|
+
|
|
94
|
+
candidates = remaining.select { |table| cyclic_names.include?(table.name) }
|
|
95
|
+
candidates = remaining if candidates.empty? # defensive; a stall implies a cycle
|
|
96
|
+
|
|
97
|
+
anchored = candidates.select { |table| ordered_parent?(table, present_names, ordered) }
|
|
98
|
+
pool = anchored.empty? ? candidates : anchored
|
|
99
|
+
|
|
100
|
+
pool.min_by { |table| [unresolved_dependencies(table, present_names, ordered).size, table.name] }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# True when `table` has a belongs_to whose target was already ordered, so its
|
|
104
|
+
# extraction filter will be constrained rather than an unscoped full scan.
|
|
105
|
+
private_class_method def ordered_parent?(table, present_names, ordered)
|
|
106
|
+
compute_table_dependencies(table).any? do |dep|
|
|
107
|
+
dep != table.name && present_names.include?(dep) && ordered.include?(dep)
|
|
54
108
|
end
|
|
55
109
|
end
|
|
56
110
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
111
|
+
# Names belonging to a non-trivial strongly-connected component (size > 1) of
|
|
112
|
+
# `adjacency` (table name -> unresolved dependency names), i.e. the genuine
|
|
113
|
+
# cycle participants. Iterative Tarjan; nodes and edges are visited in name
|
|
114
|
+
# order so the result is deterministic. Self-edges are already excluded from
|
|
115
|
+
# the adjacency, so a size-1 component is never a cycle.
|
|
116
|
+
private_class_method def strongly_connected_members(adjacency)
|
|
117
|
+
index = {}
|
|
118
|
+
low = {}
|
|
119
|
+
on_stack = {}
|
|
120
|
+
stack = []
|
|
121
|
+
counter = 0
|
|
122
|
+
members = Set.new
|
|
123
|
+
neighbors = adjacency.each_with_object({}) { |(name, deps), acc| acc[name] = deps.sort }
|
|
124
|
+
|
|
125
|
+
adjacency.keys.sort.each do |start|
|
|
126
|
+
next if index.key?(start)
|
|
127
|
+
|
|
128
|
+
work = [[start, 0]]
|
|
129
|
+
until work.empty?
|
|
130
|
+
node, edge_i = work.last
|
|
131
|
+
if edge_i.zero?
|
|
132
|
+
index[node] = counter
|
|
133
|
+
low[node] = counter
|
|
134
|
+
counter += 1
|
|
135
|
+
stack.push(node)
|
|
136
|
+
on_stack[node] = true
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
adj = neighbors[node] || []
|
|
140
|
+
if edge_i < adj.size
|
|
141
|
+
work.last[1] += 1
|
|
142
|
+
w = adj[edge_i]
|
|
143
|
+
next unless adjacency.key?(w) # ignore edges leaving the remaining set
|
|
144
|
+
|
|
145
|
+
if index.key?(w)
|
|
146
|
+
low[node] = [low[node], index[w]].min if on_stack[w]
|
|
147
|
+
else
|
|
148
|
+
work.push([w, 0])
|
|
149
|
+
end
|
|
150
|
+
else
|
|
151
|
+
if low[node] == index[node]
|
|
152
|
+
component = []
|
|
153
|
+
loop do
|
|
154
|
+
w = stack.pop
|
|
155
|
+
on_stack[w] = false
|
|
156
|
+
component << w
|
|
157
|
+
break if w == node
|
|
158
|
+
end
|
|
159
|
+
members.merge(component) if component.size > 1
|
|
160
|
+
end
|
|
161
|
+
work.pop
|
|
162
|
+
low[work.last[0]] = [low[work.last[0]], low[node]].min unless work.empty?
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
members
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private_class_method def warn_cycle_break(logger, victim, dropped)
|
|
171
|
+
return if logger.nil?
|
|
172
|
+
|
|
173
|
+
logger.warn(
|
|
174
|
+
"Circular belongs_to dependency detected. Breaking it by ordering " \
|
|
175
|
+
"'#{victim.name}' before its parent table(s): #{dropped.join(', ')}. The dropped " \
|
|
176
|
+
"relationship is not enforced while ordering, so '#{victim.name}' is extracted " \
|
|
177
|
+
"without that parent constraint (the mongodb adapter may then match a superset of " \
|
|
178
|
+
"rows; SQL output may not load in foreign-key order). To break the cycle explicitly " \
|
|
179
|
+
"instead, mark one of the belongs_to entries forming it with `ignore: true`."
|
|
180
|
+
)
|
|
64
181
|
end
|
|
65
182
|
end
|
|
66
183
|
end
|
data/lib/exwiw/explain_runner.rb
CHANGED
|
@@ -30,7 +30,7 @@ module Exwiw
|
|
|
30
30
|
QueryAstBuilder.validate_scope!(dumpable_configs, table_by_name, @dump_target, @logger)
|
|
31
31
|
|
|
32
32
|
@logger.debug("Determining table processing order...")
|
|
33
|
-
ordered_table_names = DetermineTableProcessingOrder.run(dumpable_configs)
|
|
33
|
+
ordered_table_names = DetermineTableProcessingOrder.run(dumpable_configs, logger: @logger)
|
|
34
34
|
|
|
35
35
|
total_size = ordered_table_names.size
|
|
36
36
|
ordered_table_names.each_with_index do |table_name, idx|
|