exwiw 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'json'
4
+ require 'set'
4
5
 
5
6
  # NOTE: This adapter consumes MongodbCollectionConfig (`fields` instead of
6
7
  # `columns`, plus `embedded_in`). Top-level collections are dumped as one
@@ -13,6 +14,57 @@ module Exwiw
13
14
  Exwiw::MongodbCollectionConfig
14
15
  end
15
16
 
17
+ # A lazy, streaming stand-in for the materialized result array #execute
18
+ # used to return. Wrapping the Mongo cursor (instead of `.to_a`) keeps the
19
+ # dump's dominant memory cost — the full result set — off the heap: the
20
+ # Runner pulls documents through `each_slice`, so at most one chunk of
21
+ # documents (plus the small propagation-key arrays) is resident at a time,
22
+ # even for large or embed-heavy collections.
23
+ #
24
+ # It satisfies the two things the Runner asks of an execute result:
25
+ # - #size: the record count, used to skip empty collections and to log.
26
+ # Answered with a `count_documents` query (which only walks index
27
+ # entries, far cheaper than fetching every document) rather than by
28
+ # draining the cursor.
29
+ # - #each (via Enumerable / each_slice): a single streaming pass over the
30
+ # cursor. While streaming it captures — per propagation key, BEFORE
31
+ # handing the document to the caller's masking — the values downstream
32
+ # children will `$in`-match against, publishing them into @state once
33
+ # the pass completes.
34
+ #
35
+ # Contract note: unlike the old `.to_a` execute, which populated @state
36
+ # eagerly, this defers state capture until the result is consumed. The
37
+ # Runner always fully consumes a non-empty result before any child
38
+ # collection is processed, so propagation is unaffected; a caller that only
39
+ # needs @state must iterate the result (e.g. `.to_a`).
40
+ class StreamingResult
41
+ include Enumerable
42
+
43
+ def initialize(view:, collection:, keys:, state:)
44
+ @view = view
45
+ @collection = collection
46
+ @keys = keys
47
+ @state = state
48
+ end
49
+
50
+ def size
51
+ @size ||= @view.count_documents
52
+ end
53
+ alias length size
54
+
55
+ def each
56
+ return enum_for(:each) { size } unless block_given?
57
+
58
+ captured = @keys.each_with_object({}) { |key, acc| acc[key] = [] }
59
+ @view.each do |doc|
60
+ @keys.each { |key| captured[key] << doc[key] }
61
+ yield doc
62
+ end
63
+ @state[@collection] = captured
64
+ self
65
+ end
66
+ end
67
+
16
68
  def initialize(connection_config, logger)
17
69
  super
18
70
  @state = {}
@@ -71,16 +123,7 @@ module Exwiw
71
123
  { config.primary_key => { "$in" => coerce_ids(dump_target.ids) } }
72
124
  end
73
125
  else
74
- config.belongs_tos.each_with_object({}) do |relation, acc|
75
- # Constrain by the parent field this FK actually references
76
- # (`relation.references`, default the parent primary_key). The
77
- # values were captured from that field's documents in #execute, so
78
- # their BSON type already matches the stored FK — no coercion.
79
- values = parent_state_for(relation, config_by_name)
80
- next if values.nil? || values.empty?
81
-
82
- acc[relation.foreign_key] = { "$in" => values }
83
- end
126
+ related_collection_filter(config, config_by_name)
84
127
  end
85
128
 
86
129
  Exwiw::MongoQuery::Find.new(
@@ -94,22 +137,24 @@ module Exwiw
94
137
  def execute(query)
95
138
  @logger.debug(" Executing Mongo find on '#{query.collection}': filter=#{query.filter.inspect} projection=#{query.projection.inspect}")
96
139
 
97
- docs = db[query.collection]
140
+ view = db[query.collection]
98
141
  .find(query.filter)
99
142
  .projection(query.projection)
100
143
  .comment(query_comment_text("collection=#{query.collection}"))
101
- .to_a
102
144
 
103
- # Stash, per referenced field, the values children will `$in`-match
104
- # against. @propagation_keys is set by the build_query call for this same
145
+ # Per referenced field, the values children will `$in`-match against.
146
+ # @propagation_keys is set by the build_query call for this same
105
147
  # collection; fall back to the primary key if execute is driven without a
106
148
  # preceding build_query (e.g. in isolation from a test).
107
149
  keys = @propagation_keys || [query.primary_key]
108
- @state[query.collection] = keys.each_with_object({}) do |key, acc|
109
- acc[key] = docs.map { |doc| doc[key] }
110
- end
111
150
 
112
- docs
151
+ # Return a streaming view of the result set rather than `.to_a`-ing the
152
+ # whole collection into memory. The Runner pulls documents through
153
+ # `each_slice`, so only one chunk's worth is resident at a time even for
154
+ # large / embed-heavy collections — the dump's dominant memory cost. The
155
+ # propagation-key values are captured as the cursor streams and published
156
+ # into @state once the pass completes (see StreamingResult).
157
+ StreamingResult.new(view: view, collection: query.collection, keys: keys, state: @state)
113
158
  end
114
159
 
115
160
  # NOTE: relies on @embedded_children_by_parent set by a prior build_query
@@ -118,9 +163,9 @@ module Exwiw
118
163
  # to_bulk_insert (SQL adapters don't need it). Safe in Runner, fragile in
119
164
  # tests — call build_query first.
120
165
  def to_bulk_insert(rows, config)
166
+ plan = mask_plan(config)
121
167
  rows.map do |doc|
122
- apply_replace_with!(doc, config)
123
- apply_embedded_masking!(doc, config)
168
+ apply_mask_plan!(doc, plan)
124
169
  JSON.generate(extended_json(doc))
125
170
  end.join("\n")
126
171
  end
@@ -143,6 +188,20 @@ module Exwiw
143
188
  'jsonl'
144
189
  end
145
190
 
191
+ # Bound how many documents are serialized at once when a collection config
192
+ # carries no explicit bulk_insert_chunk_size. A MongoDB dump is one JSONL
193
+ # line per document and, without chunking, the Runner would materialize the
194
+ # entire collection's output as a single giant string while the full
195
+ # in-memory result set is still alive — doubling peak memory on large or
196
+ # embed-heavy collections. Chunking lets the Runner stream each slice to the
197
+ # file and release its serialized string (and the transient extended-JSON
198
+ # trees) before building the next.
199
+ DEFAULT_BULK_INSERT_CHUNK_SIZE = 1_000
200
+
201
+ def default_bulk_insert_chunk_size
202
+ DEFAULT_BULK_INSERT_CHUNK_SIZE
203
+ end
204
+
146
205
  def schema_output_extension
147
206
  'js'
148
207
  end
@@ -160,6 +219,14 @@ module Exwiw
160
219
 
161
220
  collections = ordered_tables.reject(&:embedded?)
162
221
 
222
+ # Index listing targets a specific collection, and MongoDB raises
223
+ # NamespaceNotFound (code 26) for one that does not exist. The schema may
224
+ # declare collections absent from this database (schema/DB drift, or a
225
+ # sparse dev DB), so resolve the set that actually exists up front and emit
226
+ # indexes only for those. `createCollection` is still emitted for every
227
+ # config below, so the target schema is created in full regardless.
228
+ existing_collections = db.database.collection_names.to_set
229
+
163
230
  File.open(output_path, 'w') do |file|
164
231
  file.puts("// Auto-generated by exwiw. Apply with: mongosh \"$MONGODB_URI\" #{File.basename(output_path)}")
165
232
  file.puts
@@ -172,6 +239,11 @@ module Exwiw
172
239
 
173
240
  collections.each do |config|
174
241
  name = config.name
242
+ unless existing_collections.include?(name)
243
+ @logger.debug(" Collection '#{name}' is not present in the source database; emitting no indexes.")
244
+ next
245
+ end
246
+
175
247
  indexes = db[name].indexes.to_a.reject { |idx| idx['name'] == '_id_' }
176
248
  indexes.each do |idx|
177
249
  key = idx['key']
@@ -274,6 +346,39 @@ module Exwiw
274
346
  ([config.primary_key] + referenced).uniq
275
347
  end
276
348
 
349
+ # Build the scoping filter for a non-target collection from its belongs_to
350
+ # parents' captured ids. Each belongs_to is constrained by the parent field
351
+ # the FK references (`relation.references`, default the parent primary_key);
352
+ # the values were captured from that field in #execute, so their BSON type
353
+ # already matches the stored FK — no coercion.
354
+ #
355
+ # A belongs_to whose parent produced no ids contributes no constraint:
356
+ # either the parent matched nothing, or it is not dumped here (e.g. an
357
+ # embedded collection, or one excluded from the run). If that leaves the
358
+ # filter empty even though the collection HAS belongs_to, the collection
359
+ # cannot be scoped from the dump target — and falling back to an empty `{}`
360
+ # filter would scan and dump the ENTIRE collection across every scope. That
361
+ # is never what a scoped extraction wants, so constrain it to match nothing
362
+ # and warn instead. (A collection with no belongs_to at all is genuine
363
+ # reference/master data and is still dumped in full via `{}`.)
364
+ private def related_collection_filter(config, config_by_name)
365
+ filter = config.belongs_tos.each_with_object({}) do |relation, acc|
366
+ values = parent_state_for(relation, config_by_name)
367
+ next if values.nil? || values.empty?
368
+
369
+ acc[relation.foreign_key] = { "$in" => values }
370
+ end
371
+
372
+ return filter unless filter.empty? && config.belongs_tos.any?
373
+
374
+ @logger.warn(
375
+ " Collection '#{config.name}' has belongs_to but no parent produced ids to scope by " \
376
+ "(parents matched nothing, or are not dumped on their own such as embedded collections). " \
377
+ "Constraining it to match no rows to avoid an unscoped full-collection dump."
378
+ )
379
+ { config.primary_key => { "$in" => [] } }
380
+ end
381
+
277
382
  # The captured parent-collection values a child belongs_to should be
278
383
  # constrained by: the values of the parent field the FK references
279
384
  # (`relation.references`, default the parent primary_key). nil when the
@@ -287,41 +392,101 @@ module Exwiw
287
392
  parent_fields[reference_field]
288
393
  end
289
394
 
290
- private def apply_replace_with!(doc, config)
291
- config.fields.each do |field|
395
+ # A masking plan compiled once per collection config and reused for every
396
+ # document of that collection. `masked_fields` is `[field_name,
397
+ # template_segments]` for each field carrying a `replace_with`;
398
+ # `embedded` is one EmbeddedMask per embedded child.
399
+ MaskPlan = Struct.new(:masked_fields, :embedded)
400
+
401
+ # A pre-resolved embedded-child mask: the parent path split once into
402
+ # `prefix` (the containers to descend into) and `last` (the field holding
403
+ # the subdocument(s)), plus the child's own MaskPlan.
404
+ EmbeddedMask = Struct.new(:prefix, :last, :plan)
405
+
406
+ # Build (or fetch) the cached MaskPlan for `config`. Masking runs over every
407
+ # document AND every embedded subdocument, so for an embed-heavy collection
408
+ # the same per-config decisions — which fields carry a `replace_with`, how
409
+ # each template splits into segments, where the embedded children live —
410
+ # were previously recomputed tens of times per document. Compiling them once
411
+ # per config lets #apply_mask_plan! do nothing but the work that actually
412
+ # varies per document (rendering templates, descending into subdocuments),
413
+ # so the saved per-subdocument overhead scales down with embedding count.
414
+ #
415
+ # Cached by config name: names are unique within a run and the configs do
416
+ # not mutate mid-dump. Relies on @embedded_children_by_parent, set by the
417
+ # build_query call that always precedes to_bulk_insert (see #to_bulk_insert).
418
+ private def mask_plan(config)
419
+ (@mask_plans ||= {})[config.name] ||= build_mask_plan(config)
420
+ end
421
+
422
+ private def build_mask_plan(config)
423
+ masked_fields = config.fields.each_with_object([]) do |field, acc|
292
424
  next unless field.replace_with
293
425
 
294
- doc[field.name] = field.replace_with.gsub(/\{([^{}]+)\}/) do
295
- ref = Regexp.last_match(1)
296
- (doc.key?(ref) ? doc[ref] : nil).to_s
297
- end
426
+ acc << [field.name, compile_template(field.replace_with)]
427
+ end
428
+ embedded = embedded_children_of(config).map do |child|
429
+ *prefix, last = child.embedded_in.path.split(".")
430
+ EmbeddedMask.new(prefix, last, build_mask_plan(child))
298
431
  end
432
+ MaskPlan.new(masked_fields, embedded)
299
433
  end
300
434
 
301
- private def apply_embedded_masking!(doc, parent_config)
302
- embedded_children_of(parent_config).each do |child|
303
- walk(doc, child.embedded_in.path) do |subdoc|
304
- apply_replace_with!(subdoc, child)
305
- apply_embedded_masking!(subdoc, child)
435
+ # Apply a precompiled MaskPlan to a document in place: render each masked
436
+ # field, then descend into each embedded child (recursing into its own
437
+ # plan). Equivalent to the old apply_replace_with! + apply_embedded_masking!
438
+ # pair, with all per-config lookups hoisted into the plan.
439
+ private def apply_mask_plan!(doc, plan)
440
+ plan.masked_fields.each do |name, segments|
441
+ doc[name] = render_template(segments, doc)
442
+ end
443
+ plan.embedded.each do |child|
444
+ container = child.prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
445
+ next unless container.is_a?(Hash)
446
+
447
+ case (value = container[child.last])
448
+ when Array then value.each { |sub| apply_mask_plan!(sub, child.plan) if sub.is_a?(Hash) }
449
+ when Hash then apply_mask_plan!(value, child.plan)
306
450
  end
307
451
  end
308
452
  end
309
453
 
310
- private def embedded_children_of(parent_config)
311
- @embedded_children_by_parent.fetch(parent_config.name, [])
454
+ PLACEHOLDER_PATTERN = /\{([^{}]+)\}/
455
+
456
+ # Split a `replace_with` template into a flat list of segments (called once
457
+ # per masked field at plan-build time, see #build_mask_plan). A segment is
458
+ # either a literal String or a 1-element Array `[ref]` marking a `{ref}`
459
+ # placeholder. #render_template then concatenates them, skipping the regex
460
+ # scan / block / `Regexp.last_match` a per-document `gsub` would repeat (~2.5x
461
+ # faster per field). The segment walk reproduces the old gsub byte-for-byte
462
+ # (missing keys render as "", literals pass through unchanged).
463
+ private def compile_template(template)
464
+ segments = []
465
+ pos = 0
466
+ while (md = PLACEHOLDER_PATTERN.match(template, pos))
467
+ segments << template[pos...md.begin(0)] if md.begin(0) > pos
468
+ segments << [md[1]]
469
+ pos = md.end(0)
470
+ end
471
+ segments << template[pos..] if pos < template.length
472
+ segments
312
473
  end
313
474
 
314
- private def walk(doc, dotted_path)
315
- segments = dotted_path.split(".")
316
- *prefix, last = segments
317
- container = prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
318
- return unless container.is_a?(Hash)
319
-
320
- value = container[last]
321
- case value
322
- when Array then value.each { |sub| yield sub if sub.is_a?(Hash) }
323
- when Hash then yield value
475
+ private def render_template(segments, doc)
476
+ out = +''
477
+ segments.each do |seg|
478
+ if seg.is_a?(Array)
479
+ ref = seg[0]
480
+ out << (doc.key?(ref) ? doc[ref] : nil).to_s
481
+ else
482
+ out << seg
483
+ end
324
484
  end
485
+ out
486
+ end
487
+
488
+ private def embedded_children_of(parent_config)
489
+ @embedded_children_by_parent.fetch(parent_config.name, [])
325
490
  end
326
491
 
327
492
  private def extended_json(doc)
@@ -65,7 +65,18 @@ module Exwiw
65
65
  ext_ddl = extensions.map do |extname, schema|
66
66
  stmt = "CREATE EXTENSION IF NOT EXISTS #{connection.quote_ident(extname)}"
67
67
  stmt += " SCHEMA #{connection.quote_ident(schema)}" unless schema == "public"
68
- "DO $$ BEGIN #{stmt}; EXCEPTION WHEN feature_not_supported THEN NULL; END $$;"
68
+ # Best-effort prepend: a restore target that genuinely cannot create the
69
+ # extension should not abort the whole restore. Two such cases are caught:
70
+ # feature_not_supported (0A000) -- the extension's binaries are unavailable
71
+ # invalid_schema_name (3F000) -- the extension's required schema is absent
72
+ # insufficient_privilege (42501) is deliberately NOT caught: a restore role
73
+ # lacking CREATE privilege is a misconfiguration to fix, not to skip silently.
74
+ # The skip is re-raised as a WARNING so it surfaces in the restore logs
75
+ # instead of vanishing.
76
+ warning = connection.escape_literal("exwiw: skipped CREATE EXTENSION #{extname} (SQLSTATE %): %")
77
+ "DO $$ BEGIN #{stmt}; " \
78
+ "EXCEPTION WHEN feature_not_supported OR invalid_schema_name THEN " \
79
+ "RAISE WARNING #{warning}, SQLSTATE, SQLERRM; END $$;"
69
80
  end.join("\n") + "\n\n"
70
81
  @logger.debug(" Found #{extensions.size} extension(s) to prepend.")
71
82
  stdout = ext_ddl + stdout
@@ -382,11 +393,17 @@ module Exwiw
382
393
  end
383
394
 
384
395
  private def query_extensions
396
+ # Skip plpgsql (always present) and managed-platform bookkeeping extensions
397
+ # (google_*/rds_*/aiven_*). pglogical is also skipped: it is a logical-
398
+ # replication mechanism of the source, not part of the data being copied,
399
+ # and its dedicated `pglogical` schema is typically absent on the restore
400
+ # target — so prepending CREATE EXTENSION for it only breaks the restore.
385
401
  sql = <<~SQL
386
402
  SELECT e.extname, n.nspname
387
403
  FROM pg_extension e
388
404
  JOIN pg_namespace n ON n.oid = e.extnamespace
389
405
  WHERE e.extname != 'plpgsql'
406
+ AND e.extname != 'pglogical'
390
407
  AND e.extname NOT LIKE 'google\\_%' ESCAPE '\\'
391
408
  AND e.extname NOT LIKE 'rds\\_%' ESCAPE '\\'
392
409
  AND e.extname NOT LIKE 'aiven\\_%' ESCAPE '\\'
data/lib/exwiw/adapter.rb CHANGED
@@ -113,6 +113,16 @@ module Exwiw
113
113
  raise NotImplementedError, "COPY format is not supported by #{self.class.name}"
114
114
  end
115
115
 
116
+ # Default bulk-insert chunk size when a table config does not set one.
117
+ # The Runner streams each chunk straight to the output file, so a non-nil
118
+ # value here bounds how much serialized output (and how many transient
119
+ # intermediate objects) live in memory at once. SQL adapters keep nil
120
+ # (one statement per table, as before); adapters whose output is large
121
+ # and built per-row (e.g. MongoDB JSONL) override with a positive value.
122
+ def default_bulk_insert_chunk_size
123
+ nil
124
+ end
125
+
116
126
  # Run the database-specific EXPLAIN for the given query and return the
117
127
  # output as a single string for `explain` subcommand to print.
118
128
  # SQL adapters override; MongodbAdapter currently raises.
@@ -1,35 +1,58 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "set"
4
+
3
5
  module Exwiw
4
6
  module DetermineTableProcessingOrder
5
7
  module_function
6
8
 
7
9
  # @param tables [Array<Exwiw::TableConfig>] tables
10
+ # @param logger [Logger, nil] receives a warning when a cycle has to be broken
8
11
  # @return [Array<String>] sorted table names
9
- def run(tables)
12
+ def run(tables, logger: nil)
10
13
  return tables.map(&:name) if tables.size < 2
11
14
 
12
15
  ordered_table_names = []
16
+ ordered = Set.new
13
17
 
14
18
  table_by_name = tables.each_with_object({}) do |table, acc|
15
19
  acc[table.name] = table
16
20
  end
17
21
 
22
+ # Only belongs_to relations whose target is also in this run constrain the
23
+ # order. A belongs_to pointing at a table that is not being processed here
24
+ # — e.g. an embedded MongoDB collection (masked through its parent, never
25
+ # dumped on its own) or any table excluded from the run — is not something
26
+ # we can or need to order against, so it must never block resolution.
27
+ # Without this, such a dependency would stay unresolved forever and
28
+ # masquerade as a circular dependency, freezing every table that
29
+ # (transitively) references it.
30
+ present_names = table_by_name.keys.to_set
31
+
18
32
  loop do
19
33
  break if table_by_name.empty?
20
34
 
21
- tables_with_no_dependencies = table_by_name.values.select do |table|
22
- not_resolved_names = compute_table_dependencies(table) - ordered_table_names - [table.name]
23
-
24
- not_resolved_names.empty?
35
+ resolvable = table_by_name.values.select do |table|
36
+ unresolved_dependencies(table, present_names, ordered).empty?
25
37
  end
26
38
 
27
- if tables_with_no_dependencies.empty?
28
- raise ArgumentError, build_cycle_error_message(table_by_name, ordered_table_names)
39
+ if resolvable.empty?
40
+ # No table has all its (in-run) dependencies satisfied, yet tables
41
+ # remain: the belongs_to graph has a genuine cycle and no strict
42
+ # topological order exists. Rather than aborting the whole export, break
43
+ # the cycle by emitting one cycle member; see pick_cycle_victim for how
44
+ # the member is chosen. Warn so the dropped constraint is visible.
45
+ victim = pick_cycle_victim(table_by_name.values, present_names, ordered)
46
+ warn_cycle_break(logger, victim, unresolved_dependencies(victim, present_names, ordered))
47
+ resolvable = [victim]
29
48
  end
30
49
 
31
- tables_with_no_dependencies.each do |table|
50
+ # In the normal (acyclic) path, emit every currently-resolvable table in
51
+ # insertion order — preserving the historical ordering the snapshot specs
52
+ # depend on. The cycle-break path emits exactly its single chosen victim.
53
+ resolvable.each do |table|
32
54
  ordered_table_names << table.name
55
+ ordered << table.name
33
56
  table_by_name.delete(table.name)
34
57
  end
35
58
  end
@@ -37,30 +60,124 @@ module Exwiw
37
60
  ordered_table_names
38
61
  end
39
62
 
63
+ # The belongs_to target table names of `table`. A polymorphic belongs_to is
64
+ # expanded into one entry per concrete target by schema generation, so each
65
+ # entry is a plain table name here.
40
66
  def compute_table_dependencies(table)
41
- table.belongs_tos.each_with_object([]) do |relation, acc|
42
- acc << relation.table_name
67
+ table.belongs_tos.map(&:table_name)
68
+ end
69
+
70
+ # The dependencies still blocking `table`: belongs_to targets that are part
71
+ # of this run, not yet ordered, and not the table itself (a self-referential
72
+ # belongs_to never blocks).
73
+ private_class_method def unresolved_dependencies(table, present_names, ordered)
74
+ compute_table_dependencies(table).uniq.select do |dep|
75
+ present_names.include?(dep) && !ordered.include?(dep) && dep != table.name
43
76
  end
44
77
  end
45
78
 
46
- # When no table can be resolved but some remain, the belongs_to graph
47
- # contains a cycle (e.g. A belongs_to B and B belongs_to A). A topological
48
- # order cannot exist, so report the offending tables instead of looping
49
- # forever.
50
- private_class_method def cycle_diagnostics(table_by_name, ordered_table_names)
51
- table_by_name.values.map do |table|
52
- unresolved = (compute_table_dependencies(table) - ordered_table_names - [table.name]).uniq
53
- " #{table.name} -> #{unresolved.join(', ')}"
79
+ # Choose the next table to emit when the order is stuck in a cycle. Only
80
+ # genuine cycle members are eligible a table in a non-trivial
81
+ # strongly-connected component of the unresolved-dependency subgraph so an
82
+ # acyclic table that merely waits on a cycle is never reordered ahead of its
83
+ # parent. Among the members, prefer one that still has at least one
84
+ # already-ordered parent, so its extraction stays constrained instead of
85
+ # collapsing to "match every row" (a cross-scope over-extraction risk for the
86
+ # mongodb adapter); break remaining ties by fewest unresolved dependencies,
87
+ # then by name, for determinism.
88
+ private_class_method def pick_cycle_victim(remaining, present_names, ordered)
89
+ adjacency = remaining.each_with_object({}) do |table, acc|
90
+ acc[table.name] = unresolved_dependencies(table, present_names, ordered)
91
+ end
92
+ cyclic_names = strongly_connected_members(adjacency)
93
+
94
+ candidates = remaining.select { |table| cyclic_names.include?(table.name) }
95
+ candidates = remaining if candidates.empty? # defensive; a stall implies a cycle
96
+
97
+ anchored = candidates.select { |table| ordered_parent?(table, present_names, ordered) }
98
+ pool = anchored.empty? ? candidates : anchored
99
+
100
+ pool.min_by { |table| [unresolved_dependencies(table, present_names, ordered).size, table.name] }
101
+ end
102
+
103
+ # True when `table` has a belongs_to whose target was already ordered, so its
104
+ # extraction filter will be constrained rather than an unscoped full scan.
105
+ private_class_method def ordered_parent?(table, present_names, ordered)
106
+ compute_table_dependencies(table).any? do |dep|
107
+ dep != table.name && present_names.include?(dep) && ordered.include?(dep)
54
108
  end
55
109
  end
56
110
 
57
- private_class_method def build_cycle_error_message(table_by_name, ordered_table_names)
58
- "Circular belongs_to dependency detected among tables: " \
59
- "#{table_by_name.keys.sort.join(', ')}. " \
60
- "A processing order cannot be determined. " \
61
- "Remove one of the belongs_to entries forming the cycle.\n" \
62
- "Unresolved dependencies:\n" \
63
- "#{cycle_diagnostics(table_by_name, ordered_table_names).join("\n")}"
111
+ # Names belonging to a non-trivial strongly-connected component (size > 1) of
112
+ # `adjacency` (table name -> unresolved dependency names), i.e. the genuine
113
+ # cycle participants. Iterative Tarjan; nodes and edges are visited in name
114
+ # order so the result is deterministic. Self-edges are already excluded from
115
+ # the adjacency, so a size-1 component is never a cycle.
116
+ private_class_method def strongly_connected_members(adjacency)
117
+ index = {}
118
+ low = {}
119
+ on_stack = {}
120
+ stack = []
121
+ counter = 0
122
+ members = Set.new
123
+ neighbors = adjacency.each_with_object({}) { |(name, deps), acc| acc[name] = deps.sort }
124
+
125
+ adjacency.keys.sort.each do |start|
126
+ next if index.key?(start)
127
+
128
+ work = [[start, 0]]
129
+ until work.empty?
130
+ node, edge_i = work.last
131
+ if edge_i.zero?
132
+ index[node] = counter
133
+ low[node] = counter
134
+ counter += 1
135
+ stack.push(node)
136
+ on_stack[node] = true
137
+ end
138
+
139
+ adj = neighbors[node] || []
140
+ if edge_i < adj.size
141
+ work.last[1] += 1
142
+ w = adj[edge_i]
143
+ next unless adjacency.key?(w) # ignore edges leaving the remaining set
144
+
145
+ if index.key?(w)
146
+ low[node] = [low[node], index[w]].min if on_stack[w]
147
+ else
148
+ work.push([w, 0])
149
+ end
150
+ else
151
+ if low[node] == index[node]
152
+ component = []
153
+ loop do
154
+ w = stack.pop
155
+ on_stack[w] = false
156
+ component << w
157
+ break if w == node
158
+ end
159
+ members.merge(component) if component.size > 1
160
+ end
161
+ work.pop
162
+ low[work.last[0]] = [low[work.last[0]], low[node]].min unless work.empty?
163
+ end
164
+ end
165
+ end
166
+
167
+ members
168
+ end
169
+
170
+ private_class_method def warn_cycle_break(logger, victim, dropped)
171
+ return if logger.nil?
172
+
173
+ logger.warn(
174
+ "Circular belongs_to dependency detected. Breaking it by ordering " \
175
+ "'#{victim.name}' before its parent table(s): #{dropped.join(', ')}. The dropped " \
176
+ "relationship is not enforced while ordering, so '#{victim.name}' is extracted " \
177
+ "without that parent constraint (the mongodb adapter may then match a superset of " \
178
+ "rows; SQL output may not load in foreign-key order). To break the cycle explicitly " \
179
+ "instead, mark one of the belongs_to entries forming it with `ignore: true`."
180
+ )
64
181
  end
65
182
  end
66
183
  end
@@ -30,7 +30,7 @@ module Exwiw
30
30
  QueryAstBuilder.validate_scope!(dumpable_configs, table_by_name, @dump_target, @logger)
31
31
 
32
32
  @logger.debug("Determining table processing order...")
33
- ordered_table_names = DetermineTableProcessingOrder.run(dumpable_configs)
33
+ ordered_table_names = DetermineTableProcessingOrder.run(dumpable_configs, logger: @logger)
34
34
 
35
35
  total_size = ordered_table_names.size
36
36
  ordered_table_names.each_with_index do |table_name, idx|