@ctxr/skill-llm-wiki 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,917 @@
1
+ // join.mjs — merge N ≥ 2 existing wikis into one unified output wiki.
2
+ //
3
+ // Implements the 11-phase pipeline from guide/operations/ingest/join.md:
4
+ //
5
+ // 0 preflight (handled by orchestrator caller; not here)
6
+ // 1 ingest-all — read every source wiki's tree into memory
7
+ // 2 source-validate — validate each source; halt on errors
8
+ // 3 plan-union — merge per-source leaf lists into one
9
+ // 4 resolve-id-collisions — namespace (default) / merge / ask policies
10
+ // 5 merge-categories — detect matching-focus category folds
11
+ // (actual directory-MERGE fold is
12
+ // deferred — runConvergence's MERGE
13
+ // operator only handles sibling leaves,
14
+ // not entire category subtrees)
15
+ // 6 rewire-references — resolve links[].id / overlay_targets
16
+ // via id→alias→rename map. parents[]
17
+ // are POSIX paths, not ids; they're
18
+ // re-derived at phase 8 by
19
+ // `rebuildAllIndices` from the target's
20
+ // actual tree shape, not rewritten here.
21
+ // 7 apply-operators — runConvergence on the unified tree
22
+ // 8 generate-indices — rebuildAllIndices on the joined tree
23
+ // 9 validation — validateWiki on the joined tree
24
+ // 10 golden-path-union — each source's fixtures must still pass
25
+ // 11 commit — phase-commit via the caller's callback
26
+ //
27
+ // Source immutability: every source wiki is treated as strictly
28
+ // read-only. The pipeline materialises the unified output at the
29
+ // target path (created empty by the orchestrator before runJoin is
30
+ // called); sources are never touched on disk. This module assumes
31
+ // that precondition has already been enforced by the caller
32
+ // (intent.mjs's join branch refuses a non-empty target via INT-01
33
+ // and the CLI creates the target empty via mkdirSync when
34
+ // `plan.is_new_wiki` is set).
35
+
36
+ import {
37
+ closeSync,
38
+ existsSync,
39
+ mkdirSync,
40
+ openSync,
41
+ readFileSync,
42
+ readSync,
43
+ readdirSync,
44
+ writeFileSync,
45
+ } from "node:fs";
46
+ import { basename, dirname, join, relative } from "node:path";
47
+ import { readFrontmatterStreaming } from "./chunk.mjs";
48
+ import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
49
+ import { rebuildAllIndices } from "./indices.mjs";
50
+ import { runConvergence } from "./operators.mjs";
51
+ import { summariseFindings, validateWiki } from "./validate.mjs";
52
+ // Re-export the policy constants from `join-constants.mjs` so
53
+ // existing callers that pull them from this module keep working.
54
+ // The intent layer imports from `join-constants.mjs` directly to
55
+ // avoid loading the full pipeline on non-join CLI paths.
56
+ export {
57
+ DEFAULT_COLLISION_POLICY,
58
+ VALID_COLLISION_POLICIES,
59
+ } from "./join-constants.mjs";
60
+ import { DEFAULT_COLLISION_POLICY, VALID_COLLISION_POLICIES } from "./join-constants.mjs";
61
+
62
+ // Body-read streaming chunk size, 64 KiB. Sized well above the
63
+ // 4 KiB chunk.mjs uses for the bounded frontmatter read (where
64
+ // frontmatter is expected to be small and the read budget matters)
65
+ // — leaf bodies are much larger and can be full documents, so a
66
+ // bigger chunk amortises the per-readSync syscall overhead while
67
+ // still keeping each individual allocation predictably small.
68
+ // Note this is the PER-READ ceiling, not a total-memory ceiling:
69
+ // each leaf's body is still fully held in memory after the
70
+ // `Buffer.concat` that assembles the chunks into the final UTF-8
71
+ // string. A future optimisation could switch to lazy body loading
72
+ // if the holding-every-body-in-memory shape becomes the bottleneck
73
+ // on very large corpora.
74
+ const BODY_READ_CHUNK_SIZE = 64 * 1024;
75
+
76
+ // ── Phase 1: ingest-all ──────────────────────────────────────────
77
+ //
78
+ // Read one source wiki into memory. Returns a normalised
79
+ // representation with `leaves[]` (non-index .md files) and
80
+ // `indices[]` (index.md files) — both carrying `relPath` (relative
81
+ // POSIX path under wikiRoot), parsed `data` (frontmatter), and
82
+ // `body` (everything after the closing fence). CRLF fences are
83
+ // handled by `readFrontmatterStreaming` which normalises to LF on
84
+ // the frontmatter payload; the body is sliced at the pre-normalisation
85
+ // byte offset and normalised to LF on read for downstream
86
+ // consistency with every other writer in this codebase.
87
+ //
88
+ // Files that fail to parse are collected into `malformed[]` so the
89
+ // caller can surface them as part of source-validate rather than
90
+ // silently dropping them.
91
+ export function ingestWiki(wikiRoot) {
92
+ const out = {
93
+ wikiRoot,
94
+ leaves: [],
95
+ indices: [],
96
+ malformed: [],
97
+ };
98
+ const stack = [wikiRoot];
99
+ while (stack.length > 0) {
100
+ const dir = stack.pop();
101
+ let entries;
102
+ try {
103
+ entries = readdirSync(dir, { withFileTypes: true });
104
+ } catch {
105
+ continue;
106
+ }
107
+ for (const e of entries) {
108
+ if (e.name.startsWith(".")) continue;
109
+ const full = join(dir, e.name);
110
+ if (e.isDirectory()) {
111
+ stack.push(full);
112
+ continue;
113
+ }
114
+ if (!e.isFile() || !e.name.endsWith(".md")) continue;
115
+ let captured;
116
+ try {
117
+ captured = readFrontmatterStreaming(full);
118
+ } catch (err) {
119
+ out.malformed.push({ path: full, error: err.message });
120
+ continue;
121
+ }
122
+ if (!captured) continue; // plain .md with no fence — not a wiki entry
123
+ let parsed;
124
+ try {
125
+ parsed = parseFrontmatter(captured.frontmatterText, full);
126
+ } catch (err) {
127
+ out.malformed.push({ path: full, error: err.message });
128
+ continue;
129
+ }
130
+ if (!parsed?.data?.id) continue;
131
+ // Read the body starting at `captured.bodyOffset` rather than
132
+ // re-reading the whole file via `readFileSync(full)` — the
133
+ // full-file read was a needless double-I/O that loaded the
134
+ // frontmatter bytes twice. Stream the body in bounded
135
+ // `BODY_READ_CHUNK_SIZE` chunks (per-read allocation stays
136
+ // small and predictable) rather than a single
137
+ // `Buffer.alloc(bodyLen)` up front. Chunks accumulate in an
138
+ // array and assemble into the final UTF-8 string via
139
+ // `Buffer.concat` at the end; peak memory during the concat
140
+ // is ~2× the body size momentarily, but the per-read
141
+ // allocation is always the small chunk buffer regardless of
142
+ // how large the body turns out to be (which matters because
143
+ // we don't know the body size before reading).
144
+ let body;
145
+ try {
146
+ const chunks = [];
147
+ const fd = openSync(full, "r");
148
+ try {
149
+ const buf = Buffer.alloc(BODY_READ_CHUNK_SIZE);
150
+ let pos = captured.bodyOffset;
151
+ while (true) {
152
+ const n = readSync(fd, buf, 0, buf.length, pos);
153
+ if (n === 0) break;
154
+ // Copy the populated slice — `buf` is reused on the
155
+ // next iteration, so retaining a view would corrupt
156
+ // previously-accumulated chunks.
157
+ chunks.push(Buffer.from(buf.subarray(0, n)));
158
+ pos += n;
159
+ }
160
+ } finally {
161
+ closeSync(fd);
162
+ }
163
+ body = Buffer.concat(chunks).toString("utf8");
164
+ } catch {
165
+ // Fall back to the full-file read on any low-level error
166
+ // (e.g., stat race between the streaming read and now).
167
+ const raw = readFileSync(full);
168
+ body = raw.slice(captured.bodyOffset).toString("utf8");
169
+ }
170
+ if (captured.lineEnding === "crlf") {
171
+ body = body.replace(/\r\n/g, "\n");
172
+ }
173
+ const relPath = relPosix(relative(wikiRoot, full));
174
+ const record = { relPath, absolutePath: full, data: parsed.data, body };
175
+ if (e.name === "index.md") {
176
+ out.indices.push(record);
177
+ } else {
178
+ out.leaves.push(record);
179
+ }
180
+ }
181
+ }
182
+ // Lex sort for determinism — downstream collision resolution is
183
+ // order-sensitive when the policy emits sequential suffixes.
184
+ out.leaves.sort((a, b) => a.relPath.localeCompare(b.relPath));
185
+ out.indices.sort((a, b) => a.relPath.localeCompare(b.relPath));
186
+ return out;
187
+ }
188
+
189
+ // ── Phase 2: source-validate ─────────────────────────────────────
190
+ //
191
+ // Run `validateWiki` on each source. Any hard error halts join
192
+ // with "fix this source first" — joining a broken source produces
193
+ // a broken joined wiki. Warnings are surfaced but don't block.
194
+ // Malformed files collected by `ingestWiki` are folded in as
195
+ // synthetic PARSE findings so the caller sees a single unified
196
+ // list rather than two separate error channels — deduped by
197
+ // (code, target) against `validateWiki`'s own PARSE findings so
198
+ // the same file doesn't get reported twice when both channels
199
+ // detect the same parse failure.
200
+ export function validateSources(ingested) {
201
+ const report = { errors: [], warnings: [] };
202
+ for (const src of ingested) {
203
+ const findings = validateWiki(src.wikiRoot);
204
+ const seenErrorKeys = new Set();
205
+ for (const f of findings) {
206
+ const entry = { wikiRoot: src.wikiRoot, ...f };
207
+ if (f.severity === "error") {
208
+ seenErrorKeys.add(`${f.code}:${f.target}`);
209
+ report.errors.push(entry);
210
+ } else if (f.severity === "warning") {
211
+ report.warnings.push(entry);
212
+ }
213
+ }
214
+ for (const m of src.malformed) {
215
+ // Skip synthetic PARSE entries that `validateWiki` already
216
+ // reported — duplicating would make the
217
+ // JOIN-SOURCE-INVALID summary noisy without adding signal.
218
+ if (seenErrorKeys.has(`PARSE:${m.path}`)) continue;
219
+ report.errors.push({
220
+ wikiRoot: src.wikiRoot,
221
+ severity: "error",
222
+ code: "PARSE",
223
+ target: m.path,
224
+ message: m.error,
225
+ });
226
+ }
227
+ }
228
+ return report;
229
+ }
230
+
231
+ // ── Phase 3: plan-union ──────────────────────────────────────────
232
+ //
233
+ // Merge per-source leaf + index records into a single in-memory
234
+ // plan. Each record gets tagged with `sourceWiki` (the absolute
235
+ // path of its origin wiki) so downstream phases can reason about
236
+ // provenance — the `source_wikis[]` frontmatter field on merged
237
+ // entries, the rename-map namespacing prefix, and the
238
+ // golden-path-union phase's fixture tracing all rely on this.
239
+ //
240
+ // The union preserves source order (source A's entries first, then
241
+ // B, etc.) with lex-sorted records within each source, so the plan
242
+ // is byte-stable for identical inputs regardless of filesystem
243
+ // readdir ordering.
244
+ export function planUnion(ingestedSources) {
245
+ const leaves = [];
246
+ const indices = [];
247
+ for (const src of ingestedSources) {
248
+ for (const leaf of src.leaves) {
249
+ leaves.push({ ...leaf, sourceWiki: src.wikiRoot });
250
+ }
251
+ for (const idx of src.indices) {
252
+ indices.push({ ...idx, sourceWiki: src.wikiRoot });
253
+ }
254
+ }
255
+ return { leaves, indices };
256
+ }
257
+
258
+ // ── Phase 4: resolve-id-collisions ───────────────────────────────
259
+ //
260
+ // Detect duplicate ids across sources and apply the configured
261
+ // collision policy. Includes BOTH leaves and indices — `validateWiki`
262
+ // enforces global id uniqueness across every entry type, so an
263
+ // index-only collision (two sources both with `auth/index.md`) would
264
+ // still trip `DUP-ID` at phase 9 if left alone.
265
+ //
266
+ // Index collisions: this first-cut throws a structured
267
+ // `JOIN-INDEX-COLLISION` the moment the collision is detected —
268
+ // safely renaming just the id frontmatter would immediately trip
269
+ // `ID-MISMATCH-DIR` (id must match parent dir basename), and
270
+ // directory-renaming an entire subtree to produce a unique id is
271
+ // a full directory-MERGE operator tracked as a follow-up. The
272
+ // throw asks the user to disambiguate one of the conflicting
273
+ // subcategories in its source wiki before re-joining. In the
274
+ // common case of joining topically-distinct source wikis, index
275
+ // collisions don't occur at all.
276
+ //
277
+ // Collision policies:
278
+ //
279
+ // `namespace` (default): rename the colliding entry to
280
+ // `<source-prefix>.<original-id>` where the prefix is the
281
+ // colliding source-wiki's basename (e.g. `reviewers.wiki` →
282
+ // `reviewers`). Inbound references from that source resolve
283
+ // via source-scoped `renameMap` rewrites during the rewire
284
+ // phase. The original id is NOT added to `aliases[]` — the
285
+ // keeper record (from the first source) retains the
286
+ // un-prefixed id as its live id, so an alias entry with the
287
+ // same value would trip `ALIAS-COLLIDES-ID` at phase 9.
288
+ // Never loses an entry; never shadows the keeper.
289
+ //
290
+ // `merge`: when frontmatter is compatible (same focus, same
291
+ // type, same depth_role), fold duplicates into the keeper —
292
+ // preserving the absorbed record's existing `aliases[]` on
293
+ // the keeper and recording both source wikis via
294
+ // `source_wikis[]`. Because collisions are on IDENTICAL ids,
295
+ // the absorbed id is NOT added to `aliases[]` (it equals the
296
+ // keeper's live id; a self-alias would trip
297
+ // `ALIAS-COLLIDES-ID` at phase 9). When frontmatter is
298
+ // INcompatible, fall through to `namespace` — we never lose
299
+ // content on a merge fallback.
300
+ //
301
+ // `ask`: halt and throw `JOIN-COLLISION-ASK` with the collision
302
+ // set for the caller to surface. The interactive resolution
303
+ // flow is deferred; first-cut just fails loud.
304
+ //
305
+ // Returns `{ plan, renameMap, mergeMap }`:
306
+ // - plan: the mutated plan (leaves/indices with renamed ids)
307
+ // - renameMap: `Map<sourceWiki, Map<oldId, newId>>` — source-
308
+ // scoped so 3+ sources sharing an id each get their
309
+ // own entry without overwriting. `rewireReferences`
310
+ // uses the referring entry's `sourceWiki` to pick
311
+ // the right rename.
312
+ // - mergeMap: `Map<absorbedId, keeperId>` — a flat map because
313
+ // every absorbed id collapses to the keeper's id
314
+ // regardless of the referring source's identity.
315
+ export function resolveIdCollisions(plan, policy = DEFAULT_COLLISION_POLICY) {
316
+ if (!VALID_COLLISION_POLICIES.includes(policy)) {
317
+ throw new Error(
318
+ `join: unknown id-collision policy "${policy}" (valid: ${VALID_COLLISION_POLICIES.join(", ")})`,
319
+ );
320
+ }
321
+ // Include both leaves and indices in collision detection. Each
322
+ // entry is tagged with `kind` so we can apply slightly different
323
+ // rename shapes (a leaf rename also updates relPath's filename;
324
+ // an index rename only updates the id frontmatter — renaming the
325
+ // whole subdirectory is deferred).
326
+ const byId = new Map();
327
+ for (const leaf of plan.leaves) {
328
+ const id = leaf.data.id;
329
+ if (!byId.has(id)) byId.set(id, []);
330
+ byId.get(id).push({ kind: "leaf", record: leaf });
331
+ }
332
+ for (const idx of plan.indices) {
333
+ const id = idx.data.id;
334
+ if (!byId.has(id)) byId.set(id, []);
335
+ byId.get(id).push({ kind: "index", record: idx });
336
+ }
337
+ const collisions = [...byId.entries()].filter(([, arr]) => arr.length > 1);
338
+ if (collisions.length === 0) {
339
+ return { plan, renameMap: new Map(), mergeMap: new Map() };
340
+ }
341
+ if (policy === "ask") {
342
+ const err = new Error(
343
+ `join: id collisions found and policy=ask — ` +
344
+ `${collisions.length} colliding id(s). Resolve manually and re-invoke.`,
345
+ );
346
+ err.code = "JOIN-COLLISION-ASK";
347
+ err.collisions = collisions.map(([id, arr]) => ({
348
+ id,
349
+ sources: arr.map((entry) => entry.record.sourceWiki),
350
+ }));
351
+ throw err;
352
+ }
353
+ // Source-scoped rename map: Map<sourceWiki, Map<oldId, newId>>.
354
+ // When three sources all contain id "dup", each gets its own
355
+ // namespace-prefixed id; `rewireReferences` picks the right
356
+ // rename by looking up `renameMap.get(referrer.sourceWiki)`.
357
+ const renameMap = new Map();
358
+ const addRename = (sourceWiki, oldId, newId) => {
359
+ if (!renameMap.has(sourceWiki)) renameMap.set(sourceWiki, new Map());
360
+ renameMap.get(sourceWiki).set(oldId, newId);
361
+ };
362
+ const mergeMap = new Map();
363
+ const absorbedPaths = new Set();
364
+ // Running set of every "live" id we've already assigned — both
365
+ // un-renamed keepers AND namespace-prefixed renames. Used to
366
+ // detect secondary collisions: two source wikis with the same
367
+ // basename (e.g. both `/a/reviewers.wiki` and `/b/reviewers.wiki`)
368
+ // would produce identical `reviewers.<id>` prefixed ids without
369
+ // this guard. Seeded with every plan record's current id so
370
+ // namespace renames don't silently collide with non-duplicated
371
+ // ids elsewhere in the plan either.
372
+ const liveIds = new Set();
373
+ for (const leaf of plan.leaves) liveIds.add(leaf.data.id);
374
+ for (const idx of plan.indices) liveIds.add(idx.data.id);
375
+ const reserveId = (baseId) => {
376
+ if (!liveIds.has(baseId)) {
377
+ liveIds.add(baseId);
378
+ return baseId;
379
+ }
380
+ for (let n = 2; n < 1000; n++) {
381
+ const candidate = `${baseId}-${n}`;
382
+ if (!liveIds.has(candidate)) {
383
+ liveIds.add(candidate);
384
+ return candidate;
385
+ }
386
+ }
387
+ throw new Error(
388
+ `join: could not disambiguate namespace-prefixed id "${baseId}" within 1000 attempts`,
389
+ );
390
+ };
391
+ for (const [id, dupes] of collisions) {
392
+ // Keeper is the first source's entry; subsequent entries either
393
+ // merge (merge policy + compatible) or rename (namespace policy,
394
+ // or merge-fallback on incompatible frontmatter).
395
+ const [keeperEntry, ...rest] = dupes;
396
+ const keeper = keeperEntry.record;
397
+ for (const dupEntry of rest) {
398
+ const dup = dupEntry.record;
399
+ const canMerge =
400
+ policy === "merge" &&
401
+ dupEntry.kind === keeperEntry.kind &&
402
+ dup.data.focus === keeper.data.focus &&
403
+ dup.data.type === keeper.data.type &&
404
+ dup.data.depth_role === keeper.data.depth_role;
405
+ if (canMerge) {
406
+ // Do NOT add an identity `{id → id}` entry to mergeMap:
407
+ // under collision the absorbed and keeper share the same
408
+ // id, so a self-mapping is a no-op semantically AND
409
+ // actively harmful — `rewireReferences` consults mergeMap
410
+ // before the source-scoped renameMap, so an identity
411
+ // entry would intercept a reference from a DIFFERENT
412
+ // source (which got namespace-renamed for the same
413
+ // collision id) and short-circuit before the renameMap
414
+ // rewrite fires. The absorbed record is dropped from the
415
+ // plan via `absorbedPaths` regardless.
416
+ if (dup.data.id !== keeper.data.id) {
417
+ mergeMap.set(dup.data.id, keeper.data.id);
418
+ }
419
+ absorbedPaths.add(dup.absolutePath);
420
+ // Inherit absorbed's aliases (NOT its id — the absorbed
421
+ // id is already the keeper's id, so adding it to
422
+ // aliases[] would create a self-alias the validator
423
+ // rejects under ALIAS-COLLIDES-ID).
424
+ keeper.data.aliases = dedupe([
425
+ ...(keeper.data.aliases || []),
426
+ ...(dup.data.aliases || []),
427
+ ]);
428
+ keeper.data.source_wikis = dedupe([
429
+ ...(keeper.data.source_wikis || [keeper.sourceWiki]),
430
+ dup.sourceWiki,
431
+ ]);
432
+ } else {
433
+ // Namespace fallback. Index collisions are a hard fail
434
+ // under the current scope (full directory-rename is a
435
+ // follow-up), so short-circuit BEFORE mutating
436
+ // `dup.data.id` / renameMap / relPath. Leaving partial
437
+ // mutations on the in-memory plan would make the pre-
438
+ // throw state unsafe to reuse — a caller that caught
439
+ // JOIN-INDEX-COLLISION and wanted to inspect the plan
440
+ // would see a half-renamed dup mixed with un-renamed
441
+ // peers, producing confusing diagnostics.
442
+ if (dupEntry.kind === "index") {
443
+ const err = new Error(
444
+ `join: index id collision on "${id}" between ` +
445
+ `${keeper.sourceWiki} and ${dup.sourceWiki}. ` +
446
+ `Directory-rename for index collisions is not yet ` +
447
+ `supported; rename one of the conflicting directories ` +
448
+ `in its source wiki before joining.`,
449
+ );
450
+ err.code = "JOIN-INDEX-COLLISION";
451
+ throw err;
452
+ }
453
+ // Namespace: prefix with the basename of the dup's source
454
+ // wiki. `reviewers.wiki` → `reviewers`; the trailing
455
+ // `.wiki` suffix is idiomatic and stripped for the prefix.
456
+ // `reserveId` guards against secondary collisions when two
457
+ // sources share the same basename or when the generated
458
+ // prefix happens to clash with an existing unrelated id.
459
+ const prefix = namespacePrefix(dup.sourceWiki);
460
+ const newId = reserveId(`${prefix}.${id}`);
461
+ // DON'T add the old id to aliases[]: the keeper from the
462
+ // first source still owns the un-prefixed "dup" as its
463
+ // live id. A dup.aliases = ["dup"] would trip
464
+ // ALIAS-COLLIDES-ID at phase 9. Existing aliases the
465
+ // source carried on the dup are preserved byte-identical.
466
+ addRename(dup.sourceWiki, dup.data.id, newId);
467
+ dup.data.id = newId;
468
+ // Leaf rename: the filename must track the id (validator
469
+ // enforces `ID-MISMATCH-FILE`); rewrite relPath.
470
+ const dir = dirname(dup.relPath);
471
+ dup.relPath = dir === "." ? `${newId}.md` : `${dir}/${newId}.md`;
472
+ }
473
+ }
474
+ }
475
+ const rebuiltLeaves = plan.leaves.filter(
476
+ (l) => !absorbedPaths.has(l.absolutePath),
477
+ );
478
+ const rebuiltIndices = plan.indices.filter(
479
+ (i) => !absorbedPaths.has(i.absolutePath),
480
+ );
481
+ return {
482
+ plan: { leaves: rebuiltLeaves, indices: rebuiltIndices },
483
+ renameMap,
484
+ mergeMap,
485
+ };
486
+ }
487
+
488
+ // ── Phase 5: merge-categories ────────────────────────────────────
489
+ //
490
+ // Detect top-level categories that share the same `focus` across
491
+ // source wikis. DETECT-ONLY: this helper walks every source's
492
+ // top-level indices, groups them by focus, and returns the
493
+ // multi-source groups without mutating the plan or the filesystem.
494
+ // No entries[] appends, no subdirectory moves — a category-level
495
+ // directory-MERGE operator that would actually fold two same-focus
496
+ // category subtrees is tracked as a follow-up; for now the joined
497
+ // tree retains both categories side-by-side and downstream
498
+ // convergence can decide whether to merge individual leaves across
499
+ // them.
500
+ export function mergeCategoriesWithSameFocus(ingestedSources) {
501
+ const byFocus = new Map();
502
+ for (const src of ingestedSources) {
503
+ for (const idx of src.indices) {
504
+ const rel = idx.relPath;
505
+ // Top-level only: relative path must be `<name>/index.md` —
506
+ // exactly one slash.
507
+ if (rel.split("/").length !== 2) continue;
508
+ const focus = idx.data.focus || "";
509
+ if (!focus) continue;
510
+ if (!byFocus.has(focus)) byFocus.set(focus, []);
511
+ byFocus.get(focus).push({ ...idx, sourceWiki: src.wikiRoot });
512
+ }
513
+ }
514
+ const merges = [];
515
+ for (const [focus, group] of byFocus) {
516
+ if (group.length < 2) continue;
517
+ merges.push({ focus, categories: group });
518
+ }
519
+ return merges;
520
+ }
521
+
522
+ // ── Phase 6: rewire-references ───────────────────────────────────
523
+ //
524
+ // Walk every leaf + index in the plan and rewrite any `links[].id`
525
+ // or `overlay_targets[]` entry that points at a renamed or merged
526
+ // id. Resolution order for each reference:
527
+ // 1. `mergeMap` — absorbed → keeper is source-agnostic (every
528
+ // source's reference to the absorbed id collapses to the
529
+ // same keeper id), so it's consulted first and applies flat.
530
+ // 2. `renameMap.get(referrerSourceWiki)` — source-scoped, so a
531
+ // link in source B's frontmatter pointing at "dup" resolves
532
+ // to B's renamed id (e.g. "b.dup"), not A's preserved "dup".
533
+ // This is the fix for the N>2 collision case: before the
534
+ // scope-by-source change, 3+ sources sharing "dup" all
535
+ // clobbered renameMap entries and left references pointing
536
+ // at the last-renamed value.
537
+ // Unresolvable references are left as-is; the downstream
538
+ // `validateWiki` flags them as `DANGLING-LINK` / `DANGLING-OVERLAY`
539
+ // so the user gets a single structured report at phase 9.
540
+ //
541
+ // `parents[]` entries are POSIX paths, not ids, and never resolve
542
+ // via the id maps — they're re-derived at phase 8 by
543
+ // `rebuildAllIndices` using the same path-relative rules the
544
+ // regular build pipeline uses.
545
+ export function rewireReferences(plan, renameMap, mergeMap) {
546
+ const resolveId = (ref, sourceWiki) => {
547
+ if (typeof ref !== "string") return ref;
548
+ if (mergeMap.has(ref)) return mergeMap.get(ref);
549
+ const sourceRenames = renameMap.get(sourceWiki);
550
+ if (sourceRenames && sourceRenames.has(ref)) return sourceRenames.get(ref);
551
+ return ref;
552
+ };
553
+ const rewriteLinks = (entry) => {
554
+ const src = entry.sourceWiki;
555
+ if (Array.isArray(entry.data.links)) {
556
+ entry.data.links = entry.data.links.map((link) => {
557
+ if (link && typeof link === "object" && typeof link.id === "string") {
558
+ return { ...link, id: resolveId(link.id, src) };
559
+ }
560
+ return link;
561
+ });
562
+ }
563
+ if (Array.isArray(entry.data.overlay_targets)) {
564
+ entry.data.overlay_targets = entry.data.overlay_targets.map((id) =>
565
+ resolveId(id, src),
566
+ );
567
+ }
568
+ };
569
+ for (const leaf of plan.leaves) rewriteLinks(leaf);
570
+ for (const idx of plan.indices) rewriteLinks(idx);
571
+ return plan;
572
+ }
573
+
574
+ // ── materialise-to-target ────────────────────────────────────────
575
+ //
576
+ // Intermediate step between phase 6 (rewire-references) and phase 7
577
+ // (apply-operators / runConvergence). Writes the unified plan into
578
+ // the prepared empty target directory so subsequent phases
579
+ // (convergence, index-generation, validation) operate on a real
580
+ // on-disk tree. Not one of the 11 methodology phases itself — it's
581
+ // the materialise step that makes phases 7+ possible. Each leaf's
582
+ // file is written with the (possibly rewritten) frontmatter + body;
583
+ // subdirectories are created as needed. Category indices are
584
+ // written last so directories exist before any index tries to
585
+ // enumerate entries[] at rebuild time.
586
+ //
587
+ // Structural fields on indices (`id`, `depth_role`, `parents`,
588
+ // `depth`, `entries`) are STRIPPED before writing. `rebuildAllIndices`
589
+ // in phase 8 re-derives every one of them from the target tree's
590
+ // actual shape — the source's stale values would trip
591
+ // `ID-MISMATCH-DIR` (source root id == `basename(sourceWiki)` which
592
+ // is not `basename(target)`) and `PARENTS-REQUIRED` (source subcat
593
+ // parents are relative to the source root, not the unified target).
594
+ // Authored-intent fields (`focus`, `shared_covers`, `orientation`,
595
+ // etc.) ARE preserved from the first source whose index lands at
596
+ // a given relPath — that's the closest we get to "category merge"
597
+ // without running the convergence MERGE operator here.
598
+ //
599
+ // Duplicate index relPaths (two sources with the same
600
+ // `foo/index.md`) are resolved first-wins: the first source's
601
+ // index contributes the authored-intent fields; subsequent writes
602
+ // at the same relPath are dropped SILENTLY — phase 9's validator
603
+ // doesn't see them (the second file never lands on disk), so no
604
+ // DUP-ID warning surfaces. This is intentional for the common
605
+ // case where two sources happen to share a top-level shape and
606
+ // their category indices carry near-identical metadata; if the
607
+ // focus values genuinely differ, the authored-intent divergence
608
+ // is lost. The upshot: `resolveIdCollisions` already throws
609
+ // `JOIN-INDEX-COLLISION` on same-id collisions, which catches
610
+ // the meaningful case; a same-relPath-different-focus pair would
611
+ // also mean same-id (the index id equals its directory basename),
612
+ // so the resolveIdCollisions throw fires before we reach
613
+ // materialisation. This silent-drop path is load-bearing only
614
+ // for truly identical duplicate indices.
615
+ //
616
+ // Source immutability: writes happen ONLY under `target`, never
617
+ // back to any source wiki.
618
+ export function materialisePlan(plan, target) {
619
+ if (!existsSync(target)) mkdirSync(target, { recursive: true });
620
+ // Write leaves first.
621
+ for (const leaf of plan.leaves) {
622
+ const absPath = join(target, leaf.relPath);
623
+ mkdirSync(dirname(absPath), { recursive: true });
624
+ const data = { ...leaf.data };
625
+ writeFileSync(absPath, renderFrontmatter(data, leaf.body), "utf8");
626
+ }
627
+ // Category indices: write only for directories that actually hold
628
+ // at least one leaf; strip structural fields so `rebuildAllIndices`
629
+ // re-derives them; first-wins on duplicate relPath.
630
+ const liveDirs = new Set();
631
+ for (const leaf of plan.leaves) {
632
+ const parts = leaf.relPath.split("/");
633
+ for (let i = 1; i < parts.length; i++) {
634
+ liveDirs.add(parts.slice(0, i).join("/"));
635
+ }
636
+ }
637
+ liveDirs.add(""); // wiki root always gets an index
638
+ const writtenIndexPaths = new Set();
639
+ for (const idx of plan.indices) {
640
+ const parts = idx.relPath.split("/");
641
+ const dirRel = parts.slice(0, -1).join("/");
642
+ if (!liveDirs.has(dirRel)) continue;
643
+ if (writtenIndexPaths.has(idx.relPath)) continue;
644
+ writtenIndexPaths.add(idx.relPath);
645
+ const absPath = join(target, idx.relPath);
646
+ mkdirSync(dirname(absPath), { recursive: true });
647
+ const data = { ...idx.data };
648
+ // Location-dependent structural fields are re-derived by
649
+ // `rebuildAllIndices` from the materialised tree's actual
650
+ // shape. Source values would mismatch the target's position
651
+ // in the unified hierarchy (e.g. source root `id` is
652
+ // `basename(sourceWiki)`, not `basename(target)`).
653
+ delete data.id;
654
+ delete data.depth_role;
655
+ delete data.depth;
656
+ delete data.parents;
657
+ delete data.entries;
658
+ // `type: "index"` stays. Phase 7 (`runConvergence`) runs
659
+ // BEFORE Phase 8's rebuildAllIndices, and convergence
660
+ // classifies entries as indices vs leaves via the `type`
661
+ // field. Stripping type would leave the intermediate tree
662
+ // with typeless index.md files, making convergence read
663
+ // them as plain leaves and tripping operator-proposal bugs
664
+ // (e.g. LIFT detection's `listChildren` filter).
665
+ writeFileSync(absPath, renderFrontmatter(data, idx.body), "utf8");
666
+ }
667
+ }
668
+
669
+ // ── Main entry ───────────────────────────────────────────────────
670
+ //
671
+ // Orchestrator calls this after it has:
672
+ // - taken a pre-op snapshot on the target
673
+ // - confirmed target is a fresh empty directory
674
+ //
675
+ // Returns a structured phase log:
676
+ // {
677
+ // phases: [{ name, summary }],
678
+ // warnings: [...],
679
+ // unified: { leaves: N, indices: M }
680
+ // }
681
+ //
682
+ // Phase 7/8/9 call the existing convergence/indices/validation
683
+ // helpers after materialisation, so the same tiered-AI quality mode
684
+ // applies to joined trees that applies to ordinary builds.
685
+ export async function runJoin(sources, target, ctx = {}) {
686
+ const {
687
+ opId = null,
688
+ qualityMode = "tiered-fast",
689
+ idCollisionPolicy = DEFAULT_COLLISION_POLICY,
690
+ // Optional per-phase commit hook. The orchestrator passes a
691
+ // function that stages + commits between phases so the private
692
+ // git log records the join's progression at per-phase
693
+ // granularity (matching the build pipeline's commit cadence).
694
+ // Shape: async ({ phase, summary }) => void. If absent, runJoin
695
+ // runs end-to-end without intermediate commits — the shape tests
696
+ // in `tests/unit/join.test.mjs` use that path.
697
+ onPhaseCommit = null,
698
+ // Optional per-phase progress hook, invoked synchronously the
699
+ // moment each phase records its summary (i.e. BEFORE the op
700
+ // awaits the next phase's I/O). This is what makes CLI-level
701
+ // progress streaming work for join — without it the
702
+ // orchestrator only sees join's phases AFTER `runJoin`
703
+ // returns, so the `[<op-id> N] phase: summary` breadcrumbs
704
+ // batch-print at the end of the join instead of streaming
705
+ // during execution. Shape:
706
+ // ({ phase, summary }) => any | Promise<any>
707
+ // The return value is ignored. Synchronous throws AND Promise
708
+ // rejections are both swallowed — a misbehaving progress hook
709
+ // must never halt the op. Async hooks are supported (the
710
+ // caller is responsible for ensuring their observable side
711
+ // effects don't race with subsequent phases; `runJoin` itself
712
+ // does not await the hook).
713
+ onPhase = null,
714
+ } = ctx;
715
+ const commitPhase = async (phase, summary) => {
716
+ if (onPhaseCommit) await onPhaseCommit({ phase, summary });
717
+ };
718
+ if (!Array.isArray(sources) || sources.length < 2) {
719
+ throw new Error(`join: at least 2 source wikis required, got ${sources?.length ?? 0}`);
720
+ }
721
+ const phaseLog = [];
722
+ const record = (name, summary) => {
723
+ phaseLog.push({ name, summary });
724
+ if (onPhase) {
725
+ // Cover BOTH sync throws and async rejections from the hook
726
+ // (mirrors `orchestrator.mjs::record()`). An `async` onPhase
727
+ // that rejects would otherwise escape as unhandledRejection
728
+ // and could terminate the process under Node's default
729
+ // policy, violating the "progress hook failures never halt
730
+ // the op" contract.
731
+ try {
732
+ const ret = onPhase({ phase: name, summary });
733
+ if (ret && typeof ret.then === "function") {
734
+ Promise.resolve(ret).catch(() => {
735
+ /* async onPhase rejection silently swallowed */
736
+ });
737
+ }
738
+ } catch {
739
+ /* sync onPhase throw silently swallowed */
740
+ }
741
+ }
742
+ };
743
+
744
+ // Phase 1 — ingest-all.
745
+ const ingested = sources.map((s) => ingestWiki(s));
746
+ record(
747
+ "ingest-all",
748
+ `read ${ingested.length} source(s); ` +
749
+ `${ingested.reduce((n, i) => n + i.leaves.length, 0)} leaf/leaves, ` +
750
+ `${ingested.reduce((n, i) => n + i.indices.length, 0)} index/indices`,
751
+ );
752
+
753
+ // Phase 2 — source-validate.
754
+ const vreport = validateSources(ingested);
755
+ if (vreport.errors.length > 0) {
756
+ const err = new Error(
757
+ `join: source-validate failed — ${vreport.errors.length} error(s) across ${sources.length} source(s). Fix each source before joining:\n` +
758
+ summariseFindings(vreport.errors.slice(0, 10)),
759
+ );
760
+ err.code = "JOIN-SOURCE-INVALID";
761
+ err.findings = vreport.errors;
762
+ throw err;
763
+ }
764
+ record(
765
+ "source-validate",
766
+ `0 errors, ${vreport.warnings.length} warning(s) across ${sources.length} source(s)`,
767
+ );
768
+
769
+ // Phase 3 — plan-union.
770
+ const unionPlan = planUnion(ingested);
771
+ record(
772
+ "plan-union",
773
+ `${unionPlan.leaves.length} leaf/leaves + ${unionPlan.indices.length} index/indices in union`,
774
+ );
775
+
776
+ // Phase 4 — resolve-id-collisions.
777
+ const { plan: resolvedPlan, renameMap, mergeMap } = resolveIdCollisions(
778
+ unionPlan,
779
+ idCollisionPolicy,
780
+ );
781
+ const totalRenames = [...renameMap.values()].reduce(
782
+ (sum, perSource) => sum + perSource.size,
783
+ 0,
784
+ );
785
+ record(
786
+ "resolve-id-collisions",
787
+ `policy=${idCollisionPolicy}; ${totalRenames} rename(s), ${mergeMap.size} merge(s)`,
788
+ );
789
+
790
+ // Phase 5 — merge-categories. First-cut is DETECT-ONLY: the
791
+ // helper identifies same-focus top-level categories but does not
792
+ // fold them. runConvergence's MERGE operator only merges sibling
793
+ // leaves (listChildren + leaf-pair scoring), not entire category
794
+ // subtrees — applying category merges requires a separate
795
+ // directory-MERGE operator that's tracked as a follow-up. In
796
+ // the common case (topically-distinct source wikis) there are
797
+ // zero same-focus categories so the phase is a no-op anyway; in
798
+ // the corner case where two sources both have `auth/` with
799
+ // matching focus, the joined tree ends up with two adjacent
800
+ // top-level categories that downstream rebalance can still
801
+ // consolidate if the user chooses.
802
+ const categoryMerges = mergeCategoriesWithSameFocus(ingested);
803
+ record(
804
+ "merge-categories",
805
+ `${categoryMerges.length} same-focus category group(s) detected (fold deferred to a future directory-MERGE operator)`,
806
+ );
807
+
808
+ // Phase 6 — rewire-references.
809
+ rewireReferences(resolvedPlan, renameMap, mergeMap);
810
+ record("rewire-references", `resolved via renameMap + mergeMap`);
811
+
812
+ // Materialise the in-memory plan to the target directory before
813
+ // phase 7 (runConvergence operates on a real on-disk tree).
814
+ materialisePlan(resolvedPlan, target);
815
+ record(
816
+ "materialise",
817
+ `wrote ${resolvedPlan.leaves.length} leaf/leaves into ${target}`,
818
+ );
819
+ await commitPhase(
820
+ "join-materialise",
821
+ `${resolvedPlan.leaves.length} leaf/leaves; policy=${idCollisionPolicy}`,
822
+ );
823
+
824
+ // Phase 7 — apply-operators (operator-convergence on unified tree).
825
+ const convergence = await runConvergence(target, {
826
+ opId,
827
+ qualityMode,
828
+ interactive: false,
829
+ });
830
+ record(
831
+ "operator-convergence",
832
+ `${convergence.applied.length} operator(s) applied across ${convergence.iterations} iteration(s)`,
833
+ );
834
+ await commitPhase(
835
+ "join-convergence",
836
+ `${convergence.applied.length} operator(s) applied`,
837
+ );
838
+
839
+ // Honour the Tier 2 suspend/resume contract. If convergence
840
+ // parked any Tier 2 requests, we MUST stop here — finalising
841
+ // phases 8-11 (index-generation, validation, commit) on a tree
842
+ // that still has pending decisions would produce a half-baked
843
+ // joined wiki and leave orphan `.work/tier2/pending-*.json` the
844
+ // re-invoke flow expects to consume. The orchestrator wraps the
845
+ // `runJoin` call in a catch that routes NeedsTier2Error through
846
+ // the exit-7 handshake (drain pending queue → write batch → exit
847
+ // 7 for wiki-runner to resolve) and preserves the pre-op state
848
+ // without tagging `op/<id>`.
849
+ if (convergence.needs_tier2) {
850
+ return {
851
+ phases: phaseLog,
852
+ convergence,
853
+ needs_tier2: true,
854
+ unified: {
855
+ leaves: resolvedPlan.leaves.length,
856
+ indices: resolvedPlan.indices.length,
857
+ },
858
+ };
859
+ }
860
+
861
+ // Phase 8 — generate-indices.
862
+ const rebuilt = rebuildAllIndices(target);
863
+ record("index-generation", `rebuilt ${rebuilt.length} indices`);
864
+ await commitPhase("join-index-generation", `rebuilt ${rebuilt.length} indices`);
865
+
866
+ // Phase 9 — validation.
867
+ const findings = validateWiki(target);
868
+ const errors = findings.filter((f) => f.severity === "error");
869
+ const warnings = findings.filter((f) => f.severity === "warning");
870
+ if (errors.length > 0) {
871
+ const err = new Error(
872
+ `join: target validation failed — ${errors.length} error(s).\n` +
873
+ summariseFindings(errors.slice(0, 10)),
874
+ );
875
+ err.code = "JOIN-TARGET-INVALID";
876
+ err.findings = errors;
877
+ throw err;
878
+ }
879
+ record("validation", `0 errors, ${warnings.length} warning(s)`);
880
+
881
+ // Phase 10 — golden-path-union. Source fixtures are out of scope
882
+ // for the first-cut implementation; record a no-op and leave the
883
+ // hook for downstream work.
884
+ record(
885
+ "golden-path-union",
886
+ "skipped (fixture-regression gate lands as a follow-up)",
887
+ );
888
+
889
+ // Phase 11 — commit (the orchestrator handles tagging).
890
+ return {
891
+ phases: phaseLog,
892
+ convergence,
893
+ validation: { errors, warnings },
894
+ unified: {
895
+ leaves: resolvedPlan.leaves.length,
896
+ indices: resolvedPlan.indices.length,
897
+ },
898
+ };
899
+ }
900
+
901
+ // ── Helpers ──────────────────────────────────────────────────────
902
+
903
+ function relPosix(p) {
904
+ return p.split(/[\\\/]/).join("/");
905
+ }
906
+
907
+ function dedupe(arr) {
908
+ return [...new Set(arr)];
909
+ }
910
+
911
+ // Basename of wiki path, stripped of trailing `.wiki` if present.
912
+ // Used as the namespace prefix when the `namespace` id-collision
913
+ // policy renames `<prefix>.<id>`.
914
+ function namespacePrefix(wikiRoot) {
915
+ const base = basename(wikiRoot);
916
+ return base.endsWith(".wiki") ? base.slice(0, -".wiki".length) : base;
917
+ }