@ctxr/skill-llm-wiki 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,726 @@
1
+ // soft-dag.mjs — post-convergence DAG soft-parent synthesis.
2
+ //
3
+ // Runs when the caller passes `--soft-dag-parents` on build or
4
+ // rebuild. For each routable leaf, compares the leaf's TF-IDF vector
5
+ // against every candidate category-directory's aggregate vector;
6
+ // directories whose cosine similarity meets the threshold become
7
+ // SOFT parents. The leaf's `parents[]` frontmatter is rewritten with
8
+ // the primary parent FIRST (always `"index.md"` for leaves — the
9
+ // path is POSIX-relative to the LEAF's own directory, and the leaf
10
+ // sits in the same directory as its direct-parent index.md by
11
+ // construction; `"../index.md"` is the subcategory-index shape, not
12
+ // a leaf shape), followed by one entry per chosen soft parent,
13
+ // likewise POSIX-relative to the same origin (the leaf's directory).
14
+ //
15
+ // Downstream, `applySoftParentEntries` re-walks the tree after index
16
+ // generation and appends each leaf's record into every soft-parent
17
+ // index's `entries[]`. The rebuilder never moves files on disk — a
18
+ // leaf's physical location remains under its primary parent; only
19
+ // the leaf's `parents[]` pointer array and every claimed parent's
20
+ // `entries[]` expand.
21
+ //
22
+ // Determinism: lex-sorted leaf iteration, lex-sorted candidate-dir
23
+ // iteration inside each leaf's pass, lex-sorted frontmatter
24
+ // serialisation. Two runs on the same tree produce byte-identical
25
+ // output.
26
+ //
27
+ // Threshold + cap: a cosine similarity ≥ `SOFT_PARENT_AFFINITY_THRESHOLD`
28
+ // is required for a candidate to qualify; the top
29
+ // `SOFT_PARENT_MAX_PER_LEAF` qualifying candidates per leaf are kept.
30
+ // Ranking is descending cosine with POSIX-path ascending as a
31
+ // deterministic tie-break.
32
+ //
33
+ // Subcommand scope: build + rebuild only. Intent validation rejects
34
+ // the flag elsewhere via `INT-16a` for the same reasons the balance
35
+ // flags reject in non-{build,rebuild} (see intent.mjs).
36
+
37
+ import {
38
+ existsSync,
39
+ readFileSync,
40
+ readdirSync,
41
+ realpathSync,
42
+ renameSync,
43
+ writeFileSync,
44
+ } from "node:fs";
45
+ import { basename, dirname, join, relative, resolve, sep } from "node:path";
46
+ import { readFrontmatterStreaming } from "./chunk.mjs";
47
+ import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
48
+ import {
49
+ buildComparisonModelFromTexts,
50
+ cosine,
51
+ entryText,
52
+ tfidfVector,
53
+ } from "./similarity.mjs";
54
+
55
+ // Minimum cosine similarity between a leaf and a candidate category
56
+ // directory for the category to qualify as a soft parent. Calibrated
57
+ // against similarity.mjs's Tier-0 thresholds: the `TIER0_DECISIVE_DIFFERENT
58
+ // = 0.30` floor marks "definitely unrelated" at pairwise-leaf scale,
59
+ // and the `TIER0_DECISIVE_SAME = 0.85` ceiling marks "definitely same
60
+ // topic". Soft parents want the middle-of-the-band "clearly related
61
+ // but not identical" zone — ~0.35 is empirically the lowest point
62
+ // where a category-vs-leaf cosine consistently reflects topical
63
+ // overlap (rather than accidental token reuse). A two-aggregate
64
+ // comparison inflates average cosine slightly vs pairwise, so we
65
+ // sit above DECISIVE_DIFFERENT by about one standard deviation of
66
+ // background noise.
67
+ export const SOFT_PARENT_AFFINITY_THRESHOLD = 0.35;
68
+
69
+ // Cap on soft parents per leaf (primary parent not counted toward the
70
+ // cap). Three soft parents + one primary = max four index locations a
71
+ // single leaf appears in. Chosen on the same token-economy reasoning
72
+ // as Phase X.5's fan-out target: a Claude navigator reading one
73
+ // leaf's parents[] tolerates a handful of entries before signal
74
+ // quality drops. Higher caps dilute the "this is where the leaf
75
+ // belongs" hint into noise.
76
+ export const SOFT_PARENT_MAX_PER_LEAF = 3;
77
+
78
+ // Walk the wiki and collect every routable leaf's absolute path +
79
+ // parsed frontmatter. Uses readdir directly (not `listChildren`) so
80
+ // pre-bootstrap category dirs — directories created by Phase 3 draft
81
+ // that don't have `index.md` yet — are still descended into. Leaves
82
+ // themselves are validated with the same frontmatter-must-have-id
83
+ // discipline `listChildren` uses. Dot-prefixed entries are skipped
84
+ // under the blanket pipeline rule.
85
+ //
86
+ // `withBody` controls read mode:
87
+ // - `true` (default for `runSoftDagParents`): `readFileSync` +
88
+ // `parseFrontmatter` so the caller can write leaves back via
89
+ // `renderFrontmatter(data, body)` preserving the body bytes.
90
+ // - `false` (used by `applySoftParentEntries`): bounded
91
+ // `readFrontmatterStreaming` so the walk only pays the
92
+ // frontmatter-byte cost, not the full-file-bytes cost. Matters
93
+ // at the 596-leaf consumer-corpus scale where bodies can dwarf
94
+ // frontmatter.
95
+ function collectAllLeaves(wikiRoot, withBody = true) {
96
+ const out = [];
97
+ const stack = [wikiRoot];
98
+ while (stack.length > 0) {
99
+ const dir = stack.pop();
100
+ let entries;
101
+ try {
102
+ entries = readdirSync(dir, { withFileTypes: true });
103
+ } catch {
104
+ continue;
105
+ }
106
+ for (const e of entries) {
107
+ if (e.name.startsWith(".")) continue;
108
+ const full = join(dir, e.name);
109
+ if (e.isDirectory()) {
110
+ stack.push(full);
111
+ continue;
112
+ }
113
+ if (!e.isFile()) continue;
114
+ if (!e.name.endsWith(".md")) continue;
115
+ if (e.name === "index.md") continue;
116
+ let parsed;
117
+ let body;
118
+ try {
119
+ // Both modes use `readFrontmatterStreaming` to get the
120
+ // frontmatter text + byte offset. That function normalises
121
+ // CRLF → LF on the frontmatter payload so `parseFrontmatter`
122
+ // (which only recognises an LF fence) sees the expected
123
+ // form. Pre-round-2 the withBody path used `readFileSync` +
124
+ // `parseFrontmatter` directly, which silently dropped
125
+ // CRLF-fence leaves (common on Windows editors) and made
126
+ // them invisible to soft-DAG synthesis.
127
+ const captured = readFrontmatterStreaming(full);
128
+ if (!captured) continue;
129
+ parsed = parseFrontmatter(captured.frontmatterText, full);
130
+ if (withBody) {
131
+ // Read the raw file as a buffer and slice at the original
132
+ // byte offset so multi-byte characters at the fence
133
+ // boundary don't corrupt the body. `captured.bodyOffset`
134
+ // is the byte index just after the CLOSING fence.
135
+ const raw = readFileSync(full);
136
+ body = raw.slice(captured.bodyOffset).toString("utf8");
137
+ // CRLF-fenced files slice a body that starts with "\r\n".
138
+ // `renderFrontmatter` only checks `body.startsWith("\n")`
139
+ // when deciding whether to prepend the separator newline,
140
+ // so a CRLF-leading body would produce "\n\r\n" at the
141
+ // fence boundary — a mixed-EOL corruption. Normalise the
142
+ // whole body to LF on rewrite: the wider codebase is LF-
143
+ // only for on-disk output (`renderFrontmatter` always
144
+ // emits LF fences, `writeFrontmatter` ditto), so the
145
+ // rewritten leaf ends up entirely LF regardless of the
146
+ // input style. This aligns with the existing pipeline
147
+ // convention; the alternative (CRLF-preserving renderer)
148
+ // would require API changes across call sites that have
149
+ // all been LF-only since v1.0.0.
150
+ if (captured.lineEnding === "crlf") {
151
+ body = body.replace(/\r\n/g, "\n");
152
+ }
153
+ }
154
+ } catch {
155
+ continue;
156
+ }
157
+ if (!parsed?.data?.id) continue;
158
+ out.push(
159
+ withBody
160
+ ? { path: full, data: parsed.data, body }
161
+ : { path: full, data: parsed.data },
162
+ );
163
+ }
164
+ }
165
+ return out;
166
+ }
167
+
168
+ // Group already-collected leaves by their direct-parent directory.
169
+ // Consumed by `collectCandidateDirs` (routable-leaf check) and
170
+ // `buildCategoryText` (aggregate text) so neither has to call
171
+ // `listChildren` again per directory. One pass at the top of
172
+ // `runSoftDagParents` feeds both downstream helpers; the previous
173
+ // layout did two full-tree walks (one in collectCandidateDirs via
174
+ // listChildren, one per candidate in buildCategoryText) on top of
175
+ // the full-leaf collection, which was wasteful on large corpora.
176
+ function groupLeavesByDir(leaves) {
177
+ const map = new Map();
178
+ for (const leaf of leaves) {
179
+ const dir = dirname(leaf.path);
180
+ const list = map.get(dir) ?? [];
181
+ list.push(leaf);
182
+ map.set(dir, list);
183
+ }
184
+ return map;
185
+ }
186
+
187
+ // Walk the wiki and collect every non-dot category directory (any
188
+ // directory that could be a soft-parent target). The wiki root is
189
+ // included since leaves from deep subtrees can claim the root as a
190
+ // soft parent (the typed "this is also broadly relevant to the root
191
+ // topic" pointer). A category is eligible as a soft-parent target if
192
+ // it has an `index.md` OR at least one ROUTABLE leaf directly
193
+ // underneath. Routability is decided from the pre-collected
194
+ // `leavesByDir` map (derived from `collectAllLeaves`), not from a
195
+ // fresh `listChildren` call per dir — `listChildren` would parse
196
+ // frontmatter for every `.md` again, duplicating I/O already done.
197
+ function collectCandidateDirs(wikiRoot, leavesByDir) {
198
+ const out = [];
199
+ const stack = [wikiRoot];
200
+ while (stack.length > 0) {
201
+ const dir = stack.pop();
202
+ let entries;
203
+ try {
204
+ entries = readdirSync(dir, { withFileTypes: true });
205
+ } catch {
206
+ continue;
207
+ }
208
+ const subdirs = [];
209
+ let hasIndex = false;
210
+ for (const e of entries) {
211
+ if (e.name.startsWith(".")) continue;
212
+ const full = join(dir, e.name);
213
+ if (e.isDirectory()) {
214
+ subdirs.push(full);
215
+ continue;
216
+ }
217
+ if (!e.isFile()) continue;
218
+ if (e.name === "index.md") {
219
+ hasIndex = true;
220
+ }
221
+ }
222
+ const hasRoutableLeaf = (leavesByDir.get(dir)?.length ?? 0) > 0;
223
+ if (dir === wikiRoot || hasIndex || hasRoutableLeaf) {
224
+ out.push(dir);
225
+ }
226
+ for (const sub of subdirs) stack.push(sub);
227
+ }
228
+ return out;
229
+ }
230
+
231
+ // Build the aggregate semantic text for a candidate category
232
+ // directory. Includes the directory's `index.md` frontmatter
233
+ // (`focus`, `covers`, `tags`, `domains` — the full set `entryText`
234
+ // uses, with focus doubled for emphasis) when present PLUS each
235
+ // routable leaf directly under it via `entryText`. Descendant
236
+ // leaves are deliberately NOT included — soft parents claim leaves
237
+ // on direct topical overlap, not on transitive subtree content, so
238
+ // aggregating across depths would let a leaf latch onto a root
239
+ // category it only matches through a deeply nested cousin. Takes
240
+ // the pre-grouped `leavesByDir` map so no second `listChildren`
241
+ // pass is needed here.
242
+ //
243
+ // Index reads go through `readFrontmatterStreaming` rather than a
244
+ // full `readFileSync` — we only need frontmatter fields to build
245
+ // `entryText`. Authored orientation blocks can be long, and this
246
+ // phase only scores against frontmatter signals; reading the body
247
+ // would be pure waste. `readIndex` from indices.mjs loads full
248
+ // bodies, which is why it's avoided here.
249
+ function buildCategoryText(dir, leavesByDir) {
250
+ const parts = [];
251
+ const indexPath = join(dir, "index.md");
252
+ try {
253
+ const captured = readFrontmatterStreaming(indexPath);
254
+ if (captured) {
255
+ const { data } = parseFrontmatter(captured.frontmatterText, indexPath);
256
+ if (data) parts.push(entryText(data));
257
+ }
258
+ } catch {
259
+ // Missing / malformed index.md is tolerated: the dir may be
260
+ // pre-bootstrap (Phase 3 draft category) or the index may have
261
+ // a fence mismatch. We fall through to aggregating leaves only;
262
+ // downstream validation catches any real shape issue.
263
+ }
264
+ const leaves = leavesByDir.get(dir) ?? [];
265
+ for (const leaf of leaves) parts.push(entryText(leaf.data));
266
+ return parts.join(" ").trim();
267
+ }
268
+
269
+ // Normalise an absolute path to a POSIX-separator string relative to
270
+ // `fromDir`. Matches Phase X.5's `posixSortKey` discipline: on
271
+ // Windows `path.relative` emits `\\` which would pollute
272
+ // `parents[]` strings with OS-specific separators and break
273
+ // cross-platform byte-reproducibility. Soft-parent paths are POSIX
274
+ // in on-disk form.
275
+ function posixRelative(fromDir, toPath) {
276
+ const rel = relative(fromDir, toPath);
277
+ return sep === "/" ? rel : rel.split(sep).join("/");
278
+ }
279
+
280
+ // Given a leaf at absolute `leafPath` and a soft-parent candidate
281
+ // `targetDir` (also absolute), produce the POSIX-relative path from
282
+ // the leaf's direct-parent `index.md` to `targetDir/index.md`.
283
+ // Balance's flatten pass keeps `parents[]` relative to the direct
284
+ // parent's `index.md` (see `applyBalanceFlatten` doc); the
285
+ // convention is the same for soft parents.
286
+ //
287
+ // Examples (POSIX):
288
+ // leaf=wiki/a/l.md, target=wiki/a → "index.md"
289
+ // leaf=wiki/a/l.md, target=wiki/b → "../b/index.md"
290
+ // leaf=wiki/a/b/l.md, target=wiki/ → "../../index.md"
291
+ function relativeParentPath(leafPath, targetDir) {
292
+ const leafDir = dirname(leafPath);
293
+ return posixRelative(leafDir, join(targetDir, "index.md"));
294
+ }
295
+
296
+ // Score a leaf against every candidate directory (excluding the
297
+ // leaf's own direct parent — that's the primary, not a soft parent).
298
+ // Returns an array of `{ dir, cosine }` sorted by cosine descending,
299
+ // POSIX-path ascending as a deterministic tie-break. Only scores at
300
+ // or above the caller's `threshold` are included.
301
+ //
302
+ // Threshold is passed as a parameter rather than hard-coded so an
303
+ // override via `ctx.threshold` in `runSoftDagParents` takes effect
304
+ // AT THIS FILTER. An earlier draft hard-coded the constant here and
305
+ // re-filtered post-facto — if the override was LOWER than the
306
+ // constant, candidates in the window [override, constant) were
307
+ // dropped at scoring time and couldn't be reinstated.
308
+ function scoreCandidates(
309
+ leaf,
310
+ leafVector,
311
+ candidates,
312
+ categoryVectors,
313
+ wikiRoot,
314
+ threshold,
315
+ ) {
316
+ const primaryDir = dirname(leaf.path);
317
+ const scored = [];
318
+ for (const dir of candidates) {
319
+ if (dir === primaryDir) continue;
320
+ const catVec = categoryVectors.get(dir);
321
+ if (!catVec) continue;
322
+ const sim = cosine(leafVector, catVec);
323
+ if (sim < threshold) continue;
324
+ scored.push({ dir, cosine: sim });
325
+ }
326
+ scored.sort((a, b) => {
327
+ if (b.cosine !== a.cosine) return b.cosine - a.cosine;
328
+ // Deterministic lex tie-break via POSIX-normalised relative path.
329
+ const aKey = posixRelative(wikiRoot, a.dir);
330
+ const bKey = posixRelative(wikiRoot, b.dir);
331
+ return aKey.localeCompare(bKey);
332
+ });
333
+ return scored;
334
+ }
335
+
336
+ // Resolve the PRIMARY parent path-string for a leaf. parents[] is
337
+ // POSIX-relative to the LEAF's directory. The primary parent is
338
+ // the leaf's direct-parent `index.md`, which sits in the same
339
+ // directory as the leaf — so the path-string is always `"index.md"`
340
+ // regardless of the leaf's depth. This matches the convention
341
+ // `rebuildIndex` derives and the shape `applyBalanceFlatten` relies
342
+ // on (see the doc comment there — promoting a subtree preserves
343
+ // every relative parents[] entry by construction because they're
344
+ // all anchored at the leaf's own dir).
345
+ function primaryParentPath() {
346
+ return "index.md";
347
+ }
348
+
349
+ // Atomic write: materialise to `<path>.tmp` then rename into place.
350
+ // Matches `indices.mjs::atomicWriteFile`'s discipline — a crash or
351
+ // SIGKILL between writeFileSync and renameSync leaves EITHER the
352
+ // old file intact OR the temp file orphaned, never a partially-
353
+ // written target. Both leaf rewrites (`rewriteLeafParents`) and
354
+ // index rewrites (`applySoftParentEntries`) route through this so
355
+ // the soft-DAG phase matches the durability expectations the rest
356
+ // of the index-generation pipeline sets.
357
+ function atomicWriteFile(targetPath, content) {
358
+ const tmp = targetPath + ".tmp";
359
+ writeFileSync(tmp, content, "utf8");
360
+ renameSync(tmp, targetPath);
361
+ }
362
+
363
+ // Rewrite the leaf's frontmatter with an expanded `parents[]` array.
364
+ // Primary parent first, soft parents after in score order. The body
365
+ // is preserved byte-exact; only the frontmatter is re-serialised.
366
+ function rewriteLeafParents(leaf, parentsArray) {
367
+ const newData = { ...leaf.data, parents: parentsArray };
368
+ const serialised = renderFrontmatter(newData, leaf.body);
369
+ atomicWriteFile(leaf.path, serialised);
370
+ }
371
+
372
+ // Main entry point. Returns a summary of work done; the caller
373
+ // (orchestrator Phase 4.4) records it in the phase log.
374
+ //
375
+ // Shape:
376
+ // {
377
+ // leavesProcessed: number,
378
+ // softParentsAdded: number, // total across all leaves
379
+ // perLeaf: Map<leafPath, string[]> // soft-parent paths per leaf
380
+ // // (empty array if none qualified)
381
+ // }
382
+ //
383
+ // Contract with the caller:
384
+ // - `wikiRoot` must point at a valid wiki root (has root index.md
385
+ // or is in pre-bootstrap state from Phase 3 draft — both are
386
+ // tolerated by `collectAllLeaves` + `collectCandidateDirs`).
387
+ // - `ctx.threshold` and `ctx.maxPerLeaf` may override the exported
388
+ // defaults for tests that want deterministic boundary behaviour.
389
+ // - No commits: the orchestrator's phase infrastructure handles
390
+ // git-add + git-commit around this call.
391
+ export async function runSoftDagParents(wikiRoot, ctx = {}) {
392
+ const {
393
+ threshold = SOFT_PARENT_AFFINITY_THRESHOLD,
394
+ maxPerLeaf = SOFT_PARENT_MAX_PER_LEAF,
395
+ } = ctx;
396
+
397
+ const leaves = collectAllLeaves(wikiRoot);
398
+ leaves.sort((a, b) =>
399
+ posixRelative(wikiRoot, a.path).localeCompare(
400
+ posixRelative(wikiRoot, b.path),
401
+ ),
402
+ );
403
+ if (leaves.length === 0) {
404
+ return { leavesProcessed: 0, softParentsAdded: 0, perLeaf: new Map() };
405
+ }
406
+
407
+ // One group-by-dir pass over the already-collected leaves feeds
408
+ // both `collectCandidateDirs` (routable-leaf check) and
409
+ // `buildCategoryText` (aggregate text). No extra filesystem I/O
410
+ // beyond the initial `collectAllLeaves` walk.
411
+ const leavesByDir = groupLeavesByDir(leaves);
412
+ const candidateDirs = collectCandidateDirs(wikiRoot, leavesByDir);
413
+ candidateDirs.sort((a, b) =>
414
+ posixRelative(wikiRoot, a).localeCompare(posixRelative(wikiRoot, b)),
415
+ );
416
+
417
+ // Build one corpus over all leaves AND all candidate-category
418
+ // texts. Unified IDF means leaf-vs-category cosines sit on the
419
+ // same TF-IDF basis as Phase X.3 / similarity.mjs's pairwise
420
+ // scores, so threshold calibration transfers.
421
+ //
422
+ // Leaf text comes from `entryText(leaf.data)` which already
423
+ // applies the doubled-focus weighting. Category text is
424
+ // pre-aggregated by `buildCategoryText` (which also routes
425
+ // through `entryText` for each contributor). Both are passed
426
+ // as-is to `buildComparisonModelFromTexts` — a plain texts-array
427
+ // constructor that skips the `entryText` roundtrip
428
+ // `buildComparisonModel` would otherwise perform, avoiding a
429
+ // second round of focus-doubling on pre-assembled strings.
430
+ const leafTexts = leaves.map((l) => entryText(l.data));
431
+ const catTextMap = new Map();
432
+ for (const dir of candidateDirs) {
433
+ catTextMap.set(dir, buildCategoryText(dir, leavesByDir));
434
+ }
435
+ const corpusTexts = [...leafTexts, ...Array.from(catTextMap.values())];
436
+ const model = buildComparisonModelFromTexts(corpusTexts);
437
+ const leafVectors = new Map();
438
+ for (let i = 0; i < leaves.length; i++) {
439
+ leafVectors.set(leaves[i].path, tfidfVector(model.tokenLists[i], model.idfMap));
440
+ }
441
+ const categoryVectors = new Map();
442
+ let idx = leaves.length;
443
+ for (const dir of candidateDirs) {
444
+ categoryVectors.set(
445
+ dir,
446
+ tfidfVector(model.tokenLists[idx], model.idfMap),
447
+ );
448
+ idx++;
449
+ }
450
+
451
+ const perLeaf = new Map();
452
+ let softParentsAdded = 0;
453
+ for (const leaf of leaves) {
454
+ const leafVec = leafVectors.get(leaf.path);
455
+ const scored = scoreCandidates(
456
+ leaf,
457
+ leafVec,
458
+ candidateDirs,
459
+ categoryVectors,
460
+ wikiRoot,
461
+ threshold,
462
+ );
463
+ const chosen = scored.slice(0, maxPerLeaf);
464
+ const softParents = chosen.map((c) => relativeParentPath(leaf.path, c.dir));
465
+ const parentsArray = [primaryParentPath(), ...softParents];
466
+ rewriteLeafParents(leaf, parentsArray);
467
+ perLeaf.set(leaf.path, softParents);
468
+ softParentsAdded += softParents.length;
469
+ }
470
+
471
+ return {
472
+ leavesProcessed: leaves.length,
473
+ softParentsAdded,
474
+ perLeaf,
475
+ };
476
+ }
477
+
478
+ // Post-index-rebuild pass: for every leaf claiming a soft parent,
479
+ // append the leaf's `entries[]` record to each claimed parent
480
+ // directory's `index.md`. `rebuildAllIndices` only places a leaf in
481
+ // its direct-parent `index.md`; this pass extends the DAG view so a
482
+ // Claude navigator arriving at any claimed parent sees the leaf in
483
+ // that parent's `entries[]`.
484
+ //
485
+ // The leaf's `parents[]` is the ground truth: we never invent claims.
486
+ // The pass walks every leaf, resolves each non-primary `parents[]`
487
+ // entry to an absolute index path, reads the target `index.md`,
488
+ // appends a minimal entry record (mirroring `rebuildIndex`'s shape),
489
+ // and re-writes the index. Records already present (same `id`) are
490
+ // skipped so the pass is idempotent — running it twice on the same
491
+ // tree produces the same bytes.
492
+ export function applySoftParentEntries(wikiRoot) {
493
+ // Frontmatter-only reads for the propagation pass — we never
494
+ // rewrite leaves here, only their claimed parent index.md files,
495
+ // so there's no need to buffer bodies in memory. On large corpora
496
+ // (the 596-leaf target workload) body bytes dwarf frontmatter
497
+ // bytes, so bounded streaming reads turn this from O(total leaf
498
+ // bytes) into O(frontmatter bytes).
499
+ const leaves = collectAllLeaves(wikiRoot, /* withBody */ false);
500
+ // Deterministic iteration so repeated runs produce byte-identical
501
+ // output regardless of OS filesystem enumeration order.
502
+ leaves.sort((a, b) =>
503
+ posixRelative(wikiRoot, a.path).localeCompare(
504
+ posixRelative(wikiRoot, b.path),
505
+ ),
506
+ );
507
+
508
+ // Group soft-parent appends by target index path. We resolve once
509
+ // per leaf-parent pair, dedupe against existing `entries[]` by id,
510
+ // then commit per-index in a single pass at the end to avoid
511
+ // quadratic file I/O.
512
+ const softAppendsByIndex = new Map(); // indexPath → Array<record>
513
+
514
+ for (const leaf of leaves) {
515
+ const parents = Array.isArray(leaf.data.parents) ? leaf.data.parents : [];
516
+ if (parents.length <= 1) continue; // primary-only, nothing to do
517
+ const leafDir = dirname(leaf.path);
518
+ const record = buildEntryRecord(leaf);
519
+ // Skip the first entry (primary); everything after is soft.
520
+ for (let i = 1; i < parents.length; i++) {
521
+ const rel = parents[i];
522
+ if (typeof rel !== "string" || rel.length === 0) continue;
523
+ const absIndex = normaliseIndexPath(leafDir, rel, wikiRoot);
524
+ if (!absIndex) continue;
525
+ if (!existsSync(absIndex)) continue;
526
+ // The `file:` field is relative to the target index's directory,
527
+ // not the leaf's direct parent. Use OS-native `path.relative`
528
+ // (not `posixRelative`) to match the convention
529
+ // `indices.mjs::rebuildIndex` produces for primary entries —
530
+ // on Windows `rebuildIndex` emits `\`-separator file paths, so
531
+ // mixing POSIX-normalised appends into the same array would
532
+ // produce inconsistent separators within one `entries[]` and
533
+ // break link rendering. Byte-reproducibility across OSes is a
534
+ // concern shared with rebuildIndex itself, out of scope here.
535
+ const targetDir = dirname(absIndex);
536
+ const targetRecord = {
537
+ ...record,
538
+ file: relative(targetDir, leaf.path),
539
+ };
540
+ const list = softAppendsByIndex.get(absIndex) ?? [];
541
+ list.push(targetRecord);
542
+ softAppendsByIndex.set(absIndex, list);
543
+ }
544
+ }
545
+
546
+ // Actual-write counters. Pre-round-2 the returned stats were
547
+ // derived from `softAppendsByIndex.size` and the sum of its value
548
+ // arrays — the PLANNED appends. That over-reported on reruns (every
549
+ // id already present → zero actual writes but indicesTouched still
550
+ // counted) and over-reported when an index failed to parse.
551
+ // Tracking the actual writes keeps orchestrator phase logging
552
+ // honest across idempotent and hostile-fixture cases.
553
+ let indicesTouched = 0;
554
+ let softEntriesAdded = 0;
555
+ for (const [indexPath, appends] of softAppendsByIndex) {
556
+ // Per-index try/catch: a malformed target `index.md` (e.g.,
557
+ // user-edited YAML that fails to parse) must NOT abort the
558
+ // entire propagation pass. Soft-DAG synthesis is best-effort;
559
+ // the rest of the pipeline (`listChildren`, `collectAllLeaves`)
560
+ // follows the same skip-and-continue discipline for malformed
561
+ // frontmatter. Downstream validation surfaces the bad index
562
+ // with its own diagnostic.
563
+ let raw, parsed;
564
+ try {
565
+ raw = readFileSync(indexPath, "utf8");
566
+ parsed = parseFrontmatter(raw, indexPath);
567
+ } catch {
568
+ continue;
569
+ }
570
+ if (!parsed?.data) continue;
571
+ // Sanity check: only touch files that ARE managed indices. A
572
+ // file that passes the path guard and exists but has no
573
+ // frontmatter fence (`parseFrontmatter` returns `{data: {}}`),
574
+ // or has frontmatter but isn't `type: index`, is not something
575
+ // we should append `entries:` into. Skip defensively — the
576
+ // rest of the pipeline would produce confusing validator
577
+ // errors if we smuggled an `entries[]` into an arbitrary .md
578
+ // with the filename `index.md`.
579
+ if (parsed.data.type !== "index") continue;
580
+ if (typeof parsed.data.id !== "string" || parsed.data.id.length === 0) continue;
581
+ const existing = Array.isArray(parsed.data.entries)
582
+ ? parsed.data.entries
583
+ : [];
584
+ const existingIds = new Set(existing.map((e) => e?.id).filter(Boolean));
585
+ // De-dupe by id: a leaf may already be in the index's entries
586
+ // (primary case) or may appear twice across soft claims in
587
+ // degenerate fixtures.
588
+ const newEntries = existing.slice();
589
+ let addedThisIndex = 0;
590
+ for (const rec of appends) {
591
+ if (!rec.id || existingIds.has(rec.id)) continue;
592
+ newEntries.push(rec);
593
+ existingIds.add(rec.id);
594
+ addedThisIndex++;
595
+ }
596
+ if (addedThisIndex === 0) continue; // no change
597
+ // Deterministic sort: lex by id. `rebuildIndex` already produces
598
+ // entries in walk-order, but the DAG pass adds them at the end,
599
+ // and a future run's grouping may differ — lex-sort keeps the
600
+ // on-disk order stable across runs. `localeCompare` would throw
601
+ // on a non-string `id` (e.g., a manually edited index with
602
+ // `id: 123`), which for a best-effort propagation pass is the
603
+ // wrong trade-off — one malformed index shouldn't abort
604
+ // propagation for the other valid ones. Coerce to string first;
605
+ // downstream validation still catches the bad id shape.
606
+ newEntries.sort((a, b) =>
607
+ String(a?.id ?? "").localeCompare(String(b?.id ?? "")),
608
+ );
609
+ parsed.data.entries = newEntries;
610
+ atomicWriteFile(indexPath, renderFrontmatter(parsed.data, parsed.body));
611
+ indicesTouched++;
612
+ softEntriesAdded += addedThisIndex;
613
+ }
614
+
615
+ return { indicesTouched, softEntriesAdded };
616
+ }
617
+
618
+ // Build a minimal `entries[]` record for a leaf, matching the shape
619
+ // `indices.mjs::rebuildIndex` produces. The `file` field is left
620
+ // absent — the caller (`applySoftParentEntries`) always sets it to
621
+ // `path.relative(targetDir, leaf.path)` per call site because each
622
+ // soft parent index lives in a different directory and the relative
623
+ // link must anchor to THAT index's directory, not the leaf's own.
624
+ // Using OS-native `path.relative` (not a POSIX normaliser) matches
625
+ // what `indices.mjs::rebuildIndex` produces for the primary-parent
626
+ // entry, avoiding mixed `\` + `/` separators within the same
627
+ // `entries[]` on Windows.
628
+ function buildEntryRecord(leaf) {
629
+ const record = {
630
+ id: leaf.data.id,
631
+ type: leaf.data.type ?? "primary",
632
+ focus: leaf.data.focus ?? "",
633
+ };
634
+ if (leaf.data.tags) record.tags = leaf.data.tags;
635
+ if (leaf.data.overlay_targets) record.overlay_targets = leaf.data.overlay_targets;
636
+ return record;
637
+ }
638
+
639
+ // Resolve a POSIX-relative `parents[]` entry like `"../b/index.md"`
640
+ // to an absolute filesystem path, anchored at the leaf's direct
641
+ // parent directory. Returns null for obviously malformed entries
642
+ // (absolute paths, entries that escape above wikiRoot, non-index.md
643
+ // endings). Defensive: malformed claims are skipped rather than
644
+ // crashing the phase — soft-dag synthesis is best-effort, and a
645
+ // bad parents[] entry typically indicates manual frontmatter edits
646
+ // that downstream validation will surface.
647
+ //
648
+ // Path-traversal guard: a crafted entry like
649
+ // "../../../../somewhere/index.md" must not let this phase read or
650
+ // write files outside the wiki tree. Resolve both the candidate path
651
+ // and the wiki root to canonical absolute form, then confirm the
652
+ // candidate sits under the wikiRoot prefix. Reject otherwise — this
653
+ // is a defense-in-depth check alongside validate's DUP-ID /
654
+ // ALIAS-COLLIDES-ID; a hostile leaf's parents[] shouldn't be able to
655
+ // mutate arbitrary filesystem paths even transiently.
656
+ //
657
+ // Two guards fire here:
658
+ //
659
+ // 1. Lexical guard on `resolve(leafDir, nativeRel)` prefix.
660
+ // Rejects pure `..`-traversal that would escape the wikiRoot
661
+ // prefix on disk without touching the filesystem.
662
+ // 2. Symlink-aware guard on `realpathSync`. `readFileSync` /
663
+ // `writeFileSync` FOLLOW symlinks, so a symlinked index.md
664
+ // inside the wiki pointing at an external file would bypass
665
+ // guard (1) even though the lexical path sits inside the
666
+ // wikiRoot prefix. `realpathSync` resolves the whole chain
667
+ // (including intermediate symlinked directories); the
668
+ // resolved target must still sit under the wikiRoot realpath
669
+ // for the claim to be accepted. Only fires when the target
670
+ // already exists — realpath throws ENOENT on a new index, and
671
+ // the caller's `existsSync` branch below handles that case.
672
+ function normaliseIndexPath(leafDir, rel, wikiRoot) {
673
+ if (typeof rel !== "string") return null;
674
+ if (rel.length === 0) return null;
675
+ // Reject every absolute-path form — parents[] is ALWAYS relative.
676
+ // POSIX absolute: "/foo/bar"
677
+ // Drive-letter absolute: "C:/foo/bar" or "C:\\foo\\bar"
678
+ // Windows root-relative: "\\foo\\bar" (resolves from the current drive)
679
+ // UNC path: "\\\\server\\share\\foo"
680
+ // The containment guard below would catch most of these, but an
681
+ // adversarial fixture (or a wikiRoot that is itself a UNC path)
682
+ // could slip a `\\server\\...` form through prefix comparison,
683
+ // so fail-loud here keeps the "relative only" contract explicit
684
+ // without relying on downstream behaviour.
685
+ if (rel.startsWith("/") || rel.startsWith("\\")) return null;
686
+ if (/^[a-zA-Z]:/.test(rel)) return null;
687
+ // Soft-parent convention: POSIX-style separators. Normalise to
688
+ // OS-native for filesystem operations.
689
+ const nativeRel = sep === "/" ? rel : rel.split("/").join(sep);
690
+ const abs = resolve(leafDir, nativeRel);
691
+ // Only index.md entries are valid parents.
692
+ if (basename(abs) !== "index.md") return null;
693
+ // Guard 1: lexical containment of the resolved path. Build the
694
+ // prefix by concatenating `sep` only when `rootExact` doesn't
695
+ // already end in one — avoids a degenerate `"//"` prefix when
696
+ // `wikiRoot` is the filesystem root on POSIX (`"/"` → prefix
697
+ // `"/"` not `"//"`).
698
+ const rootExact = resolve(wikiRoot);
699
+ const rootPrefix = rootExact.endsWith(sep) ? rootExact : rootExact + sep;
700
+ if (abs !== rootExact && !abs.startsWith(rootPrefix)) return null;
701
+ // Guard 2: symlink-aware containment. Only applies when the
702
+ // target exists (realpath throws on ENOENT) — we'd otherwise
703
+ // reject every brand-new target. Caller (`applySoftParentEntries`)
704
+ // already runs an `existsSync(absIndex)` check before reading /
705
+ // writing, so non-existent targets short-circuit that branch.
706
+ if (existsSync(abs)) {
707
+ try {
708
+ // `realpathSync` resolves the full symlink chain, including
709
+ // any intermediate symlinked directories. That's a stronger
710
+ // containment check than `lstatSync(...).isSymbolicLink()`
711
+ // alone would give us: we don't care whether the final
712
+ // component itself is a symlink — we only care where the
713
+ // filesystem operations would actually land, which is what
714
+ // realpath reveals.
715
+ const realAbs = realpathSync(abs);
716
+ const realRoot = realpathSync(rootExact);
717
+ const realRootPrefix = realRoot.endsWith(sep) ? realRoot : realRoot + sep;
718
+ if (realAbs !== realRoot && !realAbs.startsWith(realRootPrefix)) {
719
+ return null;
720
+ }
721
+ } catch {
722
+ return null; // realpath failure → reject defensively
723
+ }
724
+ }
725
+ return abs;
726
+ }