@ctxr/skill-llm-wiki 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +2 -2
- package/guide/cli.md +3 -2
- package/guide/substrate/operators.md +1 -1
- package/guide/substrate/tiered-ai.md +6 -5
- package/guide/ux/user-intent.md +1 -1
- package/package.json +4 -2
- package/scripts/cli.mjs +92 -2
- package/scripts/lib/balance.mjs +579 -0
- package/scripts/lib/cluster-detect.mjs +482 -4
- package/scripts/lib/contract.mjs +31 -3
- package/scripts/lib/decision-log.mjs +121 -15
- package/scripts/lib/heal.mjs +5 -0
- package/scripts/lib/intent.mjs +370 -4
- package/scripts/lib/join-constants.mjs +22 -0
- package/scripts/lib/join.mjs +917 -0
- package/scripts/lib/nest-applier.mjs +395 -32
- package/scripts/lib/operators.mjs +472 -38
- package/scripts/lib/orchestrator.mjs +419 -12
- package/scripts/lib/root-containment.mjs +351 -0
- package/scripts/lib/similarity-cache.mjs +115 -20
- package/scripts/lib/similarity.mjs +11 -0
- package/scripts/lib/soft-dag.mjs +726 -0
- package/scripts/lib/tiered.mjs +42 -18
- package/scripts/lib/validate.mjs +22 -0
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
// soft-dag.mjs — post-convergence DAG soft-parent synthesis.
|
|
2
|
+
//
|
|
3
|
+
// Runs when the caller passes `--soft-dag-parents` on build or
|
|
4
|
+
// rebuild. For each routable leaf, compares the leaf's TF-IDF vector
|
|
5
|
+
// against every candidate category-directory's aggregate vector;
|
|
6
|
+
// directories whose cosine similarity meets the threshold become
|
|
7
|
+
// SOFT parents. The leaf's `parents[]` frontmatter is rewritten with
|
|
8
|
+
// the primary parent FIRST (always `"index.md"` for leaves — the
|
|
9
|
+
// path is POSIX-relative to the LEAF's own directory, and the leaf
|
|
10
|
+
// sits in the same directory as its direct-parent index.md by
|
|
11
|
+
// construction; `"../index.md"` is the subcategory-index shape, not
|
|
12
|
+
// a leaf shape), followed by one entry per chosen soft parent,
|
|
13
|
+
// likewise POSIX-relative to the same origin (the leaf's directory).
|
|
14
|
+
//
|
|
15
|
+
// Downstream, `applySoftParentEntries` re-walks the tree after index
|
|
16
|
+
// generation and appends each leaf's record into every soft-parent
|
|
17
|
+
// index's `entries[]`. The rebuilder never moves files on disk — a
|
|
18
|
+
// leaf's physical location remains under its primary parent; only
|
|
19
|
+
// the leaf's `parents[]` pointer array and every claimed parent's
|
|
20
|
+
// `entries[]` expand.
|
|
21
|
+
//
|
|
22
|
+
// Determinism: lex-sorted leaf iteration, lex-sorted candidate-dir
|
|
23
|
+
// iteration inside each leaf's pass, lex-sorted frontmatter
|
|
24
|
+
// serialisation. Two runs on the same tree produce byte-identical
|
|
25
|
+
// output.
|
|
26
|
+
//
|
|
27
|
+
// Threshold + cap: a cosine similarity ≥ `SOFT_PARENT_AFFINITY_THRESHOLD`
|
|
28
|
+
// is required for a candidate to qualify; the top
|
|
29
|
+
// `SOFT_PARENT_MAX_PER_LEAF` qualifying candidates per leaf are kept.
|
|
30
|
+
// Ranking is descending cosine with POSIX-path ascending as a
|
|
31
|
+
// deterministic tie-break.
|
|
32
|
+
//
|
|
33
|
+
// Subcommand scope: build + rebuild only. Intent validation rejects
|
|
34
|
+
// the flag elsewhere via `INT-16a` for the same reasons the balance
|
|
35
|
+
// flags reject in non-{build,rebuild} (see intent.mjs).
|
|
36
|
+
|
|
37
|
+
import {
|
|
38
|
+
existsSync,
|
|
39
|
+
readFileSync,
|
|
40
|
+
readdirSync,
|
|
41
|
+
realpathSync,
|
|
42
|
+
renameSync,
|
|
43
|
+
writeFileSync,
|
|
44
|
+
} from "node:fs";
|
|
45
|
+
import { basename, dirname, join, relative, resolve, sep } from "node:path";
|
|
46
|
+
import { readFrontmatterStreaming } from "./chunk.mjs";
|
|
47
|
+
import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
|
|
48
|
+
import {
|
|
49
|
+
buildComparisonModelFromTexts,
|
|
50
|
+
cosine,
|
|
51
|
+
entryText,
|
|
52
|
+
tfidfVector,
|
|
53
|
+
} from "./similarity.mjs";
|
|
54
|
+
|
|
55
|
+
// Minimum cosine similarity between a leaf and a candidate category
|
|
56
|
+
// directory for the category to qualify as a soft parent. Calibrated
|
|
57
|
+
// against similarity.mjs's Tier-0 thresholds: the `TIER0_DECISIVE_DIFFERENT
|
|
58
|
+
// = 0.30` floor marks "definitely unrelated" at pairwise-leaf scale,
|
|
59
|
+
// and the `TIER0_DECISIVE_SAME = 0.85` ceiling marks "definitely same
|
|
60
|
+
// topic". Soft parents want the middle-of-the-band "clearly related
|
|
61
|
+
// but not identical" zone — ~0.35 is empirically the lowest point
|
|
62
|
+
// where a category-vs-leaf cosine consistently reflects topical
|
|
63
|
+
// overlap (rather than accidental token reuse). A two-aggregate
|
|
64
|
+
// comparison inflates average cosine slightly vs pairwise, so we
|
|
65
|
+
// sit above DECISIVE_DIFFERENT by about one standard deviation of
|
|
66
|
+
// background noise.
|
|
67
|
+
export const SOFT_PARENT_AFFINITY_THRESHOLD = 0.35;
|
|
68
|
+
|
|
69
|
+
// Cap on soft parents per leaf (primary parent not counted toward the
|
|
70
|
+
// cap). Three soft parents + one primary = max four index locations a
|
|
71
|
+
// single leaf appears in. Chosen on the same token-economy reasoning
|
|
72
|
+
// as Phase X.5's fan-out target: a Claude navigator reading one
|
|
73
|
+
// leaf's parents[] tolerates a handful of entries before signal
|
|
74
|
+
// quality drops. Higher caps dilute the "this is where the leaf
|
|
75
|
+
// belongs" hint into noise.
|
|
76
|
+
export const SOFT_PARENT_MAX_PER_LEAF = 3;
|
|
77
|
+
|
|
78
|
+
// Walk the wiki and collect every routable leaf's absolute path +
|
|
79
|
+
// parsed frontmatter. Uses readdir directly (not `listChildren`) so
|
|
80
|
+
// pre-bootstrap category dirs — directories created by Phase 3 draft
|
|
81
|
+
// that don't have `index.md` yet — are still descended into. Leaves
|
|
82
|
+
// themselves are validated with the same frontmatter-must-have-id
|
|
83
|
+
// discipline `listChildren` uses. Dot-prefixed entries are skipped
|
|
84
|
+
// under the blanket pipeline rule.
|
|
85
|
+
//
|
|
86
|
+
// `withBody` controls read mode:
|
|
87
|
+
// - `true` (default for `runSoftDagParents`): `readFileSync` +
|
|
88
|
+
// `parseFrontmatter` so the caller can write leaves back via
|
|
89
|
+
// `renderFrontmatter(data, body)` preserving the body bytes.
|
|
90
|
+
// - `false` (used by `applySoftParentEntries`): bounded
|
|
91
|
+
// `readFrontmatterStreaming` so the walk only pays the
|
|
92
|
+
// frontmatter-byte cost, not the full-file-bytes cost. Matters
|
|
93
|
+
// at the 596-leaf consumer-corpus scale where bodies can dwarf
|
|
94
|
+
// frontmatter.
|
|
95
|
+
function collectAllLeaves(wikiRoot, withBody = true) {
|
|
96
|
+
const out = [];
|
|
97
|
+
const stack = [wikiRoot];
|
|
98
|
+
while (stack.length > 0) {
|
|
99
|
+
const dir = stack.pop();
|
|
100
|
+
let entries;
|
|
101
|
+
try {
|
|
102
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
103
|
+
} catch {
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
for (const e of entries) {
|
|
107
|
+
if (e.name.startsWith(".")) continue;
|
|
108
|
+
const full = join(dir, e.name);
|
|
109
|
+
if (e.isDirectory()) {
|
|
110
|
+
stack.push(full);
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
if (!e.isFile()) continue;
|
|
114
|
+
if (!e.name.endsWith(".md")) continue;
|
|
115
|
+
if (e.name === "index.md") continue;
|
|
116
|
+
let parsed;
|
|
117
|
+
let body;
|
|
118
|
+
try {
|
|
119
|
+
// Both modes use `readFrontmatterStreaming` to get the
|
|
120
|
+
// frontmatter text + byte offset. That function normalises
|
|
121
|
+
// CRLF → LF on the frontmatter payload so `parseFrontmatter`
|
|
122
|
+
// (which only recognises an LF fence) sees the expected
|
|
123
|
+
// form. Pre-round-2 the withBody path used `readFileSync` +
|
|
124
|
+
// `parseFrontmatter` directly, which silently dropped
|
|
125
|
+
// CRLF-fence leaves (common on Windows editors) and made
|
|
126
|
+
// them invisible to soft-DAG synthesis.
|
|
127
|
+
const captured = readFrontmatterStreaming(full);
|
|
128
|
+
if (!captured) continue;
|
|
129
|
+
parsed = parseFrontmatter(captured.frontmatterText, full);
|
|
130
|
+
if (withBody) {
|
|
131
|
+
// Read the raw file as a buffer and slice at the original
|
|
132
|
+
// byte offset so multi-byte characters at the fence
|
|
133
|
+
// boundary don't corrupt the body. `captured.bodyOffset`
|
|
134
|
+
// is the byte index just after the CLOSING fence.
|
|
135
|
+
const raw = readFileSync(full);
|
|
136
|
+
body = raw.slice(captured.bodyOffset).toString("utf8");
|
|
137
|
+
// CRLF-fenced files slice a body that starts with "\r\n".
|
|
138
|
+
// `renderFrontmatter` only checks `body.startsWith("\n")`
|
|
139
|
+
// when deciding whether to prepend the separator newline,
|
|
140
|
+
// so a CRLF-leading body would produce "\n\r\n" at the
|
|
141
|
+
// fence boundary — a mixed-EOL corruption. Normalise the
|
|
142
|
+
// whole body to LF on rewrite: the wider codebase is LF-
|
|
143
|
+
// only for on-disk output (`renderFrontmatter` always
|
|
144
|
+
// emits LF fences, `writeFrontmatter` ditto), so the
|
|
145
|
+
// rewritten leaf ends up entirely LF regardless of the
|
|
146
|
+
// input style. This aligns with the existing pipeline
|
|
147
|
+
// convention; the alternative (CRLF-preserving renderer)
|
|
148
|
+
// would require API changes across call sites that have
|
|
149
|
+
// all been LF-only since v1.0.0.
|
|
150
|
+
if (captured.lineEnding === "crlf") {
|
|
151
|
+
body = body.replace(/\r\n/g, "\n");
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
} catch {
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
if (!parsed?.data?.id) continue;
|
|
158
|
+
out.push(
|
|
159
|
+
withBody
|
|
160
|
+
? { path: full, data: parsed.data, body }
|
|
161
|
+
: { path: full, data: parsed.data },
|
|
162
|
+
);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Group already-collected leaves by their direct-parent directory.
|
|
169
|
+
// Consumed by `collectCandidateDirs` (routable-leaf check) and
|
|
170
|
+
// `buildCategoryText` (aggregate text) so neither has to call
|
|
171
|
+
// `listChildren` again per directory. One pass at the top of
|
|
172
|
+
// `runSoftDagParents` feeds both downstream helpers; the previous
|
|
173
|
+
// layout did two full-tree walks (one in collectCandidateDirs via
|
|
174
|
+
// listChildren, one per candidate in buildCategoryText) on top of
|
|
175
|
+
// the full-leaf collection, which was wasteful on large corpora.
|
|
176
|
+
function groupLeavesByDir(leaves) {
|
|
177
|
+
const map = new Map();
|
|
178
|
+
for (const leaf of leaves) {
|
|
179
|
+
const dir = dirname(leaf.path);
|
|
180
|
+
const list = map.get(dir) ?? [];
|
|
181
|
+
list.push(leaf);
|
|
182
|
+
map.set(dir, list);
|
|
183
|
+
}
|
|
184
|
+
return map;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Walk the wiki and collect every non-dot category directory (any
|
|
188
|
+
// directory that could be a soft-parent target). The wiki root is
|
|
189
|
+
// included since leaves from deep subtrees can claim the root as a
|
|
190
|
+
// soft parent (the typed "this is also broadly relevant to the root
|
|
191
|
+
// topic" pointer). A category is eligible as a soft-parent target if
|
|
192
|
+
// it has an `index.md` OR at least one ROUTABLE leaf directly
|
|
193
|
+
// underneath. Routability is decided from the pre-collected
|
|
194
|
+
// `leavesByDir` map (derived from `collectAllLeaves`), not from a
|
|
195
|
+
// fresh `listChildren` call per dir — `listChildren` would parse
|
|
196
|
+
// frontmatter for every `.md` again, duplicating I/O already done.
|
|
197
|
+
function collectCandidateDirs(wikiRoot, leavesByDir) {
|
|
198
|
+
const out = [];
|
|
199
|
+
const stack = [wikiRoot];
|
|
200
|
+
while (stack.length > 0) {
|
|
201
|
+
const dir = stack.pop();
|
|
202
|
+
let entries;
|
|
203
|
+
try {
|
|
204
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
205
|
+
} catch {
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const subdirs = [];
|
|
209
|
+
let hasIndex = false;
|
|
210
|
+
for (const e of entries) {
|
|
211
|
+
if (e.name.startsWith(".")) continue;
|
|
212
|
+
const full = join(dir, e.name);
|
|
213
|
+
if (e.isDirectory()) {
|
|
214
|
+
subdirs.push(full);
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
if (!e.isFile()) continue;
|
|
218
|
+
if (e.name === "index.md") {
|
|
219
|
+
hasIndex = true;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const hasRoutableLeaf = (leavesByDir.get(dir)?.length ?? 0) > 0;
|
|
223
|
+
if (dir === wikiRoot || hasIndex || hasRoutableLeaf) {
|
|
224
|
+
out.push(dir);
|
|
225
|
+
}
|
|
226
|
+
for (const sub of subdirs) stack.push(sub);
|
|
227
|
+
}
|
|
228
|
+
return out;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Build the aggregate semantic text for a candidate category
|
|
232
|
+
// directory. Includes the directory's `index.md` frontmatter
|
|
233
|
+
// (`focus`, `covers`, `tags`, `domains` — the full set `entryText`
|
|
234
|
+
// uses, with focus doubled for emphasis) when present PLUS each
|
|
235
|
+
// routable leaf directly under it via `entryText`. Descendant
|
|
236
|
+
// leaves are deliberately NOT included — soft parents claim leaves
|
|
237
|
+
// on direct topical overlap, not on transitive subtree content, so
|
|
238
|
+
// aggregating across depths would let a leaf latch onto a root
|
|
239
|
+
// category it only matches through a deeply nested cousin. Takes
|
|
240
|
+
// the pre-grouped `leavesByDir` map so no second `listChildren`
|
|
241
|
+
// pass is needed here.
|
|
242
|
+
//
|
|
243
|
+
// Index reads go through `readFrontmatterStreaming` rather than a
|
|
244
|
+
// full `readFileSync` — we only need frontmatter fields to build
|
|
245
|
+
// `entryText`. Authored orientation blocks can be long, and this
|
|
246
|
+
// phase only scores against frontmatter signals; reading the body
|
|
247
|
+
// would be pure waste. `readIndex` from indices.mjs loads full
|
|
248
|
+
// bodies, which is why it's avoided here.
|
|
249
|
+
function buildCategoryText(dir, leavesByDir) {
|
|
250
|
+
const parts = [];
|
|
251
|
+
const indexPath = join(dir, "index.md");
|
|
252
|
+
try {
|
|
253
|
+
const captured = readFrontmatterStreaming(indexPath);
|
|
254
|
+
if (captured) {
|
|
255
|
+
const { data } = parseFrontmatter(captured.frontmatterText, indexPath);
|
|
256
|
+
if (data) parts.push(entryText(data));
|
|
257
|
+
}
|
|
258
|
+
} catch {
|
|
259
|
+
// Missing / malformed index.md is tolerated: the dir may be
|
|
260
|
+
// pre-bootstrap (Phase 3 draft category) or the index may have
|
|
261
|
+
// a fence mismatch. We fall through to aggregating leaves only;
|
|
262
|
+
// downstream validation catches any real shape issue.
|
|
263
|
+
}
|
|
264
|
+
const leaves = leavesByDir.get(dir) ?? [];
|
|
265
|
+
for (const leaf of leaves) parts.push(entryText(leaf.data));
|
|
266
|
+
return parts.join(" ").trim();
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Normalise an absolute path to a POSIX-separator string relative to
|
|
270
|
+
// `fromDir`. Matches Phase X.5's `posixSortKey` discipline: on
|
|
271
|
+
// Windows `path.relative` emits `\\` which would pollute
|
|
272
|
+
// `parents[]` strings with OS-specific separators and break
|
|
273
|
+
// cross-platform byte-reproducibility. Soft-parent paths are POSIX
|
|
274
|
+
// in on-disk form.
|
|
275
|
+
function posixRelative(fromDir, toPath) {
|
|
276
|
+
const rel = relative(fromDir, toPath);
|
|
277
|
+
return sep === "/" ? rel : rel.split(sep).join("/");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Given a leaf at absolute `leafPath` and a soft-parent candidate
|
|
281
|
+
// `targetDir` (also absolute), produce the POSIX-relative path from
|
|
282
|
+
// the leaf's direct-parent `index.md` to `targetDir/index.md`.
|
|
283
|
+
// Balance's flatten pass keeps `parents[]` relative to the direct
|
|
284
|
+
// parent's `index.md` (see `applyBalanceFlatten` doc); the
|
|
285
|
+
// convention is the same for soft parents.
|
|
286
|
+
//
|
|
287
|
+
// Examples (POSIX):
|
|
288
|
+
// leaf=wiki/a/l.md, target=wiki/a → "index.md"
|
|
289
|
+
// leaf=wiki/a/l.md, target=wiki/b → "../b/index.md"
|
|
290
|
+
// leaf=wiki/a/b/l.md, target=wiki/ → "../../index.md"
|
|
291
|
+
function relativeParentPath(leafPath, targetDir) {
|
|
292
|
+
const leafDir = dirname(leafPath);
|
|
293
|
+
return posixRelative(leafDir, join(targetDir, "index.md"));
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Score a leaf against every candidate directory (excluding the
|
|
297
|
+
// leaf's own direct parent — that's the primary, not a soft parent).
|
|
298
|
+
// Returns an array of `{ dir, cosine }` sorted by cosine descending,
|
|
299
|
+
// POSIX-path ascending as a deterministic tie-break. Only scores at
|
|
300
|
+
// or above the caller's `threshold` are included.
|
|
301
|
+
//
|
|
302
|
+
// Threshold is passed as a parameter rather than hard-coded so an
|
|
303
|
+
// override via `ctx.threshold` in `runSoftDagParents` takes effect
|
|
304
|
+
// AT THIS FILTER. An earlier draft hard-coded the constant here and
|
|
305
|
+
// re-filtered post-facto — if the override was LOWER than the
|
|
306
|
+
// constant, candidates in the window [override, constant) were
|
|
307
|
+
// dropped at scoring time and couldn't be reinstated.
|
|
308
|
+
function scoreCandidates(
|
|
309
|
+
leaf,
|
|
310
|
+
leafVector,
|
|
311
|
+
candidates,
|
|
312
|
+
categoryVectors,
|
|
313
|
+
wikiRoot,
|
|
314
|
+
threshold,
|
|
315
|
+
) {
|
|
316
|
+
const primaryDir = dirname(leaf.path);
|
|
317
|
+
const scored = [];
|
|
318
|
+
for (const dir of candidates) {
|
|
319
|
+
if (dir === primaryDir) continue;
|
|
320
|
+
const catVec = categoryVectors.get(dir);
|
|
321
|
+
if (!catVec) continue;
|
|
322
|
+
const sim = cosine(leafVector, catVec);
|
|
323
|
+
if (sim < threshold) continue;
|
|
324
|
+
scored.push({ dir, cosine: sim });
|
|
325
|
+
}
|
|
326
|
+
scored.sort((a, b) => {
|
|
327
|
+
if (b.cosine !== a.cosine) return b.cosine - a.cosine;
|
|
328
|
+
// Deterministic lex tie-break via POSIX-normalised relative path.
|
|
329
|
+
const aKey = posixRelative(wikiRoot, a.dir);
|
|
330
|
+
const bKey = posixRelative(wikiRoot, b.dir);
|
|
331
|
+
return aKey.localeCompare(bKey);
|
|
332
|
+
});
|
|
333
|
+
return scored;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Resolve the PRIMARY parent path-string for a leaf. parents[] is
|
|
337
|
+
// POSIX-relative to the LEAF's directory. The primary parent is
|
|
338
|
+
// the leaf's direct-parent `index.md`, which sits in the same
|
|
339
|
+
// directory as the leaf — so the path-string is always `"index.md"`
|
|
340
|
+
// regardless of the leaf's depth. This matches the convention
|
|
341
|
+
// `rebuildIndex` derives and the shape `applyBalanceFlatten` relies
|
|
342
|
+
// on (see the doc comment there — promoting a subtree preserves
|
|
343
|
+
// every relative parents[] entry by construction because they're
|
|
344
|
+
// all anchored at the leaf's own dir).
|
|
345
|
+
function primaryParentPath() {
|
|
346
|
+
return "index.md";
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Atomic write: materialise to `<path>.tmp` then rename into place.
|
|
350
|
+
// Matches `indices.mjs::atomicWriteFile`'s discipline — a crash or
|
|
351
|
+
// SIGKILL between writeFileSync and renameSync leaves EITHER the
|
|
352
|
+
// old file intact OR the temp file orphaned, never a partially-
|
|
353
|
+
// written target. Both leaf rewrites (`rewriteLeafParents`) and
|
|
354
|
+
// index rewrites (`applySoftParentEntries`) route through this so
|
|
355
|
+
// the soft-DAG phase matches the durability expectations the rest
|
|
356
|
+
// of the index-generation pipeline sets.
|
|
357
|
+
function atomicWriteFile(targetPath, content) {
|
|
358
|
+
const tmp = targetPath + ".tmp";
|
|
359
|
+
writeFileSync(tmp, content, "utf8");
|
|
360
|
+
renameSync(tmp, targetPath);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Rewrite the leaf's frontmatter with an expanded `parents[]` array.
|
|
364
|
+
// Primary parent first, soft parents after in score order. The body
|
|
365
|
+
// is preserved byte-exact; only the frontmatter is re-serialised.
|
|
366
|
+
function rewriteLeafParents(leaf, parentsArray) {
|
|
367
|
+
const newData = { ...leaf.data, parents: parentsArray };
|
|
368
|
+
const serialised = renderFrontmatter(newData, leaf.body);
|
|
369
|
+
atomicWriteFile(leaf.path, serialised);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Main entry point. Returns a summary of work done; the caller
|
|
373
|
+
// (orchestrator Phase 4.4) records it in the phase log.
|
|
374
|
+
//
|
|
375
|
+
// Shape:
|
|
376
|
+
// {
|
|
377
|
+
// leavesProcessed: number,
|
|
378
|
+
// softParentsAdded: number, // total across all leaves
|
|
379
|
+
// perLeaf: Map<leafPath, string[]> // soft-parent paths per leaf
|
|
380
|
+
// // (empty array if none qualified)
|
|
381
|
+
// }
|
|
382
|
+
//
|
|
383
|
+
// Contract with the caller:
|
|
384
|
+
// - `wikiRoot` must point at a valid wiki root (has root index.md
|
|
385
|
+
// or is in pre-bootstrap state from Phase 3 draft — both are
|
|
386
|
+
// tolerated by `collectAllLeaves` + `collectCandidateDirs`).
|
|
387
|
+
// - `ctx.threshold` and `ctx.maxPerLeaf` may override the exported
|
|
388
|
+
// defaults for tests that want deterministic boundary behaviour.
|
|
389
|
+
// - No commits: the orchestrator's phase infrastructure handles
|
|
390
|
+
// git-add + git-commit around this call.
|
|
391
|
+
export async function runSoftDagParents(wikiRoot, ctx = {}) {
|
|
392
|
+
const {
|
|
393
|
+
threshold = SOFT_PARENT_AFFINITY_THRESHOLD,
|
|
394
|
+
maxPerLeaf = SOFT_PARENT_MAX_PER_LEAF,
|
|
395
|
+
} = ctx;
|
|
396
|
+
|
|
397
|
+
const leaves = collectAllLeaves(wikiRoot);
|
|
398
|
+
leaves.sort((a, b) =>
|
|
399
|
+
posixRelative(wikiRoot, a.path).localeCompare(
|
|
400
|
+
posixRelative(wikiRoot, b.path),
|
|
401
|
+
),
|
|
402
|
+
);
|
|
403
|
+
if (leaves.length === 0) {
|
|
404
|
+
return { leavesProcessed: 0, softParentsAdded: 0, perLeaf: new Map() };
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// One group-by-dir pass over the already-collected leaves feeds
|
|
408
|
+
// both `collectCandidateDirs` (routable-leaf check) and
|
|
409
|
+
// `buildCategoryText` (aggregate text). No extra filesystem I/O
|
|
410
|
+
// beyond the initial `collectAllLeaves` walk.
|
|
411
|
+
const leavesByDir = groupLeavesByDir(leaves);
|
|
412
|
+
const candidateDirs = collectCandidateDirs(wikiRoot, leavesByDir);
|
|
413
|
+
candidateDirs.sort((a, b) =>
|
|
414
|
+
posixRelative(wikiRoot, a).localeCompare(posixRelative(wikiRoot, b)),
|
|
415
|
+
);
|
|
416
|
+
|
|
417
|
+
// Build one corpus over all leaves AND all candidate-category
|
|
418
|
+
// texts. Unified IDF means leaf-vs-category cosines sit on the
|
|
419
|
+
// same TF-IDF basis as Phase X.3 / similarity.mjs's pairwise
|
|
420
|
+
// scores, so threshold calibration transfers.
|
|
421
|
+
//
|
|
422
|
+
// Leaf text comes from `entryText(leaf.data)` which already
|
|
423
|
+
// applies the doubled-focus weighting. Category text is
|
|
424
|
+
// pre-aggregated by `buildCategoryText` (which also routes
|
|
425
|
+
// through `entryText` for each contributor). Both are passed
|
|
426
|
+
// as-is to `buildComparisonModelFromTexts` — a plain texts-array
|
|
427
|
+
// constructor that skips the `entryText` roundtrip
|
|
428
|
+
// `buildComparisonModel` would otherwise perform, avoiding a
|
|
429
|
+
// second round of focus-doubling on pre-assembled strings.
|
|
430
|
+
const leafTexts = leaves.map((l) => entryText(l.data));
|
|
431
|
+
const catTextMap = new Map();
|
|
432
|
+
for (const dir of candidateDirs) {
|
|
433
|
+
catTextMap.set(dir, buildCategoryText(dir, leavesByDir));
|
|
434
|
+
}
|
|
435
|
+
const corpusTexts = [...leafTexts, ...Array.from(catTextMap.values())];
|
|
436
|
+
const model = buildComparisonModelFromTexts(corpusTexts);
|
|
437
|
+
const leafVectors = new Map();
|
|
438
|
+
for (let i = 0; i < leaves.length; i++) {
|
|
439
|
+
leafVectors.set(leaves[i].path, tfidfVector(model.tokenLists[i], model.idfMap));
|
|
440
|
+
}
|
|
441
|
+
const categoryVectors = new Map();
|
|
442
|
+
let idx = leaves.length;
|
|
443
|
+
for (const dir of candidateDirs) {
|
|
444
|
+
categoryVectors.set(
|
|
445
|
+
dir,
|
|
446
|
+
tfidfVector(model.tokenLists[idx], model.idfMap),
|
|
447
|
+
);
|
|
448
|
+
idx++;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const perLeaf = new Map();
|
|
452
|
+
let softParentsAdded = 0;
|
|
453
|
+
for (const leaf of leaves) {
|
|
454
|
+
const leafVec = leafVectors.get(leaf.path);
|
|
455
|
+
const scored = scoreCandidates(
|
|
456
|
+
leaf,
|
|
457
|
+
leafVec,
|
|
458
|
+
candidateDirs,
|
|
459
|
+
categoryVectors,
|
|
460
|
+
wikiRoot,
|
|
461
|
+
threshold,
|
|
462
|
+
);
|
|
463
|
+
const chosen = scored.slice(0, maxPerLeaf);
|
|
464
|
+
const softParents = chosen.map((c) => relativeParentPath(leaf.path, c.dir));
|
|
465
|
+
const parentsArray = [primaryParentPath(), ...softParents];
|
|
466
|
+
rewriteLeafParents(leaf, parentsArray);
|
|
467
|
+
perLeaf.set(leaf.path, softParents);
|
|
468
|
+
softParentsAdded += softParents.length;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
return {
|
|
472
|
+
leavesProcessed: leaves.length,
|
|
473
|
+
softParentsAdded,
|
|
474
|
+
perLeaf,
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Post-index-rebuild pass: for every leaf claiming a soft parent,
|
|
479
|
+
// append the leaf's `entries[]` record to each claimed parent
|
|
480
|
+
// directory's `index.md`. `rebuildAllIndices` only places a leaf in
|
|
481
|
+
// its direct-parent `index.md`; this pass extends the DAG view so a
|
|
482
|
+
// Claude navigator arriving at any claimed parent sees the leaf in
|
|
483
|
+
// that parent's `entries[]`.
|
|
484
|
+
//
|
|
485
|
+
// The leaf's `parents[]` is the ground truth: we never invent claims.
|
|
486
|
+
// The pass walks every leaf, resolves each non-primary `parents[]`
|
|
487
|
+
// entry to an absolute index path, reads the target `index.md`,
|
|
488
|
+
// appends a minimal entry record (mirroring `rebuildIndex`'s shape),
|
|
489
|
+
// and re-writes the index. Records already present (same `id`) are
|
|
490
|
+
// skipped so the pass is idempotent — running it twice on the same
|
|
491
|
+
// tree produces the same bytes.
|
|
492
|
+
export function applySoftParentEntries(wikiRoot) {
|
|
493
|
+
// Frontmatter-only reads for the propagation pass — we never
|
|
494
|
+
// rewrite leaves here, only their claimed parent index.md files,
|
|
495
|
+
// so there's no need to buffer bodies in memory. On large corpora
|
|
496
|
+
// (the 596-leaf target workload) body bytes dwarf frontmatter
|
|
497
|
+
// bytes, so bounded streaming reads turn this from O(total leaf
|
|
498
|
+
// bytes) into O(frontmatter bytes).
|
|
499
|
+
const leaves = collectAllLeaves(wikiRoot, /* withBody */ false);
|
|
500
|
+
// Deterministic iteration so repeated runs produce byte-identical
|
|
501
|
+
// output regardless of OS filesystem enumeration order.
|
|
502
|
+
leaves.sort((a, b) =>
|
|
503
|
+
posixRelative(wikiRoot, a.path).localeCompare(
|
|
504
|
+
posixRelative(wikiRoot, b.path),
|
|
505
|
+
),
|
|
506
|
+
);
|
|
507
|
+
|
|
508
|
+
// Group soft-parent appends by target index path. We resolve once
|
|
509
|
+
// per leaf-parent pair, dedupe against existing `entries[]` by id,
|
|
510
|
+
// then commit per-index in a single pass at the end to avoid
|
|
511
|
+
// quadratic file I/O.
|
|
512
|
+
const softAppendsByIndex = new Map(); // indexPath → Array<record>
|
|
513
|
+
|
|
514
|
+
for (const leaf of leaves) {
|
|
515
|
+
const parents = Array.isArray(leaf.data.parents) ? leaf.data.parents : [];
|
|
516
|
+
if (parents.length <= 1) continue; // primary-only, nothing to do
|
|
517
|
+
const leafDir = dirname(leaf.path);
|
|
518
|
+
const record = buildEntryRecord(leaf);
|
|
519
|
+
// Skip the first entry (primary); everything after is soft.
|
|
520
|
+
for (let i = 1; i < parents.length; i++) {
|
|
521
|
+
const rel = parents[i];
|
|
522
|
+
if (typeof rel !== "string" || rel.length === 0) continue;
|
|
523
|
+
const absIndex = normaliseIndexPath(leafDir, rel, wikiRoot);
|
|
524
|
+
if (!absIndex) continue;
|
|
525
|
+
if (!existsSync(absIndex)) continue;
|
|
526
|
+
// The `file:` field is relative to the target index's directory,
|
|
527
|
+
// not the leaf's direct parent. Use OS-native `path.relative`
|
|
528
|
+
// (not `posixRelative`) to match the convention
|
|
529
|
+
// `indices.mjs::rebuildIndex` produces for primary entries —
|
|
530
|
+
// on Windows `rebuildIndex` emits `\`-separator file paths, so
|
|
531
|
+
// mixing POSIX-normalised appends into the same array would
|
|
532
|
+
// produce inconsistent separators within one `entries[]` and
|
|
533
|
+
// break link rendering. Byte-reproducibility across OSes is a
|
|
534
|
+
// concern shared with rebuildIndex itself, out of scope here.
|
|
535
|
+
const targetDir = dirname(absIndex);
|
|
536
|
+
const targetRecord = {
|
|
537
|
+
...record,
|
|
538
|
+
file: relative(targetDir, leaf.path),
|
|
539
|
+
};
|
|
540
|
+
const list = softAppendsByIndex.get(absIndex) ?? [];
|
|
541
|
+
list.push(targetRecord);
|
|
542
|
+
softAppendsByIndex.set(absIndex, list);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Actual-write counters. Pre-round-2 the returned stats were
|
|
547
|
+
// derived from `softAppendsByIndex.size` and the sum of its value
|
|
548
|
+
// arrays — the PLANNED appends. That over-reported on reruns (every
|
|
549
|
+
// id already present → zero actual writes but indicesTouched still
|
|
550
|
+
// counted) and over-reported when an index failed to parse.
|
|
551
|
+
// Tracking the actual writes keeps orchestrator phase logging
|
|
552
|
+
// honest across idempotent and hostile-fixture cases.
|
|
553
|
+
let indicesTouched = 0;
|
|
554
|
+
let softEntriesAdded = 0;
|
|
555
|
+
for (const [indexPath, appends] of softAppendsByIndex) {
|
|
556
|
+
// Per-index try/catch: a malformed target `index.md` (e.g.,
|
|
557
|
+
// user-edited YAML that fails to parse) must NOT abort the
|
|
558
|
+
// entire propagation pass. Soft-DAG synthesis is best-effort;
|
|
559
|
+
// the rest of the pipeline (`listChildren`, `collectAllLeaves`)
|
|
560
|
+
// follows the same skip-and-continue discipline for malformed
|
|
561
|
+
// frontmatter. Downstream validation surfaces the bad index
|
|
562
|
+
// with its own diagnostic.
|
|
563
|
+
let raw, parsed;
|
|
564
|
+
try {
|
|
565
|
+
raw = readFileSync(indexPath, "utf8");
|
|
566
|
+
parsed = parseFrontmatter(raw, indexPath);
|
|
567
|
+
} catch {
|
|
568
|
+
continue;
|
|
569
|
+
}
|
|
570
|
+
if (!parsed?.data) continue;
|
|
571
|
+
// Sanity check: only touch files that ARE managed indices. A
|
|
572
|
+
// file that passes the path guard and exists but has no
|
|
573
|
+
// frontmatter fence (`parseFrontmatter` returns `{data: {}}`),
|
|
574
|
+
// or has frontmatter but isn't `type: index`, is not something
|
|
575
|
+
// we should append `entries:` into. Skip defensively — the
|
|
576
|
+
// rest of the pipeline would produce confusing validator
|
|
577
|
+
// errors if we smuggled an `entries[]` into an arbitrary .md
|
|
578
|
+
// with the filename `index.md`.
|
|
579
|
+
if (parsed.data.type !== "index") continue;
|
|
580
|
+
if (typeof parsed.data.id !== "string" || parsed.data.id.length === 0) continue;
|
|
581
|
+
const existing = Array.isArray(parsed.data.entries)
|
|
582
|
+
? parsed.data.entries
|
|
583
|
+
: [];
|
|
584
|
+
const existingIds = new Set(existing.map((e) => e?.id).filter(Boolean));
|
|
585
|
+
// De-dupe by id: a leaf may already be in the index's entries
|
|
586
|
+
// (primary case) or may appear twice across soft claims in
|
|
587
|
+
// degenerate fixtures.
|
|
588
|
+
const newEntries = existing.slice();
|
|
589
|
+
let addedThisIndex = 0;
|
|
590
|
+
for (const rec of appends) {
|
|
591
|
+
if (!rec.id || existingIds.has(rec.id)) continue;
|
|
592
|
+
newEntries.push(rec);
|
|
593
|
+
existingIds.add(rec.id);
|
|
594
|
+
addedThisIndex++;
|
|
595
|
+
}
|
|
596
|
+
if (addedThisIndex === 0) continue; // no change
|
|
597
|
+
// Deterministic sort: lex by id. `rebuildIndex` already produces
|
|
598
|
+
// entries in walk-order, but the DAG pass adds them at the end,
|
|
599
|
+
// and a future run's grouping may differ — lex-sort keeps the
|
|
600
|
+
// on-disk order stable across runs. `localeCompare` would throw
|
|
601
|
+
// on a non-string `id` (e.g., a manually edited index with
|
|
602
|
+
// `id: 123`), which for a best-effort propagation pass is the
|
|
603
|
+
// wrong trade-off — one malformed index shouldn't abort
|
|
604
|
+
// propagation for the other valid ones. Coerce to string first;
|
|
605
|
+
// downstream validation still catches the bad id shape.
|
|
606
|
+
newEntries.sort((a, b) =>
|
|
607
|
+
String(a?.id ?? "").localeCompare(String(b?.id ?? "")),
|
|
608
|
+
);
|
|
609
|
+
parsed.data.entries = newEntries;
|
|
610
|
+
atomicWriteFile(indexPath, renderFrontmatter(parsed.data, parsed.body));
|
|
611
|
+
indicesTouched++;
|
|
612
|
+
softEntriesAdded += addedThisIndex;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
return { indicesTouched, softEntriesAdded };
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// Build a minimal `entries[]` record for a leaf, matching the shape
|
|
619
|
+
// `indices.mjs::rebuildIndex` produces. The `file` field is left
|
|
620
|
+
// absent — the caller (`applySoftParentEntries`) always sets it to
|
|
621
|
+
// `path.relative(targetDir, leaf.path)` per call site because each
|
|
622
|
+
// soft parent index lives in a different directory and the relative
|
|
623
|
+
// link must anchor to THAT index's directory, not the leaf's own.
|
|
624
|
+
// Using OS-native `path.relative` (not a POSIX normaliser) matches
|
|
625
|
+
// what `indices.mjs::rebuildIndex` produces for the primary-parent
|
|
626
|
+
// entry, avoiding mixed `\` + `/` separators within the same
|
|
627
|
+
// `entries[]` on Windows.
|
|
628
|
+
function buildEntryRecord(leaf) {
|
|
629
|
+
const record = {
|
|
630
|
+
id: leaf.data.id,
|
|
631
|
+
type: leaf.data.type ?? "primary",
|
|
632
|
+
focus: leaf.data.focus ?? "",
|
|
633
|
+
};
|
|
634
|
+
if (leaf.data.tags) record.tags = leaf.data.tags;
|
|
635
|
+
if (leaf.data.overlay_targets) record.overlay_targets = leaf.data.overlay_targets;
|
|
636
|
+
return record;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// Resolve a POSIX-relative `parents[]` entry like `"../b/index.md"`
|
|
640
|
+
// to an absolute filesystem path, anchored at the leaf's direct
|
|
641
|
+
// parent directory. Returns null for obviously malformed entries
|
|
642
|
+
// (absolute paths, entries that escape above wikiRoot, non-index.md
|
|
643
|
+
// endings). Defensive: malformed claims are skipped rather than
|
|
644
|
+
// crashing the phase — soft-dag synthesis is best-effort, and a
|
|
645
|
+
// bad parents[] entry typically indicates manual frontmatter edits
|
|
646
|
+
// that downstream validation will surface.
|
|
647
|
+
//
|
|
648
|
+
// Path-traversal guard: a crafted entry like
|
|
649
|
+
// "../../../../somewhere/index.md" must not let this phase read or
|
|
650
|
+
// write files outside the wiki tree. Resolve both the candidate path
|
|
651
|
+
// and the wiki root to canonical absolute form, then confirm the
|
|
652
|
+
// candidate sits under the wikiRoot prefix. Reject otherwise — this
|
|
653
|
+
// is a defense-in-depth check alongside validate's DUP-ID /
|
|
654
|
+
// ALIAS-COLLIDES-ID; a hostile leaf's parents[] shouldn't be able to
|
|
655
|
+
// mutate arbitrary filesystem paths even transiently.
|
|
656
|
+
//
|
|
657
|
+
// Two guards fire here:
|
|
658
|
+
//
|
|
659
|
+
// 1. Lexical guard on `resolve(leafDir, nativeRel)` prefix.
|
|
660
|
+
// Rejects pure `..`-traversal that would escape the wikiRoot
|
|
661
|
+
// prefix on disk without touching the filesystem.
|
|
662
|
+
// 2. Symlink-aware guard on `realpathSync`. `readFileSync` /
|
|
663
|
+
// `writeFileSync` FOLLOW symlinks, so a symlinked index.md
|
|
664
|
+
// inside the wiki pointing at an external file would bypass
|
|
665
|
+
// guard (1) even though the lexical path sits inside the
|
|
666
|
+
// wikiRoot prefix. `realpathSync` resolves the whole chain
|
|
667
|
+
// (including intermediate symlinked directories); the
|
|
668
|
+
// resolved target must still sit under the wikiRoot realpath
|
|
669
|
+
// for the claim to be accepted. Only fires when the target
|
|
670
|
+
// already exists — realpath throws ENOENT on a new index, and
|
|
671
|
+
// the caller's `existsSync` branch below handles that case.
|
|
672
|
+
function normaliseIndexPath(leafDir, rel, wikiRoot) {
|
|
673
|
+
if (typeof rel !== "string") return null;
|
|
674
|
+
if (rel.length === 0) return null;
|
|
675
|
+
// Reject every absolute-path form — parents[] is ALWAYS relative.
|
|
676
|
+
// POSIX absolute: "/foo/bar"
|
|
677
|
+
// Drive-letter absolute: "C:/foo/bar" or "C:\\foo\\bar"
|
|
678
|
+
// Windows root-relative: "\\foo\\bar" (resolves from the current drive)
|
|
679
|
+
// UNC path: "\\\\server\\share\\foo"
|
|
680
|
+
// The containment guard below would catch most of these, but an
|
|
681
|
+
// adversarial fixture (or a wikiRoot that is itself a UNC path)
|
|
682
|
+
// could slip a `\\server\\...` form through prefix comparison,
|
|
683
|
+
// so fail-loud here keeps the "relative only" contract explicit
|
|
684
|
+
// without relying on downstream behaviour.
|
|
685
|
+
if (rel.startsWith("/") || rel.startsWith("\\")) return null;
|
|
686
|
+
if (/^[a-zA-Z]:/.test(rel)) return null;
|
|
687
|
+
// Soft-parent convention: POSIX-style separators. Normalise to
|
|
688
|
+
// OS-native for filesystem operations.
|
|
689
|
+
const nativeRel = sep === "/" ? rel : rel.split("/").join(sep);
|
|
690
|
+
const abs = resolve(leafDir, nativeRel);
|
|
691
|
+
// Only index.md entries are valid parents.
|
|
692
|
+
if (basename(abs) !== "index.md") return null;
|
|
693
|
+
// Guard 1: lexical containment of the resolved path. Build the
|
|
694
|
+
// prefix by concatenating `sep` only when `rootExact` doesn't
|
|
695
|
+
// already end in one — avoids a degenerate `"//"` prefix when
|
|
696
|
+
// `wikiRoot` is the filesystem root on POSIX (`"/"` → prefix
|
|
697
|
+
// `"/"` not `"//"`).
|
|
698
|
+
const rootExact = resolve(wikiRoot);
|
|
699
|
+
const rootPrefix = rootExact.endsWith(sep) ? rootExact : rootExact + sep;
|
|
700
|
+
if (abs !== rootExact && !abs.startsWith(rootPrefix)) return null;
|
|
701
|
+
// Guard 2: symlink-aware containment. Only applies when the
|
|
702
|
+
// target exists (realpath throws on ENOENT) — we'd otherwise
|
|
703
|
+
// reject every brand-new target. Caller (`applySoftParentEntries`)
|
|
704
|
+
// already runs an `existsSync(absIndex)` check before reading /
|
|
705
|
+
// writing, so non-existent targets short-circuit that branch.
|
|
706
|
+
if (existsSync(abs)) {
|
|
707
|
+
try {
|
|
708
|
+
// `realpathSync` resolves the full symlink chain, including
|
|
709
|
+
// any intermediate symlinked directories. That's a stronger
|
|
710
|
+
// containment check than `lstatSync(...).isSymbolicLink()`
|
|
711
|
+
// alone would give us: we don't care whether the final
|
|
712
|
+
// component itself is a symlink — we only care where the
|
|
713
|
+
// filesystem operations would actually land, which is what
|
|
714
|
+
// realpath reveals.
|
|
715
|
+
const realAbs = realpathSync(abs);
|
|
716
|
+
const realRoot = realpathSync(rootExact);
|
|
717
|
+
const realRootPrefix = realRoot.endsWith(sep) ? realRoot : realRoot + sep;
|
|
718
|
+
if (realAbs !== realRoot && !realAbs.startsWith(realRootPrefix)) {
|
|
719
|
+
return null;
|
|
720
|
+
}
|
|
721
|
+
} catch {
|
|
722
|
+
return null; // realpath failure → reject defensively
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
return abs;
|
|
726
|
+
}
|