@ctxr/skill-llm-wiki 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +2 -2
- package/SKILL.md +7 -0
- package/guide/cli.md +6 -4
- package/guide/consumers/index.md +106 -0
- package/guide/consumers/quickstart.md +96 -0
- package/guide/consumers/recipes/ci-gate.md +125 -0
- package/guide/consumers/recipes/dated-wiki.md +131 -0
- package/guide/consumers/recipes/format-gate.md +126 -0
- package/guide/consumers/recipes/post-write-heal.md +125 -0
- package/guide/consumers/recipes/skill-absent.md +111 -0
- package/guide/consumers/recipes/subject-wiki.md +110 -0
- package/guide/consumers/recipes/testing.md +149 -0
- package/guide/index.md +9 -0
- package/guide/substrate/operators.md +1 -1
- package/guide/substrate/tiered-ai.md +6 -5
- package/guide/ux/user-intent.md +6 -5
- package/package.json +9 -3
- package/scripts/cli.mjs +565 -15
- package/scripts/lib/balance.mjs +579 -0
- package/scripts/lib/cluster-detect.mjs +482 -4
- package/scripts/lib/contract.mjs +257 -0
- package/scripts/lib/decision-log.mjs +121 -15
- package/scripts/lib/heal.mjs +167 -0
- package/scripts/lib/init.mjs +210 -0
- package/scripts/lib/intent.mjs +370 -4
- package/scripts/lib/join-constants.mjs +22 -0
- package/scripts/lib/join.mjs +917 -0
- package/scripts/lib/json-envelope.mjs +190 -0
- package/scripts/lib/nest-applier.mjs +395 -32
- package/scripts/lib/operators.mjs +472 -38
- package/scripts/lib/orchestrator.mjs +419 -12
- package/scripts/lib/root-containment.mjs +351 -0
- package/scripts/lib/similarity-cache.mjs +115 -20
- package/scripts/lib/similarity.mjs +11 -0
- package/scripts/lib/soft-dag.mjs +726 -0
- package/scripts/lib/templates.mjs +78 -0
- package/scripts/lib/tiered.mjs +42 -18
- package/scripts/lib/validate.mjs +22 -0
- package/scripts/lib/where.mjs +71 -0
- package/scripts/testkit/assert-frontmatter.mjs +171 -0
- package/scripts/testkit/cli-run.mjs +95 -0
- package/scripts/testkit/make-wiki-fixture.mjs +301 -0
- package/scripts/testkit/stub-skill.mjs +107 -0
- package/templates/adrs.llmwiki.layout.yaml +33 -0
- package/templates/plans.llmwiki.layout.yaml +34 -0
- package/templates/regressions.llmwiki.layout.yaml +34 -0
- package/templates/reports.llmwiki.layout.yaml +33 -0
- package/templates/runbooks.llmwiki.layout.yaml +33 -0
- package/templates/sessions.llmwiki.layout.yaml +34 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
// root-containment.mjs — enforce "no leaves at wiki root" invariant.
|
|
2
|
+
//
|
|
3
|
+
// Runs as Phase 4.4.5 (between soft-DAG synthesis and review) so the
|
|
4
|
+
// containment commit participates in the `--review` diff — users can
|
|
5
|
+
// drop/abort individual containment moves exactly like they can drop
|
|
6
|
+
// any other tree-mutating phase's commits.
|
|
7
|
+
// Walks `wikiRoot`, collects every direct-child `.md` file (i.e., a
|
|
8
|
+
// leaf that sits at the wiki root itself — depth 0 per `depthOf`,
|
|
9
|
+
// one level above any subcategory) other than `index.md`, and moves
|
|
10
|
+
// each into its own semantically-named
|
|
11
|
+
// subcategory derived from the leaf's own TF-IDF distinguishing
|
|
12
|
+
// tokens. A stub `<slug>/index.md` is written so the new category
|
|
13
|
+
// is routable; Phase 5's `rebuildAllIndices` populates the stub's
|
|
14
|
+
// `entries[]` on the next pass.
|
|
15
|
+
//
|
|
16
|
+
// Why a single-member category rather than a shared "uncategorised"
|
|
17
|
+
// bucket: every reviewer leaf has `focus` / `covers` / `tags` that
|
|
18
|
+
// describe some coherent topic, so the honest answer to "where does
|
|
19
|
+
// this belong?" is "in its own tight category named after what it
|
|
20
|
+
// is." A shared bucket label admits defeat about something the data
|
|
21
|
+
// already tells us; a per-outlier slug preserves the semantic signal.
|
|
22
|
+
// If the corpus later grows a topically-adjacent leaf, future builds'
|
|
23
|
+
// convergence + balance may nest both into an existing category — a
|
|
24
|
+
// single-member start state is a valid transient, not a permanent
|
|
25
|
+
// scar.
|
|
26
|
+
//
|
|
27
|
+
// Slug uniqueness is enforced via `resolveNestSlug` + the full-wiki
|
|
28
|
+
// forbidden-id index from PR #5. A generated slug that happens to
|
|
29
|
+
// collide with an existing subcategory basename, leaf id, or alias
|
|
30
|
+
// elsewhere in the tree gets the `-group` / `-group-N` fallback
|
|
31
|
+
// treatment.
|
|
32
|
+
//
|
|
33
|
+
// parents[] rewrite on the moved leaf:
|
|
34
|
+
// - Primary parent: stays `"index.md"`. The leaf's new direct
|
|
35
|
+
// parent (`<slug>/index.md`) is same-dir-as-leaf, so the
|
|
36
|
+
// POSIX-relative path string doesn't change even though its
|
|
37
|
+
// semantic target moves from root-index to subcategory-index.
|
|
38
|
+
// Same convention `applyBalanceFlatten` leveraged when moving a
|
|
39
|
+
// subtree up one level (PR #8).
|
|
40
|
+
// - Soft parents (if any): paths that were relative to the old
|
|
41
|
+
// leaf-dir (wiki root) gain a "../" prefix because the leaf now
|
|
42
|
+
// sits one level deeper. Example: `"b/index.md"` → `"../b/index.md"`.
|
|
43
|
+
//
|
|
44
|
+
// Determinism: outlier iteration is lex-sorted by filename, so two
|
|
45
|
+
// runs on the same set of outliers produce byte-identical slug
|
|
46
|
+
// assignment order (matters for `-group-N` collision tie-breaks).
|
|
47
|
+
// `generateDeterministicSlug` + `deterministicPurpose` are both
|
|
48
|
+
// byte-stable across member order.
|
|
49
|
+
|
|
50
|
+
import {
|
|
51
|
+
existsSync,
|
|
52
|
+
mkdirSync,
|
|
53
|
+
readFileSync,
|
|
54
|
+
readdirSync,
|
|
55
|
+
renameSync,
|
|
56
|
+
writeFileSync,
|
|
57
|
+
} from "node:fs";
|
|
58
|
+
import { basename, join } from "node:path";
|
|
59
|
+
import { readFrontmatterStreaming } from "./chunk.mjs";
|
|
60
|
+
import {
|
|
61
|
+
buildSiblingIdfContext,
|
|
62
|
+
deterministicPurpose,
|
|
63
|
+
generateDeterministicSlug,
|
|
64
|
+
} from "./cluster-detect.mjs";
|
|
65
|
+
import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
|
|
66
|
+
import { buildWikiForbiddenIndex, resolveNestSlug } from "./nest-applier.mjs";
|
|
67
|
+
|
|
68
|
+
// Walk the wiki root and return outlier leaves — non-index `.md`
|
|
69
|
+
// files sitting directly at the wiki root (depth 0 per `depthOf`).
|
|
70
|
+
// Each item is `{ path, data }`
|
|
71
|
+
// with parsed frontmatter so the caller can feed directly into
|
|
72
|
+
// `generateDeterministicSlug`. Files whose frontmatter fails to
|
|
73
|
+
// parse are skipped silently — the validator will surface them
|
|
74
|
+
// separately.
|
|
75
|
+
function collectRootLeaves(wikiRoot) {
|
|
76
|
+
let entries;
|
|
77
|
+
try {
|
|
78
|
+
entries = readdirSync(wikiRoot, { withFileTypes: true });
|
|
79
|
+
} catch {
|
|
80
|
+
return [];
|
|
81
|
+
}
|
|
82
|
+
const out = [];
|
|
83
|
+
for (const e of entries) {
|
|
84
|
+
if (!e.isFile()) continue;
|
|
85
|
+
if (!e.name.endsWith(".md")) continue;
|
|
86
|
+
if (e.name === "index.md") continue;
|
|
87
|
+
if (e.name.startsWith(".")) continue;
|
|
88
|
+
const full = join(wikiRoot, e.name);
|
|
89
|
+
try {
|
|
90
|
+
const captured = readFrontmatterStreaming(full);
|
|
91
|
+
if (captured === null) continue; // no frontmatter fence → plain md
|
|
92
|
+
const { data } = parseFrontmatter(captured.frontmatterText, full);
|
|
93
|
+
if (!data?.id) continue; // unroutable, skip
|
|
94
|
+
out.push({ path: full, data });
|
|
95
|
+
} catch {
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
// Lex-sorted by filename so slug-resolution tie-breaks (e.g.,
|
|
100
|
+
// `-group-N` collisions) are deterministic across runs.
|
|
101
|
+
out.sort((a, b) => basename(a.path).localeCompare(basename(b.path)));
|
|
102
|
+
return out;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Walk the wiki root and return the other root-direct children
|
|
106
|
+
// (leaves + subcategory indices) as an IDF sibling corpus for
|
|
107
|
+
// `generateDeterministicSlug`. The slug algorithm ranks a leaf's
|
|
108
|
+
// tokens by distinctiveness vs these siblings, so the corpus must
|
|
109
|
+
// include every other top-level entry the slug should discriminate
|
|
110
|
+
// against. Passing plain leaves won't tell the IDF ranker that
|
|
111
|
+
// "cache" appears in 7 subcategories; passing subcategory `index.md`
|
|
112
|
+
// frontmatters does.
|
|
113
|
+
function collectRootSiblings(wikiRoot, excludePath) {
|
|
114
|
+
let entries;
|
|
115
|
+
try {
|
|
116
|
+
entries = readdirSync(wikiRoot, { withFileTypes: true });
|
|
117
|
+
} catch {
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
const out = [];
|
|
121
|
+
const readSiblingFm = (absPath) => {
|
|
122
|
+
const captured = readFrontmatterStreaming(absPath);
|
|
123
|
+
if (captured === null) return null;
|
|
124
|
+
return parseFrontmatter(captured.frontmatterText, absPath).data;
|
|
125
|
+
};
|
|
126
|
+
for (const e of entries) {
|
|
127
|
+
if (e.name.startsWith(".")) continue;
|
|
128
|
+
const full = join(wikiRoot, e.name);
|
|
129
|
+
if (e.isFile() && e.name.endsWith(".md") && e.name !== "index.md") {
|
|
130
|
+
if (full === excludePath) continue;
|
|
131
|
+
try {
|
|
132
|
+
const data = readSiblingFm(full);
|
|
133
|
+
if (data?.id) out.push({ path: full, data });
|
|
134
|
+
} catch {
|
|
135
|
+
/* skip malformed */
|
|
136
|
+
}
|
|
137
|
+
} else if (e.isDirectory()) {
|
|
138
|
+
// Subcategory index.md contributes its frontmatter as a sibling signal
|
|
139
|
+
const indexPath = join(full, "index.md");
|
|
140
|
+
if (!existsSync(indexPath)) continue;
|
|
141
|
+
try {
|
|
142
|
+
const data = readSiblingFm(indexPath);
|
|
143
|
+
if (data?.id) out.push({ path: indexPath, data });
|
|
144
|
+
} catch {
|
|
145
|
+
/* skip malformed */
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return out;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Rewrite parents[] after a one-level-down move. Primary parent
|
|
153
|
+
// (first entry) stays "index.md" because the leaf's new direct
|
|
154
|
+
// parent sits in the same dir as the leaf itself. Every other entry
|
|
155
|
+
// gains a "../" prefix because paths that were relative to the old
|
|
156
|
+
// leaf-dir (wiki root) are now one level too shallow.
|
|
157
|
+
//
|
|
158
|
+
// Exception: a parent entry that already starts with "../" is a
|
|
159
|
+
// depth-contract violation on the input — a root-level leaf has no
|
|
160
|
+
// legitimate parent above wikiRoot to reference. Blindly prepending
|
|
161
|
+
// "../" would turn the already-malformed "../foo" into "../../foo",
|
|
162
|
+
// escaping the wiki root outright. Preserve the (already-malformed)
|
|
163
|
+
// entry byte-identical instead, and let validation surface it
|
|
164
|
+
// post-containment under its normal parent-path rules.
|
|
165
|
+
function rewriteParentsAfterContainment(leafPath) {
|
|
166
|
+
// Use `readFrontmatterStreaming` (same pattern `soft-dag.mjs` uses
|
|
167
|
+
// in its `collectAllLeaves(withBody=true)` path) rather than
|
|
168
|
+
// `readFileSync` + `parseFrontmatter(raw)`. `parseFrontmatter`
|
|
169
|
+
// only recognises an LF opening fence, so a CRLF-fenced outlier
|
|
170
|
+
// — which `collectRootLeaves` DOES include, because
|
|
171
|
+
// `readFrontmatterStreaming` normalises CRLF→LF on the
|
|
172
|
+
// frontmatter payload — would silently fail to parse here and
|
|
173
|
+
// the parents[] rewrite would be skipped. The leaf would still
|
|
174
|
+
// move, but its non-primary parent paths would be left one level
|
|
175
|
+
// too shallow (relative to the old depth-0 dir, now invalid at
|
|
176
|
+
// depth 1). Reading through the streaming helper guarantees
|
|
177
|
+
// symmetry with the `collectRootLeaves` discovery pass.
|
|
178
|
+
let captured;
|
|
179
|
+
try {
|
|
180
|
+
captured = readFrontmatterStreaming(leafPath);
|
|
181
|
+
} catch {
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
if (!captured) return;
|
|
185
|
+
let parsed;
|
|
186
|
+
try {
|
|
187
|
+
parsed = parseFrontmatter(captured.frontmatterText, leafPath);
|
|
188
|
+
} catch {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
if (!parsed?.data) return;
|
|
192
|
+
const parents = Array.isArray(parsed.data.parents) ? parsed.data.parents : [];
|
|
193
|
+
if (parents.length === 0) return;
|
|
194
|
+
const rewritten = parents.map((p, i) => {
|
|
195
|
+
if (typeof p !== "string") return p;
|
|
196
|
+
// Primary stays "index.md" when it was "index.md" (same-dir
|
|
197
|
+
// reference that survives the move).
|
|
198
|
+
if (i === 0 && p === "index.md") return "index.md";
|
|
199
|
+
// Already-escaping paths are preserved byte-identical — adding
|
|
200
|
+
// another "../" to an already-"../"-prefixed entry only digs the
|
|
201
|
+
// depth-contract violation deeper. See module header.
|
|
202
|
+
if (p.startsWith("../")) return p;
|
|
203
|
+
return "../" + p;
|
|
204
|
+
});
|
|
205
|
+
parsed.data.parents = rewritten;
|
|
206
|
+
// Slice the body via the streaming helper's bodyOffset so
|
|
207
|
+
// multi-byte characters at the fence boundary can't corrupt the
|
|
208
|
+
// body. For CRLF-fenced inputs the body buffer starts with "\r\n"
|
|
209
|
+
// — normalise to LF on rewrite (the wider codebase is LF-only for
|
|
210
|
+
// on-disk output, matching `renderFrontmatter`'s always-LF
|
|
211
|
+
// emission); mixing CRLF body into LF-emitted frontmatter would
|
|
212
|
+
// produce an "\n\r\n" boundary a downstream reader would trip on.
|
|
213
|
+
const raw = readFileSync(leafPath);
|
|
214
|
+
let body = raw.slice(captured.bodyOffset).toString("utf8");
|
|
215
|
+
if (captured.lineEnding === "crlf") {
|
|
216
|
+
body = body.replace(/\r\n/g, "\n");
|
|
217
|
+
}
|
|
218
|
+
writeFileSync(leafPath, renderFrontmatter(parsed.data, body), "utf8");
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Write the stub `<slug>/index.md` for a newly-minted single-member
|
|
222
|
+
// subcategory. The stub inherits the member's topical signature via
|
|
223
|
+
// `deterministicPurpose` so a Claude navigator reading it
|
|
224
|
+
// immediately sees what's inside. `rebuildAllIndices` in Phase 5
|
|
225
|
+
// will populate the `entries[]` field on the next pass; we don't
|
|
226
|
+
// pre-seed it here.
|
|
227
|
+
//
|
|
228
|
+
// `parents: ["../index.md"]` is pre-seeded on the stub so the
|
|
229
|
+
// intermediate root-containment commit satisfies `PARENTS-REQUIRED`
|
|
230
|
+
// BEFORE Phase 5's rebuild runs. Without this, a reviewer who later
|
|
231
|
+
// drops the Phase-5 commit via `git revert` (the `--review` drop
|
|
232
|
+
// flow) would leave a tree with a parentless subcategory index —
|
|
233
|
+
// the dropped-state validate would fire `PARENTS-REQUIRED` on every
|
|
234
|
+
// stub X.11 created. `rebuildAllIndices` line 185 only fills
|
|
235
|
+
// `data.parents` when it's unset, so the seeded value survives the
|
|
236
|
+
// Phase 5 pass byte-identical.
|
|
237
|
+
function writeStubIndex(targetDir, slug, leaf) {
|
|
238
|
+
const indexPath = join(targetDir, "index.md");
|
|
239
|
+
const data = {
|
|
240
|
+
id: slug,
|
|
241
|
+
type: "index",
|
|
242
|
+
depth_role: "subcategory",
|
|
243
|
+
focus: deterministicPurpose([leaf]) || leaf.data.focus || "",
|
|
244
|
+
parents: ["../index.md"],
|
|
245
|
+
generator: "skill-llm-wiki/v1",
|
|
246
|
+
};
|
|
247
|
+
writeFileSync(indexPath, renderFrontmatter(data, `\n# ${slug}\n`), "utf8");
|
|
248
|
+
// Return the exact data + path so the caller can append the new
|
|
249
|
+
// stub onto the in-memory sibling corpus for subsequent outliers.
|
|
250
|
+
// Keeping the write + record in-step guarantees the incremental
|
|
251
|
+
// sibling corpus stays byte-identical to what a fresh
|
|
252
|
+
// `collectRootSiblings` walk would produce on the final tree.
|
|
253
|
+
return { path: indexPath, data };
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Main entry. Returns a summary for the orchestrator phase log.
|
|
257
|
+
//
|
|
258
|
+
// Shape:
|
|
259
|
+
// {
|
|
260
|
+
// outliers: number, // root leaves detected
|
|
261
|
+
// moved: number, // successfully contained
|
|
262
|
+
// operations: [{ from, to, slug }]
|
|
263
|
+
// }
|
|
264
|
+
//
|
|
265
|
+
// Contract:
|
|
266
|
+
// - Zero outliers → no mkdir, no writes, returns { outliers: 0 }.
|
|
267
|
+
// - Each outlier lands in its OWN subcategory (never a shared
|
|
268
|
+
// bucket).
|
|
269
|
+
// - Slug derivation is deterministic (`generateDeterministicSlug`
|
|
270
|
+
// + `resolveNestSlug`'s collision fallback).
|
|
271
|
+
// - parents[] rewrite handled per moved leaf.
|
|
272
|
+
export async function runRootContainment(wikiRoot) {
|
|
273
|
+
const outliers = collectRootLeaves(wikiRoot);
|
|
274
|
+
if (outliers.length === 0) {
|
|
275
|
+
return { outliers: 0, moved: 0, operations: [] };
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Build the wiki-wide forbidden-id index ONCE up front and mutate
|
|
279
|
+
// after each successful slug resolution. Same pattern PR #5
|
|
280
|
+
// established for multi-NEST convergence iterations — the slug
|
|
281
|
+
// resolver short-circuits the full-tree walk via `opts.wikiIndex`
|
|
282
|
+
// and we add each resolved slug to the set so subsequent outliers
|
|
283
|
+
// can't accidentally reuse it.
|
|
284
|
+
const wikiIndex = buildWikiForbiddenIndex(wikiRoot);
|
|
285
|
+
|
|
286
|
+
// Sibling corpus is read ONCE at entry and mutated in-place as
|
|
287
|
+
// outliers land in their new subcategories. A naive
|
|
288
|
+
// `collectRootSiblings(wikiRoot, outlier.path)` per iteration
|
|
289
|
+
// would be O(N) reads × N outliers = O(N²) frontmatter reads on a
|
|
290
|
+
// flat-source wiki that drops many leaves at root. Same
|
|
291
|
+
// amortisation shape PR #5 / PR #8 use for `buildWikiForbiddenIndex`
|
|
292
|
+
// and `balance.mjs::computeFanoutStats`.
|
|
293
|
+
//
|
|
294
|
+
// Per-outlier workflow vs. the in-memory corpus:
|
|
295
|
+
// 1. Filter out the current outlier (it's still at root and
|
|
296
|
+
// would appear as its own sibling), producing the IDF context.
|
|
297
|
+
// 2. Compute slug, resolve collisions, move the leaf, write stub.
|
|
298
|
+
// 3. Drop the moved outlier from `siblings` (its path is stale —
|
|
299
|
+
// its frontmatter is still on disk at newLeafPath, but as a
|
|
300
|
+
// wiki-root sibling record it's gone).
|
|
301
|
+
// 4. Append the new stub's { path, data } onto `siblings` so the
|
|
302
|
+
// NEXT outlier sees this freshly-contained subcategory as a
|
|
303
|
+
// sibling signal. Byte-identical to what a fresh
|
|
304
|
+
// collectRootSiblings walk on the final tree would produce.
|
|
305
|
+
let siblings = collectRootSiblings(wikiRoot, /* excludePath */ null);
|
|
306
|
+
const operations = [];
|
|
307
|
+
|
|
308
|
+
for (const outlier of outliers) {
|
|
309
|
+
const perOutlierSiblings = siblings.filter(
|
|
310
|
+
(s) => s.path !== outlier.path,
|
|
311
|
+
);
|
|
312
|
+
const idfMap =
|
|
313
|
+
perOutlierSiblings.length > 0
|
|
314
|
+
? buildSiblingIdfContext(perOutlierSiblings)
|
|
315
|
+
: undefined;
|
|
316
|
+
const slug = generateDeterministicSlug([outlier], perOutlierSiblings, {
|
|
317
|
+
precomputedIdf: idfMap,
|
|
318
|
+
});
|
|
319
|
+
const proposal = { leaves: [outlier], parent_dir: wikiRoot };
|
|
320
|
+
const resolvedSlug = resolveNestSlug(slug, proposal, wikiRoot, {
|
|
321
|
+
wikiIndex,
|
|
322
|
+
});
|
|
323
|
+
const targetDir = join(wikiRoot, resolvedSlug);
|
|
324
|
+
if (existsSync(targetDir)) {
|
|
325
|
+
// Shouldn't happen if resolveNestSlug did its job, but
|
|
326
|
+
// defensive: an existing dir with the resolved slug would
|
|
327
|
+
// collide on the mkdir below. Surface the failure rather
|
|
328
|
+
// than silently overwrite.
|
|
329
|
+
throw new Error(
|
|
330
|
+
`root-containment: target ${targetDir} already exists for outlier ${basename(outlier.path)} — slug resolution leaked a collision`,
|
|
331
|
+
);
|
|
332
|
+
}
|
|
333
|
+
mkdirSync(targetDir);
|
|
334
|
+
const newLeafPath = join(targetDir, basename(outlier.path));
|
|
335
|
+
renameSync(outlier.path, newLeafPath);
|
|
336
|
+
rewriteParentsAfterContainment(newLeafPath);
|
|
337
|
+
const stubRecord = writeStubIndex(targetDir, resolvedSlug, outlier);
|
|
338
|
+
wikiIndex.add(resolvedSlug);
|
|
339
|
+
// Incremental sibling corpus update — drop the moved outlier,
|
|
340
|
+
// add the new stub index.
|
|
341
|
+
siblings = siblings.filter((s) => s.path !== outlier.path);
|
|
342
|
+
siblings.push(stubRecord);
|
|
343
|
+
operations.push({ from: outlier.path, to: newLeafPath, slug: resolvedSlug });
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
outliers: outliers.length,
|
|
348
|
+
moved: operations.length,
|
|
349
|
+
operations,
|
|
350
|
+
};
|
|
351
|
+
}
|
|
@@ -3,12 +3,29 @@
|
|
|
3
3
|
// (b,a) resolve to the same entry. Invalidated implicitly when either
|
|
4
4
|
// entry's hash changes — the key simply doesn't match anymore.
|
|
5
5
|
//
|
|
6
|
-
// Cache entries are JSON files under
|
|
7
|
-
//
|
|
8
|
-
//
|
|
6
|
+
// Cache entries are JSON files under
|
|
7
|
+
// `<wiki>/.llmwiki/similarity-cache/<shard>/<rest>.json`, where
|
|
8
|
+
// `<shard>` is the first `CACHE_SHARD_PREFIX_LEN` hex chars of the
|
|
9
|
+
// 32-char cache key and `<rest>` is the remaining
|
|
10
|
+
// `32 - CACHE_SHARD_PREFIX_LEN` chars. The default
|
|
11
|
+
// `CACHE_SHARD_PREFIX_LEN = 2` gives 16² = 256 shards, which keeps
|
|
12
|
+
// each shard dir's inode count ~cacheSize/256 — a 178k-pair
|
|
13
|
+
// corpus has ~700 entries per shard instead of 178k in a single
|
|
14
|
+
// flat directory.
|
|
15
|
+
// APFS/ext4/ZFS directory lookups degrade with entry count once
|
|
16
|
+
// the VFS dirent cache overflows (~10k on typical kernels), so
|
|
17
|
+
// sharding turns the per-lookup cost from O(log N)-with-large-N
|
|
18
|
+
// back into O(log N)-with-small-N. Same pattern as `.git/objects`.
|
|
19
|
+
//
|
|
9
20
|
// The payload carries the tier, similarity, decision, and the tier
|
|
10
21
|
// at which the decision was resolved — tests read it back to verify
|
|
11
22
|
// caching prevented redundant work.
|
|
23
|
+
//
|
|
24
|
+
// Pre-sharding caches: the old flat layout (`<cacheDir>/<key>.json`)
|
|
25
|
+
// is NOT auto-migrated. This is a pure perf cache — if a user
|
|
26
|
+
// upgrades and the cache invalidates, the next build recomputes
|
|
27
|
+
// everything once and fills the sharded layout. No user data at
|
|
28
|
+
// stake; nothing to preserve.
|
|
12
29
|
|
|
13
30
|
import { createHash } from "node:crypto";
|
|
14
31
|
import {
|
|
@@ -26,8 +43,8 @@ export function cacheDir(wikiRoot) {
|
|
|
26
43
|
return join(wikiRoot, ".llmwiki", "similarity-cache");
|
|
27
44
|
}
|
|
28
45
|
|
|
29
|
-
// Deterministic filename for a hash pair. Hash prefixes are
|
|
30
|
-
// so the lookup is symmetric regardless of argument order.
|
|
46
|
+
// Deterministic filename stem for a hash pair. Hash prefixes are
|
|
47
|
+
// sorted so the lookup is symmetric regardless of argument order.
|
|
31
48
|
export function cacheKey(hashA, hashB) {
|
|
32
49
|
if (!hashA || !hashB) {
|
|
33
50
|
throw new Error("similarity-cache: both hashes must be non-empty strings");
|
|
@@ -42,8 +59,19 @@ export function cacheKey(hashA, hashB) {
|
|
|
42
59
|
.slice(0, 32);
|
|
43
60
|
}
|
|
44
61
|
|
|
62
|
+
// Number of hex chars taken from the start of the cache key as the
|
|
63
|
+
// shard directory name. 2 → 256 shards, which keeps every shard dir
|
|
64
|
+
// below 1k entries for workloads up to ~256k pairs. Changing this
|
|
65
|
+
// constant invalidates the cache layout for existing wikis, but
|
|
66
|
+
// since similarity-cache is purely an optimisation the next build
|
|
67
|
+
// simply rebuilds the populated shards.
|
|
68
|
+
export const CACHE_SHARD_PREFIX_LEN = 2;
|
|
69
|
+
|
|
45
70
|
export function cacheEntryPath(wikiRoot, hashA, hashB) {
|
|
46
|
-
|
|
71
|
+
const key = cacheKey(hashA, hashB);
|
|
72
|
+
const shard = key.slice(0, CACHE_SHARD_PREFIX_LEN);
|
|
73
|
+
const rest = key.slice(CACHE_SHARD_PREFIX_LEN);
|
|
74
|
+
return join(cacheDir(wikiRoot), shard, rest + ".json");
|
|
47
75
|
}
|
|
48
76
|
|
|
49
77
|
// Read a cached decision. Returns null on miss or malformed file.
|
|
@@ -72,13 +100,24 @@ export function readCached(wikiRoot, hashA, hashB) {
|
|
|
72
100
|
}
|
|
73
101
|
|
|
74
102
|
// Write a decision to the cache atomically (temp-file + rename).
|
|
103
|
+
// The shard subdirectory is created on demand, so callers don't
|
|
104
|
+
// need to pre-create it. `mkdirSync({ recursive: true })` is
|
|
105
|
+
// idempotent and cheap on hot shards (kernel caches the parent's
|
|
106
|
+
// dentry).
|
|
75
107
|
export function writeCached(wikiRoot, hashA, hashB, decision) {
|
|
76
108
|
if (!decision || typeof decision !== "object") {
|
|
77
109
|
throw new Error("similarity-cache: decision must be an object");
|
|
78
110
|
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
111
|
+
// Compute the key once — previously we hashed twice (once via
|
|
112
|
+
// cacheEntryPath + once inline for the shard slice). The hot
|
|
113
|
+
// write path of a 596-leaf sweep calls this ~178k times, so
|
|
114
|
+
// halving the sha256 work is a small-but-cheap win.
|
|
115
|
+
const key = cacheKey(hashA, hashB);
|
|
116
|
+
const shard = key.slice(0, CACHE_SHARD_PREFIX_LEN);
|
|
117
|
+
const rest = key.slice(CACHE_SHARD_PREFIX_LEN);
|
|
118
|
+
const shardDir = join(cacheDir(wikiRoot), shard);
|
|
119
|
+
const path = join(shardDir, rest + ".json");
|
|
120
|
+
mkdirSync(shardDir, { recursive: true });
|
|
82
121
|
const payload = JSON.stringify(
|
|
83
122
|
{
|
|
84
123
|
tier: decision.tier,
|
|
@@ -97,30 +136,86 @@ export function writeCached(wikiRoot, hashA, hashB, decision) {
|
|
|
97
136
|
|
|
98
137
|
// Remove every cache file. Used by tests and by `startCorpus` via
|
|
99
138
|
// the orchestrator when the corpus changes materially. Safe to call
|
|
100
|
-
// when the cache dir doesn't exist.
|
|
139
|
+
// when the cache dir doesn't exist. Walks every shard directory
|
|
140
|
+
// plus the top-level (tolerates pre-sharding flat caches from
|
|
141
|
+
// before the layout change — they get cleared on first clear-call).
|
|
142
|
+
//
|
|
143
|
+
// `readdirSync({ withFileTypes: true })` returns `Dirent` entries
|
|
144
|
+
// that already carry `isDirectory()` / `isFile()` metadata, so we
|
|
145
|
+
// can branch on type without a per-entry `statSync` syscall. On a
|
|
146
|
+
// pre-sharding flat cache with 178k entries that's 178k syscalls
|
|
147
|
+
// saved on the first post-upgrade clear.
|
|
101
148
|
export function clearCache(wikiRoot) {
|
|
102
149
|
const dir = cacheDir(wikiRoot);
|
|
103
150
|
if (!existsSync(dir)) return 0;
|
|
104
151
|
let count = 0;
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
152
|
+
let entries;
|
|
153
|
+
try {
|
|
154
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
155
|
+
} catch {
|
|
156
|
+
return 0;
|
|
157
|
+
}
|
|
158
|
+
for (const dirent of entries) {
|
|
159
|
+
const entryPath = join(dir, dirent.name);
|
|
160
|
+
if (dirent.isDirectory()) {
|
|
161
|
+
// Shard subdirectory — clear every .json beneath it, then
|
|
162
|
+
// the directory itself. Best-effort; a shard that's locked
|
|
163
|
+
// or mid-write doesn't abort the whole clear.
|
|
164
|
+
try {
|
|
165
|
+
for (const sub of readdirSync(entryPath, { withFileTypes: true })) {
|
|
166
|
+
if (!sub.isFile() || !sub.name.endsWith(".json")) continue;
|
|
167
|
+
try {
|
|
168
|
+
rmSync(join(entryPath, sub.name), { force: true });
|
|
169
|
+
count++;
|
|
170
|
+
} catch {
|
|
171
|
+
/* best-effort */
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
rmSync(entryPath, { force: true, recursive: true });
|
|
175
|
+
} catch {
|
|
176
|
+
/* best-effort */
|
|
177
|
+
}
|
|
178
|
+
} else if (dirent.isFile() && dirent.name.endsWith(".json")) {
|
|
179
|
+
// Pre-sharding flat entry — clear in place.
|
|
180
|
+
try {
|
|
181
|
+
rmSync(entryPath, { force: true });
|
|
182
|
+
count++;
|
|
183
|
+
} catch {
|
|
184
|
+
/* best-effort */
|
|
185
|
+
}
|
|
112
186
|
}
|
|
113
187
|
}
|
|
114
188
|
return count;
|
|
115
189
|
}
|
|
116
190
|
|
|
117
|
-
// Count cached entries — convenience for tests and metrics.
|
|
191
|
+
// Count cached entries — convenience for tests and metrics. Walks
|
|
192
|
+
// every shard directory; also counts any pre-sharding flat entries
|
|
193
|
+
// if they exist (so a pre-upgrade cache still reports meaningful
|
|
194
|
+
// size until the user runs a build that regenerates). Uses
|
|
195
|
+
// `withFileTypes: true` for the same per-syscall saving as
|
|
196
|
+
// `clearCache`.
|
|
118
197
|
export function cacheSize(wikiRoot) {
|
|
119
198
|
const dir = cacheDir(wikiRoot);
|
|
120
199
|
if (!existsSync(dir)) return 0;
|
|
121
200
|
let n = 0;
|
|
122
|
-
|
|
123
|
-
|
|
201
|
+
let entries;
|
|
202
|
+
try {
|
|
203
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
204
|
+
} catch {
|
|
205
|
+
return 0;
|
|
206
|
+
}
|
|
207
|
+
for (const dirent of entries) {
|
|
208
|
+
if (dirent.isDirectory()) {
|
|
209
|
+
try {
|
|
210
|
+
for (const sub of readdirSync(join(dir, dirent.name), { withFileTypes: true })) {
|
|
211
|
+
if (sub.isFile() && sub.name.endsWith(".json")) n++;
|
|
212
|
+
}
|
|
213
|
+
} catch {
|
|
214
|
+
/* best-effort */
|
|
215
|
+
}
|
|
216
|
+
} else if (dirent.isFile() && dirent.name.endsWith(".json")) {
|
|
217
|
+
n++; // pre-sharding flat entry
|
|
218
|
+
}
|
|
124
219
|
}
|
|
125
220
|
return n;
|
|
126
221
|
}
|
|
@@ -92,6 +92,17 @@ export function computeIdf(tokenLists) {
|
|
|
92
92
|
// every pair — the difference between O(N³) and O(N²) work.
|
|
93
93
|
export function buildComparisonModel(entries) {
|
|
94
94
|
const texts = entries.map((e) => entryText(e));
|
|
95
|
+
return buildComparisonModelFromTexts(texts);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Build the same model shape directly from pre-assembled text
|
|
99
|
+
// strings. Useful when the caller has already done its own
|
|
100
|
+
// aggregation (e.g. `soft-dag.mjs` concatenates multiple leaves'
|
|
101
|
+
// `entryText` outputs into a single "category text") and passing
|
|
102
|
+
// the result back through `entryText` would either double-count
|
|
103
|
+
// the doubled-focus treatment or simply waste tokenisation work.
|
|
104
|
+
// No `entryText` roundtrip — the text goes straight to tokenisation.
|
|
105
|
+
export function buildComparisonModelFromTexts(texts) {
|
|
95
106
|
const tokenLists = texts.map((t) => tokenize(t));
|
|
96
107
|
const idfMap = computeIdf(tokenLists);
|
|
97
108
|
return { texts, tokenLists, idfMap };
|