@ctxr/skill-llm-wiki 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +118 -0
  2. package/README.md +2 -2
  3. package/SKILL.md +7 -0
  4. package/guide/cli.md +6 -4
  5. package/guide/consumers/index.md +106 -0
  6. package/guide/consumers/quickstart.md +96 -0
  7. package/guide/consumers/recipes/ci-gate.md +125 -0
  8. package/guide/consumers/recipes/dated-wiki.md +131 -0
  9. package/guide/consumers/recipes/format-gate.md +126 -0
  10. package/guide/consumers/recipes/post-write-heal.md +125 -0
  11. package/guide/consumers/recipes/skill-absent.md +111 -0
  12. package/guide/consumers/recipes/subject-wiki.md +110 -0
  13. package/guide/consumers/recipes/testing.md +149 -0
  14. package/guide/index.md +9 -0
  15. package/guide/substrate/operators.md +1 -1
  16. package/guide/substrate/tiered-ai.md +6 -5
  17. package/guide/ux/user-intent.md +6 -5
  18. package/package.json +9 -3
  19. package/scripts/cli.mjs +565 -15
  20. package/scripts/lib/balance.mjs +579 -0
  21. package/scripts/lib/cluster-detect.mjs +482 -4
  22. package/scripts/lib/contract.mjs +257 -0
  23. package/scripts/lib/decision-log.mjs +121 -15
  24. package/scripts/lib/heal.mjs +167 -0
  25. package/scripts/lib/init.mjs +210 -0
  26. package/scripts/lib/intent.mjs +370 -4
  27. package/scripts/lib/join-constants.mjs +22 -0
  28. package/scripts/lib/join.mjs +917 -0
  29. package/scripts/lib/json-envelope.mjs +190 -0
  30. package/scripts/lib/nest-applier.mjs +395 -32
  31. package/scripts/lib/operators.mjs +472 -38
  32. package/scripts/lib/orchestrator.mjs +419 -12
  33. package/scripts/lib/root-containment.mjs +351 -0
  34. package/scripts/lib/similarity-cache.mjs +115 -20
  35. package/scripts/lib/similarity.mjs +11 -0
  36. package/scripts/lib/soft-dag.mjs +726 -0
  37. package/scripts/lib/templates.mjs +78 -0
  38. package/scripts/lib/tiered.mjs +42 -18
  39. package/scripts/lib/validate.mjs +22 -0
  40. package/scripts/lib/where.mjs +71 -0
  41. package/scripts/testkit/assert-frontmatter.mjs +171 -0
  42. package/scripts/testkit/cli-run.mjs +95 -0
  43. package/scripts/testkit/make-wiki-fixture.mjs +301 -0
  44. package/scripts/testkit/stub-skill.mjs +107 -0
  45. package/templates/adrs.llmwiki.layout.yaml +33 -0
  46. package/templates/plans.llmwiki.layout.yaml +34 -0
  47. package/templates/regressions.llmwiki.layout.yaml +34 -0
  48. package/templates/reports.llmwiki.layout.yaml +33 -0
  49. package/templates/runbooks.llmwiki.layout.yaml +33 -0
  50. package/templates/sessions.llmwiki.layout.yaml +34 -0
@@ -0,0 +1,351 @@
1
+ // root-containment.mjs — enforce "no leaves at wiki root" invariant.
2
+ //
3
+ // Runs as Phase 4.4.5 (between soft-DAG synthesis and review) so the
4
+ // containment commit participates in the `--review` diff — users can
5
+ // drop/abort individual containment moves exactly like they can drop
6
+ // any other tree-mutating phase's commits.
7
+ // Walks `wikiRoot`, collects every direct-child `.md` file (i.e., a
8
+ // leaf that sits at the wiki root itself — depth 0 per `depthOf`,
9
+ // one level above any subcategory) other than `index.md`, and moves
10
+ // each into its own semantically-named
11
+ // subcategory derived from the leaf's own TF-IDF distinguishing
12
+ // tokens. A stub `<slug>/index.md` is written so the new category
13
+ // is routable; Phase 5's `rebuildAllIndices` populates the stub's
14
+ // `entries[]` on the next pass.
15
+ //
16
+ // Why a single-member category rather than a shared "uncategorised"
17
+ // bucket: every reviewer leaf has `focus` / `covers` / `tags` that
18
+ // describe some coherent topic, so the honest answer to "where does
19
+ // this belong?" is "in its own tight category named after what it
20
+ // is." A shared bucket label admits defeat about something the data
21
+ // already tells us; a per-outlier slug preserves the semantic signal.
22
+ // If the corpus later grows a topically-adjacent leaf, future builds'
23
+ // convergence + balance may nest both into an existing category — a
24
+ // single-member start state is a valid transient, not a permanent
25
+ // scar.
26
+ //
27
+ // Slug uniqueness is enforced via `resolveNestSlug` + the full-wiki
28
+ // forbidden-id index from PR #5. A generated slug that happens to
29
+ // collide with an existing subcategory basename, leaf id, or alias
30
+ // elsewhere in the tree gets the `-group` / `-group-N` fallback
31
+ // treatment.
32
+ //
33
+ // parents[] rewrite on the moved leaf:
34
+ // - Primary parent: stays `"index.md"`. The leaf's new direct
35
+ // parent (`<slug>/index.md`) is same-dir-as-leaf, so the
36
+ // POSIX-relative path string doesn't change even though its
37
+ // semantic target moves from root-index to subcategory-index.
38
+ // Same convention `applyBalanceFlatten` leveraged when moving a
39
+ // subtree up one level (PR #8).
40
+ // - Soft parents (if any): paths that were relative to the old
41
+ // leaf-dir (wiki root) gain a "../" prefix because the leaf now
42
+ // sits one level deeper. Example: `"b/index.md"` → `"../b/index.md"`.
43
+ //
44
+ // Determinism: outlier iteration is lex-sorted by filename, so two
45
+ // runs on the same set of outliers produce byte-identical slug
46
+ // assignment order (matters for `-group-N` collision tie-breaks).
47
+ // `generateDeterministicSlug` + `deterministicPurpose` are both
48
+ // byte-stable across member order.
49
+
50
+ import {
51
+ existsSync,
52
+ mkdirSync,
53
+ readFileSync,
54
+ readdirSync,
55
+ renameSync,
56
+ writeFileSync,
57
+ } from "node:fs";
58
+ import { basename, join } from "node:path";
59
+ import { readFrontmatterStreaming } from "./chunk.mjs";
60
+ import {
61
+ buildSiblingIdfContext,
62
+ deterministicPurpose,
63
+ generateDeterministicSlug,
64
+ } from "./cluster-detect.mjs";
65
+ import { parseFrontmatter, renderFrontmatter } from "./frontmatter.mjs";
66
+ import { buildWikiForbiddenIndex, resolveNestSlug } from "./nest-applier.mjs";
67
+
68
+ // Walk the wiki root and return outlier leaves — non-index `.md`
69
+ // files sitting directly at the wiki root (depth 0 per `depthOf`).
70
+ // Each item is `{ path, data }`
71
+ // with parsed frontmatter so the caller can feed directly into
72
+ // `generateDeterministicSlug`. Files whose frontmatter fails to
73
+ // parse are skipped silently — the validator will surface them
74
+ // separately.
75
+ function collectRootLeaves(wikiRoot) {
76
+ let entries;
77
+ try {
78
+ entries = readdirSync(wikiRoot, { withFileTypes: true });
79
+ } catch {
80
+ return [];
81
+ }
82
+ const out = [];
83
+ for (const e of entries) {
84
+ if (!e.isFile()) continue;
85
+ if (!e.name.endsWith(".md")) continue;
86
+ if (e.name === "index.md") continue;
87
+ if (e.name.startsWith(".")) continue;
88
+ const full = join(wikiRoot, e.name);
89
+ try {
90
+ const captured = readFrontmatterStreaming(full);
91
+ if (captured === null) continue; // no frontmatter fence → plain md
92
+ const { data } = parseFrontmatter(captured.frontmatterText, full);
93
+ if (!data?.id) continue; // unroutable, skip
94
+ out.push({ path: full, data });
95
+ } catch {
96
+ continue;
97
+ }
98
+ }
99
+ // Lex-sorted by filename so slug-resolution tie-breaks (e.g.,
100
+ // `-group-N` collisions) are deterministic across runs.
101
+ out.sort((a, b) => basename(a.path).localeCompare(basename(b.path)));
102
+ return out;
103
+ }
104
+
105
+ // Walk the wiki root and return the other root-direct children
106
+ // (leaves + subcategory indices) as an IDF sibling corpus for
107
+ // `generateDeterministicSlug`. The slug algorithm ranks a leaf's
108
+ // tokens by distinctiveness vs these siblings, so the corpus must
109
+ // include every other top-level entry the slug should discriminate
110
+ // against. Passing plain leaves won't tell the IDF ranker that
111
+ // "cache" appears in 7 subcategories; passing subcategory `index.md`
112
+ // frontmatters does.
113
+ function collectRootSiblings(wikiRoot, excludePath) {
114
+ let entries;
115
+ try {
116
+ entries = readdirSync(wikiRoot, { withFileTypes: true });
117
+ } catch {
118
+ return [];
119
+ }
120
+ const out = [];
121
+ const readSiblingFm = (absPath) => {
122
+ const captured = readFrontmatterStreaming(absPath);
123
+ if (captured === null) return null;
124
+ return parseFrontmatter(captured.frontmatterText, absPath).data;
125
+ };
126
+ for (const e of entries) {
127
+ if (e.name.startsWith(".")) continue;
128
+ const full = join(wikiRoot, e.name);
129
+ if (e.isFile() && e.name.endsWith(".md") && e.name !== "index.md") {
130
+ if (full === excludePath) continue;
131
+ try {
132
+ const data = readSiblingFm(full);
133
+ if (data?.id) out.push({ path: full, data });
134
+ } catch {
135
+ /* skip malformed */
136
+ }
137
+ } else if (e.isDirectory()) {
138
+ // Subcategory index.md contributes its frontmatter as a sibling signal
139
+ const indexPath = join(full, "index.md");
140
+ if (!existsSync(indexPath)) continue;
141
+ try {
142
+ const data = readSiblingFm(indexPath);
143
+ if (data?.id) out.push({ path: indexPath, data });
144
+ } catch {
145
+ /* skip malformed */
146
+ }
147
+ }
148
+ }
149
+ return out;
150
+ }
151
+
152
+ // Rewrite parents[] after a one-level-down move. Primary parent
153
+ // (first entry) stays "index.md" because the leaf's new direct
154
+ // parent sits in the same dir as the leaf itself. Every other entry
155
+ // gains a "../" prefix because paths that were relative to the old
156
+ // leaf-dir (wiki root) are now one level too shallow.
157
+ //
158
+ // Exception: a parent entry that already starts with "../" is a
159
+ // depth-contract violation on the input — a root-level leaf has no
160
+ // legitimate parent above wikiRoot to reference. Blindly prepending
161
+ // "../" would turn the already-malformed "../foo" into "../../foo",
162
+ // escaping the wiki root outright. Preserve the (already-malformed)
163
+ // entry byte-identical instead, and let validation surface it
164
+ // post-containment under its normal parent-path rules.
165
+ function rewriteParentsAfterContainment(leafPath) {
166
+ // Use `readFrontmatterStreaming` (same pattern `soft-dag.mjs` uses
167
+ // in its `collectAllLeaves(withBody=true)` path) rather than
168
+ // `readFileSync` + `parseFrontmatter(raw)`. `parseFrontmatter`
169
+ // only recognises an LF opening fence, so a CRLF-fenced outlier
170
+ // — which `collectRootLeaves` DOES include, because
171
+ // `readFrontmatterStreaming` normalises CRLF→LF on the
172
+ // frontmatter payload — would silently fail to parse here and
173
+ // the parents[] rewrite would be skipped. The leaf would still
174
+ // move, but its non-primary parent paths would be left one level
175
+ // too shallow (relative to the old depth-0 dir, now invalid at
176
+ // depth 1). Reading through the streaming helper guarantees
177
+ // symmetry with the `collectRootLeaves` discovery pass.
178
+ let captured;
179
+ try {
180
+ captured = readFrontmatterStreaming(leafPath);
181
+ } catch {
182
+ return;
183
+ }
184
+ if (!captured) return;
185
+ let parsed;
186
+ try {
187
+ parsed = parseFrontmatter(captured.frontmatterText, leafPath);
188
+ } catch {
189
+ return;
190
+ }
191
+ if (!parsed?.data) return;
192
+ const parents = Array.isArray(parsed.data.parents) ? parsed.data.parents : [];
193
+ if (parents.length === 0) return;
194
+ const rewritten = parents.map((p, i) => {
195
+ if (typeof p !== "string") return p;
196
+ // Primary stays "index.md" when it was "index.md" (same-dir
197
+ // reference that survives the move).
198
+ if (i === 0 && p === "index.md") return "index.md";
199
+ // Already-escaping paths are preserved byte-identical — adding
200
+ // another "../" to an already-"../"-prefixed entry only digs the
201
+ // depth-contract violation deeper. See module header.
202
+ if (p.startsWith("../")) return p;
203
+ return "../" + p;
204
+ });
205
+ parsed.data.parents = rewritten;
206
+ // Slice the body via the streaming helper's bodyOffset so
207
+ // multi-byte characters at the fence boundary can't corrupt the
208
+ // body. For CRLF-fenced inputs the body buffer starts with "\r\n"
209
+ // — normalise to LF on rewrite (the wider codebase is LF-only for
210
+ // on-disk output, matching `renderFrontmatter`'s always-LF
211
+ // emission); mixing CRLF body into LF-emitted frontmatter would
212
+ // produce an "\n\r\n" boundary a downstream reader would trip on.
213
+ const raw = readFileSync(leafPath);
214
+ let body = raw.slice(captured.bodyOffset).toString("utf8");
215
+ if (captured.lineEnding === "crlf") {
216
+ body = body.replace(/\r\n/g, "\n");
217
+ }
218
+ writeFileSync(leafPath, renderFrontmatter(parsed.data, body), "utf8");
219
+ }
220
+
221
+ // Write the stub `<slug>/index.md` for a newly-minted single-member
222
+ // subcategory. The stub inherits the member's topical signature via
223
+ // `deterministicPurpose` so a Claude navigator reading it
224
+ // immediately sees what's inside. `rebuildAllIndices` in Phase 5
225
+ // will populate the `entries[]` field on the next pass; we don't
226
+ // pre-seed it here.
227
+ //
228
+ // `parents: ["../index.md"]` is pre-seeded on the stub so the
229
+ // intermediate root-containment commit satisfies `PARENTS-REQUIRED`
230
+ // BEFORE Phase 5's rebuild runs. Without this, a reviewer who later
231
+ // drops the Phase-5 commit via `git revert` (the `--review` drop
232
+ // flow) would leave a tree with a parentless subcategory index —
233
+ // the dropped-state validate would fire `PARENTS-REQUIRED` on every
234
+ // stub X.11 created. `rebuildAllIndices` line 185 only fills
235
+ // `data.parents` when it's unset, so the seeded value survives the
236
+ // Phase 5 pass byte-identical.
237
+ function writeStubIndex(targetDir, slug, leaf) {
238
+ const indexPath = join(targetDir, "index.md");
239
+ const data = {
240
+ id: slug,
241
+ type: "index",
242
+ depth_role: "subcategory",
243
+ focus: deterministicPurpose([leaf]) || leaf.data.focus || "",
244
+ parents: ["../index.md"],
245
+ generator: "skill-llm-wiki/v1",
246
+ };
247
+ writeFileSync(indexPath, renderFrontmatter(data, `\n# ${slug}\n`), "utf8");
248
+ // Return the exact data + path so the caller can append the new
249
+ // stub onto the in-memory sibling corpus for subsequent outliers.
250
+ // Keeping the write + record in-step guarantees the incremental
251
+ // sibling corpus stays byte-identical to what a fresh
252
+ // `collectRootSiblings` walk would produce on the final tree.
253
+ return { path: indexPath, data };
254
+ }
255
+
256
+ // Main entry. Returns a summary for the orchestrator phase log.
257
+ //
258
+ // Shape:
259
+ // {
260
+ // outliers: number, // root leaves detected
261
+ // moved: number, // successfully contained
262
+ // operations: [{ from, to, slug }]
263
+ // }
264
+ //
265
+ // Contract:
266
+ // - Zero outliers → no mkdir, no writes, returns { outliers: 0 }.
267
+ // - Each outlier lands in its OWN subcategory (never a shared
268
+ // bucket).
269
+ // - Slug derivation is deterministic (`generateDeterministicSlug`
270
+ // + `resolveNestSlug`'s collision fallback).
271
+ // - parents[] rewrite handled per moved leaf.
272
+ export async function runRootContainment(wikiRoot) {
273
+ const outliers = collectRootLeaves(wikiRoot);
274
+ if (outliers.length === 0) {
275
+ return { outliers: 0, moved: 0, operations: [] };
276
+ }
277
+
278
+ // Build the wiki-wide forbidden-id index ONCE up front and mutate
279
+ // after each successful slug resolution. Same pattern PR #5
280
+ // established for multi-NEST convergence iterations — the slug
281
+ // resolver short-circuits the full-tree walk via `opts.wikiIndex`
282
+ // and we add each resolved slug to the set so subsequent outliers
283
+ // can't accidentally reuse it.
284
+ const wikiIndex = buildWikiForbiddenIndex(wikiRoot);
285
+
286
+ // Sibling corpus is read ONCE at entry and mutated in-place as
287
+ // outliers land in their new subcategories. A naive
288
+ // `collectRootSiblings(wikiRoot, outlier.path)` per iteration
289
+ // would be O(N) reads × N outliers = O(N²) frontmatter reads on a
290
+ // flat-source wiki that drops many leaves at root. Same
291
+ // amortisation shape PR #5 / PR #8 use for `buildWikiForbiddenIndex`
292
+ // and `balance.mjs::computeFanoutStats`.
293
+ //
294
+ // Per-outlier workflow vs. the in-memory corpus:
295
+ // 1. Filter out the current outlier (it's still at root and
296
+ // would appear as its own sibling), producing the IDF context.
297
+ // 2. Compute slug, resolve collisions, move the leaf, write stub.
298
+ // 3. Drop the moved outlier from `siblings` (its path is stale —
299
+ // its frontmatter is still on disk at newLeafPath, but as a
300
+ // wiki-root sibling record it's gone).
301
+ // 4. Append the new stub's { path, data } onto `siblings` so the
302
+ // NEXT outlier sees this freshly-contained subcategory as a
303
+ // sibling signal. Byte-identical to what a fresh
304
+ // collectRootSiblings walk on the final tree would produce.
305
+ let siblings = collectRootSiblings(wikiRoot, /* excludePath */ null);
306
+ const operations = [];
307
+
308
+ for (const outlier of outliers) {
309
+ const perOutlierSiblings = siblings.filter(
310
+ (s) => s.path !== outlier.path,
311
+ );
312
+ const idfMap =
313
+ perOutlierSiblings.length > 0
314
+ ? buildSiblingIdfContext(perOutlierSiblings)
315
+ : undefined;
316
+ const slug = generateDeterministicSlug([outlier], perOutlierSiblings, {
317
+ precomputedIdf: idfMap,
318
+ });
319
+ const proposal = { leaves: [outlier], parent_dir: wikiRoot };
320
+ const resolvedSlug = resolveNestSlug(slug, proposal, wikiRoot, {
321
+ wikiIndex,
322
+ });
323
+ const targetDir = join(wikiRoot, resolvedSlug);
324
+ if (existsSync(targetDir)) {
325
+ // Shouldn't happen if resolveNestSlug did its job, but
326
+ // defensive: an existing dir with the resolved slug would
327
+ // collide on the mkdir below. Surface the failure rather
328
+ // than silently overwrite.
329
+ throw new Error(
330
+ `root-containment: target ${targetDir} already exists for outlier ${basename(outlier.path)} — slug resolution leaked a collision`,
331
+ );
332
+ }
333
+ mkdirSync(targetDir);
334
+ const newLeafPath = join(targetDir, basename(outlier.path));
335
+ renameSync(outlier.path, newLeafPath);
336
+ rewriteParentsAfterContainment(newLeafPath);
337
+ const stubRecord = writeStubIndex(targetDir, resolvedSlug, outlier);
338
+ wikiIndex.add(resolvedSlug);
339
+ // Incremental sibling corpus update — drop the moved outlier,
340
+ // add the new stub index.
341
+ siblings = siblings.filter((s) => s.path !== outlier.path);
342
+ siblings.push(stubRecord);
343
+ operations.push({ from: outlier.path, to: newLeafPath, slug: resolvedSlug });
344
+ }
345
+
346
+ return {
347
+ outliers: outliers.length,
348
+ moved: operations.length,
349
+ operations,
350
+ };
351
+ }
@@ -3,12 +3,29 @@
3
3
  // (b,a) resolve to the same entry. Invalidated implicitly when either
4
4
  // entry's hash changes — the key simply doesn't match anymore.
5
5
  //
6
- // Cache entries are JSON files under `<wiki>/.llmwiki/similarity-cache/`.
7
- // One file per pair. The filename is derived from the sorted hashes
8
- // with sha256 collapsing to keep the name short and filesystem-safe.
6
+ // Cache entries are JSON files under
7
+ // `<wiki>/.llmwiki/similarity-cache/<shard>/<rest>.json`, where
8
+ // `<shard>` is the first `CACHE_SHARD_PREFIX_LEN` hex chars of the
9
+ // 32-char cache key and `<rest>` is the remaining
10
+ // `32 - CACHE_SHARD_PREFIX_LEN` chars. The default
11
+ // `CACHE_SHARD_PREFIX_LEN = 2` gives 16² = 256 shards, which keeps
12
+ // each shard dir's inode count ~cacheSize/256 — a 178k-pair
13
+ // corpus has ~700 entries per shard instead of 178k in a single
14
+ // flat directory.
15
+ // APFS/ext4/ZFS directory lookups degrade with entry count once
16
+ // the VFS dirent cache overflows (~10k on typical kernels), so
17
+ // sharding turns the per-lookup cost from O(log N)-with-large-N
18
+ // back into O(log N)-with-small-N. Same pattern as `.git/objects`.
19
+ //
9
20
  // The payload carries the tier, similarity, decision, and the tier
10
21
  // at which the decision was resolved — tests read it back to verify
11
22
  // caching prevented redundant work.
23
+ //
24
+ // Pre-sharding caches: the old flat layout (`<cacheDir>/<key>.json`)
25
+ // is NOT auto-migrated. This is a pure perf cache — if a user
26
+ // upgrades and the cache invalidates, the next build recomputes
27
+ // everything once and fills the sharded layout. No user data at
28
+ // stake; nothing to preserve.
12
29
 
13
30
  import { createHash } from "node:crypto";
14
31
  import {
@@ -26,8 +43,8 @@ export function cacheDir(wikiRoot) {
26
43
  return join(wikiRoot, ".llmwiki", "similarity-cache");
27
44
  }
28
45
 
29
- // Deterministic filename for a hash pair. Hash prefixes are sorted
30
- // so the lookup is symmetric regardless of argument order.
46
+ // Deterministic filename stem for a hash pair. Hash prefixes are
47
+ // sorted so the lookup is symmetric regardless of argument order.
31
48
  export function cacheKey(hashA, hashB) {
32
49
  if (!hashA || !hashB) {
33
50
  throw new Error("similarity-cache: both hashes must be non-empty strings");
@@ -42,8 +59,19 @@ export function cacheKey(hashA, hashB) {
42
59
  .slice(0, 32);
43
60
  }
44
61
 
62
+ // Number of hex chars taken from the start of the cache key as the
63
+ // shard directory name. 2 → 256 shards, which keeps every shard dir
64
+ // below 1k entries for workloads up to ~256k pairs. Changing this
65
+ // constant invalidates the cache layout for existing wikis, but
66
+ // since similarity-cache is purely an optimisation the next build
67
+ // simply rebuilds the populated shards.
68
+ export const CACHE_SHARD_PREFIX_LEN = 2;
69
+
45
70
  export function cacheEntryPath(wikiRoot, hashA, hashB) {
46
- return join(cacheDir(wikiRoot), cacheKey(hashA, hashB) + ".json");
71
+ const key = cacheKey(hashA, hashB);
72
+ const shard = key.slice(0, CACHE_SHARD_PREFIX_LEN);
73
+ const rest = key.slice(CACHE_SHARD_PREFIX_LEN);
74
+ return join(cacheDir(wikiRoot), shard, rest + ".json");
47
75
  }
48
76
 
49
77
  // Read a cached decision. Returns null on miss or malformed file.
@@ -72,13 +100,24 @@ export function readCached(wikiRoot, hashA, hashB) {
72
100
  }
73
101
 
74
102
  // Write a decision to the cache atomically (temp-file + rename).
103
+ // The shard subdirectory is created on demand, so callers don't
104
+ // need to pre-create it. `mkdirSync({ recursive: true })` is
105
+ // idempotent and cheap on hot shards (kernel caches the parent's
106
+ // dentry).
75
107
  export function writeCached(wikiRoot, hashA, hashB, decision) {
76
108
  if (!decision || typeof decision !== "object") {
77
109
  throw new Error("similarity-cache: decision must be an object");
78
110
  }
79
- const dir = cacheDir(wikiRoot);
80
- mkdirSync(dir, { recursive: true });
81
- const path = cacheEntryPath(wikiRoot, hashA, hashB);
111
+ // Compute the key once — previously we hashed twice (once via
112
+ // cacheEntryPath + once inline for the shard slice). The hot
113
+ // write path of a 596-leaf sweep calls this ~178k times, so
114
+ // halving the sha256 work is a small-but-cheap win.
115
+ const key = cacheKey(hashA, hashB);
116
+ const shard = key.slice(0, CACHE_SHARD_PREFIX_LEN);
117
+ const rest = key.slice(CACHE_SHARD_PREFIX_LEN);
118
+ const shardDir = join(cacheDir(wikiRoot), shard);
119
+ const path = join(shardDir, rest + ".json");
120
+ mkdirSync(shardDir, { recursive: true });
82
121
  const payload = JSON.stringify(
83
122
  {
84
123
  tier: decision.tier,
@@ -97,30 +136,86 @@ export function writeCached(wikiRoot, hashA, hashB, decision) {
97
136
 
98
137
  // Remove every cache file. Used by tests and by `startCorpus` via
99
138
  // the orchestrator when the corpus changes materially. Safe to call
100
- // when the cache dir doesn't exist.
139
+ // when the cache dir doesn't exist. Walks every shard directory
140
+ // plus the top-level (tolerates pre-sharding flat caches from
141
+ // before the layout change — they get cleared on first clear-call).
142
+ //
143
+ // `readdirSync({ withFileTypes: true })` returns `Dirent` entries
144
+ // that already carry `isDirectory()` / `isFile()` metadata, so we
145
+ // can branch on type without a per-entry `statSync` syscall. On a
146
+ // pre-sharding flat cache with 178k entries that's 178k syscalls
147
+ // saved on the first post-upgrade clear.
101
148
  export function clearCache(wikiRoot) {
102
149
  const dir = cacheDir(wikiRoot);
103
150
  if (!existsSync(dir)) return 0;
104
151
  let count = 0;
105
- for (const name of readdirSync(dir)) {
106
- if (!name.endsWith(".json")) continue;
107
- try {
108
- rmSync(join(dir, name), { force: true });
109
- count++;
110
- } catch {
111
- /* best-effort */
152
+ let entries;
153
+ try {
154
+ entries = readdirSync(dir, { withFileTypes: true });
155
+ } catch {
156
+ return 0;
157
+ }
158
+ for (const dirent of entries) {
159
+ const entryPath = join(dir, dirent.name);
160
+ if (dirent.isDirectory()) {
161
+ // Shard subdirectory — clear every .json beneath it, then
162
+ // the directory itself. Best-effort; a shard that's locked
163
+ // or mid-write doesn't abort the whole clear.
164
+ try {
165
+ for (const sub of readdirSync(entryPath, { withFileTypes: true })) {
166
+ if (!sub.isFile() || !sub.name.endsWith(".json")) continue;
167
+ try {
168
+ rmSync(join(entryPath, sub.name), { force: true });
169
+ count++;
170
+ } catch {
171
+ /* best-effort */
172
+ }
173
+ }
174
+ rmSync(entryPath, { force: true, recursive: true });
175
+ } catch {
176
+ /* best-effort */
177
+ }
178
+ } else if (dirent.isFile() && dirent.name.endsWith(".json")) {
179
+ // Pre-sharding flat entry — clear in place.
180
+ try {
181
+ rmSync(entryPath, { force: true });
182
+ count++;
183
+ } catch {
184
+ /* best-effort */
185
+ }
112
186
  }
113
187
  }
114
188
  return count;
115
189
  }
116
190
 
117
- // Count cached entries — convenience for tests and metrics.
191
+ // Count cached entries — convenience for tests and metrics. Walks
192
+ // every shard directory; also counts any pre-sharding flat entries
193
+ // if they exist (so a pre-upgrade cache still reports meaningful
194
+ // size until the user runs a build that regenerates). Uses
195
+ // `withFileTypes: true` for the same per-syscall saving as
196
+ // `clearCache`.
118
197
  export function cacheSize(wikiRoot) {
119
198
  const dir = cacheDir(wikiRoot);
120
199
  if (!existsSync(dir)) return 0;
121
200
  let n = 0;
122
- for (const name of readdirSync(dir)) {
123
- if (name.endsWith(".json")) n++;
201
+ let entries;
202
+ try {
203
+ entries = readdirSync(dir, { withFileTypes: true });
204
+ } catch {
205
+ return 0;
206
+ }
207
+ for (const dirent of entries) {
208
+ if (dirent.isDirectory()) {
209
+ try {
210
+ for (const sub of readdirSync(join(dir, dirent.name), { withFileTypes: true })) {
211
+ if (sub.isFile() && sub.name.endsWith(".json")) n++;
212
+ }
213
+ } catch {
214
+ /* best-effort */
215
+ }
216
+ } else if (dirent.isFile() && dirent.name.endsWith(".json")) {
217
+ n++; // pre-sharding flat entry
218
+ }
124
219
  }
125
220
  return n;
126
221
  }
@@ -92,6 +92,17 @@ export function computeIdf(tokenLists) {
92
92
  // every pair — the difference between O(N³) and O(N²) work.
93
93
  export function buildComparisonModel(entries) {
94
94
  const texts = entries.map((e) => entryText(e));
95
+ return buildComparisonModelFromTexts(texts);
96
+ }
97
+
98
+ // Build the same model shape directly from pre-assembled text
99
+ // strings. Useful when the caller has already done its own
100
+ // aggregation (e.g. `soft-dag.mjs` concatenates multiple leaves'
101
+ // `entryText` outputs into a single "category text") and passing
102
+ // the result back through `entryText` would either double-count
103
+ // the doubled-focus treatment or simply waste tokenisation work.
104
+ // No `entryText` roundtrip — the text goes straight to tokenisation.
105
+ export function buildComparisonModelFromTexts(texts) {
95
106
  const tokenLists = texts.map((t) => tokenize(t));
96
107
  const idfMap = computeIdf(tokenLists);
97
108
  return { texts, tokenLists, idfMap };