@ctxr/skill-llm-wiki 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +134 -0
  2. package/LICENSE +21 -0
  3. package/README.md +484 -0
  4. package/SKILL.md +252 -0
  5. package/guide/basics/concepts.md +74 -0
  6. package/guide/basics/index.md +45 -0
  7. package/guide/basics/schema.md +140 -0
  8. package/guide/cli.md +256 -0
  9. package/guide/correctness/index.md +45 -0
  10. package/guide/correctness/invariants.md +89 -0
  11. package/guide/correctness/safety.md +96 -0
  12. package/guide/history/diff.md +110 -0
  13. package/guide/history/hidden-git.md +130 -0
  14. package/guide/history/index.md +52 -0
  15. package/guide/history/remote-sync.md +113 -0
  16. package/guide/index.md +134 -0
  17. package/guide/isolation/coexistence.md +134 -0
  18. package/guide/isolation/index.md +44 -0
  19. package/guide/isolation/scale.md +251 -0
  20. package/guide/layout/in-place-mode.md +97 -0
  21. package/guide/layout/index.md +53 -0
  22. package/guide/layout/layout-contract.md +131 -0
  23. package/guide/layout/layout-modes.md +115 -0
  24. package/guide/operations/index.md +76 -0
  25. package/guide/operations/ingest/build.md +75 -0
  26. package/guide/operations/ingest/extend.md +61 -0
  27. package/guide/operations/ingest/index.md +54 -0
  28. package/guide/operations/ingest/join.md +65 -0
  29. package/guide/operations/maintain/fix.md +66 -0
  30. package/guide/operations/maintain/index.md +47 -0
  31. package/guide/operations/maintain/rebuild.md +86 -0
  32. package/guide/operations/validate.md +48 -0
  33. package/guide/substrate/index.md +47 -0
  34. package/guide/substrate/operators.md +96 -0
  35. package/guide/substrate/tiered-ai.md +363 -0
  36. package/guide/ux/index.md +44 -0
  37. package/guide/ux/preflight.md +150 -0
  38. package/guide/ux/user-intent.md +135 -0
  39. package/package.json +55 -0
  40. package/scripts/cli.mjs +893 -0
  41. package/scripts/commands/remote.mjs +93 -0
  42. package/scripts/commands/review.mjs +253 -0
  43. package/scripts/commands/sync.mjs +84 -0
  44. package/scripts/lib/chunk.mjs +421 -0
  45. package/scripts/lib/cluster-detect.mjs +516 -0
  46. package/scripts/lib/decision-log.mjs +343 -0
  47. package/scripts/lib/draft.mjs +158 -0
  48. package/scripts/lib/embeddings.mjs +366 -0
  49. package/scripts/lib/frontmatter.mjs +497 -0
  50. package/scripts/lib/git-commands.mjs +155 -0
  51. package/scripts/lib/git.mjs +486 -0
  52. package/scripts/lib/gitignore.mjs +62 -0
  53. package/scripts/lib/history.mjs +331 -0
  54. package/scripts/lib/indices.mjs +510 -0
  55. package/scripts/lib/ingest.mjs +258 -0
  56. package/scripts/lib/intent.mjs +713 -0
  57. package/scripts/lib/interactive.mjs +99 -0
  58. package/scripts/lib/migrate.mjs +126 -0
  59. package/scripts/lib/nest-applier.mjs +260 -0
  60. package/scripts/lib/operators.mjs +1365 -0
  61. package/scripts/lib/orchestrator.mjs +718 -0
  62. package/scripts/lib/paths.mjs +197 -0
  63. package/scripts/lib/preflight.mjs +213 -0
  64. package/scripts/lib/provenance.mjs +672 -0
  65. package/scripts/lib/quality-metric.mjs +269 -0
  66. package/scripts/lib/query-fixture.mjs +71 -0
  67. package/scripts/lib/rollback.mjs +95 -0
  68. package/scripts/lib/shape-check.mjs +172 -0
  69. package/scripts/lib/similarity-cache.mjs +126 -0
  70. package/scripts/lib/similarity.mjs +230 -0
  71. package/scripts/lib/snapshot.mjs +54 -0
  72. package/scripts/lib/source-frontmatter.mjs +85 -0
  73. package/scripts/lib/tier2-protocol.mjs +470 -0
  74. package/scripts/lib/tiered.mjs +453 -0
  75. package/scripts/lib/validate.mjs +362 -0
@@ -0,0 +1,516 @@
1
+ // cluster-detect.mjs — multi-signal cluster detection for NEST.
2
+ //
3
+ // Given a set of leaves at a single depth (one directory's worth
4
+ // of children), compute an affinity matrix using several signals,
5
+ // find candidate clusters as connected components under a
6
+ // threshold, and propose NEST applications. Every proposal is
7
+ // named by asking Tier 2 (cluster_name kind) — we never invent
8
+ // names from keyword shortcuts, because the whole point of Tier 2
9
+ // is to let the sub-agent exercise judgment at naming time.
10
+ //
11
+ // Signals used for the affinity matrix:
12
+ //
13
+ // 1. Tier 0 TF-IDF cosine on focus + covers + tags
14
+ // 2. Tier 1 embedding cosine on focus + covers + first ~1 KB of body
15
+ // 3. Authored tag overlap (Jaccard)
16
+ // 4. Authored activation.keyword_matches overlap (Jaccard)
17
+ //
18
+ // Each signal is normalized into [0, 1] and summed with
19
+ // configurable weights. The default weights are:
20
+ //
21
+ // tier0: 0.25
22
+ // tier1: 0.40
23
+ // tag_jaccard: 0.20
24
+ // act_jaccard: 0.15
25
+ //
26
+ // The affinity matrix is symmetric and zero-diagonal. Clustering
27
+ // finds connected components using a union-find over edges whose
28
+ // affinity exceeds a threshold. We try multiple thresholds
29
+ // (0.30 / 0.38 / 0.46) and pick whichever partition produces the
30
+ // lowest routing_cost when we test it against the quality metric —
31
+ // so the threshold is not hand-tuned for any specific corpus, it's
32
+ // corpus-adaptive by construction.
33
+ //
34
+ // Justification for the threshold range:
35
+ //
36
+ // 0.30 is a weak floor: Tier 1 alone needs to be borderline-
37
+ // related for this to fire when the other signals are
38
+ // absent. Below this is noise territory.
39
+ //
40
+ // 0.38 is the midpoint of the Tier 1 decisive-same threshold
41
+ // (0.80) and the Tier 1 decisive-different threshold
42
+ // (0.45). Anything above this is more likely related than
43
+ // unrelated under pure embedding evidence.
44
+ //
45
+ // 0.46 mirrors the Tier 1 decisive-different threshold
46
+ // projected through the weighted combination — signals
47
+ // stronger than this level in combination are a strong
48
+ // cluster indicator.
49
+ //
50
+ // Picking the best threshold is a "use the quality metric" loop,
51
+ // not a constant — document-corpus-independent.
52
+ //
53
+ // Cluster size constraints:
54
+ //
55
+ // - Minimum cluster size: 3. Two-leaf clusters are better
56
+ // handled by MERGE or by the existing pairwise-merge path.
57
+ // - Maximum cluster size: 8. Larger clusters likely hide a
58
+ // multi-level structure that convergence will discover by
59
+ // iterating.
60
+ // - Single-cluster preference: if every leaf in the parent
61
+ // ends up in the SAME cluster, we reject the proposal — the
62
+ // parent is already a coherent category by itself.
63
+
64
+ import { readFileSync } from "node:fs";
65
+ import {
66
+ embed as tier1Embed,
67
+ embeddingCosine,
68
+ } from "./embeddings.mjs";
69
+ import {
70
+ buildComparisonModel,
71
+ cosine,
72
+ entryText,
73
+ tfidfVector,
74
+ tokenize,
75
+ } from "./similarity.mjs";
76
+ import { makeRequest } from "./tier2-protocol.mjs";
77
+
78
+ // Weights for the affinity fusion. Exported so tests can swap them.
79
+ export const DEFAULT_AFFINITY_WEIGHTS = Object.freeze({
80
+ tier0: 0.25,
81
+ tier1: 0.40,
82
+ tag_jaccard: 0.20,
83
+ act_jaccard: 0.15,
84
+ });
85
+
86
+ // Candidate thresholds the detector tries. A WIDE range is the
87
+ // point of the aggressive scan: sparse-signal corpora (hand-
88
+ // authored wikis with modest pairwise similarities) will never
89
+ // cross the 0.30+ band on tf-idf-dominated affinity, so if the
90
+ // skill only scanned the conservative range it would leave every
91
+ // natural grouping flat. We push the floor down to 0.10 — "any
92
+ // detectable overlap is a candidate" — and let Tier 2 + the
93
+ // quality-metric gate filter out the noise.
94
+ //
95
+ // Why 0.10 as the floor: it's well above pure-noise affinity
96
+ // (random 300-token corpora score in the 0.00–0.04 range on tf-
97
+ // idf cosine) but low enough that a single shared tag + modest
98
+ // embedding overlap will cross it. The math proposal still has
99
+ // to pass Tier 2's nest_decision gate AND improve the routing-
100
+ // cost metric before any NEST is applied, so a false positive
101
+ // at 0.10 is caught at two later gates.
102
+ //
103
+ // Thresholds remain corpus-independent; picking a different one
104
+ // per corpus would make the algorithm non-deterministic.
105
+ export const CANDIDATE_THRESHOLDS = Object.freeze([
106
+ 0.10, 0.15, 0.20, 0.25, 0.30, 0.38, 0.46,
107
+ ]);
108
+
109
+ // Minimum cluster size is split by proposal source:
110
+ //
111
+ // - Math-only candidates need at least 3 members. Pairwise
112
+ // similarity on two leaves is a noisy signal — a random TF-IDF
113
+ // collision between two terse frontmatters can score > 0.10 and
114
+ // would clutter the tree with false 2-member nests.
115
+ //
116
+ // - Tier 2-proposed clusters can have 2 members. Tier 2 is a
117
+ // structural judgment call by a language model that has read
118
+ // both frontmatters; it can defend a pair on conceptual grounds
119
+ // (e.g. "invariants + safety are the correctness substrate")
120
+ // even when the math-based similarity alone wouldn't be
121
+ // decisive. Pairing Tier 2's structural judgment with the
122
+ // relaxed metric gate (5% regression tolerance) is how the
123
+ // engine surfaces maximum nesting on hand-authored corpora with
124
+ // heterogeneous terse frontmatters.
125
+ //
126
+ // Consumers choose the right constant based on proposal source.
127
+ // `MIN_CLUSTER_SIZE` is kept as the math default for backwards
128
+ // compatibility; size-2 Tier 2 clusters flow through a separate
129
+ // path that uses `MIN_TIER2_CLUSTER_SIZE`.
130
+ export const MIN_CLUSTER_SIZE = 3;
131
+ export const MIN_MATH_CLUSTER_SIZE = 3;
132
+ export const MIN_TIER2_CLUSTER_SIZE = 2;
133
+ export const MAX_CLUSTER_SIZE = 8;
134
+
135
+ // Reject partitions where a single component swallows more than
136
+ // this fraction of the leaves. The "one giant blob" case is
137
+ // usually a noise floor hit and is structurally useless.
138
+ export const GIANT_BLOB_FRACTION = 0.75;
139
+
140
+ // Read the first ~1 KB of a leaf's body for the Tier 1 signal.
141
+ // We skip the frontmatter (between the first two `---` lines)
142
+ // and take a prefix of the remaining bytes. Short-body leaves
143
+ // return their whole body.
144
+ function readBodySample(leafPath, maxBytes = 1024) {
145
+ try {
146
+ const raw = readFileSync(leafPath, "utf8");
147
+ // Strip frontmatter fence if present.
148
+ if (raw.startsWith("---\n")) {
149
+ const end = raw.indexOf("\n---\n", 4);
150
+ if (end !== -1) {
151
+ const body = raw.slice(end + 5);
152
+ return body.slice(0, maxBytes);
153
+ }
154
+ }
155
+ return raw.slice(0, maxBytes);
156
+ } catch {
157
+ return "";
158
+ }
159
+ }
160
+
161
+ // Jaccard similarity between two sets (or arrays). Returns 0 for
162
+ // empty inputs so the contribution to the affinity is zero when
163
+ // the authored metadata is absent.
164
+ function jaccard(a, b) {
165
+ const sa = new Set(a || []);
166
+ const sb = new Set(b || []);
167
+ if (sa.size === 0 && sb.size === 0) return 0;
168
+ let inter = 0;
169
+ for (const x of sa) if (sb.has(x)) inter++;
170
+ const uni = sa.size + sb.size - inter;
171
+ if (uni === 0) return 0;
172
+ return inter / uni;
173
+ }
174
+
175
+ function activationKeywords(data) {
176
+ if (!data || typeof data !== "object") return [];
177
+ const act = data.activation;
178
+ if (act && Array.isArray(act.keyword_matches)) {
179
+ return act.keyword_matches.map((k) => String(k).toLowerCase());
180
+ }
181
+ return [];
182
+ }
183
+
184
+ // Build the affinity matrix. Each entry [i][j] is the weighted
185
+ // fusion of Tier 0, Tier 1, tag-Jaccard and activation-Jaccard
186
+ // for leaves i and j.
187
+ //
188
+ // Tier 1 embeddings are generated once per leaf and cached on
189
+ // disk via `embed()`, so calling this repeatedly on overlapping
190
+ // leaf sets is cheap after the first run.
191
+ export async function computeAffinityMatrix(wikiRoot, leaves, opts = {}) {
192
+ const { weights = DEFAULT_AFFINITY_WEIGHTS } = opts;
193
+ const n = leaves.length;
194
+ const matrix = Array.from({ length: n }, () => new Float64Array(n));
195
+
196
+ // Precompute Tier 0 once across the whole leaf set.
197
+ const corpus = leaves.map((l) => l.data);
198
+ const model = buildComparisonModel(corpus);
199
+
200
+ // Precompute Tier 1 vectors for each leaf. The "text" for the
201
+ // embedding is focus + covers + tags + first ~1 KB of body.
202
+ const tier1Texts = leaves.map((leaf) => {
203
+ const d = leaf.data;
204
+ const parts = [];
205
+ if (d.focus) parts.push(d.focus);
206
+ if (Array.isArray(d.covers)) parts.push(d.covers.join(" "));
207
+ if (Array.isArray(d.tags)) parts.push(d.tags.join(" "));
208
+ parts.push(readBodySample(leaf.path));
209
+ return parts.join("\n\n");
210
+ });
211
+ const tier1Vectors = await Promise.all(
212
+ tier1Texts.map((t) => tier1Embed(wikiRoot, t)),
213
+ );
214
+
215
+ // Precompute Tier 0 tf-idf vectors per leaf (against the
216
+ // precomputed model).
217
+ const tier0Vectors = leaves.map((leaf) => {
218
+ const text = entryText(leaf.data);
219
+ const tokens = tokenize(text);
220
+ return tfidfVector(tokens, model.idfMap);
221
+ });
222
+
223
+ for (let i = 0; i < n; i++) {
224
+ for (let j = i + 1; j < n; j++) {
225
+ const t0 = cosine(tier0Vectors[i], tier0Vectors[j]);
226
+ const t1 = embeddingCosine(tier1Vectors[i], tier1Vectors[j]);
227
+ const tagJ = jaccard(
228
+ leaves[i].data.tags || [],
229
+ leaves[j].data.tags || [],
230
+ );
231
+ const actJ = jaccard(
232
+ activationKeywords(leaves[i].data),
233
+ activationKeywords(leaves[j].data),
234
+ );
235
+ const affinity =
236
+ weights.tier0 * clamp01(t0) +
237
+ weights.tier1 * clamp01(t1) +
238
+ weights.tag_jaccard * tagJ +
239
+ weights.act_jaccard * actJ;
240
+ matrix[i][j] = affinity;
241
+ matrix[j][i] = affinity;
242
+ }
243
+ }
244
+ return matrix;
245
+ }
246
+
247
+ function clamp01(x) {
248
+ if (!Number.isFinite(x)) return 0;
249
+ if (x < 0) return 0;
250
+ if (x > 1) return 1;
251
+ return x;
252
+ }
253
+
254
+ // Find connected components under a threshold using union-find.
255
+ // Returns an array of components, each is an array of leaf
256
+ // indices.
257
+ export function findComponents(matrix, threshold) {
258
+ const n = matrix.length;
259
+ const parent = Array.from({ length: n }, (_, i) => i);
260
+ const find = (x) => {
261
+ while (parent[x] !== x) {
262
+ parent[x] = parent[parent[x]];
263
+ x = parent[x];
264
+ }
265
+ return x;
266
+ };
267
+ const union = (a, b) => {
268
+ const ra = find(a);
269
+ const rb = find(b);
270
+ if (ra !== rb) parent[ra] = rb;
271
+ };
272
+ for (let i = 0; i < n; i++) {
273
+ for (let j = i + 1; j < n; j++) {
274
+ if (matrix[i][j] >= threshold) union(i, j);
275
+ }
276
+ }
277
+ const groups = new Map();
278
+ for (let i = 0; i < n; i++) {
279
+ const r = find(i);
280
+ if (!groups.has(r)) groups.set(r, []);
281
+ groups.get(r).push(i);
282
+ }
283
+ return Array.from(groups.values());
284
+ }
285
+
286
+ // Given a partition and the affinity matrix, produce a shape
287
+ // score used to pick the BEST threshold for a directory. The
288
+ // convergence loop still uses the real routing_cost metric as
289
+ // the final gate; this score is only for comparing candidate
290
+ // thresholds against each other inside one directory.
291
+ //
292
+ // Scoring components:
293
+ //
294
+ // coverage — fraction of leaves that landed in an
295
+ // acceptable-size component (3–8). Higher
296
+ // is better.
297
+ // cluster_count — number of acceptable-size components.
298
+ // Favour partitions that surface MULTIPLE
299
+ // clusters over ones that find only one.
300
+ // cohesion — average intra-cluster affinity across all
301
+ // acceptable components. Higher is better.
302
+ // giant_penalty — partitions where any component holds more
303
+ // than GIANT_BLOB_FRACTION of the leaves
304
+ // score 0. That's the "everything lumped
305
+ // together" degenerate case.
306
+ //
307
+ // All four components are combined into a single scalar:
308
+ //
309
+ // score = coverage * (1 + 0.25 * (cluster_count - 1)) * (0.5 + 0.5 * cohesion)
310
+ //
311
+ // The + 0.25 * (cluster_count - 1) multiplier rewards partitions
312
+ // that find 2+ clusters — we WANT multiple clusters per pass when
313
+ // possible, since the alternative is doing nothing at this depth.
314
+ // The cohesion multiplier keeps the score sensitive to average
315
+ // intra-cluster strength even when coverage is equal across
316
+ // thresholds, so "two tight triangles at threshold 0.30" beats
317
+ // "two loose triangles at threshold 0.10" when coverage ties.
318
+ export function partitionShapeScore(partition, n, matrix = null) {
319
+ if (partition.length === 1 && partition[0].length === n) return 0;
320
+ const accepted = partition.filter(
321
+ (c) => c.length >= MIN_CLUSTER_SIZE && c.length <= MAX_CLUSTER_SIZE,
322
+ );
323
+ if (accepted.length === 0) return 0;
324
+ // Giant-blob rejection — any single component (even outside
325
+ // the accepted band) that holds more than 75% of leaves kills
326
+ // the score entirely.
327
+ for (const c of partition) {
328
+ if (c.length / n > GIANT_BLOB_FRACTION) return 0;
329
+ }
330
+ const totalInClusters = accepted.reduce((a, c) => a + c.length, 0);
331
+ const coverage = totalInClusters / Math.max(1, n);
332
+ const clusterMultiplier = 1 + 0.25 * (accepted.length - 1);
333
+ let cohesion = 0;
334
+ if (matrix) {
335
+ let pairSum = 0;
336
+ let pairCount = 0;
337
+ for (const component of accepted) {
338
+ for (let i = 0; i < component.length; i++) {
339
+ for (let j = i + 1; j < component.length; j++) {
340
+ pairSum += matrix[component[i]][component[j]];
341
+ pairCount++;
342
+ }
343
+ }
344
+ }
345
+ cohesion = pairCount > 0 ? pairSum / pairCount : 0;
346
+ }
347
+ const cohesionMultiplier = matrix ? (0.5 + 0.5 * cohesion) : 1;
348
+ return coverage * clusterMultiplier * cohesionMultiplier;
349
+ }
350
+
351
+ // Build a NEST proposal for a single component. Returns a proposal
352
+ // object carrying:
353
+ //
354
+ // {
355
+ // operator: "NEST",
356
+ // leaves: [<leaf>, <leaf>, ...] (the cluster members)
357
+ // naming_request: Tier 2 cluster_name request (to be queued)
358
+ // resolved_slug: optional, set when fixture/runtime provides
359
+ // average_affinity: confidence proxy
360
+ // }
361
+ //
362
+ // The caller is responsible for enqueueing the naming request via
363
+ // tier2-protocol and, when the answer is available, invoking the
364
+ // NEST applier.
365
+ export function buildNestProposal(componentLeaves, matrix, componentIndices) {
366
+ // Average pairwise affinity within the component.
367
+ let sum = 0;
368
+ let count = 0;
369
+ for (let i = 0; i < componentIndices.length; i++) {
370
+ for (let j = i + 1; j < componentIndices.length; j++) {
371
+ sum += matrix[componentIndices[i]][componentIndices[j]];
372
+ count++;
373
+ }
374
+ }
375
+ const avg = count > 0 ? sum / count : 0;
376
+
377
+ // Build the Tier 2 naming request.
378
+ const inputs = {
379
+ leaves: componentLeaves.map((leaf) => ({
380
+ id: leaf.data.id,
381
+ focus: leaf.data.focus || "",
382
+ covers: leaf.data.covers || [],
383
+ tags: leaf.data.tags || [],
384
+ })),
385
+ };
386
+ const request = makeRequest("cluster_name", {
387
+ prompt:
388
+ "These leaves are candidates for grouping into a single subcategory. " +
389
+ "Return a short kebab-case slug (one or two words, e.g., 'history' or " +
390
+ "'layout-modes') and a one-line purpose. The slug must be a valid " +
391
+ "directory name and should describe the CONCEPTUAL grouping. If the " +
392
+ "leaves are clearly unrelated, return decision 'reject' with a reason.",
393
+ inputs,
394
+ });
395
+
396
+ // Companion nest_decision request: "should these N leaves
397
+ // actually nest together, or stay flat?" The convergence loop
398
+ // uses this as a mandatory GO/NO-GO gate on math-proposed
399
+ // clusters — no math-only proposal is ever applied without a
400
+ // Tier 2 nest_decision returning "nest". Tier-2-proposed
401
+ // clusters (from propose_structure) skip this gate since
402
+ // Tier 2 already approved them structurally.
403
+ const gateRequest = makeRequest("nest_decision", {
404
+ prompt:
405
+ "Given these N sibling leaves, should they be grouped " +
406
+ "together under a new parent subcategory? Answer 'nest' " +
407
+ "if they share a defensible conceptual grouping that would " +
408
+ "meaningfully improve routing, 'keep_flat' if the overlap " +
409
+ "is incidental and nesting would add noise, or " +
410
+ "'undecidable' if you cannot tell from the frontmatter.",
411
+ inputs,
412
+ });
413
+
414
+ return {
415
+ operator: "NEST",
416
+ leaves: componentLeaves,
417
+ naming_request: request,
418
+ gate_request: gateRequest,
419
+ average_affinity: avg,
420
+ size: componentLeaves.length,
421
+ };
422
+ }
423
+
424
+ // Build a propose_structure request for a whole directory. Used
425
+ // by the convergence loop as the FIRST pass on every directory:
426
+ // Tier 2 gets first dibs on the optimal partition. The response
427
+ // carries an array of subcategories (slug + purpose + member
428
+ // ids) plus siblings that should stay at root level. See
429
+ // tier2-protocol.mjs::TIER2_DEFAULTS.propose_structure for the
430
+ // response schema.
431
+ export function buildProposeStructureRequest(relativeDir, leaves) {
432
+ const inputs = {
433
+ directory: relativeDir || ".",
434
+ leaves: leaves.map((leaf) => ({
435
+ id: leaf.data.id,
436
+ focus: leaf.data.focus || "",
437
+ covers: Array.isArray(leaf.data.covers) ? leaf.data.covers : [],
438
+ tags: Array.isArray(leaf.data.tags) ? leaf.data.tags : [],
439
+ activation_keywords: activationKeywords(leaf.data),
440
+ })),
441
+ };
442
+ return makeRequest("propose_structure", {
443
+ prompt:
444
+ "Given these N leaves in a directory, propose the optimal " +
445
+ "nested structure. Group related leaves into named " +
446
+ "subcategories (slug + purpose + member ids). Leaves that " +
447
+ "genuinely stand alone should be reported as siblings. " +
448
+ "Favour nesting over flatness whenever 2+ leaves share a " +
449
+ "defensible conceptual grouping — err on the side of " +
450
+ "nesting if in doubt. Return STRICT JSON matching the " +
451
+ "response_schema; do not include any commentary.",
452
+ inputs,
453
+ });
454
+ }
455
+
456
+ // Detect all NEST proposals for a single parent directory's
457
+ // leaves. Tries each candidate threshold (aggressive range), picks
458
+ // the best by shape score, and emits a proposal for each
459
+ // acceptable component. If NO threshold produces a usable
460
+ // partition, returns an array carrying a single `empty_partition:
461
+ // true` marker proposal so the caller can trigger a whole-
462
+ // directory Tier 2 `propose_structure` escalation.
463
+ //
464
+ // `opts.returnEmptyMarker = false` suppresses the empty-partition
465
+ // marker and returns `[]` instead — used by tests and the
466
+ // cluster_name unit tests that don't want the marker in their
467
+ // output.
468
+ export async function detectClusters(wikiRoot, leaves, opts = {}) {
469
+ const { returnEmptyMarker = true } = opts;
470
+ if (leaves.length < MIN_CLUSTER_SIZE) return [];
471
+ const matrix = await computeAffinityMatrix(wikiRoot, leaves, opts);
472
+ let bestPartition = null;
473
+ let bestScore = -1;
474
+ let bestThreshold = null;
475
+ for (const t of CANDIDATE_THRESHOLDS) {
476
+ const parts = findComponents(matrix, t);
477
+ const score = partitionShapeScore(parts, leaves.length, matrix);
478
+ if (score > bestScore) {
479
+ bestScore = score;
480
+ bestPartition = parts;
481
+ bestThreshold = t;
482
+ }
483
+ }
484
+ if (!bestPartition || bestScore <= 0) {
485
+ if (returnEmptyMarker) {
486
+ return [
487
+ {
488
+ operator: "NEST",
489
+ empty_partition: true,
490
+ leaves,
491
+ reason:
492
+ "aggressive threshold scan [" +
493
+ CANDIDATE_THRESHOLDS.join(", ") +
494
+ "] produced no acceptable partition — escalate to propose_structure",
495
+ },
496
+ ];
497
+ }
498
+ return [];
499
+ }
500
+ const proposals = [];
501
+ for (const component of bestPartition) {
502
+ if (component.length < MIN_CLUSTER_SIZE) continue;
503
+ if (component.length > MAX_CLUSTER_SIZE) continue;
504
+ // Reject single-cluster-everything case
505
+ if (component.length === leaves.length) continue;
506
+ const componentLeaves = component.map((i) => leaves[i]);
507
+ const proposal = buildNestProposal(componentLeaves, matrix, component);
508
+ proposal.threshold = bestThreshold;
509
+ proposal.source = "math";
510
+ proposals.push(proposal);
511
+ }
512
+ // Sort by average_affinity descending so the strongest proposal
513
+ // is applied first each iteration.
514
+ proposals.sort((a, b) => b.average_affinity - a.average_affinity);
515
+ return proposals;
516
+ }