@ctxr/skill-llm-wiki 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,18 @@
3
3
  // Given a set of leaves at a single depth (one directory's worth
4
4
  // of children), compute an affinity matrix using several signals,
5
5
  // find candidate clusters as connected components under a
6
- // threshold, and propose NEST applications. Every proposal is
7
- // named by asking Tier 2 (cluster_name kind) — we never invent
8
- // names from keyword shortcuts, because the whole point of Tier 2
9
- // is to let the sub-agent exercise judgment at naming time.
6
+ // threshold, and propose NEST applications.
7
+ //
8
+ // Cluster naming depends on the active quality mode:
9
+ // - tiered-fast / claude-first: proposals are named by asking
10
+ // Tier 2 (the `cluster_name` request kind), because the point
11
+ // of Tier 2 is to let the sub-agent exercise judgment at
12
+ // naming time.
13
+ // - deterministic: naming is derived locally from member
14
+ // frontmatters via `generateDeterministicSlug` +
15
+ // `deterministicPurpose`, bypassing the `cluster_name` request
16
+ // entirely so the mode's "no LLM in the loop" contract holds
17
+ // end-to-end. See these helpers' doc comments for the algorithm.
10
18
  //
11
19
  // Signals used for the affinity matrix:
12
20
  //
@@ -68,6 +76,7 @@ import {
68
76
  } from "./embeddings.mjs";
69
77
  import {
70
78
  buildComparisonModel,
79
+ computeIdf,
71
80
  cosine,
72
81
  entryText,
73
82
  tfidfVector,
@@ -137,6 +146,45 @@ export const MAX_CLUSTER_SIZE = 8;
137
146
  // usually a noise floor hit and is structurally useless.
138
147
  export const GIANT_BLOB_FRACTION = 0.75;
139
148
 
149
+ // ── Coarse-partition pre-pass for flat large-diverse directories ──
150
+ //
151
+ // The HAC path above (`findComponents` + `partitionShapeScore`) is
152
+ // tuned for FINE-GRAINED sub-clustering inside already-bounded
153
+ // directories: it maximises the count of 3-8-size components at
154
+ // some candidate threshold. On a flat 600-leaf root that's the
155
+ // wrong optimisation — the best partition at any threshold is
156
+ // dominated by one giant component plus many singletons, and the
157
+ // handful of 3-8-size clusters that do emerge score poorly.
158
+ // Practical symptom: a 596-leaf hand-authored corpus observed in
159
+ // the field produced zero NEST proposals during convergence under
160
+ // `--quality-mode deterministic`, which left the balance phase to
161
+ // carve categories linearly and hit its 20-iter cap far short of
162
+ // convergence.
163
+ //
164
+ // The coarse-partition pre-pass uses deterministic K-means (farthest-
165
+ // first init + mean-member-similarity assignment) to force K top-
166
+ // level clusters when the directory's leaf count exceeds
167
+ // `COARSE_PARTITION_THRESHOLD`. K is chosen as
168
+ // `ceil(N / COARSE_TARGET_CLUSTER_SIZE)` so the average cluster
169
+ // lands around the `COARSE_TARGET_CLUSTER_SIZE` mark. Clusters
170
+ // smaller than `MIN_CLUSTER_SIZE` or larger than
171
+ // `MAX_COARSE_CLUSTER_SIZE` are rejected post-hoc — small ones
172
+ // aren't worth nesting (the `MIN_CLUSTER_SIZE` floor) and giant
173
+ // ones are usually noise-floor hits that would themselves need
174
+ // sub-clustering (the `MAX_COARSE_CLUSTER_SIZE` ceiling, 30, is
175
+ // ~4× the target so only egregiously-concentrated clusters get
176
+ // pruned — the rest pass through and balance enforcement can
177
+ // refine them in a second pass if `--fanout-target` is tight).
178
+ //
179
+ // Determinism: all ordering uses lex-first tie-breaking (first
180
+ // seed is always index 0, subsequent seeds via farthest-first,
181
+ // members iterate in leaf-array order). Two runs on the same
182
+ // corpus produce byte-identical cluster membership.
183
+ export const COARSE_PARTITION_THRESHOLD = 50;
184
+ export const COARSE_TARGET_CLUSTER_SIZE = 8;
185
+ export const MAX_COARSE_CLUSTER_SIZE = 30;
186
+ export const COARSE_KMEANS_MAX_ITERS = 20;
187
+
140
188
  // Read the first ~1 KB of a leaf's body for the Tier 1 signal.
141
189
  // We skip the frontmatter (between the first two `---` lines)
142
190
  // and take a prefix of the remaining bytes. Short-body leaves
@@ -453,6 +501,161 @@ export function buildProposeStructureRequest(relativeDir, leaves) {
453
501
  });
454
502
  }
455
503
 
504
+ // Coarse-partition K-means for flat large-diverse directories.
505
+ // Called from `detectClusters` when `leaves.length` exceeds
506
+ // `COARSE_PARTITION_THRESHOLD`. The HAC path used for ≤-threshold
507
+ // directories can't produce usable 3-8-sized clusters on a flat
508
+ // 600-leaf root (see the constant block at the top of the file);
509
+ // this function forces K top-level clusters via deterministic
510
+ // K-means with farthest-first seed init.
511
+ //
512
+ // Algorithm:
513
+ //
514
+ // 1. Compute the same NxN affinity matrix `detectClusters` uses
515
+ // (Tier 0 + Tier 1 blend via `computeAffinityMatrix`). Reused
516
+ // downstream — we do NOT recompute it in the HAC path when
517
+ // we dispatch here.
518
+ //
519
+ // 2. Pick K = ceil(N / COARSE_TARGET_CLUSTER_SIZE) seeds via
520
+ // farthest-first selection. First seed is leaves[0] (lex-first
521
+ // by the caller's ordering). Each subsequent seed maximises
522
+ // its minimum similarity-distance (1 - max(sim-to-existing))
523
+ // so seeds spread across the similarity space.
524
+ //
525
+ // 3. Iterate assignment: each leaf → cluster whose current
526
+ // members have the highest MEAN similarity to it. Using mean
527
+ // member similarity rather than vector-centroid distance lets
528
+ // us work with the existing `matrix` directly — no need to
529
+ // expose or recompute per-leaf vectors. Stops when assignments
530
+ // stop changing or the iteration cap fires.
531
+ //
532
+ // 4. Build proposals via `buildNestProposal`. Clusters smaller
533
+ // than `MIN_CLUSTER_SIZE` or larger than
534
+ // `MAX_COARSE_CLUSTER_SIZE` are rejected (small: not worth
535
+ // nesting; giant: noise-floor concentration, leave to a
536
+ // second pass or to balance enforcement).
537
+ //
538
+ // Returns an array of NEST proposals in
539
+ // `(average_affinity desc, member-path asc)` order. Returns `[]`
540
+ // if no cluster passed filters — the caller decides whether to
541
+ // fall back to HAC or escalate.
542
+ export async function detectCoarseClusters(wikiRoot, leaves, opts = {}) {
543
+ if (leaves.length < MIN_CLUSTER_SIZE) return [];
544
+ const matrix =
545
+ opts.precomputedMatrix ??
546
+ (await computeAffinityMatrix(wikiRoot, leaves, opts));
547
+ const N = leaves.length;
548
+ const K = Math.min(
549
+ Math.ceil(N / COARSE_TARGET_CLUSTER_SIZE),
550
+ // Guard: K cannot exceed N (degenerate) or produce clusters
551
+ // smaller than MIN on average. ceil(N / TARGET) hits both
552
+ // floors naturally, but pin the upper bound so a user tuning
553
+ // TARGET down to 1 doesn't blow up.
554
+ Math.floor(N / MIN_CLUSTER_SIZE),
555
+ );
556
+ if (K < 2) return []; // nothing meaningful to partition into
557
+
558
+ // Step 1: deterministic farthest-first seeds. First seed is the
559
+ // lex-first leaf (index 0). Each subsequent seed maximises its
560
+ // minimum similarity-distance (1 - max(sim-to-any-existing-seed))
561
+ // so seeds don't pile up in a dense region of the affinity graph.
562
+ // Ties broken by index-ascending, preserving determinism.
563
+ const seeds = [0];
564
+ while (seeds.length < K) {
565
+ let bestIdx = -1;
566
+ let bestMinDist = -1;
567
+ for (let i = 0; i < N; i++) {
568
+ if (seeds.includes(i)) continue;
569
+ let maxSimToSeed = -Infinity;
570
+ for (const s of seeds) {
571
+ if (matrix[i][s] > maxSimToSeed) maxSimToSeed = matrix[i][s];
572
+ }
573
+ const minDistToSeed = 1 - maxSimToSeed;
574
+ if (minDistToSeed > bestMinDist) {
575
+ bestMinDist = minDistToSeed;
576
+ bestIdx = i;
577
+ }
578
+ }
579
+ if (bestIdx === -1) break;
580
+ seeds.push(bestIdx);
581
+ }
582
+
583
+ // Step 2: initial assignment = nearest-seed (max similarity).
584
+ const assignments = new Array(N);
585
+ for (let i = 0; i < N; i++) {
586
+ let bestK = 0;
587
+ let bestSim = -Infinity;
588
+ for (let k = 0; k < seeds.length; k++) {
589
+ const sim = matrix[i][seeds[k]];
590
+ if (sim > bestSim) {
591
+ bestSim = sim;
592
+ bestK = k;
593
+ }
594
+ }
595
+ assignments[i] = bestK;
596
+ }
597
+
598
+ // Step 3: iterate. Each leaf re-assigns to the cluster whose
599
+ // current members have the highest mean similarity to it.
600
+ // Converges in a handful of iterations on most corpora; the
601
+ // COARSE_KMEANS_MAX_ITERS cap is defensive against pathological
602
+ // oscillation.
603
+ for (let iter = 0; iter < COARSE_KMEANS_MAX_ITERS; iter++) {
604
+ const members = Array.from({ length: seeds.length }, () => []);
605
+ for (let i = 0; i < N; i++) members[assignments[i]].push(i);
606
+ let changed = false;
607
+ for (let i = 0; i < N; i++) {
608
+ let bestK = assignments[i];
609
+ let bestMean = -Infinity;
610
+ for (let k = 0; k < seeds.length; k++) {
611
+ const mem = members[k];
612
+ if (mem.length === 0) continue;
613
+ let sum = 0;
614
+ for (const m of mem) sum += matrix[i][m];
615
+ const mean = sum / mem.length;
616
+ if (mean > bestMean) {
617
+ bestMean = mean;
618
+ bestK = k;
619
+ }
620
+ }
621
+ if (bestK !== assignments[i]) {
622
+ assignments[i] = bestK;
623
+ changed = true;
624
+ }
625
+ }
626
+ if (!changed) break;
627
+ }
628
+
629
+ // Step 4: build proposals from each non-trivial cluster.
630
+ const proposals = [];
631
+ for (let k = 0; k < seeds.length; k++) {
632
+ const componentIndices = [];
633
+ for (let i = 0; i < N; i++) {
634
+ if (assignments[i] === k) componentIndices.push(i);
635
+ }
636
+ if (componentIndices.length < MIN_CLUSTER_SIZE) continue;
637
+ if (componentIndices.length > MAX_COARSE_CLUSTER_SIZE) continue;
638
+ if (componentIndices.length === N) continue; // single-cluster-everything
639
+ const componentLeaves = componentIndices.map((i) => leaves[i]);
640
+ const proposal = buildNestProposal(componentLeaves, matrix, componentIndices);
641
+ proposal.threshold = null; // n/a for K-means; left null to signal coarse-mode
642
+ proposal.source = "math-coarse";
643
+ proposals.push(proposal);
644
+ }
645
+ // Deterministic sort: highest-affinity clusters first, ties
646
+ // broken by the lex-first member path so the on-disk apply
647
+ // order is stable across runs.
648
+ proposals.sort((a, b) => {
649
+ if (b.average_affinity !== a.average_affinity) {
650
+ return b.average_affinity - a.average_affinity;
651
+ }
652
+ const aKey = a.leaves?.[0]?.path ?? "";
653
+ const bKey = b.leaves?.[0]?.path ?? "";
654
+ return aKey.localeCompare(bKey);
655
+ });
656
+ return proposals;
657
+ }
658
+
456
659
  // Detect all NEST proposals for a single parent directory's
457
660
  // leaves. Tries each candidate threshold (aggressive range), picks
458
661
  // the best by shape score, and emits a proposal for each
@@ -465,9 +668,32 @@ export function buildProposeStructureRequest(relativeDir, leaves) {
465
668
  // marker and returns `[]` instead — used by tests and the
466
669
  // cluster_name unit tests that don't want the marker in their
467
670
  // output.
671
+ //
672
+ // Dispatch: for directories above `COARSE_PARTITION_THRESHOLD`
673
+ // leaves, skip the HAC path entirely and run the coarse K-means
674
+ // partitioner. The HAC path's shape-score optimiser is tuned for
675
+ // fine-grained sub-clustering (3-8-size components), which can't
676
+ // structure a flat large-diverse root — see the constant block.
677
+ // Coarse clusters returned in the same shape the HAC path would
678
+ // emit, so downstream (operators.mjs::tryClusterNestIteration,
679
+ // balance.mjs::runBalance) is untouched.
468
680
  export async function detectClusters(wikiRoot, leaves, opts = {}) {
469
681
  const { returnEmptyMarker = true } = opts;
470
682
  if (leaves.length < MIN_CLUSTER_SIZE) return [];
683
+
684
+ // Coarse-partition dispatch for flat large-diverse roots. This
685
+ // path doesn't honour `returnEmptyMarker` (no empty-partition
686
+ // marker is emitted) because Tier 2's propose_structure is the
687
+ // wrong tool for these inputs anyway — the LLM would be asked
688
+ // to partition 500+ leaves in one shot, which is both a huge
689
+ // token cost and typically produces worse structure than the
690
+ // deterministic K-means. If coarse produces zero valid clusters,
691
+ // return empty; the caller (balance / operators) handles zero-
692
+ // proposal days gracefully.
693
+ if (leaves.length > COARSE_PARTITION_THRESHOLD) {
694
+ return detectCoarseClusters(wikiRoot, leaves, opts);
695
+ }
696
+
471
697
  const matrix = await computeAffinityMatrix(wikiRoot, leaves, opts);
472
698
  let bestPartition = null;
473
699
  let bestScore = -1;
@@ -514,3 +740,255 @@ export async function detectClusters(wikiRoot, leaves, opts = {}) {
514
740
  proposals.sort((a, b) => b.average_affinity - a.average_affinity);
515
741
  return proposals;
516
742
  }
743
+
744
+ // Deterministic slug generator for the `deterministic` quality mode.
745
+ // Given a cluster's member leaves and optional corpus context (for
746
+ // IDF), returns a reproducible kebab-case slug derived from the
747
+ // members' frontmatter terms alone — no LLM, no network, no
748
+ // randomness. Repeated invocations on the same inputs always return
749
+ // the same slug; shuffling the member order never changes the output.
750
+ //
751
+ // Algorithm:
752
+ //
753
+ // 1. Build a TF-IDF vector over each member's `entryText` (focus +
754
+ // covers + tags + domains) using the supplied corpus context
755
+ // for IDF weighting. Without context, members form their own
756
+ // micro-corpus — less semantically interesting but still
757
+ // deterministic.
758
+ // 2. Sum the per-member vectors (weights stay dominated by terms
759
+ // that are rare in the corpus but common inside the cluster —
760
+ // exactly the "distinguishing" terms we want in the slug).
761
+ // 3. Rank terms by (weight desc, term asc). The lex tie-break is
762
+ // the ONLY source of determinism when two terms share a weight.
763
+ // 4. Walk the ranked list, taking the first 1–2 terms that are
764
+ // valid slug components (lowercase, ≥ 2 chars, start with a
765
+ // letter, pass the `SLUG_RE` check when joined).
766
+ // 5. If still no valid slug (terse frontmatters, every top term
767
+ // numeric/short), fall back to a 7-hex-char content hash of
768
+ // the sorted member ids — deterministic in its inputs, but NOT
769
+ // globally unique. Seven hex characters is ~28 bits of entropy
770
+ // from a truncated FNV-1a-32 output, so hash collisions are
771
+ // mathematically possible (~0.1% collision rate at 1000 distinct
772
+ // clusters per the birthday bound). That's fine at this layer:
773
+ // the caller passes every slug — hash-derived or term-derived —
774
+ // through `resolveNestSlug` next, which auto-suffixes any
775
+ // collision with an existing id / alias / directory basename
776
+ // into the `-group`/`-group-N` deterministic sequence. The hash
777
+ // fallback just needs to be reproducible from the same inputs,
778
+ // not collision-free across the whole corpus.
779
+ //
780
+ // The caller (operators.mjs::tryClusterNestIteration) passes the
781
+ // result through `resolveNestSlug` so collisions with existing ids
782
+ // auto-suffix deterministically.
783
+ //
784
+ // `opts.precomputedIdf` lets the caller share an IDF map across
785
+ // sibling clusters in the same directory — cuts the per-candidate
786
+ // cost from `O(|corpus|)` tokenization + IDF to `O(|cluster|)`
787
+ // tokenization alone. Semantically identical to a fresh derivation
788
+ // from the passed `corpusContext`; pass whichever you already have.
789
+ export function generateDeterministicSlug(
790
+ componentLeaves,
791
+ corpusContext,
792
+ opts = {},
793
+ ) {
794
+ // Sort members by a stable key BEFORE building text/token lists.
795
+ // Floating-point summation is order-sensitive, so an unsorted input
796
+ // could theoretically flip near-tie ordering under shuffled input.
797
+ // Sorting on leaf id (path fallback for tests that omit id) removes
798
+ // that entire class of ambiguity at trivial cost.
799
+ // Normalise each member: accept either a leaf wrapper `{ path, data }`
800
+ // or a plain frontmatter object (the shape corpusContext also
801
+ // tolerates below). Without this, a caller passing plain frontmatter
802
+ // would hit `entryText(undefined)` for every member, producing empty
803
+ // token lists and collapsing every such cluster onto the identical
804
+ // `cluster-<hash>` fallback — so multiple unrelated clusters could
805
+ // end up with the same slug. Symmetrising with the corpusContext
806
+ // path closes that footgun.
807
+ const normalisedMembers = componentLeaves.map((leaf) => ({
808
+ data: leaf?.data ?? leaf,
809
+ path: leaf?.path,
810
+ }));
811
+ const stableMembers = [...normalisedMembers].sort((a, b) => {
812
+ const ka = a?.data?.id ?? a?.path ?? "";
813
+ const kb = b?.data?.id ?? b?.path ?? "";
814
+ return ka < kb ? -1 : ka > kb ? 1 : 0;
815
+ });
816
+ const tokenLists = stableMembers.map((leaf) => tokenize(entryText(leaf.data)));
817
+ // IDF context: precomputed > corpusContext > cluster itself.
818
+ const idfMap =
819
+ opts.precomputedIdf ??
820
+ (corpusContext && corpusContext.length > 0
821
+ ? computeIdf(
822
+ corpusContext.map((e) => tokenize(entryText(e.data ?? e))),
823
+ )
824
+ : computeIdf(tokenLists));
825
+ // Per-member tf-idf, then sum into a single cluster-wide vector.
826
+ // Stable member order + lex tie-break on the final ranking below
827
+ // means the output is byte-identical regardless of caller-side
828
+ // ordering.
829
+ const sum = new Map();
830
+ for (const tokens of tokenLists) {
831
+ const vec = tfidfVector(tokens, idfMap);
832
+ for (const [term, weight] of vec) {
833
+ sum.set(term, (sum.get(term) ?? 0) + weight);
834
+ }
835
+ }
836
+ // Rank: weight desc, term asc (lex tie-break → determinism).
837
+ const ranked = Array.from(sum.entries()).sort((a, b) => {
838
+ if (b[1] !== a[1]) return b[1] - a[1];
839
+ return a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
840
+ });
841
+
842
+ const SLUG_RE = /^[a-z][a-z0-9-]{0,63}$/;
843
+ const VALID_TOKEN = /^[a-z][a-z0-9]*$/;
844
+ // Collect up to MAX_TOKENS_TO_CONSIDER ranked tokens, bounded to
845
+ // keep the O(n²) pair search below fast on corpora with many
846
+ // distinct terms. The pair search checks `C(n, 2) = n·(n−1)/2`
847
+ // combinations, so with n=16 that's at most 120 candidate slugs
848
+ // to test — trivial, and far more than practical frontmatters
849
+ // actually supply in their combined focus + covers + tags
850
+ // token bag.
851
+ const MAX_TOKENS_TO_CONSIDER = 16;
852
+ const takeable = [];
853
+ for (const [term] of ranked) {
854
+ if (!VALID_TOKEN.test(term)) continue;
855
+ takeable.push(term);
856
+ if (takeable.length >= MAX_TOKENS_TO_CONSIDER) break;
857
+ }
858
+ // Priority 1: highest-ranked TWO tokens that, when joined with "-",
859
+ // produce a valid SLUG_RE slug. The outer loop walks rank-first;
860
+ // the inner loop fills the second slot. Because both axes march
861
+ // top-to-bottom in ranked order, the first valid combo we find is
862
+ // the one carrying the highest total rank weight — semantically
863
+ // the "best" two-term slug.
864
+ //
865
+ // Bugfix vs. the v1 impl, which stopped after the top 2 ranked
866
+ // tokens and fell back to the hash whenever that specific combo
867
+ // overflowed SLUG_RE's 64-char cap. Walking further ranked terms
868
+ // surfaces a valid slug in every case where member frontmatters
869
+ // supply at least one kebab-compatible short pair, instead of
870
+ // producing an opaque `cluster-<hash>` when a valid slug was
871
+ // reachable just one rank away.
872
+ for (let i = 0; i < takeable.length; i++) {
873
+ for (let j = i + 1; j < takeable.length; j++) {
874
+ const candidate = `${takeable[i]}-${takeable[j]}`;
875
+ if (SLUG_RE.test(candidate)) return candidate;
876
+ }
877
+ }
878
+ // Priority 2: highest-ranked SINGLE token that passes SLUG_RE.
879
+ // Walks in ranked order for the same reason.
880
+ for (const term of takeable) {
881
+ if (SLUG_RE.test(term)) return term;
882
+ }
883
+ // Deterministic hash fallback — member ids sorted lex, hashed.
884
+ // Use the normalisedMembers we built earlier so plain-frontmatter
885
+ // callers get a stable hash too (their id lives at `.data.id` after
886
+ // normalisation, not `.id` directly).
887
+ const sortedIds = normalisedMembers
888
+ .map((leaf) => leaf?.data?.id ?? leaf?.path ?? "")
889
+ .filter(Boolean)
890
+ .sort();
891
+ const hash = hashString(sortedIds.join("|")).slice(0, 7);
892
+ return `cluster-${hash}`;
893
+ }
894
+
895
+ // Build an IDF map over a sibling leaf set once, for reuse across
896
+ // multiple `generateDeterministicSlug` calls on clusters within the
897
+ // same parent directory. Every candidate cluster under a given parent
898
+ // shares the same corpus context, so computing IDF once per directory
899
+ // — rather than once per candidate — is strictly better for any
900
+ // directory with ≥ 2 candidate clusters. Drop the return value into
901
+ // `generateDeterministicSlug(.., .., { precomputedIdf: idfMap })`.
902
+ export function buildSiblingIdfContext(siblings) {
903
+ const tokenLists = siblings.map((leaf) =>
904
+ tokenize(entryText(leaf?.data ?? leaf)),
905
+ );
906
+ return computeIdf(tokenLists);
907
+ }
908
+
909
+ // Maximum number of cover phrases to splice into a multi-member
910
+ // cluster's synthesised focus. Four is a soft cap that keeps the
911
+ // resulting string short enough to read at a glance while still
912
+ // telling the orchestrator the cluster is multi-topic.
913
+ const PURPOSE_MAX_COVERS = 4;
914
+
915
+ // Deterministic purpose for the NEST stub's `focus:` field.
916
+ //
917
+ // Tiered policy:
918
+ // - Single-member cluster: return the member's own `focus` directly
919
+ // — concise + accurate, e.g. for an X.11 root-containment outlier
920
+ // whose folder will hold exactly one leaf.
921
+ // - Multi-member cluster: aggregate the top-N cover phrases across
922
+ // members ranked by frequency desc, then lex asc, joined with
923
+ // "; ". This is the corrected behaviour: previously the
924
+ // algorithm returned just the lex-first highest-count cover,
925
+ // which on coarse k-means clusters of diverse content (where no
926
+ // cover appears in multiple members) collapsed to "the
927
+ // alphabetically-first cover of any member" — producing
928
+ // misleading single-leaf focus strings on multi-topic clusters
929
+ // (see X.11 wiki output where an 8-leaf ops/observability
930
+ // cluster's focus read "Action items without owner or deadline",
931
+ // which was just one detail of one member).
932
+ // - Multi-member cluster with no covers anywhere: fall back to the
933
+ // focus of the member whose id sorts first. Still deterministic,
934
+ // still driven by member content alone.
935
+ //
936
+ // Accepts either `{ path, data }` leaf wrappers or plain frontmatter
937
+ // objects. Input is normalised via `leaf?.data ?? leaf` at the top so
938
+ // this helper matches `generateDeterministicSlug` + `buildSiblingIdfContext`'s
939
+ // API shape — callers can pass whichever form they already have
940
+ // without getting silent empty results for the plain-object path.
941
+ export function deterministicPurpose(componentLeaves) {
942
+ const normalised = componentLeaves.map((leaf) => leaf?.data ?? leaf);
943
+ if (normalised.length === 1) {
944
+ return typeof normalised[0]?.focus === "string" ? normalised[0].focus : "";
945
+ }
946
+ const counts = new Map();
947
+ for (const data of normalised) {
948
+ const covers = Array.isArray(data?.covers) ? data.covers : [];
949
+ const seenInLeaf = new Set();
950
+ for (const cover of covers) {
951
+ const key = typeof cover === "string" ? cover.trim() : "";
952
+ if (!key || seenInLeaf.has(key)) continue;
953
+ seenInLeaf.add(key);
954
+ counts.set(key, (counts.get(key) ?? 0) + 1);
955
+ }
956
+ }
957
+ if (counts.size > 0) {
958
+ const ranked = Array.from(counts.entries()).sort((a, b) => {
959
+ if (b[1] !== a[1]) return b[1] - a[1];
960
+ return a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
961
+ });
962
+ const top = ranked.slice(0, PURPOSE_MAX_COVERS).map(([cover]) => cover);
963
+ return top.length === 1 ? top[0] : top.join("; ");
964
+ }
965
+ const sorted = normalised
966
+ .map((data) => ({
967
+ id: data?.id ?? "",
968
+ // Mirror the single-member branch: only accept string focus
969
+ // values, normalise everything else to "". Without this, a
970
+ // hand-authored leaf with a non-string `focus:` (e.g. a YAML
971
+ // number `0` or `false`) would propagate through and make
972
+ // `deterministicPurpose()` return a non-string, breaking the
973
+ // documented contract that this helper always returns a
974
+ // string.
975
+ focus: typeof data?.focus === "string" ? data.focus : "",
976
+ }))
977
+ .filter((x) => x.id)
978
+ .sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
979
+ return sorted[0]?.focus ?? "";
980
+ }
981
+
982
+ // Local helper — a simple, stable non-crypto hash. `createHash` would
983
+ // be fine but adds a node:crypto import and is overkill for a 7-char
984
+ // slug suffix. FNV-1a 32-bit is widely used, stable across Node
985
+ // versions, and deterministic on the same string input.
986
+ function hashString(str) {
987
+ let h = 0x811c9dc5 >>> 0;
988
+ for (let i = 0; i < str.length; i++) {
989
+ h ^= str.charCodeAt(i);
990
+ h = Math.imul(h, 0x01000193) >>> 0;
991
+ }
992
+ return h.toString(16).padStart(8, "0");
993
+ }
994
+
@@ -65,8 +65,29 @@ const FRONTMATTER_SCHEMA = {
65
65
  shared_covers: { kind: "string[]" },
66
66
  // Only present (and required) when type === "overlay".
67
67
  overlay_targets: { kind: "string[]" },
68
- links: { kind: "string[]" },
68
+ links: {
69
+ kind: "object[]",
70
+ description: "Cross-leaf references; each entry carries an `id` (and optional metadata) — see scripts/lib/join.mjs for how runtime code reads link.id.",
71
+ },
72
+ // Consumer-defined fields (e.g. skill-code-review's
73
+ // `dimensions`, `audit_surface`, `languages`, `tools`) are
74
+ // carried through rebuilds via the deny-list forwarding in
75
+ // draft.mjs; their VALUES are preserved (not dropped). Exact
76
+ // bytes can change because the renderer applies canonical
77
+ // top-level key ordering and YAML formatting. The contract
78
+ // here describes only the fields the wiki framework itself
79
+ // reads / writes; consumers ship their own schemas alongside.
69
80
  },
81
+ // Reserved fields that the rebuild ALWAYS re-derives from the
82
+ // target-tree position, regardless of what the author wrote.
83
+ // `parents` is NOT in this set — it's hand-authored when the
84
+ // soft-DAG layout requires it (the drafter picks the authored
85
+ // value over the heuristic).
86
+ reserved: ["id", "type", "depth_role", "source"],
87
+ // Deny-list semantics for everything else: any authored field not
88
+ // in `reserved` flows through verbatim. This keeps the wiki
89
+ // framework agnostic to consumer-specific schemas.
90
+ pass_through_authored: true,
70
91
  },
71
92
  index: {
72
93
  required: ["id", "type", "depth_role", "focus"],
@@ -135,6 +156,9 @@ const SUBCOMMANDS = {
135
156
  "--layout-mode",
136
157
  "--target",
137
158
  "--quality-mode",
159
+ "--fanout-target",
160
+ "--max-depth",
161
+ "--soft-dag-parents",
138
162
  "--no-prompt",
139
163
  "--accept-dirty",
140
164
  "--accept-foreign-target",
@@ -143,17 +167,42 @@ const SUBCOMMANDS = {
143
167
  },
144
168
  extend: {
145
169
  positionals: ["wiki"],
146
- flags: ["--quality-mode", "--no-prompt", "--json"],
170
+ flags: [
171
+ "--quality-mode",
172
+ "--no-prompt",
173
+ "--json",
174
+ ],
147
175
  },
148
176
  validate: { positionals: ["wiki"], flags: ["--json"] },
149
177
  rebuild: {
150
178
  positionals: ["wiki"],
151
- flags: ["--quality-mode", "--review", "--no-prompt", "--json"],
179
+ flags: [
180
+ "--quality-mode",
181
+ "--fanout-target",
182
+ "--max-depth",
183
+ "--soft-dag-parents",
184
+ "--review",
185
+ "--no-prompt",
186
+ "--json",
187
+ ],
152
188
  },
153
189
  fix: { positionals: ["wiki"], flags: ["--json"] },
154
190
  join: {
191
+ // Variadic positionals — the CLI accepts
192
+ // `join <wiki-a> <wiki-b> [<wiki-c>...]`. `positionals` lists
193
+ // the minimum shape; `min_positionals` / `variadic` describe
194
+ // the full contract so consumers generating invocations or
195
+ // validating argument counts don't assume exactly two sources.
155
196
  positionals: ["wiki-a", "wiki-b"],
156
- flags: ["--target", "--canonical", "--json"],
197
+ min_positionals: 2,
198
+ variadic: true,
199
+ flags: [
200
+ "--target",
201
+ "--canonical",
202
+ "--id-collision",
203
+ "--quality-mode",
204
+ "--json",
205
+ ],
157
206
  },
158
207
  rollback: { positionals: ["wiki"], flags: ["--to", "--json"] },
159
208
  init: {