@ctxr/skill-llm-wiki 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +118 -0
- package/README.md +2 -2
- package/guide/cli.md +3 -2
- package/guide/substrate/operators.md +1 -1
- package/guide/substrate/tiered-ai.md +6 -5
- package/guide/ux/user-intent.md +1 -1
- package/package.json +4 -2
- package/scripts/cli.mjs +92 -2
- package/scripts/lib/balance.mjs +579 -0
- package/scripts/lib/cluster-detect.mjs +482 -4
- package/scripts/lib/contract.mjs +31 -3
- package/scripts/lib/decision-log.mjs +121 -15
- package/scripts/lib/heal.mjs +5 -0
- package/scripts/lib/intent.mjs +370 -4
- package/scripts/lib/join-constants.mjs +22 -0
- package/scripts/lib/join.mjs +917 -0
- package/scripts/lib/nest-applier.mjs +395 -32
- package/scripts/lib/operators.mjs +472 -38
- package/scripts/lib/orchestrator.mjs +419 -12
- package/scripts/lib/root-containment.mjs +351 -0
- package/scripts/lib/similarity-cache.mjs +115 -20
- package/scripts/lib/similarity.mjs +11 -0
- package/scripts/lib/soft-dag.mjs +726 -0
- package/scripts/lib/tiered.mjs +42 -18
- package/scripts/lib/validate.mjs +22 -0
|
@@ -3,10 +3,18 @@
|
|
|
3
3
|
// Given a set of leaves at a single depth (one directory's worth
|
|
4
4
|
// of children), compute an affinity matrix using several signals,
|
|
5
5
|
// find candidate clusters as connected components under a
|
|
6
|
-
// threshold, and propose NEST applications.
|
|
7
|
-
//
|
|
8
|
-
//
|
|
9
|
-
//
|
|
6
|
+
// threshold, and propose NEST applications.
|
|
7
|
+
//
|
|
8
|
+
// Cluster naming depends on the active quality mode:
|
|
9
|
+
// - tiered-fast / claude-first: proposals are named by asking
|
|
10
|
+
// Tier 2 (the `cluster_name` request kind), because the point
|
|
11
|
+
// of Tier 2 is to let the sub-agent exercise judgment at
|
|
12
|
+
// naming time.
|
|
13
|
+
// - deterministic: naming is derived locally from member
|
|
14
|
+
// frontmatters via `generateDeterministicSlug` +
|
|
15
|
+
// `deterministicPurpose`, bypassing the `cluster_name` request
|
|
16
|
+
// entirely so the mode's "no LLM in the loop" contract holds
|
|
17
|
+
// end-to-end. See these helpers' doc comments for the algorithm.
|
|
10
18
|
//
|
|
11
19
|
// Signals used for the affinity matrix:
|
|
12
20
|
//
|
|
@@ -68,6 +76,7 @@ import {
|
|
|
68
76
|
} from "./embeddings.mjs";
|
|
69
77
|
import {
|
|
70
78
|
buildComparisonModel,
|
|
79
|
+
computeIdf,
|
|
71
80
|
cosine,
|
|
72
81
|
entryText,
|
|
73
82
|
tfidfVector,
|
|
@@ -137,6 +146,45 @@ export const MAX_CLUSTER_SIZE = 8;
|
|
|
137
146
|
// usually a noise floor hit and is structurally useless.
|
|
138
147
|
export const GIANT_BLOB_FRACTION = 0.75;
|
|
139
148
|
|
|
149
|
+
// ── Coarse-partition pre-pass for flat large-diverse directories ──
|
|
150
|
+
//
|
|
151
|
+
// The HAC path above (`findComponents` + `partitionShapeScore`) is
|
|
152
|
+
// tuned for FINE-GRAINED sub-clustering inside already-bounded
|
|
153
|
+
// directories: it maximises the count of 3-8-size components at
|
|
154
|
+
// some candidate threshold. On a flat 600-leaf root that's the
|
|
155
|
+
// wrong optimisation — the best partition at any threshold is
|
|
156
|
+
// dominated by one giant component plus many singletons, and the
|
|
157
|
+
// handful of 3-8-size clusters that do emerge score poorly.
|
|
158
|
+
// Practical symptom: a 596-leaf hand-authored corpus observed in
|
|
159
|
+
// the field produced zero NEST proposals during convergence under
|
|
160
|
+
// `--quality-mode deterministic`, which left the balance phase to
|
|
161
|
+
// carve categories linearly and hit its 20-iter cap far short of
|
|
162
|
+
// convergence.
|
|
163
|
+
//
|
|
164
|
+
// The coarse-partition pre-pass uses deterministic K-means (farthest-
|
|
165
|
+
// first init + mean-member-similarity assignment) to force K top-
|
|
166
|
+
// level clusters when the directory's leaf count exceeds
|
|
167
|
+
// `COARSE_PARTITION_THRESHOLD`. K is chosen as
|
|
168
|
+
// `ceil(N / COARSE_TARGET_CLUSTER_SIZE)` so the average cluster
|
|
169
|
+
// lands around the `COARSE_TARGET_CLUSTER_SIZE` mark. Clusters
|
|
170
|
+
// smaller than `MIN_CLUSTER_SIZE` or larger than
|
|
171
|
+
// `MAX_COARSE_CLUSTER_SIZE` are rejected post-hoc — small ones
|
|
172
|
+
// aren't worth nesting (the `MIN_CLUSTER_SIZE` floor) and giant
|
|
173
|
+
// ones are usually noise-floor hits that would themselves need
|
|
174
|
+
// sub-clustering (the `MAX_COARSE_CLUSTER_SIZE` ceiling, 30, is
|
|
175
|
+
// ~4× the target so only egregiously-concentrated clusters get
|
|
176
|
+
// pruned — the rest pass through and balance enforcement can
|
|
177
|
+
// refine them in a second pass if `--fanout-target` is tight).
|
|
178
|
+
//
|
|
179
|
+
// Determinism: all ordering uses lex-first tie-breaking (first
|
|
180
|
+
// seed is always index 0, subsequent seeds via farthest-first,
|
|
181
|
+
// members iterate in leaf-array order). Two runs on the same
|
|
182
|
+
// corpus produce byte-identical cluster membership.
|
|
183
|
+
export const COARSE_PARTITION_THRESHOLD = 50;
|
|
184
|
+
export const COARSE_TARGET_CLUSTER_SIZE = 8;
|
|
185
|
+
export const MAX_COARSE_CLUSTER_SIZE = 30;
|
|
186
|
+
export const COARSE_KMEANS_MAX_ITERS = 20;
|
|
187
|
+
|
|
140
188
|
// Read the first ~1 KB of a leaf's body for the Tier 1 signal.
|
|
141
189
|
// We skip the frontmatter (between the first two `---` lines)
|
|
142
190
|
// and take a prefix of the remaining bytes. Short-body leaves
|
|
@@ -453,6 +501,161 @@ export function buildProposeStructureRequest(relativeDir, leaves) {
|
|
|
453
501
|
});
|
|
454
502
|
}
|
|
455
503
|
|
|
504
|
+
// Coarse-partition K-means for flat large-diverse directories.
|
|
505
|
+
// Called from `detectClusters` when `leaves.length` exceeds
|
|
506
|
+
// `COARSE_PARTITION_THRESHOLD`. The HAC path used for ≤-threshold
|
|
507
|
+
// directories can't produce usable 3-8-sized clusters on a flat
|
|
508
|
+
// 600-leaf root (see the constant block at the top of the file);
|
|
509
|
+
// this function forces K top-level clusters via deterministic
|
|
510
|
+
// K-means with farthest-first seed init.
|
|
511
|
+
//
|
|
512
|
+
// Algorithm:
|
|
513
|
+
//
|
|
514
|
+
// 1. Compute the same NxN affinity matrix `detectClusters` uses
|
|
515
|
+
// (Tier 0 + Tier 1 blend via `computeAffinityMatrix`). Reused
|
|
516
|
+
// downstream — we do NOT recompute it in the HAC path when
|
|
517
|
+
// we dispatch here.
|
|
518
|
+
//
|
|
519
|
+
// 2. Pick K = ceil(N / COARSE_TARGET_CLUSTER_SIZE) seeds via
|
|
520
|
+
// farthest-first selection. First seed is leaves[0] (lex-first
|
|
521
|
+
// by the caller's ordering). Each subsequent seed maximises
|
|
522
|
+
// its minimum similarity-distance (1 - max(sim-to-existing))
|
|
523
|
+
// so seeds spread across the similarity space.
|
|
524
|
+
//
|
|
525
|
+
// 3. Iterate assignment: each leaf → cluster whose current
|
|
526
|
+
// members have the highest MEAN similarity to it. Using mean
|
|
527
|
+
// member similarity rather than vector-centroid distance lets
|
|
528
|
+
// us work with the existing `matrix` directly — no need to
|
|
529
|
+
// expose or recompute per-leaf vectors. Stops when assignments
|
|
530
|
+
// stop changing or the iteration cap fires.
|
|
531
|
+
//
|
|
532
|
+
// 4. Build proposals via `buildNestProposal`. Clusters smaller
|
|
533
|
+
// than `MIN_CLUSTER_SIZE` or larger than
|
|
534
|
+
// `MAX_COARSE_CLUSTER_SIZE` are rejected (small: not worth
|
|
535
|
+
// nesting; giant: noise-floor concentration, leave to a
|
|
536
|
+
// second pass or to balance enforcement).
|
|
537
|
+
//
|
|
538
|
+
// Returns an array of NEST proposals in
|
|
539
|
+
// `(average_affinity desc, member-path asc)` order. Returns `[]`
|
|
540
|
+
// if no cluster passed filters — the caller decides whether to
|
|
541
|
+
// fall back to HAC or escalate.
|
|
542
|
+
export async function detectCoarseClusters(wikiRoot, leaves, opts = {}) {
|
|
543
|
+
if (leaves.length < MIN_CLUSTER_SIZE) return [];
|
|
544
|
+
const matrix =
|
|
545
|
+
opts.precomputedMatrix ??
|
|
546
|
+
(await computeAffinityMatrix(wikiRoot, leaves, opts));
|
|
547
|
+
const N = leaves.length;
|
|
548
|
+
const K = Math.min(
|
|
549
|
+
Math.ceil(N / COARSE_TARGET_CLUSTER_SIZE),
|
|
550
|
+
// Guard: K cannot exceed N (degenerate) or produce clusters
|
|
551
|
+
// smaller than MIN on average. ceil(N / TARGET) hits both
|
|
552
|
+
// floors naturally, but pin the upper bound so a user tuning
|
|
553
|
+
// TARGET down to 1 doesn't blow up.
|
|
554
|
+
Math.floor(N / MIN_CLUSTER_SIZE),
|
|
555
|
+
);
|
|
556
|
+
if (K < 2) return []; // nothing meaningful to partition into
|
|
557
|
+
|
|
558
|
+
// Step 1: deterministic farthest-first seeds. First seed is the
|
|
559
|
+
// lex-first leaf (index 0). Each subsequent seed maximises its
|
|
560
|
+
// minimum similarity-distance (1 - max(sim-to-any-existing-seed))
|
|
561
|
+
// so seeds don't pile up in a dense region of the affinity graph.
|
|
562
|
+
// Ties broken by index-ascending, preserving determinism.
|
|
563
|
+
const seeds = [0];
|
|
564
|
+
while (seeds.length < K) {
|
|
565
|
+
let bestIdx = -1;
|
|
566
|
+
let bestMinDist = -1;
|
|
567
|
+
for (let i = 0; i < N; i++) {
|
|
568
|
+
if (seeds.includes(i)) continue;
|
|
569
|
+
let maxSimToSeed = -Infinity;
|
|
570
|
+
for (const s of seeds) {
|
|
571
|
+
if (matrix[i][s] > maxSimToSeed) maxSimToSeed = matrix[i][s];
|
|
572
|
+
}
|
|
573
|
+
const minDistToSeed = 1 - maxSimToSeed;
|
|
574
|
+
if (minDistToSeed > bestMinDist) {
|
|
575
|
+
bestMinDist = minDistToSeed;
|
|
576
|
+
bestIdx = i;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
if (bestIdx === -1) break;
|
|
580
|
+
seeds.push(bestIdx);
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// Step 2: initial assignment = nearest-seed (max similarity).
|
|
584
|
+
const assignments = new Array(N);
|
|
585
|
+
for (let i = 0; i < N; i++) {
|
|
586
|
+
let bestK = 0;
|
|
587
|
+
let bestSim = -Infinity;
|
|
588
|
+
for (let k = 0; k < seeds.length; k++) {
|
|
589
|
+
const sim = matrix[i][seeds[k]];
|
|
590
|
+
if (sim > bestSim) {
|
|
591
|
+
bestSim = sim;
|
|
592
|
+
bestK = k;
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
assignments[i] = bestK;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// Step 3: iterate. Each leaf re-assigns to the cluster whose
|
|
599
|
+
// current members have the highest mean similarity to it.
|
|
600
|
+
// Converges in a handful of iterations on most corpora; the
|
|
601
|
+
// COARSE_KMEANS_MAX_ITERS cap is defensive against pathological
|
|
602
|
+
// oscillation.
|
|
603
|
+
for (let iter = 0; iter < COARSE_KMEANS_MAX_ITERS; iter++) {
|
|
604
|
+
const members = Array.from({ length: seeds.length }, () => []);
|
|
605
|
+
for (let i = 0; i < N; i++) members[assignments[i]].push(i);
|
|
606
|
+
let changed = false;
|
|
607
|
+
for (let i = 0; i < N; i++) {
|
|
608
|
+
let bestK = assignments[i];
|
|
609
|
+
let bestMean = -Infinity;
|
|
610
|
+
for (let k = 0; k < seeds.length; k++) {
|
|
611
|
+
const mem = members[k];
|
|
612
|
+
if (mem.length === 0) continue;
|
|
613
|
+
let sum = 0;
|
|
614
|
+
for (const m of mem) sum += matrix[i][m];
|
|
615
|
+
const mean = sum / mem.length;
|
|
616
|
+
if (mean > bestMean) {
|
|
617
|
+
bestMean = mean;
|
|
618
|
+
bestK = k;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
if (bestK !== assignments[i]) {
|
|
622
|
+
assignments[i] = bestK;
|
|
623
|
+
changed = true;
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
if (!changed) break;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Step 4: build proposals from each non-trivial cluster.
|
|
630
|
+
const proposals = [];
|
|
631
|
+
for (let k = 0; k < seeds.length; k++) {
|
|
632
|
+
const componentIndices = [];
|
|
633
|
+
for (let i = 0; i < N; i++) {
|
|
634
|
+
if (assignments[i] === k) componentIndices.push(i);
|
|
635
|
+
}
|
|
636
|
+
if (componentIndices.length < MIN_CLUSTER_SIZE) continue;
|
|
637
|
+
if (componentIndices.length > MAX_COARSE_CLUSTER_SIZE) continue;
|
|
638
|
+
if (componentIndices.length === N) continue; // single-cluster-everything
|
|
639
|
+
const componentLeaves = componentIndices.map((i) => leaves[i]);
|
|
640
|
+
const proposal = buildNestProposal(componentLeaves, matrix, componentIndices);
|
|
641
|
+
proposal.threshold = null; // n/a for K-means; left null to signal coarse-mode
|
|
642
|
+
proposal.source = "math-coarse";
|
|
643
|
+
proposals.push(proposal);
|
|
644
|
+
}
|
|
645
|
+
// Deterministic sort: highest-affinity clusters first, ties
|
|
646
|
+
// broken by the lex-first member path so the on-disk apply
|
|
647
|
+
// order is stable across runs.
|
|
648
|
+
proposals.sort((a, b) => {
|
|
649
|
+
if (b.average_affinity !== a.average_affinity) {
|
|
650
|
+
return b.average_affinity - a.average_affinity;
|
|
651
|
+
}
|
|
652
|
+
const aKey = a.leaves?.[0]?.path ?? "";
|
|
653
|
+
const bKey = b.leaves?.[0]?.path ?? "";
|
|
654
|
+
return aKey.localeCompare(bKey);
|
|
655
|
+
});
|
|
656
|
+
return proposals;
|
|
657
|
+
}
|
|
658
|
+
|
|
456
659
|
// Detect all NEST proposals for a single parent directory's
|
|
457
660
|
// leaves. Tries each candidate threshold (aggressive range), picks
|
|
458
661
|
// the best by shape score, and emits a proposal for each
|
|
@@ -465,9 +668,32 @@ export function buildProposeStructureRequest(relativeDir, leaves) {
|
|
|
465
668
|
// marker and returns `[]` instead — used by tests and the
|
|
466
669
|
// cluster_name unit tests that don't want the marker in their
|
|
467
670
|
// output.
|
|
671
|
+
//
|
|
672
|
+
// Dispatch: for directories above `COARSE_PARTITION_THRESHOLD`
|
|
673
|
+
// leaves, skip the HAC path entirely and run the coarse K-means
|
|
674
|
+
// partitioner. The HAC path's shape-score optimiser is tuned for
|
|
675
|
+
// fine-grained sub-clustering (3-8-size components), which can't
|
|
676
|
+
// structure a flat large-diverse root — see the constant block.
|
|
677
|
+
// Coarse clusters returned in the same shape the HAC path would
|
|
678
|
+
// emit, so downstream (operators.mjs::tryClusterNestIteration,
|
|
679
|
+
// balance.mjs::runBalance) is untouched.
|
|
468
680
|
export async function detectClusters(wikiRoot, leaves, opts = {}) {
|
|
469
681
|
const { returnEmptyMarker = true } = opts;
|
|
470
682
|
if (leaves.length < MIN_CLUSTER_SIZE) return [];
|
|
683
|
+
|
|
684
|
+
// Coarse-partition dispatch for flat large-diverse roots. This
|
|
685
|
+
// path doesn't honour `returnEmptyMarker` (no empty-partition
|
|
686
|
+
// marker is emitted) because Tier 2's propose_structure is the
|
|
687
|
+
// wrong tool for these inputs anyway — the LLM would be asked
|
|
688
|
+
// to partition 500+ leaves in one shot, which is both a huge
|
|
689
|
+
// token cost and typically produces worse structure than the
|
|
690
|
+
// deterministic K-means. If coarse produces zero valid clusters,
|
|
691
|
+
// return empty; the caller (balance / operators) handles zero-
|
|
692
|
+
// proposal days gracefully.
|
|
693
|
+
if (leaves.length > COARSE_PARTITION_THRESHOLD) {
|
|
694
|
+
return detectCoarseClusters(wikiRoot, leaves, opts);
|
|
695
|
+
}
|
|
696
|
+
|
|
471
697
|
const matrix = await computeAffinityMatrix(wikiRoot, leaves, opts);
|
|
472
698
|
let bestPartition = null;
|
|
473
699
|
let bestScore = -1;
|
|
@@ -514,3 +740,255 @@ export async function detectClusters(wikiRoot, leaves, opts = {}) {
|
|
|
514
740
|
proposals.sort((a, b) => b.average_affinity - a.average_affinity);
|
|
515
741
|
return proposals;
|
|
516
742
|
}
|
|
743
|
+
|
|
744
|
+
// Deterministic slug generator for the `deterministic` quality mode.
|
|
745
|
+
// Given a cluster's member leaves and optional corpus context (for
|
|
746
|
+
// IDF), returns a reproducible kebab-case slug derived from the
|
|
747
|
+
// members' frontmatter terms alone — no LLM, no network, no
|
|
748
|
+
// randomness. Repeated invocations on the same inputs always return
|
|
749
|
+
// the same slug; shuffling the member order never changes the output.
|
|
750
|
+
//
|
|
751
|
+
// Algorithm:
|
|
752
|
+
//
|
|
753
|
+
// 1. Build a TF-IDF vector over each member's `entryText` (focus +
|
|
754
|
+
// covers + tags + domains) using the supplied corpus context
|
|
755
|
+
// for IDF weighting. Without context, members form their own
|
|
756
|
+
// micro-corpus — less semantically interesting but still
|
|
757
|
+
// deterministic.
|
|
758
|
+
// 2. Sum the per-member vectors (weights stay dominated by terms
|
|
759
|
+
// that are rare in the corpus but common inside the cluster —
|
|
760
|
+
// exactly the "distinguishing" terms we want in the slug).
|
|
761
|
+
// 3. Rank terms by (weight desc, term asc). The lex tie-break is
|
|
762
|
+
// the ONLY source of determinism when two terms share a weight.
|
|
763
|
+
// 4. Walk the ranked list, taking the first 1–2 terms that are
|
|
764
|
+
// valid slug components (lowercase, ≥ 2 chars, start with a
|
|
765
|
+
// letter, pass the `SLUG_RE` check when joined).
|
|
766
|
+
// 5. If still no valid slug (terse frontmatters, every top term
|
|
767
|
+
// numeric/short), fall back to a 7-hex-char content hash of
|
|
768
|
+
// the sorted member ids — deterministic in its inputs, but NOT
|
|
769
|
+
// globally unique. Seven hex characters is ~28 bits of entropy
|
|
770
|
+
// from a truncated FNV-1a-32 output, so hash collisions are
|
|
771
|
+
// mathematically possible (~0.1% collision rate at 1000 distinct
|
|
772
|
+
// clusters per the birthday bound). That's fine at this layer:
|
|
773
|
+
// the caller passes every slug — hash-derived or term-derived —
|
|
774
|
+
// through `resolveNestSlug` next, which auto-suffixes any
|
|
775
|
+
// collision with an existing id / alias / directory basename
|
|
776
|
+
// into the `-group`/`-group-N` deterministic sequence. The hash
|
|
777
|
+
// fallback just needs to be reproducible from the same inputs,
|
|
778
|
+
// not collision-free across the whole corpus.
|
|
779
|
+
//
|
|
780
|
+
// The caller (operators.mjs::tryClusterNestIteration) passes the
|
|
781
|
+
// result through `resolveNestSlug` so collisions with existing ids
|
|
782
|
+
// auto-suffix deterministically.
|
|
783
|
+
//
|
|
784
|
+
// `opts.precomputedIdf` lets the caller share an IDF map across
|
|
785
|
+
// sibling clusters in the same directory — cuts the per-candidate
|
|
786
|
+
// cost from `O(|corpus|)` tokenization + IDF to `O(|cluster|)`
|
|
787
|
+
// tokenization alone. Semantically identical to a fresh derivation
|
|
788
|
+
// from the passed `corpusContext`; pass whichever you already have.
|
|
789
|
+
export function generateDeterministicSlug(
|
|
790
|
+
componentLeaves,
|
|
791
|
+
corpusContext,
|
|
792
|
+
opts = {},
|
|
793
|
+
) {
|
|
794
|
+
// Sort members by a stable key BEFORE building text/token lists.
|
|
795
|
+
// Floating-point summation is order-sensitive, so an unsorted input
|
|
796
|
+
// could theoretically flip near-tie ordering under shuffled input.
|
|
797
|
+
// Sorting on leaf id (path fallback for tests that omit id) removes
|
|
798
|
+
// that entire class of ambiguity at trivial cost.
|
|
799
|
+
// Normalise each member: accept either a leaf wrapper `{ path, data }`
|
|
800
|
+
// or a plain frontmatter object (the shape corpusContext also
|
|
801
|
+
// tolerates below). Without this, a caller passing plain frontmatter
|
|
802
|
+
// would hit `entryText(undefined)` for every member, producing empty
|
|
803
|
+
// token lists and collapsing every such cluster onto the identical
|
|
804
|
+
// `cluster-<hash>` fallback — so multiple unrelated clusters could
|
|
805
|
+
// end up with the same slug. Symmetrising with the corpusContext
|
|
806
|
+
// path closes that footgun.
|
|
807
|
+
const normalisedMembers = componentLeaves.map((leaf) => ({
|
|
808
|
+
data: leaf?.data ?? leaf,
|
|
809
|
+
path: leaf?.path,
|
|
810
|
+
}));
|
|
811
|
+
const stableMembers = [...normalisedMembers].sort((a, b) => {
|
|
812
|
+
const ka = a?.data?.id ?? a?.path ?? "";
|
|
813
|
+
const kb = b?.data?.id ?? b?.path ?? "";
|
|
814
|
+
return ka < kb ? -1 : ka > kb ? 1 : 0;
|
|
815
|
+
});
|
|
816
|
+
const tokenLists = stableMembers.map((leaf) => tokenize(entryText(leaf.data)));
|
|
817
|
+
// IDF context: precomputed > corpusContext > cluster itself.
|
|
818
|
+
const idfMap =
|
|
819
|
+
opts.precomputedIdf ??
|
|
820
|
+
(corpusContext && corpusContext.length > 0
|
|
821
|
+
? computeIdf(
|
|
822
|
+
corpusContext.map((e) => tokenize(entryText(e.data ?? e))),
|
|
823
|
+
)
|
|
824
|
+
: computeIdf(tokenLists));
|
|
825
|
+
// Per-member tf-idf, then sum into a single cluster-wide vector.
|
|
826
|
+
// Stable member order + lex tie-break on the final ranking below
|
|
827
|
+
// means the output is byte-identical regardless of caller-side
|
|
828
|
+
// ordering.
|
|
829
|
+
const sum = new Map();
|
|
830
|
+
for (const tokens of tokenLists) {
|
|
831
|
+
const vec = tfidfVector(tokens, idfMap);
|
|
832
|
+
for (const [term, weight] of vec) {
|
|
833
|
+
sum.set(term, (sum.get(term) ?? 0) + weight);
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
// Rank: weight desc, term asc (lex tie-break → determinism).
|
|
837
|
+
const ranked = Array.from(sum.entries()).sort((a, b) => {
|
|
838
|
+
if (b[1] !== a[1]) return b[1] - a[1];
|
|
839
|
+
return a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
|
|
840
|
+
});
|
|
841
|
+
|
|
842
|
+
const SLUG_RE = /^[a-z][a-z0-9-]{0,63}$/;
|
|
843
|
+
const VALID_TOKEN = /^[a-z][a-z0-9]*$/;
|
|
844
|
+
// Collect up to MAX_TOKENS_TO_CONSIDER ranked tokens, bounded to
|
|
845
|
+
// keep the O(n²) pair search below fast on corpora with many
|
|
846
|
+
// distinct terms. The pair search checks `C(n, 2) = n·(n−1)/2`
|
|
847
|
+
// combinations, so with n=16 that's at most 120 candidate slugs
|
|
848
|
+
// to test — trivial, and far more than practical frontmatters
|
|
849
|
+
// actually supply in their combined focus + covers + tags
|
|
850
|
+
// token bag.
|
|
851
|
+
const MAX_TOKENS_TO_CONSIDER = 16;
|
|
852
|
+
const takeable = [];
|
|
853
|
+
for (const [term] of ranked) {
|
|
854
|
+
if (!VALID_TOKEN.test(term)) continue;
|
|
855
|
+
takeable.push(term);
|
|
856
|
+
if (takeable.length >= MAX_TOKENS_TO_CONSIDER) break;
|
|
857
|
+
}
|
|
858
|
+
// Priority 1: highest-ranked TWO tokens that, when joined with "-",
|
|
859
|
+
// produce a valid SLUG_RE slug. The outer loop walks rank-first;
|
|
860
|
+
// the inner loop fills the second slot. Because both axes march
|
|
861
|
+
// top-to-bottom in ranked order, the first valid combo we find is
|
|
862
|
+
// the one carrying the highest total rank weight — semantically
|
|
863
|
+
// the "best" two-term slug.
|
|
864
|
+
//
|
|
865
|
+
// Bugfix vs. the v1 impl, which stopped after the top 2 ranked
|
|
866
|
+
// tokens and fell back to the hash whenever that specific combo
|
|
867
|
+
// overflowed SLUG_RE's 64-char cap. Walking further ranked terms
|
|
868
|
+
// surfaces a valid slug in every case where member frontmatters
|
|
869
|
+
// supply at least one kebab-compatible short pair, instead of
|
|
870
|
+
// producing an opaque `cluster-<hash>` when a valid slug was
|
|
871
|
+
// reachable just one rank away.
|
|
872
|
+
for (let i = 0; i < takeable.length; i++) {
|
|
873
|
+
for (let j = i + 1; j < takeable.length; j++) {
|
|
874
|
+
const candidate = `${takeable[i]}-${takeable[j]}`;
|
|
875
|
+
if (SLUG_RE.test(candidate)) return candidate;
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
// Priority 2: highest-ranked SINGLE token that passes SLUG_RE.
|
|
879
|
+
// Walks in ranked order for the same reason.
|
|
880
|
+
for (const term of takeable) {
|
|
881
|
+
if (SLUG_RE.test(term)) return term;
|
|
882
|
+
}
|
|
883
|
+
// Deterministic hash fallback — member ids sorted lex, hashed.
|
|
884
|
+
// Use the normalisedMembers we built earlier so plain-frontmatter
|
|
885
|
+
// callers get a stable hash too (their id lives at `.data.id` after
|
|
886
|
+
// normalisation, not `.id` directly).
|
|
887
|
+
const sortedIds = normalisedMembers
|
|
888
|
+
.map((leaf) => leaf?.data?.id ?? leaf?.path ?? "")
|
|
889
|
+
.filter(Boolean)
|
|
890
|
+
.sort();
|
|
891
|
+
const hash = hashString(sortedIds.join("|")).slice(0, 7);
|
|
892
|
+
return `cluster-${hash}`;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// Build an IDF map over a sibling leaf set once, for reuse across
|
|
896
|
+
// multiple `generateDeterministicSlug` calls on clusters within the
|
|
897
|
+
// same parent directory. Every candidate cluster under a given parent
|
|
898
|
+
// shares the same corpus context, so computing IDF once per directory
|
|
899
|
+
// — rather than once per candidate — is strictly better for any
|
|
900
|
+
// directory with ≥ 2 candidate clusters. Drop the return value into
|
|
901
|
+
// `generateDeterministicSlug(.., .., { precomputedIdf: idfMap })`.
|
|
902
|
+
export function buildSiblingIdfContext(siblings) {
|
|
903
|
+
const tokenLists = siblings.map((leaf) =>
|
|
904
|
+
tokenize(entryText(leaf?.data ?? leaf)),
|
|
905
|
+
);
|
|
906
|
+
return computeIdf(tokenLists);
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// Maximum number of cover phrases to splice into a multi-member
|
|
910
|
+
// cluster's synthesised focus. Four is a soft cap that keeps the
|
|
911
|
+
// resulting string short enough to read at a glance while still
|
|
912
|
+
// telling the orchestrator the cluster is multi-topic.
|
|
913
|
+
const PURPOSE_MAX_COVERS = 4;
|
|
914
|
+
|
|
915
|
+
// Deterministic purpose for the NEST stub's `focus:` field.
|
|
916
|
+
//
|
|
917
|
+
// Tiered policy:
|
|
918
|
+
// - Single-member cluster: return the member's own `focus` directly
|
|
919
|
+
// — concise + accurate, e.g. for an X.11 root-containment outlier
|
|
920
|
+
// whose folder will hold exactly one leaf.
|
|
921
|
+
// - Multi-member cluster: aggregate the top-N cover phrases across
|
|
922
|
+
// members ranked by frequency desc, then lex asc, joined with
|
|
923
|
+
// "; ". This is the corrected behaviour: previously the
|
|
924
|
+
// algorithm returned just the lex-first highest-count cover,
|
|
925
|
+
// which on coarse k-means clusters of diverse content (where no
|
|
926
|
+
// cover appears in multiple members) collapsed to "the
|
|
927
|
+
// alphabetically-first cover of any member" — producing
|
|
928
|
+
// misleading single-leaf focus strings on multi-topic clusters
|
|
929
|
+
// (see X.11 wiki output where an 8-leaf ops/observability
|
|
930
|
+
// cluster's focus read "Action items without owner or deadline",
|
|
931
|
+
// which was just one detail of one member).
|
|
932
|
+
// - Multi-member cluster with no covers anywhere: fall back to the
|
|
933
|
+
// focus of the member whose id sorts first. Still deterministic,
|
|
934
|
+
// still driven by member content alone.
|
|
935
|
+
//
|
|
936
|
+
// Accepts either `{ path, data }` leaf wrappers or plain frontmatter
|
|
937
|
+
// objects. Input is normalised via `leaf?.data ?? leaf` at the top so
|
|
938
|
+
// this helper matches `generateDeterministicSlug` + `buildSiblingIdfContext`'s
|
|
939
|
+
// API shape — callers can pass whichever form they already have
|
|
940
|
+
// without getting silent empty results for the plain-object path.
|
|
941
|
+
export function deterministicPurpose(componentLeaves) {
|
|
942
|
+
const normalised = componentLeaves.map((leaf) => leaf?.data ?? leaf);
|
|
943
|
+
if (normalised.length === 1) {
|
|
944
|
+
return typeof normalised[0]?.focus === "string" ? normalised[0].focus : "";
|
|
945
|
+
}
|
|
946
|
+
const counts = new Map();
|
|
947
|
+
for (const data of normalised) {
|
|
948
|
+
const covers = Array.isArray(data?.covers) ? data.covers : [];
|
|
949
|
+
const seenInLeaf = new Set();
|
|
950
|
+
for (const cover of covers) {
|
|
951
|
+
const key = typeof cover === "string" ? cover.trim() : "";
|
|
952
|
+
if (!key || seenInLeaf.has(key)) continue;
|
|
953
|
+
seenInLeaf.add(key);
|
|
954
|
+
counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
if (counts.size > 0) {
|
|
958
|
+
const ranked = Array.from(counts.entries()).sort((a, b) => {
|
|
959
|
+
if (b[1] !== a[1]) return b[1] - a[1];
|
|
960
|
+
return a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
|
|
961
|
+
});
|
|
962
|
+
const top = ranked.slice(0, PURPOSE_MAX_COVERS).map(([cover]) => cover);
|
|
963
|
+
return top.length === 1 ? top[0] : top.join("; ");
|
|
964
|
+
}
|
|
965
|
+
const sorted = normalised
|
|
966
|
+
.map((data) => ({
|
|
967
|
+
id: data?.id ?? "",
|
|
968
|
+
// Mirror the single-member branch: only accept string focus
|
|
969
|
+
// values, normalise everything else to "". Without this, a
|
|
970
|
+
// hand-authored leaf with a non-string `focus:` (e.g. a YAML
|
|
971
|
+
// number `0` or `false`) would propagate through and make
|
|
972
|
+
// `deterministicPurpose()` return a non-string, breaking the
|
|
973
|
+
// documented contract that this helper always returns a
|
|
974
|
+
// string.
|
|
975
|
+
focus: typeof data?.focus === "string" ? data.focus : "",
|
|
976
|
+
}))
|
|
977
|
+
.filter((x) => x.id)
|
|
978
|
+
.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
979
|
+
return sorted[0]?.focus ?? "";
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
// Local helper — a simple, stable non-crypto hash. `createHash` would
|
|
983
|
+
// be fine but adds a node:crypto import and is overkill for a 7-char
|
|
984
|
+
// slug suffix. FNV-1a 32-bit is widely used, stable across Node
|
|
985
|
+
// versions, and deterministic on the same string input.
|
|
986
|
+
function hashString(str) {
|
|
987
|
+
let h = 0x811c9dc5 >>> 0;
|
|
988
|
+
for (let i = 0; i < str.length; i++) {
|
|
989
|
+
h ^= str.charCodeAt(i);
|
|
990
|
+
h = Math.imul(h, 0x01000193) >>> 0;
|
|
991
|
+
}
|
|
992
|
+
return h.toString(16).padStart(8, "0");
|
|
993
|
+
}
|
|
994
|
+
|
package/scripts/lib/contract.mjs
CHANGED
|
@@ -135,6 +135,9 @@ const SUBCOMMANDS = {
|
|
|
135
135
|
"--layout-mode",
|
|
136
136
|
"--target",
|
|
137
137
|
"--quality-mode",
|
|
138
|
+
"--fanout-target",
|
|
139
|
+
"--max-depth",
|
|
140
|
+
"--soft-dag-parents",
|
|
138
141
|
"--no-prompt",
|
|
139
142
|
"--accept-dirty",
|
|
140
143
|
"--accept-foreign-target",
|
|
@@ -143,17 +146,42 @@ const SUBCOMMANDS = {
|
|
|
143
146
|
},
|
|
144
147
|
extend: {
|
|
145
148
|
positionals: ["wiki"],
|
|
146
|
-
flags: [
|
|
149
|
+
flags: [
|
|
150
|
+
"--quality-mode",
|
|
151
|
+
"--no-prompt",
|
|
152
|
+
"--json",
|
|
153
|
+
],
|
|
147
154
|
},
|
|
148
155
|
validate: { positionals: ["wiki"], flags: ["--json"] },
|
|
149
156
|
rebuild: {
|
|
150
157
|
positionals: ["wiki"],
|
|
151
|
-
flags: [
|
|
158
|
+
flags: [
|
|
159
|
+
"--quality-mode",
|
|
160
|
+
"--fanout-target",
|
|
161
|
+
"--max-depth",
|
|
162
|
+
"--soft-dag-parents",
|
|
163
|
+
"--review",
|
|
164
|
+
"--no-prompt",
|
|
165
|
+
"--json",
|
|
166
|
+
],
|
|
152
167
|
},
|
|
153
168
|
fix: { positionals: ["wiki"], flags: ["--json"] },
|
|
154
169
|
join: {
|
|
170
|
+
// Variadic positionals — the CLI accepts
|
|
171
|
+
// `join <wiki-a> <wiki-b> [<wiki-c>...]`. `positionals` lists
|
|
172
|
+
// the minimum shape; `min_positionals` / `variadic` describe
|
|
173
|
+
// the full contract so consumers generating invocations or
|
|
174
|
+
// validating argument counts don't assume exactly two sources.
|
|
155
175
|
positionals: ["wiki-a", "wiki-b"],
|
|
156
|
-
|
|
176
|
+
min_positionals: 2,
|
|
177
|
+
variadic: true,
|
|
178
|
+
flags: [
|
|
179
|
+
"--target",
|
|
180
|
+
"--canonical",
|
|
181
|
+
"--id-collision",
|
|
182
|
+
"--quality-mode",
|
|
183
|
+
"--json",
|
|
184
|
+
],
|
|
157
185
|
},
|
|
158
186
|
rollback: { positionals: ["wiki"], flags: ["--to", "--json"] },
|
|
159
187
|
init: {
|