selftune 0.2.18 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -4
- package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +60 -0
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
- package/cli/selftune/constants.ts +10 -0
- package/cli/selftune/contribute/contribute.ts +30 -2
- package/cli/selftune/contribution-config.ts +249 -0
- package/cli/selftune/contribution-relay.ts +177 -0
- package/cli/selftune/contribution-signals.ts +219 -0
- package/cli/selftune/contribution-staging.ts +147 -0
- package/cli/selftune/contributions.ts +532 -0
- package/cli/selftune/creator-contributions.ts +333 -0
- package/cli/selftune/dashboard-contract.ts +209 -1
- package/cli/selftune/dashboard-server.ts +45 -11
- package/cli/selftune/eval/family-overlap.ts +714 -0
- package/cli/selftune/eval/hooks-to-evals.ts +182 -28
- package/cli/selftune/eval/synthetic-evals.ts +298 -11
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +62 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +236 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/export.ts +2 -2
- package/cli/selftune/index.ts +41 -5
- package/cli/selftune/ingestors/codex-rollout.ts +31 -35
- package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
- package/cli/selftune/localdb/db.ts +2 -2
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +712 -31
- package/cli/selftune/localdb/schema.ts +30 -1
- package/cli/selftune/recover.ts +153 -0
- package/cli/selftune/repair/skill-usage.ts +363 -4
- package/cli/selftune/routes/actions.ts +35 -1
- package/cli/selftune/routes/analytics.ts +14 -0
- package/cli/selftune/routes/index.ts +1 -0
- package/cli/selftune/routes/overview.ts +112 -4
- package/cli/selftune/routes/skill-report.ts +575 -11
- package/cli/selftune/status.ts +81 -2
- package/cli/selftune/sync.ts +56 -2
- package/cli/selftune/trust-model.ts +66 -0
- package/cli/selftune/types.ts +103 -0
- package/cli/selftune/utils/skill-detection.ts +43 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/cli/selftune/watchlist.ts +65 -0
- package/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
- package/packages/ui/src/components/EvidenceViewer.tsx +419 -145
- package/packages/ui/src/components/EvolutionTimeline.tsx +81 -29
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
- package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
- package/packages/ui/src/components/section-cards.tsx +12 -9
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/SKILL.md +11 -1
- package/skill/Workflows/AlphaUpload.md +4 -0
- package/skill/Workflows/Composability.md +78 -0
- package/skill/Workflows/Contribute.md +6 -3
- package/skill/Workflows/Contributions.md +97 -0
- package/skill/Workflows/CreatorContributions.md +74 -0
- package/skill/Workflows/Dashboard.md +31 -0
- package/skill/Workflows/Evals.md +57 -8
- package/skill/Workflows/Evolve.md +23 -0
- package/skill/Workflows/Ingest.md +7 -0
- package/skill/Workflows/Initialize.md +20 -1
- package/skill/Workflows/Recover.md +84 -0
- package/skill/Workflows/RepairSkillUsage.md +12 -4
- package/skill/Workflows/Sync.md +18 -12
- package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
- package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
|
@@ -31,12 +31,13 @@ import { callViaSubagent } from "../utils/llm-call.js";
|
|
|
31
31
|
import { appendAuditEntry } from "./audit.js";
|
|
32
32
|
import { checkConstitutionSizeOnly } from "./constitutional.js";
|
|
33
33
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
34
|
-
import { appendEvidenceEntry } from "./evidence.js";
|
|
34
|
+
import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
|
|
35
35
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
36
36
|
import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
|
|
37
37
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
38
38
|
import { refineBodyProposal } from "./refine-body.js";
|
|
39
39
|
import { validateBodyProposal } from "./validate-body.js";
|
|
40
|
+
import { buildRoutingReplayFixture } from "./validate-host-replay.js";
|
|
40
41
|
import { validateRoutingProposal } from "./validate-routing.js";
|
|
41
42
|
|
|
42
43
|
// ---------------------------------------------------------------------------
|
|
@@ -106,6 +107,10 @@ function createAuditEntry(
|
|
|
106
107
|
action: EvolutionAuditEntry["action"],
|
|
107
108
|
details: string,
|
|
108
109
|
skillName?: string,
|
|
110
|
+
provenance?: Pick<
|
|
111
|
+
EvolutionAuditEntry,
|
|
112
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
113
|
+
>,
|
|
109
114
|
): EvolutionAuditEntry {
|
|
110
115
|
return {
|
|
111
116
|
timestamp: new Date().toISOString(),
|
|
@@ -113,6 +118,14 @@ function createAuditEntry(
|
|
|
113
118
|
skill_name: skillName,
|
|
114
119
|
action,
|
|
115
120
|
details,
|
|
121
|
+
...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
|
|
122
|
+
...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
|
|
123
|
+
...(provenance?.validation_fixture_id
|
|
124
|
+
? { validation_fixture_id: provenance.validation_fixture_id }
|
|
125
|
+
: {}),
|
|
126
|
+
...(provenance?.validation_evidence_ref
|
|
127
|
+
? { validation_evidence_ref: provenance.validation_evidence_ref }
|
|
128
|
+
: {}),
|
|
116
129
|
};
|
|
117
130
|
}
|
|
118
131
|
|
|
@@ -181,8 +194,12 @@ export async function evolveBody(
|
|
|
181
194
|
proposalId: string,
|
|
182
195
|
action: EvolutionAuditEntry["action"],
|
|
183
196
|
details: string,
|
|
197
|
+
provenance?: Pick<
|
|
198
|
+
EvolutionAuditEntry,
|
|
199
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
200
|
+
>,
|
|
184
201
|
): void {
|
|
185
|
-
const entry = createAuditEntry(proposalId, action, details, skillName);
|
|
202
|
+
const entry = createAuditEntry(proposalId, action, details, skillName, provenance);
|
|
186
203
|
auditEntries.push(entry);
|
|
187
204
|
try {
|
|
188
205
|
_appendAuditEntry(entry);
|
|
@@ -443,11 +460,17 @@ export async function evolveBody(
|
|
|
443
460
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
444
461
|
let validation: BodyValidationResult;
|
|
445
462
|
if (target === "routing") {
|
|
463
|
+
const replayFixture = buildRoutingReplayFixture({
|
|
464
|
+
skillName,
|
|
465
|
+
skillPath,
|
|
466
|
+
platform: studentAgent === "codex" ? "codex" : "claude_code",
|
|
467
|
+
});
|
|
446
468
|
validation = await _validateRoutingProposal(
|
|
447
469
|
proposal,
|
|
448
470
|
evalSet,
|
|
449
471
|
studentAgent,
|
|
450
472
|
validationModelFlag,
|
|
473
|
+
{ replayFixture },
|
|
451
474
|
);
|
|
452
475
|
} else {
|
|
453
476
|
validation = await _validateBodyProposal(
|
|
@@ -458,11 +481,18 @@ export async function evolveBody(
|
|
|
458
481
|
);
|
|
459
482
|
}
|
|
460
483
|
lastValidation = validation;
|
|
484
|
+
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
461
485
|
|
|
462
486
|
recordAudit(
|
|
463
487
|
proposal.proposal_id,
|
|
464
488
|
"validated",
|
|
465
489
|
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
|
|
490
|
+
{
|
|
491
|
+
validation_mode: validation.validation_mode,
|
|
492
|
+
validation_agent: validation.validation_agent,
|
|
493
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
494
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
495
|
+
},
|
|
466
496
|
);
|
|
467
497
|
recordEvidence({
|
|
468
498
|
timestamp: new Date().toISOString(),
|
|
@@ -480,6 +510,12 @@ export async function evolveBody(
|
|
|
480
510
|
gates_total: validation.gates_total,
|
|
481
511
|
gate_results: validation.gate_results,
|
|
482
512
|
regressions: validation.regressions,
|
|
513
|
+
before_pass_rate: validation.before_pass_rate,
|
|
514
|
+
after_pass_rate: validation.after_pass_rate,
|
|
515
|
+
validation_mode: validation.validation_mode,
|
|
516
|
+
validation_agent: validation.validation_agent,
|
|
517
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
518
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
483
519
|
},
|
|
484
520
|
});
|
|
485
521
|
|
|
@@ -491,6 +527,12 @@ export async function evolveBody(
|
|
|
491
527
|
proposal.proposal_id,
|
|
492
528
|
"rejected",
|
|
493
529
|
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
|
|
530
|
+
{
|
|
531
|
+
validation_mode: validation.validation_mode,
|
|
532
|
+
validation_agent: validation.validation_agent,
|
|
533
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
534
|
+
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
535
|
+
},
|
|
494
536
|
);
|
|
495
537
|
recordEvidence({
|
|
496
538
|
timestamp: new Date().toISOString(),
|
|
@@ -508,6 +550,12 @@ export async function evolveBody(
|
|
|
508
550
|
gates_total: validation.gates_total,
|
|
509
551
|
gate_results: validation.gate_results,
|
|
510
552
|
regressions: validation.regressions,
|
|
553
|
+
before_pass_rate: validation.before_pass_rate,
|
|
554
|
+
after_pass_rate: validation.after_pass_rate,
|
|
555
|
+
validation_mode: validation.validation_mode,
|
|
556
|
+
validation_agent: validation.validation_agent,
|
|
557
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
558
|
+
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
511
559
|
},
|
|
512
560
|
});
|
|
513
561
|
|
|
@@ -607,6 +655,12 @@ export async function evolveBody(
|
|
|
607
655
|
lastProposal.proposal_id,
|
|
608
656
|
"deployed",
|
|
609
657
|
`Deployed ${target} proposal for ${skillName}`,
|
|
658
|
+
{
|
|
659
|
+
validation_mode: lastValidation.validation_mode,
|
|
660
|
+
validation_agent: lastValidation.validation_agent,
|
|
661
|
+
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
662
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
663
|
+
},
|
|
610
664
|
);
|
|
611
665
|
recordEvidence({
|
|
612
666
|
timestamp: new Date().toISOString(),
|
|
@@ -624,6 +678,12 @@ export async function evolveBody(
|
|
|
624
678
|
gates_total: lastValidation.gates_total,
|
|
625
679
|
gate_results: lastValidation.gate_results,
|
|
626
680
|
regressions: lastValidation.regressions,
|
|
681
|
+
before_pass_rate: lastValidation.before_pass_rate,
|
|
682
|
+
after_pass_rate: lastValidation.after_pass_rate,
|
|
683
|
+
validation_mode: lastValidation.validation_mode,
|
|
684
|
+
validation_agent: lastValidation.validation_agent,
|
|
685
|
+
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
686
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
627
687
|
},
|
|
628
688
|
});
|
|
629
689
|
|
|
@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
|
|
|
43
43
|
import { appendAuditEntry } from "./audit.js";
|
|
44
44
|
import { checkConstitution } from "./constitutional.js";
|
|
45
45
|
import { scoreDescription } from "./description-quality.js";
|
|
46
|
-
import { appendEvidenceEntry } from "./evidence.js";
|
|
46
|
+
import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
|
|
47
47
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
48
48
|
import {
|
|
49
49
|
computeInvocationScores,
|
|
@@ -139,6 +139,10 @@ function createAuditEntry(
|
|
|
139
139
|
evalSnapshot?: EvalPassRate,
|
|
140
140
|
skillName?: string,
|
|
141
141
|
iterationsUsed?: number,
|
|
142
|
+
provenance?: Pick<
|
|
143
|
+
EvolutionAuditEntry,
|
|
144
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
145
|
+
>,
|
|
142
146
|
): EvolutionAuditEntry {
|
|
143
147
|
return {
|
|
144
148
|
timestamp: new Date().toISOString(),
|
|
@@ -148,6 +152,14 @@ function createAuditEntry(
|
|
|
148
152
|
...(skillName ? { skill_name: skillName } : {}),
|
|
149
153
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
150
154
|
...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
|
|
155
|
+
...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
|
|
156
|
+
...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
|
|
157
|
+
...(provenance?.validation_fixture_id
|
|
158
|
+
? { validation_fixture_id: provenance.validation_fixture_id }
|
|
159
|
+
: {}),
|
|
160
|
+
...(provenance?.validation_evidence_ref
|
|
161
|
+
? { validation_evidence_ref: provenance.validation_evidence_ref }
|
|
162
|
+
: {}),
|
|
151
163
|
};
|
|
152
164
|
}
|
|
153
165
|
|
|
@@ -289,6 +301,10 @@ export async function evolve(
|
|
|
289
301
|
details: string,
|
|
290
302
|
evalSnapshot?: EvalPassRate,
|
|
291
303
|
iterationsUsed?: number,
|
|
304
|
+
provenance?: Pick<
|
|
305
|
+
EvolutionAuditEntry,
|
|
306
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
307
|
+
>,
|
|
292
308
|
): void {
|
|
293
309
|
const entry = createAuditEntry(
|
|
294
310
|
proposalId,
|
|
@@ -297,6 +313,7 @@ export async function evolve(
|
|
|
297
313
|
evalSnapshot,
|
|
298
314
|
skillName,
|
|
299
315
|
iterationsUsed,
|
|
316
|
+
provenance,
|
|
300
317
|
);
|
|
301
318
|
auditEntries.push(entry);
|
|
302
319
|
try {
|
|
@@ -637,10 +654,18 @@ export async function evolve(
|
|
|
637
654
|
options.validationModel,
|
|
638
655
|
);
|
|
639
656
|
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
657
|
+
const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
640
658
|
recordAudit(
|
|
641
659
|
proposal.proposal_id,
|
|
642
660
|
"validated",
|
|
643
661
|
`Pareto validation: improved=${validation.improved}`,
|
|
662
|
+
undefined,
|
|
663
|
+
undefined,
|
|
664
|
+
{
|
|
665
|
+
validation_mode: validation.validation_mode,
|
|
666
|
+
validation_agent: validation.validation_agent,
|
|
667
|
+
validation_evidence_ref: evidenceRef,
|
|
668
|
+
},
|
|
644
669
|
);
|
|
645
670
|
recordEvidence({
|
|
646
671
|
timestamp: new Date().toISOString(),
|
|
@@ -660,6 +685,9 @@ export async function evolve(
|
|
|
660
685
|
regressions: validation.regressions,
|
|
661
686
|
new_passes: validation.new_passes,
|
|
662
687
|
per_entry_results: validation.per_entry_results,
|
|
688
|
+
validation_mode: validation.validation_mode,
|
|
689
|
+
validation_agent: validation.validation_agent,
|
|
690
|
+
validation_evidence_ref: evidenceRef,
|
|
663
691
|
},
|
|
664
692
|
});
|
|
665
693
|
|
|
@@ -866,11 +894,18 @@ export async function evolve(
|
|
|
866
894
|
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
867
895
|
pass_rate: validation.after_pass_rate,
|
|
868
896
|
};
|
|
897
|
+
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
869
898
|
recordAudit(
|
|
870
899
|
proposal.proposal_id,
|
|
871
900
|
"validated",
|
|
872
901
|
`Validation complete: improved=${validation.improved}`,
|
|
873
902
|
evalSnapshot,
|
|
903
|
+
undefined,
|
|
904
|
+
{
|
|
905
|
+
validation_mode: validation.validation_mode,
|
|
906
|
+
validation_agent: validation.validation_agent,
|
|
907
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
908
|
+
},
|
|
874
909
|
);
|
|
875
910
|
recordEvidence({
|
|
876
911
|
timestamp: new Date().toISOString(),
|
|
@@ -890,6 +925,9 @@ export async function evolve(
|
|
|
890
925
|
regressions: validation.regressions,
|
|
891
926
|
new_passes: validation.new_passes,
|
|
892
927
|
per_entry_results: validation.per_entry_results,
|
|
928
|
+
validation_mode: validation.validation_mode,
|
|
929
|
+
validation_agent: validation.validation_agent,
|
|
930
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
893
931
|
},
|
|
894
932
|
});
|
|
895
933
|
|
|
@@ -906,10 +944,18 @@ export async function evolve(
|
|
|
906
944
|
|
|
907
945
|
if (!validation.improved) {
|
|
908
946
|
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
947
|
+
const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
|
|
909
948
|
recordAudit(
|
|
910
949
|
proposal.proposal_id,
|
|
911
950
|
"rejected",
|
|
912
951
|
`Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
952
|
+
undefined,
|
|
953
|
+
undefined,
|
|
954
|
+
{
|
|
955
|
+
validation_mode: validation.validation_mode,
|
|
956
|
+
validation_agent: validation.validation_agent,
|
|
957
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
958
|
+
},
|
|
913
959
|
);
|
|
914
960
|
recordEvidence({
|
|
915
961
|
timestamp: new Date().toISOString(),
|
|
@@ -929,6 +975,9 @@ export async function evolve(
|
|
|
929
975
|
regressions: validation.regressions,
|
|
930
976
|
new_passes: validation.new_passes,
|
|
931
977
|
per_entry_results: validation.per_entry_results,
|
|
978
|
+
validation_mode: validation.validation_mode,
|
|
979
|
+
validation_agent: validation.validation_agent,
|
|
980
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
932
981
|
},
|
|
933
982
|
});
|
|
934
983
|
|
|
@@ -1138,6 +1187,11 @@ export async function evolve(
|
|
|
1138
1187
|
pass_rate: lastValidation.after_pass_rate,
|
|
1139
1188
|
},
|
|
1140
1189
|
iterationsCompleted,
|
|
1190
|
+
{
|
|
1191
|
+
validation_mode: lastValidation.validation_mode,
|
|
1192
|
+
validation_agent: lastValidation.validation_agent,
|
|
1193
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1194
|
+
},
|
|
1141
1195
|
);
|
|
1142
1196
|
recordEvidence({
|
|
1143
1197
|
timestamp: new Date().toISOString(),
|
|
@@ -1157,6 +1211,9 @@ export async function evolve(
|
|
|
1157
1211
|
regressions: lastValidation.regressions,
|
|
1158
1212
|
new_passes: lastValidation.new_passes,
|
|
1159
1213
|
per_entry_results: lastValidation.per_entry_results,
|
|
1214
|
+
validation_mode: lastValidation.validation_mode,
|
|
1215
|
+
validation_agent: lastValidation.validation_agent,
|
|
1216
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1160
1217
|
},
|
|
1161
1218
|
});
|
|
1162
1219
|
}
|
|
@@ -209,6 +209,8 @@ export async function validateBodyProposal(
|
|
|
209
209
|
gate_results: gateResults,
|
|
210
210
|
improved: false,
|
|
211
211
|
regressions: [],
|
|
212
|
+
validation_mode: "structural_guard",
|
|
213
|
+
validation_agent: agent,
|
|
212
214
|
};
|
|
213
215
|
}
|
|
214
216
|
|
|
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
|
|
|
250
252
|
gate_results: gateResults,
|
|
251
253
|
improved: gatesPassed === 3,
|
|
252
254
|
regressions: accuracy.regressions,
|
|
255
|
+
validation_mode: "llm_judge",
|
|
256
|
+
validation_agent: agent,
|
|
257
|
+
...(evalSet.length > 0
|
|
258
|
+
? {
|
|
259
|
+
before_pass_rate: accuracy.before_pass_rate,
|
|
260
|
+
after_pass_rate: accuracy.after_pass_rate,
|
|
261
|
+
}
|
|
262
|
+
: {}),
|
|
253
263
|
};
|
|
254
264
|
}
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
|
|
2
|
+
import { basename, dirname, join } from "node:path";
|
|
3
|
+
|
|
4
|
+
import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
|
|
5
|
+
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
6
|
+
import { containsWholeSkillMention } from "../utils/skill-discovery.js";
|
|
7
|
+
import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
|
|
8
|
+
import {
|
|
9
|
+
extractWhenToUseLines,
|
|
10
|
+
jaccardSimilarity,
|
|
11
|
+
tokenizeText,
|
|
12
|
+
} from "../utils/text-similarity.js";
|
|
13
|
+
|
|
14
|
+
interface ReplaySkillSurface {
|
|
15
|
+
skillName: string;
|
|
16
|
+
descriptionTokens: Set<string>;
|
|
17
|
+
whenToUseTokens: Set<string>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Minimum score needed before replay treats routing text or skill-surface overlap
|
|
22
|
+
* as a real match. Tuned to suppress weak false positives without killing recall
|
|
23
|
+
* for short routing phrases and sparse skill surfaces.
|
|
24
|
+
*/
|
|
25
|
+
const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
|
|
26
|
+
|
|
27
|
+
function resolveReplayPath(path: string): string {
|
|
28
|
+
try {
|
|
29
|
+
return realpathSync(path);
|
|
30
|
+
} catch {
|
|
31
|
+
return path;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function listCompetingSkillPaths(targetSkillPath: string): string[] {
|
|
36
|
+
const normalizedTargetPath = resolveReplayPath(targetSkillPath);
|
|
37
|
+
const targetSkillDir = dirname(normalizedTargetPath);
|
|
38
|
+
const registryDir = dirname(targetSkillDir);
|
|
39
|
+
const targetDirName = basename(targetSkillDir);
|
|
40
|
+
const competingPaths: string[] = [];
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
for (const entry of readdirSync(registryDir)) {
|
|
44
|
+
if (entry === targetDirName) continue;
|
|
45
|
+
const candidateDir = join(registryDir, entry);
|
|
46
|
+
try {
|
|
47
|
+
if (!statSync(candidateDir).isDirectory()) continue;
|
|
48
|
+
} catch {
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const candidateSkillPath = join(candidateDir, "SKILL.md");
|
|
53
|
+
if (!existsSync(candidateSkillPath)) continue;
|
|
54
|
+
competingPaths.push(resolveReplayPath(candidateSkillPath));
|
|
55
|
+
}
|
|
56
|
+
} catch {
|
|
57
|
+
// Ignore unreadable registries and treat the fixture as target-only.
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return competingPaths.sort((a, b) => a.localeCompare(b));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function buildRoutingReplayFixture(options: {
|
|
64
|
+
skillName: string;
|
|
65
|
+
skillPath: string;
|
|
66
|
+
platform?: RoutingReplayFixture["platform"];
|
|
67
|
+
fixtureId?: string;
|
|
68
|
+
workspaceRoot?: string;
|
|
69
|
+
}): RoutingReplayFixture {
|
|
70
|
+
const targetSkillPath = resolveReplayPath(options.skillPath);
|
|
71
|
+
const workspaceRoot =
|
|
72
|
+
options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
|
|
73
|
+
const platform = options.platform ?? "claude_code";
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
|
|
77
|
+
platform,
|
|
78
|
+
target_skill_name: options.skillName,
|
|
79
|
+
target_skill_path: targetSkillPath,
|
|
80
|
+
competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
|
|
81
|
+
...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
|
|
86
|
+
const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
|
|
87
|
+
try {
|
|
88
|
+
const raw = readFileSync(skillPath, "utf8");
|
|
89
|
+
const parsed = parseFrontmatter(raw);
|
|
90
|
+
return {
|
|
91
|
+
skillName: parsed.name.trim() || fallbackName,
|
|
92
|
+
descriptionTokens: tokenizeText(parsed.description),
|
|
93
|
+
whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
|
|
94
|
+
};
|
|
95
|
+
} catch {
|
|
96
|
+
return {
|
|
97
|
+
skillName: fallbackName,
|
|
98
|
+
descriptionTokens: new Set<string>(),
|
|
99
|
+
whenToUseTokens: new Set<string>(),
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function extractRoutingTriggerPhrases(routing: string): string[] {
|
|
105
|
+
const lines = routing
|
|
106
|
+
.trim()
|
|
107
|
+
.split("\n")
|
|
108
|
+
.map((line) => line.trim())
|
|
109
|
+
.filter(Boolean);
|
|
110
|
+
if (lines.length < 3) return [];
|
|
111
|
+
|
|
112
|
+
const phrases: string[] = [];
|
|
113
|
+
for (const row of lines.slice(2)) {
|
|
114
|
+
if (!row.startsWith("|") || !row.endsWith("|")) continue;
|
|
115
|
+
const cells = row.split("|").map((cell) => cell.trim());
|
|
116
|
+
const triggerCell = cells[1];
|
|
117
|
+
if (!triggerCell) continue;
|
|
118
|
+
for (const part of triggerCell.split(/,|\/| or /i)) {
|
|
119
|
+
const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
|
|
120
|
+
if (phrase.length >= 3) phrases.push(phrase);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return phrases;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
|
|
127
|
+
const normalizedQuery = query.toLowerCase();
|
|
128
|
+
const queryTokens = tokenizeText(query);
|
|
129
|
+
let best = 0;
|
|
130
|
+
for (const phrase of triggerPhrases) {
|
|
131
|
+
const normalizedPhrase = phrase.toLowerCase();
|
|
132
|
+
if (normalizedQuery.includes(normalizedPhrase)) {
|
|
133
|
+
best = Math.max(best, 1);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
|
|
137
|
+
}
|
|
138
|
+
return best;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
|
|
142
|
+
const queryTokens = tokenizeText(query);
|
|
143
|
+
return Math.max(
|
|
144
|
+
jaccardSimilarity(queryTokens, surface.descriptionTokens),
|
|
145
|
+
jaccardSimilarity(queryTokens, surface.whenToUseTokens),
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function evaluateReplayTrigger(
|
|
150
|
+
query: string,
|
|
151
|
+
routing: string,
|
|
152
|
+
targetSurface: ReplaySkillSurface,
|
|
153
|
+
competingSurfaces: ReplaySkillSurface[],
|
|
154
|
+
): { triggered: boolean; evidence: string } {
|
|
155
|
+
const normalizedQuery = query.trim();
|
|
156
|
+
if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
|
|
157
|
+
return {
|
|
158
|
+
triggered: true,
|
|
159
|
+
evidence: `explicit target mention: ${targetSurface.skillName}`,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for (const competingSurface of competingSurfaces) {
|
|
164
|
+
if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
|
|
165
|
+
return {
|
|
166
|
+
triggered: false,
|
|
167
|
+
evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const triggerPhrases = extractRoutingTriggerPhrases(routing);
|
|
173
|
+
const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
|
|
174
|
+
const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
|
|
175
|
+
const targetScore = Math.max(triggerScore, targetSurfaceScore);
|
|
176
|
+
const bestCompetitor = competingSurfaces
|
|
177
|
+
.map((surface) => ({
|
|
178
|
+
skillName: surface.skillName,
|
|
179
|
+
score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
|
|
180
|
+
}))
|
|
181
|
+
.sort((a, b) => b.score - a.score)[0];
|
|
182
|
+
|
|
183
|
+
if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
|
|
184
|
+
return {
|
|
185
|
+
triggered: false,
|
|
186
|
+
evidence: "target routing and skill surface did not clear replay threshold",
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (bestCompetitor && bestCompetitor.score >= targetScore) {
|
|
191
|
+
return {
|
|
192
|
+
triggered: false,
|
|
193
|
+
evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (triggerScore >= targetSurfaceScore) {
|
|
198
|
+
return {
|
|
199
|
+
triggered: true,
|
|
200
|
+
evidence:
|
|
201
|
+
triggerScore === 1
|
|
202
|
+
? "query matched a routing trigger phrase exactly"
|
|
203
|
+
: "query aligned with routing trigger language",
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
triggered: true,
|
|
209
|
+
evidence: "query aligned with target skill surface in replay fixture",
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export function runHostReplayFixture(options: {
|
|
214
|
+
routing: string;
|
|
215
|
+
evalSet: EvalEntry[];
|
|
216
|
+
fixture: RoutingReplayFixture;
|
|
217
|
+
}): RoutingReplayEntryResult[] {
|
|
218
|
+
const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
|
|
219
|
+
const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
|
|
220
|
+
|
|
221
|
+
return options.evalSet.map((entry) => {
|
|
222
|
+
const evaluated = evaluateReplayTrigger(
|
|
223
|
+
entry.query,
|
|
224
|
+
options.routing,
|
|
225
|
+
targetSurface,
|
|
226
|
+
competingSurfaces,
|
|
227
|
+
);
|
|
228
|
+
return {
|
|
229
|
+
query: entry.query,
|
|
230
|
+
should_trigger: entry.should_trigger,
|
|
231
|
+
triggered: evaluated.triggered,
|
|
232
|
+
passed: evaluated.triggered === entry.should_trigger,
|
|
233
|
+
evidence: evaluated.evidence,
|
|
234
|
+
};
|
|
235
|
+
});
|
|
236
|
+
}
|
|
@@ -40,6 +40,8 @@ export interface ValidationResult {
|
|
|
40
40
|
net_change: number; // after - before pass rate
|
|
41
41
|
by_invocation_type?: InvocationTypeScores;
|
|
42
42
|
per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
43
|
+
validation_mode?: "llm_judge";
|
|
44
|
+
validation_agent?: string;
|
|
43
45
|
}
|
|
44
46
|
|
|
45
47
|
// ---------------------------------------------------------------------------
|
|
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
|
|
|
63
65
|
regressions: [],
|
|
64
66
|
new_passes: [],
|
|
65
67
|
net_change: 0,
|
|
68
|
+
validation_mode: "llm_judge",
|
|
69
|
+
validation_agent: agent,
|
|
66
70
|
};
|
|
67
71
|
}
|
|
68
72
|
|
|
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
|
|
|
174
178
|
net_change: netChange,
|
|
175
179
|
by_invocation_type: invocationScores,
|
|
176
180
|
per_entry_results: perEntryResults,
|
|
181
|
+
validation_mode: "llm_judge",
|
|
182
|
+
validation_agent: agent,
|
|
177
183
|
};
|
|
178
184
|
}
|
|
179
185
|
|
|
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
|
|
|
220
226
|
regressions: [],
|
|
221
227
|
new_passes: [],
|
|
222
228
|
net_change: 0,
|
|
229
|
+
validation_mode: "llm_judge",
|
|
230
|
+
validation_agent: agent,
|
|
223
231
|
};
|
|
224
232
|
}
|
|
225
233
|
|
|
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
|
|
|
342
350
|
net_change: netChange,
|
|
343
351
|
by_invocation_type: invocationScores,
|
|
344
352
|
per_entry_results: perEntryResults,
|
|
353
|
+
validation_mode: "llm_judge",
|
|
354
|
+
validation_agent: agent,
|
|
345
355
|
};
|
|
346
356
|
}
|
|
347
357
|
|