selftune 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
- package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -433,37 +433,6 @@ export async function evolveBody(
|
|
|
433
433
|
}
|
|
434
434
|
}
|
|
435
435
|
|
|
436
|
-
// Check confidence threshold
|
|
437
|
-
if (proposal.confidence < confidenceThreshold) {
|
|
438
|
-
recordAudit(
|
|
439
|
-
proposal.proposal_id,
|
|
440
|
-
"rejected",
|
|
441
|
-
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
442
|
-
);
|
|
443
|
-
recordEvidence({
|
|
444
|
-
timestamp: new Date().toISOString(),
|
|
445
|
-
proposal_id: proposal.proposal_id,
|
|
446
|
-
skill_name: skillName,
|
|
447
|
-
skill_path: skillPath,
|
|
448
|
-
target,
|
|
449
|
-
stage: "rejected",
|
|
450
|
-
rationale: proposal.rationale,
|
|
451
|
-
confidence: proposal.confidence,
|
|
452
|
-
details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
453
|
-
});
|
|
454
|
-
|
|
455
|
-
if (iteration === maxIterations - 1) {
|
|
456
|
-
return {
|
|
457
|
-
proposal: lastProposal,
|
|
458
|
-
validation: null,
|
|
459
|
-
deployed: false,
|
|
460
|
-
auditEntries,
|
|
461
|
-
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
462
|
-
};
|
|
463
|
-
}
|
|
464
|
-
continue;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
436
|
// Validate (validationModel overrides studentModel for validation calls)
|
|
468
437
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
469
438
|
let validation: BodyValidationResult;
|
|
@@ -544,6 +513,10 @@ export async function evolveBody(
|
|
|
544
513
|
}
|
|
545
514
|
lastValidation = validation;
|
|
546
515
|
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
516
|
+
const confidenceReviewNote =
|
|
517
|
+
proposal.confidence < confidenceThreshold
|
|
518
|
+
? ` (confidence ${proposal.confidence.toFixed(2)} below review threshold ${confidenceThreshold})`
|
|
519
|
+
: "";
|
|
547
520
|
|
|
548
521
|
recordAudit(
|
|
549
522
|
proposal.proposal_id,
|
|
@@ -552,7 +525,7 @@ export async function evolveBody(
|
|
|
552
525
|
validation.validation_fallback_reason
|
|
553
526
|
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
554
527
|
: ""
|
|
555
|
-
}`,
|
|
528
|
+
}${confidenceReviewNote}`,
|
|
556
529
|
{
|
|
557
530
|
validation_mode: validation.validation_mode,
|
|
558
531
|
validation_agent: validation.validation_agent,
|
|
@@ -573,7 +546,7 @@ export async function evolveBody(
|
|
|
573
546
|
validation.validation_fallback_reason
|
|
574
547
|
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
575
548
|
: ""
|
|
576
|
-
}`,
|
|
549
|
+
}${confidenceReviewNote}`,
|
|
577
550
|
validation: {
|
|
578
551
|
improved: validation.improved,
|
|
579
552
|
gates_passed: validation.gates_passed,
|
|
@@ -641,7 +614,7 @@ export async function evolveBody(
|
|
|
641
614
|
validation.validation_fallback_reason
|
|
642
615
|
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
643
616
|
: ""
|
|
644
|
-
}`,
|
|
617
|
+
}${confidenceReviewNote}`,
|
|
645
618
|
{
|
|
646
619
|
validation_mode: validation.validation_mode,
|
|
647
620
|
validation_agent: validation.validation_agent,
|
|
@@ -662,7 +635,7 @@ export async function evolveBody(
|
|
|
662
635
|
validation.validation_fallback_reason
|
|
663
636
|
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
664
637
|
: ""
|
|
665
|
-
}`,
|
|
638
|
+
}${confidenceReviewNote}`,
|
|
666
639
|
validation: {
|
|
667
640
|
improved: validation.improved,
|
|
668
641
|
gates_passed: validation.gates_passed,
|
|
@@ -886,7 +859,7 @@ Options:
|
|
|
886
859
|
--eval-set Path to eval set JSON
|
|
887
860
|
--dry-run Validate without deploying
|
|
888
861
|
--max-iterations Max refinement iterations (default: 3)
|
|
889
|
-
--confidence
|
|
862
|
+
--confidence Low-confidence review threshold 0.0-1.0 (default: 0.6)
|
|
890
863
|
--task-description Optional task description context
|
|
891
864
|
--few-shot Comma-separated paths to example skill files
|
|
892
865
|
--validation-model Model for trigger-check validation calls (overrides --student-model for validation)
|
|
@@ -79,7 +79,7 @@ export interface EvolveOptions {
|
|
|
79
79
|
evalSetPath?: string;
|
|
80
80
|
agent: string;
|
|
81
81
|
dryRun: boolean;
|
|
82
|
-
confidenceThreshold: number; // default 0.6
|
|
82
|
+
confidenceThreshold: number; // warning/review threshold, default 0.6
|
|
83
83
|
maxIterations: number; // default 3
|
|
84
84
|
gradingResults?: GradingResult[];
|
|
85
85
|
paretoEnabled?: boolean;
|
|
@@ -713,23 +713,9 @@ export async function evolve(
|
|
|
713
713
|
);
|
|
714
714
|
llmCallCount += candidateCount;
|
|
715
715
|
|
|
716
|
-
// Filter by confidence threshold
|
|
717
|
-
const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
|
|
718
|
-
|
|
719
|
-
if (viableCandidates.length === 0) {
|
|
720
|
-
finishTui();
|
|
721
|
-
return withStats({
|
|
722
|
-
proposal: candidates[0] ?? null,
|
|
723
|
-
validation: null,
|
|
724
|
-
deployed: false,
|
|
725
|
-
auditEntries,
|
|
726
|
-
reason: `No candidates met confidence threshold ${confidenceThreshold}`,
|
|
727
|
-
});
|
|
728
|
-
}
|
|
729
|
-
|
|
730
716
|
// Validate each candidate
|
|
731
717
|
const paretoCandidates: ParetoCandidate[] = [];
|
|
732
|
-
for (const proposal of
|
|
718
|
+
for (const proposal of candidates) {
|
|
733
719
|
recordAudit(
|
|
734
720
|
proposal.proposal_id,
|
|
735
721
|
"created",
|
|
@@ -855,7 +841,7 @@ export async function evolve(
|
|
|
855
841
|
if (paretoCandidates.length === 0) {
|
|
856
842
|
finishTui();
|
|
857
843
|
return withStats({
|
|
858
|
-
proposal:
|
|
844
|
+
proposal: candidates[0] ?? null,
|
|
859
845
|
validation: null,
|
|
860
846
|
deployed: false,
|
|
861
847
|
auditEntries,
|
|
@@ -932,15 +918,12 @@ export async function evolve(
|
|
|
932
918
|
// Re-evaluate stopping after a constitutional rejection by treating the
|
|
933
919
|
// last entry in previousPassRates as the currentPassRate (or 0 on the
|
|
934
920
|
// first iteration) and slicing it out of history before calling
|
|
935
|
-
// evaluateStoppingCriteria() with the current iteration/maxIterations
|
|
936
|
-
// confidenceThreshold, and proposal.confidence.
|
|
921
|
+
// evaluateStoppingCriteria() with the current iteration/maxIterations.
|
|
937
922
|
const constitutionStop = evaluateStoppingCriteria(
|
|
938
923
|
previousPassRates.at(-1) ?? 0,
|
|
939
924
|
previousPassRates.slice(0, -1),
|
|
940
925
|
iteration + 1,
|
|
941
926
|
maxIterations,
|
|
942
|
-
confidenceThreshold,
|
|
943
|
-
proposal.confidence,
|
|
944
927
|
);
|
|
945
928
|
recordAudit(
|
|
946
929
|
proposal.proposal_id,
|
|
@@ -971,52 +954,7 @@ export async function evolve(
|
|
|
971
954
|
continue;
|
|
972
955
|
}
|
|
973
956
|
|
|
974
|
-
// Step 9:
|
|
975
|
-
{
|
|
976
|
-
const preValidationStop = evaluateStoppingCriteria(
|
|
977
|
-
previousPassRates.at(-1) ?? 0,
|
|
978
|
-
previousPassRates.slice(0, -1),
|
|
979
|
-
iteration + 1,
|
|
980
|
-
maxIterations,
|
|
981
|
-
confidenceThreshold,
|
|
982
|
-
proposal.confidence,
|
|
983
|
-
);
|
|
984
|
-
if (proposal.confidence < confidenceThreshold) {
|
|
985
|
-
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
986
|
-
recordAudit(
|
|
987
|
-
proposal.proposal_id,
|
|
988
|
-
"rejected",
|
|
989
|
-
`${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
990
|
-
);
|
|
991
|
-
recordEvidence({
|
|
992
|
-
timestamp: new Date().toISOString(),
|
|
993
|
-
proposal_id: proposal.proposal_id,
|
|
994
|
-
skill_name: skillName,
|
|
995
|
-
skill_path: skillPath,
|
|
996
|
-
target: "description",
|
|
997
|
-
stage: "rejected",
|
|
998
|
-
rationale: proposal.rationale,
|
|
999
|
-
confidence: proposal.confidence,
|
|
1000
|
-
details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
1001
|
-
});
|
|
1002
|
-
|
|
1003
|
-
// Use stopping criteria to decide whether to return or retry
|
|
1004
|
-
if (preValidationStop.shouldStop) {
|
|
1005
|
-
finishTui();
|
|
1006
|
-
return withStats({
|
|
1007
|
-
proposal: lastProposal,
|
|
1008
|
-
validation: null,
|
|
1009
|
-
deployed: false,
|
|
1010
|
-
auditEntries,
|
|
1011
|
-
reason: `${feedbackReason} (${preValidationStop.reason})`,
|
|
1012
|
-
});
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
continue;
|
|
1016
|
-
}
|
|
1017
|
-
}
|
|
1018
|
-
|
|
1019
|
-
// Step 10: Validate against eval set
|
|
957
|
+
// Step 9: Validate against eval set
|
|
1020
958
|
const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
|
|
1021
959
|
tui.step(
|
|
1022
960
|
`Validating ${evalSet.length} entries (mode=${effectiveValidationMode}, ${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
|
|
@@ -1038,7 +976,7 @@ export async function evolve(
|
|
|
1038
976
|
`Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
|
|
1039
977
|
);
|
|
1040
978
|
|
|
1041
|
-
// Step
|
|
979
|
+
// Step 10: Audit "validated"
|
|
1042
980
|
const evalSnapshot: EvalPassRate = {
|
|
1043
981
|
total: evalSet.length,
|
|
1044
982
|
passed: Math.round(validation.after_pass_rate * evalSet.length),
|
|
@@ -1094,14 +1032,12 @@ export async function evolve(
|
|
|
1094
1032
|
},
|
|
1095
1033
|
});
|
|
1096
1034
|
|
|
1097
|
-
// Step
|
|
1035
|
+
// Step 11: Evaluate stopping criteria after validation
|
|
1098
1036
|
const stopping = evaluateStoppingCriteria(
|
|
1099
1037
|
validation.after_pass_rate,
|
|
1100
1038
|
previousPassRates,
|
|
1101
1039
|
iteration + 1,
|
|
1102
1040
|
maxIterations,
|
|
1103
|
-
confidenceThreshold,
|
|
1104
|
-
proposal.confidence,
|
|
1105
1041
|
);
|
|
1106
1042
|
previousPassRates.push(validation.after_pass_rate);
|
|
1107
1043
|
|
|
@@ -1710,7 +1646,7 @@ export async function cliMain(): Promise<void> {
|
|
|
1710
1646
|
result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
|
|
1711
1647
|
) {
|
|
1712
1648
|
console.error(
|
|
1713
|
-
` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
|
|
1649
|
+
` Confidence ${result.proposal.confidence.toFixed(2)} below review threshold ${values.confidence ?? "0.6"} (validated anyway)`,
|
|
1714
1650
|
);
|
|
1715
1651
|
}
|
|
1716
1652
|
// Targeted suggestions based on specific failure reason
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* stopping-criteria.ts
|
|
3
3
|
*
|
|
4
4
|
* Evaluates whether the evolution loop should stop based on convergence,
|
|
5
|
-
* iteration limits,
|
|
5
|
+
* iteration limits, and plateau detection.
|
|
6
6
|
* Pure function module with no external dependencies.
|
|
7
7
|
*/
|
|
8
8
|
|
|
@@ -25,17 +25,14 @@ export interface StoppingDecision {
|
|
|
25
25
|
* Checks conditions in priority order:
|
|
26
26
|
* 1. Converged (pass rate >= 95%)
|
|
27
27
|
* 2. Max iterations reached
|
|
28
|
-
* 3.
|
|
29
|
-
* 4.
|
|
30
|
-
* 5. Continue (none of the above)
|
|
28
|
+
* 3. Plateau (< 1% variation over last 3 iterations)
|
|
29
|
+
* 4. Continue (none of the above)
|
|
31
30
|
*/
|
|
32
31
|
export function evaluateStoppingCriteria(
|
|
33
32
|
currentPassRate: number,
|
|
34
33
|
previousPassRates: number[],
|
|
35
34
|
iterationCount: number,
|
|
36
35
|
maxIterations: number,
|
|
37
|
-
confidenceThreshold: number,
|
|
38
|
-
proposalConfidence: number,
|
|
39
36
|
): StoppingDecision {
|
|
40
37
|
// 1. Converged
|
|
41
38
|
if (currentPassRate >= 0.95) {
|
|
@@ -47,12 +44,7 @@ export function evaluateStoppingCriteria(
|
|
|
47
44
|
return { shouldStop: true, reason: "Max iterations reached" };
|
|
48
45
|
}
|
|
49
46
|
|
|
50
|
-
// 3.
|
|
51
|
-
if (proposalConfidence < confidenceThreshold) {
|
|
52
|
-
return { shouldStop: true, reason: "Confidence below threshold" };
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
// 4. Plateau detection: need at least 2 previous rates to form 3 data points
|
|
47
|
+
// 3. Plateau detection: need at least 2 previous rates to form 3 data points
|
|
56
48
|
if (previousPassRates.length >= 2) {
|
|
57
49
|
const last2Previous = previousPassRates.slice(-2);
|
|
58
50
|
const window = [...last2Previous, currentPassRate];
|
|
@@ -64,6 +56,6 @@ export function evaluateStoppingCriteria(
|
|
|
64
56
|
}
|
|
65
57
|
}
|
|
66
58
|
|
|
67
|
-
//
|
|
59
|
+
// 4. Continue
|
|
68
60
|
return { shouldStop: false, reason: "Continuing: improvement possible" };
|
|
69
61
|
}
|
|
@@ -83,22 +83,6 @@ export function buildUnblockSuggestions(result: EvolveResult, skillName: string)
|
|
|
83
83
|
return suggestions;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
|
-
// --- Confidence failures (specific before general) ---
|
|
87
|
-
if (reason.includes("No candidates met confidence")) {
|
|
88
|
-
suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
|
|
89
|
-
suggestions.push(
|
|
90
|
-
`Or increase candidates: selftune evolve --skill ${skillName} --pareto --candidates 5`,
|
|
91
|
-
);
|
|
92
|
-
appendQualityHints(suggestions, descText, skillName);
|
|
93
|
-
return suggestions;
|
|
94
|
-
}
|
|
95
|
-
if (reason.toLowerCase().includes("confidence") && reason.includes("threshold")) {
|
|
96
|
-
suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
|
|
97
|
-
suggestions.push("Or add more eval entries so the LLM has more context for proposals");
|
|
98
|
-
appendQualityHints(suggestions, descText, skillName);
|
|
99
|
-
return suggestions;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
86
|
// --- Validation failures (proposals regressed) ---
|
|
103
87
|
if (reason.includes("Validation failed after")) {
|
|
104
88
|
suggestions.push(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
existsSync,
|
|
3
|
+
copyFileSync,
|
|
3
4
|
mkdirSync,
|
|
4
5
|
mkdtempSync,
|
|
5
6
|
readFileSync,
|
|
@@ -16,7 +17,13 @@ import {
|
|
|
16
17
|
emitDashboardActionMetrics,
|
|
17
18
|
emitDashboardActionProgress,
|
|
18
19
|
} from "../dashboard-action-events.js";
|
|
19
|
-
import type {
|
|
20
|
+
import type {
|
|
21
|
+
EvalEntry,
|
|
22
|
+
ReplayStagingMode,
|
|
23
|
+
RuntimeReplayEntryMetrics,
|
|
24
|
+
RoutingReplayEntryResult,
|
|
25
|
+
RoutingReplayFixture,
|
|
26
|
+
} from "../types.js";
|
|
20
27
|
import type { DashboardActionMetrics } from "../dashboard-contract.js";
|
|
21
28
|
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
22
29
|
import {
|
|
@@ -45,6 +52,7 @@ interface ReplayWorkspace {
|
|
|
45
52
|
skillRegistryDir: string;
|
|
46
53
|
targetSkillPath: string;
|
|
47
54
|
competingSkillPaths: string[];
|
|
55
|
+
allowedReadRoots: string[];
|
|
48
56
|
}
|
|
49
57
|
|
|
50
58
|
export type RuntimeReplayContentTarget = "routing" | "description" | "body";
|
|
@@ -65,6 +73,7 @@ export interface RuntimeReplayObservation {
|
|
|
65
73
|
rawOutput: string;
|
|
66
74
|
sessionId?: string;
|
|
67
75
|
runtimeError?: string;
|
|
76
|
+
metrics?: DashboardActionMetrics;
|
|
68
77
|
}
|
|
69
78
|
|
|
70
79
|
export type RuntimeReplayInvoker = (
|
|
@@ -162,6 +171,7 @@ export function buildRoutingReplayFixture(options: {
|
|
|
162
171
|
platform?: RoutingReplayFixture["platform"];
|
|
163
172
|
fixtureId?: string;
|
|
164
173
|
workspaceRoot?: string;
|
|
174
|
+
stagingMode?: ReplayStagingMode;
|
|
165
175
|
}): RoutingReplayFixture {
|
|
166
176
|
const targetSkillPath = resolveReplayPath(options.skillPath);
|
|
167
177
|
const workspaceRoot =
|
|
@@ -175,6 +185,7 @@ export function buildRoutingReplayFixture(options: {
|
|
|
175
185
|
target_skill_path: targetSkillPath,
|
|
176
186
|
competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
|
|
177
187
|
...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
|
|
188
|
+
...(options.stagingMode ? { skill_staging_mode: options.stagingMode } : {}),
|
|
178
189
|
};
|
|
179
190
|
}
|
|
180
191
|
|
|
@@ -193,14 +204,32 @@ function buildRuntimeReplayTargetContent(
|
|
|
193
204
|
return replaceSection(currentContent, "Workflow Routing", content.trim());
|
|
194
205
|
}
|
|
195
206
|
|
|
207
|
+
function copyDirectoryRecursive(sourceDir: string, destinationDir: string): void {
|
|
208
|
+
mkdirSync(destinationDir, { recursive: true });
|
|
209
|
+
for (const entry of readdirSync(sourceDir, { withFileTypes: true })) {
|
|
210
|
+
const sourcePath = join(sourceDir, entry.name);
|
|
211
|
+
const destinationPath = join(destinationDir, entry.name);
|
|
212
|
+
if (entry.isDirectory()) {
|
|
213
|
+
copyDirectoryRecursive(sourcePath, destinationPath);
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
copyFileSync(sourcePath, destinationPath);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
196
220
|
function stageReplaySkill(
|
|
197
221
|
registryDir: string,
|
|
198
222
|
sourceSkillPath: string,
|
|
223
|
+
stagingMode: ReplayStagingMode,
|
|
199
224
|
overrideContent?: string,
|
|
200
225
|
): string {
|
|
201
226
|
const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
|
|
202
227
|
const destinationDir = join(registryDir, skillDirName);
|
|
203
|
-
|
|
228
|
+
if (stagingMode === "package") {
|
|
229
|
+
copyDirectoryRecursive(dirname(sourceSkillPath), destinationDir);
|
|
230
|
+
} else {
|
|
231
|
+
mkdirSync(destinationDir, { recursive: true });
|
|
232
|
+
}
|
|
204
233
|
const destinationPath = join(destinationDir, "SKILL.md");
|
|
205
234
|
const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
|
|
206
235
|
writeFileSync(destinationPath, content, "utf8");
|
|
@@ -211,27 +240,43 @@ function buildRuntimeReplayWorkspace(
|
|
|
211
240
|
fixture: RoutingReplayFixture,
|
|
212
241
|
content: string,
|
|
213
242
|
contentTarget: RuntimeReplayContentTarget,
|
|
243
|
+
includeTargetSkill: boolean = true,
|
|
214
244
|
): ReplayWorkspace {
|
|
215
245
|
const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
|
|
216
246
|
try {
|
|
217
247
|
const registryDir = join(rootDir, getRuntimeReplayRegistryRelativeDir(fixture.platform));
|
|
218
248
|
mkdirSync(join(rootDir, ".git"), { recursive: true });
|
|
219
249
|
mkdirSync(registryDir, { recursive: true });
|
|
220
|
-
|
|
221
|
-
const
|
|
250
|
+
const stagingMode = fixture.skill_staging_mode ?? "routing";
|
|
251
|
+
const allowedReadRoots: string[] = [];
|
|
252
|
+
const targetSkillDir = join(
|
|
222
253
|
registryDir,
|
|
223
|
-
fixture.target_skill_path,
|
|
224
|
-
buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
|
|
254
|
+
basename(dirname(fixture.target_skill_path)) || "unknown-skill",
|
|
225
255
|
);
|
|
256
|
+
|
|
257
|
+
const targetSkillPath = join(targetSkillDir, "SKILL.md");
|
|
258
|
+
if (includeTargetSkill) {
|
|
259
|
+
const stagedTargetSkillPath = stageReplaySkill(
|
|
260
|
+
registryDir,
|
|
261
|
+
fixture.target_skill_path,
|
|
262
|
+
stagingMode,
|
|
263
|
+
buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
|
|
264
|
+
);
|
|
265
|
+
allowedReadRoots.push(dirname(stagedTargetSkillPath));
|
|
266
|
+
}
|
|
226
267
|
const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
|
|
227
|
-
stageReplaySkill(registryDir, skillPath),
|
|
268
|
+
stageReplaySkill(registryDir, skillPath, stagingMode),
|
|
228
269
|
);
|
|
270
|
+
for (const skillPath of competingSkillPaths) {
|
|
271
|
+
allowedReadRoots.push(dirname(skillPath));
|
|
272
|
+
}
|
|
229
273
|
|
|
230
274
|
return {
|
|
231
275
|
rootDir,
|
|
232
276
|
skillRegistryDir: registryDir,
|
|
233
277
|
targetSkillPath,
|
|
234
278
|
competingSkillPaths,
|
|
279
|
+
allowedReadRoots,
|
|
235
280
|
};
|
|
236
281
|
} catch (error) {
|
|
237
282
|
rmSync(rootDir, { recursive: true, force: true });
|
|
@@ -433,6 +478,42 @@ export function extractClaudeRuntimeReplayMetrics(line: string): DashboardAction
|
|
|
433
478
|
return null;
|
|
434
479
|
}
|
|
435
480
|
|
|
481
|
+
function mergeRuntimeReplayDashboardMetrics(
|
|
482
|
+
previous: DashboardActionMetrics | null,
|
|
483
|
+
next: DashboardActionMetrics,
|
|
484
|
+
): DashboardActionMetrics {
|
|
485
|
+
if (!previous) return next;
|
|
486
|
+
|
|
487
|
+
return {
|
|
488
|
+
platform: next.platform ?? previous.platform,
|
|
489
|
+
model: next.model ?? previous.model,
|
|
490
|
+
session_id: next.session_id ?? previous.session_id,
|
|
491
|
+
input_tokens: next.input_tokens ?? previous.input_tokens,
|
|
492
|
+
output_tokens: next.output_tokens ?? previous.output_tokens,
|
|
493
|
+
cache_creation_input_tokens:
|
|
494
|
+
next.cache_creation_input_tokens ?? previous.cache_creation_input_tokens,
|
|
495
|
+
cache_read_input_tokens: next.cache_read_input_tokens ?? previous.cache_read_input_tokens,
|
|
496
|
+
total_cost_usd: next.total_cost_usd ?? previous.total_cost_usd,
|
|
497
|
+
duration_ms: next.duration_ms ?? previous.duration_ms,
|
|
498
|
+
num_turns: next.num_turns ?? previous.num_turns,
|
|
499
|
+
};
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
function buildRuntimeReplayEntryMetrics(
|
|
503
|
+
metrics: DashboardActionMetrics | undefined,
|
|
504
|
+
elapsedMs: number,
|
|
505
|
+
): RuntimeReplayEntryMetrics {
|
|
506
|
+
return {
|
|
507
|
+
input_tokens: metrics?.input_tokens ?? null,
|
|
508
|
+
output_tokens: metrics?.output_tokens ?? null,
|
|
509
|
+
cache_creation_input_tokens: metrics?.cache_creation_input_tokens ?? null,
|
|
510
|
+
cache_read_input_tokens: metrics?.cache_read_input_tokens ?? null,
|
|
511
|
+
total_cost_usd: metrics?.total_cost_usd ?? null,
|
|
512
|
+
duration_ms: metrics?.duration_ms ?? elapsedMs,
|
|
513
|
+
num_turns: metrics?.num_turns ?? null,
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
|
|
436
517
|
async function readStreamText(
|
|
437
518
|
stream: ReadableStream<Uint8Array> | null | undefined,
|
|
438
519
|
onLine?: (line: string) => void,
|
|
@@ -725,10 +806,14 @@ async function invokeClaudeRuntimeReplay(
|
|
|
725
806
|
});
|
|
726
807
|
const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
|
|
727
808
|
|
|
809
|
+
let latestMetrics: DashboardActionMetrics | null = null;
|
|
728
810
|
const [stdoutText, stderrText, exitCode] = await Promise.all([
|
|
729
811
|
readStreamText(proc.stdout, (line) => {
|
|
730
812
|
const metrics = extractClaudeRuntimeReplayMetrics(line);
|
|
731
|
-
if (metrics)
|
|
813
|
+
if (metrics) {
|
|
814
|
+
latestMetrics = mergeRuntimeReplayDashboardMetrics(latestMetrics, metrics);
|
|
815
|
+
emitDashboardActionMetrics(latestMetrics);
|
|
816
|
+
}
|
|
732
817
|
}),
|
|
733
818
|
new Response(proc.stderr).text(),
|
|
734
819
|
proc.exited,
|
|
@@ -746,6 +831,7 @@ async function invokeClaudeRuntimeReplay(
|
|
|
746
831
|
|
|
747
832
|
return {
|
|
748
833
|
...observation,
|
|
834
|
+
...(latestMetrics ? { metrics: latestMetrics } : {}),
|
|
749
835
|
...(combinedError ? { runtimeError: combinedError } : {}),
|
|
750
836
|
};
|
|
751
837
|
}
|
|
@@ -850,10 +936,9 @@ function evaluateRuntimeReplayObservation(
|
|
|
850
936
|
const normalizedReadPaths = new Set(
|
|
851
937
|
observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
|
|
852
938
|
);
|
|
853
|
-
const
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
]);
|
|
939
|
+
const allowedReadRoots = workspace.allowedReadRoots.map(resolveReplayPath);
|
|
940
|
+
const isAllowedReadPath = (path: string): boolean =>
|
|
941
|
+
allowedReadRoots.some((root) => path === root || path.startsWith(`${root}/`));
|
|
857
942
|
const targetSkillName = fixture.target_skill_name.trim();
|
|
858
943
|
const targetTriggered = observation.triggeredSkillNames.includes(targetSkillName);
|
|
859
944
|
const competingTriggered = observation.triggeredSkillNames.find((skillName) =>
|
|
@@ -864,10 +949,16 @@ function evaluateRuntimeReplayObservation(
|
|
|
864
949
|
const unrelatedTriggered = observation.triggeredSkillNames.find(
|
|
865
950
|
(skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingTriggered,
|
|
866
951
|
);
|
|
867
|
-
const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !
|
|
868
|
-
const
|
|
952
|
+
const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !isAllowedReadPath(path));
|
|
953
|
+
const targetReadRoot = resolveReplayPath(dirname(workspace.targetSkillPath));
|
|
954
|
+
const targetRead = [...normalizedReadPaths].some(
|
|
955
|
+
(path) => path === targetReadRoot || path.startsWith(`${targetReadRoot}/`),
|
|
956
|
+
);
|
|
869
957
|
const competingRead = workspace.competingSkillPaths.find((skillPath) =>
|
|
870
|
-
normalizedReadPaths.
|
|
958
|
+
[...normalizedReadPaths].some((path) => {
|
|
959
|
+
const root = resolveReplayPath(dirname(skillPath));
|
|
960
|
+
return path === root || path.startsWith(`${root}/`);
|
|
961
|
+
}),
|
|
871
962
|
);
|
|
872
963
|
const sessionPrefix = observation.sessionId
|
|
873
964
|
? `runtime replay session ${observation.sessionId}`
|
|
@@ -1126,6 +1217,7 @@ export function buildRuntimeReplayValidationOptions(options: {
|
|
|
1126
1217
|
skillPath: string;
|
|
1127
1218
|
agent: string | null | undefined;
|
|
1128
1219
|
contentTarget?: RuntimeReplayContentTarget;
|
|
1220
|
+
stagingMode?: ReplayStagingMode;
|
|
1129
1221
|
}): ReplayValidationOptions | undefined {
|
|
1130
1222
|
const platform = resolveRuntimeReplayPlatform(options.agent);
|
|
1131
1223
|
if (!platform) return undefined;
|
|
@@ -1135,6 +1227,7 @@ export function buildRuntimeReplayValidationOptions(options: {
|
|
|
1135
1227
|
skillName: options.skillName,
|
|
1136
1228
|
skillPath: options.skillPath,
|
|
1137
1229
|
platform,
|
|
1230
|
+
stagingMode: options.stagingMode,
|
|
1138
1231
|
});
|
|
1139
1232
|
|
|
1140
1233
|
return {
|
|
@@ -1157,6 +1250,7 @@ export async function runHostRuntimeReplayFixture(options: {
|
|
|
1157
1250
|
evalSet: EvalEntry[];
|
|
1158
1251
|
fixture: RoutingReplayFixture;
|
|
1159
1252
|
contentTarget?: RuntimeReplayContentTarget;
|
|
1253
|
+
includeTargetSkill?: boolean;
|
|
1160
1254
|
runtimeInvoker?: RuntimeReplayInvoker;
|
|
1161
1255
|
}): Promise<RoutingReplayEntryResult[]> {
|
|
1162
1256
|
const invokeRuntime =
|
|
@@ -1168,6 +1262,7 @@ export async function runHostRuntimeReplayFixture(options: {
|
|
|
1168
1262
|
options.fixture,
|
|
1169
1263
|
options.routing,
|
|
1170
1264
|
options.contentTarget ?? "routing",
|
|
1265
|
+
options.includeTargetSkill ?? true,
|
|
1171
1266
|
);
|
|
1172
1267
|
const results: RoutingReplayEntryResult[] = [];
|
|
1173
1268
|
const total = options.evalSet.length;
|
|
@@ -1175,6 +1270,7 @@ export async function runHostRuntimeReplayFixture(options: {
|
|
|
1175
1270
|
for (const [index, entry] of options.evalSet.entries()) {
|
|
1176
1271
|
const current = index + 1;
|
|
1177
1272
|
const querySnippet = truncateReplayText(entry.query, 120);
|
|
1273
|
+
const startedAt = Date.now();
|
|
1178
1274
|
|
|
1179
1275
|
emitDashboardActionProgress({
|
|
1180
1276
|
current,
|
|
@@ -1201,6 +1297,10 @@ export async function runHostRuntimeReplayFixture(options: {
|
|
|
1201
1297
|
observation,
|
|
1202
1298
|
workspace,
|
|
1203
1299
|
);
|
|
1300
|
+
result.runtime_metrics = buildRuntimeReplayEntryMetrics(
|
|
1301
|
+
observation.metrics,
|
|
1302
|
+
Date.now() - startedAt,
|
|
1303
|
+
);
|
|
1204
1304
|
results.push(result);
|
|
1205
1305
|
|
|
1206
1306
|
emitDashboardActionProgress({
|