selftune 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/apps/local-dashboard/dist/assets/index-Bk9vSHHd.js +15 -0
- package/apps/local-dashboard/dist/assets/index-CRtLkBTi.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-BQH_6WrG.js +60 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-B7VF2Ipl.js → vendor-table-dK1QMLq9.js} +1 -1
- package/apps/local-dashboard/dist/assets/{vendor-ui-r2k_Ku_V.js → vendor-ui-CO2mrx6e.js} +60 -65
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/activation-rules.ts +30 -9
- package/cli/selftune/agent-guidance.ts +96 -0
- package/cli/selftune/alpha-identity.ts +157 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +151 -0
- package/cli/selftune/alpha-upload/client.ts +113 -0
- package/cli/selftune/alpha-upload/flush.ts +191 -0
- package/cli/selftune/alpha-upload/index.ts +194 -0
- package/cli/selftune/alpha-upload/queue.ts +252 -0
- package/cli/selftune/alpha-upload/stage-canonical.ts +242 -0
- package/cli/selftune/alpha-upload-contract.ts +52 -0
- package/cli/selftune/auth/device-code.ts +110 -0
- package/cli/selftune/auto-update.ts +130 -0
- package/cli/selftune/badge/badge.ts +19 -9
- package/cli/selftune/canonical-export.ts +16 -3
- package/cli/selftune/constants.ts +28 -8
- package/cli/selftune/contribute/bundle.ts +32 -5
- package/cli/selftune/dashboard-contract.ts +32 -1
- package/cli/selftune/dashboard-server.ts +256 -692
- package/cli/selftune/dashboard.ts +1 -1
- package/cli/selftune/eval/baseline.ts +11 -7
- package/cli/selftune/eval/hooks-to-evals.ts +27 -9
- package/cli/selftune/eval/synthetic-evals.ts +54 -1
- package/cli/selftune/evolution/audit.ts +24 -19
- package/cli/selftune/evolution/constitutional.ts +176 -0
- package/cli/selftune/evolution/evidence.ts +18 -13
- package/cli/selftune/evolution/evolve-body.ts +104 -7
- package/cli/selftune/evolution/evolve.ts +195 -22
- package/cli/selftune/evolution/propose-body.ts +18 -1
- package/cli/selftune/evolution/propose-description.ts +27 -2
- package/cli/selftune/evolution/rollback.ts +11 -15
- package/cli/selftune/export.ts +84 -0
- package/cli/selftune/grading/auto-grade.ts +13 -4
- package/cli/selftune/grading/grade-session.ts +16 -6
- package/cli/selftune/hooks/evolution-guard.ts +26 -9
- package/cli/selftune/hooks/prompt-log.ts +23 -9
- package/cli/selftune/hooks/session-stop.ts +78 -15
- package/cli/selftune/hooks/skill-eval.ts +189 -10
- package/cli/selftune/index.ts +274 -2
- package/cli/selftune/ingestors/claude-replay.ts +48 -21
- package/cli/selftune/init.ts +249 -47
- package/cli/selftune/last.ts +7 -7
- package/cli/selftune/localdb/db.ts +90 -10
- package/cli/selftune/localdb/direct-write.ts +531 -0
- package/cli/selftune/localdb/materialize.ts +296 -42
- package/cli/selftune/localdb/queries.ts +325 -32
- package/cli/selftune/localdb/schema.ts +109 -0
- package/cli/selftune/monitoring/watch.ts +26 -8
- package/cli/selftune/normalization.ts +85 -15
- package/cli/selftune/observability.ts +248 -2
- package/cli/selftune/orchestrate.ts +165 -20
- package/cli/selftune/quickstart.ts +34 -10
- package/cli/selftune/repair/skill-usage.ts +12 -2
- package/cli/selftune/routes/actions.ts +77 -0
- package/cli/selftune/routes/badge.ts +66 -0
- package/cli/selftune/routes/doctor.ts +12 -0
- package/cli/selftune/routes/index.ts +14 -0
- package/cli/selftune/routes/orchestrate-runs.ts +13 -0
- package/cli/selftune/routes/overview.ts +14 -0
- package/cli/selftune/routes/report.ts +293 -0
- package/cli/selftune/routes/skill-report.ts +230 -0
- package/cli/selftune/status.ts +203 -7
- package/cli/selftune/sync.ts +13 -1
- package/cli/selftune/types.ts +50 -0
- package/cli/selftune/utils/jsonl.ts +58 -1
- package/cli/selftune/utils/selftune-meta.ts +38 -0
- package/cli/selftune/utils/skill-log.ts +30 -4
- package/cli/selftune/utils/transcript.ts +15 -0
- package/cli/selftune/workflows/workflows.ts +7 -6
- package/package.json +10 -6
- package/packages/telemetry-contract/fixtures/complete-push.ts +184 -0
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +58 -0
- package/packages/telemetry-contract/fixtures/golden.json +1 -0
- package/packages/telemetry-contract/fixtures/index.ts +4 -0
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +40 -0
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +79 -0
- package/packages/telemetry-contract/package.json +6 -1
- package/packages/telemetry-contract/src/index.ts +1 -0
- package/packages/telemetry-contract/src/schemas.ts +215 -0
- package/packages/telemetry-contract/src/types.ts +3 -1
- package/packages/telemetry-contract/src/validators.ts +3 -1
- package/packages/telemetry-contract/tests/compatibility.test.ts +144 -0
- package/packages/ui/package.json +4 -0
- package/packages/ui/src/components/ActivityTimeline.tsx +61 -29
- package/packages/ui/src/components/section-cards.tsx +31 -14
- package/packages/ui/src/types.ts +1 -0
- package/skill/SKILL.md +214 -174
- package/skill/Workflows/AlphaUpload.md +45 -0
- package/skill/Workflows/Baseline.md +18 -12
- package/skill/Workflows/Composability.md +3 -3
- package/skill/Workflows/Dashboard.md +44 -91
- package/skill/Workflows/Doctor.md +93 -66
- package/skill/Workflows/Evals.md +49 -40
- package/skill/Workflows/Evolve.md +76 -28
- package/skill/Workflows/EvolveBody.md +37 -38
- package/skill/Workflows/Initialize.md +172 -26
- package/skill/Workflows/Orchestrate.md +11 -2
- package/skill/Workflows/Sync.md +23 -0
- package/skill/Workflows/Watch.md +2 -5
- package/skill/agents/diagnosis-analyst.md +163 -0
- package/skill/agents/evolution-reviewer.md +149 -0
- package/skill/agents/integration-guide.md +154 -0
- package/skill/agents/pattern-analyst.md +149 -0
- package/skill/assets/multi-skill-settings.json +1 -1
- package/skill/assets/single-skill-settings.json +1 -1
- package/skill/references/interactive-config.md +39 -0
- package/skill/references/invocation-taxonomy.md +34 -0
- package/skill/references/logs.md +9 -1
- package/skill/references/setup-patterns.md +3 -3
- package/skill/settings_snippet.json +1 -1
- package/apps/local-dashboard/dist/assets/index-C75H1Q3n.css +0 -1
- package/apps/local-dashboard/dist/assets/index-axE4kz3Q.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +0 -60
|
@@ -9,11 +9,17 @@
|
|
|
9
9
|
import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
10
|
import { parseArgs } from "node:util";
|
|
11
11
|
|
|
12
|
-
import { QUERY_LOG, SKILL_LOG
|
|
12
|
+
import { QUERY_LOG, SKILL_LOG } from "../constants.js";
|
|
13
13
|
import type { BaselineMeasurement } from "../eval/baseline.js";
|
|
14
14
|
import { measureBaseline } from "../eval/baseline.js";
|
|
15
15
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
16
16
|
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
17
|
+
import { getDb } from "../localdb/db.js";
|
|
18
|
+
import {
|
|
19
|
+
queryQueryLog,
|
|
20
|
+
querySessionTelemetry,
|
|
21
|
+
querySkillUsageRecords,
|
|
22
|
+
} from "../localdb/queries.js";
|
|
17
23
|
import { updateContextAfterEvolve } from "../memory/writer.js";
|
|
18
24
|
import type { SyncResult } from "../sync.js";
|
|
19
25
|
import type {
|
|
@@ -31,10 +37,10 @@ import type {
|
|
|
31
37
|
SkillUsageRecord,
|
|
32
38
|
} from "../types.js";
|
|
33
39
|
import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
|
|
34
|
-
|
|
35
|
-
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
40
|
+
|
|
36
41
|
import { createEvolveTUI } from "../utils/tui.js";
|
|
37
42
|
import { appendAuditEntry } from "./audit.js";
|
|
43
|
+
import { checkConstitution } from "./constitutional.js";
|
|
38
44
|
import { appendEvidenceEntry } from "./evidence.js";
|
|
39
45
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
40
46
|
import {
|
|
@@ -124,6 +130,7 @@ function createAuditEntry(
|
|
|
124
130
|
details: string,
|
|
125
131
|
evalSnapshot?: EvalPassRate,
|
|
126
132
|
skillName?: string,
|
|
133
|
+
iterationsUsed?: number,
|
|
127
134
|
): EvolutionAuditEntry {
|
|
128
135
|
return {
|
|
129
136
|
timestamp: new Date().toISOString(),
|
|
@@ -132,6 +139,7 @@ function createAuditEntry(
|
|
|
132
139
|
details,
|
|
133
140
|
...(skillName ? { skill_name: skillName } : {}),
|
|
134
141
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
142
|
+
...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
|
|
135
143
|
};
|
|
136
144
|
}
|
|
137
145
|
|
|
@@ -190,7 +198,12 @@ export async function evolve(
|
|
|
190
198
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
191
199
|
const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
|
|
192
200
|
const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
|
|
193
|
-
const _readSkillUsageLog =
|
|
201
|
+
const _readSkillUsageLog =
|
|
202
|
+
_deps.readSkillUsageLog ??
|
|
203
|
+
(() => {
|
|
204
|
+
const db = getDb();
|
|
205
|
+
return querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
206
|
+
});
|
|
194
207
|
|
|
195
208
|
const auditEntries: EvolutionAuditEntry[] = [];
|
|
196
209
|
let syncResult: SyncResult | undefined;
|
|
@@ -200,8 +213,16 @@ export async function evolve(
|
|
|
200
213
|
action: EvolutionAuditEntry["action"],
|
|
201
214
|
details: string,
|
|
202
215
|
evalSnapshot?: EvalPassRate,
|
|
216
|
+
iterationsUsed?: number,
|
|
203
217
|
): void {
|
|
204
|
-
const entry = createAuditEntry(
|
|
218
|
+
const entry = createAuditEntry(
|
|
219
|
+
proposalId,
|
|
220
|
+
action,
|
|
221
|
+
details,
|
|
222
|
+
evalSnapshot,
|
|
223
|
+
skillName,
|
|
224
|
+
iterationsUsed,
|
|
225
|
+
);
|
|
205
226
|
auditEntries.push(entry);
|
|
206
227
|
try {
|
|
207
228
|
_appendAuditEntry(entry);
|
|
@@ -316,7 +337,8 @@ export async function evolve(
|
|
|
316
337
|
}
|
|
317
338
|
} else {
|
|
318
339
|
// Build from logs
|
|
319
|
-
const
|
|
340
|
+
const dbForQuery = getDb();
|
|
341
|
+
const queryRecords = queryQueryLog(dbForQuery) as QueryLogRecord[];
|
|
320
342
|
evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
|
|
321
343
|
}
|
|
322
344
|
|
|
@@ -342,6 +364,33 @@ export async function evolve(
|
|
|
342
364
|
`Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
|
|
343
365
|
);
|
|
344
366
|
|
|
367
|
+
// Compute aggregate grading metrics for proposal context
|
|
368
|
+
const aggregateMetrics = options.gradingResults?.length
|
|
369
|
+
? (() => {
|
|
370
|
+
const scores = options.gradingResults.map(
|
|
371
|
+
(r) => r.summary.mean_score ?? r.summary.pass_rate,
|
|
372
|
+
);
|
|
373
|
+
const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
374
|
+
const scoreStdDev = Math.sqrt(
|
|
375
|
+
scores.reduce((sum, s) => sum + (s - meanScore) ** 2, 0) / scores.length,
|
|
376
|
+
);
|
|
377
|
+
const failedRate =
|
|
378
|
+
options.gradingResults.filter((r) => r.summary.failed > 0).length /
|
|
379
|
+
options.gradingResults.length;
|
|
380
|
+
const errors = options.gradingResults.map(
|
|
381
|
+
(r) => r.execution_metrics?.errors_encountered ?? 0,
|
|
382
|
+
);
|
|
383
|
+
const meanErrors = errors.reduce((a, b) => a + b, 0) / errors.length;
|
|
384
|
+
return {
|
|
385
|
+
mean_score: meanScore,
|
|
386
|
+
score_std_dev: scoreStdDev,
|
|
387
|
+
failed_session_rate: failedRate,
|
|
388
|
+
mean_errors: meanErrors,
|
|
389
|
+
total_graded: options.gradingResults.length,
|
|
390
|
+
};
|
|
391
|
+
})()
|
|
392
|
+
: undefined;
|
|
393
|
+
|
|
345
394
|
// -----------------------------------------------------------------------
|
|
346
395
|
// Step 5: Cold-start bootstrap or early exit if no patterns
|
|
347
396
|
// -----------------------------------------------------------------------
|
|
@@ -394,7 +443,12 @@ export async function evolve(
|
|
|
394
443
|
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
395
444
|
const telemetryRecords =
|
|
396
445
|
options.telemetryRecords ??
|
|
397
|
-
(tokenEfficiencyEnabled
|
|
446
|
+
(tokenEfficiencyEnabled
|
|
447
|
+
? (() => {
|
|
448
|
+
const dbTel = getDb();
|
|
449
|
+
return querySessionTelemetry(dbTel) as SessionTelemetryRecord[];
|
|
450
|
+
})()
|
|
451
|
+
: undefined);
|
|
398
452
|
|
|
399
453
|
// Compute token efficiency score if enabled and telemetry is available
|
|
400
454
|
let tokenEffScore: number | undefined;
|
|
@@ -407,6 +461,8 @@ export async function evolve(
|
|
|
407
461
|
);
|
|
408
462
|
}
|
|
409
463
|
|
|
464
|
+
let iterationsCompleted = 0;
|
|
465
|
+
|
|
410
466
|
if (paretoEnabled && candidateCount > 1) {
|
|
411
467
|
// Generate N candidates in parallel
|
|
412
468
|
const candidates = await generateMultipleProposals(
|
|
@@ -418,6 +474,7 @@ export async function evolve(
|
|
|
418
474
|
agent,
|
|
419
475
|
candidateCount,
|
|
420
476
|
options.proposalModel,
|
|
477
|
+
aggregateMetrics,
|
|
421
478
|
);
|
|
422
479
|
|
|
423
480
|
// Filter by confidence threshold
|
|
@@ -457,6 +514,32 @@ export async function evolve(
|
|
|
457
514
|
eval_set: evalSet,
|
|
458
515
|
});
|
|
459
516
|
|
|
517
|
+
// Constitutional check before validation (same gate as retry flow)
|
|
518
|
+
const constitution = checkConstitution(
|
|
519
|
+
proposal.proposed_description,
|
|
520
|
+
currentDescription,
|
|
521
|
+
skillName,
|
|
522
|
+
);
|
|
523
|
+
if (!constitution.passed) {
|
|
524
|
+
const reason = `Constitutional: ${constitution.violations.join("; ")}`;
|
|
525
|
+
recordAudit(proposal.proposal_id, "rejected", reason);
|
|
526
|
+
recordEvidence({
|
|
527
|
+
timestamp: new Date().toISOString(),
|
|
528
|
+
proposal_id: proposal.proposal_id,
|
|
529
|
+
skill_name: skillName,
|
|
530
|
+
skill_path: skillPath,
|
|
531
|
+
target: "description",
|
|
532
|
+
stage: "rejected",
|
|
533
|
+
rationale: proposal.rationale,
|
|
534
|
+
confidence: proposal.confidence,
|
|
535
|
+
details: reason,
|
|
536
|
+
original_text: proposal.original_description,
|
|
537
|
+
proposed_text: proposal.proposed_description,
|
|
538
|
+
eval_set: evalSet,
|
|
539
|
+
});
|
|
540
|
+
continue;
|
|
541
|
+
}
|
|
542
|
+
|
|
460
543
|
const validation = await _validateProposal(
|
|
461
544
|
proposal,
|
|
462
545
|
evalSet,
|
|
@@ -521,6 +604,7 @@ export async function evolve(
|
|
|
521
604
|
|
|
522
605
|
lastProposal = best.proposal;
|
|
523
606
|
lastValidation = best.validation;
|
|
607
|
+
iterationsCompleted = 1; // Pareto selection is a single-pass
|
|
524
608
|
|
|
525
609
|
// Skip the standard retry loop — we already have our result
|
|
526
610
|
} else {
|
|
@@ -528,6 +612,7 @@ export async function evolve(
|
|
|
528
612
|
let feedbackReason = "";
|
|
529
613
|
|
|
530
614
|
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
615
|
+
iterationsCompleted = iteration + 1;
|
|
531
616
|
// Step 7: Generate proposal
|
|
532
617
|
const effectiveMissedQueries = feedbackReason
|
|
533
618
|
? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
|
|
@@ -542,6 +627,7 @@ export async function evolve(
|
|
|
542
627
|
skillPath,
|
|
543
628
|
agent,
|
|
544
629
|
options.proposalModel,
|
|
630
|
+
aggregateMetrics,
|
|
545
631
|
);
|
|
546
632
|
llmCallCount++;
|
|
547
633
|
|
|
@@ -569,6 +655,39 @@ export async function evolve(
|
|
|
569
655
|
eval_set: evalSet,
|
|
570
656
|
});
|
|
571
657
|
|
|
658
|
+
// Step 8b: Constitutional check (deterministic, pre-validation)
|
|
659
|
+
const constitution = checkConstitution(
|
|
660
|
+
proposal.proposed_description,
|
|
661
|
+
currentDescription,
|
|
662
|
+
skillName,
|
|
663
|
+
);
|
|
664
|
+
if (!constitution.passed) {
|
|
665
|
+
feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
|
|
666
|
+
recordAudit(proposal.proposal_id, "rejected", feedbackReason);
|
|
667
|
+
recordEvidence({
|
|
668
|
+
timestamp: new Date().toISOString(),
|
|
669
|
+
proposal_id: proposal.proposal_id,
|
|
670
|
+
skill_name: skillName,
|
|
671
|
+
skill_path: skillPath,
|
|
672
|
+
target: "description",
|
|
673
|
+
stage: "rejected",
|
|
674
|
+
rationale: proposal.rationale,
|
|
675
|
+
confidence: proposal.confidence,
|
|
676
|
+
details: feedbackReason,
|
|
677
|
+
});
|
|
678
|
+
if (iteration === maxIterations - 1) {
|
|
679
|
+
finishTui();
|
|
680
|
+
return withStats({
|
|
681
|
+
proposal: lastProposal,
|
|
682
|
+
validation: null,
|
|
683
|
+
deployed: false,
|
|
684
|
+
auditEntries,
|
|
685
|
+
reason: feedbackReason,
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
continue;
|
|
689
|
+
}
|
|
690
|
+
|
|
572
691
|
// Step 9: Check confidence threshold
|
|
573
692
|
if (proposal.confidence < confidenceThreshold) {
|
|
574
693
|
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
@@ -742,6 +861,26 @@ export async function evolve(
|
|
|
742
861
|
);
|
|
743
862
|
|
|
744
863
|
if (!baselineResult.adds_value) {
|
|
864
|
+
recordAudit(
|
|
865
|
+
lastProposal.proposal_id,
|
|
866
|
+
"rejected",
|
|
867
|
+
`Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
|
|
868
|
+
);
|
|
869
|
+
recordEvidence({
|
|
870
|
+
timestamp: new Date().toISOString(),
|
|
871
|
+
proposal_id: lastProposal.proposal_id,
|
|
872
|
+
skill_name: skillName,
|
|
873
|
+
skill_path: skillPath,
|
|
874
|
+
target: "description",
|
|
875
|
+
stage: "rejected",
|
|
876
|
+
rationale: lastProposal.rationale,
|
|
877
|
+
confidence: lastProposal.confidence,
|
|
878
|
+
details: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
|
|
879
|
+
validation: {
|
|
880
|
+
improved: false,
|
|
881
|
+
net_change: baselineResult.lift,
|
|
882
|
+
},
|
|
883
|
+
});
|
|
745
884
|
finishTui();
|
|
746
885
|
return withStats({
|
|
747
886
|
proposal: lastProposal,
|
|
@@ -761,17 +900,37 @@ export async function evolve(
|
|
|
761
900
|
if (options.gateModel && lastProposal && lastValidation?.improved) {
|
|
762
901
|
tui.step(`Gate validation (${options.gateModel})...`);
|
|
763
902
|
gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
|
|
903
|
+
llmCallCount++;
|
|
764
904
|
tui.done(
|
|
765
905
|
`Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
766
906
|
);
|
|
767
907
|
|
|
768
|
-
recordAudit(
|
|
769
|
-
lastProposal.proposal_id,
|
|
770
|
-
"validated",
|
|
771
|
-
`Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
772
|
-
);
|
|
773
|
-
|
|
774
908
|
if (!gateValidation.improved) {
|
|
909
|
+
recordAudit(
|
|
910
|
+
lastProposal.proposal_id,
|
|
911
|
+
"rejected",
|
|
912
|
+
`Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
913
|
+
);
|
|
914
|
+
recordEvidence({
|
|
915
|
+
timestamp: new Date().toISOString(),
|
|
916
|
+
proposal_id: lastProposal.proposal_id,
|
|
917
|
+
skill_name: skillName,
|
|
918
|
+
skill_path: skillPath,
|
|
919
|
+
target: "description",
|
|
920
|
+
stage: "rejected",
|
|
921
|
+
rationale: lastProposal.rationale,
|
|
922
|
+
confidence: lastProposal.confidence,
|
|
923
|
+
details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
924
|
+
validation: {
|
|
925
|
+
improved: gateValidation.improved,
|
|
926
|
+
before_pass_rate: gateValidation.before_pass_rate,
|
|
927
|
+
after_pass_rate: gateValidation.after_pass_rate,
|
|
928
|
+
net_change: gateValidation.net_change,
|
|
929
|
+
regressions: gateValidation.regressions,
|
|
930
|
+
new_passes: gateValidation.new_passes,
|
|
931
|
+
per_entry_results: gateValidation.per_entry_results,
|
|
932
|
+
},
|
|
933
|
+
});
|
|
775
934
|
finishTui();
|
|
776
935
|
return withStats({
|
|
777
936
|
proposal: lastProposal,
|
|
@@ -783,6 +942,12 @@ export async function evolve(
|
|
|
783
942
|
...(baselineResult ? { baselineResult } : {}),
|
|
784
943
|
});
|
|
785
944
|
}
|
|
945
|
+
|
|
946
|
+
recordAudit(
|
|
947
|
+
lastProposal.proposal_id,
|
|
948
|
+
"validated",
|
|
949
|
+
`Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
950
|
+
);
|
|
786
951
|
}
|
|
787
952
|
|
|
788
953
|
// -----------------------------------------------------------------------
|
|
@@ -810,12 +975,18 @@ export async function evolve(
|
|
|
810
975
|
console.error("------------------------------\n");
|
|
811
976
|
}
|
|
812
977
|
|
|
813
|
-
recordAudit(
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
978
|
+
recordAudit(
|
|
979
|
+
lastProposal.proposal_id,
|
|
980
|
+
"deployed",
|
|
981
|
+
`Deployed proposal for ${skillName}`,
|
|
982
|
+
{
|
|
983
|
+
total: evalSet.length,
|
|
984
|
+
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
985
|
+
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
986
|
+
pass_rate: lastValidation.after_pass_rate,
|
|
987
|
+
},
|
|
988
|
+
iterationsCompleted,
|
|
989
|
+
);
|
|
819
990
|
recordEvidence({
|
|
820
991
|
timestamp: new Date().toISOString(),
|
|
821
992
|
proposal_id: lastProposal.proposal_id,
|
|
@@ -1001,7 +1172,8 @@ Options:
|
|
|
1001
1172
|
|
|
1002
1173
|
// If no eval-set provided, check that log files exist for auto-generation
|
|
1003
1174
|
if (!evalSetPath && !(values["sync-first"] ?? false)) {
|
|
1004
|
-
const
|
|
1175
|
+
const dbCheck = getDb();
|
|
1176
|
+
const hasSkillLog = querySkillUsageRecords(dbCheck).length > 0;
|
|
1005
1177
|
const hasQueryLog = existsSync(QUERY_LOG);
|
|
1006
1178
|
if (!hasSkillLog && !hasQueryLog) {
|
|
1007
1179
|
console.error("[ERROR] No eval set provided and no telemetry logs found.");
|
|
@@ -1016,7 +1188,8 @@ Options:
|
|
|
1016
1188
|
const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
|
|
1017
1189
|
let telemetryRecords: SessionTelemetryRecord[] | undefined;
|
|
1018
1190
|
if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
|
|
1019
|
-
|
|
1191
|
+
const dbTel2 = getDb();
|
|
1192
|
+
telemetryRecords = querySessionTelemetry(dbTel2) as SessionTelemetryRecord[];
|
|
1020
1193
|
}
|
|
1021
1194
|
const gradingResults = readGradingResultsForSkill(values.skill);
|
|
1022
1195
|
|
|
@@ -1117,7 +1290,7 @@ if (import.meta.main) {
|
|
|
1117
1290
|
console.error(
|
|
1118
1291
|
"\nTroubleshooting:\n" +
|
|
1119
1292
|
" - Verify --skill-path points to a valid SKILL.md file\n" +
|
|
1120
|
-
" - Ensure eval data exists (run `selftune
|
|
1293
|
+
" - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" +
|
|
1121
1294
|
" - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
|
|
1122
1295
|
" - Re-run with --verbose for full diagnostic output",
|
|
1123
1296
|
);
|
|
@@ -37,6 +37,15 @@ Do NOT include any text outside the JSON object.`;
|
|
|
37
37
|
// Prompt builder
|
|
38
38
|
// ---------------------------------------------------------------------------
|
|
39
39
|
|
|
40
|
+
/** Execution telemetry context for body evolution proposals. */
|
|
41
|
+
export interface ExecutionContext {
|
|
42
|
+
avgToolCalls: number;
|
|
43
|
+
avgErrors: number;
|
|
44
|
+
avgTurns: number;
|
|
45
|
+
commonTools: string[];
|
|
46
|
+
failureTools: string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
40
49
|
/** Build the user prompt for full body generation. */
|
|
41
50
|
export function buildBodyGenerationPrompt(
|
|
42
51
|
currentContent: string,
|
|
@@ -44,6 +53,7 @@ export function buildBodyGenerationPrompt(
|
|
|
44
53
|
missedQueries: string[],
|
|
45
54
|
skillName: string,
|
|
46
55
|
fewShotExamples?: string[],
|
|
56
|
+
executionContext?: ExecutionContext,
|
|
47
57
|
): string {
|
|
48
58
|
const patternLines = failurePatterns.map((p) => {
|
|
49
59
|
const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
|
|
@@ -66,6 +76,11 @@ export function buildBodyGenerationPrompt(
|
|
|
66
76
|
const feedbackSection =
|
|
67
77
|
feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
|
|
68
78
|
|
|
79
|
+
// Build execution telemetry section if provided
|
|
80
|
+
const executionSection = executionContext
|
|
81
|
+
? `\n\nExecution Profile (from recent sessions using this skill):\n Average tool calls per session: ${executionContext.avgToolCalls.toFixed(1)}\n Average errors per session: ${executionContext.avgErrors.toFixed(1)}\n Average assistant turns: ${executionContext.avgTurns.toFixed(1)}\n Most-used tools in successful sessions: ${executionContext.commonTools.join(", ") || "none"}\n Tools correlated with failures: ${executionContext.failureTools.join(", ") || "none"}`
|
|
82
|
+
: "";
|
|
83
|
+
|
|
69
84
|
// Build few-shot examples section if provided
|
|
70
85
|
const fewShotSection =
|
|
71
86
|
fewShotExamples && fewShotExamples.length > 0
|
|
@@ -81,7 +96,7 @@ Failure Patterns:
|
|
|
81
96
|
${patternLines.join("\n\n")}
|
|
82
97
|
|
|
83
98
|
All Missed Queries:
|
|
84
|
-
${missedLines}${feedbackSection}${fewShotSection}
|
|
99
|
+
${missedLines}${feedbackSection}${executionSection}${fewShotSection}
|
|
85
100
|
|
|
86
101
|
Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
|
|
87
102
|
}
|
|
@@ -144,6 +159,7 @@ export async function generateBodyProposal(
|
|
|
144
159
|
agent: string,
|
|
145
160
|
modelFlag?: string,
|
|
146
161
|
fewShotExamples?: string[],
|
|
162
|
+
executionContext?: ExecutionContext,
|
|
147
163
|
): Promise<BodyEvolutionProposal> {
|
|
148
164
|
const prompt = buildBodyGenerationPrompt(
|
|
149
165
|
currentContent,
|
|
@@ -151,6 +167,7 @@ export async function generateBodyProposal(
|
|
|
151
167
|
missedQueries,
|
|
152
168
|
skillName,
|
|
153
169
|
fewShotExamples,
|
|
170
|
+
executionContext,
|
|
154
171
|
);
|
|
155
172
|
const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
|
|
156
173
|
const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
|
|
@@ -36,12 +36,22 @@ Do NOT include any text outside the JSON object.`;
|
|
|
36
36
|
// Prompt builder
|
|
37
37
|
// ---------------------------------------------------------------------------
|
|
38
38
|
|
|
39
|
+
/** Aggregate session quality metrics passed into proposal prompts. */
|
|
40
|
+
export interface AggregateMetrics {
|
|
41
|
+
mean_score: number;
|
|
42
|
+
score_std_dev: number;
|
|
43
|
+
failed_session_rate: number;
|
|
44
|
+
mean_errors: number;
|
|
45
|
+
total_graded: number;
|
|
46
|
+
}
|
|
47
|
+
|
|
39
48
|
/** Build the user prompt for the LLM with context about failures. */
|
|
40
49
|
export function buildProposalPrompt(
|
|
41
50
|
currentDescription: string,
|
|
42
51
|
failurePatterns: FailurePattern[],
|
|
43
52
|
missedQueries: string[],
|
|
44
53
|
skillName: string,
|
|
54
|
+
aggregateMetrics?: AggregateMetrics,
|
|
45
55
|
): string {
|
|
46
56
|
const patternLines = failurePatterns.map((p) => {
|
|
47
57
|
const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
|
|
@@ -67,6 +77,10 @@ export function buildProposalPrompt(
|
|
|
67
77
|
const feedbackSection =
|
|
68
78
|
feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
|
|
69
79
|
|
|
80
|
+
const metricsSection = aggregateMetrics
|
|
81
|
+
? `\n\nSession Quality Context:\n Mean grading score: ${aggregateMetrics.mean_score.toFixed(2)}/1.0 (σ=${aggregateMetrics.score_std_dev.toFixed(2)})\n Failed session rate: ${(aggregateMetrics.failed_session_rate * 100).toFixed(0)}%\n Mean execution errors per session: ${aggregateMetrics.mean_errors.toFixed(1)}\n Sessions graded: ${aggregateMetrics.total_graded}`
|
|
82
|
+
: "";
|
|
83
|
+
|
|
70
84
|
return `Skill Name: ${skillName}
|
|
71
85
|
|
|
72
86
|
Current Description:
|
|
@@ -76,7 +90,7 @@ Failure Patterns:
|
|
|
76
90
|
${patternLines.join("\n\n")}
|
|
77
91
|
|
|
78
92
|
All Missed Queries:
|
|
79
|
-
${missedLines}${feedbackSection}
|
|
93
|
+
${missedLines}${feedbackSection}${metricsSection}
|
|
80
94
|
|
|
81
95
|
Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
|
|
82
96
|
}
|
|
@@ -142,6 +156,7 @@ export async function generateMultipleProposals(
|
|
|
142
156
|
agent: string,
|
|
143
157
|
count = 3,
|
|
144
158
|
modelFlag?: string,
|
|
159
|
+
aggregateMetrics?: AggregateMetrics,
|
|
145
160
|
): Promise<EvolutionProposal[]> {
|
|
146
161
|
const variations = buildPromptVariations(
|
|
147
162
|
currentDescription,
|
|
@@ -149,6 +164,7 @@ export async function generateMultipleProposals(
|
|
|
149
164
|
missedQueries,
|
|
150
165
|
skillName,
|
|
151
166
|
count,
|
|
167
|
+
aggregateMetrics,
|
|
152
168
|
);
|
|
153
169
|
|
|
154
170
|
const proposals = await Promise.all(
|
|
@@ -187,6 +203,7 @@ export function buildPromptVariations(
|
|
|
187
203
|
missedQueries: string[],
|
|
188
204
|
skillName: string,
|
|
189
205
|
count: number,
|
|
206
|
+
aggregateMetrics?: AggregateMetrics,
|
|
190
207
|
): string[] {
|
|
191
208
|
const biases: string[] = [
|
|
192
209
|
"Focus especially on improving explicit invocation (direct mentions of the skill).",
|
|
@@ -199,6 +216,7 @@ export function buildPromptVariations(
|
|
|
199
216
|
failurePatterns,
|
|
200
217
|
missedQueries,
|
|
201
218
|
skillName,
|
|
219
|
+
aggregateMetrics,
|
|
202
220
|
);
|
|
203
221
|
const variations: string[] = [];
|
|
204
222
|
|
|
@@ -219,8 +237,15 @@ export async function generateProposal(
|
|
|
219
237
|
skillPath: string,
|
|
220
238
|
agent: string,
|
|
221
239
|
modelFlag?: string,
|
|
240
|
+
aggregateMetrics?: AggregateMetrics,
|
|
222
241
|
): Promise<EvolutionProposal> {
|
|
223
|
-
const prompt = buildProposalPrompt(
|
|
242
|
+
const prompt = buildProposalPrompt(
|
|
243
|
+
currentDescription,
|
|
244
|
+
failurePatterns,
|
|
245
|
+
missedQueries,
|
|
246
|
+
skillName,
|
|
247
|
+
aggregateMetrics,
|
|
248
|
+
);
|
|
224
249
|
const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
|
|
225
250
|
const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
|
|
226
251
|
|
|
@@ -24,7 +24,7 @@ export interface RollbackOptions {
|
|
|
24
24
|
skillName: string;
|
|
25
25
|
skillPath: string;
|
|
26
26
|
proposalId?: string; // rollback specific proposal, or last deployed
|
|
27
|
-
logPath?: string; //
|
|
27
|
+
logPath?: string; // deprecated — ignored, kept for backward compat
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
export interface RollbackResult {
|
|
@@ -71,8 +71,8 @@ function findLatestBackup(skillPath: string): string | null {
|
|
|
71
71
|
* Find the "created" audit entry for a given proposal ID and extract
|
|
72
72
|
* the original_description from its details field.
|
|
73
73
|
*/
|
|
74
|
-
function findOriginalFromAudit(proposalId: string
|
|
75
|
-
const entries = readAuditTrail(
|
|
74
|
+
function findOriginalFromAudit(proposalId: string): string | null {
|
|
75
|
+
const entries = readAuditTrail();
|
|
76
76
|
const createdEntry = entries.find((e) => e.proposal_id === proposalId && e.action === "created");
|
|
77
77
|
if (!createdEntry) return null;
|
|
78
78
|
|
|
@@ -90,12 +90,8 @@ function findOriginalFromAudit(proposalId: string, logPath?: string): string | n
|
|
|
90
90
|
/**
|
|
91
91
|
* Find the deployed audit entry for a specific proposal ID.
|
|
92
92
|
*/
|
|
93
|
-
function findDeployedEntry(
|
|
94
|
-
|
|
95
|
-
skillName: string,
|
|
96
|
-
logPath?: string,
|
|
97
|
-
): EvolutionAuditEntry | null {
|
|
98
|
-
const entries = readAuditTrail(skillName, logPath);
|
|
93
|
+
function findDeployedEntry(proposalId: string, skillName: string): EvolutionAuditEntry | null {
|
|
94
|
+
const entries = readAuditTrail(skillName);
|
|
99
95
|
return entries.find((e) => e.proposal_id === proposalId && e.action === "deployed") ?? null;
|
|
100
96
|
}
|
|
101
97
|
|
|
@@ -104,7 +100,7 @@ function findDeployedEntry(
|
|
|
104
100
|
// ---------------------------------------------------------------------------
|
|
105
101
|
|
|
106
102
|
export async function rollback(options: RollbackOptions): Promise<RollbackResult> {
|
|
107
|
-
const { skillName, skillPath, proposalId
|
|
103
|
+
const { skillName, skillPath, proposalId } = options;
|
|
108
104
|
|
|
109
105
|
const noRollback = (reason: string): RollbackResult => ({
|
|
110
106
|
rolledBack: false,
|
|
@@ -123,14 +119,14 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
123
119
|
|
|
124
120
|
if (proposalId) {
|
|
125
121
|
// Verify the specific proposal exists in audit trail
|
|
126
|
-
const entry = findDeployedEntry(proposalId, skillName
|
|
122
|
+
const entry = findDeployedEntry(proposalId, skillName);
|
|
127
123
|
if (!entry) {
|
|
128
124
|
return noRollback(`Proposal ${proposalId} not found as deployed entry in audit trail`);
|
|
129
125
|
}
|
|
130
126
|
targetProposalId = proposalId;
|
|
131
127
|
} else {
|
|
132
128
|
// Use the most recent deployed proposal
|
|
133
|
-
const lastDeployed = getLastDeployedProposal(skillName
|
|
129
|
+
const lastDeployed = getLastDeployedProposal(skillName);
|
|
134
130
|
if (!lastDeployed) {
|
|
135
131
|
return noRollback(`No deployed proposal found for skill "${skillName}"`);
|
|
136
132
|
}
|
|
@@ -152,7 +148,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
152
148
|
action: "rolled_back",
|
|
153
149
|
details: `Rolled back ${skillName} from backup file`,
|
|
154
150
|
};
|
|
155
|
-
appendAuditEntry(auditEntry
|
|
151
|
+
appendAuditEntry(auditEntry);
|
|
156
152
|
|
|
157
153
|
const backupResult: RollbackResult = {
|
|
158
154
|
rolledBack: true,
|
|
@@ -170,7 +166,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
170
166
|
}
|
|
171
167
|
|
|
172
168
|
// Strategy 2: Restore from audit trail's created entry (description only)
|
|
173
|
-
const originalFromAudit = findOriginalFromAudit(targetProposalId
|
|
169
|
+
const originalFromAudit = findOriginalFromAudit(targetProposalId);
|
|
174
170
|
if (originalFromAudit) {
|
|
175
171
|
// Replace only the description section in SKILL.md, preserving structure
|
|
176
172
|
const currentContent = readFileSync(skillPath, "utf-8");
|
|
@@ -184,7 +180,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
184
180
|
action: "rolled_back",
|
|
185
181
|
details: `Rolled back ${skillName} from audit trail`,
|
|
186
182
|
};
|
|
187
|
-
appendAuditEntry(auditEntry
|
|
183
|
+
appendAuditEntry(auditEntry);
|
|
188
184
|
|
|
189
185
|
const auditResult: RollbackResult = {
|
|
190
186
|
rolledBack: true,
|