selftune 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/README.md +1 -0
  2. package/apps/local-dashboard/dist/assets/index-Bk9vSHHd.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-CRtLkBTi.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-react-BQH_6WrG.js +60 -0
  5. package/apps/local-dashboard/dist/assets/{vendor-table-B7VF2Ipl.js → vendor-table-dK1QMLq9.js} +1 -1
  6. package/apps/local-dashboard/dist/assets/{vendor-ui-r2k_Ku_V.js → vendor-ui-CO2mrx6e.js} +60 -65
  7. package/apps/local-dashboard/dist/index.html +5 -5
  8. package/cli/selftune/activation-rules.ts +30 -9
  9. package/cli/selftune/agent-guidance.ts +96 -0
  10. package/cli/selftune/alpha-identity.ts +157 -0
  11. package/cli/selftune/alpha-upload/build-payloads.ts +151 -0
  12. package/cli/selftune/alpha-upload/client.ts +113 -0
  13. package/cli/selftune/alpha-upload/flush.ts +191 -0
  14. package/cli/selftune/alpha-upload/index.ts +194 -0
  15. package/cli/selftune/alpha-upload/queue.ts +252 -0
  16. package/cli/selftune/alpha-upload/stage-canonical.ts +242 -0
  17. package/cli/selftune/alpha-upload-contract.ts +52 -0
  18. package/cli/selftune/auth/device-code.ts +110 -0
  19. package/cli/selftune/auto-update.ts +130 -0
  20. package/cli/selftune/badge/badge.ts +19 -9
  21. package/cli/selftune/canonical-export.ts +16 -3
  22. package/cli/selftune/constants.ts +28 -8
  23. package/cli/selftune/contribute/bundle.ts +32 -5
  24. package/cli/selftune/dashboard-contract.ts +32 -1
  25. package/cli/selftune/dashboard-server.ts +256 -692
  26. package/cli/selftune/dashboard.ts +1 -1
  27. package/cli/selftune/eval/baseline.ts +11 -7
  28. package/cli/selftune/eval/hooks-to-evals.ts +27 -9
  29. package/cli/selftune/eval/synthetic-evals.ts +54 -1
  30. package/cli/selftune/evolution/audit.ts +24 -19
  31. package/cli/selftune/evolution/constitutional.ts +176 -0
  32. package/cli/selftune/evolution/evidence.ts +18 -13
  33. package/cli/selftune/evolution/evolve-body.ts +104 -7
  34. package/cli/selftune/evolution/evolve.ts +195 -22
  35. package/cli/selftune/evolution/propose-body.ts +18 -1
  36. package/cli/selftune/evolution/propose-description.ts +27 -2
  37. package/cli/selftune/evolution/rollback.ts +11 -15
  38. package/cli/selftune/export.ts +84 -0
  39. package/cli/selftune/grading/auto-grade.ts +13 -4
  40. package/cli/selftune/grading/grade-session.ts +16 -6
  41. package/cli/selftune/hooks/evolution-guard.ts +26 -9
  42. package/cli/selftune/hooks/prompt-log.ts +23 -9
  43. package/cli/selftune/hooks/session-stop.ts +78 -15
  44. package/cli/selftune/hooks/skill-eval.ts +189 -10
  45. package/cli/selftune/index.ts +274 -2
  46. package/cli/selftune/ingestors/claude-replay.ts +48 -21
  47. package/cli/selftune/init.ts +249 -47
  48. package/cli/selftune/last.ts +7 -7
  49. package/cli/selftune/localdb/db.ts +90 -10
  50. package/cli/selftune/localdb/direct-write.ts +531 -0
  51. package/cli/selftune/localdb/materialize.ts +296 -42
  52. package/cli/selftune/localdb/queries.ts +325 -32
  53. package/cli/selftune/localdb/schema.ts +109 -0
  54. package/cli/selftune/monitoring/watch.ts +26 -8
  55. package/cli/selftune/normalization.ts +85 -15
  56. package/cli/selftune/observability.ts +248 -2
  57. package/cli/selftune/orchestrate.ts +165 -20
  58. package/cli/selftune/quickstart.ts +34 -10
  59. package/cli/selftune/repair/skill-usage.ts +12 -2
  60. package/cli/selftune/routes/actions.ts +77 -0
  61. package/cli/selftune/routes/badge.ts +66 -0
  62. package/cli/selftune/routes/doctor.ts +12 -0
  63. package/cli/selftune/routes/index.ts +14 -0
  64. package/cli/selftune/routes/orchestrate-runs.ts +13 -0
  65. package/cli/selftune/routes/overview.ts +14 -0
  66. package/cli/selftune/routes/report.ts +293 -0
  67. package/cli/selftune/routes/skill-report.ts +230 -0
  68. package/cli/selftune/status.ts +203 -7
  69. package/cli/selftune/sync.ts +13 -1
  70. package/cli/selftune/types.ts +50 -0
  71. package/cli/selftune/utils/jsonl.ts +58 -1
  72. package/cli/selftune/utils/selftune-meta.ts +38 -0
  73. package/cli/selftune/utils/skill-log.ts +30 -4
  74. package/cli/selftune/utils/transcript.ts +15 -0
  75. package/cli/selftune/workflows/workflows.ts +7 -6
  76. package/package.json +10 -6
  77. package/packages/telemetry-contract/fixtures/complete-push.ts +184 -0
  78. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +58 -0
  79. package/packages/telemetry-contract/fixtures/golden.json +1 -0
  80. package/packages/telemetry-contract/fixtures/index.ts +4 -0
  81. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +40 -0
  82. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +79 -0
  83. package/packages/telemetry-contract/package.json +6 -1
  84. package/packages/telemetry-contract/src/index.ts +1 -0
  85. package/packages/telemetry-contract/src/schemas.ts +215 -0
  86. package/packages/telemetry-contract/src/types.ts +3 -1
  87. package/packages/telemetry-contract/src/validators.ts +3 -1
  88. package/packages/telemetry-contract/tests/compatibility.test.ts +144 -0
  89. package/packages/ui/package.json +4 -0
  90. package/packages/ui/src/components/ActivityTimeline.tsx +61 -29
  91. package/packages/ui/src/components/section-cards.tsx +31 -14
  92. package/packages/ui/src/types.ts +1 -0
  93. package/skill/SKILL.md +214 -174
  94. package/skill/Workflows/AlphaUpload.md +45 -0
  95. package/skill/Workflows/Baseline.md +18 -12
  96. package/skill/Workflows/Composability.md +3 -3
  97. package/skill/Workflows/Dashboard.md +44 -91
  98. package/skill/Workflows/Doctor.md +93 -66
  99. package/skill/Workflows/Evals.md +49 -40
  100. package/skill/Workflows/Evolve.md +76 -28
  101. package/skill/Workflows/EvolveBody.md +37 -38
  102. package/skill/Workflows/Initialize.md +172 -26
  103. package/skill/Workflows/Orchestrate.md +11 -2
  104. package/skill/Workflows/Sync.md +23 -0
  105. package/skill/Workflows/Watch.md +2 -5
  106. package/skill/agents/diagnosis-analyst.md +163 -0
  107. package/skill/agents/evolution-reviewer.md +149 -0
  108. package/skill/agents/integration-guide.md +154 -0
  109. package/skill/agents/pattern-analyst.md +149 -0
  110. package/skill/assets/multi-skill-settings.json +1 -1
  111. package/skill/assets/single-skill-settings.json +1 -1
  112. package/skill/references/interactive-config.md +39 -0
  113. package/skill/references/invocation-taxonomy.md +34 -0
  114. package/skill/references/logs.md +9 -1
  115. package/skill/references/setup-patterns.md +3 -3
  116. package/skill/settings_snippet.json +1 -1
  117. package/apps/local-dashboard/dist/assets/index-C75H1Q3n.css +0 -1
  118. package/apps/local-dashboard/dist/assets/index-axE4kz3Q.js +0 -15
  119. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +0 -60
@@ -9,11 +9,17 @@
9
9
  import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
10
10
  import { parseArgs } from "node:util";
11
11
 
12
- import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
12
+ import { QUERY_LOG, SKILL_LOG } from "../constants.js";
13
13
  import type { BaselineMeasurement } from "../eval/baseline.js";
14
14
  import { measureBaseline } from "../eval/baseline.js";
15
15
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
16
16
  import { readGradingResultsForSkill } from "../grading/results.js";
17
+ import { getDb } from "../localdb/db.js";
18
+ import {
19
+ queryQueryLog,
20
+ querySessionTelemetry,
21
+ querySkillUsageRecords,
22
+ } from "../localdb/queries.js";
17
23
  import { updateContextAfterEvolve } from "../memory/writer.js";
18
24
  import type { SyncResult } from "../sync.js";
19
25
  import type {
@@ -31,10 +37,10 @@ import type {
31
37
  SkillUsageRecord,
32
38
  } from "../types.js";
33
39
  import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
34
- import { readJsonl } from "../utils/jsonl.js";
35
- import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
40
+
36
41
  import { createEvolveTUI } from "../utils/tui.js";
37
42
  import { appendAuditEntry } from "./audit.js";
43
+ import { checkConstitution } from "./constitutional.js";
38
44
  import { appendEvidenceEntry } from "./evidence.js";
39
45
  import { extractFailurePatterns } from "./extract-patterns.js";
40
46
  import {
@@ -124,6 +130,7 @@ function createAuditEntry(
124
130
  details: string,
125
131
  evalSnapshot?: EvalPassRate,
126
132
  skillName?: string,
133
+ iterationsUsed?: number,
127
134
  ): EvolutionAuditEntry {
128
135
  return {
129
136
  timestamp: new Date().toISOString(),
@@ -132,6 +139,7 @@ function createAuditEntry(
132
139
  details,
133
140
  ...(skillName ? { skill_name: skillName } : {}),
134
141
  ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
142
+ ...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
135
143
  };
136
144
  }
137
145
 
@@ -190,7 +198,12 @@ export async function evolve(
190
198
  const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
191
199
  const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
192
200
  const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
193
- const _readSkillUsageLog = _deps.readSkillUsageLog ?? (() => readEffectiveSkillUsageRecords());
201
+ const _readSkillUsageLog =
202
+ _deps.readSkillUsageLog ??
203
+ (() => {
204
+ const db = getDb();
205
+ return querySkillUsageRecords(db) as SkillUsageRecord[];
206
+ });
194
207
 
195
208
  const auditEntries: EvolutionAuditEntry[] = [];
196
209
  let syncResult: SyncResult | undefined;
@@ -200,8 +213,16 @@ export async function evolve(
200
213
  action: EvolutionAuditEntry["action"],
201
214
  details: string,
202
215
  evalSnapshot?: EvalPassRate,
216
+ iterationsUsed?: number,
203
217
  ): void {
204
- const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
218
+ const entry = createAuditEntry(
219
+ proposalId,
220
+ action,
221
+ details,
222
+ evalSnapshot,
223
+ skillName,
224
+ iterationsUsed,
225
+ );
205
226
  auditEntries.push(entry);
206
227
  try {
207
228
  _appendAuditEntry(entry);
@@ -316,7 +337,8 @@ export async function evolve(
316
337
  }
317
338
  } else {
318
339
  // Build from logs
319
- const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
340
+ const dbForQuery = getDb();
341
+ const queryRecords = queryQueryLog(dbForQuery) as QueryLogRecord[];
320
342
  evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
321
343
  }
322
344
 
@@ -342,6 +364,33 @@ export async function evolve(
342
364
  `Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
343
365
  );
344
366
 
367
+ // Compute aggregate grading metrics for proposal context
368
+ const aggregateMetrics = options.gradingResults?.length
369
+ ? (() => {
370
+ const scores = options.gradingResults.map(
371
+ (r) => r.summary.mean_score ?? r.summary.pass_rate,
372
+ );
373
+ const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;
374
+ const scoreStdDev = Math.sqrt(
375
+ scores.reduce((sum, s) => sum + (s - meanScore) ** 2, 0) / scores.length,
376
+ );
377
+ const failedRate =
378
+ options.gradingResults.filter((r) => r.summary.failed > 0).length /
379
+ options.gradingResults.length;
380
+ const errors = options.gradingResults.map(
381
+ (r) => r.execution_metrics?.errors_encountered ?? 0,
382
+ );
383
+ const meanErrors = errors.reduce((a, b) => a + b, 0) / errors.length;
384
+ return {
385
+ mean_score: meanScore,
386
+ score_std_dev: scoreStdDev,
387
+ failed_session_rate: failedRate,
388
+ mean_errors: meanErrors,
389
+ total_graded: options.gradingResults.length,
390
+ };
391
+ })()
392
+ : undefined;
393
+
345
394
  // -----------------------------------------------------------------------
346
395
  // Step 5: Cold-start bootstrap or early exit if no patterns
347
396
  // -----------------------------------------------------------------------
@@ -394,7 +443,12 @@ export async function evolve(
394
443
  const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
395
444
  const telemetryRecords =
396
445
  options.telemetryRecords ??
397
- (tokenEfficiencyEnabled ? readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG) : undefined);
446
+ (tokenEfficiencyEnabled
447
+ ? (() => {
448
+ const dbTel = getDb();
449
+ return querySessionTelemetry(dbTel) as SessionTelemetryRecord[];
450
+ })()
451
+ : undefined);
398
452
 
399
453
  // Compute token efficiency score if enabled and telemetry is available
400
454
  let tokenEffScore: number | undefined;
@@ -407,6 +461,8 @@ export async function evolve(
407
461
  );
408
462
  }
409
463
 
464
+ let iterationsCompleted = 0;
465
+
410
466
  if (paretoEnabled && candidateCount > 1) {
411
467
  // Generate N candidates in parallel
412
468
  const candidates = await generateMultipleProposals(
@@ -418,6 +474,7 @@ export async function evolve(
418
474
  agent,
419
475
  candidateCount,
420
476
  options.proposalModel,
477
+ aggregateMetrics,
421
478
  );
422
479
 
423
480
  // Filter by confidence threshold
@@ -457,6 +514,32 @@ export async function evolve(
457
514
  eval_set: evalSet,
458
515
  });
459
516
 
517
+ // Constitutional check before validation (same gate as retry flow)
518
+ const constitution = checkConstitution(
519
+ proposal.proposed_description,
520
+ currentDescription,
521
+ skillName,
522
+ );
523
+ if (!constitution.passed) {
524
+ const reason = `Constitutional: ${constitution.violations.join("; ")}`;
525
+ recordAudit(proposal.proposal_id, "rejected", reason);
526
+ recordEvidence({
527
+ timestamp: new Date().toISOString(),
528
+ proposal_id: proposal.proposal_id,
529
+ skill_name: skillName,
530
+ skill_path: skillPath,
531
+ target: "description",
532
+ stage: "rejected",
533
+ rationale: proposal.rationale,
534
+ confidence: proposal.confidence,
535
+ details: reason,
536
+ original_text: proposal.original_description,
537
+ proposed_text: proposal.proposed_description,
538
+ eval_set: evalSet,
539
+ });
540
+ continue;
541
+ }
542
+
460
543
  const validation = await _validateProposal(
461
544
  proposal,
462
545
  evalSet,
@@ -521,6 +604,7 @@ export async function evolve(
521
604
 
522
605
  lastProposal = best.proposal;
523
606
  lastValidation = best.validation;
607
+ iterationsCompleted = 1; // Pareto selection is a single-pass
524
608
 
525
609
  // Skip the standard retry loop — we already have our result
526
610
  } else {
@@ -528,6 +612,7 @@ export async function evolve(
528
612
  let feedbackReason = "";
529
613
 
530
614
  for (let iteration = 0; iteration < maxIterations; iteration++) {
615
+ iterationsCompleted = iteration + 1;
531
616
  // Step 7: Generate proposal
532
617
  const effectiveMissedQueries = feedbackReason
533
618
  ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
@@ -542,6 +627,7 @@ export async function evolve(
542
627
  skillPath,
543
628
  agent,
544
629
  options.proposalModel,
630
+ aggregateMetrics,
545
631
  );
546
632
  llmCallCount++;
547
633
 
@@ -569,6 +655,39 @@ export async function evolve(
569
655
  eval_set: evalSet,
570
656
  });
571
657
 
658
+ // Step 8b: Constitutional check (deterministic, pre-validation)
659
+ const constitution = checkConstitution(
660
+ proposal.proposed_description,
661
+ currentDescription,
662
+ skillName,
663
+ );
664
+ if (!constitution.passed) {
665
+ feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
666
+ recordAudit(proposal.proposal_id, "rejected", feedbackReason);
667
+ recordEvidence({
668
+ timestamp: new Date().toISOString(),
669
+ proposal_id: proposal.proposal_id,
670
+ skill_name: skillName,
671
+ skill_path: skillPath,
672
+ target: "description",
673
+ stage: "rejected",
674
+ rationale: proposal.rationale,
675
+ confidence: proposal.confidence,
676
+ details: feedbackReason,
677
+ });
678
+ if (iteration === maxIterations - 1) {
679
+ finishTui();
680
+ return withStats({
681
+ proposal: lastProposal,
682
+ validation: null,
683
+ deployed: false,
684
+ auditEntries,
685
+ reason: feedbackReason,
686
+ });
687
+ }
688
+ continue;
689
+ }
690
+
572
691
  // Step 9: Check confidence threshold
573
692
  if (proposal.confidence < confidenceThreshold) {
574
693
  feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
@@ -742,6 +861,26 @@ export async function evolve(
742
861
  );
743
862
 
744
863
  if (!baselineResult.adds_value) {
864
+ recordAudit(
865
+ lastProposal.proposal_id,
866
+ "rejected",
867
+ `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
868
+ );
869
+ recordEvidence({
870
+ timestamp: new Date().toISOString(),
871
+ proposal_id: lastProposal.proposal_id,
872
+ skill_name: skillName,
873
+ skill_path: skillPath,
874
+ target: "description",
875
+ stage: "rejected",
876
+ rationale: lastProposal.rationale,
877
+ confidence: lastProposal.confidence,
878
+ details: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
879
+ validation: {
880
+ improved: false,
881
+ net_change: baselineResult.lift,
882
+ },
883
+ });
745
884
  finishTui();
746
885
  return withStats({
747
886
  proposal: lastProposal,
@@ -761,17 +900,37 @@ export async function evolve(
761
900
  if (options.gateModel && lastProposal && lastValidation?.improved) {
762
901
  tui.step(`Gate validation (${options.gateModel})...`);
763
902
  gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
903
+ llmCallCount++;
764
904
  tui.done(
765
905
  `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
766
906
  );
767
907
 
768
- recordAudit(
769
- lastProposal.proposal_id,
770
- "validated",
771
- `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
772
- );
773
-
774
908
  if (!gateValidation.improved) {
909
+ recordAudit(
910
+ lastProposal.proposal_id,
911
+ "rejected",
912
+ `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
913
+ );
914
+ recordEvidence({
915
+ timestamp: new Date().toISOString(),
916
+ proposal_id: lastProposal.proposal_id,
917
+ skill_name: skillName,
918
+ skill_path: skillPath,
919
+ target: "description",
920
+ stage: "rejected",
921
+ rationale: lastProposal.rationale,
922
+ confidence: lastProposal.confidence,
923
+ details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
924
+ validation: {
925
+ improved: gateValidation.improved,
926
+ before_pass_rate: gateValidation.before_pass_rate,
927
+ after_pass_rate: gateValidation.after_pass_rate,
928
+ net_change: gateValidation.net_change,
929
+ regressions: gateValidation.regressions,
930
+ new_passes: gateValidation.new_passes,
931
+ per_entry_results: gateValidation.per_entry_results,
932
+ },
933
+ });
775
934
  finishTui();
776
935
  return withStats({
777
936
  proposal: lastProposal,
@@ -783,6 +942,12 @@ export async function evolve(
783
942
  ...(baselineResult ? { baselineResult } : {}),
784
943
  });
785
944
  }
945
+
946
+ recordAudit(
947
+ lastProposal.proposal_id,
948
+ "validated",
949
+ `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
950
+ );
786
951
  }
787
952
 
788
953
  // -----------------------------------------------------------------------
@@ -810,12 +975,18 @@ export async function evolve(
810
975
  console.error("------------------------------\n");
811
976
  }
812
977
 
813
- recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
814
- total: evalSet.length,
815
- passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
816
- failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
817
- pass_rate: lastValidation.after_pass_rate,
818
- });
978
+ recordAudit(
979
+ lastProposal.proposal_id,
980
+ "deployed",
981
+ `Deployed proposal for ${skillName}`,
982
+ {
983
+ total: evalSet.length,
984
+ passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
985
+ failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
986
+ pass_rate: lastValidation.after_pass_rate,
987
+ },
988
+ iterationsCompleted,
989
+ );
819
990
  recordEvidence({
820
991
  timestamp: new Date().toISOString(),
821
992
  proposal_id: lastProposal.proposal_id,
@@ -1001,7 +1172,8 @@ Options:
1001
1172
 
1002
1173
  // If no eval-set provided, check that log files exist for auto-generation
1003
1174
  if (!evalSetPath && !(values["sync-first"] ?? false)) {
1004
- const hasSkillLog = readEffectiveSkillUsageRecords().length > 0;
1175
+ const dbCheck = getDb();
1176
+ const hasSkillLog = querySkillUsageRecords(dbCheck).length > 0;
1005
1177
  const hasQueryLog = existsSync(QUERY_LOG);
1006
1178
  if (!hasSkillLog && !hasQueryLog) {
1007
1179
  console.error("[ERROR] No eval set provided and no telemetry logs found.");
@@ -1016,7 +1188,8 @@ Options:
1016
1188
  const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
1017
1189
  let telemetryRecords: SessionTelemetryRecord[] | undefined;
1018
1190
  if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
1019
- telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
1191
+ const dbTel2 = getDb();
1192
+ telemetryRecords = querySessionTelemetry(dbTel2) as SessionTelemetryRecord[];
1020
1193
  }
1021
1194
  const gradingResults = readGradingResultsForSkill(values.skill);
1022
1195
 
@@ -1117,7 +1290,7 @@ if (import.meta.main) {
1117
1290
  console.error(
1118
1291
  "\nTroubleshooting:\n" +
1119
1292
  " - Verify --skill-path points to a valid SKILL.md file\n" +
1120
- " - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" +
1293
+ " - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" +
1121
1294
  " - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
1122
1295
  " - Re-run with --verbose for full diagnostic output",
1123
1296
  );
@@ -37,6 +37,15 @@ Do NOT include any text outside the JSON object.`;
37
37
  // Prompt builder
38
38
  // ---------------------------------------------------------------------------
39
39
 
40
+ /** Execution telemetry context for body evolution proposals. */
41
+ export interface ExecutionContext {
42
+ avgToolCalls: number;
43
+ avgErrors: number;
44
+ avgTurns: number;
45
+ commonTools: string[];
46
+ failureTools: string[];
47
+ }
48
+
40
49
  /** Build the user prompt for full body generation. */
41
50
  export function buildBodyGenerationPrompt(
42
51
  currentContent: string,
@@ -44,6 +53,7 @@ export function buildBodyGenerationPrompt(
44
53
  missedQueries: string[],
45
54
  skillName: string,
46
55
  fewShotExamples?: string[],
56
+ executionContext?: ExecutionContext,
47
57
  ): string {
48
58
  const patternLines = failurePatterns.map((p) => {
49
59
  const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
@@ -66,6 +76,11 @@ export function buildBodyGenerationPrompt(
66
76
  const feedbackSection =
67
77
  feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
68
78
 
79
+ // Build execution telemetry section if provided
80
+ const executionSection = executionContext
81
+ ? `\n\nExecution Profile (from recent sessions using this skill):\n Average tool calls per session: ${executionContext.avgToolCalls.toFixed(1)}\n Average errors per session: ${executionContext.avgErrors.toFixed(1)}\n Average assistant turns: ${executionContext.avgTurns.toFixed(1)}\n Most-used tools in successful sessions: ${executionContext.commonTools.join(", ") || "none"}\n Tools correlated with failures: ${executionContext.failureTools.join(", ") || "none"}`
82
+ : "";
83
+
69
84
  // Build few-shot examples section if provided
70
85
  const fewShotSection =
71
86
  fewShotExamples && fewShotExamples.length > 0
@@ -81,7 +96,7 @@ Failure Patterns:
81
96
  ${patternLines.join("\n\n")}
82
97
 
83
98
  All Missed Queries:
84
- ${missedLines}${feedbackSection}${fewShotSection}
99
+ ${missedLines}${feedbackSection}${executionSection}${fewShotSection}
85
100
 
86
101
  Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
87
102
  }
@@ -144,6 +159,7 @@ export async function generateBodyProposal(
144
159
  agent: string,
145
160
  modelFlag?: string,
146
161
  fewShotExamples?: string[],
162
+ executionContext?: ExecutionContext,
147
163
  ): Promise<BodyEvolutionProposal> {
148
164
  const prompt = buildBodyGenerationPrompt(
149
165
  currentContent,
@@ -151,6 +167,7 @@ export async function generateBodyProposal(
151
167
  missedQueries,
152
168
  skillName,
153
169
  fewShotExamples,
170
+ executionContext,
154
171
  );
155
172
  const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
156
173
  const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
@@ -36,12 +36,22 @@ Do NOT include any text outside the JSON object.`;
36
36
  // Prompt builder
37
37
  // ---------------------------------------------------------------------------
38
38
 
39
+ /** Aggregate session quality metrics passed into proposal prompts. */
40
+ export interface AggregateMetrics {
41
+ mean_score: number;
42
+ score_std_dev: number;
43
+ failed_session_rate: number;
44
+ mean_errors: number;
45
+ total_graded: number;
46
+ }
47
+
39
48
  /** Build the user prompt for the LLM with context about failures. */
40
49
  export function buildProposalPrompt(
41
50
  currentDescription: string,
42
51
  failurePatterns: FailurePattern[],
43
52
  missedQueries: string[],
44
53
  skillName: string,
54
+ aggregateMetrics?: AggregateMetrics,
45
55
  ): string {
46
56
  const patternLines = failurePatterns.map((p) => {
47
57
  const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
@@ -67,6 +77,10 @@ export function buildProposalPrompt(
67
77
  const feedbackSection =
68
78
  feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
69
79
 
80
+ const metricsSection = aggregateMetrics
81
+ ? `\n\nSession Quality Context:\n Mean grading score: ${aggregateMetrics.mean_score.toFixed(2)}/1.0 (σ=${aggregateMetrics.score_std_dev.toFixed(2)})\n Failed session rate: ${(aggregateMetrics.failed_session_rate * 100).toFixed(0)}%\n Mean execution errors per session: ${aggregateMetrics.mean_errors.toFixed(1)}\n Sessions graded: ${aggregateMetrics.total_graded}`
82
+ : "";
83
+
70
84
  return `Skill Name: ${skillName}
71
85
 
72
86
  Current Description:
@@ -76,7 +90,7 @@ Failure Patterns:
76
90
  ${patternLines.join("\n\n")}
77
91
 
78
92
  All Missed Queries:
79
- ${missedLines}${feedbackSection}
93
+ ${missedLines}${feedbackSection}${metricsSection}
80
94
 
81
95
  Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
82
96
  }
@@ -142,6 +156,7 @@ export async function generateMultipleProposals(
142
156
  agent: string,
143
157
  count = 3,
144
158
  modelFlag?: string,
159
+ aggregateMetrics?: AggregateMetrics,
145
160
  ): Promise<EvolutionProposal[]> {
146
161
  const variations = buildPromptVariations(
147
162
  currentDescription,
@@ -149,6 +164,7 @@ export async function generateMultipleProposals(
149
164
  missedQueries,
150
165
  skillName,
151
166
  count,
167
+ aggregateMetrics,
152
168
  );
153
169
 
154
170
  const proposals = await Promise.all(
@@ -187,6 +203,7 @@ export function buildPromptVariations(
187
203
  missedQueries: string[],
188
204
  skillName: string,
189
205
  count: number,
206
+ aggregateMetrics?: AggregateMetrics,
190
207
  ): string[] {
191
208
  const biases: string[] = [
192
209
  "Focus especially on improving explicit invocation (direct mentions of the skill).",
@@ -199,6 +216,7 @@ export function buildPromptVariations(
199
216
  failurePatterns,
200
217
  missedQueries,
201
218
  skillName,
219
+ aggregateMetrics,
202
220
  );
203
221
  const variations: string[] = [];
204
222
 
@@ -219,8 +237,15 @@ export async function generateProposal(
219
237
  skillPath: string,
220
238
  agent: string,
221
239
  modelFlag?: string,
240
+ aggregateMetrics?: AggregateMetrics,
222
241
  ): Promise<EvolutionProposal> {
223
- const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
242
+ const prompt = buildProposalPrompt(
243
+ currentDescription,
244
+ failurePatterns,
245
+ missedQueries,
246
+ skillName,
247
+ aggregateMetrics,
248
+ );
224
249
  const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
225
250
  const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
226
251
 
@@ -24,7 +24,7 @@ export interface RollbackOptions {
24
24
  skillName: string;
25
25
  skillPath: string;
26
26
  proposalId?: string; // rollback specific proposal, or last deployed
27
- logPath?: string; // optional override for audit log path (testing)
27
+ logPath?: string; // deprecated ignored, kept for backward compat
28
28
  }
29
29
 
30
30
  export interface RollbackResult {
@@ -71,8 +71,8 @@ function findLatestBackup(skillPath: string): string | null {
71
71
  * Find the "created" audit entry for a given proposal ID and extract
72
72
  * the original_description from its details field.
73
73
  */
74
- function findOriginalFromAudit(proposalId: string, logPath?: string): string | null {
75
- const entries = readAuditTrail(undefined, logPath);
74
+ function findOriginalFromAudit(proposalId: string): string | null {
75
+ const entries = readAuditTrail();
76
76
  const createdEntry = entries.find((e) => e.proposal_id === proposalId && e.action === "created");
77
77
  if (!createdEntry) return null;
78
78
 
@@ -90,12 +90,8 @@ function findOriginalFromAudit(proposalId: string, logPath?: string): string | n
90
90
  /**
91
91
  * Find the deployed audit entry for a specific proposal ID.
92
92
  */
93
- function findDeployedEntry(
94
- proposalId: string,
95
- skillName: string,
96
- logPath?: string,
97
- ): EvolutionAuditEntry | null {
98
- const entries = readAuditTrail(skillName, logPath);
93
+ function findDeployedEntry(proposalId: string, skillName: string): EvolutionAuditEntry | null {
94
+ const entries = readAuditTrail(skillName);
99
95
  return entries.find((e) => e.proposal_id === proposalId && e.action === "deployed") ?? null;
100
96
  }
101
97
 
@@ -104,7 +100,7 @@ function findDeployedEntry(
104
100
  // ---------------------------------------------------------------------------
105
101
 
106
102
  export async function rollback(options: RollbackOptions): Promise<RollbackResult> {
107
- const { skillName, skillPath, proposalId, logPath } = options;
103
+ const { skillName, skillPath, proposalId } = options;
108
104
 
109
105
  const noRollback = (reason: string): RollbackResult => ({
110
106
  rolledBack: false,
@@ -123,14 +119,14 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
123
119
 
124
120
  if (proposalId) {
125
121
  // Verify the specific proposal exists in audit trail
126
- const entry = findDeployedEntry(proposalId, skillName, logPath);
122
+ const entry = findDeployedEntry(proposalId, skillName);
127
123
  if (!entry) {
128
124
  return noRollback(`Proposal ${proposalId} not found as deployed entry in audit trail`);
129
125
  }
130
126
  targetProposalId = proposalId;
131
127
  } else {
132
128
  // Use the most recent deployed proposal
133
- const lastDeployed = getLastDeployedProposal(skillName, logPath);
129
+ const lastDeployed = getLastDeployedProposal(skillName);
134
130
  if (!lastDeployed) {
135
131
  return noRollback(`No deployed proposal found for skill "${skillName}"`);
136
132
  }
@@ -152,7 +148,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
152
148
  action: "rolled_back",
153
149
  details: `Rolled back ${skillName} from backup file`,
154
150
  };
155
- appendAuditEntry(auditEntry, logPath);
151
+ appendAuditEntry(auditEntry);
156
152
 
157
153
  const backupResult: RollbackResult = {
158
154
  rolledBack: true,
@@ -170,7 +166,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
170
166
  }
171
167
 
172
168
  // Strategy 2: Restore from audit trail's created entry (description only)
173
- const originalFromAudit = findOriginalFromAudit(targetProposalId, logPath);
169
+ const originalFromAudit = findOriginalFromAudit(targetProposalId);
174
170
  if (originalFromAudit) {
175
171
  // Replace only the description section in SKILL.md, preserving structure
176
172
  const currentContent = readFileSync(skillPath, "utf-8");
@@ -184,7 +180,7 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
184
180
  action: "rolled_back",
185
181
  details: `Rolled back ${skillName} from audit trail`,
186
182
  };
187
- appendAuditEntry(auditEntry, logPath);
183
+ appendAuditEntry(auditEntry);
188
184
 
189
185
  const auditResult: RollbackResult = {
190
186
  rolledBack: true,