selftune 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.claude/agents/diagnosis-analyst.md +20 -10
  2. package/.claude/agents/evolution-reviewer.md +14 -1
  3. package/.claude/agents/integration-guide.md +18 -6
  4. package/.claude/agents/pattern-analyst.md +18 -5
  5. package/CHANGELOG.md +12 -4
  6. package/README.md +43 -35
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/cli/selftune/badge/badge-data.ts +1 -1
  20. package/cli/selftune/badge/badge.ts +4 -8
  21. package/cli/selftune/canonical-export.ts +183 -0
  22. package/cli/selftune/constants.ts +28 -0
  23. package/cli/selftune/contribute/contribute.ts +1 -1
  24. package/cli/selftune/cron/setup.ts +17 -17
  25. package/cli/selftune/dashboard-contract.ts +202 -0
  26. package/cli/selftune/dashboard-server.ts +653 -186
  27. package/cli/selftune/dashboard.ts +41 -176
  28. package/cli/selftune/eval/baseline.ts +5 -4
  29. package/cli/selftune/eval/composability-v2.ts +273 -0
  30. package/cli/selftune/eval/hooks-to-evals.ts +34 -15
  31. package/cli/selftune/eval/unit-test-cli.ts +1 -1
  32. package/cli/selftune/evolution/evidence.ts +26 -0
  33. package/cli/selftune/evolution/evolve-body.ts +105 -11
  34. package/cli/selftune/evolution/evolve.ts +371 -25
  35. package/cli/selftune/evolution/extract-patterns.ts +87 -29
  36. package/cli/selftune/evolution/rollback.ts +2 -2
  37. package/cli/selftune/grading/auto-grade.ts +200 -0
  38. package/cli/selftune/grading/grade-session.ts +448 -97
  39. package/cli/selftune/grading/results.ts +42 -0
  40. package/cli/selftune/hooks/prompt-log.ts +172 -2
  41. package/cli/selftune/hooks/session-stop.ts +123 -3
  42. package/cli/selftune/hooks/skill-eval.ts +119 -3
  43. package/cli/selftune/index.ts +395 -116
  44. package/cli/selftune/ingestors/claude-replay.ts +140 -114
  45. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  46. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  47. package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
  48. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  49. package/cli/selftune/init.ts +227 -14
  50. package/cli/selftune/last.ts +14 -5
  51. package/cli/selftune/localdb/db.ts +63 -0
  52. package/cli/selftune/localdb/materialize.ts +428 -0
  53. package/cli/selftune/localdb/queries.ts +376 -0
  54. package/cli/selftune/localdb/schema.ts +204 -0
  55. package/cli/selftune/monitoring/watch.ts +66 -15
  56. package/cli/selftune/normalization.ts +682 -0
  57. package/cli/selftune/observability.ts +19 -44
  58. package/cli/selftune/orchestrate.ts +1073 -0
  59. package/cli/selftune/quickstart.ts +203 -0
  60. package/cli/selftune/repair/skill-usage.ts +576 -0
  61. package/cli/selftune/schedule.ts +561 -0
  62. package/cli/selftune/status.ts +48 -26
  63. package/cli/selftune/sync.ts +627 -0
  64. package/cli/selftune/types.ts +148 -0
  65. package/cli/selftune/utils/canonical-log.ts +45 -0
  66. package/cli/selftune/utils/hooks.ts +41 -0
  67. package/cli/selftune/utils/html.ts +27 -0
  68. package/cli/selftune/utils/llm-call.ts +78 -20
  69. package/cli/selftune/utils/math.ts +10 -0
  70. package/cli/selftune/utils/query-filter.ts +139 -0
  71. package/cli/selftune/utils/skill-discovery.ts +340 -0
  72. package/cli/selftune/utils/skill-log.ts +68 -0
  73. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  74. package/cli/selftune/utils/transcript.ts +272 -26
  75. package/cli/selftune/workflows/discover.ts +254 -0
  76. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  77. package/cli/selftune/workflows/workflows.ts +188 -0
  78. package/package.json +21 -8
  79. package/packages/telemetry-contract/README.md +11 -0
  80. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  81. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  82. package/packages/telemetry-contract/index.ts +1 -0
  83. package/packages/telemetry-contract/package.json +19 -0
  84. package/packages/telemetry-contract/src/index.ts +2 -0
  85. package/packages/telemetry-contract/src/types.ts +163 -0
  86. package/packages/telemetry-contract/src/validators.ts +109 -0
  87. package/skill/SKILL.md +84 -53
  88. package/skill/Workflows/AutoActivation.md +17 -16
  89. package/skill/Workflows/Badge.md +6 -0
  90. package/skill/Workflows/Baseline.md +46 -23
  91. package/skill/Workflows/Composability.md +12 -5
  92. package/skill/Workflows/Contribute.md +17 -14
  93. package/skill/Workflows/Cron.md +56 -79
  94. package/skill/Workflows/Dashboard.md +45 -34
  95. package/skill/Workflows/Doctor.md +30 -17
  96. package/skill/Workflows/Evals.md +64 -40
  97. package/skill/Workflows/EvolutionMemory.md +2 -0
  98. package/skill/Workflows/Evolve.md +102 -47
  99. package/skill/Workflows/EvolveBody.md +6 -6
  100. package/skill/Workflows/Grade.md +36 -31
  101. package/skill/Workflows/ImportSkillsBench.md +11 -5
  102. package/skill/Workflows/Ingest.md +43 -36
  103. package/skill/Workflows/Initialize.md +44 -30
  104. package/skill/Workflows/Orchestrate.md +139 -0
  105. package/skill/Workflows/Replay.md +39 -18
  106. package/skill/Workflows/Rollback.md +3 -3
  107. package/skill/Workflows/Schedule.md +61 -0
  108. package/skill/Workflows/Sync.md +88 -0
  109. package/skill/Workflows/UnitTest.md +34 -22
  110. package/skill/Workflows/Watch.md +14 -4
  111. package/skill/Workflows/Workflows.md +129 -0
  112. package/skill/assets/activation-rules-default.json +26 -0
  113. package/skill/assets/multi-skill-settings.json +63 -0
  114. package/skill/assets/single-skill-settings.json +57 -0
  115. package/skill/references/invocation-taxonomy.md +2 -2
  116. package/skill/references/logs.md +164 -2
  117. package/skill/references/setup-patterns.md +65 -0
  118. package/skill/references/version-history.md +40 -0
  119. package/skill/settings_snippet.json +1 -1
  120. package/templates/multi-skill-settings.json +7 -7
  121. package/templates/single-skill-settings.json +6 -6
  122. package/dashboard/index.html +0 -1680
@@ -27,7 +27,13 @@ import type {
27
27
  } from "../types.js";
28
28
  import { readJsonl } from "../utils/jsonl.js";
29
29
  import { detectAgent } from "../utils/llm-call.js";
30
+ import {
31
+ filterActionableQueryRecords,
32
+ filterActionableSkillUsageRecords,
33
+ } from "../utils/query-filter.js";
30
34
  import { seededShuffle } from "../utils/seeded-random.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
36
+ import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
31
37
  import { generateSyntheticEvals } from "./synthetic-evals.js";
32
38
 
33
39
  // ---------------------------------------------------------------------------
@@ -116,14 +122,16 @@ export function buildEvalSet(
116
122
  seed = 42,
117
123
  annotateTaxonomy = true,
118
124
  ): EvalEntry[] {
125
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
126
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
119
127
  const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
120
128
  const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
121
129
 
122
130
  // Build set of positive query texts (for exclusion from negatives)
123
131
  const positiveQueries = new Set<string>();
124
- for (const r of skillRecords) {
132
+ for (const r of actionableSkillRecords) {
125
133
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
126
- if (r.skill_name === skillName) {
134
+ if (isHighConfidencePositiveSkillRecord(r, skillName)) {
127
135
  const q = (r.query ?? "").trim();
128
136
  if (q && q !== "(query not found)") {
129
137
  positiveQueries.add(q);
@@ -134,9 +142,9 @@ export function buildEvalSet(
134
142
  // Build deduplicated positives with taxonomy classification
135
143
  const seen = new Set<string>();
136
144
  const positives: EvalEntry[] = [];
137
- for (const r of skillRecords) {
145
+ for (const r of actionableSkillRecords) {
138
146
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
139
- if (r.skill_name !== skillName) continue;
147
+ if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
140
148
  const q = (r.query ?? "").trim();
141
149
  if (!q || q === "(query not found)" || seen.has(q)) continue;
142
150
  seen.add(q);
@@ -153,7 +161,7 @@ export function buildEvalSet(
153
161
  if (includeNegatives) {
154
162
  const negCandidates: string[] = [];
155
163
  const negSeen = new Set<string>();
156
- for (const r of queryRecords) {
164
+ for (const r of actionableQueryRecords) {
157
165
  if (!r || typeof r.query !== "string") continue;
158
166
  const q = (r.query ?? "").trim();
159
167
  if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
@@ -198,13 +206,17 @@ export function listSkills(
198
206
  queryRecords: QueryLogRecord[],
199
207
  telemetryRecords: SessionTelemetryRecord[],
200
208
  ): void {
209
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
210
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
201
211
  const counts = new Map<string, number>();
202
- for (const r of skillRecords) {
212
+ for (const r of actionableSkillRecords) {
203
213
  const name = r.skill_name ?? "unknown";
204
214
  counts.set(name, (counts.get(name) ?? 0) + 1);
205
215
  }
206
216
 
207
- console.log(`Skill triggers in skill_usage_log (${skillRecords.length} total records):`);
217
+ console.log(
218
+ `Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
219
+ );
208
220
  if (counts.size > 0) {
209
221
  const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
210
222
  for (const [name, count] of sorted) {
@@ -214,8 +226,8 @@ export function listSkills(
214
226
  console.log(" (none yet -- trigger some skills in Claude Code to populate)");
215
227
  }
216
228
 
217
- console.log(`\nAll queries in all_queries_log: ${queryRecords.length}`);
218
- if (queryRecords.length === 0) {
229
+ console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
230
+ if (actionableQueryRecords.length === 0) {
219
231
  console.log(" (none yet -- make sure prompt_log_hook is installed)");
220
232
  }
221
233
 
@@ -303,14 +315,16 @@ export function printEvalStats(
303
315
  ): void {
304
316
  const pos = evalSet.filter((e) => e.should_trigger);
305
317
  const neg = evalSet.filter((e) => !e.should_trigger);
306
- const totalTriggers = skillRecords.filter((r) => r.skill_name === skillName).length;
318
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
319
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
320
+ const totalTriggers = actionableSkillRecords.filter((r) => r.skill_name === skillName).length;
307
321
 
308
322
  console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
309
323
  console.log(
310
324
  ` Positives (should_trigger=true) : ${pos.length} (from ${totalTriggers} logged triggers)`,
311
325
  );
312
326
  console.log(
313
- ` Negatives (should_trigger=false): ${neg.length} (from ${queryRecords.length} total logged queries)`,
327
+ ` Negatives (should_trigger=false): ${neg.length} (from ${actionableQueryRecords.length} actionable logged queries)`,
314
328
  );
315
329
 
316
330
  if (annotateTaxonomy && pos.length > 0) {
@@ -336,7 +350,7 @@ export function printEvalStats(
336
350
  console.log();
337
351
  if (pos.length === 0) {
338
352
  console.log(`[WARN] No positives for skill '${skillName}'.`);
339
- const names = [...new Set(skillRecords.map((r) => r.skill_name))].sort();
353
+ const names = [...new Set(actionableSkillRecords.map((r) => r.skill_name))].sort();
340
354
  if (names.length > 0) {
341
355
  console.log(` Known skills: ${names.join(", ")}`);
342
356
  }
@@ -366,6 +380,7 @@ export async function cliMain(): Promise<void> {
366
380
  options: {
367
381
  skill: { type: "string" },
368
382
  output: { type: "string" },
383
+ out: { type: "string" },
369
384
  max: { type: "string", default: "50" },
370
385
  seed: { type: "string", default: "42" },
371
386
  "list-skills": { type: "boolean", default: false },
@@ -409,7 +424,7 @@ export async function cliMain(): Promise<void> {
409
424
  modelFlag: values.model,
410
425
  });
411
426
 
412
- const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
427
+ const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
413
428
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
414
429
 
415
430
  const pos = evalSet.filter((e) => e.should_trigger);
@@ -440,7 +455,11 @@ export async function cliMain(): Promise<void> {
440
455
  }
441
456
 
442
457
  // --- Log-based mode (original behavior) ---
443
- const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
458
+ const skillLogPath = values["skill-log"] ?? SKILL_LOG;
459
+ const skillRecords =
460
+ skillLogPath === SKILL_LOG
461
+ ? readEffectiveSkillUsageRecords()
462
+ : readJsonl<SkillUsageRecord>(skillLogPath);
444
463
  const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
445
464
  const telemetryRecords = readJsonl<SessionTelemetryRecord>(
446
465
  values["telemetry-log"] ?? TELEMETRY_LOG,
@@ -475,7 +494,7 @@ export async function cliMain(): Promise<void> {
475
494
  annotateTaxonomy,
476
495
  );
477
496
 
478
- const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
497
+ const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
479
498
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
480
499
  printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
481
500
  }
@@ -2,7 +2,7 @@
2
2
  * CLI entrypoint for skill unit tests.
3
3
  *
4
4
  * Usage:
5
- * selftune unit-test --skill <name> --tests <path> [--run-agent] [--generate]
5
+ * selftune eval unit-test --skill <name> --tests <path> [--run-agent] [--generate]
6
6
  *
7
7
  * --skill <name> Skill name (required)
8
8
  * --tests <path> Path to unit test JSON file (default: ~/.selftune/unit-tests/<skill>.json)
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Evolution evidence trail: append and read proposal/eval artifacts that power
3
+ * explainable dashboard drill-downs.
4
+ */
5
+
6
+ import { EVOLUTION_EVIDENCE_LOG } from "../constants.js";
7
+ import type { EvolutionEvidenceEntry } from "../types.js";
8
+ import { appendJsonl, readJsonl } from "../utils/jsonl.js";
9
+
10
+ /** Append a structured evidence artifact to the evolution evidence log. */
11
+ export function appendEvidenceEntry(
12
+ entry: EvolutionEvidenceEntry,
13
+ logPath: string = EVOLUTION_EVIDENCE_LOG,
14
+ ): void {
15
+ appendJsonl(logPath, entry);
16
+ }
17
+
18
+ /** Read all evidence entries, optionally filtered by exact skill name. */
19
+ export function readEvidenceTrail(
20
+ skillName?: string,
21
+ logPath: string = EVOLUTION_EVIDENCE_LOG,
22
+ ): EvolutionEvidenceEntry[] {
23
+ const entries = readJsonl<EvolutionEvidenceEntry>(logPath);
24
+ if (!skillName) return entries;
25
+ return entries.filter((entry) => entry.skill_name === skillName);
26
+ }
@@ -9,13 +9,15 @@
9
9
  import { existsSync, readFileSync } from "node:fs";
10
10
  import { parseArgs } from "node:util";
11
11
 
12
- import { QUERY_LOG, SKILL_LOG } from "../constants.js";
12
+ import { QUERY_LOG } from "../constants.js";
13
13
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
14
+ import { readGradingResultsForSkill } from "../grading/results.js";
14
15
  import type {
15
16
  BodyEvolutionProposal,
16
17
  BodyValidationResult,
17
18
  EvalEntry,
18
19
  EvolutionAuditEntry,
20
+ EvolutionEvidenceEntry,
19
21
  EvolutionTarget,
20
22
  FailurePattern,
21
23
  GradingResult,
@@ -23,8 +25,10 @@ import type {
23
25
  SkillUsageRecord,
24
26
  } from "../types.js";
25
27
  import { readJsonl } from "../utils/jsonl.js";
28
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
26
29
  import { appendAuditEntry } from "./audit.js";
27
30
  import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
31
+ import { appendEvidenceEntry } from "./evidence.js";
28
32
  import { extractFailurePatterns } from "./extract-patterns.js";
29
33
  import { generateBodyProposal } from "./propose-body.js";
30
34
  import { generateRoutingProposal } from "./propose-routing.js";
@@ -79,7 +83,9 @@ export interface EvolveBodyDeps {
79
83
  validateRoutingProposal?: typeof import("./validate-routing.js").validateRoutingProposal;
80
84
  refineBodyProposal?: typeof import("./refine-body.js").refineBodyProposal;
81
85
  appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
86
+ appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
82
87
  buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
88
+ readEffectiveSkillUsageRecords?: typeof import("../utils/skill-log.js").readEffectiveSkillUsageRecords;
83
89
  readFileSync?: typeof readFileSync;
84
90
  writeFileSync?: (path: string, data: string, encoding: string) => void;
85
91
  }
@@ -134,7 +140,10 @@ export async function evolveBody(
134
140
  const _validateRoutingProposal = _deps.validateRoutingProposal ?? validateRoutingProposal;
135
141
  const _refineBodyProposal = _deps.refineBodyProposal ?? refineBodyProposal;
136
142
  const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
143
+ const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
137
144
  const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
145
+ const _readEffectiveSkillUsageRecords =
146
+ _deps.readEffectiveSkillUsageRecords ?? readEffectiveSkillUsageRecords;
138
147
  const _readFileSync = _deps.readFileSync ?? readFileSync;
139
148
  const _writeFileSync = _deps.writeFileSync ?? (await import("node:fs")).writeFileSync;
140
149
 
@@ -154,6 +163,14 @@ export async function evolveBody(
154
163
  }
155
164
  }
156
165
 
166
+ function recordEvidence(entry: EvolutionEvidenceEntry): void {
167
+ try {
168
+ _appendEvidenceEntry(entry);
169
+ } catch {
170
+ // Fail-open
171
+ }
172
+ }
173
+
157
174
  try {
158
175
  // Step 1: Read current SKILL.md
159
176
  if (!existsSync(skillPath)) {
@@ -168,6 +185,8 @@ export async function evolveBody(
168
185
 
169
186
  const currentContent = _readFileSync(skillPath, "utf-8");
170
187
  const parsed = parseSkillSections(currentContent);
188
+ const createdAuditDetails = (): string => `original_description:${currentContent}`;
189
+ const skillUsage = _readEffectiveSkillUsageRecords();
171
190
 
172
191
  // Step 2: Load eval set
173
192
  let evalSet: EvalEntry[];
@@ -179,13 +198,11 @@ export async function evolveBody(
179
198
  }
180
199
  evalSet = parsed as EvalEntry[];
181
200
  } else {
182
- const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
183
201
  const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
184
- evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
202
+ evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
185
203
  }
186
204
 
187
205
  // Step 3: Load skill usage and extract failure patterns
188
- const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
189
206
  const failurePatterns = _extractFailurePatterns(
190
207
  evalSet,
191
208
  skillUsage,
@@ -252,11 +269,21 @@ export async function evolveBody(
252
269
 
253
270
  lastProposal = proposal;
254
271
 
255
- recordAudit(
256
- proposal.proposal_id,
257
- "created",
258
- `${target} proposal created for ${skillName} (iteration ${iteration + 1})`,
259
- );
272
+ recordAudit(proposal.proposal_id, "created", createdAuditDetails());
273
+ recordEvidence({
274
+ timestamp: new Date().toISOString(),
275
+ proposal_id: proposal.proposal_id,
276
+ skill_name: skillName,
277
+ skill_path: skillPath,
278
+ target,
279
+ stage: "created",
280
+ rationale: proposal.rationale,
281
+ confidence: proposal.confidence,
282
+ details: `${target} proposal created for ${skillName} (iteration ${iteration + 1})`,
283
+ original_text: proposal.original_body,
284
+ proposed_text: proposal.proposed_body,
285
+ eval_set: evalSet,
286
+ });
260
287
 
261
288
  // Check confidence threshold
262
289
  if (proposal.confidence < confidenceThreshold) {
@@ -265,6 +292,17 @@ export async function evolveBody(
265
292
  "rejected",
266
293
  `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
267
294
  );
295
+ recordEvidence({
296
+ timestamp: new Date().toISOString(),
297
+ proposal_id: proposal.proposal_id,
298
+ skill_name: skillName,
299
+ skill_path: skillPath,
300
+ target,
301
+ stage: "rejected",
302
+ rationale: proposal.rationale,
303
+ confidence: proposal.confidence,
304
+ details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
305
+ });
268
306
 
269
307
  if (iteration === maxIterations - 1) {
270
308
  return {
@@ -303,6 +341,24 @@ export async function evolveBody(
303
341
  "validated",
304
342
  `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
305
343
  );
344
+ recordEvidence({
345
+ timestamp: new Date().toISOString(),
346
+ proposal_id: proposal.proposal_id,
347
+ skill_name: skillName,
348
+ skill_path: skillPath,
349
+ target,
350
+ stage: "validated",
351
+ rationale: proposal.rationale,
352
+ confidence: proposal.confidence,
353
+ details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
354
+ validation: {
355
+ improved: validation.improved,
356
+ gates_passed: validation.gates_passed,
357
+ gates_total: validation.gates_total,
358
+ gate_results: validation.gate_results,
359
+ regressions: validation.regressions,
360
+ },
361
+ });
306
362
 
307
363
  if (validation.improved) {
308
364
  break;
@@ -313,6 +369,24 @@ export async function evolveBody(
313
369
  "rejected",
314
370
  `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
315
371
  );
372
+ recordEvidence({
373
+ timestamp: new Date().toISOString(),
374
+ proposal_id: proposal.proposal_id,
375
+ skill_name: skillName,
376
+ skill_path: skillPath,
377
+ target,
378
+ stage: "rejected",
379
+ rationale: proposal.rationale,
380
+ confidence: proposal.confidence,
381
+ details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
382
+ validation: {
383
+ improved: validation.improved,
384
+ gates_passed: validation.gates_passed,
385
+ gates_total: validation.gates_total,
386
+ gate_results: validation.gate_results,
387
+ regressions: validation.regressions,
388
+ },
389
+ });
316
390
 
317
391
  if (iteration === maxIterations - 1) {
318
392
  return {
@@ -355,6 +429,24 @@ export async function evolveBody(
355
429
  "deployed",
356
430
  `Deployed ${target} proposal for ${skillName}`,
357
431
  );
432
+ recordEvidence({
433
+ timestamp: new Date().toISOString(),
434
+ proposal_id: lastProposal.proposal_id,
435
+ skill_name: skillName,
436
+ skill_path: skillPath,
437
+ target,
438
+ stage: "deployed",
439
+ rationale: lastProposal.rationale,
440
+ confidence: lastProposal.confidence,
441
+ details: `Deployed ${target} proposal for ${skillName}`,
442
+ validation: {
443
+ improved: lastValidation.improved,
444
+ gates_passed: lastValidation.gates_passed,
445
+ gates_total: lastValidation.gates_total,
446
+ gate_results: lastValidation.gate_results,
447
+ regressions: lastValidation.regressions,
448
+ },
449
+ });
358
450
 
359
451
  return {
360
452
  proposal: lastProposal,
@@ -411,10 +503,10 @@ export async function cliMain(): Promise<void> {
411
503
  });
412
504
 
413
505
  if (values.help) {
414
- console.log(`selftune evolve-body — Evolve a skill body or routing table
506
+ console.log(`selftune evolve body — Evolve a skill body or routing table
415
507
 
416
508
  Usage:
417
- selftune evolve-body --skill <name> --skill-path <path> [options]
509
+ selftune evolve body --skill <name> --skill-path <path> [options]
418
510
 
419
511
  Options:
420
512
  --skill Skill name (required)
@@ -462,6 +554,7 @@ Options:
462
554
  const paths = values["few-shot"].split(",").map((p) => p.trim());
463
555
  fewShotExamples = paths.filter((p) => existsSync(p)).map((p) => readFileSync(p, "utf-8"));
464
556
  }
557
+ const gradingResults = readGradingResultsForSkill(values.skill);
465
558
 
466
559
  const result = await evolveBody({
467
560
  skillName: values.skill,
@@ -477,6 +570,7 @@ Options:
477
570
  confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
478
571
  taskDescription: values["task-description"],
479
572
  fewShotExamples,
573
+ gradingResults,
480
574
  validationModel: values["validation-model"],
481
575
  });
482
576