selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.claude/agents/diagnosis-analyst.md +20 -10
  2. package/.claude/agents/evolution-reviewer.md +14 -1
  3. package/.claude/agents/integration-guide.md +18 -6
  4. package/.claude/agents/pattern-analyst.md +18 -5
  5. package/CHANGELOG.md +12 -4
  6. package/README.md +43 -35
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/cli/selftune/badge/badge-data.ts +1 -1
  20. package/cli/selftune/badge/badge.ts +4 -8
  21. package/cli/selftune/canonical-export.ts +183 -0
  22. package/cli/selftune/constants.ts +28 -0
  23. package/cli/selftune/contribute/contribute.ts +1 -1
  24. package/cli/selftune/cron/setup.ts +17 -17
  25. package/cli/selftune/dashboard-contract.ts +202 -0
  26. package/cli/selftune/dashboard-server.ts +653 -186
  27. package/cli/selftune/dashboard.ts +41 -176
  28. package/cli/selftune/eval/baseline.ts +5 -4
  29. package/cli/selftune/eval/composability-v2.ts +273 -0
  30. package/cli/selftune/eval/hooks-to-evals.ts +34 -15
  31. package/cli/selftune/eval/unit-test-cli.ts +1 -1
  32. package/cli/selftune/evolution/evidence.ts +26 -0
  33. package/cli/selftune/evolution/evolve-body.ts +105 -11
  34. package/cli/selftune/evolution/evolve.ts +371 -25
  35. package/cli/selftune/evolution/extract-patterns.ts +87 -29
  36. package/cli/selftune/evolution/rollback.ts +2 -2
  37. package/cli/selftune/grading/auto-grade.ts +200 -0
  38. package/cli/selftune/grading/grade-session.ts +448 -97
  39. package/cli/selftune/grading/results.ts +42 -0
  40. package/cli/selftune/hooks/prompt-log.ts +172 -2
  41. package/cli/selftune/hooks/session-stop.ts +123 -3
  42. package/cli/selftune/hooks/skill-eval.ts +119 -3
  43. package/cli/selftune/index.ts +395 -116
  44. package/cli/selftune/ingestors/claude-replay.ts +140 -114
  45. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  46. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  47. package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
  48. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  49. package/cli/selftune/init.ts +227 -14
  50. package/cli/selftune/last.ts +14 -5
  51. package/cli/selftune/localdb/db.ts +63 -0
  52. package/cli/selftune/localdb/materialize.ts +428 -0
  53. package/cli/selftune/localdb/queries.ts +376 -0
  54. package/cli/selftune/localdb/schema.ts +204 -0
  55. package/cli/selftune/monitoring/watch.ts +66 -15
  56. package/cli/selftune/normalization.ts +682 -0
  57. package/cli/selftune/observability.ts +19 -44
  58. package/cli/selftune/orchestrate.ts +1073 -0
  59. package/cli/selftune/quickstart.ts +203 -0
  60. package/cli/selftune/repair/skill-usage.ts +576 -0
  61. package/cli/selftune/schedule.ts +561 -0
  62. package/cli/selftune/status.ts +48 -26
  63. package/cli/selftune/sync.ts +627 -0
  64. package/cli/selftune/types.ts +148 -0
  65. package/cli/selftune/utils/canonical-log.ts +45 -0
  66. package/cli/selftune/utils/hooks.ts +41 -0
  67. package/cli/selftune/utils/html.ts +27 -0
  68. package/cli/selftune/utils/llm-call.ts +78 -20
  69. package/cli/selftune/utils/math.ts +10 -0
  70. package/cli/selftune/utils/query-filter.ts +139 -0
  71. package/cli/selftune/utils/skill-discovery.ts +340 -0
  72. package/cli/selftune/utils/skill-log.ts +68 -0
  73. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  74. package/cli/selftune/utils/transcript.ts +272 -26
  75. package/cli/selftune/workflows/discover.ts +254 -0
  76. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  77. package/cli/selftune/workflows/workflows.ts +188 -0
  78. package/package.json +21 -8
  79. package/packages/telemetry-contract/README.md +11 -0
  80. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  81. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  82. package/packages/telemetry-contract/index.ts +1 -0
  83. package/packages/telemetry-contract/package.json +19 -0
  84. package/packages/telemetry-contract/src/index.ts +2 -0
  85. package/packages/telemetry-contract/src/types.ts +163 -0
  86. package/packages/telemetry-contract/src/validators.ts +109 -0
  87. package/skill/SKILL.md +84 -53
  88. package/skill/Workflows/AutoActivation.md +17 -16
  89. package/skill/Workflows/Badge.md +6 -0
  90. package/skill/Workflows/Baseline.md +46 -23
  91. package/skill/Workflows/Composability.md +12 -5
  92. package/skill/Workflows/Contribute.md +17 -14
  93. package/skill/Workflows/Cron.md +56 -79
  94. package/skill/Workflows/Dashboard.md +45 -34
  95. package/skill/Workflows/Doctor.md +30 -17
  96. package/skill/Workflows/Evals.md +64 -40
  97. package/skill/Workflows/EvolutionMemory.md +2 -0
  98. package/skill/Workflows/Evolve.md +102 -47
  99. package/skill/Workflows/EvolveBody.md +6 -6
  100. package/skill/Workflows/Grade.md +36 -31
  101. package/skill/Workflows/ImportSkillsBench.md +11 -5
  102. package/skill/Workflows/Ingest.md +43 -36
  103. package/skill/Workflows/Initialize.md +44 -30
  104. package/skill/Workflows/Orchestrate.md +139 -0
  105. package/skill/Workflows/Replay.md +39 -18
  106. package/skill/Workflows/Rollback.md +3 -3
  107. package/skill/Workflows/Schedule.md +61 -0
  108. package/skill/Workflows/Sync.md +88 -0
  109. package/skill/Workflows/UnitTest.md +34 -22
  110. package/skill/Workflows/Watch.md +14 -4
  111. package/skill/Workflows/Workflows.md +129 -0
  112. package/skill/assets/activation-rules-default.json +26 -0
  113. package/skill/assets/multi-skill-settings.json +63 -0
  114. package/skill/assets/single-skill-settings.json +57 -0
  115. package/skill/references/invocation-taxonomy.md +2 -2
  116. package/skill/references/logs.md +164 -2
  117. package/skill/references/setup-patterns.md +65 -0
  118. package/skill/references/version-history.md +40 -0
  119. package/skill/settings_snippet.json +1 -1
  120. package/templates/multi-skill-settings.json +7 -7
  121. package/templates/single-skill-settings.json +6 -6
  122. package/dashboard/index.html +0 -1680
@@ -13,11 +13,14 @@ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
13
13
  import type { BaselineMeasurement } from "../eval/baseline.js";
14
14
  import { measureBaseline } from "../eval/baseline.js";
15
15
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
16
+ import { readGradingResultsForSkill } from "../grading/results.js";
16
17
  import { updateContextAfterEvolve } from "../memory/writer.js";
18
+ import type { SyncResult } from "../sync.js";
17
19
  import type {
18
20
  EvalEntry,
19
21
  EvalPassRate,
20
22
  EvolutionAuditEntry,
23
+ EvolutionEvidenceEntry,
21
24
  EvolutionProposal,
22
25
  EvolveResultSummary,
23
26
  FailurePattern,
@@ -29,8 +32,10 @@ import type {
29
32
  } from "../types.js";
30
33
  import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
31
34
  import { readJsonl } from "../utils/jsonl.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
32
36
  import { createEvolveTUI } from "../utils/tui.js";
33
37
  import { appendAuditEntry } from "./audit.js";
38
+ import { appendEvidenceEntry } from "./evidence.js";
34
39
  import { extractFailurePatterns } from "./extract-patterns.js";
35
40
  import {
36
41
  computeInvocationScores,
@@ -68,6 +73,8 @@ export interface EvolveOptions {
68
73
  cheapLoop?: boolean;
69
74
  gateModel?: string;
70
75
  proposalModel?: string;
76
+ syncFirst?: boolean;
77
+ syncForce?: boolean;
71
78
  }
72
79
 
73
80
  export interface EvolveResult {
@@ -81,6 +88,7 @@ export interface EvolveResult {
81
88
  elapsedMs: number;
82
89
  baselineResult?: BaselineMeasurement;
83
90
  gateValidation?: ValidationResult;
91
+ sync_result?: SyncResult;
84
92
  }
85
93
 
86
94
  /**
@@ -98,9 +106,12 @@ export interface EvolveDeps {
98
106
  validateProposal?: typeof import("./validate-proposal.js").validateProposal;
99
107
  gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
100
108
  appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
109
+ appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
101
110
  buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
102
111
  updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
103
112
  measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
113
+ readSkillUsageLog?: () => SkillUsageRecord[];
114
+ syncSources?: typeof import("../sync.js").syncSources;
104
115
  }
105
116
 
106
117
  // ---------------------------------------------------------------------------
@@ -124,6 +135,33 @@ function createAuditEntry(
124
135
  };
125
136
  }
126
137
 
138
+ // ---------------------------------------------------------------------------
139
+ // Diff helper
140
+ // ---------------------------------------------------------------------------
141
+
142
+ /**
143
+ * Produce a simple colored diff between two text strings.
144
+ * Red (removed) / Green (added) lines, skipping unchanged lines.
145
+ */
146
+ function formatSimpleDiff(oldText: string, newText: string): string {
147
+ const oldLines = oldText.split("\n");
148
+ const newLines = newText.split("\n");
149
+ const output: string[] = [];
150
+ const maxLen = Math.max(oldLines.length, newLines.length);
151
+ for (let i = 0; i < maxLen; i++) {
152
+ const oldLine = oldLines[i];
153
+ const newLine = newLines[i];
154
+ if (oldLine === newLine) continue;
155
+ if (oldLine !== undefined) {
156
+ output.push(`\x1b[31m- ${oldLine}\x1b[0m`);
157
+ }
158
+ if (newLine !== undefined) {
159
+ output.push(`\x1b[32m+ ${newLine}\x1b[0m`);
160
+ }
161
+ }
162
+ return output.join("\n");
163
+ }
164
+
127
165
  // ---------------------------------------------------------------------------
128
166
  // Main orchestrator
129
167
  // ---------------------------------------------------------------------------
@@ -148,11 +186,14 @@ export async function evolve(
148
186
  const _validateProposal = _deps.validateProposal ?? validateProposal;
149
187
  const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
150
188
  const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
189
+ const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
151
190
  const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
152
191
  const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
153
192
  const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
193
+ const _readSkillUsageLog = _deps.readSkillUsageLog ?? (() => readEffectiveSkillUsageRecords());
154
194
 
155
195
  const auditEntries: EvolutionAuditEntry[] = [];
196
+ let syncResult: SyncResult | undefined;
156
197
 
157
198
  function recordAudit(
158
199
  proposalId: string,
@@ -169,6 +210,14 @@ export async function evolve(
169
210
  }
170
211
  }
171
212
 
213
+ function recordEvidence(entry: EvolutionEvidenceEntry): void {
214
+ try {
215
+ _appendEvidenceEntry(entry);
216
+ } catch {
217
+ // Fail-open: evidence should not block the pipeline
218
+ }
219
+ }
220
+
172
221
  const pipelineStart = Date.now();
173
222
  let llmCallCount = 0;
174
223
  const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
@@ -182,6 +231,7 @@ export async function evolve(
182
231
  ...r,
183
232
  llmCallCount,
184
233
  elapsedMs: Date.now() - pipelineStart,
234
+ ...(syncResult ? { sync_result: syncResult } : {}),
185
235
  });
186
236
 
187
237
  // Hoisted so catch block can preserve partial results on error
@@ -209,21 +259,65 @@ export async function evolve(
209
259
  const currentDescription = frontmatter.description || rawContent;
210
260
  const skillVersion = frontmatter.version || undefined;
211
261
  const versionTag = skillVersion ? `, v${skillVersion}` : "";
262
+ const createdAuditDetails = (message: string) =>
263
+ `original_description:${rawContent}\n${message}`;
212
264
  tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
213
265
 
266
+ if (options.syncFirst) {
267
+ tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
268
+ const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
269
+ const syncRunner = _deps.syncSources ?? realSyncSources;
270
+ syncResult = syncRunner(
271
+ createDefaultSyncOptions({
272
+ force: options.syncForce ?? false,
273
+ }),
274
+ );
275
+ const sourceSynced = Object.values(syncResult.sources).reduce(
276
+ (sum, source) => sum + source.synced,
277
+ 0,
278
+ );
279
+ tui.done(
280
+ `Source sync complete (${sourceSynced} source sessions, ${syncResult.repair.repaired_records} repaired records)`,
281
+ );
282
+ }
283
+
214
284
  // -----------------------------------------------------------------------
215
285
  // Step 2: Load eval set
216
286
  // -----------------------------------------------------------------------
287
+ const skillUsage = _readSkillUsageLog();
217
288
  let evalSet: EvalEntry[];
218
289
 
219
290
  if (evalSetPath && existsSync(evalSetPath)) {
220
- const raw = readFileSync(evalSetPath, "utf-8");
221
- evalSet = JSON.parse(raw) as EvalEntry[];
291
+ try {
292
+ const raw = readFileSync(evalSetPath, "utf-8");
293
+ evalSet = JSON.parse(raw) as EvalEntry[];
294
+ } catch (parseErr) {
295
+ const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
296
+ tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
297
+ finishTui();
298
+ return withStats({
299
+ proposal: null,
300
+ validation: null,
301
+ deployed: false,
302
+ auditEntries,
303
+ reason: `Failed to load eval set: ${msg}`,
304
+ });
305
+ }
306
+ if (!Array.isArray(evalSet)) {
307
+ tui.fail(`Eval set at ${evalSetPath} is not an array`);
308
+ finishTui();
309
+ return withStats({
310
+ proposal: null,
311
+ validation: null,
312
+ deployed: false,
313
+ auditEntries,
314
+ reason: `Eval set at ${evalSetPath} is not a JSON array`,
315
+ });
316
+ }
222
317
  } else {
223
318
  // Build from logs
224
- const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
225
319
  const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
226
- evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
320
+ evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
227
321
  }
228
322
 
229
323
  const posCount = evalSet.filter((e) => e.should_trigger).length;
@@ -233,8 +327,6 @@ export async function evolve(
233
327
  // -----------------------------------------------------------------------
234
328
  // Step 3: Load skill usage records
235
329
  // -----------------------------------------------------------------------
236
- const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
237
-
238
330
  // -----------------------------------------------------------------------
239
331
  // Step 4: Extract failure patterns
240
332
  // -----------------------------------------------------------------------
@@ -251,17 +343,38 @@ export async function evolve(
251
343
  );
252
344
 
253
345
  // -----------------------------------------------------------------------
254
- // Step 5: Early exit if no patterns
346
+ // Step 5: Cold-start bootstrap or early exit if no patterns
255
347
  // -----------------------------------------------------------------------
256
348
  if (failurePatterns.length === 0) {
257
- finishTui();
258
- return withStats({
259
- proposal: null,
260
- validation: null,
261
- deployed: false,
262
- auditEntries,
263
- reason: "No failure patterns found",
264
- });
349
+ // Cold-start: if the eval set has positive entries that the skill should
350
+ // match but there are zero skill usage records, treat the positive eval
351
+ // entries themselves as "missed queries" — they ARE the failure signal.
352
+ const positiveEvals = evalSet.filter((e) => e.should_trigger);
353
+ const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
354
+ if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
355
+ const coldStartPattern: FailurePattern = {
356
+ pattern_id: `fp-${skillName}-coldstart`,
357
+ skill_name: skillName,
358
+ invocation_type: "implicit",
359
+ missed_queries: positiveEvals.map((e) => e.query),
360
+ frequency: positiveEvals.length,
361
+ sample_sessions: [],
362
+ extracted_at: new Date().toISOString(),
363
+ };
364
+ failurePatterns.push(coldStartPattern);
365
+ tui.done(
366
+ `Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
367
+ );
368
+ } else {
369
+ finishTui();
370
+ return withStats({
371
+ proposal: null,
372
+ validation: null,
373
+ deployed: false,
374
+ auditEntries,
375
+ reason: "No failure patterns found",
376
+ });
377
+ }
265
378
  }
266
379
 
267
380
  // -----------------------------------------------------------------------
@@ -279,11 +392,14 @@ export async function evolve(
279
392
  const paretoEnabled = options.paretoEnabled ?? false;
280
393
  const candidateCount = options.candidateCount ?? 3;
281
394
  const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
395
+ const telemetryRecords =
396
+ options.telemetryRecords ??
397
+ (tokenEfficiencyEnabled ? readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG) : undefined);
282
398
 
283
399
  // Compute token efficiency score if enabled and telemetry is available
284
400
  let tokenEffScore: number | undefined;
285
- if (tokenEfficiencyEnabled && options.telemetryRecords && options.telemetryRecords.length > 0) {
286
- tokenEffScore = computeTokenEfficiencyScore(skillName, options.telemetryRecords);
401
+ if (tokenEfficiencyEnabled && telemetryRecords && telemetryRecords.length > 0) {
402
+ tokenEffScore = computeTokenEfficiencyScore(skillName, telemetryRecords);
287
403
  recordAudit(
288
404
  "system",
289
405
  "created",
@@ -321,7 +437,25 @@ export async function evolve(
321
437
  // Validate each candidate
322
438
  const paretoCandidates: ParetoCandidate[] = [];
323
439
  for (const proposal of viableCandidates) {
324
- recordAudit(proposal.proposal_id, "created", `Pareto candidate for ${skillName}`);
440
+ recordAudit(
441
+ proposal.proposal_id,
442
+ "created",
443
+ createdAuditDetails(`Pareto candidate for ${skillName}`),
444
+ );
445
+ recordEvidence({
446
+ timestamp: new Date().toISOString(),
447
+ proposal_id: proposal.proposal_id,
448
+ skill_name: skillName,
449
+ skill_path: skillPath,
450
+ target: "description",
451
+ stage: "created",
452
+ rationale: proposal.rationale,
453
+ confidence: proposal.confidence,
454
+ details: `Pareto candidate for ${skillName}`,
455
+ original_text: proposal.original_description,
456
+ proposed_text: proposal.proposed_description,
457
+ eval_set: evalSet,
458
+ });
325
459
 
326
460
  const validation = await _validateProposal(
327
461
  proposal,
@@ -334,6 +468,26 @@ export async function evolve(
334
468
  "validated",
335
469
  `Pareto validation: improved=${validation.improved}`,
336
470
  );
471
+ recordEvidence({
472
+ timestamp: new Date().toISOString(),
473
+ proposal_id: proposal.proposal_id,
474
+ skill_name: skillName,
475
+ skill_path: skillPath,
476
+ target: "description",
477
+ stage: "validated",
478
+ rationale: proposal.rationale,
479
+ confidence: proposal.confidence,
480
+ details: `Pareto validation: improved=${validation.improved}`,
481
+ validation: {
482
+ improved: validation.improved,
483
+ before_pass_rate: validation.before_pass_rate,
484
+ after_pass_rate: validation.after_pass_rate,
485
+ net_change: validation.net_change,
486
+ regressions: validation.regressions,
487
+ new_passes: validation.new_passes,
488
+ per_entry_results: validation.per_entry_results,
489
+ },
490
+ });
337
491
 
338
492
  if (validation.improved && validation.per_entry_results) {
339
493
  const invocationScores = computeInvocationScores(validation.per_entry_results);
@@ -398,8 +552,22 @@ export async function evolve(
398
552
  recordAudit(
399
553
  proposal.proposal_id,
400
554
  "created",
401
- `Proposal created for ${skillName} (iteration ${iteration + 1})`,
555
+ createdAuditDetails(`Proposal created for ${skillName} (iteration ${iteration + 1})`),
402
556
  );
557
+ recordEvidence({
558
+ timestamp: new Date().toISOString(),
559
+ proposal_id: proposal.proposal_id,
560
+ skill_name: skillName,
561
+ skill_path: skillPath,
562
+ target: "description",
563
+ stage: "created",
564
+ rationale: proposal.rationale,
565
+ confidence: proposal.confidence,
566
+ details: `Proposal created for ${skillName} (iteration ${iteration + 1})`,
567
+ original_text: proposal.original_description,
568
+ proposed_text: proposal.proposed_description,
569
+ eval_set: evalSet,
570
+ });
403
571
 
404
572
  // Step 9: Check confidence threshold
405
573
  if (proposal.confidence < confidenceThreshold) {
@@ -409,6 +577,17 @@ export async function evolve(
409
577
  "rejected",
410
578
  `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
411
579
  );
580
+ recordEvidence({
581
+ timestamp: new Date().toISOString(),
582
+ proposal_id: proposal.proposal_id,
583
+ skill_name: skillName,
584
+ skill_path: skillPath,
585
+ target: "description",
586
+ stage: "rejected",
587
+ rationale: proposal.rationale,
588
+ confidence: proposal.confidence,
589
+ details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
590
+ });
412
591
 
413
592
  // If this is the last iteration, return early with rejection
414
593
  if (iteration === maxIterations - 1) {
@@ -455,6 +634,26 @@ export async function evolve(
455
634
  `Validation complete: improved=${validation.improved}`,
456
635
  evalSnapshot,
457
636
  );
637
+ recordEvidence({
638
+ timestamp: new Date().toISOString(),
639
+ proposal_id: proposal.proposal_id,
640
+ skill_name: skillName,
641
+ skill_path: skillPath,
642
+ target: "description",
643
+ stage: "validated",
644
+ rationale: proposal.rationale,
645
+ confidence: proposal.confidence,
646
+ details: `Validation complete: improved=${validation.improved}`,
647
+ validation: {
648
+ improved: validation.improved,
649
+ before_pass_rate: validation.before_pass_rate,
650
+ after_pass_rate: validation.after_pass_rate,
651
+ net_change: validation.net_change,
652
+ regressions: validation.regressions,
653
+ new_passes: validation.new_passes,
654
+ per_entry_results: validation.per_entry_results,
655
+ },
656
+ });
458
657
 
459
658
  // Step 12: Check validation result
460
659
  if (!validation.improved) {
@@ -464,6 +663,26 @@ export async function evolve(
464
663
  "rejected",
465
664
  `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
466
665
  );
666
+ recordEvidence({
667
+ timestamp: new Date().toISOString(),
668
+ proposal_id: proposal.proposal_id,
669
+ skill_name: skillName,
670
+ skill_path: skillPath,
671
+ target: "description",
672
+ stage: "rejected",
673
+ rationale: proposal.rationale,
674
+ confidence: proposal.confidence,
675
+ details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
676
+ validation: {
677
+ improved: validation.improved,
678
+ before_pass_rate: validation.before_pass_rate,
679
+ after_pass_rate: validation.after_pass_rate,
680
+ net_change: validation.net_change,
681
+ regressions: validation.regressions,
682
+ new_passes: validation.new_passes,
683
+ per_entry_results: validation.per_entry_results,
684
+ },
685
+ });
467
686
 
468
687
  // If this is the last iteration, return with rejection
469
688
  if (iteration === maxIterations - 1) {
@@ -583,12 +802,40 @@ export async function evolve(
583
802
  writeFileSync(skillPath, updatedContent, "utf-8");
584
803
  tui.done(`Deployed updated description to ${skillPath}`);
585
804
 
805
+ // Show what changed in the skill file
806
+ const diffOutput = formatSimpleDiff(rawContent, updatedContent);
807
+ if (diffOutput) {
808
+ console.error("\n--- Skill description diff ---");
809
+ console.error(diffOutput);
810
+ console.error("------------------------------\n");
811
+ }
812
+
586
813
  recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
587
814
  total: evalSet.length,
588
815
  passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
589
816
  failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
590
817
  pass_rate: lastValidation.after_pass_rate,
591
818
  });
819
+ recordEvidence({
820
+ timestamp: new Date().toISOString(),
821
+ proposal_id: lastProposal.proposal_id,
822
+ skill_name: skillName,
823
+ skill_path: skillPath,
824
+ target: "description",
825
+ stage: "deployed",
826
+ rationale: lastProposal.rationale,
827
+ confidence: lastProposal.confidence,
828
+ details: `Deployed proposal for ${skillName}`,
829
+ validation: {
830
+ improved: lastValidation.improved,
831
+ before_pass_rate: lastValidation.before_pass_rate,
832
+ after_pass_rate: lastValidation.after_pass_rate,
833
+ net_change: lastValidation.net_change,
834
+ regressions: lastValidation.regressions,
835
+ new_passes: lastValidation.new_passes,
836
+ per_entry_results: lastValidation.per_entry_results,
837
+ },
838
+ });
592
839
  }
593
840
 
594
841
  // -----------------------------------------------------------------------
@@ -654,9 +901,12 @@ export async function cliMain(): Promise<void> {
654
901
  "token-efficiency": { type: "boolean", default: false },
655
902
  "with-baseline": { type: "boolean", default: false },
656
903
  "validation-model": { type: "string", default: "haiku" },
657
- "cheap-loop": { type: "boolean", default: false },
904
+ "cheap-loop": { type: "boolean", default: true },
905
+ "full-model": { type: "boolean", default: false },
658
906
  "gate-model": { type: "string" },
659
907
  "proposal-model": { type: "string" },
908
+ "sync-first": { type: "boolean", default: false },
909
+ "sync-force": { type: "boolean", default: false },
660
910
  verbose: { type: "boolean", default: false },
661
911
  help: { type: "boolean", default: false },
662
912
  },
@@ -682,9 +932,12 @@ Options:
682
932
  --token-efficiency Enable 5D Pareto with token efficiency scoring
683
933
  --with-baseline Gate deployment on baseline lift > 0.05
684
934
  --validation-model Model for trigger-check validation calls (default: haiku)
685
- --cheap-loop Use cheap models for loop, expensive model for final gate
686
- --gate-model Model for final gate validation (default: sonnet when --cheap-loop)
935
+ --cheap-loop Use cheap models for loop, expensive for gate (default: on)
936
+ --full-model Use same model for all stages (disables cheap-loop)
937
+ --gate-model Model for final gate validation (default: sonnet)
687
938
  --proposal-model Model for proposal generation LLM calls
939
+ --sync-first Refresh source-truth telemetry before building evals/failure patterns
940
+ --sync-force Force a full rescan during --sync-first
688
941
  --verbose Output full EvolveResult JSON (default: compact summary)
689
942
  --help Show this help message`);
690
943
  process.exit(0);
@@ -694,6 +947,10 @@ Options:
694
947
  console.error("[ERROR] --skill and --skill-path are required");
695
948
  process.exit(1);
696
949
  }
950
+ if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
951
+ console.error("[ERROR] --sync-force requires --sync-first");
952
+ process.exit(1);
953
+ }
697
954
 
698
955
  const { detectAgent } = await import("../utils/llm-call.js");
699
956
  const requestedAgent = values.agent;
@@ -721,11 +978,60 @@ Options:
721
978
  process.exit(1);
722
979
  }
723
980
 
981
+ // -------------------------------------------------------------------------
982
+ // Pre-flight validation: catch common misconfigurations before evolve()
983
+ // -------------------------------------------------------------------------
984
+ const skillPath = values["skill-path"];
985
+ if (!skillPath) {
986
+ console.error("[ERROR] --skill-path is required.");
987
+ process.exit(1);
988
+ }
989
+ if (!existsSync(skillPath)) {
990
+ console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
991
+ console.error(" Verify the --skill-path argument points to an existing SKILL.md file.");
992
+ process.exit(1);
993
+ }
994
+
995
+ const evalSetPath = values["eval-set"];
996
+ if (evalSetPath && !existsSync(evalSetPath)) {
997
+ console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
998
+ console.error(" Verify the --eval-set argument points to an existing JSON file.");
999
+ process.exit(1);
1000
+ }
1001
+
1002
+ // If no eval-set provided, check that log files exist for auto-generation
1003
+ if (!evalSetPath && !(values["sync-first"] ?? false)) {
1004
+ const hasSkillLog = readEffectiveSkillUsageRecords().length > 0;
1005
+ const hasQueryLog = existsSync(QUERY_LOG);
1006
+ if (!hasSkillLog && !hasQueryLog) {
1007
+ console.error("[ERROR] No eval set provided and no telemetry logs found.");
1008
+ console.error(
1009
+ " Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
1010
+ );
1011
+ console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
1012
+ process.exit(1);
1013
+ }
1014
+ }
1015
+
724
1016
  const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
725
1017
  let telemetryRecords: SessionTelemetryRecord[] | undefined;
726
- if (tokenEfficiencyEnabled) {
1018
+ if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
727
1019
  telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
728
1020
  }
1021
+ const gradingResults = readGradingResultsForSkill(values.skill);
1022
+
1023
+ if (values.verbose) {
1024
+ console.error("[verbose] Pre-flight checks passed");
1025
+ console.error(`[verbose] Skill: ${values.skill}`);
1026
+ console.error(`[verbose] Skill path: ${skillPath}`);
1027
+ console.error(`[verbose] Agent: ${agent}`);
1028
+ console.error(`[verbose] Eval set: ${evalSetPath ?? "(auto-generated from logs)"}`);
1029
+ console.error(`[verbose] Loaded grading results: ${gradingResults.length}`);
1030
+ console.error(`[verbose] Cheap loop: ${values["cheap-loop"] ?? false}`);
1031
+ console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
1032
+ console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
1033
+ console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
1034
+ }
729
1035
 
730
1036
  const result = await evolve({
731
1037
  skillName: values.skill,
@@ -741,9 +1047,12 @@ Options:
741
1047
  telemetryRecords,
742
1048
  withBaseline: values["with-baseline"] ?? false,
743
1049
  validationModel: values["validation-model"],
744
- cheapLoop: values["cheap-loop"] ?? false,
1050
+ cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
745
1051
  gateModel: values["gate-model"],
746
1052
  proposalModel: values["proposal-model"],
1053
+ gradingResults,
1054
+ syncFirst: values["sync-first"] ?? false,
1055
+ syncForce: values["sync-force"] ?? false,
747
1056
  });
748
1057
 
749
1058
  if (values.verbose) {
@@ -769,12 +1078,49 @@ Options:
769
1078
  };
770
1079
  console.log(JSON.stringify(summary, null, 2));
771
1080
  }
1081
+
1082
+ // Print human-readable status to stderr so users always see outcome
1083
+ if (!result.deployed) {
1084
+ console.error(`\n[NOT DEPLOYED] ${result.reason}`);
1085
+ if (result.validation && !result.validation.improved) {
1086
+ console.error(
1087
+ ` Pass rate: ${(result.validation.before_pass_rate * 100).toFixed(1)}% -> ${(result.validation.after_pass_rate * 100).toFixed(1)}% (net: ${result.validation.net_change >= 0 ? "+" : ""}${(result.validation.net_change * 100).toFixed(1)}%)`,
1088
+ );
1089
+ if (result.validation.regressions.length > 0) {
1090
+ console.error(` Regressions: ${result.validation.regressions.length} entries`);
1091
+ }
1092
+ }
1093
+ if (
1094
+ result.proposal &&
1095
+ result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
1096
+ ) {
1097
+ console.error(
1098
+ ` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
1099
+ );
1100
+ }
1101
+ console.error(" Re-run with --verbose for full diagnostic output.");
1102
+ } else {
1103
+ console.error(`\n[DEPLOYED] ${result.reason}`);
1104
+ }
1105
+
772
1106
  process.exit(result.deployed ? 0 : 1);
773
1107
  }
774
1108
 
775
1109
  if (import.meta.main) {
776
1110
  cliMain().catch((err) => {
777
- console.error(`[FATAL] ${err}`);
1111
+ const message = err instanceof Error ? err.message : String(err);
1112
+ const stack = err instanceof Error ? err.stack : undefined;
1113
+ console.error(`[FATAL] ${message}`);
1114
+ if (stack && process.env.SELFTUNE_VERBOSE === "1") {
1115
+ console.error(stack);
1116
+ }
1117
+ console.error(
1118
+ "\nTroubleshooting:\n" +
1119
+ " - Verify --skill-path points to a valid SKILL.md file\n" +
1120
+ " - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" +
1121
+ " - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
1122
+ " - Re-run with --verbose for full diagnostic output",
1123
+ );
778
1124
  process.exit(1);
779
1125
  });
780
1126
  }