selftune 0.2.13 → 0.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +2 -0
  2. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +16 -0
  3. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +12 -0
  4. package/apps/local-dashboard/dist/index.html +3 -3
  5. package/cli/selftune/activation-rules.ts +24 -48
  6. package/cli/selftune/analytics.ts +13 -11
  7. package/cli/selftune/badge/badge.ts +13 -9
  8. package/cli/selftune/canonical-export.ts +6 -6
  9. package/cli/selftune/constants.ts +7 -0
  10. package/cli/selftune/contribute/bundle.ts +9 -44
  11. package/cli/selftune/contribute/contribute.ts +2 -1
  12. package/cli/selftune/cron/setup.ts +3 -1
  13. package/cli/selftune/dashboard-contract.ts +22 -0
  14. package/cli/selftune/dashboard.ts +10 -5
  15. package/cli/selftune/eval/baseline.ts +20 -30
  16. package/cli/selftune/eval/hooks-to-evals.ts +27 -34
  17. package/cli/selftune/eval/import-skillsbench.ts +21 -8
  18. package/cli/selftune/eval/unit-test-cli.ts +22 -11
  19. package/cli/selftune/evolution/description-quality.ts +224 -0
  20. package/cli/selftune/evolution/evolve-body.ts +17 -10
  21. package/cli/selftune/evolution/evolve.ts +70 -57
  22. package/cli/selftune/evolution/rollback.ts +7 -6
  23. package/cli/selftune/grading/auto-grade.ts +27 -35
  24. package/cli/selftune/grading/grade-session.ts +24 -30
  25. package/cli/selftune/hooks/auto-activate.ts +12 -3
  26. package/cli/selftune/hooks/evolution-guard.ts +14 -24
  27. package/cli/selftune/hooks/prompt-log.ts +7 -9
  28. package/cli/selftune/hooks/session-stop.ts +0 -8
  29. package/cli/selftune/index.ts +66 -69
  30. package/cli/selftune/ingestors/claude-replay.ts +29 -14
  31. package/cli/selftune/ingestors/codex-rollout.ts +15 -5
  32. package/cli/selftune/ingestors/codex-wrapper.ts +15 -13
  33. package/cli/selftune/ingestors/openclaw-ingest.ts +24 -5
  34. package/cli/selftune/ingestors/opencode-ingest.ts +9 -4
  35. package/cli/selftune/init.ts +14 -9
  36. package/cli/selftune/localdb/queries.ts +57 -0
  37. package/cli/selftune/monitoring/watch.ts +39 -38
  38. package/cli/selftune/normalization.ts +2 -23
  39. package/cli/selftune/orchestrate.ts +224 -24
  40. package/cli/selftune/routes/skill-report.ts +17 -0
  41. package/cli/selftune/schedule.ts +74 -14
  42. package/cli/selftune/sync.ts +7 -3
  43. package/cli/selftune/types.ts +44 -10
  44. package/cli/selftune/utils/cli-error.ts +102 -0
  45. package/cli/selftune/utils/jsonl.ts +2 -0
  46. package/cli/selftune/workflows/workflows.ts +23 -17
  47. package/package.json +3 -1
  48. package/packages/ui/src/components/RecentActivityFeed.tsx +86 -0
  49. package/packages/ui/src/components/index.ts +1 -0
  50. package/packages/ui/src/components/section-cards.tsx +13 -0
  51. package/skill/SKILL.md +1 -1
  52. package/skill/Workflows/Evolve.md +4 -0
  53. package/skill/Workflows/Initialize.md +8 -8
  54. package/skill/Workflows/Orchestrate.md +11 -7
  55. package/skill/Workflows/Schedule.md +11 -0
  56. package/skill/references/logs.md +22 -21
  57. package/skill/settings_snippet.json +29 -6
  58. package/apps/local-dashboard/dist/assets/index-4_dAY17K.js +0 -16
  59. package/apps/local-dashboard/dist/assets/index-BxV5WZHc.css +0 -2
  60. package/apps/local-dashboard/dist/assets/vendor-ui-7xD7fNEU.js +0 -12
@@ -11,6 +11,7 @@ import type {
11
11
  OrchestrateRunReport,
12
12
  OverviewPayload,
13
13
  PendingProposal,
14
+ RecentActivityItem,
14
15
  SkillReportPayload,
15
16
  SkillSummary,
16
17
  } from "../dashboard-contract.js";
@@ -126,6 +127,10 @@ export function getOverviewPayload(db: Database): OverviewPayload {
126
127
  // Pending proposals: created/validated but no terminal action (deduped in SQL)
127
128
  const pending_proposals = getPendingProposals(db);
128
129
 
130
+ // Active sessions and recent activity
131
+ const active_sessions = getActiveSessionCount(db);
132
+ const recent_activity = getRecentActivity(db);
133
+
129
134
  return {
130
135
  telemetry,
131
136
  skills,
@@ -133,6 +138,8 @@ export function getOverviewPayload(db: Database): OverviewPayload {
133
138
  counts,
134
139
  unmatched_queries: unmatchedRows,
135
140
  pending_proposals,
141
+ active_sessions,
142
+ recent_activity,
136
143
  };
137
144
  }
138
145
 
@@ -361,6 +368,56 @@ export function getOrchestrateRuns(db: Database, limit = 20): OrchestrateRunRepo
361
368
  }));
362
369
  }
363
370
 
371
+ /**
372
+ * Count sessions that have queries recorded but no session_telemetry yet
373
+ * (i.e., the session is still in progress).
374
+ */
375
+ export function getActiveSessionCount(db: Database): number {
376
+ const row = db
377
+ .query(
378
+ `SELECT COUNT(DISTINCT q.session_id) as count
379
+ FROM queries q
380
+ WHERE NOT EXISTS (
381
+ SELECT 1 FROM session_telemetry st WHERE st.session_id = q.session_id
382
+ )`,
383
+ )
384
+ .get() as { count: number };
385
+ return row.count;
386
+ }
387
+
388
+ /**
389
+ * Get the most recent skill invocations with a flag indicating whether the
390
+ * session is still in progress (no session_telemetry row yet).
391
+ */
392
+ export function getRecentActivity(db: Database, limit = 20): RecentActivityItem[] {
393
+ const rows = db
394
+ .query(
395
+ `SELECT si.occurred_at, si.session_id, si.skill_name, si.query, si.triggered,
396
+ CASE WHEN st.session_id IS NULL THEN 1 ELSE 0 END as is_live
397
+ FROM skill_invocations si
398
+ LEFT JOIN session_telemetry st ON si.session_id = st.session_id
399
+ ORDER BY si.occurred_at DESC
400
+ LIMIT ?`,
401
+ )
402
+ .all(limit) as Array<{
403
+ occurred_at: string;
404
+ session_id: string;
405
+ skill_name: string;
406
+ query: string;
407
+ triggered: number;
408
+ is_live: number;
409
+ }>;
410
+
411
+ return rows.map((row) => ({
412
+ timestamp: row.occurred_at,
413
+ session_id: row.session_id,
414
+ skill_name: row.skill_name,
415
+ query: row.query ?? "",
416
+ triggered: row.triggered === 1,
417
+ is_live: row.is_live === 1,
418
+ }));
419
+ }
420
+
364
421
  // -- Generic read queries (Phase 3: replace readJsonl calls) ------------------
365
422
 
366
423
  /**
@@ -26,7 +26,7 @@ import type {
26
26
  SessionTelemetryRecord,
27
27
  SkillUsageRecord,
28
28
  } from "../types.js";
29
- import { readJsonl } from "../utils/jsonl.js";
29
+ import { CLIError, handleCLIError } from "../utils/cli-error.js";
30
30
  import {
31
31
  filterActionableQueryRecords,
32
32
  filterActionableSkillUsageRecords,
@@ -212,27 +212,13 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
212
212
  );
213
213
  }
214
214
 
215
- // 1. Read log files from SQLite (fall back to JSONL for custom paths)
216
- let telemetry: SessionTelemetryRecord[];
217
- let skillRecords: SkillUsageRecord[];
218
- let queryRecords: QueryLogRecord[];
219
- if (
220
- _telemetryLogPath === TELEMETRY_LOG &&
221
- _skillLogPath === SKILL_LOG &&
222
- _queryLogPath === QUERY_LOG
223
- ) {
224
- const db = getDb();
225
- telemetry = querySessionTelemetry(db) as SessionTelemetryRecord[];
226
- // SQLite queries return DESC order; computeMonitoringSnapshot expects chronological (ASC)
227
- telemetry.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
228
- skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
229
- queryRecords = queryQueryLog(db) as QueryLogRecord[];
230
- } else {
231
- // Intentional JSONL fallback: custom log path overrides bypass SQLite reads
232
- telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
233
- skillRecords = readJsonl<SkillUsageRecord>(_skillLogPath);
234
- queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
235
- }
215
+ // 1. Read log files from SQLite
216
+ const db = getDb();
217
+ const telemetry = querySessionTelemetry(db) as SessionTelemetryRecord[];
218
+ // SQLite queries return DESC order; computeMonitoringSnapshot expects chronological (ASC)
219
+ telemetry.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
220
+ const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
221
+ const queryRecords = queryQueryLog(db) as QueryLogRecord[];
236
222
 
237
223
  // 2. Determine baseline pass rate from last deployed audit entry
238
224
  const lastDeployed = getLastDeployedProposal(skillName, _auditLogPath);
@@ -369,34 +355,52 @@ Options:
369
355
  }
370
356
 
371
357
  if (!values.skill || !values["skill-path"]) {
372
- console.error("[ERROR] --skill and --skill-path are required");
373
- process.exit(1);
358
+ throw new CLIError(
359
+ "--skill and --skill-path are required.",
360
+ "MISSING_FLAG",
361
+ "Usage: selftune watch --skill <name> --skill-path <path>",
362
+ );
374
363
  }
375
364
  if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
376
- console.error("[ERROR] --sync-force requires --sync-first");
377
- process.exit(1);
365
+ throw new CLIError(
366
+ "--sync-force requires --sync-first.",
367
+ "INVALID_FLAG",
368
+ "Add --sync-first when using --sync-force.",
369
+ );
378
370
  }
379
371
 
380
372
  const rawWindow = values.window ?? "20";
381
373
  if (!/^\d+$/.test(rawWindow)) {
382
- console.error("[ERROR] --window must be a positive integer >= 1");
383
- process.exit(1);
374
+ throw new CLIError(
375
+ "--window must be a positive integer >= 1.",
376
+ "INVALID_FLAG",
377
+ "selftune watch --window 20",
378
+ );
384
379
  }
385
380
  const windowSessions = Number.parseInt(rawWindow, 10);
386
381
  if (windowSessions < 1) {
387
- console.error("[ERROR] --window must be a positive integer >= 1");
388
- process.exit(1);
382
+ throw new CLIError(
383
+ "--window must be a positive integer >= 1.",
384
+ "INVALID_FLAG",
385
+ "selftune watch --window 20",
386
+ );
389
387
  }
390
388
 
391
389
  const rawThreshold = values.threshold ?? "0.1";
392
390
  if (!/^\d+(\.\d+)?$/.test(rawThreshold)) {
393
- console.error("[ERROR] --threshold must be a finite number between 0 and 1");
394
- process.exit(1);
391
+ throw new CLIError(
392
+ "--threshold must be a finite number between 0 and 1.",
393
+ "INVALID_FLAG",
394
+ "selftune watch --threshold 0.1",
395
+ );
395
396
  }
396
397
  const regressionThreshold = Number.parseFloat(rawThreshold);
397
398
  if (regressionThreshold < 0 || regressionThreshold > 1) {
398
- console.error("[ERROR] --threshold must be a finite number between 0 and 1");
399
- process.exit(1);
399
+ throw new CLIError(
400
+ "--threshold must be a finite number between 0 and 1.",
401
+ "INVALID_FLAG",
402
+ "selftune watch --threshold 0.1",
403
+ );
400
404
  }
401
405
 
402
406
  const result = await watch({
@@ -414,8 +418,5 @@ Options:
414
418
  }
415
419
 
416
420
  if (import.meta.main) {
417
- cliMain().catch((err) => {
418
- console.error(`[FATAL] ${err}`);
419
- process.exit(1);
420
- });
421
+ cliMain().catch(handleCLIError);
421
422
  }
@@ -14,7 +14,6 @@
14
14
 
15
15
  import { createHash } from "node:crypto";
16
16
  import {
17
- appendFileSync,
18
17
  existsSync,
19
18
  mkdirSync,
20
19
  readFileSync,
@@ -388,32 +387,12 @@ export function getLatestPromptIdentity(
388
387
  };
389
388
  }
390
389
 
391
- export function appendCanonicalRecord(record: CanonicalRecord, logPath?: string): void {
390
+ export function appendCanonicalRecord(record: CanonicalRecord, _logPath?: string): void {
392
391
  writeCanonicalToDb(record);
393
- // JSONL append — best-effort backup for prompt state recovery
394
- try {
395
- const path = logPath ?? CANONICAL_LOG;
396
- const dir = dirname(path);
397
- if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
398
- appendFileSync(path, `${JSON.stringify(record)}\n`, "utf-8");
399
- } catch {
400
- /* best-effort only */
401
- }
402
392
  }
403
393
 
404
- export function appendCanonicalRecords(records: CanonicalRecord[], logPath?: string): void {
394
+ export function appendCanonicalRecords(records: CanonicalRecord[], _logPath?: string): void {
405
395
  writeCanonicalBatchToDb(records);
406
- // JSONL append — best-effort backup for prompt state recovery
407
- try {
408
- const path = logPath ?? CANONICAL_LOG;
409
- const dir = dirname(path);
410
- if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
411
- for (const record of records) {
412
- appendFileSync(path, `${JSON.stringify(record)}\n`, "utf-8");
413
- }
414
- } catch {
415
- /* best-effort only */
416
- }
417
396
  }
418
397
 
419
398
  // ---------------------------------------------------------------------------
@@ -9,9 +9,9 @@
9
9
  * explicit dry-run and review-required modes for human-in-the-loop operation.
10
10
  */
11
11
 
12
- import { existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
12
+ import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
13
13
  import { homedir } from "node:os";
14
- import { join } from "node:path";
14
+ import { dirname, join } from "node:path";
15
15
  import { parseArgs } from "node:util";
16
16
 
17
17
  import { readAlphaIdentity } from "./alpha-identity.js";
@@ -19,9 +19,19 @@ import type { UploadCycleSummary } from "./alpha-upload/index.js";
19
19
  import { ORCHESTRATE_LOCK, SELFTUNE_CONFIG_PATH } from "./constants.js";
20
20
  import type { OrchestrateRunReport, OrchestrateRunSkillAction } from "./dashboard-contract.js";
21
21
  import type { EvolveResult } from "./evolution/evolve.js";
22
+ import {
23
+ buildDefaultGradingOutputPath,
24
+ deriveExpectationsFromSkill,
25
+ gradeSession,
26
+ resolveLatestSessionForSkill,
27
+ } from "./grading/grade-session.js";
22
28
  import { readGradingResultsForSkill } from "./grading/results.js";
23
29
  import { getDb } from "./localdb/db.js";
24
- import { updateSignalConsumed, writeOrchestrateRunToDb } from "./localdb/direct-write.js";
30
+ import {
31
+ updateSignalConsumed,
32
+ writeGradingResultToDb,
33
+ writeOrchestrateRunToDb,
34
+ } from "./localdb/direct-write.js";
25
35
  import {
26
36
  queryEvolutionAudit,
27
37
  queryImprovementSignals,
@@ -43,6 +53,7 @@ import type {
43
53
  SessionTelemetryRecord,
44
54
  SkillUsageRecord,
45
55
  } from "./types.js";
56
+ import { CLIError, handleCLIError } from "./utils/cli-error.js";
46
57
  import { detectAgent } from "./utils/llm-call.js";
47
58
  import { getSelftuneVersion, readConfiguredAgentType } from "./utils/selftune-meta.js";
48
59
  import {
@@ -50,6 +61,7 @@ import {
50
61
  findRepositoryClaudeSkillDirs,
51
62
  findRepositorySkillDirs,
52
63
  } from "./utils/skill-discovery.js";
64
+ import { readExcerpt } from "./utils/transcript.js";
53
65
 
54
66
  // ---------------------------------------------------------------------------
55
67
  // Lockfile management
@@ -156,6 +168,8 @@ export interface OrchestrateOptions {
156
168
  recentWindowHours: number;
157
169
  /** Force sync to rescan all sources. */
158
170
  syncForce: boolean;
171
+ /** Max ungraded skills to auto-grade per run (default: 5). Set 0 to disable. */
172
+ maxAutoGrade: number;
159
173
  }
160
174
 
161
175
  export interface SkillAction {
@@ -178,6 +192,7 @@ export interface OrchestrateResult {
178
192
  deployed: number;
179
193
  watched: number;
180
194
  skipped: number;
195
+ autoGraded: number;
181
196
  dryRun: boolean;
182
197
  approvalMode: "auto" | "review";
183
198
  elapsedMs: number;
@@ -335,6 +350,7 @@ export function formatOrchestrateReport(result: OrchestrateResult): string {
335
350
 
336
351
  // Final summary
337
352
  lines.push("Summary");
353
+ lines.push(` Auto-graded: ${result.summary.autoGraded}`);
338
354
  lines.push(` Evaluated: ${result.summary.evaluated} skills`);
339
355
  lines.push(` Deployed: ${result.summary.deployed}`);
340
356
  lines.push(` Watched: ${result.summary.watched}`);
@@ -620,6 +636,111 @@ function findRecentlyDeployedSkills(
620
636
  return names;
621
637
  }
622
638
 
639
+ // ---------------------------------------------------------------------------
640
+ // Auto-grade ungraded skills
641
+ // ---------------------------------------------------------------------------
642
+
643
+ /**
644
+ * Auto-grade the top ungraded skills that have some session data.
645
+ * Fail-open: individual grading errors are logged but never propagated.
646
+ *
647
+ * @returns Number of skills successfully graded.
648
+ */
649
+ export async function autoGradeTopUngraded(
650
+ skills: SkillStatus[],
651
+ maxAutoGrade: number,
652
+ agent: string,
653
+ deps: {
654
+ readTelemetry: () => SessionTelemetryRecord[];
655
+ readSkillRecords: () => SkillUsageRecord[];
656
+ },
657
+ ): Promise<number> {
658
+ // Filter: UNGRADED skills with some data (skill_checks > 0)
659
+ const ungradedWithData = skills
660
+ .filter((s) => s.status === "UNGRADED" && (s.snapshot?.skill_checks ?? 0) > 0)
661
+ .sort((a, b) => (b.snapshot?.skill_checks ?? 0) - (a.snapshot?.skill_checks ?? 0))
662
+ .slice(0, maxAutoGrade);
663
+
664
+ if (ungradedWithData.length === 0) return 0;
665
+
666
+ let graded = 0;
667
+
668
+ for (const skill of ungradedWithData) {
669
+ try {
670
+ const telemetry = deps.readTelemetry();
671
+ const skillUsage = deps.readSkillRecords();
672
+
673
+ // Resolve the latest session for this skill
674
+ const resolved = resolveLatestSessionForSkill(telemetry, skillUsage, skill.name);
675
+ if (!resolved) {
676
+ console.error(` [auto-grade] ${skill.name}: no session found, skipping`);
677
+ continue;
678
+ }
679
+
680
+ // Derive expectations from SKILL.md
681
+ const derived = deriveExpectationsFromSkill(skill.name);
682
+ let transcriptExcerpt = "(no transcript)";
683
+ if (resolved.transcriptPath) {
684
+ try {
685
+ transcriptExcerpt = readExcerpt(resolved.transcriptPath);
686
+ } catch {
687
+ transcriptExcerpt = "(no transcript)";
688
+ }
689
+ }
690
+
691
+ console.error(` [auto-grade] Grading "${skill.name}" (session ${resolved.sessionId})...`);
692
+
693
+ const result = await gradeSession({
694
+ expectations: derived.expectations,
695
+ telemetry: resolved.telemetry,
696
+ sessionId: resolved.sessionId,
697
+ skillName: skill.name,
698
+ transcriptExcerpt,
699
+ transcriptPath: resolved.transcriptPath,
700
+ agent,
701
+ });
702
+
703
+ // Persist to SQLite — only count as graded if DB write succeeds
704
+ let persisted = false;
705
+ try {
706
+ persisted = writeGradingResultToDb(result);
707
+ } catch {
708
+ persisted = false;
709
+ }
710
+ if (!persisted) {
711
+ console.error(` [auto-grade] ${skill.name}: graded but failed to persist result`);
712
+ continue;
713
+ }
714
+
715
+ // Persist to file (fail-open, supplementary)
716
+ try {
717
+ const basePath = buildDefaultGradingOutputPath(resolved.sessionId);
718
+ const safeName = skill.name.replace(/[^a-zA-Z0-9_-]/g, "_");
719
+ const outputPath = basePath.replace(/\.json$/, `_${safeName}.json`);
720
+ const outputDir = dirname(outputPath);
721
+ mkdirSync(outputDir, { recursive: true });
722
+ writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
723
+ } catch {
724
+ // fail-open: DB is authoritative, file is supplementary
725
+ }
726
+
727
+ const passRate = result.summary.pass_rate;
728
+ console.error(
729
+ ` [auto-grade] ${skill.name}: ${result.summary.passed}/${result.summary.total} passed (${Math.round(passRate * 100)}%)`,
730
+ );
731
+ graded++;
732
+ } catch (err) {
733
+ const msg = err instanceof Error ? err.message : String(err);
734
+ console.error(
735
+ ` [auto-grade] ${skill.name}: error — ${msg}. Retry with: selftune grade ${skill.name}`,
736
+ );
737
+ // fail-open: continue to next skill
738
+ }
739
+ }
740
+
741
+ return graded;
742
+ }
743
+
623
744
  // ---------------------------------------------------------------------------
624
745
  // Main orchestrator
625
746
  // ---------------------------------------------------------------------------
@@ -665,6 +786,7 @@ export async function orchestrate(
665
786
  deployed: 0,
666
787
  watched: 0,
667
788
  skipped: 0,
789
+ autoGraded: 0,
668
790
  dryRun: options.dryRun,
669
791
  approvalMode: options.approvalMode,
670
792
  elapsedMs: 0,
@@ -732,7 +854,7 @@ export async function orchestrate(
732
854
  const auditEntries = _readAuditEntries();
733
855
  const doctorResult = await _doctor();
734
856
 
735
- const statusResult = _computeStatus(
857
+ let statusResult = _computeStatus(
736
858
  telemetry,
737
859
  skillRecords,
738
860
  queryRecords,
@@ -743,6 +865,61 @@ export async function orchestrate(
743
865
  `[orchestrate] Status: ${statusResult.skills.length} skills, system=${statusResult.system.healthy ? "healthy" : "unhealthy"}`,
744
866
  );
745
867
 
868
+ // -------------------------------------------------------------------------
869
+ // Step 2a: Auto-grade ungraded skills with sufficient data
870
+ // -------------------------------------------------------------------------
871
+ let autoGradedCount = 0;
872
+ const scopedSkills = options.skillFilter
873
+ ? statusResult.skills.filter((s) => s.name === options.skillFilter)
874
+ : statusResult.skills;
875
+ const ungradedWithData = scopedSkills.filter(
876
+ (s) => s.status === "UNGRADED" && (s.snapshot?.skill_checks ?? 0) > 0,
877
+ );
878
+
879
+ if (!options.dryRun && options.maxAutoGrade > 0 && ungradedWithData.length > 0) {
880
+ const gradeAgent = _detectAgent();
881
+ if (gradeAgent) {
882
+ console.error(
883
+ `[orchestrate] Auto-grading ${Math.min(ungradedWithData.length, options.maxAutoGrade)} ungraded skill(s)...`,
884
+ );
885
+ autoGradedCount = await autoGradeTopUngraded(
886
+ scopedSkills,
887
+ options.maxAutoGrade,
888
+ gradeAgent,
889
+ { readTelemetry: _readTelemetry, readSkillRecords: _readSkillRecords },
890
+ );
891
+
892
+ if (autoGradedCount > 0) {
893
+ // Recompute status so candidate selection sees updated grades
894
+ console.error(
895
+ `[orchestrate] Recomputing status after grading ${autoGradedCount} skill(s)...`,
896
+ );
897
+ try {
898
+ const freshTelemetry = _readTelemetry();
899
+ const freshSkillRecords = _readSkillRecords();
900
+ const freshQueryRecords = _readQueryRecords();
901
+ const freshAudit = _readAuditEntries();
902
+ const freshDoctor = doctorResult; // reuse — environment unchanged during grading
903
+ statusResult = _computeStatus(
904
+ freshTelemetry,
905
+ freshSkillRecords,
906
+ freshQueryRecords,
907
+ freshAudit,
908
+ freshDoctor,
909
+ );
910
+ } catch (recomputeErr) {
911
+ console.error(
912
+ `[orchestrate] Warning: failed to recompute status after grading — using pre-grade status. ${recomputeErr instanceof Error ? recomputeErr.message : String(recomputeErr)}`,
913
+ );
914
+ }
915
+ }
916
+ } else {
917
+ console.error(
918
+ "[orchestrate] No agent CLI found — skipping auto-grade. To disable, rerun with: selftune orchestrate --max-auto-grade 0",
919
+ );
920
+ }
921
+ }
922
+
746
923
  // -------------------------------------------------------------------------
747
924
  // Step 2b: Read pending improvement signals
748
925
  // -------------------------------------------------------------------------
@@ -919,6 +1096,7 @@ export async function orchestrate(
919
1096
  deployed: candidates.filter((c) => c.evolveResult?.deployed).length,
920
1097
  watched: candidates.filter((c) => c.action === "watch").length,
921
1098
  skipped: candidates.filter((c) => c.action === "skip").length,
1099
+ autoGraded: autoGradedCount,
922
1100
  };
923
1101
 
924
1102
  const result: OrchestrateResult = {
@@ -956,6 +1134,7 @@ export async function orchestrate(
956
1134
  deployed: finalTotals.deployed,
957
1135
  watched: finalTotals.watched,
958
1136
  skipped: finalTotals.skipped,
1137
+ auto_graded: finalTotals.autoGraded,
959
1138
  skill_actions: candidates.map(
960
1139
  (c): OrchestrateRunSkillAction => ({
961
1140
  skill: c.skill,
@@ -1023,6 +1202,7 @@ export async function cliMain(): Promise<void> {
1023
1202
  "max-skills": { type: "string", default: "5" },
1024
1203
  "recent-window": { type: "string", default: "48" },
1025
1204
  "sync-force": { type: "boolean", default: false },
1205
+ "max-auto-grade": { type: "string", default: "5" },
1026
1206
  loop: { type: "boolean", default: false },
1027
1207
  "loop-interval": { type: "string", default: "3600" },
1028
1208
  help: { type: "boolean", short: "h", default: false },
@@ -1033,7 +1213,7 @@ export async function cliMain(): Promise<void> {
1033
1213
  if (values.help) {
1034
1214
  console.log(`selftune orchestrate — Autonomous core loop
1035
1215
 
1036
- Runs the full improvement cycle: sync → status → evolve → watch.
1216
+ Runs the full improvement cycle: sync → status → auto-grade → evolve → watch.
1037
1217
 
1038
1218
  Usage:
1039
1219
  selftune orchestrate [options]
@@ -1046,6 +1226,7 @@ Options:
1046
1226
  --max-skills <n> Cap skills processed per run (default: 5)
1047
1227
  --recent-window <hrs> Hours to look back for watch targets (default: 48)
1048
1228
  --sync-force Force full rescan during sync
1229
+ --max-auto-grade <n> Max ungraded skills to auto-grade per run (default: 5, 0 to disable)
1049
1230
  --loop Run in continuous loop mode (never stops)
1050
1231
  --loop-interval <s> Seconds between iterations (default: 3600, min: 60)
1051
1232
  -h, --help Show this help message
@@ -1067,23 +1248,45 @@ Examples:
1067
1248
  process.exit(0);
1068
1249
  }
1069
1250
 
1070
- const maxSkills = Number.parseInt(values["max-skills"] ?? "5", 10);
1071
- if (Number.isNaN(maxSkills) || maxSkills < 1) {
1072
- console.error("[ERROR] --max-skills must be a positive integer");
1073
- process.exit(1);
1251
+ const maxSkillsRaw = values["max-skills"] ?? "5";
1252
+ if (!/^\d+$/.test(maxSkillsRaw) || Number(maxSkillsRaw) < 1) {
1253
+ throw new CLIError(
1254
+ "--max-skills must be a positive integer",
1255
+ "INVALID_FLAG",
1256
+ "selftune orchestrate --max-skills 5",
1257
+ );
1074
1258
  }
1075
-
1076
- const recentWindow = Number.parseInt(values["recent-window"] ?? "48", 10);
1077
- if (Number.isNaN(recentWindow) || recentWindow < 1) {
1078
- console.error("[ERROR] --recent-window must be a positive integer");
1079
- process.exit(1);
1259
+ const maxSkills = Number(maxSkillsRaw);
1260
+
1261
+ const recentWindowRaw = values["recent-window"] ?? "48";
1262
+ if (!/^\d+$/.test(recentWindowRaw) || Number(recentWindowRaw) < 1) {
1263
+ throw new CLIError(
1264
+ "--recent-window must be a positive integer",
1265
+ "INVALID_FLAG",
1266
+ "selftune orchestrate --recent-window 48",
1267
+ );
1080
1268
  }
1081
-
1082
- const loopInterval = Number.parseInt(values["loop-interval"] ?? "3600", 10);
1083
- if (values.loop && (Number.isNaN(loopInterval) || loopInterval < 60)) {
1084
- console.error("[ERROR] --loop-interval must be an integer >= 60 (seconds)");
1085
- process.exit(1);
1269
+ const recentWindow = Number(recentWindowRaw);
1270
+
1271
+ const maxAutoGradeRaw = values["max-auto-grade"] ?? "5";
1272
+ if (!/^\d+$/.test(maxAutoGradeRaw)) {
1273
+ throw new CLIError(
1274
+ "--max-auto-grade must be a non-negative integer",
1275
+ "INVALID_FLAG",
1276
+ "selftune orchestrate --max-auto-grade 5",
1277
+ );
1278
+ }
1279
+ const maxAutoGrade = Number(maxAutoGradeRaw);
1280
+
1281
+ const loopIntervalRaw = values["loop-interval"] ?? "3600";
1282
+ if (!/^\d+$/.test(loopIntervalRaw) || (values.loop && Number(loopIntervalRaw) < 60)) {
1283
+ throw new CLIError(
1284
+ "--loop-interval must be an integer >= 60 (seconds)",
1285
+ "INVALID_FLAG",
1286
+ "selftune orchestrate --loop --loop-interval 3600",
1287
+ );
1086
1288
  }
1289
+ const loopInterval = Number(loopIntervalRaw);
1087
1290
 
1088
1291
  const autoApprove = values["auto-approve"] ?? false;
1089
1292
  if (autoApprove) {
@@ -1132,6 +1335,7 @@ Examples:
1132
1335
  maxSkills,
1133
1336
  recentWindowHours: recentWindow,
1134
1337
  syncForce: values["sync-force"] ?? false,
1338
+ maxAutoGrade,
1135
1339
  });
1136
1340
 
1137
1341
  // JSON output: include per-skill decisions for machine consumption
@@ -1188,9 +1392,5 @@ Examples:
1188
1392
  }
1189
1393
 
1190
1394
  if (import.meta.main) {
1191
- cliMain().catch((err) => {
1192
- const message = err instanceof Error ? err.message : String(err);
1193
- console.error(`[FATAL] ${message}`);
1194
- process.exit(1);
1195
- });
1395
+ cliMain().catch(handleCLIError);
1196
1396
  }
@@ -8,6 +8,7 @@
8
8
 
9
9
  import type { Database } from "bun:sqlite";
10
10
 
11
+ import { scoreDescription } from "../evolution/description-quality.js";
11
12
  import { getPendingProposals, getSkillReportPayload, safeParseJson } from "../localdb/queries.js";
12
13
 
13
14
  export function handleSkillReport(db: Database, skillName: string): Response {
@@ -203,6 +204,21 @@ export function handleSkillReport(db: Database, skillName: string): Response {
203
204
  completion_status: string | null;
204
205
  }>;
205
206
 
207
+ // 8. Description quality score — computed from latest evolution evidence
208
+ const latestEvidence = db
209
+ .query(
210
+ `SELECT proposed_text, original_text FROM evolution_evidence
211
+ WHERE skill_name = ? AND (proposed_text IS NOT NULL OR original_text IS NOT NULL)
212
+ ORDER BY timestamp DESC LIMIT 1`,
213
+ )
214
+ .get(skillName) as { proposed_text: string | null; original_text: string | null } | null;
215
+
216
+ // Use the most recent description: deployed proposed_text, or fallback to original_text
217
+ const currentDescriptionText = latestEvidence?.proposed_text ?? latestEvidence?.original_text;
218
+ const descriptionQuality = currentDescriptionText
219
+ ? scoreDescription(currentDescriptionText, skillName)
220
+ : null;
221
+
206
222
  return Response.json({
207
223
  ...report,
208
224
  evolution: evolutionWithSnapshot,
@@ -227,5 +243,6 @@ export function handleSkillReport(db: Database, skillName: string): Response {
227
243
  is_actionable: p.is_actionable === 1,
228
244
  })),
229
245
  session_metadata: sessionMeta,
246
+ description_quality: descriptionQuality,
230
247
  });
231
248
  }