selftune 0.2.13 → 0.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +2 -0
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +16 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +12 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/activation-rules.ts +24 -48
- package/cli/selftune/analytics.ts +13 -11
- package/cli/selftune/badge/badge.ts +13 -9
- package/cli/selftune/canonical-export.ts +6 -6
- package/cli/selftune/constants.ts +7 -0
- package/cli/selftune/contribute/bundle.ts +9 -44
- package/cli/selftune/contribute/contribute.ts +2 -1
- package/cli/selftune/cron/setup.ts +3 -1
- package/cli/selftune/dashboard-contract.ts +22 -0
- package/cli/selftune/dashboard.ts +10 -5
- package/cli/selftune/eval/baseline.ts +20 -30
- package/cli/selftune/eval/hooks-to-evals.ts +27 -34
- package/cli/selftune/eval/import-skillsbench.ts +21 -8
- package/cli/selftune/eval/unit-test-cli.ts +22 -11
- package/cli/selftune/evolution/description-quality.ts +224 -0
- package/cli/selftune/evolution/evolve-body.ts +17 -10
- package/cli/selftune/evolution/evolve.ts +70 -57
- package/cli/selftune/evolution/rollback.ts +7 -6
- package/cli/selftune/grading/auto-grade.ts +27 -35
- package/cli/selftune/grading/grade-session.ts +24 -30
- package/cli/selftune/hooks/auto-activate.ts +12 -3
- package/cli/selftune/hooks/evolution-guard.ts +14 -24
- package/cli/selftune/hooks/prompt-log.ts +7 -9
- package/cli/selftune/hooks/session-stop.ts +0 -8
- package/cli/selftune/index.ts +66 -69
- package/cli/selftune/ingestors/claude-replay.ts +29 -14
- package/cli/selftune/ingestors/codex-rollout.ts +15 -5
- package/cli/selftune/ingestors/codex-wrapper.ts +15 -13
- package/cli/selftune/ingestors/openclaw-ingest.ts +24 -5
- package/cli/selftune/ingestors/opencode-ingest.ts +9 -4
- package/cli/selftune/init.ts +14 -9
- package/cli/selftune/localdb/queries.ts +57 -0
- package/cli/selftune/monitoring/watch.ts +39 -38
- package/cli/selftune/normalization.ts +2 -23
- package/cli/selftune/orchestrate.ts +224 -24
- package/cli/selftune/routes/skill-report.ts +17 -0
- package/cli/selftune/schedule.ts +74 -14
- package/cli/selftune/sync.ts +7 -3
- package/cli/selftune/types.ts +44 -10
- package/cli/selftune/utils/cli-error.ts +102 -0
- package/cli/selftune/utils/jsonl.ts +2 -0
- package/cli/selftune/workflows/workflows.ts +23 -17
- package/package.json +3 -1
- package/packages/ui/src/components/RecentActivityFeed.tsx +86 -0
- package/packages/ui/src/components/index.ts +1 -0
- package/packages/ui/src/components/section-cards.tsx +13 -0
- package/skill/SKILL.md +1 -1
- package/skill/Workflows/Evolve.md +4 -0
- package/skill/Workflows/Initialize.md +8 -8
- package/skill/Workflows/Orchestrate.md +11 -7
- package/skill/Workflows/Schedule.md +11 -0
- package/skill/references/logs.md +22 -21
- package/skill/settings_snippet.json +29 -6
- package/apps/local-dashboard/dist/assets/index-4_dAY17K.js +0 -16
- package/apps/local-dashboard/dist/assets/index-BxV5WZHc.css +0 -2
- package/apps/local-dashboard/dist/assets/vendor-ui-7xD7fNEU.js +0 -12
|
@@ -11,6 +11,7 @@ import type {
|
|
|
11
11
|
OrchestrateRunReport,
|
|
12
12
|
OverviewPayload,
|
|
13
13
|
PendingProposal,
|
|
14
|
+
RecentActivityItem,
|
|
14
15
|
SkillReportPayload,
|
|
15
16
|
SkillSummary,
|
|
16
17
|
} from "../dashboard-contract.js";
|
|
@@ -126,6 +127,10 @@ export function getOverviewPayload(db: Database): OverviewPayload {
|
|
|
126
127
|
// Pending proposals: created/validated but no terminal action (deduped in SQL)
|
|
127
128
|
const pending_proposals = getPendingProposals(db);
|
|
128
129
|
|
|
130
|
+
// Active sessions and recent activity
|
|
131
|
+
const active_sessions = getActiveSessionCount(db);
|
|
132
|
+
const recent_activity = getRecentActivity(db);
|
|
133
|
+
|
|
129
134
|
return {
|
|
130
135
|
telemetry,
|
|
131
136
|
skills,
|
|
@@ -133,6 +138,8 @@ export function getOverviewPayload(db: Database): OverviewPayload {
|
|
|
133
138
|
counts,
|
|
134
139
|
unmatched_queries: unmatchedRows,
|
|
135
140
|
pending_proposals,
|
|
141
|
+
active_sessions,
|
|
142
|
+
recent_activity,
|
|
136
143
|
};
|
|
137
144
|
}
|
|
138
145
|
|
|
@@ -361,6 +368,56 @@ export function getOrchestrateRuns(db: Database, limit = 20): OrchestrateRunRepo
|
|
|
361
368
|
}));
|
|
362
369
|
}
|
|
363
370
|
|
|
371
|
+
/**
|
|
372
|
+
* Count sessions that have queries recorded but no session_telemetry yet
|
|
373
|
+
* (i.e., the session is still in progress).
|
|
374
|
+
*/
|
|
375
|
+
export function getActiveSessionCount(db: Database): number {
|
|
376
|
+
const row = db
|
|
377
|
+
.query(
|
|
378
|
+
`SELECT COUNT(DISTINCT q.session_id) as count
|
|
379
|
+
FROM queries q
|
|
380
|
+
WHERE NOT EXISTS (
|
|
381
|
+
SELECT 1 FROM session_telemetry st WHERE st.session_id = q.session_id
|
|
382
|
+
)`,
|
|
383
|
+
)
|
|
384
|
+
.get() as { count: number };
|
|
385
|
+
return row.count;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* Get the most recent skill invocations with a flag indicating whether the
|
|
390
|
+
* session is still in progress (no session_telemetry row yet).
|
|
391
|
+
*/
|
|
392
|
+
export function getRecentActivity(db: Database, limit = 20): RecentActivityItem[] {
|
|
393
|
+
const rows = db
|
|
394
|
+
.query(
|
|
395
|
+
`SELECT si.occurred_at, si.session_id, si.skill_name, si.query, si.triggered,
|
|
396
|
+
CASE WHEN st.session_id IS NULL THEN 1 ELSE 0 END as is_live
|
|
397
|
+
FROM skill_invocations si
|
|
398
|
+
LEFT JOIN session_telemetry st ON si.session_id = st.session_id
|
|
399
|
+
ORDER BY si.occurred_at DESC
|
|
400
|
+
LIMIT ?`,
|
|
401
|
+
)
|
|
402
|
+
.all(limit) as Array<{
|
|
403
|
+
occurred_at: string;
|
|
404
|
+
session_id: string;
|
|
405
|
+
skill_name: string;
|
|
406
|
+
query: string;
|
|
407
|
+
triggered: number;
|
|
408
|
+
is_live: number;
|
|
409
|
+
}>;
|
|
410
|
+
|
|
411
|
+
return rows.map((row) => ({
|
|
412
|
+
timestamp: row.occurred_at,
|
|
413
|
+
session_id: row.session_id,
|
|
414
|
+
skill_name: row.skill_name,
|
|
415
|
+
query: row.query ?? "",
|
|
416
|
+
triggered: row.triggered === 1,
|
|
417
|
+
is_live: row.is_live === 1,
|
|
418
|
+
}));
|
|
419
|
+
}
|
|
420
|
+
|
|
364
421
|
// -- Generic read queries (Phase 3: replace readJsonl calls) ------------------
|
|
365
422
|
|
|
366
423
|
/**
|
|
@@ -26,7 +26,7 @@ import type {
|
|
|
26
26
|
SessionTelemetryRecord,
|
|
27
27
|
SkillUsageRecord,
|
|
28
28
|
} from "../types.js";
|
|
29
|
-
import {
|
|
29
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
30
30
|
import {
|
|
31
31
|
filterActionableQueryRecords,
|
|
32
32
|
filterActionableSkillUsageRecords,
|
|
@@ -212,27 +212,13 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
212
212
|
);
|
|
213
213
|
}
|
|
214
214
|
|
|
215
|
-
// 1. Read log files from SQLite
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
_queryLogPath === QUERY_LOG
|
|
223
|
-
) {
|
|
224
|
-
const db = getDb();
|
|
225
|
-
telemetry = querySessionTelemetry(db) as SessionTelemetryRecord[];
|
|
226
|
-
// SQLite queries return DESC order; computeMonitoringSnapshot expects chronological (ASC)
|
|
227
|
-
telemetry.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
228
|
-
skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
229
|
-
queryRecords = queryQueryLog(db) as QueryLogRecord[];
|
|
230
|
-
} else {
|
|
231
|
-
// Intentional JSONL fallback: custom log path overrides bypass SQLite reads
|
|
232
|
-
telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
|
|
233
|
-
skillRecords = readJsonl<SkillUsageRecord>(_skillLogPath);
|
|
234
|
-
queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
|
|
235
|
-
}
|
|
215
|
+
// 1. Read log files from SQLite
|
|
216
|
+
const db = getDb();
|
|
217
|
+
const telemetry = querySessionTelemetry(db) as SessionTelemetryRecord[];
|
|
218
|
+
// SQLite queries return DESC order; computeMonitoringSnapshot expects chronological (ASC)
|
|
219
|
+
telemetry.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
220
|
+
const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
221
|
+
const queryRecords = queryQueryLog(db) as QueryLogRecord[];
|
|
236
222
|
|
|
237
223
|
// 2. Determine baseline pass rate from last deployed audit entry
|
|
238
224
|
const lastDeployed = getLastDeployedProposal(skillName, _auditLogPath);
|
|
@@ -369,34 +355,52 @@ Options:
|
|
|
369
355
|
}
|
|
370
356
|
|
|
371
357
|
if (!values.skill || !values["skill-path"]) {
|
|
372
|
-
|
|
373
|
-
|
|
358
|
+
throw new CLIError(
|
|
359
|
+
"--skill and --skill-path are required.",
|
|
360
|
+
"MISSING_FLAG",
|
|
361
|
+
"Usage: selftune watch --skill <name> --skill-path <path>",
|
|
362
|
+
);
|
|
374
363
|
}
|
|
375
364
|
if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
|
|
376
|
-
|
|
377
|
-
|
|
365
|
+
throw new CLIError(
|
|
366
|
+
"--sync-force requires --sync-first.",
|
|
367
|
+
"INVALID_FLAG",
|
|
368
|
+
"Add --sync-first when using --sync-force.",
|
|
369
|
+
);
|
|
378
370
|
}
|
|
379
371
|
|
|
380
372
|
const rawWindow = values.window ?? "20";
|
|
381
373
|
if (!/^\d+$/.test(rawWindow)) {
|
|
382
|
-
|
|
383
|
-
|
|
374
|
+
throw new CLIError(
|
|
375
|
+
"--window must be a positive integer >= 1.",
|
|
376
|
+
"INVALID_FLAG",
|
|
377
|
+
"selftune watch --window 20",
|
|
378
|
+
);
|
|
384
379
|
}
|
|
385
380
|
const windowSessions = Number.parseInt(rawWindow, 10);
|
|
386
381
|
if (windowSessions < 1) {
|
|
387
|
-
|
|
388
|
-
|
|
382
|
+
throw new CLIError(
|
|
383
|
+
"--window must be a positive integer >= 1.",
|
|
384
|
+
"INVALID_FLAG",
|
|
385
|
+
"selftune watch --window 20",
|
|
386
|
+
);
|
|
389
387
|
}
|
|
390
388
|
|
|
391
389
|
const rawThreshold = values.threshold ?? "0.1";
|
|
392
390
|
if (!/^\d+(\.\d+)?$/.test(rawThreshold)) {
|
|
393
|
-
|
|
394
|
-
|
|
391
|
+
throw new CLIError(
|
|
392
|
+
"--threshold must be a finite number between 0 and 1.",
|
|
393
|
+
"INVALID_FLAG",
|
|
394
|
+
"selftune watch --threshold 0.1",
|
|
395
|
+
);
|
|
395
396
|
}
|
|
396
397
|
const regressionThreshold = Number.parseFloat(rawThreshold);
|
|
397
398
|
if (regressionThreshold < 0 || regressionThreshold > 1) {
|
|
398
|
-
|
|
399
|
-
|
|
399
|
+
throw new CLIError(
|
|
400
|
+
"--threshold must be a finite number between 0 and 1.",
|
|
401
|
+
"INVALID_FLAG",
|
|
402
|
+
"selftune watch --threshold 0.1",
|
|
403
|
+
);
|
|
400
404
|
}
|
|
401
405
|
|
|
402
406
|
const result = await watch({
|
|
@@ -414,8 +418,5 @@ Options:
|
|
|
414
418
|
}
|
|
415
419
|
|
|
416
420
|
if (import.meta.main) {
|
|
417
|
-
cliMain().catch(
|
|
418
|
-
console.error(`[FATAL] ${err}`);
|
|
419
|
-
process.exit(1);
|
|
420
|
-
});
|
|
421
|
+
cliMain().catch(handleCLIError);
|
|
421
422
|
}
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import { createHash } from "node:crypto";
|
|
16
16
|
import {
|
|
17
|
-
appendFileSync,
|
|
18
17
|
existsSync,
|
|
19
18
|
mkdirSync,
|
|
20
19
|
readFileSync,
|
|
@@ -388,32 +387,12 @@ export function getLatestPromptIdentity(
|
|
|
388
387
|
};
|
|
389
388
|
}
|
|
390
389
|
|
|
391
|
-
export function appendCanonicalRecord(record: CanonicalRecord,
|
|
390
|
+
export function appendCanonicalRecord(record: CanonicalRecord, _logPath?: string): void {
|
|
392
391
|
writeCanonicalToDb(record);
|
|
393
|
-
// JSONL append — best-effort backup for prompt state recovery
|
|
394
|
-
try {
|
|
395
|
-
const path = logPath ?? CANONICAL_LOG;
|
|
396
|
-
const dir = dirname(path);
|
|
397
|
-
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
398
|
-
appendFileSync(path, `${JSON.stringify(record)}\n`, "utf-8");
|
|
399
|
-
} catch {
|
|
400
|
-
/* best-effort only */
|
|
401
|
-
}
|
|
402
392
|
}
|
|
403
393
|
|
|
404
|
-
export function appendCanonicalRecords(records: CanonicalRecord[],
|
|
394
|
+
export function appendCanonicalRecords(records: CanonicalRecord[], _logPath?: string): void {
|
|
405
395
|
writeCanonicalBatchToDb(records);
|
|
406
|
-
// JSONL append — best-effort backup for prompt state recovery
|
|
407
|
-
try {
|
|
408
|
-
const path = logPath ?? CANONICAL_LOG;
|
|
409
|
-
const dir = dirname(path);
|
|
410
|
-
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
411
|
-
for (const record of records) {
|
|
412
|
-
appendFileSync(path, `${JSON.stringify(record)}\n`, "utf-8");
|
|
413
|
-
}
|
|
414
|
-
} catch {
|
|
415
|
-
/* best-effort only */
|
|
416
|
-
}
|
|
417
396
|
}
|
|
418
397
|
|
|
419
398
|
// ---------------------------------------------------------------------------
|
|
@@ -9,9 +9,9 @@
|
|
|
9
9
|
* explicit dry-run and review-required modes for human-in-the-loop operation.
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
|
-
import { existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
12
|
+
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
13
13
|
import { homedir } from "node:os";
|
|
14
|
-
import { join } from "node:path";
|
|
14
|
+
import { dirname, join } from "node:path";
|
|
15
15
|
import { parseArgs } from "node:util";
|
|
16
16
|
|
|
17
17
|
import { readAlphaIdentity } from "./alpha-identity.js";
|
|
@@ -19,9 +19,19 @@ import type { UploadCycleSummary } from "./alpha-upload/index.js";
|
|
|
19
19
|
import { ORCHESTRATE_LOCK, SELFTUNE_CONFIG_PATH } from "./constants.js";
|
|
20
20
|
import type { OrchestrateRunReport, OrchestrateRunSkillAction } from "./dashboard-contract.js";
|
|
21
21
|
import type { EvolveResult } from "./evolution/evolve.js";
|
|
22
|
+
import {
|
|
23
|
+
buildDefaultGradingOutputPath,
|
|
24
|
+
deriveExpectationsFromSkill,
|
|
25
|
+
gradeSession,
|
|
26
|
+
resolveLatestSessionForSkill,
|
|
27
|
+
} from "./grading/grade-session.js";
|
|
22
28
|
import { readGradingResultsForSkill } from "./grading/results.js";
|
|
23
29
|
import { getDb } from "./localdb/db.js";
|
|
24
|
-
import {
|
|
30
|
+
import {
|
|
31
|
+
updateSignalConsumed,
|
|
32
|
+
writeGradingResultToDb,
|
|
33
|
+
writeOrchestrateRunToDb,
|
|
34
|
+
} from "./localdb/direct-write.js";
|
|
25
35
|
import {
|
|
26
36
|
queryEvolutionAudit,
|
|
27
37
|
queryImprovementSignals,
|
|
@@ -43,6 +53,7 @@ import type {
|
|
|
43
53
|
SessionTelemetryRecord,
|
|
44
54
|
SkillUsageRecord,
|
|
45
55
|
} from "./types.js";
|
|
56
|
+
import { CLIError, handleCLIError } from "./utils/cli-error.js";
|
|
46
57
|
import { detectAgent } from "./utils/llm-call.js";
|
|
47
58
|
import { getSelftuneVersion, readConfiguredAgentType } from "./utils/selftune-meta.js";
|
|
48
59
|
import {
|
|
@@ -50,6 +61,7 @@ import {
|
|
|
50
61
|
findRepositoryClaudeSkillDirs,
|
|
51
62
|
findRepositorySkillDirs,
|
|
52
63
|
} from "./utils/skill-discovery.js";
|
|
64
|
+
import { readExcerpt } from "./utils/transcript.js";
|
|
53
65
|
|
|
54
66
|
// ---------------------------------------------------------------------------
|
|
55
67
|
// Lockfile management
|
|
@@ -156,6 +168,8 @@ export interface OrchestrateOptions {
|
|
|
156
168
|
recentWindowHours: number;
|
|
157
169
|
/** Force sync to rescan all sources. */
|
|
158
170
|
syncForce: boolean;
|
|
171
|
+
/** Max ungraded skills to auto-grade per run (default: 5). Set 0 to disable. */
|
|
172
|
+
maxAutoGrade: number;
|
|
159
173
|
}
|
|
160
174
|
|
|
161
175
|
export interface SkillAction {
|
|
@@ -178,6 +192,7 @@ export interface OrchestrateResult {
|
|
|
178
192
|
deployed: number;
|
|
179
193
|
watched: number;
|
|
180
194
|
skipped: number;
|
|
195
|
+
autoGraded: number;
|
|
181
196
|
dryRun: boolean;
|
|
182
197
|
approvalMode: "auto" | "review";
|
|
183
198
|
elapsedMs: number;
|
|
@@ -335,6 +350,7 @@ export function formatOrchestrateReport(result: OrchestrateResult): string {
|
|
|
335
350
|
|
|
336
351
|
// Final summary
|
|
337
352
|
lines.push("Summary");
|
|
353
|
+
lines.push(` Auto-graded: ${result.summary.autoGraded}`);
|
|
338
354
|
lines.push(` Evaluated: ${result.summary.evaluated} skills`);
|
|
339
355
|
lines.push(` Deployed: ${result.summary.deployed}`);
|
|
340
356
|
lines.push(` Watched: ${result.summary.watched}`);
|
|
@@ -620,6 +636,111 @@ function findRecentlyDeployedSkills(
|
|
|
620
636
|
return names;
|
|
621
637
|
}
|
|
622
638
|
|
|
639
|
+
// ---------------------------------------------------------------------------
|
|
640
|
+
// Auto-grade ungraded skills
|
|
641
|
+
// ---------------------------------------------------------------------------
|
|
642
|
+
|
|
643
|
+
/**
|
|
644
|
+
* Auto-grade the top ungraded skills that have some session data.
|
|
645
|
+
* Fail-open: individual grading errors are logged but never propagated.
|
|
646
|
+
*
|
|
647
|
+
* @returns Number of skills successfully graded.
|
|
648
|
+
*/
|
|
649
|
+
export async function autoGradeTopUngraded(
|
|
650
|
+
skills: SkillStatus[],
|
|
651
|
+
maxAutoGrade: number,
|
|
652
|
+
agent: string,
|
|
653
|
+
deps: {
|
|
654
|
+
readTelemetry: () => SessionTelemetryRecord[];
|
|
655
|
+
readSkillRecords: () => SkillUsageRecord[];
|
|
656
|
+
},
|
|
657
|
+
): Promise<number> {
|
|
658
|
+
// Filter: UNGRADED skills with some data (skill_checks > 0)
|
|
659
|
+
const ungradedWithData = skills
|
|
660
|
+
.filter((s) => s.status === "UNGRADED" && (s.snapshot?.skill_checks ?? 0) > 0)
|
|
661
|
+
.sort((a, b) => (b.snapshot?.skill_checks ?? 0) - (a.snapshot?.skill_checks ?? 0))
|
|
662
|
+
.slice(0, maxAutoGrade);
|
|
663
|
+
|
|
664
|
+
if (ungradedWithData.length === 0) return 0;
|
|
665
|
+
|
|
666
|
+
let graded = 0;
|
|
667
|
+
|
|
668
|
+
for (const skill of ungradedWithData) {
|
|
669
|
+
try {
|
|
670
|
+
const telemetry = deps.readTelemetry();
|
|
671
|
+
const skillUsage = deps.readSkillRecords();
|
|
672
|
+
|
|
673
|
+
// Resolve the latest session for this skill
|
|
674
|
+
const resolved = resolveLatestSessionForSkill(telemetry, skillUsage, skill.name);
|
|
675
|
+
if (!resolved) {
|
|
676
|
+
console.error(` [auto-grade] ${skill.name}: no session found, skipping`);
|
|
677
|
+
continue;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Derive expectations from SKILL.md
|
|
681
|
+
const derived = deriveExpectationsFromSkill(skill.name);
|
|
682
|
+
let transcriptExcerpt = "(no transcript)";
|
|
683
|
+
if (resolved.transcriptPath) {
|
|
684
|
+
try {
|
|
685
|
+
transcriptExcerpt = readExcerpt(resolved.transcriptPath);
|
|
686
|
+
} catch {
|
|
687
|
+
transcriptExcerpt = "(no transcript)";
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
console.error(` [auto-grade] Grading "${skill.name}" (session ${resolved.sessionId})...`);
|
|
692
|
+
|
|
693
|
+
const result = await gradeSession({
|
|
694
|
+
expectations: derived.expectations,
|
|
695
|
+
telemetry: resolved.telemetry,
|
|
696
|
+
sessionId: resolved.sessionId,
|
|
697
|
+
skillName: skill.name,
|
|
698
|
+
transcriptExcerpt,
|
|
699
|
+
transcriptPath: resolved.transcriptPath,
|
|
700
|
+
agent,
|
|
701
|
+
});
|
|
702
|
+
|
|
703
|
+
// Persist to SQLite — only count as graded if DB write succeeds
|
|
704
|
+
let persisted = false;
|
|
705
|
+
try {
|
|
706
|
+
persisted = writeGradingResultToDb(result);
|
|
707
|
+
} catch {
|
|
708
|
+
persisted = false;
|
|
709
|
+
}
|
|
710
|
+
if (!persisted) {
|
|
711
|
+
console.error(` [auto-grade] ${skill.name}: graded but failed to persist result`);
|
|
712
|
+
continue;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// Persist to file (fail-open, supplementary)
|
|
716
|
+
try {
|
|
717
|
+
const basePath = buildDefaultGradingOutputPath(resolved.sessionId);
|
|
718
|
+
const safeName = skill.name.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
719
|
+
const outputPath = basePath.replace(/\.json$/, `_${safeName}.json`);
|
|
720
|
+
const outputDir = dirname(outputPath);
|
|
721
|
+
mkdirSync(outputDir, { recursive: true });
|
|
722
|
+
writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
|
|
723
|
+
} catch {
|
|
724
|
+
// fail-open: DB is authoritative, file is supplementary
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
const passRate = result.summary.pass_rate;
|
|
728
|
+
console.error(
|
|
729
|
+
` [auto-grade] ${skill.name}: ${result.summary.passed}/${result.summary.total} passed (${Math.round(passRate * 100)}%)`,
|
|
730
|
+
);
|
|
731
|
+
graded++;
|
|
732
|
+
} catch (err) {
|
|
733
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
734
|
+
console.error(
|
|
735
|
+
` [auto-grade] ${skill.name}: error — ${msg}. Retry with: selftune grade ${skill.name}`,
|
|
736
|
+
);
|
|
737
|
+
// fail-open: continue to next skill
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
return graded;
|
|
742
|
+
}
|
|
743
|
+
|
|
623
744
|
// ---------------------------------------------------------------------------
|
|
624
745
|
// Main orchestrator
|
|
625
746
|
// ---------------------------------------------------------------------------
|
|
@@ -665,6 +786,7 @@ export async function orchestrate(
|
|
|
665
786
|
deployed: 0,
|
|
666
787
|
watched: 0,
|
|
667
788
|
skipped: 0,
|
|
789
|
+
autoGraded: 0,
|
|
668
790
|
dryRun: options.dryRun,
|
|
669
791
|
approvalMode: options.approvalMode,
|
|
670
792
|
elapsedMs: 0,
|
|
@@ -732,7 +854,7 @@ export async function orchestrate(
|
|
|
732
854
|
const auditEntries = _readAuditEntries();
|
|
733
855
|
const doctorResult = await _doctor();
|
|
734
856
|
|
|
735
|
-
|
|
857
|
+
let statusResult = _computeStatus(
|
|
736
858
|
telemetry,
|
|
737
859
|
skillRecords,
|
|
738
860
|
queryRecords,
|
|
@@ -743,6 +865,61 @@ export async function orchestrate(
|
|
|
743
865
|
`[orchestrate] Status: ${statusResult.skills.length} skills, system=${statusResult.system.healthy ? "healthy" : "unhealthy"}`,
|
|
744
866
|
);
|
|
745
867
|
|
|
868
|
+
// -------------------------------------------------------------------------
|
|
869
|
+
// Step 2a: Auto-grade ungraded skills with sufficient data
|
|
870
|
+
// -------------------------------------------------------------------------
|
|
871
|
+
let autoGradedCount = 0;
|
|
872
|
+
const scopedSkills = options.skillFilter
|
|
873
|
+
? statusResult.skills.filter((s) => s.name === options.skillFilter)
|
|
874
|
+
: statusResult.skills;
|
|
875
|
+
const ungradedWithData = scopedSkills.filter(
|
|
876
|
+
(s) => s.status === "UNGRADED" && (s.snapshot?.skill_checks ?? 0) > 0,
|
|
877
|
+
);
|
|
878
|
+
|
|
879
|
+
if (!options.dryRun && options.maxAutoGrade > 0 && ungradedWithData.length > 0) {
|
|
880
|
+
const gradeAgent = _detectAgent();
|
|
881
|
+
if (gradeAgent) {
|
|
882
|
+
console.error(
|
|
883
|
+
`[orchestrate] Auto-grading ${Math.min(ungradedWithData.length, options.maxAutoGrade)} ungraded skill(s)...`,
|
|
884
|
+
);
|
|
885
|
+
autoGradedCount = await autoGradeTopUngraded(
|
|
886
|
+
scopedSkills,
|
|
887
|
+
options.maxAutoGrade,
|
|
888
|
+
gradeAgent,
|
|
889
|
+
{ readTelemetry: _readTelemetry, readSkillRecords: _readSkillRecords },
|
|
890
|
+
);
|
|
891
|
+
|
|
892
|
+
if (autoGradedCount > 0) {
|
|
893
|
+
// Recompute status so candidate selection sees updated grades
|
|
894
|
+
console.error(
|
|
895
|
+
`[orchestrate] Recomputing status after grading ${autoGradedCount} skill(s)...`,
|
|
896
|
+
);
|
|
897
|
+
try {
|
|
898
|
+
const freshTelemetry = _readTelemetry();
|
|
899
|
+
const freshSkillRecords = _readSkillRecords();
|
|
900
|
+
const freshQueryRecords = _readQueryRecords();
|
|
901
|
+
const freshAudit = _readAuditEntries();
|
|
902
|
+
const freshDoctor = doctorResult; // reuse — environment unchanged during grading
|
|
903
|
+
statusResult = _computeStatus(
|
|
904
|
+
freshTelemetry,
|
|
905
|
+
freshSkillRecords,
|
|
906
|
+
freshQueryRecords,
|
|
907
|
+
freshAudit,
|
|
908
|
+
freshDoctor,
|
|
909
|
+
);
|
|
910
|
+
} catch (recomputeErr) {
|
|
911
|
+
console.error(
|
|
912
|
+
`[orchestrate] Warning: failed to recompute status after grading — using pre-grade status. ${recomputeErr instanceof Error ? recomputeErr.message : String(recomputeErr)}`,
|
|
913
|
+
);
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
} else {
|
|
917
|
+
console.error(
|
|
918
|
+
"[orchestrate] No agent CLI found — skipping auto-grade. To disable, rerun with: selftune orchestrate --max-auto-grade 0",
|
|
919
|
+
);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
|
|
746
923
|
// -------------------------------------------------------------------------
|
|
747
924
|
// Step 2b: Read pending improvement signals
|
|
748
925
|
// -------------------------------------------------------------------------
|
|
@@ -919,6 +1096,7 @@ export async function orchestrate(
|
|
|
919
1096
|
deployed: candidates.filter((c) => c.evolveResult?.deployed).length,
|
|
920
1097
|
watched: candidates.filter((c) => c.action === "watch").length,
|
|
921
1098
|
skipped: candidates.filter((c) => c.action === "skip").length,
|
|
1099
|
+
autoGraded: autoGradedCount,
|
|
922
1100
|
};
|
|
923
1101
|
|
|
924
1102
|
const result: OrchestrateResult = {
|
|
@@ -956,6 +1134,7 @@ export async function orchestrate(
|
|
|
956
1134
|
deployed: finalTotals.deployed,
|
|
957
1135
|
watched: finalTotals.watched,
|
|
958
1136
|
skipped: finalTotals.skipped,
|
|
1137
|
+
auto_graded: finalTotals.autoGraded,
|
|
959
1138
|
skill_actions: candidates.map(
|
|
960
1139
|
(c): OrchestrateRunSkillAction => ({
|
|
961
1140
|
skill: c.skill,
|
|
@@ -1023,6 +1202,7 @@ export async function cliMain(): Promise<void> {
|
|
|
1023
1202
|
"max-skills": { type: "string", default: "5" },
|
|
1024
1203
|
"recent-window": { type: "string", default: "48" },
|
|
1025
1204
|
"sync-force": { type: "boolean", default: false },
|
|
1205
|
+
"max-auto-grade": { type: "string", default: "5" },
|
|
1026
1206
|
loop: { type: "boolean", default: false },
|
|
1027
1207
|
"loop-interval": { type: "string", default: "3600" },
|
|
1028
1208
|
help: { type: "boolean", short: "h", default: false },
|
|
@@ -1033,7 +1213,7 @@ export async function cliMain(): Promise<void> {
|
|
|
1033
1213
|
if (values.help) {
|
|
1034
1214
|
console.log(`selftune orchestrate — Autonomous core loop
|
|
1035
1215
|
|
|
1036
|
-
Runs the full improvement cycle: sync → status → evolve → watch.
|
|
1216
|
+
Runs the full improvement cycle: sync → status → auto-grade → evolve → watch.
|
|
1037
1217
|
|
|
1038
1218
|
Usage:
|
|
1039
1219
|
selftune orchestrate [options]
|
|
@@ -1046,6 +1226,7 @@ Options:
|
|
|
1046
1226
|
--max-skills <n> Cap skills processed per run (default: 5)
|
|
1047
1227
|
--recent-window <hrs> Hours to look back for watch targets (default: 48)
|
|
1048
1228
|
--sync-force Force full rescan during sync
|
|
1229
|
+
--max-auto-grade <n> Max ungraded skills to auto-grade per run (default: 5, 0 to disable)
|
|
1049
1230
|
--loop Run in continuous loop mode (never stops)
|
|
1050
1231
|
--loop-interval <s> Seconds between iterations (default: 3600, min: 60)
|
|
1051
1232
|
-h, --help Show this help message
|
|
@@ -1067,23 +1248,45 @@ Examples:
|
|
|
1067
1248
|
process.exit(0);
|
|
1068
1249
|
}
|
|
1069
1250
|
|
|
1070
|
-
const
|
|
1071
|
-
if (
|
|
1072
|
-
|
|
1073
|
-
|
|
1251
|
+
const maxSkillsRaw = values["max-skills"] ?? "5";
|
|
1252
|
+
if (!/^\d+$/.test(maxSkillsRaw) || Number(maxSkillsRaw) < 1) {
|
|
1253
|
+
throw new CLIError(
|
|
1254
|
+
"--max-skills must be a positive integer",
|
|
1255
|
+
"INVALID_FLAG",
|
|
1256
|
+
"selftune orchestrate --max-skills 5",
|
|
1257
|
+
);
|
|
1074
1258
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1259
|
+
const maxSkills = Number(maxSkillsRaw);
|
|
1260
|
+
|
|
1261
|
+
const recentWindowRaw = values["recent-window"] ?? "48";
|
|
1262
|
+
if (!/^\d+$/.test(recentWindowRaw) || Number(recentWindowRaw) < 1) {
|
|
1263
|
+
throw new CLIError(
|
|
1264
|
+
"--recent-window must be a positive integer",
|
|
1265
|
+
"INVALID_FLAG",
|
|
1266
|
+
"selftune orchestrate --recent-window 48",
|
|
1267
|
+
);
|
|
1080
1268
|
}
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1269
|
+
const recentWindow = Number(recentWindowRaw);
|
|
1270
|
+
|
|
1271
|
+
const maxAutoGradeRaw = values["max-auto-grade"] ?? "5";
|
|
1272
|
+
if (!/^\d+$/.test(maxAutoGradeRaw)) {
|
|
1273
|
+
throw new CLIError(
|
|
1274
|
+
"--max-auto-grade must be a non-negative integer",
|
|
1275
|
+
"INVALID_FLAG",
|
|
1276
|
+
"selftune orchestrate --max-auto-grade 5",
|
|
1277
|
+
);
|
|
1278
|
+
}
|
|
1279
|
+
const maxAutoGrade = Number(maxAutoGradeRaw);
|
|
1280
|
+
|
|
1281
|
+
const loopIntervalRaw = values["loop-interval"] ?? "3600";
|
|
1282
|
+
if (!/^\d+$/.test(loopIntervalRaw) || (values.loop && Number(loopIntervalRaw) < 60)) {
|
|
1283
|
+
throw new CLIError(
|
|
1284
|
+
"--loop-interval must be an integer >= 60 (seconds)",
|
|
1285
|
+
"INVALID_FLAG",
|
|
1286
|
+
"selftune orchestrate --loop --loop-interval 3600",
|
|
1287
|
+
);
|
|
1086
1288
|
}
|
|
1289
|
+
const loopInterval = Number(loopIntervalRaw);
|
|
1087
1290
|
|
|
1088
1291
|
const autoApprove = values["auto-approve"] ?? false;
|
|
1089
1292
|
if (autoApprove) {
|
|
@@ -1132,6 +1335,7 @@ Examples:
|
|
|
1132
1335
|
maxSkills,
|
|
1133
1336
|
recentWindowHours: recentWindow,
|
|
1134
1337
|
syncForce: values["sync-force"] ?? false,
|
|
1338
|
+
maxAutoGrade,
|
|
1135
1339
|
});
|
|
1136
1340
|
|
|
1137
1341
|
// JSON output: include per-skill decisions for machine consumption
|
|
@@ -1188,9 +1392,5 @@ Examples:
|
|
|
1188
1392
|
}
|
|
1189
1393
|
|
|
1190
1394
|
if (import.meta.main) {
|
|
1191
|
-
cliMain().catch(
|
|
1192
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
1193
|
-
console.error(`[FATAL] ${message}`);
|
|
1194
|
-
process.exit(1);
|
|
1195
|
-
});
|
|
1395
|
+
cliMain().catch(handleCLIError);
|
|
1196
1396
|
}
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
import type { Database } from "bun:sqlite";
|
|
10
10
|
|
|
11
|
+
import { scoreDescription } from "../evolution/description-quality.js";
|
|
11
12
|
import { getPendingProposals, getSkillReportPayload, safeParseJson } from "../localdb/queries.js";
|
|
12
13
|
|
|
13
14
|
export function handleSkillReport(db: Database, skillName: string): Response {
|
|
@@ -203,6 +204,21 @@ export function handleSkillReport(db: Database, skillName: string): Response {
|
|
|
203
204
|
completion_status: string | null;
|
|
204
205
|
}>;
|
|
205
206
|
|
|
207
|
+
// 8. Description quality score — computed from latest evolution evidence
|
|
208
|
+
const latestEvidence = db
|
|
209
|
+
.query(
|
|
210
|
+
`SELECT proposed_text, original_text FROM evolution_evidence
|
|
211
|
+
WHERE skill_name = ? AND (proposed_text IS NOT NULL OR original_text IS NOT NULL)
|
|
212
|
+
ORDER BY timestamp DESC LIMIT 1`,
|
|
213
|
+
)
|
|
214
|
+
.get(skillName) as { proposed_text: string | null; original_text: string | null } | null;
|
|
215
|
+
|
|
216
|
+
// Use the most recent description: deployed proposed_text, or fallback to original_text
|
|
217
|
+
const currentDescriptionText = latestEvidence?.proposed_text ?? latestEvidence?.original_text;
|
|
218
|
+
const descriptionQuality = currentDescriptionText
|
|
219
|
+
? scoreDescription(currentDescriptionText, skillName)
|
|
220
|
+
: null;
|
|
221
|
+
|
|
206
222
|
return Response.json({
|
|
207
223
|
...report,
|
|
208
224
|
evolution: evolutionWithSnapshot,
|
|
@@ -227,5 +243,6 @@ export function handleSkillReport(db: Database, skillName: string): Response {
|
|
|
227
243
|
is_actionable: p.is_actionable === 1,
|
|
228
244
|
})),
|
|
229
245
|
session_metadata: sessionMeta,
|
|
246
|
+
description_quality: descriptionQuality,
|
|
230
247
|
});
|
|
231
248
|
}
|