selftune 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +20 -10
- package/.claude/agents/evolution-reviewer.md +14 -1
- package/.claude/agents/integration-guide.md +18 -6
- package/.claude/agents/pattern-analyst.md +18 -5
- package/CHANGELOG.md +12 -4
- package/README.md +43 -35
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/cli/selftune/badge/badge-data.ts +1 -1
- package/cli/selftune/badge/badge.ts +4 -8
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +28 -0
- package/cli/selftune/contribute/contribute.ts +1 -1
- package/cli/selftune/cron/setup.ts +17 -17
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +653 -186
- package/cli/selftune/dashboard.ts +41 -176
- package/cli/selftune/eval/baseline.ts +5 -4
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/hooks-to-evals.ts +34 -15
- package/cli/selftune/eval/unit-test-cli.ts +1 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +105 -11
- package/cli/selftune/evolution/evolve.ts +371 -25
- package/cli/selftune/evolution/extract-patterns.ts +87 -29
- package/cli/selftune/evolution/rollback.ts +2 -2
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +448 -97
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +395 -116
- package/cli/selftune/ingestors/claude-replay.ts +140 -114
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +227 -14
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/monitoring/watch.ts +66 -15
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +48 -26
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +148 -0
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +78 -20
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +272 -26
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +21 -8
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +84 -53
- package/skill/Workflows/AutoActivation.md +17 -16
- package/skill/Workflows/Badge.md +6 -0
- package/skill/Workflows/Baseline.md +46 -23
- package/skill/Workflows/Composability.md +12 -5
- package/skill/Workflows/Contribute.md +17 -14
- package/skill/Workflows/Cron.md +56 -79
- package/skill/Workflows/Dashboard.md +45 -34
- package/skill/Workflows/Doctor.md +30 -17
- package/skill/Workflows/Evals.md +64 -40
- package/skill/Workflows/EvolutionMemory.md +2 -0
- package/skill/Workflows/Evolve.md +102 -47
- package/skill/Workflows/EvolveBody.md +6 -6
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +11 -5
- package/skill/Workflows/Ingest.md +43 -36
- package/skill/Workflows/Initialize.md +44 -30
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +39 -18
- package/skill/Workflows/Rollback.md +3 -3
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +34 -22
- package/skill/Workflows/Watch.md +14 -4
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +1 -1
- package/templates/multi-skill-settings.json +7 -7
- package/templates/single-skill-settings.json +6 -6
- package/dashboard/index.html +0 -1680
|
@@ -5,20 +5,26 @@
|
|
|
5
5
|
* Rubric-based grader for Claude Code skill sessions.
|
|
6
6
|
* Migrated from grade_session.py.
|
|
7
7
|
*
|
|
8
|
-
* Grades via installed agent CLI
|
|
8
|
+
* Grades via an installed agent CLI selected from AGENT_CANDIDATES.
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
-
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
12
|
-
import { dirname } from "node:path";
|
|
11
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
12
|
+
import { basename, dirname, join } from "node:path";
|
|
13
13
|
import { parseArgs } from "node:util";
|
|
14
14
|
|
|
15
|
-
import {
|
|
15
|
+
import {
|
|
16
|
+
AGENT_CANDIDATES,
|
|
17
|
+
CLAUDE_CODE_PROJECTS_DIR,
|
|
18
|
+
SELFTUNE_CONFIG_DIR,
|
|
19
|
+
TELEMETRY_LOG,
|
|
20
|
+
} from "../constants.js";
|
|
16
21
|
import type {
|
|
17
22
|
ExecutionMetrics,
|
|
18
23
|
GraderOutput,
|
|
19
24
|
GradingExpectation,
|
|
20
25
|
GradingResult,
|
|
21
26
|
SessionTelemetryRecord,
|
|
27
|
+
SkillUsageRecord,
|
|
22
28
|
} from "../types.js";
|
|
23
29
|
import { readJsonl } from "../utils/jsonl.js";
|
|
24
30
|
import {
|
|
@@ -26,7 +32,12 @@ import {
|
|
|
26
32
|
stripMarkdownFences as _stripMarkdownFences,
|
|
27
33
|
callViaAgent,
|
|
28
34
|
} from "../utils/llm-call.js";
|
|
29
|
-
import {
|
|
35
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
36
|
+
import {
|
|
37
|
+
buildTelemetryFromTranscript,
|
|
38
|
+
findTranscriptPathForSession,
|
|
39
|
+
readExcerpt,
|
|
40
|
+
} from "../utils/transcript.js";
|
|
30
41
|
import { type PreGateContext, runPreGates } from "./pre-gates.js";
|
|
31
42
|
|
|
32
43
|
// Re-export for backward compatibility
|
|
@@ -99,12 +110,148 @@ export function latestSessionForSkill(
|
|
|
99
110
|
telemetry: SessionTelemetryRecord[],
|
|
100
111
|
skillName: string,
|
|
101
112
|
): SessionTelemetryRecord | null {
|
|
113
|
+
// First pass: prefer sessions with actual Skill tool invocations (skills_invoked)
|
|
114
|
+
for (let i = telemetry.length - 1; i >= 0; i--) {
|
|
115
|
+
if (telemetry[i].skills_invoked?.includes(skillName)) return telemetry[i];
|
|
116
|
+
}
|
|
117
|
+
// Fallback: sessions where SKILL.md was read (skills_triggered)
|
|
102
118
|
for (let i = telemetry.length - 1; i >= 0; i--) {
|
|
103
119
|
if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
|
|
104
120
|
}
|
|
105
121
|
return null;
|
|
106
122
|
}
|
|
107
123
|
|
|
124
|
+
export function latestSkillUsageForSkill(
|
|
125
|
+
skillUsage: SkillUsageRecord[],
|
|
126
|
+
skillName: string,
|
|
127
|
+
): SkillUsageRecord | null {
|
|
128
|
+
for (let i = skillUsage.length - 1; i >= 0; i--) {
|
|
129
|
+
const record = skillUsage[i];
|
|
130
|
+
if (record.skill_name === skillName && record.triggered) return record;
|
|
131
|
+
}
|
|
132
|
+
return null;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
export interface ResolvedSessionContext {
|
|
136
|
+
telemetry: SessionTelemetryRecord;
|
|
137
|
+
sessionId: string;
|
|
138
|
+
transcriptPath: string;
|
|
139
|
+
source: "telemetry" | "transcript_fallback" | "skill_usage_fallback";
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function buildSkillUsageFallbackTelemetry(record: SkillUsageRecord): SessionTelemetryRecord {
|
|
143
|
+
return {
|
|
144
|
+
timestamp: record.timestamp,
|
|
145
|
+
session_id: record.session_id,
|
|
146
|
+
cwd: "",
|
|
147
|
+
transcript_path: "",
|
|
148
|
+
tool_calls: {},
|
|
149
|
+
total_tool_calls: 0,
|
|
150
|
+
bash_commands: [],
|
|
151
|
+
skills_triggered: [record.skill_name],
|
|
152
|
+
skills_invoked: [record.skill_name],
|
|
153
|
+
assistant_turns: 0,
|
|
154
|
+
errors_encountered: 0,
|
|
155
|
+
transcript_chars: 0,
|
|
156
|
+
last_user_query: record.query,
|
|
157
|
+
source: record.source ?? "skill_usage_fallback",
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export function resolveSessionById(
|
|
162
|
+
telemetry: SessionTelemetryRecord[],
|
|
163
|
+
sessionId: string,
|
|
164
|
+
projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
|
|
165
|
+
): ResolvedSessionContext | null {
|
|
166
|
+
const direct = findSession(telemetry, sessionId);
|
|
167
|
+
if (direct) {
|
|
168
|
+
return {
|
|
169
|
+
telemetry: direct,
|
|
170
|
+
sessionId: direct.session_id,
|
|
171
|
+
transcriptPath: direct.transcript_path ?? "",
|
|
172
|
+
source: "telemetry",
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const transcriptPath = findTranscriptPathForSession(sessionId, projectsDir);
|
|
177
|
+
if (!transcriptPath) return null;
|
|
178
|
+
|
|
179
|
+
const rebuilt = buildTelemetryFromTranscript(sessionId, transcriptPath);
|
|
180
|
+
if (!rebuilt) return null;
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
telemetry: rebuilt,
|
|
184
|
+
sessionId,
|
|
185
|
+
transcriptPath,
|
|
186
|
+
source: "transcript_fallback",
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export function resolveLatestSessionForSkill(
|
|
191
|
+
telemetry: SessionTelemetryRecord[],
|
|
192
|
+
skillUsage: SkillUsageRecord[],
|
|
193
|
+
skillName: string,
|
|
194
|
+
projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
|
|
195
|
+
): ResolvedSessionContext | null {
|
|
196
|
+
const direct = latestSessionForSkill(telemetry, skillName);
|
|
197
|
+
if (direct) {
|
|
198
|
+
return {
|
|
199
|
+
telemetry: direct,
|
|
200
|
+
sessionId: direct.session_id,
|
|
201
|
+
transcriptPath: direct.transcript_path ?? "",
|
|
202
|
+
source: "telemetry",
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const usage = latestSkillUsageForSkill(skillUsage, skillName);
|
|
207
|
+
if (!usage) return null;
|
|
208
|
+
|
|
209
|
+
const transcriptPath = findTranscriptPathForSession(usage.session_id, projectsDir);
|
|
210
|
+
if (!transcriptPath) {
|
|
211
|
+
const fallback = buildSkillUsageFallbackTelemetry(usage);
|
|
212
|
+
return {
|
|
213
|
+
telemetry: fallback,
|
|
214
|
+
sessionId: fallback.session_id,
|
|
215
|
+
transcriptPath: fallback.transcript_path,
|
|
216
|
+
source: "skill_usage_fallback",
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const rebuilt = buildTelemetryFromTranscript(usage.session_id, transcriptPath);
|
|
221
|
+
if (!rebuilt) {
|
|
222
|
+
const fallback = buildSkillUsageFallbackTelemetry(usage);
|
|
223
|
+
fallback.transcript_path = transcriptPath;
|
|
224
|
+
return {
|
|
225
|
+
telemetry: fallback,
|
|
226
|
+
sessionId: fallback.session_id,
|
|
227
|
+
transcriptPath,
|
|
228
|
+
source: "skill_usage_fallback",
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (!rebuilt.skills_triggered.includes(skillName)) {
|
|
233
|
+
rebuilt.skills_triggered = [...rebuilt.skills_triggered, skillName];
|
|
234
|
+
}
|
|
235
|
+
if (rebuilt.skills_invoked && !rebuilt.skills_invoked.includes(skillName)) {
|
|
236
|
+
rebuilt.skills_invoked = [...rebuilt.skills_invoked, skillName];
|
|
237
|
+
}
|
|
238
|
+
if (!rebuilt.last_user_query) {
|
|
239
|
+
rebuilt.last_user_query = usage.query;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
telemetry: rebuilt,
|
|
244
|
+
sessionId: rebuilt.session_id,
|
|
245
|
+
transcriptPath,
|
|
246
|
+
source: "transcript_fallback",
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
export function buildDefaultGradingOutputPath(sessionId: string): string {
|
|
251
|
+
const safeSessionId = sessionId.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
252
|
+
return join(SELFTUNE_CONFIG_DIR, "grading", `result-${safeSessionId}.json`);
|
|
253
|
+
}
|
|
254
|
+
|
|
108
255
|
export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
|
|
109
256
|
let data: unknown;
|
|
110
257
|
try {
|
|
@@ -157,6 +304,107 @@ export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: num
|
|
|
157
304
|
throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
|
|
158
305
|
}
|
|
159
306
|
|
|
307
|
+
// ---------------------------------------------------------------------------
|
|
308
|
+
// Auto-derive expectations from SKILL.md
|
|
309
|
+
// ---------------------------------------------------------------------------
|
|
310
|
+
|
|
311
|
+
export interface DerivedExpectations {
|
|
312
|
+
expectations: string[];
|
|
313
|
+
derived: boolean;
|
|
314
|
+
source: string;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
const GENERIC_EXPECTATIONS: string[] = [
|
|
318
|
+
"The skill was triggered during the session",
|
|
319
|
+
"The task was completed successfully without critical errors",
|
|
320
|
+
"No unhandled errors were encountered",
|
|
321
|
+
];
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Derive grading expectations from a skill's SKILL.md file.
|
|
325
|
+
*
|
|
326
|
+
* Resolution order for SKILL.md path:
|
|
327
|
+
* 1. Explicit `skillPath` argument
|
|
328
|
+
* 2. Lookup from skill_usage_log.jsonl records
|
|
329
|
+
* 3. Falls back to generic expectations if not found
|
|
330
|
+
*/
|
|
331
|
+
export function deriveExpectationsFromSkill(
|
|
332
|
+
skillName: string,
|
|
333
|
+
skillPath?: string,
|
|
334
|
+
): DerivedExpectations {
|
|
335
|
+
// Resolve the SKILL.md path
|
|
336
|
+
let resolvedPath = skillPath;
|
|
337
|
+
|
|
338
|
+
if (!resolvedPath) {
|
|
339
|
+
// Try to find from skill_usage_log
|
|
340
|
+
try {
|
|
341
|
+
const usageRecords = readEffectiveSkillUsageRecords();
|
|
342
|
+
for (let i = usageRecords.length - 1; i >= 0; i--) {
|
|
343
|
+
if (usageRecords[i].skill_name === skillName && usageRecords[i].skill_path) {
|
|
344
|
+
resolvedPath = usageRecords[i].skill_path;
|
|
345
|
+
break;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
} catch {
|
|
349
|
+
// skill_usage_log not available
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if (!resolvedPath || !existsSync(resolvedPath)) {
|
|
354
|
+
return {
|
|
355
|
+
expectations: GENERIC_EXPECTATIONS,
|
|
356
|
+
derived: false,
|
|
357
|
+
source: resolvedPath ? `SKILL.md not found at ${resolvedPath}` : "no SKILL.md path found",
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Read and parse SKILL.md
|
|
362
|
+
let content: string;
|
|
363
|
+
try {
|
|
364
|
+
content = readFileSync(resolvedPath, "utf-8");
|
|
365
|
+
} catch {
|
|
366
|
+
return {
|
|
367
|
+
expectations: GENERIC_EXPECTATIONS,
|
|
368
|
+
derived: false,
|
|
369
|
+
source: `failed to read ${resolvedPath}`,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const expectations: string[] = [`The "${skillName}" skill was triggered during the session`];
|
|
374
|
+
|
|
375
|
+
// Extract description from first paragraph after title
|
|
376
|
+
const descMatch = content.match(/^#\s+.+\n+([^\n#][^\n]*)/m);
|
|
377
|
+
if (descMatch) {
|
|
378
|
+
const desc = descMatch[1].trim();
|
|
379
|
+
if (desc.length > 10) {
|
|
380
|
+
expectations.push(`The skill fulfilled its purpose: ${desc.slice(0, 120)}`);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Extract "When to Use" section content
|
|
385
|
+
const whenMatch = content.match(/##\s*When\s+to\s+Use\b[^\n]*\n([\s\S]*?)(?=\n##\s|\n---|$)/i);
|
|
386
|
+
if (whenMatch) {
|
|
387
|
+
const lines = whenMatch[1]
|
|
388
|
+
.split("\n")
|
|
389
|
+
.map((l) => l.replace(/^[-*]\s*/, "").trim())
|
|
390
|
+
.filter((l) => l.length > 5);
|
|
391
|
+
if (lines.length > 0) {
|
|
392
|
+
expectations.push(`The session context matched a "When to Use" trigger for ${skillName}`);
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Add standard quality expectations
|
|
397
|
+
expectations.push("The task was completed successfully without critical errors");
|
|
398
|
+
expectations.push("No unhandled errors were encountered");
|
|
399
|
+
|
|
400
|
+
// Cap at 5 expectations
|
|
401
|
+
return {
|
|
402
|
+
expectations: expectations.slice(0, 5),
|
|
403
|
+
derived: true,
|
|
404
|
+
source: resolvedPath,
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
|
|
160
408
|
// ---------------------------------------------------------------------------
|
|
161
409
|
// Execution metrics
|
|
162
410
|
// ---------------------------------------------------------------------------
|
|
@@ -271,24 +519,26 @@ export async function gradeViaAgent(prompt: string, agent: string): Promise<Grad
|
|
|
271
519
|
}
|
|
272
520
|
|
|
273
521
|
// ---------------------------------------------------------------------------
|
|
274
|
-
//
|
|
522
|
+
// Shared grading flow
|
|
275
523
|
// ---------------------------------------------------------------------------
|
|
276
524
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
telemetry: SessionTelemetryRecord,
|
|
280
|
-
sessionId: string,
|
|
281
|
-
skillName: string,
|
|
282
|
-
transcriptPath: string,
|
|
283
|
-
): GradingResult {
|
|
284
|
-
// Default missing scores on expectations
|
|
285
|
-
const expectations = (graderOutput?.expectations ?? []).map((e) => ({
|
|
525
|
+
function normalizeExpectations(expectations: GradingExpectation[]): GradingExpectation[] {
|
|
526
|
+
return expectations.map((e) => ({
|
|
286
527
|
...e,
|
|
287
528
|
score: e.score ?? (e.passed ? 1.0 : 0.0),
|
|
288
529
|
source: e.source ?? ("llm" as const),
|
|
289
530
|
}));
|
|
531
|
+
}
|
|
290
532
|
|
|
291
|
-
|
|
533
|
+
function assembleResultFromExpectations(
|
|
534
|
+
expectations: GradingExpectation[],
|
|
535
|
+
telemetry: SessionTelemetryRecord,
|
|
536
|
+
sessionId: string,
|
|
537
|
+
skillName: string,
|
|
538
|
+
transcriptPath: string,
|
|
539
|
+
): GradingResult {
|
|
540
|
+
const passedCount = expectations.filter((e) => e.passed).length;
|
|
541
|
+
const totalCount = expectations.length;
|
|
292
542
|
const graduated = buildGraduatedSummary(expectations);
|
|
293
543
|
|
|
294
544
|
return {
|
|
@@ -298,11 +548,116 @@ export function assembleResult(
|
|
|
298
548
|
graded_at: new Date().toISOString(),
|
|
299
549
|
expectations,
|
|
300
550
|
summary: {
|
|
301
|
-
|
|
551
|
+
passed: passedCount,
|
|
552
|
+
failed: totalCount - passedCount,
|
|
553
|
+
total: totalCount,
|
|
554
|
+
pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
|
|
302
555
|
mean_score: graduated.mean_score,
|
|
303
556
|
score_std_dev: graduated.score_std_dev,
|
|
304
557
|
},
|
|
305
558
|
execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
|
|
559
|
+
claims: [],
|
|
560
|
+
eval_feedback: { suggestions: [], overall: "" },
|
|
561
|
+
};
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
export interface GradeSessionParams {
|
|
565
|
+
expectations: string[];
|
|
566
|
+
telemetry: SessionTelemetryRecord;
|
|
567
|
+
sessionId: string;
|
|
568
|
+
skillName: string;
|
|
569
|
+
transcriptExcerpt: string;
|
|
570
|
+
transcriptPath: string;
|
|
571
|
+
agent: string;
|
|
572
|
+
gradeViaAgentFn?: (prompt: string, agent: string) => Promise<GraderOutput>;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
export async function gradeSession({
|
|
576
|
+
expectations,
|
|
577
|
+
telemetry,
|
|
578
|
+
sessionId,
|
|
579
|
+
skillName,
|
|
580
|
+
transcriptExcerpt,
|
|
581
|
+
transcriptPath,
|
|
582
|
+
agent,
|
|
583
|
+
gradeViaAgentFn = gradeViaAgent,
|
|
584
|
+
}: GradeSessionParams): Promise<GradingResult> {
|
|
585
|
+
const preGateCtx: PreGateContext = {
|
|
586
|
+
telemetry,
|
|
587
|
+
skillName,
|
|
588
|
+
transcriptExcerpt,
|
|
589
|
+
};
|
|
590
|
+
const preGateResult = runPreGates(expectations, preGateCtx);
|
|
591
|
+
|
|
592
|
+
let allExpectations: GradingExpectation[];
|
|
593
|
+
|
|
594
|
+
if (preGateResult.remaining.length === 0) {
|
|
595
|
+
console.error(
|
|
596
|
+
`[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
|
|
597
|
+
);
|
|
598
|
+
allExpectations = preGateResult.resolved;
|
|
599
|
+
} else {
|
|
600
|
+
console.error(
|
|
601
|
+
`[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
|
|
602
|
+
);
|
|
603
|
+
const prompt = buildGradingPrompt(
|
|
604
|
+
preGateResult.remaining,
|
|
605
|
+
telemetry,
|
|
606
|
+
transcriptExcerpt,
|
|
607
|
+
skillName,
|
|
608
|
+
);
|
|
609
|
+
console.error(
|
|
610
|
+
`Grading ${preGateResult.remaining.length} expectations for skill '${skillName}'...`,
|
|
611
|
+
);
|
|
612
|
+
|
|
613
|
+
let graderOutput: GraderOutput;
|
|
614
|
+
try {
|
|
615
|
+
graderOutput = await gradeViaAgentFn(prompt, agent);
|
|
616
|
+
} catch (err) {
|
|
617
|
+
throw new Error(`Grading failed: ${err instanceof Error ? err.message : String(err)}`, {
|
|
618
|
+
cause: err,
|
|
619
|
+
});
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const llmExpectations = normalizeExpectations(graderOutput.expectations ?? []);
|
|
623
|
+
if (llmExpectations.length !== preGateResult.remaining.length) {
|
|
624
|
+
throw new Error(
|
|
625
|
+
`Grader returned ${llmExpectations.length} expectations for ${preGateResult.remaining.length} unresolved expectations`,
|
|
626
|
+
);
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
allExpectations = [...preGateResult.resolved, ...llmExpectations];
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
return assembleResultFromExpectations(
|
|
633
|
+
allExpectations,
|
|
634
|
+
telemetry,
|
|
635
|
+
sessionId,
|
|
636
|
+
skillName,
|
|
637
|
+
transcriptPath,
|
|
638
|
+
);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// ---------------------------------------------------------------------------
|
|
642
|
+
// Result assembly
|
|
643
|
+
// ---------------------------------------------------------------------------
|
|
644
|
+
|
|
645
|
+
export function assembleResult(
|
|
646
|
+
graderOutput: GraderOutput,
|
|
647
|
+
telemetry: SessionTelemetryRecord,
|
|
648
|
+
sessionId: string,
|
|
649
|
+
skillName: string,
|
|
650
|
+
transcriptPath: string,
|
|
651
|
+
): GradingResult {
|
|
652
|
+
const result = assembleResultFromExpectations(
|
|
653
|
+
normalizeExpectations(graderOutput?.expectations ?? []),
|
|
654
|
+
telemetry,
|
|
655
|
+
sessionId,
|
|
656
|
+
skillName,
|
|
657
|
+
transcriptPath,
|
|
658
|
+
);
|
|
659
|
+
return {
|
|
660
|
+
...result,
|
|
306
661
|
claims: graderOutput?.claims ?? [],
|
|
307
662
|
eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
|
|
308
663
|
failure_feedback: graderOutput?.failure_feedback,
|
|
@@ -348,19 +703,43 @@ export async function cliMain(): Promise<void> {
|
|
|
348
703
|
const { values } = parseArgs({
|
|
349
704
|
options: {
|
|
350
705
|
skill: { type: "string" },
|
|
706
|
+
"skill-path": { type: "string" },
|
|
351
707
|
expectations: { type: "string", multiple: true },
|
|
352
708
|
"evals-json": { type: "string" },
|
|
353
709
|
"eval-id": { type: "string" },
|
|
354
710
|
"session-id": { type: "string" },
|
|
355
711
|
transcript: { type: "string" },
|
|
356
712
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
357
|
-
output: { type: "string"
|
|
713
|
+
output: { type: "string" },
|
|
358
714
|
agent: { type: "string" },
|
|
359
715
|
"show-transcript": { type: "boolean", default: false },
|
|
716
|
+
help: { type: "boolean", short: "h", default: false },
|
|
360
717
|
},
|
|
361
718
|
strict: true,
|
|
362
719
|
});
|
|
363
720
|
|
|
721
|
+
if (values.help) {
|
|
722
|
+
console.log(`selftune grade — Grade a skill session
|
|
723
|
+
|
|
724
|
+
Usage:
|
|
725
|
+
selftune grade --skill <name> [options]
|
|
726
|
+
|
|
727
|
+
Options:
|
|
728
|
+
--skill Skill name (required)
|
|
729
|
+
--skill-path Path to SKILL.md (for auto-deriving expectations)
|
|
730
|
+
--expectations Expectation strings (repeatable)
|
|
731
|
+
--evals-json Path to evals JSON file
|
|
732
|
+
--eval-id Eval ID within evals JSON
|
|
733
|
+
--session-id Grade a specific session by ID
|
|
734
|
+
--transcript Path to transcript file
|
|
735
|
+
--telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
|
|
736
|
+
--output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
|
|
737
|
+
--agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
|
|
738
|
+
--show-transcript Print transcript excerpt before grading
|
|
739
|
+
-h, --help Show this help message`);
|
|
740
|
+
process.exit(0);
|
|
741
|
+
}
|
|
742
|
+
|
|
364
743
|
const skill = values.skill;
|
|
365
744
|
if (!skill) {
|
|
366
745
|
console.error("[ERROR] --skill is required");
|
|
@@ -369,7 +748,7 @@ export async function cliMain(): Promise<void> {
|
|
|
369
748
|
|
|
370
749
|
// --- Determine agent ---
|
|
371
750
|
let agent: string | null = null;
|
|
372
|
-
const validAgents = [
|
|
751
|
+
const validAgents = [...AGENT_CANDIDATES];
|
|
373
752
|
if (values.agent) {
|
|
374
753
|
if (!validAgents.includes(values.agent)) {
|
|
375
754
|
console.error(
|
|
@@ -384,8 +763,8 @@ export async function cliMain(): Promise<void> {
|
|
|
384
763
|
|
|
385
764
|
if (!agent) {
|
|
386
765
|
console.error(
|
|
387
|
-
|
|
388
|
-
"Install
|
|
766
|
+
`[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
|
|
767
|
+
"Install one of the supported agent CLIs.",
|
|
389
768
|
);
|
|
390
769
|
process.exit(1);
|
|
391
770
|
}
|
|
@@ -404,8 +783,18 @@ export async function cliMain(): Promise<void> {
|
|
|
404
783
|
} else if (values.expectations?.length) {
|
|
405
784
|
expectations = values.expectations;
|
|
406
785
|
} else {
|
|
407
|
-
|
|
408
|
-
|
|
786
|
+
// Auto-derive expectations from SKILL.md
|
|
787
|
+
const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
|
|
788
|
+
expectations = derived.expectations;
|
|
789
|
+
if (derived.derived) {
|
|
790
|
+
console.error(
|
|
791
|
+
`[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
|
|
792
|
+
);
|
|
793
|
+
} else {
|
|
794
|
+
console.error(
|
|
795
|
+
`[WARN] No --expectations or --evals-json provided. Using generic expectations (${derived.source})`,
|
|
796
|
+
);
|
|
797
|
+
}
|
|
409
798
|
}
|
|
410
799
|
|
|
411
800
|
// --- Resolve session ---
|
|
@@ -415,9 +804,15 @@ export async function cliMain(): Promise<void> {
|
|
|
415
804
|
|
|
416
805
|
const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
|
|
417
806
|
const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
|
|
807
|
+
const skillUsageRecords = readEffectiveSkillUsageRecords();
|
|
418
808
|
|
|
419
809
|
if (values.transcript) {
|
|
420
810
|
transcriptPath = values.transcript;
|
|
811
|
+
telemetry =
|
|
812
|
+
buildTelemetryFromTranscript(
|
|
813
|
+
values["session-id"] ?? basename(transcriptPath, ".jsonl"),
|
|
814
|
+
transcriptPath,
|
|
815
|
+
) ?? ({} as SessionTelemetryRecord);
|
|
421
816
|
for (let i = telRecords.length - 1; i >= 0; i--) {
|
|
422
817
|
if (telRecords[i].transcript_path === transcriptPath) {
|
|
423
818
|
telemetry = telRecords[i];
|
|
@@ -425,18 +820,25 @@ export async function cliMain(): Promise<void> {
|
|
|
425
820
|
break;
|
|
426
821
|
}
|
|
427
822
|
}
|
|
823
|
+
if (telemetry.session_id) sessionId = telemetry.session_id;
|
|
428
824
|
} else if (values["session-id"]) {
|
|
429
825
|
sessionId = values["session-id"];
|
|
430
|
-
|
|
431
|
-
|
|
826
|
+
const resolved = resolveSessionById(telRecords, sessionId);
|
|
827
|
+
telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
|
|
828
|
+
transcriptPath = resolved?.transcriptPath ?? "";
|
|
432
829
|
} else {
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
830
|
+
const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
|
|
831
|
+
telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
|
|
832
|
+
if (resolved) {
|
|
833
|
+
sessionId = resolved.sessionId;
|
|
834
|
+
transcriptPath = resolved.transcriptPath;
|
|
835
|
+
const note =
|
|
836
|
+
resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
|
|
837
|
+
console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}${note}`);
|
|
438
838
|
} else {
|
|
439
|
-
console.error(
|
|
839
|
+
console.error(
|
|
840
|
+
`[WARN] No session found for skill '${skill}' in telemetry or recovered usage data.`,
|
|
841
|
+
);
|
|
440
842
|
}
|
|
441
843
|
}
|
|
442
844
|
|
|
@@ -448,74 +850,23 @@ export async function cliMain(): Promise<void> {
|
|
|
448
850
|
console.log("==========================\n");
|
|
449
851
|
}
|
|
450
852
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
console.error(
|
|
464
|
-
|
|
465
|
-
);
|
|
466
|
-
allExpectations = preGateResult.resolved;
|
|
467
|
-
} else {
|
|
468
|
-
// Build prompt and grade remaining via LLM
|
|
469
|
-
console.error(
|
|
470
|
-
`[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
|
|
471
|
-
);
|
|
472
|
-
const prompt = buildGradingPrompt(preGateResult.remaining, telemetry, transcriptExcerpt, skill);
|
|
473
|
-
console.error(`Grading ${preGateResult.remaining.length} expectations for skill '${skill}'...`);
|
|
474
|
-
|
|
475
|
-
let graderOutput: GraderOutput;
|
|
476
|
-
try {
|
|
477
|
-
graderOutput = await gradeViaAgent(prompt, agent);
|
|
478
|
-
} catch (e) {
|
|
479
|
-
console.error(`[ERROR] Grading failed: ${e}`);
|
|
480
|
-
process.exit(1);
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
// Default scores on LLM results
|
|
484
|
-
const llmExpectations = (graderOutput.expectations ?? []).map((e) => ({
|
|
485
|
-
...e,
|
|
486
|
-
score: e.score ?? (e.passed ? 1.0 : 0.0),
|
|
487
|
-
source: e.source ?? ("llm" as const),
|
|
488
|
-
}));
|
|
489
|
-
|
|
490
|
-
// Merge pre-gate + LLM results
|
|
491
|
-
allExpectations = [...preGateResult.resolved, ...llmExpectations];
|
|
853
|
+
let result: GradingResult;
|
|
854
|
+
try {
|
|
855
|
+
result = await gradeSession({
|
|
856
|
+
expectations,
|
|
857
|
+
telemetry,
|
|
858
|
+
sessionId,
|
|
859
|
+
skillName: skill,
|
|
860
|
+
transcriptExcerpt,
|
|
861
|
+
transcriptPath,
|
|
862
|
+
agent,
|
|
863
|
+
});
|
|
864
|
+
} catch (err) {
|
|
865
|
+
console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
|
|
866
|
+
process.exit(1);
|
|
492
867
|
}
|
|
493
868
|
|
|
494
|
-
|
|
495
|
-
const graduated = buildGraduatedSummary(allExpectations);
|
|
496
|
-
const passedCount = allExpectations.filter((e) => e.passed).length;
|
|
497
|
-
const totalCount = allExpectations.length;
|
|
498
|
-
|
|
499
|
-
const result: GradingResult = {
|
|
500
|
-
session_id: sessionId,
|
|
501
|
-
skill_name: skill,
|
|
502
|
-
transcript_path: transcriptPath,
|
|
503
|
-
graded_at: new Date().toISOString(),
|
|
504
|
-
expectations: allExpectations,
|
|
505
|
-
summary: {
|
|
506
|
-
passed: passedCount,
|
|
507
|
-
failed: totalCount - passedCount,
|
|
508
|
-
total: totalCount,
|
|
509
|
-
pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
|
|
510
|
-
mean_score: graduated.mean_score,
|
|
511
|
-
score_std_dev: graduated.score_std_dev,
|
|
512
|
-
},
|
|
513
|
-
execution_metrics: buildExecutionMetrics(telemetry),
|
|
514
|
-
claims: [],
|
|
515
|
-
eval_feedback: { suggestions: [], overall: "" },
|
|
516
|
-
};
|
|
517
|
-
|
|
518
|
-
const outputPath = values.output ?? "grading.json";
|
|
869
|
+
const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
|
|
519
870
|
const outputDir = dirname(outputPath);
|
|
520
871
|
if (outputDir !== ".") {
|
|
521
872
|
mkdirSync(outputDir, { recursive: true });
|