selftune 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +259 -0
- package/bin/selftune.cjs +29 -0
- package/cli/selftune/constants.ts +71 -0
- package/cli/selftune/eval/hooks-to-evals.ts +422 -0
- package/cli/selftune/evolution/audit.ts +44 -0
- package/cli/selftune/evolution/deploy-proposal.ts +244 -0
- package/cli/selftune/evolution/evolve.ts +406 -0
- package/cli/selftune/evolution/extract-patterns.ts +145 -0
- package/cli/selftune/evolution/propose-description.ts +146 -0
- package/cli/selftune/evolution/rollback.ts +242 -0
- package/cli/selftune/evolution/stopping-criteria.ts +69 -0
- package/cli/selftune/evolution/validate-proposal.ts +137 -0
- package/cli/selftune/grading/grade-session.ts +459 -0
- package/cli/selftune/hooks/prompt-log.ts +52 -0
- package/cli/selftune/hooks/session-stop.ts +54 -0
- package/cli/selftune/hooks/skill-eval.ts +73 -0
- package/cli/selftune/index.ts +104 -0
- package/cli/selftune/ingestors/codex-rollout.ts +416 -0
- package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
- package/cli/selftune/init.ts +297 -0
- package/cli/selftune/monitoring/watch.ts +328 -0
- package/cli/selftune/observability.ts +255 -0
- package/cli/selftune/types.ts +255 -0
- package/cli/selftune/utils/jsonl.ts +75 -0
- package/cli/selftune/utils/llm-call.ts +192 -0
- package/cli/selftune/utils/logging.ts +40 -0
- package/cli/selftune/utils/schema-validator.ts +47 -0
- package/cli/selftune/utils/seeded-random.ts +31 -0
- package/cli/selftune/utils/transcript.ts +260 -0
- package/package.json +29 -0
- package/skill/SKILL.md +120 -0
- package/skill/Workflows/Doctor.md +145 -0
- package/skill/Workflows/Evals.md +193 -0
- package/skill/Workflows/Evolve.md +159 -0
- package/skill/Workflows/Grade.md +157 -0
- package/skill/Workflows/Ingest.md +159 -0
- package/skill/Workflows/Initialize.md +125 -0
- package/skill/Workflows/Rollback.md +131 -0
- package/skill/Workflows/Watch.md +128 -0
- package/skill/references/grading-methodology.md +176 -0
- package/skill/references/invocation-taxonomy.md +144 -0
- package/skill/references/logs.md +168 -0
- package/skill/settings_snippet.json +41 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* validate-proposal.ts
|
|
3
|
+
*
|
|
4
|
+
* Validates an evolution proposal by running trigger checks against an eval set.
|
|
5
|
+
* Compares trigger accuracy between the original and proposed skill descriptions
|
|
6
|
+
* to determine whether the proposal is an improvement.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { EvalEntry, EvolutionProposal } from "../types.js";
|
|
10
|
+
import { callLlm } from "../utils/llm-call.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Types
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
export interface ValidationResult {
|
|
17
|
+
proposal_id: string;
|
|
18
|
+
before_pass_rate: number;
|
|
19
|
+
after_pass_rate: number;
|
|
20
|
+
improved: boolean;
|
|
21
|
+
regressions: EvalEntry[]; // passed before, fail after
|
|
22
|
+
new_passes: EvalEntry[]; // failed before, pass after
|
|
23
|
+
net_change: number; // after - before pass rate
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Prompt building
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
/** Build the trigger check prompt for the LLM. */
|
|
31
|
+
export function buildTriggerCheckPrompt(description: string, query: string): string {
|
|
32
|
+
return [
|
|
33
|
+
"Given this skill description, would the following user query trigger this skill?",
|
|
34
|
+
"Respond YES or NO only.",
|
|
35
|
+
"",
|
|
36
|
+
"Skill description:",
|
|
37
|
+
description,
|
|
38
|
+
"",
|
|
39
|
+
"User query:",
|
|
40
|
+
query,
|
|
41
|
+
].join("\n");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Response parsing
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
/** Parse YES/NO from LLM response. */
|
|
49
|
+
export function parseTriggerResponse(response: string): boolean {
|
|
50
|
+
const normalized = response.trim().toUpperCase();
|
|
51
|
+
if (normalized.startsWith("YES")) return true;
|
|
52
|
+
if (normalized.startsWith("NO")) return false;
|
|
53
|
+
return false; // conservative default
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// Proposal validation
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
/** Validate a proposal by running trigger checks against the eval set. */
|
|
61
|
+
export async function validateProposal(
|
|
62
|
+
proposal: EvolutionProposal,
|
|
63
|
+
evalSet: EvalEntry[],
|
|
64
|
+
mode: "agent" | "api",
|
|
65
|
+
agent?: string,
|
|
66
|
+
): Promise<ValidationResult> {
|
|
67
|
+
if (evalSet.length === 0) {
|
|
68
|
+
return {
|
|
69
|
+
proposal_id: proposal.proposal_id,
|
|
70
|
+
before_pass_rate: 0,
|
|
71
|
+
after_pass_rate: 0,
|
|
72
|
+
improved: false,
|
|
73
|
+
regressions: [],
|
|
74
|
+
new_passes: [],
|
|
75
|
+
net_change: 0,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
80
|
+
const regressions: EvalEntry[] = [];
|
|
81
|
+
const newPasses: EvalEntry[] = [];
|
|
82
|
+
let beforePassed = 0;
|
|
83
|
+
let afterPassed = 0;
|
|
84
|
+
|
|
85
|
+
for (const entry of evalSet) {
|
|
86
|
+
// Check with original description
|
|
87
|
+
const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
|
|
88
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, mode, agent);
|
|
89
|
+
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
90
|
+
const beforePass =
|
|
91
|
+
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
92
|
+
|
|
93
|
+
// Check with proposed description
|
|
94
|
+
const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
|
|
95
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, mode, agent);
|
|
96
|
+
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
97
|
+
const afterPass =
|
|
98
|
+
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
99
|
+
|
|
100
|
+
if (beforePass) beforePassed++;
|
|
101
|
+
if (afterPass) afterPassed++;
|
|
102
|
+
|
|
103
|
+
// Regression: passed before, fails after
|
|
104
|
+
if (beforePass && !afterPass) {
|
|
105
|
+
regressions.push(entry);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// New pass: failed before, passes after
|
|
109
|
+
if (!beforePass && afterPass) {
|
|
110
|
+
newPasses.push(entry);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const total = evalSet.length;
|
|
115
|
+
const beforePassRate = beforePassed / total;
|
|
116
|
+
const afterPassRate = afterPassed / total;
|
|
117
|
+
const netChange = afterPassRate - beforePassRate;
|
|
118
|
+
|
|
119
|
+
// A proposal is improved when ALL of:
|
|
120
|
+
// - after_pass_rate > before_pass_rate
|
|
121
|
+
// - regressions count < 5% of total eval entries
|
|
122
|
+
// - Either net improvement >= 0.10 OR new_passes.length >= 2
|
|
123
|
+
const improved =
|
|
124
|
+
afterPassRate > beforePassRate &&
|
|
125
|
+
regressions.length < total * 0.05 &&
|
|
126
|
+
(netChange >= 0.1 || newPasses.length >= 2);
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
proposal_id: proposal.proposal_id,
|
|
130
|
+
before_pass_rate: beforePassRate,
|
|
131
|
+
after_pass_rate: afterPassRate,
|
|
132
|
+
improved,
|
|
133
|
+
regressions,
|
|
134
|
+
new_passes: newPasses,
|
|
135
|
+
net_change: netChange,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* grade-session.ts
|
|
4
|
+
*
|
|
5
|
+
* Rubric-based grader for Claude Code skill sessions.
|
|
6
|
+
* Migrated from grade_session.py.
|
|
7
|
+
*
|
|
8
|
+
* Two modes:
|
|
9
|
+
* 1. --use-agent (default when no ANTHROPIC_API_KEY) — invokes installed agent CLI
|
|
10
|
+
* 2. --use-api (default when ANTHROPIC_API_KEY set) — calls Anthropic API directly
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
14
|
+
import { dirname } from "node:path";
|
|
15
|
+
import { parseArgs } from "node:util";
|
|
16
|
+
|
|
17
|
+
import { TELEMETRY_LOG } from "../constants.js";
|
|
18
|
+
import type {
|
|
19
|
+
ExecutionMetrics,
|
|
20
|
+
GraderOutput,
|
|
21
|
+
GradingResult,
|
|
22
|
+
SessionTelemetryRecord,
|
|
23
|
+
} from "../types.js";
|
|
24
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
25
|
+
import {
|
|
26
|
+
detectAgent as _detectAgent,
|
|
27
|
+
stripMarkdownFences as _stripMarkdownFences,
|
|
28
|
+
callViaAgent,
|
|
29
|
+
callViaApi,
|
|
30
|
+
} from "../utils/llm-call.js";
|
|
31
|
+
import { readExcerpt } from "../utils/transcript.js";
|
|
32
|
+
|
|
33
|
+
// Re-export for backward compatibility
|
|
34
|
+
export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Constants
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
export const MAX_TRANSCRIPT_LENGTH = 50000;
|
|
41
|
+
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Grader system prompt
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
export const GRADER_SYSTEM = `You are a rigorous skill session evaluator. You receive:
|
|
47
|
+
1. Expectations to grade (things that should be true)
|
|
48
|
+
2. Process telemetry: tool calls, bash commands, skills triggered, errors
|
|
49
|
+
3. A transcript excerpt showing what happened
|
|
50
|
+
|
|
51
|
+
Grade each expectation and output ONLY valid JSON matching this schema:
|
|
52
|
+
{
|
|
53
|
+
"expectations": [
|
|
54
|
+
{"text": "...", "passed": true/false, "evidence": "specific quote or metric"}
|
|
55
|
+
],
|
|
56
|
+
"summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0},
|
|
57
|
+
"claims": [
|
|
58
|
+
{"claim": "...", "type": "factual|process|quality", "verified": true/false, "evidence": "..."}
|
|
59
|
+
],
|
|
60
|
+
"eval_feedback": {
|
|
61
|
+
"suggestions": [{"assertion": "...", "reason": "..."}],
|
|
62
|
+
"overall": "one sentence"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
Rules:
|
|
67
|
+
- PASS only when there is clear, specific evidence — not assumptions
|
|
68
|
+
- FAIL when evidence is absent or contradictory
|
|
69
|
+
- Cite exact quotes or specific metric values
|
|
70
|
+
- Extract 2-4 implicit claims from the transcript and verify them
|
|
71
|
+
- Suggest eval improvements only for clear gaps`;
|
|
72
|
+
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
// Data lookup helpers
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
export function findSession(
|
|
78
|
+
records: SessionTelemetryRecord[],
|
|
79
|
+
sessionId: string,
|
|
80
|
+
): SessionTelemetryRecord | null {
|
|
81
|
+
for (let i = records.length - 1; i >= 0; i--) {
|
|
82
|
+
if (records[i].session_id === sessionId) return records[i];
|
|
83
|
+
}
|
|
84
|
+
return null;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export function latestSessionForSkill(
|
|
88
|
+
telemetry: SessionTelemetryRecord[],
|
|
89
|
+
skillName: string,
|
|
90
|
+
): SessionTelemetryRecord | null {
|
|
91
|
+
for (let i = telemetry.length - 1; i >= 0; i--) {
|
|
92
|
+
if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
|
|
93
|
+
}
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
|
|
98
|
+
let data: unknown;
|
|
99
|
+
try {
|
|
100
|
+
data = JSON.parse(readFileSync(evalsJsonPath, "utf-8"));
|
|
101
|
+
} catch (err) {
|
|
102
|
+
throw new Error(
|
|
103
|
+
`Failed to read or parse evals JSON at ${evalsJsonPath}: ${err instanceof Error ? err.message : String(err)}`,
|
|
104
|
+
);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (typeof data !== "object" || data === null || Array.isArray(data)) {
|
|
108
|
+
throw new Error(
|
|
109
|
+
`Invalid evals JSON at ${evalsJsonPath}: expected a top-level object, got ${Array.isArray(data) ? "array" : typeof data}`,
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const record = data as Record<string, unknown>;
|
|
114
|
+
if (!Array.isArray(record.evals)) {
|
|
115
|
+
throw new Error(
|
|
116
|
+
`Invalid evals JSON at ${evalsJsonPath}: expected "evals" to be an array, got ${typeof record.evals}`,
|
|
117
|
+
);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
for (const ev of record.evals) {
|
|
121
|
+
if (typeof ev !== "object" || ev === null || Array.isArray(ev)) {
|
|
122
|
+
throw new Error(
|
|
123
|
+
`Invalid eval entry in ${evalsJsonPath}: expected an object, got ${Array.isArray(ev) ? "array" : typeof ev}`,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
const entry = ev as Record<string, unknown>;
|
|
127
|
+
if (entry.id === evalId) {
|
|
128
|
+
if (entry.expectations === undefined || entry.expectations === null) {
|
|
129
|
+
return [];
|
|
130
|
+
}
|
|
131
|
+
if (!Array.isArray(entry.expectations)) {
|
|
132
|
+
throw new Error(
|
|
133
|
+
`Invalid eval entry (id=${evalId}) in ${evalsJsonPath}: expected "expectations" to be an array, got ${typeof entry.expectations}`,
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
for (let i = 0; i < entry.expectations.length; i++) {
|
|
137
|
+
if (typeof entry.expectations[i] !== "string") {
|
|
138
|
+
throw new Error(
|
|
139
|
+
`Invalid eval entry (id=${evalId}) in ${evalsJsonPath}: expectations[${i}] must be a string, got ${typeof entry.expectations[i]}`,
|
|
140
|
+
);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return entry.expectations as string[];
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
// Execution metrics
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): ExecutionMetrics {
|
|
154
|
+
return {
|
|
155
|
+
tool_calls: telemetry.tool_calls ?? {},
|
|
156
|
+
total_tool_calls: telemetry.total_tool_calls ?? 0,
|
|
157
|
+
total_steps: telemetry.assistant_turns ?? 0,
|
|
158
|
+
bash_commands_run: (telemetry.bash_commands ?? []).length,
|
|
159
|
+
errors_encountered: telemetry.errors_encountered ?? 0,
|
|
160
|
+
skills_triggered: telemetry.skills_triggered ?? [],
|
|
161
|
+
transcript_chars: telemetry.transcript_chars ?? 0,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Prompt building
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
export function buildGradingPrompt(
|
|
170
|
+
expectations: string[],
|
|
171
|
+
telemetry: SessionTelemetryRecord,
|
|
172
|
+
transcriptExcerpt: string,
|
|
173
|
+
skillName: string,
|
|
174
|
+
): string {
|
|
175
|
+
const toolSummary = JSON.stringify(telemetry.tool_calls ?? {}, null, 2);
|
|
176
|
+
const commands = telemetry.bash_commands ?? [];
|
|
177
|
+
const cmdSummary =
|
|
178
|
+
commands
|
|
179
|
+
.slice(0, 20)
|
|
180
|
+
.map((c) => ` $ ${c.slice(0, 120)}`)
|
|
181
|
+
.join("\n") || " (none)";
|
|
182
|
+
|
|
183
|
+
const expectationsList = expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
|
|
184
|
+
|
|
185
|
+
const excerpt =
|
|
186
|
+
transcriptExcerpt.length > MAX_TRANSCRIPT_LENGTH
|
|
187
|
+
? transcriptExcerpt.slice(0, MAX_TRANSCRIPT_LENGTH)
|
|
188
|
+
: transcriptExcerpt;
|
|
189
|
+
|
|
190
|
+
return `Skill: ${skillName}
|
|
191
|
+
|
|
192
|
+
=== PROCESS TELEMETRY ===
|
|
193
|
+
Skills triggered: ${JSON.stringify(telemetry.skills_triggered ?? [])}
|
|
194
|
+
Assistant turns: ${telemetry.assistant_turns ?? "?"}
|
|
195
|
+
Errors: ${telemetry.errors_encountered ?? "?"}
|
|
196
|
+
Total tool calls: ${telemetry.total_tool_calls ?? "?"}
|
|
197
|
+
|
|
198
|
+
Tool breakdown:
|
|
199
|
+
${toolSummary}
|
|
200
|
+
|
|
201
|
+
Bash commands:
|
|
202
|
+
${cmdSummary}
|
|
203
|
+
|
|
204
|
+
=== TRANSCRIPT EXCERPT ===
|
|
205
|
+
${excerpt}
|
|
206
|
+
|
|
207
|
+
=== EXPECTATIONS ===
|
|
208
|
+
${expectationsList}
|
|
209
|
+
|
|
210
|
+
Grade each expectation. Output JSON only.`;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// ---------------------------------------------------------------------------
|
|
214
|
+
// Grading via agent subprocess
|
|
215
|
+
// ---------------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
export async function gradeViaAgent(prompt: string, agent: string): Promise<GraderOutput> {
|
|
218
|
+
const raw = await callViaAgent(GRADER_SYSTEM, prompt, agent);
|
|
219
|
+
try {
|
|
220
|
+
return JSON.parse(_stripMarkdownFences(raw)) as GraderOutput;
|
|
221
|
+
} catch (err) {
|
|
222
|
+
throw new Error(
|
|
223
|
+
`gradeViaAgent: failed to parse LLM output as JSON. Raw (truncated): ${raw.slice(0, 200)}`,
|
|
224
|
+
{ cause: err },
|
|
225
|
+
);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// ---------------------------------------------------------------------------
|
|
230
|
+
// Grading via direct Anthropic API
|
|
231
|
+
// ---------------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
export async function gradeViaApi(prompt: string): Promise<GraderOutput> {
|
|
234
|
+
const raw = await callViaApi(GRADER_SYSTEM, prompt);
|
|
235
|
+
try {
|
|
236
|
+
return JSON.parse(_stripMarkdownFences(raw)) as GraderOutput;
|
|
237
|
+
} catch (err) {
|
|
238
|
+
throw new Error(
|
|
239
|
+
`gradeViaApi: failed to parse LLM output as JSON. Raw (truncated): ${raw.slice(0, 200)}`,
|
|
240
|
+
{ cause: err },
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// ---------------------------------------------------------------------------
|
|
246
|
+
// Result assembly
|
|
247
|
+
// ---------------------------------------------------------------------------
|
|
248
|
+
|
|
249
|
+
export function assembleResult(
|
|
250
|
+
graderOutput: GraderOutput,
|
|
251
|
+
telemetry: SessionTelemetryRecord,
|
|
252
|
+
sessionId: string,
|
|
253
|
+
skillName: string,
|
|
254
|
+
transcriptPath: string,
|
|
255
|
+
): GradingResult {
|
|
256
|
+
return {
|
|
257
|
+
session_id: sessionId ?? "unknown",
|
|
258
|
+
skill_name: skillName ?? "unknown",
|
|
259
|
+
transcript_path: transcriptPath ?? "",
|
|
260
|
+
graded_at: new Date().toISOString(),
|
|
261
|
+
expectations: graderOutput?.expectations ?? [],
|
|
262
|
+
summary: graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 },
|
|
263
|
+
execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
|
|
264
|
+
claims: graderOutput?.claims ?? [],
|
|
265
|
+
eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// ---------------------------------------------------------------------------
|
|
270
|
+
// Summary printer
|
|
271
|
+
// ---------------------------------------------------------------------------
|
|
272
|
+
|
|
273
|
+
function printSummary(result: GradingResult): void {
|
|
274
|
+
const { summary } = result;
|
|
275
|
+
const rate = summary.pass_rate ?? 0;
|
|
276
|
+
console.log(`\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)`);
|
|
277
|
+
for (const exp of result.expectations ?? []) {
|
|
278
|
+
const icon = exp.passed ? "\u2713" : "\u2717";
|
|
279
|
+
console.log(` ${icon} ${String(exp.text ?? "").slice(0, 70)}`);
|
|
280
|
+
if (!exp.passed) {
|
|
281
|
+
console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const feedback = result.eval_feedback;
|
|
286
|
+
if (feedback.suggestions?.length) {
|
|
287
|
+
console.log(`\nEval feedback: ${feedback.overall}`);
|
|
288
|
+
for (const s of feedback.suggestions) {
|
|
289
|
+
console.log(` * ${String(s.reason ?? "").slice(0, 100)}`);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// ---------------------------------------------------------------------------
|
|
295
|
+
// CLI entry point
|
|
296
|
+
// ---------------------------------------------------------------------------
|
|
297
|
+
|
|
298
|
+
export async function cliMain(): Promise<void> {
|
|
299
|
+
const { values } = parseArgs({
|
|
300
|
+
options: {
|
|
301
|
+
skill: { type: "string" },
|
|
302
|
+
expectations: { type: "string", multiple: true },
|
|
303
|
+
"evals-json": { type: "string" },
|
|
304
|
+
"eval-id": { type: "string" },
|
|
305
|
+
"session-id": { type: "string" },
|
|
306
|
+
transcript: { type: "string" },
|
|
307
|
+
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
308
|
+
output: { type: "string", default: "grading.json" },
|
|
309
|
+
"use-agent": { type: "boolean", default: false },
|
|
310
|
+
"use-api": { type: "boolean", default: false },
|
|
311
|
+
agent: { type: "string" },
|
|
312
|
+
"show-transcript": { type: "boolean", default: false },
|
|
313
|
+
},
|
|
314
|
+
strict: true,
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
const skill = values.skill;
|
|
318
|
+
if (!skill) {
|
|
319
|
+
console.error("[ERROR] --skill is required");
|
|
320
|
+
process.exit(1);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// --- Determine mode ---
|
|
324
|
+
const hasApiKey = Boolean(process.env.ANTHROPIC_API_KEY);
|
|
325
|
+
let mode: "agent" | "api";
|
|
326
|
+
let agent: string | null = null;
|
|
327
|
+
|
|
328
|
+
if (values["use-api"]) {
|
|
329
|
+
mode = "api";
|
|
330
|
+
} else if (values["use-agent"]) {
|
|
331
|
+
mode = "agent";
|
|
332
|
+
} else {
|
|
333
|
+
const availableAgent = _detectAgent();
|
|
334
|
+
if (availableAgent) {
|
|
335
|
+
mode = "agent";
|
|
336
|
+
} else if (hasApiKey) {
|
|
337
|
+
mode = "api";
|
|
338
|
+
} else {
|
|
339
|
+
console.error(
|
|
340
|
+
"[ERROR] No agent CLI (claude/codex/opencode) found in PATH " +
|
|
341
|
+
"and ANTHROPIC_API_KEY not set.\n" +
|
|
342
|
+
"Install Claude Code, Codex, or OpenCode, or set ANTHROPIC_API_KEY.",
|
|
343
|
+
);
|
|
344
|
+
process.exit(1);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
if (mode === "agent") {
|
|
349
|
+
const validAgents = ["claude", "codex", "opencode"];
|
|
350
|
+
if (values.agent && validAgents.includes(values.agent)) {
|
|
351
|
+
agent = values.agent;
|
|
352
|
+
} else {
|
|
353
|
+
agent = _detectAgent();
|
|
354
|
+
}
|
|
355
|
+
if (!agent) {
|
|
356
|
+
console.error(
|
|
357
|
+
"[ERROR] --use-agent specified but no agent found in PATH.\n" +
|
|
358
|
+
"Install claude, codex, or opencode, or use --use-api instead.",
|
|
359
|
+
);
|
|
360
|
+
process.exit(1);
|
|
361
|
+
}
|
|
362
|
+
console.error(`[INFO] Grading via agent: ${agent}`);
|
|
363
|
+
} else {
|
|
364
|
+
console.error("[INFO] Grading via direct Anthropic API");
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// --- Resolve expectations ---
|
|
368
|
+
let expectations: string[] = [];
|
|
369
|
+
if (values["evals-json"] && values["eval-id"] != null) {
|
|
370
|
+
const evalIdNum = Number(values["eval-id"]);
|
|
371
|
+
if (!Number.isFinite(evalIdNum) || !Number.isInteger(evalIdNum)) {
|
|
372
|
+
console.error(`[ERROR] --eval-id must be a finite integer, got: ${values["eval-id"]}`);
|
|
373
|
+
process.exit(1);
|
|
374
|
+
}
|
|
375
|
+
expectations = loadExpectationsFromEvalsJson(values["evals-json"], evalIdNum);
|
|
376
|
+
} else if (values.expectations?.length) {
|
|
377
|
+
expectations = values.expectations;
|
|
378
|
+
} else {
|
|
379
|
+
console.error("[ERROR] Provide --expectations or --evals-json + --eval-id");
|
|
380
|
+
process.exit(1);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// --- Resolve session ---
|
|
384
|
+
let telemetry = {} as SessionTelemetryRecord;
|
|
385
|
+
let transcriptPath = "";
|
|
386
|
+
let sessionId = "unknown";
|
|
387
|
+
|
|
388
|
+
const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
|
|
389
|
+
const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
|
|
390
|
+
|
|
391
|
+
if (values.transcript) {
|
|
392
|
+
transcriptPath = values.transcript;
|
|
393
|
+
for (let i = telRecords.length - 1; i >= 0; i--) {
|
|
394
|
+
if (telRecords[i].transcript_path === transcriptPath) {
|
|
395
|
+
telemetry = telRecords[i];
|
|
396
|
+
sessionId = telRecords[i].session_id ?? "unknown";
|
|
397
|
+
break;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
} else if (values["session-id"]) {
|
|
401
|
+
sessionId = values["session-id"];
|
|
402
|
+
telemetry = findSession(telRecords, sessionId) ?? ({} as SessionTelemetryRecord);
|
|
403
|
+
transcriptPath = telemetry.transcript_path ?? "";
|
|
404
|
+
} else {
|
|
405
|
+
telemetry = latestSessionForSkill(telRecords, skill) ?? ({} as SessionTelemetryRecord);
|
|
406
|
+
if (telemetry.session_id) {
|
|
407
|
+
sessionId = telemetry.session_id;
|
|
408
|
+
transcriptPath = telemetry.transcript_path ?? "";
|
|
409
|
+
console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}`);
|
|
410
|
+
} else {
|
|
411
|
+
console.error(`[WARN] No telemetry for skill '${skill}'. Is session_stop_hook installed?`);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
const transcriptExcerpt = transcriptPath ? readExcerpt(transcriptPath) : "(no transcript)";
|
|
416
|
+
|
|
417
|
+
if (values["show-transcript"]) {
|
|
418
|
+
console.log("=== TRANSCRIPT EXCERPT ===");
|
|
419
|
+
console.log(transcriptExcerpt);
|
|
420
|
+
console.log("==========================\n");
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// --- Build prompt and grade ---
|
|
424
|
+
const prompt = buildGradingPrompt(expectations, telemetry, transcriptExcerpt, skill);
|
|
425
|
+
|
|
426
|
+
console.error(`Grading ${expectations.length} expectations for skill '${skill}'...`);
|
|
427
|
+
|
|
428
|
+
let graderOutput: GraderOutput;
|
|
429
|
+
try {
|
|
430
|
+
if (mode === "agent") {
|
|
431
|
+
graderOutput = await gradeViaAgent(prompt, agent as string);
|
|
432
|
+
} else {
|
|
433
|
+
graderOutput = await gradeViaApi(prompt);
|
|
434
|
+
}
|
|
435
|
+
} catch (e) {
|
|
436
|
+
console.error(`[ERROR] Grading failed: ${e}`);
|
|
437
|
+
process.exit(1);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
const result = assembleResult(graderOutput, telemetry, sessionId, skill, transcriptPath);
|
|
441
|
+
|
|
442
|
+
const outputPath = values.output ?? "grading.json";
|
|
443
|
+
const outputDir = dirname(outputPath);
|
|
444
|
+
if (outputDir !== ".") {
|
|
445
|
+
mkdirSync(outputDir, { recursive: true });
|
|
446
|
+
}
|
|
447
|
+
writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
|
|
448
|
+
|
|
449
|
+
printSummary(result);
|
|
450
|
+
console.log(`\nWrote ${outputPath}`);
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Guard: only run when invoked directly
|
|
454
|
+
if (import.meta.main) {
|
|
455
|
+
cliMain().catch((err) => {
|
|
456
|
+
console.error(`[FATAL] ${err}`);
|
|
457
|
+
process.exit(1);
|
|
458
|
+
});
|
|
459
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Claude Code UserPromptSubmit hook: prompt-log.ts
|
|
4
|
+
*
|
|
5
|
+
* Fires on every user message before Claude processes it.
|
|
6
|
+
* Logs the query to ~/.claude/all_queries_log.jsonl so that
|
|
7
|
+
* hooks-to-evals can identify prompts that did NOT trigger
|
|
8
|
+
* a skill — the raw material for false-negative eval entries.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { QUERY_LOG, SKIP_PREFIXES } from "../constants.js";
|
|
12
|
+
import type { PromptSubmitPayload, QueryLogRecord } from "../types.js";
|
|
13
|
+
import { appendJsonl } from "../utils/jsonl.js";
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Core processing logic, exported for testability.
|
|
17
|
+
* Returns the record that was appended, or null if skipped.
|
|
18
|
+
*/
|
|
19
|
+
export function processPrompt(
|
|
20
|
+
payload: PromptSubmitPayload,
|
|
21
|
+
logPath: string = QUERY_LOG,
|
|
22
|
+
): QueryLogRecord | null {
|
|
23
|
+
const query = (payload.user_prompt ?? "").trim();
|
|
24
|
+
|
|
25
|
+
if (!query) return null;
|
|
26
|
+
|
|
27
|
+
// Skip automated/tool messages
|
|
28
|
+
if (SKIP_PREFIXES.some((p) => query.startsWith(p))) return null;
|
|
29
|
+
|
|
30
|
+
// Skip very short noise (single chars, punctuation)
|
|
31
|
+
if (query.length < 4) return null;
|
|
32
|
+
|
|
33
|
+
const record: QueryLogRecord = {
|
|
34
|
+
timestamp: new Date().toISOString(),
|
|
35
|
+
session_id: payload.session_id ?? "unknown",
|
|
36
|
+
query,
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
appendJsonl(logPath, record);
|
|
40
|
+
return record;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// --- stdin main (only when executed directly, not when imported) ---
|
|
44
|
+
if (import.meta.main) {
|
|
45
|
+
try {
|
|
46
|
+
const payload: PromptSubmitPayload = JSON.parse(await Bun.stdin.text());
|
|
47
|
+
processPrompt(payload);
|
|
48
|
+
} catch {
|
|
49
|
+
// silent — hooks must never block Claude
|
|
50
|
+
}
|
|
51
|
+
process.exit(0);
|
|
52
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Claude Code Stop hook: session-stop.ts
|
|
4
|
+
*
|
|
5
|
+
* Fires when a Claude Code session ends. Reads the session's transcript JSONL
|
|
6
|
+
* and extracts process-level telemetry (tool calls, errors, skills triggered, etc).
|
|
7
|
+
* Appends one record per session to ~/.claude/session_telemetry_log.jsonl.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { TELEMETRY_LOG } from "../constants.js";
|
|
11
|
+
import type { SessionTelemetryRecord, StopPayload } from "../types.js";
|
|
12
|
+
import { appendJsonl } from "../utils/jsonl.js";
|
|
13
|
+
import { parseTranscript } from "../utils/transcript.js";
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Core processing logic, exported for testability.
|
|
17
|
+
* Returns the record that was appended.
|
|
18
|
+
*/
|
|
19
|
+
export function processSessionStop(
|
|
20
|
+
payload: StopPayload,
|
|
21
|
+
logPath: string = TELEMETRY_LOG,
|
|
22
|
+
): SessionTelemetryRecord {
|
|
23
|
+
const sessionId = typeof payload.session_id === "string" ? payload.session_id : "unknown";
|
|
24
|
+
const transcriptPath = typeof payload.transcript_path === "string" ? payload.transcript_path : "";
|
|
25
|
+
const cwd = typeof payload.cwd === "string" ? payload.cwd : "";
|
|
26
|
+
|
|
27
|
+
const metrics = parseTranscript(transcriptPath);
|
|
28
|
+
|
|
29
|
+
const record: SessionTelemetryRecord = {
|
|
30
|
+
timestamp: new Date().toISOString(),
|
|
31
|
+
session_id: sessionId,
|
|
32
|
+
cwd,
|
|
33
|
+
transcript_path: transcriptPath,
|
|
34
|
+
source: "claude_code",
|
|
35
|
+
...metrics,
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
appendJsonl(logPath, record);
|
|
39
|
+
return record;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// --- stdin main (only when executed directly, not when imported) ---
|
|
43
|
+
if (import.meta.main) {
|
|
44
|
+
try {
|
|
45
|
+
const payload: StopPayload = JSON.parse(await Bun.stdin.text());
|
|
46
|
+
processSessionStop(payload);
|
|
47
|
+
} catch (err) {
|
|
48
|
+
// silent — hooks must never block Claude
|
|
49
|
+
if (process.env.DEBUG || process.env.NODE_ENV === "development") {
|
|
50
|
+
console.error("session-stop hook failed:", err);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
process.exit(0);
|
|
54
|
+
}
|