selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +259 -0
  3. package/bin/selftune.cjs +29 -0
  4. package/cli/selftune/constants.ts +71 -0
  5. package/cli/selftune/eval/hooks-to-evals.ts +422 -0
  6. package/cli/selftune/evolution/audit.ts +44 -0
  7. package/cli/selftune/evolution/deploy-proposal.ts +244 -0
  8. package/cli/selftune/evolution/evolve.ts +406 -0
  9. package/cli/selftune/evolution/extract-patterns.ts +145 -0
  10. package/cli/selftune/evolution/propose-description.ts +146 -0
  11. package/cli/selftune/evolution/rollback.ts +242 -0
  12. package/cli/selftune/evolution/stopping-criteria.ts +69 -0
  13. package/cli/selftune/evolution/validate-proposal.ts +137 -0
  14. package/cli/selftune/grading/grade-session.ts +459 -0
  15. package/cli/selftune/hooks/prompt-log.ts +52 -0
  16. package/cli/selftune/hooks/session-stop.ts +54 -0
  17. package/cli/selftune/hooks/skill-eval.ts +73 -0
  18. package/cli/selftune/index.ts +104 -0
  19. package/cli/selftune/ingestors/codex-rollout.ts +416 -0
  20. package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
  21. package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
  22. package/cli/selftune/init.ts +297 -0
  23. package/cli/selftune/monitoring/watch.ts +328 -0
  24. package/cli/selftune/observability.ts +255 -0
  25. package/cli/selftune/types.ts +255 -0
  26. package/cli/selftune/utils/jsonl.ts +75 -0
  27. package/cli/selftune/utils/llm-call.ts +192 -0
  28. package/cli/selftune/utils/logging.ts +40 -0
  29. package/cli/selftune/utils/schema-validator.ts +47 -0
  30. package/cli/selftune/utils/seeded-random.ts +31 -0
  31. package/cli/selftune/utils/transcript.ts +260 -0
  32. package/package.json +29 -0
  33. package/skill/SKILL.md +120 -0
  34. package/skill/Workflows/Doctor.md +145 -0
  35. package/skill/Workflows/Evals.md +193 -0
  36. package/skill/Workflows/Evolve.md +159 -0
  37. package/skill/Workflows/Grade.md +157 -0
  38. package/skill/Workflows/Ingest.md +159 -0
  39. package/skill/Workflows/Initialize.md +125 -0
  40. package/skill/Workflows/Rollback.md +131 -0
  41. package/skill/Workflows/Watch.md +128 -0
  42. package/skill/references/grading-methodology.md +176 -0
  43. package/skill/references/invocation-taxonomy.md +144 -0
  44. package/skill/references/logs.md +168 -0
  45. package/skill/settings_snippet.json +41 -0
@@ -0,0 +1,137 @@
1
+ /**
2
+ * validate-proposal.ts
3
+ *
4
+ * Validates an evolution proposal by running trigger checks against an eval set.
5
+ * Compares trigger accuracy between the original and proposed skill descriptions
6
+ * to determine whether the proposal is an improvement.
7
+ */
8
+
9
+ import type { EvalEntry, EvolutionProposal } from "../types.js";
10
+ import { callLlm } from "../utils/llm-call.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface ValidationResult {
17
+ proposal_id: string;
18
+ before_pass_rate: number;
19
+ after_pass_rate: number;
20
+ improved: boolean;
21
+ regressions: EvalEntry[]; // passed before, fail after
22
+ new_passes: EvalEntry[]; // failed before, pass after
23
+ net_change: number; // after - before pass rate
24
+ }
25
+
26
+ // ---------------------------------------------------------------------------
27
+ // Prompt building
28
+ // ---------------------------------------------------------------------------
29
+
30
+ /** Build the trigger check prompt for the LLM. */
31
+ export function buildTriggerCheckPrompt(description: string, query: string): string {
32
+ return [
33
+ "Given this skill description, would the following user query trigger this skill?",
34
+ "Respond YES or NO only.",
35
+ "",
36
+ "Skill description:",
37
+ description,
38
+ "",
39
+ "User query:",
40
+ query,
41
+ ].join("\n");
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Response parsing
46
+ // ---------------------------------------------------------------------------
47
+
48
+ /** Parse YES/NO from LLM response. */
49
+ export function parseTriggerResponse(response: string): boolean {
50
+ const normalized = response.trim().toUpperCase();
51
+ if (normalized.startsWith("YES")) return true;
52
+ if (normalized.startsWith("NO")) return false;
53
+ return false; // conservative default
54
+ }
55
+
56
+ // ---------------------------------------------------------------------------
57
+ // Proposal validation
58
+ // ---------------------------------------------------------------------------
59
+
60
+ /** Validate a proposal by running trigger checks against the eval set. */
61
+ export async function validateProposal(
62
+ proposal: EvolutionProposal,
63
+ evalSet: EvalEntry[],
64
+ mode: "agent" | "api",
65
+ agent?: string,
66
+ ): Promise<ValidationResult> {
67
+ if (evalSet.length === 0) {
68
+ return {
69
+ proposal_id: proposal.proposal_id,
70
+ before_pass_rate: 0,
71
+ after_pass_rate: 0,
72
+ improved: false,
73
+ regressions: [],
74
+ new_passes: [],
75
+ net_change: 0,
76
+ };
77
+ }
78
+
79
+ const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
80
+ const regressions: EvalEntry[] = [];
81
+ const newPasses: EvalEntry[] = [];
82
+ let beforePassed = 0;
83
+ let afterPassed = 0;
84
+
85
+ for (const entry of evalSet) {
86
+ // Check with original description
87
+ const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
88
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, mode, agent);
89
+ const beforeTriggered = parseTriggerResponse(beforeRaw);
90
+ const beforePass =
91
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
92
+
93
+ // Check with proposed description
94
+ const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
95
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, mode, agent);
96
+ const afterTriggered = parseTriggerResponse(afterRaw);
97
+ const afterPass =
98
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
99
+
100
+ if (beforePass) beforePassed++;
101
+ if (afterPass) afterPassed++;
102
+
103
+ // Regression: passed before, fails after
104
+ if (beforePass && !afterPass) {
105
+ regressions.push(entry);
106
+ }
107
+
108
+ // New pass: failed before, passes after
109
+ if (!beforePass && afterPass) {
110
+ newPasses.push(entry);
111
+ }
112
+ }
113
+
114
+ const total = evalSet.length;
115
+ const beforePassRate = beforePassed / total;
116
+ const afterPassRate = afterPassed / total;
117
+ const netChange = afterPassRate - beforePassRate;
118
+
119
+ // A proposal is improved when ALL of:
120
+ // - after_pass_rate > before_pass_rate
121
+ // - regressions count < 5% of total eval entries
122
+ // - Either net improvement >= 0.10 OR new_passes.length >= 2
123
+ const improved =
124
+ afterPassRate > beforePassRate &&
125
+ regressions.length < total * 0.05 &&
126
+ (netChange >= 0.1 || newPasses.length >= 2);
127
+
128
+ return {
129
+ proposal_id: proposal.proposal_id,
130
+ before_pass_rate: beforePassRate,
131
+ after_pass_rate: afterPassRate,
132
+ improved,
133
+ regressions,
134
+ new_passes: newPasses,
135
+ net_change: netChange,
136
+ };
137
+ }
@@ -0,0 +1,459 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * grade-session.ts
4
+ *
5
+ * Rubric-based grader for Claude Code skill sessions.
6
+ * Migrated from grade_session.py.
7
+ *
8
+ * Two modes:
9
+ * 1. --use-agent (default when no ANTHROPIC_API_KEY) — invokes installed agent CLI
10
+ * 2. --use-api (default when ANTHROPIC_API_KEY set) — calls Anthropic API directly
11
+ */
12
+
13
+ import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
14
+ import { dirname } from "node:path";
15
+ import { parseArgs } from "node:util";
16
+
17
+ import { TELEMETRY_LOG } from "../constants.js";
18
+ import type {
19
+ ExecutionMetrics,
20
+ GraderOutput,
21
+ GradingResult,
22
+ SessionTelemetryRecord,
23
+ } from "../types.js";
24
+ import { readJsonl } from "../utils/jsonl.js";
25
+ import {
26
+ detectAgent as _detectAgent,
27
+ stripMarkdownFences as _stripMarkdownFences,
28
+ callViaAgent,
29
+ callViaApi,
30
+ } from "../utils/llm-call.js";
31
+ import { readExcerpt } from "../utils/transcript.js";
32
+
33
+ // Re-export for backward compatibility
34
+ export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Constants
38
+ // ---------------------------------------------------------------------------
39
+
40
+ export const MAX_TRANSCRIPT_LENGTH = 50000;
41
+
42
+ // ---------------------------------------------------------------------------
43
+ // Grader system prompt
44
+ // ---------------------------------------------------------------------------
45
+
46
+ export const GRADER_SYSTEM = `You are a rigorous skill session evaluator. You receive:
47
+ 1. Expectations to grade (things that should be true)
48
+ 2. Process telemetry: tool calls, bash commands, skills triggered, errors
49
+ 3. A transcript excerpt showing what happened
50
+
51
+ Grade each expectation and output ONLY valid JSON matching this schema:
52
+ {
53
+ "expectations": [
54
+ {"text": "...", "passed": true/false, "evidence": "specific quote or metric"}
55
+ ],
56
+ "summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0},
57
+ "claims": [
58
+ {"claim": "...", "type": "factual|process|quality", "verified": true/false, "evidence": "..."}
59
+ ],
60
+ "eval_feedback": {
61
+ "suggestions": [{"assertion": "...", "reason": "..."}],
62
+ "overall": "one sentence"
63
+ }
64
+ }
65
+
66
+ Rules:
67
+ - PASS only when there is clear, specific evidence — not assumptions
68
+ - FAIL when evidence is absent or contradictory
69
+ - Cite exact quotes or specific metric values
70
+ - Extract 2-4 implicit claims from the transcript and verify them
71
+ - Suggest eval improvements only for clear gaps`;
72
+
73
+ // ---------------------------------------------------------------------------
74
+ // Data lookup helpers
75
+ // ---------------------------------------------------------------------------
76
+
77
+ export function findSession(
78
+ records: SessionTelemetryRecord[],
79
+ sessionId: string,
80
+ ): SessionTelemetryRecord | null {
81
+ for (let i = records.length - 1; i >= 0; i--) {
82
+ if (records[i].session_id === sessionId) return records[i];
83
+ }
84
+ return null;
85
+ }
86
+
87
+ export function latestSessionForSkill(
88
+ telemetry: SessionTelemetryRecord[],
89
+ skillName: string,
90
+ ): SessionTelemetryRecord | null {
91
+ for (let i = telemetry.length - 1; i >= 0; i--) {
92
+ if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
93
+ }
94
+ return null;
95
+ }
96
+
97
+ export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
98
+ let data: unknown;
99
+ try {
100
+ data = JSON.parse(readFileSync(evalsJsonPath, "utf-8"));
101
+ } catch (err) {
102
+ throw new Error(
103
+ `Failed to read or parse evals JSON at ${evalsJsonPath}: ${err instanceof Error ? err.message : String(err)}`,
104
+ );
105
+ }
106
+
107
+ if (typeof data !== "object" || data === null || Array.isArray(data)) {
108
+ throw new Error(
109
+ `Invalid evals JSON at ${evalsJsonPath}: expected a top-level object, got ${Array.isArray(data) ? "array" : typeof data}`,
110
+ );
111
+ }
112
+
113
+ const record = data as Record<string, unknown>;
114
+ if (!Array.isArray(record.evals)) {
115
+ throw new Error(
116
+ `Invalid evals JSON at ${evalsJsonPath}: expected "evals" to be an array, got ${typeof record.evals}`,
117
+ );
118
+ }
119
+
120
+ for (const ev of record.evals) {
121
+ if (typeof ev !== "object" || ev === null || Array.isArray(ev)) {
122
+ throw new Error(
123
+ `Invalid eval entry in ${evalsJsonPath}: expected an object, got ${Array.isArray(ev) ? "array" : typeof ev}`,
124
+ );
125
+ }
126
+ const entry = ev as Record<string, unknown>;
127
+ if (entry.id === evalId) {
128
+ if (entry.expectations === undefined || entry.expectations === null) {
129
+ return [];
130
+ }
131
+ if (!Array.isArray(entry.expectations)) {
132
+ throw new Error(
133
+ `Invalid eval entry (id=${evalId}) in ${evalsJsonPath}: expected "expectations" to be an array, got ${typeof entry.expectations}`,
134
+ );
135
+ }
136
+ for (let i = 0; i < entry.expectations.length; i++) {
137
+ if (typeof entry.expectations[i] !== "string") {
138
+ throw new Error(
139
+ `Invalid eval entry (id=${evalId}) in ${evalsJsonPath}: expectations[${i}] must be a string, got ${typeof entry.expectations[i]}`,
140
+ );
141
+ }
142
+ }
143
+ return entry.expectations as string[];
144
+ }
145
+ }
146
+ throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
147
+ }
148
+
149
+ // ---------------------------------------------------------------------------
150
+ // Execution metrics
151
+ // ---------------------------------------------------------------------------
152
+
153
+ export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): ExecutionMetrics {
154
+ return {
155
+ tool_calls: telemetry.tool_calls ?? {},
156
+ total_tool_calls: telemetry.total_tool_calls ?? 0,
157
+ total_steps: telemetry.assistant_turns ?? 0,
158
+ bash_commands_run: (telemetry.bash_commands ?? []).length,
159
+ errors_encountered: telemetry.errors_encountered ?? 0,
160
+ skills_triggered: telemetry.skills_triggered ?? [],
161
+ transcript_chars: telemetry.transcript_chars ?? 0,
162
+ };
163
+ }
164
+
165
+ // ---------------------------------------------------------------------------
166
+ // Prompt building
167
+ // ---------------------------------------------------------------------------
168
+
169
+ export function buildGradingPrompt(
170
+ expectations: string[],
171
+ telemetry: SessionTelemetryRecord,
172
+ transcriptExcerpt: string,
173
+ skillName: string,
174
+ ): string {
175
+ const toolSummary = JSON.stringify(telemetry.tool_calls ?? {}, null, 2);
176
+ const commands = telemetry.bash_commands ?? [];
177
+ const cmdSummary =
178
+ commands
179
+ .slice(0, 20)
180
+ .map((c) => ` $ ${c.slice(0, 120)}`)
181
+ .join("\n") || " (none)";
182
+
183
+ const expectationsList = expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
184
+
185
+ const excerpt =
186
+ transcriptExcerpt.length > MAX_TRANSCRIPT_LENGTH
187
+ ? transcriptExcerpt.slice(0, MAX_TRANSCRIPT_LENGTH)
188
+ : transcriptExcerpt;
189
+
190
+ return `Skill: ${skillName}
191
+
192
+ === PROCESS TELEMETRY ===
193
+ Skills triggered: ${JSON.stringify(telemetry.skills_triggered ?? [])}
194
+ Assistant turns: ${telemetry.assistant_turns ?? "?"}
195
+ Errors: ${telemetry.errors_encountered ?? "?"}
196
+ Total tool calls: ${telemetry.total_tool_calls ?? "?"}
197
+
198
+ Tool breakdown:
199
+ ${toolSummary}
200
+
201
+ Bash commands:
202
+ ${cmdSummary}
203
+
204
+ === TRANSCRIPT EXCERPT ===
205
+ ${excerpt}
206
+
207
+ === EXPECTATIONS ===
208
+ ${expectationsList}
209
+
210
+ Grade each expectation. Output JSON only.`;
211
+ }
212
+
213
+ // ---------------------------------------------------------------------------
214
+ // Grading via agent subprocess
215
+ // ---------------------------------------------------------------------------
216
+
217
+ export async function gradeViaAgent(prompt: string, agent: string): Promise<GraderOutput> {
218
+ const raw = await callViaAgent(GRADER_SYSTEM, prompt, agent);
219
+ try {
220
+ return JSON.parse(_stripMarkdownFences(raw)) as GraderOutput;
221
+ } catch (err) {
222
+ throw new Error(
223
+ `gradeViaAgent: failed to parse LLM output as JSON. Raw (truncated): ${raw.slice(0, 200)}`,
224
+ { cause: err },
225
+ );
226
+ }
227
+ }
228
+
229
+ // ---------------------------------------------------------------------------
230
+ // Grading via direct Anthropic API
231
+ // ---------------------------------------------------------------------------
232
+
233
+ export async function gradeViaApi(prompt: string): Promise<GraderOutput> {
234
+ const raw = await callViaApi(GRADER_SYSTEM, prompt);
235
+ try {
236
+ return JSON.parse(_stripMarkdownFences(raw)) as GraderOutput;
237
+ } catch (err) {
238
+ throw new Error(
239
+ `gradeViaApi: failed to parse LLM output as JSON. Raw (truncated): ${raw.slice(0, 200)}`,
240
+ { cause: err },
241
+ );
242
+ }
243
+ }
244
+
245
+ // ---------------------------------------------------------------------------
246
+ // Result assembly
247
+ // ---------------------------------------------------------------------------
248
+
249
+ export function assembleResult(
250
+ graderOutput: GraderOutput,
251
+ telemetry: SessionTelemetryRecord,
252
+ sessionId: string,
253
+ skillName: string,
254
+ transcriptPath: string,
255
+ ): GradingResult {
256
+ return {
257
+ session_id: sessionId ?? "unknown",
258
+ skill_name: skillName ?? "unknown",
259
+ transcript_path: transcriptPath ?? "",
260
+ graded_at: new Date().toISOString(),
261
+ expectations: graderOutput?.expectations ?? [],
262
+ summary: graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 },
263
+ execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
264
+ claims: graderOutput?.claims ?? [],
265
+ eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
266
+ };
267
+ }
268
+
269
+ // ---------------------------------------------------------------------------
270
+ // Summary printer
271
+ // ---------------------------------------------------------------------------
272
+
273
+ function printSummary(result: GradingResult): void {
274
+ const { summary } = result;
275
+ const rate = summary.pass_rate ?? 0;
276
+ console.log(`\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)`);
277
+ for (const exp of result.expectations ?? []) {
278
+ const icon = exp.passed ? "\u2713" : "\u2717";
279
+ console.log(` ${icon} ${String(exp.text ?? "").slice(0, 70)}`);
280
+ if (!exp.passed) {
281
+ console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
282
+ }
283
+ }
284
+
285
+ const feedback = result.eval_feedback;
286
+ if (feedback.suggestions?.length) {
287
+ console.log(`\nEval feedback: ${feedback.overall}`);
288
+ for (const s of feedback.suggestions) {
289
+ console.log(` * ${String(s.reason ?? "").slice(0, 100)}`);
290
+ }
291
+ }
292
+ }
293
+
294
+ // ---------------------------------------------------------------------------
295
+ // CLI entry point
296
+ // ---------------------------------------------------------------------------
297
+
298
+ export async function cliMain(): Promise<void> {
299
+ const { values } = parseArgs({
300
+ options: {
301
+ skill: { type: "string" },
302
+ expectations: { type: "string", multiple: true },
303
+ "evals-json": { type: "string" },
304
+ "eval-id": { type: "string" },
305
+ "session-id": { type: "string" },
306
+ transcript: { type: "string" },
307
+ "telemetry-log": { type: "string", default: TELEMETRY_LOG },
308
+ output: { type: "string", default: "grading.json" },
309
+ "use-agent": { type: "boolean", default: false },
310
+ "use-api": { type: "boolean", default: false },
311
+ agent: { type: "string" },
312
+ "show-transcript": { type: "boolean", default: false },
313
+ },
314
+ strict: true,
315
+ });
316
+
317
+ const skill = values.skill;
318
+ if (!skill) {
319
+ console.error("[ERROR] --skill is required");
320
+ process.exit(1);
321
+ }
322
+
323
+ // --- Determine mode ---
324
+ const hasApiKey = Boolean(process.env.ANTHROPIC_API_KEY);
325
+ let mode: "agent" | "api";
326
+ let agent: string | null = null;
327
+
328
+ if (values["use-api"]) {
329
+ mode = "api";
330
+ } else if (values["use-agent"]) {
331
+ mode = "agent";
332
+ } else {
333
+ const availableAgent = _detectAgent();
334
+ if (availableAgent) {
335
+ mode = "agent";
336
+ } else if (hasApiKey) {
337
+ mode = "api";
338
+ } else {
339
+ console.error(
340
+ "[ERROR] No agent CLI (claude/codex/opencode) found in PATH " +
341
+ "and ANTHROPIC_API_KEY not set.\n" +
342
+ "Install Claude Code, Codex, or OpenCode, or set ANTHROPIC_API_KEY.",
343
+ );
344
+ process.exit(1);
345
+ }
346
+ }
347
+
348
+ if (mode === "agent") {
349
+ const validAgents = ["claude", "codex", "opencode"];
350
+ if (values.agent && validAgents.includes(values.agent)) {
351
+ agent = values.agent;
352
+ } else {
353
+ agent = _detectAgent();
354
+ }
355
+ if (!agent) {
356
+ console.error(
357
+ "[ERROR] --use-agent specified but no agent found in PATH.\n" +
358
+ "Install claude, codex, or opencode, or use --use-api instead.",
359
+ );
360
+ process.exit(1);
361
+ }
362
+ console.error(`[INFO] Grading via agent: ${agent}`);
363
+ } else {
364
+ console.error("[INFO] Grading via direct Anthropic API");
365
+ }
366
+
367
+ // --- Resolve expectations ---
368
+ let expectations: string[] = [];
369
+ if (values["evals-json"] && values["eval-id"] != null) {
370
+ const evalIdNum = Number(values["eval-id"]);
371
+ if (!Number.isFinite(evalIdNum) || !Number.isInteger(evalIdNum)) {
372
+ console.error(`[ERROR] --eval-id must be a finite integer, got: ${values["eval-id"]}`);
373
+ process.exit(1);
374
+ }
375
+ expectations = loadExpectationsFromEvalsJson(values["evals-json"], evalIdNum);
376
+ } else if (values.expectations?.length) {
377
+ expectations = values.expectations;
378
+ } else {
379
+ console.error("[ERROR] Provide --expectations or --evals-json + --eval-id");
380
+ process.exit(1);
381
+ }
382
+
383
+ // --- Resolve session ---
384
+ let telemetry = {} as SessionTelemetryRecord;
385
+ let transcriptPath = "";
386
+ let sessionId = "unknown";
387
+
388
+ const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
389
+ const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
390
+
391
+ if (values.transcript) {
392
+ transcriptPath = values.transcript;
393
+ for (let i = telRecords.length - 1; i >= 0; i--) {
394
+ if (telRecords[i].transcript_path === transcriptPath) {
395
+ telemetry = telRecords[i];
396
+ sessionId = telRecords[i].session_id ?? "unknown";
397
+ break;
398
+ }
399
+ }
400
+ } else if (values["session-id"]) {
401
+ sessionId = values["session-id"];
402
+ telemetry = findSession(telRecords, sessionId) ?? ({} as SessionTelemetryRecord);
403
+ transcriptPath = telemetry.transcript_path ?? "";
404
+ } else {
405
+ telemetry = latestSessionForSkill(telRecords, skill) ?? ({} as SessionTelemetryRecord);
406
+ if (telemetry.session_id) {
407
+ sessionId = telemetry.session_id;
408
+ transcriptPath = telemetry.transcript_path ?? "";
409
+ console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}`);
410
+ } else {
411
+ console.error(`[WARN] No telemetry for skill '${skill}'. Is session_stop_hook installed?`);
412
+ }
413
+ }
414
+
415
+ const transcriptExcerpt = transcriptPath ? readExcerpt(transcriptPath) : "(no transcript)";
416
+
417
+ if (values["show-transcript"]) {
418
+ console.log("=== TRANSCRIPT EXCERPT ===");
419
+ console.log(transcriptExcerpt);
420
+ console.log("==========================\n");
421
+ }
422
+
423
+ // --- Build prompt and grade ---
424
+ const prompt = buildGradingPrompt(expectations, telemetry, transcriptExcerpt, skill);
425
+
426
+ console.error(`Grading ${expectations.length} expectations for skill '${skill}'...`);
427
+
428
+ let graderOutput: GraderOutput;
429
+ try {
430
+ if (mode === "agent") {
431
+ graderOutput = await gradeViaAgent(prompt, agent as string);
432
+ } else {
433
+ graderOutput = await gradeViaApi(prompt);
434
+ }
435
+ } catch (e) {
436
+ console.error(`[ERROR] Grading failed: ${e}`);
437
+ process.exit(1);
438
+ }
439
+
440
+ const result = assembleResult(graderOutput, telemetry, sessionId, skill, transcriptPath);
441
+
442
+ const outputPath = values.output ?? "grading.json";
443
+ const outputDir = dirname(outputPath);
444
+ if (outputDir !== ".") {
445
+ mkdirSync(outputDir, { recursive: true });
446
+ }
447
+ writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
448
+
449
+ printSummary(result);
450
+ console.log(`\nWrote ${outputPath}`);
451
+ }
452
+
453
+ // Guard: only run when invoked directly
454
+ if (import.meta.main) {
455
+ cliMain().catch((err) => {
456
+ console.error(`[FATAL] ${err}`);
457
+ process.exit(1);
458
+ });
459
+ }
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Claude Code UserPromptSubmit hook: prompt-log.ts
4
+ *
5
+ * Fires on every user message before Claude processes it.
6
+ * Logs the query to ~/.claude/all_queries_log.jsonl so that
7
+ * hooks-to-evals can identify prompts that did NOT trigger
8
+ * a skill — the raw material for false-negative eval entries.
9
+ */
10
+
11
+ import { QUERY_LOG, SKIP_PREFIXES } from "../constants.js";
12
+ import type { PromptSubmitPayload, QueryLogRecord } from "../types.js";
13
+ import { appendJsonl } from "../utils/jsonl.js";
14
+
15
+ /**
16
+ * Core processing logic, exported for testability.
17
+ * Returns the record that was appended, or null if skipped.
18
+ */
19
+ export function processPrompt(
20
+ payload: PromptSubmitPayload,
21
+ logPath: string = QUERY_LOG,
22
+ ): QueryLogRecord | null {
23
+ const query = (payload.user_prompt ?? "").trim();
24
+
25
+ if (!query) return null;
26
+
27
+ // Skip automated/tool messages
28
+ if (SKIP_PREFIXES.some((p) => query.startsWith(p))) return null;
29
+
30
+ // Skip very short noise (single chars, punctuation)
31
+ if (query.length < 4) return null;
32
+
33
+ const record: QueryLogRecord = {
34
+ timestamp: new Date().toISOString(),
35
+ session_id: payload.session_id ?? "unknown",
36
+ query,
37
+ };
38
+
39
+ appendJsonl(logPath, record);
40
+ return record;
41
+ }
42
+
43
+ // --- stdin main (only when executed directly, not when imported) ---
44
+ if (import.meta.main) {
45
+ try {
46
+ const payload: PromptSubmitPayload = JSON.parse(await Bun.stdin.text());
47
+ processPrompt(payload);
48
+ } catch {
49
+ // silent — hooks must never block Claude
50
+ }
51
+ process.exit(0);
52
+ }
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Claude Code Stop hook: session-stop.ts
4
+ *
5
+ * Fires when a Claude Code session ends. Reads the session's transcript JSONL
6
+ * and extracts process-level telemetry (tool calls, errors, skills triggered, etc).
7
+ * Appends one record per session to ~/.claude/session_telemetry_log.jsonl.
8
+ */
9
+
10
+ import { TELEMETRY_LOG } from "../constants.js";
11
+ import type { SessionTelemetryRecord, StopPayload } from "../types.js";
12
+ import { appendJsonl } from "../utils/jsonl.js";
13
+ import { parseTranscript } from "../utils/transcript.js";
14
+
15
+ /**
16
+ * Core processing logic, exported for testability.
17
+ * Returns the record that was appended.
18
+ */
19
+ export function processSessionStop(
20
+ payload: StopPayload,
21
+ logPath: string = TELEMETRY_LOG,
22
+ ): SessionTelemetryRecord {
23
+ const sessionId = typeof payload.session_id === "string" ? payload.session_id : "unknown";
24
+ const transcriptPath = typeof payload.transcript_path === "string" ? payload.transcript_path : "";
25
+ const cwd = typeof payload.cwd === "string" ? payload.cwd : "";
26
+
27
+ const metrics = parseTranscript(transcriptPath);
28
+
29
+ const record: SessionTelemetryRecord = {
30
+ timestamp: new Date().toISOString(),
31
+ session_id: sessionId,
32
+ cwd,
33
+ transcript_path: transcriptPath,
34
+ source: "claude_code",
35
+ ...metrics,
36
+ };
37
+
38
+ appendJsonl(logPath, record);
39
+ return record;
40
+ }
41
+
42
+ // --- stdin main (only when executed directly, not when imported) ---
43
+ if (import.meta.main) {
44
+ try {
45
+ const payload: StopPayload = JSON.parse(await Bun.stdin.text());
46
+ processSessionStop(payload);
47
+ } catch (err) {
48
+ // silent — hooks must never block Claude
49
+ if (process.env.DEBUG || process.env.NODE_ENV === "development") {
50
+ console.error("session-stop hook failed:", err);
51
+ }
52
+ }
53
+ process.exit(0);
54
+ }