agent-gauntlet 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +4 -4
  2. package/src/built-in-reviews/code-quality.md +25 -0
  3. package/src/built-in-reviews/index.ts +28 -0
  4. package/src/cli-adapters/claude.ts +229 -29
  5. package/src/cli-adapters/codex.ts +189 -22
  6. package/src/cli-adapters/cursor.ts +8 -0
  7. package/src/cli-adapters/gemini.ts +414 -59
  8. package/src/cli-adapters/github-copilot.ts +8 -0
  9. package/src/cli-adapters/index.ts +14 -0
  10. package/src/cli-adapters/thinking-budget.ts +23 -0
  11. package/src/commands/check.ts +18 -19
  12. package/src/commands/clean.ts +5 -4
  13. package/src/commands/detect.ts +104 -29
  14. package/src/commands/init.ts +1314 -322
  15. package/src/commands/review.ts +18 -19
  16. package/src/commands/shared.ts +60 -39
  17. package/src/commands/stop-hook.ts +7 -4
  18. package/src/commands/wait-ci.ts +2 -2
  19. package/src/config/loader.ts +28 -12
  20. package/src/config/schema.ts +28 -7
  21. package/src/config/types.ts +2 -0
  22. package/src/config/validator.ts +8 -8
  23. package/src/core/change-detector.ts +38 -32
  24. package/src/core/run-executor.ts +50 -46
  25. package/src/core/runner.ts +50 -24
  26. package/src/gates/check.ts +3 -14
  27. package/src/gates/resolve-check-command.ts +21 -0
  28. package/src/gates/result.ts +1 -0
  29. package/src/gates/review.ts +44 -5
  30. package/src/hooks/stop-hook-handler.ts +56 -20
  31. package/src/output/app-logger.ts +1 -1
  32. package/src/output/console.ts +3 -3
  33. package/src/output/sinks/file-sink.ts +2 -2
  34. package/src/scripts/status.ts +433 -0
  35. package/src/utils/debug-log.ts +55 -3
  36. package/src/utils/diff-parser.ts +63 -48
  37. package/src/utils/execution-state.ts +22 -0
  38. package/src/templates/fix_pr.template.md +0 -12
  39. package/src/templates/push_pr.template.md +0 -9
  40. package/src/templates/run_gauntlet.template.md +0 -41
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.9.0",
3
+ "version": "0.10.0",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -35,10 +35,10 @@
35
35
  "build": "bun build --compile --minify --sourcemap ./src/index.ts --outfile bin/agent-gauntlet",
36
36
  "test": "bun test",
37
37
  "lint": "biome check src",
38
+ "typecheck": "tsc --noEmit && tsc --noEmit -p test/tsconfig.json",
38
39
  "changeset": "changeset",
39
40
  "version": "changeset version",
40
- "release": "npm publish",
41
- "gen-changeset": "bun scripts/gen-changeset.ts"
41
+ "release": "changeset publish"
42
42
  },
43
43
  "devDependencies": {
44
44
  "@biomejs/biome": "^2.3.11",
@@ -57,4 +57,4 @@
57
57
  "yaml": "^2.8.2",
58
58
  "zod": "^4.3.5"
59
59
  }
60
- }
60
+ }
@@ -0,0 +1,25 @@
1
+ # Code Quality Review
2
+
3
+ You are a senior software engineer performing a code review. Your primary goal is to identify **real problems** that could cause bugs, security vulnerabilities, or performance issues in production. Do not report style, formatting, naming conventions, or maintainability suggestions unless you see something egregious.
4
+
5
+ ## Focus Areas (in priority order)
6
+
7
+ 1. **Bugs** — Logic errors, null/undefined issues, race conditions, unhandled edge cases, resource leaks
8
+ 2. **Security** — Injection vulnerabilities, auth/authz flaws, sensitive data exposure, input validation gaps
9
+ 3. **Performance** — Algorithmic complexity issues, N+1 queries, blocking operations, memory problems
10
+ 4. **Maintainability** — Unclear code, missing error handling, duplication
11
+
12
+ ## Do NOT Report
13
+
14
+ - Style, formatting, or naming preferences
15
+ - Missing documentation, comments, or type annotations
16
+ - Suggestions for "better" abstractions or patterns that aren't broken
17
+ - Hypothetical issues that require unlikely preconditions
18
+ - Issues in code that wasn't changed in this diff
19
+
20
+ ## Guidelines
21
+
22
+ - **Threshold**: only report issues you would block a PR over
23
+ - Explain **why** each issue is a problem with a concrete failure scenario
24
+ - Provide a **concrete fix** with corrected code
25
+ - If the status quo works correctly, it's not a violation
@@ -0,0 +1,28 @@
1
+ // @ts-expect-error Bun text import
2
+ import codeQualityContent from "./code-quality.md" with { type: "text" };
3
+
4
+ const BUILT_IN_PREFIX = "built-in:";
5
+
6
+ const builtInSources: Record<string, string> = {
7
+ "code-quality": codeQualityContent,
8
+ };
9
+
10
+ /**
11
+ * Check if a review name uses the built-in prefix.
12
+ */
13
+ export function isBuiltInReview(name: string): boolean {
14
+ return name.startsWith(BUILT_IN_PREFIX);
15
+ }
16
+
17
+ /**
18
+ * Load a built-in review prompt by name. Returns the raw markdown content.
19
+ */
20
+ export function loadBuiltInReview(name: string): string {
21
+ const source = builtInSources[name];
22
+
23
+ if (!source) {
24
+ throw new Error(`Unknown built-in review: "${name}"`);
25
+ }
26
+
27
+ return source;
28
+ }
@@ -4,11 +4,194 @@ import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
6
  import { GAUNTLET_STOP_HOOK_ACTIVE_ENV } from "../commands/stop-hook.js";
7
+ import { getDebugLogger } from "../utils/debug-log.js";
7
8
  import { type CLIAdapter, runStreamingCommand } from "./index.js";
9
+ import { CLAUDE_THINKING_TOKENS } from "./thinking-budget.js";
8
10
 
9
11
  const execAsync = promisify(exec);
10
12
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
11
13
 
14
+ // Matches OTel console exporter metric blocks dumped to stdout at process exit.
15
+ // Requires `descriptor`, `dataPointType`, and `dataPoints` fields which are
16
+ // unique to OTel SDK output and won't appear in normal code review content.
17
+ // Optionally matches [otel] prefix that some exporters add.
18
+ const OTEL_METRIC_BLOCK_RE =
19
+ /(?:\[otel\]\s*)?\{\s*\n\s*descriptor:\s*\{[\s\S]*?dataPointType:\s*\d+[\s\S]*?dataPoints:\s*\[[\s\S]*?\]\s*,?\s*\n\}/g;
20
+
21
+ interface OtelUsage {
22
+ cost?: number;
23
+ input?: number;
24
+ output?: number;
25
+ cacheRead?: number;
26
+ cacheCreation?: number;
27
+ toolCalls?: number;
28
+ toolContentBytes?: number;
29
+ apiRequests?: number;
30
+ }
31
+
32
+ const TOKEN_TYPES = ["input", "output", "cacheRead", "cacheCreation"] as const;
33
+
34
+ function parseCostBlock(block: string): number | undefined {
35
+ const match = block.match(/value:\s*([\d.]+)/);
36
+ return match ? Number.parseFloat(match[1]!) : undefined;
37
+ }
38
+
39
+ function parseTokenBlock(block: string): Partial<OtelUsage> {
40
+ const result: Partial<OtelUsage> = {};
41
+ const re = /type:\s*"(\w+)"[\s\S]*?value:\s*(\d+)(?:,|\s*\})/g;
42
+ for (const match of block.matchAll(re)) {
43
+ const type = match[1]! as (typeof TOKEN_TYPES)[number];
44
+ if (TOKEN_TYPES.includes(type)) {
45
+ result[type] = Number.parseInt(match[2]!, 10);
46
+ }
47
+ }
48
+ return result;
49
+ }
50
+
51
+ function parseOtelMetrics(blocks: string[]): OtelUsage {
52
+ const usage: OtelUsage = {};
53
+ for (const block of blocks) {
54
+ const nameMatch = block.match(/name:\s*"([^"]+)"/);
55
+ if (!nameMatch) continue;
56
+
57
+ if (nameMatch[1] === "claude_code.cost.usage") {
58
+ usage.cost = parseCostBlock(block);
59
+ } else if (nameMatch[1] === "claude_code.token.usage") {
60
+ Object.assign(usage, parseTokenBlock(block));
61
+ }
62
+ }
63
+ return usage;
64
+ }
65
+
66
+ // Matches OTel console log exporter event records emitted by Claude Code.
67
+ // The Node.js SDK console exporter uses util.inspect() format with unquoted keys
68
+ // and single-quoted strings. Blocks start with `resource:` and contain a `body:`
69
+ // field with the event name (e.g. 'claude_code.tool_result').
70
+ const OTEL_LOG_BLOCK_RE =
71
+ /\{\s*\n\s*resource:\s*\{[\s\S]*?body:\s*'claude_code\.\w+'[\s\S]*?\n\}/g;
72
+
73
+ /** Pre-compiled regexes for extracting single-quoted attribute values from OTel log blocks. */
74
+ const OTEL_ATTR_RE = {
75
+ body: /body:\s*'([^']*)'/,
76
+ tool_result_size_bytes: /tool_result_size_bytes:\s*'([^']*)'/,
77
+ input_tokens: /input_tokens:\s*'([^']*)'/,
78
+ output_tokens: /output_tokens:\s*'([^']*)'/,
79
+ cache_read_tokens: /cache_read_tokens:\s*'([^']*)'/,
80
+ cache_creation_tokens: /cache_creation_tokens:\s*'([^']*)'/,
81
+ cost_usd: /cost_usd:\s*'([^']*)'/,
82
+ } as const;
83
+
84
+ /** Maps OTel api_request attribute regexes to OtelUsage fields. */
85
+ const API_REQUEST_FIELDS: Array<[RegExp, keyof OtelUsage]> = [
86
+ [OTEL_ATTR_RE.input_tokens, "input"],
87
+ [OTEL_ATTR_RE.output_tokens, "output"],
88
+ [OTEL_ATTR_RE.cache_read_tokens, "cacheRead"],
89
+ [OTEL_ATTR_RE.cache_creation_tokens, "cacheCreation"],
90
+ [OTEL_ATTR_RE.cost_usd, "cost"],
91
+ ];
92
+
93
+ /** Accumulate a tool_result log block into usage. */
94
+ function accumulateToolResult(block: string, usage: OtelUsage): void {
95
+ usage.toolCalls = (usage.toolCalls || 0) + 1;
96
+ const bytes = block.match(OTEL_ATTR_RE.tool_result_size_bytes)?.[1];
97
+ if (bytes !== undefined) {
98
+ usage.toolContentBytes = (usage.toolContentBytes || 0) + Number(bytes);
99
+ }
100
+ }
101
+
102
+ /** Accumulate an api_request log block into usage. */
103
+ function accumulateApiRequest(block: string, usage: OtelUsage): void {
104
+ usage.apiRequests = (usage.apiRequests || 0) + 1;
105
+ for (const [re, field] of API_REQUEST_FIELDS) {
106
+ const val = block.match(re)?.[1];
107
+ if (val !== undefined) {
108
+ usage[field] = (usage[field] || 0) + Number(val);
109
+ }
110
+ }
111
+ }
112
+
113
+ /** Accumulate tool_result and api_request event data from OTel log blocks. */
114
+ function parseOtelLogEvents(raw: string, usage: OtelUsage): void {
115
+ const blocks = raw.match(OTEL_LOG_BLOCK_RE);
116
+ if (!blocks) return;
117
+ for (const block of blocks) {
118
+ const body = block.match(OTEL_ATTR_RE.body)?.[1];
119
+ if (body === "claude_code.tool_result") {
120
+ accumulateToolResult(block, usage);
121
+ } else if (body === "claude_code.api_request") {
122
+ accumulateApiRequest(block, usage);
123
+ }
124
+ }
125
+ }
126
+
127
+ const OTEL_SUMMARY_FIELDS: Array<[keyof OtelUsage, string]> = [
128
+ ["input", "in"],
129
+ ["output", "out"],
130
+ ["cacheRead", "cacheRead"],
131
+ ["cacheCreation", "cacheWrite"],
132
+ ["toolCalls", "tool_calls"],
133
+ ["toolContentBytes", "tool_content_bytes"],
134
+ ["apiRequests", "api_requests"],
135
+ ];
136
+
137
+ function formatOtelSummary(usage: OtelUsage): string | null {
138
+ if (usage.cost === undefined && usage.input === undefined) return null;
139
+
140
+ const parts: string[] = [];
141
+ if (usage.cost !== undefined) parts.push(`cost=$${usage.cost.toFixed(4)}`);
142
+ for (const [key, label] of OTEL_SUMMARY_FIELDS) {
143
+ if (usage[key] !== undefined) parts.push(`${label}=${usage[key]}`);
144
+ }
145
+
146
+ return `[otel] ${parts.join(" ")}`;
147
+ }
148
+
149
+ function extractOtelMetrics(
150
+ raw: string,
151
+ onLog?: (msg: string) => void,
152
+ ): string {
153
+ const metricBlocks = raw.match(OTEL_METRIC_BLOCK_RE);
154
+ const usage = metricBlocks ? parseOtelMetrics(metricBlocks) : {};
155
+
156
+ // Also parse log events for tool call and API request counts
157
+ parseOtelLogEvents(raw, usage);
158
+
159
+ const summary = formatOtelSummary(usage);
160
+ if (summary) {
161
+ onLog?.(`\n${summary}\n`);
162
+ process.stderr.write(`${summary}\n`);
163
+ getDebugLogger()?.logTelemetry({ adapter: "claude", summary });
164
+ }
165
+
166
+ return raw
167
+ .replace(OTEL_METRIC_BLOCK_RE, "")
168
+ .replace(OTEL_LOG_BLOCK_RE, "")
169
+ .trimEnd();
170
+ }
171
+
172
+ /** Build OTel environment overrides for console export. */
173
+ function buildOtelEnv(): Record<string, string> {
174
+ const env: Record<string, string> = {};
175
+ if (!process.env.CLAUDE_CODE_ENABLE_TELEMETRY) {
176
+ env.CLAUDE_CODE_ENABLE_TELEMETRY = "1";
177
+ }
178
+ if (!process.env.OTEL_METRICS_EXPORTER) {
179
+ env.OTEL_METRICS_EXPORTER = "console";
180
+ }
181
+ if (!process.env.OTEL_LOGS_EXPORTER) {
182
+ env.OTEL_LOGS_EXPORTER = "console";
183
+ }
184
+ return env;
185
+ }
186
+
187
+ /** Strip OTel metric and log blocks from raw output. */
188
+ function stripOtelBlocks(raw: string): string {
189
+ return raw
190
+ .replace(OTEL_METRIC_BLOCK_RE, "")
191
+ .replace(OTEL_LOG_BLOCK_RE, "")
192
+ .trimEnd();
193
+ }
194
+
12
195
  export class ClaudeAdapter implements CLIAdapter {
13
196
  name = "claude";
14
197
 
@@ -43,21 +226,26 @@ export class ClaudeAdapter implements CLIAdapter {
43
226
  }
44
227
 
45
228
  getUserCommandDir(): string | null {
46
- // Claude supports user-level commands at ~/.claude/commands
47
229
  return path.join(os.homedir(), ".claude", "commands");
48
230
  }
49
231
 
232
+ getProjectSkillDir(): string | null {
233
+ return ".claude/skills";
234
+ }
235
+
236
+ getUserSkillDir(): string | null {
237
+ return path.join(os.homedir(), ".claude", "skills");
238
+ }
239
+
50
240
  getCommandExtension(): string {
51
241
  return ".md";
52
242
  }
53
243
 
54
244
  canUseSymlink(): boolean {
55
- // Claude uses the same Markdown format as our canonical file
56
245
  return true;
57
246
  }
58
247
 
59
248
  transformCommand(markdownContent: string): string {
60
- // Claude uses the same Markdown format, no transformation needed
61
249
  return markdownContent;
62
250
  }
63
251
 
@@ -67,59 +255,71 @@ export class ClaudeAdapter implements CLIAdapter {
67
255
  model?: string;
68
256
  timeoutMs?: number;
69
257
  onOutput?: (chunk: string) => void;
258
+ allowToolUse?: boolean;
259
+ thinkingBudget?: string;
70
260
  }): Promise<string> {
71
261
  const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
72
262
 
73
263
  const tmpDir = os.tmpdir();
74
- // Include process.pid for uniqueness across concurrent processes
75
264
  const tmpFile = path.join(
76
265
  tmpDir,
77
266
  `gauntlet-claude-${process.pid}-${Date.now()}.txt`,
78
267
  );
79
268
  await fs.writeFile(tmpFile, fullContent);
80
269
 
81
- // Recommended invocation per spec:
82
- // -p: non-interactive print mode
83
- // --allowedTools: explicitly restricts to read-only tools
84
- // --max-turns: caps agentic turns
85
- const args = [
86
- "-p",
87
- "--allowedTools",
88
- "Read,Glob,Grep",
89
- "--max-turns",
90
- "10",
91
- ];
270
+ const args = ["-p"];
271
+ if (opts.allowToolUse === false) {
272
+ args.push("--tools", "");
273
+ } else {
274
+ args.push("--allowedTools", "Read,Glob,Grep");
275
+ }
276
+ args.push("--max-turns", "10");
277
+
278
+ const otelEnv = buildOtelEnv();
279
+ const thinkingEnv: Record<string, string> = {};
280
+ if (opts.thinkingBudget && opts.thinkingBudget in CLAUDE_THINKING_TOKENS) {
281
+ thinkingEnv.MAX_THINKING_TOKENS = String(
282
+ CLAUDE_THINKING_TOKENS[opts.thinkingBudget],
283
+ );
284
+ }
92
285
 
93
286
  const cleanup = () => fs.unlink(tmpFile).catch(() => {});
287
+ const execEnv = {
288
+ ...process.env,
289
+ [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
290
+ ...otelEnv,
291
+ ...thinkingEnv,
292
+ };
94
293
 
95
- // If onOutput callback is provided, use spawn for real-time streaming
96
294
  if (opts.onOutput) {
97
- return runStreamingCommand({
295
+ const outputBuffer: string[] = [];
296
+ const raw = await runStreamingCommand({
98
297
  command: "claude",
99
298
  args,
100
299
  tmpFile,
101
300
  timeoutMs: opts.timeoutMs,
102
- onOutput: opts.onOutput,
103
- cleanup,
104
- env: {
105
- ...process.env,
106
- [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
301
+ onOutput: (chunk: string) => {
302
+ outputBuffer.push(chunk);
107
303
  },
304
+ cleanup,
305
+ env: execEnv,
108
306
  });
307
+ const cleanedOutput = extractOtelMetrics(
308
+ outputBuffer.join(""),
309
+ opts.onOutput,
310
+ );
311
+ opts.onOutput(cleanedOutput);
312
+ return stripOtelBlocks(raw);
109
313
  }
110
314
 
111
- // Otherwise use exec for buffered output
112
315
  try {
113
- const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
316
+ const cmd = `cat "${tmpFile}" | claude ${args.map((a) => (a === "" ? '""' : a)).join(" ")}`;
114
317
  const { stdout } = await execAsync(cmd, {
115
318
  timeout: opts.timeoutMs,
116
319
  maxBuffer: MAX_BUFFER_BYTES,
117
- env: {
118
- ...process.env,
119
- [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
120
- },
320
+ env: execEnv,
121
321
  });
122
- return stdout;
322
+ return extractOtelMetrics(stdout);
123
323
  } finally {
124
324
  await cleanup();
125
325
  }
@@ -3,11 +3,157 @@ import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
+ import { getDebugLogger } from "../utils/debug-log.js";
6
7
  import { type CLIAdapter, runStreamingCommand } from "./index.js";
8
+ import { CODEX_REASONING_EFFORT } from "./thinking-budget.js";
7
9
 
8
10
  const execAsync = promisify(exec);
9
11
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
10
12
 
13
+ interface CodexUsage {
14
+ inputTokens?: number;
15
+ cachedInputTokens?: number;
16
+ outputTokens?: number;
17
+ toolCalls?: number;
18
+ apiRequests?: number;
19
+ }
20
+
21
+ /** Parse a single JSONL line into a typed event, or undefined on failure. */
22
+ function parseJsonlLine(
23
+ line: string,
24
+ ): { type: string; [key: string]: unknown } | undefined {
25
+ try {
26
+ const obj = JSON.parse(line);
27
+ if (obj && typeof obj.type === "string") return obj;
28
+ } catch {
29
+ /* skip malformed lines */
30
+ }
31
+ return undefined;
32
+ }
33
+
34
+ /** Maps Codex turn usage JSON fields to CodexUsage fields. */
35
+ const TURN_USAGE_MAP: Array<[string, keyof CodexUsage]> = [
36
+ ["input_tokens", "inputTokens"],
37
+ ["cached_input_tokens", "cachedInputTokens"],
38
+ ["output_tokens", "outputTokens"],
39
+ ];
40
+
41
+ /** Accumulate a turn.completed event's usage into totals. */
42
+ function accumulateTurnUsage(
43
+ event: { type: string; [key: string]: unknown },
44
+ usage: CodexUsage,
45
+ ): void {
46
+ const u = event.usage as Record<string, number | undefined> | undefined;
47
+ if (!u) return;
48
+ usage.apiRequests = (usage.apiRequests || 0) + 1;
49
+ for (const [jsonKey, usageKey] of TURN_USAGE_MAP) {
50
+ if (u[jsonKey] !== undefined) {
51
+ usage[usageKey] = (usage[usageKey] || 0) + u[jsonKey]!;
52
+ }
53
+ }
54
+ }
55
+
56
+ /** Check if an item.completed event represents a tool call (command, file, mcp). */
57
+ function isToolCallItem(event: {
58
+ type: string;
59
+ [key: string]: unknown;
60
+ }): boolean {
61
+ const item = event.item as { type?: string } | undefined;
62
+ if (!item?.type) return false;
63
+ return (
64
+ item.type === "command_execution" ||
65
+ item.type === "file_change" ||
66
+ item.type === "mcp_tool_call"
67
+ );
68
+ }
69
+
70
+ /** Extract the final agent message text from a completed item. */
71
+ function extractAgentMessage(event: {
72
+ type: string;
73
+ [key: string]: unknown;
74
+ }): string | undefined {
75
+ const item = event.item as { type?: string; text?: string } | undefined;
76
+ if (item?.type === "agent_message" && typeof item.text === "string") {
77
+ return item.text;
78
+ }
79
+ return undefined;
80
+ }
81
+
82
+ const SUMMARY_FIELDS: Array<[keyof CodexUsage, string]> = [
83
+ ["inputTokens", "in"],
84
+ ["cachedInputTokens", "cache"],
85
+ ["outputTokens", "out"],
86
+ ["toolCalls", "tool_calls"],
87
+ ["apiRequests", "api_requests"],
88
+ ];
89
+
90
+ function formatCodexSummary(usage: CodexUsage): string | null {
91
+ const parts = SUMMARY_FIELDS.filter(([key]) => usage[key] !== undefined).map(
92
+ ([key, label]) => `${label}=${usage[key]}`,
93
+ );
94
+ return parts.length > 0 ? `[codex-telemetry] ${parts.join(" ")}` : null;
95
+ }
96
+
97
+ /** Process a single item.completed event, updating usage and returning any agent message. */
98
+ function processItemCompleted(
99
+ event: { type: string; [key: string]: unknown },
100
+ usage: CodexUsage,
101
+ ): string | undefined {
102
+ if (isToolCallItem(event)) {
103
+ usage.toolCalls = (usage.toolCalls || 0) + 1;
104
+ }
105
+ return extractAgentMessage(event);
106
+ }
107
+
108
+ /** Route a parsed JSONL event to the appropriate handler, returning any agent message. */
109
+ function processCodexEvent(
110
+ event: { type: string; [key: string]: unknown },
111
+ usage: CodexUsage,
112
+ ): string | undefined {
113
+ if (event.type === "turn.completed") {
114
+ accumulateTurnUsage(event, usage);
115
+ return undefined;
116
+ }
117
+ if (event.type === "item.completed") {
118
+ return processItemCompleted(event, usage);
119
+ }
120
+ return undefined;
121
+ }
122
+
123
+ /** Emit a telemetry summary to logs and debug log. */
124
+ function emitCodexSummary(
125
+ usage: CodexUsage,
126
+ onLog?: (msg: string) => void,
127
+ ): void {
128
+ const summary = formatCodexSummary(usage);
129
+ if (!summary) return;
130
+ onLog?.(`\n${summary}\n`);
131
+ process.stderr.write(`${summary}\n`);
132
+ getDebugLogger()?.logTelemetry({ adapter: "codex", summary });
133
+ }
134
+
135
+ /**
136
+ * Parse JSONL output from `codex exec --json`, extracting the final agent
137
+ * message, token usage, and tool call counts.
138
+ */
139
+ function parseCodexJsonl(
140
+ raw: string,
141
+ onLog?: (msg: string) => void,
142
+ ): { text: string; usage: CodexUsage } {
143
+ const usage: CodexUsage = {};
144
+ let lastAgentMessage = "";
145
+
146
+ for (const line of raw.split("\n")) {
147
+ const event = parseJsonlLine(line.trim());
148
+ if (!event) continue;
149
+ const msg = processCodexEvent(event, usage);
150
+ if (msg !== undefined) lastAgentMessage = msg;
151
+ }
152
+
153
+ emitCodexSummary(usage, onLog);
154
+ return { text: lastAgentMessage, usage };
155
+ }
156
+
11
157
  export class CodexAdapter implements CLIAdapter {
12
158
  name = "codex";
13
159
 
@@ -48,6 +194,14 @@ export class CodexAdapter implements CLIAdapter {
48
194
  return path.join(os.homedir(), ".codex", "prompts");
49
195
  }
50
196
 
197
+ getProjectSkillDir(): string | null {
198
+ return null;
199
+ }
200
+
201
+ getUserSkillDir(): string | null {
202
+ return null;
203
+ }
204
+
51
205
  getCommandExtension(): string {
52
206
  return ".md";
53
207
  }
@@ -62,12 +216,35 @@ export class CodexAdapter implements CLIAdapter {
62
216
  return markdownContent;
63
217
  }
64
218
 
219
+ private buildArgs(allowToolUse?: boolean, thinkingBudget?: string): string[] {
220
+ const args = [
221
+ "exec",
222
+ "--cd",
223
+ process.cwd(),
224
+ "--sandbox",
225
+ "read-only",
226
+ "-c",
227
+ 'ask_for_approval="never"',
228
+ ];
229
+ if (allowToolUse === false) {
230
+ args.push("--disable", "shell_tool");
231
+ }
232
+ if (thinkingBudget && thinkingBudget in CODEX_REASONING_EFFORT) {
233
+ const effort = CODEX_REASONING_EFFORT[thinkingBudget];
234
+ args.push("-c", `model_reasoning_effort="${effort}"`);
235
+ }
236
+ args.push("--json", "-");
237
+ return args;
238
+ }
239
+
65
240
  async execute(opts: {
66
241
  prompt: string;
67
242
  diff: string;
68
243
  model?: string;
69
244
  timeoutMs?: number;
70
245
  onOutput?: (chunk: string) => void;
246
+ allowToolUse?: boolean;
247
+ thinkingBudget?: string;
71
248
  }): Promise<string> {
72
249
  const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
73
250
 
@@ -75,47 +252,37 @@ export class CodexAdapter implements CLIAdapter {
75
252
  const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
76
253
  await fs.writeFile(tmpFile, fullContent);
77
254
 
78
- // Get absolute path to repo root (CWD)
79
- const repoRoot = process.cwd();
80
-
81
- // Recommended invocation per spec:
82
- // --cd: sets working directory to repo root
83
- // --sandbox read-only: prevents file modifications
84
- // -c ask_for_approval="never": prevents blocking on prompts
85
- // -: reads prompt from stdin
86
- const args = [
87
- "exec",
88
- "--cd",
89
- repoRoot,
90
- "--sandbox",
91
- "read-only",
92
- "-c",
93
- 'ask_for_approval="never"',
94
- "-",
95
- ];
255
+ const args = this.buildArgs(opts.allowToolUse, opts.thinkingBudget);
96
256
 
97
257
  const cleanup = () => fs.unlink(tmpFile).catch(() => {});
98
258
 
99
259
  // If onOutput callback is provided, use spawn for real-time streaming
100
260
  if (opts.onOutput) {
101
- return runStreamingCommand({
261
+ const raw = await runStreamingCommand({
102
262
  command: "codex",
103
263
  args,
104
264
  tmpFile,
105
265
  timeoutMs: opts.timeoutMs,
106
- onOutput: opts.onOutput,
266
+ onOutput: (chunk: string) => {
267
+ opts.onOutput?.(chunk);
268
+ },
107
269
  cleanup,
108
270
  });
271
+
272
+ const { text } = parseCodexJsonl(raw, opts.onOutput);
273
+ return text || raw.trimEnd();
109
274
  }
110
275
 
111
276
  // Otherwise use exec for buffered output
112
277
  try {
113
- const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
278
+ const quoteArg = (a: string) => `"${a.replace(/(["\\$`])/g, "\\$1")}"`;
279
+ const cmd = `cat "${tmpFile}" | codex ${args.map(quoteArg).join(" ")}`;
114
280
  const { stdout } = await execAsync(cmd, {
115
281
  timeout: opts.timeoutMs,
116
282
  maxBuffer: MAX_BUFFER_BYTES,
117
283
  });
118
- return stdout;
284
+ const { text } = parseCodexJsonl(stdout);
285
+ return text || stdout.trimEnd();
119
286
  } finally {
120
287
  await cleanup();
121
288
  }
@@ -48,6 +48,14 @@ export class CursorAdapter implements CLIAdapter {
48
48
  return null;
49
49
  }
50
50
 
51
+ getProjectSkillDir(): string | null {
52
+ return null;
53
+ }
54
+
55
+ getUserSkillDir(): string | null {
56
+ return null;
57
+ }
58
+
51
59
  getCommandExtension(): string {
52
60
  return ".md";
53
61
  }