agent-gauntlet 0.9.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +25 -23
  2. package/package.json +14 -4
  3. package/src/built-in-reviews/code-quality.md +25 -0
  4. package/src/built-in-reviews/index.ts +28 -0
  5. package/src/cli-adapters/claude.ts +231 -29
  6. package/src/cli-adapters/codex.ts +189 -22
  7. package/src/cli-adapters/cursor.ts +8 -0
  8. package/src/cli-adapters/gemini.ts +414 -59
  9. package/src/cli-adapters/github-copilot.ts +8 -0
  10. package/src/cli-adapters/index.ts +14 -0
  11. package/src/cli-adapters/thinking-budget.ts +23 -0
  12. package/src/commands/check.ts +18 -19
  13. package/src/commands/clean.ts +5 -4
  14. package/src/commands/detect.ts +104 -29
  15. package/src/commands/init.ts +1338 -351
  16. package/src/commands/review.ts +18 -19
  17. package/src/commands/shared.ts +60 -39
  18. package/src/commands/stop-hook.ts +10 -15
  19. package/src/commands/wait-ci.ts +3 -2
  20. package/src/config/loader.ts +28 -12
  21. package/src/config/schema.ts +28 -7
  22. package/src/config/types.ts +2 -0
  23. package/src/config/validator.ts +9 -8
  24. package/src/core/change-detector.ts +38 -32
  25. package/src/core/run-executor.ts +51 -46
  26. package/src/core/runner.ts +50 -24
  27. package/src/gates/check.ts +3 -14
  28. package/src/gates/resolve-check-command.ts +21 -0
  29. package/src/gates/result.ts +1 -0
  30. package/src/gates/review.ts +44 -5
  31. package/src/hooks/adapters/claude-stop-hook.ts +1 -7
  32. package/src/hooks/adapters/cursor-stop-hook.ts +1 -7
  33. package/src/hooks/adapters/types.ts +3 -11
  34. package/src/hooks/stop-hook-handler.ts +252 -422
  35. package/src/hooks/stop-hook-state.ts +112 -0
  36. package/src/output/app-logger.ts +1 -1
  37. package/src/output/console.ts +6 -3
  38. package/src/output/sinks/file-sink.ts +2 -2
  39. package/src/scripts/status.ts +433 -0
  40. package/src/types/gauntlet-status.ts +2 -1
  41. package/src/utils/debug-log.ts +55 -3
  42. package/src/utils/diff-parser.ts +63 -48
  43. package/src/utils/execution-state.ts +22 -0
  44. package/src/templates/fix_pr.template.md +0 -12
  45. package/src/templates/push_pr.template.md +0 -9
  46. package/src/templates/run_gauntlet.template.md +0 -41
package/README.md CHANGED
@@ -11,30 +11,17 @@
11
11
 
12
12
  Agent Gauntlet is a configurable “feedback loop” runner for AI-assisted development workflows.
13
13
 
14
- You configure which paths in your repo should trigger which validations — shell commands like tests and linters, plus AI-powered code reviews. When files change, Gauntlet automatically runs the relevant validations and reports results.
14
+ You configure which paths in your repo should trigger which validations — shell commands like tests and linters, plus AI-powered local code reviews. When files change, Gauntlet automatically runs the relevant validations and reports results.
15
15
 
16
16
  For AI reviews, it uses the CLI tool of your choice: Gemini, Codex, Claude Code, GitHub Copilot, or Cursor.
17
17
 
18
18
  ## Features
19
19
 
20
20
  - **Agent validation loop**: Keep your coding agent on track with automated feedback loops. Detect problems — deterministically and/or non-deterministically — and let your agent fix and Gauntlet verify.
21
- - **Multi-agent collaboration**: Enable one AI agent to automatically request code reviews from another. For example, if Claude made changes, Gauntlet can request a review from Codex or Gemini — spreading token usage across your subscriptions instead of burning through one.
21
+ - **Local cross-agent code reviews**: Enable one AI agent to automatically request code reviews from another. For example, if Claude made changes, Gauntlet can request a review from Codex — spreading token usage across your subscriptions instead of burning through one.
22
+ - Multiple AI review adapters have been evaluated for quality and efficiency. Claude and Codex deliver optimal review quality with superior token efficiency. For detailed metrics, see [Eval Results](docs/eval-results.md).
22
23
  - **Leverage existing subscriptions**: Agent Gauntlet is *free* and tool-agnostic, leveraging the AI CLI tools you already have installed.
23
- - **Easy CI setup**: Define your checks once, run them locally and in GitHub.
24
-
25
- ### vs AI Code Review Tools
26
-
27
- Unlike traditional code review tools designed for PR workflows, Agent Gauntlet provides real-time feedback loops for autonomous coding agents.
28
-
29
- | Use Case | Recommended |
30
- | :--- | :--- |
31
- | Autonomous agent development | **Agent Gauntlet** |
32
- | Traditional PR review with human reviewers | Other tools |
33
- | IDE-integrated review while coding | Other tools |
34
- | Enterprise with strict compliance requirements | Other tools |
35
- | Budget-conscious teams with existing AI CLI tools | **Agent Gauntlet** |
36
-
37
- [Full comparison →](docs/feature_comparison.md)
24
+ - **Easy CI setup**: Define your CI gates once, run them locally and in GitHub.
38
25
 
39
26
  ## Common Workflows
40
27
 
@@ -42,15 +29,30 @@ Agent Gauntlet supports three workflows, ranging from simple CLI execution to fu
42
29
 
43
30
  - **CLI Mode** — Run checks via command line; ideal for CI pipelines and scripts.
44
31
  - **Assistant Mode** — AI assistant runs validation loop, fixing issues iteratively.
45
- - **Agentic Mode** — Autonomous agent validates and fixes in real-time via stop hook.
32
+ - **Agentic Mode** — Autonomous agent validates and fixes in real-time via stop hook (experimental).
46
33
 
47
34
  ![Agent Gauntlet Workflows](docs/images/workflows.png)
48
35
 
49
- ## Quick Start
36
+ ### Example Workflow
37
+
38
+ 1. Claude implements a feature
39
+ 2. Agent Gauntlet reports quality issues detected by static code analysis and Codex reviewer agent
40
+ 3. Claude fixes issues
41
+ 4. Agent Gauntlet verifies
42
+
43
+ ### Comparison vs Other Tools
50
44
 
51
- 1. **Install**: `bun add -g agent-gauntlet`
52
- 2. **Initialize**: `agent-gauntlet init`
53
- 3. **Run**: `agent-gauntlet run`
45
+ ### AI Code Review Tools
46
+
47
+ Agent Gauntlet is not a replacement for tools that provide AI pull request code reviews. It provides real-time feedback loops for autonomous coding agents, combining deterministic static checks (build, lint, test) with multi-agent AI reviews in a single pipeline. This enables agents to iterate and self-correct until all checks and reviews pass, without human intervention.
48
+
49
+ [Full comparison →](docs/feature_comparison.md)
50
+
51
+ ### Spec-Driven Workflow Tools
52
+
53
+ It is recommended to use Agent Gauntlet in conjunction with other spec-driven development tools. We believe is the ideal implementation of the validation step in any Spec → Implement → Validate workflow.
54
+
55
+ ## Quick Start
54
56
 
55
57
  For basic usage and configuration guide, see the [Quick Start Guide](docs/quick-start.md).
56
58
 
@@ -59,7 +61,7 @@ For basic usage and configuration guide, see the [Quick Start Guide](docs/quick-
59
61
  - [Quick Start Guide](docs/quick-start.md) — installation, basic usage, and config layout
60
62
  - [User Guide](docs/user-guide.md) — full usage details
61
63
  - [Configuration Reference](docs/config-reference.md) — all configuration fields + defaults
62
- - [Stop Hook Guide](docs/stop-hook-guide.md) — integrate with Claude Code's stop hook
64
+ - [Stop Hook Guide](docs/stop-hook-guide.md) — integrate with Claude Code's stop hook (experimental).
63
65
  - [CLI Invocation Details](docs/cli-invocation-details.md) — how we securely invoke AI CLIs
64
66
  - [Feature Comparison](docs/feature_comparison.md) — how Agent Gauntlet compares to other tools
65
67
  - [Development Guide](docs/development.md) — how to build and develop this project
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.9.0",
3
+ "version": "0.10.1",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -34,11 +34,21 @@
34
34
  "scripts": {
35
35
  "build": "bun build --compile --minify --sourcemap ./src/index.ts --outfile bin/agent-gauntlet",
36
36
  "test": "bun test",
37
+ "test:e2e": "bun test/integration/stop-hook-e2e.ts",
37
38
  "lint": "biome check src",
39
+ "typecheck": "tsc --noEmit && tsc --noEmit -p test/tsconfig.json",
38
40
  "changeset": "changeset",
39
41
  "version": "changeset version",
40
- "release": "npm publish",
41
- "gen-changeset": "bun scripts/gen-changeset.ts"
42
+ "release": "changeset publish",
43
+ "run": "bun src/index.ts run",
44
+ "check": "bun src/index.ts check",
45
+ "clean": "bun src/index.ts clean",
46
+ "review": "bun src/index.ts review",
47
+ "detect": "bun src/index.ts detect",
48
+ "list": "bun src/index.ts list",
49
+ "health": "bun src/index.ts health",
50
+ "validate": "bun src/index.ts validate",
51
+ "wait-ci": "bun src/index.ts wait-ci"
42
52
  },
43
53
  "devDependencies": {
44
54
  "@biomejs/biome": "^2.3.11",
@@ -57,4 +67,4 @@
57
67
  "yaml": "^2.8.2",
58
68
  "zod": "^4.3.5"
59
69
  }
60
- }
70
+ }
@@ -0,0 +1,25 @@
1
+ # Code Quality Review
2
+
3
+ You are a senior software engineer performing a code review. Your primary goal is to identify **real problems** that could cause bugs, security vulnerabilities, or performance issues in production. Do not report style, formatting, naming conventions, or maintainability suggestions unless you see something egregious.
4
+
5
+ ## Focus Areas (in priority order)
6
+
7
+ 1. **Bugs** — Logic errors, null/undefined issues, race conditions, unhandled edge cases, resource leaks
8
+ 2. **Security** — Injection vulnerabilities, auth/authz flaws, sensitive data exposure, input validation gaps
9
+ 3. **Performance** — Algorithmic complexity issues, N+1 queries, blocking operations, memory problems
10
+ 4. **Maintainability** — Unclear code, missing error handling, duplication
11
+
12
+ ## Do NOT Report
13
+
14
+ - Style, formatting, or naming preferences
15
+ - Missing documentation, comments, or type annotations
16
+ - Suggestions for "better" abstractions or patterns that aren't broken
17
+ - Hypothetical issues that require unlikely preconditions
18
+ - Issues in code that wasn't changed in this diff
19
+
20
+ ## Guidelines
21
+
22
+ - **Threshold**: only report issues you would block a PR over
23
+ - Explain **why** each issue is a problem with a concrete failure scenario
24
+ - Provide a **concrete fix** with corrected code
25
+ - If the status quo works correctly, it's not a violation
@@ -0,0 +1,28 @@
1
+ // @ts-expect-error Bun text import
2
+ import codeQualityContent from "./code-quality.md" with { type: "text" };
3
+
4
+ const BUILT_IN_PREFIX = "built-in:";
5
+
6
+ const builtInSources: Record<string, string> = {
7
+ "code-quality": codeQualityContent,
8
+ };
9
+
10
+ /**
11
+ * Check if a review name uses the built-in prefix.
12
+ */
13
+ export function isBuiltInReview(name: string): boolean {
14
+ return name.startsWith(BUILT_IN_PREFIX);
15
+ }
16
+
17
+ /**
18
+ * Load a built-in review prompt by name. Returns the raw markdown content.
19
+ */
20
+ export function loadBuiltInReview(name: string): string {
21
+ const source = builtInSources[name];
22
+
23
+ if (!source) {
24
+ throw new Error(`Unknown built-in review: "${name}"`);
25
+ }
26
+
27
+ return source;
28
+ }
@@ -4,11 +4,196 @@ import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
6
  import { GAUNTLET_STOP_HOOK_ACTIVE_ENV } from "../commands/stop-hook.js";
7
+ import { getDebugLogger } from "../utils/debug-log.js";
7
8
  import { type CLIAdapter, runStreamingCommand } from "./index.js";
9
+ import { CLAUDE_THINKING_TOKENS } from "./thinking-budget.js";
8
10
 
9
11
  const execAsync = promisify(exec);
10
12
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
11
13
 
14
+ // Matches OTel console exporter metric blocks dumped to stdout at process exit.
15
+ // Requires `descriptor`, `dataPointType`, and `dataPoints` fields which are
16
+ // unique to OTel SDK output and won't appear in normal code review content.
17
+ // Optionally matches [otel] prefix that some exporters add.
18
+ const OTEL_METRIC_BLOCK_RE =
19
+ /(?:\[otel\]\s*)?\{\s*\n\s*descriptor:\s*\{[\s\S]*?dataPointType:\s*\d+[\s\S]*?dataPoints:\s*\[[\s\S]*?\]\s*,?\s*\n\}/g;
20
+
21
+ interface OtelUsage {
22
+ cost?: number;
23
+ input?: number;
24
+ output?: number;
25
+ cacheRead?: number;
26
+ cacheCreation?: number;
27
+ toolCalls?: number;
28
+ toolContentBytes?: number;
29
+ apiRequests?: number;
30
+ }
31
+
32
+ const TOKEN_TYPES = ["input", "output", "cacheRead", "cacheCreation"] as const;
33
+
34
+ function parseCostBlock(block: string): number | undefined {
35
+ const match = block.match(/value:\s*([\d.]+)/);
36
+ return match?.[1] ? Number.parseFloat(match[1]) : undefined;
37
+ }
38
+
39
+ function parseTokenBlock(block: string): Partial<OtelUsage> {
40
+ const result: Partial<OtelUsage> = {};
41
+ const re = /type:\s*"(\w+)"[\s\S]*?value:\s*(\d+)(?:,|\s*\})/g;
42
+ for (const match of block.matchAll(re)) {
43
+ const type = match[1] as (typeof TOKEN_TYPES)[number] | undefined;
44
+ const value = match[2];
45
+ if (!type || !value) continue;
46
+ if (TOKEN_TYPES.includes(type)) {
47
+ result[type] = Number.parseInt(value, 10);
48
+ }
49
+ }
50
+ return result;
51
+ }
52
+
53
+ function parseOtelMetrics(blocks: string[]): OtelUsage {
54
+ const usage: OtelUsage = {};
55
+ for (const block of blocks) {
56
+ const nameMatch = block.match(/name:\s*"([^"]+)"/);
57
+ if (!nameMatch) continue;
58
+
59
+ if (nameMatch[1] === "claude_code.cost.usage") {
60
+ usage.cost = parseCostBlock(block);
61
+ } else if (nameMatch[1] === "claude_code.token.usage") {
62
+ Object.assign(usage, parseTokenBlock(block));
63
+ }
64
+ }
65
+ return usage;
66
+ }
67
+
68
+ // Matches OTel console log exporter event records emitted by Claude Code.
69
+ // The Node.js SDK console exporter uses util.inspect() format with unquoted keys
70
+ // and single-quoted strings. Blocks start with `resource:` and contain a `body:`
71
+ // field with the event name (e.g. 'claude_code.tool_result').
72
+ const OTEL_LOG_BLOCK_RE =
73
+ /\{\s*\n\s*resource:\s*\{[\s\S]*?body:\s*'claude_code\.\w+'[\s\S]*?\n\}/g;
74
+
75
+ /** Pre-compiled regexes for extracting single-quoted attribute values from OTel log blocks. */
76
+ const OTEL_ATTR_RE = {
77
+ body: /body:\s*'([^']*)'/,
78
+ tool_result_size_bytes: /tool_result_size_bytes:\s*'([^']*)'/,
79
+ input_tokens: /input_tokens:\s*'([^']*)'/,
80
+ output_tokens: /output_tokens:\s*'([^']*)'/,
81
+ cache_read_tokens: /cache_read_tokens:\s*'([^']*)'/,
82
+ cache_creation_tokens: /cache_creation_tokens:\s*'([^']*)'/,
83
+ cost_usd: /cost_usd:\s*'([^']*)'/,
84
+ } as const;
85
+
86
+ /** Maps OTel api_request attribute regexes to OtelUsage fields. */
87
+ const API_REQUEST_FIELDS: Array<[RegExp, keyof OtelUsage]> = [
88
+ [OTEL_ATTR_RE.input_tokens, "input"],
89
+ [OTEL_ATTR_RE.output_tokens, "output"],
90
+ [OTEL_ATTR_RE.cache_read_tokens, "cacheRead"],
91
+ [OTEL_ATTR_RE.cache_creation_tokens, "cacheCreation"],
92
+ [OTEL_ATTR_RE.cost_usd, "cost"],
93
+ ];
94
+
95
+ /** Accumulate a tool_result log block into usage. */
96
+ function accumulateToolResult(block: string, usage: OtelUsage): void {
97
+ usage.toolCalls = (usage.toolCalls || 0) + 1;
98
+ const bytes = block.match(OTEL_ATTR_RE.tool_result_size_bytes)?.[1];
99
+ if (bytes !== undefined) {
100
+ usage.toolContentBytes = (usage.toolContentBytes || 0) + Number(bytes);
101
+ }
102
+ }
103
+
104
+ /** Accumulate an api_request log block into usage. */
105
+ function accumulateApiRequest(block: string, usage: OtelUsage): void {
106
+ usage.apiRequests = (usage.apiRequests || 0) + 1;
107
+ for (const [re, field] of API_REQUEST_FIELDS) {
108
+ const val = block.match(re)?.[1];
109
+ if (val !== undefined) {
110
+ usage[field] = (usage[field] || 0) + Number(val);
111
+ }
112
+ }
113
+ }
114
+
115
+ /** Accumulate tool_result and api_request event data from OTel log blocks. */
116
+ function parseOtelLogEvents(raw: string, usage: OtelUsage): void {
117
+ const blocks = raw.match(OTEL_LOG_BLOCK_RE);
118
+ if (!blocks) return;
119
+ for (const block of blocks) {
120
+ const body = block.match(OTEL_ATTR_RE.body)?.[1];
121
+ if (body === "claude_code.tool_result") {
122
+ accumulateToolResult(block, usage);
123
+ } else if (body === "claude_code.api_request") {
124
+ accumulateApiRequest(block, usage);
125
+ }
126
+ }
127
+ }
128
+
129
+ const OTEL_SUMMARY_FIELDS: Array<[keyof OtelUsage, string]> = [
130
+ ["input", "in"],
131
+ ["output", "out"],
132
+ ["cacheRead", "cacheRead"],
133
+ ["cacheCreation", "cacheWrite"],
134
+ ["toolCalls", "tool_calls"],
135
+ ["toolContentBytes", "tool_content_bytes"],
136
+ ["apiRequests", "api_requests"],
137
+ ];
138
+
139
+ function formatOtelSummary(usage: OtelUsage): string | null {
140
+ if (usage.cost === undefined && usage.input === undefined) return null;
141
+
142
+ const parts: string[] = [];
143
+ if (usage.cost !== undefined) parts.push(`cost=$${usage.cost.toFixed(4)}`);
144
+ for (const [key, label] of OTEL_SUMMARY_FIELDS) {
145
+ if (usage[key] !== undefined) parts.push(`${label}=${usage[key]}`);
146
+ }
147
+
148
+ return `[otel] ${parts.join(" ")}`;
149
+ }
150
+
151
+ function extractOtelMetrics(
152
+ raw: string,
153
+ onLog?: (msg: string) => void,
154
+ ): string {
155
+ const metricBlocks = raw.match(OTEL_METRIC_BLOCK_RE);
156
+ const usage = metricBlocks ? parseOtelMetrics(metricBlocks) : {};
157
+
158
+ // Also parse log events for tool call and API request counts
159
+ parseOtelLogEvents(raw, usage);
160
+
161
+ const summary = formatOtelSummary(usage);
162
+ if (summary) {
163
+ onLog?.(`\n${summary}\n`);
164
+ process.stderr.write(`${summary}\n`);
165
+ getDebugLogger()?.logTelemetry({ adapter: "claude", summary });
166
+ }
167
+
168
+ return raw
169
+ .replace(OTEL_METRIC_BLOCK_RE, "")
170
+ .replace(OTEL_LOG_BLOCK_RE, "")
171
+ .trimEnd();
172
+ }
173
+
174
+ /** Build OTel environment overrides for console export. */
175
+ function buildOtelEnv(): Record<string, string> {
176
+ const env: Record<string, string> = {};
177
+ if (!process.env.CLAUDE_CODE_ENABLE_TELEMETRY) {
178
+ env.CLAUDE_CODE_ENABLE_TELEMETRY = "1";
179
+ }
180
+ if (!process.env.OTEL_METRICS_EXPORTER) {
181
+ env.OTEL_METRICS_EXPORTER = "console";
182
+ }
183
+ if (!process.env.OTEL_LOGS_EXPORTER) {
184
+ env.OTEL_LOGS_EXPORTER = "console";
185
+ }
186
+ return env;
187
+ }
188
+
189
+ /** Strip OTel metric and log blocks from raw output. */
190
+ function stripOtelBlocks(raw: string): string {
191
+ return raw
192
+ .replace(OTEL_METRIC_BLOCK_RE, "")
193
+ .replace(OTEL_LOG_BLOCK_RE, "")
194
+ .trimEnd();
195
+ }
196
+
12
197
  export class ClaudeAdapter implements CLIAdapter {
13
198
  name = "claude";
14
199
 
@@ -43,21 +228,26 @@ export class ClaudeAdapter implements CLIAdapter {
43
228
  }
44
229
 
45
230
  getUserCommandDir(): string | null {
46
- // Claude supports user-level commands at ~/.claude/commands
47
231
  return path.join(os.homedir(), ".claude", "commands");
48
232
  }
49
233
 
234
+ getProjectSkillDir(): string | null {
235
+ return ".claude/skills";
236
+ }
237
+
238
+ getUserSkillDir(): string | null {
239
+ return path.join(os.homedir(), ".claude", "skills");
240
+ }
241
+
50
242
  getCommandExtension(): string {
51
243
  return ".md";
52
244
  }
53
245
 
54
246
  canUseSymlink(): boolean {
55
- // Claude uses the same Markdown format as our canonical file
56
247
  return true;
57
248
  }
58
249
 
59
250
  transformCommand(markdownContent: string): string {
60
- // Claude uses the same Markdown format, no transformation needed
61
251
  return markdownContent;
62
252
  }
63
253
 
@@ -67,59 +257,71 @@ export class ClaudeAdapter implements CLIAdapter {
67
257
  model?: string;
68
258
  timeoutMs?: number;
69
259
  onOutput?: (chunk: string) => void;
260
+ allowToolUse?: boolean;
261
+ thinkingBudget?: string;
70
262
  }): Promise<string> {
71
263
  const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
72
264
 
73
265
  const tmpDir = os.tmpdir();
74
- // Include process.pid for uniqueness across concurrent processes
75
266
  const tmpFile = path.join(
76
267
  tmpDir,
77
268
  `gauntlet-claude-${process.pid}-${Date.now()}.txt`,
78
269
  );
79
270
  await fs.writeFile(tmpFile, fullContent);
80
271
 
81
- // Recommended invocation per spec:
82
- // -p: non-interactive print mode
83
- // --allowedTools: explicitly restricts to read-only tools
84
- // --max-turns: caps agentic turns
85
- const args = [
86
- "-p",
87
- "--allowedTools",
88
- "Read,Glob,Grep",
89
- "--max-turns",
90
- "10",
91
- ];
272
+ const args = ["-p"];
273
+ if (opts.allowToolUse === false) {
274
+ args.push("--tools", "");
275
+ } else {
276
+ args.push("--allowedTools", "Read,Glob,Grep");
277
+ }
278
+ args.push("--max-turns", "10");
279
+
280
+ const otelEnv = buildOtelEnv();
281
+ const thinkingEnv: Record<string, string> = {};
282
+ if (opts.thinkingBudget && opts.thinkingBudget in CLAUDE_THINKING_TOKENS) {
283
+ thinkingEnv.MAX_THINKING_TOKENS = String(
284
+ CLAUDE_THINKING_TOKENS[opts.thinkingBudget],
285
+ );
286
+ }
92
287
 
93
288
  const cleanup = () => fs.unlink(tmpFile).catch(() => {});
289
+ const execEnv = {
290
+ ...process.env,
291
+ [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
292
+ ...otelEnv,
293
+ ...thinkingEnv,
294
+ };
94
295
 
95
- // If onOutput callback is provided, use spawn for real-time streaming
96
296
  if (opts.onOutput) {
97
- return runStreamingCommand({
297
+ const outputBuffer: string[] = [];
298
+ const raw = await runStreamingCommand({
98
299
  command: "claude",
99
300
  args,
100
301
  tmpFile,
101
302
  timeoutMs: opts.timeoutMs,
102
- onOutput: opts.onOutput,
103
- cleanup,
104
- env: {
105
- ...process.env,
106
- [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
303
+ onOutput: (chunk: string) => {
304
+ outputBuffer.push(chunk);
107
305
  },
306
+ cleanup,
307
+ env: execEnv,
108
308
  });
309
+ const cleanedOutput = extractOtelMetrics(
310
+ outputBuffer.join(""),
311
+ opts.onOutput,
312
+ );
313
+ opts.onOutput(cleanedOutput);
314
+ return stripOtelBlocks(raw);
109
315
  }
110
316
 
111
- // Otherwise use exec for buffered output
112
317
  try {
113
- const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
318
+ const cmd = `cat "${tmpFile}" | claude ${args.map((a) => (a === "" ? '""' : a)).join(" ")}`;
114
319
  const { stdout } = await execAsync(cmd, {
115
320
  timeout: opts.timeoutMs,
116
321
  maxBuffer: MAX_BUFFER_BYTES,
117
- env: {
118
- ...process.env,
119
- [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
120
- },
322
+ env: execEnv,
121
323
  });
122
- return stdout;
324
+ return extractOtelMetrics(stdout);
123
325
  } finally {
124
326
  await cleanup();
125
327
  }