agent-gauntlet 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.8.0",
3
+ "version": "0.9.0",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -1,10 +1,10 @@
1
- import { exec, spawn } from "node:child_process";
1
+ import { exec } from "node:child_process";
2
2
  import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
6
  import { GAUNTLET_STOP_HOOK_ACTIVE_ENV } from "../commands/stop-hook.js";
7
- import type { CLIAdapter } from "./index.js";
7
+ import { type CLIAdapter, runStreamingCommand } from "./index.js";
8
8
 
9
9
  const execAsync = promisify(exec);
10
10
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
@@ -94,65 +94,17 @@ export class ClaudeAdapter implements CLIAdapter {
94
94
 
95
95
  // If onOutput callback is provided, use spawn for real-time streaming
96
96
  if (opts.onOutput) {
97
- return new Promise((resolve, reject) => {
98
- const chunks: string[] = [];
99
- const inputStream = fs.open(tmpFile, "r").then((handle) => {
100
- const stream = handle.createReadStream();
101
- return { stream, handle };
102
- });
103
-
104
- inputStream
105
- .then(({ stream, handle }) => {
106
- const child = spawn("claude", args, {
107
- stdio: ["pipe", "pipe", "pipe"],
108
- env: {
109
- ...process.env,
110
- [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
111
- },
112
- });
113
-
114
- stream.pipe(child.stdin);
115
-
116
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
117
- if (opts.timeoutMs) {
118
- timeoutId = setTimeout(() => {
119
- child.kill("SIGTERM");
120
- reject(new Error("Command timed out"));
121
- }, opts.timeoutMs);
122
- }
123
-
124
- child.stdout.on("data", (data: Buffer) => {
125
- const chunk = data.toString();
126
- chunks.push(chunk);
127
- opts.onOutput?.(chunk);
128
- });
129
-
130
- child.stderr.on("data", (data: Buffer) => {
131
- // Only log stderr, don't include in return value
132
- opts.onOutput?.(data.toString());
133
- });
134
-
135
- child.on("close", (code) => {
136
- if (timeoutId) clearTimeout(timeoutId);
137
- handle.close().catch(() => {});
138
- cleanup().then(() => {
139
- if (code === 0 || code === null) {
140
- resolve(chunks.join(""));
141
- } else {
142
- reject(new Error(`Process exited with code ${code}`));
143
- }
144
- });
145
- });
146
-
147
- child.on("error", (err) => {
148
- if (timeoutId) clearTimeout(timeoutId);
149
- handle.close().catch(() => {});
150
- cleanup().then(() => reject(err));
151
- });
152
- })
153
- .catch((err) => {
154
- cleanup().then(() => reject(err));
155
- });
97
+ return runStreamingCommand({
98
+ command: "claude",
99
+ args,
100
+ tmpFile,
101
+ timeoutMs: opts.timeoutMs,
102
+ onOutput: opts.onOutput,
103
+ cleanup,
104
+ env: {
105
+ ...process.env,
106
+ [GAUNTLET_STOP_HOOK_ACTIVE_ENV]: "1",
107
+ },
156
108
  });
157
109
  }
158
110
 
@@ -1,9 +1,9 @@
1
- import { exec, spawn } from "node:child_process";
1
+ import { exec } from "node:child_process";
2
2
  import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
- import type { CLIAdapter } from "./index.js";
6
+ import { type CLIAdapter, runStreamingCommand } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
@@ -98,61 +98,13 @@ export class CodexAdapter implements CLIAdapter {
98
98
 
99
99
  // If onOutput callback is provided, use spawn for real-time streaming
100
100
  if (opts.onOutput) {
101
- return new Promise((resolve, reject) => {
102
- const chunks: string[] = [];
103
- const inputStream = fs.open(tmpFile, "r").then((handle) => {
104
- const stream = handle.createReadStream();
105
- return { stream, handle };
106
- });
107
-
108
- inputStream
109
- .then(({ stream, handle }) => {
110
- const child = spawn("codex", args, {
111
- stdio: ["pipe", "pipe", "pipe"],
112
- });
113
-
114
- stream.pipe(child.stdin);
115
-
116
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
117
- if (opts.timeoutMs) {
118
- timeoutId = setTimeout(() => {
119
- child.kill("SIGTERM");
120
- reject(new Error("Command timed out"));
121
- }, opts.timeoutMs);
122
- }
123
-
124
- child.stdout.on("data", (data: Buffer) => {
125
- const chunk = data.toString();
126
- chunks.push(chunk);
127
- opts.onOutput?.(chunk);
128
- });
129
-
130
- child.stderr.on("data", (data: Buffer) => {
131
- // Only log stderr, don't include in return value
132
- opts.onOutput?.(data.toString());
133
- });
134
-
135
- child.on("close", (code) => {
136
- if (timeoutId) clearTimeout(timeoutId);
137
- handle.close().catch(() => {});
138
- cleanup().then(() => {
139
- if (code === 0 || code === null) {
140
- resolve(chunks.join(""));
141
- } else {
142
- reject(new Error(`Process exited with code ${code}`));
143
- }
144
- });
145
- });
146
-
147
- child.on("error", (err) => {
148
- if (timeoutId) clearTimeout(timeoutId);
149
- handle.close().catch(() => {});
150
- cleanup().then(() => reject(err));
151
- });
152
- })
153
- .catch((err) => {
154
- cleanup().then(() => reject(err));
155
- });
101
+ return runStreamingCommand({
102
+ command: "codex",
103
+ args,
104
+ tmpFile,
105
+ timeoutMs: opts.timeoutMs,
106
+ onOutput: opts.onOutput,
107
+ cleanup,
156
108
  });
157
109
  }
158
110
 
@@ -1,9 +1,9 @@
1
- import { exec, spawn } from "node:child_process";
1
+ import { exec } from "node:child_process";
2
2
  import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
- import type { CLIAdapter } from "./index.js";
6
+ import { type CLIAdapter, runStreamingCommand } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
@@ -90,61 +90,13 @@ export class CursorAdapter implements CLIAdapter {
90
90
 
91
91
  // If onOutput callback is provided, use spawn for real-time streaming
92
92
  if (opts.onOutput) {
93
- return new Promise((resolve, reject) => {
94
- const chunks: string[] = [];
95
- const inputStream = fs.open(tmpFile, "r").then((handle) => {
96
- const stream = handle.createReadStream();
97
- return { stream, handle };
98
- });
99
-
100
- inputStream
101
- .then(({ stream, handle }) => {
102
- const child = spawn("agent", [], {
103
- stdio: ["pipe", "pipe", "pipe"],
104
- });
105
-
106
- stream.pipe(child.stdin);
107
-
108
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
109
- if (opts.timeoutMs) {
110
- timeoutId = setTimeout(() => {
111
- child.kill("SIGTERM");
112
- reject(new Error("Command timed out"));
113
- }, opts.timeoutMs);
114
- }
115
-
116
- child.stdout.on("data", (data: Buffer) => {
117
- const chunk = data.toString();
118
- chunks.push(chunk);
119
- opts.onOutput?.(chunk);
120
- });
121
-
122
- child.stderr.on("data", (data: Buffer) => {
123
- // Only log stderr, don't include in return value
124
- opts.onOutput?.(data.toString());
125
- });
126
-
127
- child.on("close", (code) => {
128
- if (timeoutId) clearTimeout(timeoutId);
129
- handle.close().catch(() => {});
130
- cleanup().then(() => {
131
- if (code === 0 || code === null) {
132
- resolve(chunks.join(""));
133
- } else {
134
- reject(new Error(`Process exited with code ${code}`));
135
- }
136
- });
137
- });
138
-
139
- child.on("error", (err) => {
140
- if (timeoutId) clearTimeout(timeoutId);
141
- handle.close().catch(() => {});
142
- cleanup().then(() => reject(err));
143
- });
144
- })
145
- .catch((err) => {
146
- cleanup().then(() => reject(err));
147
- });
93
+ return runStreamingCommand({
94
+ command: "agent",
95
+ args: [],
96
+ tmpFile,
97
+ timeoutMs: opts.timeoutMs,
98
+ onOutput: opts.onOutput,
99
+ cleanup,
148
100
  });
149
101
  }
150
102
 
@@ -1,9 +1,9 @@
1
- import { exec, spawn } from "node:child_process";
1
+ import { exec } from "node:child_process";
2
2
  import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
- import type { CLIAdapter } from "./index.js";
6
+ import { type CLIAdapter, runStreamingCommand } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
@@ -130,61 +130,13 @@ ${escapedBody}
130
130
 
131
131
  // If onOutput callback is provided, use spawn for real-time streaming
132
132
  if (opts.onOutput) {
133
- return new Promise((resolve, reject) => {
134
- const chunks: string[] = [];
135
- const inputStream = fs.open(tmpFile, "r").then((handle) => {
136
- const stream = handle.createReadStream();
137
- return { stream, handle };
138
- });
139
-
140
- inputStream
141
- .then(({ stream, handle }) => {
142
- const child = spawn("gemini", args, {
143
- stdio: ["pipe", "pipe", "pipe"],
144
- });
145
-
146
- stream.pipe(child.stdin);
147
-
148
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
149
- if (opts.timeoutMs) {
150
- timeoutId = setTimeout(() => {
151
- child.kill("SIGTERM");
152
- reject(new Error("Command timed out"));
153
- }, opts.timeoutMs);
154
- }
155
-
156
- child.stdout.on("data", (data: Buffer) => {
157
- const chunk = data.toString();
158
- chunks.push(chunk);
159
- opts.onOutput?.(chunk);
160
- });
161
-
162
- child.stderr.on("data", (data: Buffer) => {
163
- // Only log stderr, don't include in return value
164
- opts.onOutput?.(data.toString());
165
- });
166
-
167
- child.on("close", (code) => {
168
- if (timeoutId) clearTimeout(timeoutId);
169
- handle.close().catch(() => {});
170
- cleanup().then(() => {
171
- if (code === 0 || code === null) {
172
- resolve(chunks.join(""));
173
- } else {
174
- reject(new Error(`Process exited with code ${code}`));
175
- }
176
- });
177
- });
178
-
179
- child.on("error", (err) => {
180
- if (timeoutId) clearTimeout(timeoutId);
181
- handle.close().catch(() => {});
182
- cleanup().then(() => reject(err));
183
- });
184
- })
185
- .catch((err) => {
186
- cleanup().then(() => reject(err));
187
- });
133
+ return runStreamingCommand({
134
+ command: "gemini",
135
+ args,
136
+ tmpFile,
137
+ timeoutMs: opts.timeoutMs,
138
+ onOutput: opts.onOutput,
139
+ cleanup,
188
140
  });
189
141
  }
190
142
 
@@ -1,9 +1,9 @@
1
- import { exec, spawn } from "node:child_process";
1
+ import { exec } from "node:child_process";
2
2
  import fs from "node:fs/promises";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { promisify } from "node:util";
6
- import type { CLIAdapter } from "./index.js";
6
+ import { type CLIAdapter, runStreamingCommand } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
@@ -103,61 +103,13 @@ export class GitHubCopilotAdapter implements CLIAdapter {
103
103
 
104
104
  // If onOutput callback is provided, use spawn for real-time streaming
105
105
  if (opts.onOutput) {
106
- return new Promise((resolve, reject) => {
107
- const chunks: string[] = [];
108
- const inputStream = fs.open(tmpFile, "r").then((handle) => {
109
- const stream = handle.createReadStream();
110
- return { stream, handle };
111
- });
112
-
113
- inputStream
114
- .then(({ stream, handle }) => {
115
- const child = spawn("copilot", args, {
116
- stdio: ["pipe", "pipe", "pipe"],
117
- });
118
-
119
- stream.pipe(child.stdin);
120
-
121
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
122
- if (opts.timeoutMs) {
123
- timeoutId = setTimeout(() => {
124
- child.kill("SIGTERM");
125
- reject(new Error("Command timed out"));
126
- }, opts.timeoutMs);
127
- }
128
-
129
- child.stdout.on("data", (data: Buffer) => {
130
- const chunk = data.toString();
131
- chunks.push(chunk);
132
- opts.onOutput?.(chunk);
133
- });
134
-
135
- child.stderr.on("data", (data: Buffer) => {
136
- // Only log stderr, don't include in return value
137
- opts.onOutput?.(data.toString());
138
- });
139
-
140
- child.on("close", (code) => {
141
- if (timeoutId) clearTimeout(timeoutId);
142
- handle.close().catch(() => {});
143
- cleanup().then(() => {
144
- if (code === 0 || code === null) {
145
- resolve(chunks.join(""));
146
- } else {
147
- reject(new Error(`Process exited with code ${code}`));
148
- }
149
- });
150
- });
151
-
152
- child.on("error", (err) => {
153
- if (timeoutId) clearTimeout(timeoutId);
154
- handle.close().catch(() => {});
155
- cleanup().then(() => reject(err));
156
- });
157
- })
158
- .catch((err) => {
159
- cleanup().then(() => reject(err));
160
- });
106
+ return runStreamingCommand({
107
+ command: "copilot",
108
+ args,
109
+ tmpFile,
110
+ timeoutMs: opts.timeoutMs,
111
+ onOutput: opts.onOutput,
112
+ cleanup,
161
113
  });
162
114
  }
163
115
 
@@ -1,9 +1,137 @@
1
+ import { type ChildProcess, spawn } from "node:child_process";
2
+ import type { FileHandle } from "node:fs/promises";
3
+ import fs from "node:fs/promises";
4
+
1
5
  export interface CLIAdapterHealth {
2
6
  available: boolean;
3
7
  status: "healthy" | "missing" | "unhealthy";
4
8
  message?: string;
5
9
  }
6
10
 
11
+ /**
12
+ * Collects stderr from a child process and returns a getter for the accumulated output.
13
+ * Also forwards each chunk to the optional onOutput callback.
14
+ */
15
+ export function collectStderr(
16
+ child: ChildProcess,
17
+ onOutput?: (text: string) => void,
18
+ ): () => string {
19
+ const chunks: string[] = [];
20
+ child.stderr?.on("data", (data: Buffer) => {
21
+ const text = data.toString();
22
+ chunks.push(text);
23
+ onOutput?.(text);
24
+ });
25
+ return () => chunks.join("");
26
+ }
27
+
28
+ /**
29
+ * Builds an Error for a non-zero process exit, including stdout and stderr if available.
30
+ * Both stdout and stderr are included to ensure usage limit messages are captured
31
+ * regardless of which stream the CLI writes them to.
32
+ */
33
+ export function processExitError(
34
+ code: number | null,
35
+ getStderr: () => string,
36
+ getStdout?: () => string,
37
+ ): Error {
38
+ const stderr = getStderr();
39
+ const stdout = getStdout?.() ?? "";
40
+ const output = [stdout, stderr].filter(Boolean).join("\n");
41
+ return new Error(
42
+ `Process exited with code ${code}${output ? `\n${output}` : ""}`,
43
+ );
44
+ }
45
+
46
+ export async function runStreamingCommand(opts: {
47
+ command: string;
48
+ args: string[];
49
+ tmpFile: string;
50
+ timeoutMs?: number;
51
+ onOutput?: (chunk: string) => void;
52
+ cleanup: () => Promise<void>;
53
+ env?: NodeJS.ProcessEnv;
54
+ }): Promise<string> {
55
+ return new Promise((resolve, reject) => {
56
+ const chunks: string[] = [];
57
+ const inputStream = fs.open(opts.tmpFile, "r").then((handle) => {
58
+ const stream = handle.createReadStream();
59
+ return { stream, handle };
60
+ });
61
+
62
+ inputStream
63
+ .then(({ stream, handle }) => {
64
+ const child = spawn(opts.command, opts.args, {
65
+ stdio: ["pipe", "pipe", "pipe"],
66
+ env: opts.env,
67
+ });
68
+
69
+ stream.pipe(child.stdin);
70
+
71
+ let timeoutId: ReturnType<typeof setTimeout> | undefined;
72
+ if (opts.timeoutMs) {
73
+ timeoutId = setTimeout(() => {
74
+ child.kill("SIGTERM");
75
+ reject(new Error("Command timed out"));
76
+ }, opts.timeoutMs);
77
+ }
78
+
79
+ child.stdout.on("data", (data: Buffer) => {
80
+ const chunk = data.toString();
81
+ chunks.push(chunk);
82
+ opts.onOutput?.(chunk);
83
+ });
84
+
85
+ const getStderr = collectStderr(child, opts.onOutput);
86
+
87
+ child.on("close", (code) => {
88
+ void finalizeProcessClose({
89
+ code,
90
+ timeoutId,
91
+ handle,
92
+ cleanup: opts.cleanup,
93
+ chunks,
94
+ getStderr,
95
+ resolve,
96
+ reject,
97
+ });
98
+ });
99
+
100
+ child.on("error", (err) => {
101
+ if (timeoutId) clearTimeout(timeoutId);
102
+ handle.close().catch(() => {});
103
+ opts.cleanup().then(() => reject(err));
104
+ });
105
+ })
106
+ .catch((err) => {
107
+ opts.cleanup().then(() => reject(err));
108
+ });
109
+ });
110
+ }
111
+
112
+ export async function finalizeProcessClose(opts: {
113
+ code: number | null;
114
+ timeoutId?: ReturnType<typeof setTimeout>;
115
+ handle: FileHandle;
116
+ cleanup: () => Promise<void>;
117
+ chunks: string[];
118
+ getStderr: () => string;
119
+ resolve: (value: string) => void;
120
+ reject: (error: Error) => void;
121
+ }): Promise<void> {
122
+ if (opts.timeoutId) clearTimeout(opts.timeoutId);
123
+ await opts.handle.close().catch(() => {});
124
+ await opts.cleanup();
125
+
126
+ if (opts.code === 0 || opts.code === null) {
127
+ opts.resolve(opts.chunks.join(""));
128
+ } else {
129
+ opts.reject(
130
+ processExitError(opts.code, opts.getStderr, () => opts.chunks.join("")),
131
+ );
132
+ }
133
+ }
134
+
7
135
  export function isUsageLimit(output: string): boolean {
8
136
  const lower = output.toLowerCase();
9
137
  return (
@@ -7,12 +7,13 @@ import {
7
7
  initDebugLogger,
8
8
  mergeDebugLogConfig,
9
9
  } from "../utils/debug-log.js";
10
+ import { deleteExecutionState } from "../utils/execution-state.js";
10
11
  import { acquireLock, cleanLogs, releaseLock } from "./shared.js";
11
12
 
12
13
  export function registerCleanCommand(program: Command): void {
13
14
  program
14
15
  .command("clean")
15
- .description("Archive logs (move current logs into previous/)")
16
+ .description("Archive logs and reset execution state")
16
17
  .action(async () => {
17
18
  let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
18
19
  let lockAcquired = false;
@@ -27,14 +28,17 @@ export function registerCleanCommand(program: Command): void {
27
28
  );
28
29
  initDebugLogger(config.project.log_dir, debugLogConfig);
29
30
 
30
- // Log the command invocation
31
+ // Acquire lock BEFORE logging - prevents clean from running during active gauntlet run
32
+ await acquireLock(config.project.log_dir);
33
+ lockAcquired = true;
34
+
35
+ // Log the command invocation (only after lock acquired)
31
36
  const debugLogger = getDebugLogger();
32
37
  await debugLogger?.logCommand("clean", []);
33
38
  await debugLogger?.logClean("manual", "user_request");
34
39
 
35
- await acquireLock(config.project.log_dir);
36
- lockAcquired = true;
37
40
  await cleanLogs(config.project.log_dir);
41
+ await deleteExecutionState(config.project.log_dir);
38
42
  await releaseLock(config.project.log_dir);
39
43
  console.log(chalk.green("Logs archived successfully."));
40
44
  } catch (error: unknown) {
@@ -10,3 +10,4 @@ export { registerReviewCommand } from "./review.js";
10
10
  export { registerRunCommand } from "./run.js";
11
11
  export { registerStopHookCommand } from "./stop-hook.js";
12
12
  export { registerValidateCommand } from "./validate.js";
13
+ export { registerWaitCICommand } from "./wait-ci.js";