agent-gauntlet 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +4 -2
- package/src/cli-adapters/claude.ts +139 -108
- package/src/cli-adapters/codex.ts +141 -117
- package/src/cli-adapters/cursor.ts +152 -0
- package/src/cli-adapters/gemini.ts +171 -139
- package/src/cli-adapters/github-copilot.ts +153 -0
- package/src/cli-adapters/index.ts +77 -48
- package/src/commands/check.test.ts +24 -20
- package/src/commands/check.ts +65 -59
- package/src/commands/detect.test.ts +38 -32
- package/src/commands/detect.ts +74 -61
- package/src/commands/health.test.ts +67 -53
- package/src/commands/health.ts +167 -145
- package/src/commands/help.test.ts +37 -37
- package/src/commands/help.ts +30 -22
- package/src/commands/index.ts +9 -9
- package/src/commands/init.test.ts +118 -107
- package/src/commands/init.ts +514 -417
- package/src/commands/list.test.ts +87 -70
- package/src/commands/list.ts +28 -24
- package/src/commands/rerun.ts +142 -119
- package/src/commands/review.test.ts +26 -20
- package/src/commands/review.ts +65 -59
- package/src/commands/run.test.ts +22 -20
- package/src/commands/run.ts +64 -58
- package/src/commands/shared.ts +44 -35
- package/src/config/loader.test.ts +112 -90
- package/src/config/loader.ts +132 -123
- package/src/config/schema.ts +49 -47
- package/src/config/types.ts +15 -13
- package/src/config/validator.ts +521 -454
- package/src/core/change-detector.ts +122 -104
- package/src/core/entry-point.test.ts +60 -62
- package/src/core/entry-point.ts +76 -67
- package/src/core/job.ts +69 -59
- package/src/core/runner.ts +261 -230
- package/src/gates/check.ts +78 -69
- package/src/gates/result.ts +7 -7
- package/src/gates/review.test.ts +174 -138
- package/src/gates/review.ts +716 -561
- package/src/index.ts +16 -15
- package/src/output/console.ts +253 -214
- package/src/output/logger.ts +64 -52
- package/src/templates/run_gauntlet.template.md +18 -0
- package/src/utils/diff-parser.ts +64 -62
- package/src/utils/log-parser.ts +227 -206
- package/src/utils/sanitizer.ts +1 -1
package/README.md
CHANGED
|
@@ -20,7 +20,7 @@ Agent Gauntlet is designed to be "tool-agnostic" by leveraging the AI CLI tools
|
|
|
20
20
|
|
|
21
21
|
- **Bun** (Required runtime, v1.0.0+)
|
|
22
22
|
- **git** (change detection and diffs)
|
|
23
|
-
- For review gates: one or more supported AI CLIs installed (`gemini`, `codex`, `claude`). For the full list of tools and how they are used, see [CLI Invocation Details](docs/cli-invocation-details.md)
|
|
23
|
+
- For review gates: one or more supported AI CLIs installed (`gemini`, `codex`, `claude`, `github-copilot`, `cursor`). For the full list of tools and how they are used, see [CLI Invocation Details](docs/cli-invocation-details.md)
|
|
24
24
|
|
|
25
25
|
### Installation
|
|
26
26
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-gauntlet",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.11",
|
|
4
4
|
"description": "A CLI tool for testing AI coding agents",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "Paul Caplan",
|
|
@@ -33,9 +33,11 @@
|
|
|
33
33
|
},
|
|
34
34
|
"scripts": {
|
|
35
35
|
"build": "bun build --compile --minify --sourcemap ./src/index.ts --outfile bin/agent-gauntlet",
|
|
36
|
-
"test": "bun test"
|
|
36
|
+
"test": "bun test",
|
|
37
|
+
"lint": "biome check src"
|
|
37
38
|
},
|
|
38
39
|
"devDependencies": {
|
|
40
|
+
"@biomejs/biome": "^2.3.11",
|
|
39
41
|
"@types/bun": "latest"
|
|
40
42
|
},
|
|
41
43
|
"peerDependencies": {
|
|
@@ -1,114 +1,145 @@
|
|
|
1
|
-
import { exec } from
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
import
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { promisify } from "node:util";
|
|
6
|
+
import { type CLIAdapter, isUsageLimit } from "./index.js";
|
|
7
7
|
|
|
8
8
|
const execAsync = promisify(exec);
|
|
9
9
|
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
10
10
|
|
|
11
11
|
export class ClaudeAdapter implements CLIAdapter {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
12
|
+
name = "claude";
|
|
13
|
+
|
|
14
|
+
async isAvailable(): Promise<boolean> {
|
|
15
|
+
try {
|
|
16
|
+
await execAsync("which claude");
|
|
17
|
+
return true;
|
|
18
|
+
} catch {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
|
|
24
|
+
available: boolean;
|
|
25
|
+
status: "healthy" | "missing" | "unhealthy";
|
|
26
|
+
message?: string;
|
|
27
|
+
}> {
|
|
28
|
+
const available = await this.isAvailable();
|
|
29
|
+
if (!available) {
|
|
30
|
+
return {
|
|
31
|
+
available: false,
|
|
32
|
+
status: "missing",
|
|
33
|
+
message: "Command not found",
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (options?.checkUsageLimit) {
|
|
38
|
+
try {
|
|
39
|
+
// Try a lightweight command to check if we're rate limited
|
|
40
|
+
// We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
|
|
41
|
+
const { stdout, stderr } = await execAsync(
|
|
42
|
+
'echo "hello" | claude -p --max-turns 1',
|
|
43
|
+
{ timeout: 10000 },
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
const combined = (stdout || "") + (stderr || "");
|
|
47
|
+
if (isUsageLimit(combined)) {
|
|
48
|
+
return {
|
|
49
|
+
available: true,
|
|
50
|
+
status: "unhealthy",
|
|
51
|
+
message: "Usage limit exceeded",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
56
|
+
} catch (error: unknown) {
|
|
57
|
+
const execError = error as {
|
|
58
|
+
stderr?: string;
|
|
59
|
+
stdout?: string;
|
|
60
|
+
message?: string;
|
|
61
|
+
};
|
|
62
|
+
const stderr = execError.stderr || "";
|
|
63
|
+
const stdout = execError.stdout || "";
|
|
64
|
+
const combined = stderr + stdout;
|
|
65
|
+
|
|
66
|
+
if (isUsageLimit(combined)) {
|
|
67
|
+
return {
|
|
68
|
+
available: true,
|
|
69
|
+
status: "unhealthy",
|
|
70
|
+
message: "Usage limit exceeded",
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Since we sent a valid prompt ("hello"), any other error implies the tool is broken
|
|
75
|
+
// Extract a brief error message if possible
|
|
76
|
+
const cleanError =
|
|
77
|
+
combined.split("\n")[0]?.trim() ||
|
|
78
|
+
execError.message ||
|
|
79
|
+
"Command failed";
|
|
80
|
+
return {
|
|
81
|
+
available: true,
|
|
82
|
+
status: "unhealthy",
|
|
83
|
+
message: `Error: ${cleanError}`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
getProjectCommandDir(): string | null {
|
|
92
|
+
return ".claude/commands";
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
getUserCommandDir(): string | null {
|
|
96
|
+
// Claude supports user-level commands at ~/.claude/commands
|
|
97
|
+
return path.join(os.homedir(), ".claude", "commands");
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
getCommandExtension(): string {
|
|
101
|
+
return ".md";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
canUseSymlink(): boolean {
|
|
105
|
+
// Claude uses the same Markdown format as our canonical file
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
transformCommand(markdownContent: string): string {
|
|
110
|
+
// Claude uses the same Markdown format, no transformation needed
|
|
111
|
+
return markdownContent;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async execute(opts: {
|
|
115
|
+
prompt: string;
|
|
116
|
+
diff: string;
|
|
117
|
+
model?: string;
|
|
118
|
+
timeoutMs?: number;
|
|
119
|
+
}): Promise<string> {
|
|
120
|
+
const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
|
|
121
|
+
|
|
122
|
+
const tmpDir = os.tmpdir();
|
|
123
|
+
// Include process.pid for uniqueness across concurrent processes
|
|
124
|
+
const tmpFile = path.join(
|
|
125
|
+
tmpDir,
|
|
126
|
+
`gauntlet-claude-${process.pid}-${Date.now()}.txt`,
|
|
127
|
+
);
|
|
128
|
+
await fs.writeFile(tmpFile, fullContent);
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
// Recommended invocation per spec:
|
|
132
|
+
// -p: non-interactive print mode
|
|
133
|
+
// --allowedTools: explicitly restricts to read-only tools
|
|
134
|
+
// --max-turns: caps agentic turns
|
|
135
|
+
const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
|
|
136
|
+
const { stdout } = await execAsync(cmd, {
|
|
137
|
+
timeout: opts.timeoutMs,
|
|
138
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
139
|
+
});
|
|
140
|
+
return stdout;
|
|
141
|
+
} finally {
|
|
142
|
+
await fs.unlink(tmpFile).catch(() => {});
|
|
143
|
+
}
|
|
144
|
+
}
|
|
114
145
|
}
|
|
@@ -1,123 +1,147 @@
|
|
|
1
|
-
import { exec } from
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
import
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { promisify } from "node:util";
|
|
6
|
+
import { type CLIAdapter, isUsageLimit } from "./index.js";
|
|
7
7
|
|
|
8
8
|
const execAsync = promisify(exec);
|
|
9
9
|
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
10
10
|
|
|
11
11
|
export class CodexAdapter implements CLIAdapter {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
12
|
+
name = "codex";
|
|
13
|
+
|
|
14
|
+
async isAvailable(): Promise<boolean> {
|
|
15
|
+
try {
|
|
16
|
+
await execAsync("which codex");
|
|
17
|
+
return true;
|
|
18
|
+
} catch {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
|
|
24
|
+
available: boolean;
|
|
25
|
+
status: "healthy" | "missing" | "unhealthy";
|
|
26
|
+
message?: string;
|
|
27
|
+
}> {
|
|
28
|
+
const available = await this.isAvailable();
|
|
29
|
+
if (!available) {
|
|
30
|
+
return {
|
|
31
|
+
available: false,
|
|
32
|
+
status: "missing",
|
|
33
|
+
message: "Command not found",
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (options?.checkUsageLimit) {
|
|
38
|
+
try {
|
|
39
|
+
const repoRoot = process.cwd();
|
|
40
|
+
// Try a lightweight command to check if we're rate limited
|
|
41
|
+
const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
|
|
42
|
+
const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
|
|
43
|
+
|
|
44
|
+
const combined = (stdout || "") + (stderr || "");
|
|
45
|
+
if (isUsageLimit(combined)) {
|
|
46
|
+
return {
|
|
47
|
+
available: true,
|
|
48
|
+
status: "unhealthy",
|
|
49
|
+
message: "Usage limit exceeded",
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return { available: true, status: "healthy", message: "Installed" };
|
|
54
|
+
} catch (error: unknown) {
|
|
55
|
+
const execError = error as {
|
|
56
|
+
stderr?: string;
|
|
57
|
+
stdout?: string;
|
|
58
|
+
message?: string;
|
|
59
|
+
};
|
|
60
|
+
const stderr = execError.stderr || "";
|
|
61
|
+
const stdout = execError.stdout || "";
|
|
62
|
+
const combined = stderr + stdout;
|
|
63
|
+
|
|
64
|
+
if (isUsageLimit(combined)) {
|
|
65
|
+
return {
|
|
66
|
+
available: true,
|
|
67
|
+
status: "unhealthy",
|
|
68
|
+
message: "Usage limit exceeded",
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const cleanError =
|
|
73
|
+
combined.split("\n")[0]?.trim() ||
|
|
74
|
+
execError.message ||
|
|
75
|
+
"Command failed";
|
|
76
|
+
return {
|
|
77
|
+
available: true,
|
|
78
|
+
status: "unhealthy",
|
|
79
|
+
message: `Error: ${cleanError}`,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
available,
|
|
86
|
+
status: available ? "healthy" : "missing",
|
|
87
|
+
message: available ? "Installed" : "Command not found",
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
getProjectCommandDir(): string | null {
|
|
92
|
+
// Codex only supports user-level prompts at ~/.codex/prompts/
|
|
93
|
+
// No project-scoped commands available
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
getUserCommandDir(): string | null {
|
|
98
|
+
// Codex uses user-level prompts at ~/.codex/prompts/
|
|
99
|
+
return path.join(os.homedir(), ".codex", "prompts");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
getCommandExtension(): string {
|
|
103
|
+
return ".md";
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
canUseSymlink(): boolean {
|
|
107
|
+
// Codex uses the same Markdown format as our canonical file
|
|
108
|
+
return true;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
transformCommand(markdownContent: string): string {
|
|
112
|
+
// Codex uses the same Markdown format as Claude, no transformation needed
|
|
113
|
+
return markdownContent;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async execute(opts: {
|
|
117
|
+
prompt: string;
|
|
118
|
+
diff: string;
|
|
119
|
+
model?: string;
|
|
120
|
+
timeoutMs?: number;
|
|
121
|
+
}): Promise<string> {
|
|
122
|
+
const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
|
|
123
|
+
|
|
124
|
+
const tmpDir = os.tmpdir();
|
|
125
|
+
const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
|
|
126
|
+
await fs.writeFile(tmpFile, fullContent);
|
|
127
|
+
|
|
128
|
+
// Get absolute path to repo root (CWD)
|
|
129
|
+
const repoRoot = process.cwd();
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
// Recommended invocation per spec:
|
|
133
|
+
// --cd: sets working directory to repo root
|
|
134
|
+
// --sandbox read-only: prevents file modifications
|
|
135
|
+
// -c ask_for_approval="never": prevents blocking on prompts
|
|
136
|
+
// -: reads prompt from stdin
|
|
137
|
+
const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
|
|
138
|
+
const { stdout } = await execAsync(cmd, {
|
|
139
|
+
timeout: opts.timeoutMs,
|
|
140
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
141
|
+
});
|
|
142
|
+
return stdout;
|
|
143
|
+
} finally {
|
|
144
|
+
await fs.unlink(tmpFile).catch(() => {});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
123
147
|
}
|