agent-gauntlet 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +1 -1
  2. package/package.json +4 -2
  3. package/src/cli-adapters/claude.ts +139 -108
  4. package/src/cli-adapters/codex.ts +141 -117
  5. package/src/cli-adapters/cursor.ts +152 -0
  6. package/src/cli-adapters/gemini.ts +171 -139
  7. package/src/cli-adapters/github-copilot.ts +153 -0
  8. package/src/cli-adapters/index.ts +77 -48
  9. package/src/commands/check.test.ts +24 -20
  10. package/src/commands/check.ts +65 -59
  11. package/src/commands/detect.test.ts +38 -32
  12. package/src/commands/detect.ts +74 -61
  13. package/src/commands/health.test.ts +67 -53
  14. package/src/commands/health.ts +167 -145
  15. package/src/commands/help.test.ts +37 -37
  16. package/src/commands/help.ts +30 -22
  17. package/src/commands/index.ts +9 -9
  18. package/src/commands/init.test.ts +118 -107
  19. package/src/commands/init.ts +514 -417
  20. package/src/commands/list.test.ts +87 -70
  21. package/src/commands/list.ts +28 -24
  22. package/src/commands/rerun.ts +142 -119
  23. package/src/commands/review.test.ts +26 -20
  24. package/src/commands/review.ts +65 -59
  25. package/src/commands/run.test.ts +22 -20
  26. package/src/commands/run.ts +64 -58
  27. package/src/commands/shared.ts +44 -35
  28. package/src/config/loader.test.ts +112 -90
  29. package/src/config/loader.ts +132 -123
  30. package/src/config/schema.ts +49 -47
  31. package/src/config/types.ts +15 -13
  32. package/src/config/validator.ts +521 -454
  33. package/src/core/change-detector.ts +122 -104
  34. package/src/core/entry-point.test.ts +60 -62
  35. package/src/core/entry-point.ts +76 -67
  36. package/src/core/job.ts +69 -59
  37. package/src/core/runner.ts +261 -230
  38. package/src/gates/check.ts +78 -69
  39. package/src/gates/result.ts +7 -7
  40. package/src/gates/review.test.ts +174 -138
  41. package/src/gates/review.ts +716 -561
  42. package/src/index.ts +16 -15
  43. package/src/output/console.ts +253 -214
  44. package/src/output/logger.ts +64 -52
  45. package/src/templates/run_gauntlet.template.md +18 -0
  46. package/src/utils/diff-parser.ts +64 -62
  47. package/src/utils/log-parser.ts +227 -206
  48. package/src/utils/sanitizer.ts +1 -1
package/README.md CHANGED
@@ -20,7 +20,7 @@ Agent Gauntlet is designed to be "tool-agnostic" by leveraging the AI CLI tools
20
20
 
21
21
  - **Bun** (Required runtime, v1.0.0+)
22
22
  - **git** (change detection and diffs)
23
- - For review gates: one or more supported AI CLIs installed (`gemini`, `codex`, `claude`). For the full list of tools and how they are used, see [CLI Invocation Details](docs/cli-invocation-details.md)
23
+ - For review gates: one or more supported AI CLIs installed (`gemini`, `codex`, `claude`, `github-copilot`, `cursor`). For the full list of tools and how they are used, see [CLI Invocation Details](docs/cli-invocation-details.md)
24
24
 
25
25
  ### Installation
26
26
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.1.10",
3
+ "version": "0.1.11",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -33,9 +33,11 @@
33
33
  },
34
34
  "scripts": {
35
35
  "build": "bun build --compile --minify --sourcemap ./src/index.ts --outfile bin/agent-gauntlet",
36
- "test": "bun test"
36
+ "test": "bun test",
37
+ "lint": "biome check src"
37
38
  },
38
39
  "devDependencies": {
40
+ "@biomejs/biome": "^2.3.11",
39
41
  "@types/bun": "latest"
40
42
  },
41
43
  "peerDependencies": {
@@ -1,114 +1,145 @@
1
- import { exec } from 'node:child_process';
2
- import { promisify } from 'node:util';
3
- import { type CLIAdapter, isUsageLimit } from './index.js';
4
- import fs from 'node:fs/promises';
5
- import path from 'node:path';
6
- import os from 'node:os';
1
+ import { exec } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { promisify } from "node:util";
6
+ import { type CLIAdapter, isUsageLimit } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
10
10
 
11
11
  export class ClaudeAdapter implements CLIAdapter {
12
- name = 'claude';
13
-
14
- async isAvailable(): Promise<boolean> {
15
- try {
16
- await execAsync('which claude');
17
- return true;
18
- } catch {
19
- return false;
20
- }
21
- }
22
-
23
- async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{ available: boolean; status: 'healthy' | 'missing' | 'unhealthy'; message?: string }> {
24
- const available = await this.isAvailable();
25
- if (!available) {
26
- return { available: false, status: 'missing', message: 'Command not found' };
27
- }
28
-
29
- if (options?.checkUsageLimit) {
30
- try {
31
- // Try a lightweight command to check if we're rate limited
32
- // We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
33
- const { stdout, stderr } = await execAsync('echo "hello" | claude -p --max-turns 1', { timeout: 10000 });
34
-
35
- const combined = (stdout || '') + (stderr || '');
36
- if (isUsageLimit(combined)) {
37
- return {
38
- available: true,
39
- status: 'unhealthy',
40
- message: 'Usage limit exceeded'
41
- };
42
- }
43
-
44
- return { available: true, status: 'healthy', message: 'Ready' };
45
- } catch (error: any) {
46
- const stderr = error.stderr || '';
47
- const stdout = error.stdout || '';
48
- const combined = (stderr + stdout);
49
-
50
- if (isUsageLimit(combined)) {
51
- return {
52
- available: true,
53
- status: 'unhealthy',
54
- message: 'Usage limit exceeded'
55
- };
56
- }
57
-
58
- // Since we sent a valid prompt ("hello"), any other error implies the tool is broken
59
- // Extract a brief error message if possible
60
- const cleanError = combined.split('\n')[0]?.trim() || error.message || 'Command failed';
61
- return {
62
- available: true,
63
- status: 'unhealthy',
64
- message: `Error: ${cleanError}`
65
- };
66
- }
67
- }
68
-
69
- return { available: true, status: 'healthy', message: 'Ready' };
70
- }
71
-
72
- getProjectCommandDir(): string | null {
73
- return '.claude/commands';
74
- }
75
-
76
- getUserCommandDir(): string | null {
77
- // Claude supports user-level commands at ~/.claude/commands
78
- return path.join(os.homedir(), '.claude', 'commands');
79
- }
80
-
81
- getCommandExtension(): string {
82
- return '.md';
83
- }
84
-
85
- canUseSymlink(): boolean {
86
- // Claude uses the same Markdown format as our canonical file
87
- return true;
88
- }
89
-
90
- transformCommand(markdownContent: string): string {
91
- // Claude uses the same Markdown format, no transformation needed
92
- return markdownContent;
93
- }
94
-
95
- async execute(opts: { prompt: string; diff: string; model?: string; timeoutMs?: number }): Promise<string> {
96
- const fullContent = opts.prompt + "\n\n--- DIFF ---\n" + opts.diff;
97
-
98
- const tmpDir = os.tmpdir();
99
- const tmpFile = path.join(tmpDir, `gauntlet-claude-${Date.now()}.txt`);
100
- await fs.writeFile(tmpFile, fullContent);
101
-
102
- try {
103
- // Recommended invocation per spec:
104
- // -p: non-interactive print mode
105
- // --allowedTools: explicitly restricts to read-only tools
106
- // --max-turns: caps agentic turns
107
- const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
108
- const { stdout } = await execAsync(cmd, { timeout: opts.timeoutMs, maxBuffer: MAX_BUFFER_BYTES });
109
- return stdout;
110
- } finally {
111
- await fs.unlink(tmpFile).catch(() => {});
112
- }
113
- }
12
+ name = "claude";
13
+
14
+ async isAvailable(): Promise<boolean> {
15
+ try {
16
+ await execAsync("which claude");
17
+ return true;
18
+ } catch {
19
+ return false;
20
+ }
21
+ }
22
+
23
+ async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
24
+ available: boolean;
25
+ status: "healthy" | "missing" | "unhealthy";
26
+ message?: string;
27
+ }> {
28
+ const available = await this.isAvailable();
29
+ if (!available) {
30
+ return {
31
+ available: false,
32
+ status: "missing",
33
+ message: "Command not found",
34
+ };
35
+ }
36
+
37
+ if (options?.checkUsageLimit) {
38
+ try {
39
+ // Try a lightweight command to check if we're rate limited
40
+ // We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
41
+ const { stdout, stderr } = await execAsync(
42
+ 'echo "hello" | claude -p --max-turns 1',
43
+ { timeout: 10000 },
44
+ );
45
+
46
+ const combined = (stdout || "") + (stderr || "");
47
+ if (isUsageLimit(combined)) {
48
+ return {
49
+ available: true,
50
+ status: "unhealthy",
51
+ message: "Usage limit exceeded",
52
+ };
53
+ }
54
+
55
+ return { available: true, status: "healthy", message: "Ready" };
56
+ } catch (error: unknown) {
57
+ const execError = error as {
58
+ stderr?: string;
59
+ stdout?: string;
60
+ message?: string;
61
+ };
62
+ const stderr = execError.stderr || "";
63
+ const stdout = execError.stdout || "";
64
+ const combined = stderr + stdout;
65
+
66
+ if (isUsageLimit(combined)) {
67
+ return {
68
+ available: true,
69
+ status: "unhealthy",
70
+ message: "Usage limit exceeded",
71
+ };
72
+ }
73
+
74
+ // Since we sent a valid prompt ("hello"), any other error implies the tool is broken
75
+ // Extract a brief error message if possible
76
+ const cleanError =
77
+ combined.split("\n")[0]?.trim() ||
78
+ execError.message ||
79
+ "Command failed";
80
+ return {
81
+ available: true,
82
+ status: "unhealthy",
83
+ message: `Error: ${cleanError}`,
84
+ };
85
+ }
86
+ }
87
+
88
+ return { available: true, status: "healthy", message: "Ready" };
89
+ }
90
+
91
+ getProjectCommandDir(): string | null {
92
+ return ".claude/commands";
93
+ }
94
+
95
+ getUserCommandDir(): string | null {
96
+ // Claude supports user-level commands at ~/.claude/commands
97
+ return path.join(os.homedir(), ".claude", "commands");
98
+ }
99
+
100
+ getCommandExtension(): string {
101
+ return ".md";
102
+ }
103
+
104
+ canUseSymlink(): boolean {
105
+ // Claude uses the same Markdown format as our canonical file
106
+ return true;
107
+ }
108
+
109
+ transformCommand(markdownContent: string): string {
110
+ // Claude uses the same Markdown format, no transformation needed
111
+ return markdownContent;
112
+ }
113
+
114
+ async execute(opts: {
115
+ prompt: string;
116
+ diff: string;
117
+ model?: string;
118
+ timeoutMs?: number;
119
+ }): Promise<string> {
120
+ const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
121
+
122
+ const tmpDir = os.tmpdir();
123
+ // Include process.pid for uniqueness across concurrent processes
124
+ const tmpFile = path.join(
125
+ tmpDir,
126
+ `gauntlet-claude-${process.pid}-${Date.now()}.txt`,
127
+ );
128
+ await fs.writeFile(tmpFile, fullContent);
129
+
130
+ try {
131
+ // Recommended invocation per spec:
132
+ // -p: non-interactive print mode
133
+ // --allowedTools: explicitly restricts to read-only tools
134
+ // --max-turns: caps agentic turns
135
+ const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
136
+ const { stdout } = await execAsync(cmd, {
137
+ timeout: opts.timeoutMs,
138
+ maxBuffer: MAX_BUFFER_BYTES,
139
+ });
140
+ return stdout;
141
+ } finally {
142
+ await fs.unlink(tmpFile).catch(() => {});
143
+ }
144
+ }
114
145
  }
@@ -1,123 +1,147 @@
1
- import { exec } from 'node:child_process';
2
- import { promisify } from 'node:util';
3
- import { type CLIAdapter, isUsageLimit } from './index.js';
4
- import fs from 'node:fs/promises';
5
- import path from 'node:path';
6
- import os from 'node:os';
1
+ import { exec } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { promisify } from "node:util";
6
+ import { type CLIAdapter, isUsageLimit } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
10
10
 
11
11
  export class CodexAdapter implements CLIAdapter {
12
- name = 'codex';
13
-
14
- async isAvailable(): Promise<boolean> {
15
- try {
16
- await execAsync('which codex');
17
- return true;
18
- } catch {
19
- return false;
20
- }
21
- }
22
-
23
- async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{ available: boolean; status: 'healthy' | 'missing' | 'unhealthy'; message?: string }> {
24
- const available = await this.isAvailable();
25
- if (!available) {
26
- return { available: false, status: 'missing', message: 'Command not found' };
27
- }
28
-
29
- if (options?.checkUsageLimit) {
30
- try {
31
- const repoRoot = process.cwd();
32
- // Try a lightweight command to check if we're rate limited
33
- const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
34
- const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
35
-
36
- const combined = (stdout || '') + (stderr || '');
37
- if (isUsageLimit(combined)) {
38
- return {
39
- available: true,
40
- status: 'unhealthy',
41
- message: 'Usage limit exceeded'
42
- };
43
- }
44
-
45
- return { available: true, status: 'healthy', message: 'Installed' };
46
- } catch (error: any) {
47
- const stderr = error.stderr || '';
48
- const stdout = error.stdout || '';
49
- const combined = (stderr + stdout);
50
-
51
- if (isUsageLimit(combined)) {
52
- return {
53
- available: true,
54
- status: 'unhealthy',
55
- message: 'Usage limit exceeded'
56
- };
57
- }
58
-
59
- const cleanError = combined.split('\n')[0]?.trim() || error.message || 'Command failed';
60
- return {
61
- available: true,
62
- status: 'unhealthy',
63
- message: `Error: ${cleanError}`
64
- };
65
- }
66
- }
67
-
68
- return {
69
- available,
70
- status: available ? 'healthy' : 'missing',
71
- message: available ? 'Installed' : 'Command not found'
72
- };
73
- }
74
-
75
- getProjectCommandDir(): string | null {
76
- // Codex only supports user-level prompts at ~/.codex/prompts/
77
- // No project-scoped commands available
78
- return null;
79
- }
80
-
81
- getUserCommandDir(): string | null {
82
- // Codex uses user-level prompts at ~/.codex/prompts/
83
- return path.join(os.homedir(), '.codex', 'prompts');
84
- }
85
-
86
- getCommandExtension(): string {
87
- return '.md';
88
- }
89
-
90
- canUseSymlink(): boolean {
91
- // Codex uses the same Markdown format as our canonical file
92
- return true;
93
- }
94
-
95
- transformCommand(markdownContent: string): string {
96
- // Codex uses the same Markdown format as Claude, no transformation needed
97
- return markdownContent;
98
- }
99
-
100
- async execute(opts: { prompt: string; diff: string; model?: string; timeoutMs?: number }): Promise<string> {
101
- const fullContent = opts.prompt + "\n\n--- DIFF ---\n" + opts.diff;
102
-
103
- const tmpDir = os.tmpdir();
104
- const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
105
- await fs.writeFile(tmpFile, fullContent);
106
-
107
- // Get absolute path to repo root (CWD)
108
- const repoRoot = process.cwd();
109
-
110
- try {
111
- // Recommended invocation per spec:
112
- // --cd: sets working directory to repo root
113
- // --sandbox read-only: prevents file modifications
114
- // -c ask_for_approval="never": prevents blocking on prompts
115
- // -: reads prompt from stdin
116
- const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
117
- const { stdout } = await execAsync(cmd, { timeout: opts.timeoutMs, maxBuffer: MAX_BUFFER_BYTES });
118
- return stdout;
119
- } finally {
120
- await fs.unlink(tmpFile).catch(() => {});
121
- }
122
- }
12
+ name = "codex";
13
+
14
+ async isAvailable(): Promise<boolean> {
15
+ try {
16
+ await execAsync("which codex");
17
+ return true;
18
+ } catch {
19
+ return false;
20
+ }
21
+ }
22
+
23
+ async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
24
+ available: boolean;
25
+ status: "healthy" | "missing" | "unhealthy";
26
+ message?: string;
27
+ }> {
28
+ const available = await this.isAvailable();
29
+ if (!available) {
30
+ return {
31
+ available: false,
32
+ status: "missing",
33
+ message: "Command not found",
34
+ };
35
+ }
36
+
37
+ if (options?.checkUsageLimit) {
38
+ try {
39
+ const repoRoot = process.cwd();
40
+ // Try a lightweight command to check if we're rate limited
41
+ const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
42
+ const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
43
+
44
+ const combined = (stdout || "") + (stderr || "");
45
+ if (isUsageLimit(combined)) {
46
+ return {
47
+ available: true,
48
+ status: "unhealthy",
49
+ message: "Usage limit exceeded",
50
+ };
51
+ }
52
+
53
+ return { available: true, status: "healthy", message: "Installed" };
54
+ } catch (error: unknown) {
55
+ const execError = error as {
56
+ stderr?: string;
57
+ stdout?: string;
58
+ message?: string;
59
+ };
60
+ const stderr = execError.stderr || "";
61
+ const stdout = execError.stdout || "";
62
+ const combined = stderr + stdout;
63
+
64
+ if (isUsageLimit(combined)) {
65
+ return {
66
+ available: true,
67
+ status: "unhealthy",
68
+ message: "Usage limit exceeded",
69
+ };
70
+ }
71
+
72
+ const cleanError =
73
+ combined.split("\n")[0]?.trim() ||
74
+ execError.message ||
75
+ "Command failed";
76
+ return {
77
+ available: true,
78
+ status: "unhealthy",
79
+ message: `Error: ${cleanError}`,
80
+ };
81
+ }
82
+ }
83
+
84
+ return {
85
+ available,
86
+ status: available ? "healthy" : "missing",
87
+ message: available ? "Installed" : "Command not found",
88
+ };
89
+ }
90
+
91
+ getProjectCommandDir(): string | null {
92
+ // Codex only supports user-level prompts at ~/.codex/prompts/
93
+ // No project-scoped commands available
94
+ return null;
95
+ }
96
+
97
+ getUserCommandDir(): string | null {
98
+ // Codex uses user-level prompts at ~/.codex/prompts/
99
+ return path.join(os.homedir(), ".codex", "prompts");
100
+ }
101
+
102
+ getCommandExtension(): string {
103
+ return ".md";
104
+ }
105
+
106
+ canUseSymlink(): boolean {
107
+ // Codex uses the same Markdown format as our canonical file
108
+ return true;
109
+ }
110
+
111
+ transformCommand(markdownContent: string): string {
112
+ // Codex uses the same Markdown format as Claude, no transformation needed
113
+ return markdownContent;
114
+ }
115
+
116
+ async execute(opts: {
117
+ prompt: string;
118
+ diff: string;
119
+ model?: string;
120
+ timeoutMs?: number;
121
+ }): Promise<string> {
122
+ const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
123
+
124
+ const tmpDir = os.tmpdir();
125
+ const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
126
+ await fs.writeFile(tmpFile, fullContent);
127
+
128
+ // Get absolute path to repo root (CWD)
129
+ const repoRoot = process.cwd();
130
+
131
+ try {
132
+ // Recommended invocation per spec:
133
+ // --cd: sets working directory to repo root
134
+ // --sandbox read-only: prevents file modifications
135
+ // -c ask_for_approval="never": prevents blocking on prompts
136
+ // -: reads prompt from stdin
137
+ const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
138
+ const { stdout } = await execAsync(cmd, {
139
+ timeout: opts.timeoutMs,
140
+ maxBuffer: MAX_BUFFER_BYTES,
141
+ });
142
+ return stdout;
143
+ } finally {
144
+ await fs.unlink(tmpFile).catch(() => {});
145
+ }
146
+ }
123
147
  }