agent-gauntlet 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -87
- package/package.json +4 -2
- package/src/bun-plugins.d.ts +4 -0
- package/src/cli-adapters/claude.ts +139 -108
- package/src/cli-adapters/codex.ts +141 -117
- package/src/cli-adapters/cursor.ts +152 -0
- package/src/cli-adapters/gemini.ts +171 -139
- package/src/cli-adapters/github-copilot.ts +153 -0
- package/src/cli-adapters/index.ts +77 -48
- package/src/commands/check.test.ts +24 -20
- package/src/commands/check.ts +86 -59
- package/src/commands/ci/index.ts +15 -0
- package/src/commands/ci/init.ts +96 -0
- package/src/commands/ci/list-jobs.ts +78 -0
- package/src/commands/detect.test.ts +38 -32
- package/src/commands/detect.ts +89 -61
- package/src/commands/health.test.ts +67 -53
- package/src/commands/health.ts +167 -145
- package/src/commands/help.test.ts +37 -37
- package/src/commands/help.ts +31 -22
- package/src/commands/index.ts +10 -9
- package/src/commands/init.test.ts +120 -107
- package/src/commands/init.ts +514 -417
- package/src/commands/list.test.ts +87 -70
- package/src/commands/list.ts +28 -24
- package/src/commands/rerun.ts +157 -119
- package/src/commands/review.test.ts +26 -20
- package/src/commands/review.ts +86 -59
- package/src/commands/run.test.ts +22 -20
- package/src/commands/run.ts +85 -58
- package/src/commands/shared.ts +44 -35
- package/src/config/ci-loader.ts +33 -0
- package/src/config/ci-schema.ts +52 -0
- package/src/config/loader.test.ts +112 -90
- package/src/config/loader.ts +132 -123
- package/src/config/schema.ts +48 -47
- package/src/config/types.ts +28 -13
- package/src/config/validator.ts +521 -454
- package/src/core/change-detector.ts +122 -104
- package/src/core/entry-point.test.ts +60 -62
- package/src/core/entry-point.ts +120 -74
- package/src/core/job.ts +69 -59
- package/src/core/runner.ts +264 -230
- package/src/gates/check.ts +78 -69
- package/src/gates/result.ts +7 -7
- package/src/gates/review.test.ts +277 -138
- package/src/gates/review.ts +724 -561
- package/src/index.ts +18 -15
- package/src/output/console.ts +253 -214
- package/src/output/logger.ts +66 -52
- package/src/templates/run_gauntlet.template.md +18 -0
- package/src/templates/workflow.yml +77 -0
- package/src/utils/diff-parser.ts +64 -62
- package/src/utils/log-parser.ts +227 -206
- package/src/utils/sanitizer.ts +1 -1
|
@@ -1,123 +1,147 @@
|
|
|
1
|
-
import { exec } from
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
import
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { promisify } from "node:util";
|
|
6
|
+
import { type CLIAdapter, isUsageLimit } from "./index.js";
|
|
7
7
|
|
|
8
8
|
const execAsync = promisify(exec);
|
|
9
9
|
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
10
10
|
|
|
11
11
|
export class CodexAdapter implements CLIAdapter {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
12
|
+
name = "codex";
|
|
13
|
+
|
|
14
|
+
async isAvailable(): Promise<boolean> {
|
|
15
|
+
try {
|
|
16
|
+
await execAsync("which codex");
|
|
17
|
+
return true;
|
|
18
|
+
} catch {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
|
|
24
|
+
available: boolean;
|
|
25
|
+
status: "healthy" | "missing" | "unhealthy";
|
|
26
|
+
message?: string;
|
|
27
|
+
}> {
|
|
28
|
+
const available = await this.isAvailable();
|
|
29
|
+
if (!available) {
|
|
30
|
+
return {
|
|
31
|
+
available: false,
|
|
32
|
+
status: "missing",
|
|
33
|
+
message: "Command not found",
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (options?.checkUsageLimit) {
|
|
38
|
+
try {
|
|
39
|
+
const repoRoot = process.cwd();
|
|
40
|
+
// Try a lightweight command to check if we're rate limited
|
|
41
|
+
const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
|
|
42
|
+
const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
|
|
43
|
+
|
|
44
|
+
const combined = (stdout || "") + (stderr || "");
|
|
45
|
+
if (isUsageLimit(combined)) {
|
|
46
|
+
return {
|
|
47
|
+
available: true,
|
|
48
|
+
status: "unhealthy",
|
|
49
|
+
message: "Usage limit exceeded",
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return { available: true, status: "healthy", message: "Installed" };
|
|
54
|
+
} catch (error: unknown) {
|
|
55
|
+
const execError = error as {
|
|
56
|
+
stderr?: string;
|
|
57
|
+
stdout?: string;
|
|
58
|
+
message?: string;
|
|
59
|
+
};
|
|
60
|
+
const stderr = execError.stderr || "";
|
|
61
|
+
const stdout = execError.stdout || "";
|
|
62
|
+
const combined = stderr + stdout;
|
|
63
|
+
|
|
64
|
+
if (isUsageLimit(combined)) {
|
|
65
|
+
return {
|
|
66
|
+
available: true,
|
|
67
|
+
status: "unhealthy",
|
|
68
|
+
message: "Usage limit exceeded",
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const cleanError =
|
|
73
|
+
combined.split("\n")[0]?.trim() ||
|
|
74
|
+
execError.message ||
|
|
75
|
+
"Command failed";
|
|
76
|
+
return {
|
|
77
|
+
available: true,
|
|
78
|
+
status: "unhealthy",
|
|
79
|
+
message: `Error: ${cleanError}`,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
available,
|
|
86
|
+
status: available ? "healthy" : "missing",
|
|
87
|
+
message: available ? "Installed" : "Command not found",
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
getProjectCommandDir(): string | null {
|
|
92
|
+
// Codex only supports user-level prompts at ~/.codex/prompts/
|
|
93
|
+
// No project-scoped commands available
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
getUserCommandDir(): string | null {
|
|
98
|
+
// Codex uses user-level prompts at ~/.codex/prompts/
|
|
99
|
+
return path.join(os.homedir(), ".codex", "prompts");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
getCommandExtension(): string {
|
|
103
|
+
return ".md";
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
canUseSymlink(): boolean {
|
|
107
|
+
// Codex uses the same Markdown format as our canonical file
|
|
108
|
+
return true;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
transformCommand(markdownContent: string): string {
|
|
112
|
+
// Codex uses the same Markdown format as Claude, no transformation needed
|
|
113
|
+
return markdownContent;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async execute(opts: {
|
|
117
|
+
prompt: string;
|
|
118
|
+
diff: string;
|
|
119
|
+
model?: string;
|
|
120
|
+
timeoutMs?: number;
|
|
121
|
+
}): Promise<string> {
|
|
122
|
+
const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
|
|
123
|
+
|
|
124
|
+
const tmpDir = os.tmpdir();
|
|
125
|
+
const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
|
|
126
|
+
await fs.writeFile(tmpFile, fullContent);
|
|
127
|
+
|
|
128
|
+
// Get absolute path to repo root (CWD)
|
|
129
|
+
const repoRoot = process.cwd();
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
// Recommended invocation per spec:
|
|
133
|
+
// --cd: sets working directory to repo root
|
|
134
|
+
// --sandbox read-only: prevents file modifications
|
|
135
|
+
// -c ask_for_approval="never": prevents blocking on prompts
|
|
136
|
+
// -: reads prompt from stdin
|
|
137
|
+
const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
|
|
138
|
+
const { stdout } = await execAsync(cmd, {
|
|
139
|
+
timeout: opts.timeoutMs,
|
|
140
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
141
|
+
});
|
|
142
|
+
return stdout;
|
|
143
|
+
} finally {
|
|
144
|
+
await fs.unlink(tmpFile).catch(() => {});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
123
147
|
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { promisify } from "node:util";
|
|
6
|
+
import { type CLIAdapter, isUsageLimit } from "./index.js";
|
|
7
|
+
|
|
8
|
+
const execAsync = promisify(exec);
|
|
9
|
+
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
10
|
+
|
|
11
|
+
export class CursorAdapter implements CLIAdapter {
|
|
12
|
+
name = "cursor";
|
|
13
|
+
|
|
14
|
+
async isAvailable(): Promise<boolean> {
|
|
15
|
+
try {
|
|
16
|
+
// Note: Cursor's CLI binary is named "agent", not "cursor"
|
|
17
|
+
await execAsync("which agent");
|
|
18
|
+
return true;
|
|
19
|
+
} catch {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
|
|
25
|
+
available: boolean;
|
|
26
|
+
status: "healthy" | "missing" | "unhealthy";
|
|
27
|
+
message?: string;
|
|
28
|
+
}> {
|
|
29
|
+
const available = await this.isAvailable();
|
|
30
|
+
if (!available) {
|
|
31
|
+
return {
|
|
32
|
+
available: false,
|
|
33
|
+
status: "missing",
|
|
34
|
+
message: "Command not found",
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (options?.checkUsageLimit) {
|
|
39
|
+
try {
|
|
40
|
+
// Try a lightweight command to check if we're rate limited
|
|
41
|
+
const { stdout, stderr } = await execAsync('echo "hello" | agent', {
|
|
42
|
+
timeout: 10000,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
const combined = (stdout || "") + (stderr || "");
|
|
46
|
+
if (isUsageLimit(combined)) {
|
|
47
|
+
return {
|
|
48
|
+
available: true,
|
|
49
|
+
status: "unhealthy",
|
|
50
|
+
message: "Usage limit exceeded",
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
55
|
+
} catch (error: unknown) {
|
|
56
|
+
const execError = error as {
|
|
57
|
+
stderr?: string;
|
|
58
|
+
stdout?: string;
|
|
59
|
+
message?: string;
|
|
60
|
+
};
|
|
61
|
+
const stderr = execError.stderr || "";
|
|
62
|
+
const stdout = execError.stdout || "";
|
|
63
|
+
const combined = stderr + stdout;
|
|
64
|
+
|
|
65
|
+
if (isUsageLimit(combined)) {
|
|
66
|
+
return {
|
|
67
|
+
available: true,
|
|
68
|
+
status: "unhealthy",
|
|
69
|
+
message: "Usage limit exceeded",
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Since we sent a valid prompt ("hello"), any other error implies the tool is broken
|
|
74
|
+
const cleanError =
|
|
75
|
+
combined.split("\n")[0]?.trim() ||
|
|
76
|
+
execError.message ||
|
|
77
|
+
"Command failed";
|
|
78
|
+
return {
|
|
79
|
+
available: true,
|
|
80
|
+
status: "unhealthy",
|
|
81
|
+
message: `Error: ${cleanError}`,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
getProjectCommandDir(): string | null {
|
|
90
|
+
// Cursor does not support custom commands
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
getUserCommandDir(): string | null {
|
|
95
|
+
// Cursor does not support custom commands
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
getCommandExtension(): string {
|
|
100
|
+
return ".md";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
canUseSymlink(): boolean {
|
|
104
|
+
// Not applicable - no command directory support
|
|
105
|
+
return false;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
transformCommand(markdownContent: string): string {
|
|
109
|
+
// Not applicable - no command directory support
|
|
110
|
+
return markdownContent;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async execute(opts: {
|
|
114
|
+
prompt: string;
|
|
115
|
+
diff: string;
|
|
116
|
+
model?: string;
|
|
117
|
+
timeoutMs?: number;
|
|
118
|
+
}): Promise<string> {
|
|
119
|
+
const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
|
|
120
|
+
|
|
121
|
+
const tmpDir = os.tmpdir();
|
|
122
|
+
// Include process.pid for uniqueness across concurrent processes
|
|
123
|
+
const tmpFile = path.join(
|
|
124
|
+
tmpDir,
|
|
125
|
+
`gauntlet-cursor-${process.pid}-${Date.now()}.txt`,
|
|
126
|
+
);
|
|
127
|
+
await fs.writeFile(tmpFile, fullContent);
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
// Cursor agent command reads from stdin
|
|
131
|
+
// Note: As of the current version, the Cursor 'agent' CLI does not expose
|
|
132
|
+
// flags for restricting tools or enforcing read-only mode (unlike claude's --allowedTools
|
|
133
|
+
// or codex's --sandbox read-only). The agent is assumed to be repo-scoped and
|
|
134
|
+
// safe for code review use. If Cursor adds such flags in the future, they should
|
|
135
|
+
// be added here for defense-in-depth.
|
|
136
|
+
//
|
|
137
|
+
// Shell command construction: We use exec() with shell piping
|
|
138
|
+
// because the agent requires stdin input. The tmpFile path is system-controlled
|
|
139
|
+
// (os.tmpdir() + Date.now() + process.pid), not user-supplied, eliminating injection risk.
|
|
140
|
+
// Double quotes handle paths with spaces.
|
|
141
|
+
const cmd = `cat "${tmpFile}" | agent`;
|
|
142
|
+
const { stdout } = await execAsync(cmd, {
|
|
143
|
+
timeout: opts.timeoutMs,
|
|
144
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
145
|
+
});
|
|
146
|
+
return stdout;
|
|
147
|
+
} finally {
|
|
148
|
+
// Cleanup errors are intentionally ignored - the tmp file will be cleaned up by OS
|
|
149
|
+
await fs.unlink(tmpFile).catch(() => {});
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|