agent-gauntlet 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +55 -87
  2. package/package.json +4 -2
  3. package/src/bun-plugins.d.ts +4 -0
  4. package/src/cli-adapters/claude.ts +139 -108
  5. package/src/cli-adapters/codex.ts +141 -117
  6. package/src/cli-adapters/cursor.ts +152 -0
  7. package/src/cli-adapters/gemini.ts +171 -139
  8. package/src/cli-adapters/github-copilot.ts +153 -0
  9. package/src/cli-adapters/index.ts +77 -48
  10. package/src/commands/check.test.ts +24 -20
  11. package/src/commands/check.ts +86 -59
  12. package/src/commands/ci/index.ts +15 -0
  13. package/src/commands/ci/init.ts +96 -0
  14. package/src/commands/ci/list-jobs.ts +78 -0
  15. package/src/commands/detect.test.ts +38 -32
  16. package/src/commands/detect.ts +89 -61
  17. package/src/commands/health.test.ts +67 -53
  18. package/src/commands/health.ts +167 -145
  19. package/src/commands/help.test.ts +37 -37
  20. package/src/commands/help.ts +31 -22
  21. package/src/commands/index.ts +10 -9
  22. package/src/commands/init.test.ts +120 -107
  23. package/src/commands/init.ts +514 -417
  24. package/src/commands/list.test.ts +87 -70
  25. package/src/commands/list.ts +28 -24
  26. package/src/commands/rerun.ts +157 -119
  27. package/src/commands/review.test.ts +26 -20
  28. package/src/commands/review.ts +86 -59
  29. package/src/commands/run.test.ts +22 -20
  30. package/src/commands/run.ts +85 -58
  31. package/src/commands/shared.ts +44 -35
  32. package/src/config/ci-loader.ts +33 -0
  33. package/src/config/ci-schema.ts +52 -0
  34. package/src/config/loader.test.ts +112 -90
  35. package/src/config/loader.ts +132 -123
  36. package/src/config/schema.ts +48 -47
  37. package/src/config/types.ts +28 -13
  38. package/src/config/validator.ts +521 -454
  39. package/src/core/change-detector.ts +122 -104
  40. package/src/core/entry-point.test.ts +60 -62
  41. package/src/core/entry-point.ts +120 -74
  42. package/src/core/job.ts +69 -59
  43. package/src/core/runner.ts +264 -230
  44. package/src/gates/check.ts +78 -69
  45. package/src/gates/result.ts +7 -7
  46. package/src/gates/review.test.ts +277 -138
  47. package/src/gates/review.ts +724 -561
  48. package/src/index.ts +18 -15
  49. package/src/output/console.ts +253 -214
  50. package/src/output/logger.ts +66 -52
  51. package/src/templates/run_gauntlet.template.md +18 -0
  52. package/src/templates/workflow.yml +77 -0
  53. package/src/utils/diff-parser.ts +64 -62
  54. package/src/utils/log-parser.ts +227 -206
  55. package/src/utils/sanitizer.ts +1 -1
@@ -1,123 +1,147 @@
1
- import { exec } from 'node:child_process';
2
- import { promisify } from 'node:util';
3
- import { type CLIAdapter, isUsageLimit } from './index.js';
4
- import fs from 'node:fs/promises';
5
- import path from 'node:path';
6
- import os from 'node:os';
1
+ import { exec } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { promisify } from "node:util";
6
+ import { type CLIAdapter, isUsageLimit } from "./index.js";
7
7
 
8
8
  const execAsync = promisify(exec);
9
9
  const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
10
10
 
11
11
  export class CodexAdapter implements CLIAdapter {
12
- name = 'codex';
13
-
14
- async isAvailable(): Promise<boolean> {
15
- try {
16
- await execAsync('which codex');
17
- return true;
18
- } catch {
19
- return false;
20
- }
21
- }
22
-
23
- async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{ available: boolean; status: 'healthy' | 'missing' | 'unhealthy'; message?: string }> {
24
- const available = await this.isAvailable();
25
- if (!available) {
26
- return { available: false, status: 'missing', message: 'Command not found' };
27
- }
28
-
29
- if (options?.checkUsageLimit) {
30
- try {
31
- const repoRoot = process.cwd();
32
- // Try a lightweight command to check if we're rate limited
33
- const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
34
- const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
35
-
36
- const combined = (stdout || '') + (stderr || '');
37
- if (isUsageLimit(combined)) {
38
- return {
39
- available: true,
40
- status: 'unhealthy',
41
- message: 'Usage limit exceeded'
42
- };
43
- }
44
-
45
- return { available: true, status: 'healthy', message: 'Installed' };
46
- } catch (error: any) {
47
- const stderr = error.stderr || '';
48
- const stdout = error.stdout || '';
49
- const combined = (stderr + stdout);
50
-
51
- if (isUsageLimit(combined)) {
52
- return {
53
- available: true,
54
- status: 'unhealthy',
55
- message: 'Usage limit exceeded'
56
- };
57
- }
58
-
59
- const cleanError = combined.split('\n')[0]?.trim() || error.message || 'Command failed';
60
- return {
61
- available: true,
62
- status: 'unhealthy',
63
- message: `Error: ${cleanError}`
64
- };
65
- }
66
- }
67
-
68
- return {
69
- available,
70
- status: available ? 'healthy' : 'missing',
71
- message: available ? 'Installed' : 'Command not found'
72
- };
73
- }
74
-
75
- getProjectCommandDir(): string | null {
76
- // Codex only supports user-level prompts at ~/.codex/prompts/
77
- // No project-scoped commands available
78
- return null;
79
- }
80
-
81
- getUserCommandDir(): string | null {
82
- // Codex uses user-level prompts at ~/.codex/prompts/
83
- return path.join(os.homedir(), '.codex', 'prompts');
84
- }
85
-
86
- getCommandExtension(): string {
87
- return '.md';
88
- }
89
-
90
- canUseSymlink(): boolean {
91
- // Codex uses the same Markdown format as our canonical file
92
- return true;
93
- }
94
-
95
- transformCommand(markdownContent: string): string {
96
- // Codex uses the same Markdown format as Claude, no transformation needed
97
- return markdownContent;
98
- }
99
-
100
- async execute(opts: { prompt: string; diff: string; model?: string; timeoutMs?: number }): Promise<string> {
101
- const fullContent = opts.prompt + "\n\n--- DIFF ---\n" + opts.diff;
102
-
103
- const tmpDir = os.tmpdir();
104
- const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
105
- await fs.writeFile(tmpFile, fullContent);
106
-
107
- // Get absolute path to repo root (CWD)
108
- const repoRoot = process.cwd();
109
-
110
- try {
111
- // Recommended invocation per spec:
112
- // --cd: sets working directory to repo root
113
- // --sandbox read-only: prevents file modifications
114
- // -c ask_for_approval="never": prevents blocking on prompts
115
- // -: reads prompt from stdin
116
- const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
117
- const { stdout } = await execAsync(cmd, { timeout: opts.timeoutMs, maxBuffer: MAX_BUFFER_BYTES });
118
- return stdout;
119
- } finally {
120
- await fs.unlink(tmpFile).catch(() => {});
121
- }
122
- }
12
+ name = "codex";
13
+
14
+ async isAvailable(): Promise<boolean> {
15
+ try {
16
+ await execAsync("which codex");
17
+ return true;
18
+ } catch {
19
+ return false;
20
+ }
21
+ }
22
+
23
+ async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
24
+ available: boolean;
25
+ status: "healthy" | "missing" | "unhealthy";
26
+ message?: string;
27
+ }> {
28
+ const available = await this.isAvailable();
29
+ if (!available) {
30
+ return {
31
+ available: false,
32
+ status: "missing",
33
+ message: "Command not found",
34
+ };
35
+ }
36
+
37
+ if (options?.checkUsageLimit) {
38
+ try {
39
+ const repoRoot = process.cwd();
40
+ // Try a lightweight command to check if we're rate limited
41
+ const cmd = `echo "hello" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
42
+ const { stdout, stderr } = await execAsync(cmd, { timeout: 10000 });
43
+
44
+ const combined = (stdout || "") + (stderr || "");
45
+ if (isUsageLimit(combined)) {
46
+ return {
47
+ available: true,
48
+ status: "unhealthy",
49
+ message: "Usage limit exceeded",
50
+ };
51
+ }
52
+
53
+ return { available: true, status: "healthy", message: "Installed" };
54
+ } catch (error: unknown) {
55
+ const execError = error as {
56
+ stderr?: string;
57
+ stdout?: string;
58
+ message?: string;
59
+ };
60
+ const stderr = execError.stderr || "";
61
+ const stdout = execError.stdout || "";
62
+ const combined = stderr + stdout;
63
+
64
+ if (isUsageLimit(combined)) {
65
+ return {
66
+ available: true,
67
+ status: "unhealthy",
68
+ message: "Usage limit exceeded",
69
+ };
70
+ }
71
+
72
+ const cleanError =
73
+ combined.split("\n")[0]?.trim() ||
74
+ execError.message ||
75
+ "Command failed";
76
+ return {
77
+ available: true,
78
+ status: "unhealthy",
79
+ message: `Error: ${cleanError}`,
80
+ };
81
+ }
82
+ }
83
+
84
+ return {
85
+ available,
86
+ status: available ? "healthy" : "missing",
87
+ message: available ? "Installed" : "Command not found",
88
+ };
89
+ }
90
+
91
+ getProjectCommandDir(): string | null {
92
+ // Codex only supports user-level prompts at ~/.codex/prompts/
93
+ // No project-scoped commands available
94
+ return null;
95
+ }
96
+
97
+ getUserCommandDir(): string | null {
98
+ // Codex uses user-level prompts at ~/.codex/prompts/
99
+ return path.join(os.homedir(), ".codex", "prompts");
100
+ }
101
+
102
+ getCommandExtension(): string {
103
+ return ".md";
104
+ }
105
+
106
+ canUseSymlink(): boolean {
107
+ // Codex uses the same Markdown format as our canonical file
108
+ return true;
109
+ }
110
+
111
+ transformCommand(markdownContent: string): string {
112
+ // Codex uses the same Markdown format as Claude, no transformation needed
113
+ return markdownContent;
114
+ }
115
+
116
+ async execute(opts: {
117
+ prompt: string;
118
+ diff: string;
119
+ model?: string;
120
+ timeoutMs?: number;
121
+ }): Promise<string> {
122
+ const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
123
+
124
+ const tmpDir = os.tmpdir();
125
+ const tmpFile = path.join(tmpDir, `gauntlet-codex-${Date.now()}.txt`);
126
+ await fs.writeFile(tmpFile, fullContent);
127
+
128
+ // Get absolute path to repo root (CWD)
129
+ const repoRoot = process.cwd();
130
+
131
+ try {
132
+ // Recommended invocation per spec:
133
+ // --cd: sets working directory to repo root
134
+ // --sandbox read-only: prevents file modifications
135
+ // -c ask_for_approval="never": prevents blocking on prompts
136
+ // -: reads prompt from stdin
137
+ const cmd = `cat "${tmpFile}" | codex exec --cd "${repoRoot}" --sandbox read-only -c 'ask_for_approval="never"' -`;
138
+ const { stdout } = await execAsync(cmd, {
139
+ timeout: opts.timeoutMs,
140
+ maxBuffer: MAX_BUFFER_BYTES,
141
+ });
142
+ return stdout;
143
+ } finally {
144
+ await fs.unlink(tmpFile).catch(() => {});
145
+ }
146
+ }
123
147
  }
@@ -0,0 +1,152 @@
1
+ import { exec } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { promisify } from "node:util";
6
+ import { type CLIAdapter, isUsageLimit } from "./index.js";
7
+
8
+ const execAsync = promisify(exec);
9
+ const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
10
+
11
+ export class CursorAdapter implements CLIAdapter {
12
+ name = "cursor";
13
+
14
+ async isAvailable(): Promise<boolean> {
15
+ try {
16
+ // Note: Cursor's CLI binary is named "agent", not "cursor"
17
+ await execAsync("which agent");
18
+ return true;
19
+ } catch {
20
+ return false;
21
+ }
22
+ }
23
+
24
+ async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
25
+ available: boolean;
26
+ status: "healthy" | "missing" | "unhealthy";
27
+ message?: string;
28
+ }> {
29
+ const available = await this.isAvailable();
30
+ if (!available) {
31
+ return {
32
+ available: false,
33
+ status: "missing",
34
+ message: "Command not found",
35
+ };
36
+ }
37
+
38
+ if (options?.checkUsageLimit) {
39
+ try {
40
+ // Try a lightweight command to check if we're rate limited
41
+ const { stdout, stderr } = await execAsync('echo "hello" | agent', {
42
+ timeout: 10000,
43
+ });
44
+
45
+ const combined = (stdout || "") + (stderr || "");
46
+ if (isUsageLimit(combined)) {
47
+ return {
48
+ available: true,
49
+ status: "unhealthy",
50
+ message: "Usage limit exceeded",
51
+ };
52
+ }
53
+
54
+ return { available: true, status: "healthy", message: "Ready" };
55
+ } catch (error: unknown) {
56
+ const execError = error as {
57
+ stderr?: string;
58
+ stdout?: string;
59
+ message?: string;
60
+ };
61
+ const stderr = execError.stderr || "";
62
+ const stdout = execError.stdout || "";
63
+ const combined = stderr + stdout;
64
+
65
+ if (isUsageLimit(combined)) {
66
+ return {
67
+ available: true,
68
+ status: "unhealthy",
69
+ message: "Usage limit exceeded",
70
+ };
71
+ }
72
+
73
+ // Since we sent a valid prompt ("hello"), any other error implies the tool is broken
74
+ const cleanError =
75
+ combined.split("\n")[0]?.trim() ||
76
+ execError.message ||
77
+ "Command failed";
78
+ return {
79
+ available: true,
80
+ status: "unhealthy",
81
+ message: `Error: ${cleanError}`,
82
+ };
83
+ }
84
+ }
85
+
86
+ return { available: true, status: "healthy", message: "Ready" };
87
+ }
88
+
89
+ getProjectCommandDir(): string | null {
90
+ // Cursor does not support custom commands
91
+ return null;
92
+ }
93
+
94
+ getUserCommandDir(): string | null {
95
+ // Cursor does not support custom commands
96
+ return null;
97
+ }
98
+
99
+ getCommandExtension(): string {
100
+ return ".md";
101
+ }
102
+
103
+ canUseSymlink(): boolean {
104
+ // Not applicable - no command directory support
105
+ return false;
106
+ }
107
+
108
+ transformCommand(markdownContent: string): string {
109
+ // Not applicable - no command directory support
110
+ return markdownContent;
111
+ }
112
+
113
+ async execute(opts: {
114
+ prompt: string;
115
+ diff: string;
116
+ model?: string;
117
+ timeoutMs?: number;
118
+ }): Promise<string> {
119
+ const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
120
+
121
+ const tmpDir = os.tmpdir();
122
+ // Include process.pid for uniqueness across concurrent processes
123
+ const tmpFile = path.join(
124
+ tmpDir,
125
+ `gauntlet-cursor-${process.pid}-${Date.now()}.txt`,
126
+ );
127
+ await fs.writeFile(tmpFile, fullContent);
128
+
129
+ try {
130
+ // Cursor agent command reads from stdin
131
+ // Note: As of the current version, the Cursor 'agent' CLI does not expose
132
+ // flags for restricting tools or enforcing read-only mode (unlike claude's --allowedTools
133
+ // or codex's --sandbox read-only). The agent is assumed to be repo-scoped and
134
+ // safe for code review use. If Cursor adds such flags in the future, they should
135
+ // be added here for defense-in-depth.
136
+ //
137
+ // Shell command construction: We use exec() with shell piping
138
+ // because the agent requires stdin input. The tmpFile path is system-controlled
139
+ // (os.tmpdir() + Date.now() + process.pid), not user-supplied, eliminating injection risk.
140
+ // Double quotes handle paths with spaces.
141
+ const cmd = `cat "${tmpFile}" | agent`;
142
+ const { stdout } = await execAsync(cmd, {
143
+ timeout: opts.timeoutMs,
144
+ maxBuffer: MAX_BUFFER_BYTES,
145
+ });
146
+ return stdout;
147
+ } finally {
148
+ // Cleanup errors are intentionally ignored - the tmp file will be cleaned up by OS
149
+ await fs.unlink(tmpFile).catch(() => {});
150
+ }
151
+ }
152
+ }