agent-gauntlet 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +4 -2
- package/src/cli-adapters/claude.ts +139 -108
- package/src/cli-adapters/codex.ts +141 -117
- package/src/cli-adapters/cursor.ts +152 -0
- package/src/cli-adapters/gemini.ts +171 -139
- package/src/cli-adapters/github-copilot.ts +153 -0
- package/src/cli-adapters/index.ts +77 -48
- package/src/commands/check.test.ts +24 -20
- package/src/commands/check.ts +65 -59
- package/src/commands/detect.test.ts +38 -32
- package/src/commands/detect.ts +74 -61
- package/src/commands/health.test.ts +67 -53
- package/src/commands/health.ts +167 -145
- package/src/commands/help.test.ts +37 -37
- package/src/commands/help.ts +30 -22
- package/src/commands/index.ts +9 -9
- package/src/commands/init.test.ts +118 -107
- package/src/commands/init.ts +514 -417
- package/src/commands/list.test.ts +87 -70
- package/src/commands/list.ts +28 -24
- package/src/commands/rerun.ts +142 -119
- package/src/commands/review.test.ts +26 -20
- package/src/commands/review.ts +65 -59
- package/src/commands/run.test.ts +22 -20
- package/src/commands/run.ts +64 -58
- package/src/commands/shared.ts +44 -35
- package/src/config/loader.test.ts +112 -90
- package/src/config/loader.ts +132 -123
- package/src/config/schema.ts +49 -47
- package/src/config/types.ts +15 -13
- package/src/config/validator.ts +521 -454
- package/src/core/change-detector.ts +122 -104
- package/src/core/entry-point.test.ts +60 -62
- package/src/core/entry-point.ts +76 -67
- package/src/core/job.ts +69 -59
- package/src/core/runner.ts +261 -230
- package/src/gates/check.ts +78 -69
- package/src/gates/result.ts +7 -7
- package/src/gates/review.test.ts +174 -138
- package/src/gates/review.ts +716 -561
- package/src/index.ts +16 -15
- package/src/output/console.ts +253 -214
- package/src/output/logger.ts +64 -52
- package/src/templates/run_gauntlet.template.md +18 -0
- package/src/utils/diff-parser.ts +64 -62
- package/src/utils/log-parser.ts +227 -206
- package/src/utils/sanitizer.ts +1 -1
package/src/gates/check.ts
CHANGED
|
@@ -1,82 +1,91 @@
|
|
|
1
|
-
import { exec } from
|
|
2
|
-
import { promisify } from
|
|
3
|
-
import { CheckGateConfig } from
|
|
4
|
-
import { GateResult } from
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import type { CheckGateConfig } from "../config/types.js";
|
|
4
|
+
import type { GateResult } from "./result.js";
|
|
5
5
|
|
|
6
6
|
const execAsync = promisify(exec);
|
|
7
7
|
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
8
8
|
|
|
9
9
|
export class CheckGateExecutor {
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
try {
|
|
19
|
-
await logger(`[${new Date().toISOString()}] Starting check: ${config.name}\n`);
|
|
20
|
-
await logger(`Executing command: ${config.command}\n`);
|
|
21
|
-
await logger(`Working directory: ${workingDirectory}\n\n`);
|
|
10
|
+
async execute(
|
|
11
|
+
jobId: string,
|
|
12
|
+
config: CheckGateConfig,
|
|
13
|
+
workingDirectory: string,
|
|
14
|
+
logger: (output: string) => Promise<void>,
|
|
15
|
+
): Promise<GateResult> {
|
|
16
|
+
const startTime = Date.now();
|
|
22
17
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
18
|
+
try {
|
|
19
|
+
await logger(
|
|
20
|
+
`[${new Date().toISOString()}] Starting check: ${config.name}\n`,
|
|
21
|
+
);
|
|
22
|
+
await logger(`Executing command: ${config.command}\n`);
|
|
23
|
+
await logger(`Working directory: ${workingDirectory}\n\n`);
|
|
28
24
|
|
|
29
|
-
|
|
30
|
-
|
|
25
|
+
const { stdout, stderr } = await execAsync(config.command, {
|
|
26
|
+
cwd: workingDirectory,
|
|
27
|
+
timeout: config.timeout ? config.timeout * 1000 : undefined,
|
|
28
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
29
|
+
});
|
|
31
30
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
status: 'pass',
|
|
35
|
-
duration: Date.now() - startTime,
|
|
36
|
-
message: 'Command exited with code 0'
|
|
37
|
-
};
|
|
31
|
+
if (stdout) await logger(stdout);
|
|
32
|
+
if (stderr) await logger(`\nSTDERR:\n${stderr}`);
|
|
38
33
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
await logger(`\nCommand failed: ${error.message}`);
|
|
34
|
+
const result: GateResult = {
|
|
35
|
+
jobId,
|
|
36
|
+
status: "pass",
|
|
37
|
+
duration: Date.now() - startTime,
|
|
38
|
+
message: "Command exited with code 0",
|
|
39
|
+
};
|
|
46
40
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
41
|
+
await logger(`Result: ${result.status} - ${result.message}\n`);
|
|
42
|
+
return result;
|
|
43
|
+
} catch (error: unknown) {
|
|
44
|
+
const err = error as {
|
|
45
|
+
stdout?: string;
|
|
46
|
+
stderr?: string;
|
|
47
|
+
message?: string;
|
|
48
|
+
signal?: string;
|
|
49
|
+
code?: number;
|
|
50
|
+
};
|
|
51
|
+
if (err.stdout) await logger(err.stdout);
|
|
52
|
+
if (err.stderr) await logger(`\nSTDERR:\n${err.stderr}`);
|
|
58
53
|
|
|
59
|
-
|
|
60
|
-
if (typeof error.code === 'number') {
|
|
61
|
-
const result: GateResult = {
|
|
62
|
-
jobId,
|
|
63
|
-
status: 'fail',
|
|
64
|
-
duration: Date.now() - startTime,
|
|
65
|
-
message: `Exited with code ${error.code}`
|
|
66
|
-
};
|
|
67
|
-
await logger(`Result: ${result.status} - ${result.message}\n`);
|
|
68
|
-
return result;
|
|
69
|
-
}
|
|
54
|
+
await logger(`\nCommand failed: ${err.message}`);
|
|
70
55
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
56
|
+
// If it's a timeout
|
|
57
|
+
if (err.signal === "SIGTERM" && config.timeout) {
|
|
58
|
+
const result: GateResult = {
|
|
59
|
+
jobId,
|
|
60
|
+
status: "fail",
|
|
61
|
+
duration: Date.now() - startTime,
|
|
62
|
+
message: `Timed out after ${config.timeout}s`,
|
|
63
|
+
};
|
|
64
|
+
await logger(`Result: ${result.status} - ${result.message}\n`);
|
|
65
|
+
return result;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If it's a non-zero exit code
|
|
69
|
+
if (typeof err.code === "number") {
|
|
70
|
+
const result: GateResult = {
|
|
71
|
+
jobId,
|
|
72
|
+
status: "fail",
|
|
73
|
+
duration: Date.now() - startTime,
|
|
74
|
+
message: `Exited with code ${err.code}`,
|
|
75
|
+
};
|
|
76
|
+
await logger(`Result: ${result.status} - ${result.message}\n`);
|
|
77
|
+
return result;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Other errors
|
|
81
|
+
const result: GateResult = {
|
|
82
|
+
jobId,
|
|
83
|
+
status: "error",
|
|
84
|
+
duration: Date.now() - startTime,
|
|
85
|
+
message: err.message || "Unknown error",
|
|
86
|
+
};
|
|
87
|
+
await logger(`Result: ${result.status} - ${result.message}\n`);
|
|
88
|
+
return result;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
82
91
|
}
|
package/src/gates/result.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
export type GateStatus =
|
|
1
|
+
export type GateStatus = "pass" | "fail" | "error";
|
|
2
2
|
|
|
3
3
|
export interface GateResult {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
4
|
+
jobId: string;
|
|
5
|
+
status: GateStatus;
|
|
6
|
+
duration: number; // ms
|
|
7
|
+
message?: string; // summary message
|
|
8
|
+
logPath?: string; // path to full log
|
|
9
|
+
logPaths?: string[]; // paths to multiple logs (e.g. per-agent logs)
|
|
10
10
|
}
|
package/src/gates/review.test.ts
CHANGED
|
@@ -1,152 +1,188 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import fs from
|
|
3
|
-
import path from
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
1
|
+
import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import type { CLIAdapter } from "../cli-adapters/index.js";
|
|
5
|
+
import type {
|
|
6
|
+
ReviewGateConfig,
|
|
7
|
+
ReviewPromptFrontmatter,
|
|
8
|
+
} from "../config/types.js";
|
|
9
|
+
import { Logger } from "../output/logger.js";
|
|
10
|
+
import { ReviewGateExecutor } from "./review.js";
|
|
11
|
+
|
|
12
|
+
const TEST_DIR = path.join(process.cwd(), `test-review-logs-${Date.now()}`);
|
|
13
|
+
const LOG_DIR = path.join(TEST_DIR, "logs");
|
|
14
|
+
|
|
15
|
+
describe("ReviewGateExecutor Logging", () => {
|
|
16
|
+
let logger: Logger;
|
|
17
|
+
let executor: ReviewGateExecutor;
|
|
18
|
+
|
|
19
|
+
beforeEach(async () => {
|
|
20
|
+
await fs.mkdir(TEST_DIR, { recursive: true });
|
|
21
|
+
await fs.mkdir(LOG_DIR, { recursive: true });
|
|
22
|
+
logger = new Logger(LOG_DIR);
|
|
23
|
+
executor = new ReviewGateExecutor();
|
|
24
|
+
|
|
25
|
+
// Mock getAdapter
|
|
26
|
+
mock.module("../cli-adapters/index.js", () => ({
|
|
27
|
+
getAdapter: (name: string) =>
|
|
28
|
+
({
|
|
29
|
+
name,
|
|
30
|
+
isAvailable: async () => true,
|
|
31
|
+
checkHealth: async () => ({ status: "healthy" }),
|
|
32
|
+
// execute returns the raw string output from the LLM, which is then parsed by the executor.
|
|
33
|
+
// The real adapter returns a string. In this test, we return a JSON string to simulate
|
|
34
|
+
// the LLM returning structured data. This IS intentional and matches the expected contract
|
|
35
|
+
// where execute() -> Promise<string>.
|
|
36
|
+
execute: async () => {
|
|
37
|
+
await new Promise((r) => setTimeout(r, 1)); // Simulate async work
|
|
38
|
+
return JSON.stringify({ status: "pass", message: "OK" });
|
|
39
|
+
},
|
|
40
|
+
getProjectCommandDir: () => null,
|
|
41
|
+
getUserCommandDir: () => null,
|
|
42
|
+
getCommandExtension: () => "md",
|
|
43
|
+
canUseSymlink: () => false,
|
|
44
|
+
transformCommand: (c: string) => c,
|
|
45
|
+
}) as unknown as CLIAdapter,
|
|
46
|
+
}));
|
|
47
|
+
|
|
48
|
+
// Mock git commands via util.promisify(exec)
|
|
49
|
+
mock.module("node:util", () => ({
|
|
50
|
+
promisify: (fn: (...args: unknown[]) => unknown) => {
|
|
51
|
+
// Only mock exec, let others pass (though in this test env we likely only use exec)
|
|
52
|
+
if (fn.name === "exec") {
|
|
53
|
+
return async (cmd: string) => {
|
|
54
|
+
if (/^git diff/.test(cmd)) return "diff content";
|
|
55
|
+
if (/^git ls-files/.test(cmd)) return "file.ts";
|
|
56
|
+
return { stdout: "", stderr: "" };
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
// Fallback for other functions if needed
|
|
60
|
+
return async () => {};
|
|
61
|
+
},
|
|
62
|
+
}));
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
afterEach(async () => {
|
|
66
|
+
await fs.rm(TEST_DIR, { recursive: true, force: true });
|
|
67
|
+
mock.restore();
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it("should only create adapter-specific logs and no generic log", async () => {
|
|
71
|
+
const jobId = "review:src:code-quality";
|
|
72
|
+
const config: ReviewGateConfig & ReviewPromptFrontmatter = {
|
|
73
|
+
name: "code-quality",
|
|
74
|
+
cli_preference: ["codex", "claude"],
|
|
75
|
+
num_reviews: 2,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
const loggerFactory = logger.createLoggerFactory(jobId);
|
|
79
|
+
|
|
80
|
+
// We need to mock getDiff since it uses execAsync which we mocked
|
|
81
|
+
// Actually ReviewGateExecutor is a class, we can mock its private method if needed
|
|
82
|
+
// or just let it run if the mock promisify works.
|
|
83
|
+
|
|
84
|
+
const result = await executor.execute(
|
|
85
|
+
jobId,
|
|
86
|
+
config,
|
|
87
|
+
"src/",
|
|
88
|
+
loggerFactory,
|
|
89
|
+
"main",
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
expect(result.status).toBe("pass");
|
|
93
|
+
expect(result.logPaths).toBeDefined();
|
|
94
|
+
expect(result.logPaths).toHaveLength(2);
|
|
95
|
+
expect(result.logPaths?.[0]).toContain("review_src_code-quality_codex.log");
|
|
96
|
+
expect(result.logPaths?.[1]).toContain(
|
|
97
|
+
"review_src_code-quality_claude.log",
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
const files = await fs.readdir(LOG_DIR);
|
|
101
|
+
expect(files).toContain("review_src_code-quality_codex.log");
|
|
102
|
+
expect(files).toContain("review_src_code-quality_claude.log");
|
|
103
|
+
expect(files).not.toContain("review_src_code-quality.log");
|
|
104
|
+
|
|
105
|
+
// Verify multiplexed content
|
|
106
|
+
const codexLog = await fs.readFile(
|
|
107
|
+
path.join(LOG_DIR, "review_src_code-quality_codex.log"),
|
|
108
|
+
"utf-8",
|
|
109
|
+
);
|
|
110
|
+
expect(codexLog).toContain("Starting review: code-quality");
|
|
111
|
+
expect(codexLog).toContain("Review result (codex): pass");
|
|
112
|
+
|
|
113
|
+
const claudeLog = await fs.readFile(
|
|
114
|
+
path.join(LOG_DIR, "review_src_code-quality_claude.log"),
|
|
115
|
+
"utf-8",
|
|
116
|
+
);
|
|
117
|
+
expect(claudeLog).toContain("Starting review: code-quality");
|
|
118
|
+
expect(claudeLog).toContain("Review result (claude): pass");
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it("should be handled correctly by ConsoleReporter", async () => {
|
|
122
|
+
const jobId = "review:src:code-quality";
|
|
123
|
+
const codexPath = path.join(LOG_DIR, "review_src_code-quality_codex.log");
|
|
124
|
+
const claudePath = path.join(LOG_DIR, "review_src_code-quality_claude.log");
|
|
125
|
+
|
|
126
|
+
await fs.writeFile(
|
|
127
|
+
codexPath,
|
|
128
|
+
`
|
|
115
129
|
[2026-01-14T10:00:00.000Z] Starting review: code-quality
|
|
116
130
|
--- Parsed Result (codex) ---
|
|
117
131
|
Status: FAIL
|
|
118
132
|
Violations:
|
|
119
133
|
1. src/index.ts:10 - Security risk
|
|
120
134
|
Fix: Use a safer method
|
|
121
|
-
|
|
135
|
+
`,
|
|
136
|
+
);
|
|
122
137
|
|
|
123
|
-
|
|
138
|
+
await fs.writeFile(
|
|
139
|
+
claudePath,
|
|
140
|
+
`
|
|
124
141
|
[2026-01-14T10:00:00.000Z] Starting review: code-quality
|
|
125
142
|
--- Parsed Result (claude) ---
|
|
126
143
|
Status: FAIL
|
|
127
144
|
Violations:
|
|
128
145
|
1. src/main.ts:20 - Style issue
|
|
129
146
|
Fix: Rename variable
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
147
|
+
`,
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
const result = {
|
|
151
|
+
jobId,
|
|
152
|
+
status: "fail" as const,
|
|
153
|
+
duration: 1000,
|
|
154
|
+
message: "Found violations",
|
|
155
|
+
logPaths: [codexPath, claudePath],
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
const { ConsoleReporter } = await import("../output/console.js");
|
|
159
|
+
const reporter = new ConsoleReporter();
|
|
160
|
+
|
|
161
|
+
// We can access extractFailureDetails directly as it is public
|
|
162
|
+
const details = await reporter.extractFailureDetails(result);
|
|
163
|
+
|
|
164
|
+
// Check for presence of key information rather than exact counts
|
|
165
|
+
expect(
|
|
166
|
+
details.some(
|
|
167
|
+
(d: string) =>
|
|
168
|
+
d.includes("src/index.ts") &&
|
|
169
|
+
d.includes("10") &&
|
|
170
|
+
d.includes("Security risk"),
|
|
171
|
+
),
|
|
172
|
+
).toBe(true);
|
|
173
|
+
expect(details.some((d: string) => d.includes("Use a safer method"))).toBe(
|
|
174
|
+
true,
|
|
175
|
+
);
|
|
176
|
+
expect(
|
|
177
|
+
details.some(
|
|
178
|
+
(d: string) =>
|
|
179
|
+
d.includes("src/main.ts") &&
|
|
180
|
+
d.includes("20") &&
|
|
181
|
+
d.includes("Style issue"),
|
|
182
|
+
),
|
|
183
|
+
).toBe(true);
|
|
184
|
+
expect(details.some((d: string) => d.includes("Rename variable"))).toBe(
|
|
185
|
+
true,
|
|
186
|
+
);
|
|
187
|
+
});
|
|
152
188
|
});
|