agent-gauntlet 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/commands/init.ts +18 -2
- package/src/config/loader.ts +9 -6
- package/src/config/schema.ts +0 -1
- package/src/config/types.ts +1 -0
- package/src/gates/result.ts +6 -0
- package/src/gates/review.ts +29 -12
- package/src/output/console.ts +50 -51
- package/src/templates/run_gauntlet.template.md +15 -2
package/package.json
CHANGED
package/src/commands/init.ts
CHANGED
|
@@ -12,13 +12,29 @@ const GAUNTLET_COMMAND_CONTENT = `---
|
|
|
12
12
|
description: Run the full verification gauntlet
|
|
13
13
|
allowed-tools: Bash
|
|
14
14
|
---
|
|
15
|
+
<!--
|
|
16
|
+
REVIEW TRUST LEVEL
|
|
17
|
+
Controls how aggressively the agent acts on AI reviewer feedback.
|
|
18
|
+
Change the trust_level value below to one of: high, medium, low
|
|
19
|
+
|
|
20
|
+
- high: Fix all issues unless you strongly disagree or have low confidence the human wants the change.
|
|
21
|
+
- medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
|
|
22
|
+
- low: Fix only issues you strongly agree with or are confident the human wants fixed.
|
|
23
|
+
-->
|
|
24
|
+
<!-- trust_level: medium -->
|
|
25
|
+
|
|
15
26
|
# /gauntlet
|
|
16
27
|
Execute the autonomous verification suite.
|
|
17
28
|
|
|
29
|
+
**Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
|
|
30
|
+
|
|
18
31
|
1. Run \`agent-gauntlet run\`.
|
|
19
|
-
2. If it fails
|
|
32
|
+
2. If it fails:
|
|
33
|
+
- Check the console output for "Fix instructions: available" messages.
|
|
34
|
+
- Read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
|
|
35
|
+
- If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
|
|
20
36
|
3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
|
|
21
|
-
4.
|
|
37
|
+
4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
|
|
22
38
|
5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
|
|
23
39
|
6. Run \`agent-gauntlet rerun\` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
|
|
24
40
|
7. Repeat steps 2-6 until one of the following termination conditions is met:
|
package/src/config/loader.ts
CHANGED
|
@@ -44,16 +44,19 @@ export async function loadConfig(
|
|
|
44
44
|
const filePath = path.join(checksPath, file);
|
|
45
45
|
const content = await fs.readFile(filePath, "utf-8");
|
|
46
46
|
const raw = YAML.parse(content);
|
|
47
|
-
|
|
47
|
+
const name = path.basename(file, path.extname(file));
|
|
48
48
|
const parsed: CheckGateConfig = checkGateSchema.parse(raw);
|
|
49
49
|
|
|
50
50
|
// Load fix instructions if specified
|
|
51
|
-
const loadedCheck: LoadedCheckGateConfig = {
|
|
51
|
+
const loadedCheck: LoadedCheckGateConfig = {
|
|
52
|
+
...parsed,
|
|
53
|
+
name,
|
|
54
|
+
};
|
|
52
55
|
if (parsed.fix_instructions) {
|
|
53
56
|
// Security: Reject absolute paths to prevent reading arbitrary files
|
|
54
57
|
if (path.isAbsolute(parsed.fix_instructions)) {
|
|
55
58
|
throw new Error(
|
|
56
|
-
`Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${
|
|
59
|
+
`Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${name}")`,
|
|
57
60
|
);
|
|
58
61
|
}
|
|
59
62
|
|
|
@@ -75,12 +78,12 @@ export async function loadConfig(
|
|
|
75
78
|
relativePath === ""
|
|
76
79
|
) {
|
|
77
80
|
throw new Error(
|
|
78
|
-
`Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${
|
|
81
|
+
`Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${name}")`,
|
|
79
82
|
);
|
|
80
83
|
}
|
|
81
84
|
if (!(await fileExists(fixInstructionsPath))) {
|
|
82
85
|
throw new Error(
|
|
83
|
-
`Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${
|
|
86
|
+
`Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${name}")`,
|
|
84
87
|
);
|
|
85
88
|
}
|
|
86
89
|
loadedCheck.fixInstructionsContent = await fs.readFile(
|
|
@@ -89,7 +92,7 @@ export async function loadConfig(
|
|
|
89
92
|
);
|
|
90
93
|
}
|
|
91
94
|
|
|
92
|
-
checks[
|
|
95
|
+
checks[name] = loadedCheck;
|
|
93
96
|
}
|
|
94
97
|
}
|
|
95
98
|
}
|
package/src/config/schema.ts
CHANGED
package/src/config/types.ts
CHANGED
package/src/gates/result.ts
CHANGED
|
@@ -8,4 +8,10 @@ export interface GateResult {
|
|
|
8
8
|
logPath?: string; // path to full log
|
|
9
9
|
logPaths?: string[]; // paths to multiple logs (e.g. per-agent logs)
|
|
10
10
|
fixInstructions?: string; // Markdown content for fixing failures
|
|
11
|
+
subResults?: Array<{
|
|
12
|
+
nameSuffix: string;
|
|
13
|
+
status: GateStatus;
|
|
14
|
+
message: string;
|
|
15
|
+
logPath?: string;
|
|
16
|
+
}>;
|
|
11
17
|
}
|
package/src/gates/review.ts
CHANGED
|
@@ -299,20 +299,41 @@ export class ReviewGateExecutor {
|
|
|
299
299
|
};
|
|
300
300
|
}
|
|
301
301
|
|
|
302
|
-
const failed = outputs.
|
|
303
|
-
const
|
|
302
|
+
const failed = outputs.filter((result) => result.status === "fail");
|
|
303
|
+
const errored = outputs.filter((result) => result.status === "error");
|
|
304
|
+
// If not failed or errored, it must be passed
|
|
305
|
+
// const passed = outputs.filter((result) => result.status === "pass");
|
|
304
306
|
|
|
305
307
|
let status: "pass" | "fail" | "error" = "pass";
|
|
306
308
|
let message = "Passed";
|
|
307
309
|
|
|
308
|
-
|
|
310
|
+
// Determine overall status
|
|
311
|
+
if (errored.length > 0) {
|
|
309
312
|
status = "error";
|
|
310
|
-
message = `Error
|
|
311
|
-
} else if (failed) {
|
|
313
|
+
message = `Error in ${errored.length} adapter(s)`;
|
|
314
|
+
} else if (failed.length > 0) {
|
|
312
315
|
status = "fail";
|
|
313
|
-
message = `Failed
|
|
316
|
+
message = `Failed by ${failed.length} adapter(s)`;
|
|
314
317
|
}
|
|
315
318
|
|
|
319
|
+
// Build detailed subResults
|
|
320
|
+
const subResults = outputs.map((out) => {
|
|
321
|
+
// Find specific log path for this adapter
|
|
322
|
+
// logPaths contains strings like ".../review_src_lint_codex.log"
|
|
323
|
+
// We expect the log path to contain the adapter name
|
|
324
|
+
// This is a heuristic, but likely sufficient given our naming convention
|
|
325
|
+
const specificLog = logPaths.find((p) =>
|
|
326
|
+
p.includes(`_${out.adapter}.log`),
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
nameSuffix: `(${out.adapter})`,
|
|
331
|
+
status: out.status,
|
|
332
|
+
message: out.message,
|
|
333
|
+
logPath: specificLog,
|
|
334
|
+
};
|
|
335
|
+
});
|
|
336
|
+
|
|
316
337
|
await mainLogger(`Result: ${status} - ${message}\n`);
|
|
317
338
|
|
|
318
339
|
return {
|
|
@@ -321,6 +342,7 @@ export class ReviewGateExecutor {
|
|
|
321
342
|
duration: Date.now() - startTime,
|
|
322
343
|
message,
|
|
323
344
|
logPaths,
|
|
345
|
+
subResults,
|
|
324
346
|
};
|
|
325
347
|
} catch (error: unknown) {
|
|
326
348
|
const err = error as { message?: string };
|
|
@@ -439,7 +461,6 @@ export class ReviewGateExecutor {
|
|
|
439
461
|
|
|
440
462
|
const resultMsg = `Review result (${adapter.name}): ${evaluation.status} - ${evaluation.message}`;
|
|
441
463
|
await adapterLogger(`${resultMsg}\n`);
|
|
442
|
-
await mainLogger(`${resultMsg}\n`);
|
|
443
464
|
|
|
444
465
|
return { adapter: adapter.name, evaluation };
|
|
445
466
|
} catch (error: unknown) {
|
|
@@ -745,11 +766,7 @@ export class ReviewGateExecutor {
|
|
|
745
766
|
: "some";
|
|
746
767
|
|
|
747
768
|
// Construct a summary message
|
|
748
|
-
|
|
749
|
-
if (Array.isArray(json.violations) && json.violations.length > 0) {
|
|
750
|
-
const first = json.violations[0];
|
|
751
|
-
msg += `. Example: ${first.issue} in ${first.file}`;
|
|
752
|
-
}
|
|
769
|
+
const msg = `Found ${violationCount} violations`;
|
|
753
770
|
|
|
754
771
|
return { status: "fail", message: msg, json, filteredCount };
|
|
755
772
|
}
|
package/src/output/console.ts
CHANGED
|
@@ -12,14 +12,57 @@ export class ConsoleReporter {
|
|
|
12
12
|
const duration = `${(result.duration / 1000).toFixed(2)}s`;
|
|
13
13
|
const message = result.message ?? "";
|
|
14
14
|
|
|
15
|
-
if (result.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
if (result.subResults && result.subResults.length > 0) {
|
|
16
|
+
// Print split results
|
|
17
|
+
for (const sub of result.subResults) {
|
|
18
|
+
const statusColor =
|
|
19
|
+
sub.status === "pass"
|
|
20
|
+
? chalk.green
|
|
21
|
+
: sub.status === "fail"
|
|
22
|
+
? chalk.red
|
|
23
|
+
: chalk.magenta;
|
|
24
|
+
const label =
|
|
25
|
+
sub.status === "pass"
|
|
26
|
+
? "PASS"
|
|
27
|
+
: sub.status === "fail"
|
|
28
|
+
? "FAIL"
|
|
29
|
+
: "ERROR";
|
|
30
|
+
|
|
31
|
+
let logInfo = "";
|
|
32
|
+
if (sub.status !== "pass" && sub.logPath) {
|
|
33
|
+
logInfo = `\n Log: ${sub.logPath}`;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
console.log(
|
|
37
|
+
statusColor(
|
|
38
|
+
`[${label}] ${job.id} ${chalk.dim(sub.nameSuffix)} (${duration}) - ${sub.message}${logInfo}`,
|
|
39
|
+
),
|
|
40
|
+
);
|
|
41
|
+
}
|
|
19
42
|
} else {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
)
|
|
43
|
+
// Standard single result
|
|
44
|
+
let logInfo = "";
|
|
45
|
+
if (result.status !== "pass") {
|
|
46
|
+
// Try to find a relevant log path
|
|
47
|
+
const logPath = result.logPath || result.logPaths?.[0];
|
|
48
|
+
if (logPath) {
|
|
49
|
+
logInfo = `\n Log: ${logPath}`;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (result.status === "pass") {
|
|
54
|
+
console.log(chalk.green(`[PASS] ${job.id} (${duration})`));
|
|
55
|
+
} else if (result.status === "fail") {
|
|
56
|
+
console.log(
|
|
57
|
+
chalk.red(`[FAIL] ${job.id} (${duration}) - ${message}${logInfo}`),
|
|
58
|
+
);
|
|
59
|
+
} else {
|
|
60
|
+
console.log(
|
|
61
|
+
chalk.magenta(
|
|
62
|
+
`[ERROR] ${job.id} (${duration}) - ${message}${logInfo}`,
|
|
63
|
+
),
|
|
64
|
+
);
|
|
65
|
+
}
|
|
23
66
|
}
|
|
24
67
|
}
|
|
25
68
|
|
|
@@ -35,15 +78,6 @@ export class ConsoleReporter {
|
|
|
35
78
|
if (failed.length > 0) console.log(chalk.red(`Failed: ${failed.length}`));
|
|
36
79
|
if (errored.length > 0)
|
|
37
80
|
console.log(chalk.magenta(`Errored: ${errored.length}`));
|
|
38
|
-
|
|
39
|
-
if (failed.length > 0 || errored.length > 0) {
|
|
40
|
-
console.log(`\n${chalk.bold("=== Failure Details ===\n")}`);
|
|
41
|
-
|
|
42
|
-
for (const result of [...failed, ...errored]) {
|
|
43
|
-
const details = await this.extractFailureDetails(result);
|
|
44
|
-
this.printFailureDetails(result, details);
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
81
|
}
|
|
48
82
|
|
|
49
83
|
/** @internal Public for testing */
|
|
@@ -226,39 +260,4 @@ export class ConsoleReporter {
|
|
|
226
260
|
|
|
227
261
|
return details;
|
|
228
262
|
}
|
|
229
|
-
|
|
230
|
-
private printFailureDetails(result: GateResult, details: string[]) {
|
|
231
|
-
const statusColor = result.status === "error" ? chalk.magenta : chalk.red;
|
|
232
|
-
const statusLabel = result.status === "error" ? "ERROR" : "FAIL";
|
|
233
|
-
|
|
234
|
-
console.log(statusColor(`[${statusLabel}] ${result.jobId}`));
|
|
235
|
-
if (result.message) {
|
|
236
|
-
console.log(chalk.dim(` Summary: ${result.message}`));
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
if (details.length > 0) {
|
|
240
|
-
console.log(chalk.dim(" Details:"));
|
|
241
|
-
details.forEach((detail) => {
|
|
242
|
-
console.log(detail);
|
|
243
|
-
});
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
if (result.logPaths && result.logPaths.length > 0) {
|
|
247
|
-
result.logPaths.forEach((p) => {
|
|
248
|
-
console.log(chalk.dim(` Log: ${p}`));
|
|
249
|
-
});
|
|
250
|
-
} else if (result.logPath) {
|
|
251
|
-
console.log(chalk.dim(` Log: ${result.logPath}`));
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
if (result.fixInstructions) {
|
|
255
|
-
console.log(
|
|
256
|
-
chalk.cyan(
|
|
257
|
-
` Fix instructions: available (${result.fixInstructions.split("\n").length} lines)`,
|
|
258
|
-
),
|
|
259
|
-
);
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
console.log(""); // Empty line between failures
|
|
263
|
-
}
|
|
264
263
|
}
|
|
@@ -2,20 +2,33 @@
|
|
|
2
2
|
description: Run the full verification gauntlet
|
|
3
3
|
allowed-tools: Bash
|
|
4
4
|
---
|
|
5
|
+
<!--
|
|
6
|
+
REVIEW TRUST LEVEL
|
|
7
|
+
Controls how aggressively the agent acts on AI reviewer feedback.
|
|
8
|
+
Change the trust_level value below to one of: high, medium, low
|
|
9
|
+
|
|
10
|
+
- high: Fix all issues unless you strongly disagree or have low confidence the human wants the change.
|
|
11
|
+
- medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
|
|
12
|
+
- low: Fix only issues you strongly agree with or are confident the human wants fixed.
|
|
13
|
+
-->
|
|
14
|
+
<!-- trust_level: medium -->
|
|
15
|
+
|
|
5
16
|
# /gauntlet
|
|
6
17
|
Execute the autonomous verification suite.
|
|
7
18
|
|
|
19
|
+
**Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
|
|
20
|
+
|
|
8
21
|
1. Run `agent-gauntlet run`.
|
|
9
22
|
2. If it fails:
|
|
10
23
|
- Check the console output for "Fix instructions: available" messages.
|
|
11
24
|
- Read the log files in `gauntlet_logs/` to understand exactly what went wrong.
|
|
12
25
|
- If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
|
|
13
26
|
3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
|
|
14
|
-
4.
|
|
27
|
+
4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
|
|
15
28
|
5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
|
|
16
29
|
6. Run `agent-gauntlet rerun` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
|
|
17
30
|
7. Repeat steps 2-6 until one of the following termination conditions is met:
|
|
18
31
|
- All gates pass
|
|
19
|
-
- You
|
|
32
|
+
- You are skipping remaining issues
|
|
20
33
|
- Still failing after 3 rerun attempts
|
|
21
34
|
8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.
|