npm - agent-gauntlet - Versions diffs - 0.2.0 → 0.2.2 - Mend

agent-gauntlet 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/src/commands/init.ts +18 -2
package/src/config/loader.ts +9 -6
package/src/config/schema.ts +0 -1
package/src/config/types.ts +1 -0
package/src/gates/result.ts +6 -0
package/src/gates/review.ts +29 -12
package/src/output/console.ts +50 -51
package/src/templates/run_gauntlet.template.md +15 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-gauntlet",
-  "version": "0.2.0",
+  "version": "0.2.2",
   "description": "A CLI tool for testing AI coding agents",
   "license": "Apache-2.0",
   "author": "Paul Caplan",

package/src/commands/init.ts CHANGED Viewed

@@ -12,13 +12,29 @@ const GAUNTLET_COMMAND_CONTENT = `---
 description: Run the full verification gauntlet
 allowed-tools: Bash
 ---
+<!--
+  REVIEW TRUST LEVEL
+  Controls how aggressively the agent acts on AI reviewer feedback.
+  Change the trust_level value below to one of: high, medium, low
+  - high:   Fix all issues unless you strongly disagree or have low confidence the human wants the change.
+  - medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
+  - low:    Fix only issues you strongly agree with or are confident the human wants fixed.
+-->
+<!-- trust_level: medium -->
 # /gauntlet
 Execute the autonomous verification suite.
+**Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
 1. Run \`agent-gauntlet run\`.
-2. If it fails, read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
+2. If it fails:
+   - Check the console output for "Fix instructions: available" messages.
+   - Read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
+   - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
 3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
-4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
+4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
 5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
 6. Run \`agent-gauntlet rerun\` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
 7. Repeat steps 2-6 until one of the following termination conditions is met:

package/src/config/loader.ts CHANGED Viewed

@@ -44,16 +44,19 @@ export async function loadConfig(
 				const filePath = path.join(checksPath, file);
 				const content = await fs.readFile(filePath, "utf-8");
 				const raw = YAML.parse(content);
-				// Ensure name matches filename if not provided or just use filename as key
+				const name = path.basename(file, path.extname(file));
 				const parsed: CheckGateConfig = checkGateSchema.parse(raw);
 				// Load fix instructions if specified
-				const loadedCheck: LoadedCheckGateConfig = { ...parsed };
+				const loadedCheck: LoadedCheckGateConfig = {
+					...parsed,
+					name,
+				};
 				if (parsed.fix_instructions) {
 					// Security: Reject absolute paths to prevent reading arbitrary files
 					if (path.isAbsolute(parsed.fix_instructions)) {
 						throw new Error(
-							`Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${parsed.name}")`,
+							`Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${name}")`,
 						);
 					}
@@ -75,12 +78,12 @@ export async function loadConfig(
 						relativePath === ""
 					) {
 						throw new Error(
-							`Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
+							`Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${name}")`,
 						);
 					}
 					if (!(await fileExists(fixInstructionsPath))) {
 						throw new Error(
-							`Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
+							`Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${name}")`,
 						);
 					}
 					loadedCheck.fixInstructionsContent = await fs.readFile(
@@ -89,7 +92,7 @@ export async function loadConfig(
 					);
 				}
-				checks[parsed.name] = loadedCheck;
+				checks[name] = loadedCheck;
 			}
 		}
 	}

package/src/config/schema.ts CHANGED Viewed

@@ -7,7 +7,6 @@ export const cliConfigSchema = z.object({
 export const checkGateSchema = z
 	.object({
-		name: z.string().min(1),
 		command: z.string().min(1),
 		working_directory: z.string().optional(),
 		parallel: z.boolean().default(false),

package/src/config/types.ts CHANGED Viewed

@@ -32,6 +32,7 @@ export type ServiceConfig = z.infer<typeof serviceConfigSchema>;
 // Extended check config with loaded content
 export interface LoadedCheckGateConfig extends CheckGateConfig {
+	name: string;
 	fixInstructionsContent?: string;
 }

package/src/gates/result.ts CHANGED Viewed

@@ -8,4 +8,10 @@ export interface GateResult {
 	logPath?: string; // path to full log
 	logPaths?: string[]; // paths to multiple logs (e.g. per-agent logs)
 	fixInstructions?: string; // Markdown content for fixing failures
+	subResults?: Array<{
+		nameSuffix: string;
+		status: GateStatus;
+		message: string;
+		logPath?: string;
+	}>;
 }

package/src/gates/review.ts CHANGED Viewed

@@ -299,20 +299,41 @@ export class ReviewGateExecutor {
 				};
 			}
-			const failed = outputs.find((result) => result.status === "fail");
-			const error = outputs.find((result) => result.status === "error");
+			const failed = outputs.filter((result) => result.status === "fail");
+			const errored = outputs.filter((result) => result.status === "error");
+			// If not failed or errored, it must be passed
+			// const passed = outputs.filter((result) => result.status === "pass");
 			let status: "pass" | "fail" | "error" = "pass";
 			let message = "Passed";
-			if (error) {
+			// Determine overall status
+			if (errored.length > 0) {
 				status = "error";
-				message = `Error (${error.adapter}): ${error.message}`;
-			} else if (failed) {
+				message = `Error in ${errored.length} adapter(s)`;
+			} else if (failed.length > 0) {
 				status = "fail";
-				message = `Failed (${failed.adapter}): ${failed.message}`;
+				message = `Failed by ${failed.length} adapter(s)`;
 			}
+			// Build detailed subResults
+			const subResults = outputs.map((out) => {
+				// Find specific log path for this adapter
+				// logPaths contains strings like ".../review_src_lint_codex.log"
+				// We expect the log path to contain the adapter name
+				// This is a heuristic, but likely sufficient given our naming convention
+				const specificLog = logPaths.find((p) =>
+					p.includes(`_${out.adapter}.log`),
+				);
+				return {
+					nameSuffix: `(${out.adapter})`,
+					status: out.status,
+					message: out.message,
+					logPath: specificLog,
+				};
+			});
 			await mainLogger(`Result: ${status} - ${message}\n`);
 			return {
@@ -321,6 +342,7 @@ export class ReviewGateExecutor {
 				duration: Date.now() - startTime,
 				message,
 				logPaths,
+				subResults,
 			};
 		} catch (error: unknown) {
 			const err = error as { message?: string };
@@ -439,7 +461,6 @@ export class ReviewGateExecutor {
 			const resultMsg = `Review result (${adapter.name}): ${evaluation.status} - ${evaluation.message}`;
 			await adapterLogger(`${resultMsg}\n`);
-			await mainLogger(`${resultMsg}\n`);
 			return { adapter: adapter.name, evaluation };
 		} catch (error: unknown) {
@@ -745,11 +766,7 @@ export class ReviewGateExecutor {
 			: "some";
 		// Construct a summary message
-		let msg = `Found ${violationCount} violations`;
-		if (Array.isArray(json.violations) && json.violations.length > 0) {
-			const first = json.violations[0];
-			msg += `. Example: ${first.issue} in ${first.file}`;
-		}
+		const msg = `Found ${violationCount} violations`;
 		return { status: "fail", message: msg, json, filteredCount };
 	}

package/src/output/console.ts CHANGED Viewed

@@ -12,14 +12,57 @@ export class ConsoleReporter {
 		const duration = `${(result.duration / 1000).toFixed(2)}s`;
 		const message = result.message ?? "";
-		if (result.status === "pass") {
-			console.log(chalk.green(`[PASS]  ${job.id} (${duration})`));
-		} else if (result.status === "fail") {
-			console.log(chalk.red(`[FAIL]  ${job.id} (${duration}) - ${message}`));
+		if (result.subResults && result.subResults.length > 0) {
+			// Print split results
+			for (const sub of result.subResults) {
+				const statusColor =
+					sub.status === "pass"
+						? chalk.green
+						: sub.status === "fail"
+							? chalk.red
+							: chalk.magenta;
+				const label =
+					sub.status === "pass"
+						? "PASS"
+						: sub.status === "fail"
+							? "FAIL"
+							: "ERROR";
+				let logInfo = "";
+				if (sub.status !== "pass" && sub.logPath) {
+					logInfo = `\n      Log: ${sub.logPath}`;
+				}
+				console.log(
+					statusColor(
+						`[${label}]  ${job.id} ${chalk.dim(sub.nameSuffix)} (${duration}) - ${sub.message}${logInfo}`,
+					),
+				);
+			}
 		} else {
-			console.log(
-				chalk.magenta(`[ERROR] ${job.id} (${duration}) - ${message}`),
-			);
+			// Standard single result
+			let logInfo = "";
+			if (result.status !== "pass") {
+				// Try to find a relevant log path
+				const logPath = result.logPath || result.logPaths?.[0];
+				if (logPath) {
+					logInfo = `\n      Log: ${logPath}`;
+				}
+			}
+			if (result.status === "pass") {
+				console.log(chalk.green(`[PASS]  ${job.id} (${duration})`));
+			} else if (result.status === "fail") {
+				console.log(
+					chalk.red(`[FAIL]  ${job.id} (${duration}) - ${message}${logInfo}`),
+				);
+			} else {
+				console.log(
+					chalk.magenta(
+						`[ERROR] ${job.id} (${duration}) - ${message}${logInfo}`,
+					),
+				);
+			}
 		}
 	}
@@ -35,15 +78,6 @@ export class ConsoleReporter {
 		if (failed.length > 0) console.log(chalk.red(`Failed: ${failed.length}`));
 		if (errored.length > 0)
 			console.log(chalk.magenta(`Errored: ${errored.length}`));
-		if (failed.length > 0 || errored.length > 0) {
-			console.log(`\n${chalk.bold("=== Failure Details ===\n")}`);
-			for (const result of [...failed, ...errored]) {
-				const details = await this.extractFailureDetails(result);
-				this.printFailureDetails(result, details);
-			}
-		}
 	}
 	/** @internal Public for testing */
@@ -226,39 +260,4 @@ export class ConsoleReporter {
 		return details;
 	}
-	private printFailureDetails(result: GateResult, details: string[]) {
-		const statusColor = result.status === "error" ? chalk.magenta : chalk.red;
-		const statusLabel = result.status === "error" ? "ERROR" : "FAIL";
-		console.log(statusColor(`[${statusLabel}] ${result.jobId}`));
-		if (result.message) {
-			console.log(chalk.dim(`  Summary: ${result.message}`));
-		}
-		if (details.length > 0) {
-			console.log(chalk.dim("  Details:"));
-			details.forEach((detail) => {
-				console.log(detail);
-			});
-		}
-		if (result.logPaths && result.logPaths.length > 0) {
-			result.logPaths.forEach((p) => {
-				console.log(chalk.dim(`  Log: ${p}`));
-			});
-		} else if (result.logPath) {
-			console.log(chalk.dim(`  Log: ${result.logPath}`));
-		}
-		if (result.fixInstructions) {
-			console.log(
-				chalk.cyan(
-					`  Fix instructions: available (${result.fixInstructions.split("\n").length} lines)`,
-				),
-			);
-		}
-		console.log(""); // Empty line between failures
-	}
 }

package/src/templates/run_gauntlet.template.md CHANGED Viewed

@@ -2,20 +2,33 @@
 description: Run the full verification gauntlet
 allowed-tools: Bash
 ---
+<!--
+  REVIEW TRUST LEVEL
+  Controls how aggressively the agent acts on AI reviewer feedback.
+  Change the trust_level value below to one of: high, medium, low
+  - high:   Fix all issues unless you strongly disagree or have low confidence the human wants the change.
+  - medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
+  - low:    Fix only issues you strongly agree with or are confident the human wants fixed.
+-->
+<!-- trust_level: medium -->
 # /gauntlet
 Execute the autonomous verification suite.
+**Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
 1. Run `agent-gauntlet run`.
 2. If it fails:
    - Check the console output for "Fix instructions: available" messages.
    - Read the log files in `gauntlet_logs/` to understand exactly what went wrong.
    - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
 3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
-4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
+4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
 5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
 6. Run `agent-gauntlet rerun` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
 7. Repeat steps 2-6 until one of the following termination conditions is met:
    - All gates pass
-   - You disagree with remaining failures (ask the human how to proceed)
+   - You are skipping remaining issues
    - Still failing after 3 rerun attempts
 8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.