npm - agent-gauntlet - Versions diffs - 0.2.2 → 0.4.0 - Mend

agent-gauntlet 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +3 -3
package/package.json +1 -1
package/src/cli-adapters/claude.ts +13 -1
package/src/cli-adapters/gemini.ts +17 -2
package/src/commands/check.ts +108 -12
package/src/commands/ci/list-jobs.ts +3 -2
package/src/commands/clean.ts +29 -0
package/src/commands/help.ts +1 -1
package/src/commands/index.ts +2 -1
package/src/commands/init.ts +4 -4
package/src/commands/review.ts +108 -12
package/src/commands/run.ts +109 -12
package/src/commands/shared.ts +56 -10
package/src/commands/validate.ts +20 -0
package/src/config/schema.ts +5 -0
package/src/config/validator.ts +6 -13
package/src/core/change-detector.ts +1 -0
package/src/core/entry-point.ts +48 -7
package/src/core/runner.ts +90 -56
package/src/gates/result.ts +32 -0
package/src/gates/review.ts +428 -162
package/src/index.ts +4 -2
package/src/output/console-log.ts +146 -0
package/src/output/console.ts +103 -9
package/src/output/logger.ts +52 -8
package/src/templates/run_gauntlet.template.md +20 -13
package/src/utils/log-parser.ts +498 -162
package/src/utils/session-ref.ts +82 -0
package/src/commands/check.test.ts +0 -29
package/src/commands/detect.test.ts +0 -43
package/src/commands/health.test.ts +0 -93
package/src/commands/help.test.ts +0 -44
package/src/commands/init.test.ts +0 -130
package/src/commands/list.test.ts +0 -121
package/src/commands/rerun.ts +0 -160
package/src/commands/review.test.ts +0 -31
package/src/commands/run.test.ts +0 -27
package/src/config/loader.test.ts +0 -151
package/src/core/entry-point.test.ts +0 -61
package/src/gates/review.test.ts +0 -291

package/README.md CHANGED Viewed

@@ -51,9 +51,9 @@ The use cases below illustrate when each of these patterns may be used.
 2. Run `/gauntlet` from chat
 3. Gauntlet detects changed files and runs configured checks (linter, tests, type checking, etc.)
 4. Simultaneously, Gauntlet invokes AI CLIs for code review
-5. Assistant reviews results, fixes identified issues, and runs `agent-gauntlet rerun`
-6. Gauntlet verifies fixes and checks for new issues
-7. Process repeats automatically (up to 3 reruns) until all gates pass
+5. Assistant reviews results, fixes identified issues, and runs `agent-gauntlet run` again
+6. Gauntlet detects existing logs, switches to verification mode, and checks fixes
+7. Process repeats automatically (up to 3 iterations) until all gates pass
 ### 3. Agentic Implementation

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-gauntlet",
-  "version": "0.2.2",
+  "version": "0.4.0",
   "description": "A CLI tool for testing AI coding agents",
   "license": "Apache-2.0",
   "author": "Paul Caplan",

package/src/cli-adapters/claude.ts CHANGED Viewed

@@ -40,7 +40,7 @@ export class ClaudeAdapter implements CLIAdapter {
 				// We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
 				const { stdout, stderr } = await execAsync(
 					'echo "hello" | claude -p --max-turns 1',
-					{ timeout: 10000 },
+					{ timeout: 30000 },
 				);
 				const combined = (stdout || "") + (stderr || "");
@@ -58,7 +58,19 @@ export class ClaudeAdapter implements CLIAdapter {
 					stderr?: string;
 					stdout?: string;
 					message?: string;
+					code?: number | string;
+					signal?: string;
 				};
+				// Check for timeout
+				if (execError.signal === "SIGTERM" && execError.code === null) {
+					return {
+						available: true,
+						status: "unhealthy",
+						message: "Error: Health check timed out",
+					};
+				}
 				const stderr = execError.stderr || "";
 				const stdout = execError.stdout || "";
 				const combined = stderr + stdout;

package/src/cli-adapters/gemini.ts CHANGED Viewed

@@ -38,7 +38,7 @@ export class GeminiAdapter implements CLIAdapter {
 			try {
 				const { stdout, stderr } = await execAsync(
 					'echo "hello" | gemini --sandbox --output-format text',
-					{ timeout: 10000 },
+					{ timeout: 30000 },
 				);
 				const combined = (stdout || "") + (stderr || "");
@@ -56,7 +56,19 @@ export class GeminiAdapter implements CLIAdapter {
 					stderr?: string;
 					stdout?: string;
 					message?: string;
+					code?: number | string;
+					signal?: string;
 				};
+				// Check for timeout
+				if (execError.signal === "SIGTERM" && execError.code === null) {
+					return {
+						available: true,
+						status: "unhealthy",
+						message: "Error: Health check timed out",
+					};
+				}
 				const stderr = execError.stderr || "";
 				const stdout = execError.stdout || "";
 				const combined = stderr + stdout;
@@ -159,7 +171,10 @@ ${escapedBody}
 		// Write to a temporary file to avoid shell escaping issues
 		const tmpDir = os.tmpdir();
-		const tmpFile = path.join(tmpDir, `gauntlet-gemini-${Date.now()}.txt`);
+		const tmpFile = path.join(
+			tmpDir,
+			`gauntlet-gemini-${process.pid}-${Date.now()}.txt`,
+		);
 		await fs.writeFile(tmpFile, fullContent);
 		try {

package/src/commands/check.ts CHANGED Viewed

@@ -6,8 +6,19 @@ import { EntryPointExpander } from "../core/entry-point.js";
 import { JobGenerator } from "../core/job.js";
 import { Runner } from "../core/runner.js";
 import { ConsoleReporter } from "../output/console.js";
+import { startConsoleLog } from "../output/console-log.js";
 import { Logger } from "../output/logger.js";
-import { rotateLogs } from "./shared.js";
+import {
+	findPreviousFailures,
+	type PreviousViolation,
+} from "../utils/log-parser.js";
+import { readSessionRef, writeSessionRef } from "../utils/session-ref.js";
+import {
+	acquireLock,
+	cleanLogs,
+	hasExistingLogs,
+	releaseLock,
+} from "./shared.js";
 export function registerCheckCommand(program: Command): void {
 	program
@@ -24,14 +35,16 @@ export function registerCheckCommand(program: Command): void {
 			"Use diff for current uncommitted changes (staged and unstaged)",
 		)
 		.action(async (options) => {
+			let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
+			let lockAcquired = false;
+			let restoreConsole: (() => void) | undefined;
 			try {
-				const config = await loadConfig();
-				// Rotate logs before starting
-				await rotateLogs(config.project.log_dir);
+				config = await loadConfig();
+				restoreConsole = await startConsoleLog(config.project.log_dir);
+				await acquireLock(config.project.log_dir);
+				lockAcquired = true;
 				// Determine effective base branch
-				// Priority: CLI override > CI env var > config
 				const effectiveBaseBranch =
 					options.baseBranch ||
 					(process.env.GITHUB_BASE_REF &&
@@ -40,10 +53,76 @@ export function registerCheckCommand(program: Command): void {
 						: null) ||
 					config.project.base_branch;
-				const changeDetector = new ChangeDetector(effectiveBaseBranch, {
-					commit: options.commit,
-					uncommitted: options.uncommitted,
-				});
+				// Detect rerun mode: if logs exist and not targeting a specific commit, enter verification mode
+				const logsExist = await hasExistingLogs(config.project.log_dir);
+				const isRerun = logsExist && !options.commit;
+				let failuresMap:
+					| Map<string, Map<string, PreviousViolation[]>>
+					| undefined;
+				let changeOptions:
+					| { commit?: string; uncommitted?: boolean; fixBase?: string }
+					| undefined;
+				if (isRerun) {
+					console.log(
+						chalk.dim(
+							"Existing logs detected — running in verification mode...",
+						),
+					);
+					const previousFailures = await findPreviousFailures(
+						config.project.log_dir,
+						options.gate,
+					);
+					failuresMap = new Map();
+					for (const gateFailure of previousFailures) {
+						const adapterMap = new Map<string, PreviousViolation[]>();
+						for (const af of gateFailure.adapterFailures) {
+							const key = af.reviewIndex
+								? String(af.reviewIndex)
+								: af.adapterName;
+							adapterMap.set(key, af.violations);
+						}
+						failuresMap.set(gateFailure.jobId, adapterMap);
+					}
+					if (previousFailures.length > 0) {
+						const totalViolations = previousFailures.reduce(
+							(sum, gf) =>
+								sum +
+								gf.adapterFailures.reduce(
+									(s, af) => s + af.violations.length,
+									0,
+								),
+							0,
+						);
+						console.log(
+							chalk.yellow(
+								`Found ${previousFailures.length} gate(s) with ${totalViolations} previous violation(s)`,
+							),
+						);
+					}
+					changeOptions = { uncommitted: true };
+					const fixBase = await readSessionRef(config.project.log_dir);
+					if (fixBase) {
+						changeOptions.fixBase = fixBase;
+					}
+				} else if (options.commit || options.uncommitted) {
+					changeOptions = {
+						commit: options.commit,
+						uncommitted: options.uncommitted,
+					};
+				}
+				const changeDetector = new ChangeDetector(
+					effectiveBaseBranch,
+					changeOptions || {
+						commit: options.commit,
+						uncommitted: options.uncommitted,
+					},
+				);
 				const expander = new EntryPointExpander();
 				const jobGen = new JobGenerator(config);
@@ -52,6 +131,8 @@ export function registerCheckCommand(program: Command): void {
 				if (changes.length === 0) {
 					console.log(chalk.green("No changes detected."));
+					await releaseLock(config.project.log_dir);
+					restoreConsole?.();
 					process.exit(0);
 				}
@@ -72,6 +153,8 @@ export function registerCheckCommand(program: Command): void {
 				if (jobs.length === 0) {
 					console.log(chalk.yellow("No applicable checks for these changes."));
+					await releaseLock(config.project.log_dir);
+					restoreConsole?.();
 					process.exit(0);
 				}
@@ -83,16 +166,29 @@ export function registerCheckCommand(program: Command): void {
 					config,
 					logger,
 					reporter,
-					undefined,
-					undefined,
+					failuresMap,
+					changeOptions,
 					effectiveBaseBranch,
 				);
 				const success = await runner.run(jobs);
+				if (success) {
+					await cleanLogs(config.project.log_dir);
+				} else {
+					await writeSessionRef(config.project.log_dir);
+				}
+				await releaseLock(config.project.log_dir);
+				restoreConsole?.();
 				process.exit(success ? 0 : 1);
 			} catch (error: unknown) {
+				if (config && lockAcquired) {
+					await releaseLock(config.project.log_dir);
+				}
 				const err = error as { message?: string };
 				console.error(chalk.red("Error:"), err.message);
+				restoreConsole?.();
 				process.exit(1);
 			}
 		});

package/src/commands/ci/list-jobs.ts CHANGED Viewed

@@ -34,8 +34,9 @@ export async function listJobs(): Promise<void> {
 						}
 						const workingDirectory = checkDef.working_directory || ep.path;
-						// Include entry point in key to ensure each entry point/check pair is distinct
-						const jobKey = `${ep.path}:${check.name}:${workingDirectory}`;
+						// Dedupe by check name + working directory only - if two entry points
+						// both trigger e.g. "test" with working_directory: ".", run it once
+						const jobKey = `${check.name}:${workingDirectory}`;
 						// Skip if we've already created a job for this exact entry point/check combination
 						if (seenJobs.has(jobKey)) {

package/src/commands/clean.ts ADDED Viewed

@@ -0,0 +1,29 @@
+import chalk from "chalk";
+import type { Command } from "commander";
+import { loadConfig } from "../config/loader.js";
+import { acquireLock, cleanLogs, releaseLock } from "./shared.js";
+export function registerCleanCommand(program: Command): void {
+	program
+		.command("clean")
+		.description("Archive logs (move current logs into previous/)")
+		.action(async () => {
+			let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
+			let lockAcquired = false;
+			try {
+				config = await loadConfig();
+				await acquireLock(config.project.log_dir);
+				lockAcquired = true;
+				await cleanLogs(config.project.log_dir);
+				await releaseLock(config.project.log_dir);
+				console.log(chalk.green("Logs archived successfully."));
+			} catch (error: unknown) {
+				if (config && lockAcquired) {
+					await releaseLock(config.project.log_dir);
+				}
+				const err = error as { message?: string };
+				console.error(chalk.red("Error:"), err.message);
+				process.exit(1);
+			}
+		});
+}

package/src/commands/help.ts CHANGED Viewed

@@ -15,9 +15,9 @@ export function registerHelpCommand(program: Command): void {
 			);
 			console.log(chalk.bold("Commands:\n"));
 			console.log("  run      Run gates for detected changes");
-			console.log("  rerun    Rerun gates with previous failure context");
 			console.log("  check    Run only applicable checks");
 			console.log("  review   Run only applicable reviews");
+			console.log("  clean    Archive logs (move current logs into previous/)");
 			console.log(
 				"  detect   Show what gates would run (without executing them)",
 			);

package/src/commands/index.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 export { registerCheckCommand } from "./check.js";
 export { registerCICommand } from "./ci/index.js";
+export { registerCleanCommand } from "./clean.js";
 export { registerDetectCommand } from "./detect.js";
 export { registerHealthCommand } from "./health.js";
 export { registerHelpCommand } from "./help.js";
 export { registerInitCommand } from "./init.js";
 export { registerListCommand } from "./list.js";
-export { registerRerunCommand } from "./rerun.js";
 export { registerReviewCommand } from "./review.js";
 export { registerRunCommand } from "./run.js";
+export { registerValidateCommand } from "./validate.js";

package/src/commands/init.ts CHANGED Viewed

@@ -35,12 +35,12 @@ Execute the autonomous verification suite.
    - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
 3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
 4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
-5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
-6. Run \`agent-gauntlet rerun\` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
+5. Do NOT commit your changes yet—keep them uncommitted so the next run can verify them.
+6. Run \`agent-gauntlet run\` again to verify your fixes. It will detect existing logs and automatically switch to verification mode (uncommitted changes + previous failure context).
 7. Repeat steps 2-6 until one of the following termination conditions is met:
-   - All gates pass
+   - All gates pass (logs are automatically archived)
    - You disagree with remaining failures (ask the human how to proceed)
-   - Still failing after 3 rerun attempts
+   - Still failing after 3 attempts
 8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.
 `;

package/src/commands/review.ts CHANGED Viewed

@@ -6,8 +6,19 @@ import { EntryPointExpander } from "../core/entry-point.js";
 import { JobGenerator } from "../core/job.js";
 import { Runner } from "../core/runner.js";
 import { ConsoleReporter } from "../output/console.js";
+import { startConsoleLog } from "../output/console-log.js";
 import { Logger } from "../output/logger.js";
-import { rotateLogs } from "./shared.js";
+import {
+	findPreviousFailures,
+	type PreviousViolation,
+} from "../utils/log-parser.js";
+import { readSessionRef, writeSessionRef } from "../utils/session-ref.js";
+import {
+	acquireLock,
+	cleanLogs,
+	hasExistingLogs,
+	releaseLock,
+} from "./shared.js";
 export function registerReviewCommand(program: Command): void {
 	program
@@ -24,14 +35,16 @@ export function registerReviewCommand(program: Command): void {
 			"Use diff for current uncommitted changes (staged and unstaged)",
 		)
 		.action(async (options) => {
+			let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
+			let lockAcquired = false;
+			let restoreConsole: (() => void) | undefined;
 			try {
-				const config = await loadConfig();
-				// Rotate logs before starting
-				await rotateLogs(config.project.log_dir);
+				config = await loadConfig();
+				restoreConsole = await startConsoleLog(config.project.log_dir);
+				await acquireLock(config.project.log_dir);
+				lockAcquired = true;
 				// Determine effective base branch
-				// Priority: CLI override > CI env var > config
 				const effectiveBaseBranch =
 					options.baseBranch ||
 					(process.env.GITHUB_BASE_REF &&
@@ -40,10 +53,76 @@ export function registerReviewCommand(program: Command): void {
 						: null) ||
 					config.project.base_branch;
-				const changeDetector = new ChangeDetector(effectiveBaseBranch, {
-					commit: options.commit,
-					uncommitted: options.uncommitted,
-				});
+				// Detect rerun mode: if logs exist and not targeting a specific commit, enter verification mode
+				const logsExist = await hasExistingLogs(config.project.log_dir);
+				const isRerun = logsExist && !options.commit;
+				let failuresMap:
+					| Map<string, Map<string, PreviousViolation[]>>
+					| undefined;
+				let changeOptions:
+					| { commit?: string; uncommitted?: boolean; fixBase?: string }
+					| undefined;
+				if (isRerun) {
+					console.log(
+						chalk.dim(
+							"Existing logs detected — running in verification mode...",
+						),
+					);
+					const previousFailures = await findPreviousFailures(
+						config.project.log_dir,
+						options.gate,
+					);
+					failuresMap = new Map();
+					for (const gateFailure of previousFailures) {
+						const adapterMap = new Map<string, PreviousViolation[]>();
+						for (const af of gateFailure.adapterFailures) {
+							const key = af.reviewIndex
+								? String(af.reviewIndex)
+								: af.adapterName;
+							adapterMap.set(key, af.violations);
+						}
+						failuresMap.set(gateFailure.jobId, adapterMap);
+					}
+					if (previousFailures.length > 0) {
+						const totalViolations = previousFailures.reduce(
+							(sum, gf) =>
+								sum +
+								gf.adapterFailures.reduce(
+									(s, af) => s + af.violations.length,
+									0,
+								),
+							0,
+						);
+						console.log(
+							chalk.yellow(
+								`Found ${previousFailures.length} gate(s) with ${totalViolations} previous violation(s)`,
+							),
+						);
+					}
+					changeOptions = { uncommitted: true };
+					const fixBase = await readSessionRef(config.project.log_dir);
+					if (fixBase) {
+						changeOptions.fixBase = fixBase;
+					}
+				} else if (options.commit || options.uncommitted) {
+					changeOptions = {
+						commit: options.commit,
+						uncommitted: options.uncommitted,
+					};
+				}
+				const changeDetector = new ChangeDetector(
+					effectiveBaseBranch,
+					changeOptions || {
+						commit: options.commit,
+						uncommitted: options.uncommitted,
+					},
+				);
 				const expander = new EntryPointExpander();
 				const jobGen = new JobGenerator(config);
@@ -52,6 +131,8 @@ export function registerReviewCommand(program: Command): void {
 				if (changes.length === 0) {
 					console.log(chalk.green("No changes detected."));
+					await releaseLock(config.project.log_dir);
+					restoreConsole?.();
 					process.exit(0);
 				}
@@ -72,6 +153,8 @@ export function registerReviewCommand(program: Command): void {
 				if (jobs.length === 0) {
 					console.log(chalk.yellow("No applicable reviews for these changes."));
+					await releaseLock(config.project.log_dir);
+					restoreConsole?.();
 					process.exit(0);
 				}
@@ -83,16 +166,29 @@ export function registerReviewCommand(program: Command): void {
 					config,
 					logger,
 					reporter,
-					undefined,
-					undefined,
+					failuresMap,
+					changeOptions,
 					effectiveBaseBranch,
 				);
 				const success = await runner.run(jobs);
+				if (success) {
+					await cleanLogs(config.project.log_dir);
+				} else {
+					await writeSessionRef(config.project.log_dir);
+				}
+				await releaseLock(config.project.log_dir);
+				restoreConsole?.();
 				process.exit(success ? 0 : 1);
 			} catch (error: unknown) {
+				if (config && lockAcquired) {
+					await releaseLock(config.project.log_dir);
+				}
 				const err = error as { message?: string };
 				console.error(chalk.red("Error:"), err.message);
+				restoreConsole?.();
 				process.exit(1);
 			}
 		});