agent-gauntlet 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +3 -3
  2. package/package.json +1 -1
  3. package/src/cli-adapters/claude.ts +13 -1
  4. package/src/cli-adapters/gemini.ts +17 -2
  5. package/src/commands/check.ts +108 -12
  6. package/src/commands/ci/list-jobs.ts +3 -2
  7. package/src/commands/clean.ts +29 -0
  8. package/src/commands/help.ts +1 -1
  9. package/src/commands/index.ts +2 -1
  10. package/src/commands/init.ts +4 -4
  11. package/src/commands/review.ts +108 -12
  12. package/src/commands/run.ts +109 -12
  13. package/src/commands/shared.ts +56 -10
  14. package/src/commands/validate.ts +20 -0
  15. package/src/config/schema.ts +5 -0
  16. package/src/config/validator.ts +6 -13
  17. package/src/core/change-detector.ts +1 -0
  18. package/src/core/entry-point.ts +48 -7
  19. package/src/core/runner.ts +90 -56
  20. package/src/gates/result.ts +32 -0
  21. package/src/gates/review.ts +428 -162
  22. package/src/index.ts +4 -2
  23. package/src/output/console-log.ts +146 -0
  24. package/src/output/console.ts +103 -9
  25. package/src/output/logger.ts +52 -8
  26. package/src/templates/run_gauntlet.template.md +20 -13
  27. package/src/utils/log-parser.ts +498 -162
  28. package/src/utils/session-ref.ts +82 -0
  29. package/src/commands/check.test.ts +0 -29
  30. package/src/commands/detect.test.ts +0 -43
  31. package/src/commands/health.test.ts +0 -93
  32. package/src/commands/help.test.ts +0 -44
  33. package/src/commands/init.test.ts +0 -130
  34. package/src/commands/list.test.ts +0 -121
  35. package/src/commands/rerun.ts +0 -160
  36. package/src/commands/review.test.ts +0 -31
  37. package/src/commands/run.test.ts +0 -27
  38. package/src/config/loader.test.ts +0 -151
  39. package/src/core/entry-point.test.ts +0 -61
  40. package/src/gates/review.test.ts +0 -291
package/README.md CHANGED
@@ -51,9 +51,9 @@ The use cases below illustrate when each of these patterns may be used.
51
51
  2. Run `/gauntlet` from chat
52
52
  3. Gauntlet detects changed files and runs configured checks (linter, tests, type checking, etc.)
53
53
  4. Simultaneously, Gauntlet invokes AI CLIs for code review
54
- 5. Assistant reviews results, fixes identified issues, and runs `agent-gauntlet rerun`
55
- 6. Gauntlet verifies fixes and checks for new issues
56
- 7. Process repeats automatically (up to 3 reruns) until all gates pass
54
+ 5. Assistant reviews results, fixes identified issues, and runs `agent-gauntlet run` again
55
+ 6. Gauntlet detects existing logs, switches to verification mode, and checks fixes
56
+ 7. Process repeats automatically (up to 3 iterations) until all gates pass
57
57
 
58
58
  ### 3. Agentic Implementation
59
59
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.2.2",
3
+ "version": "0.4.0",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -40,7 +40,7 @@ export class ClaudeAdapter implements CLIAdapter {
40
40
  // We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
41
41
  const { stdout, stderr } = await execAsync(
42
42
  'echo "hello" | claude -p --max-turns 1',
43
- { timeout: 10000 },
43
+ { timeout: 30000 },
44
44
  );
45
45
 
46
46
  const combined = (stdout || "") + (stderr || "");
@@ -58,7 +58,19 @@ export class ClaudeAdapter implements CLIAdapter {
58
58
  stderr?: string;
59
59
  stdout?: string;
60
60
  message?: string;
61
+ code?: number | string;
62
+ signal?: string;
61
63
  };
64
+
65
+ // Check for timeout
66
+ if (execError.signal === "SIGTERM" && execError.code === null) {
67
+ return {
68
+ available: true,
69
+ status: "unhealthy",
70
+ message: "Error: Health check timed out",
71
+ };
72
+ }
73
+
62
74
  const stderr = execError.stderr || "";
63
75
  const stdout = execError.stdout || "";
64
76
  const combined = stderr + stdout;
@@ -38,7 +38,7 @@ export class GeminiAdapter implements CLIAdapter {
38
38
  try {
39
39
  const { stdout, stderr } = await execAsync(
40
40
  'echo "hello" | gemini --sandbox --output-format text',
41
- { timeout: 10000 },
41
+ { timeout: 30000 },
42
42
  );
43
43
 
44
44
  const combined = (stdout || "") + (stderr || "");
@@ -56,7 +56,19 @@ export class GeminiAdapter implements CLIAdapter {
56
56
  stderr?: string;
57
57
  stdout?: string;
58
58
  message?: string;
59
+ code?: number | string;
60
+ signal?: string;
59
61
  };
62
+
63
+ // Check for timeout
64
+ if (execError.signal === "SIGTERM" && execError.code === null) {
65
+ return {
66
+ available: true,
67
+ status: "unhealthy",
68
+ message: "Error: Health check timed out",
69
+ };
70
+ }
71
+
60
72
  const stderr = execError.stderr || "";
61
73
  const stdout = execError.stdout || "";
62
74
  const combined = stderr + stdout;
@@ -159,7 +171,10 @@ ${escapedBody}
159
171
 
160
172
  // Write to a temporary file to avoid shell escaping issues
161
173
  const tmpDir = os.tmpdir();
162
- const tmpFile = path.join(tmpDir, `gauntlet-gemini-${Date.now()}.txt`);
174
+ const tmpFile = path.join(
175
+ tmpDir,
176
+ `gauntlet-gemini-${process.pid}-${Date.now()}.txt`,
177
+ );
163
178
  await fs.writeFile(tmpFile, fullContent);
164
179
 
165
180
  try {
@@ -6,8 +6,19 @@ import { EntryPointExpander } from "../core/entry-point.js";
6
6
  import { JobGenerator } from "../core/job.js";
7
7
  import { Runner } from "../core/runner.js";
8
8
  import { ConsoleReporter } from "../output/console.js";
9
+ import { startConsoleLog } from "../output/console-log.js";
9
10
  import { Logger } from "../output/logger.js";
10
- import { rotateLogs } from "./shared.js";
11
+ import {
12
+ findPreviousFailures,
13
+ type PreviousViolation,
14
+ } from "../utils/log-parser.js";
15
+ import { readSessionRef, writeSessionRef } from "../utils/session-ref.js";
16
+ import {
17
+ acquireLock,
18
+ cleanLogs,
19
+ hasExistingLogs,
20
+ releaseLock,
21
+ } from "./shared.js";
11
22
 
12
23
  export function registerCheckCommand(program: Command): void {
13
24
  program
@@ -24,14 +35,16 @@ export function registerCheckCommand(program: Command): void {
24
35
  "Use diff for current uncommitted changes (staged and unstaged)",
25
36
  )
26
37
  .action(async (options) => {
38
+ let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
39
+ let lockAcquired = false;
40
+ let restoreConsole: (() => void) | undefined;
27
41
  try {
28
- const config = await loadConfig();
29
-
30
- // Rotate logs before starting
31
- await rotateLogs(config.project.log_dir);
42
+ config = await loadConfig();
43
+ restoreConsole = await startConsoleLog(config.project.log_dir);
44
+ await acquireLock(config.project.log_dir);
45
+ lockAcquired = true;
32
46
 
33
47
  // Determine effective base branch
34
- // Priority: CLI override > CI env var > config
35
48
  const effectiveBaseBranch =
36
49
  options.baseBranch ||
37
50
  (process.env.GITHUB_BASE_REF &&
@@ -40,10 +53,76 @@ export function registerCheckCommand(program: Command): void {
40
53
  : null) ||
41
54
  config.project.base_branch;
42
55
 
43
- const changeDetector = new ChangeDetector(effectiveBaseBranch, {
44
- commit: options.commit,
45
- uncommitted: options.uncommitted,
46
- });
56
+ // Detect rerun mode: if logs exist and not targeting a specific commit, enter verification mode
57
+ const logsExist = await hasExistingLogs(config.project.log_dir);
58
+ const isRerun = logsExist && !options.commit;
59
+
60
+ let failuresMap:
61
+ | Map<string, Map<string, PreviousViolation[]>>
62
+ | undefined;
63
+ let changeOptions:
64
+ | { commit?: string; uncommitted?: boolean; fixBase?: string }
65
+ | undefined;
66
+
67
+ if (isRerun) {
68
+ console.log(
69
+ chalk.dim(
70
+ "Existing logs detected — running in verification mode...",
71
+ ),
72
+ );
73
+ const previousFailures = await findPreviousFailures(
74
+ config.project.log_dir,
75
+ options.gate,
76
+ );
77
+
78
+ failuresMap = new Map();
79
+ for (const gateFailure of previousFailures) {
80
+ const adapterMap = new Map<string, PreviousViolation[]>();
81
+ for (const af of gateFailure.adapterFailures) {
82
+ const key = af.reviewIndex
83
+ ? String(af.reviewIndex)
84
+ : af.adapterName;
85
+ adapterMap.set(key, af.violations);
86
+ }
87
+ failuresMap.set(gateFailure.jobId, adapterMap);
88
+ }
89
+
90
+ if (previousFailures.length > 0) {
91
+ const totalViolations = previousFailures.reduce(
92
+ (sum, gf) =>
93
+ sum +
94
+ gf.adapterFailures.reduce(
95
+ (s, af) => s + af.violations.length,
96
+ 0,
97
+ ),
98
+ 0,
99
+ );
100
+ console.log(
101
+ chalk.yellow(
102
+ `Found ${previousFailures.length} gate(s) with ${totalViolations} previous violation(s)`,
103
+ ),
104
+ );
105
+ }
106
+
107
+ changeOptions = { uncommitted: true };
108
+ const fixBase = await readSessionRef(config.project.log_dir);
109
+ if (fixBase) {
110
+ changeOptions.fixBase = fixBase;
111
+ }
112
+ } else if (options.commit || options.uncommitted) {
113
+ changeOptions = {
114
+ commit: options.commit,
115
+ uncommitted: options.uncommitted,
116
+ };
117
+ }
118
+
119
+ const changeDetector = new ChangeDetector(
120
+ effectiveBaseBranch,
121
+ changeOptions || {
122
+ commit: options.commit,
123
+ uncommitted: options.uncommitted,
124
+ },
125
+ );
47
126
  const expander = new EntryPointExpander();
48
127
  const jobGen = new JobGenerator(config);
49
128
 
@@ -52,6 +131,8 @@ export function registerCheckCommand(program: Command): void {
52
131
 
53
132
  if (changes.length === 0) {
54
133
  console.log(chalk.green("No changes detected."));
134
+ await releaseLock(config.project.log_dir);
135
+ restoreConsole?.();
55
136
  process.exit(0);
56
137
  }
57
138
 
@@ -72,6 +153,8 @@ export function registerCheckCommand(program: Command): void {
72
153
 
73
154
  if (jobs.length === 0) {
74
155
  console.log(chalk.yellow("No applicable checks for these changes."));
156
+ await releaseLock(config.project.log_dir);
157
+ restoreConsole?.();
75
158
  process.exit(0);
76
159
  }
77
160
 
@@ -83,16 +166,29 @@ export function registerCheckCommand(program: Command): void {
83
166
  config,
84
167
  logger,
85
168
  reporter,
86
- undefined,
87
- undefined,
169
+ failuresMap,
170
+ changeOptions,
88
171
  effectiveBaseBranch,
89
172
  );
90
173
 
91
174
  const success = await runner.run(jobs);
175
+
176
+ if (success) {
177
+ await cleanLogs(config.project.log_dir);
178
+ } else {
179
+ await writeSessionRef(config.project.log_dir);
180
+ }
181
+
182
+ await releaseLock(config.project.log_dir);
183
+ restoreConsole?.();
92
184
  process.exit(success ? 0 : 1);
93
185
  } catch (error: unknown) {
186
+ if (config && lockAcquired) {
187
+ await releaseLock(config.project.log_dir);
188
+ }
94
189
  const err = error as { message?: string };
95
190
  console.error(chalk.red("Error:"), err.message);
191
+ restoreConsole?.();
96
192
  process.exit(1);
97
193
  }
98
194
  });
@@ -34,8 +34,9 @@ export async function listJobs(): Promise<void> {
34
34
  }
35
35
 
36
36
  const workingDirectory = checkDef.working_directory || ep.path;
37
- // Include entry point in key to ensure each entry point/check pair is distinct
38
- const jobKey = `${ep.path}:${check.name}:${workingDirectory}`;
37
+ // Dedupe by check name + working directory only - if two entry points
38
+ // both trigger e.g. "test" with working_directory: ".", run it once
39
+ const jobKey = `${check.name}:${workingDirectory}`;
39
40
 
40
41
  // Skip if we've already created a job for this exact entry point/check combination
41
42
  if (seenJobs.has(jobKey)) {
@@ -0,0 +1,29 @@
1
+ import chalk from "chalk";
2
+ import type { Command } from "commander";
3
+ import { loadConfig } from "../config/loader.js";
4
+ import { acquireLock, cleanLogs, releaseLock } from "./shared.js";
5
+
6
+ export function registerCleanCommand(program: Command): void {
7
+ program
8
+ .command("clean")
9
+ .description("Archive logs (move current logs into previous/)")
10
+ .action(async () => {
11
+ let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
12
+ let lockAcquired = false;
13
+ try {
14
+ config = await loadConfig();
15
+ await acquireLock(config.project.log_dir);
16
+ lockAcquired = true;
17
+ await cleanLogs(config.project.log_dir);
18
+ await releaseLock(config.project.log_dir);
19
+ console.log(chalk.green("Logs archived successfully."));
20
+ } catch (error: unknown) {
21
+ if (config && lockAcquired) {
22
+ await releaseLock(config.project.log_dir);
23
+ }
24
+ const err = error as { message?: string };
25
+ console.error(chalk.red("Error:"), err.message);
26
+ process.exit(1);
27
+ }
28
+ });
29
+ }
@@ -15,9 +15,9 @@ export function registerHelpCommand(program: Command): void {
15
15
  );
16
16
  console.log(chalk.bold("Commands:\n"));
17
17
  console.log(" run Run gates for detected changes");
18
- console.log(" rerun Rerun gates with previous failure context");
19
18
  console.log(" check Run only applicable checks");
20
19
  console.log(" review Run only applicable reviews");
20
+ console.log(" clean Archive logs (move current logs into previous/)");
21
21
  console.log(
22
22
  " detect Show what gates would run (without executing them)",
23
23
  );
@@ -1,10 +1,11 @@
1
1
  export { registerCheckCommand } from "./check.js";
2
2
  export { registerCICommand } from "./ci/index.js";
3
+ export { registerCleanCommand } from "./clean.js";
3
4
  export { registerDetectCommand } from "./detect.js";
4
5
  export { registerHealthCommand } from "./health.js";
5
6
  export { registerHelpCommand } from "./help.js";
6
7
  export { registerInitCommand } from "./init.js";
7
8
  export { registerListCommand } from "./list.js";
8
- export { registerRerunCommand } from "./rerun.js";
9
9
  export { registerReviewCommand } from "./review.js";
10
10
  export { registerRunCommand } from "./run.js";
11
+ export { registerValidateCommand } from "./validate.js";
@@ -35,12 +35,12 @@ Execute the autonomous verification suite.
35
35
  - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
36
36
  3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
37
37
  4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
38
- 5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
39
- 6. Run \`agent-gauntlet rerun\` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
38
+ 5. Do NOT commit your changes yet—keep them uncommitted so the next run can verify them.
39
+ 6. Run \`agent-gauntlet run\` again to verify your fixes. It will detect existing logs and automatically switch to verification mode (uncommitted changes + previous failure context).
40
40
  7. Repeat steps 2-6 until one of the following termination conditions is met:
41
- - All gates pass
41
+ - All gates pass (logs are automatically archived)
42
42
  - You disagree with remaining failures (ask the human how to proceed)
43
- - Still failing after 3 rerun attempts
43
+ - Still failing after 3 attempts
44
44
  8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.
45
45
  `;
46
46
 
@@ -6,8 +6,19 @@ import { EntryPointExpander } from "../core/entry-point.js";
6
6
  import { JobGenerator } from "../core/job.js";
7
7
  import { Runner } from "../core/runner.js";
8
8
  import { ConsoleReporter } from "../output/console.js";
9
+ import { startConsoleLog } from "../output/console-log.js";
9
10
  import { Logger } from "../output/logger.js";
10
- import { rotateLogs } from "./shared.js";
11
+ import {
12
+ findPreviousFailures,
13
+ type PreviousViolation,
14
+ } from "../utils/log-parser.js";
15
+ import { readSessionRef, writeSessionRef } from "../utils/session-ref.js";
16
+ import {
17
+ acquireLock,
18
+ cleanLogs,
19
+ hasExistingLogs,
20
+ releaseLock,
21
+ } from "./shared.js";
11
22
 
12
23
  export function registerReviewCommand(program: Command): void {
13
24
  program
@@ -24,14 +35,16 @@ export function registerReviewCommand(program: Command): void {
24
35
  "Use diff for current uncommitted changes (staged and unstaged)",
25
36
  )
26
37
  .action(async (options) => {
38
+ let config: Awaited<ReturnType<typeof loadConfig>> | undefined;
39
+ let lockAcquired = false;
40
+ let restoreConsole: (() => void) | undefined;
27
41
  try {
28
- const config = await loadConfig();
29
-
30
- // Rotate logs before starting
31
- await rotateLogs(config.project.log_dir);
42
+ config = await loadConfig();
43
+ restoreConsole = await startConsoleLog(config.project.log_dir);
44
+ await acquireLock(config.project.log_dir);
45
+ lockAcquired = true;
32
46
 
33
47
  // Determine effective base branch
34
- // Priority: CLI override > CI env var > config
35
48
  const effectiveBaseBranch =
36
49
  options.baseBranch ||
37
50
  (process.env.GITHUB_BASE_REF &&
@@ -40,10 +53,76 @@ export function registerReviewCommand(program: Command): void {
40
53
  : null) ||
41
54
  config.project.base_branch;
42
55
 
43
- const changeDetector = new ChangeDetector(effectiveBaseBranch, {
44
- commit: options.commit,
45
- uncommitted: options.uncommitted,
46
- });
56
+ // Detect rerun mode: if logs exist and not targeting a specific commit, enter verification mode
57
+ const logsExist = await hasExistingLogs(config.project.log_dir);
58
+ const isRerun = logsExist && !options.commit;
59
+
60
+ let failuresMap:
61
+ | Map<string, Map<string, PreviousViolation[]>>
62
+ | undefined;
63
+ let changeOptions:
64
+ | { commit?: string; uncommitted?: boolean; fixBase?: string }
65
+ | undefined;
66
+
67
+ if (isRerun) {
68
+ console.log(
69
+ chalk.dim(
70
+ "Existing logs detected — running in verification mode...",
71
+ ),
72
+ );
73
+ const previousFailures = await findPreviousFailures(
74
+ config.project.log_dir,
75
+ options.gate,
76
+ );
77
+
78
+ failuresMap = new Map();
79
+ for (const gateFailure of previousFailures) {
80
+ const adapterMap = new Map<string, PreviousViolation[]>();
81
+ for (const af of gateFailure.adapterFailures) {
82
+ const key = af.reviewIndex
83
+ ? String(af.reviewIndex)
84
+ : af.adapterName;
85
+ adapterMap.set(key, af.violations);
86
+ }
87
+ failuresMap.set(gateFailure.jobId, adapterMap);
88
+ }
89
+
90
+ if (previousFailures.length > 0) {
91
+ const totalViolations = previousFailures.reduce(
92
+ (sum, gf) =>
93
+ sum +
94
+ gf.adapterFailures.reduce(
95
+ (s, af) => s + af.violations.length,
96
+ 0,
97
+ ),
98
+ 0,
99
+ );
100
+ console.log(
101
+ chalk.yellow(
102
+ `Found ${previousFailures.length} gate(s) with ${totalViolations} previous violation(s)`,
103
+ ),
104
+ );
105
+ }
106
+
107
+ changeOptions = { uncommitted: true };
108
+ const fixBase = await readSessionRef(config.project.log_dir);
109
+ if (fixBase) {
110
+ changeOptions.fixBase = fixBase;
111
+ }
112
+ } else if (options.commit || options.uncommitted) {
113
+ changeOptions = {
114
+ commit: options.commit,
115
+ uncommitted: options.uncommitted,
116
+ };
117
+ }
118
+
119
+ const changeDetector = new ChangeDetector(
120
+ effectiveBaseBranch,
121
+ changeOptions || {
122
+ commit: options.commit,
123
+ uncommitted: options.uncommitted,
124
+ },
125
+ );
47
126
  const expander = new EntryPointExpander();
48
127
  const jobGen = new JobGenerator(config);
49
128
 
@@ -52,6 +131,8 @@ export function registerReviewCommand(program: Command): void {
52
131
 
53
132
  if (changes.length === 0) {
54
133
  console.log(chalk.green("No changes detected."));
134
+ await releaseLock(config.project.log_dir);
135
+ restoreConsole?.();
55
136
  process.exit(0);
56
137
  }
57
138
 
@@ -72,6 +153,8 @@ export function registerReviewCommand(program: Command): void {
72
153
 
73
154
  if (jobs.length === 0) {
74
155
  console.log(chalk.yellow("No applicable reviews for these changes."));
156
+ await releaseLock(config.project.log_dir);
157
+ restoreConsole?.();
75
158
  process.exit(0);
76
159
  }
77
160
 
@@ -83,16 +166,29 @@ export function registerReviewCommand(program: Command): void {
83
166
  config,
84
167
  logger,
85
168
  reporter,
86
- undefined,
87
- undefined,
169
+ failuresMap,
170
+ changeOptions,
88
171
  effectiveBaseBranch,
89
172
  );
90
173
 
91
174
  const success = await runner.run(jobs);
175
+
176
+ if (success) {
177
+ await cleanLogs(config.project.log_dir);
178
+ } else {
179
+ await writeSessionRef(config.project.log_dir);
180
+ }
181
+
182
+ await releaseLock(config.project.log_dir);
183
+ restoreConsole?.();
92
184
  process.exit(success ? 0 : 1);
93
185
  } catch (error: unknown) {
186
+ if (config && lockAcquired) {
187
+ await releaseLock(config.project.log_dir);
188
+ }
94
189
  const err = error as { message?: string };
95
190
  console.error(chalk.red("Error:"), err.message);
191
+ restoreConsole?.();
96
192
  process.exit(1);
97
193
  }
98
194
  });