agent-gauntlet 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -12,13 +12,29 @@ const GAUNTLET_COMMAND_CONTENT = `---
12
12
  description: Run the full verification gauntlet
13
13
  allowed-tools: Bash
14
14
  ---
15
+ <!--
16
+ REVIEW TRUST LEVEL
17
+ Controls how aggressively the agent acts on AI reviewer feedback.
18
+ Change the trust_level value below to one of: high, medium, low
19
+
20
+ - high: Fix all issues unless you strongly disagree or have low confidence the human wants the change.
21
+ - medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
22
+ - low: Fix only issues you strongly agree with or are confident the human wants fixed.
23
+ -->
24
+ <!-- trust_level: medium -->
25
+
15
26
  # /gauntlet
16
27
  Execute the autonomous verification suite.
17
28
 
29
+ **Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
30
+
18
31
  1. Run \`agent-gauntlet run\`.
19
- 2. If it fails, read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
32
+ 2. If it fails:
33
+ - Check the console output for "Fix instructions: available" messages.
34
+ - Read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
35
+ - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
20
36
  3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
21
- 4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
37
+ 4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
22
38
  5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
23
39
  6. Run \`agent-gauntlet rerun\` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
24
40
  7. Repeat steps 2-6 until one of the following termination conditions is met:
@@ -44,16 +44,19 @@ export async function loadConfig(
44
44
  const filePath = path.join(checksPath, file);
45
45
  const content = await fs.readFile(filePath, "utf-8");
46
46
  const raw = YAML.parse(content);
47
- // Ensure name matches filename if not provided or just use filename as key
47
+ const name = path.basename(file, path.extname(file));
48
48
  const parsed: CheckGateConfig = checkGateSchema.parse(raw);
49
49
 
50
50
  // Load fix instructions if specified
51
- const loadedCheck: LoadedCheckGateConfig = { ...parsed };
51
+ const loadedCheck: LoadedCheckGateConfig = {
52
+ ...parsed,
53
+ name,
54
+ };
52
55
  if (parsed.fix_instructions) {
53
56
  // Security: Reject absolute paths to prevent reading arbitrary files
54
57
  if (path.isAbsolute(parsed.fix_instructions)) {
55
58
  throw new Error(
56
- `Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${parsed.name}")`,
59
+ `Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${name}")`,
57
60
  );
58
61
  }
59
62
 
@@ -75,12 +78,12 @@ export async function loadConfig(
75
78
  relativePath === ""
76
79
  ) {
77
80
  throw new Error(
78
- `Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
81
+ `Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${name}")`,
79
82
  );
80
83
  }
81
84
  if (!(await fileExists(fixInstructionsPath))) {
82
85
  throw new Error(
83
- `Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
86
+ `Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${name}")`,
84
87
  );
85
88
  }
86
89
  loadedCheck.fixInstructionsContent = await fs.readFile(
@@ -89,7 +92,7 @@ export async function loadConfig(
89
92
  );
90
93
  }
91
94
 
92
- checks[parsed.name] = loadedCheck;
95
+ checks[name] = loadedCheck;
93
96
  }
94
97
  }
95
98
  }
@@ -7,7 +7,6 @@ export const cliConfigSchema = z.object({
7
7
 
8
8
  export const checkGateSchema = z
9
9
  .object({
10
- name: z.string().min(1),
11
10
  command: z.string().min(1),
12
11
  working_directory: z.string().optional(),
13
12
  parallel: z.boolean().default(false),
@@ -32,6 +32,7 @@ export type ServiceConfig = z.infer<typeof serviceConfigSchema>;
32
32
 
33
33
  // Extended check config with loaded content
34
34
  export interface LoadedCheckGateConfig extends CheckGateConfig {
35
+ name: string;
35
36
  fixInstructionsContent?: string;
36
37
  }
37
38
 
@@ -8,4 +8,10 @@ export interface GateResult {
8
8
  logPath?: string; // path to full log
9
9
  logPaths?: string[]; // paths to multiple logs (e.g. per-agent logs)
10
10
  fixInstructions?: string; // Markdown content for fixing failures
11
+ subResults?: Array<{
12
+ nameSuffix: string;
13
+ status: GateStatus;
14
+ message: string;
15
+ logPath?: string;
16
+ }>;
11
17
  }
@@ -299,20 +299,41 @@ export class ReviewGateExecutor {
299
299
  };
300
300
  }
301
301
 
302
- const failed = outputs.find((result) => result.status === "fail");
303
- const error = outputs.find((result) => result.status === "error");
302
+ const failed = outputs.filter((result) => result.status === "fail");
303
+ const errored = outputs.filter((result) => result.status === "error");
304
+ // If not failed or errored, it must be passed
305
+ // const passed = outputs.filter((result) => result.status === "pass");
304
306
 
305
307
  let status: "pass" | "fail" | "error" = "pass";
306
308
  let message = "Passed";
307
309
 
308
- if (error) {
310
+ // Determine overall status
311
+ if (errored.length > 0) {
309
312
  status = "error";
310
- message = `Error (${error.adapter}): ${error.message}`;
311
- } else if (failed) {
313
+ message = `Error in ${errored.length} adapter(s)`;
314
+ } else if (failed.length > 0) {
312
315
  status = "fail";
313
- message = `Failed (${failed.adapter}): ${failed.message}`;
316
+ message = `Failed by ${failed.length} adapter(s)`;
314
317
  }
315
318
 
319
+ // Build detailed subResults
320
+ const subResults = outputs.map((out) => {
321
+ // Find specific log path for this adapter
322
+ // logPaths contains strings like ".../review_src_lint_codex.log"
323
+ // We expect the log path to contain the adapter name
324
+ // This is a heuristic, but likely sufficient given our naming convention
325
+ const specificLog = logPaths.find((p) =>
326
+ p.includes(`_${out.adapter}.log`),
327
+ );
328
+
329
+ return {
330
+ nameSuffix: `(${out.adapter})`,
331
+ status: out.status,
332
+ message: out.message,
333
+ logPath: specificLog,
334
+ };
335
+ });
336
+
316
337
  await mainLogger(`Result: ${status} - ${message}\n`);
317
338
 
318
339
  return {
@@ -321,6 +342,7 @@ export class ReviewGateExecutor {
321
342
  duration: Date.now() - startTime,
322
343
  message,
323
344
  logPaths,
345
+ subResults,
324
346
  };
325
347
  } catch (error: unknown) {
326
348
  const err = error as { message?: string };
@@ -439,7 +461,6 @@ export class ReviewGateExecutor {
439
461
 
440
462
  const resultMsg = `Review result (${adapter.name}): ${evaluation.status} - ${evaluation.message}`;
441
463
  await adapterLogger(`${resultMsg}\n`);
442
- await mainLogger(`${resultMsg}\n`);
443
464
 
444
465
  return { adapter: adapter.name, evaluation };
445
466
  } catch (error: unknown) {
@@ -745,11 +766,7 @@ export class ReviewGateExecutor {
745
766
  : "some";
746
767
 
747
768
  // Construct a summary message
748
- let msg = `Found ${violationCount} violations`;
749
- if (Array.isArray(json.violations) && json.violations.length > 0) {
750
- const first = json.violations[0];
751
- msg += `. Example: ${first.issue} in ${first.file}`;
752
- }
769
+ const msg = `Found ${violationCount} violations`;
753
770
 
754
771
  return { status: "fail", message: msg, json, filteredCount };
755
772
  }
@@ -12,14 +12,57 @@ export class ConsoleReporter {
12
12
  const duration = `${(result.duration / 1000).toFixed(2)}s`;
13
13
  const message = result.message ?? "";
14
14
 
15
- if (result.status === "pass") {
16
- console.log(chalk.green(`[PASS] ${job.id} (${duration})`));
17
- } else if (result.status === "fail") {
18
- console.log(chalk.red(`[FAIL] ${job.id} (${duration}) - ${message}`));
15
+ if (result.subResults && result.subResults.length > 0) {
16
+ // Print split results
17
+ for (const sub of result.subResults) {
18
+ const statusColor =
19
+ sub.status === "pass"
20
+ ? chalk.green
21
+ : sub.status === "fail"
22
+ ? chalk.red
23
+ : chalk.magenta;
24
+ const label =
25
+ sub.status === "pass"
26
+ ? "PASS"
27
+ : sub.status === "fail"
28
+ ? "FAIL"
29
+ : "ERROR";
30
+
31
+ let logInfo = "";
32
+ if (sub.status !== "pass" && sub.logPath) {
33
+ logInfo = `\n Log: ${sub.logPath}`;
34
+ }
35
+
36
+ console.log(
37
+ statusColor(
38
+ `[${label}] ${job.id} ${chalk.dim(sub.nameSuffix)} (${duration}) - ${sub.message}${logInfo}`,
39
+ ),
40
+ );
41
+ }
19
42
  } else {
20
- console.log(
21
- chalk.magenta(`[ERROR] ${job.id} (${duration}) - ${message}`),
22
- );
43
+ // Standard single result
44
+ let logInfo = "";
45
+ if (result.status !== "pass") {
46
+ // Try to find a relevant log path
47
+ const logPath = result.logPath || result.logPaths?.[0];
48
+ if (logPath) {
49
+ logInfo = `\n Log: ${logPath}`;
50
+ }
51
+ }
52
+
53
+ if (result.status === "pass") {
54
+ console.log(chalk.green(`[PASS] ${job.id} (${duration})`));
55
+ } else if (result.status === "fail") {
56
+ console.log(
57
+ chalk.red(`[FAIL] ${job.id} (${duration}) - ${message}${logInfo}`),
58
+ );
59
+ } else {
60
+ console.log(
61
+ chalk.magenta(
62
+ `[ERROR] ${job.id} (${duration}) - ${message}${logInfo}`,
63
+ ),
64
+ );
65
+ }
23
66
  }
24
67
  }
25
68
 
@@ -35,15 +78,6 @@ export class ConsoleReporter {
35
78
  if (failed.length > 0) console.log(chalk.red(`Failed: ${failed.length}`));
36
79
  if (errored.length > 0)
37
80
  console.log(chalk.magenta(`Errored: ${errored.length}`));
38
-
39
- if (failed.length > 0 || errored.length > 0) {
40
- console.log(`\n${chalk.bold("=== Failure Details ===\n")}`);
41
-
42
- for (const result of [...failed, ...errored]) {
43
- const details = await this.extractFailureDetails(result);
44
- this.printFailureDetails(result, details);
45
- }
46
- }
47
81
  }
48
82
 
49
83
  /** @internal Public for testing */
@@ -226,39 +260,4 @@ export class ConsoleReporter {
226
260
 
227
261
  return details;
228
262
  }
229
-
230
- private printFailureDetails(result: GateResult, details: string[]) {
231
- const statusColor = result.status === "error" ? chalk.magenta : chalk.red;
232
- const statusLabel = result.status === "error" ? "ERROR" : "FAIL";
233
-
234
- console.log(statusColor(`[${statusLabel}] ${result.jobId}`));
235
- if (result.message) {
236
- console.log(chalk.dim(` Summary: ${result.message}`));
237
- }
238
-
239
- if (details.length > 0) {
240
- console.log(chalk.dim(" Details:"));
241
- details.forEach((detail) => {
242
- console.log(detail);
243
- });
244
- }
245
-
246
- if (result.logPaths && result.logPaths.length > 0) {
247
- result.logPaths.forEach((p) => {
248
- console.log(chalk.dim(` Log: ${p}`));
249
- });
250
- } else if (result.logPath) {
251
- console.log(chalk.dim(` Log: ${result.logPath}`));
252
- }
253
-
254
- if (result.fixInstructions) {
255
- console.log(
256
- chalk.cyan(
257
- ` Fix instructions: available (${result.fixInstructions.split("\n").length} lines)`,
258
- ),
259
- );
260
- }
261
-
262
- console.log(""); // Empty line between failures
263
- }
264
263
  }
@@ -2,20 +2,33 @@
2
2
  description: Run the full verification gauntlet
3
3
  allowed-tools: Bash
4
4
  ---
5
+ <!--
6
+ REVIEW TRUST LEVEL
7
+ Controls how aggressively the agent acts on AI reviewer feedback.
8
+ Change the trust_level value below to one of: high, medium, low
9
+
10
+ - high: Fix all issues unless you strongly disagree or have low confidence the human wants the change.
11
+ - medium: Fix issues you reasonably agree with or believe the human wants fixed. (DEFAULT)
12
+ - low: Fix only issues you strongly agree with or are confident the human wants fixed.
13
+ -->
14
+ <!-- trust_level: medium -->
15
+
5
16
  # /gauntlet
6
17
  Execute the autonomous verification suite.
7
18
 
19
+ **Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
20
+
8
21
  1. Run `agent-gauntlet run`.
9
22
  2. If it fails:
10
23
  - Check the console output for "Fix instructions: available" messages.
11
24
  - Read the log files in `gauntlet_logs/` to understand exactly what went wrong.
12
25
  - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
13
26
  3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
14
- 4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
27
+ 4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
15
28
  5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
16
29
  6. Run `agent-gauntlet rerun` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
17
30
  7. Repeat steps 2-6 until one of the following termination conditions is met:
18
31
  - All gates pass
19
- - You disagree with remaining failures (ask the human how to proceed)
32
+ - You are skipping remaining issues
20
33
  - Still failing after 3 rerun attempts
21
34
  8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.