agent-gauntlet 0.1.12 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.1.12",
3
+ "version": "0.2.0",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -72,7 +72,7 @@ checks:
72
72
  .join("\n");
73
73
 
74
74
  templateContent = templateContent.replace(
75
- "# Services will be injected here by agent-gauntlet",
75
+ " # Services will be injected here by agent-gauntlet",
76
76
  indentedServices,
77
77
  );
78
78
  } else {
@@ -13,6 +13,7 @@ export async function listJobs(): Promise<void> {
13
13
  );
14
14
 
15
15
  const matrixJobs = [];
16
+ const seenJobs = new Set<string>();
16
17
 
17
18
  const globalSetup = formatSetup(ciConfig.setup || undefined);
18
19
 
@@ -32,13 +33,23 @@ export async function listJobs(): Promise<void> {
32
33
  continue;
33
34
  }
34
35
 
36
+ const workingDirectory = checkDef.working_directory || ep.path;
37
+ // Include entry point in key to ensure each entry point/check pair is distinct
38
+ const jobKey = `${ep.path}:${check.name}:${workingDirectory}`;
39
+
40
+ // Skip if we've already created a job for this exact entry point/check combination
41
+ if (seenJobs.has(jobKey)) {
42
+ continue;
43
+ }
44
+ seenJobs.add(jobKey);
45
+
35
46
  const id = `${check.name}-${ep.path.replace(/\//g, "-")}`;
36
47
 
37
48
  matrixJobs.push({
38
49
  id,
39
50
  name: check.name,
40
51
  entry_point: ep.path,
41
- working_directory: checkDef.working_directory || ep.path,
52
+ working_directory: workingDirectory,
42
53
  command: checkDef.command,
43
54
  runtimes: check.requires_runtimes || [],
44
55
  services: check.requires_services || [],
@@ -33,7 +33,7 @@ describe("Health Command", () => {
33
33
  path.join(GAUNTLET_DIR, "config.yml"),
34
34
  `
35
35
  base_branch: origin/main
36
- log_dir: .gauntlet_logs
36
+ log_dir: gauntlet_logs
37
37
  cli:
38
38
  default_preference:
39
39
  - gemini
@@ -16,7 +16,7 @@ allowed-tools: Bash
16
16
  Execute the autonomous verification suite.
17
17
 
18
18
  1. Run \`agent-gauntlet run\`.
19
- 2. If it fails, read the log files in \`.gauntlet_logs/\` to understand exactly what went wrong.
19
+ 2. If it fails, read the log files in \`gauntlet_logs/\` to understand exactly what went wrong.
20
20
  3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
21
21
  4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
22
22
  5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
@@ -336,7 +336,7 @@ function generateConfigYml(config: InitConfig): string {
336
336
  - code-quality`;
337
337
 
338
338
  return `base_branch: origin/main
339
- log_dir: .gauntlet_logs
339
+ log_dir: gauntlet_logs
340
340
 
341
341
  # Run gates in parallel when possible (default: true)
342
342
  # allow_parallel: true
@@ -37,7 +37,7 @@ describe("List Command", () => {
37
37
  path.join(GAUNTLET_DIR, "config.yml"),
38
38
  `
39
39
  base_branch: origin/main
40
- log_dir: .gauntlet_logs
40
+ log_dir: gauntlet_logs
41
41
  cli:
42
42
  default_preference:
43
43
  - gemini
@@ -20,7 +20,7 @@ export async function rotateLogs(logDir: string): Promise<void> {
20
20
  return;
21
21
  }
22
22
 
23
- // 2. Clear .gauntlet_logs/previous if it exists
23
+ // 2. Clear gauntlet_logs/previous if it exists
24
24
  if (await exists(previousDir)) {
25
25
  const previousFiles = await fs.readdir(previousDir);
26
26
  await Promise.all(
@@ -32,7 +32,7 @@ export async function rotateLogs(logDir: string): Promise<void> {
32
32
  await fs.mkdir(previousDir, { recursive: true });
33
33
  }
34
34
 
35
- // 3. Move all existing files in .gauntlet_logs/ to .gauntlet_logs/previous
35
+ // 3. Move all existing files in gauntlet_logs/ to gauntlet_logs/previous
36
36
  const files = await fs.readdir(logDir);
37
37
  await Promise.all(
38
38
  files
@@ -1,34 +1,10 @@
1
1
  import { z } from "zod";
2
2
 
3
- export const runtimeConfigSchema = z.record(
4
- z.string(),
5
- z
6
- .object({
7
- version: z.string().min(1),
8
- bundler_cache: z.boolean().optional(),
9
- })
10
- .passthrough(),
11
- );
3
+ // Runtime and service schemas use z.any() to allow flexibility for different CI providers
4
+ // Each provider (GitHub Actions, GitLab CI, etc.) has its own configuration structure
5
+ export const runtimeConfigSchema = z.record(z.string(), z.any());
12
6
 
13
- export const serviceConfigSchema = z.record(
14
- z.string(),
15
- z
16
- .object({
17
- image: z.string().min(1),
18
- env: z.record(z.string()).optional(),
19
- ports: z.array(z.string()).optional(),
20
- options: z.string().optional(),
21
- health_check: z
22
- .object({
23
- cmd: z.string().optional(),
24
- interval: z.string().optional(),
25
- timeout: z.string().optional(),
26
- retries: z.number().optional(),
27
- })
28
- .optional(),
29
- })
30
- .passthrough(),
31
- );
7
+ export const serviceConfigSchema = z.record(z.string(), z.any());
32
8
 
33
9
  export const ciSetupStepSchema = z.object({
34
10
  name: z.string().min(1),
@@ -7,7 +7,11 @@ import {
7
7
  gauntletConfigSchema,
8
8
  reviewPromptFrontmatterSchema,
9
9
  } from "./schema.js";
10
- import type { CheckGateConfig, LoadedConfig } from "./types.js";
10
+ import type {
11
+ CheckGateConfig,
12
+ LoadedCheckGateConfig,
13
+ LoadedConfig,
14
+ } from "./types.js";
11
15
 
12
16
  const GAUNTLET_DIR = ".gauntlet";
13
17
  const CONFIG_FILE = "config.yml";
@@ -31,7 +35,7 @@ export async function loadConfig(
31
35
 
32
36
  // 2. Load checks
33
37
  const checksPath = path.join(gauntletPath, CHECKS_DIR);
34
- const checks: Record<string, CheckGateConfig> = {};
38
+ const checks: Record<string, LoadedCheckGateConfig> = {};
35
39
 
36
40
  if (await dirExists(checksPath)) {
37
41
  const checkFiles = await fs.readdir(checksPath);
@@ -41,8 +45,51 @@ export async function loadConfig(
41
45
  const content = await fs.readFile(filePath, "utf-8");
42
46
  const raw = YAML.parse(content);
43
47
  // Ensure name matches filename if not provided or just use filename as key
44
- const parsed = checkGateSchema.parse(raw);
45
- checks[parsed.name] = parsed;
48
+ const parsed: CheckGateConfig = checkGateSchema.parse(raw);
49
+
50
+ // Load fix instructions if specified
51
+ const loadedCheck: LoadedCheckGateConfig = { ...parsed };
52
+ if (parsed.fix_instructions) {
53
+ // Security: Reject absolute paths to prevent reading arbitrary files
54
+ if (path.isAbsolute(parsed.fix_instructions)) {
55
+ throw new Error(
56
+ `Fix instructions path must be relative to .gauntlet/ directory, got absolute path: ${parsed.fix_instructions} (referenced by check "${parsed.name}")`,
57
+ );
58
+ }
59
+
60
+ // Security: Resolve and validate the path stays within .gauntlet/
61
+ const fixInstructionsPath = path.resolve(
62
+ gauntletPath,
63
+ parsed.fix_instructions,
64
+ );
65
+ const normalizedGauntletPath = path.resolve(gauntletPath);
66
+ const relativePath = path.relative(
67
+ normalizedGauntletPath,
68
+ fixInstructionsPath,
69
+ );
70
+ // Ensure path doesn't escape .gauntlet/ (no .. segments or absolute paths)
71
+ if (
72
+ relativePath.startsWith("..") ||
73
+ path.isAbsolute(relativePath) ||
74
+ relativePath === "." ||
75
+ relativePath === ""
76
+ ) {
77
+ throw new Error(
78
+ `Fix instructions path must stay within .gauntlet/ directory and point to a file: ${parsed.fix_instructions} resolves to ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
79
+ );
80
+ }
81
+ if (!(await fileExists(fixInstructionsPath))) {
82
+ throw new Error(
83
+ `Fix instructions file not found: ${fixInstructionsPath} (referenced by check "${parsed.name}")`,
84
+ );
85
+ }
86
+ loadedCheck.fixInstructionsContent = await fs.readFile(
87
+ fixInstructionsPath,
88
+ "utf-8",
89
+ );
90
+ }
91
+
92
+ checks[parsed.name] = loadedCheck;
46
93
  }
47
94
  }
48
95
  }
@@ -14,6 +14,7 @@ export const checkGateSchema = z
14
14
  run_locally: z.boolean().default(true),
15
15
  timeout: z.number().optional(),
16
16
  fail_fast: z.boolean().optional(),
17
+ fix_instructions: z.string().optional(), // Path relative to .gauntlet/
17
18
  })
18
19
  .refine(
19
20
  (data) => {
@@ -57,7 +58,7 @@ export const entryPointSchema = z.object({
57
58
 
58
59
  export const gauntletConfigSchema = z.object({
59
60
  base_branch: z.string().min(1).default("origin/main"),
60
- log_dir: z.string().min(1).default(".gauntlet_logs"),
61
+ log_dir: z.string().min(1).default("gauntlet_logs"),
61
62
  allow_parallel: z.boolean().default(true),
62
63
  cli: cliConfigSchema,
63
64
  entry_points: z.array(entryPointSchema).min(1),
@@ -30,9 +30,14 @@ export type CISetupStep = z.infer<typeof ciSetupStepSchema>;
30
30
  export type RuntimeConfig = z.infer<typeof runtimeConfigSchema>;
31
31
  export type ServiceConfig = z.infer<typeof serviceConfigSchema>;
32
32
 
33
+ // Extended check config with loaded content
34
+ export interface LoadedCheckGateConfig extends CheckGateConfig {
35
+ fixInstructionsContent?: string;
36
+ }
37
+
33
38
  // Combined type for the fully loaded configuration
34
39
  export interface LoadedConfig {
35
40
  project: GauntletConfig;
36
- checks: Record<string, CheckGateConfig>;
41
+ checks: Record<string, LoadedCheckGateConfig>;
37
42
  reviews: Record<string, ReviewGateConfig & ReviewPromptFrontmatter>; // Merged with frontmatter
38
43
  }
package/src/core/job.ts CHANGED
@@ -22,6 +22,7 @@ export class JobGenerator {
22
22
 
23
23
  generateJobs(expandedEntryPoints: ExpandedEntryPoint[]): Job[] {
24
24
  const jobs: Job[] = [];
25
+ const seenJobs = new Set<string>();
25
26
  const isCI =
26
27
  process.env.CI === "true" || process.env.GITHUB_ACTIONS === "true";
27
28
 
@@ -41,13 +42,23 @@ export class JobGenerator {
41
42
  if (isCI && !checkConfig.run_in_ci) continue;
42
43
  if (!isCI && !checkConfig.run_locally) continue;
43
44
 
45
+ const workingDirectory = checkConfig.working_directory || ep.path;
46
+ // Include entry point in key to ensure each entry point/check pair is distinct
47
+ const jobKey = `check:${ep.path}:${checkName}:${workingDirectory}`;
48
+
49
+ // Skip if we've already created a job for this exact entry point/check combination
50
+ if (seenJobs.has(jobKey)) {
51
+ continue;
52
+ }
53
+ seenJobs.add(jobKey);
54
+
44
55
  jobs.push({
45
56
  id: `check:${ep.path}:${checkName}`,
46
57
  type: "check",
47
58
  name: checkName,
48
59
  entryPoint: ep.path,
49
60
  gateConfig: checkConfig,
50
- workingDirectory: checkConfig.working_directory || ep.path,
61
+ workingDirectory: workingDirectory,
51
62
  });
52
63
  }
53
64
  }
@@ -5,6 +5,7 @@ import path from "node:path";
5
5
  import { promisify } from "node:util";
6
6
  import { getAdapter } from "../cli-adapters/index.js";
7
7
  import type {
8
+ LoadedCheckGateConfig,
8
9
  LoadedConfig,
9
10
  ReviewGateConfig,
10
11
  ReviewPromptFrontmatter,
@@ -78,11 +79,14 @@ export class Runner {
78
79
  if (job.type === "check") {
79
80
  const logPath = this.logger.getLogPath(job.id);
80
81
  const jobLogger = await this.logger.createJobLogger(job.id);
82
+ const effectiveBaseBranch =
83
+ this.baseBranchOverride || this.config.project.base_branch;
81
84
  result = await this.checkExecutor.execute(
82
85
  job.id,
83
- job.gateConfig as CheckGateConfig,
86
+ job.gateConfig as LoadedCheckGateConfig,
84
87
  job.workingDirectory,
85
88
  jobLogger,
89
+ effectiveBaseBranch,
86
90
  );
87
91
  result.logPath = logPath;
88
92
  } else {
@@ -129,7 +133,7 @@ export class Runner {
129
133
  if (this.shouldStop) break;
130
134
  if (job.type === "check") {
131
135
  const commandName = this.getCommandName(
132
- (job.gateConfig as CheckGateConfig).command,
136
+ (job.gateConfig as LoadedCheckGateConfig).command,
133
137
  );
134
138
  if (!commandName) {
135
139
  preflightResults.push(
@@ -1,6 +1,6 @@
1
1
  import { exec } from "node:child_process";
2
2
  import { promisify } from "node:util";
3
- import type { CheckGateConfig } from "../config/types.js";
3
+ import type { LoadedCheckGateConfig } from "../config/types.js";
4
4
  import type { GateResult } from "./result.js";
5
5
 
6
6
  const execAsync = promisify(exec);
@@ -9,20 +9,24 @@ const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
9
9
  export class CheckGateExecutor {
10
10
  async execute(
11
11
  jobId: string,
12
- config: CheckGateConfig,
12
+ config: LoadedCheckGateConfig,
13
13
  workingDirectory: string,
14
14
  logger: (output: string) => Promise<void>,
15
+ baseBranch?: string,
15
16
  ): Promise<GateResult> {
16
17
  const startTime = Date.now();
17
18
 
19
+ // Substitute variables in command
20
+ const command = this.substituteVariables(config.command, { baseBranch });
21
+
18
22
  try {
19
23
  await logger(
20
24
  `[${new Date().toISOString()}] Starting check: ${config.name}\n`,
21
25
  );
22
- await logger(`Executing command: ${config.command}\n`);
26
+ await logger(`Executing command: ${command}\n`);
23
27
  await logger(`Working directory: ${workingDirectory}\n\n`);
24
28
 
25
- const { stdout, stderr } = await execAsync(config.command, {
29
+ const { stdout, stderr } = await execAsync(command, {
26
30
  cwd: workingDirectory,
27
31
  timeout: config.timeout ? config.timeout * 1000 : undefined,
28
32
  maxBuffer: MAX_BUFFER_BYTES,
@@ -60,8 +64,14 @@ export class CheckGateExecutor {
60
64
  status: "fail",
61
65
  duration: Date.now() - startTime,
62
66
  message: `Timed out after ${config.timeout}s`,
67
+ fixInstructions: config.fixInstructionsContent,
63
68
  };
64
69
  await logger(`Result: ${result.status} - ${result.message}\n`);
70
+ if (config.fixInstructionsContent) {
71
+ await logger(
72
+ `\n--- Fix Instructions ---\n${config.fixInstructionsContent}\n`,
73
+ );
74
+ }
65
75
  return result;
66
76
  }
67
77
 
@@ -72,8 +82,14 @@ export class CheckGateExecutor {
72
82
  status: "fail",
73
83
  duration: Date.now() - startTime,
74
84
  message: `Exited with code ${err.code}`,
85
+ fixInstructions: config.fixInstructionsContent,
75
86
  };
76
87
  await logger(`Result: ${result.status} - ${result.message}\n`);
88
+ if (config.fixInstructionsContent) {
89
+ await logger(
90
+ `\n--- Fix Instructions ---\n${config.fixInstructionsContent}\n`,
91
+ );
92
+ }
77
93
  return result;
78
94
  }
79
95
 
@@ -83,9 +99,26 @@ export class CheckGateExecutor {
83
99
  status: "error",
84
100
  duration: Date.now() - startTime,
85
101
  message: err.message || "Unknown error",
102
+ fixInstructions: config.fixInstructionsContent,
86
103
  };
87
104
  await logger(`Result: ${result.status} - ${result.message}\n`);
105
+ if (config.fixInstructionsContent) {
106
+ await logger(
107
+ `\n--- Fix Instructions ---\n${config.fixInstructionsContent}\n`,
108
+ );
109
+ }
88
110
  return result;
89
111
  }
90
112
  }
113
+
114
+ private substituteVariables(
115
+ command: string,
116
+ variables: { baseBranch?: string },
117
+ ): string {
118
+ let result = command;
119
+ if (variables.baseBranch) {
120
+ result = result.replace(/\$\{BASE_BRANCH\}/g, variables.baseBranch);
121
+ }
122
+ return result;
123
+ }
91
124
  }
@@ -7,4 +7,5 @@ export interface GateResult {
7
7
  message?: string; // summary message
8
8
  logPath?: string; // path to full log
9
9
  logPaths?: string[]; // paths to multiple logs (e.g. per-agent logs)
10
+ fixInstructions?: string; // Markdown content for fixing failures
10
11
  }
@@ -251,6 +251,14 @@ export class ConsoleReporter {
251
251
  console.log(chalk.dim(` Log: ${result.logPath}`));
252
252
  }
253
253
 
254
+ if (result.fixInstructions) {
255
+ console.log(
256
+ chalk.cyan(
257
+ ` Fix instructions: available (${result.fixInstructions.split("\n").length} lines)`,
258
+ ),
259
+ );
260
+ }
261
+
254
262
  console.log(""); // Empty line between failures
255
263
  }
256
264
  }
@@ -6,7 +6,10 @@ allowed-tools: Bash
6
6
  Execute the autonomous verification suite.
7
7
 
8
8
  1. Run `agent-gauntlet run`.
9
- 2. If it fails, read the log files in `.gauntlet_logs/` to understand exactly what went wrong.
9
+ 2. If it fails:
10
+ - Check the console output for "Fix instructions: available" messages.
11
+ - Read the log files in `gauntlet_logs/` to understand exactly what went wrong.
12
+ - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
10
13
  3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
11
14
  4. If you disagree with AI reviewer feedback, briefly explain your reasoning in the code comments rather than ignoring it silently.
12
15
  5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
@@ -19,12 +19,14 @@ jobs:
19
19
  - name: Install agent-gauntlet
20
20
  run: |
21
21
  curl -fsSL https://bun.sh/install | bash
22
- ~/.bun/bin/bun add -g agent-gauntlet
22
+ echo "$HOME/.bun/bin" >> $GITHUB_PATH
23
+ export PATH="$HOME/.bun/bin:$PATH"
24
+ bun add -g pacaplan/agent-gauntlet#development
23
25
 
24
26
  - name: Discover gauntlet jobs
25
27
  id: discover
26
28
  run: |
27
- output=$(~/.bun/bin/agent-gauntlet ci list-jobs)
29
+ output=$(agent-gauntlet ci list-jobs)
28
30
  echo "matrix=$(echo "$output" | jq -c '.matrix')" >> $GITHUB_OUTPUT
29
31
  echo "runtimes=$(echo "$output" | jq -c '.runtimes')" >> $GITHUB_OUTPUT
30
32