agent-gauntlet 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-gauntlet",
3
- "version": "0.1.7",
3
+ "version": "0.1.9",
4
4
  "description": "A CLI tool for testing AI coding agents",
5
5
  "license": "Apache-2.0",
6
6
  "author": "Paul Caplan",
@@ -7,6 +7,7 @@ import { JobGenerator } from '../core/job.js';
7
7
  import { Runner } from '../core/runner.js';
8
8
  import { Logger } from '../output/logger.js';
9
9
  import { ConsoleReporter } from '../output/console.js';
10
+ import { rotateLogs } from './shared.js';
10
11
 
11
12
  export function registerCheckCommand(program: Command): void {
12
13
  program
@@ -18,6 +19,10 @@ export function registerCheckCommand(program: Command): void {
18
19
  .action(async (options) => {
19
20
  try {
20
21
  const config = await loadConfig();
22
+
23
+ // Rotate logs before starting
24
+ await rotateLogs(config.project.log_dir);
25
+
21
26
  const changeDetector = new ChangeDetector(config.project.base_branch, {
22
27
  commit: options.commit,
23
28
  uncommitted: options.uncommitted
@@ -11,6 +11,7 @@ export function registerHelpCommand(program: Command): void {
11
11
  console.log('of your repo that changed, based on a configurable set of entry points.\n');
12
12
  console.log(chalk.bold('Commands:\n'));
13
13
  console.log(' run Run gates for detected changes');
14
+ console.log(' rerun Rerun gates with previous failure context');
14
15
  console.log(' check Run only applicable checks');
15
16
  console.log(' review Run only applicable reviews');
16
17
  console.log(' detect Show what gates would run (without executing them)');
@@ -1,11 +1,41 @@
1
- import { describe, it, expect, beforeEach, afterEach, beforeAll, afterAll } from 'bun:test';
1
+ import { describe, it, expect, beforeEach, afterEach, beforeAll, afterAll, mock } from 'bun:test';
2
2
  import { Command } from 'commander';
3
- import { registerInitCommand } from './init.js';
4
3
  import fs from 'node:fs/promises';
5
4
  import path from 'node:path';
6
5
 
7
6
  const TEST_DIR = path.join(process.cwd(), 'test-init-' + Date.now());
8
7
 
8
+ // Mock adapters
9
+ const mockAdapters = [
10
+ {
11
+ name: 'mock-cli-1',
12
+ isAvailable: async () => true,
13
+ getProjectCommandDir: () => '.mock1',
14
+ getUserCommandDir: () => null,
15
+ getCommandExtension: () => '.sh',
16
+ canUseSymlink: () => false,
17
+ transformCommand: (content: string) => content,
18
+ },
19
+ {
20
+ name: 'mock-cli-2',
21
+ isAvailable: async () => false, // Not available
22
+ getProjectCommandDir: () => '.mock2',
23
+ getUserCommandDir: () => null,
24
+ getCommandExtension: () => '.sh',
25
+ canUseSymlink: () => false,
26
+ transformCommand: (content: string) => content,
27
+ }
28
+ ];
29
+
30
+ mock.module('../cli-adapters/index.js', () => ({
31
+ getAllAdapters: () => mockAdapters,
32
+ getProjectCommandAdapters: () => mockAdapters,
33
+ getUserCommandAdapters: () => [],
34
+ }));
35
+
36
+ // Import after mocking
37
+ const { registerInitCommand } = await import('./init.js');
38
+
9
39
  describe('Init Command', () => {
10
40
  let program: Command;
11
41
  const originalConsoleLog = console.log;
@@ -45,50 +75,33 @@ describe('Init Command', () => {
45
75
  });
46
76
 
47
77
  it('should create .gauntlet directory structure with --yes flag', async () => {
48
- const initCmd = program.commands.find(cmd => cmd.name() === 'init');
78
+ // We expect it to use the available mock-cli-1
79
+ await program.parseAsync(['node', 'test', 'init', '--yes']);
80
+
81
+ // Check that files were created
82
+ const gauntletDir = path.join(TEST_DIR, '.gauntlet');
83
+ const configFile = path.join(gauntletDir, 'config.yml');
84
+ const reviewsDir = path.join(gauntletDir, 'reviews');
85
+ const checksDir = path.join(gauntletDir, 'checks');
86
+ const runGauntletFile = path.join(gauntletDir, 'run_gauntlet.md');
49
87
 
50
- // Use a timeout to prevent hanging if prompts occur
51
- let timeoutId: ReturnType<typeof setTimeout> | undefined;
52
- const testPromise = initCmd?.parseAsync(['init', '--yes']);
53
- const timeoutPromise = new Promise((_, reject) => {
54
- timeoutId = setTimeout(() => reject(new Error('Test timed out - init command may be prompting')), 3000);
55
- });
88
+ expect(await fs.stat(gauntletDir)).toBeDefined();
89
+ expect(await fs.stat(configFile)).toBeDefined();
90
+ expect(await fs.stat(reviewsDir)).toBeDefined();
91
+ expect(await fs.stat(checksDir)).toBeDefined();
92
+ expect(await fs.stat(runGauntletFile)).toBeDefined();
56
93
 
57
- try {
58
- await Promise.race([testPromise, timeoutPromise]);
59
-
60
- // Check that files were created
61
- const gauntletDir = path.join(TEST_DIR, '.gauntlet');
62
- const configFile = path.join(gauntletDir, 'config.yml');
63
- const reviewsDir = path.join(gauntletDir, 'reviews');
64
- const checksDir = path.join(gauntletDir, 'checks');
65
- const runGauntletFile = path.join(gauntletDir, 'run_gauntlet.md');
66
-
67
- expect(await fs.stat(gauntletDir)).toBeDefined();
68
- expect(await fs.stat(configFile)).toBeDefined();
69
- expect(await fs.stat(reviewsDir)).toBeDefined();
70
- expect(await fs.stat(checksDir)).toBeDefined();
71
- expect(await fs.stat(runGauntletFile)).toBeDefined();
72
-
73
- // Verify config content
74
- const configContent = await fs.readFile(configFile, 'utf-8');
75
- expect(configContent).toContain('base_branch');
76
- expect(configContent).toContain('log_dir');
77
-
78
- // Verify review file content
79
- const reviewFile = path.join(reviewsDir, 'code-quality.md');
80
- const reviewContent = await fs.readFile(reviewFile, 'utf-8');
81
- expect(reviewContent).toContain('cli_preference');
82
- } catch (error: any) {
83
- // If it times out, skip this test for now - the command installation part may need more complex mocking
84
- if (error.message.includes('timed out')) {
85
- console.log('Skipping test due to interactive prompt - command installation requires manual testing');
86
- return;
87
- }
88
- throw error;
89
- } finally {
90
- if (timeoutId) clearTimeout(timeoutId);
91
- }
94
+ // Verify config content
95
+ const configContent = await fs.readFile(configFile, 'utf-8');
96
+ expect(configContent).toContain('base_branch');
97
+ expect(configContent).toContain('log_dir');
98
+ expect(configContent).toContain('mock-cli-1'); // Should be present
99
+ expect(configContent).not.toContain('mock-cli-2'); // Should not be present (unavailable)
100
+
101
+ // Verify review file content
102
+ const reviewFile = path.join(reviewsDir, 'code-quality.md');
103
+ const reviewContent = await fs.readFile(reviewFile, 'utf-8');
104
+ expect(reviewContent).toContain('mock-cli-1');
92
105
  });
93
106
 
94
107
  it('should not create directory if .gauntlet already exists', async () => {
@@ -96,8 +109,7 @@ describe('Init Command', () => {
96
109
  const gauntletDir = path.join(TEST_DIR, '.gauntlet');
97
110
  await fs.mkdir(gauntletDir, { recursive: true });
98
111
 
99
- const initCmd = program.commands.find(cmd => cmd.name() === 'init');
100
- await initCmd?.parseAsync(['init', '--yes']);
112
+ await program.parseAsync(['node', 'test', 'init', '--yes']);
101
113
 
102
114
  const output = logs.join('\n');
103
115
  expect(output).toContain('.gauntlet directory already exists');
@@ -4,7 +4,9 @@ import fs from 'node:fs/promises';
4
4
  import path from 'node:path';
5
5
  import readline from 'node:readline';
6
6
  import { exists } from './shared.js';
7
- import { getAllAdapters, getProjectCommandAdapters, getUserCommandAdapters } from '../cli-adapters/index.js';
7
+ import { getAllAdapters, getProjectCommandAdapters, getUserCommandAdapters, type CLIAdapter } from '../cli-adapters/index.js';
8
+
9
+ const MAX_PROMPT_ATTEMPTS = 10;
8
10
 
9
11
  const GAUNTLET_COMMAND_CONTENT = `---
10
12
  description: Run the full verification gauntlet
@@ -31,11 +33,18 @@ interface InitOptions {
31
33
  yes?: boolean;
32
34
  }
33
35
 
36
+ interface InitConfig {
37
+ sourceDir: string;
38
+ lintCmd: string | null; // null means not selected, empty string means selected but blank (TODO)
39
+ testCmd: string | null; // null means not selected, empty string means selected but blank (TODO)
40
+ selectedAdapters: CLIAdapter[];
41
+ }
42
+
34
43
  export function registerInitCommand(program: Command): void {
35
44
  program
36
45
  .command('init')
37
46
  .description('Initialize .gauntlet configuration')
38
- .option('-y, --yes', 'Skip prompts and use defaults (project-level commands for all agents)')
47
+ .option('-y, --yes', 'Skip prompts and use defaults (all available CLIs, source: ., no extra checks)')
39
48
  .action(async (options: InitOptions) => {
40
49
  const projectRoot = process.cwd();
41
50
  const targetDir = path.join(projectRoot, '.gauntlet');
@@ -45,39 +54,90 @@ export function registerInitCommand(program: Command): void {
45
54
  return;
46
55
  }
47
56
 
57
+ // 1. CLI Detection
58
+ console.log('Detecting available CLI agents...');
59
+ const availableAdapters = await detectAvailableCLIs();
60
+
61
+ if (availableAdapters.length === 0) {
62
+ console.log();
63
+ console.log(chalk.red('Error: No CLI agents found. Install at least one:'));
64
+ console.log(' - Claude: https://docs.anthropic.com/en/docs/claude-code');
65
+ console.log(' - Gemini: https://github.com/google-gemini/gemini-cli');
66
+ console.log(' - Codex: https://github.com/openai/codex');
67
+ console.log();
68
+ return;
69
+ }
70
+
71
+ let config: InitConfig;
72
+
73
+ if (options.yes) {
74
+ config = {
75
+ sourceDir: '.',
76
+ lintCmd: null,
77
+ testCmd: null,
78
+ selectedAdapters: availableAdapters,
79
+ };
80
+ } else {
81
+ config = await promptForConfig(availableAdapters);
82
+ }
83
+
48
84
  // Create base config structure
49
85
  await fs.mkdir(targetDir);
50
86
  await fs.mkdir(path.join(targetDir, 'checks'));
51
87
  await fs.mkdir(path.join(targetDir, 'reviews'));
52
88
 
53
- // Write sample config
54
- const sampleConfig = `base_branch: origin/main
55
- log_dir: .gauntlet_logs
56
- cli:
57
- default_preference:
58
- - gemini
59
- - codex
60
- - claude
61
- check_usage_limit: false
62
- entry_points:
63
- - path: "."
64
- reviews:
65
- - code-quality
66
- `;
67
- await fs.writeFile(path.join(targetDir, 'config.yml'), sampleConfig);
89
+ // 4. Commented Config Templates
90
+ // Generate config.yml
91
+ const configContent = generateConfigYml(config);
92
+ await fs.writeFile(path.join(targetDir, 'config.yml'), configContent);
68
93
  console.log(chalk.green('Created .gauntlet/config.yml'));
69
94
 
70
- // Write sample review
71
- const sampleReview = `---
72
- cli_preference:
73
- - gemini
74
- - codex
95
+ // Generate check files if selected
96
+ if (config.lintCmd !== null) {
97
+ const lintContent = `name: lint
98
+ command: ${config.lintCmd || '# command: TODO - add your lint command (e.g., npm run lint)'}
99
+ # parallel: false
100
+ # run_in_ci: true
101
+ # run_locally: true
102
+ # timeout: 300
103
+ `;
104
+ await fs.writeFile(path.join(targetDir, 'checks', 'lint.yml'), lintContent);
105
+ console.log(chalk.green('Created .gauntlet/checks/lint.yml'));
106
+ }
107
+
108
+ if (config.testCmd !== null) {
109
+ const testContent = `name: unit-tests
110
+ command: ${config.testCmd || '# command: TODO - add your test command (e.g., npm test)'}
111
+ # parallel: false
112
+ # run_in_ci: true
113
+ # run_locally: true
114
+ # timeout: 300
115
+ `;
116
+ await fs.writeFile(path.join(targetDir, 'checks', 'unit-tests.yml'), testContent);
117
+ console.log(chalk.green('Created .gauntlet/checks/unit-tests.yml'));
118
+ }
119
+
120
+ // 5. Improved Default Code Review Prompt
121
+ const reviewContent = `---
122
+ num_reviews: 1
123
+ # parallel: true
124
+ # timeout: 300
125
+ # cli_preference:
126
+ # - ${config.selectedAdapters[0]?.name || 'claude'}
75
127
  ---
76
128
 
77
129
  # Code Review
78
- Review this code.
130
+
131
+ Review the diff for quality issues:
132
+
133
+ - **Bugs**: Logic errors, null handling, edge cases, race conditions
134
+ - **Security**: Input validation, secrets exposure, injection risks
135
+ - **Maintainability**: Unclear code, missing error handling, duplication
136
+ - **Performance**: Unnecessary work, N+1 queries, missing optimizations
137
+
138
+ For each issue: cite file:line, explain the problem, suggest a fix.
79
139
  `;
80
- await fs.writeFile(path.join(targetDir, 'reviews', 'code-quality.md'), sampleReview);
140
+ await fs.writeFile(path.join(targetDir, 'reviews', 'code-quality.md'), reviewContent);
81
141
  console.log(chalk.green('Created .gauntlet/reviews/code-quality.md'));
82
142
 
83
143
  // Write the canonical gauntlet command file
@@ -87,52 +147,181 @@ Review this code.
87
147
 
88
148
  // Handle command installation
89
149
  if (options.yes) {
90
- // Default: install at project level for all agents
91
- const adapters = getProjectCommandAdapters();
92
- await installCommands('project', adapters.map(a => a.name), projectRoot, canonicalCommandPath);
150
+ // Default: install at project level for all selected agents (if they support it)
151
+ const adaptersToInstall = config.selectedAdapters.filter(a => a.getProjectCommandDir() !== null);
152
+ if (adaptersToInstall.length > 0) {
153
+ await installCommands('project', adaptersToInstall.map(a => a.name), projectRoot, canonicalCommandPath);
154
+ }
93
155
  } else {
94
- // Interactive prompts
95
- await promptAndInstallCommands(projectRoot, canonicalCommandPath);
156
+ // Interactive prompts - passing available adapters to avoid re-checking or offering unavailable ones
157
+ await promptAndInstallCommands(projectRoot, canonicalCommandPath, availableAdapters);
96
158
  }
97
159
  });
98
160
  }
99
161
 
100
- async function promptAndInstallCommands(projectRoot: string, canonicalCommandPath: string): Promise<void> {
101
- // Read all lines from stdin first if not a TTY (piped input)
102
- const isTTY = process.stdin.isTTY;
103
- let inputLines: string[] = [];
104
- let lineIndex = 0;
105
-
106
- if (!isTTY) {
107
- // Read all input at once for piped input
108
- const chunks: Buffer[] = [];
109
- for await (const chunk of process.stdin) {
110
- chunks.push(chunk);
162
+ async function detectAvailableCLIs(): Promise<CLIAdapter[]> {
163
+ const allAdapters = getAllAdapters();
164
+ const available: CLIAdapter[] = [];
165
+
166
+ for (const adapter of allAdapters) {
167
+ const isAvailable = await adapter.isAvailable();
168
+ if (isAvailable) {
169
+ console.log(chalk.green(` ✓ ${adapter.name}`));
170
+ available.push(adapter);
171
+ } else {
172
+ console.log(chalk.dim(` ✗ ${adapter.name} (not installed)`));
111
173
  }
112
- const input = Buffer.concat(chunks).toString('utf-8');
113
- inputLines = input.split('\n').map(l => l.trim());
114
174
  }
175
+ return available;
176
+ }
115
177
 
116
- const rl = isTTY ? readline.createInterface({
178
+ async function promptForConfig(availableAdapters: CLIAdapter[]): Promise<InitConfig> {
179
+ const rl = readline.createInterface({
117
180
  input: process.stdin,
118
181
  output: process.stdout
119
- }) : null;
120
-
121
- const question = async (prompt: string): Promise<string> => {
122
- if (isTTY && rl) {
123
- return new Promise((resolve) => {
124
- rl.question(prompt, (answer) => {
125
- resolve(answer?.trim() ?? '');
126
- });
182
+ });
183
+
184
+ const question = (prompt: string): Promise<string> => {
185
+ return new Promise((resolve) => {
186
+ rl.question(prompt, (answer) => {
187
+ resolve(answer?.trim() ?? '');
127
188
  });
128
- } else {
129
- // Non-interactive: read from pre-buffered lines
130
- process.stdout.write(prompt);
131
- const answer = inputLines[lineIndex] ?? '';
132
- lineIndex++;
133
- console.log(answer); // Echo the answer
134
- return answer;
189
+ });
190
+ };
191
+
192
+ try {
193
+ // CLI Selection
194
+ console.log();
195
+ console.log('Which CLIs would you like to use?');
196
+ availableAdapters.forEach((adapter, i) => {
197
+ console.log(` ${i + 1}) ${adapter.name}`);
198
+ });
199
+ console.log(` ${availableAdapters.length + 1}) All`);
200
+
201
+ let selectedAdapters: CLIAdapter[] = [];
202
+ let attempts = 0;
203
+ while (true) {
204
+ attempts++;
205
+ if (attempts > MAX_PROMPT_ATTEMPTS) throw new Error('Too many invalid attempts');
206
+ const answer = await question(`(comma-separated, e.g., 1,2): `);
207
+ const selections = answer.split(',').map(s => s.trim()).filter(s => s);
208
+
209
+ if (selections.length === 0) {
210
+ // Default to all if empty? Or force selection? Plan says "Which CLIs...".
211
+ // Let's assume user must pick or we default to all if they just hit enter?
212
+ // Actually, usually enter means default. Let's make All the default if just Enter.
213
+ selectedAdapters = availableAdapters;
214
+ break;
215
+ }
216
+
217
+ let valid = true;
218
+ const chosen: CLIAdapter[] = [];
219
+
220
+ for (const sel of selections) {
221
+ const num = parseInt(sel, 10);
222
+ if (isNaN(num) || num < 1 || num > availableAdapters.length + 1) {
223
+ console.log(chalk.yellow(`Invalid selection: ${sel}`));
224
+ valid = false;
225
+ break;
226
+ }
227
+ if (num === availableAdapters.length + 1) {
228
+ chosen.push(...availableAdapters);
229
+ } else {
230
+ chosen.push(availableAdapters[num - 1]);
231
+ }
232
+ }
233
+
234
+ if (valid) {
235
+ selectedAdapters = [...new Set(chosen)];
236
+ break;
237
+ }
238
+ }
239
+
240
+ // Source Directory
241
+ console.log();
242
+ const sourceDirInput = await question('Enter your source directory (e.g., src, lib, .) [default: .]: ');
243
+ const sourceDir = sourceDirInput || '.';
244
+
245
+ // Lint Check
246
+ console.log();
247
+ const addLint = await question('Would you like to add a linting check? [y/N]: ');
248
+ let lintCmd: string | null = null;
249
+ if (addLint.toLowerCase().startsWith('y')) {
250
+ lintCmd = await question('Enter lint command (blank to fill later): ');
251
+ }
252
+
253
+ // Unit Test Check
254
+ console.log();
255
+ const addTest = await question('Would you like to add a unit test check? [y/N]: ');
256
+ let testCmd: string | null = null;
257
+ if (addTest.toLowerCase().startsWith('y')) {
258
+ testCmd = await question('Enter test command (blank to fill later): ');
135
259
  }
260
+
261
+ rl.close();
262
+ return {
263
+ sourceDir,
264
+ lintCmd,
265
+ testCmd,
266
+ selectedAdapters
267
+ };
268
+
269
+ } catch (error) {
270
+ rl.close();
271
+ throw error;
272
+ }
273
+ }
274
+
275
+ function generateConfigYml(config: InitConfig): string {
276
+ const cliList = config.selectedAdapters.map(a => ` - ${a.name}`).join('\n');
277
+
278
+ let entryPoints = '';
279
+
280
+ // If we have checks, we need a source directory entry point
281
+ if (config.lintCmd !== null || config.testCmd !== null) {
282
+ entryPoints += ` - path: "${config.sourceDir}"
283
+ checks:\n`;
284
+ if (config.lintCmd !== null) entryPoints += ` - lint\n`;
285
+ if (config.testCmd !== null) entryPoints += ` - unit-tests\n`;
286
+ }
287
+
288
+ // Always include root entry point for reviews
289
+ entryPoints += ` - path: "."
290
+ reviews:
291
+ - code-quality`;
292
+
293
+ return `base_branch: origin/main
294
+ log_dir: .gauntlet_logs
295
+
296
+ # Run gates in parallel when possible (default: true)
297
+ # allow_parallel: true
298
+
299
+ cli:
300
+ default_preference:
301
+ ${cliList}
302
+ # Check CLI usage quota before running (if unavailable, uses next in list)
303
+ # check_usage_limit: false
304
+
305
+ entry_points:
306
+ ${entryPoints}
307
+ `;
308
+ }
309
+
310
+ async function promptAndInstallCommands(projectRoot: string, canonicalCommandPath: string, availableAdapters: CLIAdapter[]): Promise<void> {
311
+ // Only proceed if we have available adapters
312
+ if (availableAdapters.length === 0) return;
313
+
314
+ const rl = readline.createInterface({
315
+ input: process.stdin,
316
+ output: process.stdout
317
+ });
318
+
319
+ const question = (prompt: string): Promise<string> => {
320
+ return new Promise((resolve) => {
321
+ rl.question(prompt, (answer) => {
322
+ resolve(answer?.trim() ?? '');
323
+ });
324
+ });
136
325
  };
137
326
 
138
327
  try {
@@ -150,14 +339,12 @@ async function promptAndInstallCommands(projectRoot: string, canonicalCommandPat
150
339
 
151
340
  let installLevel: InstallLevel = 'none';
152
341
  let answer = await question('Select option [1-3]: ');
153
-
154
- // Handle EOF or empty input for non-TTY
155
- if (!isTTY && answer === '' && lineIndex > inputLines.length) {
156
- console.log(chalk.dim('\nNo input received, skipping command installation.'));
157
- return;
158
- }
342
+ let installLevelAttempts = 0;
159
343
 
160
344
  while (true) {
345
+ installLevelAttempts++;
346
+ if (installLevelAttempts > MAX_PROMPT_ATTEMPTS) throw new Error('Too many invalid attempts');
347
+
161
348
  if (answer === '1') {
162
349
  installLevel = 'none';
163
350
  break;
@@ -169,47 +356,48 @@ async function promptAndInstallCommands(projectRoot: string, canonicalCommandPat
169
356
  break;
170
357
  } else {
171
358
  console.log(chalk.yellow('Please enter 1, 2, or 3'));
172
- if (!isTTY && lineIndex >= inputLines.length) {
173
- console.log(chalk.dim('\nNo more input, skipping command installation.'));
174
- return;
175
- }
176
359
  answer = await question('Select option [1-3]: ');
177
360
  }
178
361
  }
179
362
 
180
363
  if (installLevel === 'none') {
181
364
  console.log(chalk.dim('\nSkipping command installation.'));
182
- rl?.close();
365
+ rl.close();
183
366
  return;
184
367
  }
185
368
 
186
- // Question 2: Which agents
187
- const allAdapters = getAllAdapters();
188
- const availableAdapters = installLevel === 'project'
189
- ? allAdapters.filter(a => a.getProjectCommandDir() !== null)
190
- : allAdapters.filter(a => a.getUserCommandDir() !== null);
369
+ // Filter available adapters based on install level support
370
+ const installableAdapters = installLevel === 'project'
371
+ ? availableAdapters.filter(a => a.getProjectCommandDir() !== null)
372
+ : availableAdapters.filter(a => a.getUserCommandDir() !== null);
373
+
374
+ if (installableAdapters.length === 0) {
375
+ console.log(chalk.yellow(`No available agents support ${installLevel}-level commands.`));
376
+ rl.close();
377
+ return;
378
+ }
191
379
 
192
380
  console.log();
193
381
  console.log('Which CLI agents would you like to install the command for?');
194
- availableAdapters.forEach((adapter, i) => {
382
+ installableAdapters.forEach((adapter, i) => {
195
383
  console.log(` ${i + 1}) ${adapter.name}`);
196
384
  });
197
- console.log(` ${availableAdapters.length + 1}) All of the above`);
385
+ console.log(` ${installableAdapters.length + 1}) All of the above`);
198
386
  console.log();
199
387
 
200
388
  let selectedAgents: string[] = [];
201
- answer = await question(`Select options (comma-separated, e.g., 1,2 or ${availableAdapters.length + 1} for all): `);
389
+ answer = await question(`Select options (comma-separated, e.g., 1,2 or ${installableAdapters.length + 1} for all): `);
390
+ let agentSelectionAttempts = 0;
202
391
 
203
392
  while (true) {
393
+ agentSelectionAttempts++;
394
+ if (agentSelectionAttempts > MAX_PROMPT_ATTEMPTS) throw new Error('Too many invalid attempts');
395
+
204
396
  const selections = answer.split(',').map(s => s.trim()).filter(s => s);
205
397
 
206
398
  if (selections.length === 0) {
207
- if (!isTTY && lineIndex >= inputLines.length) {
208
- console.log(chalk.dim('\nNo more input, skipping command installation.'));
209
- return;
210
- }
211
399
  console.log(chalk.yellow('Please select at least one option'));
212
- answer = await question(`Select options (comma-separated, e.g., 1,2 or ${availableAdapters.length + 1} for all): `);
400
+ answer = await question(`Select options (comma-separated, e.g., 1,2 or ${installableAdapters.length + 1} for all): `);
213
401
  continue;
214
402
  }
215
403
 
@@ -218,16 +406,15 @@ async function promptAndInstallCommands(projectRoot: string, canonicalCommandPat
218
406
 
219
407
  for (const sel of selections) {
220
408
  const num = parseInt(sel, 10);
221
- if (isNaN(num) || num < 1 || num > availableAdapters.length + 1) {
409
+ if (isNaN(num) || num < 1 || num > installableAdapters.length + 1) {
222
410
  console.log(chalk.yellow(`Invalid selection: ${sel}`));
223
411
  valid = false;
224
412
  break;
225
413
  }
226
- if (num === availableAdapters.length + 1) {
227
- // All agents
228
- agents.push(...availableAdapters.map(a => a.name));
414
+ if (num === installableAdapters.length + 1) {
415
+ agents.push(...installableAdapters.map(a => a.name));
229
416
  } else {
230
- agents.push(availableAdapters[num - 1].name);
417
+ agents.push(installableAdapters[num - 1].name);
231
418
  }
232
419
  }
233
420
 
@@ -235,21 +422,16 @@ async function promptAndInstallCommands(projectRoot: string, canonicalCommandPat
235
422
  selectedAgents = [...new Set(agents)]; // Dedupe
236
423
  break;
237
424
  }
238
-
239
- if (!isTTY && lineIndex >= inputLines.length) {
240
- console.log(chalk.dim('\nNo more input, skipping command installation.'));
241
- return;
242
- }
243
- answer = await question(`Select options (comma-separated, e.g., 1,2 or ${availableAdapters.length + 1} for all): `);
425
+ answer = await question(`Select options (comma-separated, e.g., 1,2 or ${installableAdapters.length + 1} for all): `);
244
426
  }
245
427
 
246
- rl?.close();
428
+ rl.close();
247
429
 
248
430
  // Install commands
249
431
  await installCommands(installLevel, selectedAgents, projectRoot, canonicalCommandPath);
250
432
 
251
433
  } catch (error: any) {
252
- rl?.close();
434
+ rl.close();
253
435
  throw error;
254
436
  }
255
437
  }
@@ -286,11 +468,7 @@ async function installCommands(
286
468
  }
287
469
 
288
470
  if (!commandDir) {
289
- if (level === 'project') {
290
- console.log(chalk.yellow(` ${adapter.name}: No project-level command support, skipping`));
291
- } else {
292
- console.log(chalk.yellow(` ${adapter.name}: No user-level command support, skipping`));
293
- }
471
+ // This shouldn't happen if we filtered correctly, but good safety check
294
472
  continue;
295
473
  }
296
474
 
@@ -7,7 +7,8 @@ import { JobGenerator } from '../core/job.js';
7
7
  import { Runner } from '../core/runner.js';
8
8
  import { Logger } from '../output/logger.js';
9
9
  import { ConsoleReporter } from '../output/console.js';
10
- import { findPreviousFailures, GateFailures, PreviousViolation } from '../utils/log-parser.js';
10
+ import { findPreviousFailures, type GateFailures, type PreviousViolation } from '../utils/log-parser.js';
11
+ import { rotateLogs } from './shared.js';
11
12
 
12
13
  export function registerRerunCommand(program: Command): void {
13
14
  program
@@ -51,6 +52,9 @@ export function registerRerunCommand(program: Command): void {
51
52
  console.log(chalk.dim('No previous failures found. Running as normal...'));
52
53
  }
53
54
 
55
+ // Rotate logs before starting the new run
56
+ await rotateLogs(config.project.log_dir);
57
+
54
58
  // Detect changes (default to uncommitted unless --commit is specified)
55
59
  // Note: Rerun defaults to uncommitted changes for faster iteration loops,
56
60
  // unlike 'run' which defaults to base_branch comparison.
@@ -7,6 +7,7 @@ import { JobGenerator } from '../core/job.js';
7
7
  import { Runner } from '../core/runner.js';
8
8
  import { Logger } from '../output/logger.js';
9
9
  import { ConsoleReporter } from '../output/console.js';
10
+ import { rotateLogs } from './shared.js';
10
11
 
11
12
  export function registerReviewCommand(program: Command): void {
12
13
  program
@@ -18,6 +19,10 @@ export function registerReviewCommand(program: Command): void {
18
19
  .action(async (options) => {
19
20
  try {
20
21
  const config = await loadConfig();
22
+
23
+ // Rotate logs before starting
24
+ await rotateLogs(config.project.log_dir);
25
+
21
26
  const changeDetector = new ChangeDetector(config.project.base_branch, {
22
27
  commit: options.commit,
23
28
  uncommitted: options.uncommitted
@@ -7,6 +7,7 @@ import { JobGenerator } from '../core/job.js';
7
7
  import { Runner } from '../core/runner.js';
8
8
  import { Logger } from '../output/logger.js';
9
9
  import { ConsoleReporter } from '../output/console.js';
10
+ import { rotateLogs } from './shared.js';
10
11
 
11
12
  export function registerRunCommand(program: Command): void {
12
13
  program
@@ -18,6 +19,10 @@ export function registerRunCommand(program: Command): void {
18
19
  .action(async (options) => {
19
20
  try {
20
21
  const config = await loadConfig();
22
+
23
+ // Rotate logs before starting
24
+ await rotateLogs(config.project.log_dir);
25
+
21
26
  const changeDetector = new ChangeDetector(config.project.base_branch, {
22
27
  commit: options.commit,
23
28
  uncommitted: options.uncommitted
@@ -1,4 +1,5 @@
1
1
  import fs from 'node:fs/promises';
2
+ import path from 'node:path';
2
3
 
3
4
  export async function exists(path: string): Promise<boolean> {
4
5
  try {
@@ -8,3 +9,36 @@ export async function exists(path: string): Promise<boolean> {
8
9
  return false;
9
10
  }
10
11
  }
12
+
13
+ export async function rotateLogs(logDir: string): Promise<void> {
14
+ const previousDir = path.join(logDir, 'previous');
15
+
16
+ try {
17
+ // 1. Ensure logDir exists (if not, nothing to rotate, but we should create it for future use if needed,
18
+ // though usually the logger creates it. If it doesn't exist, we can just return).
19
+ if (!(await exists(logDir))) {
20
+ return;
21
+ }
22
+
23
+ // 2. Clear .gauntlet_logs/previous if it exists
24
+ if (await exists(previousDir)) {
25
+ const previousFiles = await fs.readdir(previousDir);
26
+ await Promise.all(
27
+ previousFiles.map(file => fs.rm(path.join(previousDir, file), { recursive: true, force: true }))
28
+ );
29
+ } else {
30
+ await fs.mkdir(previousDir, { recursive: true });
31
+ }
32
+
33
+ // 3. Move all existing files in .gauntlet_logs/ to .gauntlet_logs/previous
34
+ const files = await fs.readdir(logDir);
35
+ await Promise.all(
36
+ files
37
+ .filter(file => file !== 'previous')
38
+ .map(file => fs.rename(path.join(logDir, file), path.join(previousDir, file)))
39
+ );
40
+ } catch (error) {
41
+ // Log warning but don't crash the run as log rotation failure isn't critical
42
+ console.warn(`Failed to rotate logs in ${logDir}:`, error instanceof Error ? error.message : error);
43
+ }
44
+ }
@@ -3,7 +3,9 @@ import { promisify } from 'node:util';
3
3
  import { ReviewGateConfig, ReviewPromptFrontmatter } from '../config/types.js';
4
4
  import { GateResult } from './result.js';
5
5
  import { CLIAdapter, getAdapter } from '../cli-adapters/index.js';
6
- import { PreviousViolation } from '../utils/log-parser.js';
6
+ import { Logger } from '../output/logger.js';
7
+ import { parseDiff, isValidViolationLocation, type DiffFileRange } from '../utils/diff-parser.js';
8
+ import { type PreviousViolation } from '../utils/log-parser.js';
7
9
 
8
10
  const execAsync = promisify(exec);
9
11
 
@@ -13,9 +15,17 @@ const JSON_SYSTEM_INSTRUCTION = `
13
15
  You are in a read-only mode. You may read files in the repository to gather context.
14
16
  Do NOT attempt to modify files or run shell commands that change system state.
15
17
  Do NOT access files outside the repository root.
18
+ Do NOT access the .git/ directory or read git history/commit information.
16
19
  Use your available file-reading and search tools to find information.
17
20
  If the diff is insufficient or ambiguous, use your tools to read the full file content or related files.
18
21
 
22
+ CRITICAL SCOPE RESTRICTIONS:
23
+ - ONLY review the code changes shown in the diff below
24
+ - DO NOT review commit history or existing code outside the diff
25
+ - All violations MUST reference file paths and line numbers that appear IN THE DIFF
26
+ - The "file" field must match a file from the diff
27
+ - The "line" field must be within a changed region (lines starting with + in the diff)
28
+
19
29
  IMPORTANT: You must output ONLY a valid JSON object. Do not output any markdown text, explanations, or code blocks outside of the JSON.
20
30
  Each violation MUST include a "priority" field with one of: "critical", "high", "medium", "low".
21
31
 
@@ -246,7 +256,11 @@ export class ReviewGateExecutor {
246
256
 
247
257
  await adapterLogger(`\n--- Review Output (${adapter.name}) ---\n${output}\n`);
248
258
 
249
- const evaluation = this.evaluateOutput(output);
259
+ const evaluation = this.evaluateOutput(output, diff);
260
+
261
+ if (evaluation.filteredCount && evaluation.filteredCount > 0) {
262
+ await adapterLogger(`Note: ${evaluation.filteredCount} out-of-scope violations filtered\n`);
263
+ }
250
264
 
251
265
  // Log formatted summary
252
266
  if (evaluation.json) {
@@ -408,14 +422,21 @@ export class ReviewGateExecutor {
408
422
  return lines.join('\n');
409
423
  }
410
424
 
411
- public evaluateOutput(output: string): { status: 'pass' | 'fail' | 'error'; message: string; json?: any } {
425
+ public evaluateOutput(output: string, diff?: string): {
426
+ status: 'pass' | 'fail' | 'error';
427
+ message: string;
428
+ json?: any;
429
+ filteredCount?: number;
430
+ } {
431
+ const diffRanges = diff ? parseDiff(diff) : undefined;
432
+
412
433
  try {
413
434
  // 1. Try to extract from markdown code block first (most reliable)
414
435
  const jsonBlockMatch = output.match(/```json\s*([\s\S]*?)\s*```/);
415
436
  if (jsonBlockMatch) {
416
437
  try {
417
438
  const json = JSON.parse(jsonBlockMatch[1]);
418
- return this.validateAndReturn(json);
439
+ return this.validateAndReturn(json, diffRanges);
419
440
  } catch {
420
441
  // If code block parse fails, fall back to other methods
421
442
  }
@@ -433,7 +454,7 @@ export class ReviewGateExecutor {
433
454
  const json = JSON.parse(candidate);
434
455
  // If we successfully parsed an object with 'status', it's likely our result
435
456
  if (json.status) {
436
- return this.validateAndReturn(json);
457
+ return this.validateAndReturn(json, diffRanges);
437
458
  }
438
459
  } catch {
439
460
  // Not valid JSON, keep searching backwards
@@ -448,7 +469,7 @@ export class ReviewGateExecutor {
448
469
  try {
449
470
  const candidate = output.substring(firstStart, end + 1);
450
471
  const json = JSON.parse(candidate);
451
- return this.validateAndReturn(json);
472
+ return this.validateAndReturn(json, diffRanges);
452
473
  } catch {
453
474
  // Ignore
454
475
  }
@@ -461,7 +482,10 @@ export class ReviewGateExecutor {
461
482
  }
462
483
  }
463
484
 
464
- private validateAndReturn(json: any): { status: 'pass' | 'fail' | 'error'; message: string; json?: any } {
485
+ private validateAndReturn(
486
+ json: any,
487
+ diffRanges?: Map<string, DiffFileRange>
488
+ ): { status: 'pass' | 'fail' | 'error'; message: string; json?: any; filteredCount?: number } {
465
489
  // Validate Schema
466
490
  if (!json.status || (json.status !== 'pass' && json.status !== 'fail')) {
467
491
  return { status: 'error', message: 'Invalid JSON: missing or invalid "status" field', json };
@@ -472,6 +496,33 @@ export class ReviewGateExecutor {
472
496
  }
473
497
 
474
498
  // json.status === 'fail'
499
+ let filteredCount = 0;
500
+
501
+ if (Array.isArray(json.violations) && diffRanges?.size) {
502
+ const originalCount = json.violations.length;
503
+
504
+ json.violations = json.violations.filter((v: any) => {
505
+ const isValid = isValidViolationLocation(v.file, v.line, diffRanges);
506
+ if (!isValid) {
507
+ // Can't easily access logger here, but could return warning info
508
+ // console.warn(`[WARNING] Filtered violation: ${v.file}:${v.line ?? '?'} (not in diff)`);
509
+ }
510
+ return isValid;
511
+ });
512
+
513
+ filteredCount = originalCount - json.violations.length;
514
+
515
+ // If all filtered out, change to pass
516
+ if (json.violations.length === 0) {
517
+ return {
518
+ status: 'pass',
519
+ message: `Passed (${filteredCount} out-of-scope violations filtered)`,
520
+ json: { status: 'pass' },
521
+ filteredCount
522
+ };
523
+ }
524
+ }
525
+
475
526
  const violationCount = Array.isArray(json.violations) ? json.violations.length : 'some';
476
527
 
477
528
  // Construct a summary message
@@ -481,7 +532,7 @@ export class ReviewGateExecutor {
481
532
  msg += `. Example: ${first.issue} in ${first.file}`;
482
533
  }
483
534
 
484
- return { status: 'fail', message: msg, json };
535
+ return { status: 'fail', message: msg, json, filteredCount };
485
536
  }
486
537
 
487
538
  private parseLines(stdout: string): string[] {
@@ -0,0 +1,86 @@
1
+ export type DiffFileRange = Set<number>;
2
+
3
+ /**
4
+ * Parses a unified diff string into a map of filenames to sets of valid line numbers.
5
+ * Valid line numbers are those that appear in the diff as added or modified lines.
6
+ */
7
+ export function parseDiff(diff: string): Map<string, DiffFileRange> {
8
+ const fileRanges = new Map<string, DiffFileRange>();
9
+ const lines = diff.split('\n');
10
+
11
+ let currentFile: string | null = null;
12
+ let currentRanges: DiffFileRange | null = null;
13
+ let currentLineNumber = 0;
14
+
15
+ for (const line of lines) {
16
+ // Parse file header: diff --git a/path/to/file b/path/to/file
17
+ if (line.startsWith('diff --git')) {
18
+ const parts = line.split(' ');
19
+ if (parts.length >= 4) {
20
+ // Extract filename from b/path/to/file (target file)
21
+ const targetPath = parts[3];
22
+ // Remove 'b/' prefix
23
+ currentFile = targetPath.startsWith('b/') ? targetPath.substring(2) : targetPath;
24
+
25
+ // Skip .git/ paths
26
+ if (currentFile.startsWith('.git/')) {
27
+ currentFile = null;
28
+ currentRanges = null;
29
+ continue;
30
+ }
31
+
32
+ currentRanges = new Set<number>();
33
+ fileRanges.set(currentFile, currentRanges);
34
+ }
35
+ continue;
36
+ }
37
+
38
+ // Skip if we're ignoring this file (e.g. .git/)
39
+ if (!currentFile || !currentRanges) continue;
40
+
41
+ // Parse hunk header: @@ -old,count +new,count @@
42
+ if (line.startsWith('@@')) {
43
+ const match = line.match(/@@ \-\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/);
44
+ if (match && match[1]) {
45
+ currentLineNumber = parseInt(match[1], 10);
46
+ }
47
+ continue;
48
+ }
49
+
50
+ // Track added lines
51
+ if (line.startsWith('+') && !line.startsWith('+++')) {
52
+ currentRanges.add(currentLineNumber);
53
+ currentLineNumber++;
54
+ }
55
+ // Track context lines (unchanged) to keep line count correct
56
+ else if (line.startsWith(' ')) {
57
+ currentLineNumber++;
58
+ }
59
+ // Removed lines (-) do not increment the new line counter
60
+ }
61
+
62
+ return fileRanges;
63
+ }
64
+
65
+ /**
66
+ * Checks if a violation is valid based on the parsed diff ranges.
67
+ */
68
+ export function isValidViolationLocation(
69
+ file: string,
70
+ line: number | undefined,
71
+ diffRanges: Map<string, DiffFileRange> | undefined
72
+ ): boolean {
73
+ // If no diff ranges provided (e.g. full file review), assume valid
74
+ if (!diffRanges) return true;
75
+
76
+ // Line is required for diff-scoped reviews
77
+ if (line === undefined) return false;
78
+
79
+ const validLines = diffRanges.get(file);
80
+ if (!validLines) {
81
+ // File not in diff
82
+ return false;
83
+ }
84
+
85
+ return validLines.has(line);
86
+ }