agent-gauntlet 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +3 -3
  2. package/package.json +1 -1
  3. package/src/cli-adapters/claude.ts +13 -1
  4. package/src/cli-adapters/gemini.ts +17 -2
  5. package/src/commands/check.ts +98 -12
  6. package/src/commands/ci/list-jobs.ts +3 -2
  7. package/src/commands/clean.ts +29 -0
  8. package/src/commands/help.ts +1 -1
  9. package/src/commands/index.ts +1 -1
  10. package/src/commands/init.ts +4 -4
  11. package/src/commands/review.ts +98 -12
  12. package/src/commands/run.ts +98 -12
  13. package/src/commands/shared.ts +56 -10
  14. package/src/config/schema.ts +4 -0
  15. package/src/config/validator.ts +6 -13
  16. package/src/core/change-detector.ts +1 -0
  17. package/src/core/entry-point.ts +48 -7
  18. package/src/core/runner.ts +57 -47
  19. package/src/gates/result.ts +32 -0
  20. package/src/gates/review.ts +323 -51
  21. package/src/index.ts +2 -2
  22. package/src/output/console.ts +96 -9
  23. package/src/output/logger.ts +40 -7
  24. package/src/templates/run_gauntlet.template.md +20 -13
  25. package/src/utils/log-parser.ts +409 -165
  26. package/src/utils/session-ref.ts +82 -0
  27. package/src/commands/check.test.ts +0 -29
  28. package/src/commands/detect.test.ts +0 -43
  29. package/src/commands/health.test.ts +0 -93
  30. package/src/commands/help.test.ts +0 -44
  31. package/src/commands/init.test.ts +0 -130
  32. package/src/commands/list.test.ts +0 -121
  33. package/src/commands/rerun.ts +0 -160
  34. package/src/commands/review.test.ts +0 -31
  35. package/src/commands/run.test.ts +0 -27
  36. package/src/config/loader.test.ts +0 -151
  37. package/src/core/entry-point.test.ts +0 -61
  38. package/src/gates/review.test.ts +0 -291
@@ -2,6 +2,7 @@ import fs from "node:fs/promises";
2
2
  import chalk from "chalk";
3
3
  import type { Job } from "../core/job.js";
4
4
  import type { GateResult } from "../gates/result.js";
5
+ import { reconstructHistory } from "../utils/log-parser.js";
5
6
 
6
7
  export class ConsoleReporter {
7
8
  onJobStart(job: Job) {
@@ -10,10 +11,12 @@ export class ConsoleReporter {
10
11
 
11
12
  onJobComplete(job: Job, result: GateResult) {
12
13
  const duration = `${(result.duration / 1000).toFixed(2)}s`;
14
+
13
15
  const message = result.message ?? "";
14
16
 
15
17
  if (result.subResults && result.subResults.length > 0) {
16
18
  // Print split results
19
+
17
20
  for (const sub of result.subResults) {
18
21
  const statusColor =
19
22
  sub.status === "pass"
@@ -21,6 +24,7 @@ export class ConsoleReporter {
21
24
  : sub.status === "fail"
22
25
  ? chalk.red
23
26
  : chalk.magenta;
27
+
24
28
  const label =
25
29
  sub.status === "pass"
26
30
  ? "PASS"
@@ -29,8 +33,15 @@ export class ConsoleReporter {
29
33
  : "ERROR";
30
34
 
31
35
  let logInfo = "";
36
+
32
37
  if (sub.status !== "pass" && sub.logPath) {
33
- logInfo = `\n Log: ${sub.logPath}`;
38
+ // Prefer JSON if it exists for reviews
39
+
40
+ const displayLog = sub.logPath;
41
+
42
+ const logPrefix = displayLog.endsWith(".json") ? "Review:" : "Log:";
43
+
44
+ logInfo = `\n ${logPrefix} ${displayLog}`;
34
45
  }
35
46
 
36
47
  console.log(
@@ -66,18 +77,94 @@ export class ConsoleReporter {
66
77
  }
67
78
  }
68
79
 
69
- async printSummary(results: GateResult[]) {
70
- console.log(`\n${chalk.bold("--- Gauntlet Summary ---")}`);
80
+ async printSummary(results: GateResult[], logDir?: string) {
81
+ console.log(
82
+ `\n${chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")}`,
83
+ );
84
+ console.log(chalk.bold("RESULTS SUMMARY"));
85
+ console.log(chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"));
86
+
87
+ if (logDir) {
88
+ try {
89
+ const history = await reconstructHistory(logDir);
90
+ for (const iter of history) {
91
+ if (iter.fixed.length === 0 && iter.skipped.length === 0) continue;
92
+
93
+ console.log(`\nIteration ${iter.iteration}:`);
94
+ for (const f of iter.fixed) {
95
+ const label = f.adapter ? `${f.jobId} (${f.adapter})` : f.jobId;
96
+ console.log(chalk.green(` ✓ Fixed: ${label} - ${f.details}`));
97
+ }
98
+ for (const s of iter.skipped) {
99
+ const label = s.adapter ? `${s.jobId} (${s.adapter})` : s.jobId;
100
+ console.log(
101
+ chalk.yellow(
102
+ ` ⊘ Skipped: ${label} - ${s.file}:${s.line} ${s.issue}`,
103
+ ),
104
+ );
105
+ if (s.result) {
106
+ console.log(chalk.dim(` Reason: ${s.result}`));
107
+ }
108
+ }
109
+ }
110
+
111
+ const totalFixed = history.reduce(
112
+ (sum, iter) => sum + iter.fixed.length,
113
+ 0,
114
+ );
115
+ const totalSkipped = history.reduce(
116
+ (sum, iter) => sum + iter.skipped.length,
117
+ 0,
118
+ );
119
+
120
+ let totalFailed = 0;
121
+ for (const res of results) {
122
+ if (res.subResults && res.subResults.length > 0) {
123
+ for (const sub of res.subResults) {
124
+ if (sub.status === "fail" || sub.status === "error") {
125
+ totalFailed += sub.errorCount ?? 1;
126
+ }
127
+ }
128
+ } else if (res.status === "fail" || res.status === "error") {
129
+ totalFailed += res.errorCount ?? 1;
130
+ }
131
+ }
132
+
133
+ console.log(
134
+ `\n${chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")}`,
135
+ );
136
+ const iterationsText =
137
+ history.length > 1 ? ` after ${history.length} iterations` : "";
138
+ console.log(
139
+ `Total: ${totalFixed} fixed, ${totalSkipped} skipped, ${totalFailed} failed${iterationsText}`,
140
+ );
141
+ } catch (err) {
142
+ console.warn(
143
+ chalk.yellow(`Warning: Failed to reconstruct history: ${err}`),
144
+ );
145
+ }
146
+ }
71
147
 
72
- const passed = results.filter((r) => r.status === "pass");
73
148
  const failed = results.filter((r) => r.status === "fail");
74
149
  const errored = results.filter((r) => r.status === "error");
150
+ const anySkipped = results.some((r) => r.skipped && r.skipped.length > 0);
151
+
152
+ let overallStatus = "Passed";
153
+ let statusColor = chalk.green;
154
+
155
+ if (errored.length > 0) {
156
+ overallStatus = "Error";
157
+ statusColor = chalk.magenta;
158
+ } else if (failed.length > 0) {
159
+ overallStatus = "Failed";
160
+ statusColor = chalk.red;
161
+ } else if (anySkipped) {
162
+ overallStatus = "Passed with warnings";
163
+ statusColor = chalk.yellow;
164
+ }
75
165
 
76
- console.log(`Total: ${results.length}`);
77
- console.log(chalk.green(`Passed: ${passed.length}`));
78
- if (failed.length > 0) console.log(chalk.red(`Failed: ${failed.length}`));
79
- if (errored.length > 0)
80
- console.log(chalk.magenta(`Errored: ${errored.length}`));
166
+ console.log(statusColor(`Status: ${overallStatus}`));
167
+ console.log(chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"));
81
168
  }
82
169
 
83
170
  /** @internal Public for testing */
@@ -6,8 +6,38 @@ function formatTimestamp(): string {
6
6
  return new Date().toISOString();
7
7
  }
8
8
 
9
+ /**
10
+ * Compute the next run number for a given log file prefix.
11
+ * Scans existing files in logDir and returns max+1 (or 1 if none exist).
12
+ */
13
+ async function nextRunNumber(logDir: string, prefix: string): Promise<number> {
14
+ try {
15
+ const files = await fs.readdir(logDir);
16
+ let max = 0;
17
+ const expectedStart = `${prefix}.`;
18
+ const expectedEnd = ".log";
19
+ for (const file of files) {
20
+ if (!file.startsWith(expectedStart) || !file.endsWith(expectedEnd)) {
21
+ continue;
22
+ }
23
+ const middle = file.slice(
24
+ expectedStart.length,
25
+ file.length - expectedEnd.length,
26
+ );
27
+ if (/^\d+$/.test(middle)) {
28
+ const n = parseInt(middle, 10);
29
+ if (n > max) max = n;
30
+ }
31
+ }
32
+ return max + 1;
33
+ } catch {
34
+ return 1;
35
+ }
36
+ }
37
+
9
38
  export class Logger {
10
39
  private initializedFiles: Set<string> = new Set();
40
+ private runNumberCache: Map<string, number> = new Map();
11
41
 
12
42
  constructor(private logDir: string) {}
13
43
 
@@ -19,19 +49,22 @@ export class Logger {
19
49
  // No-op - using append mode
20
50
  }
21
51
 
22
- getLogPath(jobId: string, adapterName?: string): string {
52
+ async getLogPath(jobId: string, adapterName?: string): Promise<string> {
23
53
  const safeName = sanitizeJobId(jobId);
24
- if (adapterName) {
25
- return path.join(this.logDir, `${safeName}_${adapterName}.log`);
54
+ const prefix = adapterName ? `${safeName}_${adapterName}` : safeName;
55
+
56
+ if (!this.runNumberCache.has(prefix)) {
57
+ const num = await nextRunNumber(this.logDir, prefix);
58
+ this.runNumberCache.set(prefix, num);
26
59
  }
27
- return path.join(this.logDir, `${safeName}.log`);
60
+ const runNum = this.runNumberCache.get(prefix) ?? 1;
61
+ return path.join(this.logDir, `${prefix}.${runNum}.log`);
28
62
  }
29
63
 
30
64
  private async initFile(logPath: string): Promise<void> {
31
65
  if (this.initializedFiles.has(logPath)) {
32
66
  return;
33
67
  }
34
- // Add to set BEFORE writing to make this more atomic
35
68
  this.initializedFiles.add(logPath);
36
69
  await fs.writeFile(logPath, "");
37
70
  }
@@ -39,7 +72,7 @@ export class Logger {
39
72
  async createJobLogger(
40
73
  jobId: string,
41
74
  ): Promise<(text: string) => Promise<void>> {
42
- const logPath = this.getLogPath(jobId);
75
+ const logPath = await this.getLogPath(jobId);
43
76
  await this.initFile(logPath);
44
77
 
45
78
  return async (text: string) => {
@@ -61,7 +94,7 @@ export class Logger {
61
94
  adapterName?: string,
62
95
  ) => Promise<{ logger: (text: string) => Promise<void>; logPath: string }> {
63
96
  return async (adapterName?: string) => {
64
- const logPath = this.getLogPath(jobId, adapterName);
97
+ const logPath = await this.getLogPath(jobId, adapterName);
65
98
  await this.initFile(logPath);
66
99
 
67
100
  const logger = async (text: string) => {
@@ -18,17 +18,24 @@ Execute the autonomous verification suite.
18
18
 
19
19
  **Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
20
20
 
21
- 1. Run `agent-gauntlet run`.
21
+ 0. Run `agent-gauntlet clean` to archive any previous log files
22
+ 1. Run `agent-gauntlet run`
22
23
  2. If it fails:
23
- - Check the console output for "Fix instructions: available" messages.
24
- - Read the log files in `gauntlet_logs/` to understand exactly what went wrong.
25
- - If fix instructions are available, they will be in the log file under a "--- Fix Instructions ---" section—carefully read and apply them FIRST before attempting other fixes.
26
- 3. Fix any code or logic errors found by the tools or AI reviewers, prioritizing higher-priority violations (critical > high > medium > low).
27
- 4. Apply the trust level above when deciding whether to act on AI reviewer feedback. If you skip an issue due to the trust threshold, report it with a brief explanation (e.g., "Skipped: [issue summary] — reason: [stylistic/subjective/disagree]").
28
- 5. Do NOT commit your changes yet—keep them uncommitted so the rerun command can review them.
29
- 6. Run `agent-gauntlet rerun` to verify your fixes. The rerun command reviews only uncommitted changes and uses previous failures as context.
30
- 7. Repeat steps 2-6 until one of the following termination conditions is met:
31
- - All gates pass
32
- - You are skipping remaining issues
33
- - Still failing after 3 rerun attempts
34
- 8. Once all gates pass, do NOT commit or push your changes—await the human's review and explicit instruction to commit.
24
+ - Identify the failed gates from the console output.
25
+ - For CHECK failures: Read the `.log` file path provided in the output.
26
+ - For REVIEW failures: Read the `.json` file path provided in the "Review: <path>" output.
27
+ 3. Address the violations:
28
+ - For REVIEW violations: You MUST update the `"status"` and `"result"` fields in the provided `.json` file for EACH violation.
29
+ - Set `"status": "fixed"` and add a brief description to `"result"` for issues you fix.
30
+ - Set `"status": "skipped"` and add a brief reason to `"result"` for issues you skip (based on the trust level).
31
+ - Do NOT modify any other attributes (file, line, issue, priority) in the JSON file.
32
+ - Apply the trust level above when deciding whether to act on AI reviewer feedback.
33
+ 4. Run `agent-gauntlet run` again to verify your fixes. It will detect existing logs and automatically switch to verification mode.
34
+ 5. Repeat steps 2-5 until one of the following termination conditions is met:
35
+ - "Status: Passed" appears in the output (logs are automatically archived)
36
+ - "Status: Passed with warnings" appears in the output (remaining issues were skipped)
37
+ - Still failing after 3 attempts -> Run `agent-gauntlet clean` to archive logs and reset state.
38
+ 6. Provide a summary of the session:
39
+ - Issues Fixed: (list key fixes)
40
+ - Issues Skipped: (list skipped items and reasons)
41
+ - Outstanding Failures: (if any, explain why they couldn't be resolved)