agent-gauntlet 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +1 -1
- package/src/cli-adapters/claude.ts +13 -1
- package/src/cli-adapters/gemini.ts +17 -2
- package/src/commands/check.ts +98 -12
- package/src/commands/ci/list-jobs.ts +3 -2
- package/src/commands/clean.ts +29 -0
- package/src/commands/help.ts +1 -1
- package/src/commands/index.ts +1 -1
- package/src/commands/init.ts +4 -4
- package/src/commands/review.ts +98 -12
- package/src/commands/run.ts +98 -12
- package/src/commands/shared.ts +56 -10
- package/src/config/schema.ts +4 -0
- package/src/config/validator.ts +6 -13
- package/src/core/change-detector.ts +1 -0
- package/src/core/entry-point.ts +48 -7
- package/src/core/runner.ts +57 -47
- package/src/gates/result.ts +32 -0
- package/src/gates/review.ts +323 -51
- package/src/index.ts +2 -2
- package/src/output/console.ts +96 -9
- package/src/output/logger.ts +40 -7
- package/src/templates/run_gauntlet.template.md +20 -13
- package/src/utils/log-parser.ts +409 -165
- package/src/utils/session-ref.ts +82 -0
- package/src/commands/check.test.ts +0 -29
- package/src/commands/detect.test.ts +0 -43
- package/src/commands/health.test.ts +0 -93
- package/src/commands/help.test.ts +0 -44
- package/src/commands/init.test.ts +0 -130
- package/src/commands/list.test.ts +0 -121
- package/src/commands/rerun.ts +0 -160
- package/src/commands/review.test.ts +0 -31
- package/src/commands/run.test.ts +0 -27
- package/src/config/loader.test.ts +0 -151
- package/src/core/entry-point.test.ts +0 -61
- package/src/gates/review.test.ts +0 -291
package/src/output/console.ts
CHANGED
|
@@ -2,6 +2,7 @@ import fs from "node:fs/promises";
|
|
|
2
2
|
import chalk from "chalk";
|
|
3
3
|
import type { Job } from "../core/job.js";
|
|
4
4
|
import type { GateResult } from "../gates/result.js";
|
|
5
|
+
import { reconstructHistory } from "../utils/log-parser.js";
|
|
5
6
|
|
|
6
7
|
export class ConsoleReporter {
|
|
7
8
|
onJobStart(job: Job) {
|
|
@@ -10,10 +11,12 @@ export class ConsoleReporter {
|
|
|
10
11
|
|
|
11
12
|
onJobComplete(job: Job, result: GateResult) {
|
|
12
13
|
const duration = `${(result.duration / 1000).toFixed(2)}s`;
|
|
14
|
+
|
|
13
15
|
const message = result.message ?? "";
|
|
14
16
|
|
|
15
17
|
if (result.subResults && result.subResults.length > 0) {
|
|
16
18
|
// Print split results
|
|
19
|
+
|
|
17
20
|
for (const sub of result.subResults) {
|
|
18
21
|
const statusColor =
|
|
19
22
|
sub.status === "pass"
|
|
@@ -21,6 +24,7 @@ export class ConsoleReporter {
|
|
|
21
24
|
: sub.status === "fail"
|
|
22
25
|
? chalk.red
|
|
23
26
|
: chalk.magenta;
|
|
27
|
+
|
|
24
28
|
const label =
|
|
25
29
|
sub.status === "pass"
|
|
26
30
|
? "PASS"
|
|
@@ -29,8 +33,15 @@ export class ConsoleReporter {
|
|
|
29
33
|
: "ERROR";
|
|
30
34
|
|
|
31
35
|
let logInfo = "";
|
|
36
|
+
|
|
32
37
|
if (sub.status !== "pass" && sub.logPath) {
|
|
33
|
-
|
|
38
|
+
// Prefer JSON if it exists for reviews
|
|
39
|
+
|
|
40
|
+
const displayLog = sub.logPath;
|
|
41
|
+
|
|
42
|
+
const logPrefix = displayLog.endsWith(".json") ? "Review:" : "Log:";
|
|
43
|
+
|
|
44
|
+
logInfo = `\n ${logPrefix} ${displayLog}`;
|
|
34
45
|
}
|
|
35
46
|
|
|
36
47
|
console.log(
|
|
@@ -66,18 +77,94 @@ export class ConsoleReporter {
|
|
|
66
77
|
}
|
|
67
78
|
}
|
|
68
79
|
|
|
69
|
-
async printSummary(results: GateResult[]) {
|
|
70
|
-
console.log(
|
|
80
|
+
async printSummary(results: GateResult[], logDir?: string) {
|
|
81
|
+
console.log(
|
|
82
|
+
`\n${chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")}`,
|
|
83
|
+
);
|
|
84
|
+
console.log(chalk.bold("RESULTS SUMMARY"));
|
|
85
|
+
console.log(chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"));
|
|
86
|
+
|
|
87
|
+
if (logDir) {
|
|
88
|
+
try {
|
|
89
|
+
const history = await reconstructHistory(logDir);
|
|
90
|
+
for (const iter of history) {
|
|
91
|
+
if (iter.fixed.length === 0 && iter.skipped.length === 0) continue;
|
|
92
|
+
|
|
93
|
+
console.log(`\nIteration ${iter.iteration}:`);
|
|
94
|
+
for (const f of iter.fixed) {
|
|
95
|
+
const label = f.adapter ? `${f.jobId} (${f.adapter})` : f.jobId;
|
|
96
|
+
console.log(chalk.green(` ✓ Fixed: ${label} - ${f.details}`));
|
|
97
|
+
}
|
|
98
|
+
for (const s of iter.skipped) {
|
|
99
|
+
const label = s.adapter ? `${s.jobId} (${s.adapter})` : s.jobId;
|
|
100
|
+
console.log(
|
|
101
|
+
chalk.yellow(
|
|
102
|
+
` ⊘ Skipped: ${label} - ${s.file}:${s.line} ${s.issue}`,
|
|
103
|
+
),
|
|
104
|
+
);
|
|
105
|
+
if (s.result) {
|
|
106
|
+
console.log(chalk.dim(` Reason: ${s.result}`));
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const totalFixed = history.reduce(
|
|
112
|
+
(sum, iter) => sum + iter.fixed.length,
|
|
113
|
+
0,
|
|
114
|
+
);
|
|
115
|
+
const totalSkipped = history.reduce(
|
|
116
|
+
(sum, iter) => sum + iter.skipped.length,
|
|
117
|
+
0,
|
|
118
|
+
);
|
|
119
|
+
|
|
120
|
+
let totalFailed = 0;
|
|
121
|
+
for (const res of results) {
|
|
122
|
+
if (res.subResults && res.subResults.length > 0) {
|
|
123
|
+
for (const sub of res.subResults) {
|
|
124
|
+
if (sub.status === "fail" || sub.status === "error") {
|
|
125
|
+
totalFailed += sub.errorCount ?? 1;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
} else if (res.status === "fail" || res.status === "error") {
|
|
129
|
+
totalFailed += res.errorCount ?? 1;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
console.log(
|
|
134
|
+
`\n${chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")}`,
|
|
135
|
+
);
|
|
136
|
+
const iterationsText =
|
|
137
|
+
history.length > 1 ? ` after ${history.length} iterations` : "";
|
|
138
|
+
console.log(
|
|
139
|
+
`Total: ${totalFixed} fixed, ${totalSkipped} skipped, ${totalFailed} failed${iterationsText}`,
|
|
140
|
+
);
|
|
141
|
+
} catch (err) {
|
|
142
|
+
console.warn(
|
|
143
|
+
chalk.yellow(`Warning: Failed to reconstruct history: ${err}`),
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
71
147
|
|
|
72
|
-
const passed = results.filter((r) => r.status === "pass");
|
|
73
148
|
const failed = results.filter((r) => r.status === "fail");
|
|
74
149
|
const errored = results.filter((r) => r.status === "error");
|
|
150
|
+
const anySkipped = results.some((r) => r.skipped && r.skipped.length > 0);
|
|
151
|
+
|
|
152
|
+
let overallStatus = "Passed";
|
|
153
|
+
let statusColor = chalk.green;
|
|
154
|
+
|
|
155
|
+
if (errored.length > 0) {
|
|
156
|
+
overallStatus = "Error";
|
|
157
|
+
statusColor = chalk.magenta;
|
|
158
|
+
} else if (failed.length > 0) {
|
|
159
|
+
overallStatus = "Failed";
|
|
160
|
+
statusColor = chalk.red;
|
|
161
|
+
} else if (anySkipped) {
|
|
162
|
+
overallStatus = "Passed with warnings";
|
|
163
|
+
statusColor = chalk.yellow;
|
|
164
|
+
}
|
|
75
165
|
|
|
76
|
-
console.log(`
|
|
77
|
-
console.log(chalk.
|
|
78
|
-
if (failed.length > 0) console.log(chalk.red(`Failed: ${failed.length}`));
|
|
79
|
-
if (errored.length > 0)
|
|
80
|
-
console.log(chalk.magenta(`Errored: ${errored.length}`));
|
|
166
|
+
console.log(statusColor(`Status: ${overallStatus}`));
|
|
167
|
+
console.log(chalk.bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"));
|
|
81
168
|
}
|
|
82
169
|
|
|
83
170
|
/** @internal Public for testing */
|
package/src/output/logger.ts
CHANGED
|
@@ -6,8 +6,38 @@ function formatTimestamp(): string {
|
|
|
6
6
|
return new Date().toISOString();
|
|
7
7
|
}
|
|
8
8
|
|
|
9
|
+
/**
|
|
10
|
+
* Compute the next run number for a given log file prefix.
|
|
11
|
+
* Scans existing files in logDir and returns max+1 (or 1 if none exist).
|
|
12
|
+
*/
|
|
13
|
+
async function nextRunNumber(logDir: string, prefix: string): Promise<number> {
|
|
14
|
+
try {
|
|
15
|
+
const files = await fs.readdir(logDir);
|
|
16
|
+
let max = 0;
|
|
17
|
+
const expectedStart = `${prefix}.`;
|
|
18
|
+
const expectedEnd = ".log";
|
|
19
|
+
for (const file of files) {
|
|
20
|
+
if (!file.startsWith(expectedStart) || !file.endsWith(expectedEnd)) {
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
const middle = file.slice(
|
|
24
|
+
expectedStart.length,
|
|
25
|
+
file.length - expectedEnd.length,
|
|
26
|
+
);
|
|
27
|
+
if (/^\d+$/.test(middle)) {
|
|
28
|
+
const n = parseInt(middle, 10);
|
|
29
|
+
if (n > max) max = n;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return max + 1;
|
|
33
|
+
} catch {
|
|
34
|
+
return 1;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
9
38
|
export class Logger {
|
|
10
39
|
private initializedFiles: Set<string> = new Set();
|
|
40
|
+
private runNumberCache: Map<string, number> = new Map();
|
|
11
41
|
|
|
12
42
|
constructor(private logDir: string) {}
|
|
13
43
|
|
|
@@ -19,19 +49,22 @@ export class Logger {
|
|
|
19
49
|
// No-op - using append mode
|
|
20
50
|
}
|
|
21
51
|
|
|
22
|
-
getLogPath(jobId: string, adapterName?: string): string {
|
|
52
|
+
async getLogPath(jobId: string, adapterName?: string): Promise<string> {
|
|
23
53
|
const safeName = sanitizeJobId(jobId);
|
|
24
|
-
|
|
25
|
-
|
|
54
|
+
const prefix = adapterName ? `${safeName}_${adapterName}` : safeName;
|
|
55
|
+
|
|
56
|
+
if (!this.runNumberCache.has(prefix)) {
|
|
57
|
+
const num = await nextRunNumber(this.logDir, prefix);
|
|
58
|
+
this.runNumberCache.set(prefix, num);
|
|
26
59
|
}
|
|
27
|
-
|
|
60
|
+
const runNum = this.runNumberCache.get(prefix) ?? 1;
|
|
61
|
+
return path.join(this.logDir, `${prefix}.${runNum}.log`);
|
|
28
62
|
}
|
|
29
63
|
|
|
30
64
|
private async initFile(logPath: string): Promise<void> {
|
|
31
65
|
if (this.initializedFiles.has(logPath)) {
|
|
32
66
|
return;
|
|
33
67
|
}
|
|
34
|
-
// Add to set BEFORE writing to make this more atomic
|
|
35
68
|
this.initializedFiles.add(logPath);
|
|
36
69
|
await fs.writeFile(logPath, "");
|
|
37
70
|
}
|
|
@@ -39,7 +72,7 @@ export class Logger {
|
|
|
39
72
|
async createJobLogger(
|
|
40
73
|
jobId: string,
|
|
41
74
|
): Promise<(text: string) => Promise<void>> {
|
|
42
|
-
const logPath = this.getLogPath(jobId);
|
|
75
|
+
const logPath = await this.getLogPath(jobId);
|
|
43
76
|
await this.initFile(logPath);
|
|
44
77
|
|
|
45
78
|
return async (text: string) => {
|
|
@@ -61,7 +94,7 @@ export class Logger {
|
|
|
61
94
|
adapterName?: string,
|
|
62
95
|
) => Promise<{ logger: (text: string) => Promise<void>; logPath: string }> {
|
|
63
96
|
return async (adapterName?: string) => {
|
|
64
|
-
const logPath = this.getLogPath(jobId, adapterName);
|
|
97
|
+
const logPath = await this.getLogPath(jobId, adapterName);
|
|
65
98
|
await this.initFile(logPath);
|
|
66
99
|
|
|
67
100
|
const logger = async (text: string) => {
|
|
@@ -18,17 +18,24 @@ Execute the autonomous verification suite.
|
|
|
18
18
|
|
|
19
19
|
**Review trust level: medium** — Fix issues you reasonably agree with or believe the human wants fixed. Skip issues that are purely stylistic, subjective, or that you believe the human would not want changed. When you skip an issue, briefly state what was skipped and why.
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
0. Run `agent-gauntlet clean` to archive any previous log files
|
|
22
|
+
1. Run `agent-gauntlet run`
|
|
22
23
|
2. If it fails:
|
|
23
|
-
-
|
|
24
|
-
- Read the log
|
|
25
|
-
-
|
|
26
|
-
3.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
24
|
+
- Identify the failed gates from the console output.
|
|
25
|
+
- For CHECK failures: Read the `.log` file path provided in the output.
|
|
26
|
+
- For REVIEW failures: Read the `.json` file path provided in the "Review: <path>" output.
|
|
27
|
+
3. Address the violations:
|
|
28
|
+
- For REVIEW violations: You MUST update the `"status"` and `"result"` fields in the provided `.json` file for EACH violation.
|
|
29
|
+
- Set `"status": "fixed"` and add a brief description to `"result"` for issues you fix.
|
|
30
|
+
- Set `"status": "skipped"` and add a brief reason to `"result"` for issues you skip (based on the trust level).
|
|
31
|
+
- Do NOT modify any other attributes (file, line, issue, priority) in the JSON file.
|
|
32
|
+
- Apply the trust level above when deciding whether to act on AI reviewer feedback.
|
|
33
|
+
4. Run `agent-gauntlet run` again to verify your fixes. It will detect existing logs and automatically switch to verification mode.
|
|
34
|
+
5. Repeat steps 2-5 until one of the following termination conditions is met:
|
|
35
|
+
- "Status: Passed" appears in the output (logs are automatically archived)
|
|
36
|
+
- "Status: Passed with warnings" appears in the output (remaining issues were skipped)
|
|
37
|
+
- Still failing after 3 attempts -> Run `agent-gauntlet clean` to archive logs and reset state.
|
|
38
|
+
6. Provide a summary of the session:
|
|
39
|
+
- Issues Fixed: (list key fixes)
|
|
40
|
+
- Issues Skipped: (list skipped items and reasons)
|
|
41
|
+
- Outstanding Failures: (if any, explain why they couldn't be resolved)
|