agent-gauntlet 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -87
- package/package.json +4 -2
- package/src/bun-plugins.d.ts +4 -0
- package/src/cli-adapters/claude.ts +139 -108
- package/src/cli-adapters/codex.ts +141 -117
- package/src/cli-adapters/cursor.ts +152 -0
- package/src/cli-adapters/gemini.ts +171 -139
- package/src/cli-adapters/github-copilot.ts +153 -0
- package/src/cli-adapters/index.ts +77 -48
- package/src/commands/check.test.ts +24 -20
- package/src/commands/check.ts +86 -59
- package/src/commands/ci/index.ts +15 -0
- package/src/commands/ci/init.ts +96 -0
- package/src/commands/ci/list-jobs.ts +78 -0
- package/src/commands/detect.test.ts +38 -32
- package/src/commands/detect.ts +89 -61
- package/src/commands/health.test.ts +67 -53
- package/src/commands/health.ts +167 -145
- package/src/commands/help.test.ts +37 -37
- package/src/commands/help.ts +31 -22
- package/src/commands/index.ts +10 -9
- package/src/commands/init.test.ts +120 -107
- package/src/commands/init.ts +514 -417
- package/src/commands/list.test.ts +87 -70
- package/src/commands/list.ts +28 -24
- package/src/commands/rerun.ts +157 -119
- package/src/commands/review.test.ts +26 -20
- package/src/commands/review.ts +86 -59
- package/src/commands/run.test.ts +22 -20
- package/src/commands/run.ts +85 -58
- package/src/commands/shared.ts +44 -35
- package/src/config/ci-loader.ts +33 -0
- package/src/config/ci-schema.ts +52 -0
- package/src/config/loader.test.ts +112 -90
- package/src/config/loader.ts +132 -123
- package/src/config/schema.ts +48 -47
- package/src/config/types.ts +28 -13
- package/src/config/validator.ts +521 -454
- package/src/core/change-detector.ts +122 -104
- package/src/core/entry-point.test.ts +60 -62
- package/src/core/entry-point.ts +120 -74
- package/src/core/job.ts +69 -59
- package/src/core/runner.ts +264 -230
- package/src/gates/check.ts +78 -69
- package/src/gates/result.ts +7 -7
- package/src/gates/review.test.ts +277 -138
- package/src/gates/review.ts +724 -561
- package/src/index.ts +18 -15
- package/src/output/console.ts +253 -214
- package/src/output/logger.ts +66 -52
- package/src/templates/run_gauntlet.template.md +18 -0
- package/src/templates/workflow.yml +77 -0
- package/src/utils/diff-parser.ts +64 -62
- package/src/utils/log-parser.ts +227 -206
- package/src/utils/sanitizer.ts +1 -1
package/README.md
CHANGED
|
@@ -1,122 +1,90 @@
|
|
|
1
1
|
# Agent Gauntlet
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
> Don't just review the agent's code — put it through the gauntlet.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
- **Entry points** (paths in your repo)
|
|
7
|
-
- **Check gates** (shell commands: tests, linters, typecheck, etc.)
|
|
8
|
-
- **Review gates** (AI CLI tools run on diffs, with regex-based pass/fail)
|
|
5
|
+
Agent Gauntlet is a configurable “feedback loop” runner for AI-assisted development workflows.
|
|
9
6
|
|
|
10
|
-
|
|
7
|
+
You configure which paths in your repo should trigger which validations — shell commands like tests and linters, plus AI-powered code reviews. When files change, Gauntlet automatically runs the relevant validations and reports results.
|
|
11
8
|
|
|
12
|
-
|
|
9
|
+
For AI reviews, it uses the CLI tool of your choice: Gemini, Codex, Claude Code, GitHub Copilot, or Cursor.
|
|
13
10
|
|
|
14
|
-
|
|
15
|
-
- **Leverage existing subscriptions**: Use the tools you are already paying for.
|
|
16
|
-
- **Dynamic Context**: Agents are invoked in a non-interactive, read-only mode where they can use their own file-reading and search tools to pull additional context from your repository as needed.
|
|
17
|
-
- **Security**: By using standard CLI tools with strict flags (like `--sandbox` or `--allowed-tools`), Agent Gauntlet ensures that agents can read your code to review it without being able to modify your files or escape the repository scope.
|
|
11
|
+
## Features
|
|
18
12
|
|
|
19
|
-
|
|
13
|
+
- **Agent validation loop**: Keep your coding agent on track with automated feedback loops. Detect problems — deterministically and/or non-deterministically — and let your agent fix and Gauntlet verify.
|
|
14
|
+
- **Multi-agent collaboration**: Enable one AI agent to automatically request code reviews from another. For example, if Claude made changes, Gauntlet can request a review from Codex or Gemini — spreading token usage across your subscriptions instead of burning through one.
|
|
15
|
+
- **Leverage existing subscriptions**: Agent Gauntlet is *free* and tool-agnostic, leveraging the AI CLI tools you already have installed.
|
|
16
|
+
- **Easy CI setup**: Define your checks once, run them locally and in GitHub.
|
|
20
17
|
|
|
21
|
-
|
|
22
|
-
- **git** (change detection and diffs)
|
|
23
|
-
- For review gates: one or more supported AI CLIs installed (`gemini`, `codex`, `claude`). For the full list of tools and how they are used, see [CLI Invocation Details](docs/cli-invocation-details.md)
|
|
18
|
+
## Usage Patterns
|
|
24
19
|
|
|
25
|
-
|
|
20
|
+
Agent Gauntlet supports three primary usage patterns, each suited for different development workflows:
|
|
21
|
+
1. Run CLI: `agent-gauntlet run`
|
|
22
|
+
2. Run agent command: `/gauntlet`
|
|
23
|
+
3. Automatically run after agent completes task
|
|
26
24
|
|
|
27
|
-
|
|
25
|
+
The use cases below illustrate when each of these patterns may be used.
|
|
28
26
|
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
bun add -g agent-gauntlet
|
|
32
|
-
```
|
|
27
|
+
### 1. Planning Mode
|
|
33
28
|
|
|
34
|
-
**
|
|
35
|
-
```bash
|
|
36
|
-
npm install -g agent-gauntlet
|
|
37
|
-
```
|
|
29
|
+
**Use case:** Generate and review high-level implementation plans before coding.
|
|
38
30
|
|
|
39
|
-
|
|
31
|
+
**Problem Gauntlet solves:** Catch architectural issues and requirement misunderstandings before coding to avoid costly rework.
|
|
40
32
|
|
|
41
|
-
|
|
33
|
+
**Workflow:**
|
|
42
34
|
|
|
43
|
-
|
|
44
|
-
agent-gauntlet
|
|
45
|
-
|
|
35
|
+
1. Create a plan document in your project directory
|
|
36
|
+
2. Run `agent-gauntlet run` from the terminal
|
|
37
|
+
3. Gauntlet detects the new or modified plan and invokes configured AI CLIs to review it
|
|
38
|
+
4. *(Optional)* Ask your assistant to refine the plan based on review feedback
|
|
46
39
|
|
|
47
|
-
|
|
40
|
+
**Note:** Review configuration and prompts are fully customizable. Example prompt: *"Review this plan for completeness and potential issues."*
|
|
48
41
|
|
|
49
|
-
|
|
50
|
-
agent-gauntlet
|
|
51
|
-
```
|
|
42
|
+
### 2. AI-Assisted Development
|
|
52
43
|
|
|
53
|
-
|
|
44
|
+
**Use case:** Pair with an AI coding assistant to implement features with continuous quality checks.
|
|
54
45
|
|
|
55
|
-
-
|
|
46
|
+
**Problem Gauntlet solves:** Catch AI-introduced bugs and quality issues through automated checks and multi-LLM review.
|
|
56
47
|
|
|
57
|
-
|
|
58
|
-
bun install
|
|
59
|
-
```
|
|
48
|
+
**Workflow:**
|
|
60
49
|
|
|
61
|
-
|
|
50
|
+
1. Collaborate with your assistant to implement code changes
|
|
51
|
+
2. Run `/gauntlet` from chat
|
|
52
|
+
3. Gauntlet detects changed files and runs configured checks (linter, tests, type checking, etc.)
|
|
53
|
+
4. Simultaneously, Gauntlet invokes AI CLIs for code review
|
|
54
|
+
5. Assistant reviews results, fixes identified issues, and runs `agent-gauntlet rerun`
|
|
55
|
+
6. Gauntlet verifies fixes and checks for new issues
|
|
56
|
+
7. Process repeats automatically (up to 3 reruns) until all gates pass
|
|
62
57
|
|
|
63
|
-
|
|
64
|
-
bun run build
|
|
65
|
-
```
|
|
58
|
+
### 3. Agentic Implementation
|
|
66
59
|
|
|
67
|
-
|
|
60
|
+
**Use case:** Delegate well-defined tasks to a coding agent for autonomous implementation.
|
|
68
61
|
|
|
69
|
-
-
|
|
62
|
+
**Problem Gauntlet solves:** Enable autonomous agent development with built-in quality gates, eliminating the validation gap when humans aren't in the loop.
|
|
70
63
|
|
|
71
|
-
|
|
72
|
-
agent-gauntlet run
|
|
73
|
-
```
|
|
64
|
+
**Workflow:**
|
|
74
65
|
|
|
75
|
-
|
|
66
|
+
1. Configure your agent to automatically run `/gauntlet` after completing implementation:
|
|
67
|
+
- **Rules files:** Add to `.cursorrules`, `AGENT.md`, or similar
|
|
68
|
+
- **Custom commands:** Create a `/my-dev-workflow` that includes gauntlet
|
|
69
|
+
- **Git hooks:** Use pre-commit hooks to trigger gauntlet
|
|
70
|
+
- **Agent hooks:** Leverage platform features (e.g., Claude's Stop event)
|
|
71
|
+
2. Assign the task to your agent and step away
|
|
72
|
+
3. When you return: the task is complete, reviewed by a different LLM, all issues fixed, and CI checks passing
|
|
76
73
|
|
|
77
|
-
|
|
78
|
-
agent-gauntlet run --gate lint
|
|
79
|
-
```
|
|
74
|
+
**Benefit:** Fully autonomous quality assurance without manual intervention.
|
|
80
75
|
|
|
81
|
-
|
|
76
|
+
## Quick Start
|
|
82
77
|
|
|
83
|
-
|
|
84
|
-
agent-gauntlet
|
|
85
|
-
|
|
78
|
+
1. **Install**: `bun add -g agent-gauntlet`
|
|
79
|
+
2. **Initialize**: `agent-gauntlet init`
|
|
80
|
+
3. **Run**: `agent-gauntlet run`
|
|
86
81
|
|
|
87
|
-
|
|
82
|
+
For basic usage and configuration guide, see the [Quick Start Guide](docs/quick-start.md).
|
|
88
83
|
|
|
89
|
-
|
|
90
|
-
agent-gauntlet health
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### Agent loop rules
|
|
94
|
-
|
|
95
|
-
The `.gauntlet/run_gauntlet.md` file defines how AI agents should interact with the gauntlet. By default, agents will terminate after 4 runs (1 initial + 3 fix attempts). You can increase this limit by manually editing the termination conditions in that file.
|
|
96
|
-
|
|
97
|
-
### Configuration layout
|
|
98
|
-
|
|
99
|
-
Agent Gauntlet loads configuration from your repository:
|
|
100
|
-
|
|
101
|
-
```text
|
|
102
|
-
.gauntlet/
|
|
103
|
-
config.yml
|
|
104
|
-
checks/
|
|
105
|
-
*.yml
|
|
106
|
-
reviews/
|
|
107
|
-
*.md
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
- **Project config**: `.gauntlet/config.yml`
|
|
111
|
-
- **Check definitions**: `.gauntlet/checks/*.yml`
|
|
112
|
-
- **Review definitions**: `.gauntlet/reviews/*.md` (filename is the review gate name)
|
|
113
|
-
|
|
114
|
-
### Logs
|
|
115
|
-
|
|
116
|
-
Each job writes a log file under `log_dir` (default: `.gauntlet_logs/`). Filenames are derived from the job id (sanitized).
|
|
117
|
-
|
|
118
|
-
### Documentation
|
|
84
|
+
## Documentation
|
|
119
85
|
|
|
86
|
+
- [Quick Start Guide](docs/quick-start.md) — installation, basic usage, and config layout
|
|
120
87
|
- [User Guide](docs/user-guide.md) — full usage details
|
|
121
88
|
- [Configuration Reference](docs/config-reference.md) — all configuration fields + defaults
|
|
122
89
|
- [CLI Invocation Details](docs/cli-invocation-details.md) — how we securely invoke AI CLIs
|
|
90
|
+
- [Development Guide](docs/development.md) — how to build and develop this project
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-gauntlet",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.12",
|
|
4
4
|
"description": "A CLI tool for testing AI coding agents",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "Paul Caplan",
|
|
@@ -33,9 +33,11 @@
|
|
|
33
33
|
},
|
|
34
34
|
"scripts": {
|
|
35
35
|
"build": "bun build --compile --minify --sourcemap ./src/index.ts --outfile bin/agent-gauntlet",
|
|
36
|
-
"test": "bun test"
|
|
36
|
+
"test": "bun test",
|
|
37
|
+
"lint": "biome check src"
|
|
37
38
|
},
|
|
38
39
|
"devDependencies": {
|
|
40
|
+
"@biomejs/biome": "^2.3.11",
|
|
39
41
|
"@types/bun": "latest"
|
|
40
42
|
},
|
|
41
43
|
"peerDependencies": {
|
|
@@ -1,114 +1,145 @@
|
|
|
1
|
-
import { exec } from
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
import
|
|
1
|
+
import { exec } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { promisify } from "node:util";
|
|
6
|
+
import { type CLIAdapter, isUsageLimit } from "./index.js";
|
|
7
7
|
|
|
8
8
|
const execAsync = promisify(exec);
|
|
9
9
|
const MAX_BUFFER_BYTES = 10 * 1024 * 1024;
|
|
10
10
|
|
|
11
11
|
export class ClaudeAdapter implements CLIAdapter {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
12
|
+
name = "claude";
|
|
13
|
+
|
|
14
|
+
async isAvailable(): Promise<boolean> {
|
|
15
|
+
try {
|
|
16
|
+
await execAsync("which claude");
|
|
17
|
+
return true;
|
|
18
|
+
} catch {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async checkHealth(options?: { checkUsageLimit?: boolean }): Promise<{
|
|
24
|
+
available: boolean;
|
|
25
|
+
status: "healthy" | "missing" | "unhealthy";
|
|
26
|
+
message?: string;
|
|
27
|
+
}> {
|
|
28
|
+
const available = await this.isAvailable();
|
|
29
|
+
if (!available) {
|
|
30
|
+
return {
|
|
31
|
+
available: false,
|
|
32
|
+
status: "missing",
|
|
33
|
+
message: "Command not found",
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (options?.checkUsageLimit) {
|
|
38
|
+
try {
|
|
39
|
+
// Try a lightweight command to check if we're rate limited
|
|
40
|
+
// We use a simple "hello" prompt to avoid "No messages returned" errors from empty input
|
|
41
|
+
const { stdout, stderr } = await execAsync(
|
|
42
|
+
'echo "hello" | claude -p --max-turns 1',
|
|
43
|
+
{ timeout: 10000 },
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
const combined = (stdout || "") + (stderr || "");
|
|
47
|
+
if (isUsageLimit(combined)) {
|
|
48
|
+
return {
|
|
49
|
+
available: true,
|
|
50
|
+
status: "unhealthy",
|
|
51
|
+
message: "Usage limit exceeded",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
56
|
+
} catch (error: unknown) {
|
|
57
|
+
const execError = error as {
|
|
58
|
+
stderr?: string;
|
|
59
|
+
stdout?: string;
|
|
60
|
+
message?: string;
|
|
61
|
+
};
|
|
62
|
+
const stderr = execError.stderr || "";
|
|
63
|
+
const stdout = execError.stdout || "";
|
|
64
|
+
const combined = stderr + stdout;
|
|
65
|
+
|
|
66
|
+
if (isUsageLimit(combined)) {
|
|
67
|
+
return {
|
|
68
|
+
available: true,
|
|
69
|
+
status: "unhealthy",
|
|
70
|
+
message: "Usage limit exceeded",
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Since we sent a valid prompt ("hello"), any other error implies the tool is broken
|
|
75
|
+
// Extract a brief error message if possible
|
|
76
|
+
const cleanError =
|
|
77
|
+
combined.split("\n")[0]?.trim() ||
|
|
78
|
+
execError.message ||
|
|
79
|
+
"Command failed";
|
|
80
|
+
return {
|
|
81
|
+
available: true,
|
|
82
|
+
status: "unhealthy",
|
|
83
|
+
message: `Error: ${cleanError}`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return { available: true, status: "healthy", message: "Ready" };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
getProjectCommandDir(): string | null {
|
|
92
|
+
return ".claude/commands";
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
getUserCommandDir(): string | null {
|
|
96
|
+
// Claude supports user-level commands at ~/.claude/commands
|
|
97
|
+
return path.join(os.homedir(), ".claude", "commands");
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
getCommandExtension(): string {
|
|
101
|
+
return ".md";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
canUseSymlink(): boolean {
|
|
105
|
+
// Claude uses the same Markdown format as our canonical file
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
transformCommand(markdownContent: string): string {
|
|
110
|
+
// Claude uses the same Markdown format, no transformation needed
|
|
111
|
+
return markdownContent;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async execute(opts: {
|
|
115
|
+
prompt: string;
|
|
116
|
+
diff: string;
|
|
117
|
+
model?: string;
|
|
118
|
+
timeoutMs?: number;
|
|
119
|
+
}): Promise<string> {
|
|
120
|
+
const fullContent = `${opts.prompt}\n\n--- DIFF ---\n${opts.diff}`;
|
|
121
|
+
|
|
122
|
+
const tmpDir = os.tmpdir();
|
|
123
|
+
// Include process.pid for uniqueness across concurrent processes
|
|
124
|
+
const tmpFile = path.join(
|
|
125
|
+
tmpDir,
|
|
126
|
+
`gauntlet-claude-${process.pid}-${Date.now()}.txt`,
|
|
127
|
+
);
|
|
128
|
+
await fs.writeFile(tmpFile, fullContent);
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
// Recommended invocation per spec:
|
|
132
|
+
// -p: non-interactive print mode
|
|
133
|
+
// --allowedTools: explicitly restricts to read-only tools
|
|
134
|
+
// --max-turns: caps agentic turns
|
|
135
|
+
const cmd = `cat "${tmpFile}" | claude -p --allowedTools "Read,Glob,Grep" --max-turns 10`;
|
|
136
|
+
const { stdout } = await execAsync(cmd, {
|
|
137
|
+
timeout: opts.timeoutMs,
|
|
138
|
+
maxBuffer: MAX_BUFFER_BYTES,
|
|
139
|
+
});
|
|
140
|
+
return stdout;
|
|
141
|
+
} finally {
|
|
142
|
+
await fs.unlink(tmpFile).catch(() => {});
|
|
143
|
+
}
|
|
144
|
+
}
|
|
114
145
|
}
|