@cliwatch/cli-bench 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/README.md +6 -7
- package/dist/index.js +1 -1
- package/package.json +11 -2
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.5.0 (2026-02-11)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
- **`system_prompt` config field** — append custom instructions to the default agent system prompt via cli-bench.yaml
|
|
8
|
+
|
|
9
|
+
## 0.4.0 (2026-02-08)
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
- **Repeat support** — run tasks N times with `repeat` (per-task or global) for statistical confidence
|
|
14
|
+
- **Threshold checks** — fail CI when pass rates drop below configured thresholds
|
|
15
|
+
- **Conversation traces** — full tool call/result traces uploaded for debugging
|
|
16
|
+
- **Task suite hashing** — detect when task definitions change between runs
|
|
17
|
+
|
|
18
|
+
## 0.3.0 (2026-02-05)
|
|
19
|
+
|
|
20
|
+
- Config file mode (`cli-bench.yaml`)
|
|
21
|
+
- File references (`file://`) with glob support
|
|
22
|
+
- CI metadata collection (git sha, branch, PR number)
|
|
23
|
+
- Upload to CLIWatch dashboard
|
|
24
|
+
|
|
25
|
+
## 0.2.0 (2026-02-02)
|
|
26
|
+
|
|
27
|
+
- Help modes: injected, discoverable, none
|
|
28
|
+
- Assertion-based evaluation (10 assertion types)
|
|
29
|
+
- Concurrent task execution
|
|
30
|
+
|
|
31
|
+
## 0.1.0 (2026-01-28)
|
|
32
|
+
|
|
33
|
+
Initial release.
|
package/README.md
CHANGED
|
@@ -79,7 +79,9 @@ Each referenced file is a plain array of tasks:
|
|
|
79
79
|
| `concurrency` | No | Max concurrent API calls (default: 3) |
|
|
80
80
|
| `workdir` | No | Working directory (default: temp dir per task) |
|
|
81
81
|
| `upload` | No | auto, always, never (default: auto) |
|
|
82
|
-
| `
|
|
82
|
+
| `repeat` | No | Run all tasks N times (default: 1, range: 1-100) |
|
|
83
|
+
| `system_prompt` | No | Custom prompt appended to the default agent system message |
|
|
84
|
+
| `thresholds` | No | Pass rate thresholds (see [docs](https://docs.cliwatch.com/configuration/thresholds)) |
|
|
83
85
|
| `tasks` | Yes | Array of tasks or `file://` references |
|
|
84
86
|
|
|
85
87
|
## Assertion types
|
|
@@ -109,7 +111,7 @@ steps:
|
|
|
109
111
|
- run: npm install -g my-cli
|
|
110
112
|
- run: npx @cliwatch/cli-bench
|
|
111
113
|
env:
|
|
112
|
-
|
|
114
|
+
AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }}
|
|
113
115
|
CLIWATCH_API_KEY: ${{ secrets.CLIWATCH_API_KEY }} # optional, uploads to dashboard
|
|
114
116
|
```
|
|
115
117
|
|
|
@@ -119,11 +121,8 @@ No Docker required. Commands run directly on the CI runner.
|
|
|
119
121
|
|
|
120
122
|
| Variable | Description |
|
|
121
123
|
|----------|-------------|
|
|
122
|
-
| `
|
|
123
|
-
| `
|
|
124
|
-
| `GOOGLE_GENERATIVE_AI_API_KEY` | Google AI API key (for `google/*` models) |
|
|
125
|
-
| `CLIWATCH_API_KEY` | API key for uploading results to cliwatch.dev |
|
|
126
|
-
| `CLIWATCH_BACKEND_URL` | Override backend URL (default: `https://api.cliwatch.dev`) |
|
|
124
|
+
| `AI_GATEWAY_API_KEY` | Vercel AI Gateway key — provides access to all models |
|
|
125
|
+
| `CLIWATCH_API_KEY` | API key from [app.cliwatch.com](https://app.cliwatch.com) for uploading results |
|
|
127
126
|
|
|
128
127
|
## Uploading results
|
|
129
128
|
|
package/dist/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cliwatch/cli-bench",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
|
+
"description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
|
|
5
|
+
"keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"homepage": "https://docs.cliwatch.com",
|
|
4
8
|
"type": "module",
|
|
9
|
+
"engines": {
|
|
10
|
+
"node": ">=18"
|
|
11
|
+
},
|
|
5
12
|
"bin": {
|
|
6
13
|
"cli-bench": "./dist/index.js"
|
|
7
14
|
},
|
|
8
15
|
"files": [
|
|
9
16
|
"dist",
|
|
10
|
-
"task_suites"
|
|
17
|
+
"task_suites",
|
|
18
|
+
"LICENSE",
|
|
19
|
+
"CHANGELOG.md"
|
|
11
20
|
],
|
|
12
21
|
"exports": {
|
|
13
22
|
".": {
|