@cliwatch/cli-bench 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,33 @@
1
+ # Changelog
2
+
3
+ ## 0.5.0 (2026-02-11)
4
+
5
+ ### Features
6
+
7
+ - **`system_prompt` config field** — append custom instructions to the default agent system prompt via cli-bench.yaml
8
+
9
+ ## 0.4.0 (2026-02-08)
10
+
11
+ ### Features
12
+
13
+ - **Repeat support** — run tasks N times with `repeat` (per-task or global) for statistical confidence
14
+ - **Threshold checks** — fail CI when pass rates drop below configured thresholds
15
+ - **Conversation traces** — full tool call/result traces uploaded for debugging
16
+ - **Task suite hashing** — detect when task definitions change between runs
17
+
18
+ ## 0.3.0 (2026-02-05)
19
+
20
+ - Config file mode (`cli-bench.yaml`)
21
+ - File references (`file://`) with glob support
22
+ - CI metadata collection (git sha, branch, PR number)
23
+ - Upload to CLIWatch dashboard
24
+
25
+ ## 0.2.0 (2026-02-02)
26
+
27
+ - Help modes: injected, discoverable, none
28
+ - Assertion-based evaluation (10 assertion types)
29
+ - Concurrent task execution
30
+
31
+ ## 0.1.0 (2026-01-28)
32
+
33
+ Initial release.
package/README.md CHANGED
@@ -79,7 +79,9 @@ Each referenced file is a plain array of tasks:
79
79
  | `concurrency` | No | Max concurrent API calls (default: 3) |
80
80
  | `workdir` | No | Working directory (default: temp dir per task) |
81
81
  | `upload` | No | auto, always, never (default: auto) |
82
- | `backend_url` | No | Dashboard URL (default: https://api.cliwatch.dev) |
82
+ | `repeat` | No | Run all tasks N times (default: 1, range: 1-100) |
83
+ | `system_prompt` | No | Custom prompt appended to the default agent system message |
84
+ | `thresholds` | No | Pass rate thresholds (see [docs](https://docs.cliwatch.com/configuration/thresholds)) |
83
85
  | `tasks` | Yes | Array of tasks or `file://` references |
84
86
 
85
87
  ## Assertion types
@@ -109,7 +111,7 @@ steps:
109
111
  - run: npm install -g my-cli
110
112
  - run: npx @cliwatch/cli-bench
111
113
  env:
112
- ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
114
+ AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }}
113
115
  CLIWATCH_API_KEY: ${{ secrets.CLIWATCH_API_KEY }} # optional, uploads to dashboard
114
116
  ```
115
117
 
@@ -119,11 +121,8 @@ No Docker required. Commands run directly on the CI runner.
119
121
 
120
122
  | Variable | Description |
121
123
  |----------|-------------|
122
- | `ANTHROPIC_API_KEY` | Anthropic API key (for `anthropic/*` models) |
123
- | `OPENAI_API_KEY` | OpenAI API key (for `openai/*` models) |
124
- | `GOOGLE_GENERATIVE_AI_API_KEY` | Google AI API key (for `google/*` models) |
125
- | `CLIWATCH_API_KEY` | API key for uploading results to cliwatch.dev |
126
- | `CLIWATCH_BACKEND_URL` | Override backend URL (default: `https://api.cliwatch.dev`) |
124
+ | `AI_GATEWAY_API_KEY` | Vercel AI Gateway key provides access to all models |
125
+ | `CLIWATCH_API_KEY` | API key from [app.cliwatch.com](https://app.cliwatch.com) for uploading results |
127
126
 
128
127
  ## Uploading results
129
128
 
package/dist/index.js CHANGED
@@ -32,7 +32,7 @@ async function main() {
32
32
  }
33
33
  return;
34
34
  }
35
- console.log('@cliwatch/cli-bench v0.5.0');
35
+ console.log('@cliwatch/cli-bench v0.5.1');
36
36
  // Try to find a config file
37
37
  const configPath = await resolveConfigFile(config.configFile);
38
38
  let reports;
package/package.json CHANGED
@@ -1,13 +1,22 @@
1
1
  {
2
2
  "name": "@cliwatch/cli-bench",
3
- "version": "0.5.0",
3
+ "version": "0.5.2",
4
+ "description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
5
+ "keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
6
+ "license": "MIT",
7
+ "homepage": "https://docs.cliwatch.com",
4
8
  "type": "module",
9
+ "engines": {
10
+ "node": ">=18"
11
+ },
5
12
  "bin": {
6
13
  "cli-bench": "./dist/index.js"
7
14
  },
8
15
  "files": [
9
16
  "dist",
10
- "task_suites"
17
+ "task_suites",
18
+ "LICENSE",
19
+ "CHANGELOG.md"
11
20
  ],
12
21
  "exports": {
13
22
  ".": {