@cliwatch/cli-bench 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/package.json +11 -2
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.5.0 (2026-02-11)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
- **`system_prompt` config field** — append custom instructions to the default agent system prompt via cli-bench.yaml
|
|
8
|
+
|
|
9
|
+
## 0.4.0 (2026-02-08)
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
- **Repeat support** — run tasks N times with `repeat` (per-task or global) for statistical confidence
|
|
14
|
+
- **Threshold checks** — fail CI when pass rates drop below configured thresholds
|
|
15
|
+
- **Conversation traces** — full tool call/result traces uploaded for debugging
|
|
16
|
+
- **Task suite hashing** — detect when task definitions change between runs
|
|
17
|
+
|
|
18
|
+
## 0.3.0 (2026-02-05)
|
|
19
|
+
|
|
20
|
+
- Config file mode (`cli-bench.yaml`)
|
|
21
|
+
- File references (`file://`) with glob support
|
|
22
|
+
- CI metadata collection (git sha, branch, PR number)
|
|
23
|
+
- Upload to CLIWatch dashboard
|
|
24
|
+
|
|
25
|
+
## 0.2.0 (2026-02-02)
|
|
26
|
+
|
|
27
|
+
- Help modes: injected, discoverable, none
|
|
28
|
+
- Assertion-based evaluation (10 assertion types)
|
|
29
|
+
- Concurrent task execution
|
|
30
|
+
|
|
31
|
+
## 0.1.0 (2026-01-28)
|
|
32
|
+
|
|
33
|
+
Initial release.
|
package/package.json
CHANGED
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cliwatch/cli-bench",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
|
+
"description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
|
|
5
|
+
"keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"homepage": "https://docs.cliwatch.com",
|
|
4
8
|
"type": "module",
|
|
9
|
+
"engines": {
|
|
10
|
+
"node": ">=18"
|
|
11
|
+
},
|
|
5
12
|
"bin": {
|
|
6
13
|
"cli-bench": "./dist/index.js"
|
|
7
14
|
},
|
|
8
15
|
"files": [
|
|
9
16
|
"dist",
|
|
10
|
-
"task_suites"
|
|
17
|
+
"task_suites",
|
|
18
|
+
"LICENSE",
|
|
19
|
+
"CHANGELOG.md"
|
|
11
20
|
],
|
|
12
21
|
"exports": {
|
|
13
22
|
".": {
|