caliper-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caliper_eval-0.1.0/.gitignore +12 -0
- caliper_eval-0.1.0/PKG-INFO +588 -0
- caliper_eval-0.1.0/README.md +552 -0
- caliper_eval-0.1.0/assets/caliper-banner.png +0 -0
- caliper_eval-0.1.0/assets/caliper-demo.gif +0 -0
- caliper_eval-0.1.0/assets/demo.tape +45 -0
- caliper_eval-0.1.0/assets/logo.png +0 -0
- caliper_eval-0.1.0/caliper/__init__.py +0 -0
- caliper_eval-0.1.0/caliper/commands/__init__.py +0 -0
- caliper_eval-0.1.0/caliper/commands/install_skill.py +56 -0
- caliper_eval-0.1.0/caliper/commands/list_cmd.py +94 -0
- caliper_eval-0.1.0/caliper/commands/new.py +16 -0
- caliper_eval-0.1.0/caliper/commands/report.py +56 -0
- caliper_eval-0.1.0/caliper/commands/run.py +110 -0
- caliper_eval-0.1.0/caliper/commands/validate.py +66 -0
- caliper_eval-0.1.0/caliper/harness/__init__.py +31 -0
- caliper_eval-0.1.0/caliper/harness/base.py +50 -0
- caliper_eval-0.1.0/caliper/harness/claude_api.py +74 -0
- caliper_eval-0.1.0/caliper/harness/claude_code.py +372 -0
- caliper_eval-0.1.0/caliper/harness/codex.py +278 -0
- caliper_eval-0.1.0/caliper/harness/openai_api.py +61 -0
- caliper_eval-0.1.0/caliper/judge/__init__.py +42 -0
- caliper_eval-0.1.0/caliper/judge/autorater.py +129 -0
- caliper_eval-0.1.0/caliper/judge/base.py +28 -0
- caliper_eval-0.1.0/caliper/judge/claude_code_judge.py +92 -0
- caliper_eval-0.1.0/caliper/judge/codex_judge.py +180 -0
- caliper_eval-0.1.0/caliper/judge/openai_api_judge.py +89 -0
- caliper_eval-0.1.0/caliper/judge/script_assert.py +210 -0
- caliper_eval-0.1.0/caliper/main.py +30 -0
- caliper_eval-0.1.0/caliper/reporter.py +223 -0
- caliper_eval-0.1.0/caliper/resources/__init__.py +1 -0
- caliper_eval-0.1.0/caliper/resources/evaluate_skill/SKILL.md +252 -0
- caliper_eval-0.1.0/caliper/resources/evaluate_skill/__init__.py +1 -0
- caliper_eval-0.1.0/caliper/runner.py +308 -0
- caliper_eval-0.1.0/caliper/schema/__init__.py +0 -0
- caliper_eval-0.1.0/caliper/schema/results.py +75 -0
- caliper_eval-0.1.0/caliper/schema/spec.py +87 -0
- caliper_eval-0.1.0/caliper/scoring.py +33 -0
- caliper_eval-0.1.0/caliper/wizard.py +159 -0
- caliper_eval-0.1.0/pyproject.toml +51 -0
- caliper_eval-0.1.0/skills/evaluate-skill/SKILL.md +252 -0
- caliper_eval-0.1.0/skills/evaluate-skill/evaluate-skill.eval.yaml +163 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/claude-code-smoke/SKILL.md +14 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/claude-code-smoke/claude-code-smoke.eval.yaml +27 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/commit-simple/SKILL.md +29 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/commit-simple/commit-simple.eval.yaml +84 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/screenshot/SKILL.md +267 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml +31 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/summarize/SKILL.md +87 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/summarize/summarize.eval.yaml +50 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/tdd/SKILL.md +371 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/evals/tdd/tdd.eval.yaml +88 -0
- caliper_eval-0.1.0/skills/evaluate-skill/references/examples/simple.eval.yaml +29 -0
- caliper_eval-0.1.0/tests/test_claude_harness.py +107 -0
- caliper_eval-0.1.0/tests/test_claude_judge.py +45 -0
- caliper_eval-0.1.0/tests/test_codex_harness.py +206 -0
- caliper_eval-0.1.0/tests/test_codex_judge.py +108 -0
- caliper_eval-0.1.0/tests/test_install_skill.py +78 -0
- caliper_eval-0.1.0/tests/test_reporter.py +10 -0
- caliper_eval-0.1.0/tests/test_runner.py +77 -0
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: caliper-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI for evaluating Claude Code skills and AI agents
|
|
5
|
+
Project-URL: Homepage, https://github.com/edonadei/caliper
|
|
6
|
+
Project-URL: Repository, https://github.com/edonadei/caliper
|
|
7
|
+
Project-URL: Issues, https://github.com/edonadei/caliper/issues
|
|
8
|
+
Author: Emrick Donadei
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: agents,ai,claude,codex,evals,skills
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Software Development :: Testing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: anthropic>=0.28
|
|
24
|
+
Requires-Dist: pydantic>=2
|
|
25
|
+
Requires-Dist: pyyaml>=6
|
|
26
|
+
Requires-Dist: rich>=13
|
|
27
|
+
Requires-Dist: typer>=0.15
|
|
28
|
+
Provides-Extra: codex
|
|
29
|
+
Requires-Dist: openai>=1.0; extra == 'codex'
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
33
|
+
Provides-Extra: openai
|
|
34
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# Caliper - Agent Skill Evaluation Harness
|
|
38
|
+
|
|
39
|
+
<p align="center">
|
|
40
|
+
<img src="assets/caliper-banner.png" alt="Caliper banner">
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
Evaluate AI agent skills with repeatable tasks, automated judging, and pass@k
|
|
44
|
+
scoring.
|
|
45
|
+
|
|
46
|
+
Caliper is a local-first evaluation harness for Claude Code skills, Codex
|
|
47
|
+
skills, and API-backed agents. It runs a skill against one or more task specs,
|
|
48
|
+
records every attempt, judges the result with an LLM and/or deterministic Python
|
|
49
|
+
assertions, and saves reproducible result files you can inspect later.
|
|
50
|
+
|
|
51
|
+
Use Caliper when you want to answer practical questions like:
|
|
52
|
+
|
|
53
|
+
- Did this skill actually get better after my prompt edit?
|
|
54
|
+
- Does it still pass the workflows it passed last week?
|
|
55
|
+
- Does Codex or Claude Code run this skill more reliably for my use case?
|
|
56
|
+
- Is the skill doing the work, or would the baseline agent pass without it?
|
|
57
|
+
- Can contributors change a skill without relying on subjective manual testing?
|
|
58
|
+
|
|
59
|
+
Caliper is especially useful for agent skills because skills are hard to review
|
|
60
|
+
with ordinary unit tests. A good skill is part prompt, part workflow, part tool
|
|
61
|
+
contract. Caliper turns that behavior into versioned eval specs, repeatable
|
|
62
|
+
runs, pass/fail judgments, and saved transcripts.
|
|
63
|
+
|
|
64
|
+

|
|
65
|
+
|
|
66
|
+
## Highlights
|
|
67
|
+
|
|
68
|
+
- **Skill-first evaluation** for Claude Code, Codex, Anthropic API, and OpenAI
|
|
69
|
+
API backends.
|
|
70
|
+
- **Independent agent and judge backends**, so you can test a Codex skill with a
|
|
71
|
+
Claude judge, a Claude Code skill with a Codex judge, or keep everything on one
|
|
72
|
+
provider.
|
|
73
|
+
- **Natural-language and deterministic checks** through `expect:` and `assert:`.
|
|
74
|
+
- **pass@k scoring** for measuring reliability across repeated attempts.
|
|
75
|
+
- **Baseline runs** to show whether the skill improves over an unassisted agent.
|
|
76
|
+
- **Attempt isolation** with fresh temporary homes and no session history.
|
|
77
|
+
- **Reproducible result files** that snapshot the skill content, referenced local
|
|
78
|
+
files, and git SHA when available.
|
|
79
|
+
- **Agent-installable evaluator skill** so Claude Code or Codex can help create,
|
|
80
|
+
validate, run, and interpret evals.
|
|
81
|
+
|
|
82
|
+
## When To Use It
|
|
83
|
+
|
|
84
|
+
Caliper works well for:
|
|
85
|
+
|
|
86
|
+
- evaluating Claude Code slash-command skills
|
|
87
|
+
- evaluating Codex skills
|
|
88
|
+
- comparing agent backends on the same task suite
|
|
89
|
+
- regression-testing prompt and workflow changes
|
|
90
|
+
- checking coding, review, refactor, summarization, screenshot, and file-writing
|
|
91
|
+
behaviors
|
|
92
|
+
- mixing LLM judgment with exact checks for files, JSON, command output, images,
|
|
93
|
+
and repository state
|
|
94
|
+
|
|
95
|
+
It is not a replacement for normal unit tests. Use unit tests for deterministic
|
|
96
|
+
library behavior. Use Caliper for agent behavior where the output depends on a
|
|
97
|
+
model following instructions, using tools, and completing a workflow.
|
|
98
|
+
|
|
99
|
+
## Install
|
|
100
|
+
|
|
101
|
+
The fastest way to install Caliper from GitHub is:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pipx install git+https://github.com/edonadei/caliper.git
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
After the first PyPI release, install the same CLI as:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pipx install caliper-eval
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Both install methods expose the `caliper` command:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
caliper --help
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For local development from the repository root:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
pip install -e .
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
For local development and optional OpenAI API support:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
pip install -e ".[dev,openai]"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Caliper requires Python 3.10 or newer.
|
|
132
|
+
|
|
133
|
+
## Backend Setup
|
|
134
|
+
|
|
135
|
+
Caliper can run the agent under test and the judge through different backends.
|
|
136
|
+
|
|
137
|
+
| Role | Claude Code CLI | Codex CLI | API backends |
|
|
138
|
+
|---|---|---|---|
|
|
139
|
+
| Agent under test | `skill.backend: claude-code` | `skill.backend: codex` | `skill.backend: claude-api` or `openai-api` |
|
|
140
|
+
| LLM judge | `judge.backend: claude-code` | `judge.backend: codex` | `judge.backend: claude-api` or `openai-api` |
|
|
141
|
+
| Auth/billing | Claude Code subscription/auth | Codex CLI subscription/auth | Provider API key/billing |
|
|
142
|
+
| Transcript | Claude `stream-json` tool-call transcript | Final Codex text output | Final API response text |
|
|
143
|
+
|
|
144
|
+
### Claude Code
|
|
145
|
+
|
|
146
|
+
Install and authenticate the `claude` CLI. `backend: claude-code` uses your
|
|
147
|
+
normal Claude Code CLI auth.
|
|
148
|
+
|
|
149
|
+
If you explicitly use `backend: claude-api`, set:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
export ANTHROPIC_API_KEY=...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Codex
|
|
156
|
+
|
|
157
|
+
Install and authenticate the Codex CLI:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
npm install -g @openai/codex
|
|
161
|
+
codex login
|
|
162
|
+
codex --version
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
`backend: codex` calls Codex with `codex exec`. It does not fall back to the
|
|
166
|
+
OpenAI API. If the CLI is unavailable or cannot authenticate, Caliper reports a
|
|
167
|
+
backend configuration error.
|
|
168
|
+
|
|
169
|
+
When the Codex desktop app is installed, Caliper prefers the app-bundled Codex
|
|
170
|
+
CLI over an older `codex` found on `PATH`. Set `CODEX_CLI_PATH` to force a
|
|
171
|
+
specific CLI binary.
|
|
172
|
+
|
|
173
|
+
If you explicitly use `backend: openai-api`, set:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
export OPENAI_API_KEY=...
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Quick Start
|
|
180
|
+
|
|
181
|
+
Create an eval spec:
|
|
182
|
+
|
|
183
|
+
```yaml
|
|
184
|
+
# my-skill.eval.yaml
|
|
185
|
+
skill:
|
|
186
|
+
path: ./SKILL.md
|
|
187
|
+
backend: codex
|
|
188
|
+
|
|
189
|
+
judge:
|
|
190
|
+
backend: codex
|
|
191
|
+
|
|
192
|
+
tasks:
|
|
193
|
+
- name: Produces the expected answer
|
|
194
|
+
prompt: "Use this skill to answer: what is 2 + 2?"
|
|
195
|
+
expect: "The assistant answers 4."
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Run it:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
caliper run my-skill.eval.yaml --k 3 --baseline
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Example output:
|
|
205
|
+
|
|
206
|
+
```text
|
|
207
|
+
CALIPER - my-skill - k=3 - codex
|
|
208
|
+
|
|
209
|
+
ID Task k (3) pass@k
|
|
210
|
+
task-1 Produces the expected answer 2/3 96.3% PARTIAL
|
|
211
|
+
|
|
212
|
+
With skill 96.3% ###################-
|
|
213
|
+
No skill 70.4% ##############------
|
|
214
|
+
Delta +25.9% up
|
|
215
|
+
Results saved to .caliper/results/my-skill/2026-05-22T14-23-01Z.json
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Browse results:
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
caliper list
|
|
222
|
+
caliper report my-skill
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Validate a spec before running:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
caliper validate my-skill.eval.yaml
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Recommended Workflow
|
|
232
|
+
|
|
233
|
+
1. Create a small eval spec for one behavior you care about.
|
|
234
|
+
2. Run it with `--k 1` while iterating on the spec.
|
|
235
|
+
3. Add deterministic `assert:` checks for facts an LLM judge should not guess.
|
|
236
|
+
4. Run with `--k 3` or higher once the task is stable.
|
|
237
|
+
5. Use `--baseline` to measure whether the skill helps over the raw agent.
|
|
238
|
+
6. Commit the spec beside the skill so future contributors can run the same
|
|
239
|
+
evaluation before changing behavior.
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
caliper run path/to/skill.eval.yaml --k 3 --baseline --verbose
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Install The Evaluator Skill
|
|
246
|
+
|
|
247
|
+
The repository includes an `evaluate-skill` agent skill. Installing it lets
|
|
248
|
+
Claude Code or Codex help you create eval specs, validate them, run Caliper, and
|
|
249
|
+
summarize results from inside your normal agent workflow.
|
|
250
|
+
|
|
251
|
+
If you installed the CLI, use the bundled installer:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
caliper install-skill codex
|
|
255
|
+
caliper install-skill claude-code
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Preview the destination without writing files:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
caliper install-skill codex --dry-run
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Use `--force` to overwrite an existing installed copy.
|
|
265
|
+
|
|
266
|
+
### Claude Code
|
|
267
|
+
|
|
268
|
+
Without the CLI installer, copy the skill into Claude Code commands:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
mkdir -p ~/.claude/commands
|
|
272
|
+
curl -fsSL https://raw.githubusercontent.com/edonadei/caliper/main/skills/evaluate-skill/SKILL.md \
|
|
273
|
+
-o ~/.claude/commands/evaluate-skill.md
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Then use it in Claude Code:
|
|
277
|
+
|
|
278
|
+
```text
|
|
279
|
+
/evaluate-skill validate my-skill.eval.yaml
|
|
280
|
+
/evaluate-skill run my-skill.eval.yaml --k 3
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Codex
|
|
284
|
+
|
|
285
|
+
Without the CLI installer, install the skill in Codex:
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
mkdir -p ~/.codex/skills/evaluate-skill
|
|
289
|
+
curl -fsSL https://raw.githubusercontent.com/edonadei/caliper/main/skills/evaluate-skill/SKILL.md \
|
|
290
|
+
-o ~/.codex/skills/evaluate-skill/SKILL.md
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
Make sure `caliper` is on `PATH` for Codex sessions. If you installed Caliper in
|
|
294
|
+
editable mode, the generated console script is usually enough.
|
|
295
|
+
|
|
296
|
+
Then ask Codex:
|
|
297
|
+
|
|
298
|
+
```text
|
|
299
|
+
Use the evaluate-skill skill to validate my-skill.eval.yaml.
|
|
300
|
+
Use the evaluate-skill skill to run my-skill.eval.yaml with k=3 and summarize the result.
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## Examples
|
|
304
|
+
|
|
305
|
+
### Codex Agent, Codex Judge
|
|
306
|
+
|
|
307
|
+
```yaml
|
|
308
|
+
skill:
|
|
309
|
+
path: ./SKILL.md
|
|
310
|
+
backend: codex
|
|
311
|
+
|
|
312
|
+
judge:
|
|
313
|
+
backend: codex
|
|
314
|
+
|
|
315
|
+
tasks:
|
|
316
|
+
- name: Validates a spec
|
|
317
|
+
prompt: "Use caliper to validate ./example.eval.yaml and summarize the result."
|
|
318
|
+
expect: "The assistant runs caliper validate and reports whether the spec is valid."
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
caliper run my-codex-skill.eval.yaml --k 1 --verbose
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### Claude Code Agent, Claude Judge
|
|
326
|
+
|
|
327
|
+
```yaml
|
|
328
|
+
skill:
|
|
329
|
+
path: ~/.claude/commands/review.md
|
|
330
|
+
backend: claude-code
|
|
331
|
+
model: claude-sonnet-4-6
|
|
332
|
+
|
|
333
|
+
judge:
|
|
334
|
+
backend: claude-code
|
|
335
|
+
model: claude-haiku-4-5-20251001
|
|
336
|
+
|
|
337
|
+
tasks:
|
|
338
|
+
- name: Finds a null dereference
|
|
339
|
+
prompt: "/review the staged changes in /tmp/eval-repo"
|
|
340
|
+
expect: "The review identifies a possible null pointer dereference."
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
### Mix Backends
|
|
344
|
+
|
|
345
|
+
The agent backend and judge backend are independent:
|
|
346
|
+
|
|
347
|
+
```yaml
|
|
348
|
+
skill:
|
|
349
|
+
path: ./SKILL.md
|
|
350
|
+
backend: codex
|
|
351
|
+
|
|
352
|
+
judge:
|
|
353
|
+
backend: claude-code
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
Or opt into API billing explicitly:
|
|
357
|
+
|
|
358
|
+
```yaml
|
|
359
|
+
skill:
|
|
360
|
+
path: ./SKILL.md
|
|
361
|
+
backend: openai-api
|
|
362
|
+
model: gpt-4o-mini
|
|
363
|
+
|
|
364
|
+
judge:
|
|
365
|
+
backend: openai-api
|
|
366
|
+
model: gpt-4o-mini
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
### Deterministic Assertions
|
|
370
|
+
|
|
371
|
+
Use `assert:` when success can be verified with Python. This is usually better
|
|
372
|
+
than asking an LLM to judge files, JSON, command output, or screenshots.
|
|
373
|
+
|
|
374
|
+
```yaml
|
|
375
|
+
tasks:
|
|
376
|
+
- name: Writes an output file
|
|
377
|
+
cleanup: rm -f /tmp/out.txt
|
|
378
|
+
prompt: "Write hello world to /tmp/out.txt"
|
|
379
|
+
expect: "A file is written at /tmp/out.txt."
|
|
380
|
+
assert: |
|
|
381
|
+
from pathlib import Path
|
|
382
|
+
|
|
383
|
+
path = Path("/tmp/out.txt")
|
|
384
|
+
assert path.exists(), "Output file was not created"
|
|
385
|
+
assert path.read_text().strip() == "hello world"
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
When both `expect` and `assert` are present, both must pass.
|
|
389
|
+
|
|
390
|
+
### Screenshot Skill Eval
|
|
391
|
+
|
|
392
|
+
The repo includes a Codex-backed screenshot eval:
|
|
393
|
+
|
|
394
|
+
```bash
|
|
395
|
+
caliper validate skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml
|
|
396
|
+
caliper run skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml --k 1 --judge script --verbose
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
On macOS, the process running the eval must have Screen Recording permission. If
|
|
400
|
+
direct `screencapture -x /tmp/test.png` fails, this eval will fail until that
|
|
401
|
+
permission is granted.
|
|
402
|
+
|
|
403
|
+
## Spec Format
|
|
404
|
+
|
|
405
|
+
```yaml
|
|
406
|
+
skill:
|
|
407
|
+
path: ./SKILL.md # optional path to the skill file
|
|
408
|
+
backend: codex # claude-code | codex | claude-api | openai-api
|
|
409
|
+
model: <model-name> # optional backend-specific model override
|
|
410
|
+
|
|
411
|
+
judge:
|
|
412
|
+
backend: codex # claude-code | codex | claude-api | openai-api
|
|
413
|
+
model: <model-name> # optional backend-specific model override
|
|
414
|
+
|
|
415
|
+
sandbox:
|
|
416
|
+
extra_path:
|
|
417
|
+
- ./bin # optional paths prepended to PATH
|
|
418
|
+
forbidden_files:
|
|
419
|
+
- ".*\\.eval\\.yaml$" # agent cannot read the spec file
|
|
420
|
+
- "./.caliper/.*" # agent cannot read saved results
|
|
421
|
+
|
|
422
|
+
tasks:
|
|
423
|
+
- name: Short task name
|
|
424
|
+
setup: <shell command> # optional, runs before each attempt
|
|
425
|
+
cleanup: <shell command> # optional, always runs after each attempt
|
|
426
|
+
prompt: <prompt sent to the agent>
|
|
427
|
+
expect: <natural-language success condition>
|
|
428
|
+
assert: |
|
|
429
|
+
# optional inline Python assertion
|
|
430
|
+
assert True
|
|
431
|
+
|
|
432
|
+
- name: Task with external assertion script
|
|
433
|
+
prompt: "Generate a report"
|
|
434
|
+
assert: ./assertions/check_report.py
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
Each task must define at least one of `expect` or `assert`. Task ids are assigned
|
|
438
|
+
automatically as `task-001`, `task-002`, and so on.
|
|
439
|
+
|
|
440
|
+
## Commands
|
|
441
|
+
|
|
442
|
+
| Command | Description |
|
|
443
|
+
|---|---|
|
|
444
|
+
| `caliper run <spec>` | Run an evaluation spec |
|
|
445
|
+
| `caliper new [name]` | Create a new evaluation spec with the wizard |
|
|
446
|
+
| `caliper validate <spec>` | Validate a spec file |
|
|
447
|
+
| `caliper list [spec]` | List specs and saved runs |
|
|
448
|
+
| `caliper report <spec-or-result>` | Re-render saved results |
|
|
449
|
+
|
|
450
|
+
### `caliper run` Flags
|
|
451
|
+
|
|
452
|
+
| Flag | Default | Description |
|
|
453
|
+
|---|---|---|
|
|
454
|
+
| `--k INT` | `3` | Attempts per task |
|
|
455
|
+
| `--baseline` | off | Also run each task without the skill |
|
|
456
|
+
| `--judge autorater` | `autorater` | LLM judge gives a direct pass/fail |
|
|
457
|
+
| `--judge script` | | Run static assertions and, if `expect` exists, an LLM judge |
|
|
458
|
+
| `--judge autorater-sdk` | | Legacy alias for Anthropic SDK judging; prefer `judge.backend: claude-api` |
|
|
459
|
+
| `--workers INT` | `4` | Parallel task workers |
|
|
460
|
+
| `--timeout INT` | `120` | Seconds per attempt |
|
|
461
|
+
| `--model MODEL` | | Override `skill.model` for the agent under test |
|
|
462
|
+
| `--verbose` | off | Show per-attempt judge reasoning |
|
|
463
|
+
| `--output PATH` | | Also save results JSON to a specific path |
|
|
464
|
+
|
|
465
|
+
## Judging
|
|
466
|
+
|
|
467
|
+
### Autorater
|
|
468
|
+
|
|
469
|
+
`--judge autorater` asks the configured judge backend to decide whether the
|
|
470
|
+
transcript satisfies `expect`.
|
|
471
|
+
|
|
472
|
+
```yaml
|
|
473
|
+
judge:
|
|
474
|
+
backend: codex
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
### Script Judge
|
|
478
|
+
|
|
479
|
+
`--judge script` always runs static `assert:` checks when present.
|
|
480
|
+
|
|
481
|
+
If the task also has `expect`, it also asks the configured judge backend for an
|
|
482
|
+
LLM verdict. With `judge.backend: codex`, that LLM check is performed by Codex
|
|
483
|
+
CLI. With `judge.backend: claude-code`, it is performed by Claude Code CLI. Use
|
|
484
|
+
`claude-api` or `openai-api` only when API billing is intended.
|
|
485
|
+
|
|
486
|
+
### Static Assertions
|
|
487
|
+
|
|
488
|
+
Static assertions run locally with Python. They are ideal for verifying:
|
|
489
|
+
|
|
490
|
+
- files exist
|
|
491
|
+
- exact file contents
|
|
492
|
+
- JSON/schema validity
|
|
493
|
+
- command output
|
|
494
|
+
- images or screenshots
|
|
495
|
+
- repository state
|
|
496
|
+
|
|
497
|
+
## Isolation And Reproducibility
|
|
498
|
+
|
|
499
|
+
Each attempt runs with a fresh temporary `HOME` directory. For Claude Code,
|
|
500
|
+
Caliper installs a temporary slash-command skill in that isolated home. For
|
|
501
|
+
Codex, Caliper injects the skill body directly into the prompt passed to
|
|
502
|
+
`codex exec`.
|
|
503
|
+
|
|
504
|
+
Results are saved next to the spec:
|
|
505
|
+
|
|
506
|
+
```text
|
|
507
|
+
.caliper/results/<spec-name>/<timestamp>.json
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
Each result includes a skill snapshot: the skill file content, referenced local
|
|
511
|
+
files, and git SHA when available.
|
|
512
|
+
|
|
513
|
+
## Scoring
|
|
514
|
+
|
|
515
|
+
For each task:
|
|
516
|
+
|
|
517
|
+
```text
|
|
518
|
+
pass@k = 1 - (1 - successes / k)^k
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
The aggregate score is the average task pass@k. With `--baseline`, Caliper also
|
|
522
|
+
runs the same tasks without the skill and reports the delta.
|
|
523
|
+
|
|
524
|
+
## Project Layout
|
|
525
|
+
|
|
526
|
+
```text
|
|
527
|
+
caliper/
|
|
528
|
+
commands/ Typer command implementations
|
|
529
|
+
harness/ Claude, Codex, and API execution backends
|
|
530
|
+
judge/ LLM and script judging implementations
|
|
531
|
+
schema/ Eval spec and result models
|
|
532
|
+
runner.py Evaluation orchestration
|
|
533
|
+
skills/
|
|
534
|
+
evaluate-skill/ Agent skill for running Caliper from Claude Code or Codex
|
|
535
|
+
tests/ Pytest coverage for harnesses, judges, and runner behavior
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
## Contributing
|
|
539
|
+
|
|
540
|
+
Contributions are welcome when they keep Caliper focused on repeatable,
|
|
541
|
+
maintainable skill evaluation.
|
|
542
|
+
|
|
543
|
+
Good first contribution areas:
|
|
544
|
+
|
|
545
|
+
- add example evals for real skills
|
|
546
|
+
- improve backend error messages
|
|
547
|
+
- add deterministic assertion helpers
|
|
548
|
+
- expand tests for harness and judge behavior
|
|
549
|
+
- improve result reporting and summaries
|
|
550
|
+
- document common setup problems for Claude Code and Codex
|
|
551
|
+
|
|
552
|
+
Before opening a pull request:
|
|
553
|
+
|
|
554
|
+
```bash
|
|
555
|
+
pip install -e ".[dev,openai]"
|
|
556
|
+
pytest
|
|
557
|
+
ruff check .
|
|
558
|
+
caliper validate skills/evaluate-skill/evaluate-skill.eval.yaml
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
When changing behavior, include either a test or an eval fixture that demonstrates
|
|
562
|
+
the expected outcome. Keep backend-specific behavior isolated to the relevant
|
|
563
|
+
module under `caliper/harness/` or `caliper/judge/` when possible.
|
|
564
|
+
|
|
565
|
+
## Troubleshooting
|
|
566
|
+
|
|
567
|
+
### `codex judge failed: model ... is not supported`
|
|
568
|
+
|
|
569
|
+
The model name in `skill.model` or `judge.model` is not available to your Codex
|
|
570
|
+
account. Use a model that `codex exec --model <name>` supports.
|
|
571
|
+
|
|
572
|
+
### `codex CLI not found`
|
|
573
|
+
|
|
574
|
+
Install the Codex CLI and ensure it is on `PATH`:
|
|
575
|
+
|
|
576
|
+
```bash
|
|
577
|
+
npm install -g @openai/codex
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
### `claude` command not found
|
|
581
|
+
|
|
582
|
+
Install and authenticate Claude Code, or switch the relevant backend to `codex`,
|
|
583
|
+
`claude-api`, or `openai-api`.
|
|
584
|
+
|
|
585
|
+
### A task passes only because of `assert:`
|
|
586
|
+
|
|
587
|
+
When a task has only `assert:`, no LLM judge is required. Add `expect:` if you
|
|
588
|
+
also want an LLM to judge the transcript.
|