caliper-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. caliper_eval-0.1.0/.gitignore +12 -0
  2. caliper_eval-0.1.0/PKG-INFO +588 -0
  3. caliper_eval-0.1.0/README.md +552 -0
  4. caliper_eval-0.1.0/assets/caliper-banner.png +0 -0
  5. caliper_eval-0.1.0/assets/caliper-demo.gif +0 -0
  6. caliper_eval-0.1.0/assets/demo.tape +45 -0
  7. caliper_eval-0.1.0/assets/logo.png +0 -0
  8. caliper_eval-0.1.0/caliper/__init__.py +0 -0
  9. caliper_eval-0.1.0/caliper/commands/__init__.py +0 -0
  10. caliper_eval-0.1.0/caliper/commands/install_skill.py +56 -0
  11. caliper_eval-0.1.0/caliper/commands/list_cmd.py +94 -0
  12. caliper_eval-0.1.0/caliper/commands/new.py +16 -0
  13. caliper_eval-0.1.0/caliper/commands/report.py +56 -0
  14. caliper_eval-0.1.0/caliper/commands/run.py +110 -0
  15. caliper_eval-0.1.0/caliper/commands/validate.py +66 -0
  16. caliper_eval-0.1.0/caliper/harness/__init__.py +31 -0
  17. caliper_eval-0.1.0/caliper/harness/base.py +50 -0
  18. caliper_eval-0.1.0/caliper/harness/claude_api.py +74 -0
  19. caliper_eval-0.1.0/caliper/harness/claude_code.py +372 -0
  20. caliper_eval-0.1.0/caliper/harness/codex.py +278 -0
  21. caliper_eval-0.1.0/caliper/harness/openai_api.py +61 -0
  22. caliper_eval-0.1.0/caliper/judge/__init__.py +42 -0
  23. caliper_eval-0.1.0/caliper/judge/autorater.py +129 -0
  24. caliper_eval-0.1.0/caliper/judge/base.py +28 -0
  25. caliper_eval-0.1.0/caliper/judge/claude_code_judge.py +92 -0
  26. caliper_eval-0.1.0/caliper/judge/codex_judge.py +180 -0
  27. caliper_eval-0.1.0/caliper/judge/openai_api_judge.py +89 -0
  28. caliper_eval-0.1.0/caliper/judge/script_assert.py +210 -0
  29. caliper_eval-0.1.0/caliper/main.py +30 -0
  30. caliper_eval-0.1.0/caliper/reporter.py +223 -0
  31. caliper_eval-0.1.0/caliper/resources/__init__.py +1 -0
  32. caliper_eval-0.1.0/caliper/resources/evaluate_skill/SKILL.md +252 -0
  33. caliper_eval-0.1.0/caliper/resources/evaluate_skill/__init__.py +1 -0
  34. caliper_eval-0.1.0/caliper/runner.py +308 -0
  35. caliper_eval-0.1.0/caliper/schema/__init__.py +0 -0
  36. caliper_eval-0.1.0/caliper/schema/results.py +75 -0
  37. caliper_eval-0.1.0/caliper/schema/spec.py +87 -0
  38. caliper_eval-0.1.0/caliper/scoring.py +33 -0
  39. caliper_eval-0.1.0/caliper/wizard.py +159 -0
  40. caliper_eval-0.1.0/pyproject.toml +51 -0
  41. caliper_eval-0.1.0/skills/evaluate-skill/SKILL.md +252 -0
  42. caliper_eval-0.1.0/skills/evaluate-skill/evaluate-skill.eval.yaml +163 -0
  43. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/claude-code-smoke/SKILL.md +14 -0
  44. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/claude-code-smoke/claude-code-smoke.eval.yaml +27 -0
  45. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/commit-simple/SKILL.md +29 -0
  46. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/commit-simple/commit-simple.eval.yaml +84 -0
  47. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/screenshot/SKILL.md +267 -0
  48. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml +31 -0
  49. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/summarize/SKILL.md +87 -0
  50. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/summarize/summarize.eval.yaml +50 -0
  51. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/tdd/SKILL.md +371 -0
  52. caliper_eval-0.1.0/skills/evaluate-skill/references/evals/tdd/tdd.eval.yaml +88 -0
  53. caliper_eval-0.1.0/skills/evaluate-skill/references/examples/simple.eval.yaml +29 -0
  54. caliper_eval-0.1.0/tests/test_claude_harness.py +107 -0
  55. caliper_eval-0.1.0/tests/test_claude_judge.py +45 -0
  56. caliper_eval-0.1.0/tests/test_codex_harness.py +206 -0
  57. caliper_eval-0.1.0/tests/test_codex_judge.py +108 -0
  58. caliper_eval-0.1.0/tests/test_install_skill.py +78 -0
  59. caliper_eval-0.1.0/tests/test_reporter.py +10 -0
  60. caliper_eval-0.1.0/tests/test_runner.py +77 -0
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .DS_Store
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ venv/
9
+ .caliper/
10
+ *.json
11
+ !examples/*.json
12
+ .env
@@ -0,0 +1,588 @@
1
+ Metadata-Version: 2.4
2
+ Name: caliper-eval
3
+ Version: 0.1.0
4
+ Summary: CLI for evaluating Claude Code skills and AI agents
5
+ Project-URL: Homepage, https://github.com/edonadei/caliper
6
+ Project-URL: Repository, https://github.com/edonadei/caliper
7
+ Project-URL: Issues, https://github.com/edonadei/caliper/issues
8
+ Author: Emrick Donadei
9
+ License-Expression: MIT
10
+ Keywords: agents,ai,claude,codex,evals,skills
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Software Development :: Testing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: anthropic>=0.28
24
+ Requires-Dist: pydantic>=2
25
+ Requires-Dist: pyyaml>=6
26
+ Requires-Dist: rich>=13
27
+ Requires-Dist: typer>=0.15
28
+ Provides-Extra: codex
29
+ Requires-Dist: openai>=1.0; extra == 'codex'
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == 'dev'
32
+ Requires-Dist: ruff; extra == 'dev'
33
+ Provides-Extra: openai
34
+ Requires-Dist: openai>=1.0; extra == 'openai'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # Caliper - Agent Skill Evaluation Harness
38
+
39
+ <p align="center">
40
+ <img src="assets/caliper-banner.png" alt="Caliper banner">
41
+ </p>
42
+
43
+ Evaluate AI agent skills with repeatable tasks, automated judging, and pass@k
44
+ scoring.
45
+
46
+ Caliper is a local-first evaluation harness for Claude Code skills, Codex
47
+ skills, and API-backed agents. It runs a skill against one or more task specs,
48
+ records every attempt, judges the result with an LLM and/or deterministic Python
49
+ assertions, and saves reproducible result files you can inspect later.
50
+
51
+ Use Caliper when you want to answer practical questions like:
52
+
53
+ - Did this skill actually get better after my prompt edit?
54
+ - Does it still pass the workflows it passed last week?
55
+ - Does Codex or Claude Code run this skill more reliably for my use case?
56
+ - Is the skill doing the work, or would the baseline agent pass without it?
57
+ - Can contributors change a skill without relying on subjective manual testing?
58
+
59
+ Caliper is especially useful for agent skills because skills are hard to review
60
+ with ordinary unit tests. A good skill is part prompt, part workflow, part tool
61
+ contract. Caliper turns that behavior into versioned eval specs, repeatable
62
+ runs, pass/fail judgments, and saved transcripts.
63
+
64
+ ![Caliper terminal demo](assets/caliper-demo.gif)
65
+
66
+ ## Highlights
67
+
68
+ - **Skill-first evaluation** for Claude Code, Codex, Anthropic API, and OpenAI
69
+ API backends.
70
+ - **Independent agent and judge backends**, so you can test a Codex skill with a
71
+ Claude judge, a Claude Code skill with a Codex judge, or keep everything on one
72
+ provider.
73
+ - **Natural-language and deterministic checks** through `expect:` and `assert:`.
74
+ - **pass@k scoring** for measuring reliability across repeated attempts.
75
+ - **Baseline runs** to show whether the skill improves over an unassisted agent.
76
+ - **Attempt isolation** with fresh temporary homes and no session history.
77
+ - **Reproducible result files** that snapshot the skill content, referenced local
78
+ files, and git SHA when available.
79
+ - **Agent-installable evaluator skill** so Claude Code or Codex can help create,
80
+ validate, run, and interpret evals.
81
+
82
+ ## When To Use It
83
+
84
+ Caliper works well for:
85
+
86
+ - evaluating Claude Code slash-command skills
87
+ - evaluating Codex skills
88
+ - comparing agent backends on the same task suite
89
+ - regression-testing prompt and workflow changes
90
+ - checking coding, review, refactor, summarization, screenshot, and file-writing
91
+ behaviors
92
+ - mixing LLM judgment with exact checks for files, JSON, command output, images,
93
+ and repository state
94
+
95
+ It is not a replacement for normal unit tests. Use unit tests for deterministic
96
+ library behavior. Use Caliper for agent behavior where the output depends on a
97
+ model following instructions, using tools, and completing a workflow.
98
+
99
+ ## Install
100
+
101
+ The fastest way to install Caliper from GitHub is:
102
+
103
+ ```bash
104
+ pipx install git+https://github.com/edonadei/caliper.git
105
+ ```
106
+
107
+ After the first PyPI release, install the same CLI as:
108
+
109
+ ```bash
110
+ pipx install caliper-eval
111
+ ```
112
+
113
+ Both install methods expose the `caliper` command:
114
+
115
+ ```bash
116
+ caliper --help
117
+ ```
118
+
119
+ For local development from the repository root:
120
+
121
+ ```bash
122
+ pip install -e .
123
+ ```
124
+
125
+ For local development and optional OpenAI API support:
126
+
127
+ ```bash
128
+ pip install -e ".[dev,openai]"
129
+ ```
130
+
131
+ Caliper requires Python 3.10 or newer.
132
+
133
+ ## Backend Setup
134
+
135
+ Caliper can run the agent under test and the judge through different backends.
136
+
137
+ | Role | Claude Code CLI | Codex CLI | API backends |
138
+ |---|---|---|---|
139
+ | Agent under test | `skill.backend: claude-code` | `skill.backend: codex` | `skill.backend: claude-api` or `openai-api` |
140
+ | LLM judge | `judge.backend: claude-code` | `judge.backend: codex` | `judge.backend: claude-api` or `openai-api` |
141
+ | Auth/billing | Claude Code subscription/auth | Codex CLI subscription/auth | Provider API key/billing |
142
+ | Transcript | Claude `stream-json` tool-call transcript | Final Codex text output | Final API response text |
143
+
144
+ ### Claude Code
145
+
146
+ Install and authenticate the `claude` CLI. `backend: claude-code` uses your
147
+ normal Claude Code CLI auth.
148
+
149
+ If you explicitly use `backend: claude-api`, set:
150
+
151
+ ```bash
152
+ export ANTHROPIC_API_KEY=...
153
+ ```
154
+
155
+ ### Codex
156
+
157
+ Install and authenticate the Codex CLI:
158
+
159
+ ```bash
160
+ npm install -g @openai/codex
161
+ codex login
162
+ codex --version
163
+ ```
164
+
165
+ `backend: codex` calls Codex with `codex exec`. It does not fall back to the
166
+ OpenAI API. If the CLI is unavailable or cannot authenticate, Caliper reports a
167
+ backend configuration error.
168
+
169
+ When the Codex desktop app is installed, Caliper prefers the app-bundled Codex
170
+ CLI over an older `codex` found on `PATH`. Set `CODEX_CLI_PATH` to force a
171
+ specific CLI binary.
172
+
173
+ If you explicitly use `backend: openai-api`, set:
174
+
175
+ ```bash
176
+ export OPENAI_API_KEY=...
177
+ ```
178
+
179
+ ## Quick Start
180
+
181
+ Create an eval spec:
182
+
183
+ ```yaml
184
+ # my-skill.eval.yaml
185
+ skill:
186
+ path: ./SKILL.md
187
+ backend: codex
188
+
189
+ judge:
190
+ backend: codex
191
+
192
+ tasks:
193
+ - name: Produces the expected answer
194
+ prompt: "Use this skill to answer: what is 2 + 2?"
195
+ expect: "The assistant answers 4."
196
+ ```
197
+
198
+ Run it:
199
+
200
+ ```bash
201
+ caliper run my-skill.eval.yaml --k 3 --baseline
202
+ ```
203
+
204
+ Example output:
205
+
206
+ ```text
207
+ CALIPER - my-skill - k=3 - codex
208
+
209
+ ID Task k (3) pass@k
210
+ task-1 Produces the expected answer 2/3 96.3% PARTIAL
211
+
212
+ With skill 96.3% ###################-
213
+ No skill 70.4% ##############------
214
+ Delta +25.9% up
215
+ Results saved to .caliper/results/my-skill/2026-05-22T14-23-01Z.json
216
+ ```
217
+
218
+ Browse results:
219
+
220
+ ```bash
221
+ caliper list
222
+ caliper report my-skill
223
+ ```
224
+
225
+ Validate a spec before running:
226
+
227
+ ```bash
228
+ caliper validate my-skill.eval.yaml
229
+ ```
230
+
231
+ ## Recommended Workflow
232
+
233
+ 1. Create a small eval spec for one behavior you care about.
234
+ 2. Run it with `--k 1` while iterating on the spec.
235
+ 3. Add deterministic `assert:` checks for facts an LLM judge should not guess.
236
+ 4. Run with `--k 3` or higher once the task is stable.
237
+ 5. Use `--baseline` to measure whether the skill helps over the raw agent.
238
+ 6. Commit the spec beside the skill so future contributors can run the same
239
+ evaluation before changing behavior.
240
+
241
+ ```bash
242
+ caliper run path/to/skill.eval.yaml --k 3 --baseline --verbose
243
+ ```
244
+
245
+ ## Install The Evaluator Skill
246
+
247
+ The repository includes an `evaluate-skill` agent skill. Installing it lets
248
+ Claude Code or Codex help you create eval specs, validate them, run Caliper, and
249
+ summarize results from inside your normal agent workflow.
250
+
251
+ If you installed the CLI, use the bundled installer:
252
+
253
+ ```bash
254
+ caliper install-skill codex
255
+ caliper install-skill claude-code
256
+ ```
257
+
258
+ Preview the destination without writing files:
259
+
260
+ ```bash
261
+ caliper install-skill codex --dry-run
262
+ ```
263
+
264
+ Use `--force` to overwrite an existing installed copy.
265
+
266
+ ### Claude Code
267
+
268
+ Without the CLI installer, copy the skill into Claude Code commands:
269
+
270
+ ```bash
271
+ mkdir -p ~/.claude/commands
272
+ curl -fsSL https://raw.githubusercontent.com/edonadei/caliper/main/skills/evaluate-skill/SKILL.md \
273
+ -o ~/.claude/commands/evaluate-skill.md
274
+ ```
275
+
276
+ Then use it in Claude Code:
277
+
278
+ ```text
279
+ /evaluate-skill validate my-skill.eval.yaml
280
+ /evaluate-skill run my-skill.eval.yaml --k 3
281
+ ```
282
+
283
+ ### Codex
284
+
285
+ Without the CLI installer, install the skill in Codex:
286
+
287
+ ```bash
288
+ mkdir -p ~/.codex/skills/evaluate-skill
289
+ curl -fsSL https://raw.githubusercontent.com/edonadei/caliper/main/skills/evaluate-skill/SKILL.md \
290
+ -o ~/.codex/skills/evaluate-skill/SKILL.md
291
+ ```
292
+
293
+ Make sure `caliper` is on `PATH` for Codex sessions. If you installed Caliper in
294
+ editable mode, the generated console script is usually enough.
295
+
296
+ Then ask Codex:
297
+
298
+ ```text
299
+ Use the evaluate-skill skill to validate my-skill.eval.yaml.
300
+ Use the evaluate-skill skill to run my-skill.eval.yaml with k=3 and summarize the result.
301
+ ```
302
+
303
+ ## Examples
304
+
305
+ ### Codex Agent, Codex Judge
306
+
307
+ ```yaml
308
+ skill:
309
+ path: ./SKILL.md
310
+ backend: codex
311
+
312
+ judge:
313
+ backend: codex
314
+
315
+ tasks:
316
+ - name: Validates a spec
317
+ prompt: "Use caliper to validate ./example.eval.yaml and summarize the result."
318
+ expect: "The assistant runs caliper validate and reports whether the spec is valid."
319
+ ```
320
+
321
+ ```bash
322
+ caliper run my-codex-skill.eval.yaml --k 1 --verbose
323
+ ```
324
+
325
+ ### Claude Code Agent, Claude Judge
326
+
327
+ ```yaml
328
+ skill:
329
+ path: ~/.claude/commands/review.md
330
+ backend: claude-code
331
+ model: claude-sonnet-4-6
332
+
333
+ judge:
334
+ backend: claude-code
335
+ model: claude-haiku-4-5-20251001
336
+
337
+ tasks:
338
+ - name: Finds a null dereference
339
+ prompt: "/review the staged changes in /tmp/eval-repo"
340
+ expect: "The review identifies a possible null pointer dereference."
341
+ ```
342
+
343
+ ### Mix Backends
344
+
345
+ The agent backend and judge backend are independent:
346
+
347
+ ```yaml
348
+ skill:
349
+ path: ./SKILL.md
350
+ backend: codex
351
+
352
+ judge:
353
+ backend: claude-code
354
+ ```
355
+
356
+ Or opt into API billing explicitly:
357
+
358
+ ```yaml
359
+ skill:
360
+ path: ./SKILL.md
361
+ backend: openai-api
362
+ model: gpt-4o-mini
363
+
364
+ judge:
365
+ backend: openai-api
366
+ model: gpt-4o-mini
367
+ ```
368
+
369
+ ### Deterministic Assertions
370
+
371
+ Use `assert:` when success can be verified with Python. This is usually better
372
+ than asking an LLM to judge files, JSON, command output, or screenshots.
373
+
374
+ ```yaml
375
+ tasks:
376
+ - name: Writes an output file
377
+ cleanup: rm -f /tmp/out.txt
378
+ prompt: "Write hello world to /tmp/out.txt"
379
+ expect: "A file is written at /tmp/out.txt."
380
+ assert: |
381
+ from pathlib import Path
382
+
383
+ path = Path("/tmp/out.txt")
384
+ assert path.exists(), "Output file was not created"
385
+ assert path.read_text().strip() == "hello world"
386
+ ```
387
+
388
+ When both `expect` and `assert` are present, both must pass.
389
+
390
+ ### Screenshot Skill Eval
391
+
392
+ The repo includes a Codex-backed screenshot eval:
393
+
394
+ ```bash
395
+ caliper validate skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml
396
+ caliper run skills/evaluate-skill/references/evals/screenshot/screenshot.eval.yaml --k 1 --judge script --verbose
397
+ ```
398
+
399
+ On macOS, the process running the eval must have Screen Recording permission. If
400
+ direct `screencapture -x /tmp/test.png` fails, this eval will fail until that
401
+ permission is granted.
402
+
403
+ ## Spec Format
404
+
405
+ ```yaml
406
+ skill:
407
+ path: ./SKILL.md # optional path to the skill file
408
+ backend: codex # claude-code | codex | claude-api | openai-api
409
+ model: <model-name> # optional backend-specific model override
410
+
411
+ judge:
412
+ backend: codex # claude-code | codex | claude-api | openai-api
413
+ model: <model-name> # optional backend-specific model override
414
+
415
+ sandbox:
416
+ extra_path:
417
+ - ./bin # optional paths prepended to PATH
418
+ forbidden_files:
419
+ - ".*\\.eval\\.yaml$" # agent cannot read the spec file
420
+ - "./.caliper/.*" # agent cannot read saved results
421
+
422
+ tasks:
423
+ - name: Short task name
424
+ setup: <shell command> # optional, runs before each attempt
425
+ cleanup: <shell command> # optional, always runs after each attempt
426
+ prompt: <prompt sent to the agent>
427
+ expect: <natural-language success condition>
428
+ assert: |
429
+ # optional inline Python assertion
430
+ assert True
431
+
432
+ - name: Task with external assertion script
433
+ prompt: "Generate a report"
434
+ assert: ./assertions/check_report.py
435
+ ```
436
+
437
+ Each task must define at least one of `expect` or `assert`. Task ids are assigned
438
+ automatically as `task-001`, `task-002`, and so on.
439
+
440
+ ## Commands
441
+
442
+ | Command | Description |
443
+ |---|---|
444
+ | `caliper run <spec>` | Run an evaluation spec |
445
+ | `caliper new [name]` | Create a new evaluation spec with the wizard |
446
+ | `caliper validate <spec>` | Validate a spec file |
447
+ | `caliper list [spec]` | List specs and saved runs |
448
+ | `caliper report <spec-or-result>` | Re-render saved results |
449
+
450
+ ### `caliper run` Flags
451
+
452
+ | Flag | Default | Description |
453
+ |---|---|---|
454
+ | `--k INT` | `3` | Attempts per task |
455
+ | `--baseline` | off | Also run each task without the skill |
456
+ | `--judge autorater` | `autorater` | LLM judge gives a direct pass/fail |
457
+ | `--judge script` | | Run static assertions and, if `expect` exists, an LLM judge |
458
+ | `--judge autorater-sdk` | | Legacy alias for Anthropic SDK judging; prefer `judge.backend: claude-api` |
459
+ | `--workers INT` | `4` | Parallel task workers |
460
+ | `--timeout INT` | `120` | Seconds per attempt |
461
+ | `--model MODEL` | | Override `skill.model` for the agent under test |
462
+ | `--verbose` | off | Show per-attempt judge reasoning |
463
+ | `--output PATH` | | Also save results JSON to a specific path |
464
+
465
+ ## Judging
466
+
467
+ ### Autorater
468
+
469
+ `--judge autorater` asks the configured judge backend to decide whether the
470
+ transcript satisfies `expect`.
471
+
472
+ ```yaml
473
+ judge:
474
+ backend: codex
475
+ ```
476
+
477
+ ### Script Judge
478
+
479
+ `--judge script` always runs static `assert:` checks when present.
480
+
481
+ If the task also has `expect`, it also asks the configured judge backend for an
482
+ LLM verdict. With `judge.backend: codex`, that LLM check is performed by Codex
483
+ CLI. With `judge.backend: claude-code`, it is performed by Claude Code CLI. Use
484
+ `claude-api` or `openai-api` only when API billing is intended.
485
+
486
+ ### Static Assertions
487
+
488
+ Static assertions run locally with Python. They are ideal for verifying:
489
+
490
+ - files exist
491
+ - exact file contents
492
+ - JSON/schema validity
493
+ - command output
494
+ - images or screenshots
495
+ - repository state
496
+
497
+ ## Isolation And Reproducibility
498
+
499
+ Each attempt runs with a fresh temporary `HOME` directory. For Claude Code,
500
+ Caliper installs a temporary slash-command skill in that isolated home. For
501
+ Codex, Caliper injects the skill body directly into the prompt passed to
502
+ `codex exec`.
503
+
504
+ Results are saved next to the spec:
505
+
506
+ ```text
507
+ .caliper/results/<spec-name>/<timestamp>.json
508
+ ```
509
+
510
+ Each result includes a skill snapshot: the skill file content, referenced local
511
+ files, and git SHA when available.
512
+
513
+ ## Scoring
514
+
515
+ For each task:
516
+
517
+ ```text
518
+ pass@k = 1 - (1 - successes / k)^k
519
+ ```
520
+
521
+ The aggregate score is the average task pass@k. With `--baseline`, Caliper also
522
+ runs the same tasks without the skill and reports the delta.
523
+
524
+ ## Project Layout
525
+
526
+ ```text
527
+ caliper/
528
+ commands/ Typer command implementations
529
+ harness/ Claude, Codex, and API execution backends
530
+ judge/ LLM and script judging implementations
531
+ schema/ Eval spec and result models
532
+ runner.py Evaluation orchestration
533
+ skills/
534
+ evaluate-skill/ Agent skill for running Caliper from Claude Code or Codex
535
+ tests/ Pytest coverage for harnesses, judges, and runner behavior
536
+ ```
537
+
538
+ ## Contributing
539
+
540
+ Contributions are welcome when they keep Caliper focused on repeatable,
541
+ maintainable skill evaluation.
542
+
543
+ Good first contribution areas:
544
+
545
+ - add example evals for real skills
546
+ - improve backend error messages
547
+ - add deterministic assertion helpers
548
+ - expand tests for harness and judge behavior
549
+ - improve result reporting and summaries
550
+ - document common setup problems for Claude Code and Codex
551
+
552
+ Before opening a pull request:
553
+
554
+ ```bash
555
+ pip install -e ".[dev,openai]"
556
+ pytest
557
+ ruff check .
558
+ caliper validate skills/evaluate-skill/evaluate-skill.eval.yaml
559
+ ```
560
+
561
+ When changing behavior, include either a test or an eval fixture that demonstrates
562
+ the expected outcome. Keep backend-specific behavior isolated to the relevant
563
+ module under `caliper/harness/` or `caliper/judge/` when possible.
564
+
565
+ ## Troubleshooting
566
+
567
+ ### `codex judge failed: model ... is not supported`
568
+
569
+ The model name in `skill.model` or `judge.model` is not available to your Codex
570
+ account. Use a model that `codex exec --model <name>` supports.
571
+
572
+ ### `codex CLI not found`
573
+
574
+ Install the Codex CLI and ensure it is on `PATH`:
575
+
576
+ ```bash
577
+ npm install -g @openai/codex
578
+ ```
579
+
580
+ ### `claude` command not found
581
+
582
+ Install and authenticate Claude Code, or switch the relevant backend to `codex`,
583
+ `claude-api`, or `openai-api`.
584
+
585
+ ### A task passes only because of `assert:`
586
+
587
+ When a task has only `assert:`, no LLM judge is required. Add `expect:` if you
588
+ also want an LLM to judge the transcript.