pi-evalset-lab 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +59 -1
- package/examples/evalset-compare-sample-embedded.html +142 -0
- package/examples/evalset-compare-sample.png +0 -0
- package/package.json +38 -6
- package/scripts/export-evalset-report-html.mjs +356 -0
- package/.copier-answers.yml +0 -5
- package/.githooks/pre-commit +0 -12
- package/.github/CODEOWNERS +0 -12
- package/.github/ISSUE_TEMPLATE/bug-report.yml +0 -63
- package/.github/ISSUE_TEMPLATE/config.yml +0 -5
- package/.github/ISSUE_TEMPLATE/docs.yml +0 -39
- package/.github/ISSUE_TEMPLATE/feature-request.yml +0 -41
- package/.github/VOUCHED.td +0 -8
- package/.github/dependabot.yml +0 -13
- package/.github/pull_request_template.md +0 -34
- package/.github/workflows/ci.yml +0 -37
- package/.github/workflows/publish.yml +0 -60
- package/.github/workflows/release-please.yml +0 -25
- package/.github/workflows/vouch-check-pr.yml +0 -29
- package/.github/workflows/vouch-manage.yml +0 -34
- package/.pi/extensions/startup-intake-router.ts +0 -151
- package/.pi/prompts/init-project-docs.md +0 -32
- package/.release-please-config.json +0 -11
- package/.release-please-manifest.json +0 -3
- package/AGENTS.md +0 -39
- package/CODE_OF_CONDUCT.md +0 -50
- package/CONTRIBUTING.md +0 -28
- package/NEXT_SESSION_PROMPT.md +0 -14
- package/SECURITY.md +0 -34
- package/SUPPORT.md +0 -37
- package/docs/dev/CONTRIBUTING.md +0 -37
- package/docs/dev/EXTENSION_SOP.md +0 -43
- package/docs/dev/next_steps.md +0 -17
- package/docs/dev/plans/001-initial-plan.md +0 -24
- package/docs/dev/status.md +0 -21
- package/docs/org/operating_model.md +0 -39
- package/docs/org/project-docs-intake.questions.json +0 -60
- package/docs/project/foundation.md +0 -28
- package/docs/project/incentives.md +0 -17
- package/docs/project/resources.md +0 -26
- package/docs/project/skills.md +0 -17
- package/docs/project/strategic_goals.md +0 -18
- package/docs/project/tactical_goals.md +0 -39
- package/docs/project/vision.md +0 -21
- package/external/.gitkeep +0 -0
- package/ontology/.gitkeep +0 -0
- package/policy/security-policy.json +0 -10
- package/prek.toml +0 -15
- package/scripts/docs-list.sh +0 -50
- package/scripts/init-project-docs.sh +0 -56
- package/scripts/install-hooks.sh +0 -13
- package/scripts/sync-to-live.sh +0 -91
- package/scripts/validate-structure.sh +0 -325
- package/src/.gitkeep +0 -0
- package/tests/.gitkeep +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -13,6 +13,13 @@ system4d:
|
|
|
13
13
|
|
|
14
14
|
All notable changes to this project should be documented here.
|
|
15
15
|
|
|
16
|
+
## [0.2.0](https://github.com/tryingET/pi-evalset-lab/compare/v0.1.0...v0.2.0) (2026-02-17)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
### Features
|
|
20
|
+
|
|
21
|
+
* **evalset:** add JSON to HTML report export helper ([802e7d2](https://github.com/tryingET/pi-evalset-lab/commit/802e7d2a13a44205e519e0f0c778788fd093340f))
|
|
22
|
+
|
|
16
23
|
## [Unreleased]
|
|
17
24
|
|
|
18
25
|
### Added
|
|
@@ -25,16 +32,20 @@ All notable changes to this project should be documented here.
|
|
|
25
32
|
- `fixed-task-set.json`
|
|
26
33
|
- `fixed-task-set-v2.json`
|
|
27
34
|
- `fixed-task-set-v3.json`
|
|
35
|
+
- `evalset-compare-sample-embedded.html`
|
|
36
|
+
- `evalset-compare-sample.png`
|
|
28
37
|
- `system-baseline.txt`
|
|
29
38
|
- `system-candidate.txt`
|
|
30
39
|
- Added report output support to `.evalset/reports/*.json` with per-case and aggregate metrics.
|
|
31
40
|
- Added run identity metadata to reports (`runId`, `datasetHash`, `casesHash`, `variantHash`).
|
|
32
41
|
- Reduced session message payload size by storing only lightweight report metadata instead of full report bodies.
|
|
42
|
+
- Added `scripts/export-evalset-report-html.mjs` and `npm run evalset:export-html` for repeatable JSON -> static HTML report exports.
|
|
33
43
|
|
|
34
44
|
### Changed
|
|
35
45
|
|
|
36
46
|
- Clarified `/evalset` invocation docs: use `pi -p` (or `pi -e ... -p`) for non-interactive runs; `/evalset` is not a standalone shell binary.
|
|
37
47
|
- Added the same non-interactive invocation note to `/evalset help` output.
|
|
48
|
+
- Declared publish-time runtime artifacts with `package.json` `files` whitelist and documented peer/runtime dependency behavior in README.
|
|
38
49
|
|
|
39
50
|
## [0.1.0] - 2026-02-08
|
|
40
51
|
|
package/README.md
CHANGED
|
@@ -11,7 +11,9 @@ system4d:
|
|
|
11
11
|
|
|
12
12
|
# pi-evalset-lab
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Extension package for fixed-task-set eval workflows in pi (`/evalset run|compare`) with reproducible JSON reports.
|
|
15
|
+
|
|
16
|
+
Primary category fit: **Model & Prompt Management**, **Review & Quality Loops**, **UX & Observability**, **Safety & Governance**.
|
|
15
17
|
|
|
16
18
|
## Quickstart
|
|
17
19
|
|
|
@@ -33,6 +35,38 @@ Production-ready starter scaffold for a pi extension package.
|
|
|
33
35
|
pi install /absolute/path/to/pi-evalset-lab
|
|
34
36
|
```
|
|
35
37
|
|
|
38
|
+
## Runtime dependencies and packaged files
|
|
39
|
+
|
|
40
|
+
This extension depends on pi host APIs and declares them as `peerDependencies`:
|
|
41
|
+
|
|
42
|
+
- `@mariozechner/pi-coding-agent`
|
|
43
|
+
- `@mariozechner/pi-ai`
|
|
44
|
+
|
|
45
|
+
In normal usage, pi provides these at runtime when loading the package.
|
|
46
|
+
|
|
47
|
+
The npm package also uses a `files` whitelist so required runtime artifacts are explicitly included:
|
|
48
|
+
|
|
49
|
+
- `extensions/evalset.ts`
|
|
50
|
+
- `prompts/`
|
|
51
|
+
- `examples/` (sample datasets + sample report UI)
|
|
52
|
+
|
|
53
|
+
## Category taxonomy (reference)
|
|
54
|
+
|
|
55
|
+
Keyword slugs used for extension categorization:
|
|
56
|
+
|
|
57
|
+
- `ux-observability` (UX & Observability)
|
|
58
|
+
- `safety-governance` (Safety & Governance)
|
|
59
|
+
- `context-codebase-mapping` (Context & Codebase Mapping)
|
|
60
|
+
- `web-docs-retrieval` (Web & Docs Retrieval)
|
|
61
|
+
- `background-processes` (Background / Long-running Processes)
|
|
62
|
+
- `review-quality-loops` (Review & Quality Loops)
|
|
63
|
+
- `planning-orchestration` (Planning & Orchestration)
|
|
64
|
+
- `subagents-parallelization` (Subagents / Parallelization)
|
|
65
|
+
- `model-prompt-management` (Model & Prompt Management)
|
|
66
|
+
- `interactive-clis-editors` (Interactive CLIs / Editors)
|
|
67
|
+
- `skills-rules-packs` (Skills & Rules Packs)
|
|
68
|
+
- `paste-code-extraction` (Paste / Code Extraction)
|
|
69
|
+
|
|
36
70
|
## evalset command (MVP)
|
|
37
71
|
|
|
38
72
|
This extension adds `/evalset` for fixed-task-set evaluation runs.
|
|
@@ -66,6 +100,9 @@ pi -e ./extensions/evalset.ts -p "/evalset compare examples/fixed-task-set.json
|
|
|
66
100
|
pi -p "/evalset compare examples/fixed-task-set.json examples/system-baseline.txt examples/system-candidate.txt"
|
|
67
101
|
```
|
|
68
102
|
|
|
103
|
+
Interactive sessions use pi UI hooks (`ctx.ui`) for status/notify updates.
|
|
104
|
+
In non-interactive `-p` mode, those UI calls are safely skipped (`ctx.hasUI === false`).
|
|
105
|
+
|
|
69
106
|
### Example workflow (inside pi)
|
|
70
107
|
|
|
71
108
|
```bash
|
|
@@ -79,6 +116,15 @@ pi -p "/evalset compare examples/fixed-task-set.json examples/system-baseline.tx
|
|
|
79
116
|
- `examples/fixed-task-set-v2.json` — larger first pass set
|
|
80
117
|
- `examples/fixed-task-set-v3.json` — less brittle checks (recommended)
|
|
81
118
|
|
|
119
|
+
### Sample visual output (in repo)
|
|
120
|
+
|
|
121
|
+
- `examples/evalset-compare-sample-embedded.html` — self-contained report UI with embedded compare JSON
|
|
122
|
+
- `examples/evalset-compare-sample.png` — screenshot preview of that HTML report
|
|
123
|
+
|
|
124
|
+
Preview:
|
|
125
|
+
|
|
126
|
+

|
|
127
|
+
|
|
82
128
|
The command writes JSON reports to:
|
|
83
129
|
- explicit `--out <path>` when provided
|
|
84
130
|
- otherwise `.evalset/reports/*.json` under your current project directory
|
|
@@ -91,6 +137,18 @@ Each report includes run identity metadata:
|
|
|
91
137
|
|
|
92
138
|
Session messages only keep lightweight report metadata (`reportPath`, ids, summary metrics), not full report bodies.
|
|
93
139
|
|
|
140
|
+
### Export report JSON to static HTML
|
|
141
|
+
|
|
142
|
+
Use the helper script to create a shareable standalone HTML file from any evalset JSON report:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
npm run evalset:export-html -- --in .evalset/reports/compare-your-dataset-YYYYMMDDTHHMMSS.json
|
|
146
|
+
# optional:
|
|
147
|
+
npm run evalset:export-html -- --in .evalset/reports/run-your-dataset-YYYYMMDDTHHMMSS.json --out .evalset/reports/run-your-dataset.html --title "Evalset run report"
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Script: [scripts/export-evalset-report-html.mjs](scripts/export-evalset-report-html.mjs)
|
|
151
|
+
|
|
94
152
|
## Optional core hooks (future, not required for this MVP)
|
|
95
153
|
|
|
96
154
|
This extension works today without core changes. If we decide to harden further, optional core support could include:
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8" />
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
6
|
+
<title>Evalset compare sample (embedded)</title>
|
|
7
|
+
<style>
|
|
8
|
+
:root {
|
|
9
|
+
--bg: #0b1020;
|
|
10
|
+
--panel: #111a33;
|
|
11
|
+
--line: #24335f;
|
|
12
|
+
--txt: #e8eeff;
|
|
13
|
+
--muted: #9fb0d8;
|
|
14
|
+
--ok: #2ecc71;
|
|
15
|
+
--bad: #ff6b6b;
|
|
16
|
+
--same: #7f8ea3;
|
|
17
|
+
--imp: #49dcb1;
|
|
18
|
+
--reg: #ff8f70;
|
|
19
|
+
}
|
|
20
|
+
* { box-sizing: border-box; }
|
|
21
|
+
body { margin: 0; padding: 24px; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu; background: var(--bg); color: var(--txt); }
|
|
22
|
+
.wrap { max-width: 1220px; margin: 0 auto; }
|
|
23
|
+
.panel { background: var(--panel); border: 1px solid var(--line); border-radius: 12px; padding: 14px; margin-bottom: 14px; }
|
|
24
|
+
.muted { color: var(--muted); font-size: 0.92rem; }
|
|
25
|
+
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 10px; }
|
|
26
|
+
.card { background: #0e1630; border: 1px solid #2a3a66; border-radius: 10px; padding: 10px; }
|
|
27
|
+
.label { color: var(--muted); text-transform: uppercase; letter-spacing: .04em; font-size: .75rem; }
|
|
28
|
+
.val { margin-top: 4px; font-size: 1.06rem; font-weight: 600; }
|
|
29
|
+
table { width: 100%; border-collapse: collapse; font-size: .91rem; }
|
|
30
|
+
th, td { text-align: left; padding: 10px 8px; border-bottom: 1px solid var(--line); vertical-align: top; }
|
|
31
|
+
th { color: var(--muted); }
|
|
32
|
+
.pill { display: inline-block; border-radius: 999px; padding: 2px 10px; font-size: .74rem; font-weight: 700; border: 1px solid transparent; }
|
|
33
|
+
.ok { background: rgba(46, 204, 113, .14); color: #9df7c5; border-color: rgba(46, 204, 113, .45); }
|
|
34
|
+
.bad { background: rgba(255, 107, 107, .14); color: #ffc2c2; border-color: rgba(255, 107, 107, .45); }
|
|
35
|
+
.same { background: rgba(127, 142, 163, .14); color: #d4dcf2; border-color: rgba(127, 142, 163, .45); }
|
|
36
|
+
.imp { background: rgba(73, 220, 177, .14); color: #abf6df; border-color: rgba(73, 220, 177, .45); }
|
|
37
|
+
.reg { background: rgba(255, 143, 112, .14); color: #ffd0c2; border-color: rgba(255, 143, 112, .45); }
|
|
38
|
+
.meta { color: var(--muted); font-size: .8rem; margin-top: 6px; }
|
|
39
|
+
pre { white-space: pre-wrap; background: #0d152b; border: 1px solid #2a3a66; border-radius: 8px; padding: 8px; max-height: 220px; overflow: auto; }
|
|
40
|
+
details summary { cursor: pointer; color: #9dc2ff; }
|
|
41
|
+
.split { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
|
|
42
|
+
@media (max-width: 980px) { .split { grid-template-columns: 1fr; } }
|
|
43
|
+
</style>
|
|
44
|
+
</head>
|
|
45
|
+
<body>
|
|
46
|
+
<div class="wrap">
|
|
47
|
+
<div class="panel">
|
|
48
|
+
<h1 style="margin:0 0 8px; font-size:1.32rem;">Evalset compare sample (embedded data)</h1>
|
|
49
|
+
<div class="muted" id="meta"></div>
|
|
50
|
+
</div>
|
|
51
|
+
|
|
52
|
+
<div class="panel grid" id="summary"></div>
|
|
53
|
+
|
|
54
|
+
<div class="panel">
|
|
55
|
+
<h2 style="margin:0 0 10px; font-size:1.08rem;">Case-by-case</h2>
|
|
56
|
+
<table>
|
|
57
|
+
<thead>
|
|
58
|
+
<tr>
|
|
59
|
+
<th>#</th>
|
|
60
|
+
<th>Case</th>
|
|
61
|
+
<th>Baseline</th>
|
|
62
|
+
<th>Candidate</th>
|
|
63
|
+
<th>Outcome</th>
|
|
64
|
+
<th>Details</th>
|
|
65
|
+
</tr>
|
|
66
|
+
</thead>
|
|
67
|
+
<tbody id="rows"></tbody>
|
|
68
|
+
</table>
|
|
69
|
+
</div>
|
|
70
|
+
</div>
|
|
71
|
+
|
|
72
|
+
<script id="report-data" type="application/json">{"kind":"evalset-compare","createdAt":"2026-02-16T14:41:23.888Z","run":{"runId":"471393b2-0613-41d1-8bd3-e4cc4306d65a","startedAt":"2026-02-16T14:39:40.498Z","finishedAt":"2026-02-16T14:41:23.888Z","modelKey":"openai-codex/gpt-5.3-codex","temperature":null,"datasetHash":"2851c4b6d9e9fd4961be234d2b4602431a46f4adb305eb20b0dd3120e8df64dc","casesHash":"49ca1c743474d713ee966235b03e224e861726e8b56c3aacb625227af7161aca","baselineRunId":"c3845934-4277-4915-a8d2-356e691349d7","candidateRunId":"f68bccd1-4bff-4145-9f09-d3eca49aecd8","baselineVariantHash":"df1af004a99a2cadc51b1d28e37e5d6fdceb0ae765519a36964e6fc9f78a2c56","candidateVariantHash":"7e4c4da97cf0d1ea77c0f613c44752a9d2d464cd0c1f26e7f86c69c546569a9d"},"dataset":{"name":"maintainer-clarity-v3","path":"/home/lightningralf/programming/pi-extensions/pi-evalset-lab/examples/fixed-task-set-v3.json"},"model":{"provider":"openai-codex","id":"gpt-5.3-codex","api":"openai-codex-responses"},"baseline":{"kind":"evalset-run","createdAt":"2026-02-16T14:40:35.056Z","run":{"runId":"c3845934-4277-4915-a8d2-356e691349d7","startedAt":"2026-02-16T14:39:40.498Z","finishedAt":"2026-02-16T14:40:35.056Z","modelKey":"openai-codex/gpt-5.3-codex","temperature":null,"datasetHash":"2851c4b6d9e9fd4961be234d2b4602431a46f4adb305eb20b0dd3120e8df64dc","casesHash":"49ca1c743474d713ee966235b03e224e861726e8b56c3aacb625227af7161aca","variantHash":"df1af004a99a2cadc51b1d28e37e5d6fdceb0ae765519a36964e6fc9f78a2c56"},"dataset":{"name":"maintainer-clarity-v3","path":"/home/lightningralf/programming/pi-extensions/pi-evalset-lab/examples/fixed-task-set-v3.json"},"model":{"provider":"openai-codex","id":"gpt-5.3-codex","api":"openai-codex-responses"},"variant":{"name":"baseline","systemPrompt":"Answer in concise, plain English. Prefer direct wording over jargon.\n\nYou are a concise technical assistant. Keep answers short and avoid jargon.","source":"dataset.systemPrompt + file:/home/lightningralf/programming/pi-extensions/pi-evalset-lab/examples/system-baseline.txt"},"totals":{"cases":24,"scoredCases":24,"passedCases":16,"failedCases":8,"passRate":0.6666666666666666,"totalLatencyMs":54558,"avgLatencyMs":2273.25,"usage":{"input":1237,"output":1644,"cacheRead":0,"cacheWrite":0,"totalTokens":2881,"cost":{"input":0.0021647499999999996,"output":0.023016000000000005,"cacheRead":0,"cacheWrite":0,"total":0.025180750000000002}}},"cases":[{"id":"fixed-task-set-definition","input":"In plain English, what does a fixed task set mean in eval workflows?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"same\""},{"check":"expectContains","pass":true,"details":"contains \"tasks\""}],"failedChecks":[],"outputPreview":"A **fixed task set** means the eval always uses the **same predefined list of tasks/questions** every time it runs.\n\nIn plain English:\n- The test content is locked in advance.\n- You don’t add, remove, or randomly change tasks between runs.\n- This makes results easier to compare o...","latencyMs":2405,"stopReason":"stop","usage":{"input":52,"output":74,"cacheRead":0,"cacheWrite":0,"totalTokens":126,"cost":{"input":0.000091,"output":0.001036,"cacheRead":0,"cacheWrite":0,"total":0.001127}}},{"id":"fixed-task-set-benefit","input":"Why does a fixed task set improve comparison quality across runs?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"compar\""}],"failedChecks":[],"outputPreview":"A fixed task set makes runs **comparable** because each run faces the **same work**.\n\nWhy it helps:\n\n- **Removes task variation**: Differences in results are less likely due to easier/harder tasks.\n- **Reduces noise**: Scores vary less from random task mix.\n- **Improves fairness*...","latencyMs":3621,"stopReason":"stop","usage":{"input":49,"output":138,"cacheRead":0,"cacheWrite":0,"totalTokens":187,"cost":{"input":0.00008575,"output":0.001932,"cacheRead":0,"cacheWrite":0,"total":0.00201775}}},{"id":"reproducibility-two-factors","input":"Name two things you should lock or record for reproducible eval runs.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"model\""},{"check":"expectContains","pass":true,"details":"contains \"dataset\""}],"failedChecks":[],"outputPreview":"- **Lock the model/version** you evaluate (exact checkpoint, weights, and decoding settings).\n- **Record the data snapshot** used (dataset version, split, and any preprocessing).","latencyMs":1771,"stopReason":"stop","usage":{"input":51,"output":41,"cacheRead":0,"cacheWrite":0,"totalTokens":92,"cost":{"input":0.00008925,"output":0.000574,"cacheRead":0,"cacheWrite":0,"total":0.00066325}}},{"id":"pass-rate-calculation","input":"If 18 of 24 scored cases pass, what pass rate should be reported?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"75\""}],"failedChecks":[],"outputPreview":"Report **75% pass rate** (18 ÷ 24 = 0.75).","latencyMs":1903,"stopReason":"stop","usage":{"input":54,"output":23,"cacheRead":0,"cacheWrite":0,"totalTokens":77,"cost":{"input":0.00009449999999999999,"output":0.000322,"cacheRead":0,"cacheWrite":0,"total":0.0004165}}},{"id":"scored-case-definition","input":"What makes a case scored in this evalset format?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Ee]xpect|[Cc]heck|[Cc]riteria|[Rr]ule)/m"}],"failedChecks":[],"outputPreview":"A case in this evalset format is usually “scored” by comparing the model’s output to an expected result and assigning points based on match quality.\n\nCommon parts are:\n\n- **Input**: the prompt/task.\n- **Expected output**: reference answer or criteria.\n- **Model output**: what the...","latencyMs":3705,"stopReason":"stop","usage":{"input":48,"output":155,"cacheRead":0,"cacheWrite":0,"totalTokens":203,"cost":{"input":0.000084,"output":0.00217,"cacheRead":0,"cacheWrite":0,"total":0.002254}}},{"id":"variant-hash-purpose","input":"What does variantHash help you verify?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"variant\""},{"check":"expectContains","pass":true,"details":"contains \"hash\""}],"failedChecks":[],"outputPreview":"`variantHash` helps verify that a specific product/config variant is exactly the one expected.\n\nIn short, it checks **integrity and identity** of the variant:\n- The variant data hasn’t been changed.\n- The selected options match the original variant.\n- You’re referencing the corre...","latencyMs":1986,"stopReason":"stop","usage":{"input":45,"output":66,"cacheRead":0,"cacheWrite":0,"totalTokens":111,"cost":{"input":0.00007875,"output":0.000924,"cacheRead":0,"cacheWrite":0,"total":0.00100275}}},{"id":"dataset-hash-purpose","input":"Why is datasetHash useful when comparing two reports?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"dataset\""},{"check":"expectContains","pass":true,"details":"contains \"hash\""}],"failedChecks":[],"outputPreview":"`datasetHash` is useful because it tells you whether two reports were built from the **exact same input data**.\n\nWhen comparing reports, this helps you quickly decide:\n\n- **Same hash** → differences are likely from logic, formatting, or config changes.\n- **Different hash** → diff...","latencyMs":2061,"stopReason":"stop","usage":{"input":47,"output":87,"cacheRead":0,"cacheWrite":0,"totalTokens":134,"cost":{"input":0.00008225,"output":0.001218,"cacheRead":0,"cacheWrite":0,"total":0.00130025}}},{"id":"delta-interpretation-speed-cost","input":"Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"faster\""}],"failedChecks":[],"outputPreview":"The candidate is **much faster** (about **0.8 seconds lower latency**) but **slightly more expensive** (about **$0.0003 more total cost**).\n\nSo the tradeoff is: **pay a tiny bit more for a noticeable speedup**.","latencyMs":1520,"stopReason":"stop","usage":{"input":63,"output":60,"cacheRead":0,"cacheWrite":0,"totalTokens":123,"cost":{"input":0.00011025,"output":0.00084,"cacheRead":0,"cacheWrite":0,"total":0.00095025}}},{"id":"command-non-interactive-pattern","input":"Show the correct non-interactive pattern to run compare from shell.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"pi\""},{"check":"expectContains","pass":false,"details":"contains \"-p\""},{"check":"expectContains","pass":false,"details":"contains \"evalset\""}],"failedChecks":["contains \"pi\"","contains \"-p\"","contains \"evalset\""],"outputPreview":"Use ImageMagick’s **compare** in non-interactive mode by giving it files and an output image.\n\n```sh\ncompare input1.png input2.png diff.png\n```\n\nCommon shell-safe pattern (capture metric, suppress image output if not needed):\n\n```sh\nmetric=$(compare -metric RMSE input1.png input2...","latencyMs":4125,"stopReason":"stop","usage":{"input":50,"output":158,"cacheRead":0,"cacheWrite":0,"totalTokens":208,"cost":{"input":0.0000875,"output":0.002212,"cacheRead":0,"cacheWrite":0,"total":0.0022995}}},{"id":"slash-command-shell","input":"Can you run /evalset directly in bash without pi? Answer yes or no and one reason.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":true,"details":"contains \"no\""},{"check":"expectContains","pass":false,"details":"contains \"slash\""}],"failedChecks":["contains \"slash\""],"outputPreview":"No — `/evalset` is a command for the `pi` tool, not a standard bash command.","latencyMs":1092,"stopReason":"stop","usage":{"input":57,"output":26,"cacheRead":0,"cacheWrite":0,"totalTokens":83,"cost":{"input":0.00009975,"output":0.000364,"cacheRead":0,"cacheWrite":0,"total":0.00046375}}},{"id":"report-default-location","input":"If --out is omitted, where are evalset reports written by default?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"evalset\""},{"check":"expectContains","pass":true,"details":"contains \"report\""}],"failedChecks":[],"outputPreview":"By default, evalset reports are written to the current working directory (the folder you run the command from).","latencyMs":1133,"stopReason":"stop","usage":{"input":52,"output":26,"cacheRead":0,"cacheWrite":0,"totalTokens":78,"cost":{"input":0.000091,"output":0.000364,"cacheRead":0,"cacheWrite":0,"total":0.000455}}},{"id":"model-prerequisite","input":"What should you do before /evalset run if no active model is selected?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"/model\""}],"failedChecks":[],"outputPreview":"Select a model first.\n\nBefore running `/evalset run`, make sure an active model is set (for example with `/model \u003cname>` or by choosing one in your UI). Without an active model, the run won’t start.","latencyMs":2342,"stopReason":"stop","usage":{"input":53,"output":51,"cacheRead":0,"cacheWrite":0,"totalTokens":104,"cost":{"input":0.00009275,"output":0.000714,"cacheRead":0,"cacheWrite":0,"total":0.00080675}}},{"id":"max-cases-behavior","input":"What does --max-cases 5 do during run/compare?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Ff]irst|[Ll]imit|5.*case|case.*5)/m"}],"failedChecks":[],"outputPreview":"`--max-cases 5` limits the compare step to at most **5 test cases**.\n\nSo during `run/compare`, it won’t process all available cases—just the first five (based on whatever order the tool uses). This is useful for quick checks.","latencyMs":1796,"stopReason":"stop","usage":{"input":51,"output":60,"cacheRead":0,"cacheWrite":0,"totalTokens":111,"cost":{"input":0.00008925,"output":0.00084,"cacheRead":0,"cacheWrite":0,"total":0.0009292500000000001}}},{"id":"temperature-range","input":"What is the accepted numeric range for --temperature in this extension?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"0\""},{"check":"expectContains","pass":true,"details":"contains \"2\""}],"failedChecks":[],"outputPreview":"Most OpenAI-compatible extensions accept `--temperature` from **0.0 to 2.0**.\n\n- **0.0** = most deterministic\n- **~0.7** = balanced/default in many setups\n- **2.0** = most random\n\nIf this specific extension wraps OpenAI APIs directly, that’s the standard range.","latencyMs":5754,"stopReason":"stop","usage":{"input":50,"output":76,"cacheRead":0,"cacheWrite":0,"totalTokens":126,"cost":{"input":0.0000875,"output":0.001064,"cacheRead":0,"cacheWrite":0,"total":0.0011515}}},{"id":"system-merge-behavior","input":"If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"merge\""}],"failedChecks":["contains \"merge\""],"outputPreview":"Replaced.\n\nIf `--system-file` is provided, it overrides `dataset.systemPrompt` rather than merging with it.","latencyMs":1117,"stopReason":"stop","usage":{"input":55,"output":29,"cacheRead":0,"cacheWrite":0,"totalTokens":84,"cost":{"input":0.00009625,"output":0.000406,"cacheRead":0,"cacheWrite":0,"total":0.00050225}}},{"id":"mutually-exclusive-system-options","input":"Can --system-file and --system-text be used together?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Nn]o|[Ee]ither|[Oo]ne|not together|[Bb]oth)/m"}],"failedChecks":[],"outputPreview":"Usually, no.\n\nIn most CLIs, `--system-file` and `--system-text` are alternative ways to set the same thing (the system prompt), so using both is either:\n\n- rejected as conflicting, or\n- one overrides the other (often “last one wins”).\n\nCheck your tool’s `--help` for exact behavio...","latencyMs":3438,"stopReason":"stop","usage":{"input":49,"output":83,"cacheRead":0,"cacheWrite":0,"totalTokens":132,"cost":{"input":0.00008575,"output":0.001162,"cacheRead":0,"cacheWrite":0,"total":0.0012477500000000002}}},{"id":"run-identity-fields","input":"Name any three run identity fields in a run report.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":true,"details":"contains \"run\""},{"check":"expectContains","pass":false,"details":"contains \"dataset\""},{"check":"expectContains","pass":false,"details":"contains \"case\""}],"failedChecks":["contains \"dataset\"","contains \"case\""],"outputPreview":"Three common **run identity** fields in a run report are:\n\n1. **Run ID** \n2. **Run Name** \n3. **Start Time**","latencyMs":1055,"stopReason":"stop","usage":{"input":48,"output":37,"cacheRead":0,"cacheWrite":0,"totalTokens":85,"cost":{"input":0.000084,"output":0.000518,"cacheRead":0,"cacheWrite":0,"total":0.000602}}},{"id":"compare-identity-fields","input":"Name the two run-id fields that link baseline and candidate inside compare.run.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"baseline\""},{"check":"expectContains","pass":true,"details":"contains \"candidate\""},{"check":"expectContains","pass":true,"details":"contains \"run\""}],"failedChecks":[],"outputPreview":"Inside `compare.run`, the two run-id fields are:\n\n- `baseline_run_id`\n- `candidate_run_id`","latencyMs":1555,"stopReason":"stop","usage":{"input":52,"output":28,"cacheRead":0,"cacheWrite":0,"totalTokens":80,"cost":{"input":0.000091,"output":0.000392,"cacheRead":0,"cacheWrite":0,"total":0.000483}}},{"id":"delta-passrate-zero","input":"If delta pass rate is 0, what does that imply?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Nn]o change|same pass rate|[Uu]nchanged)/m"}],"failedChecks":[],"outputPreview":"A delta pass rate of **0** means there was **no change** in pass rate between the two points you’re comparing.\n\n- If you expected improvement, none happened.\n- If you expected decline, none happened.\n- Pass rate stayed the same.","latencyMs":1769,"stopReason":"stop","usage":{"input":50,"output":55,"cacheRead":0,"cacheWrite":0,"totalTokens":105,"cost":{"input":0.0000875,"output":0.00077,"cacheRead":0,"cacheWrite":0,"total":0.0008575}}},{"id":"keyword-check-limitation","input":"Why can simple keyword checks be misleading for quality?","scored":true,"pass":false,"checks":[{"check":"expectRegex","pass":false,"details":"matches /([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword/m"}],"failedChecks":["matches /([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword/m"],"outputPreview":"Simple keyword checks can look good while missing real quality.\n\nWhy they mislead:\n- **Can be gamed:** Text can stuff keywords without saying anything useful.\n- **Misses meaning:** Good answers may use different words and still be correct.\n- **No context check:** Keywords don’t s...","latencyMs":2729,"stopReason":"stop","usage":{"input":47,"output":123,"cacheRead":0,"cacheWrite":0,"totalTokens":170,"cost":{"input":0.00008225,"output":0.001722,"cacheRead":0,"cacheWrite":0,"total":0.00180425}}},{"id":"improve-weak-evalset","input":"Give two concrete ways to improve a weak 3-case evalset.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"more\""},{"check":"expectContains","pass":true,"details":"contains \"cases\""}],"failedChecks":["contains \"more\""],"outputPreview":"1) **Add targeted edge cases**\n- Find where the model currently fails (or is likely to fail), then add tests for those exact patterns.\n- Example: if your 3 cases are all straightforward Q&A, add one ambiguous prompt and one prompt with conflicting instructions.\n\n2) **Increase div...","latencyMs":2722,"stopReason":"stop","usage":{"input":51,"output":133,"cacheRead":0,"cacheWrite":0,"totalTokens":184,"cost":{"input":0.00008925,"output":0.001862,"cacheRead":0,"cacheWrite":0,"total":0.0019512499999999999}}},{"id":"no-overclaim-rollout","input":"Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":true,"details":"contains \"no\""},{"check":"expectContains","pass":false,"details":"contains \"phase\""}],"failedChecks":["contains \"phase\""],"outputPreview":"No — big replacements are risky; start small to prove value first.","latencyMs":1205,"stopReason":"stop","usage":{"input":57,"output":18,"cacheRead":0,"cacheWrite":0,"totalTokens":75,"cost":{"input":0.00009975,"output":0.000252,"cacheRead":0,"cacheWrite":0,"total":0.00035175}}},{"id":"stakeholder-brief","input":"Write a one-line stakeholder brief that includes pilot scope and measurement.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"pilot\""},{"check":"expectContains","pass":true,"details":"contains \"measure\""}],"failedChecks":[],"outputPreview":"Pilot scope: run the new process with two teams for six weeks; measurement: compare speed, error rate, and user satisfaction against current baseline and report weekly.","latencyMs":1829,"stopReason":"stop","usage":{"input":50,"output":36,"cacheRead":0,"cacheWrite":0,"totalTokens":86,"cost":{"input":0.0000875,"output":0.000504,"cacheRead":0,"cacheWrite":0,"total":0.0005915}}},{"id":"tie-communication","input":"Baseline and candidate both scored 33.3% pass rate. How should that be communicated?","scored":true,"pass":false,"checks":[{"check":"expectRegex","pass":false,"details":"matches /([Ss]ame|[Nn]o difference|[Uu]nchanged)/m"}],"failedChecks":["matches /([Ss]ame|[Nn]o difference|[Uu]nchanged)/m"],"outputPreview":"Say it plainly:\n\n**“The candidate matched the baseline: both achieved a 33.3% pass rate (no change).”**\n\nIf you want slightly more context:\n\n**“The candidate did not improve over baseline; both had a 33.3% pass rate.”**","latencyMs":1925,"stopReason":"stop","usage":{"input":56,"output":61,"cacheRead":0,"cacheWrite":0,"totalTokens":117,"cost":{"input":0.000098,"output":0.0008539999999999999,"cacheRead":0,"cacheWrite":0,"total":0.0009519999999999999}}}]},"candidate":{"kind":"evalset-run","createdAt":"2026-02-16T14:41:23.888Z","run":{"runId":"f68bccd1-4bff-4145-9f09-d3eca49aecd8","startedAt":"2026-02-16T14:40:35.057Z","finishedAt":"2026-02-16T14:41:23.888Z","modelKey":"openai-codex/gpt-5.3-codex","temperature":null,"datasetHash":"2851c4b6d9e9fd4961be234d2b4602431a46f4adb305eb20b0dd3120e8df64dc","casesHash":"49ca1c743474d713ee966235b03e224e861726e8b56c3aacb625227af7161aca","variantHash":"7e4c4da97cf0d1ea77c0f613c44752a9d2d464cd0c1f26e7f86c69c546569a9d"},"dataset":{"name":"maintainer-clarity-v3","path":"/home/lightningralf/programming/pi-extensions/pi-evalset-lab/examples/fixed-task-set-v3.json"},"model":{"provider":"openai-codex","id":"gpt-5.3-codex","api":"openai-codex-responses"},"variant":{"name":"candidate","systemPrompt":"Answer in concise, plain English. Prefer direct wording over jargon.\n\nYou are a concise technical assistant.\n\nPriorities:\n1) Explain terms in plain language first.\n2) Keep each answer to 2-4 sentences.\n3) Prefer concrete examples over abstractions.","source":"dataset.systemPrompt + file:/home/lightningralf/programming/pi-extensions/pi-evalset-lab/examples/system-candidate.txt"},"totals":{"cases":24,"scoredCases":24,"passedCases":18,"failedCases":6,"passRate":0.75,"totalLatencyMs":48831,"avgLatencyMs":2034.625,"usage":{"input":1861,"output":1708,"cacheRead":0,"cacheWrite":0,"totalTokens":3569,"cost":{"input":0.0032567499999999992,"output":0.023912000000000003,"cacheRead":0,"cacheWrite":0,"total":0.02716875}}},"cases":[{"id":"fixed-task-set-definition","input":"In plain English, what does a fixed task set mean in eval workflows?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"same\""},{"check":"expectContains","pass":true,"details":"contains \"tasks\""}],"failedChecks":[],"outputPreview":"A **fixed task set** means you always test on the same predefined list of tasks every time you run an evaluation. \nThis makes results comparable across model versions, because the test didn’t change. \nExample: if your eval always includes the same 500 QA prompts, score changes ...","latencyMs":1892,"stopReason":"stop","usage":{"input":78,"output":74,"cacheRead":0,"cacheWrite":0,"totalTokens":152,"cost":{"input":0.0001365,"output":0.001036,"cacheRead":0,"cacheWrite":0,"total":0.0011725}}},{"id":"fixed-task-set-benefit","input":"Why does a fixed task set improve comparison quality across runs?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"compar\""}],"failedChecks":[],"outputPreview":"A fixed task set means every run is tested on the exact same jobs, so differences in results come from the system changes, not from easier or harder tasks. This reduces random noise and makes comparisons fair and repeatable. For example, if Run A and Run B both solve the same 100...","latencyMs":1950,"stopReason":"stop","usage":{"input":75,"output":85,"cacheRead":0,"cacheWrite":0,"totalTokens":160,"cost":{"input":0.00013125,"output":0.00119,"cacheRead":0,"cacheWrite":0,"total":0.0013212500000000002}}},{"id":"reproducibility-two-factors","input":"Name two things you should lock or record for reproducible eval runs.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"model\""},{"check":"expectContains","pass":true,"details":"contains \"dataset\""}],"failedChecks":[],"outputPreview":"Lock the **dataset version** and the **model version/checkpoint**. \nAlso record the **random seed** and the **exact eval settings** (like temperature, max tokens, and prompt template) so reruns use the same conditions.","latencyMs":1651,"stopReason":"stop","usage":{"input":77,"output":53,"cacheRead":0,"cacheWrite":0,"totalTokens":130,"cost":{"input":0.00013475,"output":0.000742,"cacheRead":0,"cacheWrite":0,"total":0.00087675}}},{"id":"pass-rate-calculation","input":"If 18 of 24 scored cases pass, what pass rate should be reported?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"75\""}],"failedChecks":[],"outputPreview":"Report a **75% pass rate**. \nCalculation: \\(18 \\div 24 = 0.75\\), then \\(0.75 \\times 100 = 75\\%\\).","latencyMs":1101,"stopReason":"stop","usage":{"input":80,"output":44,"cacheRead":0,"cacheWrite":0,"totalTokens":124,"cost":{"input":0.00014,"output":0.000616,"cacheRead":0,"cacheWrite":0,"total":0.0007559999999999999}}},{"id":"scored-case-definition","input":"What makes a case scored in this evalset format?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Ee]xpect|[Cc]heck|[Cc]riteria|[Rr]ule)/m"}],"failedChecks":[],"outputPreview":"A case in this evalset format is usually one test example with clear fields like an input, the expected output (or rubric), and sometimes metadata (ID, tags, difficulty). \nIt’s “scored” by comparing the model’s actual output to that expected output using a rule, such as exact ma...","latencyMs":2797,"stopReason":"stop","usage":{"input":74,"output":121,"cacheRead":0,"cacheWrite":0,"totalTokens":195,"cost":{"input":0.0001295,"output":0.001694,"cacheRead":0,"cacheWrite":0,"total":0.0018235}}},{"id":"variant-hash-purpose","input":"What does variantHash help you verify?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"variant\""},{"check":"expectContains","pass":true,"details":"contains \"hash\""}],"failedChecks":[],"outputPreview":"`variantHash` helps you check that a specific product variant’s data hasn’t changed. \nIt acts like a fingerprint of key fields (like size, color, price, or SKU), so if any of those values change, the hash changes too. \nFor example, when updating a cart item, you can compare has...","latencyMs":2180,"stopReason":"stop","usage":{"input":71,"output":83,"cacheRead":0,"cacheWrite":0,"totalTokens":154,"cost":{"input":0.00012425,"output":0.001162,"cacheRead":0,"cacheWrite":0,"total":0.00128625}}},{"id":"dataset-hash-purpose","input":"Why is datasetHash useful when comparing two reports?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"dataset\""},{"check":"expectContains","pass":true,"details":"contains \"hash\""}],"failedChecks":[],"outputPreview":"`datasetHash` is a fingerprint of the underlying data used to build a report. If two reports have the same hash, they were generated from the same dataset, so differences are likely from filters, settings, or calculations—not source data changes. If the hashes differ, the data it...","latencyMs":2543,"stopReason":"stop","usage":{"input":73,"output":93,"cacheRead":0,"cacheWrite":0,"totalTokens":166,"cost":{"input":0.00012775,"output":0.001302,"cacheRead":0,"cacheWrite":0,"total":0.00142975}}},{"id":"delta-interpretation-speed-cost","input":"Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"faster\""}],"failedChecks":[],"outputPreview":"This candidate is much faster: average latency improves by **800 ms** (about **0.8 seconds** per request). \nThe tradeoff is a tiny cost increase of **0.0003** total, so you’re paying slightly more for a noticeable speed gain.","latencyMs":1126,"stopReason":"stop","usage":{"input":89,"output":59,"cacheRead":0,"cacheWrite":0,"totalTokens":148,"cost":{"input":0.00015575,"output":0.000826,"cacheRead":0,"cacheWrite":0,"total":0.00098175}}},{"id":"command-non-interactive-pattern","input":"Show the correct non-interactive pattern to run compare from shell.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"pi\""},{"check":"expectContains","pass":false,"details":"contains \"-p\""},{"check":"expectContains","pass":false,"details":"contains \"evalset\""}],"failedChecks":["contains \"pi\"","contains \"-p\"","contains \"evalset\""],"outputPreview":"Use **ImageMagick’s `compare`** with all file paths explicitly, and write output to a file (not a display window). \nExample:\n\n```sh\ncompare -metric AE input.png reference.png diff.png 2>metric.txt\n```\n\nThis runs fully non-interactive: `diff.png` gets the visual diff, and the num...","latencyMs":3520,"stopReason":"stop","usage":{"input":76,"output":88,"cacheRead":0,"cacheWrite":0,"totalTokens":164,"cost":{"input":0.000133,"output":0.001232,"cacheRead":0,"cacheWrite":0,"total":0.0013650000000000001}}},{"id":"slash-command-shell","input":"Can you run /evalset directly in bash without pi? Answer yes or no and one reason.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"no\""},{"check":"expectContains","pass":true,"details":"contains \"slash\""}],"failedChecks":[],"outputPreview":"No — `/evalset` is a slash-command meant for the chat tool, not a normal Bash executable, so Bash won’t run it directly.","latencyMs":1107,"stopReason":"stop","usage":{"input":83,"output":34,"cacheRead":0,"cacheWrite":0,"totalTokens":117,"cost":{"input":0.00014525,"output":0.00047599999999999997,"cacheRead":0,"cacheWrite":0,"total":0.00062125}}},{"id":"report-default-location","input":"If --out is omitted, where are evalset reports written by default?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"evalset\""},{"check":"expectContains","pass":true,"details":"contains \"report\""}],"failedChecks":[],"outputPreview":"If `--out` is not provided, evalset reports are written to the current working directory (the folder you ran the command from). \nMost tools use a default filename there, often based on the evalset or run name. \nExample: if you run the command in `/home/user/project`, the report...","latencyMs":1964,"stopReason":"stop","usage":{"input":78,"output":74,"cacheRead":0,"cacheWrite":0,"totalTokens":152,"cost":{"input":0.0001365,"output":0.001036,"cacheRead":0,"cacheWrite":0,"total":0.0011725}}},{"id":"model-prerequisite","input":"What should you do before /evalset run if no active model is selected?","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"/model\""}],"failedChecks":["contains \"/model\""],"outputPreview":"Before running `/evalset run`, select an active model first. \nFor example, set one with the model selection command in your tool (like choosing `gpt-4.1`), then run the eval. \nIf no model is active, the run usually fails or uses an unintended default.","latencyMs":1319,"stopReason":"stop","usage":{"input":79,"output":65,"cacheRead":0,"cacheWrite":0,"totalTokens":144,"cost":{"input":0.00013825,"output":0.00091,"cacheRead":0,"cacheWrite":0,"total":0.00104825}}},{"id":"max-cases-behavior","input":"What does --max-cases 5 do during run/compare?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Ff]irst|[Ll]imit|5.*case|case.*5)/m"}],"failedChecks":[],"outputPreview":"`--max-cases 5` limits the compare run to at most **5 test cases**. \nIt’s useful for quick checks, like sampling a few cases instead of running everything. \nExample: if a dataset has 200 cases, `run/compare --max-cases 5` will only execute and compare 5 of them.","latencyMs":1705,"stopReason":"stop","usage":{"input":77,"output":75,"cacheRead":0,"cacheWrite":0,"totalTokens":152,"cost":{"input":0.00013475,"output":0.00105,"cacheRead":0,"cacheWrite":0,"total":0.00118475}}},{"id":"temperature-range","input":"What is the accepted numeric range for --temperature in this extension?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"0\""},{"check":"expectContains","pass":true,"details":"contains \"2\""}],"failedChecks":[],"outputPreview":"In this extension, `--temperature` is accepted from **0.0 to 2.0**. \nUse lower values (like `0.2`) for more predictable output, and higher values (like `1.2`) for more varied/creative output.","latencyMs":1546,"stopReason":"stop","usage":{"input":76,"output":59,"cacheRead":0,"cacheWrite":0,"totalTokens":135,"cost":{"input":0.000133,"output":0.000826,"cacheRead":0,"cacheWrite":0,"total":0.000959}}},{"id":"system-merge-behavior","input":"If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"merge\""}],"failedChecks":[],"outputPreview":"They are **replaced**, not merged. \nIf `--system-file` is provided, its content becomes the system prompt used at runtime, overriding `dataset.systemPrompt`. \nExample: if the dataset says “be terse” but the file says “be detailed,” the file’s instruction is what gets used.","latencyMs":1834,"stopReason":"stop","usage":{"input":81,"output":67,"cacheRead":0,"cacheWrite":0,"totalTokens":148,"cost":{"input":0.00014175,"output":0.000938,"cacheRead":0,"cacheWrite":0,"total":0.00107975}}},{"id":"mutually-exclusive-system-options","input":"Can --system-file and --system-text be used together?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Nn]o|[Ee]ither|[Oo]ne|not together|[Bb]oth)/m"}],"failedChecks":[],"outputPreview":"Usually, no—you should treat them as alternatives. \n`--system-file` loads the system prompt from a file, while `--system-text` sets it directly in the command, and most tools expect only one source. \nIf both are provided, many CLIs either error out or let one override the other...","latencyMs":2298,"stopReason":"stop","usage":{"input":75,"output":81,"cacheRead":0,"cacheWrite":0,"totalTokens":156,"cost":{"input":0.00013125,"output":0.001134,"cacheRead":0,"cacheWrite":0,"total":0.00126525}}},{"id":"run-identity-fields","input":"Name any three run identity fields in a run report.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":true,"details":"contains \"run\""},{"check":"expectContains","pass":false,"details":"contains \"dataset\""},{"check":"expectContains","pass":false,"details":"contains \"case\""}],"failedChecks":["contains \"dataset\"","contains \"case\""],"outputPreview":"Three common **run identity** fields are:\n\n1. **Run ID** – a unique number or code for that specific run. \n2. **Run Name** – a human-readable label (for example, “Nightly Regression”). \n3. **Start Time** – when the run began, which helps distinguish runs with similar names.","latencyMs":1593,"stopReason":"stop","usage":{"input":74,"output":72,"cacheRead":0,"cacheWrite":0,"totalTokens":146,"cost":{"input":0.0001295,"output":0.001008,"cacheRead":0,"cacheWrite":0,"total":0.0011375}}},{"id":"compare-identity-fields","input":"Name the two run-id fields that link baseline and candidate inside compare.run.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"baseline\""},{"check":"expectContains","pass":true,"details":"contains \"candidate\""},{"check":"expectContains","pass":true,"details":"contains \"run\""}],"failedChecks":[],"outputPreview":"Inside `compare.run`, the two run ID fields are **`baseline_run_id`** and **`candidate_run_id`**. \nThey point to the specific baseline run and candidate run being compared.","latencyMs":1627,"stopReason":"stop","usage":{"input":78,"output":45,"cacheRead":0,"cacheWrite":0,"totalTokens":123,"cost":{"input":0.0001365,"output":0.00063,"cacheRead":0,"cacheWrite":0,"total":0.0007665}}},{"id":"delta-passrate-zero","input":"If delta pass rate is 0, what does that imply?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Nn]o change|same pass rate|[Uu]nchanged)/m"}],"failedChecks":[],"outputPreview":"A delta pass rate of 0 means there was no change in pass rate between the two groups or time periods you compared. \nFor example, if both before and after pass rates are 78%, the delta is 0. \nIt implies performance stayed flat—neither improved nor declined.","latencyMs":2743,"stopReason":"stop","usage":{"input":76,"output":63,"cacheRead":0,"cacheWrite":0,"totalTokens":139,"cost":{"input":0.000133,"output":0.000882,"cacheRead":0,"cacheWrite":0,"total":0.001015}}},{"id":"keyword-check-limitation","input":"Why can simple keyword checks be misleading for quality?","scored":true,"pass":false,"checks":[{"check":"expectRegex","pass":false,"details":"matches /([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword/m"}],"failedChecks":["matches /([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword/m"],"outputPreview":"Simple keyword checks only verify that certain words appear, not that the content is correct, useful, or coherent. A bad answer can “game” the check by stuffing keywords without actually solving the problem. For example, a support reply might include words like “refund,” “policy,...","latencyMs":2609,"stopReason":"stop","usage":{"input":73,"output":84,"cacheRead":0,"cacheWrite":0,"totalTokens":157,"cost":{"input":0.00012775,"output":0.001176,"cacheRead":0,"cacheWrite":0,"total":0.00130375}}},{"id":"improve-weak-evalset","input":"Give two concrete ways to improve a weak 3-case evalset.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":false,"details":"contains \"more\""},{"check":"expectContains","pass":true,"details":"contains \"cases\""}],"failedChecks":["contains \"more\""],"outputPreview":"1) **Add realistic hard negatives.** If your eval set only has obvious “wrong” answers, include tricky lookalikes (for example, same topic but subtly incorrect facts) so the model must truly understand, not just keyword-match. \n2) **Expand case diversity with clear labels.** Tur...","latencyMs":3274,"stopReason":"stop","usage":{"input":77,"output":116,"cacheRead":0,"cacheWrite":0,"totalTokens":193,"cost":{"input":0.00013475,"output":0.001624,"cacheRead":0,"cacheWrite":0,"total":0.00175875}}},{"id":"no-overclaim-rollout","input":"Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.","scored":true,"pass":false,"checks":[{"check":"expectContains","pass":true,"details":"contains \"no\""},{"check":"expectContains","pass":false,"details":"contains \"phase\""}],"failedChecks":["contains \"phase\""],"outputPreview":"No — it’s usually better to position it as a low-risk add-on first, so people can see value before committing to a full replacement.","latencyMs":1175,"stopReason":"stop","usage":{"input":83,"output":33,"cacheRead":0,"cacheWrite":0,"totalTokens":116,"cost":{"input":0.00014525,"output":0.000462,"cacheRead":0,"cacheWrite":0,"total":0.00060725}}},{"id":"stakeholder-brief","input":"Write a one-line stakeholder brief that includes pilot scope and measurement.","scored":true,"pass":true,"checks":[{"check":"expectContains","pass":true,"details":"contains \"pilot\""},{"check":"expectContains","pass":true,"details":"contains \"measure\""}],"failedChecks":[],"outputPreview":"Pilot scope: run a 6‑week trial with 50 users in two teams using the new workflow end‑to‑end, and measure success by adoption rate, task completion time, error rate, and weekly satisfaction score against baseline.","latencyMs":1598,"stopReason":"stop","usage":{"input":76,"output":52,"cacheRead":0,"cacheWrite":0,"totalTokens":128,"cost":{"input":0.000133,"output":0.000728,"cacheRead":0,"cacheWrite":0,"total":0.000861}}},{"id":"tie-communication","input":"Baseline and candidate both scored 33.3% pass rate. How should that be communicated?","scored":true,"pass":true,"checks":[{"check":"expectRegex","pass":true,"details":"matches /([Ss]ame|[Nn]o difference|[Uu]nchanged)/m"}],"failedChecks":[],"outputPreview":"Say it directly: **“The candidate matched the baseline at 33.3% pass rate, so there was no measured improvement.”** \nIf helpful, add context like sample size: **“Both were 33.3% (e.g., 10/30), indicating parity rather than gain.”** \nAvoid wording like “performed better” or “reg...","latencyMs":3679,"stopReason":"stop","usage":{"input":82,"output":88,"cacheRead":0,"cacheWrite":0,"totalTokens":170,"cost":{"input":0.0001435,"output":0.001232,"cacheRead":0,"cacheWrite":0,"total":0.0013755}}}]},"delta":{"passRate":0.08333333333333337,"avgLatencyMs":-238.625,"totalCost":0.0019879999999999967}}</script>
|
|
73
|
+
<script>
|
|
74
|
+
const report = JSON.parse(document.getElementById('report-data').textContent);
|
|
75
|
+
|
|
76
|
+
const pct = (v) => v == null ? 'n/a' : (v * 100).toFixed(1) + '%';
|
|
77
|
+
const money = (v) => '$' + Number(v ?? 0).toFixed(6);
|
|
78
|
+
const ms = (v) => Number(v ?? 0).toFixed(0) + ' ms';
|
|
79
|
+
const esc = (s) => String(s ?? '').replaceAll('&','&').replaceAll('<','<').replaceAll('>','>');
|
|
80
|
+
const passPill = (pass) => pass ? '<span class="pill ok">PASS</span>' : '<span class="pill bad">FAIL</span>';
|
|
81
|
+
|
|
82
|
+
const meta = document.getElementById('meta');
|
|
83
|
+
meta.innerHTML = [
|
|
84
|
+
report.dataset?.name,
|
|
85
|
+
'run ' + report.run?.runId,
|
|
86
|
+
'model ' + report.model?.provider + '/' + report.model?.id,
|
|
87
|
+
'sample source: .evalset/reports/compare-maintainer-clarity-v3-20260216T154123.json'
|
|
88
|
+
].map(esc).join(' · ');
|
|
89
|
+
|
|
90
|
+
const summaryItems = [
|
|
91
|
+
['Baseline pass', pct(report.baseline?.totals?.passRate) + ' (' + report.baseline?.totals?.passedCases + '/' + report.baseline?.totals?.scoredCases + ')'],
|
|
92
|
+
['Candidate pass', pct(report.candidate?.totals?.passRate) + ' (' + report.candidate?.totals?.passedCases + '/' + report.candidate?.totals?.scoredCases + ')'],
|
|
93
|
+
['Δ pass rate', pct(report.delta?.passRate)],
|
|
94
|
+
['Δ avg latency', ms(report.delta?.avgLatencyMs)],
|
|
95
|
+
['Δ total cost', money(report.delta?.totalCost)],
|
|
96
|
+
['Dataset hash', report.run?.datasetHash?.slice(0, 12) + '…']
|
|
97
|
+
];
|
|
98
|
+
|
|
99
|
+
document.getElementById('summary').innerHTML = summaryItems
|
|
100
|
+
.map(([label, val]) => '<div class="card"><div class="label">' + esc(label) + '</div><div class="val">' + esc(val) + '</div></div>')
|
|
101
|
+
.join('');
|
|
102
|
+
|
|
103
|
+
const candidateById = new Map((report.candidate?.cases || []).map((c) => [c.id, c]));
|
|
104
|
+
let improved = 0;
|
|
105
|
+
let regressed = 0;
|
|
106
|
+
|
|
107
|
+
const rows = (report.baseline?.cases || []).map((b, idx) => {
|
|
108
|
+
const c = candidateById.get(b.id) || {};
|
|
109
|
+
let outcome = 'same';
|
|
110
|
+
if (!b.pass && c.pass) { outcome = 'improved'; improved += 1; }
|
|
111
|
+
else if (b.pass && !c.pass) { outcome = 'regressed'; regressed += 1; }
|
|
112
|
+
|
|
113
|
+
const outcomePill = outcome === 'improved'
|
|
114
|
+
? '<span class="pill imp">Improved</span>'
|
|
115
|
+
: outcome === 'regressed'
|
|
116
|
+
? '<span class="pill reg">Regressed</span>'
|
|
117
|
+
: '<span class="pill same">No change</span>';
|
|
118
|
+
|
|
119
|
+
const bChecks = (b.checks || []).map((x) => (x.pass ? '✅ ' : '❌ ') + x.details).join('
|
|
120
|
+
') || 'None';
|
|
121
|
+
const cChecks = (c.checks || []).map((x) => (x.pass ? '✅ ' : '❌ ') + x.details).join('
|
|
122
|
+
') || 'None';
|
|
123
|
+
|
|
124
|
+
return '<tr>' +
|
|
125
|
+
'<td>' + (idx + 1) + '</td>' +
|
|
126
|
+
'<td><code>' + esc(b.id) + '</code></td>' +
|
|
127
|
+
'<td>' + passPill(!!b.pass) + '<div class="meta">' + ms(b.latencyMs) + '</div></td>' +
|
|
128
|
+
'<td>' + passPill(!!c.pass) + '<div class="meta">' + ms(c.latencyMs) + '</div></td>' +
|
|
129
|
+
'<td>' + outcomePill + '</td>' +
|
|
130
|
+
'<td><details><summary>checks + output</summary>' +
|
|
131
|
+
'<div class="split">' +
|
|
132
|
+
'<div><h4 style="margin:8px 0 6px;">Baseline checks</h4><pre>' + esc(bChecks) + '</pre><h4 style="margin:8px 0 6px;">Baseline output</h4><pre>' + esc(b.outputPreview || '') + '</pre></div>' +
|
|
133
|
+
'<div><h4 style="margin:8px 0 6px;">Candidate checks</h4><pre>' + esc(cChecks) + '</pre><h4 style="margin:8px 0 6px;">Candidate output</h4><pre>' + esc(c.outputPreview || '') + '</pre></div>' +
|
|
134
|
+
'</div>' +
|
|
135
|
+
'</details></td>' +
|
|
136
|
+
'</tr>';
|
|
137
|
+
}).join('');
|
|
138
|
+
|
|
139
|
+
document.getElementById('rows').innerHTML = rows;
|
|
140
|
+
</script>
|
|
141
|
+
</body>
|
|
142
|
+
</html>
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,10 +1,29 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-evalset-lab",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "pi extension
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "pi extension for fixed-task-set eval runs and prompt/system comparisons",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
7
|
-
"keywords": [
|
|
7
|
+
"keywords": [
|
|
8
|
+
"pi-package",
|
|
9
|
+
"pi-extension",
|
|
10
|
+
"evalset",
|
|
11
|
+
"llm-eval",
|
|
12
|
+
"prompt-evaluation",
|
|
13
|
+
"fixed-task-set",
|
|
14
|
+
"ux-observability",
|
|
15
|
+
"safety-governance",
|
|
16
|
+
"context-codebase-mapping",
|
|
17
|
+
"web-docs-retrieval",
|
|
18
|
+
"background-processes",
|
|
19
|
+
"review-quality-loops",
|
|
20
|
+
"planning-orchestration",
|
|
21
|
+
"subagents-parallelization",
|
|
22
|
+
"model-prompt-management",
|
|
23
|
+
"interactive-clis-editors",
|
|
24
|
+
"skills-rules-packs",
|
|
25
|
+
"paste-code-extraction"
|
|
26
|
+
],
|
|
8
27
|
"repository": {
|
|
9
28
|
"type": "git",
|
|
10
29
|
"url": "git+https://github.com/tryingET/pi-evalset-lab.git"
|
|
@@ -13,16 +32,29 @@
|
|
|
13
32
|
"url": "https://github.com/tryingET/pi-evalset-lab/issues"
|
|
14
33
|
},
|
|
15
34
|
"homepage": "https://github.com/tryingET/pi-evalset-lab#readme",
|
|
35
|
+
"files": [
|
|
36
|
+
"extensions/evalset.ts",
|
|
37
|
+
"prompts",
|
|
38
|
+
"examples",
|
|
39
|
+
"scripts/export-evalset-report-html.mjs",
|
|
40
|
+
"README.md",
|
|
41
|
+
"CHANGELOG.md"
|
|
42
|
+
],
|
|
16
43
|
"scripts": {
|
|
17
44
|
"check": "bash ./scripts/validate-structure.sh",
|
|
18
45
|
"test": "bash ./scripts/validate-structure.sh",
|
|
19
46
|
"docs:list": "bash ./scripts/docs-list.sh",
|
|
20
47
|
"docs:list:workspace": "bash ./scripts/docs-list.sh --workspace --discover",
|
|
21
|
-
"docs:list:json": "bash ./scripts/docs-list.sh --json"
|
|
48
|
+
"docs:list:json": "bash ./scripts/docs-list.sh --json",
|
|
49
|
+
"evalset:export-html": "node ./scripts/export-evalset-report-html.mjs"
|
|
22
50
|
},
|
|
23
51
|
"pi": {
|
|
24
|
-
"extensions": [
|
|
25
|
-
|
|
52
|
+
"extensions": [
|
|
53
|
+
"./extensions/evalset.ts"
|
|
54
|
+
],
|
|
55
|
+
"prompts": [
|
|
56
|
+
"./prompts"
|
|
57
|
+
]
|
|
26
58
|
},
|
|
27
59
|
"peerDependencies": {
|
|
28
60
|
"@mariozechner/pi-coding-agent": "*",
|