prism-evals 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prism_evals-0.9.0/.github/workflows/publish-python.yml +80 -0
- prism_evals-0.9.0/.gitignore +24 -0
- prism_evals-0.9.0/AGENTS.md +276 -0
- prism_evals-0.9.0/CHANGELOG.md +159 -0
- prism_evals-0.9.0/MIGRATION.md +86 -0
- prism_evals-0.9.0/PKG-INFO +722 -0
- prism_evals-0.9.0/README.md +691 -0
- prism_evals-0.9.0/examples/01_csv_qa.py +40 -0
- prism_evals-0.9.0/examples/02_json_import.py +54 -0
- prism_evals-0.9.0/examples/03_image_generation.py +48 -0
- prism_evals-0.9.0/examples/04_config_options.py +84 -0
- prism_evals-0.9.0/examples/05_multistep_agent.py +129 -0
- prism_evals-0.9.0/examples/datasets/config_options.csv +3 -0
- prism_evals-0.9.0/examples/datasets/image_prompts.csv +2 -0
- prism_evals-0.9.0/examples/datasets/json_cases/daypack.json +19 -0
- prism_evals-0.9.0/examples/datasets/json_cases/rain_shell.json +19 -0
- prism_evals-0.9.0/examples/datasets/qa.csv +3 -0
- prism_evals-0.9.0/examples/datasets/support_tickets.csv +5 -0
- prism_evals-0.9.0/examples/prompts/config_system.md +1 -0
- prism_evals-0.9.0/pyproject.toml +77 -0
- prism_evals-0.9.0/src/prism_evals/__init__.py +70 -0
- prism_evals-0.9.0/src/prism_evals/__main__.py +7 -0
- prism_evals-0.9.0/src/prism_evals/_utils.py +112 -0
- prism_evals-0.9.0/src/prism_evals/artifacts.py +115 -0
- prism_evals-0.9.0/src/prism_evals/builtins.py +555 -0
- prism_evals-0.9.0/src/prism_evals/cli.py +390 -0
- prism_evals-0.9.0/src/prism_evals/console.py +308 -0
- prism_evals-0.9.0/src/prism_evals/datasets.py +247 -0
- prism_evals-0.9.0/src/prism_evals/errors.py +19 -0
- prism_evals-0.9.0/src/prism_evals/evaluation.py +96 -0
- prism_evals-0.9.0/src/prism_evals/experiment.py +207 -0
- prism_evals-0.9.0/src/prism_evals/models.py +260 -0
- prism_evals-0.9.0/src/prism_evals/openai.py +1027 -0
- prism_evals-0.9.0/src/prism_evals/runner.py +297 -0
- prism_evals-0.9.0/src/prism_evals/scaffold.py +49 -0
- prism_evals-0.9.0/src/prism_evals/storage.py +435 -0
- prism_evals-0.9.0/src/prism_evals/templates/AGENTS.md +220 -0
- prism_evals-0.9.0/tests/conftest.py +72 -0
- prism_evals-0.9.0/tests/test_artifacts.py +106 -0
- prism_evals-0.9.0/tests/test_builtins.py +235 -0
- prism_evals-0.9.0/tests/test_cli_viewer.py +360 -0
- prism_evals-0.9.0/tests/test_console.py +90 -0
- prism_evals-0.9.0/tests/test_datasets.py +77 -0
- prism_evals-0.9.0/tests/test_experiment_api.py +107 -0
- prism_evals-0.9.0/tests/test_openai_wrapper.py +519 -0
- prism_evals-0.9.0/tests/test_runner.py +594 -0
- prism_evals-0.9.0/tests/test_scaffold.py +88 -0
- prism_evals-0.9.0/viewer/AGENTS.md +49 -0
- prism_evals-0.9.0/viewer/app/api/compare/route.ts +32 -0
- prism_evals-0.9.0/viewer/app/api/runs/[runKey]/records/route.ts +23 -0
- prism_evals-0.9.0/viewer/app/api/runs/[runKey]/route.ts +22 -0
- prism_evals-0.9.0/viewer/app/api/runs/route.ts +17 -0
- prism_evals-0.9.0/viewer/app/artifacts/[runKey]/[artifactName]/route.ts +44 -0
- prism_evals-0.9.0/viewer/app/compare/page.tsx +5 -0
- prism_evals-0.9.0/viewer/app/globals.css +43 -0
- prism_evals-0.9.0/viewer/app/layout.tsx +62 -0
- prism_evals-0.9.0/viewer/app/media/[runKey]/[...mediaPath]/route.ts +46 -0
- prism_evals-0.9.0/viewer/app/page.tsx +5 -0
- prism_evals-0.9.0/viewer/app/runs/[runKey]/page.tsx +10 -0
- prism_evals-0.9.0/viewer/components/ComparePage.tsx +443 -0
- prism_evals-0.9.0/viewer/components/RunDetailPage.tsx +1141 -0
- prism_evals-0.9.0/viewer/components/RunsPage.tsx +1324 -0
- prism_evals-0.9.0/viewer/components/format.ts +58 -0
- prism_evals-0.9.0/viewer/components/ui.tsx +221 -0
- prism_evals-0.9.0/viewer/lib/evals.ts +485 -0
- prism_evals-0.9.0/viewer/lib/media.ts +11 -0
- prism_evals-0.9.0/viewer/lib/preferences.ts +425 -0
- prism_evals-0.9.0/viewer/lib/server/runs.ts +166 -0
- prism_evals-0.9.0/viewer/lib/server/viewer.ts +39 -0
- prism_evals-0.9.0/viewer/lib/types.ts +279 -0
- prism_evals-0.9.0/viewer/next-env.d.ts +6 -0
- prism_evals-0.9.0/viewer/next.config.ts +7 -0
- prism_evals-0.9.0/viewer/package-lock.json +4116 -0
- prism_evals-0.9.0/viewer/package.json +29 -0
- prism_evals-0.9.0/viewer/postcss.config.mjs +8 -0
- prism_evals-0.9.0/viewer/tailwind.config.ts +24 -0
- prism_evals-0.9.0/viewer/test/evals.test.ts +230 -0
- prism_evals-0.9.0/viewer/test/preferences.test.ts +103 -0
- prism_evals-0.9.0/viewer/test/viewer.test.ts +34 -0
- prism_evals-0.9.0/viewer/tsconfig.json +24 -0
- prism_evals-0.9.0/viewer/vitest.config.ts +8 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
name: Publish Python Package
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
name: Test
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- name: Check out repository
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
cache: pip
|
|
25
|
+
|
|
26
|
+
- name: Install package
|
|
27
|
+
run: python -m pip install --upgrade pip && python -m pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
- name: Run tests
|
|
30
|
+
run: python -m pytest
|
|
31
|
+
|
|
32
|
+
build:
|
|
33
|
+
name: Build
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
needs: test
|
|
36
|
+
steps:
|
|
37
|
+
- name: Check out repository
|
|
38
|
+
uses: actions/checkout@v4
|
|
39
|
+
|
|
40
|
+
- name: Set up Python
|
|
41
|
+
uses: actions/setup-python@v5
|
|
42
|
+
with:
|
|
43
|
+
python-version: "3.12"
|
|
44
|
+
cache: pip
|
|
45
|
+
|
|
46
|
+
- name: Install build tools
|
|
47
|
+
run: python -m pip install --upgrade pip build twine
|
|
48
|
+
|
|
49
|
+
- name: Build distributions
|
|
50
|
+
run: python -m build
|
|
51
|
+
|
|
52
|
+
- name: Check distributions
|
|
53
|
+
run: python -m twine check dist/*
|
|
54
|
+
|
|
55
|
+
- name: Upload distributions
|
|
56
|
+
uses: actions/upload-artifact@v4
|
|
57
|
+
with:
|
|
58
|
+
name: python-package-distributions
|
|
59
|
+
path: dist/
|
|
60
|
+
|
|
61
|
+
publish:
|
|
62
|
+
name: Publish to PyPI
|
|
63
|
+
runs-on: ubuntu-latest
|
|
64
|
+
needs: build
|
|
65
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
66
|
+
environment:
|
|
67
|
+
name: pypi
|
|
68
|
+
url: https://pypi.org/p/prism-evals
|
|
69
|
+
permissions:
|
|
70
|
+
contents: read
|
|
71
|
+
id-token: write
|
|
72
|
+
steps:
|
|
73
|
+
- name: Download distributions
|
|
74
|
+
uses: actions/download-artifact@v4
|
|
75
|
+
with:
|
|
76
|
+
name: python-package-distributions
|
|
77
|
+
path: dist/
|
|
78
|
+
|
|
79
|
+
- name: Publish distributions to PyPI
|
|
80
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
.ruff_cache/
|
|
7
|
+
.mypy_cache/
|
|
8
|
+
.coverage
|
|
9
|
+
htmlcov/
|
|
10
|
+
|
|
11
|
+
.env
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
|
|
15
|
+
build/
|
|
16
|
+
dist/
|
|
17
|
+
|
|
18
|
+
/runs/
|
|
19
|
+
/examples/runs/
|
|
20
|
+
|
|
21
|
+
viewer/.next/
|
|
22
|
+
viewer/node_modules/
|
|
23
|
+
viewer/coverage/
|
|
24
|
+
viewer/tsconfig.tsbuildinfo
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
This repo provides Prism Evals, a local Python framework for executable OpenAI
|
|
4
|
+
API experiments. The package distribution is `prism-evals`, and the Python
|
|
5
|
+
import is `prism_evals`.
|
|
6
|
+
|
|
7
|
+
Use this file as the quick orientation for LLM agents working in this package
|
|
8
|
+
repo, and as a template for consuming repos that install Prism Evals.
|
|
9
|
+
|
|
10
|
+
To seed Prism Evals instructions into a consuming repo, install the package and
|
|
11
|
+
run this from the consuming repo root:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python -m prism_evals init
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
The installed console scripts also work:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
prism init
|
|
21
|
+
prism-evals init
|
|
22
|
+
pe init
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## What An Eval Experiment Is
|
|
26
|
+
|
|
27
|
+
An experiment is a normal Python file that configures:
|
|
28
|
+
|
|
29
|
+
- A CSV dataset, JSONL file, or folder of JSON/YAML scenario files.
|
|
30
|
+
- One or more `ModelConfig` entries or named model variants.
|
|
31
|
+
- A workflow callable assigned to `exp.workflow`.
|
|
32
|
+
- Item-level evals registered with `exp.eval(...)`.
|
|
33
|
+
- Optional step-level evals inside `ctx.step(...)`.
|
|
34
|
+
|
|
35
|
+
Run an experiment with the Prism CLI:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
prism run path/to/experiment.py
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The minimal shape is:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from openai import AsyncOpenAI
|
|
45
|
+
|
|
46
|
+
from prism_evals import Experiment, ModelConfig, TaskOutput
|
|
47
|
+
|
|
48
|
+
exp = Experiment(name="my_eval", dataset="datasets/my_eval.csv", output_dir="runs")
|
|
49
|
+
exp.model(ModelConfig(key="gpt5_low", model="gpt-5", params={"reasoning": {"effort": "low"}}))
|
|
50
|
+
client = AsyncOpenAI()
|
|
51
|
+
|
|
52
|
+
async def workflow(item, model, ctx):
|
|
53
|
+
response = await client.responses.create(
|
|
54
|
+
model=model.model,
|
|
55
|
+
**model.params,
|
|
56
|
+
input=item["prompt"],
|
|
57
|
+
)
|
|
58
|
+
return TaskOutput(text=response.output_text)
|
|
59
|
+
|
|
60
|
+
exp.workflow = workflow
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Direct `python path/to/experiment.py` execution is still supported if the file
|
|
64
|
+
includes an explicit `exp.run()` block.
|
|
65
|
+
|
|
66
|
+
## Where To Make Changes
|
|
67
|
+
|
|
68
|
+
When changing eval behavior in a consuming repo, edit the experiment Python files
|
|
69
|
+
first. Those files are the source of truth for datasets, model choices,
|
|
70
|
+
workflow logic, scoring rules, concurrency, retries, and output settings.
|
|
71
|
+
|
|
72
|
+
Common change points:
|
|
73
|
+
|
|
74
|
+
- Change prompts, tool calls, multi-step logic, or response parsing in the
|
|
75
|
+
workflow assigned to `exp.workflow`.
|
|
76
|
+
- Change model coverage by editing `exp.model(...)`, `exp.models(...)`, or
|
|
77
|
+
`exp.variant(...)` for multi-agent role/model configurations.
|
|
78
|
+
- Change pass/fail or scoring logic by editing `exp.eval(...)` or the `evals=`
|
|
79
|
+
list passed to `ctx.step(...)`.
|
|
80
|
+
- Change data coverage by editing the CSV/JSONL file or scenario folder
|
|
81
|
+
referenced by `Experiment(dataset=...)`.
|
|
82
|
+
- Change output placement by editing `Experiment(output_dir=...)`.
|
|
83
|
+
- Change resume behavior with `resume=True` or `resume=False`.
|
|
84
|
+
- Change repeated sampling with `repetitions=...`.
|
|
85
|
+
- Change parallelism with `concurrency=...`.
|
|
86
|
+
|
|
87
|
+
If prompts, rubrics, schemas, or other files are part of the experiment, keep
|
|
88
|
+
them near the experiment file and pass them through `artifacts=[...]` so each run
|
|
89
|
+
copies them into the run directory.
|
|
90
|
+
|
|
91
|
+
## Paths Are Relative To The Experiment File
|
|
92
|
+
|
|
93
|
+
Relative `dataset`, `output_dir`, and `artifacts` paths are resolved relative to
|
|
94
|
+
the Python file that creates `Experiment(...)`, not necessarily the process
|
|
95
|
+
working directory.
|
|
96
|
+
|
|
97
|
+
For example, in `experiments/qa.py`:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
Experiment(
|
|
101
|
+
name="qa",
|
|
102
|
+
dataset="datasets/qa.csv",
|
|
103
|
+
output_dir="runs",
|
|
104
|
+
artifacts=["prompts/system.md"],
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
This reads `experiments/datasets/qa.csv`, writes under `experiments/runs/`, and
|
|
109
|
+
copies `experiments/prompts/system.md` into the run artifacts.
|
|
110
|
+
|
|
111
|
+
## Result Storage
|
|
112
|
+
|
|
113
|
+
Each `Experiment` instance chooses one run directory. With default settings, the
|
|
114
|
+
timestamp is created when `Experiment(...)` is constructed:
|
|
115
|
+
|
|
116
|
+
```text
|
|
117
|
+
<output_dir>/<YYYYMMDD-HHMMSS>_<experiment_name>/
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
If `timestamp_output_dir=False`, the run directory is:
|
|
121
|
+
|
|
122
|
+
```text
|
|
123
|
+
<output_dir>/<experiment_name>/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Run directories contain:
|
|
127
|
+
|
|
128
|
+
- `manifest.json`: run metadata, dataset hash, experiment file hash, settings,
|
|
129
|
+
model configs, git commit, copied artifact metadata, and output path.
|
|
130
|
+
- `results.jsonl`: append-only item-run records. This is the most complete
|
|
131
|
+
machine-readable output.
|
|
132
|
+
- `results.csv`: one row per item/model/repetition with flattened item fields,
|
|
133
|
+
final output text, usage, errors, item-level scores, and step score columns.
|
|
134
|
+
- `scores.csv`: one row per score, including both item-level and step-level
|
|
135
|
+
scores.
|
|
136
|
+
- `steps.csv`: one row per recorded workflow step, including step output text,
|
|
137
|
+
usage, errors, media columns, and step scores.
|
|
138
|
+
- `turns.csv`: one row per recorded conversation turn.
|
|
139
|
+
- `tool_calls.csv`: one row per recorded tool call.
|
|
140
|
+
- `artifacts/`: optional copied prompt/rubric/schema files listed in
|
|
141
|
+
`artifacts=[...]`.
|
|
142
|
+
- `media/`: generated outputs saved with `ctx.media`.
|
|
143
|
+
|
|
144
|
+
`runs/` and `examples/runs/` are ignored by git in this repo. Treat run outputs
|
|
145
|
+
as generated artifacts unless the consuming repo explicitly chooses to version
|
|
146
|
+
selected reports.
|
|
147
|
+
|
|
148
|
+
## How To Read Results
|
|
149
|
+
|
|
150
|
+
Start with `manifest.json` to confirm the experiment file, dataset, model
|
|
151
|
+
configs, and settings that produced the run.
|
|
152
|
+
|
|
153
|
+
Use `results.csv` for quick spreadsheet-style inspection. Useful columns include:
|
|
154
|
+
|
|
155
|
+
- `item_id`, `item_index`, `model_key`, `repetition`, and `status`.
|
|
156
|
+
- `output_text` for the final workflow output.
|
|
157
|
+
- `media_count`, `media_paths_json`, and `primary_media_path` for generated
|
|
158
|
+
output files.
|
|
159
|
+
- `score:<eval_key>` and `score_error:<eval_key>` for item-level evals.
|
|
160
|
+
- `step:<step_key>.score:<eval_key>` for step evals flattened into the item row.
|
|
161
|
+
- `input_tokens`, `output_tokens`, `reasoning_tokens`, `total_tokens`, and
|
|
162
|
+
`latency_s` for usage and timing.
|
|
163
|
+
- `error_type` and `error_message` for failed item runs.
|
|
164
|
+
|
|
165
|
+
Use `scores.csv` when comparing eval metrics across models, repetitions, or
|
|
166
|
+
steps. Filter by:
|
|
167
|
+
|
|
168
|
+
- `scope=item_run` for final-output evals.
|
|
169
|
+
- `scope=step` and `step_key=<name>` for step evals.
|
|
170
|
+
- `score_key=<eval>` for a specific metric.
|
|
171
|
+
|
|
172
|
+
Use `steps.csv` when debugging multi-step workflows. It shows each step's
|
|
173
|
+
status, output text, media paths, token usage, latency, response ID, and
|
|
174
|
+
`scores_json`.
|
|
175
|
+
|
|
176
|
+
Use `results.jsonl` when full record structure matters. It preserves nested
|
|
177
|
+
records for items, final `TaskOutput` values, evals, optional legacy generation
|
|
178
|
+
records, steps, errors, usage, and media metadata.
|
|
179
|
+
|
|
180
|
+
## Resume Behavior
|
|
181
|
+
|
|
182
|
+
`resume=True` skips item/model/repetition records that already have
|
|
183
|
+
`status == "success"` in the current run directory's `results.jsonl`.
|
|
184
|
+
|
|
185
|
+
The item-run identity is based on experiment name, dataset hash, per-item hash,
|
|
186
|
+
item id/index, model or variant key, and repetition. Changing dataset contents or
|
|
187
|
+
a scenario file creates different item-run IDs.
|
|
188
|
+
|
|
189
|
+
For a clean rerun, launch a fresh process with `timestamp_output_dir=True`,
|
|
190
|
+
change the output directory or experiment name, or delete the old generated run
|
|
191
|
+
directory.
|
|
192
|
+
|
|
193
|
+
## Datasets
|
|
194
|
+
|
|
195
|
+
Datasets can be CSV files, JSONL files, structured JSON/YAML scenario files, or
|
|
196
|
+
folders containing one JSON/YAML scenario file per item.
|
|
197
|
+
|
|
198
|
+
- Every item is passed to the workflow as `item`; CSV values remain strings,
|
|
199
|
+
while JSON/YAML items preserve nested lists and objects.
|
|
200
|
+
- If the CSV has an `id` column, that value is used as `item_id`.
|
|
201
|
+
- If `id` is missing or blank, the zero-based row index is used as `item_id`.
|
|
202
|
+
- Empty CSV values are normalized to empty strings.
|
|
203
|
+
- If `dataset` is a directory, Prism recursively reads `*.json`, `*.yaml`, and
|
|
204
|
+
`*.yml` files in stable path order. Files beginning with `_` are ignored.
|
|
205
|
+
- Scenario turns support shorthand keys such as `user`, `assistant_seed`,
|
|
206
|
+
`assistant_expect`, and `action`.
|
|
207
|
+
|
|
208
|
+
Keep dataset columns explicit and stable. Evals often use selectors such as
|
|
209
|
+
`item("expected")`, so renaming columns can silently change scoring behavior.
|
|
210
|
+
|
|
211
|
+
CSV can point to scenario files with `scenario_path`, or store compact turn lists
|
|
212
|
+
in `turns_json`.
|
|
213
|
+
|
|
214
|
+
## Workflows And Steps
|
|
215
|
+
|
|
216
|
+
A workflow receives `(item, model, ctx)` and may be sync or async. It must
|
|
217
|
+
return `TaskOutput`.
|
|
218
|
+
|
|
219
|
+
Import and use the OpenAI SDK directly inside experiment files. Prism owns eval
|
|
220
|
+
orchestration and output storage, not provider SDK calls.
|
|
221
|
+
|
|
222
|
+
Use `ctx.step("step_key", callable_or_value, evals=[...])` for multi-step
|
|
223
|
+
workflows. Step callables must return `TaskOutput`. Step outputs and step evals
|
|
224
|
+
are written to both `results.jsonl` and the flattened CSV files.
|
|
225
|
+
|
|
226
|
+
Use `ctx.conversation(...)`, `ctx.user(...)`, `ctx.assistant_seed(...)`,
|
|
227
|
+
`ctx.action_seed(...)`, and `ctx.turn(...)` for multi-turn scenarios. Seeded
|
|
228
|
+
turns record context only; generated turns also write a normal step with key
|
|
229
|
+
`turn:<turn_id>`.
|
|
230
|
+
|
|
231
|
+
Use `ctx.record_tool_call(...)` to capture app/tool invocations that matter for
|
|
232
|
+
scoring. Built-ins such as `ToolCalled`, `ToolNotCalled`, and `ToolArgsEqual`
|
|
233
|
+
can score those calls.
|
|
234
|
+
|
|
235
|
+
Return `TaskOutput(text=..., value=..., media=[...])` for display text,
|
|
236
|
+
structured data, and generated media. Built-in selectors such as `text()`,
|
|
237
|
+
`out("path")`, `step("step_key.path")`, and `step_text("step_key")` depend on
|
|
238
|
+
that structure. Use `ctx.media.from_base64(...)`, `ctx.media.from_bytes(...)`,
|
|
239
|
+
or `ctx.media.from_path(...)` to save generated outputs into `media/`.
|
|
240
|
+
|
|
241
|
+
Use `ctx.realtime.run_text(...)` or `ctx.realtime.run_audio(...)` for Realtime
|
|
242
|
+
API evals. Realtime helpers still return data through `TaskOutput`, and audio
|
|
243
|
+
outputs should be attached as run-local media.
|
|
244
|
+
|
|
245
|
+
## Viewer UI Preferences
|
|
246
|
+
|
|
247
|
+
When editing the local Next.js viewer, optimize for dense eval inspection rather
|
|
248
|
+
than marketing-style presentation:
|
|
249
|
+
|
|
250
|
+
- Use the full viewport width for run tables, compare views, and charts.
|
|
251
|
+
- Keep visible copy short and product-facing; avoid implementation details in
|
|
252
|
+
the UI.
|
|
253
|
+
- Put related controls in the same toolbar row when space allows, with secondary
|
|
254
|
+
controls aligned to the right.
|
|
255
|
+
- Prefer familiar icon-only controls for common actions such as close and remove.
|
|
256
|
+
Use a small, light `×` with an accessible label rather than bordered text
|
|
257
|
+
buttons.
|
|
258
|
+
- Keep chart previews compact. Avoid legends or section labels in small cards
|
|
259
|
+
when the surrounding context already names the metric.
|
|
260
|
+
- For run trend charts, show runs chronologically left to right so the newest
|
|
261
|
+
run is on the right.
|
|
262
|
+
- Preserve user filters across summaries and charts; model filters should affect
|
|
263
|
+
both the table and chart previews.
|
|
264
|
+
|
|
265
|
+
## Testing Changes
|
|
266
|
+
|
|
267
|
+
For this package repo:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
python -m pip install -e ".[dev]"
|
|
271
|
+
python -m pytest
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
For a consuming repo, run the smallest relevant experiment first, then inspect
|
|
275
|
+
the generated run directory before broadening concurrency, model coverage, or
|
|
276
|
+
dataset size.
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
This project follows Semantic Versioning. While the project is pre-1.0, minor
|
|
6
|
+
versions may include API changes as the experiment framework settles.
|
|
7
|
+
|
|
8
|
+
## [0.9.0] - 2026-05-21
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Added folder-backed JSON/YAML scenario datasets, JSONL datasets, CSV
|
|
13
|
+
`scenario_path`, and CSV `turns_json` expansion for multi-turn eval inputs.
|
|
14
|
+
- Added named model variants for multi-agent workflows, with `ctx.model(role)`
|
|
15
|
+
access to role-specific `ModelConfig` entries.
|
|
16
|
+
- Added conversation turn recording helpers, seeded user/assistant/action turns,
|
|
17
|
+
tool-call recording, `turns.csv`, `tool_calls.csv`, and tool-call built-in
|
|
18
|
+
evaluators.
|
|
19
|
+
|
|
20
|
+
## [0.8.0] - 2026-05-11
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
|
|
24
|
+
- Added first-class Realtime workflow support through `ctx.realtime`, including
|
|
25
|
+
text and audio helpers for `gpt-realtime-2`.
|
|
26
|
+
- Parsed Realtime tool calls into `RealtimeRunResult.tool_calls` and
|
|
27
|
+
`TaskOutput.value["tool_calls"]` for scoring.
|
|
28
|
+
- Added Realtime text and voice-agent smoke examples, plus viewer playback for
|
|
29
|
+
audio media.
|
|
30
|
+
|
|
31
|
+
## [0.7.0] - 2026-05-04
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
|
|
35
|
+
- `TaskOutput` is now the required workflow and step output contract.
|
|
36
|
+
- Workflows should import provider SDKs directly instead of relying on Prism to
|
|
37
|
+
proxy OpenAI calls.
|
|
38
|
+
|
|
39
|
+
### Added
|
|
40
|
+
|
|
41
|
+
- Added `TaskOutput.media`, `MediaArtifact`, run-local `media/` storage,
|
|
42
|
+
compact media columns in CSV outputs, and viewer media previews.
|
|
43
|
+
- Added `ctx.media.from_base64(...)`, `ctx.media.from_bytes(...)`, and
|
|
44
|
+
`ctx.media.from_path(...)` helpers for generated outputs.
|
|
45
|
+
|
|
46
|
+
## [0.6.8] - 2026-04-16
|
|
47
|
+
|
|
48
|
+
### Added
|
|
49
|
+
|
|
50
|
+
- `prism run <experiment_file>` discovers and runs module-level `Experiment`
|
|
51
|
+
instances, so eval files no longer need an explicit `exp.run()` block.
|
|
52
|
+
|
|
53
|
+
## [0.6.7] - 2026-04-16
|
|
54
|
+
|
|
55
|
+
### Added
|
|
56
|
+
|
|
57
|
+
- Compact inline `data:` URLs in captured raw payloads by default with
|
|
58
|
+
`redact_raw_data_urls=True`.
|
|
59
|
+
|
|
60
|
+
## [0.6.0] - 2026-04-16
|
|
61
|
+
|
|
62
|
+
### Changed
|
|
63
|
+
|
|
64
|
+
- Rebranded the package to Prism Evals.
|
|
65
|
+
- Renamed the Python distribution to `prism-evals` and the import package to
|
|
66
|
+
`prism_evals`.
|
|
67
|
+
- Added `prism`, `prism-evals`, and `pe` console scripts.
|
|
68
|
+
- Updated the local viewer, scaffolded instructions, docs, examples, and tests
|
|
69
|
+
to use Prism Evals naming.
|
|
70
|
+
|
|
71
|
+
## [0.5.1] - 2026-04-15
|
|
72
|
+
|
|
73
|
+
### Fixed
|
|
74
|
+
|
|
75
|
+
- Bundle the local viewer with Python wheels so installed packages can launch `prism view`.
|
|
76
|
+
- Install viewer npm dependencies automatically on first launch when they are missing.
|
|
77
|
+
|
|
78
|
+
## [0.5.0] - 2026-04-15
|
|
79
|
+
|
|
80
|
+
### Added
|
|
81
|
+
|
|
82
|
+
- `prism view <runs_dir>` opens a read-only local Next.js viewer for parent directories of eval runs.
|
|
83
|
+
- Viewer pages for all runs, run detail, score matrices, artifact downloads, and lane comparisons across `run + model_key` pairs.
|
|
84
|
+
|
|
85
|
+
## [0.4.5] - 2026-04-15
|
|
86
|
+
|
|
87
|
+
### Added
|
|
88
|
+
|
|
89
|
+
- `Experiment(..., artifacts=[...])` can copy prompt files, configs, and other user files into the run output folder.
|
|
90
|
+
|
|
91
|
+
## [0.4.4] - 2026-04-15
|
|
92
|
+
|
|
93
|
+
### Added
|
|
94
|
+
|
|
95
|
+
- Console output now includes a model-column score pivot table by eval key.
|
|
96
|
+
|
|
97
|
+
## [0.4.3] - 2026-04-15
|
|
98
|
+
|
|
99
|
+
### Added
|
|
100
|
+
|
|
101
|
+
- Console token usage now shows average per item run and totals for input, cached, output, reasoning, and total tokens.
|
|
102
|
+
|
|
103
|
+
## [0.4.2] - 2026-04-15
|
|
104
|
+
|
|
105
|
+
### Fixed
|
|
106
|
+
|
|
107
|
+
- Declared the `src/prism_evals` package for Hatchling wheel builds so `prism-evals` installs correctly from Git tags.
|
|
108
|
+
|
|
109
|
+
## [0.4.1] - 2026-04-15
|
|
110
|
+
|
|
111
|
+
### Added
|
|
112
|
+
|
|
113
|
+
- Timestamp-prefixed output directories by default, with `timestamp_output_dir=False` to keep stable experiment folders.
|
|
114
|
+
|
|
115
|
+
## [0.4.0] - 2026-04-15
|
|
116
|
+
|
|
117
|
+
### Added
|
|
118
|
+
|
|
119
|
+
- Multi-step workflows via `ctx.step(...)`, with step-owned outputs, evals, generations, usage, latency, and errors.
|
|
120
|
+
- Step selectors: `step(...)` and `step_text(...)`.
|
|
121
|
+
- `steps.csv` artifact and scoped score rows in `scores.csv`.
|
|
122
|
+
|
|
123
|
+
### Changed
|
|
124
|
+
|
|
125
|
+
- Renamed public terminology from row/execution/task to item/item-run/workflow.
|
|
126
|
+
- Replaced `exp.task = callable` with `exp.workflow = callable`.
|
|
127
|
+
- Replaced the dataset selector `row(...)` with `item(...)`.
|
|
128
|
+
- Renamed `ExecutionRecord` to `ItemRunRecord` and public result fields to item/item-run names.
|
|
129
|
+
|
|
130
|
+
## [0.3.0] - 2026-04-15
|
|
131
|
+
|
|
132
|
+
### Changed
|
|
133
|
+
|
|
134
|
+
- Replaced task decorators with direct task assignment via `exp.task = callable`.
|
|
135
|
+
- Removed eval decorator registration in favor of `exp.eval("key", evaluator)`.
|
|
136
|
+
- Task callables may now be sync or async, including callable task objects.
|
|
137
|
+
|
|
138
|
+
## [0.2.0] - 2026-04-15
|
|
139
|
+
|
|
140
|
+
### Added
|
|
141
|
+
|
|
142
|
+
- Unified `exp.eval("key", evaluator)` registration for custom functions and built-in evaluators.
|
|
143
|
+
- Built-in deterministic evaluators for equality, approximate equality, containment, regex, non-empty values, length bounds, and JSON path checks.
|
|
144
|
+
- Selector helpers: `row(...)`, `out(...)`, and `text(...)`.
|
|
145
|
+
|
|
146
|
+
## [0.1.0] - 2026-04-15
|
|
147
|
+
|
|
148
|
+
### Added
|
|
149
|
+
|
|
150
|
+
- Initial Prism Evals Python package with decorator-style experiments.
|
|
151
|
+
- `Experiment`, `ModelConfig`, `TaskOutput`, and `EvalResult` public APIs.
|
|
152
|
+
- OpenAI Responses API wrapper for automatic raw request/response capture.
|
|
153
|
+
- Token usage capture for input, cached, output, reasoning, and total tokens.
|
|
154
|
+
- Per-call latency and per-execution duration tracking.
|
|
155
|
+
- CSV dataset loading with row/model/repetition execution matrix.
|
|
156
|
+
- Bounded async concurrency, retries, resume support, and fail-fast option.
|
|
157
|
+
- Local artifacts: `manifest.json`, `results.jsonl`, `results.csv`, and `scores.csv`.
|
|
158
|
+
- Rich terminal progress and summary tables.
|
|
159
|
+
- Example QA experiment and pytest coverage.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Migration Guide
|
|
2
|
+
|
|
3
|
+
## 0.7.0: `TaskOutput` Is Required
|
|
4
|
+
|
|
5
|
+
Prism workflows and step callables must now return `TaskOutput`. Strings and
|
|
6
|
+
dicts are no longer normalized as workflow outputs.
|
|
7
|
+
|
|
8
|
+
Prism also no longer owns OpenAI SDK calls for normal workflows. Import and use
|
|
9
|
+
the OpenAI SDK directly in your experiment file:
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from openai import AsyncOpenAI
|
|
13
|
+
|
|
14
|
+
client = AsyncOpenAI()
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Use that client for Responses API, Image API, or any other OpenAI call, then
|
|
18
|
+
return a `TaskOutput` with the data Prism should evaluate and store.
|
|
19
|
+
|
|
20
|
+
Text output:
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from openai import AsyncOpenAI
|
|
24
|
+
from prism_evals import TaskOutput
|
|
25
|
+
|
|
26
|
+
client = AsyncOpenAI()
|
|
27
|
+
|
|
28
|
+
response = await client.responses.create(...)
|
|
29
|
+
|
|
30
|
+
# Old
|
|
31
|
+
return response.output_text
|
|
32
|
+
|
|
33
|
+
# New
|
|
34
|
+
return TaskOutput(text=response.output_text)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Structured output:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
# Old
|
|
41
|
+
return {"answer": parsed}
|
|
42
|
+
|
|
43
|
+
# New
|
|
44
|
+
return TaskOutput(text=json.dumps(parsed), value=parsed)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Step output:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
# Old
|
|
51
|
+
draft = await ctx.step("draft", lambda: "hello")
|
|
52
|
+
|
|
53
|
+
# New
|
|
54
|
+
draft = await ctx.step("draft", lambda: TaskOutput(text="hello"))
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Generated media:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from openai import AsyncOpenAI
|
|
61
|
+
from prism_evals import TaskOutput
|
|
62
|
+
|
|
63
|
+
client = AsyncOpenAI()
|
|
64
|
+
|
|
65
|
+
async def workflow(item, model, ctx):
|
|
66
|
+
response = await client.images.generate(
|
|
67
|
+
model=model.model,
|
|
68
|
+
prompt=item["prompt"],
|
|
69
|
+
response_format="b64_json",
|
|
70
|
+
**model.params,
|
|
71
|
+
)
|
|
72
|
+
image = ctx.media.from_base64(response.data[0].b64_json, format="png")
|
|
73
|
+
return TaskOutput(text="Generated image", media=[image])
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Generated files now live in `media/`. The `artifacts/` directory remains for
|
|
77
|
+
files copied from the experiment source tree with `Experiment(...,
|
|
78
|
+
artifacts=[...])`.
|
|
79
|
+
|
|
80
|
+
Custom JSONL consumers should read generated media from `output.media` or
|
|
81
|
+
`steps[].output.media`. Prism no longer expects raw provider responses to be the
|
|
82
|
+
source of truth for generated images.
|
|
83
|
+
|
|
84
|
+
Direct SDK calls are not automatically recorded as Prism generation records, and
|
|
85
|
+
raw OpenAI request/response payloads are not captured by default. Store any
|
|
86
|
+
important call details in `TaskOutput.metadata` when your eval needs them.
|