prism-evals 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. prism_evals-0.9.0/.github/workflows/publish-python.yml +80 -0
  2. prism_evals-0.9.0/.gitignore +24 -0
  3. prism_evals-0.9.0/AGENTS.md +276 -0
  4. prism_evals-0.9.0/CHANGELOG.md +159 -0
  5. prism_evals-0.9.0/MIGRATION.md +86 -0
  6. prism_evals-0.9.0/PKG-INFO +722 -0
  7. prism_evals-0.9.0/README.md +691 -0
  8. prism_evals-0.9.0/examples/01_csv_qa.py +40 -0
  9. prism_evals-0.9.0/examples/02_json_import.py +54 -0
  10. prism_evals-0.9.0/examples/03_image_generation.py +48 -0
  11. prism_evals-0.9.0/examples/04_config_options.py +84 -0
  12. prism_evals-0.9.0/examples/05_multistep_agent.py +129 -0
  13. prism_evals-0.9.0/examples/datasets/config_options.csv +3 -0
  14. prism_evals-0.9.0/examples/datasets/image_prompts.csv +2 -0
  15. prism_evals-0.9.0/examples/datasets/json_cases/daypack.json +19 -0
  16. prism_evals-0.9.0/examples/datasets/json_cases/rain_shell.json +19 -0
  17. prism_evals-0.9.0/examples/datasets/qa.csv +3 -0
  18. prism_evals-0.9.0/examples/datasets/support_tickets.csv +5 -0
  19. prism_evals-0.9.0/examples/prompts/config_system.md +1 -0
  20. prism_evals-0.9.0/pyproject.toml +77 -0
  21. prism_evals-0.9.0/src/prism_evals/__init__.py +70 -0
  22. prism_evals-0.9.0/src/prism_evals/__main__.py +7 -0
  23. prism_evals-0.9.0/src/prism_evals/_utils.py +112 -0
  24. prism_evals-0.9.0/src/prism_evals/artifacts.py +115 -0
  25. prism_evals-0.9.0/src/prism_evals/builtins.py +555 -0
  26. prism_evals-0.9.0/src/prism_evals/cli.py +390 -0
  27. prism_evals-0.9.0/src/prism_evals/console.py +308 -0
  28. prism_evals-0.9.0/src/prism_evals/datasets.py +247 -0
  29. prism_evals-0.9.0/src/prism_evals/errors.py +19 -0
  30. prism_evals-0.9.0/src/prism_evals/evaluation.py +96 -0
  31. prism_evals-0.9.0/src/prism_evals/experiment.py +207 -0
  32. prism_evals-0.9.0/src/prism_evals/models.py +260 -0
  33. prism_evals-0.9.0/src/prism_evals/openai.py +1027 -0
  34. prism_evals-0.9.0/src/prism_evals/runner.py +297 -0
  35. prism_evals-0.9.0/src/prism_evals/scaffold.py +49 -0
  36. prism_evals-0.9.0/src/prism_evals/storage.py +435 -0
  37. prism_evals-0.9.0/src/prism_evals/templates/AGENTS.md +220 -0
  38. prism_evals-0.9.0/tests/conftest.py +72 -0
  39. prism_evals-0.9.0/tests/test_artifacts.py +106 -0
  40. prism_evals-0.9.0/tests/test_builtins.py +235 -0
  41. prism_evals-0.9.0/tests/test_cli_viewer.py +360 -0
  42. prism_evals-0.9.0/tests/test_console.py +90 -0
  43. prism_evals-0.9.0/tests/test_datasets.py +77 -0
  44. prism_evals-0.9.0/tests/test_experiment_api.py +107 -0
  45. prism_evals-0.9.0/tests/test_openai_wrapper.py +519 -0
  46. prism_evals-0.9.0/tests/test_runner.py +594 -0
  47. prism_evals-0.9.0/tests/test_scaffold.py +88 -0
  48. prism_evals-0.9.0/viewer/AGENTS.md +49 -0
  49. prism_evals-0.9.0/viewer/app/api/compare/route.ts +32 -0
  50. prism_evals-0.9.0/viewer/app/api/runs/[runKey]/records/route.ts +23 -0
  51. prism_evals-0.9.0/viewer/app/api/runs/[runKey]/route.ts +22 -0
  52. prism_evals-0.9.0/viewer/app/api/runs/route.ts +17 -0
  53. prism_evals-0.9.0/viewer/app/artifacts/[runKey]/[artifactName]/route.ts +44 -0
  54. prism_evals-0.9.0/viewer/app/compare/page.tsx +5 -0
  55. prism_evals-0.9.0/viewer/app/globals.css +43 -0
  56. prism_evals-0.9.0/viewer/app/layout.tsx +62 -0
  57. prism_evals-0.9.0/viewer/app/media/[runKey]/[...mediaPath]/route.ts +46 -0
  58. prism_evals-0.9.0/viewer/app/page.tsx +5 -0
  59. prism_evals-0.9.0/viewer/app/runs/[runKey]/page.tsx +10 -0
  60. prism_evals-0.9.0/viewer/components/ComparePage.tsx +443 -0
  61. prism_evals-0.9.0/viewer/components/RunDetailPage.tsx +1141 -0
  62. prism_evals-0.9.0/viewer/components/RunsPage.tsx +1324 -0
  63. prism_evals-0.9.0/viewer/components/format.ts +58 -0
  64. prism_evals-0.9.0/viewer/components/ui.tsx +221 -0
  65. prism_evals-0.9.0/viewer/lib/evals.ts +485 -0
  66. prism_evals-0.9.0/viewer/lib/media.ts +11 -0
  67. prism_evals-0.9.0/viewer/lib/preferences.ts +425 -0
  68. prism_evals-0.9.0/viewer/lib/server/runs.ts +166 -0
  69. prism_evals-0.9.0/viewer/lib/server/viewer.ts +39 -0
  70. prism_evals-0.9.0/viewer/lib/types.ts +279 -0
  71. prism_evals-0.9.0/viewer/next-env.d.ts +6 -0
  72. prism_evals-0.9.0/viewer/next.config.ts +7 -0
  73. prism_evals-0.9.0/viewer/package-lock.json +4116 -0
  74. prism_evals-0.9.0/viewer/package.json +29 -0
  75. prism_evals-0.9.0/viewer/postcss.config.mjs +8 -0
  76. prism_evals-0.9.0/viewer/tailwind.config.ts +24 -0
  77. prism_evals-0.9.0/viewer/test/evals.test.ts +230 -0
  78. prism_evals-0.9.0/viewer/test/preferences.test.ts +103 -0
  79. prism_evals-0.9.0/viewer/test/viewer.test.ts +34 -0
  80. prism_evals-0.9.0/viewer/tsconfig.json +24 -0
  81. prism_evals-0.9.0/viewer/vitest.config.ts +8 -0
@@ -0,0 +1,80 @@
1
+ name: Publish Python Package
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ test:
14
+ name: Test
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Check out repository
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+ cache: pip
25
+
26
+ - name: Install package
27
+ run: python -m pip install --upgrade pip && python -m pip install -e ".[dev]"
28
+
29
+ - name: Run tests
30
+ run: python -m pytest
31
+
32
+ build:
33
+ name: Build
34
+ runs-on: ubuntu-latest
35
+ needs: test
36
+ steps:
37
+ - name: Check out repository
38
+ uses: actions/checkout@v4
39
+
40
+ - name: Set up Python
41
+ uses: actions/setup-python@v5
42
+ with:
43
+ python-version: "3.12"
44
+ cache: pip
45
+
46
+ - name: Install build tools
47
+ run: python -m pip install --upgrade pip build twine
48
+
49
+ - name: Build distributions
50
+ run: python -m build
51
+
52
+ - name: Check distributions
53
+ run: python -m twine check dist/*
54
+
55
+ - name: Upload distributions
56
+ uses: actions/upload-artifact@v4
57
+ with:
58
+ name: python-package-distributions
59
+ path: dist/
60
+
61
+ publish:
62
+ name: Publish to PyPI
63
+ runs-on: ubuntu-latest
64
+ needs: build
65
+ if: startsWith(github.ref, 'refs/tags/v')
66
+ environment:
67
+ name: pypi
68
+ url: https://pypi.org/p/prism-evals
69
+ permissions:
70
+ contents: read
71
+ id-token: write
72
+ steps:
73
+ - name: Download distributions
74
+ uses: actions/download-artifact@v4
75
+ with:
76
+ name: python-package-distributions
77
+ path: dist/
78
+
79
+ - name: Publish distributions to PyPI
80
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,24 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ .mypy_cache/
8
+ .coverage
9
+ htmlcov/
10
+
11
+ .env
12
+ .venv/
13
+ venv/
14
+
15
+ build/
16
+ dist/
17
+
18
+ /runs/
19
+ /examples/runs/
20
+
21
+ viewer/.next/
22
+ viewer/node_modules/
23
+ viewer/coverage/
24
+ viewer/tsconfig.tsbuildinfo
@@ -0,0 +1,276 @@
1
+ # AGENTS.md
2
+
3
+ This repo provides Prism Evals, a local Python framework for executable OpenAI
4
+ API experiments. The package distribution is `prism-evals`, and the Python
5
+ import is `prism_evals`.
6
+
7
+ Use this file as the quick orientation for LLM agents working in this package
8
+ repo, and as a template for consuming repos that install Prism Evals.
9
+
10
+ To seed Prism Evals instructions into a consuming repo, install the package and
11
+ run this from the consuming repo root:
12
+
13
+ ```bash
14
+ python -m prism_evals init
15
+ ```
16
+
17
+ The installed console scripts also work:
18
+
19
+ ```bash
20
+ prism init
21
+ prism-evals init
22
+ pe init
23
+ ```
24
+
25
+ ## What An Eval Experiment Is
26
+
27
+ An experiment is a normal Python file that configures:
28
+
29
+ - A CSV dataset, JSONL file, or folder of JSON/YAML scenario files.
30
+ - One or more `ModelConfig` entries or named model variants.
31
+ - A workflow callable assigned to `exp.workflow`.
32
+ - Item-level evals registered with `exp.eval(...)`.
33
+ - Optional step-level evals inside `ctx.step(...)`.
34
+
35
+ Run an experiment with the Prism CLI:
36
+
37
+ ```bash
38
+ prism run path/to/experiment.py
39
+ ```
40
+
41
+ The minimal shape is:
42
+
43
+ ```python
44
+ from openai import AsyncOpenAI
45
+
46
+ from prism_evals import Experiment, ModelConfig, TaskOutput
47
+
48
+ exp = Experiment(name="my_eval", dataset="datasets/my_eval.csv", output_dir="runs")
49
+ exp.model(ModelConfig(key="gpt5_low", model="gpt-5", params={"reasoning": {"effort": "low"}}))
50
+ client = AsyncOpenAI()
51
+
52
+ async def workflow(item, model, ctx):
53
+ response = await client.responses.create(
54
+ model=model.model,
55
+ **model.params,
56
+ input=item["prompt"],
57
+ )
58
+ return TaskOutput(text=response.output_text)
59
+
60
+ exp.workflow = workflow
61
+ ```
62
+
63
+ Direct `python path/to/experiment.py` execution is still supported if the file
64
+ includes an explicit `exp.run()` block.
65
+
66
+ ## Where To Make Changes
67
+
68
+ When changing eval behavior in a consuming repo, edit the experiment Python files
69
+ first. Those files are the source of truth for datasets, model choices,
70
+ workflow logic, scoring rules, concurrency, retries, and output settings.
71
+
72
+ Common change points:
73
+
74
+ - Change prompts, tool calls, multi-step logic, or response parsing in the
75
+ workflow assigned to `exp.workflow`.
76
+ - Change model coverage by editing `exp.model(...)`, `exp.models(...)`, or
77
+ `exp.variant(...)` for multi-agent role/model configurations.
78
+ - Change pass/fail or scoring logic by editing `exp.eval(...)` or the `evals=`
79
+ list passed to `ctx.step(...)`.
80
+ - Change data coverage by editing the CSV/JSONL file or scenario folder
81
+ referenced by `Experiment(dataset=...)`.
82
+ - Change output placement by editing `Experiment(output_dir=...)`.
83
+ - Change resume behavior with `resume=True` or `resume=False`.
84
+ - Change repeated sampling with `repetitions=...`.
85
+ - Change parallelism with `concurrency=...`.
86
+
87
+ If prompts, rubrics, schemas, or other files are part of the experiment, keep
88
+ them near the experiment file and pass them through `artifacts=[...]` so each run
89
+ copies them into the run directory.
90
+
91
+ ## Paths Are Relative To The Experiment File
92
+
93
+ Relative `dataset`, `output_dir`, and `artifacts` paths are resolved relative to
94
+ the Python file that creates `Experiment(...)`, not necessarily the process
95
+ working directory.
96
+
97
+ For example, in `experiments/qa.py`:
98
+
99
+ ```python
100
+ Experiment(
101
+ name="qa",
102
+ dataset="datasets/qa.csv",
103
+ output_dir="runs",
104
+ artifacts=["prompts/system.md"],
105
+ )
106
+ ```
107
+
108
+ This reads `experiments/datasets/qa.csv`, writes under `experiments/runs/`, and
109
+ copies `experiments/prompts/system.md` into the run artifacts.
110
+
111
+ ## Result Storage
112
+
113
+ Each `Experiment` instance chooses one run directory. With default settings, the
114
+ timestamp is created when `Experiment(...)` is constructed:
115
+
116
+ ```text
117
+ <output_dir>/<YYYYMMDD-HHMMSS>_<experiment_name>/
118
+ ```
119
+
120
+ If `timestamp_output_dir=False`, the run directory is:
121
+
122
+ ```text
123
+ <output_dir>/<experiment_name>/
124
+ ```
125
+
126
+ Run directories contain:
127
+
128
+ - `manifest.json`: run metadata, dataset hash, experiment file hash, settings,
129
+ model configs, git commit, copied artifact metadata, and output path.
130
+ - `results.jsonl`: append-only item-run records. This is the most complete
131
+ machine-readable output.
132
+ - `results.csv`: one row per item/model/repetition with flattened item fields,
133
+ final output text, usage, errors, item-level scores, and step score columns.
134
+ - `scores.csv`: one row per score, including both item-level and step-level
135
+ scores.
136
+ - `steps.csv`: one row per recorded workflow step, including step output text,
137
+ usage, errors, media columns, and step scores.
138
+ - `turns.csv`: one row per recorded conversation turn.
139
+ - `tool_calls.csv`: one row per recorded tool call.
140
+ - `artifacts/`: optional copied prompt/rubric/schema files listed in
141
+ `artifacts=[...]`.
142
+ - `media/`: generated outputs saved with `ctx.media`.
143
+
144
+ `runs/` and `examples/runs/` are ignored by git in this repo. Treat run outputs
145
+ as generated artifacts unless the consuming repo explicitly chooses to version
146
+ selected reports.
147
+
148
+ ## How To Read Results
149
+
150
+ Start with `manifest.json` to confirm the experiment file, dataset, model
151
+ configs, and settings that produced the run.
152
+
153
+ Use `results.csv` for quick spreadsheet-style inspection. Useful columns include:
154
+
155
+ - `item_id`, `item_index`, `model_key`, `repetition`, and `status`.
156
+ - `output_text` for the final workflow output.
157
+ - `media_count`, `media_paths_json`, and `primary_media_path` for generated
158
+ output files.
159
+ - `score:<eval_key>` and `score_error:<eval_key>` for item-level evals.
160
+ - `step:<step_key>.score:<eval_key>` for step evals flattened into the item row.
161
+ - `input_tokens`, `output_tokens`, `reasoning_tokens`, `total_tokens`, and
162
+ `latency_s` for usage and timing.
163
+ - `error_type` and `error_message` for failed item runs.
164
+
165
+ Use `scores.csv` when comparing eval metrics across models, repetitions, or
166
+ steps. Filter by:
167
+
168
+ - `scope=item_run` for final-output evals.
169
+ - `scope=step` and `step_key=<name>` for step evals.
170
+ - `score_key=<eval>` for a specific metric.
171
+
172
+ Use `steps.csv` when debugging multi-step workflows. It shows each step's
173
+ status, output text, media paths, token usage, latency, response ID, and
174
+ `scores_json`.
175
+
176
+ Use `results.jsonl` when full record structure matters. It preserves nested
177
+ records for items, final `TaskOutput` values, evals, optional legacy generation
178
+ records, steps, errors, usage, and media metadata.
179
+
180
+ ## Resume Behavior
181
+
182
+ `resume=True` skips item/model/repetition records that already have
183
+ `status == "success"` in the current run directory's `results.jsonl`.
184
+
185
+ The item-run identity is based on experiment name, dataset hash, per-item hash,
186
+ item id/index, model or variant key, and repetition. Changing dataset contents or
187
+ a scenario file creates different item-run IDs.
188
+
189
+ For a clean rerun, launch a fresh process with `timestamp_output_dir=True`,
190
+ change the output directory or experiment name, or delete the old generated run
191
+ directory.
192
+
193
+ ## Datasets
194
+
195
+ Datasets can be CSV files, JSONL files, structured JSON/YAML scenario files, or
196
+ folders containing one JSON/YAML scenario file per item.
197
+
198
+ - Every item is passed to the workflow as `item`; CSV values remain strings,
199
+ while JSON/YAML items preserve nested lists and objects.
200
+ - If the CSV has an `id` column, that value is used as `item_id`.
201
+ - If `id` is missing or blank, the zero-based row index is used as `item_id`.
202
+ - Empty CSV values are normalized to empty strings.
203
+ - If `dataset` is a directory, Prism recursively reads `*.json`, `*.yaml`, and
204
+ `*.yml` files in stable path order. Files beginning with `_` are ignored.
205
+ - Scenario turns support shorthand keys such as `user`, `assistant_seed`,
206
+ `assistant_expect`, and `action`.
207
+
208
+ Keep dataset columns explicit and stable. Evals often use selectors such as
209
+ `item("expected")`, so renaming columns can silently change scoring behavior.
210
+
211
+ CSV can point to scenario files with `scenario_path`, or store compact turn lists
212
+ in `turns_json`.
213
+
214
+ ## Workflows And Steps
215
+
216
+ A workflow receives `(item, model, ctx)` and may be sync or async. It must
217
+ return `TaskOutput`.
218
+
219
+ Import and use the OpenAI SDK directly inside experiment files. Prism owns eval
220
+ orchestration and output storage, not provider SDK calls.
221
+
222
+ Use `ctx.step("step_key", callable_or_value, evals=[...])` for multi-step
223
+ workflows. Step callables must return `TaskOutput`. Step outputs and step evals
224
+ are written to both `results.jsonl` and the flattened CSV files.
225
+
226
+ Use `ctx.conversation(...)`, `ctx.user(...)`, `ctx.assistant_seed(...)`,
227
+ `ctx.action_seed(...)`, and `ctx.turn(...)` for multi-turn scenarios. Seeded
228
+ turns record context only; generated turns also write a normal step with key
229
+ `turn:<turn_id>`.
230
+
231
+ Use `ctx.record_tool_call(...)` to capture app/tool invocations that matter for
232
+ scoring. Built-ins such as `ToolCalled`, `ToolNotCalled`, and `ToolArgsEqual`
233
+ can score those calls.
234
+
235
+ Return `TaskOutput(text=..., value=..., media=[...])` for display text,
236
+ structured data, and generated media. Built-in selectors such as `text()`,
237
+ `out("path")`, `step("step_key.path")`, and `step_text("step_key")` depend on
238
+ that structure. Use `ctx.media.from_base64(...)`, `ctx.media.from_bytes(...)`,
239
+ or `ctx.media.from_path(...)` to save generated outputs into `media/`.
240
+
241
+ Use `ctx.realtime.run_text(...)` or `ctx.realtime.run_audio(...)` for Realtime
242
+ API evals. Realtime helpers still return data through `TaskOutput`, and audio
243
+ outputs should be attached as run-local media.
244
+
245
+ ## Viewer UI Preferences
246
+
247
+ When editing the local Next.js viewer, optimize for dense eval inspection rather
248
+ than marketing-style presentation:
249
+
250
+ - Use the full viewport width for run tables, compare views, and charts.
251
+ - Keep visible copy short and product-facing; avoid implementation details in
252
+ the UI.
253
+ - Put related controls in the same toolbar row when space allows, with secondary
254
+ controls aligned to the right.
255
+ - Prefer familiar icon-only controls for common actions such as close and remove.
256
+ Use a small, light `×` with an accessible label rather than bordered text
257
+ buttons.
258
+ - Keep chart previews compact. Avoid legends or section labels in small cards
259
+ when the surrounding context already names the metric.
260
+ - For run trend charts, show runs chronologically left to right so the newest
261
+ run is on the right.
262
+ - Preserve user filters across summaries and charts; model filters should affect
263
+ both the table and chart previews.
264
+
265
+ ## Testing Changes
266
+
267
+ For this package repo:
268
+
269
+ ```bash
270
+ python -m pip install -e ".[dev]"
271
+ python -m pytest
272
+ ```
273
+
274
+ For a consuming repo, run the smallest relevant experiment first, then inspect
275
+ the generated run directory before broadening concurrency, model coverage, or
276
+ dataset size.
@@ -0,0 +1,159 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ This project follows Semantic Versioning. While the project is pre-1.0, minor
6
+ versions may include API changes as the experiment framework settles.
7
+
8
+ ## [0.9.0] - 2026-05-21
9
+
10
+ ### Added
11
+
12
+ - Added folder-backed JSON/YAML scenario datasets, JSONL datasets, CSV
13
+ `scenario_path`, and CSV `turns_json` expansion for multi-turn eval inputs.
14
+ - Added named model variants for multi-agent workflows, with `ctx.model(role)`
15
+ access to role-specific `ModelConfig` entries.
16
+ - Added conversation turn recording helpers, seeded user/assistant/action turns,
17
+ tool-call recording, `turns.csv`, `tool_calls.csv`, and tool-call built-in
18
+ evaluators.
19
+
20
+ ## [0.8.0] - 2026-05-11
21
+
22
+ ### Added
23
+
24
+ - Added first-class Realtime workflow support through `ctx.realtime`, including
25
+ text and audio helpers for `gpt-realtime-2`.
26
+ - Parsed Realtime tool calls into `RealtimeRunResult.tool_calls` and
27
+ `TaskOutput.value["tool_calls"]` for scoring.
28
+ - Added Realtime text and voice-agent smoke examples, plus viewer playback for
29
+ audio media.
30
+
31
+ ## [0.7.0] - 2026-05-04
32
+
33
+ ### Changed
34
+
35
+ - `TaskOutput` is now the required workflow and step output contract.
36
+ - Workflows should import provider SDKs directly instead of relying on Prism to
37
+ proxy OpenAI calls.
38
+
39
+ ### Added
40
+
41
+ - Added `TaskOutput.media`, `MediaArtifact`, run-local `media/` storage,
42
+ compact media columns in CSV outputs, and viewer media previews.
43
+ - Added `ctx.media.from_base64(...)`, `ctx.media.from_bytes(...)`, and
44
+ `ctx.media.from_path(...)` helpers for generated outputs.
45
+
46
+ ## [0.6.8] - 2026-04-16
47
+
48
+ ### Added
49
+
50
+ - `prism run <experiment_file>` discovers and runs module-level `Experiment`
51
+ instances, so eval files no longer need an explicit `exp.run()` block.
52
+
53
+ ## [0.6.7] - 2026-04-16
54
+
55
+ ### Added
56
+
57
+ - Compact inline `data:` URLs in captured raw payloads by default with
58
+ `redact_raw_data_urls=True`.
59
+
60
+ ## [0.6.0] - 2026-04-16
61
+
62
+ ### Changed
63
+
64
+ - Rebranded the package to Prism Evals.
65
+ - Renamed the Python distribution to `prism-evals` and the import package to
66
+ `prism_evals`.
67
+ - Added `prism`, `prism-evals`, and `pe` console scripts.
68
+ - Updated the local viewer, scaffolded instructions, docs, examples, and tests
69
+ to use Prism Evals naming.
70
+
71
+ ## [0.5.1] - 2026-04-15
72
+
73
+ ### Fixed
74
+
75
+ - Bundle the local viewer with Python wheels so installed packages can launch `prism view`.
76
+ - Install viewer npm dependencies automatically on first launch when they are missing.
77
+
78
+ ## [0.5.0] - 2026-04-15
79
+
80
+ ### Added
81
+
82
+ - `prism view <runs_dir>` opens a read-only local Next.js viewer for parent directories of eval runs.
83
+ - Viewer pages for all runs, run detail, score matrices, artifact downloads, and lane comparisons across `run + model_key` pairs.
84
+
85
+ ## [0.4.5] - 2026-04-15
86
+
87
+ ### Added
88
+
89
+ - `Experiment(..., artifacts=[...])` can copy prompt files, configs, and other user files into the run output folder.
90
+
91
+ ## [0.4.4] - 2026-04-15
92
+
93
+ ### Added
94
+
95
+ - Console output now includes a model-column score pivot table by eval key.
96
+
97
+ ## [0.4.3] - 2026-04-15
98
+
99
+ ### Added
100
+
101
+ - Console token usage now shows average per item run and totals for input, cached, output, reasoning, and total tokens.
102
+
103
+ ## [0.4.2] - 2026-04-15
104
+
105
+ ### Fixed
106
+
107
+ - Declared the `src/prism_evals` package for Hatchling wheel builds so `prism-evals` installs correctly from Git tags.
108
+
109
+ ## [0.4.1] - 2026-04-15
110
+
111
+ ### Added
112
+
113
+ - Timestamp-prefixed output directories by default, with `timestamp_output_dir=False` to keep stable experiment folders.
114
+
115
+ ## [0.4.0] - 2026-04-15
116
+
117
+ ### Added
118
+
119
+ - Multi-step workflows via `ctx.step(...)`, with step-owned outputs, evals, generations, usage, latency, and errors.
120
+ - Step selectors: `step(...)` and `step_text(...)`.
121
+ - `steps.csv` artifact and scoped score rows in `scores.csv`.
122
+
123
+ ### Changed
124
+
125
+ - Renamed public terminology from row/execution/task to item/item-run/workflow.
126
+ - Replaced `exp.task = callable` with `exp.workflow = callable`.
127
+ - Replaced the dataset selector `row(...)` with `item(...)`.
128
+ - Renamed `ExecutionRecord` to `ItemRunRecord` and public result fields to item/item-run names.
129
+
130
+ ## [0.3.0] - 2026-04-15
131
+
132
+ ### Changed
133
+
134
+ - Replaced task decorators with direct task assignment via `exp.task = callable`.
135
+ - Removed eval decorator registration in favor of `exp.eval("key", evaluator)`.
136
+ - Task callables may now be sync or async, including callable task objects.
137
+
138
+ ## [0.2.0] - 2026-04-15
139
+
140
+ ### Added
141
+
142
+ - Unified `exp.eval("key", evaluator)` registration for custom functions and built-in evaluators.
143
+ - Built-in deterministic evaluators for equality, approximate equality, containment, regex, non-empty values, length bounds, and JSON path checks.
144
+ - Selector helpers: `row(...)`, `out(...)`, and `text(...)`.
145
+
146
+ ## [0.1.0] - 2026-04-15
147
+
148
+ ### Added
149
+
150
+ - Initial Prism Evals Python package with decorator-style experiments.
151
+ - `Experiment`, `ModelConfig`, `TaskOutput`, and `EvalResult` public APIs.
152
+ - OpenAI Responses API wrapper for automatic raw request/response capture.
153
+ - Token usage capture for input, cached, output, reasoning, and total tokens.
154
+ - Per-call latency and per-execution duration tracking.
155
+ - CSV dataset loading with row/model/repetition execution matrix.
156
+ - Bounded async concurrency, retries, resume support, and fail-fast option.
157
+ - Local artifacts: `manifest.json`, `results.jsonl`, `results.csv`, and `scores.csv`.
158
+ - Rich terminal progress and summary tables.
159
+ - Example QA experiment and pytest coverage.
@@ -0,0 +1,86 @@
1
+ # Migration Guide
2
+
3
+ ## 0.7.0: `TaskOutput` Is Required
4
+
5
+ Prism workflows and step callables must now return `TaskOutput`. Strings and
6
+ dicts are no longer normalized as workflow outputs.
7
+
8
+ Prism also no longer owns OpenAI SDK calls for normal workflows. Import and use
9
+ the OpenAI SDK directly in your experiment file:
10
+
11
+ ```python
12
+ from openai import AsyncOpenAI
13
+
14
+ client = AsyncOpenAI()
15
+ ```
16
+
17
+ Use that client for Responses API, Image API, or any other OpenAI call, then
18
+ return a `TaskOutput` with the data Prism should evaluate and store.
19
+
20
+ Text output:
21
+
22
+ ```python
23
+ from openai import AsyncOpenAI
24
+ from prism_evals import TaskOutput
25
+
26
+ client = AsyncOpenAI()
27
+
28
+ response = await client.responses.create(...)
29
+
30
+ # Old
31
+ return response.output_text
32
+
33
+ # New
34
+ return TaskOutput(text=response.output_text)
35
+ ```
36
+
37
+ Structured output:
38
+
39
+ ```python
40
+ # Old
41
+ return {"answer": parsed}
42
+
43
+ # New
44
+ return TaskOutput(text=json.dumps(parsed), value=parsed)
45
+ ```
46
+
47
+ Step output:
48
+
49
+ ```python
50
+ # Old
51
+ draft = await ctx.step("draft", lambda: "hello")
52
+
53
+ # New
54
+ draft = await ctx.step("draft", lambda: TaskOutput(text="hello"))
55
+ ```
56
+
57
+ Generated media:
58
+
59
+ ```python
60
+ from openai import AsyncOpenAI
61
+ from prism_evals import TaskOutput
62
+
63
+ client = AsyncOpenAI()
64
+
65
+ async def workflow(item, model, ctx):
66
+ response = await client.images.generate(
67
+ model=model.model,
68
+ prompt=item["prompt"],
69
+ response_format="b64_json",
70
+ **model.params,
71
+ )
72
+ image = ctx.media.from_base64(response.data[0].b64_json, format="png")
73
+ return TaskOutput(text="Generated image", media=[image])
74
+ ```
75
+
76
+ Generated files now live in `media/`. The `artifacts/` directory remains for
77
+ files copied from the experiment source tree with `Experiment(...,
78
+ artifacts=[...])`.
79
+
80
+ Custom JSONL consumers should read generated media from `output.media` or
81
+ `steps[].output.media`. Prism no longer expects raw provider responses to be the
82
+ source of truth for generated images.
83
+
84
+ Direct SDK calls are not automatically recorded as Prism generation records, and
85
+ raw OpenAI request/response payloads are not captured by default. Store any
86
+ important call details in `TaskOutput.metadata` when your eval needs them.