assessment-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- assessment_bench-0.1.0/.gitignore +10 -0
- assessment_bench-0.1.0/LICENSE +21 -0
- assessment_bench-0.1.0/PKG-INFO +136 -0
- assessment_bench-0.1.0/README.md +89 -0
- assessment_bench-0.1.0/pyproject.toml +59 -0
- assessment_bench-0.1.0/src/assessment_bench/__init__.py +45 -0
- assessment_bench-0.1.0/src/assessment_bench/arms.py +122 -0
- assessment_bench-0.1.0/src/assessment_bench/cli.py +70 -0
- assessment_bench-0.1.0/src/assessment_bench/data/example-experiment.yaml +27 -0
- assessment_bench-0.1.0/src/assessment_bench/exceptions.py +5 -0
- assessment_bench-0.1.0/src/assessment_bench/experiment.py +141 -0
- assessment_bench-0.1.0/src/assessment_bench/models.py +136 -0
- assessment_bench-0.1.0/src/assessment_bench/providers.py +114 -0
- assessment_bench-0.1.0/src/assessment_bench/report.py +58 -0
- assessment_bench-0.1.0/src/assessment_bench/stats.py +79 -0
- assessment_bench-0.1.0/tests/test_arms.py +36 -0
- assessment_bench-0.1.0/tests/test_models.py +44 -0
- assessment_bench-0.1.0/tests/test_stats.py +44 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Michael Borck
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: assessment-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Benchmark assessment approaches: pure-LLM marking vs the family's signal-based observations, with repeated runs and agreement statistics.
|
|
5
|
+
Project-URL: Homepage, https://github.com/michael-borck/assessment-bench
|
|
6
|
+
Author: Michael Borck
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Michael Borck
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Classifier: Development Status :: 3 - Alpha
|
|
30
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
33
|
+
Requires-Python: >=3.11
|
|
34
|
+
Requires-Dist: assessment-lens>=0.2.0
|
|
35
|
+
Requires-Dist: pydantic>=2.5.0
|
|
36
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
37
|
+
Requires-Dist: rich>=13.7.0
|
|
38
|
+
Provides-Extra: analysers
|
|
39
|
+
Requires-Dist: assessment-lens[analysers]>=0.2.0; extra == 'analysers'
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
43
|
+
Provides-Extra: llm
|
|
44
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'llm'
|
|
45
|
+
Requires-Dist: openai>=1.12.0; extra == 'llm'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# assessment-bench
|
|
49
|
+
|
|
50
|
+
Part of the [lens family](https://github.com/michael-borck/lens-analysers).
|
|
51
|
+
|
|
52
|
+
[](https://www.python.org/downloads/)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
55
|
+
**Benchmark assessment approaches.** Run one cohort through competing
|
|
56
|
+
assessment arms — pure-LLM marking (the baseline) and the family's
|
|
57
|
+
signal-based observations (`assessment-lens`) — with repeated runs,
|
|
58
|
+
consistency statistics, and agreement against human marks.
|
|
59
|
+
**The bench measures; it never marks.**
|
|
60
|
+
|
|
61
|
+
> `assessment-bench` is a *bench* (a measurement product), not an `-analyser`
|
|
62
|
+
> and not a marking tool. It exists to answer research questions like: *how
|
|
63
|
+
> consistent is LLM marking across repeated runs and providers?* and *which
|
|
64
|
+
> deterministic signals actually track human judgement?*
|
|
65
|
+
|
|
66
|
+
## What it does
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
experiment.yaml (rubric + cohort + arms)
|
|
70
|
+
├─ llm arm(s) : submission + rubric → provider → score × repetitions
|
|
71
|
+
├─ signals arm : assessment-lens → evidence values (deterministic, once)
|
|
72
|
+
└─ human marks : optional ground-truth CSV
|
|
73
|
+
↓
|
|
74
|
+
result.json + runs.csv + signals.csv + agreement.csv
|
|
75
|
+
• per-submission consistency: mean / median / std-dev / CV / reliability
|
|
76
|
+
• agreement: Pearson & Spearman of every arm mean and every numeric signal
|
|
77
|
+
against the human marks
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Install
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# from source (family layout)
|
|
84
|
+
uv venv && source .venv/bin/activate
|
|
85
|
+
uv pip install -e ".[dev]"
|
|
86
|
+
|
|
87
|
+
# the signals arm needs the analyser stack (bundle-analyser CLI on PATH):
|
|
88
|
+
uv pip install -e ".[analysers]"
|
|
89
|
+
|
|
90
|
+
# LLM arms (Anthropic, OpenAI, Ollama, OpenRouter):
|
|
91
|
+
uv pip install -e ".[llm]" # + export ANTHROPIC_API_KEY / OPENAI_API_KEY / ...
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Quick start
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
assessment-bench init experiment.yaml # commented example config
|
|
98
|
+
# edit: point at your rubric.yaml + submissions/, choose arms
|
|
99
|
+
assessment-bench run experiment.yaml -o out/
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
LLM arms specify provider **and** model per arm — comparing
|
|
103
|
+
`claude-haiku-4-5` vs `gpt-4o-mini` vs a local `llama3.1` via Ollama is just
|
|
104
|
+
three arms in one config.
|
|
105
|
+
|
|
106
|
+
## Relationship to the family
|
|
107
|
+
|
|
108
|
+
- **Analysers** generate deterministic signals (assessment-agnostic).
|
|
109
|
+
- **assessment-lens** maps signals to a rubric as observations — never scores.
|
|
110
|
+
- **assessment-bench** measures both approaches against human judgement. The
|
|
111
|
+
LLM arm produces scores *because that is the approach under test*; the bench
|
|
112
|
+
treats them as data points, not grades for students.
|
|
113
|
+
|
|
114
|
+
## Status
|
|
115
|
+
|
|
116
|
+
**v0.1 scaffold.** Working today:
|
|
117
|
+
|
|
118
|
+
- ✅ Experiment config (YAML) → cohort discovery → arms → structured results
|
|
119
|
+
- ✅ LLM arm: multi-provider (anthropic / openai / ollama / openrouter), repeated
|
|
120
|
+
runs, strict `SCORE: x/y` extraction with scaled fallback
|
|
121
|
+
- ✅ Signals arm: one `assessment-lens` pass; raw evidence values consumed
|
|
122
|
+
(not the presence-based coverage)
|
|
123
|
+
- ✅ Consistency stats (ported from the original Rust prototype) + Pearson/Spearman
|
|
124
|
+
agreement vs human marks
|
|
125
|
+
- 📋 Hybrid arm (LLM marking with analyser signals in context) — next
|
|
126
|
+
- 📋 HTTP service + desktop shell for non-technical researchers — planned
|
|
127
|
+
|
|
128
|
+
## Development
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
pytest -v
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# assessment-bench
|
|
2
|
+
|
|
3
|
+
Part of the [lens family](https://github.com/michael-borck/lens-analysers).
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
**Benchmark assessment approaches.** Run one cohort through competing
|
|
9
|
+
assessment arms — pure-LLM marking (the baseline) and the family's
|
|
10
|
+
signal-based observations (`assessment-lens`) — with repeated runs,
|
|
11
|
+
consistency statistics, and agreement against human marks.
|
|
12
|
+
**The bench measures; it never marks.**
|
|
13
|
+
|
|
14
|
+
> `assessment-bench` is a *bench* (a measurement product), not an `-analyser`
|
|
15
|
+
> and not a marking tool. It exists to answer research questions like: *how
|
|
16
|
+
> consistent is LLM marking across repeated runs and providers?* and *which
|
|
17
|
+
> deterministic signals actually track human judgement?*
|
|
18
|
+
|
|
19
|
+
## What it does
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
experiment.yaml (rubric + cohort + arms)
|
|
23
|
+
├─ llm arm(s) : submission + rubric → provider → score × repetitions
|
|
24
|
+
├─ signals arm : assessment-lens → evidence values (deterministic, once)
|
|
25
|
+
└─ human marks : optional ground-truth CSV
|
|
26
|
+
↓
|
|
27
|
+
result.json + runs.csv + signals.csv + agreement.csv
|
|
28
|
+
• per-submission consistency: mean / median / std-dev / CV / reliability
|
|
29
|
+
• agreement: Pearson & Spearman of every arm mean and every numeric signal
|
|
30
|
+
against the human marks
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# from source (family layout)
|
|
37
|
+
uv venv && source .venv/bin/activate
|
|
38
|
+
uv pip install -e ".[dev]"
|
|
39
|
+
|
|
40
|
+
# the signals arm needs the analyser stack (bundle-analyser CLI on PATH):
|
|
41
|
+
uv pip install -e ".[analysers]"
|
|
42
|
+
|
|
43
|
+
# LLM arms (Anthropic, OpenAI, Ollama, OpenRouter):
|
|
44
|
+
uv pip install -e ".[llm]" # + export ANTHROPIC_API_KEY / OPENAI_API_KEY / ...
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
assessment-bench init experiment.yaml # commented example config
|
|
51
|
+
# edit: point at your rubric.yaml + submissions/, choose arms
|
|
52
|
+
assessment-bench run experiment.yaml -o out/
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
LLM arms specify provider **and** model per arm — comparing
|
|
56
|
+
`claude-haiku-4-5` vs `gpt-4o-mini` vs a local `llama3.1` via Ollama is just
|
|
57
|
+
three arms in one config.
|
|
58
|
+
|
|
59
|
+
## Relationship to the family
|
|
60
|
+
|
|
61
|
+
- **Analysers** generate deterministic signals (assessment-agnostic).
|
|
62
|
+
- **assessment-lens** maps signals to a rubric as observations — never scores.
|
|
63
|
+
- **assessment-bench** measures both approaches against human judgement. The
|
|
64
|
+
LLM arm produces scores *because that is the approach under test*; the bench
|
|
65
|
+
treats them as data points, not grades for students.
|
|
66
|
+
|
|
67
|
+
## Status
|
|
68
|
+
|
|
69
|
+
**v0.1 scaffold.** Working today:
|
|
70
|
+
|
|
71
|
+
- ✅ Experiment config (YAML) → cohort discovery → arms → structured results
|
|
72
|
+
- ✅ LLM arm: multi-provider (anthropic / openai / ollama / openrouter), repeated
|
|
73
|
+
runs, strict `SCORE: x/y` extraction with scaled fallback
|
|
74
|
+
- ✅ Signals arm: one `assessment-lens` pass; raw evidence values consumed
|
|
75
|
+
(not the presence-based coverage)
|
|
76
|
+
- ✅ Consistency stats (ported from the original Rust prototype) + Pearson/Spearman
|
|
77
|
+
agreement vs human marks
|
|
78
|
+
- 📋 Hybrid arm (LLM marking with analyser signals in context) — next
|
|
79
|
+
- 📋 HTTP service + desktop shell for non-technical researchers — planned
|
|
80
|
+
|
|
81
|
+
## Development
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pytest -v
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "assessment-bench"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Benchmark assessment approaches: pure-LLM marking vs the family's signal-based observations, with repeated runs and agreement statistics."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "Michael Borck" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"assessment-lens>=0.2.0",
|
|
21
|
+
"pydantic>=2.5.0",
|
|
22
|
+
"pyyaml>=6.0.0",
|
|
23
|
+
"rich>=13.7.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
# The pure-LLM marking arm. anthropic for Anthropic; openai also covers
|
|
28
|
+
# Ollama / OpenRouter / any OpenAI-compatible endpoint via base_url.
|
|
29
|
+
llm = [
|
|
30
|
+
"anthropic>=0.40.0",
|
|
31
|
+
"openai>=1.12.0",
|
|
32
|
+
]
|
|
33
|
+
# Pull the analyser stack into the same env so the signals arm runs for real
|
|
34
|
+
# (assessment-lens shells out to the bundle-analyser CLI).
|
|
35
|
+
analysers = [
|
|
36
|
+
"assessment-lens[analysers]>=0.2.0",
|
|
37
|
+
]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8.0.0",
|
|
40
|
+
"pytest-cov>=4.0.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
assessment-bench = "assessment_bench.cli:main"
|
|
45
|
+
|
|
46
|
+
# Local dev: resolve family members from sibling checkouts. uv strips this from
|
|
47
|
+
# the published wheel, which keeps the plain PyPI pins.
|
|
48
|
+
[tool.uv.sources]
|
|
49
|
+
assessment-lens = { path = "../assessment-lens", editable = true }
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
Homepage = "https://github.com/michael-borck/assessment-bench"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.wheel]
|
|
55
|
+
packages = ["src/assessment_bench"]
|
|
56
|
+
|
|
57
|
+
[tool.pytest.ini_options]
|
|
58
|
+
testpaths = ["tests"]
|
|
59
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""assessment-bench — benchmark assessment approaches for the lens family.
|
|
2
|
+
|
|
3
|
+
Runs one cohort through competing assessment arms (pure-LLM marking as the
|
|
4
|
+
baseline; assessment-lens signal observations as the approach under study),
|
|
5
|
+
with repeated runs, consistency statistics, and agreement against human marks.
|
|
6
|
+
**The bench measures; it never marks.**
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .exceptions import AssessmentBenchError
|
|
10
|
+
from .experiment import load_config, run_experiment
|
|
11
|
+
from .models import (
|
|
12
|
+
Agreement,
|
|
13
|
+
ArmKind,
|
|
14
|
+
ArmOutcome,
|
|
15
|
+
ArmSpec,
|
|
16
|
+
ExperimentConfig,
|
|
17
|
+
ExperimentResult,
|
|
18
|
+
GradeRun,
|
|
19
|
+
ProviderName,
|
|
20
|
+
ProviderSpec,
|
|
21
|
+
RunStats,
|
|
22
|
+
SignalReading,
|
|
23
|
+
)
|
|
24
|
+
from .report import write_results
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"Agreement",
|
|
30
|
+
"ArmKind",
|
|
31
|
+
"ArmOutcome",
|
|
32
|
+
"ArmSpec",
|
|
33
|
+
"AssessmentBenchError",
|
|
34
|
+
"ExperimentConfig",
|
|
35
|
+
"ExperimentResult",
|
|
36
|
+
"GradeRun",
|
|
37
|
+
"ProviderName",
|
|
38
|
+
"ProviderSpec",
|
|
39
|
+
"RunStats",
|
|
40
|
+
"SignalReading",
|
|
41
|
+
"__version__",
|
|
42
|
+
"load_config",
|
|
43
|
+
"run_experiment",
|
|
44
|
+
"write_results",
|
|
45
|
+
]
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""The assessment arms under test.
|
|
2
|
+
|
|
3
|
+
LLM arm — the approach the family deliberately moved away from (an LLM reading
|
|
4
|
+
a submission and emitting a mark), kept here as the benchmark baseline. Prompt
|
|
5
|
+
shape ports the original Rust prototype's Tier-1 design; the score comes from a
|
|
6
|
+
strict trailing ``SCORE: x/y`` line with a permissive regex fallback.
|
|
7
|
+
|
|
8
|
+
Signals arm — assessment-lens observations. Deterministic, so it runs once per
|
|
9
|
+
cohort regardless of repetitions; the bench consumes raw evidence values (not
|
|
10
|
+
the presence-based coverage column) and correlates each numeric signal with the
|
|
11
|
+
human marks.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from assessment_lens.assess import assess
|
|
20
|
+
from assessment_lens.rubric import load_rubric
|
|
21
|
+
|
|
22
|
+
from . import providers
|
|
23
|
+
from .models import ArmSpec, GradeRun, SignalReading
|
|
24
|
+
|
|
25
|
+
# Submission text for the LLM arm. Plain-text formats are read directly;
|
|
26
|
+
# .pdf/.docx go through the family's canonical extractor when installed.
|
|
27
|
+
_PLAIN_TEXT_SUFFIXES = {".md", ".txt", ".py", ".js", ".ts", ".r", ".sql", ".csv"}
|
|
28
|
+
_EXTRACTOR_SUFFIXES = {".pdf", ".docx", ".pptx"}
|
|
29
|
+
|
|
30
|
+
_GRADE_SYSTEM = (
|
|
31
|
+
"You are an experienced university marker. Grade the submission against the "
|
|
32
|
+
"rubric. Be consistent and justify briefly. End your response with exactly "
|
|
33
|
+
"one line in the form 'SCORE: <number>/<max>' and nothing after it."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
_SCORE_RE = re.compile(r"SCORE:\s*(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", re.IGNORECASE)
|
|
37
|
+
_FALLBACK_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(?:/|out of)\s*(\d+(?:\.\d+)?)")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read_submission_text(folder: Path) -> str:
|
|
41
|
+
"""Concatenate the readable artefacts in one submission folder."""
|
|
42
|
+
parts: list[str] = []
|
|
43
|
+
for path in sorted(folder.rglob("*")):
|
|
44
|
+
if not path.is_file():
|
|
45
|
+
continue
|
|
46
|
+
suffix = path.suffix.lower()
|
|
47
|
+
if suffix in _PLAIN_TEXT_SUFFIXES:
|
|
48
|
+
parts.append(f"--- {path.name} ---\n{path.read_text(errors='replace')}")
|
|
49
|
+
elif suffix in _EXTRACTOR_SUFFIXES:
|
|
50
|
+
try:
|
|
51
|
+
from document_analyser import extract_text
|
|
52
|
+
|
|
53
|
+
parts.append(f"--- {path.name} ---\n{extract_text(path)}")
|
|
54
|
+
except ImportError:
|
|
55
|
+
parts.append(f"--- {path.name} --- (skipped: install the [analysers] extra to extract {suffix})")
|
|
56
|
+
return "\n\n".join(parts)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def extract_score(response: str, max_score: float) -> tuple[float | None, float]:
|
|
60
|
+
"""Pull (score, max) from a response; scale to max_score when the LLM used its own denominator."""
|
|
61
|
+
matches = _SCORE_RE.findall(response) or _FALLBACK_RE.findall(response)
|
|
62
|
+
if not matches:
|
|
63
|
+
return None, max_score
|
|
64
|
+
raw, denom = (float(v) for v in matches[-1])
|
|
65
|
+
if denom and denom != max_score:
|
|
66
|
+
raw = raw / denom * max_score
|
|
67
|
+
return raw, max_score
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def grade_prompt(rubric_text: str, submission_text: str, max_score: float) -> str:
|
|
71
|
+
return (
|
|
72
|
+
f"RUBRIC:\n{rubric_text}\n\n"
|
|
73
|
+
f"SUBMISSION:\n{submission_text}\n\n"
|
|
74
|
+
f"Grade the submission against the rubric out of {max_score:g}. "
|
|
75
|
+
f"Give 2-3 sentences of rationale, then the final 'SCORE: x/{max_score:g}' line."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def run_llm_arm(
|
|
80
|
+
arm: ArmSpec,
|
|
81
|
+
submission_id: str,
|
|
82
|
+
submission_folder: Path,
|
|
83
|
+
rubric_text: str,
|
|
84
|
+
max_score: float,
|
|
85
|
+
) -> list[GradeRun]:
|
|
86
|
+
"""All repetitions of one LLM arm for one submission. Failures are recorded, not raised."""
|
|
87
|
+
assert arm.provider is not None # validated by ArmSpec
|
|
88
|
+
text = read_submission_text(submission_folder)
|
|
89
|
+
prompt = grade_prompt(rubric_text, text, max_score)
|
|
90
|
+
runs: list[GradeRun] = []
|
|
91
|
+
for i in range(arm.repetitions):
|
|
92
|
+
run = GradeRun(submission_id=submission_id, arm_id=arm.id, run_index=i, max_score=max_score)
|
|
93
|
+
try:
|
|
94
|
+
response = providers.complete(prompt, system=_GRADE_SYSTEM, spec=arm.provider)
|
|
95
|
+
run.raw_response = response
|
|
96
|
+
run.score, _ = extract_score(response, max_score)
|
|
97
|
+
run.rationale = _SCORE_RE.sub("", response).strip()
|
|
98
|
+
if run.score is None:
|
|
99
|
+
run.error = "no SCORE line found in response"
|
|
100
|
+
except Exception as exc: # one bad call must not kill a cohort run
|
|
101
|
+
run.error = str(exc)
|
|
102
|
+
runs.append(run)
|
|
103
|
+
return runs
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def run_signals_arm(arm: ArmSpec, rubric_path: Path, submissions_dir: Path) -> list[SignalReading]:
|
|
107
|
+
"""One assessment-lens pass over the whole cohort -> flat evidence readings."""
|
|
108
|
+
rubric = load_rubric(rubric_path)
|
|
109
|
+
result = assess(rubric, submissions_dir)
|
|
110
|
+
readings: list[SignalReading] = []
|
|
111
|
+
for submission in result.submissions:
|
|
112
|
+
for observation in submission.observations:
|
|
113
|
+
for evidence in observation.evidence:
|
|
114
|
+
readings.append(
|
|
115
|
+
SignalReading(
|
|
116
|
+
submission_id=submission.submission_id,
|
|
117
|
+
criterion_id=observation.criterion_id,
|
|
118
|
+
signal=evidence.signal,
|
|
119
|
+
value=evidence.value,
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
return readings
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""assessment-bench CLI.
|
|
2
|
+
|
|
3
|
+
assessment-bench run experiment.yaml -o out/
|
|
4
|
+
assessment-bench init my-experiment.yaml
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import shutil
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
|
|
16
|
+
from .exceptions import AssessmentBenchError
|
|
17
|
+
from .experiment import load_config, run_experiment
|
|
18
|
+
from .report import write_results
|
|
19
|
+
|
|
20
|
+
console = Console()
|
|
21
|
+
|
|
22
|
+
_EXAMPLE = Path(__file__).parent / "data" / "example-experiment.yaml"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main(argv: list[str] | None = None) -> int:
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
prog="assessment-bench",
|
|
28
|
+
description="Benchmark assessment approaches over one cohort: pure-LLM marking vs signal-based observation.",
|
|
29
|
+
)
|
|
30
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
31
|
+
|
|
32
|
+
run_p = sub.add_parser("run", help="Run an experiment config over its cohort.")
|
|
33
|
+
run_p.add_argument("config", type=Path, help="Experiment YAML.")
|
|
34
|
+
run_p.add_argument("-o", "--out", type=Path, default=Path("bench-out"), help="Output folder.")
|
|
35
|
+
|
|
36
|
+
init_p = sub.add_parser("init", help="Write a commented example experiment config.")
|
|
37
|
+
init_p.add_argument("path", type=Path, nargs="?", default=Path("experiment.yaml"))
|
|
38
|
+
|
|
39
|
+
args = parser.parse_args(argv)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
if args.command == "init":
|
|
43
|
+
if args.path.exists():
|
|
44
|
+
console.print(f"[red]refusing to overwrite {args.path}[/red]")
|
|
45
|
+
return 1
|
|
46
|
+
shutil.copy(_EXAMPLE, args.path)
|
|
47
|
+
console.print(f"✓ wrote {args.path} — edit it, then: assessment-bench run {args.path}")
|
|
48
|
+
return 0
|
|
49
|
+
|
|
50
|
+
config = load_config(args.config)
|
|
51
|
+
console.print(
|
|
52
|
+
f"[bold]{config.name}[/bold] — {len(config.arms)} arms, max score {config.max_score:g}"
|
|
53
|
+
)
|
|
54
|
+
result = run_experiment(config, progress=lambda msg: console.print(f" {msg}"))
|
|
55
|
+
written = write_results(result, args.out)
|
|
56
|
+
console.print(f"✓ {len(result.submissions)} submissions → " + ", ".join(str(p) for p in written))
|
|
57
|
+
if result.agreements:
|
|
58
|
+
console.print("[bold]Agreement with human marks:[/bold]")
|
|
59
|
+
for a in sorted(result.agreements, key=lambda a: -(a.pearson or -2)):
|
|
60
|
+
console.print(f" {a.measure}: r={a.pearson:.3f} rho={a.spearman:.3f} (n={a.n})"
|
|
61
|
+
if a.pearson is not None and a.spearman is not None
|
|
62
|
+
else f" {a.measure}: undefined (n={a.n})")
|
|
63
|
+
return 0
|
|
64
|
+
except AssessmentBenchError as exc:
|
|
65
|
+
console.print(f"[red]error:[/red] {exc}")
|
|
66
|
+
return 1
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
sys.exit(main())
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# assessment-bench experiment config.
|
|
2
|
+
# Paths are relative to this file. One subfolder under `submissions` = one submission.
|
|
3
|
+
name: "My experiment"
|
|
4
|
+
rubric: rubric.yaml # assessment-lens structured rubric (criteria + pinned signals)
|
|
5
|
+
submissions: submissions/
|
|
6
|
+
max_score: 100
|
|
7
|
+
# Optional ground truth — enables agreement statistics (Pearson/Spearman):
|
|
8
|
+
# human_marks: marks.csv # CSV with header: submission_id,mark
|
|
9
|
+
arms:
|
|
10
|
+
# The baseline under test: an LLM reads the submission + rubric and emits a mark.
|
|
11
|
+
- id: llm-haiku
|
|
12
|
+
kind: llm
|
|
13
|
+
repetitions: 3 # repeated runs -> consistency stats (mean/CV/reliability)
|
|
14
|
+
provider:
|
|
15
|
+
provider: anthropic # anthropic | openai | ollama | openrouter
|
|
16
|
+
model: claude-haiku-4-5
|
|
17
|
+
temperature: 0.1
|
|
18
|
+
# Local model via Ollama (any OpenAI-compatible endpoint works via base_url):
|
|
19
|
+
# - id: llm-local
|
|
20
|
+
# kind: llm
|
|
21
|
+
# repetitions: 3
|
|
22
|
+
# provider:
|
|
23
|
+
# provider: ollama
|
|
24
|
+
# model: llama3.1
|
|
25
|
+
# Signal-based observation via assessment-lens (deterministic; runs once):
|
|
26
|
+
- id: signals
|
|
27
|
+
kind: signals
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Experiment runner — the bench's orchestration spine.
|
|
2
|
+
|
|
3
|
+
One experiment = one rubric + one cohort + N arms. Each LLM arm runs per
|
|
4
|
+
submission x repetitions; the signals arm runs once per cohort (deterministic).
|
|
5
|
+
Afterwards, every arm's mean score and every numeric signal is correlated
|
|
6
|
+
against the human marks (when provided). The bench measures; it never marks.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import csv
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from . import arms, stats
|
|
17
|
+
from .exceptions import AssessmentBenchError
|
|
18
|
+
from .models import (
|
|
19
|
+
Agreement,
|
|
20
|
+
ArmKind,
|
|
21
|
+
ArmOutcome,
|
|
22
|
+
ExperimentConfig,
|
|
23
|
+
ExperimentResult,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_config(path: Path) -> ExperimentConfig:
|
|
28
|
+
"""Load an experiment YAML; relative paths resolve against the config's folder."""
|
|
29
|
+
raw = yaml.safe_load(Path(path).read_text())
|
|
30
|
+
config = ExperimentConfig.model_validate(raw)
|
|
31
|
+
base = Path(path).resolve().parent
|
|
32
|
+
config.rubric = (base / config.rubric).resolve()
|
|
33
|
+
config.submissions = (base / config.submissions).resolve()
|
|
34
|
+
if config.human_marks is not None:
|
|
35
|
+
config.human_marks = (base / config.human_marks).resolve()
|
|
36
|
+
return config
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def discover_submissions(submissions_dir: Path) -> list[Path]:
|
|
40
|
+
"""One subfolder = one submission, mirroring assessment-lens's discovery."""
|
|
41
|
+
if not submissions_dir.is_dir():
|
|
42
|
+
raise AssessmentBenchError(f"submissions folder not found: {submissions_dir}")
|
|
43
|
+
folders = sorted(p for p in submissions_dir.iterdir() if p.is_dir() and not p.name.startswith("."))
|
|
44
|
+
if not folders:
|
|
45
|
+
raise AssessmentBenchError(f"no submission subfolders in {submissions_dir}")
|
|
46
|
+
return folders
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_human_marks(path: Path) -> dict[str, float]:
|
|
50
|
+
"""CSV with a header row: submission_id,mark."""
|
|
51
|
+
marks: dict[str, float] = {}
|
|
52
|
+
with open(path, newline="") as f:
|
|
53
|
+
for row in csv.DictReader(f):
|
|
54
|
+
marks[row["submission_id"].strip()] = float(row["mark"])
|
|
55
|
+
return marks
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _agreements(
|
|
59
|
+
result: ExperimentResult, marks: dict[str, float]
|
|
60
|
+
) -> list[Agreement]:
|
|
61
|
+
"""Correlate every arm mean and every numeric signal with the human marks."""
|
|
62
|
+
agreements: list[Agreement] = []
|
|
63
|
+
|
|
64
|
+
# Arm means (LLM arms): pair each submission's mean score with its mark.
|
|
65
|
+
by_arm: dict[str, dict[str, float]] = {}
|
|
66
|
+
for outcome in result.outcomes:
|
|
67
|
+
if outcome.stats is not None:
|
|
68
|
+
by_arm.setdefault(outcome.arm_id, {})[outcome.submission_id] = outcome.stats.mean
|
|
69
|
+
# Numeric signals (signals arm): one measure per dotted signal path.
|
|
70
|
+
by_signal: dict[str, dict[str, float]] = {}
|
|
71
|
+
for outcome in result.outcomes:
|
|
72
|
+
for reading in outcome.signals:
|
|
73
|
+
if isinstance(reading.value, bool):
|
|
74
|
+
value = float(reading.value)
|
|
75
|
+
elif isinstance(reading.value, (int, float)):
|
|
76
|
+
value = float(reading.value)
|
|
77
|
+
else:
|
|
78
|
+
continue
|
|
79
|
+
by_signal.setdefault(reading.signal, {})[reading.submission_id] = value
|
|
80
|
+
|
|
81
|
+
for measure, values in {**by_arm, **by_signal}.items():
|
|
82
|
+
paired = [(values[s], marks[s]) for s in values if s in marks]
|
|
83
|
+
if len(paired) < 2:
|
|
84
|
+
continue
|
|
85
|
+
xs, ys = [p[0] for p in paired], [p[1] for p in paired]
|
|
86
|
+
agreements.append(
|
|
87
|
+
Agreement(
|
|
88
|
+
measure=measure,
|
|
89
|
+
n=len(paired),
|
|
90
|
+
pearson=stats.pearson(xs, ys),
|
|
91
|
+
spearman=stats.spearman(xs, ys),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
return agreements
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def run_experiment(config: ExperimentConfig, *, progress=None) -> ExperimentResult:
|
|
98
|
+
"""Run every arm over the cohort and assemble the structured result.
|
|
99
|
+
|
|
100
|
+
``progress`` is an optional callable(str) for CLI/UI status lines.
|
|
101
|
+
"""
|
|
102
|
+
say = progress or (lambda _msg: None)
|
|
103
|
+
submissions = discover_submissions(config.submissions)
|
|
104
|
+
rubric_text = config.rubric.read_text()
|
|
105
|
+
result = ExperimentResult(
|
|
106
|
+
name=config.name,
|
|
107
|
+
max_score=config.max_score,
|
|
108
|
+
submissions=[s.name for s in submissions],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
for arm in config.arms:
|
|
112
|
+
if arm.kind is ArmKind.SIGNALS:
|
|
113
|
+
say(f"arm {arm.id}: assessment-lens over {len(submissions)} submissions")
|
|
114
|
+
readings = arms.run_signals_arm(arm, config.rubric, config.submissions)
|
|
115
|
+
for folder in submissions:
|
|
116
|
+
result.outcomes.append(
|
|
117
|
+
ArmOutcome(
|
|
118
|
+
submission_id=folder.name,
|
|
119
|
+
arm_id=arm.id,
|
|
120
|
+
signals=[r for r in readings if r.submission_id == folder.name],
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
for folder in submissions:
|
|
125
|
+
say(f"arm {arm.id}: {folder.name} x{arm.repetitions}")
|
|
126
|
+
runs = arms.run_llm_arm(arm, folder.name, folder, rubric_text, config.max_score)
|
|
127
|
+
scores = [r.score for r in runs if r.score is not None]
|
|
128
|
+
result.outcomes.append(
|
|
129
|
+
ArmOutcome(
|
|
130
|
+
submission_id=folder.name,
|
|
131
|
+
arm_id=arm.id,
|
|
132
|
+
runs=runs,
|
|
133
|
+
stats=stats.run_stats(scores),
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if config.human_marks is not None:
|
|
138
|
+
marks = load_human_marks(config.human_marks)
|
|
139
|
+
result.agreements = _agreements(result, marks)
|
|
140
|
+
|
|
141
|
+
return result
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Core data models for assessment-bench.
|
|
2
|
+
|
|
3
|
+
The bench is the family's *measurement* layer: it runs the same cohort through
|
|
4
|
+
competing assessment arms and reports consistency and agreement. The design
|
|
5
|
+
rule that shapes these models: **the bench measures; it never marks.** An LLM
|
|
6
|
+
arm produces scores because that is the approach under test — the bench treats
|
|
7
|
+
those scores as data points, not as grades for students. Human marks, when
|
|
8
|
+
provided, are the ground truth everything is compared against.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# --- Experiment side (input) -------------------------------------------------
|
|
20
|
+
class ProviderName(str, Enum):
|
|
21
|
+
ANTHROPIC = "anthropic"
|
|
22
|
+
OPENAI = "openai"
|
|
23
|
+
OLLAMA = "ollama"
|
|
24
|
+
OPENROUTER = "openrouter"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ProviderSpec(BaseModel):
|
|
28
|
+
"""Which LLM serves an arm. base_url covers Ollama / any OpenAI-compatible host."""
|
|
29
|
+
|
|
30
|
+
provider: ProviderName
|
|
31
|
+
model: str
|
|
32
|
+
base_url: str | None = None
|
|
33
|
+
temperature: float = 0.1
|
|
34
|
+
max_tokens: int = 1500
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ArmKind(str, Enum):
|
|
38
|
+
LLM = "llm" # pure-LLM marking: submission + rubric -> score
|
|
39
|
+
SIGNALS = "signals" # assessment-lens observations: deterministic evidence values
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ArmSpec(BaseModel):
|
|
43
|
+
"""One assessment approach under test."""
|
|
44
|
+
|
|
45
|
+
id: str
|
|
46
|
+
kind: ArmKind
|
|
47
|
+
repetitions: int = Field(default=1, ge=1, le=50)
|
|
48
|
+
provider: ProviderSpec | None = None # required for kind=llm
|
|
49
|
+
|
|
50
|
+
def model_post_init(self, __context: object) -> None:
|
|
51
|
+
if self.kind is ArmKind.LLM and self.provider is None:
|
|
52
|
+
raise ValueError(f"arm '{self.id}': kind=llm requires a provider")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ExperimentConfig(BaseModel):
|
|
56
|
+
"""One experiment: a rubric, a cohort, the arms to compare.
|
|
57
|
+
|
|
58
|
+
Paths are resolved relative to the config file's directory by ``load_config``.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
name: str
|
|
62
|
+
rubric: Path
|
|
63
|
+
submissions: Path
|
|
64
|
+
max_score: float = 100.0
|
|
65
|
+
human_marks: Path | None = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="Optional CSV (submission_id,mark) of human ground-truth marks.",
|
|
68
|
+
)
|
|
69
|
+
arms: list[ArmSpec] = Field(min_length=1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# --- Result side (output) ----------------------------------------------------
|
|
73
|
+
class GradeRun(BaseModel):
|
|
74
|
+
"""One LLM grading call. score=None means extraction failed (kept, not hidden)."""
|
|
75
|
+
|
|
76
|
+
submission_id: str
|
|
77
|
+
arm_id: str
|
|
78
|
+
run_index: int
|
|
79
|
+
score: float | None = None
|
|
80
|
+
max_score: float
|
|
81
|
+
rationale: str = ""
|
|
82
|
+
raw_response: str = ""
|
|
83
|
+
error: str = ""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class RunStats(BaseModel):
|
|
87
|
+
"""Consistency statistics over one arm's repeated runs for one submission."""
|
|
88
|
+
|
|
89
|
+
n: int
|
|
90
|
+
mean: float
|
|
91
|
+
median: float
|
|
92
|
+
std_dev: float = Field(description="Sample standard deviation (n-1).")
|
|
93
|
+
coefficient_of_variation: float
|
|
94
|
+
min: float
|
|
95
|
+
max: float
|
|
96
|
+
reliability: float = Field(
|
|
97
|
+
description="1 - CV, floored at 0. A rough 'how repeatable was this arm' index."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SignalReading(BaseModel):
|
|
102
|
+
"""One deterministic evidence value from the signals arm."""
|
|
103
|
+
|
|
104
|
+
submission_id: str
|
|
105
|
+
criterion_id: str
|
|
106
|
+
signal: str
|
|
107
|
+
value: object | None = None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ArmOutcome(BaseModel):
|
|
111
|
+
"""Everything one arm produced for one submission."""
|
|
112
|
+
|
|
113
|
+
submission_id: str
|
|
114
|
+
arm_id: str
|
|
115
|
+
runs: list[GradeRun] = Field(default_factory=list)
|
|
116
|
+
stats: RunStats | None = None
|
|
117
|
+
signals: list[SignalReading] = Field(default_factory=list)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class Agreement(BaseModel):
|
|
121
|
+
"""Correlation between one measure and the human marks."""
|
|
122
|
+
|
|
123
|
+
measure: str = Field(description="An arm id (mean score) or a dotted signal path.")
|
|
124
|
+
n: int
|
|
125
|
+
pearson: float | None = None
|
|
126
|
+
spearman: float | None = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ExperimentResult(BaseModel):
|
|
130
|
+
"""The source-of-truth structured result for one experiment run."""
|
|
131
|
+
|
|
132
|
+
name: str
|
|
133
|
+
max_score: float
|
|
134
|
+
submissions: list[str] = Field(default_factory=list)
|
|
135
|
+
outcomes: list[ArmOutcome] = Field(default_factory=list)
|
|
136
|
+
agreements: list[Agreement] = Field(default_factory=list)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Multi-provider LLM completion for the pure-LLM marking arm.
|
|
2
|
+
|
|
3
|
+
Provider registry adapted from image-analyser's caption providers: Anthropic via
|
|
4
|
+
its own SDK; OpenAI, OpenRouter, and Ollama through the openai SDK (the latter
|
|
5
|
+
two are OpenAI-compatible endpoints reached via base_url). Key resolution
|
|
6
|
+
follows the family pattern — env var first, then a minimal .env fallback.
|
|
7
|
+
|
|
8
|
+
Everything here is opt-in and degradable: callers catch ``LLMUnavailable`` and
|
|
9
|
+
the experiment records the failure instead of dying mid-cohort.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .exceptions import AssessmentBenchError
|
|
18
|
+
from .models import ProviderName, ProviderSpec
|
|
19
|
+
|
|
20
|
+
PROVIDER_KEYS = {
|
|
21
|
+
ProviderName.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
22
|
+
ProviderName.OPENAI: "OPENAI_API_KEY",
|
|
23
|
+
ProviderName.OPENROUTER: "OPENROUTER_API_KEY",
|
|
24
|
+
ProviderName.OLLAMA: None, # local; no key
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
DEFAULT_BASE_URLS = {
|
|
28
|
+
ProviderName.OPENROUTER: "https://openrouter.ai/api/v1",
|
|
29
|
+
ProviderName.OLLAMA: "http://localhost:11434/v1",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LLMUnavailable(AssessmentBenchError):
|
|
34
|
+
"""The [llm] extra is not installed or no API key is configured."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _load_env_file() -> None:
|
|
38
|
+
"""Minimal .env loader (cwd upward) — no python-dotenv dependency."""
|
|
39
|
+
for parent in [Path.cwd(), *Path.cwd().parents]:
|
|
40
|
+
env_file = parent / ".env"
|
|
41
|
+
if env_file.exists():
|
|
42
|
+
try:
|
|
43
|
+
for line in env_file.read_text().splitlines():
|
|
44
|
+
line = line.strip()
|
|
45
|
+
if line and not line.startswith("#") and "=" in line:
|
|
46
|
+
key, value = line.split("=", 1)
|
|
47
|
+
os.environ.setdefault(key.strip(), value.strip().strip("\"'"))
|
|
48
|
+
except OSError:
|
|
49
|
+
pass
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_api_key(provider: ProviderName) -> str | None:
|
|
54
|
+
env_var = PROVIDER_KEYS.get(provider)
|
|
55
|
+
if env_var is None:
|
|
56
|
+
return "unused" # Ollama: openai SDK requires a non-empty key
|
|
57
|
+
if key := os.getenv(env_var):
|
|
58
|
+
return key
|
|
59
|
+
_load_env_file()
|
|
60
|
+
return os.getenv(env_var)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def complete(prompt: str, *, system: str, spec: ProviderSpec) -> str:
|
|
64
|
+
"""One marking-style completion against the arm's configured provider."""
|
|
65
|
+
api_key = get_api_key(spec.provider)
|
|
66
|
+
if not api_key:
|
|
67
|
+
raise LLMUnavailable(
|
|
68
|
+
f"No API key for {spec.provider.value} — set {PROVIDER_KEYS[spec.provider]} (env or .env)."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if spec.provider is ProviderName.ANTHROPIC:
|
|
72
|
+
return _complete_anthropic(prompt, system=system, spec=spec, api_key=api_key)
|
|
73
|
+
return _complete_openai_compatible(prompt, system=system, spec=spec, api_key=api_key)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _complete_anthropic(prompt: str, *, system: str, spec: ProviderSpec, api_key: str) -> str:
|
|
77
|
+
try:
|
|
78
|
+
import anthropic
|
|
79
|
+
except ImportError as exc:
|
|
80
|
+
raise LLMUnavailable(
|
|
81
|
+
"LLM arms need the [llm] extra: pip install 'assessment-bench[llm]'"
|
|
82
|
+
) from exc
|
|
83
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
84
|
+
response = client.messages.create(
|
|
85
|
+
model=spec.model,
|
|
86
|
+
max_tokens=spec.max_tokens,
|
|
87
|
+
temperature=spec.temperature,
|
|
88
|
+
system=system,
|
|
89
|
+
messages=[{"role": "user", "content": prompt}],
|
|
90
|
+
)
|
|
91
|
+
return "".join(block.text for block in response.content if block.type == "text").strip()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _complete_openai_compatible(
|
|
95
|
+
prompt: str, *, system: str, spec: ProviderSpec, api_key: str
|
|
96
|
+
) -> str:
|
|
97
|
+
try:
|
|
98
|
+
import openai
|
|
99
|
+
except ImportError as exc:
|
|
100
|
+
raise LLMUnavailable(
|
|
101
|
+
"LLM arms need the [llm] extra: pip install 'assessment-bench[llm]'"
|
|
102
|
+
) from exc
|
|
103
|
+
base_url = spec.base_url or DEFAULT_BASE_URLS.get(spec.provider)
|
|
104
|
+
client = openai.OpenAI(api_key=api_key, base_url=base_url)
|
|
105
|
+
response = client.chat.completions.create(
|
|
106
|
+
model=spec.model,
|
|
107
|
+
max_tokens=spec.max_tokens,
|
|
108
|
+
temperature=spec.temperature,
|
|
109
|
+
messages=[
|
|
110
|
+
{"role": "system", "content": system},
|
|
111
|
+
{"role": "user", "content": prompt},
|
|
112
|
+
],
|
|
113
|
+
)
|
|
114
|
+
return (response.choices[0].message.content or "").strip()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Result writers: one JSON source of truth + flat CSVs for spreadsheet people."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .models import ExperimentResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def write_results(result: ExperimentResult, out_dir: Path) -> list[Path]:
|
|
13
|
+
"""Write result.json, runs.csv, signals.csv, agreement.csv. Returns written paths."""
|
|
14
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
written: list[Path] = []
|
|
16
|
+
|
|
17
|
+
json_path = out_dir / "result.json"
|
|
18
|
+
json_path.write_text(json.dumps(result.model_dump(mode="json"), indent=2))
|
|
19
|
+
written.append(json_path)
|
|
20
|
+
|
|
21
|
+
runs_path = out_dir / "runs.csv"
|
|
22
|
+
with open(runs_path, "w", newline="") as f:
|
|
23
|
+
writer = csv.writer(f)
|
|
24
|
+
writer.writerow(["submission", "arm", "run", "score", "max_score", "error"])
|
|
25
|
+
for outcome in result.outcomes:
|
|
26
|
+
for run in outcome.runs:
|
|
27
|
+
writer.writerow(
|
|
28
|
+
[run.submission_id, run.arm_id, run.run_index, run.score, run.max_score, run.error]
|
|
29
|
+
)
|
|
30
|
+
written.append(runs_path)
|
|
31
|
+
|
|
32
|
+
signals_path = out_dir / "signals.csv"
|
|
33
|
+
with open(signals_path, "w", newline="") as f:
|
|
34
|
+
writer = csv.writer(f)
|
|
35
|
+
writer.writerow(["submission", "arm", "criterion", "signal", "value"])
|
|
36
|
+
for outcome in result.outcomes:
|
|
37
|
+
for reading in outcome.signals:
|
|
38
|
+
writer.writerow(
|
|
39
|
+
[
|
|
40
|
+
reading.submission_id,
|
|
41
|
+
outcome.arm_id,
|
|
42
|
+
reading.criterion_id,
|
|
43
|
+
reading.signal,
|
|
44
|
+
json.dumps(reading.value, default=str),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
written.append(signals_path)
|
|
48
|
+
|
|
49
|
+
if result.agreements:
|
|
50
|
+
agreement_path = out_dir / "agreement.csv"
|
|
51
|
+
with open(agreement_path, "w", newline="") as f:
|
|
52
|
+
writer = csv.writer(f)
|
|
53
|
+
writer.writerow(["measure", "n", "pearson", "spearman"])
|
|
54
|
+
for a in result.agreements:
|
|
55
|
+
writer.writerow([a.measure, a.n, a.pearson, a.spearman])
|
|
56
|
+
written.append(agreement_path)
|
|
57
|
+
|
|
58
|
+
return written
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Consistency and agreement statistics.
|
|
2
|
+
|
|
3
|
+
The run-level statistics (mean/median/sample std-dev/CV/reliability) are a port
|
|
4
|
+
of the original AssessmentBench Rust aggregation engine — the best-validated
|
|
5
|
+
concept in that prototype. Agreement (Pearson/Spearman against human marks) is
|
|
6
|
+
new here: it is the bench's core research output. Pure stdlib, no numpy.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .models import RunStats
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_stats(scores: list[float]) -> RunStats | None:
|
|
15
|
+
"""Consistency statistics over one arm's repeated scores. None when empty."""
|
|
16
|
+
if not scores:
|
|
17
|
+
return None
|
|
18
|
+
n = len(scores)
|
|
19
|
+
mean = sum(scores) / n
|
|
20
|
+
ordered = sorted(scores)
|
|
21
|
+
median = (
|
|
22
|
+
ordered[n // 2]
|
|
23
|
+
if n % 2
|
|
24
|
+
else (ordered[n // 2 - 1] + ordered[n // 2]) / 2.0
|
|
25
|
+
)
|
|
26
|
+
if n > 1:
|
|
27
|
+
variance = sum((s - mean) ** 2 for s in scores) / (n - 1)
|
|
28
|
+
std_dev = variance**0.5
|
|
29
|
+
else:
|
|
30
|
+
std_dev = 0.0
|
|
31
|
+
cv = std_dev / mean if mean else 0.0
|
|
32
|
+
return RunStats(
|
|
33
|
+
n=n,
|
|
34
|
+
mean=mean,
|
|
35
|
+
median=median,
|
|
36
|
+
std_dev=std_dev,
|
|
37
|
+
coefficient_of_variation=cv,
|
|
38
|
+
min=ordered[0],
|
|
39
|
+
max=ordered[-1],
|
|
40
|
+
reliability=max(0.0, 1.0 - cv),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def pearson(xs: list[float], ys: list[float]) -> float | None:
|
|
45
|
+
"""Pearson r. None when undefined (n<2 or zero variance) — never faked as 0."""
|
|
46
|
+
n = len(xs)
|
|
47
|
+
if n != len(ys) or n < 2:
|
|
48
|
+
return None
|
|
49
|
+
mx = sum(xs) / n
|
|
50
|
+
my = sum(ys) / n
|
|
51
|
+
sxx = sum((x - mx) ** 2 for x in xs)
|
|
52
|
+
syy = sum((y - my) ** 2 for y in ys)
|
|
53
|
+
if sxx == 0 or syy == 0:
|
|
54
|
+
return None
|
|
55
|
+
sxy = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
|
|
56
|
+
return sxy / (sxx**0.5 * syy**0.5)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _ranks(values: list[float]) -> list[float]:
|
|
60
|
+
"""Average ranks (ties share the mean of their rank positions)."""
|
|
61
|
+
indexed = sorted(range(len(values)), key=lambda i: values[i])
|
|
62
|
+
ranks = [0.0] * len(values)
|
|
63
|
+
i = 0
|
|
64
|
+
while i < len(indexed):
|
|
65
|
+
j = i
|
|
66
|
+
while j + 1 < len(indexed) and values[indexed[j + 1]] == values[indexed[i]]:
|
|
67
|
+
j += 1
|
|
68
|
+
avg_rank = (i + j) / 2.0 + 1.0
|
|
69
|
+
for k in range(i, j + 1):
|
|
70
|
+
ranks[indexed[k]] = avg_rank
|
|
71
|
+
i = j + 1
|
|
72
|
+
return ranks
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def spearman(xs: list[float], ys: list[float]) -> float | None:
|
|
76
|
+
"""Spearman rho = Pearson on average ranks. None when undefined."""
|
|
77
|
+
if len(xs) != len(ys) or len(xs) < 2:
|
|
78
|
+
return None
|
|
79
|
+
return pearson(_ranks(xs), _ranks(ys))
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Score extraction — the riskiest parsing in the bench (the Rust prototype never tested its)."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from assessment_bench.arms import extract_score, grade_prompt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_strict_score_line():
|
|
9
|
+
score, _ = extract_score("Good work overall.\nSCORE: 78/100", 100.0)
|
|
10
|
+
assert score == pytest.approx(78.0)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_last_score_line_wins():
|
|
14
|
+
text = "If perfect this would be SCORE: 100/100, but...\nSCORE: 62.5/100"
|
|
15
|
+
score, _ = extract_score(text, 100.0)
|
|
16
|
+
assert score == pytest.approx(62.5)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_scaled_to_max_score():
|
|
20
|
+
score, _ = extract_score("SCORE: 7/10", 100.0)
|
|
21
|
+
assert score == pytest.approx(70.0)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_fallback_out_of_phrasing():
|
|
25
|
+
score, _ = extract_score("I would award 41 out of 50 for this.", 100.0)
|
|
26
|
+
assert score == pytest.approx(82.0)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_no_score_returns_none():
|
|
30
|
+
score, _ = extract_score("This is thoughtful work with clear structure.", 100.0)
|
|
31
|
+
assert score is None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_grade_prompt_carries_parts():
|
|
35
|
+
p = grade_prompt("RUB", "SUB", 50.0)
|
|
36
|
+
assert "RUB" in p and "SUB" in p and "SCORE: x/50" in p
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Config models: the experiment YAML is the bench's central contract."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from pydantic import ValidationError
|
|
5
|
+
|
|
6
|
+
from assessment_bench.models import ArmKind, ArmSpec, ExperimentConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_llm_arm_requires_provider():
|
|
10
|
+
with pytest.raises((ValidationError, ValueError)):
|
|
11
|
+
ArmSpec(id="bad", kind=ArmKind.LLM)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_signals_arm_needs_no_provider():
|
|
15
|
+
arm = ArmSpec(id="signals", kind=ArmKind.SIGNALS)
|
|
16
|
+
assert arm.repetitions == 1
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_experiment_config_parses():
|
|
20
|
+
config = ExperimentConfig.model_validate(
|
|
21
|
+
{
|
|
22
|
+
"name": "t",
|
|
23
|
+
"rubric": "rubric.yaml",
|
|
24
|
+
"submissions": "subs/",
|
|
25
|
+
"arms": [
|
|
26
|
+
{
|
|
27
|
+
"id": "llm",
|
|
28
|
+
"kind": "llm",
|
|
29
|
+
"repetitions": 3,
|
|
30
|
+
"provider": {"provider": "ollama", "model": "llama3.1"},
|
|
31
|
+
},
|
|
32
|
+
{"id": "signals", "kind": "signals"},
|
|
33
|
+
],
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
assert config.max_score == 100.0
|
|
37
|
+
assert config.arms[0].provider.base_url is None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_experiment_config_requires_an_arm():
|
|
41
|
+
with pytest.raises(ValidationError):
|
|
42
|
+
ExperimentConfig.model_validate(
|
|
43
|
+
{"name": "t", "rubric": "r.yaml", "submissions": "s/", "arms": []}
|
|
44
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Stats: the ported Rust aggregation math + the new agreement correlations."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from assessment_bench.stats import pearson, run_stats, spearman
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_run_stats_basic():
|
|
9
|
+
s = run_stats([80.0, 85.0, 90.0])
|
|
10
|
+
assert s.n == 3
|
|
11
|
+
assert s.mean == pytest.approx(85.0)
|
|
12
|
+
assert s.median == pytest.approx(85.0)
|
|
13
|
+
assert s.std_dev == pytest.approx(5.0) # sample std-dev (n-1)
|
|
14
|
+
assert s.coefficient_of_variation == pytest.approx(5.0 / 85.0)
|
|
15
|
+
assert s.min == 80.0 and s.max == 90.0
|
|
16
|
+
assert s.reliability == pytest.approx(1.0 - 5.0 / 85.0)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_run_stats_even_median_and_single():
|
|
20
|
+
assert run_stats([1.0, 2.0, 3.0, 4.0]).median == pytest.approx(2.5)
|
|
21
|
+
single = run_stats([70.0])
|
|
22
|
+
assert single.std_dev == 0.0 and single.reliability == 1.0
|
|
23
|
+
assert run_stats([]) is None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_pearson_perfect_and_inverse():
|
|
27
|
+
assert pearson([1, 2, 3], [10, 20, 30]) == pytest.approx(1.0)
|
|
28
|
+
assert pearson([1, 2, 3], [30, 20, 10]) == pytest.approx(-1.0)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_pearson_undefined_not_faked():
|
|
32
|
+
assert pearson([1, 1, 1], [1, 2, 3]) is None # zero variance
|
|
33
|
+
assert pearson([1], [1]) is None # n < 2
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_spearman_monotonic_nonlinear():
|
|
37
|
+
# Monotonic but nonlinear: rho is 1 even where r is not.
|
|
38
|
+
xs = [1.0, 2.0, 3.0, 4.0]
|
|
39
|
+
ys = [1.0, 10.0, 100.0, 1000.0]
|
|
40
|
+
assert spearman(xs, ys) == pytest.approx(1.0)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_spearman_ties_average_ranks():
|
|
44
|
+
assert spearman([1.0, 1.0, 2.0], [1.0, 1.0, 2.0]) == pytest.approx(1.0)
|