assessment-bench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .pytest_cache/
8
+ .coverage
9
+ bench-out/
10
+ .env
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Michael Borck
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: assessment-bench
3
+ Version: 0.1.0
4
+ Summary: Benchmark assessment approaches: pure-LLM marking vs the family's signal-based observations, with repeated runs and agreement statistics.
5
+ Project-URL: Homepage, https://github.com/michael-borck/assessment-bench
6
+ Author: Michael Borck
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Michael Borck
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ License-File: LICENSE
29
+ Classifier: Development Status :: 3 - Alpha
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Programming Language :: Python :: 3.11
32
+ Classifier: Programming Language :: Python :: 3.12
33
+ Requires-Python: >=3.11
34
+ Requires-Dist: assessment-lens>=0.2.0
35
+ Requires-Dist: pydantic>=2.5.0
36
+ Requires-Dist: pyyaml>=6.0.0
37
+ Requires-Dist: rich>=13.7.0
38
+ Provides-Extra: analysers
39
+ Requires-Dist: assessment-lens[analysers]>=0.2.0; extra == 'analysers'
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
42
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
43
+ Provides-Extra: llm
44
+ Requires-Dist: anthropic>=0.40.0; extra == 'llm'
45
+ Requires-Dist: openai>=1.12.0; extra == 'llm'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # assessment-bench
49
+
50
+ Part of the [lens family](https://github.com/michael-borck/lens-analysers).
51
+
52
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
53
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
54
+
55
+ **Benchmark assessment approaches.** Run one cohort through competing
56
+ assessment arms — pure-LLM marking (the baseline) and the family's
57
+ signal-based observations (`assessment-lens`) — with repeated runs,
58
+ consistency statistics, and agreement against human marks.
59
+ **The bench measures; it never marks.**
60
+
61
+ > `assessment-bench` is a *bench* (a measurement product), not an `-analyser`
62
+ > and not a marking tool. It exists to answer research questions like: *how
63
+ > consistent is LLM marking across repeated runs and providers?* and *which
64
+ > deterministic signals actually track human judgement?*
65
+
66
+ ## What it does
67
+
68
+ ```
69
+ experiment.yaml (rubric + cohort + arms)
70
+ ├─ llm arm(s) : submission + rubric → provider → score × repetitions
71
+ ├─ signals arm : assessment-lens → evidence values (deterministic, once)
72
+ └─ human marks : optional ground-truth CSV
73
+
74
+ result.json + runs.csv + signals.csv + agreement.csv
75
+ • per-submission consistency: mean / median / std-dev / CV / reliability
76
+ • agreement: Pearson & Spearman of every arm mean and every numeric signal
77
+ against the human marks
78
+ ```
79
+
80
+ ## Install
81
+
82
+ ```bash
83
+ # from source (family layout)
84
+ uv venv && source .venv/bin/activate
85
+ uv pip install -e ".[dev]"
86
+
87
+ # the signals arm needs the analyser stack (bundle-analyser CLI on PATH):
88
+ uv pip install -e ".[analysers]"
89
+
90
+ # LLM arms (Anthropic, OpenAI, Ollama, OpenRouter):
91
+ uv pip install -e ".[llm]" # + export ANTHROPIC_API_KEY / OPENAI_API_KEY / ...
92
+ ```
93
+
94
+ ## Quick start
95
+
96
+ ```bash
97
+ assessment-bench init experiment.yaml # commented example config
98
+ # edit: point at your rubric.yaml + submissions/, choose arms
99
+ assessment-bench run experiment.yaml -o out/
100
+ ```
101
+
102
+ LLM arms specify provider **and** model per arm — comparing
103
+ `claude-haiku-4-5` vs `gpt-4o-mini` vs a local `llama3.1` via Ollama is just
104
+ three arms in one config.
105
+
106
+ ## Relationship to the family
107
+
108
+ - **Analysers** generate deterministic signals (assessment-agnostic).
109
+ - **assessment-lens** maps signals to a rubric as observations — never scores.
110
+ - **assessment-bench** measures both approaches against human judgement. The
111
+ LLM arm produces scores *because that is the approach under test*; the bench
112
+ treats them as data points, not grades for students.
113
+
114
+ ## Status
115
+
116
+ **v0.1 scaffold.** Working today:
117
+
118
+ - ✅ Experiment config (YAML) → cohort discovery → arms → structured results
119
+ - ✅ LLM arm: multi-provider (anthropic / openai / ollama / openrouter), repeated
120
+ runs, strict `SCORE: x/y` extraction with scaled fallback
121
+ - ✅ Signals arm: one `assessment-lens` pass; raw evidence values consumed
122
+ (not the presence-based coverage)
123
+ - ✅ Consistency stats (ported from the original Rust prototype) + Pearson/Spearman
124
+ agreement vs human marks
125
+ - 📋 Hybrid arm (LLM marking with analyser signals in context) — next
126
+ - 📋 HTTP service + desktop shell for non-technical researchers — planned
127
+
128
+ ## Development
129
+
130
+ ```bash
131
+ pytest -v
132
+ ```
133
+
134
+ ## License
135
+
136
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,89 @@
1
+ # assessment-bench
2
+
3
+ Part of the [lens family](https://github.com/michael-borck/lens-analysers).
4
+
5
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ **Benchmark assessment approaches.** Run one cohort through competing
9
+ assessment arms — pure-LLM marking (the baseline) and the family's
10
+ signal-based observations (`assessment-lens`) — with repeated runs,
11
+ consistency statistics, and agreement against human marks.
12
+ **The bench measures; it never marks.**
13
+
14
+ > `assessment-bench` is a *bench* (a measurement product), not an `-analyser`
15
+ > and not a marking tool. It exists to answer research questions like: *how
16
+ > consistent is LLM marking across repeated runs and providers?* and *which
17
+ > deterministic signals actually track human judgement?*
18
+
19
+ ## What it does
20
+
21
+ ```
22
+ experiment.yaml (rubric + cohort + arms)
23
+ ├─ llm arm(s) : submission + rubric → provider → score × repetitions
24
+ ├─ signals arm : assessment-lens → evidence values (deterministic, once)
25
+ └─ human marks : optional ground-truth CSV
26
+
27
+ result.json + runs.csv + signals.csv + agreement.csv
28
+ • per-submission consistency: mean / median / std-dev / CV / reliability
29
+ • agreement: Pearson & Spearman of every arm mean and every numeric signal
30
+ against the human marks
31
+ ```
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ # from source (family layout)
37
+ uv venv && source .venv/bin/activate
38
+ uv pip install -e ".[dev]"
39
+
40
+ # the signals arm needs the analyser stack (bundle-analyser CLI on PATH):
41
+ uv pip install -e ".[analysers]"
42
+
43
+ # LLM arms (Anthropic, OpenAI, Ollama, OpenRouter):
44
+ uv pip install -e ".[llm]" # + export ANTHROPIC_API_KEY / OPENAI_API_KEY / ...
45
+ ```
46
+
47
+ ## Quick start
48
+
49
+ ```bash
50
+ assessment-bench init experiment.yaml # commented example config
51
+ # edit: point at your rubric.yaml + submissions/, choose arms
52
+ assessment-bench run experiment.yaml -o out/
53
+ ```
54
+
55
+ LLM arms specify provider **and** model per arm — comparing
56
+ `claude-haiku-4-5` vs `gpt-4o-mini` vs a local `llama3.1` via Ollama is just
57
+ three arms in one config.
58
+
59
+ ## Relationship to the family
60
+
61
+ - **Analysers** generate deterministic signals (assessment-agnostic).
62
+ - **assessment-lens** maps signals to a rubric as observations — never scores.
63
+ - **assessment-bench** measures both approaches against human judgement. The
64
+ LLM arm produces scores *because that is the approach under test*; the bench
65
+ treats them as data points, not grades for students.
66
+
67
+ ## Status
68
+
69
+ **v0.1 scaffold.** Working today:
70
+
71
+ - ✅ Experiment config (YAML) → cohort discovery → arms → structured results
72
+ - ✅ LLM arm: multi-provider (anthropic / openai / ollama / openrouter), repeated
73
+ runs, strict `SCORE: x/y` extraction with scaled fallback
74
+ - ✅ Signals arm: one `assessment-lens` pass; raw evidence values consumed
75
+ (not the presence-based coverage)
76
+ - ✅ Consistency stats (ported from the original Rust prototype) + Pearson/Spearman
77
+ agreement vs human marks
78
+ - 📋 Hybrid arm (LLM marking with analyser signals in context) — next
79
+ - 📋 HTTP service + desktop shell for non-technical researchers — planned
80
+
81
+ ## Development
82
+
83
+ ```bash
84
+ pytest -v
85
+ ```
86
+
87
+ ## License
88
+
89
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,59 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "assessment-bench"
7
+ version = "0.1.0"
8
+ description = "Benchmark assessment approaches: pure-LLM marking vs the family's signal-based observations, with repeated runs and agreement statistics."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "Michael Borck" }]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ ]
19
+ dependencies = [
20
+ "assessment-lens>=0.2.0",
21
+ "pydantic>=2.5.0",
22
+ "pyyaml>=6.0.0",
23
+ "rich>=13.7.0",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ # The pure-LLM marking arm. anthropic for Anthropic; openai also covers
28
+ # Ollama / OpenRouter / any OpenAI-compatible endpoint via base_url.
29
+ llm = [
30
+ "anthropic>=0.40.0",
31
+ "openai>=1.12.0",
32
+ ]
33
+ # Pull the analyser stack into the same env so the signals arm runs for real
34
+ # (assessment-lens shells out to the bundle-analyser CLI).
35
+ analysers = [
36
+ "assessment-lens[analysers]>=0.2.0",
37
+ ]
38
+ dev = [
39
+ "pytest>=8.0.0",
40
+ "pytest-cov>=4.0.0",
41
+ ]
42
+
43
+ [project.scripts]
44
+ assessment-bench = "assessment_bench.cli:main"
45
+
46
+ # Local dev: resolve family members from sibling checkouts. uv strips this from
47
+ # the published wheel, which keeps the plain PyPI pins.
48
+ [tool.uv.sources]
49
+ assessment-lens = { path = "../assessment-lens", editable = true }
50
+
51
+ [project.urls]
52
+ Homepage = "https://github.com/michael-borck/assessment-bench"
53
+
54
+ [tool.hatch.build.targets.wheel]
55
+ packages = ["src/assessment_bench"]
56
+
57
+ [tool.pytest.ini_options]
58
+ testpaths = ["tests"]
59
+ pythonpath = ["src"]
@@ -0,0 +1,45 @@
1
+ """assessment-bench — benchmark assessment approaches for the lens family.
2
+
3
+ Runs one cohort through competing assessment arms (pure-LLM marking as the
4
+ baseline; assessment-lens signal observations as the approach under study),
5
+ with repeated runs, consistency statistics, and agreement against human marks.
6
+ **The bench measures; it never marks.**
7
+ """
8
+
9
+ from .exceptions import AssessmentBenchError
10
+ from .experiment import load_config, run_experiment
11
+ from .models import (
12
+ Agreement,
13
+ ArmKind,
14
+ ArmOutcome,
15
+ ArmSpec,
16
+ ExperimentConfig,
17
+ ExperimentResult,
18
+ GradeRun,
19
+ ProviderName,
20
+ ProviderSpec,
21
+ RunStats,
22
+ SignalReading,
23
+ )
24
+ from .report import write_results
25
+
26
+ __version__ = "0.1.0"
27
+
28
+ __all__ = [
29
+ "Agreement",
30
+ "ArmKind",
31
+ "ArmOutcome",
32
+ "ArmSpec",
33
+ "AssessmentBenchError",
34
+ "ExperimentConfig",
35
+ "ExperimentResult",
36
+ "GradeRun",
37
+ "ProviderName",
38
+ "ProviderSpec",
39
+ "RunStats",
40
+ "SignalReading",
41
+ "__version__",
42
+ "load_config",
43
+ "run_experiment",
44
+ "write_results",
45
+ ]
@@ -0,0 +1,122 @@
1
+ """The assessment arms under test.
2
+
3
+ LLM arm — the approach the family deliberately moved away from (an LLM reading
4
+ a submission and emitting a mark), kept here as the benchmark baseline. Prompt
5
+ shape ports the original Rust prototype's Tier-1 design; the score comes from a
6
+ strict trailing ``SCORE: x/y`` line with a permissive regex fallback.
7
+
8
+ Signals arm — assessment-lens observations. Deterministic, so it runs once per
9
+ cohort regardless of repetitions; the bench consumes raw evidence values (not
10
+ the presence-based coverage column) and correlates each numeric signal with the
11
+ human marks.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ from pathlib import Path
18
+
19
+ from assessment_lens.assess import assess
20
+ from assessment_lens.rubric import load_rubric
21
+
22
+ from . import providers
23
+ from .models import ArmSpec, GradeRun, SignalReading
24
+
25
+ # Submission text for the LLM arm. Plain-text formats are read directly;
26
+ # .pdf/.docx go through the family's canonical extractor when installed.
27
+ _PLAIN_TEXT_SUFFIXES = {".md", ".txt", ".py", ".js", ".ts", ".r", ".sql", ".csv"}
28
+ _EXTRACTOR_SUFFIXES = {".pdf", ".docx", ".pptx"}
29
+
30
+ _GRADE_SYSTEM = (
31
+ "You are an experienced university marker. Grade the submission against the "
32
+ "rubric. Be consistent and justify briefly. End your response with exactly "
33
+ "one line in the form 'SCORE: <number>/<max>' and nothing after it."
34
+ )
35
+
36
+ _SCORE_RE = re.compile(r"SCORE:\s*(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", re.IGNORECASE)
37
+ _FALLBACK_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(?:/|out of)\s*(\d+(?:\.\d+)?)")
38
+
39
+
40
+ def read_submission_text(folder: Path) -> str:
41
+ """Concatenate the readable artefacts in one submission folder."""
42
+ parts: list[str] = []
43
+ for path in sorted(folder.rglob("*")):
44
+ if not path.is_file():
45
+ continue
46
+ suffix = path.suffix.lower()
47
+ if suffix in _PLAIN_TEXT_SUFFIXES:
48
+ parts.append(f"--- {path.name} ---\n{path.read_text(errors='replace')}")
49
+ elif suffix in _EXTRACTOR_SUFFIXES:
50
+ try:
51
+ from document_analyser import extract_text
52
+
53
+ parts.append(f"--- {path.name} ---\n{extract_text(path)}")
54
+ except ImportError:
55
+ parts.append(f"--- {path.name} --- (skipped: install the [analysers] extra to extract {suffix})")
56
+ return "\n\n".join(parts)
57
+
58
+
59
+ def extract_score(response: str, max_score: float) -> tuple[float | None, float]:
60
+ """Pull (score, max) from a response; scale to max_score when the LLM used its own denominator."""
61
+ matches = _SCORE_RE.findall(response) or _FALLBACK_RE.findall(response)
62
+ if not matches:
63
+ return None, max_score
64
+ raw, denom = (float(v) for v in matches[-1])
65
+ if denom and denom != max_score:
66
+ raw = raw / denom * max_score
67
+ return raw, max_score
68
+
69
+
70
+ def grade_prompt(rubric_text: str, submission_text: str, max_score: float) -> str:
71
+ return (
72
+ f"RUBRIC:\n{rubric_text}\n\n"
73
+ f"SUBMISSION:\n{submission_text}\n\n"
74
+ f"Grade the submission against the rubric out of {max_score:g}. "
75
+ f"Give 2-3 sentences of rationale, then the final 'SCORE: x/{max_score:g}' line."
76
+ )
77
+
78
+
79
+ def run_llm_arm(
80
+ arm: ArmSpec,
81
+ submission_id: str,
82
+ submission_folder: Path,
83
+ rubric_text: str,
84
+ max_score: float,
85
+ ) -> list[GradeRun]:
86
+ """All repetitions of one LLM arm for one submission. Failures are recorded, not raised."""
87
+ assert arm.provider is not None # validated by ArmSpec
88
+ text = read_submission_text(submission_folder)
89
+ prompt = grade_prompt(rubric_text, text, max_score)
90
+ runs: list[GradeRun] = []
91
+ for i in range(arm.repetitions):
92
+ run = GradeRun(submission_id=submission_id, arm_id=arm.id, run_index=i, max_score=max_score)
93
+ try:
94
+ response = providers.complete(prompt, system=_GRADE_SYSTEM, spec=arm.provider)
95
+ run.raw_response = response
96
+ run.score, _ = extract_score(response, max_score)
97
+ run.rationale = _SCORE_RE.sub("", response).strip()
98
+ if run.score is None:
99
+ run.error = "no SCORE line found in response"
100
+ except Exception as exc: # one bad call must not kill a cohort run
101
+ run.error = str(exc)
102
+ runs.append(run)
103
+ return runs
104
+
105
+
106
+ def run_signals_arm(arm: ArmSpec, rubric_path: Path, submissions_dir: Path) -> list[SignalReading]:
107
+ """One assessment-lens pass over the whole cohort -> flat evidence readings."""
108
+ rubric = load_rubric(rubric_path)
109
+ result = assess(rubric, submissions_dir)
110
+ readings: list[SignalReading] = []
111
+ for submission in result.submissions:
112
+ for observation in submission.observations:
113
+ for evidence in observation.evidence:
114
+ readings.append(
115
+ SignalReading(
116
+ submission_id=submission.submission_id,
117
+ criterion_id=observation.criterion_id,
118
+ signal=evidence.signal,
119
+ value=evidence.value,
120
+ )
121
+ )
122
+ return readings
@@ -0,0 +1,70 @@
1
+ """assessment-bench CLI.
2
+
3
+ assessment-bench run experiment.yaml -o out/
4
+ assessment-bench init my-experiment.yaml
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import shutil
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from rich.console import Console
15
+
16
+ from .exceptions import AssessmentBenchError
17
+ from .experiment import load_config, run_experiment
18
+ from .report import write_results
19
+
20
+ console = Console()
21
+
22
+ _EXAMPLE = Path(__file__).parent / "data" / "example-experiment.yaml"
23
+
24
+
25
+ def main(argv: list[str] | None = None) -> int:
26
+ parser = argparse.ArgumentParser(
27
+ prog="assessment-bench",
28
+ description="Benchmark assessment approaches over one cohort: pure-LLM marking vs signal-based observation.",
29
+ )
30
+ sub = parser.add_subparsers(dest="command", required=True)
31
+
32
+ run_p = sub.add_parser("run", help="Run an experiment config over its cohort.")
33
+ run_p.add_argument("config", type=Path, help="Experiment YAML.")
34
+ run_p.add_argument("-o", "--out", type=Path, default=Path("bench-out"), help="Output folder.")
35
+
36
+ init_p = sub.add_parser("init", help="Write a commented example experiment config.")
37
+ init_p.add_argument("path", type=Path, nargs="?", default=Path("experiment.yaml"))
38
+
39
+ args = parser.parse_args(argv)
40
+
41
+ try:
42
+ if args.command == "init":
43
+ if args.path.exists():
44
+ console.print(f"[red]refusing to overwrite {args.path}[/red]")
45
+ return 1
46
+ shutil.copy(_EXAMPLE, args.path)
47
+ console.print(f"✓ wrote {args.path} — edit it, then: assessment-bench run {args.path}")
48
+ return 0
49
+
50
+ config = load_config(args.config)
51
+ console.print(
52
+ f"[bold]{config.name}[/bold] — {len(config.arms)} arms, max score {config.max_score:g}"
53
+ )
54
+ result = run_experiment(config, progress=lambda msg: console.print(f" {msg}"))
55
+ written = write_results(result, args.out)
56
+ console.print(f"✓ {len(result.submissions)} submissions → " + ", ".join(str(p) for p in written))
57
+ if result.agreements:
58
+ console.print("[bold]Agreement with human marks:[/bold]")
59
+ for a in sorted(result.agreements, key=lambda a: -(a.pearson or -2)):
60
+ console.print(f" {a.measure}: r={a.pearson:.3f} rho={a.spearman:.3f} (n={a.n})"
61
+ if a.pearson is not None and a.spearman is not None
62
+ else f" {a.measure}: undefined (n={a.n})")
63
+ return 0
64
+ except AssessmentBenchError as exc:
65
+ console.print(f"[red]error:[/red] {exc}")
66
+ return 1
67
+
68
+
69
+ if __name__ == "__main__":
70
+ sys.exit(main())
@@ -0,0 +1,27 @@
1
+ # assessment-bench experiment config.
2
+ # Paths are relative to this file. One subfolder under `submissions` = one submission.
3
+ name: "My experiment"
4
+ rubric: rubric.yaml # assessment-lens structured rubric (criteria + pinned signals)
5
+ submissions: submissions/
6
+ max_score: 100
7
+ # Optional ground truth — enables agreement statistics (Pearson/Spearman):
8
+ # human_marks: marks.csv # CSV with header: submission_id,mark
9
+ arms:
10
+ # The baseline under test: an LLM reads the submission + rubric and emits a mark.
11
+ - id: llm-haiku
12
+ kind: llm
13
+ repetitions: 3 # repeated runs -> consistency stats (mean/CV/reliability)
14
+ provider:
15
+ provider: anthropic # anthropic | openai | ollama | openrouter
16
+ model: claude-haiku-4-5
17
+ temperature: 0.1
18
+ # Local model via Ollama (any OpenAI-compatible endpoint works via base_url):
19
+ # - id: llm-local
20
+ # kind: llm
21
+ # repetitions: 3
22
+ # provider:
23
+ # provider: ollama
24
+ # model: llama3.1
25
+ # Signal-based observation via assessment-lens (deterministic; runs once):
26
+ - id: signals
27
+ kind: signals
@@ -0,0 +1,5 @@
1
+ """assessment-bench exception hierarchy."""
2
+
3
+
4
+ class AssessmentBenchError(Exception):
5
+ """Base for everything the bench raises on purpose."""
@@ -0,0 +1,141 @@
1
+ """Experiment runner — the bench's orchestration spine.
2
+
3
+ One experiment = one rubric + one cohort + N arms. Each LLM arm runs per
4
+ submission x repetitions; the signals arm runs once per cohort (deterministic).
5
+ Afterwards, every arm's mean score and every numeric signal is correlated
6
+ against the human marks (when provided). The bench measures; it never marks.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import csv
12
+ from pathlib import Path
13
+
14
+ import yaml
15
+
16
+ from . import arms, stats
17
+ from .exceptions import AssessmentBenchError
18
+ from .models import (
19
+ Agreement,
20
+ ArmKind,
21
+ ArmOutcome,
22
+ ExperimentConfig,
23
+ ExperimentResult,
24
+ )
25
+
26
+
27
+ def load_config(path: Path) -> ExperimentConfig:
28
+ """Load an experiment YAML; relative paths resolve against the config's folder."""
29
+ raw = yaml.safe_load(Path(path).read_text())
30
+ config = ExperimentConfig.model_validate(raw)
31
+ base = Path(path).resolve().parent
32
+ config.rubric = (base / config.rubric).resolve()
33
+ config.submissions = (base / config.submissions).resolve()
34
+ if config.human_marks is not None:
35
+ config.human_marks = (base / config.human_marks).resolve()
36
+ return config
37
+
38
+
39
+ def discover_submissions(submissions_dir: Path) -> list[Path]:
40
+ """One subfolder = one submission, mirroring assessment-lens's discovery."""
41
+ if not submissions_dir.is_dir():
42
+ raise AssessmentBenchError(f"submissions folder not found: {submissions_dir}")
43
+ folders = sorted(p for p in submissions_dir.iterdir() if p.is_dir() and not p.name.startswith("."))
44
+ if not folders:
45
+ raise AssessmentBenchError(f"no submission subfolders in {submissions_dir}")
46
+ return folders
47
+
48
+
49
+ def load_human_marks(path: Path) -> dict[str, float]:
50
+ """CSV with a header row: submission_id,mark."""
51
+ marks: dict[str, float] = {}
52
+ with open(path, newline="") as f:
53
+ for row in csv.DictReader(f):
54
+ marks[row["submission_id"].strip()] = float(row["mark"])
55
+ return marks
56
+
57
+
58
+ def _agreements(
59
+ result: ExperimentResult, marks: dict[str, float]
60
+ ) -> list[Agreement]:
61
+ """Correlate every arm mean and every numeric signal with the human marks."""
62
+ agreements: list[Agreement] = []
63
+
64
+ # Arm means (LLM arms): pair each submission's mean score with its mark.
65
+ by_arm: dict[str, dict[str, float]] = {}
66
+ for outcome in result.outcomes:
67
+ if outcome.stats is not None:
68
+ by_arm.setdefault(outcome.arm_id, {})[outcome.submission_id] = outcome.stats.mean
69
+ # Numeric signals (signals arm): one measure per dotted signal path.
70
+ by_signal: dict[str, dict[str, float]] = {}
71
+ for outcome in result.outcomes:
72
+ for reading in outcome.signals:
73
+ if isinstance(reading.value, bool):
74
+ value = float(reading.value)
75
+ elif isinstance(reading.value, (int, float)):
76
+ value = float(reading.value)
77
+ else:
78
+ continue
79
+ by_signal.setdefault(reading.signal, {})[reading.submission_id] = value
80
+
81
+ for measure, values in {**by_arm, **by_signal}.items():
82
+ paired = [(values[s], marks[s]) for s in values if s in marks]
83
+ if len(paired) < 2:
84
+ continue
85
+ xs, ys = [p[0] for p in paired], [p[1] for p in paired]
86
+ agreements.append(
87
+ Agreement(
88
+ measure=measure,
89
+ n=len(paired),
90
+ pearson=stats.pearson(xs, ys),
91
+ spearman=stats.spearman(xs, ys),
92
+ )
93
+ )
94
+ return agreements
95
+
96
+
97
+ def run_experiment(config: ExperimentConfig, *, progress=None) -> ExperimentResult:
98
+ """Run every arm over the cohort and assemble the structured result.
99
+
100
+ ``progress`` is an optional callable(str) for CLI/UI status lines.
101
+ """
102
+ say = progress or (lambda _msg: None)
103
+ submissions = discover_submissions(config.submissions)
104
+ rubric_text = config.rubric.read_text()
105
+ result = ExperimentResult(
106
+ name=config.name,
107
+ max_score=config.max_score,
108
+ submissions=[s.name for s in submissions],
109
+ )
110
+
111
+ for arm in config.arms:
112
+ if arm.kind is ArmKind.SIGNALS:
113
+ say(f"arm {arm.id}: assessment-lens over {len(submissions)} submissions")
114
+ readings = arms.run_signals_arm(arm, config.rubric, config.submissions)
115
+ for folder in submissions:
116
+ result.outcomes.append(
117
+ ArmOutcome(
118
+ submission_id=folder.name,
119
+ arm_id=arm.id,
120
+ signals=[r for r in readings if r.submission_id == folder.name],
121
+ )
122
+ )
123
+ else:
124
+ for folder in submissions:
125
+ say(f"arm {arm.id}: {folder.name} x{arm.repetitions}")
126
+ runs = arms.run_llm_arm(arm, folder.name, folder, rubric_text, config.max_score)
127
+ scores = [r.score for r in runs if r.score is not None]
128
+ result.outcomes.append(
129
+ ArmOutcome(
130
+ submission_id=folder.name,
131
+ arm_id=arm.id,
132
+ runs=runs,
133
+ stats=stats.run_stats(scores),
134
+ )
135
+ )
136
+
137
+ if config.human_marks is not None:
138
+ marks = load_human_marks(config.human_marks)
139
+ result.agreements = _agreements(result, marks)
140
+
141
+ return result
@@ -0,0 +1,136 @@
1
+ """Core data models for assessment-bench.
2
+
3
+ The bench is the family's *measurement* layer: it runs the same cohort through
4
+ competing assessment arms and reports consistency and agreement. The design
5
+ rule that shapes these models: **the bench measures; it never marks.** An LLM
6
+ arm produces scores because that is the approach under test — the bench treats
7
+ those scores as data points, not as grades for students. Human marks, when
8
+ provided, are the ground truth everything is compared against.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from enum import Enum
14
+ from pathlib import Path
15
+
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ # --- Experiment side (input) -------------------------------------------------
20
+ class ProviderName(str, Enum):
21
+ ANTHROPIC = "anthropic"
22
+ OPENAI = "openai"
23
+ OLLAMA = "ollama"
24
+ OPENROUTER = "openrouter"
25
+
26
+
27
+ class ProviderSpec(BaseModel):
28
+ """Which LLM serves an arm. base_url covers Ollama / any OpenAI-compatible host."""
29
+
30
+ provider: ProviderName
31
+ model: str
32
+ base_url: str | None = None
33
+ temperature: float = 0.1
34
+ max_tokens: int = 1500
35
+
36
+
37
+ class ArmKind(str, Enum):
38
+ LLM = "llm" # pure-LLM marking: submission + rubric -> score
39
+ SIGNALS = "signals" # assessment-lens observations: deterministic evidence values
40
+
41
+
42
+ class ArmSpec(BaseModel):
43
+ """One assessment approach under test."""
44
+
45
+ id: str
46
+ kind: ArmKind
47
+ repetitions: int = Field(default=1, ge=1, le=50)
48
+ provider: ProviderSpec | None = None # required for kind=llm
49
+
50
+ def model_post_init(self, __context: object) -> None:
51
+ if self.kind is ArmKind.LLM and self.provider is None:
52
+ raise ValueError(f"arm '{self.id}': kind=llm requires a provider")
53
+
54
+
55
+ class ExperimentConfig(BaseModel):
56
+ """One experiment: a rubric, a cohort, the arms to compare.
57
+
58
+ Paths are resolved relative to the config file's directory by ``load_config``.
59
+ """
60
+
61
+ name: str
62
+ rubric: Path
63
+ submissions: Path
64
+ max_score: float = 100.0
65
+ human_marks: Path | None = Field(
66
+ default=None,
67
+ description="Optional CSV (submission_id,mark) of human ground-truth marks.",
68
+ )
69
+ arms: list[ArmSpec] = Field(min_length=1)
70
+
71
+
72
+ # --- Result side (output) ----------------------------------------------------
73
+ class GradeRun(BaseModel):
74
+ """One LLM grading call. score=None means extraction failed (kept, not hidden)."""
75
+
76
+ submission_id: str
77
+ arm_id: str
78
+ run_index: int
79
+ score: float | None = None
80
+ max_score: float
81
+ rationale: str = ""
82
+ raw_response: str = ""
83
+ error: str = ""
84
+
85
+
86
+ class RunStats(BaseModel):
87
+ """Consistency statistics over one arm's repeated runs for one submission."""
88
+
89
+ n: int
90
+ mean: float
91
+ median: float
92
+ std_dev: float = Field(description="Sample standard deviation (n-1).")
93
+ coefficient_of_variation: float
94
+ min: float
95
+ max: float
96
+ reliability: float = Field(
97
+ description="1 - CV, floored at 0. A rough 'how repeatable was this arm' index."
98
+ )
99
+
100
+
101
+ class SignalReading(BaseModel):
102
+ """One deterministic evidence value from the signals arm."""
103
+
104
+ submission_id: str
105
+ criterion_id: str
106
+ signal: str
107
+ value: object | None = None
108
+
109
+
110
+ class ArmOutcome(BaseModel):
111
+ """Everything one arm produced for one submission."""
112
+
113
+ submission_id: str
114
+ arm_id: str
115
+ runs: list[GradeRun] = Field(default_factory=list)
116
+ stats: RunStats | None = None
117
+ signals: list[SignalReading] = Field(default_factory=list)
118
+
119
+
120
+ class Agreement(BaseModel):
121
+ """Correlation between one measure and the human marks."""
122
+
123
+ measure: str = Field(description="An arm id (mean score) or a dotted signal path.")
124
+ n: int
125
+ pearson: float | None = None
126
+ spearman: float | None = None
127
+
128
+
129
+ class ExperimentResult(BaseModel):
130
+ """The source-of-truth structured result for one experiment run."""
131
+
132
+ name: str
133
+ max_score: float
134
+ submissions: list[str] = Field(default_factory=list)
135
+ outcomes: list[ArmOutcome] = Field(default_factory=list)
136
+ agreements: list[Agreement] = Field(default_factory=list)
@@ -0,0 +1,114 @@
1
+ """Multi-provider LLM completion for the pure-LLM marking arm.
2
+
3
+ Provider registry adapted from image-analyser's caption providers: Anthropic via
4
+ its own SDK; OpenAI, OpenRouter, and Ollama through the openai SDK (the latter
5
+ two are OpenAI-compatible endpoints reached via base_url). Key resolution
6
+ follows the family pattern — env var first, then a minimal .env fallback.
7
+
8
+ Everything here is opt-in and degradable: callers catch ``LLMUnavailable`` and
9
+ the experiment records the failure instead of dying mid-cohort.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from pathlib import Path
16
+
17
+ from .exceptions import AssessmentBenchError
18
+ from .models import ProviderName, ProviderSpec
19
+
20
+ PROVIDER_KEYS = {
21
+ ProviderName.ANTHROPIC: "ANTHROPIC_API_KEY",
22
+ ProviderName.OPENAI: "OPENAI_API_KEY",
23
+ ProviderName.OPENROUTER: "OPENROUTER_API_KEY",
24
+ ProviderName.OLLAMA: None, # local; no key
25
+ }
26
+
27
+ DEFAULT_BASE_URLS = {
28
+ ProviderName.OPENROUTER: "https://openrouter.ai/api/v1",
29
+ ProviderName.OLLAMA: "http://localhost:11434/v1",
30
+ }
31
+
32
+
33
+ class LLMUnavailable(AssessmentBenchError):
34
+ """The [llm] extra is not installed or no API key is configured."""
35
+
36
+
37
+ def _load_env_file() -> None:
38
+ """Minimal .env loader (cwd upward) — no python-dotenv dependency."""
39
+ for parent in [Path.cwd(), *Path.cwd().parents]:
40
+ env_file = parent / ".env"
41
+ if env_file.exists():
42
+ try:
43
+ for line in env_file.read_text().splitlines():
44
+ line = line.strip()
45
+ if line and not line.startswith("#") and "=" in line:
46
+ key, value = line.split("=", 1)
47
+ os.environ.setdefault(key.strip(), value.strip().strip("\"'"))
48
+ except OSError:
49
+ pass
50
+ return
51
+
52
+
53
+ def get_api_key(provider: ProviderName) -> str | None:
54
+ env_var = PROVIDER_KEYS.get(provider)
55
+ if env_var is None:
56
+ return "unused" # Ollama: openai SDK requires a non-empty key
57
+ if key := os.getenv(env_var):
58
+ return key
59
+ _load_env_file()
60
+ return os.getenv(env_var)
61
+
62
+
63
+ def complete(prompt: str, *, system: str, spec: ProviderSpec) -> str:
64
+ """One marking-style completion against the arm's configured provider."""
65
+ api_key = get_api_key(spec.provider)
66
+ if not api_key:
67
+ raise LLMUnavailable(
68
+ f"No API key for {spec.provider.value} — set {PROVIDER_KEYS[spec.provider]} (env or .env)."
69
+ )
70
+
71
+ if spec.provider is ProviderName.ANTHROPIC:
72
+ return _complete_anthropic(prompt, system=system, spec=spec, api_key=api_key)
73
+ return _complete_openai_compatible(prompt, system=system, spec=spec, api_key=api_key)
74
+
75
+
76
+ def _complete_anthropic(prompt: str, *, system: str, spec: ProviderSpec, api_key: str) -> str:
77
+ try:
78
+ import anthropic
79
+ except ImportError as exc:
80
+ raise LLMUnavailable(
81
+ "LLM arms need the [llm] extra: pip install 'assessment-bench[llm]'"
82
+ ) from exc
83
+ client = anthropic.Anthropic(api_key=api_key)
84
+ response = client.messages.create(
85
+ model=spec.model,
86
+ max_tokens=spec.max_tokens,
87
+ temperature=spec.temperature,
88
+ system=system,
89
+ messages=[{"role": "user", "content": prompt}],
90
+ )
91
+ return "".join(block.text for block in response.content if block.type == "text").strip()
92
+
93
+
94
+ def _complete_openai_compatible(
95
+ prompt: str, *, system: str, spec: ProviderSpec, api_key: str
96
+ ) -> str:
97
+ try:
98
+ import openai
99
+ except ImportError as exc:
100
+ raise LLMUnavailable(
101
+ "LLM arms need the [llm] extra: pip install 'assessment-bench[llm]'"
102
+ ) from exc
103
+ base_url = spec.base_url or DEFAULT_BASE_URLS.get(spec.provider)
104
+ client = openai.OpenAI(api_key=api_key, base_url=base_url)
105
+ response = client.chat.completions.create(
106
+ model=spec.model,
107
+ max_tokens=spec.max_tokens,
108
+ temperature=spec.temperature,
109
+ messages=[
110
+ {"role": "system", "content": system},
111
+ {"role": "user", "content": prompt},
112
+ ],
113
+ )
114
+ return (response.choices[0].message.content or "").strip()
@@ -0,0 +1,58 @@
1
+ """Result writers: one JSON source of truth + flat CSVs for spreadsheet people."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import json
7
+ from pathlib import Path
8
+
9
+ from .models import ExperimentResult
10
+
11
+
12
+ def write_results(result: ExperimentResult, out_dir: Path) -> list[Path]:
13
+ """Write result.json, runs.csv, signals.csv, agreement.csv. Returns written paths."""
14
+ out_dir.mkdir(parents=True, exist_ok=True)
15
+ written: list[Path] = []
16
+
17
+ json_path = out_dir / "result.json"
18
+ json_path.write_text(json.dumps(result.model_dump(mode="json"), indent=2))
19
+ written.append(json_path)
20
+
21
+ runs_path = out_dir / "runs.csv"
22
+ with open(runs_path, "w", newline="") as f:
23
+ writer = csv.writer(f)
24
+ writer.writerow(["submission", "arm", "run", "score", "max_score", "error"])
25
+ for outcome in result.outcomes:
26
+ for run in outcome.runs:
27
+ writer.writerow(
28
+ [run.submission_id, run.arm_id, run.run_index, run.score, run.max_score, run.error]
29
+ )
30
+ written.append(runs_path)
31
+
32
+ signals_path = out_dir / "signals.csv"
33
+ with open(signals_path, "w", newline="") as f:
34
+ writer = csv.writer(f)
35
+ writer.writerow(["submission", "arm", "criterion", "signal", "value"])
36
+ for outcome in result.outcomes:
37
+ for reading in outcome.signals:
38
+ writer.writerow(
39
+ [
40
+ reading.submission_id,
41
+ outcome.arm_id,
42
+ reading.criterion_id,
43
+ reading.signal,
44
+ json.dumps(reading.value, default=str),
45
+ ]
46
+ )
47
+ written.append(signals_path)
48
+
49
+ if result.agreements:
50
+ agreement_path = out_dir / "agreement.csv"
51
+ with open(agreement_path, "w", newline="") as f:
52
+ writer = csv.writer(f)
53
+ writer.writerow(["measure", "n", "pearson", "spearman"])
54
+ for a in result.agreements:
55
+ writer.writerow([a.measure, a.n, a.pearson, a.spearman])
56
+ written.append(agreement_path)
57
+
58
+ return written
@@ -0,0 +1,79 @@
1
+ """Consistency and agreement statistics.
2
+
3
+ The run-level statistics (mean/median/sample std-dev/CV/reliability) are a port
4
+ of the original AssessmentBench Rust aggregation engine — the best-validated
5
+ concept in that prototype. Agreement (Pearson/Spearman against human marks) is
6
+ new here: it is the bench's core research output. Pure stdlib, no numpy.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .models import RunStats
12
+
13
+
14
+ def run_stats(scores: list[float]) -> RunStats | None:
15
+ """Consistency statistics over one arm's repeated scores. None when empty."""
16
+ if not scores:
17
+ return None
18
+ n = len(scores)
19
+ mean = sum(scores) / n
20
+ ordered = sorted(scores)
21
+ median = (
22
+ ordered[n // 2]
23
+ if n % 2
24
+ else (ordered[n // 2 - 1] + ordered[n // 2]) / 2.0
25
+ )
26
+ if n > 1:
27
+ variance = sum((s - mean) ** 2 for s in scores) / (n - 1)
28
+ std_dev = variance**0.5
29
+ else:
30
+ std_dev = 0.0
31
+ cv = std_dev / mean if mean else 0.0
32
+ return RunStats(
33
+ n=n,
34
+ mean=mean,
35
+ median=median,
36
+ std_dev=std_dev,
37
+ coefficient_of_variation=cv,
38
+ min=ordered[0],
39
+ max=ordered[-1],
40
+ reliability=max(0.0, 1.0 - cv),
41
+ )
42
+
43
+
44
+ def pearson(xs: list[float], ys: list[float]) -> float | None:
45
+ """Pearson r. None when undefined (n<2 or zero variance) — never faked as 0."""
46
+ n = len(xs)
47
+ if n != len(ys) or n < 2:
48
+ return None
49
+ mx = sum(xs) / n
50
+ my = sum(ys) / n
51
+ sxx = sum((x - mx) ** 2 for x in xs)
52
+ syy = sum((y - my) ** 2 for y in ys)
53
+ if sxx == 0 or syy == 0:
54
+ return None
55
+ sxy = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
56
+ return sxy / (sxx**0.5 * syy**0.5)
57
+
58
+
59
+ def _ranks(values: list[float]) -> list[float]:
60
+ """Average ranks (ties share the mean of their rank positions)."""
61
+ indexed = sorted(range(len(values)), key=lambda i: values[i])
62
+ ranks = [0.0] * len(values)
63
+ i = 0
64
+ while i < len(indexed):
65
+ j = i
66
+ while j + 1 < len(indexed) and values[indexed[j + 1]] == values[indexed[i]]:
67
+ j += 1
68
+ avg_rank = (i + j) / 2.0 + 1.0
69
+ for k in range(i, j + 1):
70
+ ranks[indexed[k]] = avg_rank
71
+ i = j + 1
72
+ return ranks
73
+
74
+
75
+ def spearman(xs: list[float], ys: list[float]) -> float | None:
76
+ """Spearman rho = Pearson on average ranks. None when undefined."""
77
+ if len(xs) != len(ys) or len(xs) < 2:
78
+ return None
79
+ return pearson(_ranks(xs), _ranks(ys))
@@ -0,0 +1,36 @@
1
+ """Score extraction — the riskiest parsing in the bench (the Rust prototype never tested its)."""
2
+
3
+ import pytest
4
+
5
+ from assessment_bench.arms import extract_score, grade_prompt
6
+
7
+
8
+ def test_strict_score_line():
9
+ score, _ = extract_score("Good work overall.\nSCORE: 78/100", 100.0)
10
+ assert score == pytest.approx(78.0)
11
+
12
+
13
+ def test_last_score_line_wins():
14
+ text = "If perfect this would be SCORE: 100/100, but...\nSCORE: 62.5/100"
15
+ score, _ = extract_score(text, 100.0)
16
+ assert score == pytest.approx(62.5)
17
+
18
+
19
+ def test_scaled_to_max_score():
20
+ score, _ = extract_score("SCORE: 7/10", 100.0)
21
+ assert score == pytest.approx(70.0)
22
+
23
+
24
+ def test_fallback_out_of_phrasing():
25
+ score, _ = extract_score("I would award 41 out of 50 for this.", 100.0)
26
+ assert score == pytest.approx(82.0)
27
+
28
+
29
+ def test_no_score_returns_none():
30
+ score, _ = extract_score("This is thoughtful work with clear structure.", 100.0)
31
+ assert score is None
32
+
33
+
34
+ def test_grade_prompt_carries_parts():
35
+ p = grade_prompt("RUB", "SUB", 50.0)
36
+ assert "RUB" in p and "SUB" in p and "SCORE: x/50" in p
@@ -0,0 +1,44 @@
1
+ """Config models: the experiment YAML is the bench's central contract."""
2
+
3
+ import pytest
4
+ from pydantic import ValidationError
5
+
6
+ from assessment_bench.models import ArmKind, ArmSpec, ExperimentConfig
7
+
8
+
9
+ def test_llm_arm_requires_provider():
10
+ with pytest.raises((ValidationError, ValueError)):
11
+ ArmSpec(id="bad", kind=ArmKind.LLM)
12
+
13
+
14
+ def test_signals_arm_needs_no_provider():
15
+ arm = ArmSpec(id="signals", kind=ArmKind.SIGNALS)
16
+ assert arm.repetitions == 1
17
+
18
+
19
+ def test_experiment_config_parses():
20
+ config = ExperimentConfig.model_validate(
21
+ {
22
+ "name": "t",
23
+ "rubric": "rubric.yaml",
24
+ "submissions": "subs/",
25
+ "arms": [
26
+ {
27
+ "id": "llm",
28
+ "kind": "llm",
29
+ "repetitions": 3,
30
+ "provider": {"provider": "ollama", "model": "llama3.1"},
31
+ },
32
+ {"id": "signals", "kind": "signals"},
33
+ ],
34
+ }
35
+ )
36
+ assert config.max_score == 100.0
37
+ assert config.arms[0].provider.base_url is None
38
+
39
+
40
+ def test_experiment_config_requires_an_arm():
41
+ with pytest.raises(ValidationError):
42
+ ExperimentConfig.model_validate(
43
+ {"name": "t", "rubric": "r.yaml", "submissions": "s/", "arms": []}
44
+ )
@@ -0,0 +1,44 @@
1
+ """Stats: the ported Rust aggregation math + the new agreement correlations."""
2
+
3
+ import pytest
4
+
5
+ from assessment_bench.stats import pearson, run_stats, spearman
6
+
7
+
8
+ def test_run_stats_basic():
9
+ s = run_stats([80.0, 85.0, 90.0])
10
+ assert s.n == 3
11
+ assert s.mean == pytest.approx(85.0)
12
+ assert s.median == pytest.approx(85.0)
13
+ assert s.std_dev == pytest.approx(5.0) # sample std-dev (n-1)
14
+ assert s.coefficient_of_variation == pytest.approx(5.0 / 85.0)
15
+ assert s.min == 80.0 and s.max == 90.0
16
+ assert s.reliability == pytest.approx(1.0 - 5.0 / 85.0)
17
+
18
+
19
+ def test_run_stats_even_median_and_single():
20
+ assert run_stats([1.0, 2.0, 3.0, 4.0]).median == pytest.approx(2.5)
21
+ single = run_stats([70.0])
22
+ assert single.std_dev == 0.0 and single.reliability == 1.0
23
+ assert run_stats([]) is None
24
+
25
+
26
+ def test_pearson_perfect_and_inverse():
27
+ assert pearson([1, 2, 3], [10, 20, 30]) == pytest.approx(1.0)
28
+ assert pearson([1, 2, 3], [30, 20, 10]) == pytest.approx(-1.0)
29
+
30
+
31
+ def test_pearson_undefined_not_faked():
32
+ assert pearson([1, 1, 1], [1, 2, 3]) is None # zero variance
33
+ assert pearson([1], [1]) is None # n < 2
34
+
35
+
36
+ def test_spearman_monotonic_nonlinear():
37
+ # Monotonic but nonlinear: rho is 1 even where r is not.
38
+ xs = [1.0, 2.0, 3.0, 4.0]
39
+ ys = [1.0, 10.0, 100.0, 1000.0]
40
+ assert spearman(xs, ys) == pytest.approx(1.0)
41
+
42
+
43
+ def test_spearman_ties_average_ranks():
44
+ assert spearman([1.0, 1.0, 2.0], [1.0, 1.0, 2.0]) == pytest.approx(1.0)