itemeval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- itemeval/__init__.py +60 -0
- itemeval/_builtin/config.yaml +48 -0
- itemeval/_builtin/prompts/solver/minimal.md +4 -0
- itemeval/_builtin/prompts/solver/standard.md +6 -0
- itemeval/_builtin/rubrics/standard.md +17 -0
- itemeval/_config.py +252 -0
- itemeval/_errors.py +25 -0
- itemeval/_item.py +29 -0
- itemeval/_manifest.py +196 -0
- itemeval/_mockmodels.py +68 -0
- itemeval/_prepare.py +85 -0
- itemeval/_status.py +165 -0
- itemeval/_templates.py +161 -0
- itemeval/_util.py +46 -0
- itemeval/adapters/__init__.py +1 -0
- itemeval/adapters/_base.py +98 -0
- itemeval/adapters/_hf.py +85 -0
- itemeval/budget/__init__.py +1 -0
- itemeval/budget/_estimator.py +195 -0
- itemeval/budget/_gate.py +69 -0
- itemeval/budget/_policies.py +40 -0
- itemeval/budget/_pricing.py +131 -0
- itemeval/budget/pricing_seed.json +13 -0
- itemeval/cli.py +407 -0
- itemeval/design/__init__.py +1 -0
- itemeval/design/_grid.py +178 -0
- itemeval/design/_ids.py +25 -0
- itemeval/generate/__init__.py +1 -0
- itemeval/generate/_params.py +50 -0
- itemeval/generate/_run.py +397 -0
- itemeval/generate/_task.py +61 -0
- itemeval/grade/__init__.py +1 -0
- itemeval/grade/_judge.py +96 -0
- itemeval/grade/_parse.py +94 -0
- itemeval/grade/_run.py +333 -0
- itemeval/grade/_verifiable.py +84 -0
- itemeval/py.typed +0 -0
- itemeval/store/__init__.py +1 -0
- itemeval/store/_base.py +59 -0
- itemeval/store/_export.py +183 -0
- itemeval/store/_gradings.py +87 -0
- itemeval/store/_items.py +47 -0
- itemeval/store/_layout.py +47 -0
- itemeval/store/_ledger.py +37 -0
- itemeval/store/_logs.py +38 -0
- itemeval/store/_solutions.py +93 -0
- itemeval-0.1.0.dist-info/METADATA +279 -0
- itemeval-0.1.0.dist-info/RECORD +51 -0
- itemeval-0.1.0.dist-info/WHEEL +4 -0
- itemeval-0.1.0.dist-info/entry_points.txt +2 -0
- itemeval-0.1.0.dist-info/licenses/LICENSE +21 -0
itemeval/__init__.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""itemeval: item-level LLM evaluation over any API, with built-in budget control.
|
|
2
|
+
|
|
3
|
+
Two equivalent ways to drive a study:
|
|
4
|
+
|
|
5
|
+
CLI itemeval estimate|generate|grade|export|status CONFIG
|
|
6
|
+
Python cfg = load_config("configs/my_study.yaml")
|
|
7
|
+
prep = prepare_study(cfg)
|
|
8
|
+
estimate_study(prep) # projected $ per stage
|
|
9
|
+
run_generate(prep) # stage 1 -> solutions store
|
|
10
|
+
run_grade(prep) # stage 2 -> gradings store
|
|
11
|
+
export_study(cfg) # long-format parquet + CSV + ledger
|
|
12
|
+
build_status(cfg, prep) # grid completion report
|
|
13
|
+
|
|
14
|
+
The Python pipeline functions do NOT apply the budget confirmation gate (a
|
|
15
|
+
CLI feature) — compare `estimate_study(...)` totals against your own
|
|
16
|
+
threshold before paid runs.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from importlib import import_module
|
|
20
|
+
from importlib.metadata import version
|
|
21
|
+
|
|
22
|
+
from itemeval._config import ExperimentConfig, load_config
|
|
23
|
+
from itemeval._item import Item
|
|
24
|
+
|
|
25
|
+
__version__ = version("itemeval")
|
|
26
|
+
|
|
27
|
+
# Pipeline functions resolve lazily (PEP 562) so `import itemeval` stays
|
|
28
|
+
# light: eager imports here would pull inspect_ai/pandas into every CLI start.
|
|
29
|
+
_LAZY = {
|
|
30
|
+
"prepare_study": ("itemeval._prepare", "prepare_study"),
|
|
31
|
+
"estimate_study": ("itemeval.budget._estimator", "estimate_study"),
|
|
32
|
+
"run_generate": ("itemeval.generate._run", "run_generate"),
|
|
33
|
+
"run_grade": ("itemeval.grade._run", "run_grade"),
|
|
34
|
+
"export_study": ("itemeval.store._export", "export_study"),
|
|
35
|
+
"build_status": ("itemeval._status", "build_status"),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"ExperimentConfig",
|
|
40
|
+
"Item",
|
|
41
|
+
"__version__",
|
|
42
|
+
"build_status",
|
|
43
|
+
"estimate_study",
|
|
44
|
+
"export_study",
|
|
45
|
+
"load_config",
|
|
46
|
+
"prepare_study",
|
|
47
|
+
"run_generate",
|
|
48
|
+
"run_grade",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def __getattr__(name: str):
|
|
53
|
+
if name in _LAZY:
|
|
54
|
+
module_name, attr = _LAZY[name]
|
|
55
|
+
return getattr(import_module(module_name), attr)
|
|
56
|
+
raise AttributeError(f"module 'itemeval' has no attribute {name!r}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def __dir__() -> "list[str]":
|
|
60
|
+
return sorted(__all__)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# itemeval study config — generated by `itemeval init`.
|
|
2
|
+
# Runs free out of the box on the mock provider; see the numbered steps to adapt it.
|
|
3
|
+
study: my_study
|
|
4
|
+
|
|
5
|
+
# Inputs (prompts/rubrics) resolve relative to THIS config file's directory.
|
|
6
|
+
# Outputs (the studies/ folder) are written under your current working directory.
|
|
7
|
+
# Both anchors are overridable — see docs/wiki/Configuration.md.
|
|
8
|
+
|
|
9
|
+
benchmark:
|
|
10
|
+
adapter: hf
|
|
11
|
+
datasets:
|
|
12
|
+
# step 1: point this at your HuggingFace dataset and pin a revision.
|
|
13
|
+
- id: MathArena/usamo_2025
|
|
14
|
+
revision: 0a2c60f2249e07b8ee76c942bca4f5f87aa959df
|
|
15
|
+
split: train
|
|
16
|
+
mapping:
|
|
17
|
+
# step 1: adjust these to your dataset's column names.
|
|
18
|
+
id: problem_idx
|
|
19
|
+
input: problem
|
|
20
|
+
target: sample_solution
|
|
21
|
+
grading_scheme: grading_scheme
|
|
22
|
+
metadata: [points]
|
|
23
|
+
|
|
24
|
+
solvers:
|
|
25
|
+
# step 4: replace mockllm/* with real inspect model ids (openai/..., anthropic/..., openrouter/...).
|
|
26
|
+
models: [mockllm/solver-a, mockllm/solver-b, mockllm/solver-c]
|
|
27
|
+
temperature: 0.7
|
|
28
|
+
max_tokens: 1024
|
|
29
|
+
|
|
30
|
+
facets:
|
|
31
|
+
# `builtin:` templates ship with itemeval. Run `itemeval init --with-templates`
|
|
32
|
+
# to copy them locally as editable starters, or write your own under prompts/solver/.
|
|
33
|
+
prompt: [builtin:minimal, builtin:standard]
|
|
34
|
+
grader: [mock_judge]
|
|
35
|
+
rubric: [builtin:standard]
|
|
36
|
+
replications: 2
|
|
37
|
+
|
|
38
|
+
graders:
|
|
39
|
+
mock_judge:
|
|
40
|
+
model: mockllm/judge # step 4: swap in a real judge model id
|
|
41
|
+
max_tokens: 512
|
|
42
|
+
|
|
43
|
+
crossing: full
|
|
44
|
+
|
|
45
|
+
budget:
|
|
46
|
+
policy: dev # dev: first 2 items only; switch to full-interactive / full-batch when ready
|
|
47
|
+
confirm_above_usd: 5
|
|
48
|
+
batch: auto
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
You are grading a candidate solution against a reference solution and a grading
|
|
2
|
+
scheme.
|
|
3
|
+
|
|
4
|
+
Problem:
|
|
5
|
+
{input}
|
|
6
|
+
|
|
7
|
+
Grading scheme:
|
|
8
|
+
{grading_scheme}
|
|
9
|
+
|
|
10
|
+
Reference solution:
|
|
11
|
+
{target}
|
|
12
|
+
|
|
13
|
+
Candidate solution:
|
|
14
|
+
{solution}
|
|
15
|
+
|
|
16
|
+
Evaluate the candidate solution against the grading scheme. Award a numeric
|
|
17
|
+
score according to the scheme.
|
itemeval/_config.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Experiment config schema and YAML loader.
|
|
2
|
+
|
|
3
|
+
YAML *shape* validation happens at load; *reference* resolution (template
|
|
4
|
+
files exist, grader names defined) is deferred to prepare/grid-expansion so
|
|
5
|
+
the README sketch validates as-is.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
from pydantic import (
|
|
13
|
+
BaseModel,
|
|
14
|
+
ConfigDict,
|
|
15
|
+
Field,
|
|
16
|
+
PrivateAttr,
|
|
17
|
+
ValidationError,
|
|
18
|
+
field_validator,
|
|
19
|
+
model_validator,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from itemeval._errors import ConfigError
|
|
23
|
+
from itemeval._util import sha256_hex
|
|
24
|
+
|
|
25
|
+
NAME_RE = r"^[A-Za-z0-9][A-Za-z0-9._-]*$"
|
|
26
|
+
STUDY_RE = r"^[a-z0-9][a-z0-9_-]{0,63}$"
|
|
27
|
+
|
|
28
|
+
ReasoningEffort = Literal["none", "minimal", "low", "medium", "high", "xhigh", "max"]
|
|
29
|
+
|
|
30
|
+
# What to do with a *completed* generation that produced no gradable text
|
|
31
|
+
# (empty/blank `solution`, no API error — e.g. a reasoning model whose token
|
|
32
|
+
# budget was spent entirely on hidden reasoning). Distinct from API errors
|
|
33
|
+
# (always re-attempted) and parse failures (always final).
|
|
34
|
+
# skip — exclude from grading, but report the count + stop reasons (default)
|
|
35
|
+
# rerun — also treat as not-done in generate, so a subsequent `generate`
|
|
36
|
+
# re-attempts them (raise max_tokens / lower reasoning effort first;
|
|
37
|
+
# an identical request will hit the response cache and stay empty)
|
|
38
|
+
# grade — send to the judge as-is (an empty answer, typically scored low)
|
|
39
|
+
EmptySolutionPolicy = Literal["skip", "rerun", "grade"]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatasetSpec(BaseModel):
|
|
43
|
+
model_config = ConfigDict(extra="forbid")
|
|
44
|
+
|
|
45
|
+
id: str
|
|
46
|
+
revision: str | None = None # branch/tag/SHA; None -> lock file / resolve at first run
|
|
47
|
+
split: str = "train"
|
|
48
|
+
name: str | None = None # HF config name
|
|
49
|
+
limit: int | None = Field(default=None, ge=1) # first N rows, no shuffle
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MappingSpec(BaseModel):
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
|
|
55
|
+
input: str
|
|
56
|
+
target: str | None = None
|
|
57
|
+
id: str | None = None # record column -> Item.id (else row index)
|
|
58
|
+
grading_scheme: str | None = None
|
|
59
|
+
metadata: list[str] = Field(default_factory=list)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class BenchmarkConfig(BaseModel):
|
|
63
|
+
model_config = ConfigDict(extra="forbid")
|
|
64
|
+
|
|
65
|
+
adapter: Literal["hf"]
|
|
66
|
+
datasets: list[DatasetSpec] = Field(min_length=1)
|
|
67
|
+
mapping: MappingSpec
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SolversConfig(BaseModel):
|
|
71
|
+
model_config = ConfigDict(extra="forbid")
|
|
72
|
+
|
|
73
|
+
models: list[str] = Field(min_length=1)
|
|
74
|
+
temperature: float | None = Field(default=None, ge=0.0, le=2.0)
|
|
75
|
+
max_tokens: int | None = Field(default=None, ge=1)
|
|
76
|
+
top_p: float | None = Field(default=None, gt=0.0, le=1.0)
|
|
77
|
+
seed: int | None = None # recorded; only some providers honor it
|
|
78
|
+
on_empty: EmptySolutionPolicy = "skip" # handling of empty (no-error) solutions
|
|
79
|
+
|
|
80
|
+
@field_validator("models")
|
|
81
|
+
@classmethod
|
|
82
|
+
def _unique_models(cls, v: list[str]) -> list[str]:
|
|
83
|
+
if len(set(v)) != len(v):
|
|
84
|
+
raise ValueError("solvers.models must be unique")
|
|
85
|
+
return v
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class ModelConfigFacet(BaseModel):
|
|
89
|
+
"""One model-config grid cell (sampling overrides + thinking/reasoning toggle)."""
|
|
90
|
+
|
|
91
|
+
model_config = ConfigDict(extra="forbid")
|
|
92
|
+
|
|
93
|
+
name: str = Field(pattern=NAME_RE)
|
|
94
|
+
temperature: float | None = Field(default=None, ge=0.0, le=2.0)
|
|
95
|
+
max_tokens: int | None = Field(default=None, ge=1)
|
|
96
|
+
top_p: float | None = Field(default=None, gt=0.0, le=1.0)
|
|
97
|
+
reasoning_effort: ReasoningEffort | None = None # OpenAI-style
|
|
98
|
+
reasoning_tokens: int | None = Field(default=None, ge=1) # Anthropic extended thinking
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class FacetsConfig(BaseModel):
|
|
102
|
+
# `model_config` is reserved on pydantic models; the facet list is stored as
|
|
103
|
+
# `model_config_facet` with alias "model_config" (the YAML key).
|
|
104
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
105
|
+
|
|
106
|
+
prompt: list[str] = Field(default_factory=lambda: ["builtin:standard"], min_length=1)
|
|
107
|
+
grader: list[str] = Field(default_factory=list)
|
|
108
|
+
rubric: list[str] = Field(default_factory=lambda: ["builtin:standard"], min_length=1)
|
|
109
|
+
scorer: Literal["exact_match", "multiple_choice", "numeric"] | None = None
|
|
110
|
+
replications: int = Field(default=1, ge=1)
|
|
111
|
+
model_config_facet: list[ModelConfigFacet] = Field(
|
|
112
|
+
default_factory=lambda: [ModelConfigFacet(name="default")],
|
|
113
|
+
alias="model_config",
|
|
114
|
+
min_length=1,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
@model_validator(mode="after")
|
|
118
|
+
def _check(self) -> "FacetsConfig":
|
|
119
|
+
if not self.grader and self.scorer is None:
|
|
120
|
+
raise ValueError("facets must declare at least one of grader / scorer")
|
|
121
|
+
names = [m.name for m in self.model_config_facet]
|
|
122
|
+
if len(set(names)) != len(names):
|
|
123
|
+
raise ValueError("facets.model_config names must be unique")
|
|
124
|
+
for field in ("prompt", "grader", "rubric"):
|
|
125
|
+
vals = getattr(self, field)
|
|
126
|
+
if len(set(vals)) != len(vals):
|
|
127
|
+
raise ValueError(f"facets.{field} entries must be unique")
|
|
128
|
+
return self
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class GraderSpec(BaseModel):
|
|
132
|
+
"""Judge model spec. Temperature is pinned to 0.0 in v0.1 (ROADMAP M3)."""
|
|
133
|
+
|
|
134
|
+
model_config = ConfigDict(extra="forbid")
|
|
135
|
+
|
|
136
|
+
model: str
|
|
137
|
+
max_tokens: int | None = Field(default=2048, ge=1)
|
|
138
|
+
reasoning_effort: ReasoningEffort | None = None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class BudgetConfig(BaseModel):
|
|
142
|
+
model_config = ConfigDict(extra="forbid")
|
|
143
|
+
|
|
144
|
+
policy: Literal["dev", "full-interactive", "full-batch"] = "dev"
|
|
145
|
+
confirm_above_usd: float = Field(default=5.0, ge=0.0)
|
|
146
|
+
batch: bool | int | Literal["auto"] = "auto"
|
|
147
|
+
max_usd: float | None = Field(default=None, gt=0.0) # hard cap, never overridable
|
|
148
|
+
dev_items: int = Field(default=2, ge=1) # dev preset: first N items
|
|
149
|
+
dev_replications: int | None = Field(default=None, ge=1) # None = keep config reps
|
|
150
|
+
pricing_path: str | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ExperimentConfig(BaseModel):
|
|
154
|
+
model_config = ConfigDict(extra="forbid")
|
|
155
|
+
|
|
156
|
+
study: str = Field(pattern=STUDY_RE)
|
|
157
|
+
output_dir: str = "studies" # outputs: resolved relative to work_dir (CWD)
|
|
158
|
+
prompts_dir: str = (
|
|
159
|
+
"prompts" # solver templates: <prompts_dir>/solver/<name>.md (relative to config_dir)
|
|
160
|
+
)
|
|
161
|
+
rubrics_dir: str = (
|
|
162
|
+
"rubrics" # rubric templates: <rubrics_dir>/<name>.md (relative to config_dir)
|
|
163
|
+
)
|
|
164
|
+
cache: bool = True # inspect local response cache, both stages
|
|
165
|
+
benchmark: BenchmarkConfig
|
|
166
|
+
solvers: SolversConfig
|
|
167
|
+
facets: FacetsConfig
|
|
168
|
+
graders: dict[str, GraderSpec] = Field(default_factory=dict)
|
|
169
|
+
crossing: Literal["full"] = "full"
|
|
170
|
+
budget: BudgetConfig = Field(default_factory=BudgetConfig)
|
|
171
|
+
|
|
172
|
+
# Two resolution anchors (see docs/wiki/Configuration.md):
|
|
173
|
+
# config_dir — the loaded YAML's directory; anchors INPUTS (prompts/rubrics/pricing).
|
|
174
|
+
# None for in-memory configs, which then anchor inputs to work_dir.
|
|
175
|
+
# work_dir — defaults to CWD; anchors OUTPUTS (the study directory). Never the package.
|
|
176
|
+
_config_dir: Path | None = PrivateAttr(default=None)
|
|
177
|
+
_work_dir: Path = PrivateAttr(default_factory=Path.cwd)
|
|
178
|
+
_config_path: Path | None = PrivateAttr(default=None)
|
|
179
|
+
_config_sha256: str | None = PrivateAttr(default=None)
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def config_dir(self) -> Path | None:
|
|
183
|
+
return self._config_dir
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def work_dir(self) -> Path:
|
|
187
|
+
return self._work_dir
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def config_path(self) -> Path | None:
|
|
191
|
+
return self._config_path
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def config_sha256(self) -> str | None:
|
|
195
|
+
return self._config_sha256
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def _input_base(self) -> Path:
|
|
199
|
+
"""Anchor for input dirs: the config's directory, or work_dir for in-memory configs."""
|
|
200
|
+
return self._config_dir if self._config_dir is not None else self._work_dir
|
|
201
|
+
|
|
202
|
+
def resolve_input_dir(self, rel: str) -> Path:
|
|
203
|
+
"""Resolve an input dir (prompts/rubrics/pricing) under config_dir; absolute paths pass through."""
|
|
204
|
+
p = Path(rel).expanduser()
|
|
205
|
+
return (p if p.is_absolute() else self._input_base / p).resolve()
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def study_dir(self) -> Path:
|
|
209
|
+
"""Output study directory, anchored to work_dir (CWD); absolute output_dir passes through."""
|
|
210
|
+
out = Path(self.output_dir).expanduser()
|
|
211
|
+
base = out if out.is_absolute() else self._work_dir / out
|
|
212
|
+
return (base / self.study).resolve()
|
|
213
|
+
|
|
214
|
+
def grader_spec(self, name: str) -> GraderSpec:
|
|
215
|
+
"""Resolve a facets.grader entry. Raises ConfigError if unresolvable."""
|
|
216
|
+
if name in self.graders:
|
|
217
|
+
return self.graders[name]
|
|
218
|
+
if "/" in name: # bare model id used directly as a grader
|
|
219
|
+
return GraderSpec(model=name)
|
|
220
|
+
raise ConfigError(f"grader '{name}' is not defined under graders: and is not a model id")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def load_config(path: "str | Path", *, work_dir: "str | Path | None" = None) -> ExperimentConfig:
|
|
224
|
+
"""Load and validate an experiment config YAML file.
|
|
225
|
+
|
|
226
|
+
Inputs (prompts/rubrics) anchor to the config file's directory; outputs (the
|
|
227
|
+
study dir) anchor to `work_dir`, defaulting to the current working directory.
|
|
228
|
+
"""
|
|
229
|
+
p = Path(path).expanduser().resolve()
|
|
230
|
+
if not p.is_file():
|
|
231
|
+
raise ConfigError(f"config file not found: {p}")
|
|
232
|
+
raw = p.read_bytes()
|
|
233
|
+
try:
|
|
234
|
+
data = yaml.safe_load(raw)
|
|
235
|
+
except yaml.YAMLError as e:
|
|
236
|
+
raise ConfigError(f"invalid YAML in {p}: {e}") from e
|
|
237
|
+
if not isinstance(data, dict):
|
|
238
|
+
raise ConfigError(f"config root must be a YAML mapping: {p}")
|
|
239
|
+
try:
|
|
240
|
+
cfg = ExperimentConfig.model_validate(data)
|
|
241
|
+
except ValidationError as e:
|
|
242
|
+
raise ConfigError(f"invalid config {p}:\n{e}") from e
|
|
243
|
+
cfg._config_dir = p.parent
|
|
244
|
+
cfg._work_dir = Path(work_dir).expanduser().resolve() if work_dir is not None else Path.cwd()
|
|
245
|
+
cfg._config_path = p
|
|
246
|
+
cfg._config_sha256 = sha256_hex(raw)
|
|
247
|
+
return cfg
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def config_to_jsonable(cfg: ExperimentConfig) -> dict[str, Any]:
|
|
251
|
+
"""Config as a JSON-ready dict using YAML key names (for manifests)."""
|
|
252
|
+
return cfg.model_dump(mode="json", by_alias=True)
|
itemeval/_errors.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Exception hierarchy for itemeval."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ItemevalError(Exception):
|
|
5
|
+
"""Base class for all itemeval errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ConfigError(ItemevalError):
|
|
9
|
+
"""YAML shape/validation failures and bad config references."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdapterError(ItemevalError):
|
|
13
|
+
"""Dataset load or field-mapping failures."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TemplateError(ItemevalError):
|
|
17
|
+
"""Missing template file or required placeholder."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StoreError(ItemevalError):
|
|
21
|
+
"""Parquet schema or IO problems."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BudgetError(ItemevalError):
|
|
25
|
+
"""Pricing refresh or estimator failures."""
|
itemeval/_item.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Canonical benchmark item model."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Item(BaseModel):
|
|
9
|
+
"""Canonical benchmark item: what adapters produce and both stages consume."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
12
|
+
|
|
13
|
+
id: str
|
|
14
|
+
input: str
|
|
15
|
+
target: str = ""
|
|
16
|
+
grading_scheme: str | None = None
|
|
17
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
@field_validator("id", mode="before")
|
|
20
|
+
@classmethod
|
|
21
|
+
def _coerce_id(cls, v: Any) -> str:
|
|
22
|
+
return str(v)
|
|
23
|
+
|
|
24
|
+
@field_validator("input")
|
|
25
|
+
@classmethod
|
|
26
|
+
def _non_empty_input(cls, v: str) -> str:
|
|
27
|
+
if not v or not v.strip():
|
|
28
|
+
raise ValueError("Item.input must be non-empty")
|
|
29
|
+
return v
|
itemeval/_manifest.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Run manifests: full reproducibility record, one JSON per generate/grade run."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import platform
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from itemeval._config import config_to_jsonable
|
|
12
|
+
from itemeval._util import atomic_write_bytes, canonical_json, sha256_hex, utc_now_iso
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from itemeval._prepare import PreparedStudy
|
|
16
|
+
from itemeval.store._layout import StudyPaths
|
|
17
|
+
|
|
18
|
+
MANIFEST_VERSION = 1
|
|
19
|
+
_TRACKED_PACKAGES = ("inspect-ai", "pandas", "pyarrow", "pydantic", "pyyaml", "datasets")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DatasetManifest(BaseModel):
|
|
23
|
+
model_config = ConfigDict(extra="forbid")
|
|
24
|
+
|
|
25
|
+
id: str
|
|
26
|
+
adapter: str
|
|
27
|
+
split: str
|
|
28
|
+
name: str | None = None
|
|
29
|
+
revision_requested: str | None
|
|
30
|
+
revision_resolved: str
|
|
31
|
+
n_items: int
|
|
32
|
+
items_hash: str # 12 hex over (id, input-hash) pairs in loaded order
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TemplateManifest(BaseModel):
|
|
36
|
+
model_config = ConfigDict(extra="forbid")
|
|
37
|
+
|
|
38
|
+
name: str # the reference as written, e.g. "standard" or "builtin:standard"
|
|
39
|
+
source: str # "local" | "builtin"
|
|
40
|
+
path: str # local path (relative to config_dir where possible), or "builtin:<subdir>/<name>.md"
|
|
41
|
+
sha256: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ConditionManifest(BaseModel):
|
|
45
|
+
model_config = ConfigDict(extra="forbid")
|
|
46
|
+
|
|
47
|
+
id: str
|
|
48
|
+
slug: str
|
|
49
|
+
payload: dict
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Manifest(BaseModel):
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
|
|
55
|
+
manifest_version: int = MANIFEST_VERSION
|
|
56
|
+
run_id: str
|
|
57
|
+
stage: Literal["generate", "grade"]
|
|
58
|
+
study: str
|
|
59
|
+
created_at: str
|
|
60
|
+
itemeval_version: str
|
|
61
|
+
python_version: str
|
|
62
|
+
packages: dict[str, str]
|
|
63
|
+
config_path: str
|
|
64
|
+
config_sha256: str
|
|
65
|
+
config: dict
|
|
66
|
+
datasets: list[DatasetManifest]
|
|
67
|
+
solver_templates: list[TemplateManifest]
|
|
68
|
+
rubric_templates: list[TemplateManifest]
|
|
69
|
+
models: list[str]
|
|
70
|
+
graders: dict[str, dict]
|
|
71
|
+
sampling_requested: dict
|
|
72
|
+
sampling_effective: dict[str, Any] | None = None # backfilled post-run, per condition
|
|
73
|
+
# backfilled post-run, per condition: {provider, base_url, served_model} — the
|
|
74
|
+
# endpoint/account/version that actually answered (which dashboard billed it).
|
|
75
|
+
endpoints_effective: dict[str, Any] | None = None
|
|
76
|
+
seed: int | None
|
|
77
|
+
policy: str
|
|
78
|
+
replications_requested: int
|
|
79
|
+
replications_effective: int
|
|
80
|
+
items_limit: int | None
|
|
81
|
+
batch: bool | int | None
|
|
82
|
+
grid_generate: list[ConditionManifest]
|
|
83
|
+
grid_grade: list[ConditionManifest]
|
|
84
|
+
conditions_run: list[str]
|
|
85
|
+
estimate_usd: float | None
|
|
86
|
+
cache: bool
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _pkg_version(name: str) -> str:
|
|
90
|
+
try:
|
|
91
|
+
return version(name)
|
|
92
|
+
except PackageNotFoundError:
|
|
93
|
+
return "unknown"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _items_hash(items) -> str:
|
|
97
|
+
pairs = [[it.id, sha256_hex(it.input.encode("utf-8"))[:12]] for it in items]
|
|
98
|
+
return sha256_hex(canonical_json(pairs).encode("utf-8"))[:12]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _rel_path(path: str, base: Path) -> str:
|
|
102
|
+
try:
|
|
103
|
+
return str(Path(path).relative_to(base))
|
|
104
|
+
except ValueError:
|
|
105
|
+
return path
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _template_manifest(t, base: Path) -> "TemplateManifest":
|
|
109
|
+
# built-in templates keep their package-relative id (machine-independent);
|
|
110
|
+
# local templates are recorded relative to config_dir where possible.
|
|
111
|
+
path = t.path if t.source == "builtin" else _rel_path(t.path, base)
|
|
112
|
+
return TemplateManifest(name=t.name, source=t.source, path=path, sha256=t.sha256)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def build_manifest(
|
|
116
|
+
prep: "PreparedStudy",
|
|
117
|
+
stage: str,
|
|
118
|
+
run_id: str,
|
|
119
|
+
conditions_run: "list[str]",
|
|
120
|
+
estimate_usd: "float | None",
|
|
121
|
+
) -> Manifest:
|
|
122
|
+
cfg = prep.config
|
|
123
|
+
base = cfg.config_dir or cfg.work_dir
|
|
124
|
+
used_graders = {
|
|
125
|
+
name: cfg.grader_spec(name).model_dump(mode="json") for name in cfg.facets.grader
|
|
126
|
+
}
|
|
127
|
+
sampling = cfg.solvers.model_dump(mode="json")
|
|
128
|
+
sampling.pop("models", None)
|
|
129
|
+
return Manifest(
|
|
130
|
+
run_id=run_id,
|
|
131
|
+
stage=stage, # type: ignore[arg-type]
|
|
132
|
+
study=cfg.study,
|
|
133
|
+
created_at=utc_now_iso(),
|
|
134
|
+
itemeval_version=_pkg_version("itemeval"),
|
|
135
|
+
python_version=platform.python_version(),
|
|
136
|
+
packages={p: _pkg_version(p) for p in _TRACKED_PACKAGES},
|
|
137
|
+
config_path=str(cfg.config_path) if cfg.config_path else "(in-memory)",
|
|
138
|
+
config_sha256=cfg.config_sha256 or "",
|
|
139
|
+
config=config_to_jsonable(cfg),
|
|
140
|
+
datasets=[
|
|
141
|
+
DatasetManifest(
|
|
142
|
+
id=ds.dataset_id,
|
|
143
|
+
adapter=ds.adapter,
|
|
144
|
+
split=ds.split,
|
|
145
|
+
name=ds.name,
|
|
146
|
+
revision_requested=ds.revision_requested,
|
|
147
|
+
revision_resolved=ds.revision,
|
|
148
|
+
n_items=len(ds.items),
|
|
149
|
+
items_hash=_items_hash(ds.items),
|
|
150
|
+
)
|
|
151
|
+
for ds in prep.datasets
|
|
152
|
+
],
|
|
153
|
+
solver_templates=[_template_manifest(t, base) for t in prep.solver_templates.values()],
|
|
154
|
+
rubric_templates=[_template_manifest(t, base) for t in prep.rubric_templates.values()],
|
|
155
|
+
models=list(cfg.solvers.models),
|
|
156
|
+
graders=used_graders,
|
|
157
|
+
sampling_requested=sampling,
|
|
158
|
+
seed=cfg.solvers.seed,
|
|
159
|
+
policy=prep.plan.policy,
|
|
160
|
+
replications_requested=cfg.facets.replications,
|
|
161
|
+
replications_effective=prep.plan.replications,
|
|
162
|
+
items_limit=prep.plan.items_limit,
|
|
163
|
+
batch=prep.plan.batch,
|
|
164
|
+
grid_generate=[
|
|
165
|
+
ConditionManifest(id=c.id, slug=c.slug, payload=c.payload) for c in prep.grid.generate
|
|
166
|
+
],
|
|
167
|
+
grid_grade=[
|
|
168
|
+
ConditionManifest(id=c.id, slug=c.slug, payload=c.payload) for c in prep.grid.grade
|
|
169
|
+
],
|
|
170
|
+
conditions_run=conditions_run,
|
|
171
|
+
estimate_usd=estimate_usd,
|
|
172
|
+
cache=cfg.cache,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def write_manifest(manifest: Manifest, paths: "StudyPaths") -> Path:
|
|
177
|
+
path = paths.manifests_dir / f"{manifest.run_id}.json"
|
|
178
|
+
payload = json.dumps(manifest.model_dump(mode="json"), indent=2, ensure_ascii=False)
|
|
179
|
+
atomic_write_bytes(path, (payload + "\n").encode("utf-8"))
|
|
180
|
+
return path
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def finalize_manifest(
|
|
184
|
+
manifest_path: Path,
|
|
185
|
+
sampling_effective: "dict[str, Any] | None" = None,
|
|
186
|
+
endpoints_effective: "dict[str, Any] | None" = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Backfill per-condition effective values after the run completes:
|
|
189
|
+
sampling params (generate) and/or the resolved endpoint per condition."""
|
|
190
|
+
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
191
|
+
if sampling_effective is not None:
|
|
192
|
+
data["sampling_effective"] = sampling_effective
|
|
193
|
+
if endpoints_effective is not None:
|
|
194
|
+
data["endpoints_effective"] = endpoints_effective
|
|
195
|
+
payload = json.dumps(data, indent=2, ensure_ascii=False)
|
|
196
|
+
atomic_write_bytes(manifest_path, (payload + "\n").encode("utf-8"))
|