itemeval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. itemeval/__init__.py +60 -0
  2. itemeval/_builtin/config.yaml +48 -0
  3. itemeval/_builtin/prompts/solver/minimal.md +4 -0
  4. itemeval/_builtin/prompts/solver/standard.md +6 -0
  5. itemeval/_builtin/rubrics/standard.md +17 -0
  6. itemeval/_config.py +252 -0
  7. itemeval/_errors.py +25 -0
  8. itemeval/_item.py +29 -0
  9. itemeval/_manifest.py +196 -0
  10. itemeval/_mockmodels.py +68 -0
  11. itemeval/_prepare.py +85 -0
  12. itemeval/_status.py +165 -0
  13. itemeval/_templates.py +161 -0
  14. itemeval/_util.py +46 -0
  15. itemeval/adapters/__init__.py +1 -0
  16. itemeval/adapters/_base.py +98 -0
  17. itemeval/adapters/_hf.py +85 -0
  18. itemeval/budget/__init__.py +1 -0
  19. itemeval/budget/_estimator.py +195 -0
  20. itemeval/budget/_gate.py +69 -0
  21. itemeval/budget/_policies.py +40 -0
  22. itemeval/budget/_pricing.py +131 -0
  23. itemeval/budget/pricing_seed.json +13 -0
  24. itemeval/cli.py +407 -0
  25. itemeval/design/__init__.py +1 -0
  26. itemeval/design/_grid.py +178 -0
  27. itemeval/design/_ids.py +25 -0
  28. itemeval/generate/__init__.py +1 -0
  29. itemeval/generate/_params.py +50 -0
  30. itemeval/generate/_run.py +397 -0
  31. itemeval/generate/_task.py +61 -0
  32. itemeval/grade/__init__.py +1 -0
  33. itemeval/grade/_judge.py +96 -0
  34. itemeval/grade/_parse.py +94 -0
  35. itemeval/grade/_run.py +333 -0
  36. itemeval/grade/_verifiable.py +84 -0
  37. itemeval/py.typed +0 -0
  38. itemeval/store/__init__.py +1 -0
  39. itemeval/store/_base.py +59 -0
  40. itemeval/store/_export.py +183 -0
  41. itemeval/store/_gradings.py +87 -0
  42. itemeval/store/_items.py +47 -0
  43. itemeval/store/_layout.py +47 -0
  44. itemeval/store/_ledger.py +37 -0
  45. itemeval/store/_logs.py +38 -0
  46. itemeval/store/_solutions.py +93 -0
  47. itemeval-0.1.0.dist-info/METADATA +279 -0
  48. itemeval-0.1.0.dist-info/RECORD +51 -0
  49. itemeval-0.1.0.dist-info/WHEEL +4 -0
  50. itemeval-0.1.0.dist-info/entry_points.txt +2 -0
  51. itemeval-0.1.0.dist-info/licenses/LICENSE +21 -0
itemeval/__init__.py ADDED
@@ -0,0 +1,60 @@
1
+ """itemeval: item-level LLM evaluation over any API, with built-in budget control.
2
+
3
+ Two equivalent ways to drive a study:
4
+
5
+ CLI itemeval estimate|generate|grade|export|status CONFIG
6
+ Python cfg = load_config("configs/my_study.yaml")
7
+ prep = prepare_study(cfg)
8
+ estimate_study(prep) # projected $ per stage
9
+ run_generate(prep) # stage 1 -> solutions store
10
+ run_grade(prep) # stage 2 -> gradings store
11
+ export_study(cfg) # long-format parquet + CSV + ledger
12
+ build_status(cfg, prep) # grid completion report
13
+
14
+ The Python pipeline functions do NOT apply the budget confirmation gate (a
15
+ CLI feature) — compare `estimate_study(...)` totals against your own
16
+ threshold before paid runs.
17
+ """
18
+
19
+ from importlib import import_module
20
+ from importlib.metadata import version
21
+
22
+ from itemeval._config import ExperimentConfig, load_config
23
+ from itemeval._item import Item
24
+
25
+ __version__ = version("itemeval")
26
+
27
+ # Pipeline functions resolve lazily (PEP 562) so `import itemeval` stays
28
+ # light: eager imports here would pull inspect_ai/pandas into every CLI start.
29
+ _LAZY = {
30
+ "prepare_study": ("itemeval._prepare", "prepare_study"),
31
+ "estimate_study": ("itemeval.budget._estimator", "estimate_study"),
32
+ "run_generate": ("itemeval.generate._run", "run_generate"),
33
+ "run_grade": ("itemeval.grade._run", "run_grade"),
34
+ "export_study": ("itemeval.store._export", "export_study"),
35
+ "build_status": ("itemeval._status", "build_status"),
36
+ }
37
+
38
+ __all__ = [
39
+ "ExperimentConfig",
40
+ "Item",
41
+ "__version__",
42
+ "build_status",
43
+ "estimate_study",
44
+ "export_study",
45
+ "load_config",
46
+ "prepare_study",
47
+ "run_generate",
48
+ "run_grade",
49
+ ]
50
+
51
+
52
+ def __getattr__(name: str):
53
+ if name in _LAZY:
54
+ module_name, attr = _LAZY[name]
55
+ return getattr(import_module(module_name), attr)
56
+ raise AttributeError(f"module 'itemeval' has no attribute {name!r}")
57
+
58
+
59
+ def __dir__() -> "list[str]":
60
+ return sorted(__all__)
@@ -0,0 +1,48 @@
1
+ # itemeval study config — generated by `itemeval init`.
2
+ # Runs free out of the box on the mock provider; see the numbered steps to adapt it.
3
+ study: my_study
4
+
5
+ # Inputs (prompts/rubrics) resolve relative to THIS config file's directory.
6
+ # Outputs (the studies/ folder) are written under your current working directory.
7
+ # Both anchors are overridable — see docs/wiki/Configuration.md.
8
+
9
+ benchmark:
10
+ adapter: hf
11
+ datasets:
12
+ # step 1: point this at your HuggingFace dataset and pin a revision.
13
+ - id: MathArena/usamo_2025
14
+ revision: 0a2c60f2249e07b8ee76c942bca4f5f87aa959df
15
+ split: train
16
+ mapping:
17
+ # step 1: adjust these to your dataset's column names.
18
+ id: problem_idx
19
+ input: problem
20
+ target: sample_solution
21
+ grading_scheme: grading_scheme
22
+ metadata: [points]
23
+
24
+ solvers:
25
+ # step 4: replace mockllm/* with real inspect model ids (openai/..., anthropic/..., openrouter/...).
26
+ models: [mockllm/solver-a, mockllm/solver-b, mockllm/solver-c]
27
+ temperature: 0.7
28
+ max_tokens: 1024
29
+
30
+ facets:
31
+ # `builtin:` templates ship with itemeval. Run `itemeval init --with-templates`
32
+ # to copy them locally as editable starters, or write your own under prompts/solver/.
33
+ prompt: [builtin:minimal, builtin:standard]
34
+ grader: [mock_judge]
35
+ rubric: [builtin:standard]
36
+ replications: 2
37
+
38
+ graders:
39
+ mock_judge:
40
+ model: mockllm/judge # step 4: swap in a real judge model id
41
+ max_tokens: 512
42
+
43
+ crossing: full
44
+
45
+ budget:
46
+ policy: dev # dev: first 2 items only; switch to full-interactive / full-batch when ready
47
+ confirm_above_usd: 5
48
+ batch: auto
@@ -0,0 +1,4 @@
1
+ Solve the following problem. Show your reasoning, then state your final answer
2
+ on a line starting with "ANSWER:".
3
+
4
+ {input}
@@ -0,0 +1,6 @@
1
+ You are a careful, expert problem solver. Read the problem below and work
2
+ through it with a complete, rigorous argument. Show all of your reasoning. End
3
+ with a line starting with "ANSWER:" giving your final answer.
4
+
5
+ Problem:
6
+ {input}
@@ -0,0 +1,17 @@
1
+ You are grading a candidate solution against a reference solution and a grading
2
+ scheme.
3
+
4
+ Problem:
5
+ {input}
6
+
7
+ Grading scheme:
8
+ {grading_scheme}
9
+
10
+ Reference solution:
11
+ {target}
12
+
13
+ Candidate solution:
14
+ {solution}
15
+
16
+ Evaluate the candidate solution against the grading scheme. Award a numeric
17
+ score according to the scheme.
itemeval/_config.py ADDED
@@ -0,0 +1,252 @@
1
+ """Experiment config schema and YAML loader.
2
+
3
+ YAML *shape* validation happens at load; *reference* resolution (template
4
+ files exist, grader names defined) is deferred to prepare/grid-expansion so
5
+ the README sketch validates as-is.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Any, Literal
10
+
11
+ import yaml
12
+ from pydantic import (
13
+ BaseModel,
14
+ ConfigDict,
15
+ Field,
16
+ PrivateAttr,
17
+ ValidationError,
18
+ field_validator,
19
+ model_validator,
20
+ )
21
+
22
+ from itemeval._errors import ConfigError
23
+ from itemeval._util import sha256_hex
24
+
25
+ NAME_RE = r"^[A-Za-z0-9][A-Za-z0-9._-]*$"
26
+ STUDY_RE = r"^[a-z0-9][a-z0-9_-]{0,63}$"
27
+
28
+ ReasoningEffort = Literal["none", "minimal", "low", "medium", "high", "xhigh", "max"]
29
+
30
+ # What to do with a *completed* generation that produced no gradable text
31
+ # (empty/blank `solution`, no API error — e.g. a reasoning model whose token
32
+ # budget was spent entirely on hidden reasoning). Distinct from API errors
33
+ # (always re-attempted) and parse failures (always final).
34
+ # skip — exclude from grading, but report the count + stop reasons (default)
35
+ # rerun — also treat as not-done in generate, so a subsequent `generate`
36
+ # re-attempts them (raise max_tokens / lower reasoning effort first;
37
+ # an identical request will hit the response cache and stay empty)
38
+ # grade — send to the judge as-is (an empty answer, typically scored low)
39
+ EmptySolutionPolicy = Literal["skip", "rerun", "grade"]
40
+
41
+
42
+ class DatasetSpec(BaseModel):
43
+ model_config = ConfigDict(extra="forbid")
44
+
45
+ id: str
46
+ revision: str | None = None # branch/tag/SHA; None -> lock file / resolve at first run
47
+ split: str = "train"
48
+ name: str | None = None # HF config name
49
+ limit: int | None = Field(default=None, ge=1) # first N rows, no shuffle
50
+
51
+
52
+ class MappingSpec(BaseModel):
53
+ model_config = ConfigDict(extra="forbid")
54
+
55
+ input: str
56
+ target: str | None = None
57
+ id: str | None = None # record column -> Item.id (else row index)
58
+ grading_scheme: str | None = None
59
+ metadata: list[str] = Field(default_factory=list)
60
+
61
+
62
+ class BenchmarkConfig(BaseModel):
63
+ model_config = ConfigDict(extra="forbid")
64
+
65
+ adapter: Literal["hf"]
66
+ datasets: list[DatasetSpec] = Field(min_length=1)
67
+ mapping: MappingSpec
68
+
69
+
70
+ class SolversConfig(BaseModel):
71
+ model_config = ConfigDict(extra="forbid")
72
+
73
+ models: list[str] = Field(min_length=1)
74
+ temperature: float | None = Field(default=None, ge=0.0, le=2.0)
75
+ max_tokens: int | None = Field(default=None, ge=1)
76
+ top_p: float | None = Field(default=None, gt=0.0, le=1.0)
77
+ seed: int | None = None # recorded; only some providers honor it
78
+ on_empty: EmptySolutionPolicy = "skip" # handling of empty (no-error) solutions
79
+
80
+ @field_validator("models")
81
+ @classmethod
82
+ def _unique_models(cls, v: list[str]) -> list[str]:
83
+ if len(set(v)) != len(v):
84
+ raise ValueError("solvers.models must be unique")
85
+ return v
86
+
87
+
88
+ class ModelConfigFacet(BaseModel):
89
+ """One model-config grid cell (sampling overrides + thinking/reasoning toggle)."""
90
+
91
+ model_config = ConfigDict(extra="forbid")
92
+
93
+ name: str = Field(pattern=NAME_RE)
94
+ temperature: float | None = Field(default=None, ge=0.0, le=2.0)
95
+ max_tokens: int | None = Field(default=None, ge=1)
96
+ top_p: float | None = Field(default=None, gt=0.0, le=1.0)
97
+ reasoning_effort: ReasoningEffort | None = None # OpenAI-style
98
+ reasoning_tokens: int | None = Field(default=None, ge=1) # Anthropic extended thinking
99
+
100
+
101
+ class FacetsConfig(BaseModel):
102
+ # `model_config` is reserved on pydantic models; the facet list is stored as
103
+ # `model_config_facet` with alias "model_config" (the YAML key).
104
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
105
+
106
+ prompt: list[str] = Field(default_factory=lambda: ["builtin:standard"], min_length=1)
107
+ grader: list[str] = Field(default_factory=list)
108
+ rubric: list[str] = Field(default_factory=lambda: ["builtin:standard"], min_length=1)
109
+ scorer: Literal["exact_match", "multiple_choice", "numeric"] | None = None
110
+ replications: int = Field(default=1, ge=1)
111
+ model_config_facet: list[ModelConfigFacet] = Field(
112
+ default_factory=lambda: [ModelConfigFacet(name="default")],
113
+ alias="model_config",
114
+ min_length=1,
115
+ )
116
+
117
+ @model_validator(mode="after")
118
+ def _check(self) -> "FacetsConfig":
119
+ if not self.grader and self.scorer is None:
120
+ raise ValueError("facets must declare at least one of grader / scorer")
121
+ names = [m.name for m in self.model_config_facet]
122
+ if len(set(names)) != len(names):
123
+ raise ValueError("facets.model_config names must be unique")
124
+ for field in ("prompt", "grader", "rubric"):
125
+ vals = getattr(self, field)
126
+ if len(set(vals)) != len(vals):
127
+ raise ValueError(f"facets.{field} entries must be unique")
128
+ return self
129
+
130
+
131
+ class GraderSpec(BaseModel):
132
+ """Judge model spec. Temperature is pinned to 0.0 in v0.1 (ROADMAP M3)."""
133
+
134
+ model_config = ConfigDict(extra="forbid")
135
+
136
+ model: str
137
+ max_tokens: int | None = Field(default=2048, ge=1)
138
+ reasoning_effort: ReasoningEffort | None = None
139
+
140
+
141
+ class BudgetConfig(BaseModel):
142
+ model_config = ConfigDict(extra="forbid")
143
+
144
+ policy: Literal["dev", "full-interactive", "full-batch"] = "dev"
145
+ confirm_above_usd: float = Field(default=5.0, ge=0.0)
146
+ batch: bool | int | Literal["auto"] = "auto"
147
+ max_usd: float | None = Field(default=None, gt=0.0) # hard cap, never overridable
148
+ dev_items: int = Field(default=2, ge=1) # dev preset: first N items
149
+ dev_replications: int | None = Field(default=None, ge=1) # None = keep config reps
150
+ pricing_path: str | None = None
151
+
152
+
153
+ class ExperimentConfig(BaseModel):
154
+ model_config = ConfigDict(extra="forbid")
155
+
156
+ study: str = Field(pattern=STUDY_RE)
157
+ output_dir: str = "studies" # outputs: resolved relative to work_dir (CWD)
158
+ prompts_dir: str = (
159
+ "prompts" # solver templates: <prompts_dir>/solver/<name>.md (relative to config_dir)
160
+ )
161
+ rubrics_dir: str = (
162
+ "rubrics" # rubric templates: <rubrics_dir>/<name>.md (relative to config_dir)
163
+ )
164
+ cache: bool = True # inspect local response cache, both stages
165
+ benchmark: BenchmarkConfig
166
+ solvers: SolversConfig
167
+ facets: FacetsConfig
168
+ graders: dict[str, GraderSpec] = Field(default_factory=dict)
169
+ crossing: Literal["full"] = "full"
170
+ budget: BudgetConfig = Field(default_factory=BudgetConfig)
171
+
172
+ # Two resolution anchors (see docs/wiki/Configuration.md):
173
+ # config_dir — the loaded YAML's directory; anchors INPUTS (prompts/rubrics/pricing).
174
+ # None for in-memory configs, which then anchor inputs to work_dir.
175
+ # work_dir — defaults to CWD; anchors OUTPUTS (the study directory). Never the package.
176
+ _config_dir: Path | None = PrivateAttr(default=None)
177
+ _work_dir: Path = PrivateAttr(default_factory=Path.cwd)
178
+ _config_path: Path | None = PrivateAttr(default=None)
179
+ _config_sha256: str | None = PrivateAttr(default=None)
180
+
181
+ @property
182
+ def config_dir(self) -> Path | None:
183
+ return self._config_dir
184
+
185
+ @property
186
+ def work_dir(self) -> Path:
187
+ return self._work_dir
188
+
189
+ @property
190
+ def config_path(self) -> Path | None:
191
+ return self._config_path
192
+
193
+ @property
194
+ def config_sha256(self) -> str | None:
195
+ return self._config_sha256
196
+
197
+ @property
198
+ def _input_base(self) -> Path:
199
+ """Anchor for input dirs: the config's directory, or work_dir for in-memory configs."""
200
+ return self._config_dir if self._config_dir is not None else self._work_dir
201
+
202
+ def resolve_input_dir(self, rel: str) -> Path:
203
+ """Resolve an input dir (prompts/rubrics/pricing) under config_dir; absolute paths pass through."""
204
+ p = Path(rel).expanduser()
205
+ return (p if p.is_absolute() else self._input_base / p).resolve()
206
+
207
+ @property
208
+ def study_dir(self) -> Path:
209
+ """Output study directory, anchored to work_dir (CWD); absolute output_dir passes through."""
210
+ out = Path(self.output_dir).expanduser()
211
+ base = out if out.is_absolute() else self._work_dir / out
212
+ return (base / self.study).resolve()
213
+
214
+ def grader_spec(self, name: str) -> GraderSpec:
215
+ """Resolve a facets.grader entry. Raises ConfigError if unresolvable."""
216
+ if name in self.graders:
217
+ return self.graders[name]
218
+ if "/" in name: # bare model id used directly as a grader
219
+ return GraderSpec(model=name)
220
+ raise ConfigError(f"grader '{name}' is not defined under graders: and is not a model id")
221
+
222
+
223
+ def load_config(path: "str | Path", *, work_dir: "str | Path | None" = None) -> ExperimentConfig:
224
+ """Load and validate an experiment config YAML file.
225
+
226
+ Inputs (prompts/rubrics) anchor to the config file's directory; outputs (the
227
+ study dir) anchor to `work_dir`, defaulting to the current working directory.
228
+ """
229
+ p = Path(path).expanduser().resolve()
230
+ if not p.is_file():
231
+ raise ConfigError(f"config file not found: {p}")
232
+ raw = p.read_bytes()
233
+ try:
234
+ data = yaml.safe_load(raw)
235
+ except yaml.YAMLError as e:
236
+ raise ConfigError(f"invalid YAML in {p}: {e}") from e
237
+ if not isinstance(data, dict):
238
+ raise ConfigError(f"config root must be a YAML mapping: {p}")
239
+ try:
240
+ cfg = ExperimentConfig.model_validate(data)
241
+ except ValidationError as e:
242
+ raise ConfigError(f"invalid config {p}:\n{e}") from e
243
+ cfg._config_dir = p.parent
244
+ cfg._work_dir = Path(work_dir).expanduser().resolve() if work_dir is not None else Path.cwd()
245
+ cfg._config_path = p
246
+ cfg._config_sha256 = sha256_hex(raw)
247
+ return cfg
248
+
249
+
250
+ def config_to_jsonable(cfg: ExperimentConfig) -> dict[str, Any]:
251
+ """Config as a JSON-ready dict using YAML key names (for manifests)."""
252
+ return cfg.model_dump(mode="json", by_alias=True)
itemeval/_errors.py ADDED
@@ -0,0 +1,25 @@
1
+ """Exception hierarchy for itemeval."""
2
+
3
+
4
+ class ItemevalError(Exception):
5
+ """Base class for all itemeval errors."""
6
+
7
+
8
+ class ConfigError(ItemevalError):
9
+ """YAML shape/validation failures and bad config references."""
10
+
11
+
12
+ class AdapterError(ItemevalError):
13
+ """Dataset load or field-mapping failures."""
14
+
15
+
16
+ class TemplateError(ItemevalError):
17
+ """Missing template file or required placeholder."""
18
+
19
+
20
+ class StoreError(ItemevalError):
21
+ """Parquet schema or IO problems."""
22
+
23
+
24
+ class BudgetError(ItemevalError):
25
+ """Pricing refresh or estimator failures."""
itemeval/_item.py ADDED
@@ -0,0 +1,29 @@
1
+ """Canonical benchmark item model."""
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
6
+
7
+
8
+ class Item(BaseModel):
9
+ """Canonical benchmark item: what adapters produce and both stages consume."""
10
+
11
+ model_config = ConfigDict(frozen=True, extra="forbid")
12
+
13
+ id: str
14
+ input: str
15
+ target: str = ""
16
+ grading_scheme: str | None = None
17
+ metadata: dict[str, Any] = Field(default_factory=dict)
18
+
19
+ @field_validator("id", mode="before")
20
+ @classmethod
21
+ def _coerce_id(cls, v: Any) -> str:
22
+ return str(v)
23
+
24
+ @field_validator("input")
25
+ @classmethod
26
+ def _non_empty_input(cls, v: str) -> str:
27
+ if not v or not v.strip():
28
+ raise ValueError("Item.input must be non-empty")
29
+ return v
itemeval/_manifest.py ADDED
@@ -0,0 +1,196 @@
1
+ """Run manifests: full reproducibility record, one JSON per generate/grade run."""
2
+
3
+ import json
4
+ import platform
5
+ from importlib.metadata import PackageNotFoundError, version
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Literal
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from itemeval._config import config_to_jsonable
12
+ from itemeval._util import atomic_write_bytes, canonical_json, sha256_hex, utc_now_iso
13
+
14
+ if TYPE_CHECKING:
15
+ from itemeval._prepare import PreparedStudy
16
+ from itemeval.store._layout import StudyPaths
17
+
18
+ MANIFEST_VERSION = 1
19
+ _TRACKED_PACKAGES = ("inspect-ai", "pandas", "pyarrow", "pydantic", "pyyaml", "datasets")
20
+
21
+
22
+ class DatasetManifest(BaseModel):
23
+ model_config = ConfigDict(extra="forbid")
24
+
25
+ id: str
26
+ adapter: str
27
+ split: str
28
+ name: str | None = None
29
+ revision_requested: str | None
30
+ revision_resolved: str
31
+ n_items: int
32
+ items_hash: str # 12 hex over (id, input-hash) pairs in loaded order
33
+
34
+
35
+ class TemplateManifest(BaseModel):
36
+ model_config = ConfigDict(extra="forbid")
37
+
38
+ name: str # the reference as written, e.g. "standard" or "builtin:standard"
39
+ source: str # "local" | "builtin"
40
+ path: str # local path (relative to config_dir where possible), or "builtin:<subdir>/<name>.md"
41
+ sha256: str
42
+
43
+
44
+ class ConditionManifest(BaseModel):
45
+ model_config = ConfigDict(extra="forbid")
46
+
47
+ id: str
48
+ slug: str
49
+ payload: dict
50
+
51
+
52
+ class Manifest(BaseModel):
53
+ model_config = ConfigDict(extra="forbid")
54
+
55
+ manifest_version: int = MANIFEST_VERSION
56
+ run_id: str
57
+ stage: Literal["generate", "grade"]
58
+ study: str
59
+ created_at: str
60
+ itemeval_version: str
61
+ python_version: str
62
+ packages: dict[str, str]
63
+ config_path: str
64
+ config_sha256: str
65
+ config: dict
66
+ datasets: list[DatasetManifest]
67
+ solver_templates: list[TemplateManifest]
68
+ rubric_templates: list[TemplateManifest]
69
+ models: list[str]
70
+ graders: dict[str, dict]
71
+ sampling_requested: dict
72
+ sampling_effective: dict[str, Any] | None = None # backfilled post-run, per condition
73
+ # backfilled post-run, per condition: {provider, base_url, served_model} — the
74
+ # endpoint/account/version that actually answered (which dashboard billed it).
75
+ endpoints_effective: dict[str, Any] | None = None
76
+ seed: int | None
77
+ policy: str
78
+ replications_requested: int
79
+ replications_effective: int
80
+ items_limit: int | None
81
+ batch: bool | int | None
82
+ grid_generate: list[ConditionManifest]
83
+ grid_grade: list[ConditionManifest]
84
+ conditions_run: list[str]
85
+ estimate_usd: float | None
86
+ cache: bool
87
+
88
+
89
+ def _pkg_version(name: str) -> str:
90
+ try:
91
+ return version(name)
92
+ except PackageNotFoundError:
93
+ return "unknown"
94
+
95
+
96
+ def _items_hash(items) -> str:
97
+ pairs = [[it.id, sha256_hex(it.input.encode("utf-8"))[:12]] for it in items]
98
+ return sha256_hex(canonical_json(pairs).encode("utf-8"))[:12]
99
+
100
+
101
+ def _rel_path(path: str, base: Path) -> str:
102
+ try:
103
+ return str(Path(path).relative_to(base))
104
+ except ValueError:
105
+ return path
106
+
107
+
108
+ def _template_manifest(t, base: Path) -> "TemplateManifest":
109
+ # built-in templates keep their package-relative id (machine-independent);
110
+ # local templates are recorded relative to config_dir where possible.
111
+ path = t.path if t.source == "builtin" else _rel_path(t.path, base)
112
+ return TemplateManifest(name=t.name, source=t.source, path=path, sha256=t.sha256)
113
+
114
+
115
+ def build_manifest(
116
+ prep: "PreparedStudy",
117
+ stage: str,
118
+ run_id: str,
119
+ conditions_run: "list[str]",
120
+ estimate_usd: "float | None",
121
+ ) -> Manifest:
122
+ cfg = prep.config
123
+ base = cfg.config_dir or cfg.work_dir
124
+ used_graders = {
125
+ name: cfg.grader_spec(name).model_dump(mode="json") for name in cfg.facets.grader
126
+ }
127
+ sampling = cfg.solvers.model_dump(mode="json")
128
+ sampling.pop("models", None)
129
+ return Manifest(
130
+ run_id=run_id,
131
+ stage=stage, # type: ignore[arg-type]
132
+ study=cfg.study,
133
+ created_at=utc_now_iso(),
134
+ itemeval_version=_pkg_version("itemeval"),
135
+ python_version=platform.python_version(),
136
+ packages={p: _pkg_version(p) for p in _TRACKED_PACKAGES},
137
+ config_path=str(cfg.config_path) if cfg.config_path else "(in-memory)",
138
+ config_sha256=cfg.config_sha256 or "",
139
+ config=config_to_jsonable(cfg),
140
+ datasets=[
141
+ DatasetManifest(
142
+ id=ds.dataset_id,
143
+ adapter=ds.adapter,
144
+ split=ds.split,
145
+ name=ds.name,
146
+ revision_requested=ds.revision_requested,
147
+ revision_resolved=ds.revision,
148
+ n_items=len(ds.items),
149
+ items_hash=_items_hash(ds.items),
150
+ )
151
+ for ds in prep.datasets
152
+ ],
153
+ solver_templates=[_template_manifest(t, base) for t in prep.solver_templates.values()],
154
+ rubric_templates=[_template_manifest(t, base) for t in prep.rubric_templates.values()],
155
+ models=list(cfg.solvers.models),
156
+ graders=used_graders,
157
+ sampling_requested=sampling,
158
+ seed=cfg.solvers.seed,
159
+ policy=prep.plan.policy,
160
+ replications_requested=cfg.facets.replications,
161
+ replications_effective=prep.plan.replications,
162
+ items_limit=prep.plan.items_limit,
163
+ batch=prep.plan.batch,
164
+ grid_generate=[
165
+ ConditionManifest(id=c.id, slug=c.slug, payload=c.payload) for c in prep.grid.generate
166
+ ],
167
+ grid_grade=[
168
+ ConditionManifest(id=c.id, slug=c.slug, payload=c.payload) for c in prep.grid.grade
169
+ ],
170
+ conditions_run=conditions_run,
171
+ estimate_usd=estimate_usd,
172
+ cache=cfg.cache,
173
+ )
174
+
175
+
176
+ def write_manifest(manifest: Manifest, paths: "StudyPaths") -> Path:
177
+ path = paths.manifests_dir / f"{manifest.run_id}.json"
178
+ payload = json.dumps(manifest.model_dump(mode="json"), indent=2, ensure_ascii=False)
179
+ atomic_write_bytes(path, (payload + "\n").encode("utf-8"))
180
+ return path
181
+
182
+
183
+ def finalize_manifest(
184
+ manifest_path: Path,
185
+ sampling_effective: "dict[str, Any] | None" = None,
186
+ endpoints_effective: "dict[str, Any] | None" = None,
187
+ ) -> None:
188
+ """Backfill per-condition effective values after the run completes:
189
+ sampling params (generate) and/or the resolved endpoint per condition."""
190
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
191
+ if sampling_effective is not None:
192
+ data["sampling_effective"] = sampling_effective
193
+ if endpoints_effective is not None:
194
+ data["endpoints_effective"] = endpoints_effective
195
+ payload = json.dumps(data, indent=2, ensure_ascii=False)
196
+ atomic_write_bytes(manifest_path, (payload + "\n").encode("utf-8"))