PyPI - verifiers - Versions diffs - 0.1.15.dev6__tar.gz → 0.1.15.dev7__tar.gz - Mend

verifiers 0.1.15.dev6tar.gz → 0.1.15.dev7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (311) hide show

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev6
+Version: 0.1.15.dev7
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_cli.py RENAMED Viewed

@@ -13,6 +13,7 @@ import verifiers.scripts.eval as vf_eval
 import verifiers.utils.eval_utils
 from verifiers.types import GenerateOutputs
 from verifiers.utils.eval_utils import load_toml_config
+from verifiers.utils.path_utils import get_eval_results_path
 from verifiers.utils.save_utils import states_to_outputs
@@ -706,6 +707,34 @@ def test_load_toml_config_multi_env():
         assert result[1]["env_id"] == "env2"
+def test_load_toml_config_duplicate_envs_accept_names():
+    """Duplicate env ids can be labeled and configured independently."""
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            '[[eval]]\nid = "env1"\nname = "env1-short"\n'
+            "[eval.args]\n"
+            'split = "short"\n\n'
+            '[[eval]]\nid = "env1"\nname = "env1-long"\n'
+            "[eval.args]\n"
+            'split = "long"\n'
+        )
+        f.flush()
+        result = load_toml_config(Path(f.name))
+    assert len(result) == 2
+    assert [config["env_id"] for config in result] == ["env1", "env1"]
+    assert [config["name"] for config in result] == ["env1-short", "env1-long"]
+    assert [config["env_args"]["split"] for config in result] == ["short", "long"]
+def test_load_toml_config_rejects_global_name():
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write('name = "shared-name"\n\n[[eval]]\nid = "env1"\n')
+        f.flush()
+        with pytest.raises(ValueError, match="Invalid global field"):
+            load_toml_config(Path(f.name))
 def test_load_toml_config_with_env_args():
     """Multiple sections with env_args field loads correctly."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -815,6 +844,28 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
     assert configs[1].env_id == "env2"
+def test_cli_duplicate_env_names_disambiguate_result_paths(monkeypatch, run_cli):
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            '[[eval]]\nid = "env1"\nname = "env1-short"\n'
+            "[eval.args]\n"
+            'split = "short"\n\n'
+            '[[eval]]\nid = "env1"\nname = "env1-long"\n'
+            "[eval.args]\n"
+            'split = "long"\n'
+        )
+        f.flush()
+        captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
+    configs = captured["configs"]
+    assert len(configs) == 2
+    assert [config.env_id for config in configs] == ["env1", "env1"]
+    assert [config.name for config in configs] == ["env1-short", "env1-long"]
+    assert [config.env_args["split"] for config in configs] == ["short", "long"]
+    assert get_eval_results_path(configs[0]).parent.name.startswith("env1-short--")
+    assert get_eval_results_path(configs[1]).parent.name.startswith("env1-long--")
 def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
     """TOML config ignores CLI args, uses defaults for unspecified values."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_display.py RENAMED Viewed

@@ -11,9 +11,11 @@ def make_config(
     independent_scoring: bool = False,
     endpoint_id: str | None = None,
     client_config: ClientConfig | None = None,
+    name: str | None = None,
 ) -> EvalConfig:
     return EvalConfig(
         env_id="dummy-env",
+        name=name,
         env_args={},
         env_dir_path="./environments",
         endpoint_id=endpoint_id,
@@ -82,6 +84,20 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
     assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
+def test_display_uses_eval_name_for_duplicate_env_labels() -> None:
+    display = EvalDisplay(
+        [
+            make_config(max_concurrent=1, name="dummy-env-short"),
+            make_config(max_concurrent=1, name="dummy-env-long"),
+        ]
+    )
+    rendered = render_plain(display._make_compact_env_row(0))
+    assert "dummy-env-short" in rendered
+    assert "dummy-env-long" not in rendered
 def render_plain(renderable) -> str:
     console = Console(width=100, record=True)
     console.print(renderable)

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_utils.py RENAMED Viewed

@@ -87,6 +87,22 @@ def test_print_results_single_rollout(capsys, make_metadata, make_state, make_in
     assert "r1: [0.1, 0.2, 0.3]" in captured.out
+def test_print_results_includes_eval_name(capsys, make_metadata, make_output):
+    from verifiers.utils.eval_utils import print_results
+    metadata = make_metadata(env_id="env1")
+    metadata["name"] = "env1-short"
+    results = GenerateOutputs(
+        outputs=[make_output(example_id=0, reward=1.0)],
+        metadata=metadata,
+    )
+    print_results(results)
+    captured = capsys.readouterr()
+    assert "Environment: env1-short (env1)" in captured.out
 def test_print_results_three_rollouts(capsys, make_metadata, make_state, make_input):
     """Test print_results with three rollouts per example."""
     from verifiers.utils.eval_utils import print_results

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_path_utils.py RENAMED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 from verifiers.utils.path_utils import (
     find_latest_incomplete_eval_results_path,
+    get_eval_runs_dir,
     is_valid_eval_results_path,
 )
@@ -69,6 +70,19 @@ def test_find_latest_incomplete_eval_results_path_returns_none_when_no_match(
     assert result is None
+def test_get_eval_runs_dir_uses_name_as_result_label(tmp_path: Path):
+    runs_dir = get_eval_runs_dir(
+        env_id="dummy-env",
+        name="dummy-env-short",
+        model="openai/gpt-4.1-mini",
+        output_dir=str(tmp_path / "outputs"),
+    )
+    assert runs_dir == (
+        tmp_path / "outputs" / "evals" / "dummy-env-short--openai--gpt-4.1-mini"
+    )
 def test_is_valid_eval_results_path_requires_files(tmp_path: Path):
     run_dir = tmp_path / "run"
     run_dir.mkdir()

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.15.dev6"
+__version__ = "0.1.15.dev7"
 import importlib
 import os

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/eval.py RENAMED Viewed

@@ -536,6 +536,9 @@ def main(argv: list[str] | None = None):
     def build_eval_config(raw: dict) -> EvalConfig:
         """Build EvalConfig from a raw config dict."""
         env_id = raw["env_id"]
+        name = raw.get("name")
+        if name is not None and (not isinstance(name, str) or not name):
+            raise ValueError("'name' must be a non-empty string when provided.")
         # Resolve num_examples and rollouts_per_example with env defaults
         env_defaults = get_env_eval_defaults(env_id)
@@ -775,6 +778,7 @@ def main(argv: list[str] | None = None):
                 rollouts_per_example=rollouts_per_example,
                 env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
                 output_dir=raw.get("output_dir"),
+                name=name,
             )
             if auto_resume_path is not None:
                 resume_path = auto_resume_path
@@ -794,6 +798,7 @@ def main(argv: list[str] | None = None):
         return EvalConfig(
             env_id=env_id,
+            name=name,
             env_args=raw.get("env_args", {}),
             env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
             output_dir=raw.get("output_dir"),

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/types.py RENAMED Viewed

@@ -937,6 +937,7 @@ class GenerateMetadata(TypedDict):
     """Pydantic model for generation metadata."""
     env_id: str
+    name: NotRequired[str]
     env_args: dict
     model: str
     base_url: str
@@ -1109,6 +1110,7 @@ class EvalConfig(BaseModel):
     # environment
     env_id: str
+    name: str | None = None
     env_args: dict
     env_dir_path: str
     # evaluation

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_display.py RENAMED Viewed

@@ -36,6 +36,17 @@ from verifiers.utils.message_utils import format_messages
 from verifiers.utils.pricing_utils import format_cost_usd
+def _eval_label(config: EvalConfig) -> str:
+    return config.name or config.env_id
+def _eval_title(config: EvalConfig) -> str:
+    label = _eval_label(config)
+    if config.name and config.name != config.env_id:
+        return f"{label} ({config.env_id})"
+    return label
 @dataclass
 class EnvEvalState:
     """Dynamic eval state for a single env."""
@@ -572,7 +583,7 @@ class EvalDisplay(BaseDisplay):
         # build title with env name (and index if multi-env)
         title = Text()
-        title.append(config.env_id, style="bold cyan")
+        title.append(_eval_title(config), style="bold cyan")
         if len(self.configs) > 1:
             title.append(f" (env {env_idx + 1}/{len(self.configs)})", style="dim")
@@ -740,9 +751,10 @@ class EvalDisplay(BaseDisplay):
         prefix = "\u25b6 " if selected else "  "
         line = Text()
+        label = _eval_label(config)
         if env_state.status == "completed":
             line.append(f"{prefix}\u2713 ", style="bold green")
-            line.append(config.env_id, style="green")
+            line.append(label, style="green")
             line.append("  reward ", style="dim")
             line.append(format_numeric(env_state.reward), style="bold")
             color = self._get_error_rate_color(env_state.error_rate)
@@ -754,7 +766,7 @@ class EvalDisplay(BaseDisplay):
             line.append(f"  {time_str}", style="dim")
         elif env_state.status == "failed":
             line.append(f"{prefix}\u2717 ", style="bold red")
-            line.append(config.env_id, style="red")
+            line.append(label, style="red")
             if env_state.error:
                 line.append("  ", style="dim")
                 line.append(env_state.error[:80], style="red")
@@ -770,7 +782,7 @@ class EvalDisplay(BaseDisplay):
             )
             total_str = "..." if env_state.total <= 0 else str(env_state.total)
             line.append(f"{prefix}\u25cf ", style="bold yellow")
-            line.append(config.env_id, style="yellow")
+            line.append(label, style="yellow")
             line.append(f"  {pct:.0f}%", style="bold")
             line.append(f" ({env_state.progress}/{total_str})", style="dim")
             line.append("  reward ", style="dim")
@@ -784,7 +796,7 @@ class EvalDisplay(BaseDisplay):
             line.append(f"  {time_str}", style="dim")
         else:
             line.append(f"{prefix}\u25cb ", style="dim")
-            line.append(config.env_id, style="dim")
+            line.append(label, style="dim")
             line.append("  pending", style="dim")
         return line
@@ -958,7 +970,7 @@ class EvalDisplay(BaseDisplay):
             self.console.print(
                 Panel(
                     self._make_env_detail(config, env_state, results),
-                    title=f"[bold blue]{config.env_id}[/bold blue]",
+                    title=f"[bold blue]{_eval_title(config)}[/bold blue]",
                     border_style="dim",
                 )
             )
@@ -980,12 +992,12 @@ class EvalDisplay(BaseDisplay):
             env_state = self.state.envs[idx]
             if env_state.error:
                 self.console.print()
-                self.console.print(f"[red]error in {config.env_id}:[/red]")
+                self.console.print(f"[red]error in {_eval_label(config)}:[/red]")
                 self.console.print(f"  {env_state.error}")
         # Summary table with main metrics (printed last)
         table = Table(title="Evaluation Summary")
-        table.add_column("env_id", style="cyan")
+        table.add_column("eval", style="cyan")
         table.add_column("status", justify="center")
         table.add_column("examples", justify="center")
         table.add_column("rollouts", justify="center")
@@ -1060,7 +1072,7 @@ class EvalDisplay(BaseDisplay):
             mins, secs = divmod(int(elapsed), 60)
             time_str = f"{mins}m {secs:02d}s" if mins > 0 else f"{secs}s"
-            row = [config.env_id, status, examples_str, rollouts_str, reward]
+            row = [_eval_label(config), status, examples_str, rollouts_str, reward]
             if show_usage:
                 row.extend([input_tokens or "-", output_tokens or "-"])
             if show_cost:
@@ -1079,6 +1091,10 @@ class EvalDisplay(BaseDisplay):
         text = Text()
         text.append("model: ", style="dim")
         text.append(config.model, style="bold")
+        if config.name:
+            text.append("\n")
+            text.append("env: ", style="dim")
+            text.append(config.env_id, style="bold")
         text.append("\n")
         text.append("endpoint: ", style="dim")
         text.append(self._format_client_target(config))

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_utils.py RENAMED Viewed

@@ -109,25 +109,35 @@ def _attach_metadata_cost(
     return cost
-def _with_metadata_cost(
+def _attach_metadata_name(metadata: GenerateMetadata, name: str | None) -> bool:
+    if name is None:
+        return False
+    metadata["name"] = name
+    return True
+def _with_eval_metadata(
     on_progress: ProgressCallback | list[ProgressCallback] | None,
     model_pricing: ModelPricing | None,
+    name: str | None,
 ) -> ProgressCallback | list[ProgressCallback] | None:
-    if model_pricing is None:
+    if model_pricing is None and name is None:
         return on_progress
-    def attach_cost(
+    def attach_metadata(
         all_outputs: list[RolloutOutput],
         new_outputs: list[RolloutOutput],
         metadata: GenerateMetadata,
     ) -> None:
+        _attach_metadata_name(metadata, name)
         _attach_metadata_cost(metadata, model_pricing, all_outputs)
     if on_progress is None:
-        return [attach_cost]
+        return [attach_metadata]
     if isinstance(on_progress, list):
-        callbacks: list[ProgressCallback] = [attach_cost]
+        callbacks: list[ProgressCallback] = [attach_metadata]
         callbacks.extend(cast(list[ProgressCallback], on_progress))
         return callbacks
@@ -136,7 +146,7 @@ def _with_metadata_cost(
         new_outputs: list[RolloutOutput],
         metadata: GenerateMetadata,
     ) -> None:
-        attach_cost(all_outputs, new_outputs, metadata)
+        attach_metadata(all_outputs, new_outputs, metadata)
         on_progress(all_outputs, new_outputs, metadata)
     return wrapped_progress
@@ -526,6 +536,7 @@ def load_toml_config(
     valid_fields = {
         # environment
         "env_id",
+        "name",
         "args",
         "env_args",
         "taskset",
@@ -573,11 +584,12 @@ def load_toml_config(
     # validate global fields
     if global_defaults:
-        invalid_global = set(global_defaults.keys()) - valid_fields
+        global_valid_fields = valid_fields - {"name"}
+        invalid_global = set(global_defaults.keys()) - global_valid_fields
         if invalid_global:
             raise ValueError(
                 f"Invalid global field(s) {invalid_global}. "
-                f"Valid fields are: {sorted(valid_fields)}"
+                f"Valid fields are: {sorted(global_valid_fields)}"
             )
     # merge global defaults with per-eval configs
@@ -856,7 +868,10 @@ def print_usage(results: GenerateOutputs):
 def print_results(results: GenerateOutputs, num_samples: int = 1):
     assert results["metadata"] is not None
     print("--- Evaluation ---")
-    print(f"Environment: {results['metadata']['env_id']}")
+    env_id = results["metadata"]["env_id"]
+    name = results["metadata"].get("name")
+    env_label = f"{name} ({env_id})" if name and name != env_id else env_id
+    print(f"Environment: {env_label}")
     print(f"Model: {results['metadata']['model']}")
     print(f"Provider: {results['metadata']['base_url']}")
     print(f"Examples: {results['metadata']['num_examples']}")
@@ -932,7 +947,7 @@ async def run_evaluation(
     results_path = config.resume_path or get_eval_results_path(config)
     model_pricing = await _resolve_model_pricing(config)
-    on_progress = _with_metadata_cost(on_progress, model_pricing)
+    on_progress = _with_eval_metadata(on_progress, model_pricing, config.name)
     try:
         if not config.disable_env_server:
@@ -1022,12 +1037,11 @@ async def run_evaluation(
         if not config.disable_env_server:
             await vf_env.stop_server()
-    if (
-        _attach_metadata_cost(outputs["metadata"], model_pricing, outputs["outputs"])
-        is not None
-    ):
-        if config.save_results:
-            await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
+    metadata_changed = _attach_metadata_name(outputs["metadata"], config.name)
+    if _attach_metadata_cost(outputs["metadata"], model_pricing, outputs["outputs"]):
+        metadata_changed = True
+    if metadata_changed and config.save_results:
+        await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
     return outputs

{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/path_utils.py RENAMED Viewed

@@ -51,7 +51,7 @@ def get_eval_results_path(config: EvalConfig) -> Path:
     base_path = _get_outputs_base_path(
         config.env_id, config.env_dir_path, config.output_dir
     )
-    return get_results_path(config.env_id, config.model, base_path)
+    return get_results_path(config.name or config.env_id, config.model, base_path)
 def get_eval_runs_dir(
@@ -59,10 +59,11 @@ def get_eval_runs_dir(
     model: str,
     env_dir_path: str = "./environments",
     output_dir: str | None = None,
+    name: str | None = None,
 ) -> Path:
     """Return directory containing all eval run directories for env/model."""
     base_path = _get_outputs_base_path(env_id, env_dir_path, output_dir)
-    env_model_str = f"{env_id}--{model.replace('/', '--')}"
+    env_model_str = f"{name or env_id}--{model.replace('/', '--')}"
     return base_path / "evals" / env_model_str
@@ -108,10 +109,15 @@ def find_latest_incomplete_eval_results_path(
     rollouts_per_example: int,
     env_dir_path: str = "./environments",
     output_dir: str | None = None,
+    name: str | None = None,
 ) -> Path | None:
     """Find the newest resumable, incomplete eval run for the provided config."""
     runs_dir = get_eval_runs_dir(
-        env_id=env_id, model=model, env_dir_path=env_dir_path, output_dir=output_dir
+        env_id=env_id,
+        model=model,
+        env_dir_path=env_dir_path,
+        output_dir=output_dir,
+        name=name,
     )
     if not runs_dir.exists():
         return None