PyPI - benchmax - Versions diffs - 0.1.2.dev25__tar.gz → 0.1.2.dev27__tar.gz - Mend

benchmax 0.1.2.dev25tar.gz → 0.1.2.dev27tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchmax
-Version: 0.1.2.dev25
+Version: 0.1.2.dev27
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: castie@castform.com
 Classifier: Programming Language :: Python :: 3

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "benchmax"
-version = "0.1.2.dev25"
+version = "0.1.2.dev27"
 description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
 readme = "README.md"
 authors = [{ name = "castie@castform.com" }]

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/config.py RENAMED Viewed

@@ -38,8 +38,3 @@ def web_app_url() -> str:
 def llm_url() -> str:
     """OpenAI-compatible LLM endpoint hosted by the platform."""
     return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
-def rollout_url() -> str:
-    """Rollout / inference server."""
-    return os.environ.get("CASTFORM_ROLLOUT_URL") or f"https://autobots.{base_domain()}"

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/base_env.py RENAMED Viewed

@@ -88,20 +88,37 @@ class BaseEnv(ABC):
         )
     @classmethod
-    def playground_preprocess(cls, prompt: str, **kwargs: Any) -> Example:
-        """Wrap a one-shot playground prompt into an :class:`Example`.
+    def playground_preprocess(
+        cls,
+        prompt: str | None = None,
+        messages: Messages | None = None,
+        **kwargs: Any,
+    ) -> Example:
+        """Wrap a playground input into an :class:`Example`.
+        Accepts either ``prompt`` (single user string — the typical one-shot
+        chat case) or ``messages`` (a full chat list, used when replaying a
+        multi-turn eval prompt). Exactly one must be provided.
         Classmethod (like :meth:`dataset_preprocess`), reading the static
-        ``cls.system_prompt`` class attribute — so a one-shot playground
-        prompt is preprocessed without constructing an env instance, and the
-        system prompt matches what training uses. Prepends ``cls.system_prompt``
-        via :func:`make_example` with ``task=None`` — the rollout worker skips
+        ``cls.system_prompt`` class attribute — so a playground input is
+        preprocessed without constructing an env instance, and the system
+        prompt matches what training uses. ``cls.system_prompt`` is prepended
+        unless the caller already supplied a system message (a replayed eval
+        prompt typically does). ``task=None`` — the rollout worker skips
         reward computation for playground examples.
         """
+        if messages is None:
+            if not prompt:
+                raise ValueError(
+                    "playground_preprocess requires either 'prompt' or 'messages'"
+                )
+            messages = [{"role": "user", "content": prompt}]
+        has_system = any(m.get("role") == "system" for m in messages)
         return make_example(
-            prompt_messages=[{"role": "user", "content": prompt}],
+            prompt_messages=messages,
             task=None,
-            system_prompt=cls.system_prompt,
+            system_prompt=None if has_system else cls.system_prompt,
         )
     @classmethod

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/client.py RENAMED Viewed

@@ -29,7 +29,9 @@ from .exceptions import (
 )
 if TYPE_CHECKING:
-    pass
+    from types import ModuleType
+    from benchmax.envs.base_env import BaseEnv
 @dataclass(frozen=True)
@@ -279,7 +281,9 @@ class StorageClient:
         # Stream from disk instead of read_bytes() to keep memory bounded for
         # multi-GB datasets. httpx infers Content-Length from the file size.
         url_response = self._get_upload_url(
-            path, mime_type, expires_in_minutes=expires_in_minutes,
+            path,
+            mime_type,
+            expires_in_minutes=expires_in_minutes,
         )
         with file_path.open("rb") as fh:
             self._put_to_signed_url(url_response["uploadUrl"], fh, mime_type)
@@ -437,7 +441,11 @@ class TrainerClient:
         specs = self.list_launch_args()
         print(_hdr("Launch args accepted by POST /train/runs/launch"))
         for spec in specs:
-            req = _RED + "required" + _RESET if spec.required else _CYAN + "optional" + _RESET
+            req = (
+                _RED + "required" + _RESET
+                if spec.required
+                else _CYAN + "optional" + _RESET
+            )
             header = f"  {_BOLD}{spec.name}{_RESET} ({spec.type}, {req})"
             bits: list[str] = []
             if spec.default is not None:
@@ -652,7 +660,9 @@ def _print_event(
                                 tool_text,
                             )
                         else:
-                            preview = textwrap.shorten(tool_text, width=120, placeholder="…")
+                            preview = textwrap.shorten(
+                                tool_text, width=120, placeholder="…"
+                            )
                             print(
                                 f"{prefix} → message [{role}/tool_result] "
                                 f"(chars={len(tool_text)}): {preview}"
@@ -690,7 +700,13 @@ def _print_event(
 class RolloutClient:
-    """Thin synchronous client for the /rollout/stream endpoint.
+    """Thin synchronous client for the rollout-stream endpoint.
+    Rollouts are reached through platform-service. platform-service is the API-key
+    gate: it validates the ``sk_`` key and mints a short-lived act_as JWT that
+    rollout-service accepts (rollout-service's own auth only takes
+    auth-service-minted JWTs — never a raw platform key). The proxy is mounted at
+    ``/v1/rollout/stream``.
     Supports two ways to provide the environment:
@@ -700,8 +716,11 @@ class RolloutClient:
        raw file contents; they will be base64-encoded and sent inline.
     Args:
-        api_key:    Bearer token for the rollout server.
-        server_url: Base URL of the rollout server.
+        api_key:    Platform API key (``sk_``); forwarded as the Bearer token
+                    platform-service validates.
+        server_url: Base URL of platform-service. Defaults to
+                    ``config.platform_url()``; the ``/v1/rollout/stream`` path is
+                    appended per request.
         timeout:    Per-request timeout in seconds (default 300 — rollouts can be slow).
     """
@@ -716,7 +735,9 @@ class RolloutClient:
         self._api_key = api_key
         # Resolve at construction time, not import time, so env-var changes
         # take effect (mirrors StorageClient/TrainerClient default_factory pattern).
-        self._server_url = (server_url or config.rollout_url()).rstrip("/")
+        # Target platform-service (the API-key gate), not the rollout-service
+        # host directly — see the class docstring for why.
+        self._server_url = (server_url or config.platform_url()).rstrip("/")
         self._timeout = timeout
     @staticmethod
@@ -734,7 +755,9 @@ class RolloutClient:
         has_bytes = env_cls_bytes is not None and env_metadata_bytes is not None
         if has_paths and has_bytes:
-            raise ValueError("Provide either blob paths or raw bytes for the env, not both.")
+            raise ValueError(
+                "Provide either blob paths or raw bytes for the env, not both."
+            )
         if not has_paths and not has_bytes:
             raise ValueError(
                 "Provide either (env_cls_path, env_metadata_path) or "
@@ -844,7 +867,9 @@ class RolloutClient:
             },
         }
-        url = f"{self._server_url}/rollout/stream"
+        # platform-service mounts the proxy at /v1/rollout/stream; it validates
+        # the platform key and forwards to rollout-service with an act_as JWT.
+        url = f"{self._server_url}/v1/rollout/stream"
         headers = {"Authorization": f"Bearer {self._api_key}"}
         with httpx.stream(
@@ -858,7 +883,10 @@ class RolloutClient:
                 body = response.read().decode()
                 # Typed errors so callers can distinguish retryable from
                 # caller-fix from auth-fix without parsing exception messages.
-                if response.status_code == 401:
+                # 403 too: rollouts route through platform-service's optionalAuth
+                # gate, which rejects a missing/invalid/expired key as 403
+                # ("sign in to run rollouts") rather than 401 — same fix (the key).
+                if response.status_code in (401, 403):
                     raise AuthenticationError(body[:300], response.status_code)
                 if response.status_code == 404:
                     raise RolloutNotFound(body[:300], response.status_code)
@@ -922,6 +950,10 @@ class RolloutClient:
         env_metadata_path: str | None = None,
         n: int = 2,
         *,
+        env_class: type[BaseEnv] | None = None,
+        constructor_args: dict[str, Any] | None = None,
+        pip_dependencies: list[str] | None = None,
+        local_modules: list[ModuleType] | None = None,
         env_cls_bytes: bytes | None = None,
         env_metadata_bytes: bytes | None = None,
         llm_model: str = _VALIDATION_MODEL,
@@ -930,14 +962,22 @@ class RolloutClient:
     ) -> ValidationResult:
         """Run rollouts on the first *n* examples and report pass/fail.
-        The environment can be specified via **blob paths** or **raw bytes**
-        (mutually exclusive — see class docstring).
+        The environment can be specified three ways (mutually exclusive): an
+        **env class** (bundled to bytes here, so validation needs no prior
+        upload — preferred for a pre-launch smoke test), **blob paths** to an
+        already-uploaded env, or **raw bytes** (see class docstring).
         Args:
             examples:           Full dataset (list of raw dicts).
             env_cls_path:       Blob path to the uploaded env .pkl file.
             env_metadata_path:  Blob path to the uploaded env-meta .json file.
             n:                  Number of examples to validate (default 2).
+            env_class:          BaseEnv subclass to bundle and validate without
+                                uploading. Mutually exclusive with paths/bytes.
+            constructor_args:   kwargs baked into the env bundle (env_class only).
+            pip_dependencies:   Pip deps recorded in the bundle (env_class only).
+            local_modules:      Modules to pickle by-value (env_class only; for
+                                envs that import from local .py files).
             env_cls_bytes:      Raw bytes of the pickled env class (will be base64-encoded).
             env_metadata_bytes: Raw bytes of the env metadata JSON (will be base64-encoded).
             verbose:            Print colored progress to stdout (default True for
@@ -949,12 +989,39 @@ class RolloutClient:
             "did everything pass" check, with per-example detail in
             ``result.examples`` for richer reporting.
         """
+        # An env class is bundled to bytes here so validation can run a smoke
+        # test BEFORE uploading anything (the launch flow uploads only after
+        # validation passes). Mutually exclusive with explicit paths/bytes.
+        if env_class is not None:
+            if any(
+                (env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes)
+            ):
+                raise ValueError(
+                    "Provide env_class OR explicit env paths/bytes, not both."
+                )
+            from benchmax.bundle import dump_bundle
+            bundle = dump_bundle(
+                env_class,
+                constructor_args=constructor_args,
+                pip_dependencies=pip_dependencies,
+                local_modules=local_modules,
+            )
+            env_cls_bytes = bundle.pickled
+            env_metadata_bytes = bundle.metadata.to_json_bytes()
         # Validate env args early so we fail before running any rollouts.
-        self._build_env(env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes)
+        self._build_env(
+            env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes
+        )
         sample = examples[:n]
         if verbose:
-            print(_hdr(f"── Remote validation: {len(sample)} example(s) on {llm_model} ──"))
+            print(
+                _hdr(
+                    f"── Remote validation: {len(sample)} example(s) on {llm_model} ──"
+                )
+            )
         per_example: list[ExampleValidation] = []
         for i, example in enumerate(sample):
@@ -972,10 +1039,15 @@ class RolloutClient:
                     max_turns=max_turns,
                 )
                 ok = bool(final.get("success"))
-                per_example.append(ExampleValidation(
-                    index=i, ok=ok,
-                    error=None if ok else (final.get("error") or "rollout reported success=False"),
-                ))
+                per_example.append(
+                    ExampleValidation(
+                        index=i,
+                        ok=ok,
+                        error=None
+                        if ok
+                        else (final.get("error") or "rollout reported success=False"),
+                    )
+                )
             except (RolloutError, RuntimeError) as exc:
                 if verbose:
                     print(_err(f"  Example {i} failed: {exc}"))
@@ -987,6 +1059,10 @@ class RolloutClient:
             if result.ok:
                 print(_ok("Remote validation passed"))
             else:
-                print(_err("Remote validation failed — check output above before launching a full job"))
+                print(
+                    _err(
+                        "Remote validation failed — check output above before launching a full job"
+                    )
+                )
         return result

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/namespace.py RENAMED Viewed

@@ -218,6 +218,29 @@ class TpufNamespace:
         return len(all_chunks)
+    # ------------------------------------------------------------------
+    # Namespace metadata
+    # ------------------------------------------------------------------
+    def get_approx_row_count(self) -> int | None:
+        """Return the approximate row count from namespace metadata.
+        Uses the tpuf metadata endpoint which returns ``approx_row_count``.
+        Unlike ``get_max_id()``, this reflects actual rows (accounting for
+        deletions) rather than the highest assigned ID.
+        """
+        try:
+            meta = self._ns.metadata()
+            count = getattr(meta, "approx_row_count", None)
+            if isinstance(count, int):
+                return count
+            # Fallback: some SDK versions return a dict
+            if isinstance(meta, dict):
+                return meta.get("approx_row_count")
+            return None
+        except Exception:
+            return None
     # ------------------------------------------------------------------
     # ID pagination
     # ------------------------------------------------------------------
@@ -237,6 +260,35 @@ class TpufNamespace:
             return None
         return rows[0].id
+    def scan_all_rows(self, limit: int | None = None, page_size: int = 10_000) -> list[Any]:
+        """Sequentially scan all rows with attributes via cursor pagination.
+        Much faster than random-ID sampling for large fetches — single pass,
+        no retries, no ID collisions. Returns up to ``limit`` rows (all if
+        None).
+        """
+        all_rows: list[Any] = []
+        last_id = 0
+        while True:
+            result = self._ns.query(
+                rank_by=["id", "asc"],
+                filters=["id", "Gt", last_id],
+                top_k=page_size,
+                include_attributes=True,
+            )
+            rows = result.rows
+            if not rows:
+                break
+            all_rows.extend(rows)
+            last_id = rows[-1].id
+            if limit is not None and len(all_rows) >= limit:
+                return all_rows[:limit]
+            if len(rows) < page_size:
+                break
+        return all_rows
     def paginate_all_ids(self, page_size: int = 1000) -> list[int]:
         """Return all row IDs in the namespace via cursor pagination."""
         all_ids: list[int] = []

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/source.py RENAMED Viewed

@@ -198,9 +198,37 @@ class TpufChunkSource:
     # ------------------------------------------------------------------
     def get_chunk_count(self) -> int:
-        """Return the total number of chunks in the namespace."""
+        """Return the total number of chunks in the namespace.
+        Prefers ``approx_row_count`` from the metadata endpoint (reflects
+        actual rows after deletions). Falls back to ``get_max_id()`` which
+        can over-count in sparse namespaces.
+        """
+        approx = self._client.get_approx_row_count()
+        if approx is not None:
+            return approx
         return self._client.get_max_id() or 0
+    def scan_chunks(self, limit: int | None = None, min_chars: int = 0) -> list[Chunk]:
+        """Sequentially scan chunks via cursor pagination.
+        Much faster than ``sample_chunks`` for large fetches (single pass, no
+        retries). Returns chunks in ID order, not random. Use this when you
+        need most or all of the namespace (e.g. materialization).
+        """
+        # Over-fetch to account for min_chars filtering
+        fetch_limit = None if limit is None else int(limit * (3 if min_chars > 0 else 1.1))
+        rows = self._client.scan_all_rows(limit=fetch_limit)
+        collected: list[Chunk] = []
+        for row in rows:
+            chunk = self._client.row_to_chunk(row)
+            if min_chars > 0 and len(chunk.content) < min_chars:
+                continue
+            collected.append(chunk)
+            if limit is not None and len(collected) >= limit:
+                break
+        return collected
     def sample_chunks(self, n: int, min_chars: int = 0) -> list[Chunk]:
         """Return n randomly sampled chunks, optionally filtered by minimum length.
@@ -357,8 +385,11 @@ class TpufChunkSource:
             return []
         # Skip expensive full-namespace pagination for large namespaces.
-        # Use actual row count (not max_id) to handle sparse ID spaces where
-        # max_id >> row_count due to deletions or non-sequential assignment.
+        # Use approx_row_count (actual rows) rather than paginating all IDs
+        # just to count them — that's O(N) API calls for large namespaces.
+        chunk_count = self.get_chunk_count()
+        if chunk_count > 50_000:
+            return []
         all_ids = self._client.paginate_all_ids()
         if len(all_ids) > 50_000:
             return []

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline.py RENAMED Viewed

@@ -1332,30 +1332,42 @@ class Pipeline:
         # resolve any chunk by hash.
         max_materialize = 50_000
         if getattr(source, "collection", None) is None and chunk_count > 0:
-            if chunk_count <= max_materialize:
-                from benchmax.rag.chunkers.models import ChunkCollection  # noqa: PLC0415
+            from benchmax.rag.chunkers.models import ChunkCollection  # noqa: PLC0415
+            materialize_count = min(chunk_count, max_materialize)
+            if chunk_count > max_materialize:
+                logger.warning(
+                    "Corpus has %d chunks (limit %d). Materialising a capped "
+                    "sample so entity extraction and the chunk graph still work.",
+                    chunk_count,
+                    max_materialize,
+                )
+            else:
                 logger.info(
                     "Materialising %d chunks from API backend into memory...",
                     chunk_count,
                 )
-                all_chunks = source.sample_chunks(
-                    chunk_count,
+            # Use sequential scan when available — cursor pagination avoids
+            # the ID-collision overhead of random sampling at high fill rates.
+            # ~1.9x faster for 50k chunks from a 65k namespace.
+            if hasattr(source, "scan_chunks"):
+                all_chunks = source.scan_chunks(
+                    limit=materialize_count,
                     min_chars=cfg.corpus.min_chunk_chars,
                 )
-                if all_chunks:
-                    source.collection = ChunkCollection(chunks=all_chunks)  # type: ignore[attr-defined]
-                    logger.info(
-                        "Cached %d/%d chunks on source.collection",
-                        len(all_chunks),
-                        chunk_count,
-                    )
             else:
-                logger.warning(
-                    "Corpus too large to materialise (%d chunks > %d cap); "
-                    "entity-chunk graph will use profile sample only.",
+                all_chunks = source.sample_chunks(
+                    materialize_count,
+                    min_chars=cfg.corpus.min_chunk_chars,
+                )
+            if all_chunks:
+                source.collection = ChunkCollection(chunks=all_chunks)  # type: ignore[attr-defined]
+                logger.info(
+                    "Cached %d/%d chunks on source.collection",
+                    len(all_chunks),
                     chunk_count,
-                    max_materialize,
                 )
         profile_sample = diverse_profile_sample(

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/rubric.py RENAMED Viewed

@@ -1,9 +1,12 @@
+import logging
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Literal, Optional
 from openai import AsyncOpenAI
+logger = logging.getLogger(__name__)
 from benchmax.platform.credentials import platform_bearer
 from ._utils import _extract_json
@@ -76,6 +79,7 @@ async def evaluate_single_rubric(
     ground_truth: Optional[str] = None,
     api_key: str = "",
     timeout: Optional[float] = None,
+    enable_logging: bool = True,
 ) -> Dict[str, Any]:
     """
     Evaluate a single response against a single rubric.
@@ -146,11 +150,26 @@ async def evaluate_single_rubric(
             return {"score": 0, "reasoning": "Empty response", "llm_output": ""}
         result = _extract_json(content)
-        return {
+        out = {
             "score": result.get("score", 0),
             "reasoning": result.get("reasoning", ""),
             "llm_output": content,
         }
+        if enable_logging:
+            logger.info(
+                "\n┌─ rubric: %s ─────────────────────\n"
+                "│ ground_truth : %s\n"
+                "│ score        : %s\n"
+                "│ reasoning    : %s\n"
+                "│ llm_output   :\n%s\n"
+                "└──────────────────────────────────────────────────",
+                rubric.title,
+                (ground_truth or "").strip() or "(none)",
+                out["score"],
+                out["reasoning"],
+                content,
+            )
+        return out
     except Exception as e:
         print(f"Error evaluating rubric '{rubric.title}': {e}\njudge output:\n{content}")
@@ -166,6 +185,7 @@ async def evaluate_rubric_ranking(
     api_key: str = "",
     timeout: Optional[float] = None,
     ground_truth: Optional[str] = None,
+    enable_logging: bool = True,
 ) -> Dict[str, Any]:
     """
     Rank N responses against a single rubric in one judge call and convert the
@@ -276,12 +296,34 @@ async def evaluate_rubric_ranking(
             for j, p in pos_of.items():
                 scores[nonempty[j][0]] = 1.0 - p / max_pos if max_pos > 0 else 1.0
-        return {
+        out = {
             "scores": scores,
             "ranking": ranking,
             "reasoning": result.get("reasoning", ""),
             "llm_output": content,
         }
+        if enable_logging:
+            scores_fmt = "  ".join(f"[{i}]={s:.3f}" for i, s in enumerate(scores))
+            ranking_fmt = " > ".join(
+                f"[{', '.join(str(j) for j in tier)}]" if isinstance(tier, list) else str(tier)
+                for tier in ranking
+            )
+            logger.info(
+                "\n┌─ ranked rubric: %s ────────────────────\n"
+                "│ ground_truth : %s\n"
+                "│ ranking      : %s\n"
+                "│ scores       : %s\n"
+                "│ reasoning    : %s\n"
+                "│ llm_output   :\n%s\n"
+                "└──────────────────────────────────────────────────",
+                rubric.title,
+                (ground_truth or "").strip() or "(none)",
+                ranking_fmt or "(empty)",
+                scores_fmt,
+                out["reasoning"],
+                content,
+            )
+        return out
     except Exception as e:
         print(f"Error ranking rubric '{rubric.title}': {e}\njudge output:\n{content}")
         return {"scores": scores, "ranking": [], "reasoning": f"Error: {e}", "llm_output": content}

{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchmax
-Version: 0.1.2.dev25
+Version: 0.1.2.dev27
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: castie@castform.com
 Classifier: Programming Language :: Python :: 3