PyPI - benchmax - Versions diffs - 0.1.2.dev30__tar.gz → 0.1.2.dev31__tar.gz - Mend

benchmax 0.1.2.dev30tar.gz → 0.1.2.dev31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{benchmax-0.1.2.dev30/src/benchmax.egg-info → benchmax-0.1.2.dev31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchmax
-Version: 0.1.2.dev30
+Version: 0.1.2.dev31
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: castie@castform.com
 Classifier: Programming Language :: Python :: 3

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "benchmax"
-version = "0.1.2.dev30"
+version = "0.1.2.dev31"
 description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
 readme = "README.md"
 authors = [{ name = "castie@castform.com" }]

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/bundle.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import importlib
 import inspect
 import io
 import json
@@ -76,6 +77,7 @@ def dump_bundle(
     pip_dependencies: list[str] | None = None,
     local_modules: list[ModuleType] | None = None,
     env_class_source: str | None = None,
+    auto_local_modules: bool = True,
 ) -> Bundle:
     """Pickle ``(env_class, constructor_args)`` and stamp metadata.
@@ -90,6 +92,10 @@ def dump_bundle(
             recover it — e.g. a class produced by ``exec()`` into an in-memory
             namespace, which has no source file on disk. When ``None``
             (default), source is introspected from ``env_class``.
+        auto_local_modules: When True (default), any local module the pickle
+            references but that wasn't passed in ``local_modules`` is imported
+            and pickled by value automatically (a warning names them). When
+            False, such a reference raises ``BundlingError`` instead.
     Raises:
         BundlingError: bad env_class, cloudpickle failure, or pickle references
@@ -124,6 +130,46 @@ def dump_bundle(
                 except Exception:
                     pass
+    if auto_local_modules and _unregistered_local_refs(pickled):
+        # Import each referenced local module and re-dump with it pickled by
+        # value. Loop because a by-value module can surface further local refs;
+        # registrations accumulate (and are torn down once at the end) so an
+        # earlier module stays by-value while we resolve the ones it pulled in.
+        seen: set[str] = {m.__name__ for m in local_modules}
+        registered: list[ModuleType] = []
+        with _BUNDLE_LOCK:
+            try:
+                for _ in range(10):
+                    pending = [
+                        m for m in _unregistered_local_refs(pickled) if m not in seen
+                    ]
+                    if not pending:
+                        break
+                    new_mods: list[ModuleType] = []
+                    for name in pending:
+                        seen.add(name)  # unimportable names fall through to the guard
+                        try:
+                            new_mods.append(importlib.import_module(name))
+                        except Exception:
+                            pass
+                    if not new_mods:
+                        break
+                    logger.warning(
+                        "[bundle] %s: auto-bundling local module(s): %s ",
+                        env_class.__name__,
+                        ", ".join(sorted(m.__name__ for m in new_mods)),
+                    )
+                    for mod in new_mods:
+                        cloudpickle.register_pickle_by_value(mod)
+                        registered.append(mod)
+                    pickled = cloudpickle.dumps((env_class, constructor_args))
+            finally:
+                for mod in registered:
+                    try:
+                        cloudpickle.unregister_pickle_by_value(mod)
+                    except Exception:
+                        pass
     risky = _unregistered_local_refs(pickled)
     if risky:
         msg = (
@@ -259,6 +305,15 @@ def _referenced_modules(pickled: bytes) -> set[str]:
     # Hooks find_class so we see every (module, name) the unpickler would import —
     # i.e. exactly what'd raise ModuleNotFoundError on a fresh interpreter. The stub
     # lets unpickling proceed past missing classes so we collect every ref.
+    #
+    # find_class alone has a blind spot: a bare ``import foo`` that leaves a
+    # module *object* in the env's globals is pickled as
+    # ``cloudpickle.subimport("foo")`` — the module name is a REDUCE argument,
+    # not a find_class path, so we'd only see ``cloudpickle.cloudpickle`` (which
+    # looks installed) and miss ``foo``. We shim subimport to record its arg and
+    # return a stub instead of importing, so a missing module is captured rather
+    # than aborting the whole load early. (``dynamic_subimport`` is by-value /
+    # self-contained — leave it to the real find_class so we don't flag it.)
     refs: set[str] = set()
     class _Stub:
@@ -271,9 +326,28 @@ def _referenced_modules(pickled: bytes) -> set[str]:
         def __reduce__(self) -> tuple:
             return (type(self), ())
+    def _recording_subimport(name: str, *a: Any, **kw: Any) -> ModuleType:
+        refs.add(name)
+        return ModuleType(str(name))
+    def _noop_setstate(obj: Any, *a: Any, **kw: Any) -> Any:
+        # cloudpickle's _make_skeleton_class resolves the class_tracker_id back
+        # to the *live* class (it was tracked when env_class was dumped), so the
+        # real ``_class_setstate``/``_function_setstate`` would setattr the
+        # reconstructed (stub-globals) members onto the live class/function —
+        # mutating the caller's class mid-bundle and poisoning any later dump.
+        # We only need the refs from ``state``, which are already recorded while
+        # it's unpickled; the setter itself is a no-op here.
+        return obj
     class _Recorder(pickle.Unpickler):
         def find_class(self, module: str, name: str) -> Any:
             refs.add(module)
+            if module.startswith("cloudpickle"):
+                if name == "subimport":
+                    return _recording_subimport
+                if name in ("_class_setstate", "_function_setstate"):
+                    return _noop_setstate
             try:
                 return super().find_class(module, name)
             except Exception:

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/envs/telestich/example.py RENAMED Viewed

@@ -12,10 +12,12 @@ Run it from the benchmax project root (the ``telestich`` extra pulls in the
 env's word-list / rhyme dependencies):
     cd core/benchmax
-    CASTFORM_API_KEY=sk_... \
-        uv run --extra telestich python -m benchmax.envs.telestich.example
+    uv run --extra telestich python -m benchmax.envs.telestich.example
-(``CASTFORM_LLM_API_KEY`` is optional — it defaults to ``CASTFORM_API_KEY``.)
+Auth is the device-auth session (``ensure_session()`` opens a browser login if
+``~/.castform`` has no valid session) — no API key needed. ``CASTFORM_API_KEY``
+/ ``CASTFORM_LLM_API_KEY`` are only consulted by the offline dataset-generation
+helpers, not the launch path.
 This launches a real training run on the full committed seed dataset
 (~90/10 train/eval split).
@@ -63,6 +65,8 @@ CONCURRENCY = 15
 # pool) server-side. Supported: "Qwen/Qwen3.5-4B" (gpu4) or "Qwen/Qwen3.5-35B-A3B"
 # (gpu8). Override via TELESTICH_MODEL.
 MODEL = os.environ.get("TELESTICH_MODEL", "Qwen/Qwen3.5-4B")
+# Run name — defaults to a unique telestich-full-<uuid>. Override via TELESTICH_RUN_NAME.
+RUN_NAME = os.environ.get("TELESTICH_RUN_NAME", "")
 # (model, weight). Weights reflect observed reliability on our checks:
 # - Both grok models leak banned example words and rubber-stamp the CoT self-check.
@@ -558,12 +562,15 @@ def get_dataset():
 if __name__ == "__main__":
     import uuid
+    from benchmax.platform import ensure_session
     from benchmax.platform.client import TrainerClient
     from benchmax.platform.training_run import upload_training_run
     from benchmax.platform.validation import validate_env
-    if not API_KEY:
-        raise SystemExit("Set CASTFORM_API_KEY before running this example.")
+    # Device-auth session bootstrap: browser login if no credential resolves.
+    # After this the platform bearer comes from ~/.castform — no API key needed,
+    # so we pass api_key="" to the platform calls below (resolves via the seam).
+    ensure_session()
     print(f"Platform URL: {BASE_URL}")
     print(f"LLM URL:      {LLM_BASE_URL}\n")
@@ -603,7 +610,7 @@ if __name__ == "__main__":
         eval_dataset=eval_data[:2],
         local_modules=local_modules,
         pip_dependencies=pip_dependencies,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         llm_base_url=LLM_BASE_URL,
         llm_api_key="",
@@ -614,14 +621,14 @@ if __name__ == "__main__":
         )
     # 3. Bundle the env class and upload everything to platform storage.
-    run_name = f"telestich-full-{uuid.uuid4().hex[:8]}"
+    run_name = RUN_NAME or f"telestich-full-{uuid.uuid4().hex[:8]}"
     print(f"\nUploading bundle + datasets as {run_name!r} ...")
     uploaded = upload_training_run(
         env_class=TelestichEnv,
         train_dataset=train_data,
         eval_dataset=eval_data,
         run_name=run_name,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         local_modules=local_modules,
         constructor_args=constructor_args,
@@ -638,7 +645,7 @@ if __name__ == "__main__":
     # 4. Launch the training run. training_run_type="simple" + the `model` arg select
     #    the trainer YAML/pool server-side (Qwen3.5-4B→gpu4, Qwen3.5-35B-A3B→gpu8).
     print(f"\nLaunching training run (model={MODEL}) ...")
-    with TrainerClient(api_key=API_KEY, base_url=BASE_URL) as trainer:
+    with TrainerClient(api_key="", base_url=BASE_URL) as trainer:
         run_id = trainer.launch_training_run(
             training_run_type="simple",
             env_cls_path=uploaded.env_cls_path,
@@ -647,10 +654,10 @@ if __name__ == "__main__":
             eval_dataset_path=uploaded.eval_dataset_path,
             name=run_name,
             # num_epochs: passes over the train set (platform default is 5).
-            # max_response_len 3000: a brief reason + 1-2 tool rounds + poem fits well
+            # max_rollout_len 3000: a brief reason + 1-2 tool rounds + poem fits well
             # under this; lowered from 4000 to cut off in-head enumeration rambles
             # sooner (they truncate to a 0-reward anyway).
-            launcher_args={"model": MODEL, "max_response_len": 3000, "num_epochs": 10},
+            launcher_args={"model": MODEL, "max_rollout_len": 3000, "num_epochs": 10},
         )
     print(f"\n✓ Launched run_id={run_id}")

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/platform/client.py RENAMED Viewed

@@ -7,6 +7,7 @@ import hashlib
 import json
 import logging
 import textwrap
+import warnings
 from collections.abc import Iterator
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -404,7 +405,7 @@ class TrainerClient:
             eval_dataset_path: Path to the evaluation dataset
             name: Optional name for the training run
             launcher_args: Extra launcher args forwarded to the server
-                (e.g. {"max_response_len": 4000}). The 4 required paths
+                (e.g. {"max_rollout_len": 4000}). The 4 required paths
                 above always take precedence.
         Returns:
@@ -431,8 +432,11 @@ class TrainerClient:
         )
         self._handle_response_errors(response)
         body = response.json()
+        # Surface soft-cap / OOM-risk warnings via the warnings module (shown by
+        # default in notebooks/REPL) — a bare logger.warning is swallowed unless
+        # the caller configured logging.
         for warning in body.get("warnings", []) or []:
-            logger.warning("launch warning: %s", warning)
+            warnings.warn(f"launch warning: {warning}", stacklevel=2)
         return body["runId"]
     def list_launch_args(self) -> list[LaunchArgSpec]:

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/platform/validation.py RENAMED Viewed

@@ -7,6 +7,7 @@ the env class contract matches what the trainer expects.
 from __future__ import annotations
 import asyncio
+import importlib
 import json
 import math
 import tempfile
@@ -578,6 +579,41 @@ def _run_local_checks(
             from benchmax.bundle import unregistered_local_refs
             risky = unregistered_local_refs(cloudpickle.dumps(env_class))
+            # Mirror dump_bundle's auto_local_modules: import + pickle-by-value
+            # any local refs the user didn't list, so validation reflects what
+            # the bundle will actually contain. Only genuinely unimportable refs
+            # (which the trainer also couldn't load) remain to be flagged.
+            auto: list[ModuleType] = []
+            if risky:
+                seen: set[str] = set()
+                try:
+                    for _ in range(10):
+                        pending = [
+                            m
+                            for m in unregistered_local_refs(cloudpickle.dumps(env_class))
+                            if m not in seen
+                        ]
+                        if not pending:
+                            break
+                        new_mods: list[ModuleType] = []
+                        for name in pending:
+                            seen.add(name)
+                            try:
+                                new_mods.append(importlib.import_module(name))
+                            except Exception:
+                                pass
+                        if not new_mods:
+                            break
+                        for mod in new_mods:
+                            cloudpickle.register_pickle_by_value(mod)
+                            auto.append(mod)
+                    risky = unregistered_local_refs(cloudpickle.dumps(env_class))
+                finally:
+                    for mod in auto:
+                        try:
+                            cloudpickle.unregister_pickle_by_value(mod)
+                        except Exception:
+                            pass
             if risky:
                 print(
                     f"  \u2717 {env_class.__name__}: missing "
@@ -589,7 +625,13 @@ def _run_local_checks(
                 )
                 failed += 1
             else:
-                print("  \u2713 no unregistered local-module references")
+                if auto:
+                    names = ", ".join(sorted(m.__name__ for m in auto))
+                    print(
+                        f"  \u2713 auto-bundled local module(s): {names} "
+                    )
+                else:
+                    print("  \u2713 no unregistered local-module references")
                 passed += 1
         except Exception as exc:
             print(f"  \u2717 local-modules check failed: {type(exc).__name__}: {exc}")

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/rag/corpus/pinecone/index_client.py RENAMED Viewed

@@ -60,9 +60,17 @@ class PineconeIndexClient:
         embed_model: Pinecone hosted embedding model name.  Ignored when
             ``embed_fn`` is provided.  Defaults to
             ``"multilingual-e5-large"``.
-        field_mapping: Maps *Pinecone metadata field names* → *internal
-            field names*.  Useful for "bring your own index" scenarios where
-            the user's metadata schema differs from the default.
+        field_mapping: Low-level escape hatch — maps *Pinecone metadata
+            field names* → *internal field names* for schemas that also
+            relocate structural fields (``file_path``, ``chunk_index``,
+            headers).  For the common "my text is under a different key"
+            case, prefer ``content_field``.
+        content_field: Pinecone metadata key holding the chunk text, for
+            "bring your own index" schemas that don't use ``content`` (e.g.
+            ``"summary"`` or ``"passage"``).  The canonical way to point at
+            your text column.  Empty / None means the default ``content``
+            key.  Raises if ``field_mapping`` already maps a *different*
+            key to ``content``.
     """
     def __init__(
@@ -75,15 +83,35 @@ class PineconeIndexClient:
         embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
         embed_model: str = "multilingual-e5-large",
         field_mapping: dict[str, str] | None = None,
+        content_field: str | None = None,
     ) -> None:
         # Store config for lazy init / pickle safety.
         self._api_key = api_key
         self._index_name = index_name
         self._index_host = index_host
-        self._namespace = namespace
+        # Platform codegen may pass None for an unset namespace; Pinecone's
+        # default namespace is "".
+        self._namespace = namespace or ""
         self._embed_model = embed_model
         self.embed_fn = embed_fn or self._build_pinecone_embed_fn()
-        self._field_mapping = field_mapping or dict(DEFAULT_FIELD_MAPPING)
+        mapping = dict(field_mapping) if field_mapping else dict(DEFAULT_FIELD_MAPPING)
+        if content_field and content_field != "content":
+            conflicting = [
+                k
+                for k, v in mapping.items()
+                if v == "content" and k not in ("content", content_field)
+            ]
+            if field_mapping and conflicting:
+                raise ValueError(
+                    f"content_field={content_field!r} conflicts with field_mapping "
+                    f"entries {conflicting} that already map to 'content'. "
+                    "Specify the text column one way or the other."
+                )
+            # Drop the default content→content entry so the reverse mapping
+            # resolves "content" to the custom key unambiguously.
+            mapping.pop("content", None)
+            mapping[content_field] = "content"
+        self._field_mapping = mapping
         # Reverse mapping: internal name → pinecone metadata key
         self._reverse_mapping = {v: k for k, v in self._field_mapping.items()}
         self._index: Any | None = None
@@ -91,6 +119,8 @@ class PineconeIndexClient:
         self._known_ids: list[str] | None = None
         # Cached vector dimension (detected on first embed or describe_index).
         self._vector_dim: int | None = None
+        # Cached index vector type ("dense" | "sparse"), probed lazily.
+        self._vector_type: str | None = None
     def _build_pinecone_embed_fn(self) -> Callable[[list[str]], list[list[float]]]:
         """Build an embed_fn using Pinecone's hosted Inference API.
@@ -157,6 +187,35 @@ class PineconeIndexClient:
                 self._index = pc.Index(self._index_name)
         return self._index
+    def vector_type(self) -> str:
+        """Return the index vector type, ``"dense"`` or ``"sparse"``.
+        Probes the index via ``describe_index_stats`` on first call and
+        caches the result.
+        """
+        if self._vector_type is None:
+            index = self._get_index()
+            stats = index.describe_index_stats()
+            self._vector_type = getattr(stats, "vector_type", None) or "dense"
+        return self._vector_type
+    def namespace_vector_count(self) -> int:
+        """Return the vector count for this client's namespace.
+        Scoped to the namespace, NOT the index-wide total — an index-wide
+        count would disagree with what list/fetch/query in this namespace
+        can actually see.  The SDK keys the default namespace as
+        ``"__default__"`` (the REST API uses ``""``).
+        """
+        stats = self._get_index().describe_index_stats()
+        namespaces = getattr(stats, "namespaces", None) or {}
+        ns_stats = namespaces.get(self._namespace or "__default__")
+        if ns_stats is None and not self._namespace:
+            ns_stats = namespaces.get("")
+        if ns_stats is None:
+            return 0
+        return int(getattr(ns_stats, "vector_count", 0) or 0)
     def zero_vector(self) -> list[float]:
         """Return a zero-vector with the correct dimension for this index.
@@ -168,6 +227,12 @@ class PineconeIndexClient:
             index = self._get_index()
             stats = index.describe_index_stats()
             self._vector_dim = stats.dimension
+        if self._vector_dim is None:
+            # Sparse indexes have no fixed dimension.
+            raise ValueError(
+                f"Pinecone index '{self._index_name}' has no dimension — it is "
+                "a sparse index, which has no dense zero-vector."
+            )
         return [0.0] * self._vector_dim
     # ------------------------------------------------------------------
@@ -305,6 +370,14 @@ class PineconeIndexClient:
         include_metadata: bool = True,
     ) -> Any:
         """Run a vector query against the index."""
+        if self.vector_type() == "sparse":
+            # A dense query vector against a sparse index is rejected by
+            # Pinecone with an opaque error; fail with an actionable one.
+            raise ValueError(
+                f"Pinecone index '{self._index_name}' is a sparse index — "
+                "search against sparse indexes is not supported yet. "
+                "Use a dense index."
+            )
         index = self._get_index()
         kwargs: dict[str, Any] = {
             "vector": vector,

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/rag/corpus/pinecone/search.py RENAMED Viewed

@@ -36,6 +36,8 @@ class PineconeSearch:
         embed_model: Pinecone hosted embedding model name. Ignored
             when ``embed_fn`` is provided.
         field_mapping: Maps Pinecone metadata keys to internal names.
+        content_field: Pinecone metadata key holding the chunk text — sugar
+            over ``field_mapping`` for BYO indexes that don't use ``content``.
         token_provider: Optional override — a callable resolving the key per
             call, or a literal key (string sugar). Defaults to reading
             ``PINECONE_API_KEY``.
@@ -50,6 +52,7 @@ class PineconeSearch:
         embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
         embed_model: str = "multilingual-e5-large",
         field_mapping: dict[str, str] | None = None,
+        content_field: str | None = None,
         token_provider: str | TokenProvider | None = None,
     ) -> None:
         self._index_name = index_name
@@ -58,6 +61,7 @@ class PineconeSearch:
         self._embed_fn = embed_fn
         self._embed_model = embed_model
         self._field_mapping = field_mapping
+        self._content_field = content_field
         self._token_provider = as_token_provider(
             token_provider, env_token("PINECONE_API_KEY")
         )
@@ -75,6 +79,7 @@ class PineconeSearch:
                 embed_fn=self._embed_fn,
                 embed_model=self._embed_model,
                 field_mapping=self._field_mapping,
+                content_field=self._content_field,
             )
         return self._client

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/rag/corpus/pinecone/source.py RENAMED Viewed

@@ -26,6 +26,9 @@ from .index_client import PineconeIndexClient
 logger = logging.getLogger(__name__)
+#: Max IDs per vectors/fetch call — Pinecone caps fetch batches at 100.
+_FETCH_BATCH_SIZE = 100
 def _raw_to_chunk(raw: dict[str, Any]) -> Chunk:
     """Convert a raw dict from PineconeIndexClient to a Chunk."""
@@ -64,8 +67,13 @@ class PineconeChunkSource:
         embed_model: Pinecone hosted embedding model name.  Ignored when
             ``embed_fn`` is provided.  Defaults to
             ``"multilingual-e5-large"``.
-        field_mapping: Maps Pinecone metadata field names to internal names.
-            Useful for "bring your own index" scenarios.
+        field_mapping: Low-level escape hatch — maps Pinecone metadata field
+            names to internal names when structural fields (``file_path``,
+            ``chunk_index``, headers) are also relocated.  For the common
+            case, prefer ``content_field``.
+        content_field: Pinecone metadata key holding the chunk text — the
+            canonical way to point at your text column for pre-existing
+            indexes that don't use ``content``.
     Example:
         >>> # Using Pinecone's built-in embeddings (simplest)
@@ -82,12 +90,12 @@ class PineconeChunkSource:
         ...     embed_fn=my_embed_fn,
         ... )
-        >>> # Pre-existing index with custom field names
+        >>> # Pre-existing index whose text lives under another key
         >>> source = PineconeChunkSource(
         ...     api_key="pcsk_...",
         ...     index_name="product-catalog",
         ...     embed_model="llama-text-embed-v2",
-        ...     field_mapping={"description": "content", "path": "file_path"},
+        ...     content_field="description",
         ... )
     """
@@ -101,6 +109,7 @@ class PineconeChunkSource:
         embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
         embed_model: str = "multilingual-e5-large",
         field_mapping: dict[str, str] | None = None,
+        content_field: str | None = None,
     ) -> None:
         self._client = PineconeIndexClient(
             api_key=api_key,
@@ -110,6 +119,7 @@ class PineconeChunkSource:
             embed_fn=embed_fn,
             embed_model=embed_model,
             field_mapping=field_mapping,
+            content_field=content_field,
         )
         self._files = FileAwareness(self._client)
@@ -237,40 +247,56 @@ class PineconeChunkSource:
     # ------------------------------------------------------------------
     def get_chunk_count(self) -> int:
-        """Return the total number of vectors in the index."""
-        index = self._client._get_index()
-        stats = index.describe_index_stats()
-        return int(stats.total_vector_count or 0)
+        """Return the number of vectors in the configured namespace.
+        Scoped to the namespace this source reads from — an index-wide
+        total would disagree with what sampling/search can actually see.
+        """
+        return self._client.namespace_vector_count()
     def sample_chunks(self, n: int, min_chars: int = 0) -> list[Chunk]:
         """Return n randomly sampled chunks, optionally filtered by
         minimum length.
-        Uses a random vector query to get pseudo-random results
-        efficiently in a single API call.
+        Samples uniformly from the paginated ID listing and hydrates the
+        sample via fetch — no query vector involved, so the draw is
+        genuinely uniform (not nearest-to-a-random-point) and works for
+        dense and sparse indexes alike.
         """
-        # Generate a random vector for pseudo-random sampling
-        dim = len(self._client.zero_vector())
-        rand_vec = [random.gauss(0, 1) for _ in range(dim)]
-        # Fetch more than needed to allow for min_chars filtering
-        fetch_k = min(n * 3, 10000) if min_chars > 0 else min(n, 10000)
-        result = self._client.query(
-            vector=rand_vec,
-            top_k=fetch_k,
-            include_metadata=True,
-        )
-        matches = result.matches or []
-        if not matches:
+        # Oversample when a length filter will discard part of the draw
+        fetch_n = min(n * 3, 10000) if min_chars > 0 else min(n, 10000)
+        ids = self._client.sample_ids(fetch_n)
+        if not ids:
             return []
-        chunks = [_raw_to_chunk(self._client.match_to_raw(m)) for m in matches]
+        raws: list[dict[str, Any]] = []
+        for batch_start in range(0, len(ids), _FETCH_BATCH_SIZE):
+            raws.extend(
+                self._client.fetch_by_ids_raw(
+                    ids[batch_start : batch_start + _FETCH_BATCH_SIZE]
+                )
+            )
+        chunks = [_raw_to_chunk(r) for r in raws]
+        # Every fetched record decoding to empty content means the text key
+        # is wrong (BYO index whose schema doesn't use the configured field),
+        # not that the corpus is empty. Without this, the pipeline dies later
+        # with an unactionable "No eligible chunks were found".
+        if chunks and all(not c.content for c in chunks):
+            content_key = self._client._pc_field("content")
+            seen_keys = sorted(
+                {k for r in raws for k in r.get("metadata", {}) if not k.startswith("_")}
+            )
+            raise ValueError(
+                f"No text found under metadata field '{content_key}' in any "
+                f"sampled record. This index's metadata fields are: "
+                f"{seen_keys}. Set content_field to the one holding the "
+                f"chunk text."
+            )
         if min_chars > 0:
             chunks = [c for c in chunks if len(c.content) >= min_chars]
-        # Shuffle to avoid bias from similarity ordering
         random.shuffle(chunks)
         return chunks[:n]

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev31}/src/benchmax/rag/corpus/turbopuffer/namespace.py RENAMED Viewed

@@ -19,6 +19,27 @@ from benchmax.rag.corpus.search_schema.search_types import (
 )
+def resolve_content_attr(
+    content_attr: list[str] | None, content_field: str | None
+) -> list[str] | None:
+    """Resolve the ``content_field`` sugar against an explicit ``content_attr``.
+    ``content_field`` is the canonical single-column param; ``content_attr``
+    is the low-level multi-field escape hatch.  Specifying the text column
+    both ways with different values raises instead of silently picking a
+    winner.
+    """
+    if not content_field:
+        return content_attr
+    if content_attr is not None and content_attr != [content_field]:
+        raise ValueError(
+            f"content_field={content_field!r} conflicts with "
+            f"content_attr={content_attr!r}. Specify the text column one way "
+            "or the other."
+        )
+    return [content_field]
 class TpufNamespace:
     """Thin wrapper around a Turbopuffer namespace.

benchmax 0.1.2.dev30__tar.gz → 0.1.2.dev31__tar.gz

benchmax 0.1.2.dev30tar.gz → 0.1.2.dev31tar.gz