PyPI - benchmax - Versions diffs - 0.1.2.dev30__tar.gz → 0.1.2.dev33__tar.gz - Mend

benchmax 0.1.2.dev30tar.gz → 0.1.2.dev33tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{benchmax-0.1.2.dev30/src/benchmax.egg-info → benchmax-0.1.2.dev33}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchmax
-Version: 0.1.2.dev30
+Version: 0.1.2.dev33
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: castie@castform.com
 Classifier: Programming Language :: Python :: 3

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "benchmax"
-version = "0.1.2.dev30"
+version = "0.1.2.dev33"
 description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
 readme = "README.md"
 authors = [{ name = "castie@castform.com" }]

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/bundle.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import importlib
 import inspect
 import io
 import json
@@ -76,6 +77,7 @@ def dump_bundle(
     pip_dependencies: list[str] | None = None,
     local_modules: list[ModuleType] | None = None,
     env_class_source: str | None = None,
+    auto_local_modules: bool = True,
 ) -> Bundle:
     """Pickle ``(env_class, constructor_args)`` and stamp metadata.
@@ -90,6 +92,10 @@ def dump_bundle(
             recover it — e.g. a class produced by ``exec()`` into an in-memory
             namespace, which has no source file on disk. When ``None``
             (default), source is introspected from ``env_class``.
+        auto_local_modules: When True (default), any local module the pickle
+            references but that wasn't passed in ``local_modules`` is imported
+            and pickled by value automatically (a warning names them). When
+            False, such a reference raises ``BundlingError`` instead.
     Raises:
         BundlingError: bad env_class, cloudpickle failure, or pickle references
@@ -124,6 +130,46 @@ def dump_bundle(
                 except Exception:
                     pass
+    if auto_local_modules and _unregistered_local_refs(pickled):
+        # Import each referenced local module and re-dump with it pickled by
+        # value. Loop because a by-value module can surface further local refs;
+        # registrations accumulate (and are torn down once at the end) so an
+        # earlier module stays by-value while we resolve the ones it pulled in.
+        seen: set[str] = {m.__name__ for m in local_modules}
+        registered: list[ModuleType] = []
+        with _BUNDLE_LOCK:
+            try:
+                for _ in range(10):
+                    pending = [
+                        m for m in _unregistered_local_refs(pickled) if m not in seen
+                    ]
+                    if not pending:
+                        break
+                    new_mods: list[ModuleType] = []
+                    for name in pending:
+                        seen.add(name)  # unimportable names fall through to the guard
+                        try:
+                            new_mods.append(importlib.import_module(name))
+                        except Exception:
+                            pass
+                    if not new_mods:
+                        break
+                    logger.warning(
+                        "[bundle] %s: auto-bundling local module(s): %s ",
+                        env_class.__name__,
+                        ", ".join(sorted(m.__name__ for m in new_mods)),
+                    )
+                    for mod in new_mods:
+                        cloudpickle.register_pickle_by_value(mod)
+                        registered.append(mod)
+                    pickled = cloudpickle.dumps((env_class, constructor_args))
+            finally:
+                for mod in registered:
+                    try:
+                        cloudpickle.unregister_pickle_by_value(mod)
+                    except Exception:
+                        pass
     risky = _unregistered_local_refs(pickled)
     if risky:
         msg = (
@@ -259,6 +305,15 @@ def _referenced_modules(pickled: bytes) -> set[str]:
     # Hooks find_class so we see every (module, name) the unpickler would import —
     # i.e. exactly what'd raise ModuleNotFoundError on a fresh interpreter. The stub
     # lets unpickling proceed past missing classes so we collect every ref.
+    #
+    # find_class alone has a blind spot: a bare ``import foo`` that leaves a
+    # module *object* in the env's globals is pickled as
+    # ``cloudpickle.subimport("foo")`` — the module name is a REDUCE argument,
+    # not a find_class path, so we'd only see ``cloudpickle.cloudpickle`` (which
+    # looks installed) and miss ``foo``. We shim subimport to record its arg and
+    # return a stub instead of importing, so a missing module is captured rather
+    # than aborting the whole load early. (``dynamic_subimport`` is by-value /
+    # self-contained — leave it to the real find_class so we don't flag it.)
     refs: set[str] = set()
     class _Stub:
@@ -271,9 +326,28 @@ def _referenced_modules(pickled: bytes) -> set[str]:
         def __reduce__(self) -> tuple:
             return (type(self), ())
+    def _recording_subimport(name: str, *a: Any, **kw: Any) -> ModuleType:
+        refs.add(name)
+        return ModuleType(str(name))
+    def _noop_setstate(obj: Any, *a: Any, **kw: Any) -> Any:
+        # cloudpickle's _make_skeleton_class resolves the class_tracker_id back
+        # to the *live* class (it was tracked when env_class was dumped), so the
+        # real ``_class_setstate``/``_function_setstate`` would setattr the
+        # reconstructed (stub-globals) members onto the live class/function —
+        # mutating the caller's class mid-bundle and poisoning any later dump.
+        # We only need the refs from ``state``, which are already recorded while
+        # it's unpickled; the setter itself is a no-op here.
+        return obj
     class _Recorder(pickle.Unpickler):
         def find_class(self, module: str, name: str) -> Any:
             refs.add(module)
+            if module.startswith("cloudpickle"):
+                if name == "subimport":
+                    return _recording_subimport
+                if name in ("_class_setstate", "_function_setstate"):
+                    return _noop_setstate
             try:
                 return super().find_class(module, name)
             except Exception:

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/envs/postgres_search/search_env.py RENAMED Viewed

@@ -285,14 +285,8 @@ tags. Cite your sources inline using [Source: <source_id>] next to each claim.
             if not text.strip():
                 return zeros
-            # No final <answer> block → no answer to score. Return all-zero
-            # rewards so conciseness / citations / efficiency can't accrue
-            # from reasoning or tool-call text alone.
-            answer = extract_answer_block(text)
-            if not answer:
-                return zeros
             t = task or {}
+            answer = extract_answer_block(text)
             prompt = str(t.get("question") or t.get("prompt") or "")
             gt_str = str(t.get("ground_truth") or "")
             reference_chunks = t.get("reference_chunks", [])

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/envs/reward_helpers.py RENAMED Viewed

@@ -82,16 +82,9 @@ def extract_completion_text(completion: str | list[dict[str, Any]]) -> str:
 def extract_answer_block(text: str) -> str:
-    """Extract content from ``<answer>`` tags.
-    Returns the (stripped) tag contents when an ``<answer>…</answer>`` block
-    is present, otherwise ``""``. A missing answer block is treated as "no
-    final answer" rather than silently falling back to the full completion —
-    consumers can gate rewards on a non-empty result. ``<answer></answer>``
-    likewise yields ``""``.
-    """
+    """Extract content from <answer> tags, or return full text."""
     match = _ANSWER_TAG_RE.search(text or "")
-    return match.group(1).strip() if match else ""
+    return (match.group(1) if match else text).strip()
 def clip01(value: Any) -> float:
@@ -169,10 +162,8 @@ def citation_score(
                 ref_ids.add(norm_sid)
             break
-    if not cited:
+    if not cited or not ref_ids:
         return {"precision": 0.0, "recall": 0.0}
-    if not ref_ids:
-        return {"precision": 1.0, "recall": 0.0}
     precision = len(cited & ref_ids) / len(cited)
     recall = len(cited & ref_ids) / len(ref_ids)

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/envs/telestich/example.py RENAMED Viewed

@@ -12,10 +12,12 @@ Run it from the benchmax project root (the ``telestich`` extra pulls in the
 env's word-list / rhyme dependencies):
     cd core/benchmax
-    CASTFORM_API_KEY=sk_... \
-        uv run --extra telestich python -m benchmax.envs.telestich.example
+    uv run --extra telestich python -m benchmax.envs.telestich.example
-(``CASTFORM_LLM_API_KEY`` is optional — it defaults to ``CASTFORM_API_KEY``.)
+Auth is the device-auth session (``ensure_session()`` opens a browser login if
+``~/.castform`` has no valid session) — no API key needed. ``CASTFORM_API_KEY``
+/ ``CASTFORM_LLM_API_KEY`` are only consulted by the offline dataset-generation
+helpers, not the launch path.
 This launches a real training run on the full committed seed dataset
 (~90/10 train/eval split).
@@ -63,6 +65,8 @@ CONCURRENCY = 15
 # pool) server-side. Supported: "Qwen/Qwen3.5-4B" (gpu4) or "Qwen/Qwen3.5-35B-A3B"
 # (gpu8). Override via TELESTICH_MODEL.
 MODEL = os.environ.get("TELESTICH_MODEL", "Qwen/Qwen3.5-4B")
+# Run name — defaults to a unique telestich-full-<uuid>. Override via TELESTICH_RUN_NAME.
+RUN_NAME = os.environ.get("TELESTICH_RUN_NAME", "")
 # (model, weight). Weights reflect observed reliability on our checks:
 # - Both grok models leak banned example words and rubber-stamp the CoT self-check.
@@ -558,12 +562,15 @@ def get_dataset():
 if __name__ == "__main__":
     import uuid
+    from benchmax.platform import ensure_session
     from benchmax.platform.client import TrainerClient
     from benchmax.platform.training_run import upload_training_run
     from benchmax.platform.validation import validate_env
-    if not API_KEY:
-        raise SystemExit("Set CASTFORM_API_KEY before running this example.")
+    # Device-auth session bootstrap: browser login if no credential resolves.
+    # After this the platform bearer comes from ~/.castform — no API key needed,
+    # so we pass api_key="" to the platform calls below (resolves via the seam).
+    ensure_session()
     print(f"Platform URL: {BASE_URL}")
     print(f"LLM URL:      {LLM_BASE_URL}\n")
@@ -603,7 +610,7 @@ if __name__ == "__main__":
         eval_dataset=eval_data[:2],
         local_modules=local_modules,
         pip_dependencies=pip_dependencies,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         llm_base_url=LLM_BASE_URL,
         llm_api_key="",
@@ -614,14 +621,14 @@ if __name__ == "__main__":
         )
     # 3. Bundle the env class and upload everything to platform storage.
-    run_name = f"telestich-full-{uuid.uuid4().hex[:8]}"
+    run_name = RUN_NAME or f"telestich-full-{uuid.uuid4().hex[:8]}"
     print(f"\nUploading bundle + datasets as {run_name!r} ...")
     uploaded = upload_training_run(
         env_class=TelestichEnv,
         train_dataset=train_data,
         eval_dataset=eval_data,
         run_name=run_name,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         local_modules=local_modules,
         constructor_args=constructor_args,
@@ -638,7 +645,7 @@ if __name__ == "__main__":
     # 4. Launch the training run. training_run_type="simple" + the `model` arg select
     #    the trainer YAML/pool server-side (Qwen3.5-4B→gpu4, Qwen3.5-35B-A3B→gpu8).
     print(f"\nLaunching training run (model={MODEL}) ...")
-    with TrainerClient(api_key=API_KEY, base_url=BASE_URL) as trainer:
+    with TrainerClient(api_key="", base_url=BASE_URL) as trainer:
         run_id = trainer.launch_training_run(
             training_run_type="simple",
             env_cls_path=uploaded.env_cls_path,
@@ -647,10 +654,10 @@ if __name__ == "__main__":
             eval_dataset_path=uploaded.eval_dataset_path,
             name=run_name,
             # num_epochs: passes over the train set (platform default is 5).
-            # max_response_len 3000: a brief reason + 1-2 tool rounds + poem fits well
+            # max_rollout_len 3000: a brief reason + 1-2 tool rounds + poem fits well
             # under this; lowered from 4000 to cut off in-head enumeration rambles
             # sooner (they truncate to a 0-reward anyway).
-            launcher_args={"model": MODEL, "max_response_len": 3000, "num_epochs": 10},
+            launcher_args={"model": MODEL, "max_rollout_len": 3000, "num_epochs": 10},
         )
     print(f"\n✓ Launched run_id={run_id}")

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/platform/client.py RENAMED Viewed

@@ -7,6 +7,7 @@ import hashlib
 import json
 import logging
 import textwrap
+import warnings
 from collections.abc import Iterator
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -404,7 +405,7 @@ class TrainerClient:
             eval_dataset_path: Path to the evaluation dataset
             name: Optional name for the training run
             launcher_args: Extra launcher args forwarded to the server
-                (e.g. {"max_response_len": 4000}). The 4 required paths
+                (e.g. {"max_rollout_len": 4000}). The 4 required paths
                 above always take precedence.
         Returns:
@@ -431,8 +432,11 @@ class TrainerClient:
         )
         self._handle_response_errors(response)
         body = response.json()
+        # Surface soft-cap / OOM-risk warnings via the warnings module (shown by
+        # default in notebooks/REPL) — a bare logger.warning is swallowed unless
+        # the caller configured logging.
         for warning in body.get("warnings", []) or []:
-            logger.warning("launch warning: %s", warning)
+            warnings.warn(f"launch warning: {warning}", stacklevel=2)
         return body["runId"]
     def list_launch_args(self) -> list[LaunchArgSpec]:

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/platform/validation.py RENAMED Viewed

@@ -7,6 +7,7 @@ the env class contract matches what the trainer expects.
 from __future__ import annotations
 import asyncio
+import importlib
 import json
 import math
 import tempfile
@@ -578,6 +579,41 @@ def _run_local_checks(
             from benchmax.bundle import unregistered_local_refs
             risky = unregistered_local_refs(cloudpickle.dumps(env_class))
+            # Mirror dump_bundle's auto_local_modules: import + pickle-by-value
+            # any local refs the user didn't list, so validation reflects what
+            # the bundle will actually contain. Only genuinely unimportable refs
+            # (which the trainer also couldn't load) remain to be flagged.
+            auto: list[ModuleType] = []
+            if risky:
+                seen: set[str] = set()
+                try:
+                    for _ in range(10):
+                        pending = [
+                            m
+                            for m in unregistered_local_refs(cloudpickle.dumps(env_class))
+                            if m not in seen
+                        ]
+                        if not pending:
+                            break
+                        new_mods: list[ModuleType] = []
+                        for name in pending:
+                            seen.add(name)
+                            try:
+                                new_mods.append(importlib.import_module(name))
+                            except Exception:
+                                pass
+                        if not new_mods:
+                            break
+                        for mod in new_mods:
+                            cloudpickle.register_pickle_by_value(mod)
+                            auto.append(mod)
+                    risky = unregistered_local_refs(cloudpickle.dumps(env_class))
+                finally:
+                    for mod in auto:
+                        try:
+                            cloudpickle.unregister_pickle_by_value(mod)
+                        except Exception:
+                            pass
             if risky:
                 print(
                     f"  \u2717 {env_class.__name__}: missing "
@@ -589,7 +625,13 @@ def _run_local_checks(
                 )
                 failed += 1
             else:
-                print("  \u2713 no unregistered local-module references")
+                if auto:
+                    names = ", ".join(sorted(m.__name__ for m in auto))
+                    print(
+                        f"  \u2713 auto-bundled local module(s): {names} "
+                    )
+                else:
+                    print("  \u2713 no unregistered local-module references")
                 passed += 1
         except Exception as exc:
             print(f"  \u2717 local-modules check failed: {type(exc).__name__}: {exc}")

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/rag/corpus/chroma/client.py RENAMED Viewed

@@ -16,6 +16,13 @@ from typing import Any
 # Sparse-key name used when setting up BM25 schema
 BM25_KEY = "bm25_embedding"
+# Embedding functions that run server-side on Chroma Cloud (embed.trychroma.com)
+# — querying a collection that uses one never downloads a model. Everything else
+# (default all-MiniLM, sentence-transformers / HF / Ollama / ONNX locals,
+# third-party API EFs, or no EF) is treated as unsafe. Add hosted names here as
+# they are verified server-side.
+_SERVER_SIDE_EF_NAMES = frozenset({"chroma-cloud-qwen"})
 def has_search_api() -> bool:
     """Return True when the chromadb package exposes the Search API."""
@@ -176,6 +183,29 @@ class ChromaClient:
         return self._collection
+    def dense_embed_is_safe(self) -> bool:
+        """True when a dense (vector) query embeds WITHOUT downloading a model.
+        Safe only when we can produce vectors without a client-side model
+        download: either a caller-supplied ``embed_fn``, or a Chroma-hosted
+        server-side embedding function (embeds at embed.trychroma.com). Every
+        other embedder — chromadb's default all-MiniLM, sentence-transformers /
+        HuggingFace / Ollama / ONNX locals, third-party API EFs we lack keys
+        for, or no EF at all — is treated as UNSAFE, so callers refuse the dense
+        path rather than trigger a model download. Conservative by design: an
+        unknown embedder is unsafe.
+        """
+        if self.embed_fn is not None:
+            return True
+        col = self._collection
+        if col is None:
+            return False
+        try:
+            ef = (col._model.configuration_json or {}).get("embedding_function") or {}
+        except Exception:
+            return False
+        return ef.get("name") in _SERVER_SIDE_EF_NAMES
     @staticmethod
     def _repair_cloud_embedding_function(collection: Any) -> None:
         """Attach a working EF when chromadb can't rebuild a Cloud hosted one.

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/rag/corpus/chroma/search.py RENAMED Viewed

@@ -10,6 +10,9 @@ from collections.abc import Callable
 from typing import Any
 from benchmax.platform.credentials import TokenProvider, as_token_provider, env_token
+from benchmax.rag.corpus.search_schema.search_exceptions import (
+    LocalEmbeddingDownloadDisallowedError,
+)
 class ChromaSearch:
@@ -113,19 +116,33 @@ class ChromaSearch:
     ) -> list[dict[str, Any]]:
         """Search and return structured results."""
         client = self._get_client()
-        if mode == "auto":
-            modes = client.modes
+        # Initialize the collection first so capabilities reflect the real index
+        # (BM25 downgrade) and the embedder config is readable below.
+        client.get_collection()
+        modes = client.modes
+        has_lexical = "lexical" in modes
+        # Never download a client-side embedding model at inference/rollout time.
+        # When a dense embed isn't safe — no embed_fn and no Chroma-hosted
+        # server-side embedding function — use the BM25 lexical index if the
+        # collection has one, otherwise refuse rather than fetch all-MiniLM.
+        if not client.dense_embed_is_safe():
+            if not has_lexical:
+                raise LocalEmbeddingDownloadDisallowedError(
+                    "chroma", self._collection_name
+                )
+            mode = "lexical"
+        elif mode == "auto":
             if "hybrid" in modes:
                 mode = "hybrid"
-            elif "lexical" in modes:
+            elif has_lexical:
                 mode = "lexical"
             else:
                 mode = "vector"
-        elif mode not in client.modes:
+        elif mode not in modes:
             raise ValueError(
                 f"ChromaSearch does not support mode '{mode}'. "
-                f"Available modes: {sorted(client.modes)}"
+                f"Available modes: {sorted(modes)}"
             )
         if client.search_api and mode in ("lexical", "hybrid"):

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/rag/corpus/chroma/source.py RENAMED Viewed

@@ -17,6 +17,7 @@ from tqdm.auto import tqdm
 from benchmax.rag.chunkers.models import Chunk, ChunkCollection
 from benchmax.rag.corpus.search_schema.search_exceptions import (
     InvalidSearchSpecError,
+    LocalEmbeddingDownloadDisallowedError,
     UnsupportedSearchModeError,
 )
 from benchmax.rag.corpus.search_schema.search_types import (
@@ -642,23 +643,30 @@ class ChromaChunkSource:
         # lack a BM25 index, in which case modes was downgraded to vector-only.
         modes = self._current_modes()
-        # Pick mode. "hybrid"/None use the best available strategy and KEEP
-        # lexical enabled as a fallback: hybrid = dense + sparse, and when we
-        # can't produce dense query vectors (no embed_fn, the usual remote case)
-        # the per-query loop below degrades to the sparse/lexical leg — which
-        # needs no embedding. Only an explicit "vector" disables lexical; that's
-        # the dense-only recovery path a caller uses after a lexical/hybrid
-        # failure. (Disabling lexical for "hybrid" silently forced vector search,
-        # which made remote collections dense-embed every query — slow, and on a
-        # default-EF collection it pulls the all-MiniLM model.)
-        if mode == "vector":
-            use_hybrid = use_lexical = False
+        has_lexical = "lexical" in modes
+        has_hybrid = "hybrid" in modes
+        # Hard rule: never let chromadb embed a query with a client-side model
+        # (it downloads all-MiniLM and crawls in constrained executors). When a
+        # dense embed isn't safe — no embed_fn and no Chroma-hosted server-side
+        # embedding function — use the BM25 lexical index if the collection has
+        # one, otherwise refuse. This covers every requested mode, including the
+        # linker's "inference" preference for vector.
+        if not self._chroma.dense_embed_is_safe():
+            if not has_lexical:
+                raise LocalEmbeddingDownloadDisallowedError(
+                    "chroma", self._chroma.collection_name
+                )
+            use_hybrid = False
+            use_lexical = True
         elif mode == "lexical":
             use_hybrid = False
-            use_lexical = "lexical" in modes
+            use_lexical = has_lexical
+        elif mode == "vector":
+            use_hybrid = use_lexical = False
         else:  # "hybrid", None, or unrecognized -> best available
-            use_hybrid = "hybrid" in modes
-            use_lexical = "lexical" in modes
+            use_hybrid = has_hybrid
+            use_lexical = has_lexical
         # Batch-embed all queries when embed_fn available and vectors needed
         vectors: list[list[float]] | None = None

{benchmax-0.1.2.dev30 → benchmax-0.1.2.dev33}/src/benchmax/rag/corpus/pinecone/index_client.py RENAMED Viewed

@@ -60,9 +60,17 @@ class PineconeIndexClient:
         embed_model: Pinecone hosted embedding model name.  Ignored when
             ``embed_fn`` is provided.  Defaults to
             ``"multilingual-e5-large"``.
-        field_mapping: Maps *Pinecone metadata field names* → *internal
-            field names*.  Useful for "bring your own index" scenarios where
-            the user's metadata schema differs from the default.
+        field_mapping: Low-level escape hatch — maps *Pinecone metadata
+            field names* → *internal field names* for schemas that also
+            relocate structural fields (``file_path``, ``chunk_index``,
+            headers).  For the common "my text is under a different key"
+            case, prefer ``content_field``.
+        content_field: Pinecone metadata key holding the chunk text, for
+            "bring your own index" schemas that don't use ``content`` (e.g.
+            ``"summary"`` or ``"passage"``).  The canonical way to point at
+            your text column.  Empty / None means the default ``content``
+            key.  Raises if ``field_mapping`` already maps a *different*
+            key to ``content``.
     """
     def __init__(
@@ -75,15 +83,35 @@ class PineconeIndexClient:
         embed_fn: Callable[[list[str]], list[list[float]]] | None = None,
         embed_model: str = "multilingual-e5-large",
         field_mapping: dict[str, str] | None = None,
+        content_field: str | None = None,
     ) -> None:
         # Store config for lazy init / pickle safety.
         self._api_key = api_key
         self._index_name = index_name
         self._index_host = index_host
-        self._namespace = namespace
+        # Platform codegen may pass None for an unset namespace; Pinecone's
+        # default namespace is "".
+        self._namespace = namespace or ""
         self._embed_model = embed_model
         self.embed_fn = embed_fn or self._build_pinecone_embed_fn()
-        self._field_mapping = field_mapping or dict(DEFAULT_FIELD_MAPPING)
+        mapping = dict(field_mapping) if field_mapping else dict(DEFAULT_FIELD_MAPPING)
+        if content_field and content_field != "content":
+            conflicting = [
+                k
+                for k, v in mapping.items()
+                if v == "content" and k not in ("content", content_field)
+            ]
+            if field_mapping and conflicting:
+                raise ValueError(
+                    f"content_field={content_field!r} conflicts with field_mapping "
+                    f"entries {conflicting} that already map to 'content'. "
+                    "Specify the text column one way or the other."
+                )
+            # Drop the default content→content entry so the reverse mapping
+            # resolves "content" to the custom key unambiguously.
+            mapping.pop("content", None)
+            mapping[content_field] = "content"
+        self._field_mapping = mapping
         # Reverse mapping: internal name → pinecone metadata key
         self._reverse_mapping = {v: k for k, v in self._field_mapping.items()}
         self._index: Any | None = None
@@ -91,6 +119,8 @@ class PineconeIndexClient:
         self._known_ids: list[str] | None = None
         # Cached vector dimension (detected on first embed or describe_index).
         self._vector_dim: int | None = None
+        # Cached index vector type ("dense" | "sparse"), probed lazily.
+        self._vector_type: str | None = None
     def _build_pinecone_embed_fn(self) -> Callable[[list[str]], list[list[float]]]:
         """Build an embed_fn using Pinecone's hosted Inference API.
@@ -157,6 +187,35 @@ class PineconeIndexClient:
                 self._index = pc.Index(self._index_name)
         return self._index
+    def vector_type(self) -> str:
+        """Return the index vector type, ``"dense"`` or ``"sparse"``.
+        Probes the index via ``describe_index_stats`` on first call and
+        caches the result.
+        """
+        if self._vector_type is None:
+            index = self._get_index()
+            stats = index.describe_index_stats()
+            self._vector_type = getattr(stats, "vector_type", None) or "dense"
+        return self._vector_type
+    def namespace_vector_count(self) -> int:
+        """Return the vector count for this client's namespace.
+        Scoped to the namespace, NOT the index-wide total — an index-wide
+        count would disagree with what list/fetch/query in this namespace
+        can actually see.  The SDK keys the default namespace as
+        ``"__default__"`` (the REST API uses ``""``).
+        """
+        stats = self._get_index().describe_index_stats()
+        namespaces = getattr(stats, "namespaces", None) or {}
+        ns_stats = namespaces.get(self._namespace or "__default__")
+        if ns_stats is None and not self._namespace:
+            ns_stats = namespaces.get("")
+        if ns_stats is None:
+            return 0
+        return int(getattr(ns_stats, "vector_count", 0) or 0)
     def zero_vector(self) -> list[float]:
         """Return a zero-vector with the correct dimension for this index.
@@ -168,6 +227,12 @@ class PineconeIndexClient:
             index = self._get_index()
             stats = index.describe_index_stats()
             self._vector_dim = stats.dimension
+        if self._vector_dim is None:
+            # Sparse indexes have no fixed dimension.
+            raise ValueError(
+                f"Pinecone index '{self._index_name}' has no dimension — it is "
+                "a sparse index, which has no dense zero-vector."
+            )
         return [0.0] * self._vector_dim
     # ------------------------------------------------------------------
@@ -305,6 +370,14 @@ class PineconeIndexClient:
         include_metadata: bool = True,
     ) -> Any:
         """Run a vector query against the index."""
+        if self.vector_type() == "sparse":
+            # A dense query vector against a sparse index is rejected by
+            # Pinecone with an opaque error; fail with an actionable one.
+            raise ValueError(
+                f"Pinecone index '{self._index_name}' is a sparse index — "
+                "search against sparse indexes is not supported yet. "
+                "Use a dense index."
+            )
         index = self._get_index()
         kwargs: dict[str, Any] = {
             "vector": vector,

benchmax 0.1.2.dev30__tar.gz → 0.1.2.dev33__tar.gz

benchmax 0.1.2.dev30tar.gz → 0.1.2.dev33tar.gz