benchmax 0.1.2.dev31__py3-none-any.whl → 0.1.2.dev33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -285,14 +285,8 @@ tags. Cite your sources inline using [Source: <source_id>] next to each claim.
285
285
  if not text.strip():
286
286
  return zeros
287
287
 
288
- # No final <answer> block → no answer to score. Return all-zero
289
- # rewards so conciseness / citations / efficiency can't accrue
290
- # from reasoning or tool-call text alone.
291
- answer = extract_answer_block(text)
292
- if not answer:
293
- return zeros
294
-
295
288
  t = task or {}
289
+ answer = extract_answer_block(text)
296
290
  prompt = str(t.get("question") or t.get("prompt") or "")
297
291
  gt_str = str(t.get("ground_truth") or "")
298
292
  reference_chunks = t.get("reference_chunks", [])
@@ -82,16 +82,9 @@ def extract_completion_text(completion: str | list[dict[str, Any]]) -> str:
82
82
 
83
83
 
84
84
  def extract_answer_block(text: str) -> str:
85
- """Extract content from ``<answer>`` tags.
86
-
87
- Returns the (stripped) tag contents when an ``<answer>…</answer>`` block
88
- is present, otherwise ``""``. A missing answer block is treated as "no
89
- final answer" rather than silently falling back to the full completion —
90
- consumers can gate rewards on a non-empty result. ``<answer></answer>``
91
- likewise yields ``""``.
92
- """
85
+ """Extract content from <answer> tags, or return full text."""
93
86
  match = _ANSWER_TAG_RE.search(text or "")
94
- return match.group(1).strip() if match else ""
87
+ return (match.group(1) if match else text).strip()
95
88
 
96
89
 
97
90
  def clip01(value: Any) -> float:
@@ -169,10 +162,8 @@ def citation_score(
169
162
  ref_ids.add(norm_sid)
170
163
  break
171
164
 
172
- if not cited:
165
+ if not cited or not ref_ids:
173
166
  return {"precision": 0.0, "recall": 0.0}
174
- if not ref_ids:
175
- return {"precision": 1.0, "recall": 0.0}
176
167
 
177
168
  precision = len(cited & ref_ids) / len(cited)
178
169
  recall = len(cited & ref_ids) / len(ref_ids)
@@ -16,6 +16,13 @@ from typing import Any
16
16
  # Sparse-key name used when setting up BM25 schema
17
17
  BM25_KEY = "bm25_embedding"
18
18
 
19
+ # Embedding functions that run server-side on Chroma Cloud (embed.trychroma.com)
20
+ # — querying a collection that uses one never downloads a model. Everything else
21
+ # (default all-MiniLM, sentence-transformers / HF / Ollama / ONNX locals,
22
+ # third-party API EFs, or no EF) is treated as unsafe. Add hosted names here as
23
+ # they are verified server-side.
24
+ _SERVER_SIDE_EF_NAMES = frozenset({"chroma-cloud-qwen"})
25
+
19
26
 
20
27
  def has_search_api() -> bool:
21
28
  """Return True when the chromadb package exposes the Search API."""
@@ -176,6 +183,29 @@ class ChromaClient:
176
183
 
177
184
  return self._collection
178
185
 
186
+ def dense_embed_is_safe(self) -> bool:
187
+ """True when a dense (vector) query embeds WITHOUT downloading a model.
188
+
189
+ Safe only when we can produce vectors without a client-side model
190
+ download: either a caller-supplied ``embed_fn``, or a Chroma-hosted
191
+ server-side embedding function (embeds at embed.trychroma.com). Every
192
+ other embedder — chromadb's default all-MiniLM, sentence-transformers /
193
+ HuggingFace / Ollama / ONNX locals, third-party API EFs we lack keys
194
+ for, or no EF at all — is treated as UNSAFE, so callers refuse the dense
195
+ path rather than trigger a model download. Conservative by design: an
196
+ unknown embedder is unsafe.
197
+ """
198
+ if self.embed_fn is not None:
199
+ return True
200
+ col = self._collection
201
+ if col is None:
202
+ return False
203
+ try:
204
+ ef = (col._model.configuration_json or {}).get("embedding_function") or {}
205
+ except Exception:
206
+ return False
207
+ return ef.get("name") in _SERVER_SIDE_EF_NAMES
208
+
179
209
  @staticmethod
180
210
  def _repair_cloud_embedding_function(collection: Any) -> None:
181
211
  """Attach a working EF when chromadb can't rebuild a Cloud hosted one.
@@ -10,6 +10,9 @@ from collections.abc import Callable
10
10
  from typing import Any
11
11
 
12
12
  from benchmax.platform.credentials import TokenProvider, as_token_provider, env_token
13
+ from benchmax.rag.corpus.search_schema.search_exceptions import (
14
+ LocalEmbeddingDownloadDisallowedError,
15
+ )
13
16
 
14
17
 
15
18
  class ChromaSearch:
@@ -113,19 +116,33 @@ class ChromaSearch:
113
116
  ) -> list[dict[str, Any]]:
114
117
  """Search and return structured results."""
115
118
  client = self._get_client()
116
-
117
- if mode == "auto":
118
- modes = client.modes
119
+ # Initialize the collection first so capabilities reflect the real index
120
+ # (BM25 downgrade) and the embedder config is readable below.
121
+ client.get_collection()
122
+ modes = client.modes
123
+ has_lexical = "lexical" in modes
124
+
125
+ # Never download a client-side embedding model at inference/rollout time.
126
+ # When a dense embed isn't safe — no embed_fn and no Chroma-hosted
127
+ # server-side embedding function — use the BM25 lexical index if the
128
+ # collection has one, otherwise refuse rather than fetch all-MiniLM.
129
+ if not client.dense_embed_is_safe():
130
+ if not has_lexical:
131
+ raise LocalEmbeddingDownloadDisallowedError(
132
+ "chroma", self._collection_name
133
+ )
134
+ mode = "lexical"
135
+ elif mode == "auto":
119
136
  if "hybrid" in modes:
120
137
  mode = "hybrid"
121
- elif "lexical" in modes:
138
+ elif has_lexical:
122
139
  mode = "lexical"
123
140
  else:
124
141
  mode = "vector"
125
- elif mode not in client.modes:
142
+ elif mode not in modes:
126
143
  raise ValueError(
127
144
  f"ChromaSearch does not support mode '{mode}'. "
128
- f"Available modes: {sorted(client.modes)}"
145
+ f"Available modes: {sorted(modes)}"
129
146
  )
130
147
 
131
148
  if client.search_api and mode in ("lexical", "hybrid"):
@@ -17,6 +17,7 @@ from tqdm.auto import tqdm
17
17
  from benchmax.rag.chunkers.models import Chunk, ChunkCollection
18
18
  from benchmax.rag.corpus.search_schema.search_exceptions import (
19
19
  InvalidSearchSpecError,
20
+ LocalEmbeddingDownloadDisallowedError,
20
21
  UnsupportedSearchModeError,
21
22
  )
22
23
  from benchmax.rag.corpus.search_schema.search_types import (
@@ -642,23 +643,30 @@ class ChromaChunkSource:
642
643
  # lack a BM25 index, in which case modes was downgraded to vector-only.
643
644
  modes = self._current_modes()
644
645
 
645
- # Pick mode. "hybrid"/None use the best available strategy and KEEP
646
- # lexical enabled as a fallback: hybrid = dense + sparse, and when we
647
- # can't produce dense query vectors (no embed_fn, the usual remote case)
648
- # the per-query loop below degrades to the sparse/lexical leg which
649
- # needs no embedding. Only an explicit "vector" disables lexical; that's
650
- # the dense-only recovery path a caller uses after a lexical/hybrid
651
- # failure. (Disabling lexical for "hybrid" silently forced vector search,
652
- # which made remote collections dense-embed every query — slow, and on a
653
- # default-EF collection it pulls the all-MiniLM model.)
654
- if mode == "vector":
655
- use_hybrid = use_lexical = False
646
+ has_lexical = "lexical" in modes
647
+ has_hybrid = "hybrid" in modes
648
+
649
+ # Hard rule: never let chromadb embed a query with a client-side model
650
+ # (it downloads all-MiniLM and crawls in constrained executors). When a
651
+ # dense embed isn't safe no embed_fn and no Chroma-hosted server-side
652
+ # embedding function — use the BM25 lexical index if the collection has
653
+ # one, otherwise refuse. This covers every requested mode, including the
654
+ # linker's "inference" preference for vector.
655
+ if not self._chroma.dense_embed_is_safe():
656
+ if not has_lexical:
657
+ raise LocalEmbeddingDownloadDisallowedError(
658
+ "chroma", self._chroma.collection_name
659
+ )
660
+ use_hybrid = False
661
+ use_lexical = True
656
662
  elif mode == "lexical":
657
663
  use_hybrid = False
658
- use_lexical = "lexical" in modes
664
+ use_lexical = has_lexical
665
+ elif mode == "vector":
666
+ use_hybrid = use_lexical = False
659
667
  else: # "hybrid", None, or unrecognized -> best available
660
- use_hybrid = "hybrid" in modes
661
- use_lexical = "lexical" in modes
668
+ use_hybrid = has_hybrid
669
+ use_lexical = has_lexical
662
670
 
663
671
  # Batch-embed all queries when embed_fn available and vectors needed
664
672
  vectors: list[list[float]] | None = None
@@ -43,3 +43,21 @@ class UnsupportedSearchModeError(ValueError):
43
43
  f"[{backend}] unsupported search mode '{mode}'. "
44
44
  f"Supported modes: {sorted(supported_modes)}"
45
45
  )
46
+
47
+
48
+ class LocalEmbeddingDownloadDisallowedError(RuntimeError):
49
+ """Raised when serving a search would download a client-side embedding model.
50
+
51
+ The collection has no server-side (hosted) embedding function and no BM25
52
+ index, and the caller supplied no ``embed_fn`` — so embedding a text query
53
+ would make chromadb download and run a local model (e.g. all-MiniLM). We
54
+ refuse rather than trigger that download.
55
+ """
56
+
57
+ def __init__(self, backend: str, collection: str):
58
+ super().__init__(
59
+ f"[{backend}] collection {collection!r} has no server-side embedding "
60
+ "function and no BM25 index, so search would download a local "
61
+ "embedding model. Re-ingest the corpus with a hosted embedder "
62
+ "(chroma-cloud-qwen) or a BM25 index, or supply an embed_fn."
63
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev31
3
+ Version: 0.1.2.dev33
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
@@ -5,7 +5,7 @@ benchmax/envs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  benchmax/envs/base_env.py,sha256=FoUgWsNGeNpTHeOop0bklRjLfHA90Yi7MW8zCaNh_V0,8976
6
6
  benchmax/envs/example_id.py,sha256=WU967Pt2kFvn-W4k5BC6BvKyrTEqioLr7IyWZ3RjGgU,5685
7
7
  benchmax/envs/logging.py,sha256=QnXADCp0vWoV_-MK91yX5OFu6GwgIE98dvhaQTPawqQ,5053
8
- benchmax/envs/reward_helpers.py,sha256=lKbyTvJYU2JoiFItFkUPX6aWwp6JmmgXC76FBaf2rBQ,7740
8
+ benchmax/envs/reward_helpers.py,sha256=-pDqYBazvum8cc8KX7Q_Z0C-Daf3_4TVZuWt-ywhqyY,7364
9
9
  benchmax/envs/types.py,sha256=sGKKibQJZQj9RYkFpB3vaUY75tdoHet8yUmdzpZ0SVk,4389
10
10
  benchmax/envs/crm/crm_env.py,sha256=ltUtpA45YB_A_hYEpjFTp0nZKwkUvvLSLOAVkaUNz9E,4707
11
11
  benchmax/envs/crm/workdir/reward_fn.py,sha256=RY_iy347j79xX4gyCGI7WS0qPmut8Th2rqOiErVbDro,5439
@@ -32,7 +32,7 @@ benchmax/envs/mcp/provisioners/skypilot_provisioner.py,sha256=ACHnzNZE7GfL1WIWf7
32
32
  benchmax/envs/mcp/provisioners/utils.py,sha256=ORWJKtPzeS-IdD35p8aZyLMG2RxiB9BAFmU-0pVqiWw,3467
33
33
  benchmax/envs/postgres_search/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
34
  benchmax/envs/postgres_search/linker_env.py,sha256=B3cn0TpiqgrYL5NvOQYW3Yxy5DdxPw1kmIgqDs-8Buo,8535
35
- benchmax/envs/postgres_search/search_env.py,sha256=ldfSNCQonbQZimO7rCO1Jc6im7ff2d0-TLvwryXUOXQ,20181
35
+ benchmax/envs/postgres_search/search_env.py,sha256=IWpqbFr4hjaN_DzdeRchvwvf9qVj5Ut5D-eOsGPyWKQ,19917
36
36
  benchmax/envs/telestich/example.py,sha256=cqHIBjD8g7H4-nmspWSKRB2rxeKPOIwkLn136Y04KfQ,28680
37
37
  benchmax/envs/telestich/telestich_env.py,sha256=6p6GeyV-9ZIXrAX8zssMFjJgevkV5PfDLMZlslqO8js,61966
38
38
  benchmax/envs/wikipedia/utils.py,sha256=YDlxpMfwiVpfMpiZet4kWoeKqNbgTBxeWVEYg5QY3Qs,2879
@@ -64,11 +64,11 @@ benchmax/rag/corpus/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKV
64
64
  benchmax/rag/corpus/search_client.py,sha256=171IqQriU6kuQqvSCDgNwOT8SR5pxUPMfCifarrgrFg,1859
65
65
  benchmax/rag/corpus/source.py,sha256=dnmReLC8mccHDkg8ZytfXa4AFXrRMCg9v8E2UuVxt8E,4183
66
66
  benchmax/rag/corpus/chroma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
- benchmax/rag/corpus/chroma/client.py,sha256=TGf_YEgVBH8p-PLF7QcuSIHkkifrChXPKJB2ENR9OiM,19361
67
+ benchmax/rag/corpus/chroma/client.py,sha256=cYZZKQG09u_VfyjsP6UdCBh-RRNGKa9XisBN4OEejQk,20839
68
68
  benchmax/rag/corpus/chroma/files.py,sha256=hSP-J2osPNBAvMZHOWipMVXaWN4tila_tsQaTEPNzgc,5567
69
69
  benchmax/rag/corpus/chroma/filter_mapper.py,sha256=Y1FzDwDDg15LZ0-Uh1jzOVcSORiVUy5f1qiaVky3pJI,5074
70
- benchmax/rag/corpus/chroma/search.py,sha256=4kU1WMwsWQrN03ctVIPdXZoHyZa3jso2fKzcsc5uYr0,6824
71
- benchmax/rag/corpus/chroma/source.py,sha256=ZOLj_VfixBcB_VE8YLF6X3sKY6XHYupprHNHVxy1hH8,30295
70
+ benchmax/rag/corpus/chroma/search.py,sha256=iO8fBPk50vG3NmkCmAJ2tKnjP_wKnymV3fbfLjkIAJ8,7688
71
+ benchmax/rag/corpus/chroma/source.py,sha256=0azMLUvZS9g4jvxv_KxsPa3-ArQW5WHCq77CQh-qmqY,30440
72
72
  benchmax/rag/corpus/pinecone/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  benchmax/rag/corpus/pinecone/files.py,sha256=lhas7-mQ622Ku36QvOavXguBweJyYl78wXIeb_LNqig,5728
74
74
  benchmax/rag/corpus/pinecone/filter_mapper.py,sha256=exJ3G34QKeQo1rQ8Pu-iGL0XDXVxCW5dc3q0QoYfCo0,6454
@@ -85,7 +85,7 @@ benchmax/rag/corpus/postgres/source.py,sha256=6ptGHatOscYih42MZ9Wt8MQOrcIEQiJ1X5
85
85
  benchmax/rag/corpus/search_schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
86
  benchmax/rag/corpus/search_schema/builders.py,sha256=qAMiEOGOLR7xSXWFf12KqzYlrwBZchU_78vkRcOKa8k,1764
87
87
  benchmax/rag/corpus/search_schema/dsl_parser.py,sha256=vMijm_nRKztIrsVQP-0OySuCKnrBsbUzet_pwwlU1T8,1586
88
- benchmax/rag/corpus/search_schema/search_exceptions.py,sha256=vxhJQa7UFHduXDt225onA_R_UWcvGlj3NiS5TFR3M7w,1578
88
+ benchmax/rag/corpus/search_schema/search_exceptions.py,sha256=1ccbLnDAuSMxUnjtyBt-5iXwoKjI3xaZvk9xplCyNFw,2413
89
89
  benchmax/rag/corpus/search_schema/search_types.py,sha256=UTkteugSx5OigDRZ8Xqe6itxLUXj2sVeIVxtYbnXGSg,5831
90
90
  benchmax/rag/corpus/turbopuffer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  benchmax/rag/corpus/turbopuffer/files.py,sha256=DP80-3NmdyOD34fyQxlzovpLRs_UU1ezQ7PItpY2Nlk,5807
@@ -160,9 +160,9 @@ benchmax/traces/braintrust/adapter.py,sha256=KTeN9qKLwZJJ8TY-KtSudd4J3_nySz1bRts
160
160
  benchmax/traces/braintrust/message_extraction.py,sha256=seh3eM_qd9FUPmGOEMChUq_UAMtaIQHYSYDttMgY1go,8409
161
161
  benchmax/utils/__init__.py,sha256=FWJVm6jt0m57HS-84bgrb2M-c_EFhf60rWayioUGges,402
162
162
  benchmax/utils/checkpoint.py,sha256=htIw9iYjUUHpJqLLZ0y6K4_UYYAkZIx3vdQVY7juKDw,3148
163
- benchmax-0.1.2.dev31.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
164
- benchmax-0.1.2.dev31.dist-info/METADATA,sha256=A4ysjDpcSRx1jMeFb5xw0r9CinUiGPoz4Vm4x9FXqUQ,2775
165
- benchmax-0.1.2.dev31.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
166
- benchmax-0.1.2.dev31.dist-info/entry_points.txt,sha256=qtjqAQsHIwRIaLzwAhGTiRvI91CynwcUO5G95uQuDR4,47
167
- benchmax-0.1.2.dev31.dist-info/top_level.txt,sha256=ryj4zoahvAKL3BnxOpfJNfyIzhvlED9KJ3Q3k4bb9jc,9
168
- benchmax-0.1.2.dev31.dist-info/RECORD,,
163
+ benchmax-0.1.2.dev33.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
164
+ benchmax-0.1.2.dev33.dist-info/METADATA,sha256=X5P1IBK9INVKaO8xzBqoW8CQYQ2VIVD9IkaQV4tVjFQ,2775
165
+ benchmax-0.1.2.dev33.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
166
+ benchmax-0.1.2.dev33.dist-info/entry_points.txt,sha256=qtjqAQsHIwRIaLzwAhGTiRvI91CynwcUO5G95uQuDR4,47
167
+ benchmax-0.1.2.dev33.dist-info/top_level.txt,sha256=ryj4zoahvAKL3BnxOpfJNfyIzhvlED9KJ3Q3k4bb9jc,9
168
+ benchmax-0.1.2.dev33.dist-info/RECORD,,