renderers 0.1.8.dev2__tar.gz → 0.1.8.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/PKG-INFO +1 -1
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/__init__.py +2 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/_version.py +2 -2
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/base.py +176 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/client.py +99 -53
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/deepseek_v3.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/default.py +6 -1
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/glm45.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/glm5.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/gpt_oss.py +23 -4
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/kimi_k2.py +28 -11
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/kimi_k25.py +37 -20
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/laguna_xs2.py +36 -19
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/minimax_m2.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/nemotron3.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen3.py +28 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen35.py +37 -22
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen3_vl.py +26 -12
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_client.py +156 -11
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_multimodal.py +37 -0
- renderers-0.1.8.dev4/tests/test_tokens_per_message.py +325 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/publish.yml +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/style.yml +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/test.yml +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.gitignore +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.pre-commit-config.yaml +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/LICENSE +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/README.md +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/README.md +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/sglang/multiturn_generate_sglang.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/sglang/online_multiturn_sglang.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/tinker/multiturn_generate_tinker.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/transformers/multiturn_generate_transformers.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/vllm/multiturn_generate_vllm.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/pyproject.toml +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/parsers.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/parsing.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen36.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/conftest.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_bridge.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_build_helpers.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_gpt_oss_harmony_parity.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_incremental.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_load_tokenizer.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_load_tokenizer_fastokens.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_message_indices.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parse_response.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parse_response_robustness.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parsers.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_preserve_thinking.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_qwen35_size_coverage.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_render_ids.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_roundtrip.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_sampled_mask.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_tool_arg_type_preservation.py +0 -0
- {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/uv.lock +0 -0
|
@@ -36,6 +36,7 @@ from renderers.base import (
|
|
|
36
36
|
reject_assistant_in_extension,
|
|
37
37
|
trim_to_turn_close,
|
|
38
38
|
)
|
|
39
|
+
from renderers.client import OverlongPromptError
|
|
39
40
|
from renderers.deepseek_v3 import DeepSeekV3Renderer
|
|
40
41
|
from renderers.default import DefaultRenderer
|
|
41
42
|
from renderers.glm5 import GLM5Renderer
|
|
@@ -69,6 +70,7 @@ __all__ = [
|
|
|
69
70
|
"MultiModalData",
|
|
70
71
|
"MultimodalRenderer",
|
|
71
72
|
"Nemotron3Renderer",
|
|
73
|
+
"OverlongPromptError",
|
|
72
74
|
"ParsedResponse",
|
|
73
75
|
"ParsedToolCall",
|
|
74
76
|
"PlaceholderRange",
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.1.8.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 1, 8, '
|
|
21
|
+
__version__ = version = '0.1.8.dev4'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 8, 'dev4')
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -177,8 +177,162 @@ class RenderedTokens:
|
|
|
177
177
|
token_ids: list[int] = field(default_factory=list)
|
|
178
178
|
message_indices: list[int] = field(default_factory=list)
|
|
179
179
|
sampled_mask: list[bool] = field(default_factory=list)
|
|
180
|
+
message_roles: list[str] = field(default_factory=list)
|
|
180
181
|
multi_modal_data: "MultiModalData | None" = None
|
|
181
182
|
|
|
183
|
+
def tokens_per_message(
|
|
184
|
+
self, n_messages: int | None = None, *, sampled_only: bool = False
|
|
185
|
+
) -> list[int]:
|
|
186
|
+
"""Count rendered tokens attributed to each caller-relative message.
|
|
187
|
+
|
|
188
|
+
``out[i]`` is the number of tokens with ``message_indices[k] == i``,
|
|
189
|
+
i.e. tokens the renderer attributed to ``messages[i]``. This
|
|
190
|
+
includes template scaffolding the renderer wraps around the
|
|
191
|
+
message — the ``<|im_start|>role\\n`` opener, the closing
|
|
192
|
+
``<|im_end|>\\n``, etc. — because those are the renderer's own
|
|
193
|
+
attribution decision and are preserved verbatim here. Tokens with
|
|
194
|
+
``message_indices[k] == -1`` (scaffolding outside any single
|
|
195
|
+
message, e.g. the trailing generation prompt) are not counted.
|
|
196
|
+
|
|
197
|
+
With ``sampled_only=True``, counts only tokens the model would
|
|
198
|
+
have emitted at inference (``sampled_mask[k] is True``). For
|
|
199
|
+
example, length-penalty signals in RL: the template wraps each
|
|
200
|
+
assistant turn in scaffolding tokens (e.g. ``<|im_start|>assistant\\n``,
|
|
201
|
+
``<|im_end|>\\n``) that are constant-size and not chosen by the
|
|
202
|
+
model, so they shouldn't enter the penalty. For roles the model
|
|
203
|
+
never samples (``user``, ``tool``, ``system``), the
|
|
204
|
+
``sampled_only`` count is zero by construction. Renderers that
|
|
205
|
+
don't populate ``sampled_mask`` (``DefaultRenderer`` — the Jinja
|
|
206
|
+
template is opaque) return all zeros under ``sampled_only=True``.
|
|
207
|
+
|
|
208
|
+
``n_messages`` defaults to ``len(self.message_roles)``, which
|
|
209
|
+
every Renderer populates with the caller-relative message list
|
|
210
|
+
(caller's ``messages`` for ``render()``; ``new_messages`` for
|
|
211
|
+
``bridge_to_next_turn()``). Pass it explicitly only to truncate
|
|
212
|
+
— indices outside ``[0, n_messages)`` are ignored, so passing a
|
|
213
|
+
smaller value won't raise; it just drops the tail. Values larger
|
|
214
|
+
than ``len(self.message_roles)`` are clamped, so the returned
|
|
215
|
+
list never claims more messages than the renderer attributed.
|
|
216
|
+
|
|
217
|
+
Works on results from both :meth:`Renderer.render` and
|
|
218
|
+
:meth:`Renderer.bridge_to_next_turn`. For a bridge result the
|
|
219
|
+
indices are relative to the new messages the bridge added, not
|
|
220
|
+
the full conversation history; the prior portion is uniformly
|
|
221
|
+
``-1`` (and ``sampled_mask`` uniformly ``False``), so it
|
|
222
|
+
contributes nothing to either count.
|
|
223
|
+
"""
|
|
224
|
+
if n_messages is None:
|
|
225
|
+
n_messages = len(self.message_roles)
|
|
226
|
+
else:
|
|
227
|
+
n_messages = min(n_messages, len(self.message_roles))
|
|
228
|
+
out = [0] * n_messages
|
|
229
|
+
if sampled_only:
|
|
230
|
+
if len(self.sampled_mask) != len(self.token_ids):
|
|
231
|
+
return out
|
|
232
|
+
for idx, sampled in zip(self.message_indices, self.sampled_mask):
|
|
233
|
+
if sampled and 0 <= idx < n_messages:
|
|
234
|
+
out[idx] += 1
|
|
235
|
+
else:
|
|
236
|
+
for idx in self.message_indices:
|
|
237
|
+
if 0 <= idx < n_messages:
|
|
238
|
+
out[idx] += 1
|
|
239
|
+
return out
|
|
240
|
+
|
|
241
|
+
def message_token_spans(self) -> list[tuple[int, int] | None]:
|
|
242
|
+
"""Per-message ``(start, end)`` slices into :attr:`token_ids`.
|
|
243
|
+
|
|
244
|
+
``out[i]`` is the half-open span ``[start, end)`` such that
|
|
245
|
+
``token_ids[start:end]`` are the tokens attributed to
|
|
246
|
+
``messages[i]`` (or ``new_messages[i]`` for a bridge result).
|
|
247
|
+
Messages that contributed no tokens get ``None``. Renderer
|
|
248
|
+
scaffolding outside any message (``message_indices[k] == -1``)
|
|
249
|
+
is not represented.
|
|
250
|
+
|
|
251
|
+
Hand-coded renderers emit each message's tokens contiguously,
|
|
252
|
+
so the span is well-defined. The implementation tolerates
|
|
253
|
+
non-contiguous attribution by returning the outer span
|
|
254
|
+
``(first_k, last_k + 1)``; if you suspect interleaving, slice
|
|
255
|
+
``message_indices`` yourself to verify.
|
|
256
|
+
|
|
257
|
+
Returns ``len(self.message_roles)`` entries when ``message_roles``
|
|
258
|
+
is populated. Otherwise infers the count from
|
|
259
|
+
``max(message_indices) + 1`` — useful for manually-constructed
|
|
260
|
+
``RenderedTokens`` in tests but only correct when the last
|
|
261
|
+
message contributed at least one token.
|
|
262
|
+
|
|
263
|
+
Cheap to call: single pass over ``message_indices``. Re-call
|
|
264
|
+
rather than caching the result if you mutate the dataclass.
|
|
265
|
+
"""
|
|
266
|
+
if self.message_roles:
|
|
267
|
+
n_messages = len(self.message_roles)
|
|
268
|
+
else:
|
|
269
|
+
max_idx = -1
|
|
270
|
+
for idx in self.message_indices:
|
|
271
|
+
if idx > max_idx:
|
|
272
|
+
max_idx = idx
|
|
273
|
+
n_messages = max_idx + 1
|
|
274
|
+
|
|
275
|
+
firsts: list[int] = [-1] * n_messages
|
|
276
|
+
lasts: list[int] = [-1] * n_messages
|
|
277
|
+
for k, idx in enumerate(self.message_indices):
|
|
278
|
+
if 0 <= idx < n_messages:
|
|
279
|
+
if firsts[idx] == -1:
|
|
280
|
+
firsts[idx] = k
|
|
281
|
+
lasts[idx] = k
|
|
282
|
+
|
|
283
|
+
out: list[tuple[int, int] | None] = []
|
|
284
|
+
for i in range(n_messages):
|
|
285
|
+
if firsts[i] == -1:
|
|
286
|
+
out.append(None)
|
|
287
|
+
else:
|
|
288
|
+
out.append((firsts[i], lasts[i] + 1))
|
|
289
|
+
return out
|
|
290
|
+
|
|
291
|
+
def role_token_spans(self) -> dict[str, list[tuple[int, int]]]:
|
|
292
|
+
""":meth:`message_token_spans` regrouped by ``message_roles``.
|
|
293
|
+
|
|
294
|
+
Maps each role appearing in :attr:`message_roles` to a list of
|
|
295
|
+
``(start, end)`` spans — one per occurrence of that role, in
|
|
296
|
+
message order. Messages with no contributed tokens are skipped.
|
|
297
|
+
Returns an empty dict if :attr:`message_roles` is empty.
|
|
298
|
+
|
|
299
|
+
Intended for per-role statistics that operate on per-token
|
|
300
|
+
signals — e.g. ``logprobs[start:end]`` for each assistant span
|
|
301
|
+
to compute per-turn perplexity, or
|
|
302
|
+
``attention[start:end]`` for tool-response attention analysis.
|
|
303
|
+
"""
|
|
304
|
+
spans = self.message_token_spans()
|
|
305
|
+
out: dict[str, list[tuple[int, int]]] = {}
|
|
306
|
+
for role, span in zip(self.message_roles, spans):
|
|
307
|
+
if span is None:
|
|
308
|
+
out.setdefault(role, [])
|
|
309
|
+
continue
|
|
310
|
+
out.setdefault(role, []).append(span)
|
|
311
|
+
return out
|
|
312
|
+
|
|
313
|
+
def tokens_by_role(self, *, sampled_only: bool = False) -> dict[str, int]:
|
|
314
|
+
"""Sum :meth:`tokens_per_message` grouped by ``message_roles``.
|
|
315
|
+
|
|
316
|
+
Convenience for length-penalty bookkeeping in RL trainers:
|
|
317
|
+
``rendered.tokens_by_role(sampled_only=True)["assistant"]`` is
|
|
318
|
+
the count of tokens the model actually emitted across all
|
|
319
|
+
assistant turns — template scaffolding excluded.
|
|
320
|
+
``rendered.tokens_by_role()["tool"]`` is the raw count of
|
|
321
|
+
tool-response tokens (``sampled_only`` is zero for ``tool`` by
|
|
322
|
+
construction since the model never samples those).
|
|
323
|
+
|
|
324
|
+
Roles present in :attr:`message_roles` always appear in the
|
|
325
|
+
returned dict, even with post-filter count ``0``, so callers
|
|
326
|
+
can index directly without ``KeyError`` on conversations that
|
|
327
|
+
happen to lack a role. Returns an empty dict if
|
|
328
|
+
:attr:`message_roles` is empty.
|
|
329
|
+
"""
|
|
330
|
+
counts = self.tokens_per_message(sampled_only=sampled_only)
|
|
331
|
+
out: dict[str, int] = {}
|
|
332
|
+
for role, n in zip(self.message_roles, counts):
|
|
333
|
+
out[role] = out.get(role, 0) + n
|
|
334
|
+
return out
|
|
335
|
+
|
|
182
336
|
|
|
183
337
|
class ToolCallParseStatus(str, enum.Enum):
|
|
184
338
|
"""Per-attempt outcome of parsing a single ``<tool_call>`` block.
|
|
@@ -358,6 +512,25 @@ class Renderer(Protocol):
|
|
|
358
512
|
list so far with ``add_generation_prompt=True`` — except prev
|
|
359
513
|
sampled tokens are kept verbatim rather than re-rendered).
|
|
360
514
|
|
|
515
|
+
Attribution on the returned ``RenderedTokens``:
|
|
516
|
+
|
|
517
|
+
- ``message_indices`` is ``-1`` over the entire prior portion
|
|
518
|
+
(length ``len(previous_ids)`` after :func:`trim_to_turn_close`)
|
|
519
|
+
because the bridge gets the prior as raw token lists with no
|
|
520
|
+
attribution. Over the bridge-added portion, indices are
|
|
521
|
+
relative to ``new_messages``: a token rendered as part of
|
|
522
|
+
``new_messages[i]`` carries ``i``, and inter-turn separators /
|
|
523
|
+
the trailing generation prompt carry ``-1``. So
|
|
524
|
+
``bridge.tokens_per_message(len(new_messages))`` gives the
|
|
525
|
+
per-new-message token count for length-penalty bookkeeping.
|
|
526
|
+
- ``sampled_mask`` is uniformly ``False`` across the entire
|
|
527
|
+
returned sequence. The bridge output is consumed as the next
|
|
528
|
+
turn's prompt; nothing it emits was model-sampled, and the
|
|
529
|
+
bridge has no way to recover which prior tokens were. If the
|
|
530
|
+
caller needs that distinction for the prior portion, they
|
|
531
|
+
have it directly: every token in ``prev_completion_ids`` was
|
|
532
|
+
sampled; every token in ``prev_prompt_ids`` was not.
|
|
533
|
+
|
|
361
534
|
Text-only renderers return :class:`RenderedTokens` with
|
|
362
535
|
``multi_modal_data=None``. Multimodal renderers (see
|
|
363
536
|
:class:`MultimodalRenderer`) populate ``multi_modal_data`` so
|
|
@@ -593,6 +766,8 @@ MODEL_RENDERER_MAP: dict[str, str] = {
|
|
|
593
766
|
"Qwen/Qwen3-14B": "qwen3",
|
|
594
767
|
"Qwen/Qwen3-32B": "qwen3",
|
|
595
768
|
"Qwen/Qwen3-30B-A3B": "qwen3",
|
|
769
|
+
"Qwen/Qwen3-30B-A3B-Instruct-2507": "qwen3",
|
|
770
|
+
"Qwen/Qwen3-30B-A3B-Thinking-2507": "qwen3",
|
|
596
771
|
"Qwen/Qwen3-235B-A22B": "qwen3",
|
|
597
772
|
# Qwen3.5. All seven sizes share the same renderer. The 4B / 9B /
|
|
598
773
|
# 35B-A3B / 122B-A10B / 397B-A17B chat template defaults
|
|
@@ -619,6 +794,7 @@ MODEL_RENDERER_MAP: dict[str, str] = {
|
|
|
619
794
|
"Qwen/Qwen3-VL-30B-A3B-Instruct": "qwen3-vl",
|
|
620
795
|
# GLM-5 family (GLM-4.7 reuses the GLM-5 template).
|
|
621
796
|
"zai-org/GLM-5": "glm-5",
|
|
797
|
+
"zai-org/GLM-5-FP8": "glm-5",
|
|
622
798
|
"zai-org/GLM-4.7-Flash": "glm-5",
|
|
623
799
|
"zai-org/GLM-5.1": "glm-5.1",
|
|
624
800
|
# GLM-4.5.
|
|
@@ -14,10 +14,11 @@ from __future__ import annotations
|
|
|
14
14
|
import asyncio
|
|
15
15
|
import base64
|
|
16
16
|
import logging
|
|
17
|
+
from collections.abc import Mapping
|
|
17
18
|
from typing import Any, cast
|
|
18
19
|
|
|
19
20
|
import numpy as np
|
|
20
|
-
from openai import AsyncOpenAI
|
|
21
|
+
from openai import AsyncOpenAI
|
|
21
22
|
|
|
22
23
|
from renderers.base import (
|
|
23
24
|
Message,
|
|
@@ -31,6 +32,79 @@ from renderers.base import (
|
|
|
31
32
|
_request_logger = logging.getLogger("renderers.client")
|
|
32
33
|
|
|
33
34
|
|
|
35
|
+
class OverlongPromptError(Exception):
|
|
36
|
+
"""The rendered prompt exceeds the engine's context window.
|
|
37
|
+
|
|
38
|
+
Raised by :func:`generate` when the rendered token sequence is strictly
|
|
39
|
+
longer than the resolved cap — either an explicit ``max_prompt_len`` the
|
|
40
|
+
caller passed in, or the engine's ``max_model_len`` discovered via
|
|
41
|
+
``GET /v1/models``. Caught client-side before the engine ever sees the
|
|
42
|
+
request, so callers route the failure to a deterministic policy (skip /
|
|
43
|
+
truncate / count) instead of round-tripping through an engine 4xx.
|
|
44
|
+
|
|
45
|
+
Named after the corresponding ``verifiers.errors.OverlongPromptError``;
|
|
46
|
+
the two are distinct classes (different package hierarchies) but the
|
|
47
|
+
concept is the same and downstream clients translate one to the other.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, *, prompt_len: int, max_prompt_len: int) -> None:
|
|
51
|
+
self.prompt_len = prompt_len
|
|
52
|
+
self.max_prompt_len = max_prompt_len
|
|
53
|
+
super().__init__(
|
|
54
|
+
f"Prompt length ({prompt_len}) exceeds maximum "
|
|
55
|
+
f"context length ({max_prompt_len})."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Per-process cache of resolved engine context-length caps, keyed by
|
|
60
|
+
# ``(base_url, model)``. ``None`` is the "we asked the engine and it didn't
|
|
61
|
+
# tell us" sentinel — distinct from "key missing" (haven't asked yet). The
|
|
62
|
+
# lock serializes the first lookup per key; cache hits avoid the lock.
|
|
63
|
+
_max_prompt_len_cache: dict[tuple[str, str], int | None] = {}
|
|
64
|
+
_max_prompt_len_lock = asyncio.Lock()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None:
|
|
68
|
+
"""Discover ``max_model_len`` from the engine via ``GET /v1/models``.
|
|
69
|
+
|
|
70
|
+
OpenAI-API-compatible engines expose model metadata at this endpoint;
|
|
71
|
+
vLLM extends its ``ModelCard`` with a ``max_model_len`` field. Engines
|
|
72
|
+
that don't (SGLang as of this writing, third-party gateways, etc.) get
|
|
73
|
+
a cached ``None`` and the pre-flight overflow check silently disables —
|
|
74
|
+
callers fall back to whatever reactive handling they have for engine
|
|
75
|
+
4xx, which the verifiers ``@handle_openai_overlong_prompt`` decorator
|
|
76
|
+
already supplies for the prime-rl path.
|
|
77
|
+
|
|
78
|
+
Any exception during lookup (network error, non-JSON body, attribute
|
|
79
|
+
miss on a mock client in tests) is treated as "unknown cap": cached
|
|
80
|
+
``None`` so we don't retry on every call.
|
|
81
|
+
"""
|
|
82
|
+
key = (str(getattr(client, "base_url", "")), model)
|
|
83
|
+
if key in _max_prompt_len_cache:
|
|
84
|
+
return _max_prompt_len_cache[key]
|
|
85
|
+
async with _max_prompt_len_lock:
|
|
86
|
+
if key in _max_prompt_len_cache:
|
|
87
|
+
return _max_prompt_len_cache[key]
|
|
88
|
+
try:
|
|
89
|
+
payload = await client.get("/models", cast_to=cast(Any, dict[str, Any]))
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
_request_logger.debug("max_prompt_len lookup failed: %s", exc)
|
|
92
|
+
_max_prompt_len_cache[key] = None
|
|
93
|
+
return None
|
|
94
|
+
value: int | None = None
|
|
95
|
+
for card in payload.get("data") or []:
|
|
96
|
+
if not isinstance(card, Mapping):
|
|
97
|
+
continue
|
|
98
|
+
if card.get("id") != model:
|
|
99
|
+
continue
|
|
100
|
+
raw = card.get("max_model_len")
|
|
101
|
+
if isinstance(raw, int) and raw > 0:
|
|
102
|
+
value = raw
|
|
103
|
+
break
|
|
104
|
+
_max_prompt_len_cache[key] = value
|
|
105
|
+
return value
|
|
106
|
+
|
|
107
|
+
|
|
34
108
|
async def _maybe_offload(renderer: Renderer | RendererPool, fn):
|
|
35
109
|
"""Run sync renderer work on a thread iff ``renderer`` is a pool.
|
|
36
110
|
|
|
@@ -58,6 +132,7 @@ async def generate(
|
|
|
58
132
|
cache_salt: str | None = None,
|
|
59
133
|
priority: int | None = None,
|
|
60
134
|
extra_headers: dict[str, str] | None = None,
|
|
135
|
+
max_prompt_len: int | None = None,
|
|
61
136
|
) -> dict[str, Any]:
|
|
62
137
|
"""Tokenize messages, call vLLM /inference/v1/generate, parse the response.
|
|
63
138
|
|
|
@@ -74,6 +149,16 @@ async def generate(
|
|
|
74
149
|
mm_placeholders, kwargs_data) before POSTing. The serializer imports
|
|
75
150
|
``vllm.*`` lazily so text-only consumers never pay for the import.
|
|
76
151
|
|
|
152
|
+
``max_prompt_len`` controls the pre-flight overflow check. When the
|
|
153
|
+
rendered prompt is strictly longer than the cap, the request is never
|
|
154
|
+
sent and ``OverlongPromptError`` is raised. If ``max_prompt_len`` is
|
|
155
|
+
``None`` (the default), the cap is auto-discovered once per
|
|
156
|
+
``(base_url, model)`` via ``GET /v1/models`` (vLLM's
|
|
157
|
+
``ModelCard.max_model_len`` extension); engines that don't expose it
|
|
158
|
+
cache a ``None`` cap and the pre-flight silently disables. Engine 4xx
|
|
159
|
+
that still slip through propagate raw — converting them into a domain
|
|
160
|
+
error is the calling client's job (its error shape is engine-specific).
|
|
161
|
+
|
|
77
162
|
Returns a dict with: request_id, prompt_ids, completion_ids,
|
|
78
163
|
completion_logprobs, content, reasoning_content, tool_calls,
|
|
79
164
|
finish_reason, routed_experts.
|
|
@@ -96,6 +181,13 @@ async def generate(
|
|
|
96
181
|
|
|
97
182
|
prompt_ids, stop_token_ids, mm_data = await _maybe_offload(renderer, _prepare)
|
|
98
183
|
|
|
184
|
+
if max_prompt_len is None:
|
|
185
|
+
max_prompt_len = await _resolve_max_prompt_len(client, model)
|
|
186
|
+
if max_prompt_len is not None and len(prompt_ids) > max_prompt_len:
|
|
187
|
+
raise OverlongPromptError(
|
|
188
|
+
prompt_len=len(prompt_ids), max_prompt_len=max_prompt_len
|
|
189
|
+
)
|
|
190
|
+
|
|
99
191
|
sp: dict[str, Any] = dict(sampling_params or {})
|
|
100
192
|
sp["stop_token_ids"] = stop_token_ids
|
|
101
193
|
sp["logprobs"] = 1
|
|
@@ -135,16 +227,7 @@ async def generate(
|
|
|
135
227
|
}
|
|
136
228
|
if extra_headers:
|
|
137
229
|
post_kwargs["options"] = cast(Any, {"headers": extra_headers})
|
|
138
|
-
|
|
139
|
-
data = await client.post(endpoint, **post_kwargs)
|
|
140
|
-
except BadRequestError as exc:
|
|
141
|
-
_log_overlong_prompt_diagnostic(
|
|
142
|
-
prompt_ids=prompt_ids,
|
|
143
|
-
messages=messages,
|
|
144
|
-
max_tokens=sp.get("max_tokens"),
|
|
145
|
-
exc=exc,
|
|
146
|
-
)
|
|
147
|
-
raise
|
|
230
|
+
data = await client.post(endpoint, **post_kwargs)
|
|
148
231
|
|
|
149
232
|
choice = (data.get("choices") or [{}])[0]
|
|
150
233
|
completion_ids = choice.get("token_ids") or []
|
|
@@ -225,6 +308,7 @@ def _build_mm_features(
|
|
|
225
308
|
to change. Don't pre-build the abstraction with one engine in tree.
|
|
226
309
|
"""
|
|
227
310
|
from renderers.qwen3_vl import Qwen3VLRenderer
|
|
311
|
+
from renderers.qwen35 import Qwen35Renderer
|
|
228
312
|
|
|
229
313
|
# Type dispatch only needs the renderer class. Pools expose
|
|
230
314
|
# ``renderer_cls`` as a snapshot attribute, so we don't have to check
|
|
@@ -233,7 +317,10 @@ def _build_mm_features(
|
|
|
233
317
|
renderer.renderer_cls if isinstance(renderer, RendererPool) else type(renderer)
|
|
234
318
|
)
|
|
235
319
|
|
|
236
|
-
|
|
320
|
+
# Qwen3-VL and Qwen3.5 both ship ``pixel_values`` + ``image_grid_thw``
|
|
321
|
+
# via the shared Qwen2-VL field factory. ``spatial_merge_size=2`` is
|
|
322
|
+
# the family default and matches every Qwen-VL processor in tree.
|
|
323
|
+
if issubclass(renderer_cls, (Qwen3VLRenderer, Qwen35Renderer)):
|
|
237
324
|
return _build_qwen_vl_features(mm_data, spatial_merge_size=2)
|
|
238
325
|
|
|
239
326
|
raise NotImplementedError(
|
|
@@ -305,44 +392,3 @@ def _build_qwen_vl_features(
|
|
|
305
392
|
out["kwargs_data"] = None
|
|
306
393
|
|
|
307
394
|
return out
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
def _log_overlong_prompt_diagnostic(
|
|
311
|
-
*,
|
|
312
|
-
prompt_ids: list[int],
|
|
313
|
-
messages: list[Message],
|
|
314
|
-
max_tokens: int | None,
|
|
315
|
-
exc: BadRequestError,
|
|
316
|
-
) -> None:
|
|
317
|
-
"""Log a structured snapshot when vLLM rejects with 4xx — usually overlong.
|
|
318
|
-
|
|
319
|
-
Captures total prompt length, per-message role + character count, and
|
|
320
|
-
the first chunk of the response body.
|
|
321
|
-
"""
|
|
322
|
-
body_text = ""
|
|
323
|
-
response = getattr(exc, "response", None)
|
|
324
|
-
if response is not None:
|
|
325
|
-
body_text = (response.text or "")[:500].replace("\n", " ")
|
|
326
|
-
msg_summary = []
|
|
327
|
-
for i, m in enumerate(messages):
|
|
328
|
-
role = m.get("role", "?")
|
|
329
|
-
content = m.get("content")
|
|
330
|
-
if isinstance(content, str):
|
|
331
|
-
content_len = len(content)
|
|
332
|
-
elif isinstance(content, list):
|
|
333
|
-
content_len = sum(
|
|
334
|
-
len(p.get("text", "")) if isinstance(p, dict) else 0 for p in content
|
|
335
|
-
)
|
|
336
|
-
else:
|
|
337
|
-
content_len = 0
|
|
338
|
-
tool_calls = m.get("tool_calls")
|
|
339
|
-
tc_count = len(tool_calls) if tool_calls else 0
|
|
340
|
-
msg_summary.append(f"[{i}]{role}(c={content_len},tc={tc_count})")
|
|
341
|
-
_request_logger.warning(
|
|
342
|
-
"vllm 4xx prompt_len=%d messages=%d max_tokens=%s per_msg=%s response_body=%s",
|
|
343
|
-
len(prompt_ids),
|
|
344
|
-
len(messages),
|
|
345
|
-
max_tokens,
|
|
346
|
-
" ".join(msg_summary),
|
|
347
|
-
body_text,
|
|
348
|
-
)
|
|
@@ -210,7 +210,10 @@ class DeepSeekV3Renderer:
|
|
|
210
210
|
emit_text("<think>\n", -1, is_sampled=False)
|
|
211
211
|
|
|
212
212
|
return RenderedTokens(
|
|
213
|
-
token_ids=tokens,
|
|
213
|
+
token_ids=tokens,
|
|
214
|
+
message_indices=indices,
|
|
215
|
+
sampled_mask=sampled,
|
|
216
|
+
message_roles=[m.get("role") or "" for m in messages],
|
|
214
217
|
)
|
|
215
218
|
|
|
216
219
|
def render_ids(
|
|
@@ -271,22 +274,29 @@ class DeepSeekV3Renderer:
|
|
|
271
274
|
return None
|
|
272
275
|
|
|
273
276
|
ext: list[int] = []
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
#
|
|
278
|
-
#
|
|
279
|
-
#
|
|
280
|
-
#
|
|
277
|
+
ext_indices: list[int] = []
|
|
278
|
+
ext_sampled: list[bool] = []
|
|
279
|
+
|
|
280
|
+
# Bridge populates ``message_indices`` (relative to ``new_messages``)
|
|
281
|
+
# and ``sampled_mask`` (uniformly ``False`` — every token the
|
|
282
|
+
# bridge emits is template scaffolding for the next prompt, not
|
|
283
|
+
# something the model sampled). Downstream consumers can run
|
|
284
|
+
# :meth:`RenderedTokens.tokens_per_message` on the bridge output
|
|
285
|
+
# to get per-new-message token counts without re-rendering.
|
|
281
286
|
def emit_special(
|
|
282
|
-
token_id: int,
|
|
287
|
+
token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
|
|
283
288
|
) -> None:
|
|
284
289
|
ext.append(token_id)
|
|
290
|
+
ext_indices.append(msg_idx)
|
|
291
|
+
ext_sampled.append(is_sampled)
|
|
285
292
|
|
|
286
293
|
def emit_text(
|
|
287
|
-
text: str,
|
|
294
|
+
text: str, msg_idx: int = -1, *, is_sampled: bool = False
|
|
288
295
|
) -> None:
|
|
289
|
-
|
|
296
|
+
ids = self._encode(text)
|
|
297
|
+
ext.extend(ids)
|
|
298
|
+
ext_indices.extend([msg_idx] * len(ids))
|
|
299
|
+
ext_sampled.extend([is_sampled] * len(ids))
|
|
290
300
|
|
|
291
301
|
for i, msg in enumerate(new_messages):
|
|
292
302
|
role = msg.get("role")
|
|
@@ -329,7 +339,13 @@ class DeepSeekV3Renderer:
|
|
|
329
339
|
if self._enable_thinking:
|
|
330
340
|
emit_text("<think>\n", -1)
|
|
331
341
|
|
|
332
|
-
|
|
342
|
+
total_len = len(previous_ids) + len(ext)
|
|
343
|
+
return RenderedTokens(
|
|
344
|
+
token_ids=previous_ids + ext,
|
|
345
|
+
message_indices=[-1] * len(previous_ids) + ext_indices,
|
|
346
|
+
sampled_mask=[False] * total_len,
|
|
347
|
+
message_roles=[m.get("role") or "" for m in new_messages],
|
|
348
|
+
)
|
|
333
349
|
|
|
334
350
|
# ------------------------------------------------------------------
|
|
335
351
|
# Assistant rendering
|
|
@@ -143,7 +143,12 @@ class DefaultRenderer:
|
|
|
143
143
|
token_ids = full_ids
|
|
144
144
|
message_indices.extend([-1] * len(gen_tokens))
|
|
145
145
|
|
|
146
|
-
|
|
146
|
+
message_roles = [m.get("role") or "" for m in messages]
|
|
147
|
+
return RenderedTokens(
|
|
148
|
+
token_ids=token_ids,
|
|
149
|
+
message_indices=message_indices,
|
|
150
|
+
message_roles=message_roles,
|
|
151
|
+
)
|
|
147
152
|
|
|
148
153
|
def _apply(self, messages, *, tools=None, add_generation_prompt=False) -> list[int]:
|
|
149
154
|
kwargs = dict(self._chat_template_kwargs)
|
|
@@ -203,7 +203,10 @@ class GLM45Renderer:
|
|
|
203
203
|
emit_special(self._think_end, -1, is_sampled=False)
|
|
204
204
|
|
|
205
205
|
return RenderedTokens(
|
|
206
|
-
token_ids=tokens,
|
|
206
|
+
token_ids=tokens,
|
|
207
|
+
message_indices=indices,
|
|
208
|
+
sampled_mask=sampled,
|
|
209
|
+
message_roles=[m.get("role") or "" for m in messages],
|
|
207
210
|
)
|
|
208
211
|
|
|
209
212
|
def render_ids(
|
|
@@ -271,22 +274,29 @@ class GLM45Renderer:
|
|
|
271
274
|
last_prev = previous_ids[-1]
|
|
272
275
|
|
|
273
276
|
ext: list[int] = []
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
#
|
|
278
|
-
#
|
|
279
|
-
#
|
|
280
|
-
#
|
|
277
|
+
ext_indices: list[int] = []
|
|
278
|
+
ext_sampled: list[bool] = []
|
|
279
|
+
|
|
280
|
+
# Bridge populates ``message_indices`` (relative to ``new_messages``)
|
|
281
|
+
# and ``sampled_mask`` (uniformly ``False`` — every token the
|
|
282
|
+
# bridge emits is template scaffolding for the next prompt, not
|
|
283
|
+
# something the model sampled). Downstream consumers can run
|
|
284
|
+
# :meth:`RenderedTokens.tokens_per_message` on the bridge output
|
|
285
|
+
# to get per-new-message token counts without re-rendering.
|
|
281
286
|
def emit_special(
|
|
282
|
-
token_id: int,
|
|
287
|
+
token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
|
|
283
288
|
) -> None:
|
|
284
289
|
ext.append(token_id)
|
|
290
|
+
ext_indices.append(msg_idx)
|
|
291
|
+
ext_sampled.append(is_sampled)
|
|
285
292
|
|
|
286
293
|
def emit_text(
|
|
287
|
-
text: str,
|
|
294
|
+
text: str, msg_idx: int = -1, *, is_sampled: bool = False
|
|
288
295
|
) -> None:
|
|
289
|
-
|
|
296
|
+
ids = self._encode(text)
|
|
297
|
+
ext.extend(ids)
|
|
298
|
+
ext_indices.extend([msg_idx] * len(ids))
|
|
299
|
+
ext_sampled.extend([is_sampled] * len(ids))
|
|
290
300
|
|
|
291
301
|
for i, msg in enumerate(new_messages):
|
|
292
302
|
role = msg.get("role")
|
|
@@ -318,7 +328,13 @@ class GLM45Renderer:
|
|
|
318
328
|
emit_special(self._think, -1)
|
|
319
329
|
emit_special(self._think_end, -1)
|
|
320
330
|
|
|
321
|
-
|
|
331
|
+
total_len = len(previous_ids) + len(ext)
|
|
332
|
+
return RenderedTokens(
|
|
333
|
+
token_ids=previous_ids + ext,
|
|
334
|
+
message_indices=[-1] * len(previous_ids) + ext_indices,
|
|
335
|
+
sampled_mask=[False] * total_len,
|
|
336
|
+
message_roles=[m.get("role") or "" for m in new_messages],
|
|
337
|
+
)
|
|
322
338
|
|
|
323
339
|
def _render_assistant(
|
|
324
340
|
self,
|
|
@@ -220,7 +220,10 @@ class GLM5Renderer:
|
|
|
220
220
|
emit_special(self._think_end, -1, is_sampled=False)
|
|
221
221
|
|
|
222
222
|
return RenderedTokens(
|
|
223
|
-
token_ids=tokens,
|
|
223
|
+
token_ids=tokens,
|
|
224
|
+
message_indices=indices,
|
|
225
|
+
sampled_mask=sampled,
|
|
226
|
+
message_roles=[m.get("role") or "" for m in messages],
|
|
224
227
|
)
|
|
225
228
|
|
|
226
229
|
def render_ids(
|
|
@@ -292,22 +295,29 @@ class GLM5Renderer:
|
|
|
292
295
|
last_prev = previous_ids[-1]
|
|
293
296
|
|
|
294
297
|
ext: list[int] = []
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
#
|
|
299
|
-
#
|
|
300
|
-
#
|
|
301
|
-
#
|
|
298
|
+
ext_indices: list[int] = []
|
|
299
|
+
ext_sampled: list[bool] = []
|
|
300
|
+
|
|
301
|
+
# Bridge populates ``message_indices`` (relative to ``new_messages``)
|
|
302
|
+
# and ``sampled_mask`` (uniformly ``False`` — every token the
|
|
303
|
+
# bridge emits is template scaffolding for the next prompt, not
|
|
304
|
+
# something the model sampled). Downstream consumers can run
|
|
305
|
+
# :meth:`RenderedTokens.tokens_per_message` on the bridge output
|
|
306
|
+
# to get per-new-message token counts without re-rendering.
|
|
302
307
|
def emit_special(
|
|
303
|
-
token_id: int,
|
|
308
|
+
token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
|
|
304
309
|
) -> None:
|
|
305
310
|
ext.append(token_id)
|
|
311
|
+
ext_indices.append(msg_idx)
|
|
312
|
+
ext_sampled.append(is_sampled)
|
|
306
313
|
|
|
307
314
|
def emit_text(
|
|
308
|
-
text: str,
|
|
315
|
+
text: str, msg_idx: int = -1, *, is_sampled: bool = False
|
|
309
316
|
) -> None:
|
|
310
|
-
|
|
317
|
+
ids = self._encode(text)
|
|
318
|
+
ext.extend(ids)
|
|
319
|
+
ext_indices.extend([msg_idx] * len(ids))
|
|
320
|
+
ext_sampled.extend([is_sampled] * len(ids))
|
|
311
321
|
|
|
312
322
|
for i, msg in enumerate(new_messages):
|
|
313
323
|
role = msg.get("role")
|
|
@@ -340,7 +350,13 @@ class GLM5Renderer:
|
|
|
340
350
|
else:
|
|
341
351
|
emit_special(self._think_end, -1)
|
|
342
352
|
|
|
343
|
-
|
|
353
|
+
total_len = len(previous_ids) + len(ext)
|
|
354
|
+
return RenderedTokens(
|
|
355
|
+
token_ids=previous_ids + ext,
|
|
356
|
+
message_indices=[-1] * len(previous_ids) + ext_indices,
|
|
357
|
+
sampled_mask=[False] * total_len,
|
|
358
|
+
message_roles=[m.get("role") or "" for m in new_messages],
|
|
359
|
+
)
|
|
344
360
|
|
|
345
361
|
def _render_assistant(
|
|
346
362
|
self,
|