renderers 0.1.8.dev2__tar.gz → 0.1.8.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/PKG-INFO +1 -1
  2. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/__init__.py +2 -0
  3. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/_version.py +2 -2
  4. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/base.py +176 -0
  5. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/client.py +99 -53
  6. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/deepseek_v3.py +28 -12
  7. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/default.py +6 -1
  8. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/glm45.py +28 -12
  9. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/glm5.py +28 -12
  10. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/gpt_oss.py +23 -4
  11. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/kimi_k2.py +28 -11
  12. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/kimi_k25.py +37 -20
  13. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/laguna_xs2.py +36 -19
  14. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/minimax_m2.py +28 -12
  15. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/nemotron3.py +28 -12
  16. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen3.py +28 -12
  17. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen35.py +37 -22
  18. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen3_vl.py +26 -12
  19. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_client.py +156 -11
  20. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_multimodal.py +37 -0
  21. renderers-0.1.8.dev4/tests/test_tokens_per_message.py +325 -0
  22. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/publish.yml +0 -0
  23. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/style.yml +0 -0
  24. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.github/workflows/test.yml +0 -0
  25. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.gitignore +0 -0
  26. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/.pre-commit-config.yaml +0 -0
  27. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/LICENSE +0 -0
  28. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/README.md +0 -0
  29. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/README.md +0 -0
  30. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/sglang/multiturn_generate_sglang.py +0 -0
  31. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/sglang/online_multiturn_sglang.py +0 -0
  32. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/tinker/multiturn_generate_tinker.py +0 -0
  33. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/transformers/multiturn_generate_transformers.py +0 -0
  34. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/examples/vllm/multiturn_generate_vllm.py +0 -0
  35. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/pyproject.toml +0 -0
  36. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/parsers.py +0 -0
  37. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/parsing.py +0 -0
  38. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/renderers/qwen36.py +0 -0
  39. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/conftest.py +0 -0
  40. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_bridge.py +0 -0
  41. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_build_helpers.py +0 -0
  42. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_gpt_oss_harmony_parity.py +0 -0
  43. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_incremental.py +0 -0
  44. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_load_tokenizer.py +0 -0
  45. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_load_tokenizer_fastokens.py +0 -0
  46. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_message_indices.py +0 -0
  47. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parse_response.py +0 -0
  48. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parse_response_robustness.py +0 -0
  49. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_parsers.py +0 -0
  50. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_preserve_thinking.py +0 -0
  51. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_qwen35_size_coverage.py +0 -0
  52. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_render_ids.py +0 -0
  53. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_roundtrip.py +0 -0
  54. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_sampled_mask.py +0 -0
  55. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/tests/test_tool_arg_type_preservation.py +0 -0
  56. {renderers-0.1.8.dev2 → renderers-0.1.8.dev4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: renderers
3
- Version: 0.1.8.dev2
3
+ Version: 0.1.8.dev4
4
4
  Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -36,6 +36,7 @@ from renderers.base import (
36
36
  reject_assistant_in_extension,
37
37
  trim_to_turn_close,
38
38
  )
39
+ from renderers.client import OverlongPromptError
39
40
  from renderers.deepseek_v3 import DeepSeekV3Renderer
40
41
  from renderers.default import DefaultRenderer
41
42
  from renderers.glm5 import GLM5Renderer
@@ -69,6 +70,7 @@ __all__ = [
69
70
  "MultiModalData",
70
71
  "MultimodalRenderer",
71
72
  "Nemotron3Renderer",
73
+ "OverlongPromptError",
72
74
  "ParsedResponse",
73
75
  "ParsedToolCall",
74
76
  "PlaceholderRange",
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.1.8.dev2'
22
- __version_tuple__ = version_tuple = (0, 1, 8, 'dev2')
21
+ __version__ = version = '0.1.8.dev4'
22
+ __version_tuple__ = version_tuple = (0, 1, 8, 'dev4')
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -177,8 +177,162 @@ class RenderedTokens:
177
177
  token_ids: list[int] = field(default_factory=list)
178
178
  message_indices: list[int] = field(default_factory=list)
179
179
  sampled_mask: list[bool] = field(default_factory=list)
180
+ message_roles: list[str] = field(default_factory=list)
180
181
  multi_modal_data: "MultiModalData | None" = None
181
182
 
183
+ def tokens_per_message(
184
+ self, n_messages: int | None = None, *, sampled_only: bool = False
185
+ ) -> list[int]:
186
+ """Count rendered tokens attributed to each caller-relative message.
187
+
188
+ ``out[i]`` is the number of tokens with ``message_indices[k] == i``,
189
+ i.e. tokens the renderer attributed to ``messages[i]``. This
190
+ includes template scaffolding the renderer wraps around the
191
+ message — the ``<|im_start|>role\\n`` opener, the closing
192
+ ``<|im_end|>\\n``, etc. — because those are the renderer's own
193
+ attribution decision and are preserved verbatim here. Tokens with
194
+ ``message_indices[k] == -1`` (scaffolding outside any single
195
+ message, e.g. the trailing generation prompt) are not counted.
196
+
197
+ With ``sampled_only=True``, counts only tokens the model would
198
+ have emitted at inference (``sampled_mask[k] is True``). For
199
+ example, length-penalty signals in RL: the template wraps each
200
+ assistant turn in scaffolding tokens (e.g. ``<|im_start|>assistant\\n``,
201
+ ``<|im_end|>\\n``) that are constant-size and not chosen by the
202
+ model, so they shouldn't enter the penalty. For roles the model
203
+ never samples (``user``, ``tool``, ``system``), the
204
+ ``sampled_only`` count is zero by construction. Renderers that
205
+ don't populate ``sampled_mask`` (``DefaultRenderer`` — the Jinja
206
+ template is opaque) return all zeros under ``sampled_only=True``.
207
+
208
+ ``n_messages`` defaults to ``len(self.message_roles)``, which
209
+ every Renderer populates with the caller-relative message list
210
+ (caller's ``messages`` for ``render()``; ``new_messages`` for
211
+ ``bridge_to_next_turn()``). Pass it explicitly only to truncate
212
+ — indices outside ``[0, n_messages)`` are ignored, so passing a
213
+ smaller value won't raise; it just drops the tail. Values larger
214
+ than ``len(self.message_roles)`` are clamped, so the returned
215
+ list never claims more messages than the renderer attributed.
216
+
217
+ Works on results from both :meth:`Renderer.render` and
218
+ :meth:`Renderer.bridge_to_next_turn`. For a bridge result the
219
+ indices are relative to the new messages the bridge added, not
220
+ the full conversation history; the prior portion is uniformly
221
+ ``-1`` (and ``sampled_mask`` uniformly ``False``), so it
222
+ contributes nothing to either count.
223
+ """
224
+ if n_messages is None:
225
+ n_messages = len(self.message_roles)
226
+ else:
227
+ n_messages = min(n_messages, len(self.message_roles))
228
+ out = [0] * n_messages
229
+ if sampled_only:
230
+ if len(self.sampled_mask) != len(self.token_ids):
231
+ return out
232
+ for idx, sampled in zip(self.message_indices, self.sampled_mask):
233
+ if sampled and 0 <= idx < n_messages:
234
+ out[idx] += 1
235
+ else:
236
+ for idx in self.message_indices:
237
+ if 0 <= idx < n_messages:
238
+ out[idx] += 1
239
+ return out
240
+
241
+ def message_token_spans(self) -> list[tuple[int, int] | None]:
242
+ """Per-message ``(start, end)`` slices into :attr:`token_ids`.
243
+
244
+ ``out[i]`` is the half-open span ``[start, end)`` such that
245
+ ``token_ids[start:end]`` are the tokens attributed to
246
+ ``messages[i]`` (or ``new_messages[i]`` for a bridge result).
247
+ Messages that contributed no tokens get ``None``. Renderer
248
+ scaffolding outside any message (``message_indices[k] == -1``)
249
+ is not represented.
250
+
251
+ Hand-coded renderers emit each message's tokens contiguously,
252
+ so the span is well-defined. The implementation tolerates
253
+ non-contiguous attribution by returning the outer span
254
+ ``(first_k, last_k + 1)``; if you suspect interleaving, slice
255
+ ``message_indices`` yourself to verify.
256
+
257
+ Returns ``len(self.message_roles)`` entries when ``message_roles``
258
+ is populated. Otherwise infers the count from
259
+ ``max(message_indices) + 1`` — useful for manually-constructed
260
+ ``RenderedTokens`` in tests but only correct when the last
261
+ message contributed at least one token.
262
+
263
+ Cheap to call: single pass over ``message_indices``. Re-call
264
+ rather than caching the result if you mutate the dataclass.
265
+ """
266
+ if self.message_roles:
267
+ n_messages = len(self.message_roles)
268
+ else:
269
+ max_idx = -1
270
+ for idx in self.message_indices:
271
+ if idx > max_idx:
272
+ max_idx = idx
273
+ n_messages = max_idx + 1
274
+
275
+ firsts: list[int] = [-1] * n_messages
276
+ lasts: list[int] = [-1] * n_messages
277
+ for k, idx in enumerate(self.message_indices):
278
+ if 0 <= idx < n_messages:
279
+ if firsts[idx] == -1:
280
+ firsts[idx] = k
281
+ lasts[idx] = k
282
+
283
+ out: list[tuple[int, int] | None] = []
284
+ for i in range(n_messages):
285
+ if firsts[i] == -1:
286
+ out.append(None)
287
+ else:
288
+ out.append((firsts[i], lasts[i] + 1))
289
+ return out
290
+
291
+ def role_token_spans(self) -> dict[str, list[tuple[int, int]]]:
292
+ """:meth:`message_token_spans` regrouped by ``message_roles``.
293
+
294
+ Maps each role appearing in :attr:`message_roles` to a list of
295
+ ``(start, end)`` spans — one per occurrence of that role, in
296
+ message order. Messages with no contributed tokens are skipped.
297
+ Returns an empty dict if :attr:`message_roles` is empty.
298
+
299
+ Intended for per-role statistics that operate on per-token
300
+ signals — e.g. ``logprobs[start:end]`` for each assistant span
301
+ to compute per-turn perplexity, or
302
+ ``attention[start:end]`` for tool-response attention analysis.
303
+ """
304
+ spans = self.message_token_spans()
305
+ out: dict[str, list[tuple[int, int]]] = {}
306
+ for role, span in zip(self.message_roles, spans):
307
+ if span is None:
308
+ out.setdefault(role, [])
309
+ continue
310
+ out.setdefault(role, []).append(span)
311
+ return out
312
+
313
+ def tokens_by_role(self, *, sampled_only: bool = False) -> dict[str, int]:
314
+ """Sum :meth:`tokens_per_message` grouped by ``message_roles``.
315
+
316
+ Convenience for length-penalty bookkeeping in RL trainers:
317
+ ``rendered.tokens_by_role(sampled_only=True)["assistant"]`` is
318
+ the count of tokens the model actually emitted across all
319
+ assistant turns — template scaffolding excluded.
320
+ ``rendered.tokens_by_role()["tool"]`` is the raw count of
321
+ tool-response tokens (``sampled_only`` is zero for ``tool`` by
322
+ construction since the model never samples those).
323
+
324
+ Roles present in :attr:`message_roles` always appear in the
325
+ returned dict, even with post-filter count ``0``, so callers
326
+ can index directly without ``KeyError`` on conversations that
327
+ happen to lack a role. Returns an empty dict if
328
+ :attr:`message_roles` is empty.
329
+ """
330
+ counts = self.tokens_per_message(sampled_only=sampled_only)
331
+ out: dict[str, int] = {}
332
+ for role, n in zip(self.message_roles, counts):
333
+ out[role] = out.get(role, 0) + n
334
+ return out
335
+
182
336
 
183
337
  class ToolCallParseStatus(str, enum.Enum):
184
338
  """Per-attempt outcome of parsing a single ``<tool_call>`` block.
@@ -358,6 +512,25 @@ class Renderer(Protocol):
358
512
  list so far with ``add_generation_prompt=True`` — except prev
359
513
  sampled tokens are kept verbatim rather than re-rendered).
360
514
 
515
+ Attribution on the returned ``RenderedTokens``:
516
+
517
+ - ``message_indices`` is ``-1`` over the entire prior portion
518
+ (length ``len(previous_ids)`` after :func:`trim_to_turn_close`)
519
+ because the bridge gets the prior as raw token lists with no
520
+ attribution. Over the bridge-added portion, indices are
521
+ relative to ``new_messages``: a token rendered as part of
522
+ ``new_messages[i]`` carries ``i``, and inter-turn separators /
523
+ the trailing generation prompt carry ``-1``. So
524
+ ``bridge.tokens_per_message(len(new_messages))`` gives the
525
+ per-new-message token count for length-penalty bookkeeping.
526
+ - ``sampled_mask`` is uniformly ``False`` across the entire
527
+ returned sequence. The bridge output is consumed as the next
528
+ turn's prompt; nothing it emits was model-sampled, and the
529
+ bridge has no way to recover which prior tokens were. If the
530
+ caller needs that distinction for the prior portion, they
531
+ have it directly: every token in ``prev_completion_ids`` was
532
+ sampled; every token in ``prev_prompt_ids`` was not.
533
+
361
534
  Text-only renderers return :class:`RenderedTokens` with
362
535
  ``multi_modal_data=None``. Multimodal renderers (see
363
536
  :class:`MultimodalRenderer`) populate ``multi_modal_data`` so
@@ -593,6 +766,8 @@ MODEL_RENDERER_MAP: dict[str, str] = {
593
766
  "Qwen/Qwen3-14B": "qwen3",
594
767
  "Qwen/Qwen3-32B": "qwen3",
595
768
  "Qwen/Qwen3-30B-A3B": "qwen3",
769
+ "Qwen/Qwen3-30B-A3B-Instruct-2507": "qwen3",
770
+ "Qwen/Qwen3-30B-A3B-Thinking-2507": "qwen3",
596
771
  "Qwen/Qwen3-235B-A22B": "qwen3",
597
772
  # Qwen3.5. All seven sizes share the same renderer. The 4B / 9B /
598
773
  # 35B-A3B / 122B-A10B / 397B-A17B chat template defaults
@@ -619,6 +794,7 @@ MODEL_RENDERER_MAP: dict[str, str] = {
619
794
  "Qwen/Qwen3-VL-30B-A3B-Instruct": "qwen3-vl",
620
795
  # GLM-5 family (GLM-4.7 reuses the GLM-5 template).
621
796
  "zai-org/GLM-5": "glm-5",
797
+ "zai-org/GLM-5-FP8": "glm-5",
622
798
  "zai-org/GLM-4.7-Flash": "glm-5",
623
799
  "zai-org/GLM-5.1": "glm-5.1",
624
800
  # GLM-4.5.
@@ -14,10 +14,11 @@ from __future__ import annotations
14
14
  import asyncio
15
15
  import base64
16
16
  import logging
17
+ from collections.abc import Mapping
17
18
  from typing import Any, cast
18
19
 
19
20
  import numpy as np
20
- from openai import AsyncOpenAI, BadRequestError
21
+ from openai import AsyncOpenAI
21
22
 
22
23
  from renderers.base import (
23
24
  Message,
@@ -31,6 +32,79 @@ from renderers.base import (
31
32
  _request_logger = logging.getLogger("renderers.client")
32
33
 
33
34
 
35
+ class OverlongPromptError(Exception):
36
+ """The rendered prompt exceeds the engine's context window.
37
+
38
+ Raised by :func:`generate` when the rendered token sequence is strictly
39
+ longer than the resolved cap — either an explicit ``max_prompt_len`` the
40
+ caller passed in, or the engine's ``max_model_len`` discovered via
41
+ ``GET /v1/models``. Caught client-side before the engine ever sees the
42
+ request, so callers route the failure to a deterministic policy (skip /
43
+ truncate / count) instead of round-tripping through an engine 4xx.
44
+
45
+ Named after the corresponding ``verifiers.errors.OverlongPromptError``;
46
+ the two are distinct classes (different package hierarchies) but the
47
+ concept is the same and downstream clients translate one to the other.
48
+ """
49
+
50
+ def __init__(self, *, prompt_len: int, max_prompt_len: int) -> None:
51
+ self.prompt_len = prompt_len
52
+ self.max_prompt_len = max_prompt_len
53
+ super().__init__(
54
+ f"Prompt length ({prompt_len}) exceeds maximum "
55
+ f"context length ({max_prompt_len})."
56
+ )
57
+
58
+
59
+ # Per-process cache of resolved engine context-length caps, keyed by
60
+ # ``(base_url, model)``. ``None`` is the "we asked the engine and it didn't
61
+ # tell us" sentinel — distinct from "key missing" (haven't asked yet). The
62
+ # lock serializes the first lookup per key; cache hits avoid the lock.
63
+ _max_prompt_len_cache: dict[tuple[str, str], int | None] = {}
64
+ _max_prompt_len_lock = asyncio.Lock()
65
+
66
+
67
+ async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None:
68
+ """Discover ``max_model_len`` from the engine via ``GET /v1/models``.
69
+
70
+ OpenAI-API-compatible engines expose model metadata at this endpoint;
71
+ vLLM extends its ``ModelCard`` with a ``max_model_len`` field. Engines
72
+ that don't (SGLang as of this writing, third-party gateways, etc.) get
73
+ a cached ``None`` and the pre-flight overflow check silently disables —
74
+ callers fall back to whatever reactive handling they have for engine
75
+ 4xx, which the verifiers ``@handle_openai_overlong_prompt`` decorator
76
+ already supplies for the prime-rl path.
77
+
78
+ Any exception during lookup (network error, non-JSON body, attribute
79
+ miss on a mock client in tests) is treated as "unknown cap": cached
80
+ ``None`` so we don't retry on every call.
81
+ """
82
+ key = (str(getattr(client, "base_url", "")), model)
83
+ if key in _max_prompt_len_cache:
84
+ return _max_prompt_len_cache[key]
85
+ async with _max_prompt_len_lock:
86
+ if key in _max_prompt_len_cache:
87
+ return _max_prompt_len_cache[key]
88
+ try:
89
+ payload = await client.get("/models", cast_to=cast(Any, dict[str, Any]))
90
+ except Exception as exc:
91
+ _request_logger.debug("max_prompt_len lookup failed: %s", exc)
92
+ _max_prompt_len_cache[key] = None
93
+ return None
94
+ value: int | None = None
95
+ for card in payload.get("data") or []:
96
+ if not isinstance(card, Mapping):
97
+ continue
98
+ if card.get("id") != model:
99
+ continue
100
+ raw = card.get("max_model_len")
101
+ if isinstance(raw, int) and raw > 0:
102
+ value = raw
103
+ break
104
+ _max_prompt_len_cache[key] = value
105
+ return value
106
+
107
+
34
108
  async def _maybe_offload(renderer: Renderer | RendererPool, fn):
35
109
  """Run sync renderer work on a thread iff ``renderer`` is a pool.
36
110
 
@@ -58,6 +132,7 @@ async def generate(
58
132
  cache_salt: str | None = None,
59
133
  priority: int | None = None,
60
134
  extra_headers: dict[str, str] | None = None,
135
+ max_prompt_len: int | None = None,
61
136
  ) -> dict[str, Any]:
62
137
  """Tokenize messages, call vLLM /inference/v1/generate, parse the response.
63
138
 
@@ -74,6 +149,16 @@ async def generate(
74
149
  mm_placeholders, kwargs_data) before POSTing. The serializer imports
75
150
  ``vllm.*`` lazily so text-only consumers never pay for the import.
76
151
 
152
+ ``max_prompt_len`` controls the pre-flight overflow check. When the
153
+ rendered prompt is strictly longer than the cap, the request is never
154
+ sent and ``OverlongPromptError`` is raised. If ``max_prompt_len`` is
155
+ ``None`` (the default), the cap is auto-discovered once per
156
+ ``(base_url, model)`` via ``GET /v1/models`` (vLLM's
157
+ ``ModelCard.max_model_len`` extension); engines that don't expose it
158
+ cache a ``None`` cap and the pre-flight silently disables. Engine 4xx
159
+ that still slip through propagate raw — converting them into a domain
160
+ error is the calling client's job (its error shape is engine-specific).
161
+
77
162
  Returns a dict with: request_id, prompt_ids, completion_ids,
78
163
  completion_logprobs, content, reasoning_content, tool_calls,
79
164
  finish_reason, routed_experts.
@@ -96,6 +181,13 @@ async def generate(
96
181
 
97
182
  prompt_ids, stop_token_ids, mm_data = await _maybe_offload(renderer, _prepare)
98
183
 
184
+ if max_prompt_len is None:
185
+ max_prompt_len = await _resolve_max_prompt_len(client, model)
186
+ if max_prompt_len is not None and len(prompt_ids) > max_prompt_len:
187
+ raise OverlongPromptError(
188
+ prompt_len=len(prompt_ids), max_prompt_len=max_prompt_len
189
+ )
190
+
99
191
  sp: dict[str, Any] = dict(sampling_params or {})
100
192
  sp["stop_token_ids"] = stop_token_ids
101
193
  sp["logprobs"] = 1
@@ -135,16 +227,7 @@ async def generate(
135
227
  }
136
228
  if extra_headers:
137
229
  post_kwargs["options"] = cast(Any, {"headers": extra_headers})
138
- try:
139
- data = await client.post(endpoint, **post_kwargs)
140
- except BadRequestError as exc:
141
- _log_overlong_prompt_diagnostic(
142
- prompt_ids=prompt_ids,
143
- messages=messages,
144
- max_tokens=sp.get("max_tokens"),
145
- exc=exc,
146
- )
147
- raise
230
+ data = await client.post(endpoint, **post_kwargs)
148
231
 
149
232
  choice = (data.get("choices") or [{}])[0]
150
233
  completion_ids = choice.get("token_ids") or []
@@ -225,6 +308,7 @@ def _build_mm_features(
225
308
  to change. Don't pre-build the abstraction with one engine in tree.
226
309
  """
227
310
  from renderers.qwen3_vl import Qwen3VLRenderer
311
+ from renderers.qwen35 import Qwen35Renderer
228
312
 
229
313
  # Type dispatch only needs the renderer class. Pools expose
230
314
  # ``renderer_cls`` as a snapshot attribute, so we don't have to check
@@ -233,7 +317,10 @@ def _build_mm_features(
233
317
  renderer.renderer_cls if isinstance(renderer, RendererPool) else type(renderer)
234
318
  )
235
319
 
236
- if issubclass(renderer_cls, Qwen3VLRenderer):
320
+ # Qwen3-VL and Qwen3.5 both ship ``pixel_values`` + ``image_grid_thw``
321
+ # via the shared Qwen2-VL field factory. ``spatial_merge_size=2`` is
322
+ # the family default and matches every Qwen-VL processor in tree.
323
+ if issubclass(renderer_cls, (Qwen3VLRenderer, Qwen35Renderer)):
237
324
  return _build_qwen_vl_features(mm_data, spatial_merge_size=2)
238
325
 
239
326
  raise NotImplementedError(
@@ -305,44 +392,3 @@ def _build_qwen_vl_features(
305
392
  out["kwargs_data"] = None
306
393
 
307
394
  return out
308
-
309
-
310
- def _log_overlong_prompt_diagnostic(
311
- *,
312
- prompt_ids: list[int],
313
- messages: list[Message],
314
- max_tokens: int | None,
315
- exc: BadRequestError,
316
- ) -> None:
317
- """Log a structured snapshot when vLLM rejects with 4xx — usually overlong.
318
-
319
- Captures total prompt length, per-message role + character count, and
320
- the first chunk of the response body.
321
- """
322
- body_text = ""
323
- response = getattr(exc, "response", None)
324
- if response is not None:
325
- body_text = (response.text or "")[:500].replace("\n", " ")
326
- msg_summary = []
327
- for i, m in enumerate(messages):
328
- role = m.get("role", "?")
329
- content = m.get("content")
330
- if isinstance(content, str):
331
- content_len = len(content)
332
- elif isinstance(content, list):
333
- content_len = sum(
334
- len(p.get("text", "")) if isinstance(p, dict) else 0 for p in content
335
- )
336
- else:
337
- content_len = 0
338
- tool_calls = m.get("tool_calls")
339
- tc_count = len(tool_calls) if tool_calls else 0
340
- msg_summary.append(f"[{i}]{role}(c={content_len},tc={tc_count})")
341
- _request_logger.warning(
342
- "vllm 4xx prompt_len=%d messages=%d max_tokens=%s per_msg=%s response_body=%s",
343
- len(prompt_ids),
344
- len(messages),
345
- max_tokens,
346
- " ".join(msg_summary),
347
- body_text,
348
- )
@@ -210,7 +210,10 @@ class DeepSeekV3Renderer:
210
210
  emit_text("<think>\n", -1, is_sampled=False)
211
211
 
212
212
  return RenderedTokens(
213
- token_ids=tokens, message_indices=indices, sampled_mask=sampled
213
+ token_ids=tokens,
214
+ message_indices=indices,
215
+ sampled_mask=sampled,
216
+ message_roles=[m.get("role") or "" for m in messages],
214
217
  )
215
218
 
216
219
  def render_ids(
@@ -271,22 +274,29 @@ class DeepSeekV3Renderer:
271
274
  return None
272
275
 
273
276
  ext: list[int] = []
274
-
275
- # Bridge output is consumed as the next turn's prompt — the
276
- # caller blanket-masks it via ``prompt_mask=[False]*N``, so we
277
- # don't track sampled_mask here. Local helpers accept the kwarg
278
- # for signature compatibility with ``_render_tool`` and ignore
279
- # it; the returned ``RenderedTokens`` leaves ``sampled_mask``
280
- # empty.
277
+ ext_indices: list[int] = []
278
+ ext_sampled: list[bool] = []
279
+
280
+ # Bridge populates ``message_indices`` (relative to ``new_messages``)
281
+ # and ``sampled_mask`` (uniformly ``False`` every token the
282
+ # bridge emits is template scaffolding for the next prompt, not
283
+ # something the model sampled). Downstream consumers can run
284
+ # :meth:`RenderedTokens.tokens_per_message` on the bridge output
285
+ # to get per-new-message token counts without re-rendering.
281
286
  def emit_special(
282
- token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
287
+ token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
283
288
  ) -> None:
284
289
  ext.append(token_id)
290
+ ext_indices.append(msg_idx)
291
+ ext_sampled.append(is_sampled)
285
292
 
286
293
  def emit_text(
287
- text: str, _msg_idx: int = -1, *, is_sampled: bool = False
294
+ text: str, msg_idx: int = -1, *, is_sampled: bool = False
288
295
  ) -> None:
289
- ext.extend(self._encode(text))
296
+ ids = self._encode(text)
297
+ ext.extend(ids)
298
+ ext_indices.extend([msg_idx] * len(ids))
299
+ ext_sampled.extend([is_sampled] * len(ids))
290
300
 
291
301
  for i, msg in enumerate(new_messages):
292
302
  role = msg.get("role")
@@ -329,7 +339,13 @@ class DeepSeekV3Renderer:
329
339
  if self._enable_thinking:
330
340
  emit_text("<think>\n", -1)
331
341
 
332
- return RenderedTokens(token_ids=previous_ids + ext)
342
+ total_len = len(previous_ids) + len(ext)
343
+ return RenderedTokens(
344
+ token_ids=previous_ids + ext,
345
+ message_indices=[-1] * len(previous_ids) + ext_indices,
346
+ sampled_mask=[False] * total_len,
347
+ message_roles=[m.get("role") or "" for m in new_messages],
348
+ )
333
349
 
334
350
  # ------------------------------------------------------------------
335
351
  # Assistant rendering
@@ -143,7 +143,12 @@ class DefaultRenderer:
143
143
  token_ids = full_ids
144
144
  message_indices.extend([-1] * len(gen_tokens))
145
145
 
146
- return RenderedTokens(token_ids=token_ids, message_indices=message_indices)
146
+ message_roles = [m.get("role") or "" for m in messages]
147
+ return RenderedTokens(
148
+ token_ids=token_ids,
149
+ message_indices=message_indices,
150
+ message_roles=message_roles,
151
+ )
147
152
 
148
153
  def _apply(self, messages, *, tools=None, add_generation_prompt=False) -> list[int]:
149
154
  kwargs = dict(self._chat_template_kwargs)
@@ -203,7 +203,10 @@ class GLM45Renderer:
203
203
  emit_special(self._think_end, -1, is_sampled=False)
204
204
 
205
205
  return RenderedTokens(
206
- token_ids=tokens, message_indices=indices, sampled_mask=sampled
206
+ token_ids=tokens,
207
+ message_indices=indices,
208
+ sampled_mask=sampled,
209
+ message_roles=[m.get("role") or "" for m in messages],
207
210
  )
208
211
 
209
212
  def render_ids(
@@ -271,22 +274,29 @@ class GLM45Renderer:
271
274
  last_prev = previous_ids[-1]
272
275
 
273
276
  ext: list[int] = []
274
-
275
- # Bridge output is consumed as the next turn's prompt — the
276
- # caller blanket-masks it via ``prompt_mask=[False]*N``, so we
277
- # don't track sampled_mask here. Local helpers accept the kwarg
278
- # for signature compatibility with ``_render_tool`` and ignore
279
- # it; the returned ``RenderedTokens`` leaves ``sampled_mask``
280
- # empty.
277
+ ext_indices: list[int] = []
278
+ ext_sampled: list[bool] = []
279
+
280
+ # Bridge populates ``message_indices`` (relative to ``new_messages``)
281
+ # and ``sampled_mask`` (uniformly ``False`` every token the
282
+ # bridge emits is template scaffolding for the next prompt, not
283
+ # something the model sampled). Downstream consumers can run
284
+ # :meth:`RenderedTokens.tokens_per_message` on the bridge output
285
+ # to get per-new-message token counts without re-rendering.
281
286
  def emit_special(
282
- token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
287
+ token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
283
288
  ) -> None:
284
289
  ext.append(token_id)
290
+ ext_indices.append(msg_idx)
291
+ ext_sampled.append(is_sampled)
285
292
 
286
293
  def emit_text(
287
- text: str, _msg_idx: int = -1, *, is_sampled: bool = False
294
+ text: str, msg_idx: int = -1, *, is_sampled: bool = False
288
295
  ) -> None:
289
- ext.extend(self._encode(text))
296
+ ids = self._encode(text)
297
+ ext.extend(ids)
298
+ ext_indices.extend([msg_idx] * len(ids))
299
+ ext_sampled.extend([is_sampled] * len(ids))
290
300
 
291
301
  for i, msg in enumerate(new_messages):
292
302
  role = msg.get("role")
@@ -318,7 +328,13 @@ class GLM45Renderer:
318
328
  emit_special(self._think, -1)
319
329
  emit_special(self._think_end, -1)
320
330
 
321
- return RenderedTokens(token_ids=previous_ids + ext)
331
+ total_len = len(previous_ids) + len(ext)
332
+ return RenderedTokens(
333
+ token_ids=previous_ids + ext,
334
+ message_indices=[-1] * len(previous_ids) + ext_indices,
335
+ sampled_mask=[False] * total_len,
336
+ message_roles=[m.get("role") or "" for m in new_messages],
337
+ )
322
338
 
323
339
  def _render_assistant(
324
340
  self,
@@ -220,7 +220,10 @@ class GLM5Renderer:
220
220
  emit_special(self._think_end, -1, is_sampled=False)
221
221
 
222
222
  return RenderedTokens(
223
- token_ids=tokens, message_indices=indices, sampled_mask=sampled
223
+ token_ids=tokens,
224
+ message_indices=indices,
225
+ sampled_mask=sampled,
226
+ message_roles=[m.get("role") or "" for m in messages],
224
227
  )
225
228
 
226
229
  def render_ids(
@@ -292,22 +295,29 @@ class GLM5Renderer:
292
295
  last_prev = previous_ids[-1]
293
296
 
294
297
  ext: list[int] = []
295
-
296
- # Bridge output is consumed as the next turn's prompt — the
297
- # caller blanket-masks it via ``prompt_mask=[False]*N``, so we
298
- # don't track sampled_mask here. Local helpers accept the kwarg
299
- # for signature compatibility with ``_render_assistant`` /
300
- # ``_render_tool`` and ignore it; the returned ``RenderedTokens``
301
- # leaves ``sampled_mask`` empty.
298
+ ext_indices: list[int] = []
299
+ ext_sampled: list[bool] = []
300
+
301
+ # Bridge populates ``message_indices`` (relative to ``new_messages``)
302
+ # and ``sampled_mask`` (uniformly ``False`` — every token the
303
+ # bridge emits is template scaffolding for the next prompt, not
304
+ # something the model sampled). Downstream consumers can run
305
+ # :meth:`RenderedTokens.tokens_per_message` on the bridge output
306
+ # to get per-new-message token counts without re-rendering.
302
307
  def emit_special(
303
- token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
308
+ token_id: int, msg_idx: int = -1, *, is_sampled: bool = False
304
309
  ) -> None:
305
310
  ext.append(token_id)
311
+ ext_indices.append(msg_idx)
312
+ ext_sampled.append(is_sampled)
306
313
 
307
314
  def emit_text(
308
- text: str, _msg_idx: int = -1, *, is_sampled: bool = False
315
+ text: str, msg_idx: int = -1, *, is_sampled: bool = False
309
316
  ) -> None:
310
- ext.extend(self._encode(text))
317
+ ids = self._encode(text)
318
+ ext.extend(ids)
319
+ ext_indices.extend([msg_idx] * len(ids))
320
+ ext_sampled.extend([is_sampled] * len(ids))
311
321
 
312
322
  for i, msg in enumerate(new_messages):
313
323
  role = msg.get("role")
@@ -340,7 +350,13 @@ class GLM5Renderer:
340
350
  else:
341
351
  emit_special(self._think_end, -1)
342
352
 
343
- return RenderedTokens(token_ids=previous_ids + ext)
353
+ total_len = len(previous_ids) + len(ext)
354
+ return RenderedTokens(
355
+ token_ids=previous_ids + ext,
356
+ message_indices=[-1] * len(previous_ids) + ext_indices,
357
+ sampled_mask=[False] * total_len,
358
+ message_roles=[m.get("role") or "" for m in new_messages],
359
+ )
344
360
 
345
361
  def _render_assistant(
346
362
  self,