renderers 0.1.8.dev41__tar.gz → 0.1.8.dev43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/PKG-INFO +1 -1
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/__init__.py +4 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/_version.py +2 -2
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/base.py +8 -1
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/configs.py +21 -12
- renderers-0.1.8.dev43/renderers/deepseek_r1.py +58 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/deepseek_v3.py +40 -33
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/kimi_k25.py +15 -3
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/parsing.py +67 -3
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen3.py +2 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen3_vl.py +2 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/conftest.py +10 -0
- renderers-0.1.8.dev43/tests/test_deepseek_r1.py +152 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_load_tokenizer_fastokens.py +2 -0
- renderers-0.1.8.dev43/tests/test_parse_response.py +276 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_renderer_config_parity.py +1 -0
- renderers-0.1.8.dev41/tests/test_parse_response.py +0 -137
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/publish-dev.yml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/publish.yml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/style.yml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/test.yml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.gitignore +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.pre-commit-config.yaml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/LICENSE +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/README.md +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/docs/renderer-config.md +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/README.md +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/sglang/multiturn_generate_sglang.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/sglang/online_multiturn_sglang.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/tinker/multiturn_generate_tinker.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/transformers/multiturn_generate_transformers.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/vllm/multiturn_generate_vllm.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/pyproject.toml +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/client.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/default.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/glm45.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/glm5.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/gpt_oss.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/kimi_k2.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/laguna_xs2.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/llama_3.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/minimax_m2.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/nemotron3.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/parsers.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen35.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen36.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_bridge.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_build_helpers.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_client.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_gpt_oss_harmony_parity.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_incremental.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_is_content.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_kimi_k25_tool_schema.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_llama_3.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_load_tokenizer.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_message_indices.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_message_tool_names.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_multimodal.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_nemotron3_ultra.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_parse_response_robustness.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_parsers.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_preserve_thinking.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_qwen35_size_coverage.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_render_ids.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_renderer_config.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_roundtrip.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_sampled_mask.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_tokens_per_message.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_tool_arg_type_preservation.py +0 -0
- {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/uv.lock +0 -0
|
@@ -44,6 +44,7 @@ from renderers.configs import (
|
|
|
44
44
|
BaseRendererConfig,
|
|
45
45
|
config_from_name,
|
|
46
46
|
DefaultRendererConfig,
|
|
47
|
+
DeepSeekR1RendererConfig,
|
|
47
48
|
DeepSeekV3RendererConfig,
|
|
48
49
|
GLM45RendererConfig,
|
|
49
50
|
GLM51RendererConfig,
|
|
@@ -74,6 +75,7 @@ from renderers.configs import (
|
|
|
74
75
|
# imports — ``renderers.base._populate_registry`` lazy-imports the
|
|
75
76
|
# concrete classes itself when a renderer is instantiated.
|
|
76
77
|
_LAZY_RENDERERS: dict[str, str] = {
|
|
78
|
+
"DeepSeekR1Renderer": "renderers.deepseek_r1",
|
|
77
79
|
"DeepSeekV3Renderer": "renderers.deepseek_v3",
|
|
78
80
|
"DefaultRenderer": "renderers.default",
|
|
79
81
|
"GLM45Renderer": "renderers.glm45",
|
|
@@ -113,6 +115,8 @@ __all__ = [
|
|
|
113
115
|
"BaseRendererConfig",
|
|
114
116
|
"Content",
|
|
115
117
|
"ContentPart",
|
|
118
|
+
"DeepSeekR1Renderer",
|
|
119
|
+
"DeepSeekR1RendererConfig",
|
|
116
120
|
"DeepSeekV3Renderer",
|
|
117
121
|
"DeepSeekV3RendererConfig",
|
|
118
122
|
"DefaultRenderer",
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.1.8.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 1, 8, '
|
|
21
|
+
__version__ = version = '0.1.8.dev43'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 8, 'dev43')
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -1030,9 +1030,12 @@ MODEL_RENDERER_MAP: dict[str, str] = {
|
|
|
1030
1030
|
# MiniMax.
|
|
1031
1031
|
"MiniMaxAI/MiniMax-M2": "minimax-m2",
|
|
1032
1032
|
"MiniMaxAI/MiniMax-M2.5": "minimax-m2",
|
|
1033
|
-
# DeepSeek V3.
|
|
1033
|
+
# DeepSeek V3 (non-reasoning).
|
|
1034
1034
|
"deepseek-ai/DeepSeek-V3": "deepseek-v3",
|
|
1035
1035
|
"deepseek-ai/DeepSeek-V3-Base": "deepseek-v3",
|
|
1036
|
+
# DeepSeek R1 (reasoning).
|
|
1037
|
+
"deepseek-ai/DeepSeek-R1": "deepseek-r1",
|
|
1038
|
+
"deepseek-ai/DeepSeek-R1-0528": "deepseek-r1",
|
|
1036
1039
|
# Kimi K2 (K2.5 and K2.6 share the K2.5 template, distinct from K2).
|
|
1037
1040
|
"moonshotai/Kimi-K2-Instruct": "kimi-k2",
|
|
1038
1041
|
"moonshotai/Kimi-K2.5": "kimi-k2.5",
|
|
@@ -1161,6 +1164,8 @@ FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
|
|
|
1161
1164
|
# doesn't yet implement.
|
|
1162
1165
|
"deepseek-ai/DeepSeek-V3",
|
|
1163
1166
|
"deepseek-ai/DeepSeek-V3-Base",
|
|
1167
|
+
"deepseek-ai/DeepSeek-R1",
|
|
1168
|
+
"deepseek-ai/DeepSeek-R1-0528",
|
|
1164
1169
|
}
|
|
1165
1170
|
)
|
|
1166
1171
|
|
|
@@ -1334,6 +1339,7 @@ def load_tokenizer(
|
|
|
1334
1339
|
def _populate_registry():
|
|
1335
1340
|
if RENDERER_REGISTRY:
|
|
1336
1341
|
return
|
|
1342
|
+
from renderers.deepseek_r1 import DeepSeekR1Renderer
|
|
1337
1343
|
from renderers.deepseek_v3 import DeepSeekV3Renderer
|
|
1338
1344
|
from renderers.default import DefaultRenderer
|
|
1339
1345
|
from renderers.glm5 import GLM5Renderer, GLM51Renderer
|
|
@@ -1362,6 +1368,7 @@ def _populate_registry():
|
|
|
1362
1368
|
"glm-4.5": GLM45Renderer,
|
|
1363
1369
|
"minimax-m2": MiniMaxM2Renderer,
|
|
1364
1370
|
"deepseek-v3": DeepSeekV3Renderer,
|
|
1371
|
+
"deepseek-r1": DeepSeekR1Renderer,
|
|
1365
1372
|
"kimi-k2": KimiK2Renderer,
|
|
1366
1373
|
"kimi-k2.5": KimiK25Renderer,
|
|
1367
1374
|
"laguna-xs.2": LagunaXS2Renderer,
|
|
@@ -400,24 +400,30 @@ class Nemotron3RendererConfig(BaseRendererConfig):
|
|
|
400
400
|
|
|
401
401
|
|
|
402
402
|
class DeepSeekV3RendererConfig(BaseRendererConfig):
|
|
403
|
-
"""DeepSeek
|
|
403
|
+
"""DeepSeek-V3 renderer config (non-reasoning).
|
|
404
404
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
to control the ``<think>`` prefill at the generation prompt (R1
|
|
409
|
-
distill convention).
|
|
405
|
+
DeepSeek-V3 has no thinking concept: the generation prompt is a bare
|
|
406
|
+
``<|Assistant|>`` and assistant content is emitted verbatim. For the
|
|
407
|
+
reasoning variant use :class:`DeepSeekR1RendererConfig`.
|
|
410
408
|
"""
|
|
411
409
|
|
|
412
410
|
name: Literal["deepseek-v3"] = "deepseek-v3"
|
|
413
411
|
|
|
414
|
-
enable_thinking: bool = True
|
|
415
|
-
"""Renderer convention for the R1-distill family: when ``True``,
|
|
416
|
-
prefill ``<think>`` at the generation prompt. The DeepSeek-V3 Jinja
|
|
417
|
-
template ignores this kwarg upstream; it's not a chat-template
|
|
418
|
-
kwarg in the strict sense."""
|
|
419
412
|
|
|
420
|
-
|
|
413
|
+
class DeepSeekR1RendererConfig(BaseRendererConfig):
|
|
414
|
+
"""DeepSeek-R1 renderer config (reasoning).
|
|
415
|
+
|
|
416
|
+
R1 always reasons — its chat template unconditionally prefills
|
|
417
|
+
``<think>\\n`` at the generation prompt and strips ``</think>`` from
|
|
418
|
+
historical assistant turns. There is therefore no ``enable_thinking``
|
|
419
|
+
knob (thinking is not optional), and ``preserve_*`` flags are no-ops
|
|
420
|
+
(history reasoning is always dropped); both stored for protocol
|
|
421
|
+
uniformity. Applies to full ``deepseek-ai/DeepSeek-R1`` / ``-R1-0528``
|
|
422
|
+
— NOT the R1-Distill-Qwen/Llama models, which use those base
|
|
423
|
+
tokenizers and route to the Qwen3 / Llama-3 renderers.
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
name: Literal["deepseek-r1"] = "deepseek-r1"
|
|
421
427
|
|
|
422
428
|
|
|
423
429
|
RendererConfig = Annotated[
|
|
@@ -439,6 +445,7 @@ RendererConfig = Annotated[
|
|
|
439
445
|
MiniMaxM2RendererConfig,
|
|
440
446
|
Nemotron3RendererConfig,
|
|
441
447
|
DeepSeekV3RendererConfig,
|
|
448
|
+
DeepSeekR1RendererConfig,
|
|
442
449
|
],
|
|
443
450
|
Field(discriminator="name"),
|
|
444
451
|
]
|
|
@@ -474,6 +481,7 @@ _CONFIG_BY_NAME: dict[str, type[BaseRendererConfig]] = {
|
|
|
474
481
|
"minimax-m2": MiniMaxM2RendererConfig,
|
|
475
482
|
"nemotron-3": Nemotron3RendererConfig,
|
|
476
483
|
"deepseek-v3": DeepSeekV3RendererConfig,
|
|
484
|
+
"deepseek-r1": DeepSeekR1RendererConfig,
|
|
477
485
|
}
|
|
478
486
|
|
|
479
487
|
|
|
@@ -505,6 +513,7 @@ __all__ = [
|
|
|
505
513
|
"AutoRendererConfig",
|
|
506
514
|
"BaseRendererConfig",
|
|
507
515
|
"DefaultRendererConfig",
|
|
516
|
+
"DeepSeekR1RendererConfig",
|
|
508
517
|
"DeepSeekV3RendererConfig",
|
|
509
518
|
"GLM45RendererConfig",
|
|
510
519
|
"GLM51RendererConfig",
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""DeepSeek-R1 Renderer — the reasoning variant of the DeepSeek format.
|
|
2
|
+
|
|
3
|
+
R1 shares DeepSeek-V3's special tokens, message structure, and tool-call
|
|
4
|
+
wire format, so it subclasses :class:`renderers.deepseek_v3.DeepSeekV3Renderer`
|
|
5
|
+
and overrides only the two places its chat template diverges:
|
|
6
|
+
|
|
7
|
+
1. Generation prompt — R1 unconditionally prefills ``<think>\\n``
|
|
8
|
+
(``<|Assistant|><think>\\n``) to trigger reasoning, where V3 emits a bare
|
|
9
|
+
``<|Assistant|>``. Handled by ``_GEN_THINK_PREFILL``.
|
|
10
|
+
2. Historical assistant turns — R1 strips the reasoning trace, keeping only
|
|
11
|
+
the text after ``</think>`` (``content.split('</think>')[-1]``), where V3
|
|
12
|
+
emits content verbatim. Handled by ``_prepare_assistant_content``.
|
|
13
|
+
|
|
14
|
+
Everything else — system handling, tool-call / tool-output rendering,
|
|
15
|
+
special-token resolution, and ``parse_response`` (``parse_deepseek_v3``,
|
|
16
|
+
shared) — is inherited unchanged.
|
|
17
|
+
|
|
18
|
+
Scope: full ``deepseek-ai/DeepSeek-R1`` and ``-R1-0528``. The R1-Distill
|
|
19
|
+
models (``DeepSeek-R1-Distill-Qwen/Llama``) use their base models'
|
|
20
|
+
tokenizers and route to the Qwen3 / Llama-3 renderers, not this one.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from renderers.base import Message
|
|
26
|
+
from renderers.configs import DeepSeekR1RendererConfig
|
|
27
|
+
from renderers.deepseek_v3 import DeepSeekV3Renderer
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DeepSeekR1Renderer(DeepSeekV3Renderer):
|
|
31
|
+
"""Deterministic message → token renderer for DeepSeek-R1 models."""
|
|
32
|
+
|
|
33
|
+
_config_cls: type = DeepSeekR1RendererConfig
|
|
34
|
+
_GEN_THINK_PREFILL: str = "<think>\n"
|
|
35
|
+
|
|
36
|
+
def _prepare_assistant_content(self, msg: Message) -> str:
|
|
37
|
+
"""Assistant content with the reasoning trace stripped, mirroring the
|
|
38
|
+
R1 template's ``content.split('</think>')[-1]`` on historical turns.
|
|
39
|
+
|
|
40
|
+
Structured ``thinking``/``text`` parts are reconstructed inline first
|
|
41
|
+
so the same ``</think>`` split applies. The separate
|
|
42
|
+
``reasoning_content`` field is ignored — the R1 chat template never
|
|
43
|
+
reads it, and history reasoning is dropped regardless.
|
|
44
|
+
"""
|
|
45
|
+
content = msg.get("content") or ""
|
|
46
|
+
if isinstance(content, list):
|
|
47
|
+
parts: list[str] = []
|
|
48
|
+
for p in content:
|
|
49
|
+
if not isinstance(p, dict):
|
|
50
|
+
continue
|
|
51
|
+
if p.get("type") == "thinking":
|
|
52
|
+
parts.append(f"<think>{p.get('thinking', '')}</think>")
|
|
53
|
+
elif p.get("type") == "text":
|
|
54
|
+
parts.append(p.get("text", ""))
|
|
55
|
+
content = "".join(parts)
|
|
56
|
+
if "</think>" in content:
|
|
57
|
+
content = content.split("</think>")[-1]
|
|
58
|
+
return content
|
|
@@ -41,25 +41,30 @@ def _ds_token(name: str) -> str:
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class DeepSeekV3Renderer:
|
|
44
|
-
"""Deterministic message → token renderer for DeepSeek
|
|
45
|
-
|
|
46
|
-
DeepSeek-V3
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
no-ops here too; stored for protocol uniformity.
|
|
44
|
+
"""Deterministic message → token renderer for DeepSeek-V3 models.
|
|
45
|
+
|
|
46
|
+
DeepSeek-V3 is non-reasoning: its chat template has no ``<think>``
|
|
47
|
+
concept — the generation prompt is a bare ``<|Assistant|>`` and past
|
|
48
|
+
assistant content is emitted verbatim. The reasoning variant
|
|
49
|
+
(``<think>``-prefilled prompt, history reasoning stripped) lives in
|
|
50
|
+
:class:`renderers.deepseek_r1.DeepSeekR1Renderer`, which subclasses
|
|
51
|
+
this one. ``preserve_*`` flags are no-ops here (no reasoning channel),
|
|
52
|
+
stored for protocol uniformity.
|
|
54
53
|
"""
|
|
55
54
|
|
|
55
|
+
#: Default typed config; the R1 subclass overrides this.
|
|
56
|
+
_config_cls: type = DeepSeekV3RendererConfig
|
|
57
|
+
#: Generation-prompt reasoning prefill. Empty for V3 (bare
|
|
58
|
+
#: ``<|Assistant|>``); the R1 subclass overrides to ``"<think>\n"``.
|
|
59
|
+
_GEN_THINK_PREFILL: str = ""
|
|
60
|
+
|
|
56
61
|
def __init__(
|
|
57
62
|
self,
|
|
58
63
|
tokenizer: PreTrainedTokenizer,
|
|
59
64
|
config: DeepSeekV3RendererConfig | None = None,
|
|
60
65
|
):
|
|
61
66
|
self._tokenizer = tokenizer
|
|
62
|
-
self.config = config or
|
|
67
|
+
self.config = config or type(self)._config_cls()
|
|
63
68
|
|
|
64
69
|
# ── BOS / EOS ────────────────────────────────────────────────
|
|
65
70
|
self._bos = self._get_special_token(f"begin{_US}of{_US}sentence")
|
|
@@ -239,8 +244,10 @@ class DeepSeekV3Renderer:
|
|
|
239
244
|
emit_special(
|
|
240
245
|
self._assistant_token, -1, is_sampled=False, is_content=False
|
|
241
246
|
)
|
|
242
|
-
if self.
|
|
243
|
-
emit_text(
|
|
247
|
+
if self._GEN_THINK_PREFILL:
|
|
248
|
+
emit_text(
|
|
249
|
+
self._GEN_THINK_PREFILL, -1, is_sampled=False, is_content=False
|
|
250
|
+
)
|
|
244
251
|
|
|
245
252
|
return RenderedTokens(
|
|
246
253
|
token_ids=tokens,
|
|
@@ -382,8 +389,8 @@ class DeepSeekV3Renderer:
|
|
|
382
389
|
last_role = new_messages[-1].get("role") if new_messages else None
|
|
383
390
|
if last_role != "tool":
|
|
384
391
|
emit_special(self._assistant_token, -1)
|
|
385
|
-
if self.
|
|
386
|
-
emit_text(
|
|
392
|
+
if self._GEN_THINK_PREFILL:
|
|
393
|
+
emit_text(self._GEN_THINK_PREFILL, -1)
|
|
387
394
|
|
|
388
395
|
total_len = len(previous_ids) + len(ext)
|
|
389
396
|
return RenderedTokens(
|
|
@@ -399,6 +406,23 @@ class DeepSeekV3Renderer:
|
|
|
399
406
|
# Assistant rendering
|
|
400
407
|
# ------------------------------------------------------------------
|
|
401
408
|
|
|
409
|
+
def _prepare_assistant_content(self, msg: Message) -> str:
|
|
410
|
+
"""Assistant content as the V3 template would emit it: verbatim.
|
|
411
|
+
|
|
412
|
+
V3 is non-reasoning — its template emits ``message['content']`` as-is
|
|
413
|
+
and never reads ``reasoning_content``. A structured content list is
|
|
414
|
+
flattened to its ``text`` parts. The R1 subclass overrides this to
|
|
415
|
+
strip ``</think>`` from history.
|
|
416
|
+
"""
|
|
417
|
+
content = msg.get("content") or ""
|
|
418
|
+
if isinstance(content, list):
|
|
419
|
+
content = "".join(
|
|
420
|
+
p.get("text", "")
|
|
421
|
+
for p in content
|
|
422
|
+
if isinstance(p, dict) and p.get("type") == "text"
|
|
423
|
+
)
|
|
424
|
+
return content
|
|
425
|
+
|
|
402
426
|
def _render_assistant(
|
|
403
427
|
self,
|
|
404
428
|
msg: Message,
|
|
@@ -414,24 +438,7 @@ class DeepSeekV3Renderer:
|
|
|
414
438
|
# without a new <|Assistant|> token in that case.
|
|
415
439
|
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
|
|
416
440
|
|
|
417
|
-
content =
|
|
418
|
-
# Support structured content (ThinkingPart / TextPart list).
|
|
419
|
-
if isinstance(content, list):
|
|
420
|
-
parts_text: list[str] = []
|
|
421
|
-
for p in content:
|
|
422
|
-
if not isinstance(p, dict):
|
|
423
|
-
continue
|
|
424
|
-
if p.get("type") == "thinking":
|
|
425
|
-
thinking = p.get("thinking", "")
|
|
426
|
-
parts_text.append(f"<think>{thinking}</think>")
|
|
427
|
-
elif p.get("type") == "text":
|
|
428
|
-
parts_text.append(p.get("text", ""))
|
|
429
|
-
content = "".join(parts_text)
|
|
430
|
-
# Also accept reasoning_content stored separately (OpenAI-style).
|
|
431
|
-
elif isinstance(msg.get("reasoning_content"), str) and msg["reasoning_content"]:
|
|
432
|
-
reasoning = msg["reasoning_content"]
|
|
433
|
-
content = f"<think>{reasoning}</think>{content}"
|
|
434
|
-
|
|
441
|
+
content = self._prepare_assistant_content(msg)
|
|
435
442
|
tool_calls = msg.get("tool_calls") or []
|
|
436
443
|
|
|
437
444
|
# ``<|Assistant|>`` is template-injected scaffolding — at
|
|
@@ -42,7 +42,7 @@ from renderers.base import (
|
|
|
42
42
|
trim_to_turn_close,
|
|
43
43
|
)
|
|
44
44
|
from renderers.configs import KimiK25RendererConfig
|
|
45
|
-
from renderers.parsing import parse_kimi_k2_section
|
|
45
|
+
from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section
|
|
46
46
|
from renderers.qwen3_vl import (
|
|
47
47
|
_image_hash,
|
|
48
48
|
_is_image_part,
|
|
@@ -452,6 +452,13 @@ def _parse_kimi_k2_response(
|
|
|
452
452
|
ids = ids[:i]
|
|
453
453
|
break
|
|
454
454
|
|
|
455
|
+
# Reasoning first: a tool-call section the model drafts *inside* its
|
|
456
|
+
# <think> trace must not be parsed as a real call (regression #78 — cf.
|
|
457
|
+
# parse_qwen3). K2.5 renders </think> as text, so locate the boundary by
|
|
458
|
+
# decoding; the section scan then starts past it. content_ids still begins
|
|
459
|
+
# at 0, so the </think> text-split below recovers reasoning unchanged.
|
|
460
|
+
reasoning_end = _reasoning_end_token_index(tokenizer, ids)
|
|
461
|
+
|
|
455
462
|
# Token-ID path — produces spans. Only run if every relevant special
|
|
456
463
|
# token resolved at init (i.e. is in the tokenizer's vocab).
|
|
457
464
|
tool_calls: list[ParsedToolCall] = []
|
|
@@ -471,6 +478,7 @@ def _parse_kimi_k2_response(
|
|
|
471
478
|
tool_call_begin_id=tool_call_begin_id,
|
|
472
479
|
tool_call_argument_begin_id=tool_call_argument_begin_id,
|
|
473
480
|
tool_call_end_id=tool_call_end_id,
|
|
481
|
+
scan_start=reasoning_end,
|
|
474
482
|
)
|
|
475
483
|
text = (
|
|
476
484
|
tokenizer.decode(content_ids, skip_special_tokens=False)
|
|
@@ -481,9 +489,13 @@ def _parse_kimi_k2_response(
|
|
|
481
489
|
text = tokenizer.decode(ids, skip_special_tokens=False) if ids else ""
|
|
482
490
|
|
|
483
491
|
# Fallback path: model emitted literal-text section delimiters (singular
|
|
484
|
-
# variant) rather than special tokens. Spans unavailable here.
|
|
492
|
+
# variant) rather than special tokens. Spans unavailable here. Start the
|
|
493
|
+
# search past the first </think> so a literal section drafted inside the
|
|
494
|
+
# reasoning trace isn't matched as a real call (regression #78).
|
|
485
495
|
if not tool_calls:
|
|
486
|
-
|
|
496
|
+
think_close = text.find("</think>")
|
|
497
|
+
search_from = think_close + len("</think>") if think_close != -1 else 0
|
|
498
|
+
tc_match = _TOOL_CALLS_SECTION_RE.search(text, search_from)
|
|
487
499
|
if tc_match:
|
|
488
500
|
text = text[: tc_match.start()]
|
|
489
501
|
tool_section = (
|
|
@@ -133,6 +133,39 @@ def _decode(tokenizer, ids: list[int]) -> str:
|
|
|
133
133
|
return tokenizer.decode(ids, skip_special_tokens=False)
|
|
134
134
|
|
|
135
135
|
|
|
136
|
+
def _reasoning_end_token_index(
|
|
137
|
+
tokenizer, ids: list[int], marker: str = "</think>"
|
|
138
|
+
) -> int:
|
|
139
|
+
"""Token index immediately past the first ``</think>`` in ``ids``.
|
|
140
|
+
|
|
141
|
+
Returns 0 when ``ids`` has no closed reasoning region — callers treat
|
|
142
|
+
that as "scan from the start" (preserves pre-existing behavior for
|
|
143
|
+
non-thinking / truncated-reasoning completions).
|
|
144
|
+
|
|
145
|
+
Used by parsers whose ``</think>`` is *not* a single special token
|
|
146
|
+
(DeepSeek-V3, Kimi-K2.5) — where it tokenizes to several pieces and is
|
|
147
|
+
context-sensitive (the closing ``>`` merges differently depending on the
|
|
148
|
+
next char), so a token-id or fixed-subsequence search isn't reliable. We
|
|
149
|
+
instead locate the boundary in decoded text via binary search over prefix
|
|
150
|
+
decodes, which holds as long as ``decode(ids[:k])`` is prefix-stable in
|
|
151
|
+
``k`` (true for the byte-level BPE tokenizers here; ``</think>`` is clean
|
|
152
|
+
ASCII that won't straddle a byte boundary). Single-token ``</think>``
|
|
153
|
+
parsers (Qwen3) anchor on the token id directly and don't need this.
|
|
154
|
+
"""
|
|
155
|
+
if not ids or marker not in _decode(tokenizer, ids):
|
|
156
|
+
return 0
|
|
157
|
+
# Smallest prefix length (in tokens) whose decode already contains the
|
|
158
|
+
# full marker — i.e. the index just past where </think> completes.
|
|
159
|
+
lo, hi = 1, len(ids)
|
|
160
|
+
while lo < hi:
|
|
161
|
+
mid = (lo + hi) // 2
|
|
162
|
+
if marker in _decode(tokenizer, ids[:mid]):
|
|
163
|
+
hi = mid
|
|
164
|
+
else:
|
|
165
|
+
lo = mid + 1
|
|
166
|
+
return lo
|
|
167
|
+
|
|
168
|
+
|
|
136
169
|
# ── Qwen3: <tool_call> JSON </tool_call> ────────────────────────────
|
|
137
170
|
|
|
138
171
|
|
|
@@ -143,11 +176,26 @@ def parse_qwen3(
|
|
|
143
176
|
stop_ids: set[int],
|
|
144
177
|
tool_call_id: int,
|
|
145
178
|
tool_call_end_id: int,
|
|
179
|
+
reasoning_end_id: int | None = None,
|
|
146
180
|
) -> ParsedResponse:
|
|
147
181
|
"""Parse Qwen3 completion tokens. Hermes-style JSON tool calls."""
|
|
148
182
|
ids = _strip_stop_tokens(token_ids, stop_ids)
|
|
149
183
|
|
|
150
|
-
|
|
184
|
+
# Reasoning is resolved before tool calls. Thinking models (e.g.
|
|
185
|
+
# Qwen3-*-Thinking) routinely draft ``<tool_call>`` blocks *inside* their
|
|
186
|
+
# ``<think>...</think>`` trace while planning; those are reasoning, not
|
|
187
|
+
# real invocations. Anchoring the tool-call scan after the ``</think>``
|
|
188
|
+
# boundary keeps in-think drafts out of ``tool_calls`` (otherwise they
|
|
189
|
+
# surface as phantom/duplicate calls) and out of the reasoning/content
|
|
190
|
+
# split. Mirrors vLLM's DelegatingParser, which runs the reasoning parser
|
|
191
|
+
# first and tool-parses only the post-``</think>`` content.
|
|
192
|
+
# ``reasoning_end_id`` is the ``</think>`` token id; when it's absent
|
|
193
|
+
# (``None``) or the model never closed its reasoning, the scan falls back
|
|
194
|
+
# to the whole stream (prior behavior).
|
|
195
|
+
reasoning_end = _find(ids, reasoning_end_id) if reasoning_end_id is not None else -1
|
|
196
|
+
scan_start = reasoning_end + 1 if reasoning_end != -1 else 0
|
|
197
|
+
|
|
198
|
+
tc_start = _find(ids, tool_call_id, scan_start)
|
|
151
199
|
tool_calls: list[ParsedToolCall] = []
|
|
152
200
|
if tc_start != -1:
|
|
153
201
|
content_ids = ids[:tc_start]
|
|
@@ -685,7 +733,15 @@ def parse_deepseek_v3(
|
|
|
685
733
|
"""
|
|
686
734
|
ids = _strip_stop_tokens(token_ids, stop_ids)
|
|
687
735
|
|
|
688
|
-
|
|
736
|
+
# Reasoning first: skip past </think> before looking for the tool-call
|
|
737
|
+
# section, so a section the model drafts *inside* its <think> trace isn't
|
|
738
|
+
# parsed as a real call (regression #78 — cf. parse_qwen3). content_ids
|
|
739
|
+
# still starts at 0, so the </think> text-split below recovers reasoning.
|
|
740
|
+
# DeepSeek-V3 renders </think> as multi-token text, hence the decode-based
|
|
741
|
+
# boundary finder rather than a token-id anchor.
|
|
742
|
+
reasoning_end = _reasoning_end_token_index(tokenizer, ids)
|
|
743
|
+
|
|
744
|
+
tc_section_start = _find(ids, tool_calls_begin_id, reasoning_end)
|
|
689
745
|
tool_calls: list[ParsedToolCall] = []
|
|
690
746
|
if tc_section_start != -1:
|
|
691
747
|
content_ids = ids[:tc_section_start]
|
|
@@ -962,6 +1018,7 @@ def parse_kimi_k2_section(
|
|
|
962
1018
|
tool_call_begin_id: int,
|
|
963
1019
|
tool_call_argument_begin_id: int,
|
|
964
1020
|
tool_call_end_id: int,
|
|
1021
|
+
scan_start: int = 0,
|
|
965
1022
|
) -> tuple[list[int], list[ParsedToolCall]]:
|
|
966
1023
|
"""Split ``ids`` into ``(content_before_section, tool_calls)`` by finding
|
|
967
1024
|
the Kimi-style tool-call section delimiters.
|
|
@@ -973,8 +1030,15 @@ def parse_kimi_k2_section(
|
|
|
973
1030
|
of the section and a list of ``ParsedToolCall`` covering every attempted
|
|
974
1031
|
block inside it; an unclosed section is still walked to whatever the model
|
|
975
1032
|
emitted before EOS. Returns ``(ids, [])`` when no section is present.
|
|
1033
|
+
|
|
1034
|
+
``scan_start`` restricts the section search to ``ids[scan_start:]`` while
|
|
1035
|
+
keeping ``content_ids = ids[:section_start]`` and all token spans relative
|
|
1036
|
+
to the full ``ids``. Callers pass the post-``</think>`` index so a section
|
|
1037
|
+
the model drafts inside its reasoning trace isn't parsed as a real call;
|
|
1038
|
+
because ``content_ids`` still starts at 0, downstream text-based reasoning
|
|
1039
|
+
extraction is unaffected (regression #78).
|
|
976
1040
|
"""
|
|
977
|
-
section_start = _find_any(ids, tool_calls_section_begin_ids)
|
|
1041
|
+
section_start = _find_any(ids, tool_calls_section_begin_ids, scan_start)
|
|
978
1042
|
if section_start == -1:
|
|
979
1043
|
return list(ids), []
|
|
980
1044
|
content_ids = ids[:section_start]
|
|
@@ -62,6 +62,7 @@ class Qwen3Renderer:
|
|
|
62
62
|
self._tool_call_end = self._token_id("</tool_call>")
|
|
63
63
|
self._tool_response = self._token_id("<tool_response>")
|
|
64
64
|
self._tool_response_end = self._token_id("</tool_response>")
|
|
65
|
+
self._think_end = self._token_id("</think>")
|
|
65
66
|
|
|
66
67
|
def _token_id(self, token: str) -> int:
|
|
67
68
|
tid = self._tokenizer.convert_tokens_to_ids(token)
|
|
@@ -276,6 +277,7 @@ class Qwen3Renderer:
|
|
|
276
277
|
stop_ids={self._im_end, self._endoftext},
|
|
277
278
|
tool_call_id=self._tool_call,
|
|
278
279
|
tool_call_end_id=self._tool_call_end,
|
|
280
|
+
reasoning_end_id=self._think_end,
|
|
279
281
|
)
|
|
280
282
|
|
|
281
283
|
def get_stop_token_ids(self) -> list[int]:
|
|
@@ -325,6 +325,7 @@ class Qwen3VLRenderer:
|
|
|
325
325
|
self._tool_call_end = self._token_id("</tool_call>")
|
|
326
326
|
self._tool_response = self._token_id("<tool_response>")
|
|
327
327
|
self._tool_response_end = self._token_id("</tool_response>")
|
|
328
|
+
self._think_end = self._token_id("</think>")
|
|
328
329
|
self._vision_start = self._token_id("<|vision_start|>")
|
|
329
330
|
self._vision_end = self._token_id("<|vision_end|>")
|
|
330
331
|
self._image_pad = self._token_id("<|image_pad|>")
|
|
@@ -634,6 +635,7 @@ class Qwen3VLRenderer:
|
|
|
634
635
|
stop_ids={self._im_end, self._endoftext},
|
|
635
636
|
tool_call_id=self._tool_call,
|
|
636
637
|
tool_call_end_id=self._tool_call_end,
|
|
638
|
+
reasoning_end_id=self._think_end,
|
|
637
639
|
)
|
|
638
640
|
|
|
639
641
|
def get_stop_token_ids(self) -> list[int]:
|
|
@@ -36,6 +36,16 @@ RENDERER_MODELS = [
|
|
|
36
36
|
# Ultra resolves the Ultra template variant via name (auto → ultra=True).
|
|
37
37
|
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
|
|
38
38
|
("poolside/Laguna-XS.2", "auto"),
|
|
39
|
+
# DeepSeek-V3/R1 are intentionally NOT in this shared barrage: their
|
|
40
|
+
# chat templates can't render the barrage's tool-call fixtures (the
|
|
41
|
+
# templates require ``tool['type']`` and a string-serialized
|
|
42
|
+
# ``arguments``, and V3 only renders tool_calls when content is None —
|
|
43
|
+
# so ``apply_chat_template`` raises or drops the calls on the shared
|
|
44
|
+
# shapes), and the is_content body-recovery checks hit a Metaspace
|
|
45
|
+
# subset-decode artifact. The renderer is correct in all these cases;
|
|
46
|
+
# there's just no byte-output to parity-check against. Split-specific
|
|
47
|
+
# parity (V3 bare prompt vs R1 <think>+history-strip) is covered in
|
|
48
|
+
# tests/test_deepseek_r1.py.
|
|
39
49
|
# Llama-3 loads via the unrestricted unsloth mirror (byte-identical
|
|
40
50
|
# chat template) so CI needs no Meta-gated HF token. Pinned to the
|
|
41
51
|
# explicit "llama-3" config because the mirror name isn't in
|