renderers 0.1.8.dev41__tar.gz → 0.1.8.dev43__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/PKG-INFO +1 -1
  2. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/__init__.py +4 -0
  3. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/_version.py +2 -2
  4. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/base.py +8 -1
  5. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/configs.py +21 -12
  6. renderers-0.1.8.dev43/renderers/deepseek_r1.py +58 -0
  7. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/deepseek_v3.py +40 -33
  8. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/kimi_k25.py +15 -3
  9. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/parsing.py +67 -3
  10. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen3.py +2 -0
  11. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen3_vl.py +2 -0
  12. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/conftest.py +10 -0
  13. renderers-0.1.8.dev43/tests/test_deepseek_r1.py +152 -0
  14. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_load_tokenizer_fastokens.py +2 -0
  15. renderers-0.1.8.dev43/tests/test_parse_response.py +276 -0
  16. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_renderer_config_parity.py +1 -0
  17. renderers-0.1.8.dev41/tests/test_parse_response.py +0 -137
  18. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/publish-dev.yml +0 -0
  19. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/publish.yml +0 -0
  20. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/style.yml +0 -0
  21. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.github/workflows/test.yml +0 -0
  22. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.gitignore +0 -0
  23. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/.pre-commit-config.yaml +0 -0
  24. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/LICENSE +0 -0
  25. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/README.md +0 -0
  26. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/docs/renderer-config.md +0 -0
  27. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/README.md +0 -0
  28. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/sglang/multiturn_generate_sglang.py +0 -0
  29. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/sglang/online_multiturn_sglang.py +0 -0
  30. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/tinker/multiturn_generate_tinker.py +0 -0
  31. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/transformers/multiturn_generate_transformers.py +0 -0
  32. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/examples/vllm/multiturn_generate_vllm.py +0 -0
  33. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/pyproject.toml +0 -0
  34. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/client.py +0 -0
  35. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/default.py +0 -0
  36. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/glm45.py +0 -0
  37. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/glm5.py +0 -0
  38. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/gpt_oss.py +0 -0
  39. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/kimi_k2.py +0 -0
  40. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/laguna_xs2.py +0 -0
  41. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/llama_3.py +0 -0
  42. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/minimax_m2.py +0 -0
  43. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/nemotron3.py +0 -0
  44. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/parsers.py +0 -0
  45. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen35.py +0 -0
  46. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/renderers/qwen36.py +0 -0
  47. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_bridge.py +0 -0
  48. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_build_helpers.py +0 -0
  49. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_client.py +0 -0
  50. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_gpt_oss_harmony_parity.py +0 -0
  51. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_incremental.py +0 -0
  52. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_is_content.py +0 -0
  53. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_kimi_k25_tool_schema.py +0 -0
  54. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_llama_3.py +0 -0
  55. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_load_tokenizer.py +0 -0
  56. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_message_indices.py +0 -0
  57. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_message_tool_names.py +0 -0
  58. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_multimodal.py +0 -0
  59. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_nemotron3_ultra.py +0 -0
  60. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_parse_response_robustness.py +0 -0
  61. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_parsers.py +0 -0
  62. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_preserve_thinking.py +0 -0
  63. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_qwen35_size_coverage.py +0 -0
  64. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_render_ids.py +0 -0
  65. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_renderer_config.py +0 -0
  66. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_roundtrip.py +0 -0
  67. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_sampled_mask.py +0 -0
  68. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_tokens_per_message.py +0 -0
  69. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/tests/test_tool_arg_type_preservation.py +0 -0
  70. {renderers-0.1.8.dev41 → renderers-0.1.8.dev43}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: renderers
3
- Version: 0.1.8.dev41
3
+ Version: 0.1.8.dev43
4
4
  Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -44,6 +44,7 @@ from renderers.configs import (
44
44
  BaseRendererConfig,
45
45
  config_from_name,
46
46
  DefaultRendererConfig,
47
+ DeepSeekR1RendererConfig,
47
48
  DeepSeekV3RendererConfig,
48
49
  GLM45RendererConfig,
49
50
  GLM51RendererConfig,
@@ -74,6 +75,7 @@ from renderers.configs import (
74
75
  # imports — ``renderers.base._populate_registry`` lazy-imports the
75
76
  # concrete classes itself when a renderer is instantiated.
76
77
  _LAZY_RENDERERS: dict[str, str] = {
78
+ "DeepSeekR1Renderer": "renderers.deepseek_r1",
77
79
  "DeepSeekV3Renderer": "renderers.deepseek_v3",
78
80
  "DefaultRenderer": "renderers.default",
79
81
  "GLM45Renderer": "renderers.glm45",
@@ -113,6 +115,8 @@ __all__ = [
113
115
  "BaseRendererConfig",
114
116
  "Content",
115
117
  "ContentPart",
118
+ "DeepSeekR1Renderer",
119
+ "DeepSeekR1RendererConfig",
116
120
  "DeepSeekV3Renderer",
117
121
  "DeepSeekV3RendererConfig",
118
122
  "DefaultRenderer",
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.1.8.dev41'
22
- __version_tuple__ = version_tuple = (0, 1, 8, 'dev41')
21
+ __version__ = version = '0.1.8.dev43'
22
+ __version_tuple__ = version_tuple = (0, 1, 8, 'dev43')
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -1030,9 +1030,12 @@ MODEL_RENDERER_MAP: dict[str, str] = {
1030
1030
  # MiniMax.
1031
1031
  "MiniMaxAI/MiniMax-M2": "minimax-m2",
1032
1032
  "MiniMaxAI/MiniMax-M2.5": "minimax-m2",
1033
- # DeepSeek V3.
1033
+ # DeepSeek V3 (non-reasoning).
1034
1034
  "deepseek-ai/DeepSeek-V3": "deepseek-v3",
1035
1035
  "deepseek-ai/DeepSeek-V3-Base": "deepseek-v3",
1036
+ # DeepSeek R1 (reasoning).
1037
+ "deepseek-ai/DeepSeek-R1": "deepseek-r1",
1038
+ "deepseek-ai/DeepSeek-R1-0528": "deepseek-r1",
1036
1039
  # Kimi K2 (K2.5 and K2.6 share the K2.5 template, distinct from K2).
1037
1040
  "moonshotai/Kimi-K2-Instruct": "kimi-k2",
1038
1041
  "moonshotai/Kimi-K2.5": "kimi-k2.5",
@@ -1161,6 +1164,8 @@ FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
1161
1164
  # doesn't yet implement.
1162
1165
  "deepseek-ai/DeepSeek-V3",
1163
1166
  "deepseek-ai/DeepSeek-V3-Base",
1167
+ "deepseek-ai/DeepSeek-R1",
1168
+ "deepseek-ai/DeepSeek-R1-0528",
1164
1169
  }
1165
1170
  )
1166
1171
 
@@ -1334,6 +1339,7 @@ def load_tokenizer(
1334
1339
  def _populate_registry():
1335
1340
  if RENDERER_REGISTRY:
1336
1341
  return
1342
+ from renderers.deepseek_r1 import DeepSeekR1Renderer
1337
1343
  from renderers.deepseek_v3 import DeepSeekV3Renderer
1338
1344
  from renderers.default import DefaultRenderer
1339
1345
  from renderers.glm5 import GLM5Renderer, GLM51Renderer
@@ -1362,6 +1368,7 @@ def _populate_registry():
1362
1368
  "glm-4.5": GLM45Renderer,
1363
1369
  "minimax-m2": MiniMaxM2Renderer,
1364
1370
  "deepseek-v3": DeepSeekV3Renderer,
1371
+ "deepseek-r1": DeepSeekR1Renderer,
1365
1372
  "kimi-k2": KimiK2Renderer,
1366
1373
  "kimi-k2.5": KimiK25Renderer,
1367
1374
  "laguna-xs.2": LagunaXS2Renderer,
@@ -400,24 +400,30 @@ class Nemotron3RendererConfig(BaseRendererConfig):
400
400
 
401
401
 
402
402
  class DeepSeekV3RendererConfig(BaseRendererConfig):
403
- """DeepSeek V3 renderer config.
403
+ """DeepSeek-V3 renderer config (non-reasoning).
404
404
 
405
- ``enable_thinking`` is renderer-internal here DeepSeek-V3's chat
406
- template does not reference any thinking variable, so passing it to
407
- ``apply_chat_template`` upstream is a no-op. The renderer uses it
408
- to control the ``<think>`` prefill at the generation prompt (R1
409
- distill convention).
405
+ DeepSeek-V3 has no thinking concept: the generation prompt is a bare
406
+ ``<|Assistant|>`` and assistant content is emitted verbatim. For the
407
+ reasoning variant use :class:`DeepSeekR1RendererConfig`.
410
408
  """
411
409
 
412
410
  name: Literal["deepseek-v3"] = "deepseek-v3"
413
411
 
414
- enable_thinking: bool = True
415
- """Renderer convention for the R1-distill family: when ``True``,
416
- prefill ``<think>`` at the generation prompt. The DeepSeek-V3 Jinja
417
- template ignores this kwarg upstream; it's not a chat-template
418
- kwarg in the strict sense."""
419
412
 
420
- _internal_fields = frozenset({"enable_thinking"})
413
+ class DeepSeekR1RendererConfig(BaseRendererConfig):
414
+ """DeepSeek-R1 renderer config (reasoning).
415
+
416
+ R1 always reasons — its chat template unconditionally prefills
417
+ ``<think>\\n`` at the generation prompt and strips ``</think>`` from
418
+ historical assistant turns. There is therefore no ``enable_thinking``
419
+ knob (thinking is not optional), and ``preserve_*`` flags are no-ops
420
+ (history reasoning is always dropped); both stored for protocol
421
+ uniformity. Applies to full ``deepseek-ai/DeepSeek-R1`` / ``-R1-0528``
422
+ — NOT the R1-Distill-Qwen/Llama models, which use those base
423
+ tokenizers and route to the Qwen3 / Llama-3 renderers.
424
+ """
425
+
426
+ name: Literal["deepseek-r1"] = "deepseek-r1"
421
427
 
422
428
 
423
429
  RendererConfig = Annotated[
@@ -439,6 +445,7 @@ RendererConfig = Annotated[
439
445
  MiniMaxM2RendererConfig,
440
446
  Nemotron3RendererConfig,
441
447
  DeepSeekV3RendererConfig,
448
+ DeepSeekR1RendererConfig,
442
449
  ],
443
450
  Field(discriminator="name"),
444
451
  ]
@@ -474,6 +481,7 @@ _CONFIG_BY_NAME: dict[str, type[BaseRendererConfig]] = {
474
481
  "minimax-m2": MiniMaxM2RendererConfig,
475
482
  "nemotron-3": Nemotron3RendererConfig,
476
483
  "deepseek-v3": DeepSeekV3RendererConfig,
484
+ "deepseek-r1": DeepSeekR1RendererConfig,
477
485
  }
478
486
 
479
487
 
@@ -505,6 +513,7 @@ __all__ = [
505
513
  "AutoRendererConfig",
506
514
  "BaseRendererConfig",
507
515
  "DefaultRendererConfig",
516
+ "DeepSeekR1RendererConfig",
508
517
  "DeepSeekV3RendererConfig",
509
518
  "GLM45RendererConfig",
510
519
  "GLM51RendererConfig",
@@ -0,0 +1,58 @@
1
+ """DeepSeek-R1 Renderer — the reasoning variant of the DeepSeek format.
2
+
3
+ R1 shares DeepSeek-V3's special tokens, message structure, and tool-call
4
+ wire format, so it subclasses :class:`renderers.deepseek_v3.DeepSeekV3Renderer`
5
+ and overrides only the two places its chat template diverges:
6
+
7
+ 1. Generation prompt — R1 unconditionally prefills ``<think>\\n``
8
+ (``<|Assistant|><think>\\n``) to trigger reasoning, where V3 emits a bare
9
+ ``<|Assistant|>``. Handled by ``_GEN_THINK_PREFILL``.
10
+ 2. Historical assistant turns — R1 strips the reasoning trace, keeping only
11
+ the text after ``</think>`` (``content.split('</think>')[-1]``), where V3
12
+ emits content verbatim. Handled by ``_prepare_assistant_content``.
13
+
14
+ Everything else — system handling, tool-call / tool-output rendering,
15
+ special-token resolution, and ``parse_response`` (``parse_deepseek_v3``,
16
+ shared) — is inherited unchanged.
17
+
18
+ Scope: full ``deepseek-ai/DeepSeek-R1`` and ``-R1-0528``. The R1-Distill
19
+ models (``DeepSeek-R1-Distill-Qwen/Llama``) use their base models'
20
+ tokenizers and route to the Qwen3 / Llama-3 renderers, not this one.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from renderers.base import Message
26
+ from renderers.configs import DeepSeekR1RendererConfig
27
+ from renderers.deepseek_v3 import DeepSeekV3Renderer
28
+
29
+
30
+ class DeepSeekR1Renderer(DeepSeekV3Renderer):
31
+ """Deterministic message → token renderer for DeepSeek-R1 models."""
32
+
33
+ _config_cls: type = DeepSeekR1RendererConfig
34
+ _GEN_THINK_PREFILL: str = "<think>\n"
35
+
36
+ def _prepare_assistant_content(self, msg: Message) -> str:
37
+ """Assistant content with the reasoning trace stripped, mirroring the
38
+ R1 template's ``content.split('</think>')[-1]`` on historical turns.
39
+
40
+ Structured ``thinking``/``text`` parts are reconstructed inline first
41
+ so the same ``</think>`` split applies. The separate
42
+ ``reasoning_content`` field is ignored — the R1 chat template never
43
+ reads it, and history reasoning is dropped regardless.
44
+ """
45
+ content = msg.get("content") or ""
46
+ if isinstance(content, list):
47
+ parts: list[str] = []
48
+ for p in content:
49
+ if not isinstance(p, dict):
50
+ continue
51
+ if p.get("type") == "thinking":
52
+ parts.append(f"<think>{p.get('thinking', '')}</think>")
53
+ elif p.get("type") == "text":
54
+ parts.append(p.get("text", ""))
55
+ content = "".join(parts)
56
+ if "</think>" in content:
57
+ content = content.split("</think>")[-1]
58
+ return content
@@ -41,25 +41,30 @@ def _ds_token(name: str) -> str:
41
41
 
42
42
 
43
43
  class DeepSeekV3Renderer:
44
- """Deterministic message → token renderer for DeepSeek V3 models.
45
-
46
- DeepSeek-V3's chat template does not consult any thinking-related
47
- variable; the ``enable_thinking`` field on the typed config controls
48
- the renderer's ``<think>\\n`` prefill at the generation prompt
49
- (R1-distill convention) and is intentionally not forwarded to
50
- ``apply_chat_template`` upstream — that would be a no-op. The
51
- template also always emits ``<think>{reasoning}</think>`` when
52
- ``reasoning_content`` is provided, so ``preserve_*`` flags are
53
- no-ops here too; stored for protocol uniformity.
44
+ """Deterministic message → token renderer for DeepSeek-V3 models.
45
+
46
+ DeepSeek-V3 is non-reasoning: its chat template has no ``<think>``
47
+ concept the generation prompt is a bare ``<|Assistant|>`` and past
48
+ assistant content is emitted verbatim. The reasoning variant
49
+ (``<think>``-prefilled prompt, history reasoning stripped) lives in
50
+ :class:`renderers.deepseek_r1.DeepSeekR1Renderer`, which subclasses
51
+ this one. ``preserve_*`` flags are no-ops here (no reasoning channel),
52
+ stored for protocol uniformity.
54
53
  """
55
54
 
55
+ #: Default typed config; the R1 subclass overrides this.
56
+ _config_cls: type = DeepSeekV3RendererConfig
57
+ #: Generation-prompt reasoning prefill. Empty for V3 (bare
58
+ #: ``<|Assistant|>``); the R1 subclass overrides to ``"<think>\n"``.
59
+ _GEN_THINK_PREFILL: str = ""
60
+
56
61
  def __init__(
57
62
  self,
58
63
  tokenizer: PreTrainedTokenizer,
59
64
  config: DeepSeekV3RendererConfig | None = None,
60
65
  ):
61
66
  self._tokenizer = tokenizer
62
- self.config = config or DeepSeekV3RendererConfig()
67
+ self.config = config or type(self)._config_cls()
63
68
 
64
69
  # ── BOS / EOS ────────────────────────────────────────────────
65
70
  self._bos = self._get_special_token(f"begin{_US}of{_US}sentence")
@@ -239,8 +244,10 @@ class DeepSeekV3Renderer:
239
244
  emit_special(
240
245
  self._assistant_token, -1, is_sampled=False, is_content=False
241
246
  )
242
- if self.config.enable_thinking:
243
- emit_text("<think>\n", -1, is_sampled=False, is_content=False)
247
+ if self._GEN_THINK_PREFILL:
248
+ emit_text(
249
+ self._GEN_THINK_PREFILL, -1, is_sampled=False, is_content=False
250
+ )
244
251
 
245
252
  return RenderedTokens(
246
253
  token_ids=tokens,
@@ -382,8 +389,8 @@ class DeepSeekV3Renderer:
382
389
  last_role = new_messages[-1].get("role") if new_messages else None
383
390
  if last_role != "tool":
384
391
  emit_special(self._assistant_token, -1)
385
- if self.config.enable_thinking:
386
- emit_text("<think>\n", -1)
392
+ if self._GEN_THINK_PREFILL:
393
+ emit_text(self._GEN_THINK_PREFILL, -1)
387
394
 
388
395
  total_len = len(previous_ids) + len(ext)
389
396
  return RenderedTokens(
@@ -399,6 +406,23 @@ class DeepSeekV3Renderer:
399
406
  # Assistant rendering
400
407
  # ------------------------------------------------------------------
401
408
 
409
+ def _prepare_assistant_content(self, msg: Message) -> str:
410
+ """Assistant content as the V3 template would emit it: verbatim.
411
+
412
+ V3 is non-reasoning — its template emits ``message['content']`` as-is
413
+ and never reads ``reasoning_content``. A structured content list is
414
+ flattened to its ``text`` parts. The R1 subclass overrides this to
415
+ strip ``</think>`` from history.
416
+ """
417
+ content = msg.get("content") or ""
418
+ if isinstance(content, list):
419
+ content = "".join(
420
+ p.get("text", "")
421
+ for p in content
422
+ if isinstance(p, dict) and p.get("type") == "text"
423
+ )
424
+ return content
425
+
402
426
  def _render_assistant(
403
427
  self,
404
428
  msg: Message,
@@ -414,24 +438,7 @@ class DeepSeekV3Renderer:
414
438
  # without a new <|Assistant|> token in that case.
415
439
  prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
416
440
 
417
- content = msg.get("content") or ""
418
- # Support structured content (ThinkingPart / TextPart list).
419
- if isinstance(content, list):
420
- parts_text: list[str] = []
421
- for p in content:
422
- if not isinstance(p, dict):
423
- continue
424
- if p.get("type") == "thinking":
425
- thinking = p.get("thinking", "")
426
- parts_text.append(f"<think>{thinking}</think>")
427
- elif p.get("type") == "text":
428
- parts_text.append(p.get("text", ""))
429
- content = "".join(parts_text)
430
- # Also accept reasoning_content stored separately (OpenAI-style).
431
- elif isinstance(msg.get("reasoning_content"), str) and msg["reasoning_content"]:
432
- reasoning = msg["reasoning_content"]
433
- content = f"<think>{reasoning}</think>{content}"
434
-
441
+ content = self._prepare_assistant_content(msg)
435
442
  tool_calls = msg.get("tool_calls") or []
436
443
 
437
444
  # ``<|Assistant|>`` is template-injected scaffolding — at
@@ -42,7 +42,7 @@ from renderers.base import (
42
42
  trim_to_turn_close,
43
43
  )
44
44
  from renderers.configs import KimiK25RendererConfig
45
- from renderers.parsing import parse_kimi_k2_section
45
+ from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section
46
46
  from renderers.qwen3_vl import (
47
47
  _image_hash,
48
48
  _is_image_part,
@@ -452,6 +452,13 @@ def _parse_kimi_k2_response(
452
452
  ids = ids[:i]
453
453
  break
454
454
 
455
+ # Reasoning first: a tool-call section the model drafts *inside* its
456
+ # <think> trace must not be parsed as a real call (regression #78 — cf.
457
+ # parse_qwen3). K2.5 renders </think> as text, so locate the boundary by
458
+ # decoding; the section scan then starts past it. content_ids still begins
459
+ # at 0, so the </think> text-split below recovers reasoning unchanged.
460
+ reasoning_end = _reasoning_end_token_index(tokenizer, ids)
461
+
455
462
  # Token-ID path — produces spans. Only run if every relevant special
456
463
  # token resolved at init (i.e. is in the tokenizer's vocab).
457
464
  tool_calls: list[ParsedToolCall] = []
@@ -471,6 +478,7 @@ def _parse_kimi_k2_response(
471
478
  tool_call_begin_id=tool_call_begin_id,
472
479
  tool_call_argument_begin_id=tool_call_argument_begin_id,
473
480
  tool_call_end_id=tool_call_end_id,
481
+ scan_start=reasoning_end,
474
482
  )
475
483
  text = (
476
484
  tokenizer.decode(content_ids, skip_special_tokens=False)
@@ -481,9 +489,13 @@ def _parse_kimi_k2_response(
481
489
  text = tokenizer.decode(ids, skip_special_tokens=False) if ids else ""
482
490
 
483
491
  # Fallback path: model emitted literal-text section delimiters (singular
484
- # variant) rather than special tokens. Spans unavailable here.
492
+ # variant) rather than special tokens. Spans unavailable here. Start the
493
+ # search past the first </think> so a literal section drafted inside the
494
+ # reasoning trace isn't matched as a real call (regression #78).
485
495
  if not tool_calls:
486
- tc_match = _TOOL_CALLS_SECTION_RE.search(text)
496
+ think_close = text.find("</think>")
497
+ search_from = think_close + len("</think>") if think_close != -1 else 0
498
+ tc_match = _TOOL_CALLS_SECTION_RE.search(text, search_from)
487
499
  if tc_match:
488
500
  text = text[: tc_match.start()]
489
501
  tool_section = (
@@ -133,6 +133,39 @@ def _decode(tokenizer, ids: list[int]) -> str:
133
133
  return tokenizer.decode(ids, skip_special_tokens=False)
134
134
 
135
135
 
136
+ def _reasoning_end_token_index(
137
+ tokenizer, ids: list[int], marker: str = "</think>"
138
+ ) -> int:
139
+ """Token index immediately past the first ``</think>`` in ``ids``.
140
+
141
+ Returns 0 when ``ids`` has no closed reasoning region — callers treat
142
+ that as "scan from the start" (preserves pre-existing behavior for
143
+ non-thinking / truncated-reasoning completions).
144
+
145
+ Used by parsers whose ``</think>`` is *not* a single special token
146
+ (DeepSeek-V3, Kimi-K2.5) — where it tokenizes to several pieces and is
147
+ context-sensitive (the closing ``>`` merges differently depending on the
148
+ next char), so a token-id or fixed-subsequence search isn't reliable. We
149
+ instead locate the boundary in decoded text via binary search over prefix
150
+ decodes, which holds as long as ``decode(ids[:k])`` is prefix-stable in
151
+ ``k`` (true for the byte-level BPE tokenizers here; ``</think>`` is clean
152
+ ASCII that won't straddle a byte boundary). Single-token ``</think>``
153
+ parsers (Qwen3) anchor on the token id directly and don't need this.
154
+ """
155
+ if not ids or marker not in _decode(tokenizer, ids):
156
+ return 0
157
+ # Smallest prefix length (in tokens) whose decode already contains the
158
+ # full marker — i.e. the index just past where </think> completes.
159
+ lo, hi = 1, len(ids)
160
+ while lo < hi:
161
+ mid = (lo + hi) // 2
162
+ if marker in _decode(tokenizer, ids[:mid]):
163
+ hi = mid
164
+ else:
165
+ lo = mid + 1
166
+ return lo
167
+
168
+
136
169
  # ── Qwen3: <tool_call> JSON </tool_call> ────────────────────────────
137
170
 
138
171
 
@@ -143,11 +176,26 @@ def parse_qwen3(
143
176
  stop_ids: set[int],
144
177
  tool_call_id: int,
145
178
  tool_call_end_id: int,
179
+ reasoning_end_id: int | None = None,
146
180
  ) -> ParsedResponse:
147
181
  """Parse Qwen3 completion tokens. Hermes-style JSON tool calls."""
148
182
  ids = _strip_stop_tokens(token_ids, stop_ids)
149
183
 
150
- tc_start = _find(ids, tool_call_id)
184
+ # Reasoning is resolved before tool calls. Thinking models (e.g.
185
+ # Qwen3-*-Thinking) routinely draft ``<tool_call>`` blocks *inside* their
186
+ # ``<think>...</think>`` trace while planning; those are reasoning, not
187
+ # real invocations. Anchoring the tool-call scan after the ``</think>``
188
+ # boundary keeps in-think drafts out of ``tool_calls`` (otherwise they
189
+ # surface as phantom/duplicate calls) and out of the reasoning/content
190
+ # split. Mirrors vLLM's DelegatingParser, which runs the reasoning parser
191
+ # first and tool-parses only the post-``</think>`` content.
192
+ # ``reasoning_end_id`` is the ``</think>`` token id; when it's absent
193
+ # (``None``) or the model never closed its reasoning, the scan falls back
194
+ # to the whole stream (prior behavior).
195
+ reasoning_end = _find(ids, reasoning_end_id) if reasoning_end_id is not None else -1
196
+ scan_start = reasoning_end + 1 if reasoning_end != -1 else 0
197
+
198
+ tc_start = _find(ids, tool_call_id, scan_start)
151
199
  tool_calls: list[ParsedToolCall] = []
152
200
  if tc_start != -1:
153
201
  content_ids = ids[:tc_start]
@@ -685,7 +733,15 @@ def parse_deepseek_v3(
685
733
  """
686
734
  ids = _strip_stop_tokens(token_ids, stop_ids)
687
735
 
688
- tc_section_start = _find(ids, tool_calls_begin_id)
736
+ # Reasoning first: skip past </think> before looking for the tool-call
737
+ # section, so a section the model drafts *inside* its <think> trace isn't
738
+ # parsed as a real call (regression #78 — cf. parse_qwen3). content_ids
739
+ # still starts at 0, so the </think> text-split below recovers reasoning.
740
+ # DeepSeek-V3 renders </think> as multi-token text, hence the decode-based
741
+ # boundary finder rather than a token-id anchor.
742
+ reasoning_end = _reasoning_end_token_index(tokenizer, ids)
743
+
744
+ tc_section_start = _find(ids, tool_calls_begin_id, reasoning_end)
689
745
  tool_calls: list[ParsedToolCall] = []
690
746
  if tc_section_start != -1:
691
747
  content_ids = ids[:tc_section_start]
@@ -962,6 +1018,7 @@ def parse_kimi_k2_section(
962
1018
  tool_call_begin_id: int,
963
1019
  tool_call_argument_begin_id: int,
964
1020
  tool_call_end_id: int,
1021
+ scan_start: int = 0,
965
1022
  ) -> tuple[list[int], list[ParsedToolCall]]:
966
1023
  """Split ``ids`` into ``(content_before_section, tool_calls)`` by finding
967
1024
  the Kimi-style tool-call section delimiters.
@@ -973,8 +1030,15 @@ def parse_kimi_k2_section(
973
1030
  of the section and a list of ``ParsedToolCall`` covering every attempted
974
1031
  block inside it; an unclosed section is still walked to whatever the model
975
1032
  emitted before EOS. Returns ``(ids, [])`` when no section is present.
1033
+
1034
+ ``scan_start`` restricts the section search to ``ids[scan_start:]`` while
1035
+ keeping ``content_ids = ids[:section_start]`` and all token spans relative
1036
+ to the full ``ids``. Callers pass the post-``</think>`` index so a section
1037
+ the model drafts inside its reasoning trace isn't parsed as a real call;
1038
+ because ``content_ids`` still starts at 0, downstream text-based reasoning
1039
+ extraction is unaffected (regression #78).
976
1040
  """
977
- section_start = _find_any(ids, tool_calls_section_begin_ids)
1041
+ section_start = _find_any(ids, tool_calls_section_begin_ids, scan_start)
978
1042
  if section_start == -1:
979
1043
  return list(ids), []
980
1044
  content_ids = ids[:section_start]
@@ -62,6 +62,7 @@ class Qwen3Renderer:
62
62
  self._tool_call_end = self._token_id("</tool_call>")
63
63
  self._tool_response = self._token_id("<tool_response>")
64
64
  self._tool_response_end = self._token_id("</tool_response>")
65
+ self._think_end = self._token_id("</think>")
65
66
 
66
67
  def _token_id(self, token: str) -> int:
67
68
  tid = self._tokenizer.convert_tokens_to_ids(token)
@@ -276,6 +277,7 @@ class Qwen3Renderer:
276
277
  stop_ids={self._im_end, self._endoftext},
277
278
  tool_call_id=self._tool_call,
278
279
  tool_call_end_id=self._tool_call_end,
280
+ reasoning_end_id=self._think_end,
279
281
  )
280
282
 
281
283
  def get_stop_token_ids(self) -> list[int]:
@@ -325,6 +325,7 @@ class Qwen3VLRenderer:
325
325
  self._tool_call_end = self._token_id("</tool_call>")
326
326
  self._tool_response = self._token_id("<tool_response>")
327
327
  self._tool_response_end = self._token_id("</tool_response>")
328
+ self._think_end = self._token_id("</think>")
328
329
  self._vision_start = self._token_id("<|vision_start|>")
329
330
  self._vision_end = self._token_id("<|vision_end|>")
330
331
  self._image_pad = self._token_id("<|image_pad|>")
@@ -634,6 +635,7 @@ class Qwen3VLRenderer:
634
635
  stop_ids={self._im_end, self._endoftext},
635
636
  tool_call_id=self._tool_call,
636
637
  tool_call_end_id=self._tool_call_end,
638
+ reasoning_end_id=self._think_end,
637
639
  )
638
640
 
639
641
  def get_stop_token_ids(self) -> list[int]:
@@ -36,6 +36,16 @@ RENDERER_MODELS = [
36
36
  # Ultra resolves the Ultra template variant via name (auto → ultra=True).
37
37
  ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
38
38
  ("poolside/Laguna-XS.2", "auto"),
39
+ # DeepSeek-V3/R1 are intentionally NOT in this shared barrage: their
40
+ # chat templates can't render the barrage's tool-call fixtures (the
41
+ # templates require ``tool['type']`` and a string-serialized
42
+ # ``arguments``, and V3 only renders tool_calls when content is None —
43
+ # so ``apply_chat_template`` raises or drops the calls on the shared
44
+ # shapes), and the is_content body-recovery checks hit a Metaspace
45
+ # subset-decode artifact. The renderer is correct in all these cases;
46
+ # there's just no byte-output to parity-check against. Split-specific
47
+ # parity (V3 bare prompt vs R1 <think>+history-strip) is covered in
48
+ # tests/test_deepseek_r1.py.
39
49
  # Llama-3 loads via the unrestricted unsloth mirror (byte-identical
40
50
  # chat template) so CI needs no Meta-gated HF token. Pinned to the
41
51
  # explicit "llama-3" config because the mirror name isn't in