renderers 0.1.8.dev1__tar.gz → 0.1.8.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/publish.yml +32 -8
  2. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/PKG-INFO +2 -1
  3. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/pyproject.toml +6 -0
  4. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/_version.py +2 -2
  5. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/base.py +126 -23
  6. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/deepseek_v3.py +60 -32
  7. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/glm45.py +81 -42
  8. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/glm5.py +82 -39
  9. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/gpt_oss.py +54 -12
  10. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/kimi_k2.py +89 -63
  11. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/kimi_k25.py +92 -50
  12. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/laguna_xs2.py +64 -31
  13. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/minimax_m2.py +98 -42
  14. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/nemotron3.py +81 -55
  15. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen3.py +82 -49
  16. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen35.py +98 -60
  17. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen3_vl.py +114 -64
  18. renderers-0.1.8.dev2/tests/test_load_tokenizer_fastokens.py +172 -0
  19. renderers-0.1.8.dev2/tests/test_sampled_mask.py +119 -0
  20. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/uv.lock +4 -5
  21. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/style.yml +0 -0
  22. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/test.yml +0 -0
  23. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.gitignore +0 -0
  24. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.pre-commit-config.yaml +0 -0
  25. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/LICENSE +0 -0
  26. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/README.md +0 -0
  27. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/README.md +0 -0
  28. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/sglang/multiturn_generate_sglang.py +0 -0
  29. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/sglang/online_multiturn_sglang.py +0 -0
  30. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/tinker/multiturn_generate_tinker.py +0 -0
  31. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/transformers/multiturn_generate_transformers.py +0 -0
  32. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/vllm/multiturn_generate_vllm.py +0 -0
  33. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/__init__.py +0 -0
  34. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/client.py +0 -0
  35. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/default.py +0 -0
  36. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/parsers.py +0 -0
  37. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/parsing.py +0 -0
  38. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen36.py +0 -0
  39. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/conftest.py +0 -0
  40. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_bridge.py +0 -0
  41. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_build_helpers.py +0 -0
  42. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_client.py +0 -0
  43. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_gpt_oss_harmony_parity.py +0 -0
  44. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_incremental.py +0 -0
  45. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_load_tokenizer.py +0 -0
  46. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_message_indices.py +0 -0
  47. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_multimodal.py +0 -0
  48. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parse_response.py +0 -0
  49. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parse_response_robustness.py +0 -0
  50. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parsers.py +0 -0
  51. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_preserve_thinking.py +0 -0
  52. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_qwen35_size_coverage.py +0 -0
  53. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_render_ids.py +0 -0
  54. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_roundtrip.py +0 -0
  55. {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_tool_arg_type_preservation.py +0 -0
@@ -12,8 +12,16 @@ on:
12
12
  - "renderers-v*"
13
13
 
14
14
  jobs:
15
- publish:
15
+ # Build (no OIDC) → publish (OIDC only). The build job runs uv build with
16
+ # contents: read only so a poisoned build-time dep cannot mint the OIDC
17
+ # token. The publish job has id-token: write and the pypi-prod environment
18
+ # but no source checkout — it only downloads the prebuilt artifact and runs
19
+ # the SHA-pinned pypa publish action.
20
+ build:
21
+ if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/tags/renderers-v')
16
22
  runs-on: ubuntu-latest
23
+ permissions:
24
+ contents: read
17
25
  steps:
18
26
  - name: Checkout tagged release (dispatch)
19
27
  if: github.event_name == 'workflow_dispatch'
@@ -28,8 +36,7 @@ jobs:
28
36
  with:
29
37
  fetch-depth: 0
30
38
 
31
- - name: Resolve release tag
32
- id: release
39
+ - name: Validate release tag
33
40
  env:
34
41
  EVENT_NAME: ${{ github.event_name }}
35
42
  PUSHED_REF: ${{ github.ref_name }}
@@ -53,14 +60,31 @@ jobs:
53
60
  ;;
54
61
  esac
55
62
 
56
- echo "tag=$TAG" >> "$GITHUB_OUTPUT"
57
-
58
63
  - uses: astral-sh/setup-uv@v7
59
64
 
60
65
  - name: Build renderers
61
66
  run: uv build
62
67
 
68
+ - name: Upload dist artifacts
69
+ uses: actions/upload-artifact@v4
70
+ with:
71
+ name: dist
72
+ path: dist/
73
+ if-no-files-found: error
74
+ retention-days: 7
75
+
76
+ publish:
77
+ needs: build
78
+ runs-on: ubuntu-latest
79
+ environment: pypi-prod
80
+ permissions:
81
+ id-token: write
82
+ steps:
83
+ - name: Download dist artifacts
84
+ uses: actions/download-artifact@v4
85
+ with:
86
+ name: dist
87
+ path: dist/
88
+
63
89
  - name: Publish to PyPI
64
- env:
65
- PYPI_RENDERERS_TOKEN: ${{ secrets.PYPI_RENDERERS_TOKEN }}
66
- run: uv publish --token "$PYPI_RENDERERS_TOKEN" dist/*
90
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: renderers
3
- Version: 0.1.8.dev1
3
+ Version: 0.1.8.dev2
4
4
  Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
7
7
  Requires-Python: <3.14,>=3.10
8
+ Requires-Dist: fastokens>=0.1.1
8
9
  Requires-Dist: jinja2
9
10
  Requires-Dist: numpy
10
11
  Requires-Dist: openai-harmony>=0.0.8
@@ -24,6 +24,12 @@ dependencies = [
24
24
  # OpenAI's reference implementation keeps us byte-identical with vLLM
25
25
  # (which also uses it) and saves us mirroring a 330-line Jinja template.
26
26
  "openai-harmony>=0.0.8",
27
+ # Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
28
+ # ``load_tokenizer`` patches it in by default for every supported model
29
+ # except a small denylist (DeepSeek-V3 family, MiniMax-M2 family). The
30
+ # patch is bracketed around ``from_pretrained``, so subsequent
31
+ # ``AutoTokenizer`` calls outside the renderers package stay vanilla.
32
+ "fastokens>=0.1.1",
27
33
  ]
28
34
 
29
35
  [tool.hatch.version]
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.1.8.dev1'
22
- __version_tuple__ = version_tuple = (0, 1, 8, 'dev1')
21
+ __version__ = version = '0.1.8.dev2'
22
+ __version_tuple__ = version_tuple = (0, 1, 8, 'dev2')
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -148,8 +148,26 @@ class RenderedTokens:
148
148
  """Result of rendering messages to tokens.
149
149
 
150
150
  Each token carries an index into the original message list so callers can
151
- build per-token loss masks without re-rendering. Tokens from structural
152
- scaffolding (generation prompt, im_start/im_end wrapping) carry index -1.
151
+ build per-token loss masks without re-rendering. Tokens from structural
152
+ scaffolding the renderer adds outside any single message (e.g. the
153
+ trailing generation prompt) carry index ``-1``.
154
+
155
+ ``sampled_mask`` is a separate per-token signal: ``True`` if the model
156
+ would have produced this token at inference time (i.e. it appears in
157
+ the sampled completion), ``False`` if it is template-injected
158
+ scaffolding the model never emits (``<|im_start|>role\\n`` openers,
159
+ inter-turn ``\\n`` separators, system / user / tool content from
160
+ conversation history, etc.). This is distinct from
161
+ ``message_indices``: a token can belong to an assistant message
162
+ (``message_indices[k] >= 0``) and still be scaffolding the template
163
+ adds around the model's actual completion. SFT loss masks should AND
164
+ both: train on tokens whose role is trainable AND that the model
165
+ would actually sample.
166
+
167
+ Empty ``sampled_mask`` (``[]``) means the renderer doesn't provide
168
+ this signal — consumers should fall back to attribution-only
169
+ masking. ``DefaultRenderer`` leaves it empty because the Jinja
170
+ template is opaque; hand-coded renderers populate it.
153
171
 
154
172
  ``multi_modal_data`` is populated by multimodal renderers (e.g.
155
173
  ``Qwen3VLRenderer``) when image / video content parts are present;
@@ -158,6 +176,7 @@ class RenderedTokens:
158
176
 
159
177
  token_ids: list[int] = field(default_factory=list)
160
178
  message_indices: list[int] = field(default_factory=list)
179
+ sampled_mask: list[bool] = field(default_factory=list)
161
180
  multi_modal_data: "MultiModalData | None" = None
162
181
 
163
182
 
@@ -713,37 +732,108 @@ TRUSTED_REVISIONS: dict[str, str] = {
713
732
  }
714
733
 
715
734
 
716
- def load_tokenizer(model_name_or_path: str):
717
- """Load a tokenizer with the renderers-package security policy.
735
+ # Models for which ``fastokens`` is known to diverge from vanilla
736
+ # ``transformers.AutoTokenizer`` and therefore must NOT be patched.
737
+ # Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
738
+ # backends; 31/35 passed byte-identical. The four below either fail to
739
+ # load under fastokens (DeepSeek-V3 family — Metaspace pretokenizer not
740
+ # yet implemented) or are kept defensively pending an upstream fastokens
741
+ # fix (MiniMax-M2 family — see per-entry comments).
742
+ FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
743
+ {
744
+ # fastokens 0.1.1: ``ValueError: pre-tokenizer error: unsupported
745
+ # pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
746
+ # SentencePiece-style Metaspace pretokenization which fastokens
747
+ # doesn't yet implement.
748
+ "deepseek-ai/DeepSeek-V3",
749
+ "deepseek-ai/DeepSeek-V3-Base",
750
+ # MiniMax: kept defensive pending upstream fastokens fix
751
+ # https://github.com/crusoecloud/fastokens/pull/32 — that PR
752
+ # removes a stray attribute leaked by ``unpatch_transformers``
753
+ # which steers MiniMax (declared ``tokenizer_class =
754
+ # 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
755
+ # load path on subsequent vanilla loads. Once the upstream fix
756
+ # is released, these two entries can be dropped after re-audit.
757
+ "MiniMaxAI/MiniMax-M2",
758
+ "MiniMaxAI/MiniMax-M2.5",
759
+ }
760
+ )
761
+
762
+
763
+ def _patched_load(model_name_or_path: str, **kwargs):
764
+ """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
765
+ process-locally — patch around the load, unpatch right after.
766
+
767
+ fastokens captures the loaded backend on a per-tokenizer basis, so
768
+ after we unpatch the returned tokenizer object continues to use
769
+ fastokens for ``encode``/``decode`` while subsequent
770
+ ``AutoTokenizer.from_pretrained`` calls (outside our control) go
771
+ back to vanilla. This keeps the global side effect minimal.
772
+ """
773
+ import fastokens
774
+ from transformers import AutoTokenizer
775
+
776
+ fastokens.patch_transformers()
777
+ try:
778
+ return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
779
+ finally:
780
+ fastokens.unpatch_transformers()
718
781
 
719
- Default: ``trust_remote_code=False`` — the safe choice for every
720
- model in ``MODEL_RENDERER_MAP`` *except* the Kimi-K2 family.
721
782
 
722
- Models listed in ``TRUSTED_REVISIONS`` load with
723
- ``trust_remote_code=True`` AND ``revision=<pinned sha>`` — required
724
- because their tokenizer config has an ``auto_map.AutoTokenizer``
725
- entry pointing at a repo-supplied Python class
726
- (``tokenization_kimi.TikTokenTokenizer``). Pinning the revision
727
- means transformers executes only the reviewed commit's code, not
728
- whatever ``HEAD`` points at when the call fires.
783
+ def load_tokenizer(
784
+ model_name_or_path: str,
785
+ *,
786
+ use_fastokens: bool = True,
787
+ ):
788
+ """Load a tokenizer with the renderers-package security + perf policy.
789
+
790
+ **Security** — default ``trust_remote_code=False``. Models listed in
791
+ ``TRUSTED_REVISIONS`` (Moonshot Kimi-K2 family) load with
792
+ ``trust_remote_code=True`` AND a pinned ``revision=<sha>`` so
793
+ transformers only executes the reviewed commit's tokenizer Python.
794
+
795
+ **Performance** — ``use_fastokens=True`` (default) routes the load
796
+ through ``fastokens.patch_transformers()`` so the resulting tokenizer
797
+ encodes ~10x faster than vanilla ``tokenizers``. The patch is
798
+ bracketed: it's applied before ``from_pretrained`` and removed
799
+ immediately after, so global ``AutoTokenizer.from_pretrained`` calls
800
+ elsewhere in the user's process are not affected.
801
+
802
+ Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family, MiniMax-M2
803
+ family) skip the patch — fastokens 0.1.1 either fails to load them
804
+ or produces token-divergent output. Pass ``use_fastokens=False`` to
805
+ force the vanilla backend for any other model.
729
806
 
730
807
  Unknown / fine-tuned model paths fall through to
731
- ``trust_remote_code=False``. Callers who legitimately need to load
732
- a custom-code tokenizer outside this allow-list should call
733
- ``AutoTokenizer.from_pretrained`` themselves and pass the result to
734
- ``create_renderer`` (which doesn't load tokenizers — only
735
- ``create_renderer_pool`` does).
808
+ ``trust_remote_code=False`` and the patched-load fast path. If
809
+ fastokens raises during the patched load (e.g. an unknown
810
+ pre-tokenizer type), we automatically retry with the vanilla
811
+ backend and emit an INFO log.
736
812
  """
737
813
  from transformers import AutoTokenizer
738
814
 
815
+ kwargs: dict[str, Any] = {}
739
816
  revision = TRUSTED_REVISIONS.get(model_name_or_path)
740
817
  if revision is not None:
741
- return AutoTokenizer.from_pretrained(
818
+ kwargs = {"trust_remote_code": True, "revision": revision}
819
+ else:
820
+ kwargs = {"trust_remote_code": False}
821
+
822
+ if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE:
823
+ return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
824
+
825
+ try:
826
+ return _patched_load(model_name_or_path, **kwargs)
827
+ except Exception as exc:
828
+ logger.info(
829
+ "fastokens could not load %r (%s: %s); falling back to vanilla "
830
+ "AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in "
831
+ "renderers.base to suppress the retry.",
742
832
  model_name_or_path,
743
- trust_remote_code=True,
744
- revision=revision,
833
+ type(exc).__name__,
834
+ str(exc)[:160],
745
835
  )
746
- return AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
836
+ return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
747
837
 
748
838
 
749
839
  def _populate_registry():
@@ -947,12 +1037,25 @@ def build_training_sample(
947
1037
 
948
1038
  Single render() call + message_indices → per-token mask.
949
1039
  Replaces build_incremental_token_mask (O(N) renders → O(1)).
1040
+
1041
+ When the renderer populates ``rendered.sampled_mask``, the loss mask
1042
+ is the AND of role-based attribution and the sampled signal: only
1043
+ tokens the model would have produced at inference are trainable.
1044
+ This keeps SFT byte-aligned with the RL trajectory mask (where the
1045
+ prompt / completion split achieves the same effect structurally).
1046
+ Renderers that don't populate ``sampled_mask`` (empty list) fall
1047
+ back to attribution-only masking — every token attributed to a
1048
+ trainable role is trained on, including template-injected
1049
+ ``<|im_start|>role\\n`` openers.
950
1050
  """
951
1051
  rendered = renderer.render(messages, tools=tools)
1052
+ has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
952
1053
  loss_mask: list[bool] = []
953
- for msg_idx in rendered.message_indices:
1054
+ for k, msg_idx in enumerate(rendered.message_indices):
954
1055
  if msg_idx < 0:
955
1056
  loss_mask.append(False)
1057
+ elif has_sampled_info and not rendered.sampled_mask[k]:
1058
+ loss_mask.append(False)
956
1059
  else:
957
1060
  loss_mask.append(role_to_mask(messages[msg_idx]))
958
1061
  return rendered.token_ids, loss_mask
@@ -113,20 +113,23 @@ class DeepSeekV3Renderer:
113
113
 
114
114
  tokens: list[int] = []
115
115
  indices: list[int] = []
116
+ sampled: list[bool] = []
116
117
 
117
- def emit_ids(ids: list[int], msg_idx: int) -> None:
118
+ def emit_ids(ids: list[int], msg_idx: int, *, is_sampled: bool) -> None:
118
119
  tokens.extend(ids)
119
120
  indices.extend([msg_idx] * len(ids))
121
+ sampled.extend([is_sampled] * len(ids))
120
122
 
121
- def emit_special(token_id: int, msg_idx: int) -> None:
123
+ def emit_special(token_id: int, msg_idx: int, *, is_sampled: bool) -> None:
122
124
  tokens.append(token_id)
123
125
  indices.append(msg_idx)
126
+ sampled.append(is_sampled)
124
127
 
125
- def emit_text(text: str, msg_idx: int) -> None:
126
- emit_ids(self._encode(text), msg_idx)
128
+ def emit_text(text: str, msg_idx: int, *, is_sampled: bool) -> None:
129
+ emit_ids(self._encode(text), msg_idx, is_sampled=is_sampled)
127
130
 
128
131
  # ── 1. BOS token ─────────────────────────────────────────────
129
- emit_special(self._bos, -1)
132
+ emit_special(self._bos, -1, is_sampled=False)
130
133
 
131
134
  # ── 2. Collect system messages at the start ───────────────────
132
135
  # All leading system messages are concatenated with "\n\n" and emitted
@@ -148,7 +151,7 @@ class DeepSeekV3Renderer:
148
151
 
149
152
  if sys_parts:
150
153
  # Attribute the concatenated system text to the first system message (index 0).
151
- emit_text("\n\n".join(sys_parts), 0)
154
+ emit_text("\n\n".join(sys_parts), 0, is_sampled=False)
152
155
 
153
156
  # ── 3. Render non-system messages ─────────────────────────────
154
157
  num_messages = len(messages)
@@ -163,8 +166,8 @@ class DeepSeekV3Renderer:
163
166
  content = "".join(
164
167
  p.get("text", "") for p in content if isinstance(p, dict)
165
168
  )
166
- emit_special(self._user_token, i)
167
- emit_text(str(content), i)
169
+ emit_special(self._user_token, i, is_sampled=False)
170
+ emit_text(str(content), i, is_sampled=False)
168
171
 
169
172
  elif role == "user":
170
173
  content = msg.get("content") or ""
@@ -177,8 +180,8 @@ class DeepSeekV3Renderer:
177
180
  else ""
178
181
  for p in content
179
182
  )
180
- emit_special(self._user_token, i)
181
- emit_text(str(content), i)
183
+ emit_special(self._user_token, i, is_sampled=False)
184
+ emit_text(str(content), i, is_sampled=False)
182
185
 
183
186
  elif role == "assistant":
184
187
  self._render_assistant(
@@ -202,11 +205,13 @@ class DeepSeekV3Renderer:
202
205
  # Don't add <|Assistant|> after tool outputs — content flows directly.
203
206
  last_role = messages[-1]["role"] if messages else None
204
207
  if last_role != "tool":
205
- emit_special(self._assistant_token, -1)
208
+ emit_special(self._assistant_token, -1, is_sampled=False)
206
209
  if self._enable_thinking:
207
- emit_text("<think>\n", -1)
210
+ emit_text("<think>\n", -1, is_sampled=False)
208
211
 
209
- return RenderedTokens(token_ids=tokens, message_indices=indices)
212
+ return RenderedTokens(
213
+ token_ids=tokens, message_indices=indices, sampled_mask=sampled
214
+ )
210
215
 
211
216
  def render_ids(
212
217
  self,
@@ -267,10 +272,20 @@ class DeepSeekV3Renderer:
267
272
 
268
273
  ext: list[int] = []
269
274
 
270
- def emit_special(token_id: int, _msg_idx: int = -1) -> None:
275
+ # Bridge output is consumed as the next turn's prompt — the
276
+ # caller blanket-masks it via ``prompt_mask=[False]*N``, so we
277
+ # don't track sampled_mask here. Local helpers accept the kwarg
278
+ # for signature compatibility with ``_render_tool`` and ignore
279
+ # it; the returned ``RenderedTokens`` leaves ``sampled_mask``
280
+ # empty.
281
+ def emit_special(
282
+ token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
283
+ ) -> None:
271
284
  ext.append(token_id)
272
285
 
273
- def emit_text(text: str, _msg_idx: int = -1) -> None:
286
+ def emit_text(
287
+ text: str, _msg_idx: int = -1, *, is_sampled: bool = False
288
+ ) -> None:
274
289
  ext.extend(self._encode(text))
275
290
 
276
291
  for i, msg in enumerate(new_messages):
@@ -354,17 +369,24 @@ class DeepSeekV3Renderer:
354
369
 
355
370
  tool_calls = msg.get("tool_calls") or []
356
371
 
372
+ # ``<|Assistant|>`` is template-injected scaffolding — at
373
+ # inference the chat template emits it as the generation prompt
374
+ # and the model never samples it. Marking it ``is_sampled=False``
375
+ # keeps the SFT loss mask aligned with what the model would
376
+ # actually have produced. When the previous message is a tool
377
+ # response, the template skips this token entirely (content
378
+ # flows directly out of ``<|tool▁outputs▁end|>``).
357
379
  if not prev_is_tool:
358
- emit_special(self._assistant_token, msg_idx)
380
+ emit_special(self._assistant_token, msg_idx, is_sampled=False)
359
381
 
360
382
  if not tool_calls:
361
- emit_text(content, msg_idx)
383
+ emit_text(content, msg_idx, is_sampled=True)
362
384
  else:
363
385
  # Emit any pre-tool-call content first.
364
- emit_text(content, msg_idx)
386
+ emit_text(content, msg_idx, is_sampled=True)
365
387
 
366
388
  # Tool call section.
367
- emit_special(self._tool_calls_begin, msg_idx)
389
+ emit_special(self._tool_calls_begin, msg_idx, is_sampled=True)
368
390
  for tc in tool_calls:
369
391
  func = tc.get("function") or tc
370
392
  name = func.get("name", "")
@@ -376,14 +398,17 @@ class DeepSeekV3Renderer:
376
398
  )
377
399
  # Format: <|tool▁call▁begin|>function<|tool▁sep|>{name}\n```json\n{args}\n```<|tool▁call▁end|>
378
400
  # tool_sep is a special token; type ("function") and name+args are plain text.
379
- emit_special(self._tool_call_begin, msg_idx)
380
- emit_text("function", msg_idx)
381
- emit_special(self._tool_sep, msg_idx)
382
- emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx)
383
- emit_special(self._tool_call_end, msg_idx)
384
- emit_special(self._tool_calls_end, msg_idx)
385
-
386
- emit_special(self._eos, msg_idx)
401
+ emit_special(self._tool_call_begin, msg_idx, is_sampled=True)
402
+ emit_text("function", msg_idx, is_sampled=True)
403
+ emit_special(self._tool_sep, msg_idx, is_sampled=True)
404
+ emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx, is_sampled=True)
405
+ emit_special(self._tool_call_end, msg_idx, is_sampled=True)
406
+ emit_special(self._tool_calls_end, msg_idx, is_sampled=True)
407
+
408
+ # ``<|end▁of▁sentence|>`` is the model's stop signal — it
409
+ # samples this to end its turn, so it is part of the sampled
410
+ # stream.
411
+ emit_special(self._eos, msg_idx, is_sampled=True)
387
412
 
388
413
  # ------------------------------------------------------------------
389
414
  # Tool (tool-response) rendering
@@ -397,6 +422,9 @@ class DeepSeekV3Renderer:
397
422
  emit_special,
398
423
  emit_text,
399
424
  ) -> None:
425
+ # Tool messages are conversation history injected by the runtime
426
+ # between assistant turns — the model never samples any of these
427
+ # tokens, so every emission is is_sampled=False.
400
428
  prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
401
429
  next_is_tool = (
402
430
  msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
@@ -407,11 +435,11 @@ class DeepSeekV3Renderer:
407
435
  content = "".join(p.get("text", "") for p in content if isinstance(p, dict))
408
436
 
409
437
  if not prev_is_tool:
410
- emit_special(self._tool_outputs_begin, msg_idx)
438
+ emit_special(self._tool_outputs_begin, msg_idx, is_sampled=False)
411
439
 
412
- emit_special(self._tool_output_begin, msg_idx)
413
- emit_text(str(content), msg_idx)
414
- emit_special(self._tool_output_end, msg_idx)
440
+ emit_special(self._tool_output_begin, msg_idx, is_sampled=False)
441
+ emit_text(str(content), msg_idx, is_sampled=False)
442
+ emit_special(self._tool_output_end, msg_idx, is_sampled=False)
415
443
 
416
444
  if not next_is_tool:
417
- emit_special(self._tool_outputs_end, msg_idx)
445
+ emit_special(self._tool_outputs_end, msg_idx, is_sampled=False)