renderers 0.1.8.dev1__tar.gz → 0.1.8.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/publish.yml +32 -8
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/PKG-INFO +2 -1
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/pyproject.toml +6 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/_version.py +2 -2
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/base.py +126 -23
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/deepseek_v3.py +60 -32
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/glm45.py +81 -42
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/glm5.py +82 -39
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/gpt_oss.py +54 -12
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/kimi_k2.py +89 -63
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/kimi_k25.py +92 -50
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/laguna_xs2.py +64 -31
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/minimax_m2.py +98 -42
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/nemotron3.py +81 -55
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen3.py +82 -49
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen35.py +98 -60
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen3_vl.py +114 -64
- renderers-0.1.8.dev2/tests/test_load_tokenizer_fastokens.py +172 -0
- renderers-0.1.8.dev2/tests/test_sampled_mask.py +119 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/uv.lock +4 -5
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/style.yml +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/test.yml +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.gitignore +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.pre-commit-config.yaml +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/LICENSE +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/README.md +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/README.md +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/sglang/multiturn_generate_sglang.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/sglang/online_multiturn_sglang.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/tinker/multiturn_generate_tinker.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/transformers/multiturn_generate_transformers.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/examples/vllm/multiturn_generate_vllm.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/__init__.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/client.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/default.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/parsers.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/parsing.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/qwen36.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/conftest.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_bridge.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_build_helpers.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_client.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_gpt_oss_harmony_parity.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_incremental.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_load_tokenizer.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_message_indices.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_multimodal.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parse_response.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parse_response_robustness.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_parsers.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_preserve_thinking.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_qwen35_size_coverage.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_render_ids.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_roundtrip.py +0 -0
- {renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/tests/test_tool_arg_type_preservation.py +0 -0
|
@@ -12,8 +12,16 @@ on:
|
|
|
12
12
|
- "renderers-v*"
|
|
13
13
|
|
|
14
14
|
jobs:
|
|
15
|
-
publish
|
|
15
|
+
# Build (no OIDC) → publish (OIDC only). The build job runs uv build with
|
|
16
|
+
# contents: read only so a poisoned build-time dep cannot mint the OIDC
|
|
17
|
+
# token. The publish job has id-token: write and the pypi-prod environment
|
|
18
|
+
# but no source checkout — it only downloads the prebuilt artifact and runs
|
|
19
|
+
# the SHA-pinned pypa publish action.
|
|
20
|
+
build:
|
|
21
|
+
if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/tags/renderers-v')
|
|
16
22
|
runs-on: ubuntu-latest
|
|
23
|
+
permissions:
|
|
24
|
+
contents: read
|
|
17
25
|
steps:
|
|
18
26
|
- name: Checkout tagged release (dispatch)
|
|
19
27
|
if: github.event_name == 'workflow_dispatch'
|
|
@@ -28,8 +36,7 @@ jobs:
|
|
|
28
36
|
with:
|
|
29
37
|
fetch-depth: 0
|
|
30
38
|
|
|
31
|
-
- name:
|
|
32
|
-
id: release
|
|
39
|
+
- name: Validate release tag
|
|
33
40
|
env:
|
|
34
41
|
EVENT_NAME: ${{ github.event_name }}
|
|
35
42
|
PUSHED_REF: ${{ github.ref_name }}
|
|
@@ -53,14 +60,31 @@ jobs:
|
|
|
53
60
|
;;
|
|
54
61
|
esac
|
|
55
62
|
|
|
56
|
-
echo "tag=$TAG" >> "$GITHUB_OUTPUT"
|
|
57
|
-
|
|
58
63
|
- uses: astral-sh/setup-uv@v7
|
|
59
64
|
|
|
60
65
|
- name: Build renderers
|
|
61
66
|
run: uv build
|
|
62
67
|
|
|
68
|
+
- name: Upload dist artifacts
|
|
69
|
+
uses: actions/upload-artifact@v4
|
|
70
|
+
with:
|
|
71
|
+
name: dist
|
|
72
|
+
path: dist/
|
|
73
|
+
if-no-files-found: error
|
|
74
|
+
retention-days: 7
|
|
75
|
+
|
|
76
|
+
publish:
|
|
77
|
+
needs: build
|
|
78
|
+
runs-on: ubuntu-latest
|
|
79
|
+
environment: pypi-prod
|
|
80
|
+
permissions:
|
|
81
|
+
id-token: write
|
|
82
|
+
steps:
|
|
83
|
+
- name: Download dist artifacts
|
|
84
|
+
uses: actions/download-artifact@v4
|
|
85
|
+
with:
|
|
86
|
+
name: dist
|
|
87
|
+
path: dist/
|
|
88
|
+
|
|
63
89
|
- name: Publish to PyPI
|
|
64
|
-
|
|
65
|
-
PYPI_RENDERERS_TOKEN: ${{ secrets.PYPI_RENDERERS_TOKEN }}
|
|
66
|
-
run: uv publish --token "$PYPI_RENDERERS_TOKEN" dist/*
|
|
90
|
+
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: renderers
|
|
3
|
-
Version: 0.1.8.
|
|
3
|
+
Version: 0.1.8.dev2
|
|
4
4
|
Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: <3.14,>=3.10
|
|
8
|
+
Requires-Dist: fastokens>=0.1.1
|
|
8
9
|
Requires-Dist: jinja2
|
|
9
10
|
Requires-Dist: numpy
|
|
10
11
|
Requires-Dist: openai-harmony>=0.0.8
|
|
@@ -24,6 +24,12 @@ dependencies = [
|
|
|
24
24
|
# OpenAI's reference implementation keeps us byte-identical with vLLM
|
|
25
25
|
# (which also uses it) and saves us mirroring a 330-line Jinja template.
|
|
26
26
|
"openai-harmony>=0.0.8",
|
|
27
|
+
# Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
|
|
28
|
+
# ``load_tokenizer`` patches it in by default for every supported model
|
|
29
|
+
# except a small denylist (DeepSeek-V3 family, MiniMax-M2 family). The
|
|
30
|
+
# patch is bracketed around ``from_pretrained``, so subsequent
|
|
31
|
+
# ``AutoTokenizer`` calls outside the renderers package stay vanilla.
|
|
32
|
+
"fastokens>=0.1.1",
|
|
27
33
|
]
|
|
28
34
|
|
|
29
35
|
[tool.hatch.version]
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.1.8.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 1, 8, '
|
|
21
|
+
__version__ = version = '0.1.8.dev2'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 8, 'dev2')
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -148,8 +148,26 @@ class RenderedTokens:
|
|
|
148
148
|
"""Result of rendering messages to tokens.
|
|
149
149
|
|
|
150
150
|
Each token carries an index into the original message list so callers can
|
|
151
|
-
build per-token loss masks without re-rendering.
|
|
152
|
-
scaffolding
|
|
151
|
+
build per-token loss masks without re-rendering. Tokens from structural
|
|
152
|
+
scaffolding the renderer adds outside any single message (e.g. the
|
|
153
|
+
trailing generation prompt) carry index ``-1``.
|
|
154
|
+
|
|
155
|
+
``sampled_mask`` is a separate per-token signal: ``True`` if the model
|
|
156
|
+
would have produced this token at inference time (i.e. it appears in
|
|
157
|
+
the sampled completion), ``False`` if it is template-injected
|
|
158
|
+
scaffolding the model never emits (``<|im_start|>role\\n`` openers,
|
|
159
|
+
inter-turn ``\\n`` separators, system / user / tool content from
|
|
160
|
+
conversation history, etc.). This is distinct from
|
|
161
|
+
``message_indices``: a token can belong to an assistant message
|
|
162
|
+
(``message_indices[k] >= 0``) and still be scaffolding the template
|
|
163
|
+
adds around the model's actual completion. SFT loss masks should AND
|
|
164
|
+
both: train on tokens whose role is trainable AND that the model
|
|
165
|
+
would actually sample.
|
|
166
|
+
|
|
167
|
+
Empty ``sampled_mask`` (``[]``) means the renderer doesn't provide
|
|
168
|
+
this signal — consumers should fall back to attribution-only
|
|
169
|
+
masking. ``DefaultRenderer`` leaves it empty because the Jinja
|
|
170
|
+
template is opaque; hand-coded renderers populate it.
|
|
153
171
|
|
|
154
172
|
``multi_modal_data`` is populated by multimodal renderers (e.g.
|
|
155
173
|
``Qwen3VLRenderer``) when image / video content parts are present;
|
|
@@ -158,6 +176,7 @@ class RenderedTokens:
|
|
|
158
176
|
|
|
159
177
|
token_ids: list[int] = field(default_factory=list)
|
|
160
178
|
message_indices: list[int] = field(default_factory=list)
|
|
179
|
+
sampled_mask: list[bool] = field(default_factory=list)
|
|
161
180
|
multi_modal_data: "MultiModalData | None" = None
|
|
162
181
|
|
|
163
182
|
|
|
@@ -713,37 +732,108 @@ TRUSTED_REVISIONS: dict[str, str] = {
|
|
|
713
732
|
}
|
|
714
733
|
|
|
715
734
|
|
|
716
|
-
|
|
717
|
-
|
|
735
|
+
# Models for which ``fastokens`` is known to diverge from vanilla
|
|
736
|
+
# ``transformers.AutoTokenizer`` and therefore must NOT be patched.
|
|
737
|
+
# Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
|
|
738
|
+
# backends; 31/35 passed byte-identical. The four below either fail to
|
|
739
|
+
# load under fastokens (DeepSeek-V3 family — Metaspace pretokenizer not
|
|
740
|
+
# yet implemented) or are kept defensively pending an upstream fastokens
|
|
741
|
+
# fix (MiniMax-M2 family — see per-entry comments).
|
|
742
|
+
FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
|
|
743
|
+
{
|
|
744
|
+
# fastokens 0.1.1: ``ValueError: pre-tokenizer error: unsupported
|
|
745
|
+
# pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
|
|
746
|
+
# SentencePiece-style Metaspace pretokenization which fastokens
|
|
747
|
+
# doesn't yet implement.
|
|
748
|
+
"deepseek-ai/DeepSeek-V3",
|
|
749
|
+
"deepseek-ai/DeepSeek-V3-Base",
|
|
750
|
+
# MiniMax: kept defensive pending upstream fastokens fix
|
|
751
|
+
# https://github.com/crusoecloud/fastokens/pull/32 — that PR
|
|
752
|
+
# removes a stray attribute leaked by ``unpatch_transformers``
|
|
753
|
+
# which steers MiniMax (declared ``tokenizer_class =
|
|
754
|
+
# 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
|
|
755
|
+
# load path on subsequent vanilla loads. Once the upstream fix
|
|
756
|
+
# is released, these two entries can be dropped after re-audit.
|
|
757
|
+
"MiniMaxAI/MiniMax-M2",
|
|
758
|
+
"MiniMaxAI/MiniMax-M2.5",
|
|
759
|
+
}
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def _patched_load(model_name_or_path: str, **kwargs):
|
|
764
|
+
"""Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
|
|
765
|
+
process-locally — patch around the load, unpatch right after.
|
|
766
|
+
|
|
767
|
+
fastokens captures the loaded backend on a per-tokenizer basis, so
|
|
768
|
+
after we unpatch the returned tokenizer object continues to use
|
|
769
|
+
fastokens for ``encode``/``decode`` while subsequent
|
|
770
|
+
``AutoTokenizer.from_pretrained`` calls (outside our control) go
|
|
771
|
+
back to vanilla. This keeps the global side effect minimal.
|
|
772
|
+
"""
|
|
773
|
+
import fastokens
|
|
774
|
+
from transformers import AutoTokenizer
|
|
775
|
+
|
|
776
|
+
fastokens.patch_transformers()
|
|
777
|
+
try:
|
|
778
|
+
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
|
|
779
|
+
finally:
|
|
780
|
+
fastokens.unpatch_transformers()
|
|
718
781
|
|
|
719
|
-
Default: ``trust_remote_code=False`` — the safe choice for every
|
|
720
|
-
model in ``MODEL_RENDERER_MAP`` *except* the Kimi-K2 family.
|
|
721
782
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
783
|
+
def load_tokenizer(
|
|
784
|
+
model_name_or_path: str,
|
|
785
|
+
*,
|
|
786
|
+
use_fastokens: bool = True,
|
|
787
|
+
):
|
|
788
|
+
"""Load a tokenizer with the renderers-package security + perf policy.
|
|
789
|
+
|
|
790
|
+
**Security** — default ``trust_remote_code=False``. Models listed in
|
|
791
|
+
``TRUSTED_REVISIONS`` (Moonshot Kimi-K2 family) load with
|
|
792
|
+
``trust_remote_code=True`` AND a pinned ``revision=<sha>`` so
|
|
793
|
+
transformers only executes the reviewed commit's tokenizer Python.
|
|
794
|
+
|
|
795
|
+
**Performance** — ``use_fastokens=True`` (default) routes the load
|
|
796
|
+
through ``fastokens.patch_transformers()`` so the resulting tokenizer
|
|
797
|
+
encodes ~10x faster than vanilla ``tokenizers``. The patch is
|
|
798
|
+
bracketed: it's applied before ``from_pretrained`` and removed
|
|
799
|
+
immediately after, so global ``AutoTokenizer.from_pretrained`` calls
|
|
800
|
+
elsewhere in the user's process are not affected.
|
|
801
|
+
|
|
802
|
+
Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family, MiniMax-M2
|
|
803
|
+
family) skip the patch — fastokens 0.1.1 either fails to load them
|
|
804
|
+
or produces token-divergent output. Pass ``use_fastokens=False`` to
|
|
805
|
+
force the vanilla backend for any other model.
|
|
729
806
|
|
|
730
807
|
Unknown / fine-tuned model paths fall through to
|
|
731
|
-
``trust_remote_code=False
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
``create_renderer_pool`` does).
|
|
808
|
+
``trust_remote_code=False`` and the patched-load fast path. If
|
|
809
|
+
fastokens raises during the patched load (e.g. an unknown
|
|
810
|
+
pre-tokenizer type), we automatically retry with the vanilla
|
|
811
|
+
backend and emit an INFO log.
|
|
736
812
|
"""
|
|
737
813
|
from transformers import AutoTokenizer
|
|
738
814
|
|
|
815
|
+
kwargs: dict[str, Any] = {}
|
|
739
816
|
revision = TRUSTED_REVISIONS.get(model_name_or_path)
|
|
740
817
|
if revision is not None:
|
|
741
|
-
|
|
818
|
+
kwargs = {"trust_remote_code": True, "revision": revision}
|
|
819
|
+
else:
|
|
820
|
+
kwargs = {"trust_remote_code": False}
|
|
821
|
+
|
|
822
|
+
if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE:
|
|
823
|
+
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
|
|
824
|
+
|
|
825
|
+
try:
|
|
826
|
+
return _patched_load(model_name_or_path, **kwargs)
|
|
827
|
+
except Exception as exc:
|
|
828
|
+
logger.info(
|
|
829
|
+
"fastokens could not load %r (%s: %s); falling back to vanilla "
|
|
830
|
+
"AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in "
|
|
831
|
+
"renderers.base to suppress the retry.",
|
|
742
832
|
model_name_or_path,
|
|
743
|
-
|
|
744
|
-
|
|
833
|
+
type(exc).__name__,
|
|
834
|
+
str(exc)[:160],
|
|
745
835
|
)
|
|
746
|
-
|
|
836
|
+
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
|
|
747
837
|
|
|
748
838
|
|
|
749
839
|
def _populate_registry():
|
|
@@ -947,12 +1037,25 @@ def build_training_sample(
|
|
|
947
1037
|
|
|
948
1038
|
Single render() call + message_indices → per-token mask.
|
|
949
1039
|
Replaces build_incremental_token_mask (O(N) renders → O(1)).
|
|
1040
|
+
|
|
1041
|
+
When the renderer populates ``rendered.sampled_mask``, the loss mask
|
|
1042
|
+
is the AND of role-based attribution and the sampled signal: only
|
|
1043
|
+
tokens the model would have produced at inference are trainable.
|
|
1044
|
+
This keeps SFT byte-aligned with the RL trajectory mask (where the
|
|
1045
|
+
prompt / completion split achieves the same effect structurally).
|
|
1046
|
+
Renderers that don't populate ``sampled_mask`` (empty list) fall
|
|
1047
|
+
back to attribution-only masking — every token attributed to a
|
|
1048
|
+
trainable role is trained on, including template-injected
|
|
1049
|
+
``<|im_start|>role\\n`` openers.
|
|
950
1050
|
"""
|
|
951
1051
|
rendered = renderer.render(messages, tools=tools)
|
|
1052
|
+
has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
|
|
952
1053
|
loss_mask: list[bool] = []
|
|
953
|
-
for msg_idx in rendered.message_indices:
|
|
1054
|
+
for k, msg_idx in enumerate(rendered.message_indices):
|
|
954
1055
|
if msg_idx < 0:
|
|
955
1056
|
loss_mask.append(False)
|
|
1057
|
+
elif has_sampled_info and not rendered.sampled_mask[k]:
|
|
1058
|
+
loss_mask.append(False)
|
|
956
1059
|
else:
|
|
957
1060
|
loss_mask.append(role_to_mask(messages[msg_idx]))
|
|
958
1061
|
return rendered.token_ids, loss_mask
|
|
@@ -113,20 +113,23 @@ class DeepSeekV3Renderer:
|
|
|
113
113
|
|
|
114
114
|
tokens: list[int] = []
|
|
115
115
|
indices: list[int] = []
|
|
116
|
+
sampled: list[bool] = []
|
|
116
117
|
|
|
117
|
-
def emit_ids(ids: list[int], msg_idx: int) -> None:
|
|
118
|
+
def emit_ids(ids: list[int], msg_idx: int, *, is_sampled: bool) -> None:
|
|
118
119
|
tokens.extend(ids)
|
|
119
120
|
indices.extend([msg_idx] * len(ids))
|
|
121
|
+
sampled.extend([is_sampled] * len(ids))
|
|
120
122
|
|
|
121
|
-
def emit_special(token_id: int, msg_idx: int) -> None:
|
|
123
|
+
def emit_special(token_id: int, msg_idx: int, *, is_sampled: bool) -> None:
|
|
122
124
|
tokens.append(token_id)
|
|
123
125
|
indices.append(msg_idx)
|
|
126
|
+
sampled.append(is_sampled)
|
|
124
127
|
|
|
125
|
-
def emit_text(text: str, msg_idx: int) -> None:
|
|
126
|
-
emit_ids(self._encode(text), msg_idx)
|
|
128
|
+
def emit_text(text: str, msg_idx: int, *, is_sampled: bool) -> None:
|
|
129
|
+
emit_ids(self._encode(text), msg_idx, is_sampled=is_sampled)
|
|
127
130
|
|
|
128
131
|
# ── 1. BOS token ─────────────────────────────────────────────
|
|
129
|
-
emit_special(self._bos, -1)
|
|
132
|
+
emit_special(self._bos, -1, is_sampled=False)
|
|
130
133
|
|
|
131
134
|
# ── 2. Collect system messages at the start ───────────────────
|
|
132
135
|
# All leading system messages are concatenated with "\n\n" and emitted
|
|
@@ -148,7 +151,7 @@ class DeepSeekV3Renderer:
|
|
|
148
151
|
|
|
149
152
|
if sys_parts:
|
|
150
153
|
# Attribute the concatenated system text to the first system message (index 0).
|
|
151
|
-
emit_text("\n\n".join(sys_parts), 0)
|
|
154
|
+
emit_text("\n\n".join(sys_parts), 0, is_sampled=False)
|
|
152
155
|
|
|
153
156
|
# ── 3. Render non-system messages ─────────────────────────────
|
|
154
157
|
num_messages = len(messages)
|
|
@@ -163,8 +166,8 @@ class DeepSeekV3Renderer:
|
|
|
163
166
|
content = "".join(
|
|
164
167
|
p.get("text", "") for p in content if isinstance(p, dict)
|
|
165
168
|
)
|
|
166
|
-
emit_special(self._user_token, i)
|
|
167
|
-
emit_text(str(content), i)
|
|
169
|
+
emit_special(self._user_token, i, is_sampled=False)
|
|
170
|
+
emit_text(str(content), i, is_sampled=False)
|
|
168
171
|
|
|
169
172
|
elif role == "user":
|
|
170
173
|
content = msg.get("content") or ""
|
|
@@ -177,8 +180,8 @@ class DeepSeekV3Renderer:
|
|
|
177
180
|
else ""
|
|
178
181
|
for p in content
|
|
179
182
|
)
|
|
180
|
-
emit_special(self._user_token, i)
|
|
181
|
-
emit_text(str(content), i)
|
|
183
|
+
emit_special(self._user_token, i, is_sampled=False)
|
|
184
|
+
emit_text(str(content), i, is_sampled=False)
|
|
182
185
|
|
|
183
186
|
elif role == "assistant":
|
|
184
187
|
self._render_assistant(
|
|
@@ -202,11 +205,13 @@ class DeepSeekV3Renderer:
|
|
|
202
205
|
# Don't add <|Assistant|> after tool outputs — content flows directly.
|
|
203
206
|
last_role = messages[-1]["role"] if messages else None
|
|
204
207
|
if last_role != "tool":
|
|
205
|
-
emit_special(self._assistant_token, -1)
|
|
208
|
+
emit_special(self._assistant_token, -1, is_sampled=False)
|
|
206
209
|
if self._enable_thinking:
|
|
207
|
-
emit_text("<think>\n", -1)
|
|
210
|
+
emit_text("<think>\n", -1, is_sampled=False)
|
|
208
211
|
|
|
209
|
-
return RenderedTokens(
|
|
212
|
+
return RenderedTokens(
|
|
213
|
+
token_ids=tokens, message_indices=indices, sampled_mask=sampled
|
|
214
|
+
)
|
|
210
215
|
|
|
211
216
|
def render_ids(
|
|
212
217
|
self,
|
|
@@ -267,10 +272,20 @@ class DeepSeekV3Renderer:
|
|
|
267
272
|
|
|
268
273
|
ext: list[int] = []
|
|
269
274
|
|
|
270
|
-
|
|
275
|
+
# Bridge output is consumed as the next turn's prompt — the
|
|
276
|
+
# caller blanket-masks it via ``prompt_mask=[False]*N``, so we
|
|
277
|
+
# don't track sampled_mask here. Local helpers accept the kwarg
|
|
278
|
+
# for signature compatibility with ``_render_tool`` and ignore
|
|
279
|
+
# it; the returned ``RenderedTokens`` leaves ``sampled_mask``
|
|
280
|
+
# empty.
|
|
281
|
+
def emit_special(
|
|
282
|
+
token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
|
|
283
|
+
) -> None:
|
|
271
284
|
ext.append(token_id)
|
|
272
285
|
|
|
273
|
-
def emit_text(
|
|
286
|
+
def emit_text(
|
|
287
|
+
text: str, _msg_idx: int = -1, *, is_sampled: bool = False
|
|
288
|
+
) -> None:
|
|
274
289
|
ext.extend(self._encode(text))
|
|
275
290
|
|
|
276
291
|
for i, msg in enumerate(new_messages):
|
|
@@ -354,17 +369,24 @@ class DeepSeekV3Renderer:
|
|
|
354
369
|
|
|
355
370
|
tool_calls = msg.get("tool_calls") or []
|
|
356
371
|
|
|
372
|
+
# ``<|Assistant|>`` is template-injected scaffolding — at
|
|
373
|
+
# inference the chat template emits it as the generation prompt
|
|
374
|
+
# and the model never samples it. Marking it ``is_sampled=False``
|
|
375
|
+
# keeps the SFT loss mask aligned with what the model would
|
|
376
|
+
# actually have produced. When the previous message is a tool
|
|
377
|
+
# response, the template skips this token entirely (content
|
|
378
|
+
# flows directly out of ``<|tool▁outputs▁end|>``).
|
|
357
379
|
if not prev_is_tool:
|
|
358
|
-
emit_special(self._assistant_token, msg_idx)
|
|
380
|
+
emit_special(self._assistant_token, msg_idx, is_sampled=False)
|
|
359
381
|
|
|
360
382
|
if not tool_calls:
|
|
361
|
-
emit_text(content, msg_idx)
|
|
383
|
+
emit_text(content, msg_idx, is_sampled=True)
|
|
362
384
|
else:
|
|
363
385
|
# Emit any pre-tool-call content first.
|
|
364
|
-
emit_text(content, msg_idx)
|
|
386
|
+
emit_text(content, msg_idx, is_sampled=True)
|
|
365
387
|
|
|
366
388
|
# Tool call section.
|
|
367
|
-
emit_special(self._tool_calls_begin, msg_idx)
|
|
389
|
+
emit_special(self._tool_calls_begin, msg_idx, is_sampled=True)
|
|
368
390
|
for tc in tool_calls:
|
|
369
391
|
func = tc.get("function") or tc
|
|
370
392
|
name = func.get("name", "")
|
|
@@ -376,14 +398,17 @@ class DeepSeekV3Renderer:
|
|
|
376
398
|
)
|
|
377
399
|
# Format: <|tool▁call▁begin|>function<|tool▁sep|>{name}\n```json\n{args}\n```<|tool▁call▁end|>
|
|
378
400
|
# tool_sep is a special token; type ("function") and name+args are plain text.
|
|
379
|
-
emit_special(self._tool_call_begin, msg_idx)
|
|
380
|
-
emit_text("function", msg_idx)
|
|
381
|
-
emit_special(self._tool_sep, msg_idx)
|
|
382
|
-
emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx)
|
|
383
|
-
emit_special(self._tool_call_end, msg_idx)
|
|
384
|
-
emit_special(self._tool_calls_end, msg_idx)
|
|
385
|
-
|
|
386
|
-
|
|
401
|
+
emit_special(self._tool_call_begin, msg_idx, is_sampled=True)
|
|
402
|
+
emit_text("function", msg_idx, is_sampled=True)
|
|
403
|
+
emit_special(self._tool_sep, msg_idx, is_sampled=True)
|
|
404
|
+
emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx, is_sampled=True)
|
|
405
|
+
emit_special(self._tool_call_end, msg_idx, is_sampled=True)
|
|
406
|
+
emit_special(self._tool_calls_end, msg_idx, is_sampled=True)
|
|
407
|
+
|
|
408
|
+
# ``<|end▁of▁sentence|>`` is the model's stop signal — it
|
|
409
|
+
# samples this to end its turn, so it is part of the sampled
|
|
410
|
+
# stream.
|
|
411
|
+
emit_special(self._eos, msg_idx, is_sampled=True)
|
|
387
412
|
|
|
388
413
|
# ------------------------------------------------------------------
|
|
389
414
|
# Tool (tool-response) rendering
|
|
@@ -397,6 +422,9 @@ class DeepSeekV3Renderer:
|
|
|
397
422
|
emit_special,
|
|
398
423
|
emit_text,
|
|
399
424
|
) -> None:
|
|
425
|
+
# Tool messages are conversation history injected by the runtime
|
|
426
|
+
# between assistant turns — the model never samples any of these
|
|
427
|
+
# tokens, so every emission is is_sampled=False.
|
|
400
428
|
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
|
|
401
429
|
next_is_tool = (
|
|
402
430
|
msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
|
|
@@ -407,11 +435,11 @@ class DeepSeekV3Renderer:
|
|
|
407
435
|
content = "".join(p.get("text", "") for p in content if isinstance(p, dict))
|
|
408
436
|
|
|
409
437
|
if not prev_is_tool:
|
|
410
|
-
emit_special(self._tool_outputs_begin, msg_idx)
|
|
438
|
+
emit_special(self._tool_outputs_begin, msg_idx, is_sampled=False)
|
|
411
439
|
|
|
412
|
-
emit_special(self._tool_output_begin, msg_idx)
|
|
413
|
-
emit_text(str(content), msg_idx)
|
|
414
|
-
emit_special(self._tool_output_end, msg_idx)
|
|
440
|
+
emit_special(self._tool_output_begin, msg_idx, is_sampled=False)
|
|
441
|
+
emit_text(str(content), msg_idx, is_sampled=False)
|
|
442
|
+
emit_special(self._tool_output_end, msg_idx, is_sampled=False)
|
|
415
443
|
|
|
416
444
|
if not next_is_tool:
|
|
417
|
-
emit_special(self._tool_outputs_end, msg_idx)
|
|
445
|
+
emit_special(self._tool_outputs_end, msg_idx, is_sampled=False)
|