renderers 0.1.8.dev4__tar.gz → 0.1.8.dev26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- renderers-0.1.8.dev26/.github/workflows/publish-dev.yml +104 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/PKG-INFO +2 -2
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/pyproject.toml +10 -4
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/__init__.py +2 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/_version.py +2 -2
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/base.py +353 -22
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/client.py +68 -16
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/deepseek_v3.py +108 -38
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/glm45.py +167 -53
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/glm5.py +140 -47
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/gpt_oss.py +181 -13
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/kimi_k2.py +167 -71
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/kimi_k25.py +188 -65
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/laguna_xs2.py +132 -43
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/minimax_m2.py +228 -59
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/nemotron3.py +172 -64
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen3.py +176 -65
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen35.py +240 -99
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen3_vl.py +179 -94
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_client.py +71 -5
- renderers-0.1.8.dev26/tests/test_is_content.py +389 -0
- renderers-0.1.8.dev26/tests/test_kimi_k25_tool_schema.py +53 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_load_tokenizer_fastokens.py +44 -5
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/uv.lock +37 -3
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/publish.yml +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/style.yml +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/test.yml +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.gitignore +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.pre-commit-config.yaml +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/LICENSE +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/README.md +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/README.md +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/sglang/multiturn_generate_sglang.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/sglang/online_multiturn_sglang.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/tinker/multiturn_generate_tinker.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/transformers/multiturn_generate_transformers.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/vllm/multiturn_generate_vllm.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/default.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/parsers.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/parsing.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen36.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/conftest.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_bridge.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_build_helpers.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_gpt_oss_harmony_parity.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_incremental.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_load_tokenizer.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_message_indices.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_multimodal.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parse_response.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parse_response_robustness.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parsers.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_preserve_thinking.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_qwen35_size_coverage.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_render_ids.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_roundtrip.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_sampled_mask.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_tokens_per_message.py +0 -0
- {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_tool_arg_type_preservation.py +0 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
name: Publish Dev
|
|
2
|
+
|
|
3
|
+
# Tag every commit on main as ``renderers-v<next>.dev<N>`` and publish the
|
|
4
|
+
# wheel to PyPI as a pre-release. ``<next>`` is the latest release tag with
|
|
5
|
+
# its patch bumped; ``<N>`` is the number of commits since that release so
|
|
6
|
+
# each main commit maps to a unique PEP 440 dev version.
|
|
7
|
+
#
|
|
8
|
+
# Building from the freshly-created tag means hatch-vcs resolves the version
|
|
9
|
+
# cleanly (no ``+gHASH`` local segment), which PyPI requires.
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
push:
|
|
13
|
+
branches: [main]
|
|
14
|
+
|
|
15
|
+
concurrency:
|
|
16
|
+
group: publish-dev-${{ github.ref }}
|
|
17
|
+
cancel-in-progress: false
|
|
18
|
+
|
|
19
|
+
jobs:
|
|
20
|
+
tag:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
permissions:
|
|
23
|
+
contents: write
|
|
24
|
+
outputs:
|
|
25
|
+
tag: ${{ steps.compute.outputs.tag }}
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
with:
|
|
29
|
+
fetch-depth: 0
|
|
30
|
+
|
|
31
|
+
- name: Compute next dev tag
|
|
32
|
+
id: compute
|
|
33
|
+
run: |
|
|
34
|
+
set -euo pipefail
|
|
35
|
+
LATEST_RELEASE=$(git tag --list 'renderers-v*' --sort=-v:refname \
|
|
36
|
+
| grep -Ev '(dev|rc|a[0-9]|b[0-9])' \
|
|
37
|
+
| head -1)
|
|
38
|
+
if [ -z "$LATEST_RELEASE" ]; then
|
|
39
|
+
echo "No release tag matching 'renderers-v<MAJOR.MINOR.PATCH>' found" >&2
|
|
40
|
+
exit 1
|
|
41
|
+
fi
|
|
42
|
+
BASE=${LATEST_RELEASE#renderers-v}
|
|
43
|
+
MAJOR=$(echo "$BASE" | cut -d. -f1)
|
|
44
|
+
MINOR=$(echo "$BASE" | cut -d. -f2)
|
|
45
|
+
PATCH=$(echo "$BASE" | cut -d. -f3)
|
|
46
|
+
NEXT="${MAJOR}.${MINOR}.$((PATCH + 1))"
|
|
47
|
+
N=$(git rev-list --count "${LATEST_RELEASE}..HEAD")
|
|
48
|
+
TAG="renderers-v${NEXT}.dev${N}"
|
|
49
|
+
echo "tag=${TAG}" >> "$GITHUB_OUTPUT"
|
|
50
|
+
echo "Computed tag: ${TAG} (base=${LATEST_RELEASE}, commits=${N})"
|
|
51
|
+
|
|
52
|
+
- name: Create and push tag
|
|
53
|
+
env:
|
|
54
|
+
TAG: ${{ steps.compute.outputs.tag }}
|
|
55
|
+
run: |
|
|
56
|
+
set -euo pipefail
|
|
57
|
+
if git ls-remote --exit-code --tags origin "refs/tags/${TAG}" >/dev/null 2>&1; then
|
|
58
|
+
echo "Tag ${TAG} already exists on origin — nothing to do" >&2
|
|
59
|
+
exit 0
|
|
60
|
+
fi
|
|
61
|
+
git config user.name 'github-actions[bot]'
|
|
62
|
+
git config user.email '41898282+github-actions[bot]@users.noreply.github.com'
|
|
63
|
+
git tag -a "$TAG" -m "Automated dev release ${TAG}"
|
|
64
|
+
git push origin "$TAG"
|
|
65
|
+
|
|
66
|
+
build:
|
|
67
|
+
needs: tag
|
|
68
|
+
runs-on: ubuntu-latest
|
|
69
|
+
permissions:
|
|
70
|
+
contents: read
|
|
71
|
+
steps:
|
|
72
|
+
- uses: actions/checkout@v4
|
|
73
|
+
with:
|
|
74
|
+
fetch-depth: 0
|
|
75
|
+
ref: refs/tags/${{ needs.tag.outputs.tag }}
|
|
76
|
+
|
|
77
|
+
- uses: astral-sh/setup-uv@v7
|
|
78
|
+
|
|
79
|
+
- name: Build renderers
|
|
80
|
+
run: uv build
|
|
81
|
+
|
|
82
|
+
- name: Upload dist artifacts
|
|
83
|
+
uses: actions/upload-artifact@v4
|
|
84
|
+
with:
|
|
85
|
+
name: dist-dev
|
|
86
|
+
path: dist/
|
|
87
|
+
if-no-files-found: error
|
|
88
|
+
retention-days: 7
|
|
89
|
+
|
|
90
|
+
publish:
|
|
91
|
+
needs: build
|
|
92
|
+
runs-on: ubuntu-latest
|
|
93
|
+
environment: pypi-prod
|
|
94
|
+
permissions:
|
|
95
|
+
id-token: write
|
|
96
|
+
steps:
|
|
97
|
+
- name: Download dist artifacts
|
|
98
|
+
uses: actions/download-artifact@v4
|
|
99
|
+
with:
|
|
100
|
+
name: dist-dev
|
|
101
|
+
path: dist/
|
|
102
|
+
|
|
103
|
+
- name: Publish to PyPI
|
|
104
|
+
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: renderers
|
|
3
|
-
Version: 0.1.8.
|
|
3
|
+
Version: 0.1.8.dev26
|
|
4
4
|
Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: <3.14,>=3.10
|
|
8
|
-
Requires-Dist: fastokens>=0.
|
|
8
|
+
Requires-Dist: fastokens>=0.2.0
|
|
9
9
|
Requires-Dist: jinja2
|
|
10
10
|
Requires-Dist: numpy
|
|
11
11
|
Requires-Dist: openai-harmony>=0.0.8
|
|
@@ -26,10 +26,10 @@ dependencies = [
|
|
|
26
26
|
"openai-harmony>=0.0.8",
|
|
27
27
|
# Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
|
|
28
28
|
# ``load_tokenizer`` patches it in by default for every supported model
|
|
29
|
-
# except a small denylist (DeepSeek-V3 family
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
"fastokens>=0.
|
|
29
|
+
# except a small denylist (DeepSeek-V3 family). The patch is bracketed
|
|
30
|
+
# around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
|
|
31
|
+
# outside the renderers package stay vanilla.
|
|
32
|
+
"fastokens>=0.2.0",
|
|
33
33
|
]
|
|
34
34
|
|
|
35
35
|
[tool.hatch.version]
|
|
@@ -68,6 +68,12 @@ dev = [
|
|
|
68
68
|
|
|
69
69
|
[tool.uv]
|
|
70
70
|
exclude-newer = "7 days"
|
|
71
|
+
# fastokens 0.2.0 was published on 2026-05-17 and contains the
|
|
72
|
+
# ``unpatch_transformers`` fix (crusoecloud/fastokens#32) needed for
|
|
73
|
+
# MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
|
|
74
|
+
# the project-wide 7-day cutoff lets the lockfile pick it up immediately
|
|
75
|
+
# while the rest of the dependency graph stays gated.
|
|
76
|
+
exclude-newer-package = { fastokens = false }
|
|
71
77
|
|
|
72
78
|
[tool.ty.environment]
|
|
73
79
|
python-version = "3.13"
|
|
@@ -28,6 +28,7 @@ from renderers.base import (
|
|
|
28
28
|
ToolCallParseStatus,
|
|
29
29
|
ToolSpec,
|
|
30
30
|
VideoPart,
|
|
31
|
+
attribute_text_segments,
|
|
31
32
|
build_training_sample,
|
|
32
33
|
build_trajectory_step,
|
|
33
34
|
create_renderer,
|
|
@@ -90,6 +91,7 @@ __all__ = [
|
|
|
90
91
|
"ToolSpec",
|
|
91
92
|
"VideoPart",
|
|
92
93
|
"__version__",
|
|
94
|
+
"attribute_text_segments",
|
|
93
95
|
"build_training_sample",
|
|
94
96
|
"build_trajectory_step",
|
|
95
97
|
"create_renderer",
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.1.8.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 1, 8, '
|
|
21
|
+
__version__ = version = '0.1.8.dev26'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 8, 'dev26')
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import enum
|
|
5
|
+
import io
|
|
4
6
|
import logging
|
|
5
7
|
import queue
|
|
6
8
|
import threading
|
|
@@ -169,6 +171,32 @@ class RenderedTokens:
|
|
|
169
171
|
masking. ``DefaultRenderer`` leaves it empty because the Jinja
|
|
170
172
|
template is opaque; hand-coded renderers populate it.
|
|
171
173
|
|
|
174
|
+
``is_content`` is a per-token signal generalizing the "scaffold vs
|
|
175
|
+
body" distinction across all roles: ``True`` iff the token was
|
|
176
|
+
produced from message-body bytes (caller-provided ``content`` /
|
|
177
|
+
``tool_calls`` / ``reasoning_content``, or the model's sampled
|
|
178
|
+
emission for the assistant role), ``False`` iff it is template
|
|
179
|
+
scaffolding the renderer added around message bodies — role-tag
|
|
180
|
+
openers, closers when not model-sampled, inter-turn separators,
|
|
181
|
+
tool-response wraps, the tools-header block, the generation prompt.
|
|
182
|
+
Generalises ``sampled_mask``: where ``sampled_mask`` answers "would
|
|
183
|
+
the model emit this?" (useful for assistant tokens; uniformly
|
|
184
|
+
``False`` elsewhere), ``is_content`` answers "is this from caller
|
|
185
|
+
or model data?" (meaningful on every role). By construction
|
|
186
|
+
``is_content[k] == sampled_mask[k]`` over every token attributed to
|
|
187
|
+
an assistant message; on other roles ``is_content`` carries new
|
|
188
|
+
information that ``sampled_mask`` does not.
|
|
189
|
+
|
|
190
|
+
The use case: SFT on tool response bodies while applying RL only to
|
|
191
|
+
assistant tokens. The trainer wants the model to anticipate tool
|
|
192
|
+
outputs but never to emit ``<|tool_response>`` itself (that would
|
|
193
|
+
interrupt the rollout), so the SFT loss mask is
|
|
194
|
+
``message_role == "tool" AND is_content``.
|
|
195
|
+
|
|
196
|
+
Empty ``is_content`` (``[]``) — like ``sampled_mask`` — means the
|
|
197
|
+
renderer doesn't provide the signal. ``DefaultRenderer`` leaves it
|
|
198
|
+
empty for the same reason.
|
|
199
|
+
|
|
172
200
|
``multi_modal_data`` is populated by multimodal renderers (e.g.
|
|
173
201
|
``Qwen3VLRenderer``) when image / video content parts are present;
|
|
174
202
|
text-only renderers leave it as ``None``.
|
|
@@ -177,6 +205,7 @@ class RenderedTokens:
|
|
|
177
205
|
token_ids: list[int] = field(default_factory=list)
|
|
178
206
|
message_indices: list[int] = field(default_factory=list)
|
|
179
207
|
sampled_mask: list[bool] = field(default_factory=list)
|
|
208
|
+
is_content: list[bool] = field(default_factory=list)
|
|
180
209
|
message_roles: list[str] = field(default_factory=list)
|
|
181
210
|
multi_modal_data: "MultiModalData | None" = None
|
|
182
211
|
|
|
@@ -333,6 +362,94 @@ class RenderedTokens:
|
|
|
333
362
|
out[role] = out.get(role, 0) + n
|
|
334
363
|
return out
|
|
335
364
|
|
|
365
|
+
def content_token_spans_by_role(self) -> dict[str, list[tuple[int, int]]]:
|
|
366
|
+
"""Per-role spans of contiguous body-only tokens (``is_content=True``).
|
|
367
|
+
|
|
368
|
+
Maps each role appearing in :attr:`message_roles` to a list of
|
|
369
|
+
half-open ``[start, end)`` slices into :attr:`token_ids` over
|
|
370
|
+
which every token satisfies ``is_content=True`` AND belongs to
|
|
371
|
+
a message of that role. Spans never cross message boundaries:
|
|
372
|
+
a tool message contributes its own runs; an immediately
|
|
373
|
+
adjacent assistant message contributes separate runs even when
|
|
374
|
+
the bodies abut on the token axis.
|
|
375
|
+
|
|
376
|
+
Returns an empty dict when :attr:`is_content` or
|
|
377
|
+
:attr:`message_roles` is empty (renderer didn't populate the
|
|
378
|
+
signal — e.g. ``DefaultRenderer``).
|
|
379
|
+
|
|
380
|
+
Intended for selective loss masking: SFT on tool response
|
|
381
|
+
bodies while RL acts only on assistant turns is the canonical
|
|
382
|
+
case::
|
|
383
|
+
|
|
384
|
+
spans = rendered.content_token_spans_by_role()
|
|
385
|
+
tool_sft_mask = [False] * len(rendered.token_ids)
|
|
386
|
+
for s, e in spans.get("tool", []):
|
|
387
|
+
for k in range(s, e):
|
|
388
|
+
tool_sft_mask[k] = True
|
|
389
|
+
|
|
390
|
+
See also :meth:`content_mask_for_roles` for the same
|
|
391
|
+
computation returned as a per-token bool list.
|
|
392
|
+
"""
|
|
393
|
+
out: dict[str, list[tuple[int, int]]] = {}
|
|
394
|
+
if not self.is_content or not self.message_roles:
|
|
395
|
+
return out
|
|
396
|
+
n = len(self.token_ids)
|
|
397
|
+
if len(self.is_content) != n or len(self.message_indices) != n:
|
|
398
|
+
return out
|
|
399
|
+
|
|
400
|
+
msg_spans = self.message_token_spans()
|
|
401
|
+
for role, span in zip(self.message_roles, msg_spans):
|
|
402
|
+
bucket = out.setdefault(role, [])
|
|
403
|
+
if span is None:
|
|
404
|
+
continue
|
|
405
|
+
start, end = span
|
|
406
|
+
run_start: int | None = None
|
|
407
|
+
for k in range(start, end):
|
|
408
|
+
if self.is_content[k]:
|
|
409
|
+
if run_start is None:
|
|
410
|
+
run_start = k
|
|
411
|
+
else:
|
|
412
|
+
if run_start is not None:
|
|
413
|
+
bucket.append((run_start, k))
|
|
414
|
+
run_start = None
|
|
415
|
+
if run_start is not None:
|
|
416
|
+
bucket.append((run_start, end))
|
|
417
|
+
return out
|
|
418
|
+
|
|
419
|
+
def content_mask_for_roles(self, roles: "set[str] | frozenset[str]") -> list[bool]:
|
|
420
|
+
"""Per-token bool list: ``True`` iff the token is body of a
|
|
421
|
+
message whose role is in ``roles``.
|
|
422
|
+
|
|
423
|
+
Length matches :attr:`token_ids`. Returns an all-``False``
|
|
424
|
+
list of that length when :attr:`is_content` or
|
|
425
|
+
:attr:`message_roles` is empty — consumers can AND this with
|
|
426
|
+
their own attribution masks without length checks.
|
|
427
|
+
|
|
428
|
+
``role_to_mask`` style helpers in :func:`build_training_sample`
|
|
429
|
+
cover the trainable-role question; this one covers the
|
|
430
|
+
complementary "body-only" question. The two compose: SFT mask
|
|
431
|
+
on tool body is
|
|
432
|
+
``rendered.content_mask_for_roles({"tool"})``; RL mask on
|
|
433
|
+
assistant tokens stays
|
|
434
|
+
``[s and (mi >= 0 and rendered.message_roles[mi] == "assistant")
|
|
435
|
+
for s, mi in zip(rendered.sampled_mask, rendered.message_indices)]``.
|
|
436
|
+
"""
|
|
437
|
+
n = len(self.token_ids)
|
|
438
|
+
mask = [False] * n
|
|
439
|
+
if not self.is_content or not self.message_roles:
|
|
440
|
+
return mask
|
|
441
|
+
if len(self.is_content) != n or len(self.message_indices) != n:
|
|
442
|
+
return mask
|
|
443
|
+
|
|
444
|
+
for k, msg_idx in enumerate(self.message_indices):
|
|
445
|
+
if msg_idx < 0:
|
|
446
|
+
continue
|
|
447
|
+
if msg_idx >= len(self.message_roles):
|
|
448
|
+
continue
|
|
449
|
+
if self.message_roles[msg_idx] in roles and self.is_content[k]:
|
|
450
|
+
mask[k] = True
|
|
451
|
+
return mask
|
|
452
|
+
|
|
336
453
|
|
|
337
454
|
class ToolCallParseStatus(str, enum.Enum):
|
|
338
455
|
"""Per-attempt outcome of parsing a single ``<tool_call>`` block.
|
|
@@ -530,6 +647,15 @@ class Renderer(Protocol):
|
|
|
530
647
|
caller needs that distinction for the prior portion, they
|
|
531
648
|
have it directly: every token in ``prev_completion_ids`` was
|
|
532
649
|
sampled; every token in ``prev_prompt_ids`` was not.
|
|
650
|
+
- ``is_content`` mirrors ``sampled_mask``'s scheme for the
|
|
651
|
+
prior portion (uniformly ``False`` — body-vs-wrap
|
|
652
|
+
attribution can't be recovered from raw token ids), and on
|
|
653
|
+
the bridge-added portion the renderer populates it the same
|
|
654
|
+
way as in :meth:`render`: ``True`` over the body bytes of
|
|
655
|
+
each new message, ``False`` over the surrounding scaffold.
|
|
656
|
+
Consumers walk the trajectory and read each step's own
|
|
657
|
+
``is_content`` for full-conversation body masks; the bridge
|
|
658
|
+
output covers only the *new* tokens this turn adds.
|
|
533
659
|
|
|
534
660
|
Text-only renderers return :class:`RenderedTokens` with
|
|
535
661
|
``multi_modal_data=None``. Multimodal renderers (see
|
|
@@ -911,31 +1037,24 @@ TRUSTED_REVISIONS: dict[str, str] = {
|
|
|
911
1037
|
# Models for which ``fastokens`` is known to diverge from vanilla
|
|
912
1038
|
# ``transformers.AutoTokenizer`` and therefore must NOT be patched.
|
|
913
1039
|
# Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
|
|
914
|
-
# backends
|
|
915
|
-
#
|
|
916
|
-
# yet implemented) or are kept defensively pending an upstream fastokens
|
|
917
|
-
# fix (MiniMax-M2 family — see per-entry comments).
|
|
1040
|
+
# backends. The entries below fail to load under fastokens (DeepSeek-V3
|
|
1041
|
+
# family — Metaspace pretokenizer not yet implemented).
|
|
918
1042
|
FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
|
|
919
1043
|
{
|
|
920
|
-
# fastokens
|
|
1044
|
+
# fastokens: ``ValueError: pre-tokenizer error: unsupported
|
|
921
1045
|
# pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
|
|
922
1046
|
# SentencePiece-style Metaspace pretokenization which fastokens
|
|
923
1047
|
# doesn't yet implement.
|
|
924
1048
|
"deepseek-ai/DeepSeek-V3",
|
|
925
1049
|
"deepseek-ai/DeepSeek-V3-Base",
|
|
926
|
-
# MiniMax: kept defensive pending upstream fastokens fix
|
|
927
|
-
# https://github.com/crusoecloud/fastokens/pull/32 — that PR
|
|
928
|
-
# removes a stray attribute leaked by ``unpatch_transformers``
|
|
929
|
-
# which steers MiniMax (declared ``tokenizer_class =
|
|
930
|
-
# 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
|
|
931
|
-
# load path on subsequent vanilla loads. Once the upstream fix
|
|
932
|
-
# is released, these two entries can be dropped after re-audit.
|
|
933
|
-
"MiniMaxAI/MiniMax-M2",
|
|
934
|
-
"MiniMaxAI/MiniMax-M2.5",
|
|
935
1050
|
}
|
|
936
1051
|
)
|
|
937
1052
|
|
|
938
1053
|
|
|
1054
|
+
_FASTOKENS_PATCH_LOCK = threading.Lock()
|
|
1055
|
+
_FASTOKENS_ANNOUNCED = False
|
|
1056
|
+
|
|
1057
|
+
|
|
939
1058
|
def _patched_load(model_name_or_path: str, **kwargs):
|
|
940
1059
|
"""Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
|
|
941
1060
|
process-locally — patch around the load, unpatch right after.
|
|
@@ -945,15 +1064,39 @@ def _patched_load(model_name_or_path: str, **kwargs):
|
|
|
945
1064
|
fastokens for ``encode``/``decode`` while subsequent
|
|
946
1065
|
``AutoTokenizer.from_pretrained`` calls (outside our control) go
|
|
947
1066
|
back to vanilla. This keeps the global side effect minimal.
|
|
1067
|
+
|
|
1068
|
+
fastokens itself prints ``[fastokens] patch_transformers: ...`` to
|
|
1069
|
+
stdout on every patch/unpatch call. Building a pool of size N would
|
|
1070
|
+
therefore emit ~N lines (more under thread contention, where some
|
|
1071
|
+
threads see ``already patched``). We swallow those prints under a
|
|
1072
|
+
lock — ``contextlib.redirect_stdout`` swaps ``sys.stdout``
|
|
1073
|
+
process-wide, so the lock keeps unrelated stdout writes from other
|
|
1074
|
+
threads from disappearing into our buffer. The patch/unpatch calls
|
|
1075
|
+
are cheap; only the brief patch+unpatch is serialized, the actual
|
|
1076
|
+
``from_pretrained`` still runs concurrently across pool slots. A
|
|
1077
|
+
single ``logger.info`` is emitted on the first patch so the fast
|
|
1078
|
+
path is still discoverable in logs.
|
|
948
1079
|
"""
|
|
949
1080
|
import fastokens
|
|
950
1081
|
from transformers import AutoTokenizer
|
|
951
1082
|
|
|
952
|
-
|
|
1083
|
+
global _FASTOKENS_ANNOUNCED
|
|
1084
|
+
|
|
1085
|
+
with _FASTOKENS_PATCH_LOCK:
|
|
1086
|
+
with contextlib.redirect_stdout(io.StringIO()):
|
|
1087
|
+
fastokens.patch_transformers()
|
|
1088
|
+
if not _FASTOKENS_ANNOUNCED:
|
|
1089
|
+
logger.info(
|
|
1090
|
+
"fastokens enabled — tokenizers load through the Rust BPE "
|
|
1091
|
+
"fast path (~10x encode speedup)."
|
|
1092
|
+
)
|
|
1093
|
+
_FASTOKENS_ANNOUNCED = True
|
|
953
1094
|
try:
|
|
954
1095
|
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
|
|
955
1096
|
finally:
|
|
956
|
-
|
|
1097
|
+
with _FASTOKENS_PATCH_LOCK:
|
|
1098
|
+
with contextlib.redirect_stdout(io.StringIO()):
|
|
1099
|
+
fastokens.unpatch_transformers()
|
|
957
1100
|
|
|
958
1101
|
|
|
959
1102
|
def load_tokenizer(
|
|
@@ -975,10 +1118,10 @@ def load_tokenizer(
|
|
|
975
1118
|
immediately after, so global ``AutoTokenizer.from_pretrained`` calls
|
|
976
1119
|
elsewhere in the user's process are not affected.
|
|
977
1120
|
|
|
978
|
-
Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1121
|
+
Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family) skip the
|
|
1122
|
+
patch — fastokens currently fails to load them. Pass
|
|
1123
|
+
``use_fastokens=False`` to force the vanilla backend for any other
|
|
1124
|
+
model.
|
|
982
1125
|
|
|
983
1126
|
Unknown / fine-tuned model paths fall through to
|
|
984
1127
|
``trust_remote_code=False`` and the patched-load fast path. If
|
|
@@ -1208,6 +1351,7 @@ def build_training_sample(
|
|
|
1208
1351
|
*,
|
|
1209
1352
|
role_to_mask: Callable[[Message], bool],
|
|
1210
1353
|
tools: list[ToolSpec] | None = None,
|
|
1354
|
+
content_sft_roles: "set[str] | frozenset[str] | None" = None,
|
|
1211
1355
|
) -> tuple[list[int], list[bool]]:
|
|
1212
1356
|
"""Build (token_ids, loss_mask) for supervised training.
|
|
1213
1357
|
|
|
@@ -1223,17 +1367,53 @@ def build_training_sample(
|
|
|
1223
1367
|
back to attribution-only masking — every token attributed to a
|
|
1224
1368
|
trainable role is trained on, including template-injected
|
|
1225
1369
|
``<|im_start|>role\\n`` openers.
|
|
1370
|
+
|
|
1371
|
+
``content_sft_roles`` opts in additional roles for "body-only"
|
|
1372
|
+
supervision: for every message whose role is in this set, tokens
|
|
1373
|
+
with ``is_content=True`` are marked trainable even though the
|
|
1374
|
+
``sampled_mask`` gate excludes them (the model never samples
|
|
1375
|
+
tool / user / system tokens). Template scaffolding around those
|
|
1376
|
+
messages — ``<|im_start|>role\\n`` openers, ``<|im_end|>``
|
|
1377
|
+
closers, ``<|tool_response>`` wraps, inter-turn ``\\n`` — stays
|
|
1378
|
+
masked out, so the model learns to anticipate the body text
|
|
1379
|
+
without producing the surrounding special tokens (which would
|
|
1380
|
+
interrupt a real rollout). The canonical use case is RL on
|
|
1381
|
+
assistant tokens (``role_to_mask=lambda m: m["role"] ==
|
|
1382
|
+
"assistant"``) plus SFT on tool response bodies
|
|
1383
|
+
(``content_sft_roles={"tool"}``).
|
|
1384
|
+
|
|
1385
|
+
Requires the renderer to populate ``is_content`` for the body-only
|
|
1386
|
+
path to fire. Renderers that leave it empty (``DefaultRenderer``,
|
|
1387
|
+
or hand-coded renderers that haven't been wired up yet) ignore
|
|
1388
|
+
``content_sft_roles`` silently — falling back to the original
|
|
1389
|
+
``role_to_mask`` + ``sampled_mask`` behaviour.
|
|
1226
1390
|
"""
|
|
1227
1391
|
rendered = renderer.render(messages, tools=tools)
|
|
1228
1392
|
has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
|
|
1393
|
+
has_content_info = len(rendered.is_content) == len(rendered.token_ids)
|
|
1394
|
+
body_roles: "frozenset[str]"
|
|
1395
|
+
if content_sft_roles and has_content_info:
|
|
1396
|
+
body_roles = frozenset(content_sft_roles)
|
|
1397
|
+
else:
|
|
1398
|
+
body_roles = frozenset()
|
|
1399
|
+
|
|
1229
1400
|
loss_mask: list[bool] = []
|
|
1230
1401
|
for k, msg_idx in enumerate(rendered.message_indices):
|
|
1231
1402
|
if msg_idx < 0:
|
|
1232
1403
|
loss_mask.append(False)
|
|
1233
|
-
|
|
1404
|
+
continue
|
|
1405
|
+
msg = messages[msg_idx]
|
|
1406
|
+
# Body-only path for opt-in roles. Fires only on tokens whose
|
|
1407
|
+
# is_content bit is set; never adds the scaffolding around the
|
|
1408
|
+
# message, so the model isn't supervised on emitting the role
|
|
1409
|
+
# tags / wraps that would derail a rollout.
|
|
1410
|
+
if body_roles and msg.get("role") in body_roles:
|
|
1411
|
+
loss_mask.append(rendered.is_content[k])
|
|
1412
|
+
continue
|
|
1413
|
+
if has_sampled_info and not rendered.sampled_mask[k]:
|
|
1234
1414
|
loss_mask.append(False)
|
|
1235
1415
|
else:
|
|
1236
|
-
loss_mask.append(role_to_mask(
|
|
1416
|
+
loss_mask.append(role_to_mask(msg))
|
|
1237
1417
|
return rendered.token_ids, loss_mask
|
|
1238
1418
|
|
|
1239
1419
|
|
|
@@ -1280,6 +1460,157 @@ def trim_to_turn_close(
|
|
|
1280
1460
|
return previous_ids
|
|
1281
1461
|
|
|
1282
1462
|
|
|
1463
|
+
# Per-model offset-aware tokenizer cache. ``attribute_text_segments``
|
|
1464
|
+
# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
|
|
1465
|
+
# each token to its source text segment under one BPE pass. Fastokens
|
|
1466
|
+
# (the Rust BPE we patch in by default for ~10x faster encode) does not
|
|
1467
|
+
# track character offsets — the patched tokenizer's
|
|
1468
|
+
# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
|
|
1469
|
+
# keep a parallel vanilla tokenizer per model purely for offset queries.
|
|
1470
|
+
# Memory cost is one extra tokenizer per *unique* model name across all
|
|
1471
|
+
# pools / renderers (the cache is process-global), independent of pool
|
|
1472
|
+
# size.
|
|
1473
|
+
_offset_tokenizers: dict[str, Any] = {}
|
|
1474
|
+
_offset_tokenizers_lock = threading.Lock()
|
|
1475
|
+
|
|
1476
|
+
|
|
1477
|
+
def _get_offset_tokenizer(tokenizer):
|
|
1478
|
+
"""Return a tokenizer that supports ``return_offsets_mapping=True``.
|
|
1479
|
+
|
|
1480
|
+
If ``tokenizer`` itself supports offsets, returns it unchanged.
|
|
1481
|
+
Otherwise loads a vanilla (non-fastokens) tokenizer from
|
|
1482
|
+
``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
|
|
1483
|
+
has no usable ``name_or_path`` — hand-coded renderers always pass
|
|
1484
|
+
a tokenizer loaded via ``load_tokenizer`` which does set it.
|
|
1485
|
+
"""
|
|
1486
|
+
# Cheap probe: does this tokenizer already provide offsets?
|
|
1487
|
+
try:
|
|
1488
|
+
tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
|
|
1489
|
+
return tokenizer
|
|
1490
|
+
except (NotImplementedError, ValueError, TypeError):
|
|
1491
|
+
pass
|
|
1492
|
+
|
|
1493
|
+
name_or_path = getattr(tokenizer, "name_or_path", "")
|
|
1494
|
+
if not name_or_path:
|
|
1495
|
+
raise RuntimeError(
|
|
1496
|
+
"Cannot construct an offset-aware tokenizer: the supplied "
|
|
1497
|
+
"tokenizer has no ``name_or_path`` to fall back on. Pass a "
|
|
1498
|
+
"tokenizer loaded via ``renderers.base.load_tokenizer``."
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
with _offset_tokenizers_lock:
|
|
1502
|
+
cached = _offset_tokenizers.get(name_or_path)
|
|
1503
|
+
if cached is not None:
|
|
1504
|
+
return cached
|
|
1505
|
+
from transformers import AutoTokenizer
|
|
1506
|
+
|
|
1507
|
+
kwargs: dict[str, Any] = {}
|
|
1508
|
+
revision = TRUSTED_REVISIONS.get(name_or_path)
|
|
1509
|
+
if revision is not None:
|
|
1510
|
+
kwargs = {"trust_remote_code": True, "revision": revision}
|
|
1511
|
+
else:
|
|
1512
|
+
kwargs = {"trust_remote_code": False}
|
|
1513
|
+
# Explicitly vanilla — we want HF's Rust tokenizer with offset
|
|
1514
|
+
# tracking, not the fastokens shim. ``load_tokenizer`` would
|
|
1515
|
+
# patch fastokens in by default; calling
|
|
1516
|
+
# ``AutoTokenizer.from_pretrained`` directly here keeps the
|
|
1517
|
+
# fastokens patch out of this code path entirely.
|
|
1518
|
+
offset_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
|
|
1519
|
+
if not getattr(offset_tok, "is_fast", False):
|
|
1520
|
+
raise RuntimeError(
|
|
1521
|
+
f"Vanilla tokenizer for {name_or_path!r} is not a fast "
|
|
1522
|
+
"tokenizer; offset_mapping is unavailable. Hand-coded "
|
|
1523
|
+
"renderers require a fast tokenizer for body/scaffold "
|
|
1524
|
+
"attribution."
|
|
1525
|
+
)
|
|
1526
|
+
_offset_tokenizers[name_or_path] = offset_tok
|
|
1527
|
+
return offset_tok
|
|
1528
|
+
|
|
1529
|
+
|
|
1530
|
+
def attribute_text_segments(
|
|
1531
|
+
tokenizer,
|
|
1532
|
+
segments: "list[tuple[str, bool]]",
|
|
1533
|
+
) -> "list[tuple[int, bool]]":
|
|
1534
|
+
"""Tokenize concatenated segments as a single BPE pass and return
|
|
1535
|
+
``(token_id, is_content)`` pairs.
|
|
1536
|
+
|
|
1537
|
+
``segments`` is a list of ``(text, is_content)`` chunks the renderer
|
|
1538
|
+
wants to emit contiguously — for example ``[("user\\n", False),
|
|
1539
|
+
(content, True)]`` for a user message. Concatenation is done before
|
|
1540
|
+
encoding to preserve BPE merges across the wrap/body boundary; the
|
|
1541
|
+
resulting tokens are then attributed back to their source segment
|
|
1542
|
+
via the fast tokenizer's ``offset_mapping``.
|
|
1543
|
+
|
|
1544
|
+
A token is attributed to the segment containing its first source
|
|
1545
|
+
character (``offset_mapping[k][0]``). Tokens whose first character
|
|
1546
|
+
falls exactly on a segment boundary are attributed to the segment
|
|
1547
|
+
that *starts* at that offset (the "later" segment). Zero-length
|
|
1548
|
+
tokens (rare; usually pre-tokenizer artefacts) are attributed to
|
|
1549
|
+
the most recently entered segment.
|
|
1550
|
+
|
|
1551
|
+
Requires a HuggingFace fast tokenizer with offset tracking. The
|
|
1552
|
+
``fastokens`` patch ``load_tokenizer`` applies by default does
|
|
1553
|
+
**not** track offsets — when that's the case we transparently load
|
|
1554
|
+
a vanilla offset-capable tokenizer for the same model and cache it
|
|
1555
|
+
(see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
|
|
1556
|
+
registered for model families that ship a fast tokenizer, so a
|
|
1557
|
+
silent slow-tokenizer fallback isn't supported — BPE drift at the
|
|
1558
|
+
wrap/body boundary would defeat the whole point.
|
|
1559
|
+
|
|
1560
|
+
Empty input or empty joined text returns an empty list.
|
|
1561
|
+
"""
|
|
1562
|
+
if not segments:
|
|
1563
|
+
return []
|
|
1564
|
+
full_text = "".join(text for text, _ in segments)
|
|
1565
|
+
if not full_text:
|
|
1566
|
+
return []
|
|
1567
|
+
|
|
1568
|
+
offset_tokenizer = _get_offset_tokenizer(tokenizer)
|
|
1569
|
+
encoding = offset_tokenizer(
|
|
1570
|
+
full_text,
|
|
1571
|
+
add_special_tokens=False,
|
|
1572
|
+
return_offsets_mapping=True,
|
|
1573
|
+
)
|
|
1574
|
+
token_ids = list(encoding["input_ids"])
|
|
1575
|
+
offsets = list(encoding["offset_mapping"])
|
|
1576
|
+
|
|
1577
|
+
# Build segment char-span lookup. Track the half-open span
|
|
1578
|
+
# [seg_start, seg_end) of each segment and its is_content bit.
|
|
1579
|
+
spans: list[tuple[int, int, bool]] = []
|
|
1580
|
+
pos = 0
|
|
1581
|
+
for text, is_content in segments:
|
|
1582
|
+
spans.append((pos, pos + len(text), is_content))
|
|
1583
|
+
pos += len(text)
|
|
1584
|
+
total_len = pos
|
|
1585
|
+
|
|
1586
|
+
out: list[tuple[int, bool]] = []
|
|
1587
|
+
last_is_content = spans[-1][2] if spans else False
|
|
1588
|
+
for tok_id, (start, _end) in zip(token_ids, offsets):
|
|
1589
|
+
if start >= total_len:
|
|
1590
|
+
# Token's character offset is past every segment (shouldn't
|
|
1591
|
+
# normally happen for add_special_tokens=False, but defensive
|
|
1592
|
+
# against tokenizer-specific edge cases).
|
|
1593
|
+
out.append((tok_id, last_is_content))
|
|
1594
|
+
continue
|
|
1595
|
+
# Find the segment that contains `start`. Segments are
|
|
1596
|
+
# contiguous and ordered, so a linear scan is fine — the inner
|
|
1597
|
+
# loop runs at most len(segments) times per token and segments
|
|
1598
|
+
# is typically 2-3 in practice.
|
|
1599
|
+
is_content = last_is_content
|
|
1600
|
+
for seg_start, seg_end, seg_is_content in spans:
|
|
1601
|
+
if seg_start <= start < seg_end:
|
|
1602
|
+
is_content = seg_is_content
|
|
1603
|
+
break
|
|
1604
|
+
else:
|
|
1605
|
+
# start == total_len handled above; the remaining case is
|
|
1606
|
+
# an empty segment in the middle. Empty segments emit no
|
|
1607
|
+
# characters, so no token can land in them; fall through to
|
|
1608
|
+
# the last non-empty segment's bit.
|
|
1609
|
+
pass
|
|
1610
|
+
out.append((tok_id, is_content))
|
|
1611
|
+
return out
|
|
1612
|
+
|
|
1613
|
+
|
|
1283
1614
|
def reject_assistant_in_extension(new_messages: list[Message]) -> bool:
|
|
1284
1615
|
"""Return True if any message in ``new_messages`` is an assistant turn.
|
|
1285
1616
|
|