renderers 0.1.8.dev4__tar.gz → 0.1.8.dev26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. renderers-0.1.8.dev26/.github/workflows/publish-dev.yml +104 -0
  2. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/PKG-INFO +2 -2
  3. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/pyproject.toml +10 -4
  4. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/__init__.py +2 -0
  5. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/_version.py +2 -2
  6. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/base.py +353 -22
  7. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/client.py +68 -16
  8. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/deepseek_v3.py +108 -38
  9. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/glm45.py +167 -53
  10. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/glm5.py +140 -47
  11. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/gpt_oss.py +181 -13
  12. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/kimi_k2.py +167 -71
  13. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/kimi_k25.py +188 -65
  14. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/laguna_xs2.py +132 -43
  15. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/minimax_m2.py +228 -59
  16. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/nemotron3.py +172 -64
  17. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen3.py +176 -65
  18. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen35.py +240 -99
  19. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen3_vl.py +179 -94
  20. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_client.py +71 -5
  21. renderers-0.1.8.dev26/tests/test_is_content.py +389 -0
  22. renderers-0.1.8.dev26/tests/test_kimi_k25_tool_schema.py +53 -0
  23. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_load_tokenizer_fastokens.py +44 -5
  24. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/uv.lock +37 -3
  25. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/publish.yml +0 -0
  26. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/style.yml +0 -0
  27. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.github/workflows/test.yml +0 -0
  28. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.gitignore +0 -0
  29. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/.pre-commit-config.yaml +0 -0
  30. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/LICENSE +0 -0
  31. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/README.md +0 -0
  32. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/README.md +0 -0
  33. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/sglang/multiturn_generate_sglang.py +0 -0
  34. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/sglang/online_multiturn_sglang.py +0 -0
  35. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/tinker/multiturn_generate_tinker.py +0 -0
  36. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/transformers/multiturn_generate_transformers.py +0 -0
  37. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/examples/vllm/multiturn_generate_vllm.py +0 -0
  38. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/default.py +0 -0
  39. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/parsers.py +0 -0
  40. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/parsing.py +0 -0
  41. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/qwen36.py +0 -0
  42. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/conftest.py +0 -0
  43. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_bridge.py +0 -0
  44. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_build_helpers.py +0 -0
  45. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_gpt_oss_harmony_parity.py +0 -0
  46. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_incremental.py +0 -0
  47. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_load_tokenizer.py +0 -0
  48. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_message_indices.py +0 -0
  49. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_multimodal.py +0 -0
  50. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parse_response.py +0 -0
  51. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parse_response_robustness.py +0 -0
  52. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_parsers.py +0 -0
  53. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_preserve_thinking.py +0 -0
  54. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_qwen35_size_coverage.py +0 -0
  55. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_render_ids.py +0 -0
  56. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_roundtrip.py +0 -0
  57. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_sampled_mask.py +0 -0
  58. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_tokens_per_message.py +0 -0
  59. {renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/tests/test_tool_arg_type_preservation.py +0 -0
@@ -0,0 +1,104 @@
1
+ name: Publish Dev
2
+
3
+ # Tag every commit on main as ``renderers-v<next>.dev<N>`` and publish the
4
+ # wheel to PyPI as a pre-release. ``<next>`` is the latest release tag with
5
+ # its patch bumped; ``<N>`` is the number of commits since that release so
6
+ # each main commit maps to a unique PEP 440 dev version.
7
+ #
8
+ # Building from the freshly-created tag means hatch-vcs resolves the version
9
+ # cleanly (no ``+gHASH`` local segment), which PyPI requires.
10
+
11
+ on:
12
+ push:
13
+ branches: [main]
14
+
15
+ concurrency:
16
+ group: publish-dev-${{ github.ref }}
17
+ cancel-in-progress: false
18
+
19
+ jobs:
20
+ tag:
21
+ runs-on: ubuntu-latest
22
+ permissions:
23
+ contents: write
24
+ outputs:
25
+ tag: ${{ steps.compute.outputs.tag }}
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ with:
29
+ fetch-depth: 0
30
+
31
+ - name: Compute next dev tag
32
+ id: compute
33
+ run: |
34
+ set -euo pipefail
35
+ LATEST_RELEASE=$(git tag --list 'renderers-v*' --sort=-v:refname \
36
+ | grep -Ev '(dev|rc|a[0-9]|b[0-9])' \
37
+ | head -1)
38
+ if [ -z "$LATEST_RELEASE" ]; then
39
+ echo "No release tag matching 'renderers-v<MAJOR.MINOR.PATCH>' found" >&2
40
+ exit 1
41
+ fi
42
+ BASE=${LATEST_RELEASE#renderers-v}
43
+ MAJOR=$(echo "$BASE" | cut -d. -f1)
44
+ MINOR=$(echo "$BASE" | cut -d. -f2)
45
+ PATCH=$(echo "$BASE" | cut -d. -f3)
46
+ NEXT="${MAJOR}.${MINOR}.$((PATCH + 1))"
47
+ N=$(git rev-list --count "${LATEST_RELEASE}..HEAD")
48
+ TAG="renderers-v${NEXT}.dev${N}"
49
+ echo "tag=${TAG}" >> "$GITHUB_OUTPUT"
50
+ echo "Computed tag: ${TAG} (base=${LATEST_RELEASE}, commits=${N})"
51
+
52
+ - name: Create and push tag
53
+ env:
54
+ TAG: ${{ steps.compute.outputs.tag }}
55
+ run: |
56
+ set -euo pipefail
57
+ if git ls-remote --exit-code --tags origin "refs/tags/${TAG}" >/dev/null 2>&1; then
58
+ echo "Tag ${TAG} already exists on origin — nothing to do" >&2
59
+ exit 0
60
+ fi
61
+ git config user.name 'github-actions[bot]'
62
+ git config user.email '41898282+github-actions[bot]@users.noreply.github.com'
63
+ git tag -a "$TAG" -m "Automated dev release ${TAG}"
64
+ git push origin "$TAG"
65
+
66
+ build:
67
+ needs: tag
68
+ runs-on: ubuntu-latest
69
+ permissions:
70
+ contents: read
71
+ steps:
72
+ - uses: actions/checkout@v4
73
+ with:
74
+ fetch-depth: 0
75
+ ref: refs/tags/${{ needs.tag.outputs.tag }}
76
+
77
+ - uses: astral-sh/setup-uv@v7
78
+
79
+ - name: Build renderers
80
+ run: uv build
81
+
82
+ - name: Upload dist artifacts
83
+ uses: actions/upload-artifact@v4
84
+ with:
85
+ name: dist-dev
86
+ path: dist/
87
+ if-no-files-found: error
88
+ retention-days: 7
89
+
90
+ publish:
91
+ needs: build
92
+ runs-on: ubuntu-latest
93
+ environment: pypi-prod
94
+ permissions:
95
+ id-token: write
96
+ steps:
97
+ - name: Download dist artifacts
98
+ uses: actions/download-artifact@v4
99
+ with:
100
+ name: dist-dev
101
+ path: dist/
102
+
103
+ - name: Publish to PyPI
104
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: renderers
3
- Version: 0.1.8.dev4
3
+ Version: 0.1.8.dev26
4
4
  Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
7
7
  Requires-Python: <3.14,>=3.10
8
- Requires-Dist: fastokens>=0.1.1
8
+ Requires-Dist: fastokens>=0.2.0
9
9
  Requires-Dist: jinja2
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: openai-harmony>=0.0.8
@@ -26,10 +26,10 @@ dependencies = [
26
26
  "openai-harmony>=0.0.8",
27
27
  # Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
28
28
  # ``load_tokenizer`` patches it in by default for every supported model
29
- # except a small denylist (DeepSeek-V3 family, MiniMax-M2 family). The
30
- # patch is bracketed around ``from_pretrained``, so subsequent
31
- # ``AutoTokenizer`` calls outside the renderers package stay vanilla.
32
- "fastokens>=0.1.1",
29
+ # except a small denylist (DeepSeek-V3 family). The patch is bracketed
30
+ # around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
31
+ # outside the renderers package stay vanilla.
32
+ "fastokens>=0.2.0",
33
33
  ]
34
34
 
35
35
  [tool.hatch.version]
@@ -68,6 +68,12 @@ dev = [
68
68
 
69
69
  [tool.uv]
70
70
  exclude-newer = "7 days"
71
+ # fastokens 0.2.0 was published on 2026-05-17 and contains the
72
+ # ``unpatch_transformers`` fix (crusoecloud/fastokens#32) needed for
73
+ # MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
74
+ # the project-wide 7-day cutoff lets the lockfile pick it up immediately
75
+ # while the rest of the dependency graph stays gated.
76
+ exclude-newer-package = { fastokens = false }
71
77
 
72
78
  [tool.ty.environment]
73
79
  python-version = "3.13"
@@ -28,6 +28,7 @@ from renderers.base import (
28
28
  ToolCallParseStatus,
29
29
  ToolSpec,
30
30
  VideoPart,
31
+ attribute_text_segments,
31
32
  build_training_sample,
32
33
  build_trajectory_step,
33
34
  create_renderer,
@@ -90,6 +91,7 @@ __all__ = [
90
91
  "ToolSpec",
91
92
  "VideoPart",
92
93
  "__version__",
94
+ "attribute_text_segments",
93
95
  "build_training_sample",
94
96
  "build_trajectory_step",
95
97
  "create_renderer",
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.1.8.dev4'
22
- __version_tuple__ = version_tuple = (0, 1, 8, 'dev4')
21
+ __version__ = version = '0.1.8.dev26'
22
+ __version_tuple__ = version_tuple = (0, 1, 8, 'dev26')
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  import enum
5
+ import io
4
6
  import logging
5
7
  import queue
6
8
  import threading
@@ -169,6 +171,32 @@ class RenderedTokens:
169
171
  masking. ``DefaultRenderer`` leaves it empty because the Jinja
170
172
  template is opaque; hand-coded renderers populate it.
171
173
 
174
+ ``is_content`` is a per-token signal generalizing the "scaffold vs
175
+ body" distinction across all roles: ``True`` iff the token was
176
+ produced from message-body bytes (caller-provided ``content`` /
177
+ ``tool_calls`` / ``reasoning_content``, or the model's sampled
178
+ emission for the assistant role), ``False`` iff it is template
179
+ scaffolding the renderer added around message bodies — role-tag
180
+ openers, closers when not model-sampled, inter-turn separators,
181
+ tool-response wraps, the tools-header block, the generation prompt.
182
+ Generalises ``sampled_mask``: where ``sampled_mask`` answers "would
183
+ the model emit this?" (useful for assistant tokens; uniformly
184
+ ``False`` elsewhere), ``is_content`` answers "is this from caller
185
+ or model data?" (meaningful on every role). By construction
186
+ ``is_content[k] == sampled_mask[k]`` over every token attributed to
187
+ an assistant message; on other roles ``is_content`` carries new
188
+ information that ``sampled_mask`` does not.
189
+
190
+ The use case: SFT on tool response bodies while applying RL only to
191
+ assistant tokens. The trainer wants the model to anticipate tool
192
+ outputs but never to emit ``<|tool_response>`` itself (that would
193
+ interrupt the rollout), so the SFT loss mask is
194
+ ``message_role == "tool" AND is_content``.
195
+
196
+ Empty ``is_content`` (``[]``) — like ``sampled_mask`` — means the
197
+ renderer doesn't provide the signal. ``DefaultRenderer`` leaves it
198
+ empty for the same reason.
199
+
172
200
  ``multi_modal_data`` is populated by multimodal renderers (e.g.
173
201
  ``Qwen3VLRenderer``) when image / video content parts are present;
174
202
  text-only renderers leave it as ``None``.
@@ -177,6 +205,7 @@ class RenderedTokens:
177
205
  token_ids: list[int] = field(default_factory=list)
178
206
  message_indices: list[int] = field(default_factory=list)
179
207
  sampled_mask: list[bool] = field(default_factory=list)
208
+ is_content: list[bool] = field(default_factory=list)
180
209
  message_roles: list[str] = field(default_factory=list)
181
210
  multi_modal_data: "MultiModalData | None" = None
182
211
 
@@ -333,6 +362,94 @@ class RenderedTokens:
333
362
  out[role] = out.get(role, 0) + n
334
363
  return out
335
364
 
365
+ def content_token_spans_by_role(self) -> dict[str, list[tuple[int, int]]]:
366
+ """Per-role spans of contiguous body-only tokens (``is_content=True``).
367
+
368
+ Maps each role appearing in :attr:`message_roles` to a list of
369
+ half-open ``[start, end)`` slices into :attr:`token_ids` over
370
+ which every token satisfies ``is_content=True`` AND belongs to
371
+ a message of that role. Spans never cross message boundaries:
372
+ a tool message contributes its own runs; an immediately
373
+ adjacent assistant message contributes separate runs even when
374
+ the bodies abut on the token axis.
375
+
376
+ Returns an empty dict when :attr:`is_content` or
377
+ :attr:`message_roles` is empty (renderer didn't populate the
378
+ signal — e.g. ``DefaultRenderer``).
379
+
380
+ Intended for selective loss masking: SFT on tool response
381
+ bodies while RL acts only on assistant turns is the canonical
382
+ case::
383
+
384
+ spans = rendered.content_token_spans_by_role()
385
+ tool_sft_mask = [False] * len(rendered.token_ids)
386
+ for s, e in spans.get("tool", []):
387
+ for k in range(s, e):
388
+ tool_sft_mask[k] = True
389
+
390
+ See also :meth:`content_mask_for_roles` for the same
391
+ computation returned as a per-token bool list.
392
+ """
393
+ out: dict[str, list[tuple[int, int]]] = {}
394
+ if not self.is_content or not self.message_roles:
395
+ return out
396
+ n = len(self.token_ids)
397
+ if len(self.is_content) != n or len(self.message_indices) != n:
398
+ return out
399
+
400
+ msg_spans = self.message_token_spans()
401
+ for role, span in zip(self.message_roles, msg_spans):
402
+ bucket = out.setdefault(role, [])
403
+ if span is None:
404
+ continue
405
+ start, end = span
406
+ run_start: int | None = None
407
+ for k in range(start, end):
408
+ if self.is_content[k]:
409
+ if run_start is None:
410
+ run_start = k
411
+ else:
412
+ if run_start is not None:
413
+ bucket.append((run_start, k))
414
+ run_start = None
415
+ if run_start is not None:
416
+ bucket.append((run_start, end))
417
+ return out
418
+
419
+ def content_mask_for_roles(self, roles: "set[str] | frozenset[str]") -> list[bool]:
420
+ """Per-token bool list: ``True`` iff the token is body of a
421
+ message whose role is in ``roles``.
422
+
423
+ Length matches :attr:`token_ids`. Returns an all-``False``
424
+ list of that length when :attr:`is_content` or
425
+ :attr:`message_roles` is empty — consumers can AND this with
426
+ their own attribution masks without length checks.
427
+
428
+ ``role_to_mask`` style helpers in :func:`build_training_sample`
429
+ cover the trainable-role question; this one covers the
430
+ complementary "body-only" question. The two compose: SFT mask
431
+ on tool body is
432
+ ``rendered.content_mask_for_roles({"tool"})``; RL mask on
433
+ assistant tokens stays
434
+ ``[s and (mi >= 0 and rendered.message_roles[mi] == "assistant")
435
+ for s, mi in zip(rendered.sampled_mask, rendered.message_indices)]``.
436
+ """
437
+ n = len(self.token_ids)
438
+ mask = [False] * n
439
+ if not self.is_content or not self.message_roles:
440
+ return mask
441
+ if len(self.is_content) != n or len(self.message_indices) != n:
442
+ return mask
443
+
444
+ for k, msg_idx in enumerate(self.message_indices):
445
+ if msg_idx < 0:
446
+ continue
447
+ if msg_idx >= len(self.message_roles):
448
+ continue
449
+ if self.message_roles[msg_idx] in roles and self.is_content[k]:
450
+ mask[k] = True
451
+ return mask
452
+
336
453
 
337
454
  class ToolCallParseStatus(str, enum.Enum):
338
455
  """Per-attempt outcome of parsing a single ``<tool_call>`` block.
@@ -530,6 +647,15 @@ class Renderer(Protocol):
530
647
  caller needs that distinction for the prior portion, they
531
648
  have it directly: every token in ``prev_completion_ids`` was
532
649
  sampled; every token in ``prev_prompt_ids`` was not.
650
+ - ``is_content`` mirrors ``sampled_mask``'s scheme for the
651
+ prior portion (uniformly ``False`` — body-vs-wrap
652
+ attribution can't be recovered from raw token ids), and on
653
+ the bridge-added portion the renderer populates it the same
654
+ way as in :meth:`render`: ``True`` over the body bytes of
655
+ each new message, ``False`` over the surrounding scaffold.
656
+ Consumers walk the trajectory and read each step's own
657
+ ``is_content`` for full-conversation body masks; the bridge
658
+ output covers only the *new* tokens this turn adds.
533
659
 
534
660
  Text-only renderers return :class:`RenderedTokens` with
535
661
  ``multi_modal_data=None``. Multimodal renderers (see
@@ -911,31 +1037,24 @@ TRUSTED_REVISIONS: dict[str, str] = {
911
1037
  # Models for which ``fastokens`` is known to diverge from vanilla
912
1038
  # ``transformers.AutoTokenizer`` and therefore must NOT be patched.
913
1039
  # Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
914
- # backends; 31/35 passed byte-identical. The four below either fail to
915
- # load under fastokens (DeepSeek-V3 family — Metaspace pretokenizer not
916
- # yet implemented) or are kept defensively pending an upstream fastokens
917
- # fix (MiniMax-M2 family — see per-entry comments).
1040
+ # backends. The entries below fail to load under fastokens (DeepSeek-V3
1041
+ # family — Metaspace pretokenizer not yet implemented).
918
1042
  FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
919
1043
  {
920
- # fastokens 0.1.1: ``ValueError: pre-tokenizer error: unsupported
1044
+ # fastokens: ``ValueError: pre-tokenizer error: unsupported
921
1045
  # pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
922
1046
  # SentencePiece-style Metaspace pretokenization which fastokens
923
1047
  # doesn't yet implement.
924
1048
  "deepseek-ai/DeepSeek-V3",
925
1049
  "deepseek-ai/DeepSeek-V3-Base",
926
- # MiniMax: kept defensive pending upstream fastokens fix
927
- # https://github.com/crusoecloud/fastokens/pull/32 — that PR
928
- # removes a stray attribute leaked by ``unpatch_transformers``
929
- # which steers MiniMax (declared ``tokenizer_class =
930
- # 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
931
- # load path on subsequent vanilla loads. Once the upstream fix
932
- # is released, these two entries can be dropped after re-audit.
933
- "MiniMaxAI/MiniMax-M2",
934
- "MiniMaxAI/MiniMax-M2.5",
935
1050
  }
936
1051
  )
937
1052
 
938
1053
 
1054
+ _FASTOKENS_PATCH_LOCK = threading.Lock()
1055
+ _FASTOKENS_ANNOUNCED = False
1056
+
1057
+
939
1058
  def _patched_load(model_name_or_path: str, **kwargs):
940
1059
  """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
941
1060
  process-locally — patch around the load, unpatch right after.
@@ -945,15 +1064,39 @@ def _patched_load(model_name_or_path: str, **kwargs):
945
1064
  fastokens for ``encode``/``decode`` while subsequent
946
1065
  ``AutoTokenizer.from_pretrained`` calls (outside our control) go
947
1066
  back to vanilla. This keeps the global side effect minimal.
1067
+
1068
+ fastokens itself prints ``[fastokens] patch_transformers: ...`` to
1069
+ stdout on every patch/unpatch call. Building a pool of size N would
1070
+ therefore emit ~N lines (more under thread contention, where some
1071
+ threads see ``already patched``). We swallow those prints under a
1072
+ lock — ``contextlib.redirect_stdout`` swaps ``sys.stdout``
1073
+ process-wide, so the lock keeps unrelated stdout writes from other
1074
+ threads from disappearing into our buffer. The patch/unpatch calls
1075
+ are cheap; only the brief patch+unpatch is serialized, the actual
1076
+ ``from_pretrained`` still runs concurrently across pool slots. A
1077
+ single ``logger.info`` is emitted on the first patch so the fast
1078
+ path is still discoverable in logs.
948
1079
  """
949
1080
  import fastokens
950
1081
  from transformers import AutoTokenizer
951
1082
 
952
- fastokens.patch_transformers()
1083
+ global _FASTOKENS_ANNOUNCED
1084
+
1085
+ with _FASTOKENS_PATCH_LOCK:
1086
+ with contextlib.redirect_stdout(io.StringIO()):
1087
+ fastokens.patch_transformers()
1088
+ if not _FASTOKENS_ANNOUNCED:
1089
+ logger.info(
1090
+ "fastokens enabled — tokenizers load through the Rust BPE "
1091
+ "fast path (~10x encode speedup)."
1092
+ )
1093
+ _FASTOKENS_ANNOUNCED = True
953
1094
  try:
954
1095
  return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
955
1096
  finally:
956
- fastokens.unpatch_transformers()
1097
+ with _FASTOKENS_PATCH_LOCK:
1098
+ with contextlib.redirect_stdout(io.StringIO()):
1099
+ fastokens.unpatch_transformers()
957
1100
 
958
1101
 
959
1102
  def load_tokenizer(
@@ -975,10 +1118,10 @@ def load_tokenizer(
975
1118
  immediately after, so global ``AutoTokenizer.from_pretrained`` calls
976
1119
  elsewhere in the user's process are not affected.
977
1120
 
978
- Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family, MiniMax-M2
979
- family) skip the patch — fastokens 0.1.1 either fails to load them
980
- or produces token-divergent output. Pass ``use_fastokens=False`` to
981
- force the vanilla backend for any other model.
1121
+ Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family) skip the
1122
+ patch — fastokens currently fails to load them. Pass
1123
+ ``use_fastokens=False`` to force the vanilla backend for any other
1124
+ model.
982
1125
 
983
1126
  Unknown / fine-tuned model paths fall through to
984
1127
  ``trust_remote_code=False`` and the patched-load fast path. If
@@ -1208,6 +1351,7 @@ def build_training_sample(
1208
1351
  *,
1209
1352
  role_to_mask: Callable[[Message], bool],
1210
1353
  tools: list[ToolSpec] | None = None,
1354
+ content_sft_roles: "set[str] | frozenset[str] | None" = None,
1211
1355
  ) -> tuple[list[int], list[bool]]:
1212
1356
  """Build (token_ids, loss_mask) for supervised training.
1213
1357
 
@@ -1223,17 +1367,53 @@ def build_training_sample(
1223
1367
  back to attribution-only masking — every token attributed to a
1224
1368
  trainable role is trained on, including template-injected
1225
1369
  ``<|im_start|>role\\n`` openers.
1370
+
1371
+ ``content_sft_roles`` opts in additional roles for "body-only"
1372
+ supervision: for every message whose role is in this set, tokens
1373
+ with ``is_content=True`` are marked trainable even though the
1374
+ ``sampled_mask`` gate excludes them (the model never samples
1375
+ tool / user / system tokens). Template scaffolding around those
1376
+ messages — ``<|im_start|>role\\n`` openers, ``<|im_end|>``
1377
+ closers, ``<|tool_response>`` wraps, inter-turn ``\\n`` — stays
1378
+ masked out, so the model learns to anticipate the body text
1379
+ without producing the surrounding special tokens (which would
1380
+ interrupt a real rollout). The canonical use case is RL on
1381
+ assistant tokens (``role_to_mask=lambda m: m["role"] ==
1382
+ "assistant"``) plus SFT on tool response bodies
1383
+ (``content_sft_roles={"tool"}``).
1384
+
1385
+ Requires the renderer to populate ``is_content`` for the body-only
1386
+ path to fire. Renderers that leave it empty (``DefaultRenderer``,
1387
+ or hand-coded renderers that haven't been wired up yet) ignore
1388
+ ``content_sft_roles`` silently — falling back to the original
1389
+ ``role_to_mask`` + ``sampled_mask`` behaviour.
1226
1390
  """
1227
1391
  rendered = renderer.render(messages, tools=tools)
1228
1392
  has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
1393
+ has_content_info = len(rendered.is_content) == len(rendered.token_ids)
1394
+ body_roles: "frozenset[str]"
1395
+ if content_sft_roles and has_content_info:
1396
+ body_roles = frozenset(content_sft_roles)
1397
+ else:
1398
+ body_roles = frozenset()
1399
+
1229
1400
  loss_mask: list[bool] = []
1230
1401
  for k, msg_idx in enumerate(rendered.message_indices):
1231
1402
  if msg_idx < 0:
1232
1403
  loss_mask.append(False)
1233
- elif has_sampled_info and not rendered.sampled_mask[k]:
1404
+ continue
1405
+ msg = messages[msg_idx]
1406
+ # Body-only path for opt-in roles. Fires only on tokens whose
1407
+ # is_content bit is set; never adds the scaffolding around the
1408
+ # message, so the model isn't supervised on emitting the role
1409
+ # tags / wraps that would derail a rollout.
1410
+ if body_roles and msg.get("role") in body_roles:
1411
+ loss_mask.append(rendered.is_content[k])
1412
+ continue
1413
+ if has_sampled_info and not rendered.sampled_mask[k]:
1234
1414
  loss_mask.append(False)
1235
1415
  else:
1236
- loss_mask.append(role_to_mask(messages[msg_idx]))
1416
+ loss_mask.append(role_to_mask(msg))
1237
1417
  return rendered.token_ids, loss_mask
1238
1418
 
1239
1419
 
@@ -1280,6 +1460,157 @@ def trim_to_turn_close(
1280
1460
  return previous_ids
1281
1461
 
1282
1462
 
1463
+ # Per-model offset-aware tokenizer cache. ``attribute_text_segments``
1464
+ # uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
1465
+ # each token to its source text segment under one BPE pass. Fastokens
1466
+ # (the Rust BPE we patch in by default for ~10x faster encode) does not
1467
+ # track character offsets — the patched tokenizer's
1468
+ # ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
1469
+ # keep a parallel vanilla tokenizer per model purely for offset queries.
1470
+ # Memory cost is one extra tokenizer per *unique* model name across all
1471
+ # pools / renderers (the cache is process-global), independent of pool
1472
+ # size.
1473
+ _offset_tokenizers: dict[str, Any] = {}
1474
+ _offset_tokenizers_lock = threading.Lock()
1475
+
1476
+
1477
+ def _get_offset_tokenizer(tokenizer):
1478
+ """Return a tokenizer that supports ``return_offsets_mapping=True``.
1479
+
1480
+ If ``tokenizer`` itself supports offsets, returns it unchanged.
1481
+ Otherwise loads a vanilla (non-fastokens) tokenizer from
1482
+ ``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
1483
+ has no usable ``name_or_path`` — hand-coded renderers always pass
1484
+ a tokenizer loaded via ``load_tokenizer`` which does set it.
1485
+ """
1486
+ # Cheap probe: does this tokenizer already provide offsets?
1487
+ try:
1488
+ tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
1489
+ return tokenizer
1490
+ except (NotImplementedError, ValueError, TypeError):
1491
+ pass
1492
+
1493
+ name_or_path = getattr(tokenizer, "name_or_path", "")
1494
+ if not name_or_path:
1495
+ raise RuntimeError(
1496
+ "Cannot construct an offset-aware tokenizer: the supplied "
1497
+ "tokenizer has no ``name_or_path`` to fall back on. Pass a "
1498
+ "tokenizer loaded via ``renderers.base.load_tokenizer``."
1499
+ )
1500
+
1501
+ with _offset_tokenizers_lock:
1502
+ cached = _offset_tokenizers.get(name_or_path)
1503
+ if cached is not None:
1504
+ return cached
1505
+ from transformers import AutoTokenizer
1506
+
1507
+ kwargs: dict[str, Any] = {}
1508
+ revision = TRUSTED_REVISIONS.get(name_or_path)
1509
+ if revision is not None:
1510
+ kwargs = {"trust_remote_code": True, "revision": revision}
1511
+ else:
1512
+ kwargs = {"trust_remote_code": False}
1513
+ # Explicitly vanilla — we want HF's Rust tokenizer with offset
1514
+ # tracking, not the fastokens shim. ``load_tokenizer`` would
1515
+ # patch fastokens in by default; calling
1516
+ # ``AutoTokenizer.from_pretrained`` directly here keeps the
1517
+ # fastokens patch out of this code path entirely.
1518
+ offset_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
1519
+ if not getattr(offset_tok, "is_fast", False):
1520
+ raise RuntimeError(
1521
+ f"Vanilla tokenizer for {name_or_path!r} is not a fast "
1522
+ "tokenizer; offset_mapping is unavailable. Hand-coded "
1523
+ "renderers require a fast tokenizer for body/scaffold "
1524
+ "attribution."
1525
+ )
1526
+ _offset_tokenizers[name_or_path] = offset_tok
1527
+ return offset_tok
1528
+
1529
+
1530
+ def attribute_text_segments(
1531
+ tokenizer,
1532
+ segments: "list[tuple[str, bool]]",
1533
+ ) -> "list[tuple[int, bool]]":
1534
+ """Tokenize concatenated segments as a single BPE pass and return
1535
+ ``(token_id, is_content)`` pairs.
1536
+
1537
+ ``segments`` is a list of ``(text, is_content)`` chunks the renderer
1538
+ wants to emit contiguously — for example ``[("user\\n", False),
1539
+ (content, True)]`` for a user message. Concatenation is done before
1540
+ encoding to preserve BPE merges across the wrap/body boundary; the
1541
+ resulting tokens are then attributed back to their source segment
1542
+ via the fast tokenizer's ``offset_mapping``.
1543
+
1544
+ A token is attributed to the segment containing its first source
1545
+ character (``offset_mapping[k][0]``). Tokens whose first character
1546
+ falls exactly on a segment boundary are attributed to the segment
1547
+ that *starts* at that offset (the "later" segment). Zero-length
1548
+ tokens (rare; usually pre-tokenizer artefacts) are attributed to
1549
+ the most recently entered segment.
1550
+
1551
+ Requires a HuggingFace fast tokenizer with offset tracking. The
1552
+ ``fastokens`` patch ``load_tokenizer`` applies by default does
1553
+ **not** track offsets — when that's the case we transparently load
1554
+ a vanilla offset-capable tokenizer for the same model and cache it
1555
+ (see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
1556
+ registered for model families that ship a fast tokenizer, so a
1557
+ silent slow-tokenizer fallback isn't supported — BPE drift at the
1558
+ wrap/body boundary would defeat the whole point.
1559
+
1560
+ Empty input or empty joined text returns an empty list.
1561
+ """
1562
+ if not segments:
1563
+ return []
1564
+ full_text = "".join(text for text, _ in segments)
1565
+ if not full_text:
1566
+ return []
1567
+
1568
+ offset_tokenizer = _get_offset_tokenizer(tokenizer)
1569
+ encoding = offset_tokenizer(
1570
+ full_text,
1571
+ add_special_tokens=False,
1572
+ return_offsets_mapping=True,
1573
+ )
1574
+ token_ids = list(encoding["input_ids"])
1575
+ offsets = list(encoding["offset_mapping"])
1576
+
1577
+ # Build segment char-span lookup. Track the half-open span
1578
+ # [seg_start, seg_end) of each segment and its is_content bit.
1579
+ spans: list[tuple[int, int, bool]] = []
1580
+ pos = 0
1581
+ for text, is_content in segments:
1582
+ spans.append((pos, pos + len(text), is_content))
1583
+ pos += len(text)
1584
+ total_len = pos
1585
+
1586
+ out: list[tuple[int, bool]] = []
1587
+ last_is_content = spans[-1][2] if spans else False
1588
+ for tok_id, (start, _end) in zip(token_ids, offsets):
1589
+ if start >= total_len:
1590
+ # Token's character offset is past every segment (shouldn't
1591
+ # normally happen for add_special_tokens=False, but defensive
1592
+ # against tokenizer-specific edge cases).
1593
+ out.append((tok_id, last_is_content))
1594
+ continue
1595
+ # Find the segment that contains `start`. Segments are
1596
+ # contiguous and ordered, so a linear scan is fine — the inner
1597
+ # loop runs at most len(segments) times per token and segments
1598
+ # is typically 2-3 in practice.
1599
+ is_content = last_is_content
1600
+ for seg_start, seg_end, seg_is_content in spans:
1601
+ if seg_start <= start < seg_end:
1602
+ is_content = seg_is_content
1603
+ break
1604
+ else:
1605
+ # start == total_len handled above; the remaining case is
1606
+ # an empty segment in the middle. Empty segments emit no
1607
+ # characters, so no token can land in them; fall through to
1608
+ # the last non-empty segment's bit.
1609
+ pass
1610
+ out.append((tok_id, is_content))
1611
+ return out
1612
+
1613
+
1283
1614
  def reject_assistant_in_extension(new_messages: list[Message]) -> bool:
1284
1615
  """Return True if any message in ``new_messages`` is an assistant turn.
1285
1616