PyPI - renderers - Versions diffs - 0.1.8.dev4__tar.gz → 0.1.8.dev26__tar.gz - Mend

renderers 0.1.8.dev4tar.gz → 0.1.8.dev26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

renderers-0.1.8.dev26/.github/workflows/publish-dev.yml ADDED Viewed

@@ -0,0 +1,104 @@
+name: Publish Dev
+# Tag every commit on main as ``renderers-v<next>.dev<N>`` and publish the
+# wheel to PyPI as a pre-release. ``<next>`` is the latest release tag with
+# its patch bumped; ``<N>`` is the number of commits since that release so
+# each main commit maps to a unique PEP 440 dev version.
+#
+# Building from the freshly-created tag means hatch-vcs resolves the version
+# cleanly (no ``+gHASH`` local segment), which PyPI requires.
+on:
+  push:
+    branches: [main]
+concurrency:
+  group: publish-dev-${{ github.ref }}
+  cancel-in-progress: false
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    outputs:
+      tag: ${{ steps.compute.outputs.tag }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Compute next dev tag
+        id: compute
+        run: |
+          set -euo pipefail
+          LATEST_RELEASE=$(git tag --list 'renderers-v*' --sort=-v:refname \
+            | grep -Ev '(dev|rc|a[0-9]|b[0-9])' \
+            | head -1)
+          if [ -z "$LATEST_RELEASE" ]; then
+            echo "No release tag matching 'renderers-v<MAJOR.MINOR.PATCH>' found" >&2
+            exit 1
+          fi
+          BASE=${LATEST_RELEASE#renderers-v}
+          MAJOR=$(echo "$BASE" | cut -d. -f1)
+          MINOR=$(echo "$BASE" | cut -d. -f2)
+          PATCH=$(echo "$BASE" | cut -d. -f3)
+          NEXT="${MAJOR}.${MINOR}.$((PATCH + 1))"
+          N=$(git rev-list --count "${LATEST_RELEASE}..HEAD")
+          TAG="renderers-v${NEXT}.dev${N}"
+          echo "tag=${TAG}" >> "$GITHUB_OUTPUT"
+          echo "Computed tag: ${TAG} (base=${LATEST_RELEASE}, commits=${N})"
+      - name: Create and push tag
+        env:
+          TAG: ${{ steps.compute.outputs.tag }}
+        run: |
+          set -euo pipefail
+          if git ls-remote --exit-code --tags origin "refs/tags/${TAG}" >/dev/null 2>&1; then
+            echo "Tag ${TAG} already exists on origin — nothing to do" >&2
+            exit 0
+          fi
+          git config user.name 'github-actions[bot]'
+          git config user.email '41898282+github-actions[bot]@users.noreply.github.com'
+          git tag -a "$TAG" -m "Automated dev release ${TAG}"
+          git push origin "$TAG"
+  build:
+    needs: tag
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: refs/tags/${{ needs.tag.outputs.tag }}
+      - uses: astral-sh/setup-uv@v7
+      - name: Build renderers
+        run: uv build
+      - name: Upload dist artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist-dev
+          path: dist/
+          if-no-files-found: error
+          retention-days: 7
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi-prod
+    permissions:
+      id-token: write
+    steps:
+      - name: Download dist artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist-dev
+          path: dist/
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0

{renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: renderers
-Version: 0.1.8.dev4
+Version: 0.1.8.dev26
 Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
 License-Expression: Apache-2.0
 License-File: LICENSE
 Requires-Python: <3.14,>=3.10
-Requires-Dist: fastokens>=0.1.1
+Requires-Dist: fastokens>=0.2.0
 Requires-Dist: jinja2
 Requires-Dist: numpy
 Requires-Dist: openai-harmony>=0.0.8

{renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/pyproject.toml RENAMED Viewed

@@ -26,10 +26,10 @@ dependencies = [
     "openai-harmony>=0.0.8",
     # Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
     # ``load_tokenizer`` patches it in by default for every supported model
-    # except a small denylist (DeepSeek-V3 family, MiniMax-M2 family). The
-    # patch is bracketed around ``from_pretrained``, so subsequent
-    # ``AutoTokenizer`` calls outside the renderers package stay vanilla.
-    "fastokens>=0.1.1",
+    # except a small denylist (DeepSeek-V3 family). The patch is bracketed
+    # around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
+    # outside the renderers package stay vanilla.
+    "fastokens>=0.2.0",
 ]
 [tool.hatch.version]
@@ -68,6 +68,12 @@ dev = [
 [tool.uv]
 exclude-newer = "7 days"
+# fastokens 0.2.0 was published on 2026-05-17 and contains the
+# ``unpatch_transformers`` fix (crusoecloud/fastokens#32) needed for
+# MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
+# the project-wide 7-day cutoff lets the lockfile pick it up immediately
+# while the rest of the dependency graph stays gated.
+exclude-newer-package = { fastokens = false }
 [tool.ty.environment]
 python-version = "3.13"

{renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/__init__.py RENAMED Viewed

@@ -28,6 +28,7 @@ from renderers.base import (
     ToolCallParseStatus,
     ToolSpec,
     VideoPart,
+    attribute_text_segments,
     build_training_sample,
     build_trajectory_step,
     create_renderer,
@@ -90,6 +91,7 @@ __all__ = [
     "ToolSpec",
     "VideoPart",
     "__version__",
+    "attribute_text_segments",
     "build_training_sample",
     "build_trajectory_step",
     "create_renderer",

{renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/_version.py RENAMED Viewed

@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
 commit_id: str | None
 __commit_id__: str | None
-__version__ = version = '0.1.8.dev4'
-__version_tuple__ = version_tuple = (0, 1, 8, 'dev4')
+__version__ = version = '0.1.8.dev26'
+__version_tuple__ = version_tuple = (0, 1, 8, 'dev26')
 __commit_id__ = commit_id = None

{renderers-0.1.8.dev4 → renderers-0.1.8.dev26}/renderers/base.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import contextlib
 import enum
+import io
 import logging
 import queue
 import threading
@@ -169,6 +171,32 @@ class RenderedTokens:
     masking. ``DefaultRenderer`` leaves it empty because the Jinja
     template is opaque; hand-coded renderers populate it.
+    ``is_content`` is a per-token signal generalizing the "scaffold vs
+    body" distinction across all roles: ``True`` iff the token was
+    produced from message-body bytes (caller-provided ``content`` /
+    ``tool_calls`` / ``reasoning_content``, or the model's sampled
+    emission for the assistant role), ``False`` iff it is template
+    scaffolding the renderer added around message bodies — role-tag
+    openers, closers when not model-sampled, inter-turn separators,
+    tool-response wraps, the tools-header block, the generation prompt.
+    Generalises ``sampled_mask``: where ``sampled_mask`` answers "would
+    the model emit this?" (useful for assistant tokens; uniformly
+    ``False`` elsewhere), ``is_content`` answers "is this from caller
+    or model data?" (meaningful on every role). By construction
+    ``is_content[k] == sampled_mask[k]`` over every token attributed to
+    an assistant message; on other roles ``is_content`` carries new
+    information that ``sampled_mask`` does not.
+    The use case: SFT on tool response bodies while applying RL only to
+    assistant tokens. The trainer wants the model to anticipate tool
+    outputs but never to emit ``<|tool_response>`` itself (that would
+    interrupt the rollout), so the SFT loss mask is
+    ``message_role == "tool" AND is_content``.
+    Empty ``is_content`` (``[]``) — like ``sampled_mask`` — means the
+    renderer doesn't provide the signal. ``DefaultRenderer`` leaves it
+    empty for the same reason.
     ``multi_modal_data`` is populated by multimodal renderers (e.g.
     ``Qwen3VLRenderer``) when image / video content parts are present;
     text-only renderers leave it as ``None``.
@@ -177,6 +205,7 @@ class RenderedTokens:
     token_ids: list[int] = field(default_factory=list)
     message_indices: list[int] = field(default_factory=list)
     sampled_mask: list[bool] = field(default_factory=list)
+    is_content: list[bool] = field(default_factory=list)
     message_roles: list[str] = field(default_factory=list)
     multi_modal_data: "MultiModalData | None" = None
@@ -333,6 +362,94 @@ class RenderedTokens:
             out[role] = out.get(role, 0) + n
         return out
+    def content_token_spans_by_role(self) -> dict[str, list[tuple[int, int]]]:
+        """Per-role spans of contiguous body-only tokens (``is_content=True``).
+        Maps each role appearing in :attr:`message_roles` to a list of
+        half-open ``[start, end)`` slices into :attr:`token_ids` over
+        which every token satisfies ``is_content=True`` AND belongs to
+        a message of that role. Spans never cross message boundaries:
+        a tool message contributes its own runs; an immediately
+        adjacent assistant message contributes separate runs even when
+        the bodies abut on the token axis.
+        Returns an empty dict when :attr:`is_content` or
+        :attr:`message_roles` is empty (renderer didn't populate the
+        signal — e.g. ``DefaultRenderer``).
+        Intended for selective loss masking: SFT on tool response
+        bodies while RL acts only on assistant turns is the canonical
+        case::
+            spans = rendered.content_token_spans_by_role()
+            tool_sft_mask = [False] * len(rendered.token_ids)
+            for s, e in spans.get("tool", []):
+                for k in range(s, e):
+                    tool_sft_mask[k] = True
+        See also :meth:`content_mask_for_roles` for the same
+        computation returned as a per-token bool list.
+        """
+        out: dict[str, list[tuple[int, int]]] = {}
+        if not self.is_content or not self.message_roles:
+            return out
+        n = len(self.token_ids)
+        if len(self.is_content) != n or len(self.message_indices) != n:
+            return out
+        msg_spans = self.message_token_spans()
+        for role, span in zip(self.message_roles, msg_spans):
+            bucket = out.setdefault(role, [])
+            if span is None:
+                continue
+            start, end = span
+            run_start: int | None = None
+            for k in range(start, end):
+                if self.is_content[k]:
+                    if run_start is None:
+                        run_start = k
+                else:
+                    if run_start is not None:
+                        bucket.append((run_start, k))
+                        run_start = None
+            if run_start is not None:
+                bucket.append((run_start, end))
+        return out
+    def content_mask_for_roles(self, roles: "set[str] | frozenset[str]") -> list[bool]:
+        """Per-token bool list: ``True`` iff the token is body of a
+        message whose role is in ``roles``.
+        Length matches :attr:`token_ids`. Returns an all-``False``
+        list of that length when :attr:`is_content` or
+        :attr:`message_roles` is empty — consumers can AND this with
+        their own attribution masks without length checks.
+        ``role_to_mask`` style helpers in :func:`build_training_sample`
+        cover the trainable-role question; this one covers the
+        complementary "body-only" question. The two compose: SFT mask
+        on tool body is
+        ``rendered.content_mask_for_roles({"tool"})``; RL mask on
+        assistant tokens stays
+        ``[s and (mi >= 0 and rendered.message_roles[mi] == "assistant")
+        for s, mi in zip(rendered.sampled_mask, rendered.message_indices)]``.
+        """
+        n = len(self.token_ids)
+        mask = [False] * n
+        if not self.is_content or not self.message_roles:
+            return mask
+        if len(self.is_content) != n or len(self.message_indices) != n:
+            return mask
+        for k, msg_idx in enumerate(self.message_indices):
+            if msg_idx < 0:
+                continue
+            if msg_idx >= len(self.message_roles):
+                continue
+            if self.message_roles[msg_idx] in roles and self.is_content[k]:
+                mask[k] = True
+        return mask
 class ToolCallParseStatus(str, enum.Enum):
     """Per-attempt outcome of parsing a single ``<tool_call>`` block.
@@ -530,6 +647,15 @@ class Renderer(Protocol):
           caller needs that distinction for the prior portion, they
           have it directly: every token in ``prev_completion_ids`` was
           sampled; every token in ``prev_prompt_ids`` was not.
+        - ``is_content`` mirrors ``sampled_mask``'s scheme for the
+          prior portion (uniformly ``False`` — body-vs-wrap
+          attribution can't be recovered from raw token ids), and on
+          the bridge-added portion the renderer populates it the same
+          way as in :meth:`render`: ``True`` over the body bytes of
+          each new message, ``False`` over the surrounding scaffold.
+          Consumers walk the trajectory and read each step's own
+          ``is_content`` for full-conversation body masks; the bridge
+          output covers only the *new* tokens this turn adds.
         Text-only renderers return :class:`RenderedTokens` with
         ``multi_modal_data=None``. Multimodal renderers (see
@@ -911,31 +1037,24 @@ TRUSTED_REVISIONS: dict[str, str] = {
 # Models for which ``fastokens`` is known to diverge from vanilla
 # ``transformers.AutoTokenizer`` and therefore must NOT be patched.
 # Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
-# backends; 31/35 passed byte-identical. The four below either fail to
-# load under fastokens (DeepSeek-V3 family — Metaspace pretokenizer not
-# yet implemented) or are kept defensively pending an upstream fastokens
-# fix (MiniMax-M2 family — see per-entry comments).
+# backends. The entries below fail to load under fastokens (DeepSeek-V3
+# family — Metaspace pretokenizer not yet implemented).
 FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
     {
-        # fastokens 0.1.1: ``ValueError: pre-tokenizer error: unsupported
+        # fastokens: ``ValueError: pre-tokenizer error: unsupported
         # pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
         # SentencePiece-style Metaspace pretokenization which fastokens
         # doesn't yet implement.
         "deepseek-ai/DeepSeek-V3",
         "deepseek-ai/DeepSeek-V3-Base",
-        # MiniMax: kept defensive pending upstream fastokens fix
-        # https://github.com/crusoecloud/fastokens/pull/32 — that PR
-        # removes a stray attribute leaked by ``unpatch_transformers``
-        # which steers MiniMax (declared ``tokenizer_class =
-        # 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
-        # load path on subsequent vanilla loads. Once the upstream fix
-        # is released, these two entries can be dropped after re-audit.
-        "MiniMaxAI/MiniMax-M2",
-        "MiniMaxAI/MiniMax-M2.5",
     }
 )
+_FASTOKENS_PATCH_LOCK = threading.Lock()
+_FASTOKENS_ANNOUNCED = False
 def _patched_load(model_name_or_path: str, **kwargs):
     """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
     process-locally — patch around the load, unpatch right after.
@@ -945,15 +1064,39 @@ def _patched_load(model_name_or_path: str, **kwargs):
     fastokens for ``encode``/``decode`` while subsequent
     ``AutoTokenizer.from_pretrained`` calls (outside our control) go
     back to vanilla. This keeps the global side effect minimal.
+    fastokens itself prints ``[fastokens] patch_transformers: ...`` to
+    stdout on every patch/unpatch call. Building a pool of size N would
+    therefore emit ~N lines (more under thread contention, where some
+    threads see ``already patched``). We swallow those prints under a
+    lock — ``contextlib.redirect_stdout`` swaps ``sys.stdout``
+    process-wide, so the lock keeps unrelated stdout writes from other
+    threads from disappearing into our buffer. The patch/unpatch calls
+    are cheap; only the brief patch+unpatch is serialized, the actual
+    ``from_pretrained`` still runs concurrently across pool slots. A
+    single ``logger.info`` is emitted on the first patch so the fast
+    path is still discoverable in logs.
     """
     import fastokens
     from transformers import AutoTokenizer
-    fastokens.patch_transformers()
+    global _FASTOKENS_ANNOUNCED
+    with _FASTOKENS_PATCH_LOCK:
+        with contextlib.redirect_stdout(io.StringIO()):
+            fastokens.patch_transformers()
+        if not _FASTOKENS_ANNOUNCED:
+            logger.info(
+                "fastokens enabled — tokenizers load through the Rust BPE "
+                "fast path (~10x encode speedup)."
+            )
+            _FASTOKENS_ANNOUNCED = True
     try:
         return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
     finally:
-        fastokens.unpatch_transformers()
+        with _FASTOKENS_PATCH_LOCK:
+            with contextlib.redirect_stdout(io.StringIO()):
+                fastokens.unpatch_transformers()
 def load_tokenizer(
@@ -975,10 +1118,10 @@ def load_tokenizer(
     immediately after, so global ``AutoTokenizer.from_pretrained`` calls
     elsewhere in the user's process are not affected.
-    Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family, MiniMax-M2
-    family) skip the patch — fastokens 0.1.1 either fails to load them
-    or produces token-divergent output. Pass ``use_fastokens=False`` to
-    force the vanilla backend for any other model.
+    Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family) skip the
+    patch — fastokens currently fails to load them. Pass
+    ``use_fastokens=False`` to force the vanilla backend for any other
+    model.
     Unknown / fine-tuned model paths fall through to
     ``trust_remote_code=False`` and the patched-load fast path. If
@@ -1208,6 +1351,7 @@ def build_training_sample(
     *,
     role_to_mask: Callable[[Message], bool],
     tools: list[ToolSpec] | None = None,
+    content_sft_roles: "set[str] | frozenset[str] | None" = None,
 ) -> tuple[list[int], list[bool]]:
     """Build (token_ids, loss_mask) for supervised training.
@@ -1223,17 +1367,53 @@ def build_training_sample(
     back to attribution-only masking — every token attributed to a
     trainable role is trained on, including template-injected
     ``<|im_start|>role\\n`` openers.
+    ``content_sft_roles`` opts in additional roles for "body-only"
+    supervision: for every message whose role is in this set, tokens
+    with ``is_content=True`` are marked trainable even though the
+    ``sampled_mask`` gate excludes them (the model never samples
+    tool / user / system tokens). Template scaffolding around those
+    messages — ``<|im_start|>role\\n`` openers, ``<|im_end|>``
+    closers, ``<|tool_response>`` wraps, inter-turn ``\\n`` — stays
+    masked out, so the model learns to anticipate the body text
+    without producing the surrounding special tokens (which would
+    interrupt a real rollout). The canonical use case is RL on
+    assistant tokens (``role_to_mask=lambda m: m["role"] ==
+    "assistant"``) plus SFT on tool response bodies
+    (``content_sft_roles={"tool"}``).
+    Requires the renderer to populate ``is_content`` for the body-only
+    path to fire. Renderers that leave it empty (``DefaultRenderer``,
+    or hand-coded renderers that haven't been wired up yet) ignore
+    ``content_sft_roles`` silently — falling back to the original
+    ``role_to_mask`` + ``sampled_mask`` behaviour.
     """
     rendered = renderer.render(messages, tools=tools)
     has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
+    has_content_info = len(rendered.is_content) == len(rendered.token_ids)
+    body_roles: "frozenset[str]"
+    if content_sft_roles and has_content_info:
+        body_roles = frozenset(content_sft_roles)
+    else:
+        body_roles = frozenset()
     loss_mask: list[bool] = []
     for k, msg_idx in enumerate(rendered.message_indices):
         if msg_idx < 0:
             loss_mask.append(False)
-        elif has_sampled_info and not rendered.sampled_mask[k]:
+            continue
+        msg = messages[msg_idx]
+        # Body-only path for opt-in roles. Fires only on tokens whose
+        # is_content bit is set; never adds the scaffolding around the
+        # message, so the model isn't supervised on emitting the role
+        # tags / wraps that would derail a rollout.
+        if body_roles and msg.get("role") in body_roles:
+            loss_mask.append(rendered.is_content[k])
+            continue
+        if has_sampled_info and not rendered.sampled_mask[k]:
             loss_mask.append(False)
         else:
-            loss_mask.append(role_to_mask(messages[msg_idx]))
+            loss_mask.append(role_to_mask(msg))
     return rendered.token_ids, loss_mask
@@ -1280,6 +1460,157 @@ def trim_to_turn_close(
     return previous_ids
+# Per-model offset-aware tokenizer cache. ``attribute_text_segments``
+# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
+# each token to its source text segment under one BPE pass. Fastokens
+# (the Rust BPE we patch in by default for ~10x faster encode) does not
+# track character offsets — the patched tokenizer's
+# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
+# keep a parallel vanilla tokenizer per model purely for offset queries.
+# Memory cost is one extra tokenizer per *unique* model name across all
+# pools / renderers (the cache is process-global), independent of pool
+# size.
+_offset_tokenizers: dict[str, Any] = {}
+_offset_tokenizers_lock = threading.Lock()
+def _get_offset_tokenizer(tokenizer):
+    """Return a tokenizer that supports ``return_offsets_mapping=True``.
+    If ``tokenizer`` itself supports offsets, returns it unchanged.
+    Otherwise loads a vanilla (non-fastokens) tokenizer from
+    ``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
+    has no usable ``name_or_path`` — hand-coded renderers always pass
+    a tokenizer loaded via ``load_tokenizer`` which does set it.
+    """
+    # Cheap probe: does this tokenizer already provide offsets?
+    try:
+        tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
+        return tokenizer
+    except (NotImplementedError, ValueError, TypeError):
+        pass
+    name_or_path = getattr(tokenizer, "name_or_path", "")
+    if not name_or_path:
+        raise RuntimeError(
+            "Cannot construct an offset-aware tokenizer: the supplied "
+            "tokenizer has no ``name_or_path`` to fall back on. Pass a "
+            "tokenizer loaded via ``renderers.base.load_tokenizer``."
+        )
+    with _offset_tokenizers_lock:
+        cached = _offset_tokenizers.get(name_or_path)
+        if cached is not None:
+            return cached
+        from transformers import AutoTokenizer
+        kwargs: dict[str, Any] = {}
+        revision = TRUSTED_REVISIONS.get(name_or_path)
+        if revision is not None:
+            kwargs = {"trust_remote_code": True, "revision": revision}
+        else:
+            kwargs = {"trust_remote_code": False}
+        # Explicitly vanilla — we want HF's Rust tokenizer with offset
+        # tracking, not the fastokens shim. ``load_tokenizer`` would
+        # patch fastokens in by default; calling
+        # ``AutoTokenizer.from_pretrained`` directly here keeps the
+        # fastokens patch out of this code path entirely.
+        offset_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
+        if not getattr(offset_tok, "is_fast", False):
+            raise RuntimeError(
+                f"Vanilla tokenizer for {name_or_path!r} is not a fast "
+                "tokenizer; offset_mapping is unavailable. Hand-coded "
+                "renderers require a fast tokenizer for body/scaffold "
+                "attribution."
+            )
+        _offset_tokenizers[name_or_path] = offset_tok
+        return offset_tok
+def attribute_text_segments(
+    tokenizer,
+    segments: "list[tuple[str, bool]]",
+) -> "list[tuple[int, bool]]":
+    """Tokenize concatenated segments as a single BPE pass and return
+    ``(token_id, is_content)`` pairs.
+    ``segments`` is a list of ``(text, is_content)`` chunks the renderer
+    wants to emit contiguously — for example ``[("user\\n", False),
+    (content, True)]`` for a user message. Concatenation is done before
+    encoding to preserve BPE merges across the wrap/body boundary; the
+    resulting tokens are then attributed back to their source segment
+    via the fast tokenizer's ``offset_mapping``.
+    A token is attributed to the segment containing its first source
+    character (``offset_mapping[k][0]``). Tokens whose first character
+    falls exactly on a segment boundary are attributed to the segment
+    that *starts* at that offset (the "later" segment). Zero-length
+    tokens (rare; usually pre-tokenizer artefacts) are attributed to
+    the most recently entered segment.
+    Requires a HuggingFace fast tokenizer with offset tracking. The
+    ``fastokens`` patch ``load_tokenizer`` applies by default does
+    **not** track offsets — when that's the case we transparently load
+    a vanilla offset-capable tokenizer for the same model and cache it
+    (see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
+    registered for model families that ship a fast tokenizer, so a
+    silent slow-tokenizer fallback isn't supported — BPE drift at the
+    wrap/body boundary would defeat the whole point.
+    Empty input or empty joined text returns an empty list.
+    """
+    if not segments:
+        return []
+    full_text = "".join(text for text, _ in segments)
+    if not full_text:
+        return []
+    offset_tokenizer = _get_offset_tokenizer(tokenizer)
+    encoding = offset_tokenizer(
+        full_text,
+        add_special_tokens=False,
+        return_offsets_mapping=True,
+    )
+    token_ids = list(encoding["input_ids"])
+    offsets = list(encoding["offset_mapping"])
+    # Build segment char-span lookup. Track the half-open span
+    # [seg_start, seg_end) of each segment and its is_content bit.
+    spans: list[tuple[int, int, bool]] = []
+    pos = 0
+    for text, is_content in segments:
+        spans.append((pos, pos + len(text), is_content))
+        pos += len(text)
+    total_len = pos
+    out: list[tuple[int, bool]] = []
+    last_is_content = spans[-1][2] if spans else False
+    for tok_id, (start, _end) in zip(token_ids, offsets):
+        if start >= total_len:
+            # Token's character offset is past every segment (shouldn't
+            # normally happen for add_special_tokens=False, but defensive
+            # against tokenizer-specific edge cases).
+            out.append((tok_id, last_is_content))
+            continue
+        # Find the segment that contains `start`. Segments are
+        # contiguous and ordered, so a linear scan is fine — the inner
+        # loop runs at most len(segments) times per token and segments
+        # is typically 2-3 in practice.
+        is_content = last_is_content
+        for seg_start, seg_end, seg_is_content in spans:
+            if seg_start <= start < seg_end:
+                is_content = seg_is_content
+                break
+        else:
+            # start == total_len handled above; the remaining case is
+            # an empty segment in the middle. Empty segments emit no
+            # characters, so no token can land in them; fall through to
+            # the last non-empty segment's bit.
+            pass
+        out.append((tok_id, is_content))
+    return out
 def reject_assistant_in_extension(new_messages: list[Message]) -> bool:
     """Return True if any message in ``new_messages`` is an assistant turn.

renderers 0.1.8.dev4__tar.gz → 0.1.8.dev26__tar.gz

renderers 0.1.8.dev4tar.gz → 0.1.8.dev26tar.gz