PyPI - renderers - Versions diffs - 0.1.8.dev1__tar.gz → 0.1.8.dev2__tar.gz - Mend

renderers 0.1.8.dev1tar.gz → 0.1.8.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/.github/workflows/publish.yml RENAMED Viewed

@@ -12,8 +12,16 @@ on:
       - "renderers-v*"
 jobs:
-  publish:
+  # Build (no OIDC) → publish (OIDC only). The build job runs uv build with
+  # contents: read only so a poisoned build-time dep cannot mint the OIDC
+  # token. The publish job has id-token: write and the pypi-prod environment
+  # but no source checkout — it only downloads the prebuilt artifact and runs
+  # the SHA-pinned pypa publish action.
+  build:
+    if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/tags/renderers-v')
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
     steps:
       - name: Checkout tagged release (dispatch)
         if: github.event_name == 'workflow_dispatch'
@@ -28,8 +36,7 @@ jobs:
         with:
           fetch-depth: 0
-      - name: Resolve release tag
-        id: release
+      - name: Validate release tag
         env:
           EVENT_NAME: ${{ github.event_name }}
           PUSHED_REF: ${{ github.ref_name }}
@@ -53,14 +60,31 @@ jobs:
               ;;
           esac
-          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
       - uses: astral-sh/setup-uv@v7
       - name: Build renderers
         run: uv build
+      - name: Upload dist artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+          if-no-files-found: error
+          retention-days: 7
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi-prod
+    permissions:
+      id-token: write
+    steps:
+      - name: Download dist artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
       - name: Publish to PyPI
-        env:
-          PYPI_RENDERERS_TOKEN: ${{ secrets.PYPI_RENDERERS_TOKEN }}
-        run: uv publish --token "$PYPI_RENDERERS_TOKEN" dist/*
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.4
 Name: renderers
-Version: 0.1.8.dev1
+Version: 0.1.8.dev2
 Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
 License-Expression: Apache-2.0
 License-File: LICENSE
 Requires-Python: <3.14,>=3.10
+Requires-Dist: fastokens>=0.1.1
 Requires-Dist: jinja2
 Requires-Dist: numpy
 Requires-Dist: openai-harmony>=0.0.8

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/pyproject.toml RENAMED Viewed

@@ -24,6 +24,12 @@ dependencies = [
     # OpenAI's reference implementation keeps us byte-identical with vLLM
     # (which also uses it) and saves us mirroring a 330-line Jinja template.
     "openai-harmony>=0.0.8",
+    # Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
+    # ``load_tokenizer`` patches it in by default for every supported model
+    # except a small denylist (DeepSeek-V3 family, MiniMax-M2 family). The
+    # patch is bracketed around ``from_pretrained``, so subsequent
+    # ``AutoTokenizer`` calls outside the renderers package stay vanilla.
+    "fastokens>=0.1.1",
 ]
 [tool.hatch.version]

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/_version.py RENAMED Viewed

@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
 commit_id: str | None
 __commit_id__: str | None
-__version__ = version = '0.1.8.dev1'
-__version_tuple__ = version_tuple = (0, 1, 8, 'dev1')
+__version__ = version = '0.1.8.dev2'
+__version_tuple__ = version_tuple = (0, 1, 8, 'dev2')
 __commit_id__ = commit_id = None

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/base.py RENAMED Viewed

@@ -148,8 +148,26 @@ class RenderedTokens:
     """Result of rendering messages to tokens.
     Each token carries an index into the original message list so callers can
-    build per-token loss masks without re-rendering.  Tokens from structural
-    scaffolding (generation prompt, im_start/im_end wrapping) carry index -1.
+    build per-token loss masks without re-rendering. Tokens from structural
+    scaffolding the renderer adds outside any single message (e.g. the
+    trailing generation prompt) carry index ``-1``.
+    ``sampled_mask`` is a separate per-token signal: ``True`` if the model
+    would have produced this token at inference time (i.e. it appears in
+    the sampled completion), ``False`` if it is template-injected
+    scaffolding the model never emits (``<|im_start|>role\\n`` openers,
+    inter-turn ``\\n`` separators, system / user / tool content from
+    conversation history, etc.). This is distinct from
+    ``message_indices``: a token can belong to an assistant message
+    (``message_indices[k] >= 0``) and still be scaffolding the template
+    adds around the model's actual completion. SFT loss masks should AND
+    both: train on tokens whose role is trainable AND that the model
+    would actually sample.
+    Empty ``sampled_mask`` (``[]``) means the renderer doesn't provide
+    this signal — consumers should fall back to attribution-only
+    masking. ``DefaultRenderer`` leaves it empty because the Jinja
+    template is opaque; hand-coded renderers populate it.
     ``multi_modal_data`` is populated by multimodal renderers (e.g.
     ``Qwen3VLRenderer``) when image / video content parts are present;
@@ -158,6 +176,7 @@ class RenderedTokens:
     token_ids: list[int] = field(default_factory=list)
     message_indices: list[int] = field(default_factory=list)
+    sampled_mask: list[bool] = field(default_factory=list)
     multi_modal_data: "MultiModalData | None" = None
@@ -713,37 +732,108 @@ TRUSTED_REVISIONS: dict[str, str] = {
 }
-def load_tokenizer(model_name_or_path: str):
-    """Load a tokenizer with the renderers-package security policy.
+# Models for which ``fastokens`` is known to diverge from vanilla
+# ``transformers.AutoTokenizer`` and therefore must NOT be patched.
+# Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
+# backends; 31/35 passed byte-identical. The four below either fail to
+# load under fastokens (DeepSeek-V3 family — Metaspace pretokenizer not
+# yet implemented) or are kept defensively pending an upstream fastokens
+# fix (MiniMax-M2 family — see per-entry comments).
+FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
+    {
+        # fastokens 0.1.1: ``ValueError: pre-tokenizer error: unsupported
+        # pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
+        # SentencePiece-style Metaspace pretokenization which fastokens
+        # doesn't yet implement.
+        "deepseek-ai/DeepSeek-V3",
+        "deepseek-ai/DeepSeek-V3-Base",
+        # MiniMax: kept defensive pending upstream fastokens fix
+        # https://github.com/crusoecloud/fastokens/pull/32 — that PR
+        # removes a stray attribute leaked by ``unpatch_transformers``
+        # which steers MiniMax (declared ``tokenizer_class =
+        # 'GPT2Tokenizer'`` → slow→fast conversion path) down a different
+        # load path on subsequent vanilla loads. Once the upstream fix
+        # is released, these two entries can be dropped after re-audit.
+        "MiniMaxAI/MiniMax-M2",
+        "MiniMaxAI/MiniMax-M2.5",
+    }
+)
+def _patched_load(model_name_or_path: str, **kwargs):
+    """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
+    process-locally — patch around the load, unpatch right after.
+    fastokens captures the loaded backend on a per-tokenizer basis, so
+    after we unpatch the returned tokenizer object continues to use
+    fastokens for ``encode``/``decode`` while subsequent
+    ``AutoTokenizer.from_pretrained`` calls (outside our control) go
+    back to vanilla. This keeps the global side effect minimal.
+    """
+    import fastokens
+    from transformers import AutoTokenizer
+    fastokens.patch_transformers()
+    try:
+        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+    finally:
+        fastokens.unpatch_transformers()
-    Default: ``trust_remote_code=False`` — the safe choice for every
-    model in ``MODEL_RENDERER_MAP`` *except* the Kimi-K2 family.
-    Models listed in ``TRUSTED_REVISIONS`` load with
-    ``trust_remote_code=True`` AND ``revision=<pinned sha>`` — required
-    because their tokenizer config has an ``auto_map.AutoTokenizer``
-    entry pointing at a repo-supplied Python class
-    (``tokenization_kimi.TikTokenTokenizer``). Pinning the revision
-    means transformers executes only the reviewed commit's code, not
-    whatever ``HEAD`` points at when the call fires.
+def load_tokenizer(
+    model_name_or_path: str,
+    *,
+    use_fastokens: bool = True,
+):
+    """Load a tokenizer with the renderers-package security + perf policy.
+    **Security** — default ``trust_remote_code=False``. Models listed in
+    ``TRUSTED_REVISIONS`` (Moonshot Kimi-K2 family) load with
+    ``trust_remote_code=True`` AND a pinned ``revision=<sha>`` so
+    transformers only executes the reviewed commit's tokenizer Python.
+    **Performance** — ``use_fastokens=True`` (default) routes the load
+    through ``fastokens.patch_transformers()`` so the resulting tokenizer
+    encodes ~10x faster than vanilla ``tokenizers``. The patch is
+    bracketed: it's applied before ``from_pretrained`` and removed
+    immediately after, so global ``AutoTokenizer.from_pretrained`` calls
+    elsewhere in the user's process are not affected.
+    Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family, MiniMax-M2
+    family) skip the patch — fastokens 0.1.1 either fails to load them
+    or produces token-divergent output. Pass ``use_fastokens=False`` to
+    force the vanilla backend for any other model.
     Unknown / fine-tuned model paths fall through to
-    ``trust_remote_code=False``. Callers who legitimately need to load
-    a custom-code tokenizer outside this allow-list should call
-    ``AutoTokenizer.from_pretrained`` themselves and pass the result to
-    ``create_renderer`` (which doesn't load tokenizers — only
-    ``create_renderer_pool`` does).
+    ``trust_remote_code=False`` and the patched-load fast path. If
+    fastokens raises during the patched load (e.g. an unknown
+    pre-tokenizer type), we automatically retry with the vanilla
+    backend and emit an INFO log.
     """
     from transformers import AutoTokenizer
+    kwargs: dict[str, Any] = {}
     revision = TRUSTED_REVISIONS.get(model_name_or_path)
     if revision is not None:
-        return AutoTokenizer.from_pretrained(
+        kwargs = {"trust_remote_code": True, "revision": revision}
+    else:
+        kwargs = {"trust_remote_code": False}
+    if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE:
+        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+    try:
+        return _patched_load(model_name_or_path, **kwargs)
+    except Exception as exc:
+        logger.info(
+            "fastokens could not load %r (%s: %s); falling back to vanilla "
+            "AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in "
+            "renderers.base to suppress the retry.",
             model_name_or_path,
-            trust_remote_code=True,
-            revision=revision,
+            type(exc).__name__,
+            str(exc)[:160],
         )
-    return AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
+        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
 def _populate_registry():
@@ -947,12 +1037,25 @@ def build_training_sample(
     Single render() call + message_indices → per-token mask.
     Replaces build_incremental_token_mask (O(N) renders → O(1)).
+    When the renderer populates ``rendered.sampled_mask``, the loss mask
+    is the AND of role-based attribution and the sampled signal: only
+    tokens the model would have produced at inference are trainable.
+    This keeps SFT byte-aligned with the RL trajectory mask (where the
+    prompt / completion split achieves the same effect structurally).
+    Renderers that don't populate ``sampled_mask`` (empty list) fall
+    back to attribution-only masking — every token attributed to a
+    trainable role is trained on, including template-injected
+    ``<|im_start|>role\\n`` openers.
     """
     rendered = renderer.render(messages, tools=tools)
+    has_sampled_info = len(rendered.sampled_mask) == len(rendered.token_ids)
     loss_mask: list[bool] = []
-    for msg_idx in rendered.message_indices:
+    for k, msg_idx in enumerate(rendered.message_indices):
         if msg_idx < 0:
             loss_mask.append(False)
+        elif has_sampled_info and not rendered.sampled_mask[k]:
+            loss_mask.append(False)
         else:
             loss_mask.append(role_to_mask(messages[msg_idx]))
     return rendered.token_ids, loss_mask

{renderers-0.1.8.dev1 → renderers-0.1.8.dev2}/renderers/deepseek_v3.py RENAMED Viewed

@@ -113,20 +113,23 @@ class DeepSeekV3Renderer:
         tokens: list[int] = []
         indices: list[int] = []
+        sampled: list[bool] = []
-        def emit_ids(ids: list[int], msg_idx: int) -> None:
+        def emit_ids(ids: list[int], msg_idx: int, *, is_sampled: bool) -> None:
             tokens.extend(ids)
             indices.extend([msg_idx] * len(ids))
+            sampled.extend([is_sampled] * len(ids))
-        def emit_special(token_id: int, msg_idx: int) -> None:
+        def emit_special(token_id: int, msg_idx: int, *, is_sampled: bool) -> None:
             tokens.append(token_id)
             indices.append(msg_idx)
+            sampled.append(is_sampled)
-        def emit_text(text: str, msg_idx: int) -> None:
-            emit_ids(self._encode(text), msg_idx)
+        def emit_text(text: str, msg_idx: int, *, is_sampled: bool) -> None:
+            emit_ids(self._encode(text), msg_idx, is_sampled=is_sampled)
         # ── 1. BOS token ─────────────────────────────────────────────
-        emit_special(self._bos, -1)
+        emit_special(self._bos, -1, is_sampled=False)
         # ── 2. Collect system messages at the start ───────────────────
         # All leading system messages are concatenated with "\n\n" and emitted
@@ -148,7 +151,7 @@ class DeepSeekV3Renderer:
         if sys_parts:
             # Attribute the concatenated system text to the first system message (index 0).
-            emit_text("\n\n".join(sys_parts), 0)
+            emit_text("\n\n".join(sys_parts), 0, is_sampled=False)
         # ── 3. Render non-system messages ─────────────────────────────
         num_messages = len(messages)
@@ -163,8 +166,8 @@ class DeepSeekV3Renderer:
                     content = "".join(
                         p.get("text", "") for p in content if isinstance(p, dict)
                     )
-                emit_special(self._user_token, i)
-                emit_text(str(content), i)
+                emit_special(self._user_token, i, is_sampled=False)
+                emit_text(str(content), i, is_sampled=False)
             elif role == "user":
                 content = msg.get("content") or ""
@@ -177,8 +180,8 @@ class DeepSeekV3Renderer:
                         else ""
                         for p in content
                     )
-                emit_special(self._user_token, i)
-                emit_text(str(content), i)
+                emit_special(self._user_token, i, is_sampled=False)
+                emit_text(str(content), i, is_sampled=False)
             elif role == "assistant":
                 self._render_assistant(
@@ -202,11 +205,13 @@ class DeepSeekV3Renderer:
             # Don't add <｜Assistant｜> after tool outputs — content flows directly.
             last_role = messages[-1]["role"] if messages else None
             if last_role != "tool":
-                emit_special(self._assistant_token, -1)
+                emit_special(self._assistant_token, -1, is_sampled=False)
             if self._enable_thinking:
-                emit_text("<think>\n", -1)
+                emit_text("<think>\n", -1, is_sampled=False)
-        return RenderedTokens(token_ids=tokens, message_indices=indices)
+        return RenderedTokens(
+            token_ids=tokens, message_indices=indices, sampled_mask=sampled
+        )
     def render_ids(
         self,
@@ -267,10 +272,20 @@ class DeepSeekV3Renderer:
         ext: list[int] = []
-        def emit_special(token_id: int, _msg_idx: int = -1) -> None:
+        # Bridge output is consumed as the next turn's prompt — the
+        # caller blanket-masks it via ``prompt_mask=[False]*N``, so we
+        # don't track sampled_mask here. Local helpers accept the kwarg
+        # for signature compatibility with ``_render_tool`` and ignore
+        # it; the returned ``RenderedTokens`` leaves ``sampled_mask``
+        # empty.
+        def emit_special(
+            token_id: int, _msg_idx: int = -1, *, is_sampled: bool = False
+        ) -> None:
             ext.append(token_id)
-        def emit_text(text: str, _msg_idx: int = -1) -> None:
+        def emit_text(
+            text: str, _msg_idx: int = -1, *, is_sampled: bool = False
+        ) -> None:
             ext.extend(self._encode(text))
         for i, msg in enumerate(new_messages):
@@ -354,17 +369,24 @@ class DeepSeekV3Renderer:
         tool_calls = msg.get("tool_calls") or []
+        # ``<｜Assistant｜>`` is template-injected scaffolding — at
+        # inference the chat template emits it as the generation prompt
+        # and the model never samples it. Marking it ``is_sampled=False``
+        # keeps the SFT loss mask aligned with what the model would
+        # actually have produced. When the previous message is a tool
+        # response, the template skips this token entirely (content
+        # flows directly out of ``<｜tool▁outputs▁end｜>``).
         if not prev_is_tool:
-            emit_special(self._assistant_token, msg_idx)
+            emit_special(self._assistant_token, msg_idx, is_sampled=False)
         if not tool_calls:
-            emit_text(content, msg_idx)
+            emit_text(content, msg_idx, is_sampled=True)
         else:
             # Emit any pre-tool-call content first.
-            emit_text(content, msg_idx)
+            emit_text(content, msg_idx, is_sampled=True)
             # Tool call section.
-            emit_special(self._tool_calls_begin, msg_idx)
+            emit_special(self._tool_calls_begin, msg_idx, is_sampled=True)
             for tc in tool_calls:
                 func = tc.get("function") or tc
                 name = func.get("name", "")
@@ -376,14 +398,17 @@ class DeepSeekV3Renderer:
                 )
                 # Format: <｜tool▁call▁begin｜>function<｜tool▁sep｜>{name}\n```json\n{args}\n```<｜tool▁call▁end｜>
                 # tool_sep is a special token; type ("function") and name+args are plain text.
-                emit_special(self._tool_call_begin, msg_idx)
-                emit_text("function", msg_idx)
-                emit_special(self._tool_sep, msg_idx)
-                emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx)
-                emit_special(self._tool_call_end, msg_idx)
-            emit_special(self._tool_calls_end, msg_idx)
-        emit_special(self._eos, msg_idx)
+                emit_special(self._tool_call_begin, msg_idx, is_sampled=True)
+                emit_text("function", msg_idx, is_sampled=True)
+                emit_special(self._tool_sep, msg_idx, is_sampled=True)
+                emit_text(f"{name}\n```json\n{args_str}\n```", msg_idx, is_sampled=True)
+                emit_special(self._tool_call_end, msg_idx, is_sampled=True)
+            emit_special(self._tool_calls_end, msg_idx, is_sampled=True)
+        # ``<｜end▁of▁sentence｜>`` is the model's stop signal — it
+        # samples this to end its turn, so it is part of the sampled
+        # stream.
+        emit_special(self._eos, msg_idx, is_sampled=True)
     # ------------------------------------------------------------------
     # Tool (tool-response) rendering
@@ -397,6 +422,9 @@ class DeepSeekV3Renderer:
         emit_special,
         emit_text,
     ) -> None:
+        # Tool messages are conversation history injected by the runtime
+        # between assistant turns — the model never samples any of these
+        # tokens, so every emission is is_sampled=False.
         prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
         next_is_tool = (
             msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
@@ -407,11 +435,11 @@ class DeepSeekV3Renderer:
             content = "".join(p.get("text", "") for p in content if isinstance(p, dict))
         if not prev_is_tool:
-            emit_special(self._tool_outputs_begin, msg_idx)
+            emit_special(self._tool_outputs_begin, msg_idx, is_sampled=False)
-        emit_special(self._tool_output_begin, msg_idx)
-        emit_text(str(content), msg_idx)
-        emit_special(self._tool_output_end, msg_idx)
+        emit_special(self._tool_output_begin, msg_idx, is_sampled=False)
+        emit_text(str(content), msg_idx, is_sampled=False)
+        emit_special(self._tool_output_end, msg_idx, is_sampled=False)
         if not next_is_tool:
-            emit_special(self._tool_outputs_end, msg_idx)
+            emit_special(self._tool_outputs_end, msg_idx, is_sampled=False)

renderers 0.1.8.dev1__tar.gz → 0.1.8.dev2__tar.gz

renderers 0.1.8.dev1tar.gz → 0.1.8.dev2tar.gz