renderers 0.1.6__tar.gz → 0.1.8.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/publish.yml +4 -15
- {renderers-0.1.6 → renderers-0.1.8.dev0}/.gitignore +2 -0
- renderers-0.1.8.dev0/PKG-INFO +156 -0
- renderers-0.1.8.dev0/README.md +143 -0
- renderers-0.1.8.dev0/examples/README.md +72 -0
- renderers-0.1.8.dev0/examples/sglang/multiturn_generate_sglang.py +187 -0
- renderers-0.1.8.dev0/examples/tinker/multiturn_generate_tinker.py +179 -0
- renderers-0.1.8.dev0/examples/transformers/multiturn_generate_transformers.py +196 -0
- renderers-0.1.8.dev0/examples/vllm/multiturn_generate_vllm.py +185 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/pyproject.toml +28 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/__init__.py +23 -0
- renderers-0.1.8.dev0/renderers/_version.py +24 -0
- renderers-0.1.8.dev0/renderers/base.py +1021 -0
- renderers-0.1.8.dev0/renderers/client.py +335 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/deepseek_v3.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/default.py +1 -1
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/glm45.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/glm5.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/gpt_oss.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/kimi_k2.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/kimi_k25.py +246 -10
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/minimax_m2.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/nemotron3.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/parsers.py +1 -1
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen3.py +2 -2
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen35.py +316 -20
- renderers-0.1.8.dev0/renderers/qwen3_vl.py +726 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/conftest.py +3 -3
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_bridge.py +10 -9
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_client.py +96 -3
- renderers-0.1.8.dev0/tests/test_load_tokenizer.py +118 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_message_indices.py +2 -5
- renderers-0.1.8.dev0/tests/test_multimodal.py +528 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parse_response.py +2 -4
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parsers.py +9 -9
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_preserve_thinking.py +6 -10
- renderers-0.1.8.dev0/tests/test_qwen35_size_coverage.py +166 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_render_ids.py +4 -10
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_roundtrip.py +3 -4
- {renderers-0.1.6 → renderers-0.1.8.dev0}/uv.lock +449 -2
- renderers-0.1.6/PKG-INFO +0 -273
- renderers-0.1.6/README.md +0 -260
- renderers-0.1.6/renderers/base.py +0 -624
- renderers-0.1.6/renderers/client.py +0 -205
- renderers-0.1.6/renderers/qwen3_vl.py +0 -341
- {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/style.yml +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/test.yml +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/.pre-commit-config.yaml +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/parsing.py +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen36.py +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_build_helpers.py +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_gpt_oss_harmony_parity.py +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_incremental.py +0 -0
- {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parse_response_robustness.py +0 -0
|
@@ -41,6 +41,10 @@ jobs:
|
|
|
41
41
|
TAG="$PUSHED_REF"
|
|
42
42
|
fi
|
|
43
43
|
|
|
44
|
+
# The package version is derived from this tag by hatch-vcs
|
|
45
|
+
# at build time (see [tool.hatch.version] in pyproject.toml).
|
|
46
|
+
# We only need to validate the tag shape — there's no
|
|
47
|
+
# ``project.version`` field to cross-check anymore.
|
|
44
48
|
case "$TAG" in
|
|
45
49
|
renderers-v*) ;;
|
|
46
50
|
*)
|
|
@@ -49,21 +53,6 @@ jobs:
|
|
|
49
53
|
;;
|
|
50
54
|
esac
|
|
51
55
|
|
|
52
|
-
VERSION="${TAG#renderers-v}"
|
|
53
|
-
FILE_VERSION=$(python - <<'PY'
|
|
54
|
-
import tomllib
|
|
55
|
-
from pathlib import Path
|
|
56
|
-
with Path('pyproject.toml').open('rb') as f:
|
|
57
|
-
data = tomllib.load(f)
|
|
58
|
-
print(data['project']['version'])
|
|
59
|
-
PY
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
if [ "$FILE_VERSION" != "$VERSION" ]; then
|
|
63
|
-
echo "Version mismatch: tag requests '$VERSION' but pyproject.toml defines '$FILE_VERSION'" >&2
|
|
64
|
-
exit 1
|
|
65
|
-
fi
|
|
66
|
-
|
|
67
56
|
echo "tag=$TAG" >> "$GITHUB_OUTPUT"
|
|
68
57
|
|
|
69
58
|
- uses: astral-sh/setup-uv@v7
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: renderers
|
|
3
|
+
Version: 0.1.8.dev0
|
|
4
|
+
Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
|
|
5
|
+
Requires-Python: <3.14,>=3.10
|
|
6
|
+
Requires-Dist: jinja2
|
|
7
|
+
Requires-Dist: numpy
|
|
8
|
+
Requires-Dist: openai-harmony>=0.0.8
|
|
9
|
+
Requires-Dist: openai>=1.108.1
|
|
10
|
+
Requires-Dist: tiktoken
|
|
11
|
+
Requires-Dist: transformers>=4.50.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# renderers
|
|
15
|
+
|
|
16
|
+
Programmable chat templates for LLM training and inference. A renderer turns a model's chat template into a Python object that can render messages → token ids, parse completion ids → structured assistant messages, and extend a multi-turn rollout without re-rendering model-sampled history.
|
|
17
|
+
|
|
18
|
+
Standalone on PyPI, and portable across training and inference stacks (transformers, vLLM, SGLang, Tinker). Initially developed for RL training with [verifiers](https://github.com/PrimeIntellect-ai/verifiers) and `prime-rl` at Prime Intellect.
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv add renderers
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## At a glance
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from transformers import AutoTokenizer
|
|
30
|
+
from renderers import create_renderer
|
|
31
|
+
|
|
32
|
+
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
|
|
33
|
+
r = create_renderer(tok, renderer="auto") # → Qwen3Renderer
|
|
34
|
+
|
|
35
|
+
prompt_ids = r.render_ids(
|
|
36
|
+
[{"role": "user", "content": "hi"}],
|
|
37
|
+
add_generation_prompt=True,
|
|
38
|
+
)
|
|
39
|
+
# Feed prompt_ids to a Token-In, Token-Out endpoint.
|
|
40
|
+
# It returns completion_ids sampled by the model.
|
|
41
|
+
|
|
42
|
+
parsed = r.parse_response(completion_ids)
|
|
43
|
+
# ParsedResponse(content=..., reasoning_content=..., tool_calls=...)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
For the next turn, extend the previous sampled stream instead of re-rendering history:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
next_prompt_ids = r.bridge_to_next_turn(
|
|
50
|
+
previous_prompt_ids=prompt_ids,
|
|
51
|
+
previous_completion_ids=completion_ids,
|
|
52
|
+
new_messages=[{"role": "tool", "content": "..."}],
|
|
53
|
+
)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
|
|
57
|
+
|
|
58
|
+
## API
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
class Renderer(Protocol):
|
|
62
|
+
def render(messages, *, tools=None, add_generation_prompt=False) -> RenderedTokens: ...
|
|
63
|
+
def render_ids(messages, *, tools=None, add_generation_prompt=False) -> list[int]: ...
|
|
64
|
+
def parse_response(token_ids) -> ParsedResponse: ...
|
|
65
|
+
def get_stop_token_ids() -> list[int]: ...
|
|
66
|
+
def bridge_to_next_turn(prev_prompt_ids, prev_completion_ids, new_messages, *, tools=None) -> list[int] | None: ...
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
- `RenderedTokens` carries `token_ids` **and** `message_indices` — one entry per token attributing each to its source message (`-1` for structural scaffolding). Lets `build_training_sample` build a per-token loss mask in one render.
|
|
70
|
+
- `ParsedResponse` is `(content, reasoning_content, tool_calls)`. It scans token ids for special-token boundaries (e.g. id `151657` for `<tool_call>` on Qwen3) — a literal `"<tool_call>"` in user content tokenizes to ordinary text ids and never matches.
|
|
71
|
+
- Round-trip: rendering `[user, assistant(content, reasoning, tool_calls)]`, slicing the assistant completion, and feeding it through `parse_response` returns an equivalent structured message. Tested per-renderer in `tests/test_roundtrip.py`.
|
|
72
|
+
|
|
73
|
+
### `bridge_to_next_turn` (the core contract)
|
|
74
|
+
|
|
75
|
+
Given `(prev_prompt_ids, prev_completion_ids)` and new environment messages, return ids for the next turn's prompt such that the result starts with `prev_prompt_ids + prev_completion_ids` byte-for-byte and continues with the new messages plus the next assistant opener. If that cannot be proven safe, return `None` and the caller falls back to a full render.
|
|
76
|
+
|
|
77
|
+
Each hand-coded bridge:
|
|
78
|
+
1. Anchors at the previous turn's canonical close token. On clean stops it's already in `prev_completion_ids`. On truncation, the renderer synthesizes the close as non-loss prompt context.
|
|
79
|
+
2. Refuses assistant content in `new_messages` — re-rendering sampled tokens would replace them with canonical template bytes.
|
|
80
|
+
3. Renders only the new messages in the framing the model family expects.
|
|
81
|
+
|
|
82
|
+
`DefaultRenderer.bridge_to_next_turn` returns `None` unconditionally — the template's close is unknown, so the contract can't be proven.
|
|
83
|
+
|
|
84
|
+
### Picking a renderer
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
r = create_renderer(tok, renderer="auto")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
|
|
91
|
+
|
|
92
|
+
### Pools
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from renderers import create_renderer_pool
|
|
96
|
+
|
|
97
|
+
pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
|
|
98
|
+
with pool.checkout() as r:
|
|
99
|
+
ids = r.render_ids(messages)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Each slot owns its own tokenizer copy. Construction fans out across a thread pool so a 32-slot pool doesn't serially eat ~10–15s of `from_pretrained` calls at startup.
|
|
103
|
+
|
|
104
|
+
## Why use a renderer
|
|
105
|
+
|
|
106
|
+
For RL the trainer must see the exact token ids the sampler saw. The standard alternative — let the inference engine apply the chat template, parse tool calls, parse reasoning, and re-render full history every turn — silently breaks token identity. These are the failure modes a renderer's `bridge_to_next_turn` sidesteps by never re-rendering prior turns:
|
|
107
|
+
|
|
108
|
+
- **Boolean round-trip.** Engine emits `false`; client parses to Python `bool(False)`; `apply_chat_template` re-renders via `str(False)` → `"False"`. Capital F. Reproducible on Qwen3.5-35B-A3B + mini-swe-agent-plus at ~50% break rate per rollout.
|
|
109
|
+
- **BPE retokenization drift.** The same substring tokenizes differently depending on neighbouring bytes. `json` + `p` + `enderer` (3 tokens) vs `jsonp` + `enderer` (2 tokens) when whitespace shifts by one character. Every subsequent token is shifted from there on.
|
|
110
|
+
- **Tool-call XML drift.** The engine emits a no-arg call with a stylistic empty `</parameter>`; the Jinja re-render of the reconstructed dict drops it. Extension property broken at every such call.
|
|
111
|
+
- **Thinking stripped from non-latest assistants.** Some templates strip `<think>…</think>` blocks from prior assistant turns when re-rendering. The recorded stream has the thinking; the next prompt does not.
|
|
112
|
+
- **Max-seq-len truncation zeroing the anchor.** Client-side `max_seq_len` enforcement zeros `completion_ids` when `prompt_len > max_seq_len`. The bridge anchor is empty, falling back to full re-render — triggering every mode above.
|
|
113
|
+
- **Scaffold-level history rewriting.** Some agent scaffolds (e.g. opencode's `experimental_repairToolCall`) rewrite tool calls before sending them back as history. The next turn's prompt contains a tool call the model never emitted. *A renderer cannot fix this — the drift happens before rendering.*
|
|
114
|
+
|
|
115
|
+
Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:
|
|
116
|
+
|
|
117
|
+
| client path | breaks | training samples from 64 rollouts |
|
|
118
|
+
| -------------------------------------- | ------ | --------------------------------- |
|
|
119
|
+
| `apply_chat_template` (full re-render) | 32 | 77 |
|
|
120
|
+
| renderers `bridge_to_next_turn` | 0 | 64 |
|
|
121
|
+
|
|
122
|
+
Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.
|
|
123
|
+
|
|
124
|
+
## Compaction overrides
|
|
125
|
+
|
|
126
|
+
`create_renderer` and `create_renderer_pool` accept two constructor-only flags:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
preserve_all_thinking: bool = False
|
|
130
|
+
preserve_thinking_between_tool_calls: bool = False
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
|
|
134
|
+
|
|
135
|
+
- `preserve_all_thinking=True` — every past assistant's reasoning is kept.
|
|
136
|
+
- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
|
|
137
|
+
|
|
138
|
+
The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
|
|
139
|
+
|
|
140
|
+
## `DefaultRenderer`
|
|
141
|
+
|
|
142
|
+
Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
|
|
143
|
+
|
|
144
|
+
## Roadmap
|
|
145
|
+
|
|
146
|
+
- **VLM support.** `ContentPart` is text-only today; `Qwen3VLRenderer` ships only because Qwen3-VL's text-only chat template differs from Qwen3's. Plan: add `ImagePart` / `VideoPart`, multimodal bridges, validate against a Qwen3-VL RL run.
|
|
147
|
+
- **Patched chat templates.** Some shipped templates re-tokenize history, normalize JSON, or auto-strip thinking — each breaks the extension property. Plan: a `use_patched` opt-in per renderer that renders the same surface form while avoiding known-bad patterns.
|
|
148
|
+
|
|
149
|
+
## Testing
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
uv sync --group dev
|
|
153
|
+
uv run pytest
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Round-trip parity (render → parse → original) and token-level parity against `apply_chat_template` are tested per renderer. End-to-end validation runs against Reverse-Text, Wordle, OpenCode-Math, and RLM-SWE environments.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# renderers
|
|
2
|
+
|
|
3
|
+
Programmable chat templates for LLM training and inference. A renderer turns a model's chat template into a Python object that can render messages → token ids, parse completion ids → structured assistant messages, and extend a multi-turn rollout without re-rendering model-sampled history.
|
|
4
|
+
|
|
5
|
+
Standalone on PyPI, and portable across training and inference stacks (transformers, vLLM, SGLang, Tinker). Initially developed for RL training with [verifiers](https://github.com/PrimeIntellect-ai/verifiers) and `prime-rl` at Prime Intellect.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uv add renderers
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## At a glance
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from transformers import AutoTokenizer
|
|
17
|
+
from renderers import create_renderer
|
|
18
|
+
|
|
19
|
+
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
|
|
20
|
+
r = create_renderer(tok, renderer="auto") # → Qwen3Renderer
|
|
21
|
+
|
|
22
|
+
prompt_ids = r.render_ids(
|
|
23
|
+
[{"role": "user", "content": "hi"}],
|
|
24
|
+
add_generation_prompt=True,
|
|
25
|
+
)
|
|
26
|
+
# Feed prompt_ids to a Token-In, Token-Out endpoint.
|
|
27
|
+
# It returns completion_ids sampled by the model.
|
|
28
|
+
|
|
29
|
+
parsed = r.parse_response(completion_ids)
|
|
30
|
+
# ParsedResponse(content=..., reasoning_content=..., tool_calls=...)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
For the next turn, extend the previous sampled stream instead of re-rendering history:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
next_prompt_ids = r.bridge_to_next_turn(
|
|
37
|
+
previous_prompt_ids=prompt_ids,
|
|
38
|
+
previous_completion_ids=completion_ids,
|
|
39
|
+
new_messages=[{"role": "tool", "content": "..."}],
|
|
40
|
+
)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
|
|
44
|
+
|
|
45
|
+
## API
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
class Renderer(Protocol):
|
|
49
|
+
def render(messages, *, tools=None, add_generation_prompt=False) -> RenderedTokens: ...
|
|
50
|
+
def render_ids(messages, *, tools=None, add_generation_prompt=False) -> list[int]: ...
|
|
51
|
+
def parse_response(token_ids) -> ParsedResponse: ...
|
|
52
|
+
def get_stop_token_ids() -> list[int]: ...
|
|
53
|
+
def bridge_to_next_turn(prev_prompt_ids, prev_completion_ids, new_messages, *, tools=None) -> list[int] | None: ...
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
- `RenderedTokens` carries `token_ids` **and** `message_indices` — one entry per token attributing each to its source message (`-1` for structural scaffolding). Lets `build_training_sample` build a per-token loss mask in one render.
|
|
57
|
+
- `ParsedResponse` is `(content, reasoning_content, tool_calls)`. It scans token ids for special-token boundaries (e.g. id `151657` for `<tool_call>` on Qwen3) — a literal `"<tool_call>"` in user content tokenizes to ordinary text ids and never matches.
|
|
58
|
+
- Round-trip: rendering `[user, assistant(content, reasoning, tool_calls)]`, slicing the assistant completion, and feeding it through `parse_response` returns an equivalent structured message. Tested per-renderer in `tests/test_roundtrip.py`.
|
|
59
|
+
|
|
60
|
+
### `bridge_to_next_turn` (the core contract)
|
|
61
|
+
|
|
62
|
+
Given `(prev_prompt_ids, prev_completion_ids)` and new environment messages, return ids for the next turn's prompt such that the result starts with `prev_prompt_ids + prev_completion_ids` byte-for-byte and continues with the new messages plus the next assistant opener. If that cannot be proven safe, return `None` and the caller falls back to a full render.
|
|
63
|
+
|
|
64
|
+
Each hand-coded bridge:
|
|
65
|
+
1. Anchors at the previous turn's canonical close token. On clean stops it's already in `prev_completion_ids`. On truncation, the renderer synthesizes the close as non-loss prompt context.
|
|
66
|
+
2. Refuses assistant content in `new_messages` — re-rendering sampled tokens would replace them with canonical template bytes.
|
|
67
|
+
3. Renders only the new messages in the framing the model family expects.
|
|
68
|
+
|
|
69
|
+
`DefaultRenderer.bridge_to_next_turn` returns `None` unconditionally — the template's close is unknown, so the contract can't be proven.
|
|
70
|
+
|
|
71
|
+
### Picking a renderer
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
r = create_renderer(tok, renderer="auto")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
|
|
78
|
+
|
|
79
|
+
### Pools
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from renderers import create_renderer_pool
|
|
83
|
+
|
|
84
|
+
pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
|
|
85
|
+
with pool.checkout() as r:
|
|
86
|
+
ids = r.render_ids(messages)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Each slot owns its own tokenizer copy. Construction fans out across a thread pool so a 32-slot pool doesn't serially eat ~10–15s of `from_pretrained` calls at startup.
|
|
90
|
+
|
|
91
|
+
## Why use a renderer
|
|
92
|
+
|
|
93
|
+
For RL the trainer must see the exact token ids the sampler saw. The standard alternative — let the inference engine apply the chat template, parse tool calls, parse reasoning, and re-render full history every turn — silently breaks token identity. These are the failure modes a renderer's `bridge_to_next_turn` sidesteps by never re-rendering prior turns:
|
|
94
|
+
|
|
95
|
+
- **Boolean round-trip.** Engine emits `false`; client parses to Python `bool(False)`; `apply_chat_template` re-renders via `str(False)` → `"False"`. Capital F. Reproducible on Qwen3.5-35B-A3B + mini-swe-agent-plus at ~50% break rate per rollout.
|
|
96
|
+
- **BPE retokenization drift.** The same substring tokenizes differently depending on neighbouring bytes. `json` + `p` + `enderer` (3 tokens) vs `jsonp` + `enderer` (2 tokens) when whitespace shifts by one character. Every subsequent token is shifted from there on.
|
|
97
|
+
- **Tool-call XML drift.** The engine emits a no-arg call with a stylistic empty `</parameter>`; the Jinja re-render of the reconstructed dict drops it. Extension property broken at every such call.
|
|
98
|
+
- **Thinking stripped from non-latest assistants.** Some templates strip `<think>…</think>` blocks from prior assistant turns when re-rendering. The recorded stream has the thinking; the next prompt does not.
|
|
99
|
+
- **Max-seq-len truncation zeroing the anchor.** Client-side `max_seq_len` enforcement zeros `completion_ids` when `prompt_len > max_seq_len`. The bridge anchor is empty, falling back to full re-render — triggering every mode above.
|
|
100
|
+
- **Scaffold-level history rewriting.** Some agent scaffolds (e.g. opencode's `experimental_repairToolCall`) rewrite tool calls before sending them back as history. The next turn's prompt contains a tool call the model never emitted. *A renderer cannot fix this — the drift happens before rendering.*
|
|
101
|
+
|
|
102
|
+
Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:
|
|
103
|
+
|
|
104
|
+
| client path | breaks | training samples from 64 rollouts |
|
|
105
|
+
| -------------------------------------- | ------ | --------------------------------- |
|
|
106
|
+
| `apply_chat_template` (full re-render) | 32 | 77 |
|
|
107
|
+
| renderers `bridge_to_next_turn` | 0 | 64 |
|
|
108
|
+
|
|
109
|
+
Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.
|
|
110
|
+
|
|
111
|
+
## Compaction overrides
|
|
112
|
+
|
|
113
|
+
`create_renderer` and `create_renderer_pool` accept two constructor-only flags:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
preserve_all_thinking: bool = False
|
|
117
|
+
preserve_thinking_between_tool_calls: bool = False
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
|
|
121
|
+
|
|
122
|
+
- `preserve_all_thinking=True` — every past assistant's reasoning is kept.
|
|
123
|
+
- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
|
|
124
|
+
|
|
125
|
+
The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
|
|
126
|
+
|
|
127
|
+
## `DefaultRenderer`
|
|
128
|
+
|
|
129
|
+
Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
|
|
130
|
+
|
|
131
|
+
## Roadmap
|
|
132
|
+
|
|
133
|
+
- **VLM support.** `ContentPart` is text-only today; `Qwen3VLRenderer` ships only because Qwen3-VL's text-only chat template differs from Qwen3's. Plan: add `ImagePart` / `VideoPart`, multimodal bridges, validate against a Qwen3-VL RL run.
|
|
134
|
+
- **Patched chat templates.** Some shipped templates re-tokenize history, normalize JSON, or auto-strip thinking — each breaks the extension property. Plan: a `use_patched` opt-in per renderer that renders the same surface form while avoiding known-bad patterns.
|
|
135
|
+
|
|
136
|
+
## Testing
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
uv sync --group dev
|
|
140
|
+
uv run pytest
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Round-trip parity (render → parse → original) and token-level parity against `apply_chat_template` are tested per renderer. End-to-end validation runs against Reverse-Text, Wordle, OpenCode-Math, and RLM-SWE environments.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Offline Renderer Inference Examples
|
|
2
|
+
|
|
3
|
+
Each recipe keeps chat templating in `renderers` and sends token IDs to the
|
|
4
|
+
backend:
|
|
5
|
+
|
|
6
|
+
1. Load a Hugging Face tokenizer.
|
|
7
|
+
2. Build a model-specific `Renderer`.
|
|
8
|
+
3. Render chat messages to prompt token IDs locally.
|
|
9
|
+
4. Pass token IDs directly to an offline inference engine.
|
|
10
|
+
5. Parse completion token IDs with the same renderer.
|
|
11
|
+
6. Bridge the next turn without re-rendering prior assistant output.
|
|
12
|
+
|
|
13
|
+
The scripts use PEP 723 `uv` headers, so backend dependencies stay local to the
|
|
14
|
+
recipe and do not touch the repo `uv.lock`.
|
|
15
|
+
|
|
16
|
+
## vLLM Multi-Turn Recipe
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
CUDA_VISIBLE_DEVICES=0 uv run --script examples/vllm/multiturn_generate_vllm.py
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
The vLLM script targets `vllm>=0.20` and uses `prompt_token_ids`, so vLLM
|
|
23
|
+
does not apply a chat template.
|
|
24
|
+
|
|
25
|
+
## SGLang Multi-Turn Recipe
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
CUDA_VISIBLE_DEVICES=1 uv run --script examples/sglang/multiturn_generate_sglang.py
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The SGLang script uses `input_ids`, so SGLang does not apply a chat template.
|
|
32
|
+
It leaves `openai-harmony` at SGLang's pinned version for dependency resolution.
|
|
33
|
+
|
|
34
|
+
## Transformers Multi-Turn Recipe
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
CUDA_VISIBLE_DEVICES=0 uv run --script examples/transformers/multiturn_generate_transformers.py
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
The Transformers script calls `generate()` with `input_ids`, so Transformers
|
|
41
|
+
does not apply a chat template.
|
|
42
|
+
|
|
43
|
+
## Tinker Multi-Turn Recipe
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
TINKER_API_KEY=... uv run --script examples/tinker/multiturn_generate_tinker.py
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The Tinker script sends renderer-produced token IDs as `ModelInput` to the
|
|
50
|
+
remote sampling API, so Tinker does not apply a chat template.
|
|
51
|
+
|
|
52
|
+
## Two-GPU Validation
|
|
53
|
+
|
|
54
|
+
Run the recipes in parallel, one backend per GPU:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
CUDA_VISIBLE_DEVICES=0 uv run --script examples/vllm/multiturn_generate_vllm.py \
|
|
58
|
+
--max-new-tokens 512 &
|
|
59
|
+
|
|
60
|
+
CUDA_VISIBLE_DEVICES=1 uv run --script examples/sglang/multiturn_generate_sglang.py \
|
|
61
|
+
--max-new-tokens 512 &
|
|
62
|
+
|
|
63
|
+
wait
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Each script runs `Qwen/Qwen3.5-4B` with `enable_thinking=True` and `False`, then
|
|
67
|
+
`openai/gpt-oss-20b`.
|
|
68
|
+
|
|
69
|
+
## Multimodal Note
|
|
70
|
+
|
|
71
|
+
Renderers are text-only today. For image/video demos, use the backend's message
|
|
72
|
+
or prompt path until renderers grow multimodal placeholder support.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#!/usr/bin/env -S uv run --script
|
|
2
|
+
# /// script
|
|
3
|
+
# requires-python = ">=3.10,<3.14"
|
|
4
|
+
# dependencies = [
|
|
5
|
+
# "renderers>=0.1.6",
|
|
6
|
+
# "sglang==0.5.10.post1",
|
|
7
|
+
# "flash-attn-4>=4.0.0b4",
|
|
8
|
+
# "transformers>=5.3.0",
|
|
9
|
+
# "openai-harmony==0.0.4",
|
|
10
|
+
# "openai>=1.108.1",
|
|
11
|
+
# "tiktoken",
|
|
12
|
+
# "jinja2",
|
|
13
|
+
# "numpy",
|
|
14
|
+
# ]
|
|
15
|
+
# ///
|
|
16
|
+
"""SGLang offline generation from renderer-owned prompt token IDs."""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
|
|
24
|
+
import sglang as sgl
|
|
25
|
+
from renderers.gpt_oss import GptOssRenderer
|
|
26
|
+
from renderers.qwen35 import Qwen35Renderer
|
|
27
|
+
from transformers import AutoTokenizer
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
MODELS = ["Qwen/Qwen3.5-4B", "openai/gpt-oss-20b"]
|
|
31
|
+
QWEN_THINKING_MODES = [True, False]
|
|
32
|
+
|
|
33
|
+
TOOLS = [
|
|
34
|
+
{
|
|
35
|
+
"type": "function",
|
|
36
|
+
"function": {
|
|
37
|
+
"name": "multiply",
|
|
38
|
+
"description": "Multiply two integers.",
|
|
39
|
+
"parameters": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"properties": {
|
|
42
|
+
"a": {"type": "integer"},
|
|
43
|
+
"b": {"type": "integer"},
|
|
44
|
+
},
|
|
45
|
+
"required": ["a", "b"],
|
|
46
|
+
},
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def make_renderer(model: str, enable_thinking: bool | None):
|
|
53
|
+
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=False)
|
|
54
|
+
if model.startswith("Qwen/Qwen3.5-"):
|
|
55
|
+
return Qwen35Renderer(tokenizer, enable_thinking=enable_thinking)
|
|
56
|
+
if model == "openai/gpt-oss-20b":
|
|
57
|
+
return GptOssRenderer(tokenizer)
|
|
58
|
+
raise ValueError(f"unsupported demo model: {model}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def print_parsed(label: str, turn: str, parsed) -> None:
|
|
62
|
+
print(f"\n[{label}] {turn}")
|
|
63
|
+
if parsed.reasoning_content:
|
|
64
|
+
print(f"reasoning: {parsed.reasoning_content[:240]}")
|
|
65
|
+
if parsed.tool_calls:
|
|
66
|
+
print(f"tool_calls: {json.dumps(parsed.tool_calls, ensure_ascii=False)}")
|
|
67
|
+
if parsed.content:
|
|
68
|
+
print(f"content: {parsed.content}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def completion_ids(output: dict, prompt_ids: list[int]) -> list[int]:
|
|
72
|
+
ids = list(output.get("output_ids") or output.get("token_ids") or [])
|
|
73
|
+
if not ids:
|
|
74
|
+
raise RuntimeError("SGLang did not return completion token IDs")
|
|
75
|
+
return ids[len(prompt_ids) :] if ids[: len(prompt_ids)] == prompt_ids else ids
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def main() -> None:
|
|
79
|
+
parser = argparse.ArgumentParser()
|
|
80
|
+
parser.add_argument("--max-new-tokens", type=int, default=512)
|
|
81
|
+
parser.add_argument("--context-length", type=int, default=4096)
|
|
82
|
+
args = parser.parse_args()
|
|
83
|
+
|
|
84
|
+
print(f"CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', '<unset>')}")
|
|
85
|
+
|
|
86
|
+
targets = []
|
|
87
|
+
for model in MODELS:
|
|
88
|
+
if model.startswith("Qwen/Qwen3.5-"):
|
|
89
|
+
for enable_thinking in QWEN_THINKING_MODES:
|
|
90
|
+
targets.append((model, enable_thinking))
|
|
91
|
+
else:
|
|
92
|
+
targets.append((model, None))
|
|
93
|
+
|
|
94
|
+
for model, enable_thinking in targets:
|
|
95
|
+
label = (
|
|
96
|
+
model
|
|
97
|
+
if enable_thinking is None
|
|
98
|
+
else f"{model} enable_thinking={enable_thinking}"
|
|
99
|
+
)
|
|
100
|
+
print(f"\n=== {label} ===")
|
|
101
|
+
|
|
102
|
+
renderer = make_renderer(model, enable_thinking)
|
|
103
|
+
|
|
104
|
+
engine_kwargs = {
|
|
105
|
+
"model_path": model,
|
|
106
|
+
"trust_remote_code": False,
|
|
107
|
+
"context_length": args.context_length,
|
|
108
|
+
"attention_backend": "triton",
|
|
109
|
+
}
|
|
110
|
+
if model == "openai/gpt-oss-20b":
|
|
111
|
+
engine_kwargs["moe_runner_backend"] = "triton"
|
|
112
|
+
engine = sgl.Engine(**engine_kwargs)
|
|
113
|
+
|
|
114
|
+
sampling = {
|
|
115
|
+
"temperature": 0.0,
|
|
116
|
+
"max_new_tokens": args.max_new_tokens,
|
|
117
|
+
"stop_token_ids": renderer.get_stop_token_ids(),
|
|
118
|
+
"skip_special_tokens": False,
|
|
119
|
+
"no_stop_trim": True,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
messages = [
|
|
123
|
+
{"role": "system", "content": "You are a concise tool-using assistant."},
|
|
124
|
+
{
|
|
125
|
+
"role": "user",
|
|
126
|
+
"content": "Use the multiply tool for 17 * 23, then summarize.",
|
|
127
|
+
},
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# Turn 1: render locally and pass token IDs to SGLang. SGLang never
|
|
131
|
+
# sees messages and never applies a chat template.
|
|
132
|
+
prompt_ids = renderer.render_ids(
|
|
133
|
+
messages, tools=TOOLS, add_generation_prompt=True
|
|
134
|
+
)
|
|
135
|
+
output1 = engine.generate(input_ids=prompt_ids, sampling_params=sampling)
|
|
136
|
+
completion1 = completion_ids(output1, prompt_ids)
|
|
137
|
+
parsed1 = renderer.parse_response(completion1)
|
|
138
|
+
print_parsed(label, "turn 1", parsed1)
|
|
139
|
+
|
|
140
|
+
assistant = {"role": "assistant", "content": parsed1.content}
|
|
141
|
+
if parsed1.reasoning_content:
|
|
142
|
+
assistant["reasoning_content"] = parsed1.reasoning_content
|
|
143
|
+
if parsed1.tool_calls:
|
|
144
|
+
assistant["tool_calls"] = parsed1.tool_calls
|
|
145
|
+
messages.append(assistant)
|
|
146
|
+
|
|
147
|
+
if parsed1.tool_calls:
|
|
148
|
+
new_messages = []
|
|
149
|
+
for idx, tool_call in enumerate(parsed1.tool_calls):
|
|
150
|
+
fn = tool_call.get("function") or tool_call
|
|
151
|
+
tool_args = fn.get("arguments") or {}
|
|
152
|
+
if isinstance(tool_args, str):
|
|
153
|
+
tool_args = json.loads(tool_args)
|
|
154
|
+
new_messages.append(
|
|
155
|
+
{
|
|
156
|
+
"role": "tool",
|
|
157
|
+
"tool_call_id": tool_call.get("id", f"call_{idx}"),
|
|
158
|
+
"name": fn.get("name", "multiply"),
|
|
159
|
+
"content": json.dumps(
|
|
160
|
+
{"result": int(tool_args["a"]) * int(tool_args["b"])}
|
|
161
|
+
),
|
|
162
|
+
}
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
new_messages = [
|
|
166
|
+
{"role": "user", "content": "Give the final answer in one sentence."}
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
# Turn 2: bridge extends prompt_ids + completion1 exactly.
|
|
170
|
+
bridged_ids = renderer.bridge_to_next_turn(
|
|
171
|
+
prompt_ids, completion1, new_messages, tools=TOOLS
|
|
172
|
+
)
|
|
173
|
+
if bridged_ids is None:
|
|
174
|
+
raise RuntimeError("bridge_to_next_turn returned None")
|
|
175
|
+
assert bridged_ids[: len(prompt_ids) + len(completion1)] == (
|
|
176
|
+
prompt_ids + completion1
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
output2 = engine.generate(input_ids=bridged_ids, sampling_params=sampling)
|
|
180
|
+
completion2 = completion_ids(output2, bridged_ids)
|
|
181
|
+
print_parsed(label, "turn 2", renderer.parse_response(completion2))
|
|
182
|
+
|
|
183
|
+
engine.shutdown()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
if __name__ == "__main__":
|
|
187
|
+
main()
|