renderers 0.1.6__tar.gz → 0.1.8.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/publish.yml +4 -15
  2. {renderers-0.1.6 → renderers-0.1.8.dev0}/.gitignore +2 -0
  3. renderers-0.1.8.dev0/PKG-INFO +156 -0
  4. renderers-0.1.8.dev0/README.md +143 -0
  5. renderers-0.1.8.dev0/examples/README.md +72 -0
  6. renderers-0.1.8.dev0/examples/sglang/multiturn_generate_sglang.py +187 -0
  7. renderers-0.1.8.dev0/examples/tinker/multiturn_generate_tinker.py +179 -0
  8. renderers-0.1.8.dev0/examples/transformers/multiturn_generate_transformers.py +196 -0
  9. renderers-0.1.8.dev0/examples/vllm/multiturn_generate_vllm.py +185 -0
  10. {renderers-0.1.6 → renderers-0.1.8.dev0}/pyproject.toml +28 -2
  11. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/__init__.py +23 -0
  12. renderers-0.1.8.dev0/renderers/_version.py +24 -0
  13. renderers-0.1.8.dev0/renderers/base.py +1021 -0
  14. renderers-0.1.8.dev0/renderers/client.py +335 -0
  15. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/deepseek_v3.py +2 -2
  16. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/default.py +1 -1
  17. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/glm45.py +2 -2
  18. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/glm5.py +2 -2
  19. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/gpt_oss.py +2 -2
  20. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/kimi_k2.py +2 -2
  21. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/kimi_k25.py +246 -10
  22. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/minimax_m2.py +2 -2
  23. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/nemotron3.py +2 -2
  24. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/parsers.py +1 -1
  25. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen3.py +2 -2
  26. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen35.py +316 -20
  27. renderers-0.1.8.dev0/renderers/qwen3_vl.py +726 -0
  28. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/conftest.py +3 -3
  29. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_bridge.py +10 -9
  30. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_client.py +96 -3
  31. renderers-0.1.8.dev0/tests/test_load_tokenizer.py +118 -0
  32. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_message_indices.py +2 -5
  33. renderers-0.1.8.dev0/tests/test_multimodal.py +528 -0
  34. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parse_response.py +2 -4
  35. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parsers.py +9 -9
  36. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_preserve_thinking.py +6 -10
  37. renderers-0.1.8.dev0/tests/test_qwen35_size_coverage.py +166 -0
  38. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_render_ids.py +4 -10
  39. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_roundtrip.py +3 -4
  40. {renderers-0.1.6 → renderers-0.1.8.dev0}/uv.lock +449 -2
  41. renderers-0.1.6/PKG-INFO +0 -273
  42. renderers-0.1.6/README.md +0 -260
  43. renderers-0.1.6/renderers/base.py +0 -624
  44. renderers-0.1.6/renderers/client.py +0 -205
  45. renderers-0.1.6/renderers/qwen3_vl.py +0 -341
  46. {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/style.yml +0 -0
  47. {renderers-0.1.6 → renderers-0.1.8.dev0}/.github/workflows/test.yml +0 -0
  48. {renderers-0.1.6 → renderers-0.1.8.dev0}/.pre-commit-config.yaml +0 -0
  49. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/parsing.py +0 -0
  50. {renderers-0.1.6 → renderers-0.1.8.dev0}/renderers/qwen36.py +0 -0
  51. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_build_helpers.py +0 -0
  52. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_gpt_oss_harmony_parity.py +0 -0
  53. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_incremental.py +0 -0
  54. {renderers-0.1.6 → renderers-0.1.8.dev0}/tests/test_parse_response_robustness.py +0 -0
@@ -41,6 +41,10 @@ jobs:
41
41
  TAG="$PUSHED_REF"
42
42
  fi
43
43
 
44
+ # The package version is derived from this tag by hatch-vcs
45
+ # at build time (see [tool.hatch.version] in pyproject.toml).
46
+ # We only need to validate the tag shape — there's no
47
+ # ``project.version`` field to cross-check anymore.
44
48
  case "$TAG" in
45
49
  renderers-v*) ;;
46
50
  *)
@@ -49,21 +53,6 @@ jobs:
49
53
  ;;
50
54
  esac
51
55
 
52
- VERSION="${TAG#renderers-v}"
53
- FILE_VERSION=$(python - <<'PY'
54
- import tomllib
55
- from pathlib import Path
56
- with Path('pyproject.toml').open('rb') as f:
57
- data = tomllib.load(f)
58
- print(data['project']['version'])
59
- PY
60
- )
61
-
62
- if [ "$FILE_VERSION" != "$VERSION" ]; then
63
- echo "Version mismatch: tag requests '$VERSION' but pyproject.toml defines '$FILE_VERSION'" >&2
64
- exit 1
65
- fi
66
-
67
56
  echo "tag=$TAG" >> "$GITHUB_OUTPUT"
68
57
 
69
58
  - uses: astral-sh/setup-uv@v7
@@ -14,6 +14,8 @@ __pycache__/
14
14
  *.pyc
15
15
  *.pyo
16
16
  *.pyd
17
+ # generated by hatch-vcs at build time (see [tool.hatch.build.hooks.vcs])
18
+ renderers/_version.py
17
19
 
18
20
  # tooling caches
19
21
  .pytest_cache/
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: renderers
3
+ Version: 0.1.8.dev0
4
+ Summary: Chat template renderers — deterministic message-to-token conversion for LLM training
5
+ Requires-Python: <3.14,>=3.10
6
+ Requires-Dist: jinja2
7
+ Requires-Dist: numpy
8
+ Requires-Dist: openai-harmony>=0.0.8
9
+ Requires-Dist: openai>=1.108.1
10
+ Requires-Dist: tiktoken
11
+ Requires-Dist: transformers>=4.50.0
12
+ Description-Content-Type: text/markdown
13
+
14
+ # renderers
15
+
16
+ Programmable chat templates for LLM training and inference. A renderer turns a model's chat template into a Python object that can render messages → token ids, parse completion ids → structured assistant messages, and extend a multi-turn rollout without re-rendering model-sampled history.
17
+
18
+ Standalone on PyPI, and portable across training and inference stacks (transformers, vLLM, SGLang, Tinker). Initially developed for RL training with [verifiers](https://github.com/PrimeIntellect-ai/verifiers) and `prime-rl` at Prime Intellect.
19
+
20
+ ## Install
21
+
22
+ ```bash
23
+ uv add renderers
24
+ ```
25
+
26
+ ## At a glance
27
+
28
+ ```python
29
+ from transformers import AutoTokenizer
30
+ from renderers import create_renderer
31
+
32
+ tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
33
+ r = create_renderer(tok, renderer="auto") # → Qwen3Renderer
34
+
35
+ prompt_ids = r.render_ids(
36
+ [{"role": "user", "content": "hi"}],
37
+ add_generation_prompt=True,
38
+ )
39
+ # Feed prompt_ids to a Token-In, Token-Out endpoint.
40
+ # It returns completion_ids sampled by the model.
41
+
42
+ parsed = r.parse_response(completion_ids)
43
+ # ParsedResponse(content=..., reasoning_content=..., tool_calls=...)
44
+ ```
45
+
46
+ For the next turn, extend the previous sampled stream instead of re-rendering history:
47
+
48
+ ```python
49
+ next_prompt_ids = r.bridge_to_next_turn(
50
+ previous_prompt_ids=prompt_ids,
51
+ previous_completion_ids=completion_ids,
52
+ new_messages=[{"role": "tool", "content": "..."}],
53
+ )
54
+ ```
55
+
56
+ Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
57
+
58
+ ## API
59
+
60
+ ```python
61
+ class Renderer(Protocol):
62
+ def render(messages, *, tools=None, add_generation_prompt=False) -> RenderedTokens: ...
63
+ def render_ids(messages, *, tools=None, add_generation_prompt=False) -> list[int]: ...
64
+ def parse_response(token_ids) -> ParsedResponse: ...
65
+ def get_stop_token_ids() -> list[int]: ...
66
+ def bridge_to_next_turn(prev_prompt_ids, prev_completion_ids, new_messages, *, tools=None) -> list[int] | None: ...
67
+ ```
68
+
69
+ - `RenderedTokens` carries `token_ids` **and** `message_indices` — one entry per token attributing each to its source message (`-1` for structural scaffolding). Lets `build_training_sample` build a per-token loss mask in one render.
70
+ - `ParsedResponse` is `(content, reasoning_content, tool_calls)`. It scans token ids for special-token boundaries (e.g. id `151657` for `<tool_call>` on Qwen3) — a literal `"<tool_call>"` in user content tokenizes to ordinary text ids and never matches.
71
+ - Round-trip: rendering `[user, assistant(content, reasoning, tool_calls)]`, slicing the assistant completion, and feeding it through `parse_response` returns an equivalent structured message. Tested per-renderer in `tests/test_roundtrip.py`.
72
+
73
+ ### `bridge_to_next_turn` (the core contract)
74
+
75
+ Given `(prev_prompt_ids, prev_completion_ids)` and new environment messages, return ids for the next turn's prompt such that the result starts with `prev_prompt_ids + prev_completion_ids` byte-for-byte and continues with the new messages plus the next assistant opener. If that cannot be proven safe, return `None` and the caller falls back to a full render.
76
+
77
+ Each hand-coded bridge:
78
+ 1. Anchors at the previous turn's canonical close token. On clean stops it's already in `prev_completion_ids`. On truncation, the renderer synthesizes the close as non-loss prompt context.
79
+ 2. Refuses assistant content in `new_messages` — re-rendering sampled tokens would replace them with canonical template bytes.
80
+ 3. Renders only the new messages in the framing the model family expects.
81
+
82
+ `DefaultRenderer.bridge_to_next_turn` returns `None` unconditionally — the template's close is unknown, so the contract can't be proven.
83
+
84
+ ### Picking a renderer
85
+
86
+ ```python
87
+ r = create_renderer(tok, renderer="auto")
88
+ ```
89
+
90
+ Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
91
+
92
+ ### Pools
93
+
94
+ ```python
95
+ from renderers import create_renderer_pool
96
+
97
+ pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
98
+ with pool.checkout() as r:
99
+ ids = r.render_ids(messages)
100
+ ```
101
+
102
+ Each slot owns its own tokenizer copy. Construction fans out across a thread pool so a 32-slot pool doesn't serially eat ~10–15s of `from_pretrained` calls at startup.
103
+
104
+ ## Why use a renderer
105
+
106
+ For RL the trainer must see the exact token ids the sampler saw. The standard alternative — let the inference engine apply the chat template, parse tool calls, parse reasoning, and re-render full history every turn — silently breaks token identity. These are the failure modes a renderer's `bridge_to_next_turn` sidesteps by never re-rendering prior turns:
107
+
108
+ - **Boolean round-trip.** Engine emits `false`; client parses to Python `bool(False)`; `apply_chat_template` re-renders via `str(False)` → `"False"`. Capital F. Reproducible on Qwen3.5-35B-A3B + mini-swe-agent-plus at ~50% break rate per rollout.
109
+ - **BPE retokenization drift.** The same substring tokenizes differently depending on neighbouring bytes. `json` + `p` + `enderer` (3 tokens) vs `jsonp` + `enderer` (2 tokens) when whitespace shifts by one character. Every subsequent token is shifted from there on.
110
+ - **Tool-call XML drift.** The engine emits a no-arg call with a stylistic empty `</parameter>`; the Jinja re-render of the reconstructed dict drops it. Extension property broken at every such call.
111
+ - **Thinking stripped from non-latest assistants.** Some templates strip `<think>…</think>` blocks from prior assistant turns when re-rendering. The recorded stream has the thinking; the next prompt does not.
112
+ - **Max-seq-len truncation zeroing the anchor.** Client-side `max_seq_len` enforcement zeros `completion_ids` when `prompt_len > max_seq_len`. The bridge anchor is empty, falling back to full re-render — triggering every mode above.
113
+ - **Scaffold-level history rewriting.** Some agent scaffolds (e.g. opencode's `experimental_repairToolCall`) rewrite tool calls before sending them back as history. The next turn's prompt contains a tool call the model never emitted. *A renderer cannot fix this — the drift happens before rendering.*
114
+
115
+ Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:
116
+
117
+ | client path | breaks | training samples from 64 rollouts |
118
+ | -------------------------------------- | ------ | --------------------------------- |
119
+ | `apply_chat_template` (full re-render) | 32 | 77 |
120
+ | renderers `bridge_to_next_turn` | 0 | 64 |
121
+
122
+ Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.
123
+
124
+ ## Compaction overrides
125
+
126
+ `create_renderer` and `create_renderer_pool` accept two constructor-only flags:
127
+
128
+ ```python
129
+ preserve_all_thinking: bool = False
130
+ preserve_thinking_between_tool_calls: bool = False
131
+ ```
132
+
133
+ Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
134
+
135
+ - `preserve_all_thinking=True` — every past assistant's reasoning is kept.
136
+ - `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
137
+
138
+ The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
139
+
140
+ ## `DefaultRenderer`
141
+
142
+ Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
143
+
144
+ ## Roadmap
145
+
146
+ - **VLM support.** `ContentPart` is text-only today; `Qwen3VLRenderer` ships only because Qwen3-VL's text-only chat template differs from Qwen3's. Plan: add `ImagePart` / `VideoPart`, multimodal bridges, validate against a Qwen3-VL RL run.
147
+ - **Patched chat templates.** Some shipped templates re-tokenize history, normalize JSON, or auto-strip thinking — each breaks the extension property. Plan: a `use_patched` opt-in per renderer that renders the same surface form while avoiding known-bad patterns.
148
+
149
+ ## Testing
150
+
151
+ ```bash
152
+ uv sync --group dev
153
+ uv run pytest
154
+ ```
155
+
156
+ Round-trip parity (render → parse → original) and token-level parity against `apply_chat_template` are tested per renderer. End-to-end validation runs against Reverse-Text, Wordle, OpenCode-Math, and RLM-SWE environments.
@@ -0,0 +1,143 @@
1
+ # renderers
2
+
3
+ Programmable chat templates for LLM training and inference. A renderer turns a model's chat template into a Python object that can render messages → token ids, parse completion ids → structured assistant messages, and extend a multi-turn rollout without re-rendering model-sampled history.
4
+
5
+ Standalone on PyPI, and portable across training and inference stacks (transformers, vLLM, SGLang, Tinker). Initially developed for RL training with [verifiers](https://github.com/PrimeIntellect-ai/verifiers) and `prime-rl` at Prime Intellect.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ uv add renderers
11
+ ```
12
+
13
+ ## At a glance
14
+
15
+ ```python
16
+ from transformers import AutoTokenizer
17
+ from renderers import create_renderer
18
+
19
+ tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
20
+ r = create_renderer(tok, renderer="auto") # → Qwen3Renderer
21
+
22
+ prompt_ids = r.render_ids(
23
+ [{"role": "user", "content": "hi"}],
24
+ add_generation_prompt=True,
25
+ )
26
+ # Feed prompt_ids to a Token-In, Token-Out endpoint.
27
+ # It returns completion_ids sampled by the model.
28
+
29
+ parsed = r.parse_response(completion_ids)
30
+ # ParsedResponse(content=..., reasoning_content=..., tool_calls=...)
31
+ ```
32
+
33
+ For the next turn, extend the previous sampled stream instead of re-rendering history:
34
+
35
+ ```python
36
+ next_prompt_ids = r.bridge_to_next_turn(
37
+ previous_prompt_ids=prompt_ids,
38
+ previous_completion_ids=completion_ids,
39
+ new_messages=[{"role": "tool", "content": "..."}],
40
+ )
41
+ ```
42
+
43
+ Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
44
+
45
+ ## API
46
+
47
+ ```python
48
+ class Renderer(Protocol):
49
+ def render(messages, *, tools=None, add_generation_prompt=False) -> RenderedTokens: ...
50
+ def render_ids(messages, *, tools=None, add_generation_prompt=False) -> list[int]: ...
51
+ def parse_response(token_ids) -> ParsedResponse: ...
52
+ def get_stop_token_ids() -> list[int]: ...
53
+ def bridge_to_next_turn(prev_prompt_ids, prev_completion_ids, new_messages, *, tools=None) -> list[int] | None: ...
54
+ ```
55
+
56
+ - `RenderedTokens` carries `token_ids` **and** `message_indices` — one entry per token attributing each to its source message (`-1` for structural scaffolding). Lets `build_training_sample` build a per-token loss mask in one render.
57
+ - `ParsedResponse` is `(content, reasoning_content, tool_calls)`. It scans token ids for special-token boundaries (e.g. id `151657` for `<tool_call>` on Qwen3) — a literal `"<tool_call>"` in user content tokenizes to ordinary text ids and never matches.
58
+ - Round-trip: rendering `[user, assistant(content, reasoning, tool_calls)]`, slicing the assistant completion, and feeding it through `parse_response` returns an equivalent structured message. Tested per-renderer in `tests/test_roundtrip.py`.
59
+
60
+ ### `bridge_to_next_turn` (the core contract)
61
+
62
+ Given `(prev_prompt_ids, prev_completion_ids)` and new environment messages, return ids for the next turn's prompt such that the result starts with `prev_prompt_ids + prev_completion_ids` byte-for-byte and continues with the new messages plus the next assistant opener. If that cannot be proven safe, return `None` and the caller falls back to a full render.
63
+
64
+ Each hand-coded bridge:
65
+ 1. Anchors at the previous turn's canonical close token. On clean stops it's already in `prev_completion_ids`. On truncation, the renderer synthesizes the close as non-loss prompt context.
66
+ 2. Refuses assistant content in `new_messages` — re-rendering sampled tokens would replace them with canonical template bytes.
67
+ 3. Renders only the new messages in the framing the model family expects.
68
+
69
+ `DefaultRenderer.bridge_to_next_turn` returns `None` unconditionally — the template's close is unknown, so the contract can't be proven.
70
+
71
+ ### Picking a renderer
72
+
73
+ ```python
74
+ r = create_renderer(tok, renderer="auto")
75
+ ```
76
+
77
+ Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
78
+
79
+ ### Pools
80
+
81
+ ```python
82
+ from renderers import create_renderer_pool
83
+
84
+ pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
85
+ with pool.checkout() as r:
86
+ ids = r.render_ids(messages)
87
+ ```
88
+
89
+ Each slot owns its own tokenizer copy. Construction fans out across a thread pool so a 32-slot pool doesn't serially eat ~10–15s of `from_pretrained` calls at startup.
90
+
91
+ ## Why use a renderer
92
+
93
+ For RL the trainer must see the exact token ids the sampler saw. The standard alternative — let the inference engine apply the chat template, parse tool calls, parse reasoning, and re-render full history every turn — silently breaks token identity. These are the failure modes a renderer's `bridge_to_next_turn` sidesteps by never re-rendering prior turns:
94
+
95
+ - **Boolean round-trip.** Engine emits `false`; client parses to Python `bool(False)`; `apply_chat_template` re-renders via `str(False)` → `"False"`. Capital F. Reproducible on Qwen3.5-35B-A3B + mini-swe-agent-plus at ~50% break rate per rollout.
96
+ - **BPE retokenization drift.** The same substring tokenizes differently depending on neighbouring bytes. `json` + `p` + `enderer` (3 tokens) vs `jsonp` + `enderer` (2 tokens) when whitespace shifts by one character. Every subsequent token is shifted from there on.
97
+ - **Tool-call XML drift.** The engine emits a no-arg call with a stylistic empty `</parameter>`; the Jinja re-render of the reconstructed dict drops it. Extension property broken at every such call.
98
+ - **Thinking stripped from non-latest assistants.** Some templates strip `<think>…</think>` blocks from prior assistant turns when re-rendering. The recorded stream has the thinking; the next prompt does not.
99
+ - **Max-seq-len truncation zeroing the anchor.** Client-side `max_seq_len` enforcement zeros `completion_ids` when `prompt_len > max_seq_len`. The bridge anchor is empty, falling back to full re-render — triggering every mode above.
100
+ - **Scaffold-level history rewriting.** Some agent scaffolds (e.g. opencode's `experimental_repairToolCall`) rewrite tool calls before sending them back as history. The next turn's prompt contains a tool call the model never emitted. *A renderer cannot fix this — the drift happens before rendering.*
101
+
102
+ Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:
103
+
104
+ | client path | breaks | training samples from 64 rollouts |
105
+ | -------------------------------------- | ------ | --------------------------------- |
106
+ | `apply_chat_template` (full re-render) | 32 | 77 |
107
+ | renderers `bridge_to_next_turn` | 0 | 64 |
108
+
109
+ Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.
110
+
111
+ ## Compaction overrides
112
+
113
+ `create_renderer` and `create_renderer_pool` accept two constructor-only flags:
114
+
115
+ ```python
116
+ preserve_all_thinking: bool = False
117
+ preserve_thinking_between_tool_calls: bool = False
118
+ ```
119
+
120
+ Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
121
+
122
+ - `preserve_all_thinking=True` — every past assistant's reasoning is kept.
123
+ - `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
124
+
125
+ The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
126
+
127
+ ## `DefaultRenderer`
128
+
129
+ Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
130
+
131
+ ## Roadmap
132
+
133
+ - **VLM support.** `ContentPart` is text-only today; `Qwen3VLRenderer` ships only because Qwen3-VL's text-only chat template differs from Qwen3's. Plan: add `ImagePart` / `VideoPart`, multimodal bridges, validate against a Qwen3-VL RL run.
134
+ - **Patched chat templates.** Some shipped templates re-tokenize history, normalize JSON, or auto-strip thinking — each breaks the extension property. Plan: a `use_patched` opt-in per renderer that renders the same surface form while avoiding known-bad patterns.
135
+
136
+ ## Testing
137
+
138
+ ```bash
139
+ uv sync --group dev
140
+ uv run pytest
141
+ ```
142
+
143
+ Round-trip parity (render → parse → original) and token-level parity against `apply_chat_template` are tested per renderer. End-to-end validation runs against Reverse-Text, Wordle, OpenCode-Math, and RLM-SWE environments.
@@ -0,0 +1,72 @@
1
+ # Offline Renderer Inference Examples
2
+
3
+ Each recipe keeps chat templating in `renderers` and sends token IDs to the
4
+ backend:
5
+
6
+ 1. Load a Hugging Face tokenizer.
7
+ 2. Build a model-specific `Renderer`.
8
+ 3. Render chat messages to prompt token IDs locally.
9
+ 4. Pass token IDs directly to an offline inference engine.
10
+ 5. Parse completion token IDs with the same renderer.
11
+ 6. Bridge the next turn without re-rendering prior assistant output.
12
+
13
+ The scripts use PEP 723 `uv` headers, so backend dependencies stay local to the
14
+ recipe and do not touch the repo `uv.lock`.
15
+
16
+ ## vLLM Multi-Turn Recipe
17
+
18
+ ```bash
19
+ CUDA_VISIBLE_DEVICES=0 uv run --script examples/vllm/multiturn_generate_vllm.py
20
+ ```
21
+
22
+ The vLLM script targets `vllm>=0.20` and uses `prompt_token_ids`, so vLLM
23
+ does not apply a chat template.
24
+
25
+ ## SGLang Multi-Turn Recipe
26
+
27
+ ```bash
28
+ CUDA_VISIBLE_DEVICES=1 uv run --script examples/sglang/multiturn_generate_sglang.py
29
+ ```
30
+
31
+ The SGLang script uses `input_ids`, so SGLang does not apply a chat template.
32
+ It leaves `openai-harmony` at SGLang's pinned version for dependency resolution.
33
+
34
+ ## Transformers Multi-Turn Recipe
35
+
36
+ ```bash
37
+ CUDA_VISIBLE_DEVICES=0 uv run --script examples/transformers/multiturn_generate_transformers.py
38
+ ```
39
+
40
+ The Transformers script calls `generate()` with `input_ids`, so Transformers
41
+ does not apply a chat template.
42
+
43
+ ## Tinker Multi-Turn Recipe
44
+
45
+ ```bash
46
+ TINKER_API_KEY=... uv run --script examples/tinker/multiturn_generate_tinker.py
47
+ ```
48
+
49
+ The Tinker script sends renderer-produced token IDs as `ModelInput` to the
50
+ remote sampling API, so Tinker does not apply a chat template.
51
+
52
+ ## Two-GPU Validation
53
+
54
+ Run the recipes in parallel, one backend per GPU:
55
+
56
+ ```bash
57
+ CUDA_VISIBLE_DEVICES=0 uv run --script examples/vllm/multiturn_generate_vllm.py \
58
+ --max-new-tokens 512 &
59
+
60
+ CUDA_VISIBLE_DEVICES=1 uv run --script examples/sglang/multiturn_generate_sglang.py \
61
+ --max-new-tokens 512 &
62
+
63
+ wait
64
+ ```
65
+
66
+ Each script runs `Qwen/Qwen3.5-4B` with `enable_thinking=True` and `False`, then
67
+ `openai/gpt-oss-20b`.
68
+
69
+ ## Multimodal Note
70
+
71
+ Renderers are text-only today. For image/video demos, use the backend's message
72
+ or prompt path until renderers grow multimodal placeholder support.
@@ -0,0 +1,187 @@
1
+ #!/usr/bin/env -S uv run --script
2
+ # /// script
3
+ # requires-python = ">=3.10,<3.14"
4
+ # dependencies = [
5
+ # "renderers>=0.1.6",
6
+ # "sglang==0.5.10.post1",
7
+ # "flash-attn-4>=4.0.0b4",
8
+ # "transformers>=5.3.0",
9
+ # "openai-harmony==0.0.4",
10
+ # "openai>=1.108.1",
11
+ # "tiktoken",
12
+ # "jinja2",
13
+ # "numpy",
14
+ # ]
15
+ # ///
16
+ """SGLang offline generation from renderer-owned prompt token IDs."""
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import os
23
+
24
+ import sglang as sgl
25
+ from renderers.gpt_oss import GptOssRenderer
26
+ from renderers.qwen35 import Qwen35Renderer
27
+ from transformers import AutoTokenizer
28
+
29
+
30
+ MODELS = ["Qwen/Qwen3.5-4B", "openai/gpt-oss-20b"]
31
+ QWEN_THINKING_MODES = [True, False]
32
+
33
+ TOOLS = [
34
+ {
35
+ "type": "function",
36
+ "function": {
37
+ "name": "multiply",
38
+ "description": "Multiply two integers.",
39
+ "parameters": {
40
+ "type": "object",
41
+ "properties": {
42
+ "a": {"type": "integer"},
43
+ "b": {"type": "integer"},
44
+ },
45
+ "required": ["a", "b"],
46
+ },
47
+ },
48
+ }
49
+ ]
50
+
51
+
52
+ def make_renderer(model: str, enable_thinking: bool | None):
53
+ tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=False)
54
+ if model.startswith("Qwen/Qwen3.5-"):
55
+ return Qwen35Renderer(tokenizer, enable_thinking=enable_thinking)
56
+ if model == "openai/gpt-oss-20b":
57
+ return GptOssRenderer(tokenizer)
58
+ raise ValueError(f"unsupported demo model: {model}")
59
+
60
+
61
+ def print_parsed(label: str, turn: str, parsed) -> None:
62
+ print(f"\n[{label}] {turn}")
63
+ if parsed.reasoning_content:
64
+ print(f"reasoning: {parsed.reasoning_content[:240]}")
65
+ if parsed.tool_calls:
66
+ print(f"tool_calls: {json.dumps(parsed.tool_calls, ensure_ascii=False)}")
67
+ if parsed.content:
68
+ print(f"content: {parsed.content}")
69
+
70
+
71
+ def completion_ids(output: dict, prompt_ids: list[int]) -> list[int]:
72
+ ids = list(output.get("output_ids") or output.get("token_ids") or [])
73
+ if not ids:
74
+ raise RuntimeError("SGLang did not return completion token IDs")
75
+ return ids[len(prompt_ids) :] if ids[: len(prompt_ids)] == prompt_ids else ids
76
+
77
+
78
+ def main() -> None:
79
+ parser = argparse.ArgumentParser()
80
+ parser.add_argument("--max-new-tokens", type=int, default=512)
81
+ parser.add_argument("--context-length", type=int, default=4096)
82
+ args = parser.parse_args()
83
+
84
+ print(f"CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', '<unset>')}")
85
+
86
+ targets = []
87
+ for model in MODELS:
88
+ if model.startswith("Qwen/Qwen3.5-"):
89
+ for enable_thinking in QWEN_THINKING_MODES:
90
+ targets.append((model, enable_thinking))
91
+ else:
92
+ targets.append((model, None))
93
+
94
+ for model, enable_thinking in targets:
95
+ label = (
96
+ model
97
+ if enable_thinking is None
98
+ else f"{model} enable_thinking={enable_thinking}"
99
+ )
100
+ print(f"\n=== {label} ===")
101
+
102
+ renderer = make_renderer(model, enable_thinking)
103
+
104
+ engine_kwargs = {
105
+ "model_path": model,
106
+ "trust_remote_code": False,
107
+ "context_length": args.context_length,
108
+ "attention_backend": "triton",
109
+ }
110
+ if model == "openai/gpt-oss-20b":
111
+ engine_kwargs["moe_runner_backend"] = "triton"
112
+ engine = sgl.Engine(**engine_kwargs)
113
+
114
+ sampling = {
115
+ "temperature": 0.0,
116
+ "max_new_tokens": args.max_new_tokens,
117
+ "stop_token_ids": renderer.get_stop_token_ids(),
118
+ "skip_special_tokens": False,
119
+ "no_stop_trim": True,
120
+ }
121
+
122
+ messages = [
123
+ {"role": "system", "content": "You are a concise tool-using assistant."},
124
+ {
125
+ "role": "user",
126
+ "content": "Use the multiply tool for 17 * 23, then summarize.",
127
+ },
128
+ ]
129
+
130
+ # Turn 1: render locally and pass token IDs to SGLang. SGLang never
131
+ # sees messages and never applies a chat template.
132
+ prompt_ids = renderer.render_ids(
133
+ messages, tools=TOOLS, add_generation_prompt=True
134
+ )
135
+ output1 = engine.generate(input_ids=prompt_ids, sampling_params=sampling)
136
+ completion1 = completion_ids(output1, prompt_ids)
137
+ parsed1 = renderer.parse_response(completion1)
138
+ print_parsed(label, "turn 1", parsed1)
139
+
140
+ assistant = {"role": "assistant", "content": parsed1.content}
141
+ if parsed1.reasoning_content:
142
+ assistant["reasoning_content"] = parsed1.reasoning_content
143
+ if parsed1.tool_calls:
144
+ assistant["tool_calls"] = parsed1.tool_calls
145
+ messages.append(assistant)
146
+
147
+ if parsed1.tool_calls:
148
+ new_messages = []
149
+ for idx, tool_call in enumerate(parsed1.tool_calls):
150
+ fn = tool_call.get("function") or tool_call
151
+ tool_args = fn.get("arguments") or {}
152
+ if isinstance(tool_args, str):
153
+ tool_args = json.loads(tool_args)
154
+ new_messages.append(
155
+ {
156
+ "role": "tool",
157
+ "tool_call_id": tool_call.get("id", f"call_{idx}"),
158
+ "name": fn.get("name", "multiply"),
159
+ "content": json.dumps(
160
+ {"result": int(tool_args["a"]) * int(tool_args["b"])}
161
+ ),
162
+ }
163
+ )
164
+ else:
165
+ new_messages = [
166
+ {"role": "user", "content": "Give the final answer in one sentence."}
167
+ ]
168
+
169
+ # Turn 2: bridge extends prompt_ids + completion1 exactly.
170
+ bridged_ids = renderer.bridge_to_next_turn(
171
+ prompt_ids, completion1, new_messages, tools=TOOLS
172
+ )
173
+ if bridged_ids is None:
174
+ raise RuntimeError("bridge_to_next_turn returned None")
175
+ assert bridged_ids[: len(prompt_ids) + len(completion1)] == (
176
+ prompt_ids + completion1
177
+ )
178
+
179
+ output2 = engine.generate(input_ids=bridged_ids, sampling_params=sampling)
180
+ completion2 = completion_ids(output2, bridged_ids)
181
+ print_parsed(label, "turn 2", renderer.parse_response(completion2))
182
+
183
+ engine.shutdown()
184
+
185
+
186
+ if __name__ == "__main__":
187
+ main()