sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -1
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +8 -7
- sglang/srt/disaggregation/decode.py +8 -4
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +68 -5
- sglang/srt/entrypoints/openai/protocol.py +2 -9
- sglang/srt/entrypoints/openai/serving_chat.py +60 -265
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +55 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +24 -27
- sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +11 -13
- sglang/srt/layers/dp_attention.py +118 -27
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +12 -18
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +10 -35
- sglang/srt/layers/quantization/awq.py +15 -16
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/gptq.py +12 -17
- sglang/srt/layers/quantization/marlin_utils.py +15 -5
- sglang/srt/layers/quantization/modelopt_quant.py +58 -41
- sglang/srt/layers/quantization/mxfp4.py +20 -3
- sglang/srt/layers/quantization/utils.py +52 -2
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +66 -116
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +24 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +43 -49
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +18 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +53 -44
- sglang/srt/mem_cache/allocator.py +39 -214
- sglang/srt/mem_cache/allocator_ascend.py +158 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -23
- sglang/srt/model_executor/forward_batch_info.py +33 -14
- sglang/srt/model_executor/model_runner.py +179 -81
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_nextn.py +2 -1
- sglang/srt/models/deepseek_v2.py +79 -38
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +11 -11
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +142 -20
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +10 -27
- sglang/srt/models/llama4.py +19 -6
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +20 -5
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_classification.py +78 -0
- sglang/srt/models/qwen3_moe.py +18 -5
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +6 -2
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/operations.py +17 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +142 -140
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +16 -12
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
|
|
1
|
+
import re
|
2
|
+
from typing import List, Union
|
3
|
+
|
4
|
+
from decord import VideoReader
|
5
|
+
from transformers.video_utils import VideoMetadata
|
6
|
+
|
7
|
+
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
8
|
+
from sglang.srt.models.glm4v import Glm4vForConditionalGeneration
|
9
|
+
from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
|
10
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
11
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
12
|
+
)
|
13
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
14
|
+
BaseMultiModalProcessorOutput,
|
15
|
+
MultimodalSpecialTokens,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class Glm4vImageProcessor(SGLangBaseProcessor):
|
20
|
+
models = [Glm4vForConditionalGeneration, Glm4vMoeForConditionalGeneration]
|
21
|
+
|
22
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
23
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
24
|
+
|
25
|
+
# GLM-4.1V and GLM-4.5V specific tokens
|
26
|
+
self.IMAGE_TOKEN = "<|image|>"
|
27
|
+
self.VIDEO_TOKEN = "<|video|>"
|
28
|
+
self.IMAGE_START_TOKEN = "<|begin_of_image|>"
|
29
|
+
self.IMAGE_END_TOKEN = "<|end_of_image|>"
|
30
|
+
self.VIDEO_START_TOKEN = "<|begin_of_video|>"
|
31
|
+
self.VIDEO_END_TOKEN = "<|end_of_video|>"
|
32
|
+
|
33
|
+
# Token IDs
|
34
|
+
self.IM_TOKEN_ID = hf_config.image_token_id
|
35
|
+
self.VIDEO_TOKEN_ID = hf_config.video_token_id
|
36
|
+
self.IMAGE_START_TOKEN_ID = hf_config.image_start_token_id
|
37
|
+
self.IMAGE_END_TOKEN_ID = hf_config.image_end_token_id
|
38
|
+
self.VIDEO_START_TOKEN_ID = hf_config.video_start_token_id
|
39
|
+
self.VIDEO_END_TOKEN_ID = hf_config.video_end_token_id
|
40
|
+
|
41
|
+
# Vision config
|
42
|
+
self.IMAGE_FACTOR = 28
|
43
|
+
self.MIN_PIXELS = 112 * 112
|
44
|
+
self.MAX_PIXELS = 30000 * 28 * 28 * 2
|
45
|
+
|
46
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
47
|
+
image_token=self.IMAGE_TOKEN,
|
48
|
+
image_token_id=self.IM_TOKEN_ID,
|
49
|
+
video_token=self.VIDEO_TOKEN,
|
50
|
+
# Note: For GLM4v videos, it uses the video token before tokenization but uses image token after tokenization
|
51
|
+
video_token_id=self.IM_TOKEN_ID,
|
52
|
+
).build(_processor)
|
53
|
+
|
54
|
+
# adapted from https://github.com/huggingface/transformers/blob/369c99d0cea403b77bd0aef818527106453fd9fc/src/transformers/video_utils.py#L312
|
55
|
+
async def preprocess_video(self, vr: VideoReader):
|
56
|
+
"""
|
57
|
+
Preprocess video using VideoReader from Decord backend.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
vr (VideoReader): VideoReader object from decord
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
tuple: A tuple containing processed frames and metadata
|
64
|
+
"""
|
65
|
+
video_fps = vr.get_avg_fps()
|
66
|
+
total_num_frames = len(vr)
|
67
|
+
duration = total_num_frames / video_fps if video_fps else 0
|
68
|
+
|
69
|
+
metadata = VideoMetadata(
|
70
|
+
total_num_frames=int(total_num_frames),
|
71
|
+
fps=float(video_fps),
|
72
|
+
duration=float(duration),
|
73
|
+
video_backend="decord",
|
74
|
+
)
|
75
|
+
|
76
|
+
# Extract all frames
|
77
|
+
indices = list(range(total_num_frames))
|
78
|
+
frames = vr.get_batch(indices).asnumpy()
|
79
|
+
metadata.frames_indices = indices
|
80
|
+
|
81
|
+
return frames, metadata
|
82
|
+
|
83
|
+
async def process_mm_data_async(
|
84
|
+
self,
|
85
|
+
image_data: List[Union[str, bytes]],
|
86
|
+
input_text,
|
87
|
+
request_obj,
|
88
|
+
*args,
|
89
|
+
**kwargs,
|
90
|
+
):
|
91
|
+
base_output = self.load_mm_data(
|
92
|
+
prompt=input_text,
|
93
|
+
image_data=image_data,
|
94
|
+
video_data=request_obj.video_data,
|
95
|
+
multimodal_tokens=self.mm_tokens,
|
96
|
+
)
|
97
|
+
|
98
|
+
video_metadata = None
|
99
|
+
|
100
|
+
if base_output.videos:
|
101
|
+
videos_processed = [
|
102
|
+
await self.preprocess_video(video) for video in base_output.videos
|
103
|
+
]
|
104
|
+
base_output.videos, video_metadata = map(list, zip(*videos_processed))
|
105
|
+
# transformer requires the video inputs to be under this format
|
106
|
+
base_output.videos = [base_output.videos]
|
107
|
+
video_metadata = [video_metadata]
|
108
|
+
|
109
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
110
|
+
base_output, self.mm_tokens, video_metadata=video_metadata
|
111
|
+
)
|
112
|
+
|
113
|
+
input_ids = input_ids.flatten()
|
114
|
+
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index_glm4v(
|
115
|
+
input_ids=input_ids.unsqueeze(0),
|
116
|
+
hf_config=self.hf_config,
|
117
|
+
image_grid_thw=getattr(ret, "image_grid_thw", None),
|
118
|
+
video_grid_thw=getattr(ret, "video_grid_thw", None),
|
119
|
+
attention_mask=getattr(ret, "attention_mask", None),
|
120
|
+
)
|
121
|
+
mrope_positions = mrope_positions.squeeze(1)
|
122
|
+
|
123
|
+
mm_inputs = {
|
124
|
+
"input_ids": input_ids.tolist(),
|
125
|
+
"mm_items": mm_items,
|
126
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
127
|
+
"video_token_id": self.mm_tokens.video_token_id,
|
128
|
+
"mrope_positions": mrope_positions,
|
129
|
+
"mrope_position_delta": mrope_position_delta,
|
130
|
+
}
|
131
|
+
|
132
|
+
return mm_inputs
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import re
|
2
2
|
|
3
|
-
from sglang.srt.managers.schedule_batch import Modality
|
3
|
+
from sglang.srt.managers.schedule_batch import Modality
|
4
4
|
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
|
5
5
|
from sglang.srt.multimodal.processors.base_processor import (
|
6
6
|
BaseMultimodalProcessor,
|
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
|
29
29
|
audio_token_id=self.audio_token_id,
|
30
30
|
).build(_processor)
|
31
31
|
|
32
|
+
self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
|
33
|
+
|
32
34
|
async def process_mm_data_async(
|
33
35
|
self,
|
34
36
|
audio_data,
|
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
|
54
56
|
input_lengths = (input_lengths - 1) // 2 + 1
|
55
57
|
output_lengths = (input_lengths - 2) // 2 + 1
|
56
58
|
|
57
|
-
mm_items[0].
|
59
|
+
mm_items[0].audio_feature_lens = output_lengths
|
58
60
|
|
59
61
|
return {
|
60
62
|
"mm_items": mm_items,
|
sglang/srt/operations.py
CHANGED
@@ -1,10 +1,17 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import os
|
2
4
|
from contextlib import contextmanager
|
3
5
|
from dataclasses import dataclass
|
4
|
-
from typing import Any, Callable, Dict, Generator, List, Sequence, Union
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Sequence, Union
|
5
7
|
|
6
8
|
import torch
|
7
9
|
|
10
|
+
from sglang.srt.layers.dp_attention import set_dp_buffer_len
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
14
|
+
|
8
15
|
_ENABLE_PROFILE = bool(int(os.environ.get("SGLANG_OPERATIONS_ENABLE_PROFILE", "0")))
|
9
16
|
|
10
17
|
if _ENABLE_PROFILE:
|
@@ -66,18 +73,26 @@ Stage = List[ExecutionOperation]
|
|
66
73
|
|
67
74
|
|
68
75
|
class _StageExecutor:
|
69
|
-
def __init__(self, debug_name: str, stages: List[Stage], inputs):
|
76
|
+
def __init__(self, debug_name: str, stages: List[Stage], inputs: dict):
|
70
77
|
self._debug_name = debug_name
|
71
78
|
self._stages = stages
|
72
79
|
self._index = 0
|
73
80
|
self._stage_state = _StateDict()
|
74
81
|
self._stage_output = inputs
|
75
82
|
|
83
|
+
# handling DP attention
|
84
|
+
forward_batch: ForwardBatch = inputs["forward_batch"]
|
85
|
+
self._global_dp_buffer_len = forward_batch.global_dp_buffer_len
|
86
|
+
self._local_dp_buffer_len = forward_batch.input_ids.shape[0]
|
87
|
+
|
76
88
|
def next(self):
|
77
89
|
assert not self.done
|
78
90
|
|
79
91
|
stage = self._stages[self._index]
|
80
92
|
|
93
|
+
if self._global_dp_buffer_len is not None:
|
94
|
+
set_dp_buffer_len(self._global_dp_buffer_len, self._local_dp_buffer_len)
|
95
|
+
|
81
96
|
with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):
|
82
97
|
for op in stage:
|
83
98
|
with _annotate_region(debug_name=op.debug_name):
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import re
|
1
2
|
from typing import Dict, Optional, Tuple, Type
|
2
3
|
|
3
4
|
|
@@ -185,6 +186,320 @@ class KimiDetector(BaseReasoningFormatDetector):
|
|
185
186
|
)
|
186
187
|
|
187
188
|
|
189
|
+
class GptOssDetector(BaseReasoningFormatDetector):
|
190
|
+
"""
|
191
|
+
Detector for T4-style reasoning format.
|
192
|
+
|
193
|
+
Assumes reasoning format with two channels:
|
194
|
+
<|channel|>analysis<|message|>...reasoning content...<|end|>
|
195
|
+
<|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
|
196
|
+
|
197
|
+
Returns content from 'analysis' channel as reasoning_text
|
198
|
+
and content from 'final' channel as normal_text.
|
199
|
+
|
200
|
+
Args:
|
201
|
+
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
202
|
+
If True, streams reasoning content as it arrives.
|
203
|
+
"""
|
204
|
+
|
205
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
206
|
+
# TypeScript uses channel tokens instead of simple start/end tokens
|
207
|
+
super().__init__(
|
208
|
+
"<|channel|>analysis<|message|>",
|
209
|
+
"<|end|>",
|
210
|
+
force_reasoning=True,
|
211
|
+
stream_reasoning=stream_reasoning,
|
212
|
+
)
|
213
|
+
self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
|
214
|
+
self.final_channel_end = "<|return|>"
|
215
|
+
self._in_final_channel = False
|
216
|
+
self._analysis_complete = False
|
217
|
+
self._in_reasoning = True
|
218
|
+
|
219
|
+
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
220
|
+
"""
|
221
|
+
One-time parsing: Detects and parses both analysis and final channels.
|
222
|
+
Tool call channels are preserved in normal_text for downstream processing.
|
223
|
+
|
224
|
+
HACK: Also handles simplified format where text starts with "analysis" and transitions
|
225
|
+
to "assistantfinal" without full channel markers.
|
226
|
+
"""
|
227
|
+
# HACK: Handle simplified format (analysis...assistantfinal) without channel markers
|
228
|
+
if (
|
229
|
+
text.startswith("analysis")
|
230
|
+
and "assistantfinal" in text
|
231
|
+
and "<|channel|>" not in text
|
232
|
+
):
|
233
|
+
# Split on "assistantfinal"
|
234
|
+
parts = text.split("assistantfinal", 1)
|
235
|
+
self._in_reasoning = False
|
236
|
+
if len(parts) == 2:
|
237
|
+
reasoning_text = parts[0][
|
238
|
+
len("analysis") :
|
239
|
+
].strip() # Remove "analysis" prefix
|
240
|
+
normal_text = parts[1].strip()
|
241
|
+
return StreamingParseResult(
|
242
|
+
normal_text=normal_text, reasoning_text=reasoning_text
|
243
|
+
)
|
244
|
+
|
245
|
+
reasoning_parts = []
|
246
|
+
normal_parts = []
|
247
|
+
current_pos = 0
|
248
|
+
|
249
|
+
# Process text sequentially to preserve tool calls between analysis sections
|
250
|
+
while current_pos < len(text):
|
251
|
+
# Look for next analysis channel
|
252
|
+
analysis_start_idx = text.find(self.think_start_token, current_pos)
|
253
|
+
|
254
|
+
if analysis_start_idx == -1:
|
255
|
+
# No more analysis channels, rest goes to remaining
|
256
|
+
break
|
257
|
+
|
258
|
+
# Preserve any content before this analysis channel (could include tool calls)
|
259
|
+
if analysis_start_idx > current_pos:
|
260
|
+
between_content = text[current_pos:analysis_start_idx]
|
261
|
+
# This content will be added to normal_parts later
|
262
|
+
normal_parts.append(between_content)
|
263
|
+
|
264
|
+
# Extract analysis content
|
265
|
+
analysis_content_start = analysis_start_idx + len(self.think_start_token)
|
266
|
+
analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
|
267
|
+
|
268
|
+
if analysis_end_idx != -1:
|
269
|
+
reasoning_parts.append(
|
270
|
+
text[analysis_content_start:analysis_end_idx].strip()
|
271
|
+
)
|
272
|
+
current_pos = analysis_end_idx + len(self.think_end_token)
|
273
|
+
else:
|
274
|
+
# Analysis not complete
|
275
|
+
reasoning_parts.append(text[analysis_content_start:].strip())
|
276
|
+
reasoning_text = "".join(reasoning_parts)
|
277
|
+
return StreamingParseResult(reasoning_text=reasoning_text)
|
278
|
+
|
279
|
+
# Add any remaining text after all analysis sections
|
280
|
+
if current_pos < len(text):
|
281
|
+
remaining = text[current_pos:]
|
282
|
+
normal_parts.append(remaining)
|
283
|
+
|
284
|
+
# Process non-analysis content for commentary sections
|
285
|
+
full_normal_text = "".join(normal_parts)
|
286
|
+
|
287
|
+
# Extract reasoning from non-tool-call commentary sections
|
288
|
+
# Tool calls have "to=" in their header, regular commentary does not
|
289
|
+
commentary_pattern = re.compile(
|
290
|
+
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
291
|
+
re.DOTALL,
|
292
|
+
)
|
293
|
+
|
294
|
+
cleaned_text = full_normal_text
|
295
|
+
for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
|
296
|
+
# Check if this commentary is a tool call by looking at the text before <|message|>
|
297
|
+
match_start = match.start()
|
298
|
+
# Find where "<|channel|>commentary" starts within the matched pattern
|
299
|
+
# The pattern starts with "<|start|>assistant<|channel|>commentary"
|
300
|
+
# So we look for the text between "commentary" and "<|message|>" in the match
|
301
|
+
match_text = full_normal_text[match_start : match.end()]
|
302
|
+
commentary_idx = match_text.find("<|channel|>commentary")
|
303
|
+
if commentary_idx != -1:
|
304
|
+
message_idx = match_text.find("<|message|>", commentary_idx)
|
305
|
+
if message_idx != -1:
|
306
|
+
between_text = match_text[commentary_idx:message_idx]
|
307
|
+
# If no "to=" found, this is regular commentary (reasoning content)
|
308
|
+
if " to=" not in between_text:
|
309
|
+
content = match.group(1).strip()
|
310
|
+
reasoning_parts.append(content)
|
311
|
+
# Remove this commentary section from normal text
|
312
|
+
cleaned_text = (
|
313
|
+
cleaned_text[: match.start()] + cleaned_text[match.end() :]
|
314
|
+
)
|
315
|
+
|
316
|
+
full_normal_text = cleaned_text
|
317
|
+
|
318
|
+
# Combine all reasoning parts
|
319
|
+
reasoning_text = "".join(reasoning_parts)
|
320
|
+
|
321
|
+
# Process full_normal_text for final output
|
322
|
+
normal_text = ""
|
323
|
+
if self.final_channel_start in full_normal_text:
|
324
|
+
final_start = full_normal_text.find(self.final_channel_start)
|
325
|
+
final_content_start = final_start + len(self.final_channel_start)
|
326
|
+
final_end = full_normal_text.find(
|
327
|
+
self.final_channel_end, final_content_start
|
328
|
+
)
|
329
|
+
|
330
|
+
if final_end != -1:
|
331
|
+
# Extract content before final channel (includes tool calls)
|
332
|
+
before_final = full_normal_text[:final_start].strip()
|
333
|
+
# Extract ONLY the final channel content (not the channel markers)
|
334
|
+
final_text = full_normal_text[final_content_start:final_end].strip()
|
335
|
+
# Extract content after final channel
|
336
|
+
after_final = full_normal_text[
|
337
|
+
final_end + len(self.final_channel_end) :
|
338
|
+
].strip()
|
339
|
+
|
340
|
+
# For tool calls + final answer: concatenate tool calls with final text
|
341
|
+
parts = []
|
342
|
+
if before_final:
|
343
|
+
parts.append(before_final)
|
344
|
+
if final_text:
|
345
|
+
parts.append(final_text)
|
346
|
+
if after_final:
|
347
|
+
parts.append(after_final)
|
348
|
+
normal_text = " ".join(parts)
|
349
|
+
else:
|
350
|
+
# Final channel not complete - extract what we have
|
351
|
+
# Look for just <|channel|>final<|message|> without <|return|>
|
352
|
+
alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
|
353
|
+
if alt_final_start != -1:
|
354
|
+
before_alt_final = full_normal_text[:alt_final_start].strip()
|
355
|
+
alt_final_content = full_normal_text[
|
356
|
+
alt_final_start + len("<|channel|>final<|message|>") :
|
357
|
+
].strip()
|
358
|
+
|
359
|
+
parts = []
|
360
|
+
if before_alt_final:
|
361
|
+
parts.append(before_alt_final)
|
362
|
+
if alt_final_content:
|
363
|
+
parts.append(alt_final_content)
|
364
|
+
normal_text = " ".join(parts)
|
365
|
+
else:
|
366
|
+
normal_text = full_normal_text.strip()
|
367
|
+
else:
|
368
|
+
# No final channel, treat all as normal text (includes tool calls)
|
369
|
+
normal_text = full_normal_text.strip()
|
370
|
+
|
371
|
+
return StreamingParseResult(
|
372
|
+
normal_text=normal_text, reasoning_text=reasoning_text
|
373
|
+
)
|
374
|
+
|
375
|
+
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
376
|
+
"""
|
377
|
+
Streaming incremental parsing for GPT-OSS format.
|
378
|
+
|
379
|
+
This is a simplified streaming implementation that accumulates content
|
380
|
+
and delegates to the non-streaming parser for complex multi-channel parsing.
|
381
|
+
TODO: Implement proper incremental parsing for better streaming performance.
|
382
|
+
"""
|
383
|
+
self._buffer += new_text
|
384
|
+
|
385
|
+
if not self._in_reasoning:
|
386
|
+
return StreamingParseResult(normal_text=new_text)
|
387
|
+
|
388
|
+
# Check if we have complete sections to process
|
389
|
+
# For GPT-OSS, we need to wait for complete channel sections
|
390
|
+
# HACK: For now, use simplified approach - wait for key markers before processing
|
391
|
+
key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
|
392
|
+
has_complete_section = any(marker in self._buffer for marker in key_markers)
|
393
|
+
|
394
|
+
if not has_complete_section:
|
395
|
+
# Still accumulating, don't process yet
|
396
|
+
return StreamingParseResult()
|
397
|
+
|
398
|
+
# Handle simplified format (analysis...assistantfinal) with true incremental streaming
|
399
|
+
if (
|
400
|
+
"<|channel|>" not in self._buffer
|
401
|
+
): # Simplified format without channel markers
|
402
|
+
if self._buffer.startswith("analysis"):
|
403
|
+
# Check if we have the transition to assistantfinal
|
404
|
+
if "assistantfinal" in self._buffer:
|
405
|
+
self._in_reasoning = False
|
406
|
+
# Complete reasoning section - extract and stream it
|
407
|
+
parts = self._buffer.split("assistantfinal", 1)
|
408
|
+
reasoning_text = parts[0][len("analysis") :].strip()
|
409
|
+
final_content = parts[1].strip()
|
410
|
+
|
411
|
+
# Clear buffer and return both reasoning and final content
|
412
|
+
self._buffer = ""
|
413
|
+
return StreamingParseResult(
|
414
|
+
reasoning_text=reasoning_text if self.stream_reasoning else "",
|
415
|
+
normal_text=final_content,
|
416
|
+
)
|
417
|
+
elif self.stream_reasoning:
|
418
|
+
# Stream reasoning content incrementally as it arrives
|
419
|
+
current_reasoning = self._buffer[len("analysis") :].strip()
|
420
|
+
self._buffer = ""
|
421
|
+
return StreamingParseResult(reasoning_text=current_reasoning)
|
422
|
+
else:
|
423
|
+
# Wait for assistantfinal
|
424
|
+
return StreamingParseResult()
|
425
|
+
elif self._buffer.startswith("assistantfinal"):
|
426
|
+
# Direct final content without analysis
|
427
|
+
final_content = self._buffer[len("assistantfinal") :].strip()
|
428
|
+
self._buffer = ""
|
429
|
+
return StreamingParseResult(normal_text=final_content)
|
430
|
+
|
431
|
+
# For full channel format, process sections as they complete
|
432
|
+
result = StreamingParseResult()
|
433
|
+
|
434
|
+
# Process complete analysis sections
|
435
|
+
while (
|
436
|
+
self.think_start_token in self._buffer
|
437
|
+
and self.think_end_token in self._buffer
|
438
|
+
):
|
439
|
+
start_idx = self._buffer.find(self.think_start_token)
|
440
|
+
start_pos = start_idx + len(self.think_start_token)
|
441
|
+
end_pos = self._buffer.find(self.think_end_token, start_pos)
|
442
|
+
|
443
|
+
if end_pos != -1:
|
444
|
+
reasoning_content = self._buffer[start_pos:end_pos].strip()
|
445
|
+
if self.stream_reasoning and reasoning_content:
|
446
|
+
result.reasoning_text += reasoning_content
|
447
|
+
|
448
|
+
# Remove processed analysis section
|
449
|
+
self._buffer = (
|
450
|
+
self._buffer[:start_idx]
|
451
|
+
+ self._buffer[end_pos + len(self.think_end_token) :]
|
452
|
+
)
|
453
|
+
else:
|
454
|
+
break
|
455
|
+
|
456
|
+
# Process complete commentary sections
|
457
|
+
commentary_pattern = re.compile(
|
458
|
+
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
459
|
+
re.DOTALL,
|
460
|
+
)
|
461
|
+
|
462
|
+
for match in reversed(list(commentary_pattern.finditer(self._buffer))):
|
463
|
+
# Check if this is a tool call
|
464
|
+
start_pos = match.start()
|
465
|
+
commentary_content = match.group(1).strip()
|
466
|
+
if self.stream_reasoning and commentary_content:
|
467
|
+
result.reasoning_text += commentary_content
|
468
|
+
|
469
|
+
# Remove this commentary section
|
470
|
+
self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
|
471
|
+
# Clean up any standalone <|start|>assistant
|
472
|
+
self._buffer = re.sub(
|
473
|
+
r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
|
474
|
+
)
|
475
|
+
|
476
|
+
# Handle final channel completion
|
477
|
+
if self.final_channel_start in self._buffer:
|
478
|
+
final_start = self._buffer.find(self.final_channel_start)
|
479
|
+
final_content_start = final_start + len(self.final_channel_start)
|
480
|
+
|
481
|
+
# Check if final channel is complete
|
482
|
+
final_end = self._buffer.find(self.final_channel_end, final_content_start)
|
483
|
+
if final_end != -1:
|
484
|
+
# Complete final channel - process everything
|
485
|
+
final_result = self.detect_and_parse(self._buffer)
|
486
|
+
self._buffer = ""
|
487
|
+
return StreamingParseResult(
|
488
|
+
normal_text=final_result.normal_text,
|
489
|
+
reasoning_text=result.reasoning_text + final_result.reasoning_text,
|
490
|
+
)
|
491
|
+
else:
|
492
|
+
# Extract content before final channel (e.g. tool calls)
|
493
|
+
before_final = self._buffer[:final_start]
|
494
|
+
if before_final:
|
495
|
+
# Output tool calls for processing
|
496
|
+
result.normal_text += before_final
|
497
|
+
# Keep the final channel part in buffer
|
498
|
+
self._buffer = self._buffer[final_start:]
|
499
|
+
|
500
|
+
return result
|
501
|
+
|
502
|
+
|
188
503
|
class ReasoningParser:
|
189
504
|
"""
|
190
505
|
Parser that handles both streaming and non-streaming scenarios for extracting
|
@@ -203,6 +518,7 @@ class ReasoningParser:
|
|
203
518
|
"glm45": Qwen3Detector,
|
204
519
|
"kimi": KimiDetector,
|
205
520
|
"step3": DeepSeekR1Detector,
|
521
|
+
"gpt-oss": GptOssDetector,
|
206
522
|
}
|
207
523
|
|
208
524
|
def __init__(
|
@@ -68,6 +68,8 @@ class SamplingBatchInfo:
|
|
68
68
|
|
69
69
|
@classmethod
|
70
70
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
71
|
+
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
72
|
+
|
71
73
|
reqs = batch.reqs
|
72
74
|
device = batch.device
|
73
75
|
temperatures = (
|
@@ -97,10 +99,11 @@ class SamplingBatchInfo:
|
|
97
99
|
logit_bias[i, int(key)] = value
|
98
100
|
|
99
101
|
# Check if any request has custom logit processor
|
100
|
-
has_custom_logit_processor =
|
101
|
-
|
102
|
-
|
103
|
-
|
102
|
+
has_custom_logit_processor = global_server_args_dict[
|
103
|
+
"enable_custom_logit_processor"
|
104
|
+
] and any( # check the flag first.
|
105
|
+
r.custom_logit_processor for r in reqs
|
106
|
+
) # then check the requests.
|
104
107
|
|
105
108
|
if has_custom_logit_processor:
|
106
109
|
# Merge the same type of custom logit processors together
|