sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -1
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +8 -7
  6. sglang/srt/disaggregation/decode.py +8 -4
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +68 -5
  14. sglang/srt/entrypoints/openai/protocol.py +2 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +60 -265
  16. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  18. sglang/srt/function_call/ebnf_composer.py +1 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  21. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  22. sglang/srt/function_call/kimik2_detector.py +3 -3
  23. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  24. sglang/srt/jinja_template_utils.py +6 -0
  25. sglang/srt/layers/attention/aiter_backend.py +370 -107
  26. sglang/srt/layers/attention/ascend_backend.py +3 -0
  27. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  28. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  29. sglang/srt/layers/attention/flashinfer_backend.py +55 -13
  30. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
  31. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  32. sglang/srt/layers/attention/triton_backend.py +24 -27
  33. sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
  34. sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
  35. sglang/srt/layers/attention/vision.py +9 -1
  36. sglang/srt/layers/attention/wave_backend.py +627 -0
  37. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  38. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  39. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  40. sglang/srt/layers/communicator.py +11 -13
  41. sglang/srt/layers/dp_attention.py +118 -27
  42. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  43. sglang/srt/layers/linear.py +1 -0
  44. sglang/srt/layers/logits_processor.py +12 -18
  45. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  46. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  47. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  48. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  63. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  64. sglang/srt/layers/moe/topk.py +4 -1
  65. sglang/srt/layers/multimodal.py +156 -40
  66. sglang/srt/layers/quantization/__init__.py +10 -35
  67. sglang/srt/layers/quantization/awq.py +15 -16
  68. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
  69. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  70. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  71. sglang/srt/layers/quantization/gptq.py +12 -17
  72. sglang/srt/layers/quantization/marlin_utils.py +15 -5
  73. sglang/srt/layers/quantization/modelopt_quant.py +58 -41
  74. sglang/srt/layers/quantization/mxfp4.py +20 -3
  75. sglang/srt/layers/quantization/utils.py +52 -2
  76. sglang/srt/layers/quantization/w4afp8.py +20 -11
  77. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  78. sglang/srt/layers/rotary_embedding.py +281 -2
  79. sglang/srt/layers/sampler.py +5 -2
  80. sglang/srt/lora/backend/base_backend.py +3 -23
  81. sglang/srt/lora/layers.py +66 -116
  82. sglang/srt/lora/lora.py +17 -62
  83. sglang/srt/lora/lora_manager.py +12 -48
  84. sglang/srt/lora/lora_registry.py +20 -9
  85. sglang/srt/lora/mem_pool.py +20 -63
  86. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  87. sglang/srt/lora/utils.py +25 -58
  88. sglang/srt/managers/cache_controller.py +24 -29
  89. sglang/srt/managers/detokenizer_manager.py +1 -1
  90. sglang/srt/managers/io_struct.py +20 -6
  91. sglang/srt/managers/mm_utils.py +1 -2
  92. sglang/srt/managers/multimodal_processor.py +1 -1
  93. sglang/srt/managers/schedule_batch.py +43 -49
  94. sglang/srt/managers/schedule_policy.py +6 -6
  95. sglang/srt/managers/scheduler.py +18 -11
  96. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  97. sglang/srt/managers/tokenizer_manager.py +53 -44
  98. sglang/srt/mem_cache/allocator.py +39 -214
  99. sglang/srt/mem_cache/allocator_ascend.py +158 -0
  100. sglang/srt/mem_cache/chunk_cache.py +1 -1
  101. sglang/srt/mem_cache/hicache_storage.py +1 -1
  102. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  103. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  104. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  105. sglang/srt/mem_cache/radix_cache.py +2 -5
  106. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  107. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  108. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  109. sglang/srt/model_executor/cuda_graph_runner.py +29 -23
  110. sglang/srt/model_executor/forward_batch_info.py +33 -14
  111. sglang/srt/model_executor/model_runner.py +179 -81
  112. sglang/srt/model_loader/loader.py +18 -6
  113. sglang/srt/models/deepseek_nextn.py +2 -1
  114. sglang/srt/models/deepseek_v2.py +79 -38
  115. sglang/srt/models/gemma2.py +0 -34
  116. sglang/srt/models/gemma3n_mm.py +8 -9
  117. sglang/srt/models/glm4.py +6 -0
  118. sglang/srt/models/glm4_moe.py +11 -11
  119. sglang/srt/models/glm4_moe_nextn.py +2 -1
  120. sglang/srt/models/glm4v.py +589 -0
  121. sglang/srt/models/glm4v_moe.py +400 -0
  122. sglang/srt/models/gpt_oss.py +142 -20
  123. sglang/srt/models/granite.py +0 -25
  124. sglang/srt/models/llama.py +10 -27
  125. sglang/srt/models/llama4.py +19 -6
  126. sglang/srt/models/qwen2.py +2 -2
  127. sglang/srt/models/qwen2_5_vl.py +7 -3
  128. sglang/srt/models/qwen2_audio.py +10 -9
  129. sglang/srt/models/qwen2_moe.py +20 -5
  130. sglang/srt/models/qwen3.py +0 -24
  131. sglang/srt/models/qwen3_classification.py +78 -0
  132. sglang/srt/models/qwen3_moe.py +18 -5
  133. sglang/srt/models/registry.py +1 -1
  134. sglang/srt/models/step3_vl.py +6 -2
  135. sglang/srt/models/torch_native_llama.py +0 -24
  136. sglang/srt/multimodal/processors/base_processor.py +23 -13
  137. sglang/srt/multimodal/processors/glm4v.py +132 -0
  138. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  139. sglang/srt/operations.py +17 -2
  140. sglang/srt/reasoning_parser.py +316 -0
  141. sglang/srt/sampling/sampling_batch_info.py +7 -4
  142. sglang/srt/server_args.py +142 -140
  143. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
  144. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  145. sglang/srt/speculative/eagle_worker.py +16 -0
  146. sglang/srt/two_batch_overlap.py +16 -12
  147. sglang/srt/utils.py +3 -3
  148. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  149. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  150. sglang/test/doc_patch.py +59 -0
  151. sglang/test/few_shot_gsm8k.py +1 -1
  152. sglang/test/few_shot_gsm8k_engine.py +1 -1
  153. sglang/test/run_eval.py +4 -1
  154. sglang/test/simple_eval_common.py +6 -0
  155. sglang/test/simple_eval_gpqa.py +2 -0
  156. sglang/test/test_fp4_moe.py +118 -36
  157. sglang/test/test_marlin_moe.py +1 -1
  158. sglang/test/test_marlin_utils.py +1 -1
  159. sglang/utils.py +1 -1
  160. sglang/version.py +1 -1
  161. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
  162. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
  163. sglang/lang/backend/__init__.py +0 -0
  164. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  165. sglang/srt/layers/quantization/scalar_type.py +0 -352
  166. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  167. /sglang/{api.py → lang/api.py} +0 -0
  168. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
  169. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
  170. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
1
+ import re
2
+ from typing import List, Union
3
+
4
+ from decord import VideoReader
5
+ from transformers.video_utils import VideoMetadata
6
+
7
+ from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
8
+ from sglang.srt.models.glm4v import Glm4vForConditionalGeneration
9
+ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
10
+ from sglang.srt.multimodal.processors.base_processor import (
11
+ BaseMultimodalProcessor as SGLangBaseProcessor,
12
+ )
13
+ from sglang.srt.multimodal.processors.base_processor import (
14
+ BaseMultiModalProcessorOutput,
15
+ MultimodalSpecialTokens,
16
+ )
17
+
18
+
19
+ class Glm4vImageProcessor(SGLangBaseProcessor):
20
+ models = [Glm4vForConditionalGeneration, Glm4vMoeForConditionalGeneration]
21
+
22
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
23
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
24
+
25
+ # GLM-4.1V and GLM-4.5V specific tokens
26
+ self.IMAGE_TOKEN = "<|image|>"
27
+ self.VIDEO_TOKEN = "<|video|>"
28
+ self.IMAGE_START_TOKEN = "<|begin_of_image|>"
29
+ self.IMAGE_END_TOKEN = "<|end_of_image|>"
30
+ self.VIDEO_START_TOKEN = "<|begin_of_video|>"
31
+ self.VIDEO_END_TOKEN = "<|end_of_video|>"
32
+
33
+ # Token IDs
34
+ self.IM_TOKEN_ID = hf_config.image_token_id
35
+ self.VIDEO_TOKEN_ID = hf_config.video_token_id
36
+ self.IMAGE_START_TOKEN_ID = hf_config.image_start_token_id
37
+ self.IMAGE_END_TOKEN_ID = hf_config.image_end_token_id
38
+ self.VIDEO_START_TOKEN_ID = hf_config.video_start_token_id
39
+ self.VIDEO_END_TOKEN_ID = hf_config.video_end_token_id
40
+
41
+ # Vision config
42
+ self.IMAGE_FACTOR = 28
43
+ self.MIN_PIXELS = 112 * 112
44
+ self.MAX_PIXELS = 30000 * 28 * 28 * 2
45
+
46
+ self.mm_tokens = MultimodalSpecialTokens(
47
+ image_token=self.IMAGE_TOKEN,
48
+ image_token_id=self.IM_TOKEN_ID,
49
+ video_token=self.VIDEO_TOKEN,
50
+ # Note: For GLM4v videos, it uses the video token before tokenization but uses image token after tokenization
51
+ video_token_id=self.IM_TOKEN_ID,
52
+ ).build(_processor)
53
+
54
+ # adapted from https://github.com/huggingface/transformers/blob/369c99d0cea403b77bd0aef818527106453fd9fc/src/transformers/video_utils.py#L312
55
+ async def preprocess_video(self, vr: VideoReader):
56
+ """
57
+ Preprocess video using VideoReader from Decord backend.
58
+
59
+ Args:
60
+ vr (VideoReader): VideoReader object from decord
61
+
62
+ Returns:
63
+ tuple: A tuple containing processed frames and metadata
64
+ """
65
+ video_fps = vr.get_avg_fps()
66
+ total_num_frames = len(vr)
67
+ duration = total_num_frames / video_fps if video_fps else 0
68
+
69
+ metadata = VideoMetadata(
70
+ total_num_frames=int(total_num_frames),
71
+ fps=float(video_fps),
72
+ duration=float(duration),
73
+ video_backend="decord",
74
+ )
75
+
76
+ # Extract all frames
77
+ indices = list(range(total_num_frames))
78
+ frames = vr.get_batch(indices).asnumpy()
79
+ metadata.frames_indices = indices
80
+
81
+ return frames, metadata
82
+
83
+ async def process_mm_data_async(
84
+ self,
85
+ image_data: List[Union[str, bytes]],
86
+ input_text,
87
+ request_obj,
88
+ *args,
89
+ **kwargs,
90
+ ):
91
+ base_output = self.load_mm_data(
92
+ prompt=input_text,
93
+ image_data=image_data,
94
+ video_data=request_obj.video_data,
95
+ multimodal_tokens=self.mm_tokens,
96
+ )
97
+
98
+ video_metadata = None
99
+
100
+ if base_output.videos:
101
+ videos_processed = [
102
+ await self.preprocess_video(video) for video in base_output.videos
103
+ ]
104
+ base_output.videos, video_metadata = map(list, zip(*videos_processed))
105
+ # transformer requires the video inputs to be under this format
106
+ base_output.videos = [base_output.videos]
107
+ video_metadata = [video_metadata]
108
+
109
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
110
+ base_output, self.mm_tokens, video_metadata=video_metadata
111
+ )
112
+
113
+ input_ids = input_ids.flatten()
114
+ mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index_glm4v(
115
+ input_ids=input_ids.unsqueeze(0),
116
+ hf_config=self.hf_config,
117
+ image_grid_thw=getattr(ret, "image_grid_thw", None),
118
+ video_grid_thw=getattr(ret, "video_grid_thw", None),
119
+ attention_mask=getattr(ret, "attention_mask", None),
120
+ )
121
+ mrope_positions = mrope_positions.squeeze(1)
122
+
123
+ mm_inputs = {
124
+ "input_ids": input_ids.tolist(),
125
+ "mm_items": mm_items,
126
+ "im_token_id": self.mm_tokens.image_token_id,
127
+ "video_token_id": self.mm_tokens.video_token_id,
128
+ "mrope_positions": mrope_positions,
129
+ "mrope_position_delta": mrope_position_delta,
130
+ }
131
+
132
+ return mm_inputs
@@ -1,6 +1,6 @@
1
1
  import re
2
2
 
3
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
3
+ from sglang.srt.managers.schedule_batch import Modality
4
4
  from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
5
5
  from sglang.srt.multimodal.processors.base_processor import (
6
6
  BaseMultimodalProcessor,
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
29
29
  audio_token_id=self.audio_token_id,
30
30
  ).build(_processor)
31
31
 
32
+ self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
33
+
32
34
  async def process_mm_data_async(
33
35
  self,
34
36
  audio_data,
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
54
56
  input_lengths = (input_lengths - 1) // 2 + 1
55
57
  output_lengths = (input_lengths - 2) // 2 + 1
56
58
 
57
- mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
59
+ mm_items[0].audio_feature_lens = output_lengths
58
60
 
59
61
  return {
60
62
  "mm_items": mm_items,
sglang/srt/operations.py CHANGED
@@ -1,10 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  import os
2
4
  from contextlib import contextmanager
3
5
  from dataclasses import dataclass
4
- from typing import Any, Callable, Dict, Generator, List, Sequence, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Sequence, Union
5
7
 
6
8
  import torch
7
9
 
10
+ from sglang.srt.layers.dp_attention import set_dp_buffer_len
11
+
12
+ if TYPE_CHECKING:
13
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
14
+
8
15
  _ENABLE_PROFILE = bool(int(os.environ.get("SGLANG_OPERATIONS_ENABLE_PROFILE", "0")))
9
16
 
10
17
  if _ENABLE_PROFILE:
@@ -66,18 +73,26 @@ Stage = List[ExecutionOperation]
66
73
 
67
74
 
68
75
  class _StageExecutor:
69
- def __init__(self, debug_name: str, stages: List[Stage], inputs):
76
+ def __init__(self, debug_name: str, stages: List[Stage], inputs: dict):
70
77
  self._debug_name = debug_name
71
78
  self._stages = stages
72
79
  self._index = 0
73
80
  self._stage_state = _StateDict()
74
81
  self._stage_output = inputs
75
82
 
83
+ # handling DP attention
84
+ forward_batch: ForwardBatch = inputs["forward_batch"]
85
+ self._global_dp_buffer_len = forward_batch.global_dp_buffer_len
86
+ self._local_dp_buffer_len = forward_batch.input_ids.shape[0]
87
+
76
88
  def next(self):
77
89
  assert not self.done
78
90
 
79
91
  stage = self._stages[self._index]
80
92
 
93
+ if self._global_dp_buffer_len is not None:
94
+ set_dp_buffer_len(self._global_dp_buffer_len, self._local_dp_buffer_len)
95
+
81
96
  with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):
82
97
  for op in stage:
83
98
  with _annotate_region(debug_name=op.debug_name):
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from typing import Dict, Optional, Tuple, Type
2
3
 
3
4
 
@@ -185,6 +186,320 @@ class KimiDetector(BaseReasoningFormatDetector):
185
186
  )
186
187
 
187
188
 
189
+ class GptOssDetector(BaseReasoningFormatDetector):
190
+ """
191
+ Detector for T4-style reasoning format.
192
+
193
+ Assumes reasoning format with two channels:
194
+ <|channel|>analysis<|message|>...reasoning content...<|end|>
195
+ <|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
196
+
197
+ Returns content from 'analysis' channel as reasoning_text
198
+ and content from 'final' channel as normal_text.
199
+
200
+ Args:
201
+ stream_reasoning (bool): If False, accumulates reasoning content until complete.
202
+ If True, streams reasoning content as it arrives.
203
+ """
204
+
205
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
206
+ # TypeScript uses channel tokens instead of simple start/end tokens
207
+ super().__init__(
208
+ "<|channel|>analysis<|message|>",
209
+ "<|end|>",
210
+ force_reasoning=True,
211
+ stream_reasoning=stream_reasoning,
212
+ )
213
+ self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
214
+ self.final_channel_end = "<|return|>"
215
+ self._in_final_channel = False
216
+ self._analysis_complete = False
217
+ self._in_reasoning = True
218
+
219
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
220
+ """
221
+ One-time parsing: Detects and parses both analysis and final channels.
222
+ Tool call channels are preserved in normal_text for downstream processing.
223
+
224
+ HACK: Also handles simplified format where text starts with "analysis" and transitions
225
+ to "assistantfinal" without full channel markers.
226
+ """
227
+ # HACK: Handle simplified format (analysis...assistantfinal) without channel markers
228
+ if (
229
+ text.startswith("analysis")
230
+ and "assistantfinal" in text
231
+ and "<|channel|>" not in text
232
+ ):
233
+ # Split on "assistantfinal"
234
+ parts = text.split("assistantfinal", 1)
235
+ self._in_reasoning = False
236
+ if len(parts) == 2:
237
+ reasoning_text = parts[0][
238
+ len("analysis") :
239
+ ].strip() # Remove "analysis" prefix
240
+ normal_text = parts[1].strip()
241
+ return StreamingParseResult(
242
+ normal_text=normal_text, reasoning_text=reasoning_text
243
+ )
244
+
245
+ reasoning_parts = []
246
+ normal_parts = []
247
+ current_pos = 0
248
+
249
+ # Process text sequentially to preserve tool calls between analysis sections
250
+ while current_pos < len(text):
251
+ # Look for next analysis channel
252
+ analysis_start_idx = text.find(self.think_start_token, current_pos)
253
+
254
+ if analysis_start_idx == -1:
255
+ # No more analysis channels, rest goes to remaining
256
+ break
257
+
258
+ # Preserve any content before this analysis channel (could include tool calls)
259
+ if analysis_start_idx > current_pos:
260
+ between_content = text[current_pos:analysis_start_idx]
261
+ # This content will be added to normal_parts later
262
+ normal_parts.append(between_content)
263
+
264
+ # Extract analysis content
265
+ analysis_content_start = analysis_start_idx + len(self.think_start_token)
266
+ analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
267
+
268
+ if analysis_end_idx != -1:
269
+ reasoning_parts.append(
270
+ text[analysis_content_start:analysis_end_idx].strip()
271
+ )
272
+ current_pos = analysis_end_idx + len(self.think_end_token)
273
+ else:
274
+ # Analysis not complete
275
+ reasoning_parts.append(text[analysis_content_start:].strip())
276
+ reasoning_text = "".join(reasoning_parts)
277
+ return StreamingParseResult(reasoning_text=reasoning_text)
278
+
279
+ # Add any remaining text after all analysis sections
280
+ if current_pos < len(text):
281
+ remaining = text[current_pos:]
282
+ normal_parts.append(remaining)
283
+
284
+ # Process non-analysis content for commentary sections
285
+ full_normal_text = "".join(normal_parts)
286
+
287
+ # Extract reasoning from non-tool-call commentary sections
288
+ # Tool calls have "to=" in their header, regular commentary does not
289
+ commentary_pattern = re.compile(
290
+ r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
291
+ re.DOTALL,
292
+ )
293
+
294
+ cleaned_text = full_normal_text
295
+ for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
296
+ # Check if this commentary is a tool call by looking at the text before <|message|>
297
+ match_start = match.start()
298
+ # Find where "<|channel|>commentary" starts within the matched pattern
299
+ # The pattern starts with "<|start|>assistant<|channel|>commentary"
300
+ # So we look for the text between "commentary" and "<|message|>" in the match
301
+ match_text = full_normal_text[match_start : match.end()]
302
+ commentary_idx = match_text.find("<|channel|>commentary")
303
+ if commentary_idx != -1:
304
+ message_idx = match_text.find("<|message|>", commentary_idx)
305
+ if message_idx != -1:
306
+ between_text = match_text[commentary_idx:message_idx]
307
+ # If no "to=" found, this is regular commentary (reasoning content)
308
+ if " to=" not in between_text:
309
+ content = match.group(1).strip()
310
+ reasoning_parts.append(content)
311
+ # Remove this commentary section from normal text
312
+ cleaned_text = (
313
+ cleaned_text[: match.start()] + cleaned_text[match.end() :]
314
+ )
315
+
316
+ full_normal_text = cleaned_text
317
+
318
+ # Combine all reasoning parts
319
+ reasoning_text = "".join(reasoning_parts)
320
+
321
+ # Process full_normal_text for final output
322
+ normal_text = ""
323
+ if self.final_channel_start in full_normal_text:
324
+ final_start = full_normal_text.find(self.final_channel_start)
325
+ final_content_start = final_start + len(self.final_channel_start)
326
+ final_end = full_normal_text.find(
327
+ self.final_channel_end, final_content_start
328
+ )
329
+
330
+ if final_end != -1:
331
+ # Extract content before final channel (includes tool calls)
332
+ before_final = full_normal_text[:final_start].strip()
333
+ # Extract ONLY the final channel content (not the channel markers)
334
+ final_text = full_normal_text[final_content_start:final_end].strip()
335
+ # Extract content after final channel
336
+ after_final = full_normal_text[
337
+ final_end + len(self.final_channel_end) :
338
+ ].strip()
339
+
340
+ # For tool calls + final answer: concatenate tool calls with final text
341
+ parts = []
342
+ if before_final:
343
+ parts.append(before_final)
344
+ if final_text:
345
+ parts.append(final_text)
346
+ if after_final:
347
+ parts.append(after_final)
348
+ normal_text = " ".join(parts)
349
+ else:
350
+ # Final channel not complete - extract what we have
351
+ # Look for just <|channel|>final<|message|> without <|return|>
352
+ alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
353
+ if alt_final_start != -1:
354
+ before_alt_final = full_normal_text[:alt_final_start].strip()
355
+ alt_final_content = full_normal_text[
356
+ alt_final_start + len("<|channel|>final<|message|>") :
357
+ ].strip()
358
+
359
+ parts = []
360
+ if before_alt_final:
361
+ parts.append(before_alt_final)
362
+ if alt_final_content:
363
+ parts.append(alt_final_content)
364
+ normal_text = " ".join(parts)
365
+ else:
366
+ normal_text = full_normal_text.strip()
367
+ else:
368
+ # No final channel, treat all as normal text (includes tool calls)
369
+ normal_text = full_normal_text.strip()
370
+
371
+ return StreamingParseResult(
372
+ normal_text=normal_text, reasoning_text=reasoning_text
373
+ )
374
+
375
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
376
+ """
377
+ Streaming incremental parsing for GPT-OSS format.
378
+
379
+ This is a simplified streaming implementation that accumulates content
380
+ and delegates to the non-streaming parser for complex multi-channel parsing.
381
+ TODO: Implement proper incremental parsing for better streaming performance.
382
+ """
383
+ self._buffer += new_text
384
+
385
+ if not self._in_reasoning:
386
+ return StreamingParseResult(normal_text=new_text)
387
+
388
+ # Check if we have complete sections to process
389
+ # For GPT-OSS, we need to wait for complete channel sections
390
+ # HACK: For now, use simplified approach - wait for key markers before processing
391
+ key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
392
+ has_complete_section = any(marker in self._buffer for marker in key_markers)
393
+
394
+ if not has_complete_section:
395
+ # Still accumulating, don't process yet
396
+ return StreamingParseResult()
397
+
398
+ # Handle simplified format (analysis...assistantfinal) with true incremental streaming
399
+ if (
400
+ "<|channel|>" not in self._buffer
401
+ ): # Simplified format without channel markers
402
+ if self._buffer.startswith("analysis"):
403
+ # Check if we have the transition to assistantfinal
404
+ if "assistantfinal" in self._buffer:
405
+ self._in_reasoning = False
406
+ # Complete reasoning section - extract and stream it
407
+ parts = self._buffer.split("assistantfinal", 1)
408
+ reasoning_text = parts[0][len("analysis") :].strip()
409
+ final_content = parts[1].strip()
410
+
411
+ # Clear buffer and return both reasoning and final content
412
+ self._buffer = ""
413
+ return StreamingParseResult(
414
+ reasoning_text=reasoning_text if self.stream_reasoning else "",
415
+ normal_text=final_content,
416
+ )
417
+ elif self.stream_reasoning:
418
+ # Stream reasoning content incrementally as it arrives
419
+ current_reasoning = self._buffer[len("analysis") :].strip()
420
+ self._buffer = ""
421
+ return StreamingParseResult(reasoning_text=current_reasoning)
422
+ else:
423
+ # Wait for assistantfinal
424
+ return StreamingParseResult()
425
+ elif self._buffer.startswith("assistantfinal"):
426
+ # Direct final content without analysis
427
+ final_content = self._buffer[len("assistantfinal") :].strip()
428
+ self._buffer = ""
429
+ return StreamingParseResult(normal_text=final_content)
430
+
431
+ # For full channel format, process sections as they complete
432
+ result = StreamingParseResult()
433
+
434
+ # Process complete analysis sections
435
+ while (
436
+ self.think_start_token in self._buffer
437
+ and self.think_end_token in self._buffer
438
+ ):
439
+ start_idx = self._buffer.find(self.think_start_token)
440
+ start_pos = start_idx + len(self.think_start_token)
441
+ end_pos = self._buffer.find(self.think_end_token, start_pos)
442
+
443
+ if end_pos != -1:
444
+ reasoning_content = self._buffer[start_pos:end_pos].strip()
445
+ if self.stream_reasoning and reasoning_content:
446
+ result.reasoning_text += reasoning_content
447
+
448
+ # Remove processed analysis section
449
+ self._buffer = (
450
+ self._buffer[:start_idx]
451
+ + self._buffer[end_pos + len(self.think_end_token) :]
452
+ )
453
+ else:
454
+ break
455
+
456
+ # Process complete commentary sections
457
+ commentary_pattern = re.compile(
458
+ r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
459
+ re.DOTALL,
460
+ )
461
+
462
+ for match in reversed(list(commentary_pattern.finditer(self._buffer))):
463
+ # Check if this is a tool call
464
+ start_pos = match.start()
465
+ commentary_content = match.group(1).strip()
466
+ if self.stream_reasoning and commentary_content:
467
+ result.reasoning_text += commentary_content
468
+
469
+ # Remove this commentary section
470
+ self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
471
+ # Clean up any standalone <|start|>assistant
472
+ self._buffer = re.sub(
473
+ r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
474
+ )
475
+
476
+ # Handle final channel completion
477
+ if self.final_channel_start in self._buffer:
478
+ final_start = self._buffer.find(self.final_channel_start)
479
+ final_content_start = final_start + len(self.final_channel_start)
480
+
481
+ # Check if final channel is complete
482
+ final_end = self._buffer.find(self.final_channel_end, final_content_start)
483
+ if final_end != -1:
484
+ # Complete final channel - process everything
485
+ final_result = self.detect_and_parse(self._buffer)
486
+ self._buffer = ""
487
+ return StreamingParseResult(
488
+ normal_text=final_result.normal_text,
489
+ reasoning_text=result.reasoning_text + final_result.reasoning_text,
490
+ )
491
+ else:
492
+ # Extract content before final channel (e.g. tool calls)
493
+ before_final = self._buffer[:final_start]
494
+ if before_final:
495
+ # Output tool calls for processing
496
+ result.normal_text += before_final
497
+ # Keep the final channel part in buffer
498
+ self._buffer = self._buffer[final_start:]
499
+
500
+ return result
501
+
502
+
188
503
  class ReasoningParser:
189
504
  """
190
505
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -203,6 +518,7 @@ class ReasoningParser:
203
518
  "glm45": Qwen3Detector,
204
519
  "kimi": KimiDetector,
205
520
  "step3": DeepSeekR1Detector,
521
+ "gpt-oss": GptOssDetector,
206
522
  }
207
523
 
208
524
  def __init__(
@@ -68,6 +68,8 @@ class SamplingBatchInfo:
68
68
 
69
69
  @classmethod
70
70
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
71
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
72
+
71
73
  reqs = batch.reqs
72
74
  device = batch.device
73
75
  temperatures = (
@@ -97,10 +99,11 @@ class SamplingBatchInfo:
97
99
  logit_bias[i, int(key)] = value
98
100
 
99
101
  # Check if any request has custom logit processor
100
- has_custom_logit_processor = (
101
- batch.enable_custom_logit_processor # check the flag first.
102
- and any(r.custom_logit_processor for r in reqs) # then check the requests.
103
- )
102
+ has_custom_logit_processor = global_server_args_dict[
103
+ "enable_custom_logit_processor"
104
+ ] and any( # check the flag first.
105
+ r.custom_logit_processor for r in reqs
106
+ ) # then check the requests.
104
107
 
105
108
  if has_custom_logit_processor:
106
109
  # Merge the same type of custom logit processors together