sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,309 @@
1
+ import re
2
+ from typing import Dict, Optional, Tuple, Type
3
+
4
+ from sglang.srt.parser.harmony_parser import HarmonyParser
5
+
6
+
7
+ class StreamingParseResult:
8
+ """Result of streaming incremental parsing."""
9
+
10
+ def __init__(
11
+ self,
12
+ normal_text: Optional[str] = None,
13
+ reasoning_text: Optional[str] = None,
14
+ ):
15
+ self.normal_text = normal_text or ""
16
+ self.reasoning_text = reasoning_text or ""
17
+
18
+
19
+ class BaseReasoningFormatDetector:
20
+ """Base class providing two sets of interfaces: one-time and streaming incremental."""
21
+
22
+ def __init__(
23
+ self,
24
+ think_start_token: str,
25
+ think_end_token: str,
26
+ force_reasoning: bool = False,
27
+ stream_reasoning: bool = True,
28
+ ):
29
+ self.think_start_token = think_start_token
30
+ self.think_end_token = think_end_token
31
+ self._in_reasoning = force_reasoning
32
+ self.stream_reasoning = stream_reasoning
33
+
34
+ self._buffer = ""
35
+ self.stripped_think_start = False
36
+
37
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
38
+ """
39
+ One-time parsing: Detects and parses reasoning sections in the provided text.
40
+ Returns both reasoning content and normal text separately.
41
+ """
42
+ in_reasoning = self._in_reasoning or self.think_start_token in text
43
+
44
+ if not in_reasoning:
45
+ return StreamingParseResult(normal_text=text)
46
+
47
+ # The text is considered to be in a reasoning block.
48
+ processed_text = text.replace(self.think_start_token, "").strip()
49
+
50
+ if self.think_end_token not in processed_text:
51
+ # Assume reasoning was truncated before `</think>` token
52
+ return StreamingParseResult(reasoning_text=processed_text)
53
+
54
+ # Extract reasoning content
55
+ splits = processed_text.split(self.think_end_token, maxsplit=1)
56
+ reasoning_text = splits[0]
57
+ normal_text = splits[1].strip()
58
+
59
+ return StreamingParseResult(
60
+ normal_text=normal_text, reasoning_text=reasoning_text
61
+ )
62
+
63
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
64
+ """
65
+ Streaming incremental parsing for reasoning content.
66
+ Handles partial reasoning tags and content.
67
+
68
+ If stream_reasoning is False:
69
+ Accumulates reasoning content until the end tag is found
70
+ If stream_reasoning is True:
71
+ Streams reasoning content as it arrives
72
+ """
73
+ self._buffer += new_text
74
+ current_text = self._buffer
75
+
76
+ # If the current text is a prefix of the think token, keep buffering
77
+ if any(
78
+ token.startswith(current_text) and token != current_text
79
+ for token in [self.think_start_token, self.think_end_token]
80
+ ):
81
+ return StreamingParseResult()
82
+
83
+ # Strip `<think>` token if present
84
+ if not self.stripped_think_start and self.think_start_token in current_text:
85
+ current_text = current_text.replace(self.think_start_token, "")
86
+ self.stripped_think_start = True
87
+ self._in_reasoning = True
88
+
89
+ # Handle end of reasoning block
90
+ if self._in_reasoning and self.think_end_token in current_text:
91
+ end_idx = current_text.find(self.think_end_token)
92
+
93
+ reasoning_text = current_text[:end_idx]
94
+
95
+ self._buffer = ""
96
+ self._in_reasoning = False
97
+ normal_text = current_text[end_idx + len(self.think_end_token) :]
98
+
99
+ return StreamingParseResult(
100
+ normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
101
+ )
102
+
103
+ # Continue with reasoning content
104
+ if self._in_reasoning:
105
+ if self.stream_reasoning:
106
+ # Stream the content immediately
107
+ self._buffer = ""
108
+ return StreamingParseResult(reasoning_text=current_text)
109
+ else:
110
+ return StreamingParseResult()
111
+
112
+ # If we're not in a reasoning block return as normal text
113
+ if not self._in_reasoning:
114
+ self._buffer = ""
115
+ return StreamingParseResult(normal_text=current_text)
116
+
117
+ return StreamingParseResult()
118
+
119
+
120
+ class DeepSeekR1Detector(BaseReasoningFormatDetector):
121
+ """
122
+ Detector for DeepSeek-R1 model.
123
+ Assumes reasoning format:
124
+ (<think>)*(.*)</think>
125
+ Returns all the text before the </think> tag as `reasoning_text`
126
+ and the rest of the text as `normal_text`.
127
+
128
+ Supported models:
129
+ - DeepSeek-R1: Always generates thinking content without <think> start tag
130
+ - DeepSeek-R1-0528: Generates thinking content with <think> start tag
131
+
132
+ Format patterns:
133
+ - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
134
+ - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
135
+
136
+ Args:
137
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
138
+ If True, streams reasoning content as it arrives.
139
+ """
140
+
141
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
142
+ # DeepSeek-R1 is assumed to be reasoning until `</think>` token
143
+ super().__init__(
144
+ "<think>",
145
+ "</think>",
146
+ force_reasoning=True,
147
+ stream_reasoning=stream_reasoning,
148
+ )
149
+ # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
150
+
151
+
152
+ class Qwen3Detector(BaseReasoningFormatDetector):
153
+ """
154
+ Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
155
+ Assumes reasoning format:
156
+ (<think>)*(.*)</think>
157
+
158
+ Qwen3 models released before 07/2025 supports switching between thinking mode and normal
159
+ mode using `enable_thinking` parameter in the request parameter.
160
+ - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
161
+ - enable_thinking=False: "The answer is 42." (no thinking tokens)
162
+
163
+ Args:
164
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
165
+ If True, streams reasoning content as it arrives.
166
+ """
167
+
168
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
169
+ super().__init__(
170
+ "<think>",
171
+ "</think>",
172
+ force_reasoning=force_reasoning,
173
+ stream_reasoning=stream_reasoning,
174
+ )
175
+
176
+
177
+ class KimiDetector(BaseReasoningFormatDetector):
178
+ """
179
+ Detector for Kimi Thinking model.
180
+ Assumes reasoning format:
181
+ ◁think▷*(.*)◁/think▷
182
+ Returns all the text before the ◁/think▷ tag as `reasoning_text`
183
+ and the rest of the text as `normal_text`.
184
+ """
185
+
186
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
187
+ super().__init__(
188
+ "◁think▷",
189
+ "◁/think▷",
190
+ force_reasoning=False,
191
+ stream_reasoning=stream_reasoning,
192
+ )
193
+
194
+
195
+ class GptOssDetector(BaseReasoningFormatDetector):
196
+ """
197
+ Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
198
+ """
199
+
200
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
201
+ super().__init__(
202
+ "<|channel|>analysis<|message|>",
203
+ "<|end|>",
204
+ force_reasoning=force_reasoning,
205
+ stream_reasoning=stream_reasoning,
206
+ )
207
+ self.parser = HarmonyParser()
208
+
209
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
210
+ events = self.parser.parse(text)
211
+ # Flush the buffer for one-shot parsing
212
+ events += self.parser.parse("")
213
+
214
+ reasoning_text = "".join(
215
+ [e.content for e in events if e.event_type == "reasoning"]
216
+ )
217
+ normal_parts = []
218
+ for e in events:
219
+ if e.event_type == "normal":
220
+ normal_parts.append(e.content)
221
+ elif e.event_type == "tool_call":
222
+ # Use raw_text to preserve structural markers for function call detector
223
+ normal_parts.append(e.raw_text if e.raw_text else e.content)
224
+ normal_text = "".join(normal_parts)
225
+ # Tool call events preserve raw text with structural markers
226
+
227
+ return StreamingParseResult(
228
+ normal_text=normal_text,
229
+ reasoning_text=reasoning_text,
230
+ )
231
+
232
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
233
+ events = self.parser.parse(new_text)
234
+
235
+ reasoning_text = "".join(
236
+ [e.content for e in events if e.event_type == "reasoning"]
237
+ )
238
+ normal_parts = []
239
+ for e in events:
240
+ if e.event_type == "normal":
241
+ normal_parts.append(e.content)
242
+ elif e.event_type == "tool_call":
243
+ # Use raw_text to preserve structural markers for function call detector
244
+ normal_parts.append(e.raw_text if e.raw_text else e.content)
245
+ normal_text = "".join(normal_parts)
246
+
247
+ return StreamingParseResult(
248
+ normal_text=normal_text,
249
+ reasoning_text=reasoning_text,
250
+ )
251
+
252
+
253
+ class ReasoningParser:
254
+ """
255
+ Parser that handles both streaming and non-streaming scenarios for extracting
256
+ reasoning content from model outputs.
257
+
258
+ Args:
259
+ model_type (str): Type of model to parse reasoning from
260
+ stream_reasoning (bool): If False, accumulates reasoning content until complete.
261
+ If True, streams reasoning content as it arrives.
262
+ """
263
+
264
+ DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
265
+ "deepseek-r1": DeepSeekR1Detector,
266
+ "deepseek-v3": Qwen3Detector,
267
+ "glm45": Qwen3Detector,
268
+ "gpt-oss": GptOssDetector,
269
+ "kimi": KimiDetector,
270
+ "qwen3": Qwen3Detector,
271
+ "qwen3-thinking": Qwen3Detector,
272
+ "step3": DeepSeekR1Detector,
273
+ }
274
+
275
+ def __init__(
276
+ self,
277
+ model_type: Optional[str] = None,
278
+ stream_reasoning: bool = True,
279
+ force_reasoning: Optional[bool] = None,
280
+ ):
281
+ if not model_type:
282
+ raise ValueError("Model type must be specified")
283
+
284
+ detector_class = self.DetectorMap.get(model_type.lower())
285
+ if not detector_class:
286
+ raise ValueError(f"Unsupported model type: {model_type}")
287
+
288
+ # Special cases where we override force_reasoning
289
+ if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
290
+ force_reasoning = True
291
+
292
+ # Only pass force_reasoning if explicitly set, let detectors use their defaults
293
+ kwargs = {"stream_reasoning": stream_reasoning}
294
+ if force_reasoning is not None:
295
+ kwargs["force_reasoning"] = force_reasoning
296
+
297
+ self.detector = detector_class(**kwargs)
298
+
299
+ def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
300
+ """Non-streaming call: one-time parsing"""
301
+ ret = self.detector.detect_and_parse(full_text)
302
+ return ret.reasoning_text, ret.normal_text
303
+
304
+ def parse_stream_chunk(
305
+ self, chunk_text: str
306
+ ) -> Tuple[Optional[str], Optional[str]]:
307
+ """Streaming call: incremental parsing"""
308
+ ret = self.detector.parse_streaming_increment(chunk_text)
309
+ return ret.reasoning_text, ret.normal_text
@@ -1,7 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- from typing import TYPE_CHECKING, Set, Type
4
+ import weakref
5
+ from typing import TYPE_CHECKING, Optional, Set, Type
5
6
 
6
7
  import torch
7
8
 
@@ -17,7 +18,7 @@ class BatchedPenalizerOrchestrator:
17
18
  penalizers: Set[Type["_BatchedPenalizer"]],
18
19
  ):
19
20
  self.vocab_size = vocab_size
20
- self.batch = batch
21
+ self._batch_ref = weakref.ref(batch)
21
22
  self.device = batch.device
22
23
  self.penalizers = {Penalizer: Penalizer(self) for Penalizer in penalizers}
23
24
 
@@ -27,6 +28,17 @@ class BatchedPenalizerOrchestrator:
27
28
  is_required |= pen_is_required
28
29
  self.is_required = is_required
29
30
 
31
+ @property
32
+ def batch(self) -> ScheduleBatch | None:
33
+ return self._batch_ref()
34
+
35
+ @batch.setter
36
+ def batch(self, value: Optional[ScheduleBatch]):
37
+ if value is None:
38
+ self._batch_ref = lambda: None
39
+ else:
40
+ self._batch_ref = weakref.ref(value)
41
+
30
42
  def reqs(self):
31
43
  return self.batch.reqs
32
44
 
@@ -67,28 +67,31 @@ class SamplingBatchInfo:
67
67
  logit_bias: Optional[torch.Tensor] = None
68
68
 
69
69
  @classmethod
70
- def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
70
+ def _get_global_server_args_dict(cls):
71
71
  from sglang.srt.managers.schedule_batch import global_server_args_dict
72
72
 
73
+ return global_server_args_dict
74
+
75
+ @classmethod
76
+ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
77
+ global_server_args_dict = cls._get_global_server_args_dict()
78
+
73
79
  reqs = batch.reqs
74
80
  device = batch.device
75
- temperatures = (
76
- torch.tensor(
77
- [r.sampling_params.temperature for r in reqs],
78
- dtype=torch.float,
79
- )
80
- .view(-1, 1)
81
- .to(device, non_blocking=True)
82
- )
81
+ temperatures = torch.tensor(
82
+ [r.sampling_params.temperature for r in reqs],
83
+ dtype=torch.float,
84
+ device=device,
85
+ ).view(-1, 1)
83
86
  top_ps = torch.tensor(
84
- [r.sampling_params.top_p for r in reqs], dtype=torch.float
85
- ).to(device, non_blocking=True)
87
+ [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
88
+ )
86
89
  top_ks = torch.tensor(
87
- [r.sampling_params.top_k for r in reqs], dtype=torch.int32
88
- ).to(device, non_blocking=True)
90
+ [r.sampling_params.top_k for r in reqs], dtype=torch.int32, device=device
91
+ )
89
92
  min_ps = torch.tensor(
90
- [r.sampling_params.min_p for r in reqs], dtype=torch.float
91
- ).to(device, non_blocking=True)
93
+ [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
94
+ )
92
95
 
93
96
  logit_bias = None
94
97
  if any(r.sampling_params.logit_bias is not None for r in reqs):