sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +302 -414
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +13 -8
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  63. sglang/srt/layers/moe/topk.py +13 -4
  64. sglang/srt/layers/quantization/__init__.py +111 -7
  65. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  66. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  71. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  72. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  73. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  86. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/fp8.py +69 -28
  89. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  90. sglang/srt/layers/quantization/gptq.py +416 -0
  91. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  92. sglang/srt/layers/quantization/int8_utils.py +73 -0
  93. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  94. sglang/srt/layers/radix_attention.py +1 -0
  95. sglang/srt/layers/rotary_embedding.py +0 -1
  96. sglang/srt/layers/sampler.py +76 -31
  97. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  98. sglang/srt/lora/lora.py +17 -1
  99. sglang/srt/lora/lora_config.py +5 -0
  100. sglang/srt/lora/lora_manager.py +1 -3
  101. sglang/srt/managers/cache_controller.py +193 -62
  102. sglang/srt/managers/configure_logging.py +2 -1
  103. sglang/srt/managers/data_parallel_controller.py +6 -2
  104. sglang/srt/managers/detokenizer_manager.py +124 -102
  105. sglang/srt/managers/image_processor.py +2 -1
  106. sglang/srt/managers/io_struct.py +144 -6
  107. sglang/srt/managers/schedule_batch.py +237 -197
  108. sglang/srt/managers/schedule_policy.py +29 -29
  109. sglang/srt/managers/scheduler.py +773 -334
  110. sglang/srt/managers/session_controller.py +6 -2
  111. sglang/srt/managers/tokenizer_manager.py +225 -68
  112. sglang/srt/managers/tp_worker.py +15 -4
  113. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  114. sglang/srt/mem_cache/chunk_cache.py +18 -11
  115. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  116. sglang/srt/mem_cache/memory_pool.py +68 -37
  117. sglang/srt/mem_cache/radix_cache.py +58 -47
  118. sglang/srt/metrics/collector.py +102 -36
  119. sglang/srt/model_executor/cuda_graph_runner.py +56 -31
  120. sglang/srt/model_executor/forward_batch_info.py +49 -16
  121. sglang/srt/model_executor/model_runner.py +280 -81
  122. sglang/srt/model_loader/loader.py +3 -3
  123. sglang/srt/model_loader/weight_utils.py +36 -14
  124. sglang/srt/models/baichuan.py +31 -6
  125. sglang/srt/models/chatglm.py +39 -7
  126. sglang/srt/models/commandr.py +29 -5
  127. sglang/srt/models/dbrx.py +31 -5
  128. sglang/srt/models/deepseek.py +43 -6
  129. sglang/srt/models/deepseek_nextn.py +32 -19
  130. sglang/srt/models/deepseek_v2.py +265 -32
  131. sglang/srt/models/exaone.py +19 -9
  132. sglang/srt/models/gemma.py +22 -8
  133. sglang/srt/models/gemma2.py +25 -12
  134. sglang/srt/models/gemma2_reward.py +5 -1
  135. sglang/srt/models/gpt2.py +28 -13
  136. sglang/srt/models/gpt_bigcode.py +27 -5
  137. sglang/srt/models/granite.py +21 -9
  138. sglang/srt/models/grok.py +21 -4
  139. sglang/srt/models/internlm2.py +36 -6
  140. sglang/srt/models/internlm2_reward.py +5 -1
  141. sglang/srt/models/llama.py +26 -9
  142. sglang/srt/models/llama_classification.py +5 -1
  143. sglang/srt/models/llama_eagle.py +17 -4
  144. sglang/srt/models/llama_embedding.py +5 -1
  145. sglang/srt/models/llama_reward.py +7 -2
  146. sglang/srt/models/llava.py +19 -3
  147. sglang/srt/models/llavavid.py +10 -1
  148. sglang/srt/models/minicpm.py +26 -2
  149. sglang/srt/models/minicpm3.py +39 -3
  150. sglang/srt/models/minicpmv.py +45 -14
  151. sglang/srt/models/mixtral.py +20 -9
  152. sglang/srt/models/mixtral_quant.py +50 -8
  153. sglang/srt/models/mllama.py +57 -11
  154. sglang/srt/models/olmo.py +34 -6
  155. sglang/srt/models/olmo2.py +34 -13
  156. sglang/srt/models/olmoe.py +26 -4
  157. sglang/srt/models/phi3_small.py +29 -10
  158. sglang/srt/models/qwen.py +26 -3
  159. sglang/srt/models/qwen2.py +26 -4
  160. sglang/srt/models/qwen2_5_vl.py +46 -8
  161. sglang/srt/models/qwen2_eagle.py +17 -5
  162. sglang/srt/models/qwen2_moe.py +44 -6
  163. sglang/srt/models/qwen2_rm.py +78 -0
  164. sglang/srt/models/qwen2_vl.py +39 -8
  165. sglang/srt/models/stablelm.py +32 -5
  166. sglang/srt/models/torch_native_llama.py +5 -2
  167. sglang/srt/models/xverse.py +21 -9
  168. sglang/srt/models/xverse_moe.py +45 -7
  169. sglang/srt/models/yivl.py +2 -1
  170. sglang/srt/openai_api/adapter.py +109 -24
  171. sglang/srt/openai_api/protocol.py +17 -1
  172. sglang/srt/reasoning_parser.py +154 -0
  173. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  174. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  175. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  176. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  177. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  178. sglang/srt/sampling/sampling_batch_info.py +79 -157
  179. sglang/srt/sampling/sampling_params.py +16 -13
  180. sglang/srt/server_args.py +135 -60
  181. sglang/srt/speculative/build_eagle_tree.py +8 -9
  182. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
  183. sglang/srt/speculative/eagle_utils.py +92 -57
  184. sglang/srt/speculative/eagle_worker.py +238 -111
  185. sglang/srt/speculative/spec_info.py +1 -13
  186. sglang/srt/utils.py +43 -17
  187. sglang/srt/warmup.py +47 -0
  188. sglang/test/few_shot_gsm8k.py +4 -1
  189. sglang/test/runners.py +389 -126
  190. sglang/test/send_one.py +88 -0
  191. sglang/test/test_block_fp8_ep.py +361 -0
  192. sglang/test/test_programs.py +1 -1
  193. sglang/test/test_utils.py +138 -84
  194. sglang/utils.py +50 -60
  195. sglang/version.py +1 -1
  196. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
  197. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
  198. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
  199. sglang/bench_latency.py +0 -1
  200. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  201. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  202. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  203. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  204. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
  205. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
@@ -43,6 +43,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
43
43
  )
44
44
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
45
45
  from sglang.srt.model_loader.weight_utils import default_weight_loader
46
+ from sglang.srt.utils import add_prefix
46
47
 
47
48
 
48
49
  class XverseMLP(nn.Module):
@@ -54,10 +55,15 @@ class XverseMLP(nn.Module):
54
55
  hidden_act: str,
55
56
  quant_config: Optional[QuantizationConfig] = None,
56
57
  reduce_results: bool = True,
58
+ prefix: str = "",
57
59
  ) -> None:
58
60
  super().__init__()
59
61
  self.gate_up_proj = MergedColumnParallelLinear(
60
- hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
62
+ hidden_size,
63
+ [intermediate_size] * 2,
64
+ bias=False,
65
+ quant_config=quant_config,
66
+ prefix=add_prefix("gate_up_proj", prefix),
61
67
  )
62
68
  self.down_proj = RowParallelLinear(
63
69
  intermediate_size,
@@ -65,6 +71,7 @@ class XverseMLP(nn.Module):
65
71
  bias=False,
66
72
  quant_config=quant_config,
67
73
  reduce_results=reduce_results,
74
+ prefix=add_prefix("down_proj", prefix),
68
75
  )
69
76
  if hidden_act != "silu":
70
77
  raise ValueError(
@@ -86,6 +93,7 @@ class XverseMoE(nn.Module):
86
93
  self,
87
94
  config: PretrainedConfig,
88
95
  quant_config: Optional[QuantizationConfig] = None,
96
+ prefix: str = "",
89
97
  ):
90
98
  super().__init__()
91
99
  self.config = config
@@ -107,14 +115,19 @@ class XverseMoE(nn.Module):
107
115
  hidden_act=config.hidden_act,
108
116
  quant_config=quant_config,
109
117
  reduce_results=False,
118
+ prefix=add_prefix(f"experts.{i}", prefix),
110
119
  )
111
- for _ in range(self.n_routed_experts)
120
+ for i in range(self.n_routed_experts)
112
121
  ]
113
122
  )
114
123
  self.pack_params()
115
124
 
116
125
  self.router = ReplicatedLinear(
117
- config.hidden_size, self.n_routed_experts, bias=False, quant_config=None
126
+ config.hidden_size,
127
+ self.n_routed_experts,
128
+ bias=False,
129
+ quant_config=None,
130
+ prefix=add_prefix("router", prefix),
118
131
  )
119
132
 
120
133
  if config.num_shared_experts is not None:
@@ -125,6 +138,7 @@ class XverseMoE(nn.Module):
125
138
  hidden_act=config.hidden_act,
126
139
  quant_config=quant_config,
127
140
  reduce_results=False,
141
+ prefix=add_prefix("shared_experts", prefix),
128
142
  )
129
143
 
130
144
  def pack_params(self):
@@ -182,6 +196,7 @@ class XverseAttention(nn.Module):
182
196
  rope_scaling: Optional[Dict[str, Any]] = None,
183
197
  max_position_embeddings: int = 8192,
184
198
  quant_config: Optional[QuantizationConfig] = None,
199
+ prefix: str = "",
185
200
  ) -> None:
186
201
  super().__init__()
187
202
  self.hidden_size = hidden_size
@@ -213,6 +228,7 @@ class XverseAttention(nn.Module):
213
228
  self.total_num_kv_heads,
214
229
  bias=False,
215
230
  quant_config=quant_config,
231
+ prefix=add_prefix("qkv_proj", prefix),
216
232
  )
217
233
 
218
234
  self.o_proj = RowParallelLinear(
@@ -220,6 +236,7 @@ class XverseAttention(nn.Module):
220
236
  hidden_size,
221
237
  bias=False,
222
238
  quant_config=quant_config,
239
+ prefix=add_prefix("o_proj", prefix),
223
240
  )
224
241
 
225
242
  self.rotary_emb = get_rope(
@@ -235,6 +252,7 @@ class XverseAttention(nn.Module):
235
252
  self.scaling,
236
253
  num_kv_heads=self.num_kv_heads,
237
254
  layer_id=layer_id,
255
+ prefix=add_prefix("attn", prefix),
238
256
  )
239
257
 
240
258
  def forward(
@@ -258,6 +276,7 @@ class XverseDecoderLayer(nn.Module):
258
276
  config: PretrainedConfig,
259
277
  layer_id: int,
260
278
  quant_config: Optional[QuantizationConfig] = None,
279
+ prefix: str = "",
261
280
  ) -> None:
262
281
  super().__init__()
263
282
  self.hidden_size = config.hidden_size
@@ -276,15 +295,21 @@ class XverseDecoderLayer(nn.Module):
276
295
  rope_scaling=rope_scaling,
277
296
  max_position_embeddings=max_position_embeddings,
278
297
  quant_config=quant_config,
298
+ prefix=add_prefix("self_attn", prefix),
279
299
  )
280
300
  if config.num_experts is not None:
281
- self.mlp = XverseMoE(config=config, quant_config=quant_config)
301
+ self.mlp = XverseMoE(
302
+ config=config,
303
+ quant_config=quant_config,
304
+ prefix=add_prefix("mlp", prefix),
305
+ )
282
306
  else:
283
307
  self.mlp = XverseMLP(
284
308
  hidden_size=config.hidden_size,
285
309
  intermediate_size=config.intermediate_size,
286
310
  hidden_act=config.hidden_act,
287
311
  quant_config=quant_config,
312
+ prefix=add_prefix("mlp", prefix),
288
313
  )
289
314
  self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
290
315
  self.post_attention_layernorm = RMSNorm(
@@ -324,6 +349,7 @@ class XverseModel(nn.Module):
324
349
  self,
325
350
  config: PretrainedConfig,
326
351
  quant_config: Optional[QuantizationConfig] = None,
352
+ prefix: str = "",
327
353
  ) -> None:
328
354
  super().__init__()
329
355
  self.padding_idx = config.pad_token_id
@@ -332,10 +358,16 @@ class XverseModel(nn.Module):
332
358
  self.embed_tokens = VocabParallelEmbedding(
333
359
  config.vocab_size,
334
360
  config.hidden_size,
361
+ prefix=add_prefix("embed_tokens", prefix),
335
362
  )
336
363
  self.layers = nn.ModuleList(
337
364
  [
338
- XverseDecoderLayer(config, layer_id, quant_config=quant_config)
365
+ XverseDecoderLayer(
366
+ config,
367
+ layer_id,
368
+ quant_config=quant_config,
369
+ prefix=add_prefix(f"layers.{layer_id}", prefix),
370
+ )
339
371
  for layer_id in range(config.num_hidden_layers)
340
372
  ]
341
373
  )
@@ -364,13 +396,19 @@ class XverseMoeForCausalLM(nn.Module):
364
396
  self,
365
397
  config: PretrainedConfig,
366
398
  quant_config: Optional[QuantizationConfig] = None,
399
+ prefix: str = "",
367
400
  ) -> None:
368
401
  super().__init__()
369
402
  self.config = config
370
403
  self.quant_config = quant_config
371
- self.model = XverseModel(config, quant_config)
404
+ self.model = XverseModel(
405
+ config, quant_config, prefix=add_prefix("model", prefix)
406
+ )
372
407
  self.lm_head = ParallelLMHead(
373
- config.vocab_size, config.hidden_size, quant_config=quant_config
408
+ config.vocab_size,
409
+ config.hidden_size,
410
+ quant_config=quant_config,
411
+ prefix=add_prefix("lm_head", prefix),
374
412
  )
375
413
  self.logits_processor = LogitsProcessor(config)
376
414
 
sglang/srt/models/yivl.py CHANGED
@@ -29,8 +29,9 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
29
29
  self,
30
30
  config: LlavaConfig,
31
31
  quant_config: Optional[QuantizationConfig] = None,
32
+ prefix: str = "",
32
33
  ) -> None:
33
- super().__init__(config, quant_config)
34
+ super().__init__(config, quant_config, prefix=prefix)
34
35
 
35
36
  self.multi_modal_projector = YiVLMultiModalProjector(self.config)
36
37
  self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
@@ -26,8 +26,6 @@ from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
27
27
  from pydantic import ValidationError
28
28
 
29
- from sglang.lang.chat_template import get_chat_template_by_model_path
30
-
31
29
  try:
32
30
  from outlines.fsm.json_schema import convert_json_schema_to_str
33
31
  except ImportError:
@@ -74,6 +72,7 @@ from sglang.srt.openai_api.protocol import (
74
72
  TopLogprob,
75
73
  UsageInfo,
76
74
  )
75
+ from sglang.srt.reasoning_parser import ReasoningParser
77
76
  from sglang.utils import get_exception_traceback
78
77
 
79
78
  logger = logging.getLogger(__name__)
@@ -165,24 +164,19 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, mode
165
164
  else:
166
165
  chat_template_name = chat_template_arg
167
166
 
168
- # check chat-template
169
- chat_template = get_chat_template_by_model_path(model_path)
170
- if chat_template is not None:
171
- official_chat_template = chat_template.name
172
- used_chat_template = chat_template_name
173
- if official_chat_template != used_chat_template:
174
- logger.warning(
175
- f"Using a chat_template: '{used_chat_template}', "
176
- f"which is different from official chat template: '{official_chat_template}', "
177
- f"This discrepancy may lead to performance degradation."
178
- )
167
+ # Check chat-template
168
+ # TODO:
169
+ # 1. Do not import any code from sglang.lang
170
+ # 2. For VLM, when chat_template_arg is None, set it automatically by guessing from model_path.
179
171
 
180
172
 
181
- async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
173
+ async def v1_files_create(
174
+ file: UploadFile, purpose: str, file_storage_path: str = None
175
+ ):
182
176
  try:
183
177
  global storage_dir
184
- if file_storage_pth:
185
- storage_dir = file_storage_pth
178
+ if file_storage_path:
179
+ storage_dir = file_storage_path
186
180
  # Read the file content
187
181
  file_content = await file.read()
188
182
 
@@ -941,7 +935,13 @@ def v1_chat_generate_request(
941
935
  )
942
936
 
943
937
  if assistant_prefix:
944
- prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
938
+ encoded = tokenizer_manager.tokenizer.encode(assistant_prefix)
939
+ if (
940
+ encoded
941
+ and encoded[0] == tokenizer_manager.tokenizer.bos_token_id
942
+ ):
943
+ encoded = encoded[1:]
944
+ prompt_ids += encoded
945
945
  stop = request.stop
946
946
  image_data = None
947
947
  modalities = []
@@ -988,10 +988,17 @@ def v1_chat_generate_request(
988
988
  "ignore_eos": request.ignore_eos,
989
989
  "skip_special_tokens": request.skip_special_tokens,
990
990
  }
991
+
991
992
  if request.response_format and request.response_format.type == "json_schema":
992
993
  sampling_params["json_schema"] = convert_json_schema_to_str(
993
994
  request.response_format.json_schema.schema_
994
995
  )
996
+ elif (
997
+ request.response_format and request.response_format.type == "structural_tag"
998
+ ):
999
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
1000
+ request.response_format.model_dump(by_alias=True)
1001
+ )
995
1002
  sampling_params_list.append(sampling_params)
996
1003
 
997
1004
  image_data_list.append(image_data)
@@ -1032,7 +1039,12 @@ def v1_chat_generate_request(
1032
1039
 
1033
1040
 
1034
1041
  def v1_chat_generate_response(
1035
- request, ret, to_file=False, cache_report=False, tool_call_parser=None
1042
+ request,
1043
+ ret,
1044
+ to_file=False,
1045
+ cache_report=False,
1046
+ tool_call_parser=None,
1047
+ reasoning_parser=None,
1036
1048
  ):
1037
1049
  choices = []
1038
1050
 
@@ -1086,9 +1098,26 @@ def v1_chat_generate_response(
1086
1098
  if isinstance(request, list):
1087
1099
  tool_choice = request[idx].tool_choice
1088
1100
  tools = request[idx].tools
1101
+ separate_reasoning = request[idx].separate_reasoning
1089
1102
  else:
1090
1103
  tool_choice = request.tool_choice
1091
1104
  tools = request.tools
1105
+ separate_reasoning = request.separate_reasoning
1106
+
1107
+ if reasoning_parser and separate_reasoning:
1108
+ try:
1109
+ parser = ReasoningParser(
1110
+ model_type=reasoning_parser, stream_reasoning=False
1111
+ )
1112
+ reasoning_text, text = parser.parse_non_stream(text)
1113
+ except Exception as e:
1114
+ logger.error(f"Exception: {e}")
1115
+ return create_error_response(
1116
+ HTTPStatus.BAD_REQUEST,
1117
+ "Failed to parse reasoning related info to json format!",
1118
+ )
1119
+ else:
1120
+ reasoning_text = None
1092
1121
 
1093
1122
  if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
1094
1123
  if finish_reason == "stop":
@@ -1118,8 +1147,9 @@ def v1_chat_generate_response(
1118
1147
  "index": 0,
1119
1148
  "message": {
1120
1149
  "role": "assistant",
1121
- "content": ret_item["text"] if tool_calls is None else None,
1150
+ "content": text if tool_calls is None else None,
1122
1151
  "tool_calls": tool_calls,
1152
+ "reasoning_content": reasoning_text,
1123
1153
  },
1124
1154
  "logprobs": choice_logprobs,
1125
1155
  "finish_reason": (finish_reason["type"] if finish_reason else ""),
@@ -1134,8 +1164,9 @@ def v1_chat_generate_response(
1134
1164
  index=idx,
1135
1165
  message=ChatMessage(
1136
1166
  role="assistant",
1137
- content=ret_item["text"] if tool_calls is None else None,
1167
+ content=text if tool_calls is None else None,
1138
1168
  tool_calls=tool_calls,
1169
+ reasoning_content=reasoning_text,
1139
1170
  ),
1140
1171
  logprobs=choice_logprobs,
1141
1172
  finish_reason=(finish_reason["type"] if finish_reason else ""),
@@ -1202,6 +1233,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1202
1233
 
1203
1234
  if adapted_request.stream:
1204
1235
  parser_dict = {}
1236
+ reasoning_parser_dict = {}
1205
1237
 
1206
1238
  async def generate_stream_resp():
1207
1239
  is_firsts = {}
@@ -1268,15 +1300,27 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1268
1300
  choice_logprobs = None
1269
1301
 
1270
1302
  finish_reason = content["meta_info"]["finish_reason"]
1303
+ finish_reason_type = (
1304
+ finish_reason["type"] if finish_reason else None
1305
+ )
1271
1306
 
1272
1307
  if is_first:
1273
1308
  # First chunk with role
1274
1309
  is_first = False
1310
+ if (
1311
+ tokenizer_manager.server_args.reasoning_parser
1312
+ and request.separate_reasoning
1313
+ ):
1314
+ delta = DeltaMessage(role="assistant", reasoning_content="")
1315
+ else:
1316
+ delta = DeltaMessage(role="assistant", content="")
1275
1317
  choice_data = ChatCompletionResponseStreamChoice(
1276
1318
  index=index,
1277
- delta=DeltaMessage(role="assistant", content=""),
1319
+ delta=delta,
1278
1320
  finish_reason=(
1279
- finish_reason["type"] if finish_reason else ""
1321
+ None
1322
+ if finish_reason_type and len(finish_reason_type) == 0
1323
+ else finish_reason_type
1280
1324
  ),
1281
1325
  matched_stop=(
1282
1326
  finish_reason["matched"]
@@ -1296,6 +1340,41 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1296
1340
  delta = text[len(stream_buffer) :]
1297
1341
  new_stream_buffer = stream_buffer + delta
1298
1342
 
1343
+ if (
1344
+ tokenizer_manager.server_args.reasoning_parser
1345
+ and request.separate_reasoning
1346
+ ):
1347
+ if index not in reasoning_parser_dict:
1348
+ reasoning_parser_dict[index] = ReasoningParser(
1349
+ tokenizer_manager.server_args.reasoning_parser,
1350
+ request.stream_reasoning,
1351
+ )
1352
+ reasoning_parser = reasoning_parser_dict[index]
1353
+ reasoning_text, delta = reasoning_parser.parse_stream_chunk(
1354
+ delta
1355
+ )
1356
+ if reasoning_text:
1357
+ choice_data = ChatCompletionResponseStreamChoice(
1358
+ index=index,
1359
+ delta=DeltaMessage(reasoning_content=reasoning_text),
1360
+ finish_reason=(
1361
+ None
1362
+ if finish_reason_type
1363
+ and len(finish_reason_type) == 0
1364
+ else finish_reason_type
1365
+ ),
1366
+ )
1367
+ chunk = ChatCompletionStreamResponse(
1368
+ id=content["meta_info"]["id"],
1369
+ choices=[choice_data],
1370
+ model=request.model,
1371
+ )
1372
+ yield f"data: {chunk.model_dump_json()}\n\n"
1373
+ if (delta and len(delta) == 0) or not delta:
1374
+ stream_buffers[index] = new_stream_buffer
1375
+ is_firsts[index] = is_first
1376
+ continue
1377
+
1299
1378
  if request.tool_choice != "none" and request.tools:
1300
1379
  if index not in parser_dict:
1301
1380
  parser_dict[index] = FunctionCallParser(
@@ -1313,7 +1392,10 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1313
1392
  index=index,
1314
1393
  delta=DeltaMessage(content=normal_text),
1315
1394
  finish_reason=(
1316
- finish_reason["type"] if finish_reason else ""
1395
+ None
1396
+ if finish_reason_type
1397
+ and len(finish_reason_type) == 0
1398
+ else finish_reason_type
1317
1399
  ),
1318
1400
  )
1319
1401
  chunk = ChatCompletionStreamResponse(
@@ -1382,7 +1464,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1382
1464
  index=index,
1383
1465
  delta=DeltaMessage(content=delta),
1384
1466
  finish_reason=(
1385
- finish_reason["type"] if finish_reason else ""
1467
+ None
1468
+ if finish_reason_type and len(finish_reason_type) == 0
1469
+ else finish_reason_type
1386
1470
  ),
1387
1471
  matched_stop=(
1388
1472
  finish_reason["matched"]
@@ -1450,6 +1534,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1450
1534
  ret,
1451
1535
  cache_report=tokenizer_manager.server_args.enable_cache_report,
1452
1536
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1537
+ reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
1453
1538
  )
1454
1539
 
1455
1540
  return response
@@ -258,6 +258,18 @@ class ResponseFormat(BaseModel):
258
258
  json_schema: Optional[JsonSchemaResponseFormat] = None
259
259
 
260
260
 
261
+ class StructuresResponseFormat(BaseModel):
262
+ begin: str
263
+ schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
264
+ end: str
265
+
266
+
267
+ class StructuralTagResponseFormat(BaseModel):
268
+ type: Literal["structural_tag"]
269
+ structures: List[StructuresResponseFormat]
270
+ triggers: List[str]
271
+
272
+
261
273
  class Function(BaseModel):
262
274
  """Function descriptions."""
263
275
 
@@ -298,7 +310,7 @@ class ChatCompletionRequest(BaseModel):
298
310
  max_tokens: Optional[int] = None
299
311
  n: int = 1
300
312
  presence_penalty: float = 0.0
301
- response_format: Optional[ResponseFormat] = None
313
+ response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
302
314
  seed: Optional[int] = None
303
315
  stop: Optional[Union[str, List[str]]] = None
304
316
  stream: bool = False
@@ -324,6 +336,8 @@ class ChatCompletionRequest(BaseModel):
324
336
  skip_special_tokens: bool = True
325
337
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
326
338
  session_params: Optional[Dict] = None
339
+ separate_reasoning: bool = True
340
+ stream_reasoning: bool = True
327
341
 
328
342
 
329
343
  class FunctionResponse(BaseModel):
@@ -344,6 +358,7 @@ class ToolCall(BaseModel):
344
358
  class ChatMessage(BaseModel):
345
359
  role: Optional[str] = None
346
360
  content: Optional[str] = None
361
+ reasoning_content: Optional[str] = None
347
362
  tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
348
363
 
349
364
 
@@ -367,6 +382,7 @@ class ChatCompletionResponse(BaseModel):
367
382
  class DeltaMessage(BaseModel):
368
383
  role: Optional[str] = None
369
384
  content: Optional[str] = None
385
+ reasoning_content: Optional[str] = None
370
386
  tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
371
387
 
372
388
 
@@ -0,0 +1,154 @@
1
+ import re
2
+ from typing import Dict, Tuple
3
+
4
+
5
+ class StreamingParseResult:
6
+ """Result of streaming incremental parsing."""
7
+
8
+ def __init__(self, normal_text: str = "", reasoning_text: str = ""):
9
+ self.normal_text = normal_text
10
+ self.reasoning_text = reasoning_text
11
+
12
+
13
+ class BaseReasoningFormatDetector:
14
+ """Base class providing two sets of interfaces: one-time and streaming incremental."""
15
+
16
+ def __init__(
17
+ self,
18
+ think_start_token: str,
19
+ think_end_token: str,
20
+ force_reasoning: bool = False,
21
+ stream_reasoning: bool = True,
22
+ ):
23
+ self.think_start_token = think_start_token
24
+ self.think_end_token = think_end_token
25
+ self._in_reasoning = force_reasoning
26
+ self.stream_reasoning = stream_reasoning
27
+
28
+ self._buffer = ""
29
+ self.stripped_think_start = False
30
+
31
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
32
+ """
33
+ One-time parsing: Detects and parses reasoning sections in the provided text.
34
+ Returns both reasoning content and normal text separately.
35
+ """
36
+ text = text.replace(self.think_start_token, "").strip()
37
+ if self.think_end_token not in text:
38
+ # Assume reasoning was truncated before `</think>` token
39
+ return StreamingParseResult(reasoning_text=text)
40
+
41
+ # Extract reasoning content
42
+ splits = text.split(self.think_end_token, maxsplit=1)
43
+ reasoning_text = splits[0]
44
+ text = splits[1].strip()
45
+
46
+ return StreamingParseResult(normal_text=text, reasoning_text=reasoning_text)
47
+
48
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
49
+ """
50
+ Streaming incremental parsing for reasoning content.
51
+ Handles partial reasoning tags and content.
52
+
53
+ If stream_reasoning is False:
54
+ Accumulates reasoning content until the end tag is found
55
+ If stream_reasoning is True:
56
+ Streams reasoning content as it arrives
57
+ """
58
+ self._buffer += new_text
59
+ current_text = self._buffer
60
+
61
+ # Strip `<think>` token if present
62
+ if not self.stripped_think_start and self.think_start_token in current_text:
63
+ current_text = current_text.replace(self.think_start_token, "")
64
+ self.stripped_think_start = True
65
+
66
+ # Handle end of reasoning block
67
+ if self._in_reasoning and self.think_end_token in current_text:
68
+ end_idx = current_text.find(self.think_end_token)
69
+
70
+ reasoning_text = current_text[:end_idx]
71
+
72
+ self._buffer = ""
73
+ self._in_reasoning = False
74
+ normal_text = current_text[end_idx + len(self.think_end_token) :]
75
+
76
+ return StreamingParseResult(
77
+ normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
78
+ )
79
+
80
+ # Continue with reasoning content
81
+ if self._in_reasoning:
82
+ if self.stream_reasoning:
83
+ # Stream the content immediately
84
+ self._buffer = ""
85
+ return StreamingParseResult(reasoning_text=current_text)
86
+ else:
87
+ return StreamingParseResult()
88
+
89
+ # If we're not in a reasoning block return as normal text
90
+ if not self._in_reasoning:
91
+ self._buffer = ""
92
+ return StreamingParseResult(normal_text=new_text)
93
+
94
+ return StreamingParseResult()
95
+
96
+
97
+ class DeepSeekR1Detector(BaseReasoningFormatDetector):
98
+ """
99
+ Detector for DeepSeek-R1 model.
100
+ Assumes reasoning format:
101
+ (<think>)*(.*)</think>
102
+ Returns all the text before the </think> tag as `reasoning_text`
103
+ and the rest of the text as `normal_text`.
104
+
105
+ Args:
106
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
107
+ If True, streams reasoning content as it arrives.
108
+ """
109
+
110
+ def __init__(self, stream_reasoning: bool = True):
111
+ # DeepSeek-R1 is assumed to be reasoning until `</think>` token
112
+ super().__init__(
113
+ "<think>",
114
+ "</think>",
115
+ force_reasoning=True,
116
+ stream_reasoning=stream_reasoning,
117
+ )
118
+ # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
119
+
120
+
121
+ class ReasoningParser:
122
+ """
123
+ Parser that handles both streaming and non-streaming scenarios for extracting
124
+ reasoning content from model outputs.
125
+
126
+ Args:
127
+ model_type (str): Type of model to parse reasoning from
128
+ stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
129
+ If True, streams reasoning content as it arrives.
130
+ """
131
+
132
+ DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
133
+ "deepseek-r1": DeepSeekR1Detector
134
+ }
135
+
136
+ def __init__(self, model_type: str = None, stream_reasoning: bool = True):
137
+ if not model_type:
138
+ raise ValueError("Model type must be specified")
139
+
140
+ detector_class = self.DetectorMap.get(model_type.lower())
141
+ if not detector_class:
142
+ raise ValueError(f"Unsupported model type: {model_type}")
143
+
144
+ self.detector = detector_class(stream_reasoning=stream_reasoning)
145
+
146
+ def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
147
+ """Non-streaming call: one-time parsing"""
148
+ ret = self.detector.detect_and_parse(full_text)
149
+ return ret.reasoning_text, ret.normal_text
150
+
151
+ def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, str]:
152
+ """Streaming call: incremental parsing"""
153
+ ret = self.detector.parse_streaming_increment(chunk_text)
154
+ return ret.reasoning_text, ret.normal_text
@@ -1,13 +1,11 @@
1
- from .orchestrator import BatchedPenalizerOrchestrator
2
- from .penalizers.frequency_penalty import BatchedFrequencyPenalizer
3
- from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer
4
- from .penalizers.presence_penalty import BatchedPresencePenalizer
5
- from .penalizers.repetition_penalty import BatchedRepetitionPenalizer
1
+ from sglang.srt.sampling.penaltylib.frequency_penalty import BatchedFrequencyPenalizer
2
+ from sglang.srt.sampling.penaltylib.min_new_tokens import BatchedMinNewTokensPenalizer
3
+ from sglang.srt.sampling.penaltylib.orchestrator import BatchedPenalizerOrchestrator
4
+ from sglang.srt.sampling.penaltylib.presence_penalty import BatchedPresencePenalizer
6
5
 
7
6
  __all__ = [
8
7
  "BatchedFrequencyPenalizer",
9
8
  "BatchedMinNewTokensPenalizer",
10
9
  "BatchedPresencePenalizer",
11
- "BatchedRepetitionPenalizer",
12
10
  "BatchedPenalizerOrchestrator",
13
11
  ]