sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sglang/bench_one_batch.py +19 -3
  2. sglang/bench_serving.py +8 -9
  3. sglang/compile_deep_gemm.py +45 -4
  4. sglang/srt/code_completion_parser.py +1 -1
  5. sglang/srt/configs/deepseekvl2.py +1 -1
  6. sglang/srt/configs/model_config.py +9 -3
  7. sglang/srt/constrained/llguidance_backend.py +78 -61
  8. sglang/srt/conversation.py +34 -1
  9. sglang/srt/disaggregation/decode.py +67 -13
  10. sglang/srt/disaggregation/fake/__init__.py +1 -0
  11. sglang/srt/disaggregation/fake/conn.py +88 -0
  12. sglang/srt/disaggregation/mini_lb.py +45 -8
  13. sglang/srt/disaggregation/mooncake/conn.py +198 -31
  14. sglang/srt/disaggregation/prefill.py +36 -12
  15. sglang/srt/disaggregation/utils.py +16 -2
  16. sglang/srt/entrypoints/engine.py +9 -0
  17. sglang/srt/entrypoints/http_server.py +35 -4
  18. sglang/srt/function_call_parser.py +77 -5
  19. sglang/srt/layers/attention/base_attn_backend.py +3 -0
  20. sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  21. sglang/srt/layers/attention/flashattention_backend.py +28 -10
  22. sglang/srt/layers/attention/flashmla_backend.py +8 -11
  23. sglang/srt/layers/attention/utils.py +1 -1
  24. sglang/srt/layers/attention/vision.py +2 -0
  25. sglang/srt/layers/layernorm.py +38 -16
  26. sglang/srt/layers/logits_processor.py +2 -2
  27. sglang/srt/layers/moe/fused_moe_native.py +2 -4
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
  43. sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  44. sglang/srt/layers/pooler.py +6 -0
  45. sglang/srt/layers/quantization/awq.py +5 -1
  46. sglang/srt/layers/quantization/deep_gemm.py +17 -10
  47. sglang/srt/layers/quantization/fp8.py +20 -22
  48. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  49. sglang/srt/layers/quantization/int8_kernel.py +32 -1
  50. sglang/srt/layers/radix_attention.py +13 -3
  51. sglang/srt/layers/rotary_embedding.py +170 -126
  52. sglang/srt/managers/data_parallel_controller.py +10 -3
  53. sglang/srt/managers/io_struct.py +7 -0
  54. sglang/srt/managers/mm_utils.py +85 -28
  55. sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
  56. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
  57. sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
  58. sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
  59. sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
  61. sglang/srt/managers/schedule_batch.py +38 -12
  62. sglang/srt/managers/scheduler.py +41 -28
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
  64. sglang/srt/managers/tokenizer_manager.py +5 -1
  65. sglang/srt/managers/tp_worker.py +3 -3
  66. sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
  67. sglang/srt/mem_cache/memory_pool.py +87 -0
  68. sglang/srt/model_executor/cuda_graph_runner.py +4 -3
  69. sglang/srt/model_executor/forward_batch_info.py +51 -95
  70. sglang/srt/model_executor/model_runner.py +19 -25
  71. sglang/srt/models/deepseek.py +12 -2
  72. sglang/srt/models/deepseek_nextn.py +101 -6
  73. sglang/srt/models/deepseek_v2.py +144 -70
  74. sglang/srt/models/deepseek_vl2.py +9 -4
  75. sglang/srt/models/gemma3_causal.py +1 -1
  76. sglang/srt/models/llama4.py +0 -1
  77. sglang/srt/models/minicpmo.py +5 -1
  78. sglang/srt/models/mllama4.py +2 -2
  79. sglang/srt/models/qwen2_5_vl.py +3 -6
  80. sglang/srt/models/qwen2_vl.py +3 -7
  81. sglang/srt/models/roberta.py +178 -0
  82. sglang/srt/openai_api/adapter.py +50 -11
  83. sglang/srt/openai_api/protocol.py +2 -0
  84. sglang/srt/reasoning_parser.py +25 -1
  85. sglang/srt/server_args.py +31 -24
  86. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  87. sglang/srt/torch_memory_saver_adapter.py +10 -1
  88. sglang/srt/utils.py +5 -1
  89. sglang/test/runners.py +6 -13
  90. sglang/test/send_one.py +84 -28
  91. sglang/test/test_utils.py +74 -18
  92. sglang/version.py +1 -1
  93. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
  94. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
  95. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
  96. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
  97. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import itertools
4
+ from typing import Iterable, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import nn
8
+
9
+ from sglang.srt.layers.pooler import Pooler, PoolingType
10
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
11
+ from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
12
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
13
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
14
+ from sglang.srt.models.bert import BertEncoder
15
+
16
+ RobertaConfig = None
17
+
18
+
19
+ class RobertaEmbedding(nn.Module):
20
+
21
+ def __init__(self, config: RobertaConfig):
22
+ super().__init__()
23
+ self.size = config.hidden_size
24
+ self.word_embeddings = VocabParallelEmbedding(
25
+ config.vocab_size, config.hidden_size
26
+ )
27
+ self.padding_idx = config.pad_token_id
28
+ self.position_embeddings = nn.Embedding(
29
+ config.max_position_embeddings,
30
+ config.hidden_size,
31
+ padding_idx=self.padding_idx,
32
+ )
33
+
34
+ self.token_type_embeddings = nn.Embedding(
35
+ config.type_vocab_size, config.hidden_size
36
+ )
37
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
38
+
39
+ self.position_ids = nn.Parameter(
40
+ torch.empty((1, config.max_position_embeddings)),
41
+ )
42
+
43
+ self.position_embedding_type = config.position_embedding_type
44
+ if self.position_embedding_type != "absolute":
45
+ raise ValueError(
46
+ "Only 'absolute' position_embedding_type" + " is supported"
47
+ )
48
+
49
+ def forward(
50
+ self,
51
+ input_ids: torch.Tensor,
52
+ seq_lens: torch.Tensor,
53
+ position_ids: torch.Tensor,
54
+ inputs_embeds=None,
55
+ token_type_ids: Optional[torch.Tensor] = None,
56
+ ) -> torch.Tensor:
57
+ input_shape = input_ids.size()
58
+ inputs_embeds = self.word_embeddings(input_ids)
59
+
60
+ # adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
61
+
62
+ pos_list = []
63
+ token_list = []
64
+ offset = 0
65
+ for seq_len in seq_lens:
66
+ pos_list.append(position_ids[offset : offset + seq_len])
67
+ token_list.append(input_ids[offset : offset + seq_len])
68
+ offset += seq_len
69
+
70
+ new_pos_list = []
71
+ for positions, tokens in zip(pos_list, token_list):
72
+ # Verify assumption that incoming position are
73
+ # always a sequence from 0 to N.
74
+ expected_pos = torch.arange(
75
+ positions.size()[0], dtype=torch.long, device=inputs_embeds.device
76
+ )
77
+ assert torch.equal(positions, expected_pos)
78
+ new_pos_list.append(
79
+ create_position_ids_from_input_ids(tokens, self.padding_idx)
80
+ )
81
+ position_ids = torch.cat(new_pos_list)
82
+
83
+ # Position embeddings.
84
+ position_embeddings = self.position_embeddings(position_ids)
85
+ if token_type_ids is None:
86
+ token_type_ids = torch.zeros(
87
+ input_shape, dtype=torch.long, device=inputs_embeds.device
88
+ )
89
+
90
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
91
+ embeddings = inputs_embeds + token_type_embeddings + position_embeddings
92
+ embeddings = self.LayerNorm(embeddings)
93
+ return embeddings
94
+
95
+
96
+ class XLMRobertaModel(nn.Module):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ config: RobertaConfig,
101
+ quant_config: Optional[QuantizationConfig] = None,
102
+ prefix: str = "",
103
+ ):
104
+ super().__init__()
105
+
106
+ self.config = config
107
+ self.embeddings = RobertaEmbedding(config)
108
+ self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
109
+ self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
110
+
111
+ @torch.no_grad()
112
+ def forward(
113
+ self,
114
+ input_ids: torch.Tensor,
115
+ positions: torch.Tensor,
116
+ forward_batch: ForwardBatch,
117
+ input_embeds: torch.Tensor = None,
118
+ get_embedding: bool = False,
119
+ ) -> torch.Tensor:
120
+ assert get_embedding == True
121
+ # Your tokenized IDs
122
+
123
+ hidden_states = self.embeddings(
124
+ input_ids=input_ids,
125
+ position_ids=positions,
126
+ seq_lens=forward_batch.seq_lens,
127
+ )
128
+
129
+ hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
130
+ pooler_out = self.pooler(hidden_states, forward_batch)
131
+ return pooler_out
132
+
133
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
134
+ stacked_params_mapping = [
135
+ # (param_name, shard_name, shard_id)
136
+ ("qkv_proj", "query", "q"),
137
+ ("qkv_proj", "key", "k"),
138
+ ("qkv_proj", "value", "v"),
139
+ ]
140
+
141
+ params_dict = dict(self.named_parameters())
142
+ for name, loaded_weight in weights:
143
+ name = name.replace("self", "self_attn")
144
+ if "pooler" in name:
145
+ continue
146
+ for param_name, weight_name, shard_id in stacked_params_mapping:
147
+
148
+ if weight_name not in name:
149
+ continue
150
+ name = name.replace(weight_name, param_name)
151
+ # Skip loading extra bias for GPTQ models.
152
+ if name.endswith(".bias") and name not in params_dict:
153
+ continue
154
+ param = params_dict[name]
155
+ weight_loader = param.weight_loader
156
+ weight_loader(param, loaded_weight, shard_id)
157
+ break
158
+ else:
159
+ # Skip loading extra bias for GPTQ models.
160
+ if name.endswith(".bias") and name not in params_dict:
161
+ continue
162
+ param = params_dict[name]
163
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
164
+ weight_loader(param, loaded_weight)
165
+
166
+
167
+ # Adapted from transformers
168
+ def create_position_ids_from_input_ids(
169
+ input_ids, padding_idx, past_key_values_length=0
170
+ ):
171
+ mask = input_ids.ne(padding_idx).int()
172
+ incremental_indices = (
173
+ torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
174
+ ) * mask
175
+ return incremental_indices.long() + padding_idx
176
+
177
+
178
+ EntryClass = [XLMRobertaModel]
@@ -715,7 +715,10 @@ def v1_generate_response(
715
715
 
716
716
 
717
717
  async def v1_completions(tokenizer_manager, raw_request: Request):
718
- request_json = await raw_request.json()
718
+ try:
719
+ request_json = await raw_request.json()
720
+ except Exception as e:
721
+ return create_error_response("Invalid request body, error: ", str(e))
719
722
  all_requests = [CompletionRequest(**request_json)]
720
723
  created = int(time.time())
721
724
  adapted_request, request = v1_generate_request(all_requests)
@@ -909,6 +912,7 @@ def v1_chat_generate_request(
909
912
 
910
913
  # NOTE: with openai API, the prompt's logprobs are always not computed
911
914
 
915
+ is_multimodal = tokenizer_manager.model_config.is_multimodal
912
916
  for request in all_requests:
913
917
  # Prep the data needed for the underlying GenerateReqInput:
914
918
  # - prompt: The full prompt string.
@@ -918,6 +922,7 @@ def v1_chat_generate_request(
918
922
  # None skips any image processing in GenerateReqInput.
919
923
  strict_tag = None
920
924
  prompt = ""
925
+ prompt_ids = []
921
926
  if not isinstance(request.messages, str):
922
927
  # Apply chat template and its stop strings.
923
928
  tools = None
@@ -964,10 +969,10 @@ def v1_chat_generate_request(
964
969
  ),
965
970
  }
966
971
  )
967
- # TODO fix the compatible issues with xgrammar
968
- strict_tag = None
969
972
 
970
973
  for message in request.messages:
974
+ if message.content is None:
975
+ message.content = ""
971
976
  if isinstance(message.content, str):
972
977
  openai_compatible_messages.append(
973
978
  {"role": message.role, "content": message.content}
@@ -998,6 +1003,11 @@ def v1_chat_generate_request(
998
1003
  tokenize=True,
999
1004
  add_generation_prompt=True,
1000
1005
  tools=tools,
1006
+ **(
1007
+ request.chat_template_kwargs
1008
+ if request.chat_template_kwargs
1009
+ else {}
1010
+ ),
1001
1011
  )
1002
1012
  except:
1003
1013
  # This except branch will be triggered when the chosen model
@@ -1009,6 +1019,11 @@ def v1_chat_generate_request(
1009
1019
  tokenize=True,
1010
1020
  add_generation_prompt=True,
1011
1021
  tools=tools,
1022
+ **(
1023
+ request.chat_template_kwargs
1024
+ if request.chat_template_kwargs
1025
+ else {}
1026
+ ),
1012
1027
  )
1013
1028
 
1014
1029
  if assistant_prefix:
@@ -1019,7 +1034,7 @@ def v1_chat_generate_request(
1019
1034
  ):
1020
1035
  encoded = encoded[1:]
1021
1036
  prompt_ids += encoded
1022
- if tokenizer_manager.model_config.is_multimodal:
1037
+ if is_multimodal:
1023
1038
  prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
1024
1039
  stop = request.stop
1025
1040
  image_data = None
@@ -1064,8 +1079,9 @@ def v1_chat_generate_request(
1064
1079
  stop.append(request.stop)
1065
1080
  else:
1066
1081
  stop.extend(request.stop)
1067
- prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
1068
1082
 
1083
+ if not is_multimodal:
1084
+ prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
1069
1085
  else:
1070
1086
  # Use the raw prompt and stop strings if the messages is already a string.
1071
1087
  prompt_ids = request.messages
@@ -1135,7 +1151,7 @@ def v1_chat_generate_request(
1135
1151
  audio_data_list.append(audio_data)
1136
1152
  modalities_list.append(modalities)
1137
1153
  if len(all_requests) == 1:
1138
- if tokenizer_manager.model_config.is_multimodal:
1154
+ if is_multimodal:
1139
1155
  # processor will need text input
1140
1156
  prompt_kwargs = {"text": prompts[0]}
1141
1157
  else:
@@ -1175,6 +1191,7 @@ def v1_chat_generate_request(
1175
1191
  modalities=modalities_list,
1176
1192
  lora_path=lora_paths,
1177
1193
  bootstrap_host=all_requests[0].bootstrap_host,
1194
+ bootstrap_port=all_requests[0].bootstrap_port,
1178
1195
  bootstrap_room=all_requests[0].bootstrap_room,
1179
1196
  )
1180
1197
 
@@ -1241,16 +1258,34 @@ def v1_chat_generate_response(
1241
1258
  tool_calls = None
1242
1259
  text = ret_item["text"]
1243
1260
 
1261
+ enable_thinking = True
1244
1262
  if isinstance(request, list):
1245
1263
  tool_choice = request[idx].tool_choice
1246
1264
  tools = request[idx].tools
1247
1265
  separate_reasoning = request[idx].separate_reasoning
1266
+
1267
+ if (
1268
+ request[idx].chat_template_kwargs
1269
+ and request[idx].chat_template_kwargs.get("enable_thinking") is not None
1270
+ ):
1271
+ enable_thinking = request[idx].chat_template_kwargs.get(
1272
+ "enable_thinking", True
1273
+ )
1248
1274
  else:
1249
1275
  tool_choice = request.tool_choice
1250
1276
  tools = request.tools
1251
1277
  separate_reasoning = request.separate_reasoning
1252
1278
 
1253
- if reasoning_parser and separate_reasoning:
1279
+ if (
1280
+ request.chat_template_kwargs
1281
+ and request.chat_template_kwargs.get("enable_thinking") is not None
1282
+ ):
1283
+ enable_thinking = request.chat_template_kwargs.get(
1284
+ "enable_thinking", True
1285
+ )
1286
+
1287
+ reasoning_text = None
1288
+ if reasoning_parser and separate_reasoning and enable_thinking:
1254
1289
  try:
1255
1290
  parser = ReasoningParser(
1256
1291
  model_type=reasoning_parser, stream_reasoning=False
@@ -1262,8 +1297,6 @@ def v1_chat_generate_response(
1262
1297
  HTTPStatus.BAD_REQUEST,
1263
1298
  "Failed to parse reasoning related info to json format!",
1264
1299
  )
1265
- else:
1266
- reasoning_text = None
1267
1300
 
1268
1301
  if tool_choice != "none" and tools:
1269
1302
  parser = FunctionCallParser(tools, tool_call_parser)
@@ -1378,7 +1411,10 @@ def v1_chat_generate_response(
1378
1411
  async def v1_chat_completions(
1379
1412
  tokenizer_manager, raw_request: Request, cache_report=False
1380
1413
  ):
1381
- request_json = await raw_request.json()
1414
+ try:
1415
+ request_json = await raw_request.json()
1416
+ except Exception as e:
1417
+ return create_error_response("Invalid request body, error: ", str(e))
1382
1418
  all_requests = [ChatCompletionRequest(**request_json)]
1383
1419
  created = int(time.time())
1384
1420
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
@@ -1799,7 +1835,10 @@ def v1_embedding_response(ret, model_path, to_file=False):
1799
1835
 
1800
1836
 
1801
1837
  async def v1_embeddings(tokenizer_manager, raw_request: Request):
1802
- request_json = await raw_request.json()
1838
+ try:
1839
+ request_json = await raw_request.json()
1840
+ except Exception as e:
1841
+ return create_error_response("Invalid request body, error: ", str(e))
1803
1842
  all_requests = [EmbeddingRequest(**request_json)]
1804
1843
  adapted_request, request = v1_embedding_request(all_requests, tokenizer_manager)
1805
1844
 
@@ -361,9 +361,11 @@ class ChatCompletionRequest(BaseModel):
361
361
  session_params: Optional[Dict] = None
362
362
  separate_reasoning: bool = True
363
363
  stream_reasoning: bool = True
364
+ chat_template_kwargs: Optional[Dict] = None
364
365
 
365
366
  # For PD disaggregation
366
367
  bootstrap_host: Optional[str] = None
368
+ bootstrap_port: Optional[int] = None
367
369
  bootstrap_room: Optional[int] = None
368
370
 
369
371
 
@@ -117,6 +117,29 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
117
117
  # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
118
118
 
119
119
 
120
+ class Qwen3Detector(BaseReasoningFormatDetector):
121
+ """
122
+ Detector for Qwen3 model.
123
+ Assumes reasoning format:
124
+ (<think>)*(.*)</think>
125
+ Returns all the text before the </think> tag as `reasoning_text`
126
+ and the rest of the text as `normal_text`.
127
+
128
+ Args:
129
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
130
+ If True, streams reasoning content as it arrives.
131
+ """
132
+
133
+ def __init__(self, stream_reasoning: bool = True):
134
+ # Qwen3 is assumed to be reasoning until `</think>` token
135
+ super().__init__(
136
+ "<think>",
137
+ "</think>",
138
+ force_reasoning=True,
139
+ stream_reasoning=stream_reasoning,
140
+ )
141
+
142
+
120
143
  class ReasoningParser:
121
144
  """
122
145
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -129,7 +152,8 @@ class ReasoningParser:
129
152
  """
130
153
 
131
154
  DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
132
- "deepseek-r1": DeepSeekR1Detector
155
+ "deepseek-r1": DeepSeekR1Detector,
156
+ "qwen3": Qwen3Detector,
133
157
  }
134
158
 
135
159
  def __init__(self, model_type: str = None, stream_reasoning: bool = True):
sglang/srt/server_args.py CHANGED
@@ -153,7 +153,7 @@ class ServerArgs:
153
153
  enable_nccl_nvls: bool = False
154
154
  disable_outlines_disk_cache: bool = False
155
155
  disable_custom_all_reduce: bool = False
156
- enable_llama4_multimodal: Optional[bool] = None
156
+ enable_multimodal: Optional[bool] = None
157
157
  disable_overlap_schedule: bool = False
158
158
  enable_mixed_chunk: bool = False
159
159
  enable_dp_attention: bool = False
@@ -201,7 +201,7 @@ class ServerArgs:
201
201
  # Expert parallelism
202
202
  if self.enable_ep_moe:
203
203
  self.ep_size = self.tp_size
204
- logger.info(
204
+ logger.warning(
205
205
  f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
206
206
  )
207
207
 
@@ -243,19 +243,25 @@ class ServerArgs:
243
243
  self.chunked_prefill_size = 2048
244
244
  else:
245
245
  self.chunked_prefill_size = 8192
246
-
247
246
  assert self.chunked_prefill_size % self.page_size == 0
248
247
 
249
248
  assert self.moe_dense_tp_size in {
250
249
  1,
251
250
  None,
252
- }, f"moe_dense_tp_size only support 1 and None currently"
251
+ }, "moe_dense_tp_size only support 1 and None currently"
253
252
 
254
253
  if self.attention_backend == "flashmla":
255
254
  logger.warning(
256
255
  "FlashMLA only supports a page_size of 64, change page_size to 64."
257
256
  )
258
257
  self.page_size = 64
258
+
259
+ if self.attention_backend == "cutlass_mla":
260
+ logger.warning(
261
+ "Cutlass MLA only supports a page_size of 128, change page_size to 128."
262
+ )
263
+ self.page_size = 128
264
+
259
265
  # Set cuda graph max batch size
260
266
  if self.cuda_graph_max_bs is None:
261
267
  # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -270,6 +276,7 @@ class ServerArgs:
270
276
  self.attention_backend = "torch_native"
271
277
  self.sampling_backend = "pytorch"
272
278
 
279
+ # Set kernel backends
273
280
  if self.sampling_backend is None:
274
281
  self.sampling_backend = (
275
282
  "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -285,8 +292,6 @@ class ServerArgs:
285
292
  if self.grammar_backend is None:
286
293
  self.grammar_backend = "xgrammar"
287
294
 
288
- self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
289
-
290
295
  # Data parallelism attention
291
296
  if self.enable_dp_attention:
292
297
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -299,8 +304,8 @@ class ServerArgs:
299
304
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
300
305
  )
301
306
 
302
- self.enable_sp_layernorm = False
303
307
  # DeepEP MoE
308
+ self.enable_sp_layernorm = False
304
309
  if self.enable_deepep_moe:
305
310
  if self.deepep_mode == "auto":
306
311
  assert (
@@ -310,7 +315,7 @@ class ServerArgs:
310
315
  self.enable_sp_layernorm = (
311
316
  self.dp_size < self.tp_size if self.enable_dp_attention else True
312
317
  )
313
- logger.info(
318
+ logger.warning(
314
319
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
315
320
  )
316
321
 
@@ -319,14 +324,11 @@ class ServerArgs:
319
324
  # NEXTN shares the same implementation of EAGLE
320
325
  self.speculative_algorithm = "EAGLE"
321
326
 
322
- if (
323
- self.speculative_algorithm == "EAGLE"
324
- or self.speculative_algorithm == "EAGLE3"
325
- ):
327
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
326
328
  if self.max_running_requests is None:
327
329
  self.max_running_requests = 48
328
330
  self.disable_overlap_schedule = True
329
- logger.info(
331
+ logger.warning(
330
332
  "Overlap scheduler is disabled because of using "
331
333
  "eagle speculative decoding."
332
334
  )
@@ -345,7 +347,7 @@ class ServerArgs:
345
347
 
346
348
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
347
349
  self.speculative_eagle_topk = 1
348
- logger.info(
350
+ logger.warning(
349
351
  "speculative_eagle_topk is adjusted to 1 when page_size > 1"
350
352
  )
351
353
 
@@ -353,7 +355,7 @@ class ServerArgs:
353
355
  self.speculative_eagle_topk == 1
354
356
  and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
355
357
  ):
356
- logger.info(
358
+ logger.warning(
357
359
  "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
358
360
  )
359
361
  self.speculative_num_draft_tokens = self.speculative_num_steps + 1
@@ -424,7 +426,7 @@ class ServerArgs:
424
426
  parser.add_argument(
425
427
  "--skip-tokenizer-init",
426
428
  action="store_true",
427
- help="If set, skip init tokenizer and pass input_ids in generate request",
429
+ help="If set, skip init tokenizer and pass input_ids in generate request.",
428
430
  )
429
431
  parser.add_argument(
430
432
  "--enable-tokenizer-batch-encode",
@@ -563,6 +565,7 @@ class ServerArgs:
563
565
  "name, a tag name, or a commit id. If unspecified, will use "
564
566
  "the default version.",
565
567
  )
568
+
566
569
  # Memory and scheduling
567
570
  parser.add_argument(
568
571
  "--mem-fraction-static",
@@ -827,7 +830,14 @@ class ServerArgs:
827
830
  parser.add_argument(
828
831
  "--attention-backend",
829
832
  type=str,
830
- choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
833
+ choices=[
834
+ "flashinfer",
835
+ "triton",
836
+ "torch_native",
837
+ "fa3",
838
+ "flashmla",
839
+ "cutlass_mla",
840
+ ],
831
841
  default=ServerArgs.attention_backend,
832
842
  help="Choose the kernels for attention layers.",
833
843
  )
@@ -979,10 +989,10 @@ class ServerArgs:
979
989
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
980
990
  )
981
991
  parser.add_argument(
982
- "--enable-llama4-multimodal",
983
- default=ServerArgs.enable_llama4_multimodal,
992
+ "--enable-multimodal",
993
+ default=ServerArgs.enable_multimodal,
984
994
  action="store_true",
985
- help="Enable the multimodal functionality for Llama-4.",
995
+ help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
986
996
  )
987
997
  parser.add_argument(
988
998
  "--disable-overlap-schedule",
@@ -1364,10 +1374,7 @@ def auto_choose_speculative_params(self: ServerArgs):
1364
1374
 
1365
1375
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1366
1376
  """
1367
- if self.decrypted_config_file:
1368
- config_path = self.decrypted_config_file
1369
- else:
1370
- config_path = os.path.join(self.model_path, "config.json")
1377
+ config_path = os.path.join(self.model_path, "config.json")
1371
1378
  if not os.path.exists(config_path):
1372
1379
  raise ValueError(f"{config_path} is not found.")
1373
1380
 
@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
85
85
  f"Capture cuda graph failed: {e}\n"
86
86
  "Possible solutions:\n"
87
87
  "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
88
- "2. disable torch compile by not using --enable-torch-compile\n"
89
- "3. specify --dtype to the same dtype (e.g. bfloat16)\n"
90
- "4. disable cuda graph by --disable-cuda-graph\n"
88
+ "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
89
+ "3. disable torch compile by not using --enable-torch-compile\n"
90
+ "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
91
91
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
92
92
  )
93
93
 
@@ -6,7 +6,9 @@ try:
6
6
  import torch_memory_saver
7
7
 
8
8
  _primary_memory_saver = torch_memory_saver.TorchMemorySaver()
9
- except ImportError:
9
+ import_error = None
10
+ except ImportError as e:
11
+ import_error = e
10
12
  pass
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__)
15
17
  class TorchMemorySaverAdapter(ABC):
16
18
  @staticmethod
17
19
  def create(enable: bool):
20
+ if enable and import_error is not None:
21
+ logger.warning(
22
+ "enable_memory_saver is enabled, but "
23
+ "torch-memory-saver is not installed. Please install it "
24
+ "via `pip3 install torch-memory-saver`. "
25
+ )
26
+ raise import_error
18
27
  return (
19
28
  _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
20
29
  )
sglang/srt/utils.py CHANGED
@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str:
1944
1944
  s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
1945
1945
  return s.getsockname()[0]
1946
1946
  except Exception:
1947
- raise ValueError(f"Can not get local ip")
1947
+ raise ValueError("Can not get local ip")
1948
1948
 
1949
1949
 
1950
1950
  def is_page_size_one(server_args):
@@ -1970,7 +1970,11 @@ def is_fa3_default_architecture(hf_config):
1970
1970
  "Llama4ForConditionalGeneration",
1971
1971
  "LlamaForCausalLM",
1972
1972
  "MistralForCausalLM",
1973
+ "MixtralForCausalLM",
1973
1974
  "Gemma2ForCausalLM",
1975
+ "Gemma3ForConditionalGeneration",
1976
+ "Qwen3ForCausalLM",
1977
+ "Qwen3MoeForCausalLM",
1974
1978
  }
1975
1979
  return architectures[0] in default_archs
1976
1980
 
sglang/test/runners.py CHANGED
@@ -190,25 +190,18 @@ class HFRunner:
190
190
  if attention_mask is not None:
191
191
  attention_mask = attention_mask.to(inputs_embeds.device)
192
192
 
193
- outputs = self.model.model(
194
- input_ids=None,
193
+ outputs = self.model(
194
+ input_ids=input_ids,
195
195
  position_ids=position_ids,
196
196
  attention_mask=attention_mask,
197
197
  past_key_values=past_key_values,
198
+ output_hidden_states=True,
199
+ return_dict=True,
198
200
  inputs_embeds=inputs_embeds,
201
+ image_grid_thw=image_grid_thw,
199
202
  )
200
203
 
201
- pooling_mask = attention_mask if pooling_mask is None else pooling_mask
202
- left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0] # TODO
203
- if left_padding:
204
- embeddings = outputs.last_hidden_state[:, -1]
205
- else:
206
- sequence_lengths = pooling_mask.sum(dim=1) - 1
207
- batch_size = outputs.last_hidden_state.shape[0]
208
- embeddings = outputs.last_hidden_state[
209
- torch.arange(batch_size, device=outputs.last_hidden_state.device),
210
- sequence_lengths,
211
- ]
204
+ embeddings = outputs.hidden_states[-1][:, -1]
212
205
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
213
206
  return embeddings.contiguous()
214
207