sglang 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. sglang/bench_one_batch.py +19 -3
  2. sglang/bench_serving.py +8 -9
  3. sglang/compile_deep_gemm.py +45 -4
  4. sglang/srt/code_completion_parser.py +1 -1
  5. sglang/srt/configs/deepseekvl2.py +1 -1
  6. sglang/srt/configs/model_config.py +9 -3
  7. sglang/srt/constrained/llguidance_backend.py +78 -61
  8. sglang/srt/conversation.py +34 -1
  9. sglang/srt/disaggregation/decode.py +59 -11
  10. sglang/srt/disaggregation/mini_lb.py +45 -8
  11. sglang/srt/disaggregation/mooncake/conn.py +198 -31
  12. sglang/srt/disaggregation/prefill.py +24 -9
  13. sglang/srt/entrypoints/http_server.py +8 -2
  14. sglang/srt/function_call_parser.py +77 -5
  15. sglang/srt/layers/attention/base_attn_backend.py +3 -0
  16. sglang/srt/layers/attention/flashattention_backend.py +28 -10
  17. sglang/srt/layers/attention/flashmla_backend.py +8 -11
  18. sglang/srt/layers/attention/vision.py +2 -0
  19. sglang/srt/layers/layernorm.py +38 -16
  20. sglang/srt/layers/logits_processor.py +2 -2
  21. sglang/srt/layers/moe/fused_moe_native.py +2 -4
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
  25. sglang/srt/layers/pooler.py +6 -0
  26. sglang/srt/layers/quantization/awq.py +5 -1
  27. sglang/srt/layers/quantization/deep_gemm.py +17 -10
  28. sglang/srt/layers/quantization/int8_kernel.py +32 -1
  29. sglang/srt/layers/radix_attention.py +13 -3
  30. sglang/srt/layers/rotary_embedding.py +170 -126
  31. sglang/srt/managers/data_parallel_controller.py +10 -3
  32. sglang/srt/managers/io_struct.py +7 -0
  33. sglang/srt/managers/mm_utils.py +85 -28
  34. sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
  35. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
  36. sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
  37. sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
  38. sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
  39. sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
  40. sglang/srt/managers/schedule_batch.py +29 -12
  41. sglang/srt/managers/scheduler.py +31 -20
  42. sglang/srt/managers/tokenizer_manager.py +5 -1
  43. sglang/srt/mem_cache/memory_pool.py +87 -0
  44. sglang/srt/model_executor/cuda_graph_runner.py +4 -3
  45. sglang/srt/model_executor/forward_batch_info.py +51 -95
  46. sglang/srt/model_executor/model_runner.py +11 -24
  47. sglang/srt/models/deepseek.py +12 -2
  48. sglang/srt/models/deepseek_nextn.py +101 -6
  49. sglang/srt/models/deepseek_v2.py +144 -70
  50. sglang/srt/models/deepseek_vl2.py +9 -4
  51. sglang/srt/models/gemma3_causal.py +1 -1
  52. sglang/srt/models/llama4.py +0 -1
  53. sglang/srt/models/minicpmo.py +5 -1
  54. sglang/srt/models/mllama4.py +2 -2
  55. sglang/srt/models/qwen2_5_vl.py +3 -6
  56. sglang/srt/models/qwen2_vl.py +3 -7
  57. sglang/srt/models/roberta.py +178 -0
  58. sglang/srt/openai_api/adapter.py +18 -8
  59. sglang/srt/server_args.py +15 -22
  60. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  61. sglang/srt/torch_memory_saver_adapter.py +10 -1
  62. sglang/srt/utils.py +2 -1
  63. sglang/test/runners.py +6 -13
  64. sglang/test/test_utils.py +36 -18
  65. sglang/version.py +1 -1
  66. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/METADATA +4 -5
  67. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/RECORD +70 -68
  68. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
  69. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
  70. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import itertools
4
+ from typing import Iterable, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import nn
8
+
9
+ from sglang.srt.layers.pooler import Pooler, PoolingType
10
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
11
+ from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
12
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
13
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
14
+ from sglang.srt.models.bert import BertEncoder
15
+
16
+ RobertaConfig = None
17
+
18
+
19
+ class RobertaEmbedding(nn.Module):
20
+
21
+ def __init__(self, config: RobertaConfig):
22
+ super().__init__()
23
+ self.size = config.hidden_size
24
+ self.word_embeddings = VocabParallelEmbedding(
25
+ config.vocab_size, config.hidden_size
26
+ )
27
+ self.padding_idx = config.pad_token_id
28
+ self.position_embeddings = nn.Embedding(
29
+ config.max_position_embeddings,
30
+ config.hidden_size,
31
+ padding_idx=self.padding_idx,
32
+ )
33
+
34
+ self.token_type_embeddings = nn.Embedding(
35
+ config.type_vocab_size, config.hidden_size
36
+ )
37
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
38
+
39
+ self.position_ids = nn.Parameter(
40
+ torch.empty((1, config.max_position_embeddings)),
41
+ )
42
+
43
+ self.position_embedding_type = config.position_embedding_type
44
+ if self.position_embedding_type != "absolute":
45
+ raise ValueError(
46
+ "Only 'absolute' position_embedding_type" + " is supported"
47
+ )
48
+
49
+ def forward(
50
+ self,
51
+ input_ids: torch.Tensor,
52
+ seq_lens: torch.Tensor,
53
+ position_ids: torch.Tensor,
54
+ inputs_embeds=None,
55
+ token_type_ids: Optional[torch.Tensor] = None,
56
+ ) -> torch.Tensor:
57
+ input_shape = input_ids.size()
58
+ inputs_embeds = self.word_embeddings(input_ids)
59
+
60
+ # adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
61
+
62
+ pos_list = []
63
+ token_list = []
64
+ offset = 0
65
+ for seq_len in seq_lens:
66
+ pos_list.append(position_ids[offset : offset + seq_len])
67
+ token_list.append(input_ids[offset : offset + seq_len])
68
+ offset += seq_len
69
+
70
+ new_pos_list = []
71
+ for positions, tokens in zip(pos_list, token_list):
72
+ # Verify assumption that incoming position are
73
+ # always a sequence from 0 to N.
74
+ expected_pos = torch.arange(
75
+ positions.size()[0], dtype=torch.long, device=inputs_embeds.device
76
+ )
77
+ assert torch.equal(positions, expected_pos)
78
+ new_pos_list.append(
79
+ create_position_ids_from_input_ids(tokens, self.padding_idx)
80
+ )
81
+ position_ids = torch.cat(new_pos_list)
82
+
83
+ # Position embeddings.
84
+ position_embeddings = self.position_embeddings(position_ids)
85
+ if token_type_ids is None:
86
+ token_type_ids = torch.zeros(
87
+ input_shape, dtype=torch.long, device=inputs_embeds.device
88
+ )
89
+
90
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
91
+ embeddings = inputs_embeds + token_type_embeddings + position_embeddings
92
+ embeddings = self.LayerNorm(embeddings)
93
+ return embeddings
94
+
95
+
96
+ class XLMRobertaModel(nn.Module):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ config: RobertaConfig,
101
+ quant_config: Optional[QuantizationConfig] = None,
102
+ prefix: str = "",
103
+ ):
104
+ super().__init__()
105
+
106
+ self.config = config
107
+ self.embeddings = RobertaEmbedding(config)
108
+ self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
109
+ self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
110
+
111
+ @torch.no_grad()
112
+ def forward(
113
+ self,
114
+ input_ids: torch.Tensor,
115
+ positions: torch.Tensor,
116
+ forward_batch: ForwardBatch,
117
+ input_embeds: torch.Tensor = None,
118
+ get_embedding: bool = False,
119
+ ) -> torch.Tensor:
120
+ assert get_embedding == True
121
+ # Your tokenized IDs
122
+
123
+ hidden_states = self.embeddings(
124
+ input_ids=input_ids,
125
+ position_ids=positions,
126
+ seq_lens=forward_batch.seq_lens,
127
+ )
128
+
129
+ hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
130
+ pooler_out = self.pooler(hidden_states, forward_batch)
131
+ return pooler_out
132
+
133
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
134
+ stacked_params_mapping = [
135
+ # (param_name, shard_name, shard_id)
136
+ ("qkv_proj", "query", "q"),
137
+ ("qkv_proj", "key", "k"),
138
+ ("qkv_proj", "value", "v"),
139
+ ]
140
+
141
+ params_dict = dict(self.named_parameters())
142
+ for name, loaded_weight in weights:
143
+ name = name.replace("self", "self_attn")
144
+ if "pooler" in name:
145
+ continue
146
+ for param_name, weight_name, shard_id in stacked_params_mapping:
147
+
148
+ if weight_name not in name:
149
+ continue
150
+ name = name.replace(weight_name, param_name)
151
+ # Skip loading extra bias for GPTQ models.
152
+ if name.endswith(".bias") and name not in params_dict:
153
+ continue
154
+ param = params_dict[name]
155
+ weight_loader = param.weight_loader
156
+ weight_loader(param, loaded_weight, shard_id)
157
+ break
158
+ else:
159
+ # Skip loading extra bias for GPTQ models.
160
+ if name.endswith(".bias") and name not in params_dict:
161
+ continue
162
+ param = params_dict[name]
163
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
164
+ weight_loader(param, loaded_weight)
165
+
166
+
167
+ # Adapted from transformers
168
+ def create_position_ids_from_input_ids(
169
+ input_ids, padding_idx, past_key_values_length=0
170
+ ):
171
+ mask = input_ids.ne(padding_idx).int()
172
+ incremental_indices = (
173
+ torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
174
+ ) * mask
175
+ return incremental_indices.long() + padding_idx
176
+
177
+
178
+ EntryClass = [XLMRobertaModel]
@@ -715,7 +715,10 @@ def v1_generate_response(
715
715
 
716
716
 
717
717
  async def v1_completions(tokenizer_manager, raw_request: Request):
718
- request_json = await raw_request.json()
718
+ try:
719
+ request_json = await raw_request.json()
720
+ except Exception as e:
721
+ return create_error_response("Invalid request body, error: ", str(e))
719
722
  all_requests = [CompletionRequest(**request_json)]
720
723
  created = int(time.time())
721
724
  adapted_request, request = v1_generate_request(all_requests)
@@ -909,6 +912,7 @@ def v1_chat_generate_request(
909
912
 
910
913
  # NOTE: with openai API, the prompt's logprobs are always not computed
911
914
 
915
+ is_multimodal = tokenizer_manager.model_config.is_multimodal
912
916
  for request in all_requests:
913
917
  # Prep the data needed for the underlying GenerateReqInput:
914
918
  # - prompt: The full prompt string.
@@ -918,6 +922,7 @@ def v1_chat_generate_request(
918
922
  # None skips any image processing in GenerateReqInput.
919
923
  strict_tag = None
920
924
  prompt = ""
925
+ prompt_ids = []
921
926
  if not isinstance(request.messages, str):
922
927
  # Apply chat template and its stop strings.
923
928
  tools = None
@@ -964,8 +969,6 @@ def v1_chat_generate_request(
964
969
  ),
965
970
  }
966
971
  )
967
- # TODO fix the compatible issues with xgrammar
968
- strict_tag = None
969
972
 
970
973
  for message in request.messages:
971
974
  if isinstance(message.content, str):
@@ -1019,7 +1022,7 @@ def v1_chat_generate_request(
1019
1022
  ):
1020
1023
  encoded = encoded[1:]
1021
1024
  prompt_ids += encoded
1022
- if tokenizer_manager.model_config.is_multimodal:
1025
+ if is_multimodal:
1023
1026
  prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
1024
1027
  stop = request.stop
1025
1028
  image_data = None
@@ -1064,8 +1067,9 @@ def v1_chat_generate_request(
1064
1067
  stop.append(request.stop)
1065
1068
  else:
1066
1069
  stop.extend(request.stop)
1067
- prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
1068
1070
 
1071
+ if not is_multimodal:
1072
+ prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
1069
1073
  else:
1070
1074
  # Use the raw prompt and stop strings if the messages is already a string.
1071
1075
  prompt_ids = request.messages
@@ -1135,7 +1139,7 @@ def v1_chat_generate_request(
1135
1139
  audio_data_list.append(audio_data)
1136
1140
  modalities_list.append(modalities)
1137
1141
  if len(all_requests) == 1:
1138
- if tokenizer_manager.model_config.is_multimodal:
1142
+ if is_multimodal:
1139
1143
  # processor will need text input
1140
1144
  prompt_kwargs = {"text": prompts[0]}
1141
1145
  else:
@@ -1378,7 +1382,10 @@ def v1_chat_generate_response(
1378
1382
  async def v1_chat_completions(
1379
1383
  tokenizer_manager, raw_request: Request, cache_report=False
1380
1384
  ):
1381
- request_json = await raw_request.json()
1385
+ try:
1386
+ request_json = await raw_request.json()
1387
+ except Exception as e:
1388
+ return create_error_response("Invalid request body, error: ", str(e))
1382
1389
  all_requests = [ChatCompletionRequest(**request_json)]
1383
1390
  created = int(time.time())
1384
1391
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
@@ -1799,7 +1806,10 @@ def v1_embedding_response(ret, model_path, to_file=False):
1799
1806
 
1800
1807
 
1801
1808
  async def v1_embeddings(tokenizer_manager, raw_request: Request):
1802
- request_json = await raw_request.json()
1809
+ try:
1810
+ request_json = await raw_request.json()
1811
+ except Exception as e:
1812
+ return create_error_response("Invalid request body, error: ", str(e))
1803
1813
  all_requests = [EmbeddingRequest(**request_json)]
1804
1814
  adapted_request, request = v1_embedding_request(all_requests, tokenizer_manager)
1805
1815
 
sglang/srt/server_args.py CHANGED
@@ -153,7 +153,7 @@ class ServerArgs:
153
153
  enable_nccl_nvls: bool = False
154
154
  disable_outlines_disk_cache: bool = False
155
155
  disable_custom_all_reduce: bool = False
156
- enable_llama4_multimodal: Optional[bool] = None
156
+ enable_multimodal: Optional[bool] = None
157
157
  disable_overlap_schedule: bool = False
158
158
  enable_mixed_chunk: bool = False
159
159
  enable_dp_attention: bool = False
@@ -201,7 +201,7 @@ class ServerArgs:
201
201
  # Expert parallelism
202
202
  if self.enable_ep_moe:
203
203
  self.ep_size = self.tp_size
204
- logger.info(
204
+ logger.warning(
205
205
  f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
206
206
  )
207
207
 
@@ -243,19 +243,19 @@ class ServerArgs:
243
243
  self.chunked_prefill_size = 2048
244
244
  else:
245
245
  self.chunked_prefill_size = 8192
246
-
247
246
  assert self.chunked_prefill_size % self.page_size == 0
248
247
 
249
248
  assert self.moe_dense_tp_size in {
250
249
  1,
251
250
  None,
252
- }, f"moe_dense_tp_size only support 1 and None currently"
251
+ }, "moe_dense_tp_size only support 1 and None currently"
253
252
 
254
253
  if self.attention_backend == "flashmla":
255
254
  logger.warning(
256
255
  "FlashMLA only supports a page_size of 64, change page_size to 64."
257
256
  )
258
257
  self.page_size = 64
258
+
259
259
  # Set cuda graph max batch size
260
260
  if self.cuda_graph_max_bs is None:
261
261
  # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -270,6 +270,7 @@ class ServerArgs:
270
270
  self.attention_backend = "torch_native"
271
271
  self.sampling_backend = "pytorch"
272
272
 
273
+ # Set kernel backends
273
274
  if self.sampling_backend is None:
274
275
  self.sampling_backend = (
275
276
  "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -285,8 +286,6 @@ class ServerArgs:
285
286
  if self.grammar_backend is None:
286
287
  self.grammar_backend = "xgrammar"
287
288
 
288
- self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
289
-
290
289
  # Data parallelism attention
291
290
  if self.enable_dp_attention:
292
291
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -299,8 +298,8 @@ class ServerArgs:
299
298
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
300
299
  )
301
300
 
302
- self.enable_sp_layernorm = False
303
301
  # DeepEP MoE
302
+ self.enable_sp_layernorm = False
304
303
  if self.enable_deepep_moe:
305
304
  if self.deepep_mode == "auto":
306
305
  assert (
@@ -310,7 +309,7 @@ class ServerArgs:
310
309
  self.enable_sp_layernorm = (
311
310
  self.dp_size < self.tp_size if self.enable_dp_attention else True
312
311
  )
313
- logger.info(
312
+ logger.warning(
314
313
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
315
314
  )
316
315
 
@@ -319,14 +318,11 @@ class ServerArgs:
319
318
  # NEXTN shares the same implementation of EAGLE
320
319
  self.speculative_algorithm = "EAGLE"
321
320
 
322
- if (
323
- self.speculative_algorithm == "EAGLE"
324
- or self.speculative_algorithm == "EAGLE3"
325
- ):
321
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
326
322
  if self.max_running_requests is None:
327
323
  self.max_running_requests = 48
328
324
  self.disable_overlap_schedule = True
329
- logger.info(
325
+ logger.warning(
330
326
  "Overlap scheduler is disabled because of using "
331
327
  "eagle speculative decoding."
332
328
  )
@@ -345,7 +341,7 @@ class ServerArgs:
345
341
 
346
342
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
347
343
  self.speculative_eagle_topk = 1
348
- logger.info(
344
+ logger.warning(
349
345
  "speculative_eagle_topk is adjusted to 1 when page_size > 1"
350
346
  )
351
347
 
@@ -353,7 +349,7 @@ class ServerArgs:
353
349
  self.speculative_eagle_topk == 1
354
350
  and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
355
351
  ):
356
- logger.info(
352
+ logger.warning(
357
353
  "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
358
354
  )
359
355
  self.speculative_num_draft_tokens = self.speculative_num_steps + 1
@@ -979,10 +975,10 @@ class ServerArgs:
979
975
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
980
976
  )
981
977
  parser.add_argument(
982
- "--enable-llama4-multimodal",
983
- default=ServerArgs.enable_llama4_multimodal,
978
+ "--enable-multimodal",
979
+ default=ServerArgs.enable_multimodal,
984
980
  action="store_true",
985
- help="Enable the multimodal functionality for Llama-4.",
981
+ help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
986
982
  )
987
983
  parser.add_argument(
988
984
  "--disable-overlap-schedule",
@@ -1364,10 +1360,7 @@ def auto_choose_speculative_params(self: ServerArgs):
1364
1360
 
1365
1361
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1366
1362
  """
1367
- if self.decrypted_config_file:
1368
- config_path = self.decrypted_config_file
1369
- else:
1370
- config_path = os.path.join(self.model_path, "config.json")
1363
+ config_path = os.path.join(self.model_path, "config.json")
1371
1364
  if not os.path.exists(config_path):
1372
1365
  raise ValueError(f"{config_path} is not found.")
1373
1366
 
@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
85
85
  f"Capture cuda graph failed: {e}\n"
86
86
  "Possible solutions:\n"
87
87
  "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
88
- "2. disable torch compile by not using --enable-torch-compile\n"
89
- "3. specify --dtype to the same dtype (e.g. bfloat16)\n"
90
- "4. disable cuda graph by --disable-cuda-graph\n"
88
+ "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
89
+ "3. disable torch compile by not using --enable-torch-compile\n"
90
+ "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
91
91
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
92
92
  )
93
93
 
@@ -6,7 +6,9 @@ try:
6
6
  import torch_memory_saver
7
7
 
8
8
  _primary_memory_saver = torch_memory_saver.TorchMemorySaver()
9
- except ImportError:
9
+ import_error = None
10
+ except ImportError as e:
11
+ import_error = e
10
12
  pass
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__)
15
17
  class TorchMemorySaverAdapter(ABC):
16
18
  @staticmethod
17
19
  def create(enable: bool):
20
+ if enable and import_error is not None:
21
+ logger.warning(
22
+ "enable_memory_saver is enabled, but "
23
+ "torch-memory-saver is not installed. Please install it "
24
+ "via `pip3 install torch-memory-saver`. "
25
+ )
26
+ raise import_error
18
27
  return (
19
28
  _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
20
29
  )
sglang/srt/utils.py CHANGED
@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str:
1944
1944
  s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
1945
1945
  return s.getsockname()[0]
1946
1946
  except Exception:
1947
- raise ValueError(f"Can not get local ip")
1947
+ raise ValueError("Can not get local ip")
1948
1948
 
1949
1949
 
1950
1950
  def is_page_size_one(server_args):
@@ -1971,6 +1971,7 @@ def is_fa3_default_architecture(hf_config):
1971
1971
  "LlamaForCausalLM",
1972
1972
  "MistralForCausalLM",
1973
1973
  "Gemma2ForCausalLM",
1974
+ "Gemma3ForConditionalGeneration",
1974
1975
  }
1975
1976
  return architectures[0] in default_archs
1976
1977
 
sglang/test/runners.py CHANGED
@@ -190,25 +190,18 @@ class HFRunner:
190
190
  if attention_mask is not None:
191
191
  attention_mask = attention_mask.to(inputs_embeds.device)
192
192
 
193
- outputs = self.model.model(
194
- input_ids=None,
193
+ outputs = self.model(
194
+ input_ids=input_ids,
195
195
  position_ids=position_ids,
196
196
  attention_mask=attention_mask,
197
197
  past_key_values=past_key_values,
198
+ output_hidden_states=True,
199
+ return_dict=True,
198
200
  inputs_embeds=inputs_embeds,
201
+ image_grid_thw=image_grid_thw,
199
202
  )
200
203
 
201
- pooling_mask = attention_mask if pooling_mask is None else pooling_mask
202
- left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0] # TODO
203
- if left_padding:
204
- embeddings = outputs.last_hidden_state[:, -1]
205
- else:
206
- sequence_lengths = pooling_mask.sum(dim=1) - 1
207
- batch_size = outputs.last_hidden_state.shape[0]
208
- embeddings = outputs.last_hidden_state[
209
- torch.arange(batch_size, device=outputs.last_hidden_state.device),
210
- sequence_lengths,
211
- ]
204
+ embeddings = outputs.hidden_states[-1][:, -1]
212
205
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
213
206
  return embeddings.contiguous()
214
207
 
sglang/test/test_utils.py CHANGED
@@ -8,7 +8,6 @@ import random
8
8
  import subprocess
9
9
  import threading
10
10
  import time
11
- import traceback
12
11
  import unittest
13
12
  from concurrent.futures import ThreadPoolExecutor
14
13
  from dataclasses import dataclass
@@ -34,27 +33,44 @@ from sglang.srt.utils import (
34
33
  from sglang.test.run_eval import run_eval
35
34
  from sglang.utils import get_exception_traceback
36
35
 
37
- DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
38
- DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
39
- DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
40
- "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
41
- )
42
- DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
43
- "nvidia/Llama-3.1-8B-Instruct-FP8"
44
- )
45
-
36
+ # General test models
46
37
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
47
38
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
48
39
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
49
40
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
50
- DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
41
+
42
+ # MLA test models
51
43
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
52
44
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
45
+ DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
46
+ DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
47
+
48
+ # FP8 models
49
+ DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
50
+ DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
51
+ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
52
+ "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
53
+ )
54
+ DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
55
+ "nvidia/Llama-3.1-8B-Instruct-FP8"
56
+ )
57
+
58
+ # EAGLE
59
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
60
+ DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
61
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
62
+
63
+ # Other use cases
64
+ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
65
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct"
66
+ )
67
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
53
68
  DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
54
69
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
55
70
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
56
71
  )
57
- DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
72
+
73
+ # Nightly tests
58
74
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
59
75
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
60
76
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
@@ -63,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
63
79
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
64
80
  DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
65
81
 
66
- DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
67
- DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
68
-
69
82
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
70
83
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
71
84
 
85
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
86
+
72
87
 
73
88
  def is_in_ci():
74
89
  """Return whether it is in CI runner."""
@@ -494,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
494
509
  tic = time.time()
495
510
  success = True
496
511
 
497
- for file in files:
512
+ for i, file in enumerate(files):
498
513
  filename, estimated_time = file.name, file.estimated_time
499
514
  process = None
500
515
 
@@ -502,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
502
517
  nonlocal process
503
518
 
504
519
  filename = os.path.join(os.getcwd(), filename)
505
- print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True)
520
+ print(
521
+ f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
522
+ flush=True,
523
+ )
506
524
  tic = time.time()
507
525
 
508
526
  process = subprocess.Popen(
@@ -512,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
512
530
  elapsed = time.time() - tic
513
531
 
514
532
  print(
515
- f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
533
+ f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
516
534
  flush=True,
517
535
  )
518
536
  return process.returncode
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.5.post3"
1
+ __version__ = "0.4.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5.post3
3
+ Version: 0.4.6
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
225
225
  Requires-Dist: hf_transfer; extra == "runtime-common"
226
226
  Requires-Dist: huggingface_hub; extra == "runtime-common"
227
227
  Requires-Dist: interegular; extra == "runtime-common"
228
- Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
228
+ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
229
229
  Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
@@ -242,7 +242,6 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: compressed-tensors; extra == "runtime-common"
246
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
246
  Provides-Extra: srt
248
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
409
408
 
410
409
  For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
411
410
 
412
- ## Acknowledgment and Citation
413
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
411
+ ## Acknowledgment
412
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).