sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sglang/bench_one_batch.py +1 -11
  2. sglang/bench_serving.py +149 -1
  3. sglang/lang/chat_template.py +44 -0
  4. sglang/srt/configs/deepseekvl2.py +3 -0
  5. sglang/srt/configs/device_config.py +1 -1
  6. sglang/srt/configs/internvl.py +696 -0
  7. sglang/srt/configs/janus_pro.py +3 -0
  8. sglang/srt/configs/model_config.py +17 -0
  9. sglang/srt/constrained/xgrammar_backend.py +11 -19
  10. sglang/srt/conversation.py +30 -3
  11. sglang/srt/disaggregation/decode.py +4 -1
  12. sglang/srt/disaggregation/mini_lb.py +74 -23
  13. sglang/srt/disaggregation/mooncake/conn.py +9 -18
  14. sglang/srt/disaggregation/nixl/conn.py +241 -71
  15. sglang/srt/disaggregation/utils.py +44 -1
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  17. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  18. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  19. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  20. sglang/srt/distributed/parallel_state.py +22 -1
  21. sglang/srt/entrypoints/engine.py +14 -2
  22. sglang/srt/entrypoints/http_server.py +28 -1
  23. sglang/srt/entrypoints/verl_engine.py +3 -2
  24. sglang/srt/hf_transformers_utils.py +20 -1
  25. sglang/srt/layers/attention/flashattention_backend.py +146 -50
  26. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  27. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  28. sglang/srt/layers/attention/merge_state.py +46 -0
  29. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  30. sglang/srt/layers/attention/vision.py +290 -163
  31. sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
  32. sglang/srt/layers/moe/ep_moe/layer.py +120 -1
  33. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
  37. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  38. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  39. sglang/srt/layers/quantization/deep_gemm.py +5 -0
  40. sglang/srt/layers/quantization/fp8.py +108 -95
  41. sglang/srt/layers/quantization/fp8_kernel.py +79 -60
  42. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  43. sglang/srt/layers/quantization/kv_cache.py +3 -10
  44. sglang/srt/layers/quantization/utils.py +0 -5
  45. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  46. sglang/srt/lora/lora_manager.py +10 -13
  47. sglang/srt/managers/cache_controller.py +115 -119
  48. sglang/srt/managers/io_struct.py +10 -0
  49. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  50. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  51. sglang/srt/managers/schedule_batch.py +19 -1
  52. sglang/srt/managers/schedule_policy.py +11 -5
  53. sglang/srt/managers/scheduler.py +28 -13
  54. sglang/srt/managers/tokenizer_manager.py +24 -13
  55. sglang/srt/managers/tp_worker.py +9 -12
  56. sglang/srt/mem_cache/chunk_cache.py +2 -0
  57. sglang/srt/mem_cache/memory_pool.py +2 -2
  58. sglang/srt/model_executor/model_runner.py +44 -33
  59. sglang/srt/model_loader/loader.py +18 -11
  60. sglang/srt/models/clip.py +4 -4
  61. sglang/srt/models/deepseek_janus_pro.py +1 -1
  62. sglang/srt/models/deepseek_nextn.py +1 -20
  63. sglang/srt/models/deepseek_v2.py +55 -20
  64. sglang/srt/models/gemma3_mm.py +1 -1
  65. sglang/srt/models/internlm2.py +3 -0
  66. sglang/srt/models/internvl.py +670 -0
  67. sglang/srt/models/llama.py +1 -1
  68. sglang/srt/models/llama4.py +53 -7
  69. sglang/srt/models/minicpmv.py +1 -1
  70. sglang/srt/models/mllama.py +1 -1
  71. sglang/srt/models/phi3_small.py +16 -2
  72. sglang/srt/models/qwen2_5_vl.py +8 -4
  73. sglang/srt/models/qwen2_vl.py +4 -4
  74. sglang/srt/models/xiaomi_mimo.py +171 -0
  75. sglang/srt/openai_api/adapter.py +24 -40
  76. sglang/srt/openai_api/protocol.py +28 -16
  77. sglang/srt/reasoning_parser.py +2 -2
  78. sglang/srt/sampling/sampling_batch_info.py +54 -2
  79. sglang/srt/sampling/sampling_params.py +2 -0
  80. sglang/srt/server_args.py +30 -6
  81. sglang/srt/utils.py +35 -1
  82. sglang/test/test_block_fp8.py +2 -2
  83. sglang/test/test_deepep_utils.py +219 -0
  84. sglang/test/test_utils.py +3 -1
  85. sglang/version.py +1 -1
  86. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
  87. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
  88. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
  89. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
  90. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,13 @@ class SamplingBatchInfo:
30
30
  # Whether any request needs min_p sampling
31
31
  need_min_p_sampling: bool
32
32
 
33
+ # Use thinking_budget to truncate thinking
34
+ num_thinking_tokens: Optional[torch.Tensor] = None
35
+ think_end_ids: Optional[torch.Tensor] = None
36
+ thinking_budgets: Optional[torch.Tensor] = None
37
+
33
38
  # Masking tensors for grammar-guided structured outputs
34
- vocab_size: int
39
+ vocab_size: int = 0
35
40
  grammars: Optional[List] = None
36
41
  vocab_mask: Optional[torch.Tensor] = None
37
42
  apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
@@ -76,7 +81,22 @@ class SamplingBatchInfo:
76
81
  min_ps = torch.tensor(
77
82
  [r.sampling_params.min_p for r in reqs], dtype=torch.float
78
83
  ).to(device, non_blocking=True)
79
-
84
+ if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
85
+ think_end_ids = torch.tensor(
86
+ [getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
87
+ dtype=torch.int64,
88
+ ).to(device, non_blocking=True)
89
+ num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
90
+ device, non_blocking=True
91
+ )
92
+ thinking_budgets = torch.tensor(
93
+ [r.sampling_params.thinking_budget or -1 for r in reqs],
94
+ dtype=torch.int64,
95
+ ).to(device, non_blocking=True)
96
+ else:
97
+ think_end_ids = None
98
+ num_thinking_tokens = None
99
+ thinking_budgets = None
80
100
  # Check if any request has custom logit processor
81
101
  has_custom_logit_processor = (
82
102
  batch.enable_custom_logit_processor # check the flag first.
@@ -132,6 +152,9 @@ class SamplingBatchInfo:
132
152
  top_ps=top_ps,
133
153
  top_ks=top_ks,
134
154
  min_ps=min_ps,
155
+ think_end_ids=think_end_ids,
156
+ num_thinking_tokens=num_thinking_tokens,
157
+ thinking_budgets=thinking_budgets,
135
158
  is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
136
159
  need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
137
160
  vocab_size=vocab_size,
@@ -146,6 +169,35 @@ class SamplingBatchInfo:
146
169
  def __len__(self):
147
170
  return len(self.temperatures)
148
171
 
172
+ def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
173
+ has_budget = self.thinking_budgets > 0
174
+ if not has_budget.any():
175
+ return
176
+ torch.where(
177
+ has_budget,
178
+ self.num_thinking_tokens + 1,
179
+ self.num_thinking_tokens,
180
+ out=self.num_thinking_tokens,
181
+ )
182
+ should_stop = has_budget & (
183
+ self.num_thinking_tokens - 1 > self.thinking_budgets
184
+ )
185
+ next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
186
+ batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
187
+ if len(batch_indices) > 0:
188
+ end_token_indices = self.think_end_ids[batch_indices]
189
+ next_token_logits[batch_indices, end_token_indices] = 0.0
190
+
191
+ def update_thinking_budgets(self, next_token_ids: torch.Tensor):
192
+ if not torch.any(self.thinking_budgets > 0):
193
+ return
194
+ torch.where(
195
+ next_token_ids == self.think_end_ids,
196
+ torch.tensor(-1, device=self.thinking_budgets.device),
197
+ self.thinking_budgets,
198
+ out=self.thinking_budgets,
199
+ )
200
+
149
201
  def update_regex_vocab_mask(self):
150
202
  if not self.grammars:
151
203
  self.vocab_mask = None
@@ -30,6 +30,7 @@ class SamplingParams:
30
30
  def __init__(
31
31
  self,
32
32
  max_new_tokens: int = 128,
33
+ thinking_budget: Optional[int] = None,
33
34
  stop: Optional[Union[str, List[str]]] = None,
34
35
  stop_token_ids: Optional[List[int]] = None,
35
36
  temperature: float = 1.0,
@@ -57,6 +58,7 @@ class SamplingParams:
57
58
  self.stop_token_ids = set(stop_token_ids)
58
59
  else:
59
60
  self.stop_token_ids = None
61
+ self.thinking_budget = thinking_budget
60
62
  self.temperature = temperature
61
63
  self.top_p = top_p
62
64
  self.top_k = top_k
sglang/srt/server_args.py CHANGED
@@ -187,6 +187,7 @@ class ServerArgs:
187
187
  n_share_experts_fusion: int = 0
188
188
  disable_chunked_prefix_cache: bool = False
189
189
  disable_fast_image_processor: bool = False
190
+ mm_attention_backend: Optional[str] = None
190
191
 
191
192
  # Debug tensor dumps
192
193
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -198,6 +199,7 @@ class ServerArgs:
198
199
  disaggregation_bootstrap_port: int = 8998
199
200
  disaggregation_transfer_backend: str = "mooncake"
200
201
  disaggregation_ib_device: Optional[str] = None
202
+ pdlb_url: Optional[str] = None
201
203
 
202
204
  def __post_init__(self):
203
205
  # Expert parallelism
@@ -322,6 +324,9 @@ class ServerArgs:
322
324
  assert (
323
325
  not self.enable_dp_attention
324
326
  ), "DeepEP MoE `auto` mode is not supported with DP Attention."
327
+ if self.deepep_mode == "normal":
328
+ logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
329
+ self.disable_cuda_graph = True
325
330
  self.ep_size = self.tp_size
326
331
  self.enable_sp_layernorm = (
327
332
  self.dp_size < self.tp_size if self.enable_dp_attention else True
@@ -347,10 +352,13 @@ class ServerArgs:
347
352
  model_arch = get_model_arch(self)
348
353
 
349
354
  # Auto set draft_model_path DeepSeek-V3/R1
350
- if self.speculative_draft_model_path is None and model_arch in [
351
- "DeepseekV3ForCausalLM"
352
- ]:
353
- self.speculative_draft_model_path = self.model_path
355
+ if model_arch == "DeepseekV3ForCausalLM":
356
+ if self.speculative_draft_model_path is None:
357
+ self.speculative_draft_model_path = self.model_path
358
+ else:
359
+ logger.warning(
360
+ "DeepSeek MTP does not require setting speculative_draft_model_path."
361
+ )
354
362
 
355
363
  # Auto choose parameters
356
364
  if self.speculative_num_steps is None:
@@ -551,7 +559,7 @@ class ServerArgs:
551
559
  "--device",
552
560
  type=str,
553
561
  default=ServerArgs.device,
554
- help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
562
+ help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
555
563
  )
556
564
  parser.add_argument(
557
565
  "--served-model-name",
@@ -1247,7 +1255,23 @@ class ServerArgs:
1247
1255
  "--disaggregation-ib-device",
1248
1256
  type=str,
1249
1257
  default=ServerArgs.disaggregation_ib_device,
1250
- help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
1258
+ help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
1259
+ "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
1260
+ "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
1261
+ )
1262
+ parser.add_argument(
1263
+ "--pdlb-url",
1264
+ type=str,
1265
+ default=None,
1266
+ help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1267
+ )
1268
+
1269
+ parser.add_argument(
1270
+ "--mm-attention-backend",
1271
+ type=str,
1272
+ choices=["sdpa", "fa3", "triton_attn"],
1273
+ default=ServerArgs.mm_attention_backend,
1274
+ help="Set multimodal attention backend.",
1251
1275
  )
1252
1276
 
1253
1277
  @classmethod
sglang/srt/utils.py CHANGED
@@ -145,6 +145,10 @@ def is_xpu() -> bool:
145
145
  return hasattr(torch, "xpu") and torch.xpu.is_available()
146
146
 
147
147
 
148
+ def is_npu() -> bool:
149
+ return hasattr(torch, "npu") and torch.npu.is_available()
150
+
151
+
148
152
  def is_flashinfer_available():
149
153
  """
150
154
  Check whether flashinfer is available.
@@ -328,6 +332,16 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
328
332
  elif device == "cpu":
329
333
  # TODO: rename the variables in the current function to be not GPU specific
330
334
  free_gpu_memory = psutil.virtual_memory().available
335
+ elif device == "npu":
336
+ num_gpus = torch.npu.device_count()
337
+ assert gpu_id < num_gpus
338
+
339
+ if torch.npu.current_device() != gpu_id:
340
+ print(
341
+ f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
342
+ "which may cause useless memory allocation for torch NPU context.",
343
+ )
344
+ free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
331
345
 
332
346
  if distributed:
333
347
  tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
@@ -897,7 +911,10 @@ def broadcast_pyobj(
897
911
  src: int = 0,
898
912
  force_cpu_device: bool = True,
899
913
  ):
900
- """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
914
+ """Broadcast inputs from src rank to all other ranks with torch.dist backend.
915
+ The `rank` here refer to the source rank on global process group (regardless
916
+ of dist_group argument).
917
+ """
901
918
  device = torch.device(
902
919
  "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
903
920
  )
@@ -1345,6 +1362,9 @@ def get_device_name(device_id: int = 0) -> str:
1345
1362
  if hasattr(torch, "hpu") and torch.hpu.is_available():
1346
1363
  return torch.hpu.get_device_name(device_id)
1347
1364
 
1365
+ if hasattr(torch, "npu") and torch.npu.is_available():
1366
+ return torch.npu.get_device_name(device_id)
1367
+
1348
1368
 
1349
1369
  @lru_cache(maxsize=1)
1350
1370
  def is_habana_available() -> bool:
@@ -1441,6 +1461,13 @@ def get_compiler_backend() -> str:
1441
1461
  if hasattr(torch, "hpu") and torch.hpu.is_available():
1442
1462
  return "hpu_backend"
1443
1463
 
1464
+ if hasattr(torch, "npu") and torch.npu.is_available():
1465
+ import torchair
1466
+
1467
+ config = torchair.CompilerConfig()
1468
+ npu_backend = torchair.get_npu_backend(compiler_config=config)
1469
+ return npu_backend
1470
+
1444
1471
  return "inductor"
1445
1472
 
1446
1473
 
@@ -2069,3 +2096,10 @@ class BumpAllocator:
2069
2096
  output = self._buffer[self._pointer : self._pointer + size]
2070
2097
  self._pointer += size
2071
2098
  return output
2099
+
2100
+
2101
+ def log_info_on_rank0(logger, msg):
2102
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
2103
+
2104
+ if get_tensor_model_parallel_rank() == 0:
2105
+ logger.info(msg)
@@ -7,9 +7,9 @@ import torch
7
7
  from sglang.srt.layers.activation import SiluAndMul
8
8
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
9
9
  from sglang.srt.layers.quantization.fp8_kernel import (
10
- per_tensor_quant_mla_deep_gemm_masked_fp8,
11
10
  per_tensor_quant_mla_fp8,
12
11
  per_token_group_quant_fp8,
12
+ per_token_group_quant_mla_deep_gemm_masked_fp8,
13
13
  static_quant_fp8,
14
14
  w8a8_block_fp8_matmul,
15
15
  )
@@ -236,7 +236,7 @@ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
236
236
 
237
237
  with torch.inference_mode():
238
238
  ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
239
- out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
239
+ out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
240
240
  x, group_size
241
241
  )
242
242
  out = out[:, :num_tokens, :]
@@ -0,0 +1,219 @@
1
+ # Copy from deepseek-ai/DeepEP/tests/test_utils.py
2
+
3
+ import os
4
+ import sys
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.distributed as dist
10
+
11
+
12
+ def init_dist(local_rank: int, num_local_ranks: int):
13
+ # NOTES: you may rewrite this function with your own cluster settings
14
+ ip = os.getenv("MASTER_ADDR", "127.0.0.1")
15
+ port = int(os.getenv("MASTER_PORT", "8361"))
16
+ num_nodes = int(os.getenv("WORLD_SIZE", 1))
17
+ node_rank = int(os.getenv("RANK", 0))
18
+ assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
19
+
20
+ dist.init_process_group(
21
+ backend="nccl",
22
+ init_method=f"tcp://{ip}:{port}",
23
+ world_size=num_nodes * num_local_ranks,
24
+ rank=node_rank * num_local_ranks + local_rank,
25
+ )
26
+ torch.set_default_dtype(torch.bfloat16)
27
+ torch.set_default_device("cuda")
28
+ torch.cuda.set_device(local_rank)
29
+
30
+ return (
31
+ dist.get_rank(),
32
+ dist.get_world_size(),
33
+ dist.new_group(list(range(num_local_ranks * num_nodes))),
34
+ )
35
+
36
+
37
+ def calc_diff(x: torch.Tensor, y: torch.Tensor):
38
+ x, y = x.double() + 1, y.double() + 1
39
+ denominator = (x * x + y * y).sum()
40
+ sim = 2 * (x * y).sum() / denominator
41
+ return (1 - sim).item()
42
+
43
+
44
+ def per_token_cast_to_fp8(x: torch.Tensor):
45
+ assert x.dim() == 2 and x.size(1) % 128 == 0
46
+ m, n = x.shape
47
+ x_view = x.view(m, -1, 128)
48
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
49
+ return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
50
+ m, n
51
+ ), (x_amax / 448.0).view(m, -1)
52
+
53
+
54
+ def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
55
+ x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
56
+ x_scales = x_scales.view(x_fp8.size(0), -1, 1)
57
+ return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
58
+
59
+
60
+ def inplace_unique(x: torch.Tensor, num_slots: int):
61
+ assert x.dim() == 2
62
+ mask = x < 0
63
+ x_padded = x.masked_fill(mask, num_slots)
64
+ bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
65
+ bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
66
+ bin_count = bin_count[:, :num_slots]
67
+ sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
68
+ sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
69
+ sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
70
+ x[:, :].fill_(-1)
71
+ valid_len = min(num_slots, x.size(1))
72
+ x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
73
+
74
+
75
+ def create_grouped_scores(
76
+ scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
77
+ ):
78
+ num_tokens, num_experts = scores.shape
79
+ scores = scores.view(num_tokens, num_groups, -1)
80
+ mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
81
+ mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
82
+ return (scores * mask).view(num_tokens, num_experts)
83
+
84
+
85
+ def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
86
+ # Flush L2 cache with 256 MB data
87
+ torch.cuda.synchronize()
88
+ cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
89
+
90
+ # Warmup
91
+ for _ in range(num_warmups):
92
+ fn()
93
+
94
+ # Flush L2
95
+ cache.zero_()
96
+
97
+ # Testing
98
+ start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
99
+ end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
100
+ for i in range(num_tests):
101
+ # Record
102
+ start_events[i].record()
103
+ fn()
104
+ end_events[i].record()
105
+ if post_fn is not None:
106
+ post_fn()
107
+ torch.cuda.synchronize()
108
+
109
+ times = np.array(
110
+ [s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
111
+ )[1:]
112
+ return np.average(times), np.min(times), np.max(times)
113
+
114
+
115
+ class empty_suppress:
116
+ def __enter__(self):
117
+ return self
118
+
119
+ def __exit__(self, *_):
120
+ pass
121
+
122
+
123
+ class suppress_stdout_stderr:
124
+ def __enter__(self):
125
+ self.outnull_file = open(os.devnull, "w")
126
+ self.errnull_file = open(os.devnull, "w")
127
+
128
+ self.old_stdout_fileno_undup = sys.stdout.fileno()
129
+ self.old_stderr_fileno_undup = sys.stderr.fileno()
130
+
131
+ self.old_stdout_fileno = os.dup(sys.stdout.fileno())
132
+ self.old_stderr_fileno = os.dup(sys.stderr.fileno())
133
+
134
+ self.old_stdout = sys.stdout
135
+ self.old_stderr = sys.stderr
136
+
137
+ os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
138
+ os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
139
+
140
+ sys.stdout = self.outnull_file
141
+ sys.stderr = self.errnull_file
142
+ return self
143
+
144
+ def __exit__(self, *_):
145
+ sys.stdout = self.old_stdout
146
+ sys.stderr = self.old_stderr
147
+
148
+ os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
149
+ os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
150
+
151
+ os.close(self.old_stdout_fileno)
152
+ os.close(self.old_stderr_fileno)
153
+
154
+ self.outnull_file.close()
155
+ self.errnull_file.close()
156
+
157
+
158
+ def bench_kineto(
159
+ fn,
160
+ kernel_names,
161
+ num_tests: int = 30,
162
+ suppress_kineto_output: bool = False,
163
+ trace_path: Optional[str] = None,
164
+ barrier_comm_profiling: bool = False,
165
+ ):
166
+ # Profile
167
+ suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
168
+ with suppress():
169
+ schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
170
+ with torch.profiler.profile(
171
+ activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
172
+ ) as prof:
173
+ for i in range(2):
174
+ # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
175
+ if barrier_comm_profiling:
176
+ lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
177
+ rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
178
+ lhs @ rhs
179
+ dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
180
+ for _ in range(num_tests):
181
+ fn()
182
+ prof.step()
183
+
184
+ # Parse the profiling table
185
+ assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
186
+ is_tupled = isinstance(kernel_names, tuple)
187
+ prof_lines = (
188
+ prof.key_averages()
189
+ .table(sort_by="cuda_time_total", max_name_column_width=100)
190
+ .split("\n")
191
+ )
192
+ kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
193
+ assert all([isinstance(name, str) for name in kernel_names])
194
+ for name in kernel_names:
195
+ assert (
196
+ sum([name in line for line in prof_lines]) == 1
197
+ ), f"Errors of the kernel {name} in the profiling table"
198
+
199
+ # Save chrome traces
200
+ if trace_path is not None:
201
+ prof.export_chrome_trace(trace_path)
202
+
203
+ # Return average kernel times
204
+ units = {"ms": 1e3, "us": 1e6}
205
+ kernel_times = []
206
+ for name in kernel_names:
207
+ for line in prof_lines:
208
+ if name in line:
209
+ time_str = line.split()[-2]
210
+ for unit, scale in units.items():
211
+ if unit in time_str:
212
+ kernel_times.append(float(time_str.replace(unit, "")) / scale)
213
+ break
214
+ break
215
+ return tuple(kernel_times) if is_tupled else kernel_times[0]
216
+
217
+
218
+ def hash_tensor(t: torch.Tensor):
219
+ return t.view(torch.int64).sum().item()
sglang/test/test_utils.py CHANGED
@@ -66,6 +66,7 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
66
66
  )
67
67
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
68
68
  DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
69
+ DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
69
70
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
70
71
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
71
72
  )
@@ -78,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
78
79
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
79
80
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
80
81
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
81
- DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
82
+ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
83
+ DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
82
84
 
83
85
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
84
86
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6.post2"
1
+ __version__ = "0.4.6.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post2
3
+ Version: 0.4.6.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
232
232
  Requires-Dist: packaging; extra == "runtime-common"
233
+ Requires-Dist: partial_json_parser; extra == "runtime-common"
233
234
  Requires-Dist: pillow; extra == "runtime-common"
234
235
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
235
236
  Requires-Dist: psutil; extra == "runtime-common"
@@ -242,7 +243,7 @@ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
243
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
244
  Requires-Dist: uvicorn; extra == "runtime-common"
244
245
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
246
247
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
247
248
  Provides-Extra: srt
248
249
  Requires-Dist: sglang[runtime_common]; extra == "srt"
@@ -252,7 +253,6 @@ Requires-Dist: torch==2.6.0; extra == "srt"
252
253
  Requires-Dist: torchvision==0.21.0; extra == "srt"
253
254
  Requires-Dist: cuda-python; extra == "srt"
254
255
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
255
- Requires-Dist: partial_json_parser; extra == "srt"
256
256
  Requires-Dist: einops; extra == "srt"
257
257
  Provides-Extra: blackwell
258
258
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
@@ -261,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
261
261
  Requires-Dist: torchvision; extra == "blackwell"
262
262
  Requires-Dist: cuda-python; extra == "blackwell"
263
263
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
- Requires-Dist: partial_json_parser; extra == "blackwell"
265
264
  Requires-Dist: einops; extra == "blackwell"
266
265
  Provides-Extra: srt-hip
267
266
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -278,6 +277,9 @@ Provides-Extra: srt-cpu
278
277
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
279
278
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
280
279
  Requires-Dist: torch; extra == "srt-cpu"
280
+ Provides-Extra: srt-npu
281
+ Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
282
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
281
283
  Provides-Extra: openai
282
284
  Requires-Dist: openai>=1.0; extra == "openai"
283
285
  Requires-Dist: tiktoken; extra == "openai"
@@ -319,6 +321,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
319
321
  Requires-Dist: sglang[openai]; extra == "all-cpu"
320
322
  Requires-Dist: sglang[anthropic]; extra == "all-cpu"
321
323
  Requires-Dist: sglang[litellm]; extra == "all-cpu"
324
+ Provides-Extra: all-npu
325
+ Requires-Dist: sglang[srt_npu]; extra == "all-npu"
326
+ Requires-Dist: sglang[openai]; extra == "all-npu"
327
+ Requires-Dist: sglang[anthropic]; extra == "all-npu"
328
+ Requires-Dist: sglang[litellm]; extra == "all-npu"
322
329
  Provides-Extra: dev
323
330
  Requires-Dist: sglang[all]; extra == "dev"
324
331
  Requires-Dist: sglang[test]; extra == "dev"
@@ -358,6 +365,7 @@ Dynamic: license-file
358
365
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
359
366
 
360
367
  ## News
368
+ - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
361
369
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
362
370
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
363
371
  - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
@@ -383,7 +391,7 @@ The core features include:
383
391
 
384
392
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
385
393
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
386
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
394
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
387
395
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
388
396
 
389
397
  ## Getting Started
@@ -401,7 +409,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
401
409
 
402
410
  ## Adoption and Sponsorship
403
411
  The project has been deployed to large-scale production, generating trillions of tokens every day.
404
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
412
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
405
413
 
406
414
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
407
415