sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. sglang/bench_one_batch.py +3 -11
  2. sglang/bench_serving.py +149 -1
  3. sglang/check_env.py +3 -3
  4. sglang/lang/chat_template.py +44 -0
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/deepseekvl2.py +3 -0
  7. sglang/srt/configs/device_config.py +1 -1
  8. sglang/srt/configs/internvl.py +696 -0
  9. sglang/srt/configs/janus_pro.py +3 -0
  10. sglang/srt/configs/kimi_vl.py +38 -0
  11. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  12. sglang/srt/configs/model_config.py +32 -0
  13. sglang/srt/constrained/xgrammar_backend.py +11 -19
  14. sglang/srt/conversation.py +151 -3
  15. sglang/srt/disaggregation/decode.py +4 -1
  16. sglang/srt/disaggregation/mini_lb.py +74 -23
  17. sglang/srt/disaggregation/mooncake/conn.py +9 -18
  18. sglang/srt/disaggregation/nixl/conn.py +241 -71
  19. sglang/srt/disaggregation/utils.py +44 -1
  20. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  21. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  22. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  23. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  24. sglang/srt/distributed/parallel_state.py +22 -1
  25. sglang/srt/entrypoints/engine.py +58 -24
  26. sglang/srt/entrypoints/http_server.py +28 -1
  27. sglang/srt/entrypoints/verl_engine.py +3 -2
  28. sglang/srt/function_call_parser.py +97 -0
  29. sglang/srt/hf_transformers_utils.py +22 -1
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  31. sglang/srt/layers/attention/flashattention_backend.py +146 -50
  32. sglang/srt/layers/attention/flashinfer_backend.py +129 -94
  33. sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
  34. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  35. sglang/srt/layers/attention/merge_state.py +46 -0
  36. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  37. sglang/srt/layers/attention/vision.py +290 -163
  38. sglang/srt/layers/dp_attention.py +5 -2
  39. sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
  40. sglang/srt/layers/moe/ep_moe/layer.py +120 -1
  41. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
  49. sglang/srt/layers/quantization/__init__.py +2 -2
  50. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  52. sglang/srt/layers/quantization/deep_gemm.py +6 -1
  53. sglang/srt/layers/quantization/fp8.py +108 -95
  54. sglang/srt/layers/quantization/fp8_kernel.py +79 -60
  55. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  56. sglang/srt/layers/quantization/kv_cache.py +3 -10
  57. sglang/srt/layers/quantization/utils.py +0 -5
  58. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  59. sglang/srt/layers/utils.py +35 -0
  60. sglang/srt/lora/layers.py +35 -9
  61. sglang/srt/lora/lora_manager.py +81 -35
  62. sglang/srt/managers/cache_controller.py +115 -119
  63. sglang/srt/managers/data_parallel_controller.py +52 -34
  64. sglang/srt/managers/io_struct.py +10 -0
  65. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  66. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  67. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  68. sglang/srt/managers/schedule_batch.py +44 -16
  69. sglang/srt/managers/schedule_policy.py +11 -5
  70. sglang/srt/managers/scheduler.py +291 -72
  71. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  72. sglang/srt/managers/tokenizer_manager.py +24 -13
  73. sglang/srt/managers/tp_worker.py +60 -28
  74. sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  75. sglang/srt/mem_cache/chunk_cache.py +2 -0
  76. sglang/srt/mem_cache/memory_pool.py +70 -36
  77. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  78. sglang/srt/model_executor/forward_batch_info.py +31 -1
  79. sglang/srt/model_executor/model_runner.py +159 -90
  80. sglang/srt/model_loader/loader.py +18 -11
  81. sglang/srt/models/clip.py +4 -4
  82. sglang/srt/models/deepseek_janus_pro.py +1 -1
  83. sglang/srt/models/deepseek_nextn.py +2 -277
  84. sglang/srt/models/deepseek_v2.py +132 -37
  85. sglang/srt/models/gemma3_mm.py +1 -1
  86. sglang/srt/models/internlm2.py +3 -0
  87. sglang/srt/models/internvl.py +670 -0
  88. sglang/srt/models/kimi_vl.py +308 -0
  89. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  90. sglang/srt/models/llama.py +93 -31
  91. sglang/srt/models/llama4.py +54 -7
  92. sglang/srt/models/llama_eagle.py +4 -1
  93. sglang/srt/models/llama_eagle3.py +4 -1
  94. sglang/srt/models/minicpmv.py +1 -1
  95. sglang/srt/models/mllama.py +1 -1
  96. sglang/srt/models/phi3_small.py +16 -2
  97. sglang/srt/models/qwen2_5_vl.py +8 -4
  98. sglang/srt/models/qwen2_moe.py +8 -3
  99. sglang/srt/models/qwen2_vl.py +4 -16
  100. sglang/srt/models/qwen3_moe.py +8 -3
  101. sglang/srt/models/xiaomi_mimo.py +171 -0
  102. sglang/srt/openai_api/adapter.py +58 -62
  103. sglang/srt/openai_api/protocol.py +38 -16
  104. sglang/srt/reasoning_parser.py +2 -2
  105. sglang/srt/sampling/sampling_batch_info.py +54 -2
  106. sglang/srt/sampling/sampling_params.py +2 -0
  107. sglang/srt/server_args.py +93 -24
  108. sglang/srt/speculative/eagle_worker.py +3 -2
  109. sglang/srt/utils.py +123 -10
  110. sglang/test/runners.py +4 -0
  111. sglang/test/test_block_fp8.py +2 -2
  112. sglang/test/test_deepep_utils.py +219 -0
  113. sglang/test/test_utils.py +32 -1
  114. sglang/version.py +1 -1
  115. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
  116. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
  117. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
  118. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
  119. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -22,7 +22,7 @@ import random
22
22
  import tempfile
23
23
  from typing import List, Literal, Optional
24
24
 
25
- from sglang.srt.hf_transformers_utils import check_gguf_file
25
+ from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
26
26
  from sglang.srt.reasoning_parser import ReasoningParser
27
27
  from sglang.srt.utils import (
28
28
  configure_ipv6,
@@ -78,6 +78,8 @@ class ServerArgs:
78
78
 
79
79
  # Other runtime options
80
80
  tp_size: int = 1
81
+ pp_size: int = 1
82
+ max_micro_batch_size: Optional[int] = None
81
83
  stream_interval: int = 1
82
84
  stream_output: bool = False
83
85
  random_seed: Optional[int] = None
@@ -185,6 +187,7 @@ class ServerArgs:
185
187
  n_share_experts_fusion: int = 0
186
188
  disable_chunked_prefix_cache: bool = False
187
189
  disable_fast_image_processor: bool = False
190
+ mm_attention_backend: Optional[str] = None
188
191
 
189
192
  # Debug tensor dumps
190
193
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -196,6 +199,7 @@ class ServerArgs:
196
199
  disaggregation_bootstrap_port: int = 8998
197
200
  disaggregation_transfer_backend: str = "mooncake"
198
201
  disaggregation_ib_device: Optional[str] = None
202
+ pdlb_url: Optional[str] = None
199
203
 
200
204
  def __post_init__(self):
201
205
  # Expert parallelism
@@ -222,25 +226,34 @@ class ServerArgs:
222
226
 
223
227
  # Set mem fraction static, which depends on the tensor parallelism size
224
228
  if self.mem_fraction_static is None:
229
+ parallel_size = self.tp_size * self.pp_size
225
230
  if gpu_mem <= 81920:
226
- if self.tp_size >= 16:
231
+ if parallel_size >= 16:
227
232
  self.mem_fraction_static = 0.79
228
- elif self.tp_size >= 8:
233
+ elif parallel_size >= 8:
229
234
  self.mem_fraction_static = 0.81
230
- elif self.tp_size >= 4:
235
+ elif parallel_size >= 4:
231
236
  self.mem_fraction_static = 0.85
232
- elif self.tp_size >= 2:
237
+ elif parallel_size >= 2:
233
238
  self.mem_fraction_static = 0.87
234
239
  else:
235
240
  self.mem_fraction_static = 0.88
236
241
  else:
237
- # FIXME: more fine grained auto-selection polices
238
- self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
242
+ self.mem_fraction_static = 0.88
243
+ if gpu_mem > 96 * 1024:
244
+ mem_fraction = self.mem_fraction_static
245
+ self.mem_fraction_static = min(
246
+ mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
247
+ (gpu_mem - 1024 * 18)
248
+ / gpu_mem, # 15 GB + additional 3GB for cuda graph
249
+ )
239
250
 
240
251
  # Set chunked prefill size, which depends on the gpu memory capacity
241
252
  if self.chunked_prefill_size is None:
242
253
  if gpu_mem is not None and gpu_mem < 25_000:
243
254
  self.chunked_prefill_size = 2048
255
+ elif self.disaggregation_mode != "null":
256
+ self.chunked_prefill_size = 16384
244
257
  else:
245
258
  self.chunked_prefill_size = 8192
246
259
  assert self.chunked_prefill_size % self.page_size == 0
@@ -311,6 +324,9 @@ class ServerArgs:
311
324
  assert (
312
325
  not self.enable_dp_attention
313
326
  ), "DeepEP MoE `auto` mode is not supported with DP Attention."
327
+ if self.deepep_mode == "normal":
328
+ logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
329
+ self.disable_cuda_graph = True
314
330
  self.ep_size = self.tp_size
315
331
  self.enable_sp_layernorm = (
316
332
  self.dp_size < self.tp_size if self.enable_dp_attention else True
@@ -333,6 +349,17 @@ class ServerArgs:
333
349
  "eagle speculative decoding."
334
350
  )
335
351
 
352
+ model_arch = get_model_arch(self)
353
+
354
+ # Auto set draft_model_path DeepSeek-V3/R1
355
+ if model_arch == "DeepseekV3ForCausalLM":
356
+ if self.speculative_draft_model_path is None:
357
+ self.speculative_draft_model_path = self.model_path
358
+ else:
359
+ logger.warning(
360
+ "DeepSeek MTP does not require setting speculative_draft_model_path."
361
+ )
362
+
336
363
  # Auto choose parameters
337
364
  if self.speculative_num_steps is None:
338
365
  assert (
@@ -343,7 +370,7 @@ class ServerArgs:
343
370
  self.speculative_num_steps,
344
371
  self.speculative_eagle_topk,
345
372
  self.speculative_num_draft_tokens,
346
- ) = auto_choose_speculative_params(self)
373
+ ) = auto_choose_speculative_params(model_arch)
347
374
 
348
375
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
349
376
  self.speculative_eagle_topk = 1
@@ -532,7 +559,7 @@ class ServerArgs:
532
559
  "--device",
533
560
  type=str,
534
561
  default=ServerArgs.device,
535
- help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
562
+ help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
536
563
  )
537
564
  parser.add_argument(
538
565
  "--served-model-name",
@@ -632,6 +659,19 @@ class ServerArgs:
632
659
  default=ServerArgs.tp_size,
633
660
  help="The tensor parallelism size.",
634
661
  )
662
+ parser.add_argument(
663
+ "--pipeline-parallel-size",
664
+ "--pp-size",
665
+ type=int,
666
+ default=ServerArgs.pp_size,
667
+ help="The pipeline parallelism size.",
668
+ )
669
+ parser.add_argument(
670
+ "--max-micro-batch-size",
671
+ type=int,
672
+ default=ServerArgs.max_micro_batch_size,
673
+ help="The maximum micro batch size in pipeline parallelism.",
674
+ )
635
675
  parser.add_argument(
636
676
  "--stream-interval",
637
677
  type=int,
@@ -1096,9 +1136,9 @@ class ServerArgs:
1096
1136
  parser.add_argument(
1097
1137
  "--tool-call-parser",
1098
1138
  type=str,
1099
- choices=["qwen25", "mistral", "llama3", "deepseekv3"],
1139
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1100
1140
  default=ServerArgs.tool_call_parser,
1101
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
1141
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1102
1142
  )
1103
1143
  parser.add_argument(
1104
1144
  "--enable-hierarchical-cache",
@@ -1215,12 +1255,29 @@ class ServerArgs:
1215
1255
  "--disaggregation-ib-device",
1216
1256
  type=str,
1217
1257
  default=ServerArgs.disaggregation_ib_device,
1218
- help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
1258
+ help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
1259
+ "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
1260
+ "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
1261
+ )
1262
+ parser.add_argument(
1263
+ "--pdlb-url",
1264
+ type=str,
1265
+ default=None,
1266
+ help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1267
+ )
1268
+
1269
+ parser.add_argument(
1270
+ "--mm-attention-backend",
1271
+ type=str,
1272
+ choices=["sdpa", "fa3", "triton_attn"],
1273
+ default=ServerArgs.mm_attention_backend,
1274
+ help="Set multimodal attention backend.",
1219
1275
  )
1220
1276
 
1221
1277
  @classmethod
1222
1278
  def from_cli_args(cls, args: argparse.Namespace):
1223
1279
  args.tp_size = args.tensor_parallel_size
1280
+ args.pp_size = args.pipeline_parallel_size
1224
1281
  args.dp_size = args.data_parallel_size
1225
1282
  args.ep_size = args.expert_parallel_size
1226
1283
  attrs = [attr.name for attr in dataclasses.fields(cls)]
@@ -1234,15 +1291,25 @@ class ServerArgs:
1234
1291
 
1235
1292
  def check_server_args(self):
1236
1293
  assert (
1237
- self.tp_size % self.nnodes == 0
1238
- ), "tp_size must be divisible by number of nodes"
1294
+ self.tp_size * self.pp_size
1295
+ ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
1296
+
1297
+ # FIXME pp constraints
1298
+ if self.pp_size > 1:
1299
+ logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
1300
+ self.disable_overlap_schedule = True
1301
+ assert (
1302
+ self.disable_overlap_schedule
1303
+ and self.speculative_algorithm is None
1304
+ and not self.enable_mixed_chunk
1305
+ ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
1306
+
1239
1307
  assert not (
1240
1308
  self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
1241
1309
  ), "multi-node data parallel is not supported unless dp attention!"
1242
1310
  assert (
1243
1311
  self.max_loras_per_batch > 0
1244
1312
  # FIXME
1245
- and (self.lora_paths is None or self.disable_cuda_graph)
1246
1313
  and (self.lora_paths is None or self.disable_radix_cache)
1247
1314
  ), "compatibility of lora and cuda graph and radix attention is in progress"
1248
1315
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
@@ -1368,20 +1435,22 @@ class DeprecatedAction(argparse.Action):
1368
1435
  raise ValueError(self.help)
1369
1436
 
1370
1437
 
1371
- def auto_choose_speculative_params(self: ServerArgs):
1438
+ def get_model_arch(args: ServerArgs):
1439
+ hf_config = get_config(
1440
+ args.model_path,
1441
+ trust_remote_code=args.trust_remote_code,
1442
+ revision=args.revision,
1443
+ model_override_args=json.loads(args.json_model_override_args),
1444
+ )
1445
+ return hf_config.architectures[0]
1446
+
1447
+
1448
+ def auto_choose_speculative_params(arch: str):
1372
1449
  """
1373
1450
  Automatically choose the parameters for speculative decoding.
1374
1451
 
1375
1452
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1376
1453
  """
1377
- config_path = os.path.join(self.model_path, "config.json")
1378
- if not os.path.exists(config_path):
1379
- raise ValueError(f"{config_path} is not found.")
1380
-
1381
- config = json.load(open(config_path))
1382
-
1383
- arch = config.get("architectures", ["Unknown"])[0]
1384
-
1385
1454
  if arch in ["LlamaForCausalLM"]:
1386
1455
  # The default value for llama
1387
1456
  return (5, 4, 8)
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
106
106
  # Init draft worker
107
107
  with empty_context():
108
108
  super().__init__(
109
+ server_args=server_args,
109
110
  gpu_id=gpu_id,
110
111
  tp_rank=tp_rank,
111
- server_args=server_args,
112
- nccl_port=nccl_port,
112
+ pp_rank=0, # FIXME
113
113
  dp_rank=dp_rank,
114
+ nccl_port=nccl_port,
114
115
  is_draft_worker=True,
115
116
  req_to_token_pool=self.req_to_token_pool,
116
117
  token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
sglang/srt/utils.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Common utilities."""
15
+
15
16
  import base64
16
17
  import builtins
17
18
  import ctypes
@@ -144,6 +145,10 @@ def is_xpu() -> bool:
144
145
  return hasattr(torch, "xpu") and torch.xpu.is_available()
145
146
 
146
147
 
148
+ def is_npu() -> bool:
149
+ return hasattr(torch, "npu") and torch.npu.is_available()
150
+
151
+
147
152
  def is_flashinfer_available():
148
153
  """
149
154
  Check whether flashinfer is available.
@@ -327,6 +332,16 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
327
332
  elif device == "cpu":
328
333
  # TODO: rename the variables in the current function to be not GPU specific
329
334
  free_gpu_memory = psutil.virtual_memory().available
335
+ elif device == "npu":
336
+ num_gpus = torch.npu.device_count()
337
+ assert gpu_id < num_gpus
338
+
339
+ if torch.npu.current_device() != gpu_id:
340
+ print(
341
+ f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
342
+ "which may cause useless memory allocation for torch NPU context.",
343
+ )
344
+ free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
330
345
 
331
346
  if distributed:
332
347
  tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
@@ -414,16 +429,40 @@ class LayerFn(Protocol):
414
429
  def make_layers(
415
430
  num_hidden_layers: int,
416
431
  layer_fn: LayerFn,
432
+ pp_rank: Optional[int] = None,
433
+ pp_size: Optional[int] = None,
417
434
  prefix: str = "",
435
+ return_tuple: bool = False,
418
436
  ) -> Tuple[int, int, torch.nn.ModuleList]:
419
437
  """Make a list of layers with the given layer function"""
438
+ # circula imports
439
+ from sglang.srt.distributed import get_pp_indices
440
+ from sglang.srt.layers.utils import PPMissingLayer
441
+
442
+ assert not pp_size or num_hidden_layers >= pp_size
443
+ start_layer, end_layer = (
444
+ get_pp_indices(
445
+ num_hidden_layers,
446
+ pp_rank,
447
+ pp_size,
448
+ )
449
+ if pp_rank is not None and pp_size is not None
450
+ else (0, num_hidden_layers)
451
+ )
420
452
  modules = torch.nn.ModuleList(
421
- [
453
+ [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
454
+ + [
422
455
  maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
423
- for idx in range(num_hidden_layers)
456
+ for idx in range(start_layer, end_layer)
457
+ ]
458
+ + [
459
+ PPMissingLayer(return_tuple=return_tuple)
460
+ for _ in range(end_layer, num_hidden_layers)
424
461
  ]
425
462
  )
426
- return modules
463
+ if pp_rank is None or pp_size is None:
464
+ return modules
465
+ return modules, start_layer, end_layer
427
466
 
428
467
 
429
468
  def set_random_seed(seed: int) -> None:
@@ -872,12 +911,15 @@ def broadcast_pyobj(
872
911
  src: int = 0,
873
912
  force_cpu_device: bool = True,
874
913
  ):
875
- """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
914
+ """Broadcast inputs from src rank to all other ranks with torch.dist backend.
915
+ The `rank` here refer to the source rank on global process group (regardless
916
+ of dist_group argument).
917
+ """
876
918
  device = torch.device(
877
919
  "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
878
920
  )
879
921
 
880
- if rank == 0:
922
+ if rank == src:
881
923
  if len(data) == 0:
882
924
  tensor_size = torch.tensor([0], dtype=torch.long, device=device)
883
925
  dist.broadcast(tensor_size, src=src, group=dist_group)
@@ -909,6 +951,50 @@ def broadcast_pyobj(
909
951
  return data
910
952
 
911
953
 
954
+ def point_to_point_pyobj(
955
+ data: List[Any],
956
+ rank: int,
957
+ group: Optional[torch.distributed.ProcessGroup] = None,
958
+ src: int = 0,
959
+ dst: int = 1,
960
+ ):
961
+ """Send data from src to dst in group."""
962
+
963
+ if rank == src:
964
+ if len(data) == 0:
965
+ tensor_size = torch.tensor([0], dtype=torch.long)
966
+ dist.send(tensor_size, dst=dst, group=group)
967
+ else:
968
+ serialized_data = pickle.dumps(data)
969
+ size = len(serialized_data)
970
+ tensor_data = torch.ByteTensor(
971
+ np.frombuffer(serialized_data, dtype=np.uint8)
972
+ )
973
+ tensor_size = torch.tensor([size], dtype=torch.long)
974
+
975
+ dist.send(tensor_size, dst=dst, group=group)
976
+ dist.send(tensor_data, dst=dst, group=group)
977
+ return data
978
+
979
+ elif rank == dst:
980
+ tensor_size = torch.tensor([0], dtype=torch.long)
981
+ dist.recv(tensor_size, src=src, group=group)
982
+ size = tensor_size.item()
983
+
984
+ if size == 0:
985
+ return []
986
+
987
+ tensor_data = torch.empty(size, dtype=torch.uint8)
988
+ dist.recv(tensor_data, src=src, group=group)
989
+
990
+ serialized_data = bytes(tensor_data.cpu().numpy())
991
+ data = pickle.loads(serialized_data)
992
+ return data
993
+
994
+ # Other ranks in pp_group do nothing
995
+ return []
996
+
997
+
912
998
  step_counter = 0
913
999
 
914
1000
 
@@ -1276,6 +1362,9 @@ def get_device_name(device_id: int = 0) -> str:
1276
1362
  if hasattr(torch, "hpu") and torch.hpu.is_available():
1277
1363
  return torch.hpu.get_device_name(device_id)
1278
1364
 
1365
+ if hasattr(torch, "npu") and torch.npu.is_available():
1366
+ return torch.npu.get_device_name(device_id)
1367
+
1279
1368
 
1280
1369
  @lru_cache(maxsize=1)
1281
1370
  def is_habana_available() -> bool:
@@ -1372,6 +1461,13 @@ def get_compiler_backend() -> str:
1372
1461
  if hasattr(torch, "hpu") and torch.hpu.is_available():
1373
1462
  return "hpu_backend"
1374
1463
 
1464
+ if hasattr(torch, "npu") and torch.npu.is_available():
1465
+ import torchair
1466
+
1467
+ config = torchair.CompilerConfig()
1468
+ npu_backend = torchair.get_npu_backend(compiler_config=config)
1469
+ return npu_backend
1470
+
1375
1471
  return "inductor"
1376
1472
 
1377
1473
 
@@ -1732,6 +1828,13 @@ def configure_ipv6(dist_init_addr):
1732
1828
  return port, host
1733
1829
 
1734
1830
 
1831
+ def rank0_log(msg: str):
1832
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
1833
+
1834
+ if get_tensor_model_parallel_rank() == 0:
1835
+ logger.info(msg)
1836
+
1837
+
1735
1838
  def rank0_print(msg: str):
1736
1839
  from sglang.srt.distributed import get_tensor_model_parallel_rank
1737
1840
 
@@ -1905,13 +2008,16 @@ def fast_topk(values, topk, dim):
1905
2008
  return torch.topk(values, topk, dim=dim)
1906
2009
 
1907
2010
 
1908
- def is_hopper_with_cuda_12_3():
2011
+ def _check(cc_major):
1909
2012
  if not is_cuda():
1910
2013
  return False
1911
- is_hopper = torch.cuda.get_device_capability()[0] == 9
1912
- cuda_version = torch.version.cuda.split(".")
1913
- is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
1914
- return is_hopper and is_cuda_compatible
2014
+ return torch.cuda.get_device_capability()[0] == cc_major and tuple(
2015
+ map(int, torch.version.cuda.split(".")[:2])
2016
+ ) >= (12, 3)
2017
+
2018
+
2019
+ is_ampere_with_cuda_12_3 = lambda: _check(8)
2020
+ is_hopper_with_cuda_12_3 = lambda: _check(9)
1915
2021
 
1916
2022
 
1917
2023
  def get_free_port():
@@ -1990,3 +2096,10 @@ class BumpAllocator:
1990
2096
  output = self._buffer[self._pointer : self._pointer + size]
1991
2097
  self._pointer += size
1992
2098
  return output
2099
+
2100
+
2101
+ def log_info_on_rank0(logger, msg):
2102
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
2103
+
2104
+ if get_tensor_model_parallel_rank() == 0:
2105
+ logger.info(msg)
sglang/test/runners.py CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
423
423
  )
424
424
  del input_logits
425
425
 
426
+ if lora_paths is not None and lora_paths[i] is not None:
427
+ # Unload the LoRA adapter if it is used
428
+ model.unload()
429
+
426
430
  return ModelOutput(
427
431
  output_strs=output_strs,
428
432
  top_input_logprobs=top_input_logprobs,
@@ -7,9 +7,9 @@ import torch
7
7
  from sglang.srt.layers.activation import SiluAndMul
8
8
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
9
9
  from sglang.srt.layers.quantization.fp8_kernel import (
10
- per_tensor_quant_mla_deep_gemm_masked_fp8,
11
10
  per_tensor_quant_mla_fp8,
12
11
  per_token_group_quant_fp8,
12
+ per_token_group_quant_mla_deep_gemm_masked_fp8,
13
13
  static_quant_fp8,
14
14
  w8a8_block_fp8_matmul,
15
15
  )
@@ -236,7 +236,7 @@ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
236
236
 
237
237
  with torch.inference_mode():
238
238
  ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
239
- out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
239
+ out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
240
240
  x, group_size
241
241
  )
242
242
  out = out[:, :num_tokens, :]