sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -11
- sglang/bench_serving.py +149 -1
- sglang/check_env.py +3 -3
- sglang/lang/chat_template.py +44 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseekvl2.py +3 -0
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +32 -0
- sglang/srt/constrained/xgrammar_backend.py +11 -19
- sglang/srt/conversation.py +151 -3
- sglang/srt/disaggregation/decode.py +4 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +9 -18
- sglang/srt/disaggregation/nixl/conn.py +241 -71
- sglang/srt/disaggregation/utils.py +44 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +58 -24
- sglang/srt/entrypoints/http_server.py +28 -1
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +22 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +146 -50
- sglang/srt/layers/attention/flashinfer_backend.py +129 -94
- sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
- sglang/srt/layers/moe/ep_moe/layer.py +120 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +6 -1
- sglang/srt/layers/quantization/fp8.py +108 -95
- sglang/srt/layers/quantization/fp8_kernel.py +79 -60
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +81 -35
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +44 -16
- sglang/srt/managers/schedule_policy.py +11 -5
- sglang/srt/managers/scheduler.py +291 -72
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +24 -13
- sglang/srt/managers/tp_worker.py +60 -28
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +159 -90
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_nextn.py +2 -277
- sglang/srt/models/deepseek_v2.py +132 -37
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +93 -31
- sglang/srt/models/llama4.py +54 -7
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +4 -16
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +58 -62
- sglang/srt/openai_api/protocol.py +38 -16
- sglang/srt/reasoning_parser.py +2 -2
- sglang/srt/sampling/sampling_batch_info.py +54 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +93 -24
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +123 -10
- sglang/test/runners.py +4 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_utils.py +32 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -22,7 +22,7 @@ import random
|
|
22
22
|
import tempfile
|
23
23
|
from typing import List, Literal, Optional
|
24
24
|
|
25
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file
|
25
|
+
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
26
26
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
27
|
from sglang.srt.utils import (
|
28
28
|
configure_ipv6,
|
@@ -78,6 +78,8 @@ class ServerArgs:
|
|
78
78
|
|
79
79
|
# Other runtime options
|
80
80
|
tp_size: int = 1
|
81
|
+
pp_size: int = 1
|
82
|
+
max_micro_batch_size: Optional[int] = None
|
81
83
|
stream_interval: int = 1
|
82
84
|
stream_output: bool = False
|
83
85
|
random_seed: Optional[int] = None
|
@@ -185,6 +187,7 @@ class ServerArgs:
|
|
185
187
|
n_share_experts_fusion: int = 0
|
186
188
|
disable_chunked_prefix_cache: bool = False
|
187
189
|
disable_fast_image_processor: bool = False
|
190
|
+
mm_attention_backend: Optional[str] = None
|
188
191
|
|
189
192
|
# Debug tensor dumps
|
190
193
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -196,6 +199,7 @@ class ServerArgs:
|
|
196
199
|
disaggregation_bootstrap_port: int = 8998
|
197
200
|
disaggregation_transfer_backend: str = "mooncake"
|
198
201
|
disaggregation_ib_device: Optional[str] = None
|
202
|
+
pdlb_url: Optional[str] = None
|
199
203
|
|
200
204
|
def __post_init__(self):
|
201
205
|
# Expert parallelism
|
@@ -222,25 +226,34 @@ class ServerArgs:
|
|
222
226
|
|
223
227
|
# Set mem fraction static, which depends on the tensor parallelism size
|
224
228
|
if self.mem_fraction_static is None:
|
229
|
+
parallel_size = self.tp_size * self.pp_size
|
225
230
|
if gpu_mem <= 81920:
|
226
|
-
if
|
231
|
+
if parallel_size >= 16:
|
227
232
|
self.mem_fraction_static = 0.79
|
228
|
-
elif
|
233
|
+
elif parallel_size >= 8:
|
229
234
|
self.mem_fraction_static = 0.81
|
230
|
-
elif
|
235
|
+
elif parallel_size >= 4:
|
231
236
|
self.mem_fraction_static = 0.85
|
232
|
-
elif
|
237
|
+
elif parallel_size >= 2:
|
233
238
|
self.mem_fraction_static = 0.87
|
234
239
|
else:
|
235
240
|
self.mem_fraction_static = 0.88
|
236
241
|
else:
|
237
|
-
|
238
|
-
|
242
|
+
self.mem_fraction_static = 0.88
|
243
|
+
if gpu_mem > 96 * 1024:
|
244
|
+
mem_fraction = self.mem_fraction_static
|
245
|
+
self.mem_fraction_static = min(
|
246
|
+
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
247
|
+
(gpu_mem - 1024 * 18)
|
248
|
+
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
249
|
+
)
|
239
250
|
|
240
251
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
241
252
|
if self.chunked_prefill_size is None:
|
242
253
|
if gpu_mem is not None and gpu_mem < 25_000:
|
243
254
|
self.chunked_prefill_size = 2048
|
255
|
+
elif self.disaggregation_mode != "null":
|
256
|
+
self.chunked_prefill_size = 16384
|
244
257
|
else:
|
245
258
|
self.chunked_prefill_size = 8192
|
246
259
|
assert self.chunked_prefill_size % self.page_size == 0
|
@@ -311,6 +324,9 @@ class ServerArgs:
|
|
311
324
|
assert (
|
312
325
|
not self.enable_dp_attention
|
313
326
|
), "DeepEP MoE `auto` mode is not supported with DP Attention."
|
327
|
+
if self.deepep_mode == "normal":
|
328
|
+
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
329
|
+
self.disable_cuda_graph = True
|
314
330
|
self.ep_size = self.tp_size
|
315
331
|
self.enable_sp_layernorm = (
|
316
332
|
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
@@ -333,6 +349,17 @@ class ServerArgs:
|
|
333
349
|
"eagle speculative decoding."
|
334
350
|
)
|
335
351
|
|
352
|
+
model_arch = get_model_arch(self)
|
353
|
+
|
354
|
+
# Auto set draft_model_path DeepSeek-V3/R1
|
355
|
+
if model_arch == "DeepseekV3ForCausalLM":
|
356
|
+
if self.speculative_draft_model_path is None:
|
357
|
+
self.speculative_draft_model_path = self.model_path
|
358
|
+
else:
|
359
|
+
logger.warning(
|
360
|
+
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
361
|
+
)
|
362
|
+
|
336
363
|
# Auto choose parameters
|
337
364
|
if self.speculative_num_steps is None:
|
338
365
|
assert (
|
@@ -343,7 +370,7 @@ class ServerArgs:
|
|
343
370
|
self.speculative_num_steps,
|
344
371
|
self.speculative_eagle_topk,
|
345
372
|
self.speculative_num_draft_tokens,
|
346
|
-
) = auto_choose_speculative_params(
|
373
|
+
) = auto_choose_speculative_params(model_arch)
|
347
374
|
|
348
375
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
349
376
|
self.speculative_eagle_topk = 1
|
@@ -532,7 +559,7 @@ class ServerArgs:
|
|
532
559
|
"--device",
|
533
560
|
type=str,
|
534
561
|
default=ServerArgs.device,
|
535
|
-
help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
|
562
|
+
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
|
536
563
|
)
|
537
564
|
parser.add_argument(
|
538
565
|
"--served-model-name",
|
@@ -632,6 +659,19 @@ class ServerArgs:
|
|
632
659
|
default=ServerArgs.tp_size,
|
633
660
|
help="The tensor parallelism size.",
|
634
661
|
)
|
662
|
+
parser.add_argument(
|
663
|
+
"--pipeline-parallel-size",
|
664
|
+
"--pp-size",
|
665
|
+
type=int,
|
666
|
+
default=ServerArgs.pp_size,
|
667
|
+
help="The pipeline parallelism size.",
|
668
|
+
)
|
669
|
+
parser.add_argument(
|
670
|
+
"--max-micro-batch-size",
|
671
|
+
type=int,
|
672
|
+
default=ServerArgs.max_micro_batch_size,
|
673
|
+
help="The maximum micro batch size in pipeline parallelism.",
|
674
|
+
)
|
635
675
|
parser.add_argument(
|
636
676
|
"--stream-interval",
|
637
677
|
type=int,
|
@@ -1096,9 +1136,9 @@ class ServerArgs:
|
|
1096
1136
|
parser.add_argument(
|
1097
1137
|
"--tool-call-parser",
|
1098
1138
|
type=str,
|
1099
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3"],
|
1139
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1100
1140
|
default=ServerArgs.tool_call_parser,
|
1101
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and '
|
1141
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1102
1142
|
)
|
1103
1143
|
parser.add_argument(
|
1104
1144
|
"--enable-hierarchical-cache",
|
@@ -1215,12 +1255,29 @@ class ServerArgs:
|
|
1215
1255
|
"--disaggregation-ib-device",
|
1216
1256
|
type=str,
|
1217
1257
|
default=ServerArgs.disaggregation_ib_device,
|
1218
|
-
help="The
|
1258
|
+
help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
|
1259
|
+
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
1260
|
+
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
1261
|
+
)
|
1262
|
+
parser.add_argument(
|
1263
|
+
"--pdlb-url",
|
1264
|
+
type=str,
|
1265
|
+
default=None,
|
1266
|
+
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
parser.add_argument(
|
1270
|
+
"--mm-attention-backend",
|
1271
|
+
type=str,
|
1272
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1273
|
+
default=ServerArgs.mm_attention_backend,
|
1274
|
+
help="Set multimodal attention backend.",
|
1219
1275
|
)
|
1220
1276
|
|
1221
1277
|
@classmethod
|
1222
1278
|
def from_cli_args(cls, args: argparse.Namespace):
|
1223
1279
|
args.tp_size = args.tensor_parallel_size
|
1280
|
+
args.pp_size = args.pipeline_parallel_size
|
1224
1281
|
args.dp_size = args.data_parallel_size
|
1225
1282
|
args.ep_size = args.expert_parallel_size
|
1226
1283
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
@@ -1234,15 +1291,25 @@ class ServerArgs:
|
|
1234
1291
|
|
1235
1292
|
def check_server_args(self):
|
1236
1293
|
assert (
|
1237
|
-
self.tp_size
|
1238
|
-
), "tp_size must be divisible by number of nodes"
|
1294
|
+
self.tp_size * self.pp_size
|
1295
|
+
) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
|
1296
|
+
|
1297
|
+
# FIXME pp constraints
|
1298
|
+
if self.pp_size > 1:
|
1299
|
+
logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
|
1300
|
+
self.disable_overlap_schedule = True
|
1301
|
+
assert (
|
1302
|
+
self.disable_overlap_schedule
|
1303
|
+
and self.speculative_algorithm is None
|
1304
|
+
and not self.enable_mixed_chunk
|
1305
|
+
), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
|
1306
|
+
|
1239
1307
|
assert not (
|
1240
1308
|
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
1241
1309
|
), "multi-node data parallel is not supported unless dp attention!"
|
1242
1310
|
assert (
|
1243
1311
|
self.max_loras_per_batch > 0
|
1244
1312
|
# FIXME
|
1245
|
-
and (self.lora_paths is None or self.disable_cuda_graph)
|
1246
1313
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1247
1314
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
1248
1315
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
@@ -1368,20 +1435,22 @@ class DeprecatedAction(argparse.Action):
|
|
1368
1435
|
raise ValueError(self.help)
|
1369
1436
|
|
1370
1437
|
|
1371
|
-
def
|
1438
|
+
def get_model_arch(args: ServerArgs):
|
1439
|
+
hf_config = get_config(
|
1440
|
+
args.model_path,
|
1441
|
+
trust_remote_code=args.trust_remote_code,
|
1442
|
+
revision=args.revision,
|
1443
|
+
model_override_args=json.loads(args.json_model_override_args),
|
1444
|
+
)
|
1445
|
+
return hf_config.architectures[0]
|
1446
|
+
|
1447
|
+
|
1448
|
+
def auto_choose_speculative_params(arch: str):
|
1372
1449
|
"""
|
1373
1450
|
Automatically choose the parameters for speculative decoding.
|
1374
1451
|
|
1375
1452
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1376
1453
|
"""
|
1377
|
-
config_path = os.path.join(self.model_path, "config.json")
|
1378
|
-
if not os.path.exists(config_path):
|
1379
|
-
raise ValueError(f"{config_path} is not found.")
|
1380
|
-
|
1381
|
-
config = json.load(open(config_path))
|
1382
|
-
|
1383
|
-
arch = config.get("architectures", ["Unknown"])[0]
|
1384
|
-
|
1385
1454
|
if arch in ["LlamaForCausalLM"]:
|
1386
1455
|
# The default value for llama
|
1387
1456
|
return (5, 4, 8)
|
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
|
|
106
106
|
# Init draft worker
|
107
107
|
with empty_context():
|
108
108
|
super().__init__(
|
109
|
+
server_args=server_args,
|
109
110
|
gpu_id=gpu_id,
|
110
111
|
tp_rank=tp_rank,
|
111
|
-
|
112
|
-
nccl_port=nccl_port,
|
112
|
+
pp_rank=0, # FIXME
|
113
113
|
dp_rank=dp_rank,
|
114
|
+
nccl_port=nccl_port,
|
114
115
|
is_draft_worker=True,
|
115
116
|
req_to_token_pool=self.req_to_token_pool,
|
116
117
|
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
sglang/srt/utils.py
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Common utilities."""
|
15
|
+
|
15
16
|
import base64
|
16
17
|
import builtins
|
17
18
|
import ctypes
|
@@ -144,6 +145,10 @@ def is_xpu() -> bool:
|
|
144
145
|
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
145
146
|
|
146
147
|
|
148
|
+
def is_npu() -> bool:
|
149
|
+
return hasattr(torch, "npu") and torch.npu.is_available()
|
150
|
+
|
151
|
+
|
147
152
|
def is_flashinfer_available():
|
148
153
|
"""
|
149
154
|
Check whether flashinfer is available.
|
@@ -327,6 +332,16 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
|
|
327
332
|
elif device == "cpu":
|
328
333
|
# TODO: rename the variables in the current function to be not GPU specific
|
329
334
|
free_gpu_memory = psutil.virtual_memory().available
|
335
|
+
elif device == "npu":
|
336
|
+
num_gpus = torch.npu.device_count()
|
337
|
+
assert gpu_id < num_gpus
|
338
|
+
|
339
|
+
if torch.npu.current_device() != gpu_id:
|
340
|
+
print(
|
341
|
+
f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
|
342
|
+
"which may cause useless memory allocation for torch NPU context.",
|
343
|
+
)
|
344
|
+
free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
|
330
345
|
|
331
346
|
if distributed:
|
332
347
|
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
|
@@ -414,16 +429,40 @@ class LayerFn(Protocol):
|
|
414
429
|
def make_layers(
|
415
430
|
num_hidden_layers: int,
|
416
431
|
layer_fn: LayerFn,
|
432
|
+
pp_rank: Optional[int] = None,
|
433
|
+
pp_size: Optional[int] = None,
|
417
434
|
prefix: str = "",
|
435
|
+
return_tuple: bool = False,
|
418
436
|
) -> Tuple[int, int, torch.nn.ModuleList]:
|
419
437
|
"""Make a list of layers with the given layer function"""
|
438
|
+
# circula imports
|
439
|
+
from sglang.srt.distributed import get_pp_indices
|
440
|
+
from sglang.srt.layers.utils import PPMissingLayer
|
441
|
+
|
442
|
+
assert not pp_size or num_hidden_layers >= pp_size
|
443
|
+
start_layer, end_layer = (
|
444
|
+
get_pp_indices(
|
445
|
+
num_hidden_layers,
|
446
|
+
pp_rank,
|
447
|
+
pp_size,
|
448
|
+
)
|
449
|
+
if pp_rank is not None and pp_size is not None
|
450
|
+
else (0, num_hidden_layers)
|
451
|
+
)
|
420
452
|
modules = torch.nn.ModuleList(
|
421
|
-
[
|
453
|
+
[PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
|
454
|
+
+ [
|
422
455
|
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
|
423
|
-
for idx in range(
|
456
|
+
for idx in range(start_layer, end_layer)
|
457
|
+
]
|
458
|
+
+ [
|
459
|
+
PPMissingLayer(return_tuple=return_tuple)
|
460
|
+
for _ in range(end_layer, num_hidden_layers)
|
424
461
|
]
|
425
462
|
)
|
426
|
-
|
463
|
+
if pp_rank is None or pp_size is None:
|
464
|
+
return modules
|
465
|
+
return modules, start_layer, end_layer
|
427
466
|
|
428
467
|
|
429
468
|
def set_random_seed(seed: int) -> None:
|
@@ -872,12 +911,15 @@ def broadcast_pyobj(
|
|
872
911
|
src: int = 0,
|
873
912
|
force_cpu_device: bool = True,
|
874
913
|
):
|
875
|
-
"""Broadcast inputs from rank
|
914
|
+
"""Broadcast inputs from src rank to all other ranks with torch.dist backend.
|
915
|
+
The `rank` here refer to the source rank on global process group (regardless
|
916
|
+
of dist_group argument).
|
917
|
+
"""
|
876
918
|
device = torch.device(
|
877
919
|
"cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
|
878
920
|
)
|
879
921
|
|
880
|
-
if rank ==
|
922
|
+
if rank == src:
|
881
923
|
if len(data) == 0:
|
882
924
|
tensor_size = torch.tensor([0], dtype=torch.long, device=device)
|
883
925
|
dist.broadcast(tensor_size, src=src, group=dist_group)
|
@@ -909,6 +951,50 @@ def broadcast_pyobj(
|
|
909
951
|
return data
|
910
952
|
|
911
953
|
|
954
|
+
def point_to_point_pyobj(
|
955
|
+
data: List[Any],
|
956
|
+
rank: int,
|
957
|
+
group: Optional[torch.distributed.ProcessGroup] = None,
|
958
|
+
src: int = 0,
|
959
|
+
dst: int = 1,
|
960
|
+
):
|
961
|
+
"""Send data from src to dst in group."""
|
962
|
+
|
963
|
+
if rank == src:
|
964
|
+
if len(data) == 0:
|
965
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
966
|
+
dist.send(tensor_size, dst=dst, group=group)
|
967
|
+
else:
|
968
|
+
serialized_data = pickle.dumps(data)
|
969
|
+
size = len(serialized_data)
|
970
|
+
tensor_data = torch.ByteTensor(
|
971
|
+
np.frombuffer(serialized_data, dtype=np.uint8)
|
972
|
+
)
|
973
|
+
tensor_size = torch.tensor([size], dtype=torch.long)
|
974
|
+
|
975
|
+
dist.send(tensor_size, dst=dst, group=group)
|
976
|
+
dist.send(tensor_data, dst=dst, group=group)
|
977
|
+
return data
|
978
|
+
|
979
|
+
elif rank == dst:
|
980
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
981
|
+
dist.recv(tensor_size, src=src, group=group)
|
982
|
+
size = tensor_size.item()
|
983
|
+
|
984
|
+
if size == 0:
|
985
|
+
return []
|
986
|
+
|
987
|
+
tensor_data = torch.empty(size, dtype=torch.uint8)
|
988
|
+
dist.recv(tensor_data, src=src, group=group)
|
989
|
+
|
990
|
+
serialized_data = bytes(tensor_data.cpu().numpy())
|
991
|
+
data = pickle.loads(serialized_data)
|
992
|
+
return data
|
993
|
+
|
994
|
+
# Other ranks in pp_group do nothing
|
995
|
+
return []
|
996
|
+
|
997
|
+
|
912
998
|
step_counter = 0
|
913
999
|
|
914
1000
|
|
@@ -1276,6 +1362,9 @@ def get_device_name(device_id: int = 0) -> str:
|
|
1276
1362
|
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1277
1363
|
return torch.hpu.get_device_name(device_id)
|
1278
1364
|
|
1365
|
+
if hasattr(torch, "npu") and torch.npu.is_available():
|
1366
|
+
return torch.npu.get_device_name(device_id)
|
1367
|
+
|
1279
1368
|
|
1280
1369
|
@lru_cache(maxsize=1)
|
1281
1370
|
def is_habana_available() -> bool:
|
@@ -1372,6 +1461,13 @@ def get_compiler_backend() -> str:
|
|
1372
1461
|
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1373
1462
|
return "hpu_backend"
|
1374
1463
|
|
1464
|
+
if hasattr(torch, "npu") and torch.npu.is_available():
|
1465
|
+
import torchair
|
1466
|
+
|
1467
|
+
config = torchair.CompilerConfig()
|
1468
|
+
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
1469
|
+
return npu_backend
|
1470
|
+
|
1375
1471
|
return "inductor"
|
1376
1472
|
|
1377
1473
|
|
@@ -1732,6 +1828,13 @@ def configure_ipv6(dist_init_addr):
|
|
1732
1828
|
return port, host
|
1733
1829
|
|
1734
1830
|
|
1831
|
+
def rank0_log(msg: str):
|
1832
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1833
|
+
|
1834
|
+
if get_tensor_model_parallel_rank() == 0:
|
1835
|
+
logger.info(msg)
|
1836
|
+
|
1837
|
+
|
1735
1838
|
def rank0_print(msg: str):
|
1736
1839
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1737
1840
|
|
@@ -1905,13 +2008,16 @@ def fast_topk(values, topk, dim):
|
|
1905
2008
|
return torch.topk(values, topk, dim=dim)
|
1906
2009
|
|
1907
2010
|
|
1908
|
-
def
|
2011
|
+
def _check(cc_major):
|
1909
2012
|
if not is_cuda():
|
1910
2013
|
return False
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
2014
|
+
return torch.cuda.get_device_capability()[0] == cc_major and tuple(
|
2015
|
+
map(int, torch.version.cuda.split(".")[:2])
|
2016
|
+
) >= (12, 3)
|
2017
|
+
|
2018
|
+
|
2019
|
+
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
2020
|
+
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
1915
2021
|
|
1916
2022
|
|
1917
2023
|
def get_free_port():
|
@@ -1990,3 +2096,10 @@ class BumpAllocator:
|
|
1990
2096
|
output = self._buffer[self._pointer : self._pointer + size]
|
1991
2097
|
self._pointer += size
|
1992
2098
|
return output
|
2099
|
+
|
2100
|
+
|
2101
|
+
def log_info_on_rank0(logger, msg):
|
2102
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
2103
|
+
|
2104
|
+
if get_tensor_model_parallel_rank() == 0:
|
2105
|
+
logger.info(msg)
|
sglang/test/runners.py
CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
|
|
423
423
|
)
|
424
424
|
del input_logits
|
425
425
|
|
426
|
+
if lora_paths is not None and lora_paths[i] is not None:
|
427
|
+
# Unload the LoRA adapter if it is used
|
428
|
+
model.unload()
|
429
|
+
|
426
430
|
return ModelOutput(
|
427
431
|
output_strs=output_strs,
|
428
432
|
top_input_logprobs=top_input_logprobs,
|
sglang/test/test_block_fp8.py
CHANGED
@@ -7,9 +7,9 @@ import torch
|
|
7
7
|
from sglang.srt.layers.activation import SiluAndMul
|
8
8
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
9
9
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
10
|
-
per_tensor_quant_mla_deep_gemm_masked_fp8,
|
11
10
|
per_tensor_quant_mla_fp8,
|
12
11
|
per_token_group_quant_fp8,
|
12
|
+
per_token_group_quant_mla_deep_gemm_masked_fp8,
|
13
13
|
static_quant_fp8,
|
14
14
|
w8a8_block_fp8_matmul,
|
15
15
|
)
|
@@ -236,7 +236,7 @@ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
|
|
236
236
|
|
237
237
|
with torch.inference_mode():
|
238
238
|
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
|
239
|
-
out, scale, _, _, _ =
|
239
|
+
out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
|
240
240
|
x, group_size
|
241
241
|
)
|
242
242
|
out = out[:, :num_tokens, :]
|