sglang 0.4.10__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +20 -0
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/disaggregation/launch_lb.py +5 -20
- sglang/srt/disaggregation/mooncake/conn.py +33 -15
- sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
- sglang/srt/layers/attention/utils.py +6 -1
- sglang/srt/layers/moe/ep_moe/layer.py +19 -34
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -2
- sglang/srt/layers/quantization/fp8.py +52 -0
- sglang/srt/layers/quantization/w8a8_int8.py +4 -1
- sglang/srt/managers/cache_controller.py +35 -35
- sglang/srt/managers/scheduler.py +1 -0
- sglang/srt/mem_cache/hicache_storage.py +15 -6
- sglang/srt/mem_cache/hiradix_cache.py +21 -4
- sglang/srt/mem_cache/memory_pool.py +15 -118
- sglang/srt/mem_cache/memory_pool_host.py +350 -33
- sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
- sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
- sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
- sglang/srt/model_executor/cuda_graph_runner.py +25 -1
- sglang/srt/model_executor/model_runner.py +8 -1
- sglang/srt/model_loader/weight_utils.py +2 -0
- sglang/srt/models/deepseek_v2.py +5 -6
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/step3_vl.py +0 -3
- sglang/srt/server_args.py +40 -6
- sglang/srt/utils.py +1 -0
- sglang/test/attention/test_trtllm_mla_backend.py +945 -0
- sglang/version.py +1 -1
- {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +1 -1
- {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +35 -30
- {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -24,6 +24,7 @@ import tempfile
|
|
24
24
|
from typing import List, Literal, Optional, Union
|
25
25
|
|
26
26
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
27
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
27
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
28
29
|
from sglang.srt.reasoning_parser import ReasoningParser
|
29
30
|
from sglang.srt.utils import (
|
@@ -197,7 +198,8 @@ class ServerArgs:
|
|
197
198
|
hicache_ratio: float = 2.0
|
198
199
|
hicache_size: int = 0
|
199
200
|
hicache_write_policy: str = "write_through_selective"
|
200
|
-
hicache_io_backend: str = ""
|
201
|
+
hicache_io_backend: str = "kernel"
|
202
|
+
hicache_mem_layout: str = "layer_first"
|
201
203
|
hicache_storage_backend: Optional[str] = None
|
202
204
|
|
203
205
|
# Double Sparsity
|
@@ -215,6 +217,7 @@ class ServerArgs:
|
|
215
217
|
disable_cuda_graph: bool = False
|
216
218
|
disable_cuda_graph_padding: bool = False
|
217
219
|
enable_profile_cuda_graph: bool = False
|
220
|
+
enable_cudagraph_gc: bool = False
|
218
221
|
enable_nccl_nvls: bool = False
|
219
222
|
enable_tokenizer_batch_encode: bool = False
|
220
223
|
disable_outlines_disk_cache: bool = False
|
@@ -401,6 +404,22 @@ class ServerArgs:
|
|
401
404
|
)
|
402
405
|
self.page_size = 128
|
403
406
|
|
407
|
+
if self.attention_backend == "trtllm_mla":
|
408
|
+
if not is_sm100_supported():
|
409
|
+
raise ValueError(
|
410
|
+
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
411
|
+
)
|
412
|
+
|
413
|
+
if self.page_size not in [32, 64]:
|
414
|
+
logger.warning(
|
415
|
+
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
|
416
|
+
)
|
417
|
+
self.page_size = 64
|
418
|
+
if self.speculative_algorithm is not None:
|
419
|
+
raise ValueError(
|
420
|
+
"trtllm_mla backend does not support speculative decoding yet."
|
421
|
+
)
|
422
|
+
|
404
423
|
# Set page size
|
405
424
|
if self.page_size is None:
|
406
425
|
self.page_size = 1
|
@@ -436,10 +455,11 @@ class ServerArgs:
|
|
436
455
|
self.quantization == "modelopt_fp4"
|
437
456
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
438
457
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
458
|
+
if self.enable_ep_moe:
|
459
|
+
self.ep_size = self.tp_size
|
460
|
+
logger.warning(
|
461
|
+
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
462
|
+
)
|
443
463
|
|
444
464
|
# DeepEP MoE
|
445
465
|
if self.enable_deepep_moe:
|
@@ -1219,6 +1239,7 @@ class ServerArgs:
|
|
1219
1239
|
"torch_native",
|
1220
1240
|
"ascend",
|
1221
1241
|
"triton",
|
1242
|
+
"trtllm_mla",
|
1222
1243
|
],
|
1223
1244
|
default=ServerArgs.attention_backend,
|
1224
1245
|
help="Choose the kernels for attention layers.",
|
@@ -1467,10 +1488,18 @@ class ServerArgs:
|
|
1467
1488
|
default=ServerArgs.hicache_io_backend,
|
1468
1489
|
help="The IO backend for KV cache transfer between CPU and GPU",
|
1469
1490
|
)
|
1491
|
+
parser.add_argument(
|
1492
|
+
"--hicache-mem-layout",
|
1493
|
+
type=str,
|
1494
|
+
choices=["layer_first", "page_first"],
|
1495
|
+
default=ServerArgs.hicache_mem_layout,
|
1496
|
+
help="The layout of host memory pool for hierarchical cache.",
|
1497
|
+
)
|
1498
|
+
|
1470
1499
|
parser.add_argument(
|
1471
1500
|
"--hicache-storage-backend",
|
1472
1501
|
type=str,
|
1473
|
-
choices=["file", "mooncake", "hf3fs"],
|
1502
|
+
choices=["file", "mooncake", "hf3fs", "nixl"],
|
1474
1503
|
default=ServerArgs.hicache_storage_backend,
|
1475
1504
|
help="The storage backend for hierarchical KV cache.",
|
1476
1505
|
)
|
@@ -1545,6 +1574,11 @@ class ServerArgs:
|
|
1545
1574
|
action="store_true",
|
1546
1575
|
help="Enable profiling of cuda graph capture.",
|
1547
1576
|
)
|
1577
|
+
parser.add_argument(
|
1578
|
+
"--enable-cudagraph-gc",
|
1579
|
+
action="store_true",
|
1580
|
+
help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
|
1581
|
+
)
|
1548
1582
|
parser.add_argument(
|
1549
1583
|
"--enable-nccl-nvls",
|
1550
1584
|
action="store_true",
|
sglang/srt/utils.py
CHANGED