sglang 0.4.10__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. sglang/bench_offline_throughput.py +20 -0
  2. sglang/srt/configs/model_config.py +1 -0
  3. sglang/srt/disaggregation/launch_lb.py +5 -20
  4. sglang/srt/disaggregation/mooncake/conn.py +33 -15
  5. sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
  6. sglang/srt/layers/attention/utils.py +6 -1
  7. sglang/srt/layers/moe/ep_moe/layer.py +19 -34
  8. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -2
  9. sglang/srt/layers/quantization/fp8.py +52 -0
  10. sglang/srt/layers/quantization/w8a8_int8.py +4 -1
  11. sglang/srt/managers/cache_controller.py +35 -35
  12. sglang/srt/managers/scheduler.py +1 -0
  13. sglang/srt/mem_cache/hicache_storage.py +15 -6
  14. sglang/srt/mem_cache/hiradix_cache.py +21 -4
  15. sglang/srt/mem_cache/memory_pool.py +15 -118
  16. sglang/srt/mem_cache/memory_pool_host.py +350 -33
  17. sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
  18. sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
  19. sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
  20. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
  21. sglang/srt/model_executor/cuda_graph_runner.py +25 -1
  22. sglang/srt/model_executor/model_runner.py +8 -1
  23. sglang/srt/model_loader/weight_utils.py +2 -0
  24. sglang/srt/models/deepseek_v2.py +5 -6
  25. sglang/srt/models/glm4_moe.py +3 -3
  26. sglang/srt/models/step3_vl.py +0 -3
  27. sglang/srt/server_args.py +40 -6
  28. sglang/srt/utils.py +1 -0
  29. sglang/test/attention/test_trtllm_mla_backend.py +945 -0
  30. sglang/version.py +1 -1
  31. {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +1 -1
  32. {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +35 -30
  33. {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
  34. {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
  35. {sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -24,6 +24,7 @@ import tempfile
24
24
  from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
27
+ from sglang.srt.layers.utils import is_sm100_supported
27
28
  from sglang.srt.lora.lora_registry import LoRARef
28
29
  from sglang.srt.reasoning_parser import ReasoningParser
29
30
  from sglang.srt.utils import (
@@ -197,7 +198,8 @@ class ServerArgs:
197
198
  hicache_ratio: float = 2.0
198
199
  hicache_size: int = 0
199
200
  hicache_write_policy: str = "write_through_selective"
200
- hicache_io_backend: str = ""
201
+ hicache_io_backend: str = "kernel"
202
+ hicache_mem_layout: str = "layer_first"
201
203
  hicache_storage_backend: Optional[str] = None
202
204
 
203
205
  # Double Sparsity
@@ -215,6 +217,7 @@ class ServerArgs:
215
217
  disable_cuda_graph: bool = False
216
218
  disable_cuda_graph_padding: bool = False
217
219
  enable_profile_cuda_graph: bool = False
220
+ enable_cudagraph_gc: bool = False
218
221
  enable_nccl_nvls: bool = False
219
222
  enable_tokenizer_batch_encode: bool = False
220
223
  disable_outlines_disk_cache: bool = False
@@ -401,6 +404,22 @@ class ServerArgs:
401
404
  )
402
405
  self.page_size = 128
403
406
 
407
+ if self.attention_backend == "trtllm_mla":
408
+ if not is_sm100_supported():
409
+ raise ValueError(
410
+ "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
411
+ )
412
+
413
+ if self.page_size not in [32, 64]:
414
+ logger.warning(
415
+ f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
416
+ )
417
+ self.page_size = 64
418
+ if self.speculative_algorithm is not None:
419
+ raise ValueError(
420
+ "trtllm_mla backend does not support speculative decoding yet."
421
+ )
422
+
404
423
  # Set page size
405
424
  if self.page_size is None:
406
425
  self.page_size = 1
@@ -436,10 +455,11 @@ class ServerArgs:
436
455
  self.quantization == "modelopt_fp4"
437
456
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
438
457
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
439
-
440
- if self.enable_flashinfer_trtllm_moe:
441
- assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
442
- logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
458
+ if self.enable_ep_moe:
459
+ self.ep_size = self.tp_size
460
+ logger.warning(
461
+ f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
462
+ )
443
463
 
444
464
  # DeepEP MoE
445
465
  if self.enable_deepep_moe:
@@ -1219,6 +1239,7 @@ class ServerArgs:
1219
1239
  "torch_native",
1220
1240
  "ascend",
1221
1241
  "triton",
1242
+ "trtllm_mla",
1222
1243
  ],
1223
1244
  default=ServerArgs.attention_backend,
1224
1245
  help="Choose the kernels for attention layers.",
@@ -1467,10 +1488,18 @@ class ServerArgs:
1467
1488
  default=ServerArgs.hicache_io_backend,
1468
1489
  help="The IO backend for KV cache transfer between CPU and GPU",
1469
1490
  )
1491
+ parser.add_argument(
1492
+ "--hicache-mem-layout",
1493
+ type=str,
1494
+ choices=["layer_first", "page_first"],
1495
+ default=ServerArgs.hicache_mem_layout,
1496
+ help="The layout of host memory pool for hierarchical cache.",
1497
+ )
1498
+
1470
1499
  parser.add_argument(
1471
1500
  "--hicache-storage-backend",
1472
1501
  type=str,
1473
- choices=["file", "mooncake", "hf3fs"],
1502
+ choices=["file", "mooncake", "hf3fs", "nixl"],
1474
1503
  default=ServerArgs.hicache_storage_backend,
1475
1504
  help="The storage backend for hierarchical KV cache.",
1476
1505
  )
@@ -1545,6 +1574,11 @@ class ServerArgs:
1545
1574
  action="store_true",
1546
1575
  help="Enable profiling of cuda graph capture.",
1547
1576
  )
1577
+ parser.add_argument(
1578
+ "--enable-cudagraph-gc",
1579
+ action="store_true",
1580
+ help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
1581
+ )
1548
1582
  parser.add_argument(
1549
1583
  "--enable-nccl-nvls",
1550
1584
  action="store_true",
sglang/srt/utils.py CHANGED
@@ -2344,6 +2344,7 @@ def is_fa3_default_architecture(hf_config):
2344
2344
  "Qwen3ForCausalLM",
2345
2345
  "Qwen3MoeForCausalLM",
2346
2346
  "Glm4MoeForCausalLM",
2347
+ "Step3VLForConditionalGeneration",
2347
2348
  }
2348
2349
  return architectures[0] in default_archs
2349
2350