sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/bench_offline_throughput.py +20 -0
  2. sglang/bench_one_batch.py +3 -0
  3. sglang/srt/configs/__init__.py +8 -0
  4. sglang/srt/configs/model_config.py +4 -0
  5. sglang/srt/configs/step3_vl.py +172 -0
  6. sglang/srt/conversation.py +23 -0
  7. sglang/srt/disaggregation/decode.py +2 -8
  8. sglang/srt/disaggregation/launch_lb.py +5 -20
  9. sglang/srt/disaggregation/mooncake/conn.py +33 -15
  10. sglang/srt/disaggregation/prefill.py +2 -6
  11. sglang/srt/distributed/parallel_state.py +86 -1
  12. sglang/srt/entrypoints/engine.py +14 -18
  13. sglang/srt/entrypoints/http_server.py +10 -2
  14. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  15. sglang/srt/eplb/expert_distribution.py +5 -0
  16. sglang/srt/eplb/expert_location.py +17 -6
  17. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  18. sglang/srt/eplb/expert_location_updater.py +2 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/step3_detector.py +436 -0
  21. sglang/srt/hf_transformers_utils.py +2 -0
  22. sglang/srt/jinja_template_utils.py +4 -1
  23. sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
  24. sglang/srt/layers/attention/utils.py +6 -1
  25. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  26. sglang/srt/layers/moe/ep_moe/layer.py +39 -674
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
  29. sglang/srt/layers/quantization/fp8.py +52 -18
  30. sglang/srt/layers/quantization/unquant.py +0 -8
  31. sglang/srt/layers/quantization/w4afp8.py +1 -0
  32. sglang/srt/layers/quantization/w8a8_int8.py +4 -1
  33. sglang/srt/managers/cache_controller.py +165 -67
  34. sglang/srt/managers/data_parallel_controller.py +2 -0
  35. sglang/srt/managers/io_struct.py +0 -2
  36. sglang/srt/managers/scheduler.py +90 -671
  37. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  38. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  39. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  40. sglang/srt/managers/template_manager.py +62 -19
  41. sglang/srt/managers/tokenizer_manager.py +123 -74
  42. sglang/srt/managers/tp_worker.py +4 -0
  43. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  44. sglang/srt/mem_cache/hicache_storage.py +60 -17
  45. sglang/srt/mem_cache/hiradix_cache.py +36 -8
  46. sglang/srt/mem_cache/memory_pool.py +15 -118
  47. sglang/srt/mem_cache/memory_pool_host.py +418 -29
  48. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  49. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  50. sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
  51. sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
  52. sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
  53. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
  54. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  55. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  56. sglang/srt/model_executor/cuda_graph_runner.py +25 -1
  57. sglang/srt/model_executor/model_runner.py +13 -1
  58. sglang/srt/model_loader/weight_utils.py +2 -0
  59. sglang/srt/models/arcee.py +532 -0
  60. sglang/srt/models/deepseek_v2.py +7 -6
  61. sglang/srt/models/glm4_moe.py +6 -4
  62. sglang/srt/models/granitemoe.py +3 -0
  63. sglang/srt/models/grok.py +3 -0
  64. sglang/srt/models/hunyuan.py +1 -0
  65. sglang/srt/models/llama4.py +3 -0
  66. sglang/srt/models/mixtral.py +3 -0
  67. sglang/srt/models/olmoe.py +3 -0
  68. sglang/srt/models/phimoe.py +1 -0
  69. sglang/srt/models/step3_vl.py +991 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/reasoning_parser.py +2 -1
  73. sglang/srt/server_args.py +49 -18
  74. sglang/srt/speculative/eagle_worker.py +2 -0
  75. sglang/srt/utils.py +1 -0
  76. sglang/test/attention/test_trtllm_mla_backend.py +945 -0
  77. sglang/utils.py +0 -11
  78. sglang/version.py +1 -1
  79. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
  80. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
  81. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
  82. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
  83. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -24,6 +24,7 @@ import tempfile
24
24
  from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
27
+ from sglang.srt.layers.utils import is_sm100_supported
27
28
  from sglang.srt.lora.lora_registry import LoRARef
28
29
  from sglang.srt.reasoning_parser import ReasoningParser
29
30
  from sglang.srt.utils import (
@@ -197,7 +198,8 @@ class ServerArgs:
197
198
  hicache_ratio: float = 2.0
198
199
  hicache_size: int = 0
199
200
  hicache_write_policy: str = "write_through_selective"
200
- hicache_io_backend: str = ""
201
+ hicache_io_backend: str = "kernel"
202
+ hicache_mem_layout: str = "layer_first"
201
203
  hicache_storage_backend: Optional[str] = None
202
204
 
203
205
  # Double Sparsity
@@ -215,6 +217,7 @@ class ServerArgs:
215
217
  disable_cuda_graph: bool = False
216
218
  disable_cuda_graph_padding: bool = False
217
219
  enable_profile_cuda_graph: bool = False
220
+ enable_cudagraph_gc: bool = False
218
221
  enable_nccl_nvls: bool = False
219
222
  enable_tokenizer_batch_encode: bool = False
220
223
  disable_outlines_disk_cache: bool = False
@@ -270,14 +273,6 @@ class ServerArgs:
270
273
  sm_group_num: int = 3
271
274
 
272
275
  def __post_init__(self):
273
- # Expert parallelism
274
- # We put it here first due to some internal ckpt conversation issues.
275
- if self.enable_ep_moe:
276
- self.ep_size = self.tp_size
277
- logger.warning(
278
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
279
- )
280
-
281
276
  # Set missing default values
282
277
  if self.tokenizer_path is None:
283
278
  self.tokenizer_path = self.model_path
@@ -409,6 +404,22 @@ class ServerArgs:
409
404
  )
410
405
  self.page_size = 128
411
406
 
407
+ if self.attention_backend == "trtllm_mla":
408
+ if not is_sm100_supported():
409
+ raise ValueError(
410
+ "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
411
+ )
412
+
413
+ if self.page_size not in [32, 64]:
414
+ logger.warning(
415
+ f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
416
+ )
417
+ self.page_size = 64
418
+ if self.speculative_algorithm is not None:
419
+ raise ValueError(
420
+ "trtllm_mla backend does not support speculative decoding yet."
421
+ )
422
+
412
423
  # Set page size
413
424
  if self.page_size is None:
414
425
  self.page_size = 1
@@ -444,10 +455,11 @@ class ServerArgs:
444
455
  self.quantization == "modelopt_fp4"
445
456
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
446
457
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
447
-
448
- if self.enable_flashinfer_trtllm_moe:
449
- assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
450
- logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
458
+ if self.enable_ep_moe:
459
+ self.ep_size = self.tp_size
460
+ logger.warning(
461
+ f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
462
+ )
451
463
 
452
464
  # DeepEP MoE
453
465
  if self.enable_deepep_moe:
@@ -1117,9 +1129,10 @@ class ServerArgs:
1117
1129
  "kimi_k2",
1118
1130
  "qwen3_coder",
1119
1131
  "glm45",
1132
+ "step3",
1120
1133
  ],
1121
1134
  default=ServerArgs.tool_call_parser,
1122
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
1135
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1123
1136
  )
1124
1137
 
1125
1138
  # Data parallelism
@@ -1226,6 +1239,7 @@ class ServerArgs:
1226
1239
  "torch_native",
1227
1240
  "ascend",
1228
1241
  "triton",
1242
+ "trtllm_mla",
1229
1243
  ],
1230
1244
  default=ServerArgs.attention_backend,
1231
1245
  help="Choose the kernels for attention layers.",
@@ -1334,6 +1348,7 @@ class ServerArgs:
1334
1348
  parser.add_argument(
1335
1349
  "--expert-parallel-size",
1336
1350
  "--ep-size",
1351
+ "--ep",
1337
1352
  type=int,
1338
1353
  default=ServerArgs.ep_size,
1339
1354
  help="The expert parallelism size.",
@@ -1473,10 +1488,18 @@ class ServerArgs:
1473
1488
  default=ServerArgs.hicache_io_backend,
1474
1489
  help="The IO backend for KV cache transfer between CPU and GPU",
1475
1490
  )
1491
+ parser.add_argument(
1492
+ "--hicache-mem-layout",
1493
+ type=str,
1494
+ choices=["layer_first", "page_first"],
1495
+ default=ServerArgs.hicache_mem_layout,
1496
+ help="The layout of host memory pool for hierarchical cache.",
1497
+ )
1498
+
1476
1499
  parser.add_argument(
1477
1500
  "--hicache-storage-backend",
1478
1501
  type=str,
1479
- choices=["file"], # todo, mooncake
1502
+ choices=["file", "mooncake", "hf3fs", "nixl"],
1480
1503
  default=ServerArgs.hicache_storage_backend,
1481
1504
  help="The storage backend for hierarchical KV cache.",
1482
1505
  )
@@ -1551,6 +1574,11 @@ class ServerArgs:
1551
1574
  action="store_true",
1552
1575
  help="Enable profiling of cuda graph capture.",
1553
1576
  )
1577
+ parser.add_argument(
1578
+ "--enable-cudagraph-gc",
1579
+ action="store_true",
1580
+ help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
1581
+ )
1554
1582
  parser.add_argument(
1555
1583
  "--enable-nccl-nvls",
1556
1584
  action="store_true",
@@ -2071,6 +2099,9 @@ class PortArgs:
2071
2099
 
2072
2100
  dist_init_host, dist_init_port = dist_init_addr
2073
2101
  port_base = int(dist_init_port) + 1
2102
+ detokenizer_port = port_base + 1
2103
+ rpc_port = port_base + 2
2104
+ metrics_ipc_name = port_base + 3
2074
2105
  if dp_rank is None:
2075
2106
  # TokenizerManager to DataParallelController
2076
2107
  scheduler_input_port = port_base + 4
@@ -2080,10 +2111,10 @@ class PortArgs:
2080
2111
  return PortArgs(
2081
2112
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
2082
2113
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
2083
- detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
2114
+ detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
2084
2115
  nccl_port=nccl_port,
2085
- rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
2086
- metrics_ipc_name=f"tcp://{dist_init_host}:{port_base + 3}",
2116
+ rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2117
+ metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2087
2118
  )
2088
2119
 
2089
2120
 
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
73
73
  gpu_id: int,
74
74
  tp_rank: int,
75
75
  dp_rank: Optional[int],
76
+ moe_ep_rank: int,
76
77
  nccl_port: int,
77
78
  target_worker: TpModelWorker,
78
79
  ):
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
127
128
  tp_rank=tp_rank,
128
129
  pp_rank=0, # FIXME
129
130
  dp_rank=dp_rank,
131
+ moe_ep_rank=moe_ep_rank,
130
132
  nccl_port=nccl_port,
131
133
  is_draft_worker=True,
132
134
  req_to_token_pool=self.req_to_token_pool,
sglang/srt/utils.py CHANGED
@@ -2344,6 +2344,7 @@ def is_fa3_default_architecture(hf_config):
2344
2344
  "Qwen3ForCausalLM",
2345
2345
  "Qwen3MoeForCausalLM",
2346
2346
  "Glm4MoeForCausalLM",
2347
+ "Step3VLForConditionalGeneration",
2347
2348
  }
2348
2349
  return architectures[0] in default_archs
2349
2350