sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +20 -0
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/launch_lb.py +5 -20
- sglang/srt/disaggregation/mooncake/conn.py +33 -15
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +10 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
- sglang/srt/layers/attention/utils.py +6 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +39 -674
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
- sglang/srt/layers/quantization/fp8.py +52 -18
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/layers/quantization/w8a8_int8.py +4 -1
- sglang/srt/managers/cache_controller.py +165 -67
- sglang/srt/managers/data_parallel_controller.py +2 -0
- sglang/srt/managers/io_struct.py +0 -2
- sglang/srt/managers/scheduler.py +90 -671
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +123 -74
- sglang/srt/managers/tp_worker.py +4 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +60 -17
- sglang/srt/mem_cache/hiradix_cache.py +36 -8
- sglang/srt/mem_cache/memory_pool.py +15 -118
- sglang/srt/mem_cache/memory_pool_host.py +418 -29
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
- sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
- sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/cuda_graph_runner.py +25 -1
- sglang/srt/model_executor/model_runner.py +13 -1
- sglang/srt/model_loader/weight_utils.py +2 -0
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +7 -6
- sglang/srt/models/glm4_moe.py +6 -4
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/step3_vl.py +991 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +49 -18
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/utils.py +1 -0
- sglang/test/attention/test_trtllm_mla_backend.py +945 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -24,6 +24,7 @@ import tempfile
|
|
24
24
|
from typing import List, Literal, Optional, Union
|
25
25
|
|
26
26
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
27
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
27
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
28
29
|
from sglang.srt.reasoning_parser import ReasoningParser
|
29
30
|
from sglang.srt.utils import (
|
@@ -197,7 +198,8 @@ class ServerArgs:
|
|
197
198
|
hicache_ratio: float = 2.0
|
198
199
|
hicache_size: int = 0
|
199
200
|
hicache_write_policy: str = "write_through_selective"
|
200
|
-
hicache_io_backend: str = ""
|
201
|
+
hicache_io_backend: str = "kernel"
|
202
|
+
hicache_mem_layout: str = "layer_first"
|
201
203
|
hicache_storage_backend: Optional[str] = None
|
202
204
|
|
203
205
|
# Double Sparsity
|
@@ -215,6 +217,7 @@ class ServerArgs:
|
|
215
217
|
disable_cuda_graph: bool = False
|
216
218
|
disable_cuda_graph_padding: bool = False
|
217
219
|
enable_profile_cuda_graph: bool = False
|
220
|
+
enable_cudagraph_gc: bool = False
|
218
221
|
enable_nccl_nvls: bool = False
|
219
222
|
enable_tokenizer_batch_encode: bool = False
|
220
223
|
disable_outlines_disk_cache: bool = False
|
@@ -270,14 +273,6 @@ class ServerArgs:
|
|
270
273
|
sm_group_num: int = 3
|
271
274
|
|
272
275
|
def __post_init__(self):
|
273
|
-
# Expert parallelism
|
274
|
-
# We put it here first due to some internal ckpt conversation issues.
|
275
|
-
if self.enable_ep_moe:
|
276
|
-
self.ep_size = self.tp_size
|
277
|
-
logger.warning(
|
278
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
279
|
-
)
|
280
|
-
|
281
276
|
# Set missing default values
|
282
277
|
if self.tokenizer_path is None:
|
283
278
|
self.tokenizer_path = self.model_path
|
@@ -409,6 +404,22 @@ class ServerArgs:
|
|
409
404
|
)
|
410
405
|
self.page_size = 128
|
411
406
|
|
407
|
+
if self.attention_backend == "trtllm_mla":
|
408
|
+
if not is_sm100_supported():
|
409
|
+
raise ValueError(
|
410
|
+
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
411
|
+
)
|
412
|
+
|
413
|
+
if self.page_size not in [32, 64]:
|
414
|
+
logger.warning(
|
415
|
+
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
|
416
|
+
)
|
417
|
+
self.page_size = 64
|
418
|
+
if self.speculative_algorithm is not None:
|
419
|
+
raise ValueError(
|
420
|
+
"trtllm_mla backend does not support speculative decoding yet."
|
421
|
+
)
|
422
|
+
|
412
423
|
# Set page size
|
413
424
|
if self.page_size is None:
|
414
425
|
self.page_size = 1
|
@@ -444,10 +455,11 @@ class ServerArgs:
|
|
444
455
|
self.quantization == "modelopt_fp4"
|
445
456
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
446
457
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
458
|
+
if self.enable_ep_moe:
|
459
|
+
self.ep_size = self.tp_size
|
460
|
+
logger.warning(
|
461
|
+
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
462
|
+
)
|
451
463
|
|
452
464
|
# DeepEP MoE
|
453
465
|
if self.enable_deepep_moe:
|
@@ -1117,9 +1129,10 @@ class ServerArgs:
|
|
1117
1129
|
"kimi_k2",
|
1118
1130
|
"qwen3_coder",
|
1119
1131
|
"glm45",
|
1132
|
+
"step3",
|
1120
1133
|
],
|
1121
1134
|
default=ServerArgs.tool_call_parser,
|
1122
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and '
|
1135
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
|
1123
1136
|
)
|
1124
1137
|
|
1125
1138
|
# Data parallelism
|
@@ -1226,6 +1239,7 @@ class ServerArgs:
|
|
1226
1239
|
"torch_native",
|
1227
1240
|
"ascend",
|
1228
1241
|
"triton",
|
1242
|
+
"trtllm_mla",
|
1229
1243
|
],
|
1230
1244
|
default=ServerArgs.attention_backend,
|
1231
1245
|
help="Choose the kernels for attention layers.",
|
@@ -1334,6 +1348,7 @@ class ServerArgs:
|
|
1334
1348
|
parser.add_argument(
|
1335
1349
|
"--expert-parallel-size",
|
1336
1350
|
"--ep-size",
|
1351
|
+
"--ep",
|
1337
1352
|
type=int,
|
1338
1353
|
default=ServerArgs.ep_size,
|
1339
1354
|
help="The expert parallelism size.",
|
@@ -1473,10 +1488,18 @@ class ServerArgs:
|
|
1473
1488
|
default=ServerArgs.hicache_io_backend,
|
1474
1489
|
help="The IO backend for KV cache transfer between CPU and GPU",
|
1475
1490
|
)
|
1491
|
+
parser.add_argument(
|
1492
|
+
"--hicache-mem-layout",
|
1493
|
+
type=str,
|
1494
|
+
choices=["layer_first", "page_first"],
|
1495
|
+
default=ServerArgs.hicache_mem_layout,
|
1496
|
+
help="The layout of host memory pool for hierarchical cache.",
|
1497
|
+
)
|
1498
|
+
|
1476
1499
|
parser.add_argument(
|
1477
1500
|
"--hicache-storage-backend",
|
1478
1501
|
type=str,
|
1479
|
-
choices=["file"
|
1502
|
+
choices=["file", "mooncake", "hf3fs", "nixl"],
|
1480
1503
|
default=ServerArgs.hicache_storage_backend,
|
1481
1504
|
help="The storage backend for hierarchical KV cache.",
|
1482
1505
|
)
|
@@ -1551,6 +1574,11 @@ class ServerArgs:
|
|
1551
1574
|
action="store_true",
|
1552
1575
|
help="Enable profiling of cuda graph capture.",
|
1553
1576
|
)
|
1577
|
+
parser.add_argument(
|
1578
|
+
"--enable-cudagraph-gc",
|
1579
|
+
action="store_true",
|
1580
|
+
help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
|
1581
|
+
)
|
1554
1582
|
parser.add_argument(
|
1555
1583
|
"--enable-nccl-nvls",
|
1556
1584
|
action="store_true",
|
@@ -2071,6 +2099,9 @@ class PortArgs:
|
|
2071
2099
|
|
2072
2100
|
dist_init_host, dist_init_port = dist_init_addr
|
2073
2101
|
port_base = int(dist_init_port) + 1
|
2102
|
+
detokenizer_port = port_base + 1
|
2103
|
+
rpc_port = port_base + 2
|
2104
|
+
metrics_ipc_name = port_base + 3
|
2074
2105
|
if dp_rank is None:
|
2075
2106
|
# TokenizerManager to DataParallelController
|
2076
2107
|
scheduler_input_port = port_base + 4
|
@@ -2080,10 +2111,10 @@ class PortArgs:
|
|
2080
2111
|
return PortArgs(
|
2081
2112
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
2082
2113
|
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|
2083
|
-
detokenizer_ipc_name=f"tcp://{dist_init_host}:{
|
2114
|
+
detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
|
2084
2115
|
nccl_port=nccl_port,
|
2085
|
-
rpc_ipc_name=f"tcp://{dist_init_host}:{
|
2086
|
-
metrics_ipc_name=f"tcp://{dist_init_host}:{
|
2116
|
+
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2117
|
+
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2087
2118
|
)
|
2088
2119
|
|
2089
2120
|
|
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
|
|
73
73
|
gpu_id: int,
|
74
74
|
tp_rank: int,
|
75
75
|
dp_rank: Optional[int],
|
76
|
+
moe_ep_rank: int,
|
76
77
|
nccl_port: int,
|
77
78
|
target_worker: TpModelWorker,
|
78
79
|
):
|
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
|
|
127
128
|
tp_rank=tp_rank,
|
128
129
|
pp_rank=0, # FIXME
|
129
130
|
dp_rank=dp_rank,
|
131
|
+
moe_ep_rank=moe_ep_rank,
|
130
132
|
nccl_port=nccl_port,
|
131
133
|
is_draft_worker=True,
|
132
134
|
req_to_token_pool=self.req_to_token_pool,
|
sglang/srt/utils.py
CHANGED