sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union
|
|
25
25
|
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
|
-
from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
|
29
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
30
29
|
from sglang.srt.reasoning_parser import ReasoningParser
|
31
30
|
from sglang.srt.utils import (
|
@@ -39,6 +38,8 @@ from sglang.srt.utils import (
|
|
39
38
|
is_hip,
|
40
39
|
is_port_available,
|
41
40
|
is_remote_url,
|
41
|
+
is_sm90_supported,
|
42
|
+
is_sm100_supported,
|
42
43
|
is_triton_kernels_available,
|
43
44
|
is_valid_ipv6_address,
|
44
45
|
nullable_str,
|
@@ -47,12 +48,87 @@ from sglang.srt.utils import (
|
|
47
48
|
logger = logging.getLogger(__name__)
|
48
49
|
|
49
50
|
|
51
|
+
# Define constants
|
52
|
+
LOAD_FORMAT_CHOICES = [
|
53
|
+
"auto",
|
54
|
+
"pt",
|
55
|
+
"safetensors",
|
56
|
+
"npcache",
|
57
|
+
"dummy",
|
58
|
+
"sharded_state",
|
59
|
+
"gguf",
|
60
|
+
"bitsandbytes",
|
61
|
+
"layered",
|
62
|
+
"remote",
|
63
|
+
]
|
64
|
+
|
65
|
+
QUANTIZATION_CHOICES = [
|
66
|
+
"awq",
|
67
|
+
"fp8",
|
68
|
+
"gptq",
|
69
|
+
"marlin",
|
70
|
+
"gptq_marlin",
|
71
|
+
"awq_marlin",
|
72
|
+
"bitsandbytes",
|
73
|
+
"gguf",
|
74
|
+
"modelopt",
|
75
|
+
"modelopt_fp4",
|
76
|
+
"petit_nvfp4",
|
77
|
+
"w8a8_int8",
|
78
|
+
"w8a8_fp8",
|
79
|
+
"moe_wna16",
|
80
|
+
"qoq",
|
81
|
+
"w4afp8",
|
82
|
+
"mxfp4",
|
83
|
+
]
|
84
|
+
|
85
|
+
ATTENTION_BACKEND_CHOICES = [
|
86
|
+
# Common
|
87
|
+
"triton",
|
88
|
+
"torch_native",
|
89
|
+
# NVIDIA specific
|
90
|
+
"cutlass_mla",
|
91
|
+
"fa3",
|
92
|
+
"flashinfer",
|
93
|
+
"flashmla",
|
94
|
+
"trtllm_mla",
|
95
|
+
"trtllm_mha",
|
96
|
+
"dual_chunk_flash_attn",
|
97
|
+
# AMD specific
|
98
|
+
"aiter",
|
99
|
+
"wave",
|
100
|
+
# Other platforms
|
101
|
+
"intel_amx",
|
102
|
+
"ascend",
|
103
|
+
]
|
104
|
+
|
105
|
+
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
106
|
+
|
107
|
+
|
108
|
+
# Allow external code to add more choices
|
109
|
+
def add_load_format_choices(choices):
|
110
|
+
LOAD_FORMAT_CHOICES.extend(choices)
|
111
|
+
|
112
|
+
|
113
|
+
def add_quantization_method_choices(choices):
|
114
|
+
QUANTIZATION_CHOICES.extend(choices)
|
115
|
+
|
116
|
+
|
117
|
+
def add_attention_backend_choices(choices):
|
118
|
+
ATTENTION_BACKEND_CHOICES.extend(choices)
|
119
|
+
|
120
|
+
|
121
|
+
def add_disagg_transfer_backend_choices(choices):
|
122
|
+
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
123
|
+
|
124
|
+
|
50
125
|
@dataclasses.dataclass
|
51
126
|
class ServerArgs:
|
52
127
|
# Model and tokenizer
|
53
128
|
model_path: str
|
54
129
|
tokenizer_path: Optional[str] = None
|
55
130
|
tokenizer_mode: str = "auto"
|
131
|
+
tokenizer_worker_num: int = 1
|
56
132
|
skip_tokenizer_init: bool = False
|
57
133
|
load_format: str = "auto"
|
58
134
|
model_loader_extra_config: str = "{}"
|
@@ -199,6 +275,7 @@ class ServerArgs:
|
|
199
275
|
eplb_algorithm: str = "auto"
|
200
276
|
eplb_rebalance_num_iterations: int = 1000
|
201
277
|
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
278
|
+
eplb_min_rebalancing_utilization_threshold: float = 1.0
|
202
279
|
expert_distribution_recorder_mode: Optional[
|
203
280
|
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
204
281
|
] = None
|
@@ -211,11 +288,12 @@ class ServerArgs:
|
|
211
288
|
enable_hierarchical_cache: bool = False
|
212
289
|
hicache_ratio: float = 2.0
|
213
290
|
hicache_size: int = 0
|
214
|
-
hicache_write_policy: str = "
|
291
|
+
hicache_write_policy: str = "write_through"
|
215
292
|
hicache_io_backend: str = "kernel"
|
216
293
|
hicache_mem_layout: str = "layer_first"
|
217
294
|
hicache_storage_backend: Optional[str] = None
|
218
295
|
hicache_storage_prefetch_policy: str = "best_effort"
|
296
|
+
hicache_storage_backend_extra_config: Optional[str] = None
|
219
297
|
|
220
298
|
# Double Sparsity
|
221
299
|
enable_double_sparsity: bool = False
|
@@ -671,6 +749,15 @@ class ServerArgs:
|
|
671
749
|
)
|
672
750
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
673
751
|
|
752
|
+
if (
|
753
|
+
self.speculative_eagle_topk > 1
|
754
|
+
and self.page_size > 1
|
755
|
+
and self.attention_backend != "flashinfer"
|
756
|
+
):
|
757
|
+
raise ValueError(
|
758
|
+
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
759
|
+
)
|
760
|
+
|
674
761
|
# The token generated from the verify step is counted.
|
675
762
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
676
763
|
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
@@ -741,6 +828,12 @@ class ServerArgs:
|
|
741
828
|
default=ServerArgs.tokenizer_path,
|
742
829
|
help="The path of the tokenizer.",
|
743
830
|
)
|
831
|
+
parser.add_argument(
|
832
|
+
"--tokenizer-worker-num",
|
833
|
+
type=int,
|
834
|
+
default=ServerArgs.tokenizer_worker_num,
|
835
|
+
help="The worker num of the tokenizer manager.",
|
836
|
+
)
|
744
837
|
parser.add_argument(
|
745
838
|
"--tokenizer-mode",
|
746
839
|
type=str,
|
@@ -759,18 +852,7 @@ class ServerArgs:
|
|
759
852
|
"--load-format",
|
760
853
|
type=str,
|
761
854
|
default=ServerArgs.load_format,
|
762
|
-
choices=
|
763
|
-
"auto",
|
764
|
-
"pt",
|
765
|
-
"safetensors",
|
766
|
-
"npcache",
|
767
|
-
"dummy",
|
768
|
-
"sharded_state",
|
769
|
-
"gguf",
|
770
|
-
"bitsandbytes",
|
771
|
-
"layered",
|
772
|
-
"remote",
|
773
|
-
],
|
855
|
+
choices=LOAD_FORMAT_CHOICES,
|
774
856
|
help="The format of the model weights to load. "
|
775
857
|
'"auto" will try to load the weights in the safetensors format '
|
776
858
|
"and fall back to the pytorch bin format if safetensors format "
|
@@ -889,25 +971,7 @@ class ServerArgs:
|
|
889
971
|
"--quantization",
|
890
972
|
type=str,
|
891
973
|
default=ServerArgs.quantization,
|
892
|
-
choices=
|
893
|
-
"awq",
|
894
|
-
"fp8",
|
895
|
-
"gptq",
|
896
|
-
"marlin",
|
897
|
-
"gptq_marlin",
|
898
|
-
"awq_marlin",
|
899
|
-
"bitsandbytes",
|
900
|
-
"gguf",
|
901
|
-
"modelopt",
|
902
|
-
"modelopt_fp4",
|
903
|
-
"petit_nvfp4",
|
904
|
-
"w8a8_int8",
|
905
|
-
"w8a8_fp8",
|
906
|
-
"moe_wna16",
|
907
|
-
"qoq",
|
908
|
-
"w4afp8",
|
909
|
-
"mxfp4",
|
910
|
-
],
|
974
|
+
choices=QUANTIZATION_CHOICES,
|
911
975
|
help="The quantization method.",
|
912
976
|
)
|
913
977
|
parser.add_argument(
|
@@ -1357,43 +1421,24 @@ class ServerArgs:
|
|
1357
1421
|
)
|
1358
1422
|
|
1359
1423
|
# Kernel backend
|
1360
|
-
ATTN_BACKENDS = [
|
1361
|
-
# Common
|
1362
|
-
"triton",
|
1363
|
-
"torch_native",
|
1364
|
-
# NVIDIA specific
|
1365
|
-
"cutlass_mla",
|
1366
|
-
"fa3",
|
1367
|
-
"flashinfer",
|
1368
|
-
"flashmla",
|
1369
|
-
"trtllm_mla",
|
1370
|
-
"trtllm_mha",
|
1371
|
-
"dual_chunk_flash_attn",
|
1372
|
-
# AMD specific
|
1373
|
-
"aiter",
|
1374
|
-
"wave",
|
1375
|
-
# Other platforms
|
1376
|
-
"intel_amx",
|
1377
|
-
"ascend",
|
1378
|
-
]
|
1379
1424
|
parser.add_argument(
|
1380
1425
|
"--attention-backend",
|
1381
1426
|
type=str,
|
1382
|
-
choices=
|
1427
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1383
1428
|
default=ServerArgs.attention_backend,
|
1384
1429
|
help="Choose the kernels for attention layers.",
|
1385
1430
|
)
|
1386
1431
|
parser.add_argument(
|
1387
1432
|
"--prefill-attention-backend",
|
1388
1433
|
type=str,
|
1389
|
-
choices=
|
1434
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1390
1435
|
default=ServerArgs.prefill_attention_backend,
|
1391
1436
|
help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
|
1392
1437
|
)
|
1393
1438
|
parser.add_argument(
|
1394
1439
|
"--decode-attention-backend",
|
1395
1440
|
type=str,
|
1396
|
-
choices=
|
1441
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1397
1442
|
default=ServerArgs.decode_attention_backend,
|
1398
1443
|
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
|
1399
1444
|
)
|
@@ -1558,6 +1603,12 @@ class ServerArgs:
|
|
1558
1603
|
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1559
1604
|
help="Number of layers to rebalance per forward pass.",
|
1560
1605
|
)
|
1606
|
+
parser.add_argument(
|
1607
|
+
"--eplb-min-rebalancing-utilization-threshold",
|
1608
|
+
type=float,
|
1609
|
+
default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
|
1610
|
+
help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
|
1611
|
+
)
|
1561
1612
|
parser.add_argument(
|
1562
1613
|
"--expert-distribution-recorder-mode",
|
1563
1614
|
type=str,
|
@@ -1641,6 +1692,12 @@ class ServerArgs:
|
|
1641
1692
|
default=ServerArgs.hicache_storage_prefetch_policy,
|
1642
1693
|
help="Control when prefetching from the storage backend should stop.",
|
1643
1694
|
)
|
1695
|
+
parser.add_argument(
|
1696
|
+
"--hicache-storage-backend-extra-config",
|
1697
|
+
type=str,
|
1698
|
+
default=ServerArgs.hicache_storage_backend_extra_config,
|
1699
|
+
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1700
|
+
)
|
1644
1701
|
|
1645
1702
|
# Double Sparsity
|
1646
1703
|
parser.add_argument(
|
@@ -1951,7 +2008,7 @@ class ServerArgs:
|
|
1951
2008
|
"--disaggregation-transfer-backend",
|
1952
2009
|
type=str,
|
1953
2010
|
default=ServerArgs.disaggregation_transfer_backend,
|
1954
|
-
choices=
|
2011
|
+
choices=DISAGG_TRANSFER_BACKEND_CHOICES,
|
1955
2012
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1956
2013
|
)
|
1957
2014
|
parser.add_argument(
|
@@ -2126,6 +2183,9 @@ class ServerArgs:
|
|
2126
2183
|
self.chunked_prefill_size % self.page_size == 0
|
2127
2184
|
), "chunked_prefill_size must be divisible by page_size"
|
2128
2185
|
|
2186
|
+
# Check multi tokenizer
|
2187
|
+
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
2188
|
+
|
2129
2189
|
def check_lora_server_args(self):
|
2130
2190
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
2131
2191
|
|
@@ -2271,6 +2331,7 @@ class ServerArgs:
|
|
2271
2331
|
if is_mxfp4_quant_format:
|
2272
2332
|
# use bf16 for mxfp4 triton kernels
|
2273
2333
|
self.dtype = "bfloat16"
|
2334
|
+
|
2274
2335
|
elif "Llama4" in model_arch:
|
2275
2336
|
assert self.attention_backend in {
|
2276
2337
|
"fa3",
|
@@ -2368,6 +2429,9 @@ class PortArgs:
|
|
2368
2429
|
# The ipc filename for Scheduler to send metrics
|
2369
2430
|
metrics_ipc_name: str
|
2370
2431
|
|
2432
|
+
# The ipc filename for Tokenizer and worker tokenizer
|
2433
|
+
tokenizer_worker_ipc_name: Optional[str]
|
2434
|
+
|
2371
2435
|
@staticmethod
|
2372
2436
|
def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
|
2373
2437
|
if server_args.nccl_port is None:
|
@@ -2391,6 +2455,7 @@ class PortArgs:
|
|
2391
2455
|
nccl_port=nccl_port,
|
2392
2456
|
rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2393
2457
|
metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2458
|
+
tokenizer_worker_ipc_name=None,
|
2394
2459
|
)
|
2395
2460
|
else:
|
2396
2461
|
# DP attention. Use TCP + port to handle both single-node and multi-node.
|
@@ -2424,6 +2489,7 @@ class PortArgs:
|
|
2424
2489
|
nccl_port=nccl_port,
|
2425
2490
|
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2426
2491
|
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2492
|
+
tokenizer_worker_ipc_name=None,
|
2427
2493
|
)
|
2428
2494
|
|
2429
2495
|
|
@@ -46,6 +46,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|
46
46
|
from sglang.srt.utils import (
|
47
47
|
empty_context,
|
48
48
|
get_available_gpu_memory,
|
49
|
+
get_bool_env_var,
|
49
50
|
is_cuda,
|
50
51
|
next_power_of_2,
|
51
52
|
)
|
@@ -54,6 +55,7 @@ if is_cuda():
|
|
54
55
|
from sgl_kernel import segment_packbits
|
55
56
|
|
56
57
|
logger = logging.getLogger(__name__)
|
58
|
+
RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
|
57
59
|
|
58
60
|
|
59
61
|
@contextmanager
|
@@ -137,8 +139,15 @@ class EAGLEWorker(TpModelWorker):
|
|
137
139
|
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
138
140
|
|
139
141
|
if self.speculative_algorithm.is_eagle3():
|
140
|
-
# EAGLE3 models don't share lm_head
|
141
|
-
|
142
|
+
# most cases EAGLE3 models don't share lm_head
|
143
|
+
# but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
|
144
|
+
if (
|
145
|
+
hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
|
146
|
+
and self.draft_model_runner.model.load_lm_head_from_target
|
147
|
+
):
|
148
|
+
self.draft_model_runner.model.set_embed_and_head(embed, head)
|
149
|
+
else:
|
150
|
+
self.draft_model_runner.model.set_embed(embed)
|
142
151
|
|
143
152
|
# grab hot token ids
|
144
153
|
if self.draft_model_runner.model.hot_token_id is not None:
|
@@ -781,15 +790,20 @@ class EAGLEWorker(TpModelWorker):
|
|
781
790
|
token_ids_logprobs = batch.token_ids_logprobs
|
782
791
|
accepted_indices = res.accepted_indices
|
783
792
|
assert len(accepted_indices) == len(logits_output.next_token_logits)
|
793
|
+
|
784
794
|
temperatures = batch.sampling_info.temperatures
|
785
795
|
num_draft_tokens = batch.spec_info.draft_token_num
|
786
796
|
# acceptance indices are the indices in a "flattened" batch.
|
787
797
|
# dividing it to num_draft_tokens will yield the actual batch index.
|
788
798
|
temperatures = temperatures[accepted_indices // num_draft_tokens]
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
799
|
+
if RETURN_ORIGINAL_LOGPROB:
|
800
|
+
logprobs = torch.nn.functional.log_softmax(
|
801
|
+
logits_output.next_token_logits, dim=-1
|
802
|
+
)
|
803
|
+
else:
|
804
|
+
logprobs = torch.nn.functional.log_softmax(
|
805
|
+
logits_output.next_token_logits / temperatures, dim=-1
|
806
|
+
)
|
793
807
|
batch_next_token_ids = res.verified_id
|
794
808
|
num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
|
795
809
|
|
@@ -806,13 +820,19 @@ class EAGLEWorker(TpModelWorker):
|
|
806
820
|
(
|
807
821
|
logits_output.next_token_top_logprobs_val,
|
808
822
|
logits_output.next_token_top_logprobs_idx,
|
809
|
-
) = get_top_logprobs(
|
823
|
+
) = get_top_logprobs(
|
824
|
+
logprobs,
|
825
|
+
top_logprobs_nums_repeat_interleaved,
|
826
|
+
)
|
810
827
|
|
811
828
|
if any(x is not None for x in token_ids_logprobs):
|
812
829
|
(
|
813
830
|
logits_output.next_token_token_ids_logprobs_val,
|
814
831
|
logits_output.next_token_token_ids_logprobs_idx,
|
815
|
-
) = get_token_ids_logprobs(
|
832
|
+
) = get_token_ids_logprobs(
|
833
|
+
logprobs,
|
834
|
+
token_ids_logprobs_repeat_interleaved,
|
835
|
+
)
|
816
836
|
|
817
837
|
logits_output.next_token_logprobs = logprobs[
|
818
838
|
torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
|
@@ -121,7 +121,12 @@ class TiktokenTokenizer:
|
|
121
121
|
return self.tokenizer.decode_batch(batch)
|
122
122
|
|
123
123
|
def apply_chat_template(
|
124
|
-
self,
|
124
|
+
self,
|
125
|
+
messages,
|
126
|
+
tokenize,
|
127
|
+
add_generation_prompt,
|
128
|
+
tools=None,
|
129
|
+
reasoning_effort=None,
|
125
130
|
):
|
126
131
|
ret = self.chat_template_jinja.render(
|
127
132
|
messages=messages, add_generation_prompt=add_generation_prompt
|
sglang/srt/utils.py
CHANGED
@@ -172,6 +172,20 @@ def is_blackwell():
|
|
172
172
|
return torch.cuda.get_device_capability()[0] == 10
|
173
173
|
|
174
174
|
|
175
|
+
@lru_cache(maxsize=1)
|
176
|
+
def is_sm100_supported(device=None) -> bool:
|
177
|
+
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
178
|
+
torch.version.cuda >= "12.8"
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
@lru_cache(maxsize=1)
|
183
|
+
def is_sm90_supported(device=None) -> bool:
|
184
|
+
return (torch.cuda.get_device_capability(device)[0] == 9) and (
|
185
|
+
torch.version.cuda >= "12.3"
|
186
|
+
)
|
187
|
+
|
188
|
+
|
175
189
|
_warned_bool_env_var_keys = set()
|
176
190
|
|
177
191
|
|
@@ -1665,9 +1679,29 @@ def direct_register_custom_op(
|
|
1665
1679
|
IMPORTANT: the lifetime of the operator is tied to the lifetime of the
|
1666
1680
|
library object. If you want to bind the operator to a different library,
|
1667
1681
|
make sure the library object is alive when the operator is used.
|
1682
|
+
|
1683
|
+
Note: This function will silently skip registration if the operator
|
1684
|
+
with the same name is already registered to avoid RuntimeError in
|
1685
|
+
multi-engine scenarios (e.g., VERL framework).
|
1668
1686
|
"""
|
1669
1687
|
import torch.library
|
1670
1688
|
|
1689
|
+
my_lib = target_lib or sglang_lib
|
1690
|
+
|
1691
|
+
# Check if operator is already registered to avoid duplicate registration
|
1692
|
+
# This is important for scenarios where multiple SGLang engines run in the same process
|
1693
|
+
try:
|
1694
|
+
# Try to access the operator to see if it's already registered
|
1695
|
+
lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
|
1696
|
+
if hasattr(torch.ops, lib_name) and hasattr(
|
1697
|
+
getattr(torch.ops, lib_name), op_name
|
1698
|
+
):
|
1699
|
+
# Operator already exists, skip registration
|
1700
|
+
return
|
1701
|
+
except (AttributeError, RuntimeError):
|
1702
|
+
# Operator doesn't exist, proceed with registration
|
1703
|
+
pass
|
1704
|
+
|
1671
1705
|
if hasattr(torch.library, "infer_schema"):
|
1672
1706
|
schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
|
1673
1707
|
else:
|
@@ -1676,11 +1710,22 @@ def direct_register_custom_op(
|
|
1676
1710
|
|
1677
1711
|
schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
|
1678
1712
|
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1713
|
+
try:
|
1714
|
+
my_lib.define(op_name + schema_str)
|
1715
|
+
my_lib.impl(op_name, op_func, "CUDA")
|
1716
|
+
if fake_impl is not None:
|
1717
|
+
my_lib._register_fake(op_name, fake_impl)
|
1718
|
+
except RuntimeError as error:
|
1719
|
+
if "Tried to register an operator" in str(e) and "multiple times" in str(e):
|
1720
|
+
# Silently ignore duplicate registration errors
|
1721
|
+
# This can happen in multi-engine scenarios
|
1722
|
+
pass
|
1723
|
+
else:
|
1724
|
+
# Re-raise other RuntimeErrors
|
1725
|
+
raise error
|
1726
|
+
except AttributeError as error:
|
1727
|
+
# Always re-raise AttributeError as it indicates missing dependencies
|
1728
|
+
raise error
|
1684
1729
|
|
1685
1730
|
|
1686
1731
|
def set_gpu_proc_affinity(
|
@@ -1919,6 +1964,15 @@ def get_ip() -> str:
|
|
1919
1964
|
except Exception:
|
1920
1965
|
pass
|
1921
1966
|
|
1967
|
+
# try using hostname
|
1968
|
+
hostname = socket.gethostname()
|
1969
|
+
try:
|
1970
|
+
ip_addr = socket.gethostbyname(hostname)
|
1971
|
+
warnings.warn("using local ip address: {}".format(ip_addr))
|
1972
|
+
return ip_addr
|
1973
|
+
except Exception:
|
1974
|
+
pass
|
1975
|
+
|
1922
1976
|
warnings.warn(
|
1923
1977
|
"Failed to get the IP address, using 0.0.0.0 by default."
|
1924
1978
|
"The value can be set by the environment variable"
|
@@ -2733,6 +2787,20 @@ def lru_cache_frozenset(maxsize=128):
|
|
2733
2787
|
return decorator
|
2734
2788
|
|
2735
2789
|
|
2790
|
+
def get_worker_ids_from_req_rids(rids):
|
2791
|
+
if isinstance(rids, list):
|
2792
|
+
worker_ids = [int(rid.split("_")[0]) for rid in rids]
|
2793
|
+
elif isinstance(rids, str):
|
2794
|
+
worker_ids = [int(rids.split("_")[0])]
|
2795
|
+
else:
|
2796
|
+
worker_ids = []
|
2797
|
+
return worker_ids
|
2798
|
+
|
2799
|
+
|
2800
|
+
def get_origin_rid(rid):
|
2801
|
+
return rid.split("_", 1)[1] if "_" in rid else rid
|
2802
|
+
|
2803
|
+
|
2736
2804
|
def apply_module_patch(target_module, target_function, wrappers):
|
2737
2805
|
original_module, original_function = parse_module_path(
|
2738
2806
|
target_module, target_function, False
|
@@ -208,6 +208,15 @@ class MockModelRunner:
|
|
208
208
|
self.kv_cache_dtype = config["kv_cache_dtype"]
|
209
209
|
self.page_size = config["page_size"]
|
210
210
|
|
211
|
+
# Server args stub - needed by attention backends
|
212
|
+
self.server_args = type(
|
213
|
+
"ServerArgs",
|
214
|
+
(),
|
215
|
+
{
|
216
|
+
"enable_dp_attention": False, # Default value for testing
|
217
|
+
},
|
218
|
+
)
|
219
|
+
|
211
220
|
# Model-config stub with MLA attributes
|
212
221
|
self.model_config = type(
|
213
222
|
"ModelConfig",
|
@@ -833,7 +842,7 @@ class TestTRTLLMMLA(CustomTestCase):
|
|
833
842
|
|
834
843
|
# Test workspace properties
|
835
844
|
self.assertEqual(metadata.workspace.device.type, "cuda")
|
836
|
-
self.assertEqual(metadata.workspace.dtype, torch.
|
845
|
+
self.assertEqual(metadata.workspace.dtype, torch.uint8)
|
837
846
|
self.assertGreater(
|
838
847
|
metadata.workspace.numel(), 0, "Workspace should have non-zero size"
|
839
848
|
)
|
@@ -993,8 +1002,8 @@ class TestTRTLLMMLA(CustomTestCase):
|
|
993
1002
|
)
|
994
1003
|
|
995
1004
|
# Verify CUDA graph buffers are allocated
|
996
|
-
self.assertIsNotNone(backend.
|
997
|
-
self.assertIsNotNone(backend.
|
1005
|
+
self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
|
1006
|
+
self.assertIsNotNone(backend.decode_cuda_graph_workspace)
|
998
1007
|
|
999
1008
|
# Test capture metadata
|
1000
1009
|
seq_lens = torch.full(
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.2rc0"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.2rc0
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
|
|
251
251
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
252
|
Requires-Dist: tiktoken; extra == "runtime-common"
|
253
253
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
254
|
-
Requires-Dist: transformers==4.
|
254
|
+
Requires-Dist: transformers==4.56.0; extra == "runtime-common"
|
255
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
256
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
257
257
|
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.7.post1; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
264
264
|
Requires-Dist: cuda-python; extra == "srt"
|
265
|
-
Requires-Dist: flashinfer_python==0.
|
265
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
|
266
266
|
Provides-Extra: blackwell
|
267
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
268
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
270
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
271
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
272
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
273
|
-
Requires-Dist: flashinfer_python==0.
|
273
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
|
274
274
|
Provides-Extra: srt-hip
|
275
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
276
276
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
|
|
304
304
|
Requires-Dist: peft; extra == "test"
|
305
305
|
Requires-Dist: sentence_transformers; extra == "test"
|
306
306
|
Requires-Dist: pytest; extra == "test"
|
307
|
+
Requires-Dist: tabulate; extra == "test"
|
307
308
|
Provides-Extra: all
|
308
309
|
Requires-Dist: sglang[srt]; extra == "all"
|
309
310
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -374,7 +375,7 @@ Dynamic: license-file
|
|
374
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
375
376
|
|
376
377
|
## News
|
377
|
-
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
|
378
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
|
378
379
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
380
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
381
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|