sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -4
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +3 -6
- sglang/compile_deep_gemm.py +136 -0
- sglang/lang/backend/anthropic.py +0 -4
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/openai.py +6 -2
- sglang/lang/backend/runtime_endpoint.py +5 -1
- sglang/lang/backend/vertexai.py +0 -1
- sglang/lang/compiler.py +1 -7
- sglang/lang/tracer.py +3 -7
- sglang/srt/_custom_ops.py +0 -2
- sglang/srt/configs/model_config.py +4 -1
- sglang/srt/constrained/outlines_jump_forward.py +14 -1
- sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
- sglang/srt/constrained/xgrammar_backend.py +27 -4
- sglang/srt/custom_op.py +0 -62
- sglang/srt/disaggregation/decode.py +105 -6
- sglang/srt/disaggregation/mini_lb.py +74 -9
- sglang/srt/disaggregation/mooncake/conn.py +33 -63
- sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
- sglang/srt/disaggregation/nixl/__init__.py +1 -0
- sglang/srt/disaggregation/nixl/conn.py +622 -0
- sglang/srt/disaggregation/prefill.py +137 -17
- sglang/srt/disaggregation/utils.py +32 -0
- sglang/srt/entrypoints/engine.py +4 -0
- sglang/srt/entrypoints/http_server.py +3 -7
- sglang/srt/entrypoints/verl_engine.py +7 -5
- sglang/srt/function_call_parser.py +60 -0
- sglang/srt/layers/activation.py +6 -8
- sglang/srt/layers/attention/flashattention_backend.py +883 -209
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/torch_native_backend.py +6 -1
- sglang/srt/layers/attention/triton_backend.py +6 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
- sglang/srt/layers/dp_attention.py +1 -1
- sglang/srt/layers/layernorm.py +20 -5
- sglang/srt/layers/linear.py +17 -3
- sglang/srt/layers/moe/ep_moe/layer.py +17 -29
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/topk.py +27 -30
- sglang/srt/layers/parameter.py +0 -2
- sglang/srt/layers/quantization/__init__.py +1 -0
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
- sglang/srt/layers/quantization/deep_gemm.py +378 -0
- sglang/srt/layers/quantization/fp8.py +115 -132
- sglang/srt/layers/quantization/fp8_kernel.py +213 -88
- sglang/srt/layers/quantization/fp8_utils.py +189 -264
- sglang/srt/layers/quantization/gptq.py +13 -7
- sglang/srt/layers/quantization/modelopt_quant.py +2 -2
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/utils.py +5 -11
- sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -7
- sglang/srt/layers/radix_attention.py +15 -0
- sglang/srt/layers/rotary_embedding.py +9 -8
- sglang/srt/layers/sampler.py +7 -12
- sglang/srt/lora/backend/base_backend.py +18 -2
- sglang/srt/lora/backend/flashinfer_backend.py +1 -1
- sglang/srt/lora/backend/triton_backend.py +1 -1
- sglang/srt/lora/layers.py +1 -1
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +7 -1
- sglang/srt/managers/detokenizer_manager.py +0 -1
- sglang/srt/managers/io_struct.py +15 -3
- sglang/srt/managers/mm_utils.py +4 -3
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
- sglang/srt/managers/schedule_batch.py +15 -4
- sglang/srt/managers/scheduler.py +28 -77
- sglang/srt/managers/tokenizer_manager.py +116 -29
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +41 -29
- sglang/srt/mem_cache/memory_pool.py +38 -15
- sglang/srt/model_executor/cuda_graph_runner.py +15 -10
- sglang/srt/model_executor/model_runner.py +39 -31
- sglang/srt/models/bert.py +398 -0
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_nextn.py +74 -70
- sglang/srt/models/deepseek_v2.py +292 -348
- sglang/srt/models/llama.py +5 -5
- sglang/srt/models/minicpm3.py +31 -203
- sglang/srt/models/minicpmo.py +17 -6
- sglang/srt/models/qwen2.py +4 -1
- sglang/srt/models/qwen2_moe.py +14 -13
- sglang/srt/models/qwen3.py +335 -0
- sglang/srt/models/qwen3_moe.py +423 -0
- sglang/srt/openai_api/adapter.py +71 -4
- sglang/srt/openai_api/protocol.py +6 -1
- sglang/srt/reasoning_parser.py +0 -1
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/server_args.py +86 -72
- sglang/srt/speculative/build_eagle_tree.py +2 -2
- sglang/srt/speculative/eagle_utils.py +2 -2
- sglang/srt/speculative/eagle_worker.py +6 -14
- sglang/srt/utils.py +62 -6
- sglang/test/runners.py +5 -1
- sglang/test/test_block_fp8.py +167 -0
- sglang/test/test_custom_ops.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/version.py +1 -1
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
- sglang/lang/__init__.py +0 -0
- sglang/srt/lora/backend/__init__.py +0 -25
- sglang/srt/server.py +0 -18
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
|
|
26
26
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
27
|
from sglang.srt.utils import (
|
28
28
|
configure_ipv6,
|
29
|
-
get_amdgpu_memory_capacity,
|
30
29
|
get_device,
|
31
|
-
|
32
|
-
get_nvgpu_memory_capacity,
|
33
|
-
is_cuda,
|
30
|
+
get_device_memory_capacity,
|
34
31
|
is_flashinfer_available,
|
35
32
|
is_hip,
|
36
33
|
is_port_available,
|
@@ -49,6 +46,7 @@ class ServerArgs:
|
|
49
46
|
tokenizer_path: Optional[str] = None
|
50
47
|
tokenizer_mode: str = "auto"
|
51
48
|
skip_tokenizer_init: bool = False
|
49
|
+
enable_tokenizer_batch_encode: bool = False
|
52
50
|
load_format: str = "auto"
|
53
51
|
trust_remote_code: bool = False
|
54
52
|
dtype: str = "auto"
|
@@ -155,7 +153,6 @@ class ServerArgs:
|
|
155
153
|
enable_nccl_nvls: bool = False
|
156
154
|
disable_outlines_disk_cache: bool = False
|
157
155
|
disable_custom_all_reduce: bool = False
|
158
|
-
disable_mla: bool = False
|
159
156
|
enable_llama4_multimodal: Optional[bool] = None
|
160
157
|
disable_overlap_schedule: bool = False
|
161
158
|
enable_mixed_chunk: bool = False
|
@@ -180,13 +177,14 @@ class ServerArgs:
|
|
180
177
|
tool_call_parser: Optional[str] = None
|
181
178
|
enable_hierarchical_cache: bool = False
|
182
179
|
hicache_ratio: float = 2.0
|
183
|
-
|
184
|
-
|
180
|
+
hicache_size: int = 0
|
181
|
+
hicache_write_policy: str = "write_through_selective"
|
185
182
|
flashinfer_mla_disable_ragged: bool = False
|
186
183
|
warmups: Optional[str] = None
|
184
|
+
moe_dense_tp_size: Optional[int] = None
|
187
185
|
n_share_experts_fusion: int = 0
|
188
|
-
disable_shared_experts_fusion: bool = False
|
189
186
|
disable_chunked_prefix_cache: bool = False
|
187
|
+
disable_fast_image_processor: bool = False
|
190
188
|
|
191
189
|
# Debug tensor dumps
|
192
190
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -197,9 +195,7 @@ class ServerArgs:
|
|
197
195
|
disaggregation_mode: str = "null"
|
198
196
|
disaggregation_bootstrap_port: int = 8998
|
199
197
|
disaggregation_transfer_backend: str = "mooncake"
|
200
|
-
|
201
|
-
# multimodal
|
202
|
-
disable_fast_image_processor: bool = False
|
198
|
+
disaggregation_ib_device: Optional[str] = None
|
203
199
|
|
204
200
|
def __post_init__(self):
|
205
201
|
# Expert parallelism
|
@@ -222,31 +218,24 @@ class ServerArgs:
|
|
222
218
|
if self.random_seed is None:
|
223
219
|
self.random_seed = random.randint(0, 1 << 30)
|
224
220
|
|
225
|
-
|
226
|
-
gpu_mem = get_nvgpu_memory_capacity()
|
227
|
-
elif is_hip():
|
228
|
-
gpu_mem = get_amdgpu_memory_capacity()
|
229
|
-
elif self.device == "hpu":
|
230
|
-
gpu_mem = get_hpu_memory_capacity()
|
231
|
-
else:
|
232
|
-
# GPU memory is not known yet or no GPU is available.
|
233
|
-
gpu_mem = None
|
234
|
-
|
235
|
-
if is_hip():
|
236
|
-
self.disable_shared_experts_fusion = True
|
221
|
+
gpu_mem = get_device_memory_capacity(self.device)
|
237
222
|
|
238
223
|
# Set mem fraction static, which depends on the tensor parallelism size
|
239
224
|
if self.mem_fraction_static is None:
|
240
|
-
if
|
241
|
-
self.
|
242
|
-
|
243
|
-
self.
|
244
|
-
|
245
|
-
self.
|
246
|
-
|
247
|
-
self.
|
225
|
+
if gpu_mem <= 81920:
|
226
|
+
if self.tp_size >= 16:
|
227
|
+
self.mem_fraction_static = 0.79
|
228
|
+
elif self.tp_size >= 8:
|
229
|
+
self.mem_fraction_static = 0.81
|
230
|
+
elif self.tp_size >= 4:
|
231
|
+
self.mem_fraction_static = 0.85
|
232
|
+
elif self.tp_size >= 2:
|
233
|
+
self.mem_fraction_static = 0.87
|
234
|
+
else:
|
235
|
+
self.mem_fraction_static = 0.88
|
248
236
|
else:
|
249
|
-
|
237
|
+
# FIXME: more fine grained auto-selection polices
|
238
|
+
self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
|
250
239
|
|
251
240
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
252
241
|
if self.chunked_prefill_size is None:
|
@@ -257,7 +246,12 @@ class ServerArgs:
|
|
257
246
|
|
258
247
|
assert self.chunked_prefill_size % self.page_size == 0
|
259
248
|
|
260
|
-
|
249
|
+
assert self.moe_dense_tp_size in {
|
250
|
+
1,
|
251
|
+
None,
|
252
|
+
}, f"moe_dense_tp_size only support 1 and None currently"
|
253
|
+
|
254
|
+
if self.attention_backend == "flashmla":
|
261
255
|
logger.warning(
|
262
256
|
"FlashMLA only supports a page_size of 64, change page_size to 64."
|
263
257
|
)
|
@@ -270,8 +264,6 @@ class ServerArgs:
|
|
270
264
|
self.cuda_graph_max_bs = 8
|
271
265
|
else:
|
272
266
|
self.cuda_graph_max_bs = 80
|
273
|
-
else:
|
274
|
-
self.cuda_graph_max_bs = 160
|
275
267
|
|
276
268
|
# Set kernel backends for hpu device
|
277
269
|
if self.device == "hpu":
|
@@ -293,13 +285,6 @@ class ServerArgs:
|
|
293
285
|
if self.grammar_backend is None:
|
294
286
|
self.grammar_backend = "xgrammar"
|
295
287
|
|
296
|
-
# Expert parallelism
|
297
|
-
if self.enable_ep_moe:
|
298
|
-
self.ep_size = self.tp_size
|
299
|
-
logger.info(
|
300
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
301
|
-
)
|
302
|
-
|
303
288
|
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
|
304
289
|
|
305
290
|
# Data parallelism attention
|
@@ -360,7 +345,18 @@ class ServerArgs:
|
|
360
345
|
|
361
346
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
362
347
|
self.speculative_eagle_topk = 1
|
363
|
-
logger.info(
|
348
|
+
logger.info(
|
349
|
+
"speculative_eagle_topk is adjusted to 1 when page_size > 1"
|
350
|
+
)
|
351
|
+
|
352
|
+
if (
|
353
|
+
self.speculative_eagle_topk == 1
|
354
|
+
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
355
|
+
):
|
356
|
+
logger.info(
|
357
|
+
"speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
|
358
|
+
)
|
359
|
+
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
364
360
|
|
365
361
|
# The token generated from the verify step is counted.
|
366
362
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
@@ -382,18 +378,18 @@ class ServerArgs:
|
|
382
378
|
# PD disaggregation
|
383
379
|
if self.disaggregation_mode == "prefill":
|
384
380
|
self.disable_cuda_graph = True
|
385
|
-
logger.warning("
|
386
|
-
self.disable_overlap_schedule = True
|
387
|
-
logger.warning("Overlap scheduler is disabled for prefill server")
|
381
|
+
logger.warning("Cuda graph is disabled for prefill server")
|
388
382
|
elif self.disaggregation_mode == "decode":
|
389
383
|
self.disable_radix_cache = True
|
390
|
-
logger.warning("
|
391
|
-
self.disable_overlap_schedule = True
|
392
|
-
logger.warning("Overlap scheduler is disabled for decode server")
|
384
|
+
logger.warning("KV cache is forced as chunk cache for decode server")
|
393
385
|
|
394
386
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
395
387
|
"1" if self.enable_torch_compile else "0"
|
396
388
|
)
|
389
|
+
# Set env var before grammar backends init
|
390
|
+
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
391
|
+
"1" if self.disable_outlines_disk_cache else "0"
|
392
|
+
)
|
397
393
|
|
398
394
|
@staticmethod
|
399
395
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -430,6 +426,11 @@ class ServerArgs:
|
|
430
426
|
action="store_true",
|
431
427
|
help="If set, skip init tokenizer and pass input_ids in generate request",
|
432
428
|
)
|
429
|
+
parser.add_argument(
|
430
|
+
"--enable-tokenizer-batch-encode",
|
431
|
+
action="store_true",
|
432
|
+
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
433
|
+
)
|
433
434
|
parser.add_argument(
|
434
435
|
"--load-format",
|
435
436
|
type=str,
|
@@ -826,7 +827,7 @@ class ServerArgs:
|
|
826
827
|
parser.add_argument(
|
827
828
|
"--attention-backend",
|
828
829
|
type=str,
|
829
|
-
choices=["flashinfer", "triton", "torch_native", "fa3"],
|
830
|
+
choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
|
830
831
|
default=ServerArgs.attention_backend,
|
831
832
|
help="Choose the kernels for attention layers.",
|
832
833
|
)
|
@@ -846,13 +847,13 @@ class ServerArgs:
|
|
846
847
|
)
|
847
848
|
parser.add_argument(
|
848
849
|
"--enable-flashinfer-mla",
|
849
|
-
action=
|
850
|
-
help="
|
850
|
+
action=DeprecatedAction,
|
851
|
+
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
|
851
852
|
)
|
852
853
|
parser.add_argument(
|
853
854
|
"--enable-flashmla",
|
854
|
-
action=
|
855
|
-
help="
|
855
|
+
action=DeprecatedAction,
|
856
|
+
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
|
856
857
|
)
|
857
858
|
parser.add_argument(
|
858
859
|
"--flashinfer-mla-disable-ragged",
|
@@ -977,11 +978,6 @@ class ServerArgs:
|
|
977
978
|
action="store_true",
|
978
979
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
979
980
|
)
|
980
|
-
parser.add_argument(
|
981
|
-
"--disable-mla",
|
982
|
-
action="store_true",
|
983
|
-
help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
|
984
|
-
)
|
985
981
|
parser.add_argument(
|
986
982
|
"--enable-llama4-multimodal",
|
987
983
|
default=ServerArgs.enable_llama4_multimodal,
|
@@ -1090,7 +1086,7 @@ class ServerArgs:
|
|
1090
1086
|
parser.add_argument(
|
1091
1087
|
"--tool-call-parser",
|
1092
1088
|
type=str,
|
1093
|
-
choices=["qwen25", "mistral", "llama3"],
|
1089
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3"],
|
1094
1090
|
default=ServerArgs.tool_call_parser,
|
1095
1091
|
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
|
1096
1092
|
)
|
@@ -1102,15 +1098,33 @@ class ServerArgs:
|
|
1102
1098
|
parser.add_argument(
|
1103
1099
|
"--hicache-ratio",
|
1104
1100
|
type=float,
|
1105
|
-
required=False,
|
1106
1101
|
default=ServerArgs.hicache_ratio,
|
1107
1102
|
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
|
1108
1103
|
)
|
1104
|
+
parser.add_argument(
|
1105
|
+
"--hicache-size",
|
1106
|
+
type=int,
|
1107
|
+
default=ServerArgs.hicache_size,
|
1108
|
+
help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
|
1109
|
+
)
|
1110
|
+
parser.add_argument(
|
1111
|
+
"--hicache-write-policy",
|
1112
|
+
type=str,
|
1113
|
+
choices=["write_back", "write_through", "write_through_selective"],
|
1114
|
+
default=ServerArgs.hicache_write_policy,
|
1115
|
+
help="The write policy of hierarchical cache.",
|
1116
|
+
)
|
1109
1117
|
parser.add_argument(
|
1110
1118
|
"--enable-deepep-moe",
|
1111
1119
|
action="store_true",
|
1112
1120
|
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1113
1121
|
)
|
1122
|
+
parser.add_argument(
|
1123
|
+
"--moe-dense-tp-size",
|
1124
|
+
type=int,
|
1125
|
+
default=ServerArgs.moe_dense_tp_size,
|
1126
|
+
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1127
|
+
)
|
1114
1128
|
parser.add_argument(
|
1115
1129
|
"--deepep-mode",
|
1116
1130
|
type=str,
|
@@ -1123,18 +1137,18 @@ class ServerArgs:
|
|
1123
1137
|
"--n-share-experts-fusion",
|
1124
1138
|
type=int,
|
1125
1139
|
default=0,
|
1126
|
-
help="The number of shared_experts need to be
|
1127
|
-
"
|
1140
|
+
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
1141
|
+
"set it to tp_size can get best optimized performace.",
|
1128
1142
|
)
|
1129
1143
|
parser.add_argument(
|
1130
|
-
"--disable-
|
1144
|
+
"--disable-chunked-prefix-cache",
|
1131
1145
|
action="store_true",
|
1132
|
-
help="Disable
|
1146
|
+
help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
|
1133
1147
|
)
|
1134
1148
|
parser.add_argument(
|
1135
|
-
"--disable-
|
1149
|
+
"--disable-fast-image-processor",
|
1136
1150
|
action="store_true",
|
1137
|
-
help="
|
1151
|
+
help="Adopt base image processor instead of fast image processor.",
|
1138
1152
|
)
|
1139
1153
|
|
1140
1154
|
# Server warmups
|
@@ -1184,14 +1198,14 @@ class ServerArgs:
|
|
1184
1198
|
"--disaggregation-transfer-backend",
|
1185
1199
|
type=str,
|
1186
1200
|
default=ServerArgs.disaggregation_transfer_backend,
|
1201
|
+
choices=["mooncake", "nixl"],
|
1187
1202
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1188
1203
|
)
|
1189
|
-
|
1190
|
-
# Multimodal
|
1191
1204
|
parser.add_argument(
|
1192
|
-
"--
|
1193
|
-
|
1194
|
-
|
1205
|
+
"--disaggregation-ib-device",
|
1206
|
+
type=str,
|
1207
|
+
default=ServerArgs.disaggregation_ib_device,
|
1208
|
+
help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
|
1195
1209
|
)
|
1196
1210
|
|
1197
1211
|
@classmethod
|
@@ -4,9 +4,9 @@ from typing import List
|
|
4
4
|
|
5
5
|
import torch
|
6
6
|
|
7
|
-
from sglang.srt.utils import
|
7
|
+
from sglang.srt.utils import is_cuda, is_hip
|
8
8
|
|
9
|
-
if
|
9
|
+
if is_cuda() or is_hip():
|
10
10
|
from sgl_kernel import (
|
11
11
|
build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
|
12
12
|
)
|
@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
|
|
19
19
|
from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
|
20
20
|
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
|
21
21
|
from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
|
22
|
-
from sglang.srt.utils import fast_topk,
|
22
|
+
from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
|
23
23
|
|
24
|
-
if
|
24
|
+
if is_cuda():
|
25
25
|
from sgl_kernel import (
|
26
26
|
top_k_renorm_prob,
|
27
27
|
top_p_renorm_prob,
|
@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
|
|
34
34
|
select_top_k_tokens,
|
35
35
|
)
|
36
36
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
37
|
-
from sglang.srt.utils import
|
38
|
-
empty_context,
|
39
|
-
fast_topk,
|
40
|
-
get_available_gpu_memory,
|
41
|
-
is_cuda_available,
|
42
|
-
)
|
37
|
+
from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
|
43
38
|
|
44
|
-
if
|
39
|
+
if is_cuda():
|
45
40
|
from sgl_kernel import segment_packbits
|
46
41
|
|
47
42
|
logger = logging.getLogger(__name__)
|
@@ -271,14 +266,11 @@ class EAGLEWorker(TpModelWorker):
|
|
271
266
|
)
|
272
267
|
elif batch.forward_mode.is_idle():
|
273
268
|
model_worker_batch = batch.get_model_worker_batch()
|
274
|
-
logits_output, next_token_ids
|
275
|
-
|
276
|
-
ForwardBatch.init_new(
|
277
|
-
model_worker_batch, self.target_worker.model_runner
|
278
|
-
)
|
279
|
-
)
|
269
|
+
logits_output, next_token_ids = self.target_worker.forward_batch_generation(
|
270
|
+
model_worker_batch
|
280
271
|
)
|
281
|
-
|
272
|
+
|
273
|
+
return logits_output, next_token_ids, model_worker_batch.bid, 0
|
282
274
|
else:
|
283
275
|
logits_output, next_token_ids, bid = self.forward_target_extend(batch)
|
284
276
|
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
sglang/srt/utils.py
CHANGED
@@ -55,7 +55,6 @@ import torch.distributed
|
|
55
55
|
import torch.distributed as dist
|
56
56
|
import triton
|
57
57
|
import zmq
|
58
|
-
from decord import VideoReader, cpu
|
59
58
|
from fastapi.responses import ORJSONResponse
|
60
59
|
from packaging import version as pkg_version
|
61
60
|
from PIL import Image
|
@@ -79,10 +78,34 @@ time_infos = {}
|
|
79
78
|
|
80
79
|
HIP_FP8_E4M3_FNUZ_MAX = 224.0
|
81
80
|
|
81
|
+
_warned_bool_env_var_keys = set()
|
82
|
+
|
82
83
|
|
83
84
|
def get_bool_env_var(name: str, default: str = "false") -> bool:
|
84
85
|
value = os.getenv(name, default)
|
85
|
-
|
86
|
+
value = value.lower()
|
87
|
+
|
88
|
+
truthy_values = ("true", "1")
|
89
|
+
falsy_values = ("false", "0")
|
90
|
+
|
91
|
+
if (value not in truthy_values) and (value not in falsy_values):
|
92
|
+
if value not in _warned_bool_env_var_keys:
|
93
|
+
logger.warning(
|
94
|
+
f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
|
95
|
+
)
|
96
|
+
_warned_bool_env_var_keys.add(value)
|
97
|
+
|
98
|
+
return value in truthy_values
|
99
|
+
|
100
|
+
|
101
|
+
def get_int_env_var(name: str, default: int = 0) -> int:
|
102
|
+
value = os.getenv(name)
|
103
|
+
if value is None or not value.strip():
|
104
|
+
return default
|
105
|
+
try:
|
106
|
+
return int(value)
|
107
|
+
except ValueError:
|
108
|
+
return default
|
86
109
|
|
87
110
|
|
88
111
|
# https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
|
@@ -131,10 +154,6 @@ def is_flashinfer_available():
|
|
131
154
|
return importlib.util.find_spec("flashinfer") is not None and is_cuda()
|
132
155
|
|
133
156
|
|
134
|
-
def is_cuda_available():
|
135
|
-
return is_cuda()
|
136
|
-
|
137
|
-
|
138
157
|
_ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
|
139
158
|
"SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
|
140
159
|
)
|
@@ -545,6 +564,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
|
|
545
564
|
|
546
565
|
|
547
566
|
def encode_video(video_path, frame_count_limit=None):
|
567
|
+
# Lazy import because decord is not available on some arm platforms.
|
568
|
+
from decord import VideoReader, cpu
|
569
|
+
|
548
570
|
if not os.path.exists(video_path):
|
549
571
|
logger.error(f"Video {video_path} does not exist")
|
550
572
|
return []
|
@@ -772,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
|
|
772
794
|
return await call_next(request)
|
773
795
|
if request.url.path.startswith("/health"):
|
774
796
|
return await call_next(request)
|
797
|
+
if request.url.path.startswith("/metrics"):
|
798
|
+
return await call_next(request)
|
775
799
|
if request.headers.get("Authorization") != "Bearer " + api_key:
|
776
800
|
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
777
801
|
return await call_next(request)
|
@@ -928,6 +952,8 @@ def get_zmq_socket(
|
|
928
952
|
buf_size = -1
|
929
953
|
|
930
954
|
socket = context.socket(socket_type)
|
955
|
+
if endpoint.find("[") != -1:
|
956
|
+
socket.setsockopt(zmq.IPV6, 1)
|
931
957
|
|
932
958
|
def set_send_opt():
|
933
959
|
socket.setsockopt(zmq.SNDHWM, 0)
|
@@ -1144,6 +1170,20 @@ def get_hpu_memory_capacity():
|
|
1144
1170
|
)
|
1145
1171
|
|
1146
1172
|
|
1173
|
+
def get_device_memory_capacity(device: str = None):
|
1174
|
+
if is_cuda():
|
1175
|
+
gpu_mem = get_nvgpu_memory_capacity()
|
1176
|
+
elif is_hip():
|
1177
|
+
gpu_mem = get_amdgpu_memory_capacity()
|
1178
|
+
elif device == "hpu":
|
1179
|
+
gpu_mem = get_hpu_memory_capacity()
|
1180
|
+
else:
|
1181
|
+
# GPU memory is not known yet or no GPU is available.
|
1182
|
+
gpu_mem = None
|
1183
|
+
|
1184
|
+
return gpu_mem
|
1185
|
+
|
1186
|
+
|
1147
1187
|
# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
|
1148
1188
|
# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
|
1149
1189
|
# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
|
@@ -1911,6 +1951,8 @@ def is_page_size_one(server_args):
|
|
1911
1951
|
return server_args.page_size == 1
|
1912
1952
|
|
1913
1953
|
|
1954
|
+
# TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
|
1955
|
+
# TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
|
1914
1956
|
def is_no_spec_infer_or_topk_one(server_args):
|
1915
1957
|
return server_args.speculative_eagle_topk is None or (
|
1916
1958
|
server_args.speculative_eagle_topk is not None
|
@@ -1928,5 +1970,19 @@ def is_fa3_default_architecture(hf_config):
|
|
1928
1970
|
"Llama4ForConditionalGeneration",
|
1929
1971
|
"LlamaForCausalLM",
|
1930
1972
|
"MistralForCausalLM",
|
1973
|
+
"Gemma2ForCausalLM",
|
1931
1974
|
}
|
1932
1975
|
return architectures[0] in default_archs
|
1976
|
+
|
1977
|
+
|
1978
|
+
# Can be more general if it is used in multiple places (keep it simple and thus not general now)
|
1979
|
+
class BumpAllocator:
|
1980
|
+
def __init__(self, buffer_size: int, dtype, device):
|
1981
|
+
self._buffer = torch.zeros((buffer_size,), dtype=dtype, device=device)
|
1982
|
+
self._pointer = 0
|
1983
|
+
|
1984
|
+
def allocate(self, size: int):
|
1985
|
+
assert self._pointer + size <= len(self._buffer)
|
1986
|
+
output = self._buffer[self._pointer : self._pointer + size]
|
1987
|
+
self._pointer += size
|
1988
|
+
return output
|
sglang/test/runners.py
CHANGED
@@ -26,8 +26,8 @@ from transformers import (
|
|
26
26
|
AutoProcessor,
|
27
27
|
)
|
28
28
|
|
29
|
+
from sglang.srt.entrypoints.engine import Engine
|
29
30
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
30
|
-
from sglang.srt.server import Engine
|
31
31
|
from sglang.srt.utils import load_image
|
32
32
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
|
33
33
|
|
@@ -51,6 +51,8 @@ NUM_TOP_LOGPROBS = 5
|
|
51
51
|
def get_dtype_str(torch_dtype):
|
52
52
|
if torch_dtype is torch.float16:
|
53
53
|
return "float16"
|
54
|
+
if torch_dtype is torch.float32:
|
55
|
+
return "float32"
|
54
56
|
else:
|
55
57
|
raise NotImplementedError()
|
56
58
|
|
@@ -447,6 +449,7 @@ class SRTRunner:
|
|
447
449
|
port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
|
448
450
|
lora_paths: List[str] = None,
|
449
451
|
max_loras_per_batch: int = 4,
|
452
|
+
attention_backend: Optional[str] = None,
|
450
453
|
lora_backend: str = "triton",
|
451
454
|
disable_cuda_graph: bool = False,
|
452
455
|
disable_radix_cache: bool = False,
|
@@ -487,6 +490,7 @@ class SRTRunner:
|
|
487
490
|
lora_paths=lora_paths,
|
488
491
|
max_loras_per_batch=max_loras_per_batch,
|
489
492
|
lora_backend=lora_backend,
|
493
|
+
attention_backend=attention_backend,
|
490
494
|
disable_cuda_graph=disable_cuda_graph,
|
491
495
|
disable_radix_cache=disable_radix_cache,
|
492
496
|
chunked_prefill_size=chunked_prefill_size,
|