sglang 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/check_env.py +3 -3
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +15 -0
- sglang/srt/conversation.py +122 -1
- sglang/srt/disaggregation/decode.py +8 -2
- sglang/srt/disaggregation/fake/__init__.py +1 -0
- sglang/srt/disaggregation/fake/conn.py +88 -0
- sglang/srt/disaggregation/prefill.py +12 -3
- sglang/srt/disaggregation/utils.py +16 -2
- sglang/srt/entrypoints/engine.py +52 -21
- sglang/srt/entrypoints/http_server.py +27 -2
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
- sglang/srt/layers/attention/flashinfer_backend.py +107 -82
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/attention/utils.py +1 -1
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
- sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +1 -1
- sglang/srt/layers/quantization/fp8.py +20 -22
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +84 -35
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +34 -15
- sglang/srt/managers/scheduler.py +273 -67
- sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
- sglang/srt/managers/tp_worker.py +52 -17
- sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +123 -58
- sglang/srt/models/deepseek_nextn.py +1 -257
- sglang/srt/models/deepseek_v2.py +78 -18
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +92 -30
- sglang/srt/models/llama4.py +2 -1
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +0 -12
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/openai_api/adapter.py +49 -8
- sglang/srt/openai_api/protocol.py +13 -1
- sglang/srt/reasoning_parser.py +25 -1
- sglang/srt/server_args.py +83 -24
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +91 -9
- sglang/test/runners.py +4 -0
- sglang/test/send_one.py +84 -28
- sglang/test/test_utils.py +67 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +85 -60
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -22,7 +22,7 @@ import random
|
|
22
22
|
import tempfile
|
23
23
|
from typing import List, Literal, Optional
|
24
24
|
|
25
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file
|
25
|
+
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
26
26
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
27
|
from sglang.srt.utils import (
|
28
28
|
configure_ipv6,
|
@@ -78,6 +78,8 @@ class ServerArgs:
|
|
78
78
|
|
79
79
|
# Other runtime options
|
80
80
|
tp_size: int = 1
|
81
|
+
pp_size: int = 1
|
82
|
+
max_micro_batch_size: Optional[int] = None
|
81
83
|
stream_interval: int = 1
|
82
84
|
stream_output: bool = False
|
83
85
|
random_seed: Optional[int] = None
|
@@ -222,25 +224,34 @@ class ServerArgs:
|
|
222
224
|
|
223
225
|
# Set mem fraction static, which depends on the tensor parallelism size
|
224
226
|
if self.mem_fraction_static is None:
|
227
|
+
parallel_size = self.tp_size * self.pp_size
|
225
228
|
if gpu_mem <= 81920:
|
226
|
-
if
|
229
|
+
if parallel_size >= 16:
|
227
230
|
self.mem_fraction_static = 0.79
|
228
|
-
elif
|
231
|
+
elif parallel_size >= 8:
|
229
232
|
self.mem_fraction_static = 0.81
|
230
|
-
elif
|
233
|
+
elif parallel_size >= 4:
|
231
234
|
self.mem_fraction_static = 0.85
|
232
|
-
elif
|
235
|
+
elif parallel_size >= 2:
|
233
236
|
self.mem_fraction_static = 0.87
|
234
237
|
else:
|
235
238
|
self.mem_fraction_static = 0.88
|
236
239
|
else:
|
237
|
-
|
238
|
-
|
240
|
+
self.mem_fraction_static = 0.88
|
241
|
+
if gpu_mem > 96 * 1024:
|
242
|
+
mem_fraction = self.mem_fraction_static
|
243
|
+
self.mem_fraction_static = min(
|
244
|
+
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
245
|
+
(gpu_mem - 1024 * 18)
|
246
|
+
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
247
|
+
)
|
239
248
|
|
240
249
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
241
250
|
if self.chunked_prefill_size is None:
|
242
251
|
if gpu_mem is not None and gpu_mem < 25_000:
|
243
252
|
self.chunked_prefill_size = 2048
|
253
|
+
elif self.disaggregation_mode != "null":
|
254
|
+
self.chunked_prefill_size = 16384
|
244
255
|
else:
|
245
256
|
self.chunked_prefill_size = 8192
|
246
257
|
assert self.chunked_prefill_size % self.page_size == 0
|
@@ -256,6 +267,12 @@ class ServerArgs:
|
|
256
267
|
)
|
257
268
|
self.page_size = 64
|
258
269
|
|
270
|
+
if self.attention_backend == "cutlass_mla":
|
271
|
+
logger.warning(
|
272
|
+
"Cutlass MLA only supports a page_size of 128, change page_size to 128."
|
273
|
+
)
|
274
|
+
self.page_size = 128
|
275
|
+
|
259
276
|
# Set cuda graph max batch size
|
260
277
|
if self.cuda_graph_max_bs is None:
|
261
278
|
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
@@ -327,6 +344,14 @@ class ServerArgs:
|
|
327
344
|
"eagle speculative decoding."
|
328
345
|
)
|
329
346
|
|
347
|
+
model_arch = get_model_arch(self)
|
348
|
+
|
349
|
+
# Auto set draft_model_path DeepSeek-V3/R1
|
350
|
+
if self.speculative_draft_model_path is None and model_arch in [
|
351
|
+
"DeepseekV3ForCausalLM"
|
352
|
+
]:
|
353
|
+
self.speculative_draft_model_path = self.model_path
|
354
|
+
|
330
355
|
# Auto choose parameters
|
331
356
|
if self.speculative_num_steps is None:
|
332
357
|
assert (
|
@@ -337,7 +362,7 @@ class ServerArgs:
|
|
337
362
|
self.speculative_num_steps,
|
338
363
|
self.speculative_eagle_topk,
|
339
364
|
self.speculative_num_draft_tokens,
|
340
|
-
) = auto_choose_speculative_params(
|
365
|
+
) = auto_choose_speculative_params(model_arch)
|
341
366
|
|
342
367
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
343
368
|
self.speculative_eagle_topk = 1
|
@@ -420,7 +445,7 @@ class ServerArgs:
|
|
420
445
|
parser.add_argument(
|
421
446
|
"--skip-tokenizer-init",
|
422
447
|
action="store_true",
|
423
|
-
help="If set, skip init tokenizer and pass input_ids in generate request",
|
448
|
+
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
424
449
|
)
|
425
450
|
parser.add_argument(
|
426
451
|
"--enable-tokenizer-batch-encode",
|
@@ -559,6 +584,7 @@ class ServerArgs:
|
|
559
584
|
"name, a tag name, or a commit id. If unspecified, will use "
|
560
585
|
"the default version.",
|
561
586
|
)
|
587
|
+
|
562
588
|
# Memory and scheduling
|
563
589
|
parser.add_argument(
|
564
590
|
"--mem-fraction-static",
|
@@ -625,6 +651,19 @@ class ServerArgs:
|
|
625
651
|
default=ServerArgs.tp_size,
|
626
652
|
help="The tensor parallelism size.",
|
627
653
|
)
|
654
|
+
parser.add_argument(
|
655
|
+
"--pipeline-parallel-size",
|
656
|
+
"--pp-size",
|
657
|
+
type=int,
|
658
|
+
default=ServerArgs.pp_size,
|
659
|
+
help="The pipeline parallelism size.",
|
660
|
+
)
|
661
|
+
parser.add_argument(
|
662
|
+
"--max-micro-batch-size",
|
663
|
+
type=int,
|
664
|
+
default=ServerArgs.max_micro_batch_size,
|
665
|
+
help="The maximum micro batch size in pipeline parallelism.",
|
666
|
+
)
|
628
667
|
parser.add_argument(
|
629
668
|
"--stream-interval",
|
630
669
|
type=int,
|
@@ -823,7 +862,14 @@ class ServerArgs:
|
|
823
862
|
parser.add_argument(
|
824
863
|
"--attention-backend",
|
825
864
|
type=str,
|
826
|
-
choices=[
|
865
|
+
choices=[
|
866
|
+
"flashinfer",
|
867
|
+
"triton",
|
868
|
+
"torch_native",
|
869
|
+
"fa3",
|
870
|
+
"flashmla",
|
871
|
+
"cutlass_mla",
|
872
|
+
],
|
827
873
|
default=ServerArgs.attention_backend,
|
828
874
|
help="Choose the kernels for attention layers.",
|
829
875
|
)
|
@@ -1082,9 +1128,9 @@ class ServerArgs:
|
|
1082
1128
|
parser.add_argument(
|
1083
1129
|
"--tool-call-parser",
|
1084
1130
|
type=str,
|
1085
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3"],
|
1131
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1086
1132
|
default=ServerArgs.tool_call_parser,
|
1087
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and '
|
1133
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1088
1134
|
)
|
1089
1135
|
parser.add_argument(
|
1090
1136
|
"--enable-hierarchical-cache",
|
@@ -1207,6 +1253,7 @@ class ServerArgs:
|
|
1207
1253
|
@classmethod
|
1208
1254
|
def from_cli_args(cls, args: argparse.Namespace):
|
1209
1255
|
args.tp_size = args.tensor_parallel_size
|
1256
|
+
args.pp_size = args.pipeline_parallel_size
|
1210
1257
|
args.dp_size = args.data_parallel_size
|
1211
1258
|
args.ep_size = args.expert_parallel_size
|
1212
1259
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
@@ -1220,15 +1267,25 @@ class ServerArgs:
|
|
1220
1267
|
|
1221
1268
|
def check_server_args(self):
|
1222
1269
|
assert (
|
1223
|
-
self.tp_size
|
1224
|
-
), "tp_size must be divisible by number of nodes"
|
1270
|
+
self.tp_size * self.pp_size
|
1271
|
+
) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
|
1272
|
+
|
1273
|
+
# FIXME pp constraints
|
1274
|
+
if self.pp_size > 1:
|
1275
|
+
logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
|
1276
|
+
self.disable_overlap_schedule = True
|
1277
|
+
assert (
|
1278
|
+
self.disable_overlap_schedule
|
1279
|
+
and self.speculative_algorithm is None
|
1280
|
+
and not self.enable_mixed_chunk
|
1281
|
+
), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
|
1282
|
+
|
1225
1283
|
assert not (
|
1226
1284
|
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
1227
1285
|
), "multi-node data parallel is not supported unless dp attention!"
|
1228
1286
|
assert (
|
1229
1287
|
self.max_loras_per_batch > 0
|
1230
1288
|
# FIXME
|
1231
|
-
and (self.lora_paths is None or self.disable_cuda_graph)
|
1232
1289
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1233
1290
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
1234
1291
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
@@ -1354,20 +1411,22 @@ class DeprecatedAction(argparse.Action):
|
|
1354
1411
|
raise ValueError(self.help)
|
1355
1412
|
|
1356
1413
|
|
1357
|
-
def
|
1414
|
+
def get_model_arch(args: ServerArgs):
|
1415
|
+
hf_config = get_config(
|
1416
|
+
args.model_path,
|
1417
|
+
trust_remote_code=args.trust_remote_code,
|
1418
|
+
revision=args.revision,
|
1419
|
+
model_override_args=json.loads(args.json_model_override_args),
|
1420
|
+
)
|
1421
|
+
return hf_config.architectures[0]
|
1422
|
+
|
1423
|
+
|
1424
|
+
def auto_choose_speculative_params(arch: str):
|
1358
1425
|
"""
|
1359
1426
|
Automatically choose the parameters for speculative decoding.
|
1360
1427
|
|
1361
1428
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1362
1429
|
"""
|
1363
|
-
config_path = os.path.join(self.model_path, "config.json")
|
1364
|
-
if not os.path.exists(config_path):
|
1365
|
-
raise ValueError(f"{config_path} is not found.")
|
1366
|
-
|
1367
|
-
config = json.load(open(config_path))
|
1368
|
-
|
1369
|
-
arch = config.get("architectures", ["Unknown"])[0]
|
1370
|
-
|
1371
1430
|
if arch in ["LlamaForCausalLM"]:
|
1372
1431
|
# The default value for llama
|
1373
1432
|
return (5, 4, 8)
|
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
|
|
106
106
|
# Init draft worker
|
107
107
|
with empty_context():
|
108
108
|
super().__init__(
|
109
|
+
server_args=server_args,
|
109
110
|
gpu_id=gpu_id,
|
110
111
|
tp_rank=tp_rank,
|
111
|
-
|
112
|
-
nccl_port=nccl_port,
|
112
|
+
pp_rank=0, # FIXME
|
113
113
|
dp_rank=dp_rank,
|
114
|
+
nccl_port=nccl_port,
|
114
115
|
is_draft_worker=True,
|
115
116
|
req_to_token_pool=self.req_to_token_pool,
|
116
117
|
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
sglang/srt/utils.py
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Common utilities."""
|
15
|
+
|
15
16
|
import base64
|
16
17
|
import builtins
|
17
18
|
import ctypes
|
@@ -414,16 +415,40 @@ class LayerFn(Protocol):
|
|
414
415
|
def make_layers(
|
415
416
|
num_hidden_layers: int,
|
416
417
|
layer_fn: LayerFn,
|
418
|
+
pp_rank: Optional[int] = None,
|
419
|
+
pp_size: Optional[int] = None,
|
417
420
|
prefix: str = "",
|
421
|
+
return_tuple: bool = False,
|
418
422
|
) -> Tuple[int, int, torch.nn.ModuleList]:
|
419
423
|
"""Make a list of layers with the given layer function"""
|
424
|
+
# circula imports
|
425
|
+
from sglang.srt.distributed import get_pp_indices
|
426
|
+
from sglang.srt.layers.utils import PPMissingLayer
|
427
|
+
|
428
|
+
assert not pp_size or num_hidden_layers >= pp_size
|
429
|
+
start_layer, end_layer = (
|
430
|
+
get_pp_indices(
|
431
|
+
num_hidden_layers,
|
432
|
+
pp_rank,
|
433
|
+
pp_size,
|
434
|
+
)
|
435
|
+
if pp_rank is not None and pp_size is not None
|
436
|
+
else (0, num_hidden_layers)
|
437
|
+
)
|
420
438
|
modules = torch.nn.ModuleList(
|
421
|
-
[
|
439
|
+
[PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
|
440
|
+
+ [
|
422
441
|
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
|
423
|
-
for idx in range(
|
442
|
+
for idx in range(start_layer, end_layer)
|
443
|
+
]
|
444
|
+
+ [
|
445
|
+
PPMissingLayer(return_tuple=return_tuple)
|
446
|
+
for _ in range(end_layer, num_hidden_layers)
|
424
447
|
]
|
425
448
|
)
|
426
|
-
|
449
|
+
if pp_rank is None or pp_size is None:
|
450
|
+
return modules
|
451
|
+
return modules, start_layer, end_layer
|
427
452
|
|
428
453
|
|
429
454
|
def set_random_seed(seed: int) -> None:
|
@@ -877,7 +902,7 @@ def broadcast_pyobj(
|
|
877
902
|
"cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
|
878
903
|
)
|
879
904
|
|
880
|
-
if rank ==
|
905
|
+
if rank == src:
|
881
906
|
if len(data) == 0:
|
882
907
|
tensor_size = torch.tensor([0], dtype=torch.long, device=device)
|
883
908
|
dist.broadcast(tensor_size, src=src, group=dist_group)
|
@@ -909,6 +934,50 @@ def broadcast_pyobj(
|
|
909
934
|
return data
|
910
935
|
|
911
936
|
|
937
|
+
def point_to_point_pyobj(
|
938
|
+
data: List[Any],
|
939
|
+
rank: int,
|
940
|
+
group: Optional[torch.distributed.ProcessGroup] = None,
|
941
|
+
src: int = 0,
|
942
|
+
dst: int = 1,
|
943
|
+
):
|
944
|
+
"""Send data from src to dst in group."""
|
945
|
+
|
946
|
+
if rank == src:
|
947
|
+
if len(data) == 0:
|
948
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
949
|
+
dist.send(tensor_size, dst=dst, group=group)
|
950
|
+
else:
|
951
|
+
serialized_data = pickle.dumps(data)
|
952
|
+
size = len(serialized_data)
|
953
|
+
tensor_data = torch.ByteTensor(
|
954
|
+
np.frombuffer(serialized_data, dtype=np.uint8)
|
955
|
+
)
|
956
|
+
tensor_size = torch.tensor([size], dtype=torch.long)
|
957
|
+
|
958
|
+
dist.send(tensor_size, dst=dst, group=group)
|
959
|
+
dist.send(tensor_data, dst=dst, group=group)
|
960
|
+
return data
|
961
|
+
|
962
|
+
elif rank == dst:
|
963
|
+
tensor_size = torch.tensor([0], dtype=torch.long)
|
964
|
+
dist.recv(tensor_size, src=src, group=group)
|
965
|
+
size = tensor_size.item()
|
966
|
+
|
967
|
+
if size == 0:
|
968
|
+
return []
|
969
|
+
|
970
|
+
tensor_data = torch.empty(size, dtype=torch.uint8)
|
971
|
+
dist.recv(tensor_data, src=src, group=group)
|
972
|
+
|
973
|
+
serialized_data = bytes(tensor_data.cpu().numpy())
|
974
|
+
data = pickle.loads(serialized_data)
|
975
|
+
return data
|
976
|
+
|
977
|
+
# Other ranks in pp_group do nothing
|
978
|
+
return []
|
979
|
+
|
980
|
+
|
912
981
|
step_counter = 0
|
913
982
|
|
914
983
|
|
@@ -1732,6 +1801,13 @@ def configure_ipv6(dist_init_addr):
|
|
1732
1801
|
return port, host
|
1733
1802
|
|
1734
1803
|
|
1804
|
+
def rank0_log(msg: str):
|
1805
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1806
|
+
|
1807
|
+
if get_tensor_model_parallel_rank() == 0:
|
1808
|
+
logger.info(msg)
|
1809
|
+
|
1810
|
+
|
1735
1811
|
def rank0_print(msg: str):
|
1736
1812
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1737
1813
|
|
@@ -1905,13 +1981,16 @@ def fast_topk(values, topk, dim):
|
|
1905
1981
|
return torch.topk(values, topk, dim=dim)
|
1906
1982
|
|
1907
1983
|
|
1908
|
-
def
|
1984
|
+
def _check(cc_major):
|
1909
1985
|
if not is_cuda():
|
1910
1986
|
return False
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1987
|
+
return torch.cuda.get_device_capability()[0] == cc_major and tuple(
|
1988
|
+
map(int, torch.version.cuda.split(".")[:2])
|
1989
|
+
) >= (12, 3)
|
1990
|
+
|
1991
|
+
|
1992
|
+
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
1993
|
+
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
1915
1994
|
|
1916
1995
|
|
1917
1996
|
def get_free_port():
|
@@ -1970,8 +2049,11 @@ def is_fa3_default_architecture(hf_config):
|
|
1970
2049
|
"Llama4ForConditionalGeneration",
|
1971
2050
|
"LlamaForCausalLM",
|
1972
2051
|
"MistralForCausalLM",
|
2052
|
+
"MixtralForCausalLM",
|
1973
2053
|
"Gemma2ForCausalLM",
|
1974
2054
|
"Gemma3ForConditionalGeneration",
|
2055
|
+
"Qwen3ForCausalLM",
|
2056
|
+
"Qwen3MoeForCausalLM",
|
1975
2057
|
}
|
1976
2058
|
return architectures[0] in default_archs
|
1977
2059
|
|
sglang/test/runners.py
CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
|
|
423
423
|
)
|
424
424
|
del input_logits
|
425
425
|
|
426
|
+
if lora_paths is not None and lora_paths[i] is not None:
|
427
|
+
# Unload the LoRA adapter if it is used
|
428
|
+
model.unload()
|
429
|
+
|
426
430
|
return ModelOutput(
|
427
431
|
output_strs=output_strs,
|
428
432
|
top_input_logprobs=top_input_logprobs,
|
sglang/test/send_one.py
CHANGED
@@ -6,11 +6,56 @@ python3 -m sglang.test.send_one
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import argparse
|
9
|
+
import dataclasses
|
9
10
|
import json
|
10
11
|
|
11
12
|
import requests
|
12
13
|
|
13
14
|
|
15
|
+
@dataclasses.dataclass
|
16
|
+
class BenchArgs:
|
17
|
+
host: str = "localhost"
|
18
|
+
port: int = 30000
|
19
|
+
batch_size: int = 1
|
20
|
+
temperature: float = 0.0
|
21
|
+
max_new_tokens: int = 512
|
22
|
+
frequency_penalty: float = 0.0
|
23
|
+
presence_penalty: float = 0.0
|
24
|
+
json: bool = False
|
25
|
+
return_logprob: bool = False
|
26
|
+
prompt: str = (
|
27
|
+
"Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
|
28
|
+
)
|
29
|
+
image: bool = False
|
30
|
+
stream: bool = False
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
34
|
+
parser.add_argument("--host", type=str, default=BenchArgs.host)
|
35
|
+
parser.add_argument("--port", type=int, default=BenchArgs.port)
|
36
|
+
parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
|
37
|
+
parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
|
38
|
+
parser.add_argument(
|
39
|
+
"--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
|
40
|
+
)
|
41
|
+
parser.add_argument(
|
42
|
+
"--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
|
43
|
+
)
|
44
|
+
parser.add_argument(
|
45
|
+
"--presence-penalty", type=float, default=BenchArgs.presence_penalty
|
46
|
+
)
|
47
|
+
parser.add_argument("--json", action="store_true")
|
48
|
+
parser.add_argument("--return-logprob", action="store_true")
|
49
|
+
parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
|
50
|
+
parser.add_argument("--image", action="store_true")
|
51
|
+
parser.add_argument("--stream", action="store_true")
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
55
|
+
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
56
|
+
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
57
|
+
|
58
|
+
|
14
59
|
def send_one_prompt(args):
|
15
60
|
if args.image:
|
16
61
|
args.prompt = (
|
@@ -20,20 +65,42 @@ def send_one_prompt(args):
|
|
20
65
|
else:
|
21
66
|
image_data = None
|
22
67
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
"
|
28
|
-
"
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
}
|
34
|
-
|
35
|
-
|
68
|
+
prompt = args.prompt
|
69
|
+
|
70
|
+
if args.json:
|
71
|
+
prompt = (
|
72
|
+
"Human: What is the capital of France and how is that city like. "
|
73
|
+
"Give me 3 trivial information about that city. "
|
74
|
+
"Write in a format of json.\nAssistant:"
|
75
|
+
)
|
76
|
+
json_schema = "$$ANY$$"
|
77
|
+
json_schema = (
|
78
|
+
'{"type": "object", "properties": {"population": {"type": "integer"}}}'
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
json_schema = None
|
82
|
+
|
83
|
+
if args.batch_size > 1:
|
84
|
+
prompt = [prompt] * args.batch_size
|
85
|
+
|
86
|
+
json_data = {
|
87
|
+
"text": prompt,
|
88
|
+
"image_data": image_data,
|
89
|
+
"sampling_params": {
|
90
|
+
"temperature": args.temperature,
|
91
|
+
"max_new_tokens": args.max_new_tokens,
|
92
|
+
"frequency_penalty": args.frequency_penalty,
|
93
|
+
"presence_penalty": args.presence_penalty,
|
94
|
+
"json_schema": json_schema,
|
95
|
+
"stop": ["Question", "Assistant:", "<|separator|>", "<|eos|>"],
|
36
96
|
},
|
97
|
+
"return_logprob": args.return_logprob,
|
98
|
+
"stream": args.stream,
|
99
|
+
}
|
100
|
+
|
101
|
+
response = requests.post(
|
102
|
+
f"http://{args.host}:{args.port}/generate",
|
103
|
+
json=json_data,
|
37
104
|
stream=args.stream,
|
38
105
|
)
|
39
106
|
|
@@ -47,6 +114,9 @@ def send_one_prompt(args):
|
|
47
114
|
else:
|
48
115
|
ret = response.json()
|
49
116
|
|
117
|
+
if args.batch_size > 1:
|
118
|
+
ret = ret[0]
|
119
|
+
|
50
120
|
latency = ret["meta_info"]["e2e_latency"]
|
51
121
|
|
52
122
|
if "spec_verify_ct" in ret["meta_info"]:
|
@@ -68,21 +138,7 @@ def send_one_prompt(args):
|
|
68
138
|
|
69
139
|
if __name__ == "__main__":
|
70
140
|
parser = argparse.ArgumentParser()
|
71
|
-
|
72
|
-
parser.add_argument("--max-new-tokens", type=int, default=512)
|
73
|
-
parser.add_argument("--frequency-penalty", type=float, default=0.0)
|
74
|
-
parser.add_argument("--presence-penalty", type=float, default=0.0)
|
75
|
-
parser.add_argument("--return-logprob", action="store_true")
|
76
|
-
parser.add_argument(
|
77
|
-
"--prompt",
|
78
|
-
type=str,
|
79
|
-
default="Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
|
80
|
-
)
|
81
|
-
parser.add_argument(
|
82
|
-
"--image",
|
83
|
-
action="store_true",
|
84
|
-
)
|
85
|
-
parser.add_argument("--stream", action="store_true")
|
141
|
+
BenchArgs.add_cli_args(parser)
|
86
142
|
args = parser.parse_args()
|
87
143
|
|
88
144
|
send_one_prompt(args)
|
sglang/test/test_utils.py
CHANGED
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
|
69
69
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
70
70
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
71
71
|
)
|
72
|
+
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
72
73
|
|
73
74
|
# Nightly tests
|
74
75
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
@@ -732,6 +733,72 @@ def run_bench_one_batch(model, other_args):
|
|
732
733
|
return output_throughput
|
733
734
|
|
734
735
|
|
736
|
+
def run_bench_offline_throughput(model, other_args):
|
737
|
+
command = [
|
738
|
+
"python3",
|
739
|
+
"-m",
|
740
|
+
"sglang.bench_offline_throughput",
|
741
|
+
"--num-prompts",
|
742
|
+
"1",
|
743
|
+
"--dataset-name",
|
744
|
+
"random",
|
745
|
+
"--random-input-len",
|
746
|
+
"256",
|
747
|
+
"--random-output-len",
|
748
|
+
"256",
|
749
|
+
"--model-path",
|
750
|
+
model,
|
751
|
+
*[str(x) for x in other_args],
|
752
|
+
]
|
753
|
+
|
754
|
+
print(f"{command=}")
|
755
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
756
|
+
|
757
|
+
try:
|
758
|
+
stdout, stderr = process.communicate()
|
759
|
+
output = stdout.decode()
|
760
|
+
error = stderr.decode()
|
761
|
+
print(f"Output: {output}", flush=True)
|
762
|
+
print(f"Error: {error}", flush=True)
|
763
|
+
|
764
|
+
output_throughput = -1
|
765
|
+
for line in output.split("\n"):
|
766
|
+
if "Last generation throughput (tok/s):" in line:
|
767
|
+
output_throughput = float(line.split(":")[-1])
|
768
|
+
finally:
|
769
|
+
kill_process_tree(process.pid)
|
770
|
+
|
771
|
+
return output_throughput
|
772
|
+
|
773
|
+
|
774
|
+
def run_bench_one_batch_server(
|
775
|
+
model,
|
776
|
+
base_url,
|
777
|
+
server_args,
|
778
|
+
bench_args,
|
779
|
+
other_server_args,
|
780
|
+
simulate_spec_acc_lens=None,
|
781
|
+
):
|
782
|
+
from sglang.bench_one_batch_server import run_benchmark
|
783
|
+
|
784
|
+
if simulate_spec_acc_lens is not None:
|
785
|
+
env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
|
786
|
+
else:
|
787
|
+
env = None
|
788
|
+
|
789
|
+
process = popen_launch_server(
|
790
|
+
model,
|
791
|
+
base_url,
|
792
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
793
|
+
other_args=other_server_args,
|
794
|
+
env=env,
|
795
|
+
)
|
796
|
+
try:
|
797
|
+
run_benchmark(server_args=server_args, bench_args=bench_args)
|
798
|
+
finally:
|
799
|
+
kill_process_tree(process.pid)
|
800
|
+
|
801
|
+
|
735
802
|
def lcs(X, Y):
|
736
803
|
m = len(X)
|
737
804
|
n = len(Y)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6"
|
1
|
+
__version__ = "0.4.6.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6
|
3
|
+
Version: 0.4.6.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
|
|
238
238
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
239
239
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
240
240
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
241
|
-
Requires-Dist: torchao>=0.
|
241
|
+
Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
|
242
242
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
245
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
246
|
+
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
246
247
|
Provides-Extra: srt
|
247
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
248
|
-
Requires-Dist: sgl-kernel==0.
|
249
|
-
Requires-Dist: flashinfer_python==0.2.
|
249
|
+
Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
|
250
|
+
Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
|
250
251
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
251
252
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
252
253
|
Requires-Dist: cuda-python; extra == "srt"
|