sglang 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/check_env.py +3 -3
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +15 -0
- sglang/srt/conversation.py +122 -1
- sglang/srt/disaggregation/decode.py +8 -2
- sglang/srt/disaggregation/fake/__init__.py +1 -0
- sglang/srt/disaggregation/fake/conn.py +88 -0
- sglang/srt/disaggregation/prefill.py +12 -3
- sglang/srt/disaggregation/utils.py +16 -2
- sglang/srt/entrypoints/engine.py +52 -21
- sglang/srt/entrypoints/http_server.py +27 -2
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
- sglang/srt/layers/attention/flashinfer_backend.py +107 -82
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/attention/utils.py +1 -1
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
- sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +1 -1
- sglang/srt/layers/quantization/fp8.py +20 -22
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +84 -35
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +34 -15
- sglang/srt/managers/scheduler.py +273 -67
- sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
- sglang/srt/managers/tp_worker.py +52 -17
- sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +123 -58
- sglang/srt/models/deepseek_nextn.py +1 -257
- sglang/srt/models/deepseek_v2.py +78 -18
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +92 -30
- sglang/srt/models/llama4.py +2 -1
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +0 -12
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/openai_api/adapter.py +49 -8
- sglang/srt/openai_api/protocol.py +13 -1
- sglang/srt/reasoning_parser.py +25 -1
- sglang/srt/server_args.py +83 -24
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +91 -9
- sglang/test/runners.py +4 -0
- sglang/test/send_one.py +84 -28
- sglang/test/test_utils.py +67 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +85 -60
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
@@ -35,6 +35,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
|
|
35
35
|
import copy
|
36
36
|
import dataclasses
|
37
37
|
import logging
|
38
|
+
import threading
|
38
39
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
39
40
|
|
40
41
|
import numpy as np
|
@@ -65,23 +66,24 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
|
65
66
|
# Put some global args for easy access
|
66
67
|
global_server_args_dict = {
|
67
68
|
"attention_backend": ServerArgs.attention_backend,
|
68
|
-
"
|
69
|
-
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
70
|
-
"torchao_config": ServerArgs.torchao_config,
|
71
|
-
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
72
|
-
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
73
|
-
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
74
|
-
"enable_deepep_moe": ServerArgs.enable_deepep_moe,
|
69
|
+
"chunked_prefill_size": ServerArgs.chunked_prefill_size,
|
75
70
|
"deepep_mode": ServerArgs.deepep_mode,
|
76
71
|
"device": ServerArgs.device,
|
77
|
-
"
|
78
|
-
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
72
|
+
"disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
|
79
73
|
"disable_radix_cache": ServerArgs.disable_radix_cache,
|
74
|
+
"enable_deepep_moe": ServerArgs.enable_deepep_moe,
|
75
|
+
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
76
|
+
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
77
|
+
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
80
78
|
"flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
|
79
|
+
"max_micro_batch_size": ServerArgs.max_micro_batch_size,
|
81
80
|
"moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
|
82
|
-
"chunked_prefill_size": ServerArgs.chunked_prefill_size,
|
83
81
|
"n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
|
84
|
-
"
|
82
|
+
"sampling_backend": ServerArgs.sampling_backend,
|
83
|
+
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
84
|
+
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
|
85
|
+
"torchao_config": ServerArgs.torchao_config,
|
86
|
+
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
85
87
|
}
|
86
88
|
|
87
89
|
logger = logging.getLogger(__name__)
|
@@ -724,6 +726,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
724
726
|
# This is an optimization to reduce the overhead of the prefill check.
|
725
727
|
batch_is_full: bool = False
|
726
728
|
|
729
|
+
# Events
|
730
|
+
launch_done: Optional[threading.Event] = None
|
731
|
+
|
732
|
+
# For chunked prefill in PP
|
733
|
+
chunked_req: Optional[Req] = None
|
734
|
+
|
727
735
|
# Sampling info
|
728
736
|
sampling_info: SamplingBatchInfo = None
|
729
737
|
next_batch_sampling_info: SamplingBatchInfo = None
|
@@ -757,7 +765,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
757
765
|
# For extend and mixed chunekd prefill
|
758
766
|
prefix_lens: List[int] = None
|
759
767
|
extend_lens: List[int] = None
|
760
|
-
extend_num_tokens: int = None
|
768
|
+
extend_num_tokens: Optional[int] = None
|
761
769
|
decoding_reqs: List[Req] = None
|
762
770
|
extend_logprob_start_lens: List[int] = None
|
763
771
|
# It comes empty list if logprob is not required.
|
@@ -799,6 +807,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
799
807
|
enable_overlap: bool,
|
800
808
|
spec_algorithm: SpeculativeAlgorithm,
|
801
809
|
enable_custom_logit_processor: bool,
|
810
|
+
chunked_req: Optional[Req] = None,
|
802
811
|
):
|
803
812
|
return_logprob = any(req.return_logprob for req in reqs)
|
804
813
|
|
@@ -816,6 +825,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
816
825
|
spec_algorithm=spec_algorithm,
|
817
826
|
enable_custom_logit_processor=enable_custom_logit_processor,
|
818
827
|
return_hidden_states=any(req.return_hidden_states for req in reqs),
|
828
|
+
chunked_req=chunked_req,
|
819
829
|
)
|
820
830
|
|
821
831
|
def batch_size(self):
|
@@ -1232,7 +1242,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1232
1242
|
|
1233
1243
|
def retract_decode(self, server_args: ServerArgs):
|
1234
1244
|
"""Retract the decoding requests when there is not enough memory."""
|
1235
|
-
sorted_indices =
|
1245
|
+
sorted_indices = list(range(len(self.reqs)))
|
1236
1246
|
|
1237
1247
|
# TODO(lsyin): improve retraction policy for radix cache
|
1238
1248
|
# For spec decoding, filter_batch API can only filter
|
@@ -1409,15 +1419,19 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1409
1419
|
|
1410
1420
|
def filter_batch(
|
1411
1421
|
self,
|
1412
|
-
chunked_req_to_exclude: Optional[Req] = None,
|
1422
|
+
chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
|
1413
1423
|
keep_indices: Optional[List[int]] = None,
|
1414
1424
|
):
|
1415
1425
|
if keep_indices is None:
|
1426
|
+
if isinstance(chunked_req_to_exclude, Req):
|
1427
|
+
chunked_req_to_exclude = [chunked_req_to_exclude]
|
1428
|
+
elif chunked_req_to_exclude is None:
|
1429
|
+
chunked_req_to_exclude = []
|
1416
1430
|
keep_indices = [
|
1417
1431
|
i
|
1418
1432
|
for i in range(len(self.reqs))
|
1419
1433
|
if not self.reqs[i].finished()
|
1420
|
-
and self.reqs[i]
|
1434
|
+
and not self.reqs[i] in chunked_req_to_exclude
|
1421
1435
|
]
|
1422
1436
|
|
1423
1437
|
if keep_indices is None or len(keep_indices) == 0:
|
@@ -1511,6 +1525,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1511
1525
|
)
|
1512
1526
|
or global_server_args_dict["attention_backend"] == "flashmla"
|
1513
1527
|
or global_server_args_dict["attention_backend"] == "fa3"
|
1528
|
+
or global_server_args_dict["attention_backend"] == "cutlass_mla"
|
1514
1529
|
):
|
1515
1530
|
seq_lens_cpu = self.seq_lens.cpu()
|
1516
1531
|
else:
|
@@ -1565,6 +1580,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1565
1580
|
)
|
1566
1581
|
),
|
1567
1582
|
extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
|
1583
|
+
launch_done=self.launch_done,
|
1568
1584
|
)
|
1569
1585
|
|
1570
1586
|
def copy(self):
|
@@ -1647,6 +1663,9 @@ class ModelWorkerBatch:
|
|
1647
1663
|
# If set, the output of the batch contains the hidden states of the run.
|
1648
1664
|
capture_hidden_mode: CaptureHiddenMode = None
|
1649
1665
|
|
1666
|
+
# Overlap event
|
1667
|
+
launch_done: Optional[threading.Event] = None
|
1668
|
+
|
1650
1669
|
|
1651
1670
|
@triton.jit
|
1652
1671
|
def write_req_to_token_pool_triton(
|