sglang 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +21 -0
- sglang/bench_serving.py +10 -4
- sglang/lang/chat_template.py +24 -0
- sglang/srt/configs/model_config.py +40 -4
- sglang/srt/constrained/base_grammar_backend.py +26 -5
- sglang/srt/constrained/llguidance_backend.py +1 -0
- sglang/srt/constrained/outlines_backend.py +1 -0
- sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
- sglang/srt/constrained/xgrammar_backend.py +1 -0
- sglang/srt/conversation.py +29 -4
- sglang/srt/disaggregation/base/__init__.py +8 -0
- sglang/srt/disaggregation/base/conn.py +113 -0
- sglang/srt/disaggregation/decode.py +18 -5
- sglang/srt/disaggregation/mini_lb.py +53 -122
- sglang/srt/disaggregation/mooncake/__init__.py +6 -0
- sglang/srt/disaggregation/mooncake/conn.py +615 -0
- sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
- sglang/srt/disaggregation/prefill.py +43 -19
- sglang/srt/disaggregation/utils.py +31 -0
- sglang/srt/entrypoints/EngineBase.py +53 -0
- sglang/srt/entrypoints/engine.py +36 -8
- sglang/srt/entrypoints/http_server.py +37 -8
- sglang/srt/entrypoints/http_server_engine.py +142 -0
- sglang/srt/entrypoints/verl_engine.py +37 -10
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/flashattention_backend.py +609 -202
- sglang/srt/layers/attention/flashinfer_backend.py +13 -7
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/dp_attention.py +2 -4
- sglang/srt/layers/elementwise.py +15 -2
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
- sglang/srt/layers/moe/fused_moe_native.py +5 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +51 -24
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/router.py +7 -1
- sglang/srt/layers/moe/topk.py +37 -16
- sglang/srt/layers/quantization/__init__.py +13 -5
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
- sglang/srt/layers/quantization/fp8.py +28 -14
- sglang/srt/layers/quantization/fp8_kernel.py +130 -4
- sglang/srt/layers/quantization/fp8_utils.py +34 -6
- sglang/srt/layers/quantization/kv_cache.py +43 -52
- sglang/srt/layers/quantization/modelopt_quant.py +271 -4
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +14 -0
- sglang/srt/layers/rotary_embedding.py +75 -1
- sglang/srt/managers/io_struct.py +254 -97
- sglang/srt/managers/mm_utils.py +3 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
- sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
- sglang/srt/managers/multimodal_processors/mllama4.py +146 -0
- sglang/srt/managers/schedule_batch.py +62 -21
- sglang/srt/managers/scheduler.py +71 -14
- sglang/srt/managers/tokenizer_manager.py +17 -3
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/mem_cache/memory_pool.py +14 -1
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +7 -4
- sglang/srt/model_executor/forward_batch_info.py +234 -15
- sglang/srt/model_executor/model_runner.py +49 -9
- sglang/srt/model_loader/loader.py +31 -4
- sglang/srt/model_loader/weight_utils.py +4 -2
- sglang/srt/models/baichuan.py +2 -0
- sglang/srt/models/chatglm.py +1 -0
- sglang/srt/models/commandr.py +1 -0
- sglang/srt/models/dbrx.py +1 -0
- sglang/srt/models/deepseek.py +1 -0
- sglang/srt/models/deepseek_v2.py +248 -61
- sglang/srt/models/exaone.py +1 -0
- sglang/srt/models/gemma.py +1 -0
- sglang/srt/models/gemma2.py +1 -0
- sglang/srt/models/gemma3_causal.py +1 -0
- sglang/srt/models/gpt2.py +1 -0
- sglang/srt/models/gpt_bigcode.py +1 -0
- sglang/srt/models/granite.py +1 -0
- sglang/srt/models/grok.py +1 -0
- sglang/srt/models/internlm2.py +1 -0
- sglang/srt/models/llama.py +13 -4
- sglang/srt/models/llama4.py +487 -0
- sglang/srt/models/minicpm.py +1 -0
- sglang/srt/models/minicpm3.py +2 -0
- sglang/srt/models/mixtral.py +1 -0
- sglang/srt/models/mixtral_quant.py +1 -0
- sglang/srt/models/mllama.py +51 -8
- sglang/srt/models/mllama4.py +227 -0
- sglang/srt/models/olmo.py +1 -0
- sglang/srt/models/olmo2.py +1 -0
- sglang/srt/models/olmoe.py +1 -0
- sglang/srt/models/phi3_small.py +1 -0
- sglang/srt/models/qwen.py +1 -0
- sglang/srt/models/qwen2.py +1 -0
- sglang/srt/models/qwen2_5_vl.py +35 -70
- sglang/srt/models/qwen2_moe.py +1 -0
- sglang/srt/models/qwen2_vl.py +27 -25
- sglang/srt/models/stablelm.py +1 -0
- sglang/srt/models/xverse.py +1 -0
- sglang/srt/models/xverse_moe.py +1 -0
- sglang/srt/openai_api/adapter.py +4 -1
- sglang/srt/patch_torch.py +11 -0
- sglang/srt/server_args.py +34 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
- sglang/srt/speculative/eagle_utils.py +1 -11
- sglang/srt/speculative/eagle_worker.py +6 -2
- sglang/srt/utils.py +120 -9
- sglang/test/attention/test_flashattn_backend.py +259 -221
- sglang/test/attention/test_flashattn_mla_backend.py +285 -0
- sglang/test/attention/test_prefix_chunk_info.py +224 -0
- sglang/test/test_block_fp8.py +57 -0
- sglang/test/test_utils.py +19 -8
- sglang/version.py +1 -1
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +133 -109
- sglang/srt/disaggregation/conn.py +0 -81
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -49,6 +49,7 @@ from sglang.srt.disaggregation.prefill import (
|
|
49
49
|
from sglang.srt.disaggregation.utils import (
|
50
50
|
DisaggregationMode,
|
51
51
|
ReqToMetadataIdxAllocator,
|
52
|
+
TransferBackend,
|
52
53
|
)
|
53
54
|
from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
|
54
55
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
@@ -113,6 +114,7 @@ from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
|
|
113
114
|
from sglang.srt.mem_cache.radix_cache import RadixCache
|
114
115
|
from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
|
115
116
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
117
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
116
118
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
117
119
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
118
120
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
@@ -232,6 +234,15 @@ class Scheduler(
|
|
232
234
|
# Init tokenizer
|
233
235
|
self.init_tokenizer()
|
234
236
|
|
237
|
+
# Set reasoning_parser and think_end_id if --reasoning_parser is enabled
|
238
|
+
if self.server_args.reasoning_parser and self.tokenizer:
|
239
|
+
reasoning_parser = ReasoningParser(
|
240
|
+
model_type=self.server_args.reasoning_parser, stream_reasoning=False
|
241
|
+
)
|
242
|
+
self.tokenizer.think_end_id = self.tokenizer.encode(
|
243
|
+
reasoning_parser.detector.think_end_token, add_special_tokens=False
|
244
|
+
)[0]
|
245
|
+
|
235
246
|
# Check whether overlap can be enabled
|
236
247
|
if not self.is_generation:
|
237
248
|
self.enable_overlap = False
|
@@ -427,6 +438,7 @@ class Scheduler(
|
|
427
438
|
context_length=server_args.context_length,
|
428
439
|
model_override_args=server_args.json_model_override_args,
|
429
440
|
is_embedding=server_args.is_embedding,
|
441
|
+
enable_multimodal=server_args.enable_multimodal,
|
430
442
|
dtype=server_args.dtype,
|
431
443
|
quantization=server_args.quantization,
|
432
444
|
)
|
@@ -441,6 +453,7 @@ class Scheduler(
|
|
441
453
|
tokenizer_mode=server_args.tokenizer_mode,
|
442
454
|
trust_remote_code=server_args.trust_remote_code,
|
443
455
|
revision=server_args.revision,
|
456
|
+
use_fast=not server_args.disable_fast_image_processor,
|
444
457
|
)
|
445
458
|
self.tokenizer = self.processor.tokenizer
|
446
459
|
else:
|
@@ -518,6 +531,10 @@ class Scheduler(
|
|
518
531
|
)
|
519
532
|
|
520
533
|
def init_disaggregation(self):
|
534
|
+
self.transfer_backend = TransferBackend(
|
535
|
+
self.server_args.disaggregation_transfer_backend
|
536
|
+
)
|
537
|
+
|
521
538
|
if (
|
522
539
|
self.disaggregation_mode == DisaggregationMode.DECODE
|
523
540
|
): # *2 for the headroom.
|
@@ -555,6 +572,7 @@ class Scheduler(
|
|
555
572
|
tp_rank=self.tp_rank,
|
556
573
|
tp_size=self.tp_size,
|
557
574
|
bootstrap_port=self.server_args.disaggregation_bootstrap_port,
|
575
|
+
transfer_backend=self.transfer_backend,
|
558
576
|
)
|
559
577
|
elif self.disaggregation_mode == DisaggregationMode.PREFILL:
|
560
578
|
# *2 for the headroom.
|
@@ -580,9 +598,11 @@ class Scheduler(
|
|
580
598
|
tp_size=self.tp_size,
|
581
599
|
bootstrap_port=self.server_args.disaggregation_bootstrap_port,
|
582
600
|
gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
|
601
|
+
transfer_backend=self.transfer_backend,
|
602
|
+
scheduler=self,
|
583
603
|
)
|
584
604
|
# The prefill requests that are in the middle of kv sending
|
585
|
-
self.
|
605
|
+
self.disagg_prefill_inflight_queue: List[Req] = []
|
586
606
|
|
587
607
|
@DynamicGradMode()
|
588
608
|
def event_loop_normal(self):
|
@@ -662,10 +682,10 @@ class Scheduler(
|
|
662
682
|
result = self.run_batch(batch)
|
663
683
|
self.process_batch_result_disagg_prefill(batch, result)
|
664
684
|
|
665
|
-
if len(self.
|
666
|
-
self.
|
685
|
+
if len(self.disagg_prefill_inflight_queue) > 0:
|
686
|
+
self.process_disagg_prefill_inflight_queue()
|
667
687
|
|
668
|
-
if batch is None and len(self.
|
688
|
+
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
|
669
689
|
self.check_memory()
|
670
690
|
self.new_token_ratio = self.init_new_token_ratio
|
671
691
|
|
@@ -826,6 +846,8 @@ class Scheduler(
|
|
826
846
|
custom_logit_processor=custom_logit_processor,
|
827
847
|
return_hidden_states=recv_req.return_hidden_states,
|
828
848
|
eos_token_ids=self.model_config.hf_eos_token_id,
|
849
|
+
bootstrap_host=recv_req.bootstrap_host,
|
850
|
+
bootstrap_room=recv_req.bootstrap_room,
|
829
851
|
)
|
830
852
|
req.tokenizer = self.tokenizer
|
831
853
|
|
@@ -937,12 +959,11 @@ class Scheduler(
|
|
937
959
|
self._add_request_to_queue(req)
|
938
960
|
|
939
961
|
def _add_request_to_queue(self, req: Req):
|
962
|
+
req.queue_time_start = time.time()
|
940
963
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
941
964
|
self.disagg_prefill_pending_queue.add(req)
|
942
|
-
|
943
965
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
944
966
|
self.disagg_decode_prealloc_queue.add(req)
|
945
|
-
|
946
967
|
else:
|
947
968
|
self.waiting_queue.append(req)
|
948
969
|
|
@@ -985,6 +1006,7 @@ class Scheduler(
|
|
985
1006
|
req.finished_reason = FINISH_ABORT(
|
986
1007
|
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
987
1008
|
)
|
1009
|
+
req.queue_time_start = time.time()
|
988
1010
|
self.waiting_queue.append(req)
|
989
1011
|
return
|
990
1012
|
|
@@ -1021,9 +1043,10 @@ class Scheduler(
|
|
1021
1043
|
self._largest_prefill_len, adder.log_input_tokens
|
1022
1044
|
)
|
1023
1045
|
|
1046
|
+
num_new_seq = len(can_run_list)
|
1024
1047
|
f = (
|
1025
1048
|
f"Prefill batch. "
|
1026
|
-
f"#new-seq: {
|
1049
|
+
f"#new-seq: {num_new_seq}, "
|
1027
1050
|
f"#new-token: {adder.log_input_tokens}, "
|
1028
1051
|
f"#cached-token: {adder.log_hit_tokens}, "
|
1029
1052
|
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
@@ -1041,6 +1064,12 @@ class Scheduler(
|
|
1041
1064
|
self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
|
1042
1065
|
self.stats.num_queue_reqs = len(self.waiting_queue)
|
1043
1066
|
self.stats.cache_hit_rate = cache_hit_rate
|
1067
|
+
|
1068
|
+
total_queue_latency = 0
|
1069
|
+
for req in can_run_list:
|
1070
|
+
total_queue_latency += req.queue_time_end - req.queue_time_start
|
1071
|
+
self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
|
1072
|
+
|
1044
1073
|
self.metrics_collector.log_stats(self.stats)
|
1045
1074
|
|
1046
1075
|
def log_decode_stats(self):
|
@@ -1277,6 +1306,12 @@ class Scheduler(
|
|
1277
1306
|
can_run_list: List[Req] = adder.can_run_list
|
1278
1307
|
if len(can_run_list) == 0:
|
1279
1308
|
return None
|
1309
|
+
|
1310
|
+
if self.enable_metrics:
|
1311
|
+
# only record queue time when enable_metrics is True to avoid overhead
|
1312
|
+
for req in can_run_list:
|
1313
|
+
req.queue_time_end = time.time()
|
1314
|
+
|
1280
1315
|
self.waiting_queue = [
|
1281
1316
|
x for x in self.waiting_queue if x not in set(can_run_list)
|
1282
1317
|
]
|
@@ -1456,14 +1491,36 @@ class Scheduler(
|
|
1456
1491
|
self.send_to_tokenizer.send_pyobj(HealthCheckOutput())
|
1457
1492
|
|
1458
1493
|
def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
|
1494
|
+
return self.prepare_dp_attn_batch_raw(
|
1495
|
+
local_batch,
|
1496
|
+
dp_size=self.server_args.dp_size,
|
1497
|
+
attn_tp_size=self.attn_tp_size,
|
1498
|
+
tp_cpu_group=self.tp_cpu_group,
|
1499
|
+
get_idle_batch=self.get_idle_batch,
|
1500
|
+
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
1501
|
+
spec_algorithm=self.spec_algorithm,
|
1502
|
+
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
1503
|
+
)
|
1504
|
+
|
1505
|
+
@staticmethod
|
1506
|
+
def prepare_dp_attn_batch_raw(
|
1507
|
+
local_batch: ScheduleBatch,
|
1508
|
+
dp_size,
|
1509
|
+
attn_tp_size: int,
|
1510
|
+
tp_cpu_group,
|
1511
|
+
get_idle_batch,
|
1512
|
+
disable_cuda_graph: bool,
|
1513
|
+
spec_algorithm,
|
1514
|
+
speculative_num_draft_tokens,
|
1515
|
+
):
|
1459
1516
|
# Check if other DP workers have running batches
|
1460
1517
|
if local_batch is None:
|
1461
1518
|
num_tokens = 0
|
1462
1519
|
global_num_tokens_for_logprob = 0
|
1463
1520
|
elif local_batch.forward_mode.is_decode():
|
1464
1521
|
num_tokens = local_batch.batch_size()
|
1465
|
-
if not
|
1466
|
-
num_tokens = num_tokens *
|
1522
|
+
if not spec_algorithm.is_none() and spec_algorithm.is_eagle():
|
1523
|
+
num_tokens = num_tokens * speculative_num_draft_tokens
|
1467
1524
|
global_num_tokens_for_logprob = num_tokens
|
1468
1525
|
else:
|
1469
1526
|
num_tokens = local_batch.extend_num_tokens
|
@@ -1482,7 +1539,7 @@ class Scheduler(
|
|
1482
1539
|
else:
|
1483
1540
|
can_cuda_graph = 0
|
1484
1541
|
|
1485
|
-
if not
|
1542
|
+
if not spec_algorithm.is_none():
|
1486
1543
|
# TODO(sang): Support cuda graph when idle batch is there.
|
1487
1544
|
if local_batch is None or local_batch.forward_mode.is_idle():
|
1488
1545
|
can_cuda_graph = 0
|
@@ -1500,13 +1557,13 @@ class Scheduler(
|
|
1500
1557
|
dtype=torch.int64,
|
1501
1558
|
)
|
1502
1559
|
global_info = torch.empty(
|
1503
|
-
(
|
1560
|
+
(dp_size, attn_tp_size, 4),
|
1504
1561
|
dtype=torch.int64,
|
1505
1562
|
)
|
1506
1563
|
torch.distributed.all_gather_into_tensor(
|
1507
1564
|
global_info.flatten(),
|
1508
1565
|
local_info,
|
1509
|
-
group=
|
1566
|
+
group=tp_cpu_group,
|
1510
1567
|
)
|
1511
1568
|
global_num_tokens = global_info[:, 0, 0].tolist()
|
1512
1569
|
can_cuda_graph = min(global_info[:, 0, 1].tolist())
|
@@ -1514,14 +1571,14 @@ class Scheduler(
|
|
1514
1571
|
is_extend_in_batch = global_info[:, 0, 3].tolist()
|
1515
1572
|
|
1516
1573
|
if local_batch is None and max(global_num_tokens) > 0:
|
1517
|
-
local_batch =
|
1574
|
+
local_batch = get_idle_batch()
|
1518
1575
|
|
1519
1576
|
if local_batch is not None:
|
1520
1577
|
local_batch.global_num_tokens = global_num_tokens
|
1521
1578
|
local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob
|
1522
1579
|
|
1523
1580
|
# Check forward mode for cuda graph
|
1524
|
-
if not
|
1581
|
+
if not disable_cuda_graph:
|
1525
1582
|
local_batch.can_run_dp_cuda_graph = can_cuda_graph
|
1526
1583
|
|
1527
1584
|
return local_batch, any(is_extend_in_batch)
|
@@ -48,8 +48,12 @@ from fastapi import BackgroundTasks
|
|
48
48
|
|
49
49
|
from sglang.srt.aio_rwlock import RWLock
|
50
50
|
from sglang.srt.configs.model_config import ModelConfig
|
51
|
-
from sglang.srt.disaggregation.
|
52
|
-
|
51
|
+
from sglang.srt.disaggregation.utils import (
|
52
|
+
DisaggregationMode,
|
53
|
+
KVClassType,
|
54
|
+
TransferBackend,
|
55
|
+
get_kv_class,
|
56
|
+
)
|
53
57
|
from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
|
54
58
|
from sglang.srt.managers.io_struct import (
|
55
59
|
AbortReq,
|
@@ -163,6 +167,7 @@ class TokenizerManager:
|
|
163
167
|
context_length=server_args.context_length,
|
164
168
|
model_override_args=server_args.json_model_override_args,
|
165
169
|
is_embedding=server_args.is_embedding,
|
170
|
+
enable_multimodal=server_args.enable_multimodal,
|
166
171
|
dtype=server_args.dtype,
|
167
172
|
quantization=server_args.quantization,
|
168
173
|
)
|
@@ -179,6 +184,7 @@ class TokenizerManager:
|
|
179
184
|
tokenizer_mode=server_args.tokenizer_mode,
|
180
185
|
trust_remote_code=server_args.trust_remote_code,
|
181
186
|
revision=server_args.revision,
|
187
|
+
use_fast=not server_args.disable_fast_image_processor,
|
182
188
|
)
|
183
189
|
|
184
190
|
# We want to parallelize the image pre-processing so we create an executor for it
|
@@ -327,10 +333,16 @@ class TokenizerManager:
|
|
327
333
|
self.disaggregation_mode = DisaggregationMode(
|
328
334
|
self.server_args.disaggregation_mode
|
329
335
|
)
|
336
|
+
self.transfer_backend = TransferBackend(
|
337
|
+
self.server_args.disaggregation_transfer_backend
|
338
|
+
)
|
330
339
|
# for disaggregtion, start kv boostrap server on prefill
|
331
340
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
332
341
|
# only start bootstrap server on prefill tm
|
333
|
-
|
342
|
+
kv_bootstrap_server_class = get_kv_class(
|
343
|
+
self.transfer_backend, KVClassType.BOOTSTRAP_SERVER
|
344
|
+
)
|
345
|
+
self.bootstrap_server = kv_bootstrap_server_class(
|
334
346
|
self.server_args.disaggregation_bootstrap_port
|
335
347
|
)
|
336
348
|
|
@@ -452,6 +464,8 @@ class TokenizerManager:
|
|
452
464
|
top_logprobs_num,
|
453
465
|
token_ids_logprob,
|
454
466
|
obj.stream,
|
467
|
+
bootstrap_host=obj.bootstrap_host,
|
468
|
+
bootstrap_room=obj.bootstrap_room,
|
455
469
|
lora_path=obj.lora_path,
|
456
470
|
input_embeds=input_embeds,
|
457
471
|
session_params=session_params,
|
sglang/srt/managers/tp_worker.py
CHANGED
@@ -68,6 +68,7 @@ class TpModelWorker:
|
|
68
68
|
context_length=server_args.context_length,
|
69
69
|
model_override_args=server_args.json_model_override_args,
|
70
70
|
is_embedding=server_args.is_embedding,
|
71
|
+
enable_multimodal=server_args.enable_multimodal,
|
71
72
|
dtype=server_args.dtype,
|
72
73
|
quantization=server_args.quantization,
|
73
74
|
)
|
@@ -442,6 +442,14 @@ class MLATokenToKVPool(KVCache):
|
|
442
442
|
|
443
443
|
self.layer_transfer_counter = None
|
444
444
|
|
445
|
+
# for disagg
|
446
|
+
def get_contiguous_buf_infos(self):
|
447
|
+
# MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
|
448
|
+
kv_data_ptrs = [self.kv_buffer[i].data_ptr() for i in range(self.layer_num)]
|
449
|
+
kv_data_lens = [self.kv_buffer[i].nbytes for i in range(self.layer_num)]
|
450
|
+
kv_item_lens = [self.kv_buffer[i][0].nbytes for i in range(self.layer_num)]
|
451
|
+
return kv_data_ptrs, kv_data_lens, kv_item_lens
|
452
|
+
|
445
453
|
def get_key_buffer(self, layer_id: int):
|
446
454
|
if self.layer_transfer_counter is not None:
|
447
455
|
self.layer_transfer_counter.wait_until(layer_id)
|
@@ -866,7 +874,12 @@ class MLATokenToKVPoolHost(HostKVCache):
|
|
866
874
|
self.qk_rope_head_dim = self.device_pool.qk_rope_head_dim
|
867
875
|
self.layer_num = self.device_pool.layer_num
|
868
876
|
|
869
|
-
return (
|
877
|
+
return (
|
878
|
+
(self.kv_lora_rank + self.qk_rope_head_dim)
|
879
|
+
* 1
|
880
|
+
* self.dtype.itemsize
|
881
|
+
* self.layer_num
|
882
|
+
)
|
870
883
|
|
871
884
|
def init_kv_buffer(self):
|
872
885
|
return torch.empty(
|
sglang/srt/metrics/collector.py
CHANGED
@@ -27,6 +27,7 @@ class SchedulerStats:
|
|
27
27
|
num_queue_reqs: int = 0
|
28
28
|
cache_hit_rate: float = 0.0
|
29
29
|
spec_accept_length: float = 0.0
|
30
|
+
avg_request_queue_latency: float = 0.0
|
30
31
|
|
31
32
|
|
32
33
|
class SchedulerMetricsCollector:
|
@@ -87,6 +88,13 @@ class SchedulerMetricsCollector:
|
|
87
88
|
multiprocess_mode="mostrecent",
|
88
89
|
)
|
89
90
|
|
91
|
+
self.avg_request_queue_latency = Gauge(
|
92
|
+
name="sglang:avg_request_queue_latency",
|
93
|
+
documentation="The average request queue latency for the last batch of requests in seconds.",
|
94
|
+
labelnames=labels.keys(),
|
95
|
+
multiprocess_mode="mostrecent",
|
96
|
+
)
|
97
|
+
|
90
98
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
91
99
|
# Convenience function for logging to gauge.
|
92
100
|
gauge.labels(**self.labels).set(data)
|
@@ -99,6 +107,7 @@ class SchedulerMetricsCollector:
|
|
99
107
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
100
108
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
101
109
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
110
|
+
self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
|
102
111
|
self.last_log_time = time.time()
|
103
112
|
|
104
113
|
|
@@ -34,6 +34,7 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
34
34
|
ForwardBatch,
|
35
35
|
ForwardMode,
|
36
36
|
)
|
37
|
+
from sglang.srt.patch_torch import monkey_patch_torch_compile
|
37
38
|
from sglang.srt.utils import get_available_gpu_memory, is_hip
|
38
39
|
|
39
40
|
_is_hip = is_hip()
|
@@ -108,6 +109,8 @@ def set_torch_compile_config():
|
|
108
109
|
if hasattr(torch._dynamo.config, "cache_size_limit"):
|
109
110
|
torch._dynamo.config.cache_size_limit = 1024
|
110
111
|
|
112
|
+
monkey_patch_torch_compile()
|
113
|
+
|
111
114
|
|
112
115
|
def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
113
116
|
server_args = model_runner.server_args
|
@@ -116,7 +119,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|
116
119
|
if capture_bs is None:
|
117
120
|
if server_args.speculative_algorithm is None:
|
118
121
|
if server_args.disable_cuda_graph_padding:
|
119
|
-
capture_bs = list(range(1, 33)) + range(40, 161, 16)
|
122
|
+
capture_bs = list(range(1, 33)) + list(range(40, 161, 16))
|
120
123
|
else:
|
121
124
|
capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
|
122
125
|
else:
|
@@ -269,10 +272,10 @@ class CudaGraphRunner:
|
|
269
272
|
raise Exception(
|
270
273
|
f"Capture cuda graph failed: {e}\n"
|
271
274
|
"Possible solutions:\n"
|
272
|
-
"1.
|
273
|
-
"2. set --
|
275
|
+
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
276
|
+
"2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n"
|
274
277
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
275
|
-
"4.
|
278
|
+
"4. disable cuda graph by --disable-cuda-graph\n"
|
276
279
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
277
280
|
)
|
278
281
|
|