sglang 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. sglang/bench_one_batch.py +21 -0
  2. sglang/bench_serving.py +10 -4
  3. sglang/lang/chat_template.py +24 -0
  4. sglang/srt/configs/model_config.py +40 -4
  5. sglang/srt/constrained/base_grammar_backend.py +26 -5
  6. sglang/srt/constrained/llguidance_backend.py +1 -0
  7. sglang/srt/constrained/outlines_backend.py +1 -0
  8. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  9. sglang/srt/constrained/xgrammar_backend.py +1 -0
  10. sglang/srt/conversation.py +29 -4
  11. sglang/srt/disaggregation/base/__init__.py +8 -0
  12. sglang/srt/disaggregation/base/conn.py +113 -0
  13. sglang/srt/disaggregation/decode.py +18 -5
  14. sglang/srt/disaggregation/mini_lb.py +53 -122
  15. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  16. sglang/srt/disaggregation/mooncake/conn.py +615 -0
  17. sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
  18. sglang/srt/disaggregation/prefill.py +43 -19
  19. sglang/srt/disaggregation/utils.py +31 -0
  20. sglang/srt/entrypoints/EngineBase.py +53 -0
  21. sglang/srt/entrypoints/engine.py +36 -8
  22. sglang/srt/entrypoints/http_server.py +37 -8
  23. sglang/srt/entrypoints/http_server_engine.py +142 -0
  24. sglang/srt/entrypoints/verl_engine.py +37 -10
  25. sglang/srt/hf_transformers_utils.py +4 -0
  26. sglang/srt/layers/attention/flashattention_backend.py +609 -202
  27. sglang/srt/layers/attention/flashinfer_backend.py +13 -7
  28. sglang/srt/layers/attention/vision.py +1 -1
  29. sglang/srt/layers/dp_attention.py +2 -4
  30. sglang/srt/layers/elementwise.py +15 -2
  31. sglang/srt/layers/linear.py +1 -0
  32. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  33. sglang/srt/layers/moe/fused_moe_native.py +5 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +51 -24
  48. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  49. sglang/srt/layers/moe/router.py +7 -1
  50. sglang/srt/layers/moe/topk.py +37 -16
  51. sglang/srt/layers/quantization/__init__.py +13 -5
  52. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  53. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
  54. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
  55. sglang/srt/layers/quantization/fp8.py +28 -14
  56. sglang/srt/layers/quantization/fp8_kernel.py +130 -4
  57. sglang/srt/layers/quantization/fp8_utils.py +34 -6
  58. sglang/srt/layers/quantization/kv_cache.py +43 -52
  59. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  60. sglang/srt/layers/quantization/moe_wna16.py +2 -0
  61. sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
  62. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  63. sglang/srt/layers/radix_attention.py +14 -0
  64. sglang/srt/layers/rotary_embedding.py +75 -1
  65. sglang/srt/managers/io_struct.py +254 -97
  66. sglang/srt/managers/mm_utils.py +3 -2
  67. sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
  68. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  69. sglang/srt/managers/multimodal_processors/mllama4.py +146 -0
  70. sglang/srt/managers/schedule_batch.py +62 -21
  71. sglang/srt/managers/scheduler.py +71 -14
  72. sglang/srt/managers/tokenizer_manager.py +17 -3
  73. sglang/srt/managers/tp_worker.py +1 -0
  74. sglang/srt/mem_cache/memory_pool.py +14 -1
  75. sglang/srt/metrics/collector.py +9 -0
  76. sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  77. sglang/srt/model_executor/forward_batch_info.py +234 -15
  78. sglang/srt/model_executor/model_runner.py +49 -9
  79. sglang/srt/model_loader/loader.py +31 -4
  80. sglang/srt/model_loader/weight_utils.py +4 -2
  81. sglang/srt/models/baichuan.py +2 -0
  82. sglang/srt/models/chatglm.py +1 -0
  83. sglang/srt/models/commandr.py +1 -0
  84. sglang/srt/models/dbrx.py +1 -0
  85. sglang/srt/models/deepseek.py +1 -0
  86. sglang/srt/models/deepseek_v2.py +248 -61
  87. sglang/srt/models/exaone.py +1 -0
  88. sglang/srt/models/gemma.py +1 -0
  89. sglang/srt/models/gemma2.py +1 -0
  90. sglang/srt/models/gemma3_causal.py +1 -0
  91. sglang/srt/models/gpt2.py +1 -0
  92. sglang/srt/models/gpt_bigcode.py +1 -0
  93. sglang/srt/models/granite.py +1 -0
  94. sglang/srt/models/grok.py +1 -0
  95. sglang/srt/models/internlm2.py +1 -0
  96. sglang/srt/models/llama.py +13 -4
  97. sglang/srt/models/llama4.py +487 -0
  98. sglang/srt/models/minicpm.py +1 -0
  99. sglang/srt/models/minicpm3.py +2 -0
  100. sglang/srt/models/mixtral.py +1 -0
  101. sglang/srt/models/mixtral_quant.py +1 -0
  102. sglang/srt/models/mllama.py +51 -8
  103. sglang/srt/models/mllama4.py +227 -0
  104. sglang/srt/models/olmo.py +1 -0
  105. sglang/srt/models/olmo2.py +1 -0
  106. sglang/srt/models/olmoe.py +1 -0
  107. sglang/srt/models/phi3_small.py +1 -0
  108. sglang/srt/models/qwen.py +1 -0
  109. sglang/srt/models/qwen2.py +1 -0
  110. sglang/srt/models/qwen2_5_vl.py +35 -70
  111. sglang/srt/models/qwen2_moe.py +1 -0
  112. sglang/srt/models/qwen2_vl.py +27 -25
  113. sglang/srt/models/stablelm.py +1 -0
  114. sglang/srt/models/xverse.py +1 -0
  115. sglang/srt/models/xverse_moe.py +1 -0
  116. sglang/srt/openai_api/adapter.py +4 -1
  117. sglang/srt/patch_torch.py +11 -0
  118. sglang/srt/server_args.py +34 -0
  119. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  120. sglang/srt/speculative/eagle_utils.py +1 -11
  121. sglang/srt/speculative/eagle_worker.py +6 -2
  122. sglang/srt/utils.py +120 -9
  123. sglang/test/attention/test_flashattn_backend.py +259 -221
  124. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  125. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  126. sglang/test/test_block_fp8.py +57 -0
  127. sglang/test/test_utils.py +19 -8
  128. sglang/version.py +1 -1
  129. {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
  130. {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +133 -109
  131. sglang/srt/disaggregation/conn.py +0 -81
  132. {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
  133. {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
  134. {sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0
@@ -49,6 +49,7 @@ from sglang.srt.disaggregation.prefill import (
49
49
  from sglang.srt.disaggregation.utils import (
50
50
  DisaggregationMode,
51
51
  ReqToMetadataIdxAllocator,
52
+ TransferBackend,
52
53
  )
53
54
  from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
54
55
  from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
@@ -113,6 +114,7 @@ from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
113
114
  from sglang.srt.mem_cache.radix_cache import RadixCache
114
115
  from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
115
116
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
117
+ from sglang.srt.reasoning_parser import ReasoningParser
116
118
  from sglang.srt.server_args import PortArgs, ServerArgs
117
119
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
118
120
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -232,6 +234,15 @@ class Scheduler(
232
234
  # Init tokenizer
233
235
  self.init_tokenizer()
234
236
 
237
+ # Set reasoning_parser and think_end_id if --reasoning_parser is enabled
238
+ if self.server_args.reasoning_parser and self.tokenizer:
239
+ reasoning_parser = ReasoningParser(
240
+ model_type=self.server_args.reasoning_parser, stream_reasoning=False
241
+ )
242
+ self.tokenizer.think_end_id = self.tokenizer.encode(
243
+ reasoning_parser.detector.think_end_token, add_special_tokens=False
244
+ )[0]
245
+
235
246
  # Check whether overlap can be enabled
236
247
  if not self.is_generation:
237
248
  self.enable_overlap = False
@@ -427,6 +438,7 @@ class Scheduler(
427
438
  context_length=server_args.context_length,
428
439
  model_override_args=server_args.json_model_override_args,
429
440
  is_embedding=server_args.is_embedding,
441
+ enable_multimodal=server_args.enable_multimodal,
430
442
  dtype=server_args.dtype,
431
443
  quantization=server_args.quantization,
432
444
  )
@@ -441,6 +453,7 @@ class Scheduler(
441
453
  tokenizer_mode=server_args.tokenizer_mode,
442
454
  trust_remote_code=server_args.trust_remote_code,
443
455
  revision=server_args.revision,
456
+ use_fast=not server_args.disable_fast_image_processor,
444
457
  )
445
458
  self.tokenizer = self.processor.tokenizer
446
459
  else:
@@ -518,6 +531,10 @@ class Scheduler(
518
531
  )
519
532
 
520
533
  def init_disaggregation(self):
534
+ self.transfer_backend = TransferBackend(
535
+ self.server_args.disaggregation_transfer_backend
536
+ )
537
+
521
538
  if (
522
539
  self.disaggregation_mode == DisaggregationMode.DECODE
523
540
  ): # *2 for the headroom.
@@ -555,6 +572,7 @@ class Scheduler(
555
572
  tp_rank=self.tp_rank,
556
573
  tp_size=self.tp_size,
557
574
  bootstrap_port=self.server_args.disaggregation_bootstrap_port,
575
+ transfer_backend=self.transfer_backend,
558
576
  )
559
577
  elif self.disaggregation_mode == DisaggregationMode.PREFILL:
560
578
  # *2 for the headroom.
@@ -580,9 +598,11 @@ class Scheduler(
580
598
  tp_size=self.tp_size,
581
599
  bootstrap_port=self.server_args.disaggregation_bootstrap_port,
582
600
  gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
601
+ transfer_backend=self.transfer_backend,
602
+ scheduler=self,
583
603
  )
584
604
  # The prefill requests that are in the middle of kv sending
585
- self.disagg_prefill_infight_queue: List[Req] = []
605
+ self.disagg_prefill_inflight_queue: List[Req] = []
586
606
 
587
607
  @DynamicGradMode()
588
608
  def event_loop_normal(self):
@@ -662,10 +682,10 @@ class Scheduler(
662
682
  result = self.run_batch(batch)
663
683
  self.process_batch_result_disagg_prefill(batch, result)
664
684
 
665
- if len(self.disagg_prefill_infight_queue) > 0:
666
- self.process_disagg_prefill_infight_queue()
685
+ if len(self.disagg_prefill_inflight_queue) > 0:
686
+ self.process_disagg_prefill_inflight_queue()
667
687
 
668
- if batch is None and len(self.disagg_prefill_infight_queue) == 0:
688
+ if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
669
689
  self.check_memory()
670
690
  self.new_token_ratio = self.init_new_token_ratio
671
691
 
@@ -826,6 +846,8 @@ class Scheduler(
826
846
  custom_logit_processor=custom_logit_processor,
827
847
  return_hidden_states=recv_req.return_hidden_states,
828
848
  eos_token_ids=self.model_config.hf_eos_token_id,
849
+ bootstrap_host=recv_req.bootstrap_host,
850
+ bootstrap_room=recv_req.bootstrap_room,
829
851
  )
830
852
  req.tokenizer = self.tokenizer
831
853
 
@@ -937,12 +959,11 @@ class Scheduler(
937
959
  self._add_request_to_queue(req)
938
960
 
939
961
  def _add_request_to_queue(self, req: Req):
962
+ req.queue_time_start = time.time()
940
963
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
941
964
  self.disagg_prefill_pending_queue.add(req)
942
-
943
965
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
944
966
  self.disagg_decode_prealloc_queue.add(req)
945
-
946
967
  else:
947
968
  self.waiting_queue.append(req)
948
969
 
@@ -985,6 +1006,7 @@ class Scheduler(
985
1006
  req.finished_reason = FINISH_ABORT(
986
1007
  error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
987
1008
  )
1009
+ req.queue_time_start = time.time()
988
1010
  self.waiting_queue.append(req)
989
1011
  return
990
1012
 
@@ -1021,9 +1043,10 @@ class Scheduler(
1021
1043
  self._largest_prefill_len, adder.log_input_tokens
1022
1044
  )
1023
1045
 
1046
+ num_new_seq = len(can_run_list)
1024
1047
  f = (
1025
1048
  f"Prefill batch. "
1026
- f"#new-seq: {len(can_run_list)}, "
1049
+ f"#new-seq: {num_new_seq}, "
1027
1050
  f"#new-token: {adder.log_input_tokens}, "
1028
1051
  f"#cached-token: {adder.log_hit_tokens}, "
1029
1052
  f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
@@ -1041,6 +1064,12 @@ class Scheduler(
1041
1064
  self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
1042
1065
  self.stats.num_queue_reqs = len(self.waiting_queue)
1043
1066
  self.stats.cache_hit_rate = cache_hit_rate
1067
+
1068
+ total_queue_latency = 0
1069
+ for req in can_run_list:
1070
+ total_queue_latency += req.queue_time_end - req.queue_time_start
1071
+ self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
1072
+
1044
1073
  self.metrics_collector.log_stats(self.stats)
1045
1074
 
1046
1075
  def log_decode_stats(self):
@@ -1277,6 +1306,12 @@ class Scheduler(
1277
1306
  can_run_list: List[Req] = adder.can_run_list
1278
1307
  if len(can_run_list) == 0:
1279
1308
  return None
1309
+
1310
+ if self.enable_metrics:
1311
+ # only record queue time when enable_metrics is True to avoid overhead
1312
+ for req in can_run_list:
1313
+ req.queue_time_end = time.time()
1314
+
1280
1315
  self.waiting_queue = [
1281
1316
  x for x in self.waiting_queue if x not in set(can_run_list)
1282
1317
  ]
@@ -1456,14 +1491,36 @@ class Scheduler(
1456
1491
  self.send_to_tokenizer.send_pyobj(HealthCheckOutput())
1457
1492
 
1458
1493
  def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
1494
+ return self.prepare_dp_attn_batch_raw(
1495
+ local_batch,
1496
+ dp_size=self.server_args.dp_size,
1497
+ attn_tp_size=self.attn_tp_size,
1498
+ tp_cpu_group=self.tp_cpu_group,
1499
+ get_idle_batch=self.get_idle_batch,
1500
+ disable_cuda_graph=self.server_args.disable_cuda_graph,
1501
+ spec_algorithm=self.spec_algorithm,
1502
+ speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
1503
+ )
1504
+
1505
+ @staticmethod
1506
+ def prepare_dp_attn_batch_raw(
1507
+ local_batch: ScheduleBatch,
1508
+ dp_size,
1509
+ attn_tp_size: int,
1510
+ tp_cpu_group,
1511
+ get_idle_batch,
1512
+ disable_cuda_graph: bool,
1513
+ spec_algorithm,
1514
+ speculative_num_draft_tokens,
1515
+ ):
1459
1516
  # Check if other DP workers have running batches
1460
1517
  if local_batch is None:
1461
1518
  num_tokens = 0
1462
1519
  global_num_tokens_for_logprob = 0
1463
1520
  elif local_batch.forward_mode.is_decode():
1464
1521
  num_tokens = local_batch.batch_size()
1465
- if not self.spec_algorithm.is_none() and self.spec_algorithm.is_eagle():
1466
- num_tokens = num_tokens * self.server_args.speculative_num_draft_tokens
1522
+ if not spec_algorithm.is_none() and spec_algorithm.is_eagle():
1523
+ num_tokens = num_tokens * speculative_num_draft_tokens
1467
1524
  global_num_tokens_for_logprob = num_tokens
1468
1525
  else:
1469
1526
  num_tokens = local_batch.extend_num_tokens
@@ -1482,7 +1539,7 @@ class Scheduler(
1482
1539
  else:
1483
1540
  can_cuda_graph = 0
1484
1541
 
1485
- if not self.spec_algorithm.is_none():
1542
+ if not spec_algorithm.is_none():
1486
1543
  # TODO(sang): Support cuda graph when idle batch is there.
1487
1544
  if local_batch is None or local_batch.forward_mode.is_idle():
1488
1545
  can_cuda_graph = 0
@@ -1500,13 +1557,13 @@ class Scheduler(
1500
1557
  dtype=torch.int64,
1501
1558
  )
1502
1559
  global_info = torch.empty(
1503
- (self.server_args.dp_size, self.attn_tp_size, 4),
1560
+ (dp_size, attn_tp_size, 4),
1504
1561
  dtype=torch.int64,
1505
1562
  )
1506
1563
  torch.distributed.all_gather_into_tensor(
1507
1564
  global_info.flatten(),
1508
1565
  local_info,
1509
- group=self.tp_cpu_group,
1566
+ group=tp_cpu_group,
1510
1567
  )
1511
1568
  global_num_tokens = global_info[:, 0, 0].tolist()
1512
1569
  can_cuda_graph = min(global_info[:, 0, 1].tolist())
@@ -1514,14 +1571,14 @@ class Scheduler(
1514
1571
  is_extend_in_batch = global_info[:, 0, 3].tolist()
1515
1572
 
1516
1573
  if local_batch is None and max(global_num_tokens) > 0:
1517
- local_batch = self.get_idle_batch()
1574
+ local_batch = get_idle_batch()
1518
1575
 
1519
1576
  if local_batch is not None:
1520
1577
  local_batch.global_num_tokens = global_num_tokens
1521
1578
  local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob
1522
1579
 
1523
1580
  # Check forward mode for cuda graph
1524
- if not self.server_args.disable_cuda_graph:
1581
+ if not disable_cuda_graph:
1525
1582
  local_batch.can_run_dp_cuda_graph = can_cuda_graph
1526
1583
 
1527
1584
  return local_batch, any(is_extend_in_batch)
@@ -48,8 +48,12 @@ from fastapi import BackgroundTasks
48
48
 
49
49
  from sglang.srt.aio_rwlock import RWLock
50
50
  from sglang.srt.configs.model_config import ModelConfig
51
- from sglang.srt.disaggregation.conn import KVBootstrapServer
52
- from sglang.srt.disaggregation.utils import DisaggregationMode
51
+ from sglang.srt.disaggregation.utils import (
52
+ DisaggregationMode,
53
+ KVClassType,
54
+ TransferBackend,
55
+ get_kv_class,
56
+ )
53
57
  from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
54
58
  from sglang.srt.managers.io_struct import (
55
59
  AbortReq,
@@ -163,6 +167,7 @@ class TokenizerManager:
163
167
  context_length=server_args.context_length,
164
168
  model_override_args=server_args.json_model_override_args,
165
169
  is_embedding=server_args.is_embedding,
170
+ enable_multimodal=server_args.enable_multimodal,
166
171
  dtype=server_args.dtype,
167
172
  quantization=server_args.quantization,
168
173
  )
@@ -179,6 +184,7 @@ class TokenizerManager:
179
184
  tokenizer_mode=server_args.tokenizer_mode,
180
185
  trust_remote_code=server_args.trust_remote_code,
181
186
  revision=server_args.revision,
187
+ use_fast=not server_args.disable_fast_image_processor,
182
188
  )
183
189
 
184
190
  # We want to parallelize the image pre-processing so we create an executor for it
@@ -327,10 +333,16 @@ class TokenizerManager:
327
333
  self.disaggregation_mode = DisaggregationMode(
328
334
  self.server_args.disaggregation_mode
329
335
  )
336
+ self.transfer_backend = TransferBackend(
337
+ self.server_args.disaggregation_transfer_backend
338
+ )
330
339
  # for disaggregtion, start kv boostrap server on prefill
331
340
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
332
341
  # only start bootstrap server on prefill tm
333
- self.bootstrap_server = KVBootstrapServer(
342
+ kv_bootstrap_server_class = get_kv_class(
343
+ self.transfer_backend, KVClassType.BOOTSTRAP_SERVER
344
+ )
345
+ self.bootstrap_server = kv_bootstrap_server_class(
334
346
  self.server_args.disaggregation_bootstrap_port
335
347
  )
336
348
 
@@ -452,6 +464,8 @@ class TokenizerManager:
452
464
  top_logprobs_num,
453
465
  token_ids_logprob,
454
466
  obj.stream,
467
+ bootstrap_host=obj.bootstrap_host,
468
+ bootstrap_room=obj.bootstrap_room,
455
469
  lora_path=obj.lora_path,
456
470
  input_embeds=input_embeds,
457
471
  session_params=session_params,
@@ -68,6 +68,7 @@ class TpModelWorker:
68
68
  context_length=server_args.context_length,
69
69
  model_override_args=server_args.json_model_override_args,
70
70
  is_embedding=server_args.is_embedding,
71
+ enable_multimodal=server_args.enable_multimodal,
71
72
  dtype=server_args.dtype,
72
73
  quantization=server_args.quantization,
73
74
  )
@@ -442,6 +442,14 @@ class MLATokenToKVPool(KVCache):
442
442
 
443
443
  self.layer_transfer_counter = None
444
444
 
445
+ # for disagg
446
+ def get_contiguous_buf_infos(self):
447
+ # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
448
+ kv_data_ptrs = [self.kv_buffer[i].data_ptr() for i in range(self.layer_num)]
449
+ kv_data_lens = [self.kv_buffer[i].nbytes for i in range(self.layer_num)]
450
+ kv_item_lens = [self.kv_buffer[i][0].nbytes for i in range(self.layer_num)]
451
+ return kv_data_ptrs, kv_data_lens, kv_item_lens
452
+
445
453
  def get_key_buffer(self, layer_id: int):
446
454
  if self.layer_transfer_counter is not None:
447
455
  self.layer_transfer_counter.wait_until(layer_id)
@@ -866,7 +874,12 @@ class MLATokenToKVPoolHost(HostKVCache):
866
874
  self.qk_rope_head_dim = self.device_pool.qk_rope_head_dim
867
875
  self.layer_num = self.device_pool.layer_num
868
876
 
869
- return (self.kv_lora_rank + self.qk_rope_head_dim) * 1 * self.dtype.itemsize
877
+ return (
878
+ (self.kv_lora_rank + self.qk_rope_head_dim)
879
+ * 1
880
+ * self.dtype.itemsize
881
+ * self.layer_num
882
+ )
870
883
 
871
884
  def init_kv_buffer(self):
872
885
  return torch.empty(
@@ -27,6 +27,7 @@ class SchedulerStats:
27
27
  num_queue_reqs: int = 0
28
28
  cache_hit_rate: float = 0.0
29
29
  spec_accept_length: float = 0.0
30
+ avg_request_queue_latency: float = 0.0
30
31
 
31
32
 
32
33
  class SchedulerMetricsCollector:
@@ -87,6 +88,13 @@ class SchedulerMetricsCollector:
87
88
  multiprocess_mode="mostrecent",
88
89
  )
89
90
 
91
+ self.avg_request_queue_latency = Gauge(
92
+ name="sglang:avg_request_queue_latency",
93
+ documentation="The average request queue latency for the last batch of requests in seconds.",
94
+ labelnames=labels.keys(),
95
+ multiprocess_mode="mostrecent",
96
+ )
97
+
90
98
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
91
99
  # Convenience function for logging to gauge.
92
100
  gauge.labels(**self.labels).set(data)
@@ -99,6 +107,7 @@ class SchedulerMetricsCollector:
99
107
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
100
108
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
101
109
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
110
+ self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
102
111
  self.last_log_time = time.time()
103
112
 
104
113
 
@@ -34,6 +34,7 @@ from sglang.srt.model_executor.forward_batch_info import (
34
34
  ForwardBatch,
35
35
  ForwardMode,
36
36
  )
37
+ from sglang.srt.patch_torch import monkey_patch_torch_compile
37
38
  from sglang.srt.utils import get_available_gpu_memory, is_hip
38
39
 
39
40
  _is_hip = is_hip()
@@ -108,6 +109,8 @@ def set_torch_compile_config():
108
109
  if hasattr(torch._dynamo.config, "cache_size_limit"):
109
110
  torch._dynamo.config.cache_size_limit = 1024
110
111
 
112
+ monkey_patch_torch_compile()
113
+
111
114
 
112
115
  def get_batch_sizes_to_capture(model_runner: ModelRunner):
113
116
  server_args = model_runner.server_args
@@ -116,7 +119,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
116
119
  if capture_bs is None:
117
120
  if server_args.speculative_algorithm is None:
118
121
  if server_args.disable_cuda_graph_padding:
119
- capture_bs = list(range(1, 33)) + range(40, 161, 16)
122
+ capture_bs = list(range(1, 33)) + list(range(40, 161, 16))
120
123
  else:
121
124
  capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
122
125
  else:
@@ -269,10 +272,10 @@ class CudaGraphRunner:
269
272
  raise Exception(
270
273
  f"Capture cuda graph failed: {e}\n"
271
274
  "Possible solutions:\n"
272
- "1. disable cuda graph by --disable-cuda-graph\n"
273
- "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
275
+ "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
276
+ "2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n"
274
277
  "3. disable torch compile by not using --enable-torch-compile\n"
275
- "4. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n"
278
+ "4. disable cuda graph by --disable-cuda-graph\n"
276
279
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
277
280
  )
278
281