sglang 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/check_env.py +3 -3
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/kimi_vl.py +38 -0
  5. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  6. sglang/srt/configs/model_config.py +15 -0
  7. sglang/srt/conversation.py +122 -1
  8. sglang/srt/disaggregation/decode.py +8 -2
  9. sglang/srt/disaggregation/fake/__init__.py +1 -0
  10. sglang/srt/disaggregation/fake/conn.py +88 -0
  11. sglang/srt/disaggregation/prefill.py +12 -3
  12. sglang/srt/disaggregation/utils.py +16 -2
  13. sglang/srt/entrypoints/engine.py +52 -21
  14. sglang/srt/entrypoints/http_server.py +27 -2
  15. sglang/srt/function_call_parser.py +97 -0
  16. sglang/srt/hf_transformers_utils.py +2 -0
  17. sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  18. sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  19. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  20. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  21. sglang/srt/layers/attention/utils.py +1 -1
  22. sglang/srt/layers/dp_attention.py +5 -2
  23. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
  41. sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  42. sglang/srt/layers/quantization/__init__.py +2 -2
  43. sglang/srt/layers/quantization/deep_gemm.py +1 -1
  44. sglang/srt/layers/quantization/fp8.py +20 -22
  45. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  46. sglang/srt/layers/utils.py +35 -0
  47. sglang/srt/lora/layers.py +35 -9
  48. sglang/srt/lora/lora_manager.py +84 -35
  49. sglang/srt/managers/data_parallel_controller.py +52 -34
  50. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  51. sglang/srt/managers/schedule_batch.py +34 -15
  52. sglang/srt/managers/scheduler.py +273 -67
  53. sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
  54. sglang/srt/managers/tp_worker.py +52 -17
  55. sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
  56. sglang/srt/mem_cache/memory_pool.py +70 -36
  57. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  58. sglang/srt/model_executor/forward_batch_info.py +31 -1
  59. sglang/srt/model_executor/model_runner.py +123 -58
  60. sglang/srt/models/deepseek_nextn.py +1 -257
  61. sglang/srt/models/deepseek_v2.py +78 -18
  62. sglang/srt/models/kimi_vl.py +308 -0
  63. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  64. sglang/srt/models/llama.py +92 -30
  65. sglang/srt/models/llama4.py +2 -1
  66. sglang/srt/models/llama_eagle.py +4 -1
  67. sglang/srt/models/llama_eagle3.py +4 -1
  68. sglang/srt/models/qwen2_moe.py +8 -3
  69. sglang/srt/models/qwen2_vl.py +0 -12
  70. sglang/srt/models/qwen3_moe.py +8 -3
  71. sglang/srt/openai_api/adapter.py +49 -8
  72. sglang/srt/openai_api/protocol.py +13 -1
  73. sglang/srt/reasoning_parser.py +25 -1
  74. sglang/srt/server_args.py +83 -24
  75. sglang/srt/speculative/eagle_worker.py +3 -2
  76. sglang/srt/utils.py +91 -9
  77. sglang/test/runners.py +4 -0
  78. sglang/test/send_one.py +84 -28
  79. sglang/test/test_utils.py +67 -0
  80. sglang/version.py +1 -1
  81. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
  82. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +85 -60
  83. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
  84. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
  85. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
@@ -35,6 +35,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
35
35
  import copy
36
36
  import dataclasses
37
37
  import logging
38
+ import threading
38
39
  from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
39
40
 
40
41
  import numpy as np
@@ -65,23 +66,24 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
65
66
  # Put some global args for easy access
66
67
  global_server_args_dict = {
67
68
  "attention_backend": ServerArgs.attention_backend,
68
- "sampling_backend": ServerArgs.sampling_backend,
69
- "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
70
- "torchao_config": ServerArgs.torchao_config,
71
- "enable_nan_detection": ServerArgs.enable_nan_detection,
72
- "enable_dp_attention": ServerArgs.enable_dp_attention,
73
- "enable_ep_moe": ServerArgs.enable_ep_moe,
74
- "enable_deepep_moe": ServerArgs.enable_deepep_moe,
69
+ "chunked_prefill_size": ServerArgs.chunked_prefill_size,
75
70
  "deepep_mode": ServerArgs.deepep_mode,
76
71
  "device": ServerArgs.device,
77
- "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
78
- "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
72
+ "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
79
73
  "disable_radix_cache": ServerArgs.disable_radix_cache,
74
+ "enable_deepep_moe": ServerArgs.enable_deepep_moe,
75
+ "enable_dp_attention": ServerArgs.enable_dp_attention,
76
+ "enable_ep_moe": ServerArgs.enable_ep_moe,
77
+ "enable_nan_detection": ServerArgs.enable_nan_detection,
80
78
  "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
79
+ "max_micro_batch_size": ServerArgs.max_micro_batch_size,
81
80
  "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
82
- "chunked_prefill_size": ServerArgs.chunked_prefill_size,
83
81
  "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
84
- "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
82
+ "sampling_backend": ServerArgs.sampling_backend,
83
+ "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
84
+ "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
85
+ "torchao_config": ServerArgs.torchao_config,
86
+ "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
85
87
  }
86
88
 
87
89
  logger = logging.getLogger(__name__)
@@ -724,6 +726,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
724
726
  # This is an optimization to reduce the overhead of the prefill check.
725
727
  batch_is_full: bool = False
726
728
 
729
+ # Events
730
+ launch_done: Optional[threading.Event] = None
731
+
732
+ # For chunked prefill in PP
733
+ chunked_req: Optional[Req] = None
734
+
727
735
  # Sampling info
728
736
  sampling_info: SamplingBatchInfo = None
729
737
  next_batch_sampling_info: SamplingBatchInfo = None
@@ -757,7 +765,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
757
765
  # For extend and mixed chunekd prefill
758
766
  prefix_lens: List[int] = None
759
767
  extend_lens: List[int] = None
760
- extend_num_tokens: int = None
768
+ extend_num_tokens: Optional[int] = None
761
769
  decoding_reqs: List[Req] = None
762
770
  extend_logprob_start_lens: List[int] = None
763
771
  # It comes empty list if logprob is not required.
@@ -799,6 +807,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
799
807
  enable_overlap: bool,
800
808
  spec_algorithm: SpeculativeAlgorithm,
801
809
  enable_custom_logit_processor: bool,
810
+ chunked_req: Optional[Req] = None,
802
811
  ):
803
812
  return_logprob = any(req.return_logprob for req in reqs)
804
813
 
@@ -816,6 +825,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
816
825
  spec_algorithm=spec_algorithm,
817
826
  enable_custom_logit_processor=enable_custom_logit_processor,
818
827
  return_hidden_states=any(req.return_hidden_states for req in reqs),
828
+ chunked_req=chunked_req,
819
829
  )
820
830
 
821
831
  def batch_size(self):
@@ -1232,7 +1242,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
1232
1242
 
1233
1243
  def retract_decode(self, server_args: ServerArgs):
1234
1244
  """Retract the decoding requests when there is not enough memory."""
1235
- sorted_indices = [i for i in range(len(self.reqs))]
1245
+ sorted_indices = list(range(len(self.reqs)))
1236
1246
 
1237
1247
  # TODO(lsyin): improve retraction policy for radix cache
1238
1248
  # For spec decoding, filter_batch API can only filter
@@ -1409,15 +1419,19 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
1409
1419
 
1410
1420
  def filter_batch(
1411
1421
  self,
1412
- chunked_req_to_exclude: Optional[Req] = None,
1422
+ chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
1413
1423
  keep_indices: Optional[List[int]] = None,
1414
1424
  ):
1415
1425
  if keep_indices is None:
1426
+ if isinstance(chunked_req_to_exclude, Req):
1427
+ chunked_req_to_exclude = [chunked_req_to_exclude]
1428
+ elif chunked_req_to_exclude is None:
1429
+ chunked_req_to_exclude = []
1416
1430
  keep_indices = [
1417
1431
  i
1418
1432
  for i in range(len(self.reqs))
1419
1433
  if not self.reqs[i].finished()
1420
- and self.reqs[i] is not chunked_req_to_exclude
1434
+ and not self.reqs[i] in chunked_req_to_exclude
1421
1435
  ]
1422
1436
 
1423
1437
  if keep_indices is None or len(keep_indices) == 0:
@@ -1511,6 +1525,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
1511
1525
  )
1512
1526
  or global_server_args_dict["attention_backend"] == "flashmla"
1513
1527
  or global_server_args_dict["attention_backend"] == "fa3"
1528
+ or global_server_args_dict["attention_backend"] == "cutlass_mla"
1514
1529
  ):
1515
1530
  seq_lens_cpu = self.seq_lens.cpu()
1516
1531
  else:
@@ -1565,6 +1580,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
1565
1580
  )
1566
1581
  ),
1567
1582
  extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
1583
+ launch_done=self.launch_done,
1568
1584
  )
1569
1585
 
1570
1586
  def copy(self):
@@ -1647,6 +1663,9 @@ class ModelWorkerBatch:
1647
1663
  # If set, the output of the batch contains the hidden states of the run.
1648
1664
  capture_hidden_mode: CaptureHiddenMode = None
1649
1665
 
1666
+ # Overlap event
1667
+ launch_done: Optional[threading.Event] = None
1668
+
1650
1669
 
1651
1670
  @triton.jit
1652
1671
  def write_req_to_token_pool_triton(