sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. sglang/bench_one_batch.py +149 -34
  2. sglang/bench_serving.py +73 -14
  3. sglang/compile_deep_gemm.py +13 -7
  4. sglang/launch_server.py +2 -0
  5. sglang/srt/batch_invariant_ops/__init__.py +2 -0
  6. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
  7. sglang/srt/checkpoint_engine/__init__.py +9 -0
  8. sglang/srt/checkpoint_engine/update.py +317 -0
  9. sglang/srt/compilation/backend.py +1 -1
  10. sglang/srt/configs/__init__.py +2 -0
  11. sglang/srt/configs/deepseek_ocr.py +542 -10
  12. sglang/srt/configs/deepseekvl2.py +95 -194
  13. sglang/srt/configs/kimi_linear.py +160 -0
  14. sglang/srt/configs/mamba_utils.py +66 -0
  15. sglang/srt/configs/model_config.py +30 -7
  16. sglang/srt/constants.py +7 -0
  17. sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
  18. sglang/srt/disaggregation/decode.py +34 -6
  19. sglang/srt/disaggregation/nixl/conn.py +2 -2
  20. sglang/srt/disaggregation/prefill.py +25 -3
  21. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
  22. sglang/srt/distributed/parallel_state.py +9 -12
  23. sglang/srt/entrypoints/engine.py +31 -20
  24. sglang/srt/entrypoints/grpc_server.py +0 -1
  25. sglang/srt/entrypoints/http_server.py +94 -94
  26. sglang/srt/entrypoints/openai/protocol.py +7 -1
  27. sglang/srt/entrypoints/openai/serving_chat.py +42 -0
  28. sglang/srt/entrypoints/openai/serving_completions.py +10 -0
  29. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  30. sglang/srt/environ.py +23 -2
  31. sglang/srt/eplb/expert_distribution.py +64 -1
  32. sglang/srt/eplb/expert_location.py +106 -36
  33. sglang/srt/function_call/function_call_parser.py +2 -0
  34. sglang/srt/function_call/minimax_m2.py +367 -0
  35. sglang/srt/grpc/compile_proto.py +3 -0
  36. sglang/srt/layers/activation.py +6 -0
  37. sglang/srt/layers/attention/ascend_backend.py +233 -5
  38. sglang/srt/layers/attention/attention_registry.py +3 -0
  39. sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
  40. sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
  41. sglang/srt/layers/attention/fla/kda.py +1359 -0
  42. sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
  43. sglang/srt/layers/attention/flashattention_backend.py +19 -8
  44. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  45. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
  46. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  47. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
  48. sglang/srt/layers/attention/mamba/mamba.py +20 -11
  49. sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
  50. sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
  51. sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
  52. sglang/srt/layers/attention/nsa/transform_index.py +1 -1
  53. sglang/srt/layers/attention/nsa_backend.py +157 -23
  54. sglang/srt/layers/attention/triton_backend.py +4 -1
  55. sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
  56. sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
  57. sglang/srt/layers/attention/utils.py +78 -0
  58. sglang/srt/layers/communicator.py +24 -1
  59. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  60. sglang/srt/layers/layernorm.py +35 -6
  61. sglang/srt/layers/logits_processor.py +9 -20
  62. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  63. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  64. sglang/srt/layers/moe/ep_moe/layer.py +78 -289
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
  67. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
  68. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
  69. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
  70. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  71. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  72. sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
  73. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  75. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  76. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  77. sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
  78. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  79. sglang/srt/layers/moe/topk.py +35 -10
  80. sglang/srt/layers/moe/utils.py +3 -4
  81. sglang/srt/layers/pooler.py +21 -2
  82. sglang/srt/layers/quantization/__init__.py +13 -84
  83. sglang/srt/layers/quantization/auto_round.py +394 -0
  84. sglang/srt/layers/quantization/awq.py +0 -3
  85. sglang/srt/layers/quantization/base_config.py +7 -0
  86. sglang/srt/layers/quantization/fp8.py +68 -63
  87. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  88. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  89. sglang/srt/layers/quantization/gguf.py +566 -0
  90. sglang/srt/layers/quantization/modelopt_quant.py +168 -11
  91. sglang/srt/layers/quantization/mxfp4.py +30 -38
  92. sglang/srt/layers/quantization/unquant.py +23 -45
  93. sglang/srt/layers/quantization/w4afp8.py +38 -2
  94. sglang/srt/layers/radix_attention.py +5 -2
  95. sglang/srt/layers/rotary_embedding.py +130 -46
  96. sglang/srt/layers/sampler.py +12 -1
  97. sglang/srt/lora/lora_registry.py +9 -0
  98. sglang/srt/managers/async_mm_data_processor.py +122 -0
  99. sglang/srt/managers/data_parallel_controller.py +30 -3
  100. sglang/srt/managers/detokenizer_manager.py +3 -0
  101. sglang/srt/managers/io_struct.py +29 -4
  102. sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
  103. sglang/srt/managers/schedule_batch.py +74 -15
  104. sglang/srt/managers/scheduler.py +185 -144
  105. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
  107. sglang/srt/managers/scheduler_pp_mixin.py +7 -2
  108. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  109. sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
  110. sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
  111. sglang/srt/managers/session_controller.py +6 -5
  112. sglang/srt/managers/tokenizer_manager.py +165 -78
  113. sglang/srt/managers/tp_worker.py +24 -1
  114. sglang/srt/mem_cache/base_prefix_cache.py +23 -4
  115. sglang/srt/mem_cache/common.py +1 -0
  116. sglang/srt/mem_cache/hicache_storage.py +7 -1
  117. sglang/srt/mem_cache/memory_pool.py +253 -57
  118. sglang/srt/mem_cache/memory_pool_host.py +12 -5
  119. sglang/srt/mem_cache/radix_cache.py +4 -0
  120. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  121. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
  122. sglang/srt/metrics/collector.py +46 -3
  123. sglang/srt/model_executor/cuda_graph_runner.py +15 -3
  124. sglang/srt/model_executor/forward_batch_info.py +55 -14
  125. sglang/srt/model_executor/model_runner.py +77 -170
  126. sglang/srt/model_executor/npu_graph_runner.py +7 -3
  127. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  128. sglang/srt/model_loader/weight_utils.py +1 -1
  129. sglang/srt/models/bailing_moe.py +9 -2
  130. sglang/srt/models/deepseek_nextn.py +11 -2
  131. sglang/srt/models/deepseek_v2.py +296 -78
  132. sglang/srt/models/glm4.py +391 -77
  133. sglang/srt/models/glm4_moe.py +322 -354
  134. sglang/srt/models/glm4_moe_nextn.py +4 -14
  135. sglang/srt/models/glm4v.py +196 -55
  136. sglang/srt/models/glm4v_moe.py +29 -197
  137. sglang/srt/models/gpt_oss.py +1 -10
  138. sglang/srt/models/kimi_linear.py +678 -0
  139. sglang/srt/models/llama4.py +1 -1
  140. sglang/srt/models/llama_eagle3.py +11 -1
  141. sglang/srt/models/longcat_flash.py +2 -2
  142. sglang/srt/models/minimax_m2.py +922 -0
  143. sglang/srt/models/nvila.py +355 -0
  144. sglang/srt/models/nvila_lite.py +184 -0
  145. sglang/srt/models/qwen2.py +23 -2
  146. sglang/srt/models/qwen2_moe.py +30 -15
  147. sglang/srt/models/qwen3.py +35 -5
  148. sglang/srt/models/qwen3_moe.py +18 -12
  149. sglang/srt/models/qwen3_next.py +7 -0
  150. sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
  151. sglang/srt/multimodal/processors/base_processor.py +1 -0
  152. sglang/srt/multimodal/processors/glm4v.py +1 -1
  153. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  154. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  155. sglang/srt/multiplex/multiplexing_mixin.py +209 -0
  156. sglang/srt/multiplex/pdmux_context.py +164 -0
  157. sglang/srt/parser/conversation.py +7 -1
  158. sglang/srt/parser/reasoning_parser.py +28 -1
  159. sglang/srt/sampling/custom_logit_processor.py +67 -1
  160. sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
  161. sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
  162. sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
  163. sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
  164. sglang/srt/server_args.py +459 -199
  165. sglang/srt/single_batch_overlap.py +2 -4
  166. sglang/srt/speculative/draft_utils.py +16 -0
  167. sglang/srt/speculative/eagle_info.py +42 -36
  168. sglang/srt/speculative/eagle_info_v2.py +68 -25
  169. sglang/srt/speculative/eagle_utils.py +261 -16
  170. sglang/srt/speculative/eagle_worker.py +11 -3
  171. sglang/srt/speculative/eagle_worker_v2.py +15 -9
  172. sglang/srt/speculative/spec_info.py +305 -31
  173. sglang/srt/speculative/spec_utils.py +44 -8
  174. sglang/srt/tracing/trace.py +121 -12
  175. sglang/srt/utils/common.py +142 -74
  176. sglang/srt/utils/hf_transformers_utils.py +38 -12
  177. sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
  178. sglang/test/kits/radix_cache_server_kit.py +50 -0
  179. sglang/test/runners.py +31 -7
  180. sglang/test/simple_eval_common.py +5 -3
  181. sglang/test/simple_eval_humaneval.py +1 -0
  182. sglang/test/simple_eval_math.py +1 -0
  183. sglang/test/simple_eval_mmlu.py +1 -0
  184. sglang/test/simple_eval_mmmu_vlm.py +1 -0
  185. sglang/test/test_deterministic.py +235 -12
  186. sglang/test/test_deterministic_utils.py +2 -1
  187. sglang/test/test_utils.py +7 -1
  188. sglang/version.py +1 -1
  189. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
  190. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
  191. sglang/srt/models/vila.py +0 -306
  192. /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
  193. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
  194. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
  195. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,13 @@ from sglang.srt.managers.io_struct import (
14
14
  BatchEmbeddingOutput,
15
15
  BatchTokenIDOutput,
16
16
  )
17
- from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
17
+ from sglang.srt.managers.schedule_batch import (
18
+ BaseFinishReason,
19
+ Req,
20
+ RequestStage,
21
+ ScheduleBatch,
22
+ )
23
+ from sglang.srt.tracing.trace import trace_slice
18
24
  from sglang.srt.utils.common import ceil_div
19
25
 
20
26
  if TYPE_CHECKING:
@@ -160,6 +166,14 @@ class SchedulerOutputProcessorMixin:
160
166
  )
161
167
  self.abort_request(AbortReq(rid=req.rid))
162
168
  req.grammar.finished = req.finished()
169
+
170
+ trace_slice(
171
+ RequestStage.PREFILL_FORWARD,
172
+ req.rid,
173
+ auto_next_anon=not req.finished(),
174
+ thread_finish_flag=req.finished(),
175
+ )
176
+
163
177
  else:
164
178
  # being chunked reqs' prefill is not finished
165
179
  req.is_chunked -= 1
@@ -188,6 +202,12 @@ class SchedulerOutputProcessorMixin:
188
202
  )
189
203
  logprob_pt += num_input_logprobs
190
204
 
205
+ trace_slice(
206
+ RequestStage.PREFILL_CHUNKED_FORWARD,
207
+ req.rid,
208
+ auto_next_anon=True,
209
+ )
210
+
191
211
  else: # embedding or reward model
192
212
  is_sparse = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
193
213
 
@@ -203,7 +223,10 @@ class SchedulerOutputProcessorMixin:
203
223
  i
204
224
  ].item()
205
225
  else:
206
- embeddings = embeddings.tolist()
226
+ if isinstance(embeddings, torch.Tensor):
227
+ embeddings = embeddings.tolist()
228
+ else:
229
+ embeddings = [tensor.tolist() for tensor in embeddings]
207
230
 
208
231
  # Check finish conditions
209
232
  for i, req in enumerate(batch.reqs):
@@ -224,6 +247,13 @@ class SchedulerOutputProcessorMixin:
224
247
  # being chunked reqs' prefill is not finished
225
248
  req.is_chunked -= 1
226
249
 
250
+ trace_slice(
251
+ RequestStage.PREFILL_FORWARD,
252
+ req.rid,
253
+ auto_next_anon=not req.finished(),
254
+ thread_finish_flag=req.finished(),
255
+ )
256
+
227
257
  self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
228
258
 
229
259
  def _resolve_spec_overlap_token_ids(
@@ -727,6 +757,7 @@ class SchedulerOutputProcessorMixin:
727
757
  cached_tokens = []
728
758
  spec_verify_ct = []
729
759
  spec_accepted_tokens = []
760
+ retraction_counts = []
730
761
  output_hidden_states = None
731
762
 
732
763
  if return_logprob:
@@ -758,7 +789,7 @@ class SchedulerOutputProcessorMixin:
758
789
  continue
759
790
 
760
791
  # Multimodal partial stream chunks break the detokenizer, so drop aborted requests here.
761
- if self.model_config.is_multimodal_gen and req.to_abort:
792
+ if self.model_config.is_multimodal_gen and req.to_finish:
762
793
  continue
763
794
 
764
795
  if req.finished():
@@ -828,6 +859,8 @@ class SchedulerOutputProcessorMixin:
828
859
  completion_tokens.append(len(output_ids_))
829
860
  cached_tokens.append(req.cached_tokens)
830
861
 
862
+ retraction_counts.append(req.retraction_count)
863
+
831
864
  if not self.spec_algorithm.is_none():
832
865
  spec_verify_ct.append(req.spec_verify_ct)
833
866
  spec_accepted_tokens.append(req.spec_accepted_tokens)
@@ -950,6 +983,7 @@ class SchedulerOutputProcessorMixin:
950
983
  http_worker_ipcs=http_worker_ipcs,
951
984
  placeholder_tokens_idx=None,
952
985
  placeholder_tokens_val=None,
986
+ retraction_counts=retraction_counts,
953
987
  )
954
988
  )
955
989
 
@@ -961,6 +995,7 @@ class SchedulerOutputProcessorMixin:
961
995
  embeddings = []
962
996
  prompt_tokens = []
963
997
  cached_tokens = []
998
+ retraction_counts = []
964
999
  for req in reqs:
965
1000
  if req.finished():
966
1001
  rids.append(req.rid)
@@ -969,6 +1004,7 @@ class SchedulerOutputProcessorMixin:
969
1004
  embeddings.append(req.embedding)
970
1005
  prompt_tokens.append(len(req.origin_input_ids))
971
1006
  cached_tokens.append(req.cached_tokens)
1007
+ retraction_counts.append(req.retraction_count)
972
1008
  self.send_to_detokenizer.send_output(
973
1009
  BatchEmbeddingOutput(
974
1010
  finished_reasons,
@@ -979,5 +1015,6 @@ class SchedulerOutputProcessorMixin:
979
1015
  http_worker_ipcs=http_worker_ipcs,
980
1016
  placeholder_tokens_idx=None,
981
1017
  placeholder_tokens_val=None,
1018
+ retraction_counts=retraction_counts,
982
1019
  )
983
1020
  )
@@ -4,7 +4,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
4
4
  from sglang.srt.managers.schedule_batch import ScheduleBatch
5
5
  from sglang.srt.managers.utils import GenerationBatchResult
6
6
  from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
7
- from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj
7
+ from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj, require_mlp_sync
8
8
 
9
9
 
10
10
  class SchedulerPPMixin:
@@ -236,7 +236,12 @@ class SchedulerPPMixin:
236
236
  tmbs[mb_id] = transferred_rids
237
237
 
238
238
  self.process_prefill_chunk()
239
- mbs[mb_id] = self.get_new_batch_prefill()
239
+
240
+ batch = self.get_new_batch_prefill()
241
+ if require_mlp_sync(self.server_args):
242
+ batch = self.prepare_mlp_sync_batch(batch)
243
+ mbs[mb_id] = batch
244
+
240
245
  self.running_mbs[mb_id] = self.running_batch
241
246
 
242
247
  self.cur_batch = mbs[mb_id]
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
28
28
  class SchedulerProfilerMixin:
29
29
  def init_profiler(self):
30
30
  self.torch_profiler = None
31
- self.torch_profiler_output_dir: Optional[str] = None
31
+ self.torch_profiler_output_dir: Optional[Path] = None
32
32
  self.profiler_activities: Optional[List[str]] = None
33
33
  self.profile_id: Optional[str] = None
34
34
  self.profiler_start_forward_ct: Optional[int] = None
@@ -69,7 +69,7 @@ class SchedulerProfilerMixin:
69
69
  if activities is None:
70
70
  activities = ["CPU", "GPU"]
71
71
 
72
- self.torch_profiler_output_dir = output_dir
72
+ self.torch_profiler_output_dir = Path(output_dir).expanduser()
73
73
  self.torch_profiler_with_stack = with_stack
74
74
  self.torch_profiler_record_shapes = record_shapes
75
75
  self.profiler_activities = activities
@@ -213,8 +213,7 @@ class SchedulerProfilerMixin:
213
213
  message="Profiling is not in progress. Call /start_profile first.",
214
214
  )
215
215
 
216
- if not Path(self.torch_profiler_output_dir).exists():
217
- Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
216
+ self.torch_profiler_output_dir.mkdir(parents=True, exist_ok=True)
218
217
 
219
218
  stage_suffix = f"-{stage.name}" if stage else ""
220
219
  logger.info("Stop profiling" + stage_suffix + "...")
@@ -1,5 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+ import signal
5
+ import sys
3
6
  import time
4
7
  from typing import TYPE_CHECKING
5
8
 
@@ -7,10 +10,13 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
7
10
  from sglang.srt.managers.schedule_batch import ScheduleBatch
8
11
  from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
9
12
  from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
13
+ from sglang.srt.utils.common import disable_request_logging, pyspy_dump_schedulers
10
14
 
11
15
  if TYPE_CHECKING:
12
16
  from sglang.srt.managers.scheduler import Scheduler
13
17
 
18
+ logger = logging.getLogger(__name__)
19
+
14
20
 
15
21
  class SchedulerRuntimeCheckerMixin:
16
22
 
@@ -215,3 +221,42 @@ class SchedulerRuntimeCheckerMixin:
215
221
  self.check_tree_cache()
216
222
  self.new_token_ratio = self.init_new_token_ratio
217
223
  self.maybe_sleep_on_idle()
224
+
225
+ def watchdog_thread(self: Scheduler):
226
+ """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
227
+ self.watchdog_last_forward_ct = 0
228
+ self.watchdog_last_time = time.perf_counter()
229
+
230
+ while True:
231
+ current = time.perf_counter()
232
+ if self.cur_batch is not None:
233
+ if self.watchdog_last_forward_ct == self.forward_ct:
234
+ if current > self.watchdog_last_time + self.watchdog_timeout:
235
+ break
236
+ else:
237
+ self.watchdog_last_forward_ct = self.forward_ct
238
+ self.watchdog_last_time = current
239
+ time.sleep(self.watchdog_timeout // 2)
240
+
241
+ if not disable_request_logging():
242
+ # Print batch size and memory pool info to check whether there are de-sync issues.
243
+ if self.is_hybrid:
244
+ _, info_msg = self._check_hybrid_memory()
245
+ elif self.is_hybrid_gdn and isinstance(self.tree_cache, MambaRadixCache):
246
+ _, info_msg = self._check_mamba_memory()
247
+ else:
248
+ _, info_msg = self._check_radix_cache_memory()
249
+ logger.error(
250
+ f"{self.cur_batch.batch_size()=}\n"
251
+ f"{self.cur_batch.reqs=}\n"
252
+ f"{info_msg}"
253
+ )
254
+
255
+ pyspy_dump_schedulers()
256
+ logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
257
+ print(file=sys.stderr, flush=True)
258
+ print(file=sys.stdout, flush=True)
259
+
260
+ # Wait for some time so that the parent process can print the error.
261
+ time.sleep(5)
262
+ self.parent_process.send_signal(signal.SIGQUIT)
@@ -5,7 +5,12 @@ from typing import TYPE_CHECKING, Tuple
5
5
 
6
6
  import torch
7
7
 
8
- from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGHTS
8
+ from sglang.srt.constants import (
9
+ GPU_MEMORY_ALL_TYPES,
10
+ GPU_MEMORY_TYPE_CUDA_GRAPH,
11
+ GPU_MEMORY_TYPE_KV_CACHE,
12
+ GPU_MEMORY_TYPE_WEIGHTS,
13
+ )
9
14
  from sglang.srt.managers.io_struct import (
10
15
  DestroyWeightsUpdateGroupReqInput,
11
16
  DestroyWeightsUpdateGroupReqOutput,
@@ -101,10 +106,14 @@ class SchedulerUpdateWeightsMixin:
101
106
  def release_memory_occupation(
102
107
  self: Scheduler, recv_req: ReleaseMemoryOccupationReqInput
103
108
  ):
109
+ assert (
110
+ self._is_no_request()
111
+ ), "release_memory_occupation should be called only when no ongoing request."
112
+
104
113
  tags = recv_req.tags
105
114
 
106
115
  if tags is None or len(tags) == 0:
107
- tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
116
+ tags = GPU_MEMORY_ALL_TYPES
108
117
 
109
118
  for tag in tags:
110
119
  self.offload_tags.add(tag)
@@ -120,6 +129,9 @@ class SchedulerUpdateWeightsMixin:
120
129
  torch.distributed.barrier(self.tp_cpu_group)
121
130
  self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
122
131
 
132
+ if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
133
+ self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_CUDA_GRAPH)
134
+
123
135
  return ReleaseMemoryOccupationReqOutput()
124
136
 
125
137
  def resume_memory_occupation(
@@ -128,11 +140,14 @@ class SchedulerUpdateWeightsMixin:
128
140
  tags = recv_req.tags
129
141
 
130
142
  if tags is None or len(tags) == 0:
131
- tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
143
+ tags = GPU_MEMORY_ALL_TYPES
132
144
 
133
145
  for tag in tags:
134
146
  self.offload_tags.remove(tag)
135
147
 
148
+ if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
149
+ self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_CUDA_GRAPH)
150
+
136
151
  if GPU_MEMORY_TYPE_WEIGHTS in tags:
137
152
  self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
138
153
  torch.distributed.barrier(self.tp_cpu_group)
@@ -15,11 +15,11 @@ import uuid
15
15
  from typing import Dict, Optional
16
16
 
17
17
  from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
18
- from sglang.srt.managers.schedule_batch import Req
18
+ from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
19
19
 
20
20
 
21
21
  class SessionReqNode:
22
- def __init__(self, req, parent=None, childs=None):
22
+ def __init__(self, req: Req, parent=None, childs=None):
23
23
  self.req = req
24
24
  self.parent = parent
25
25
  if parent is not None:
@@ -36,12 +36,12 @@ class SessionReqNode:
36
36
  req_node.clear(req_dict)
37
37
 
38
38
  if self.req.finished_reason is None:
39
- self.req.to_abort = True
39
+ self.req.to_finish = FINISH_ABORT()
40
40
  del req_dict[self.req.rid]
41
41
 
42
42
  def abort(self):
43
43
  if self.req.finished_reason is None:
44
- self.req.to_abort = True
44
+ self.req.to_finish = FINISH_ABORT()
45
45
 
46
46
  def __str__(self):
47
47
  return self._str_helper(self.req.rid)
@@ -137,13 +137,14 @@ class Session:
137
137
  origin_input_ids=input_ids,
138
138
  origin_input_ids_unpadded=input_ids_unpadded,
139
139
  sampling_params=req.sampling_params,
140
- lora_path=req.lora_path,
140
+ lora_id=req.lora_id,
141
141
  session_id=self.session_id,
142
142
  custom_logit_processor=req.custom_logit_processor,
143
143
  stream=req.stream,
144
144
  return_logprob=req.return_logprob,
145
145
  top_logprobs_num=req.top_logprobs_num,
146
146
  token_ids_logprob=req.token_ids_logprob,
147
+ vocab_size=tokenizer.vocab_size,
147
148
  )
148
149
  if last_req is not None:
149
150
  new_req.multimodal_inputs = last_req.multimodal_inputs