sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import copy
5
5
  import logging
6
6
  import os
7
7
  import time
8
+ import uuid
8
9
  from collections import deque
9
10
  from typing import (
10
11
  TYPE_CHECKING,
@@ -24,8 +25,12 @@ import zmq
24
25
  from sglang.srt.managers.io_struct import (
25
26
  ClearHiCacheReqInput,
26
27
  ClearHiCacheReqOutput,
28
+ CloseSessionReqInput,
29
+ DestroyWeightsUpdateGroupReqInput,
30
+ DestroyWeightsUpdateGroupReqOutput,
27
31
  ExpertDistributionReq,
28
32
  ExpertDistributionReqOutput,
33
+ ExpertDistributionReqType,
29
34
  FlushCacheReqInput,
30
35
  FlushCacheReqOutput,
31
36
  GetInternalStateReq,
@@ -40,8 +45,9 @@ from sglang.srt.managers.io_struct import (
40
45
  InitWeightsUpdateGroupReqOutput,
41
46
  LoadLoRAAdapterReqInput,
42
47
  LoadLoRAAdapterReqOutput,
43
- LoRAUpdateResult,
48
+ LoRAUpdateOutput,
44
49
  MultiTokenizerWrapper,
50
+ OpenSessionReqInput,
45
51
  ProfileReq,
46
52
  ProfileReqOutput,
47
53
  ProfileReqType,
@@ -149,6 +155,9 @@ class TokenizerCommunicatorMixin:
149
155
  self.init_weights_update_group_communicator = _Communicator(
150
156
  self.send_to_scheduler, server_args.dp_size
151
157
  )
158
+ self.destroy_weights_update_group_communicator = _Communicator(
159
+ self.send_to_scheduler, server_args.dp_size
160
+ )
152
161
  self.update_weights_from_distributed_communicator = _Communicator(
153
162
  self.send_to_scheduler, server_args.dp_size
154
163
  )
@@ -207,6 +216,10 @@ class TokenizerCommunicatorMixin:
207
216
  InitWeightsUpdateGroupReqOutput,
208
217
  self.init_weights_update_group_communicator.handle_recv,
209
218
  ),
219
+ (
220
+ DestroyWeightsUpdateGroupReqOutput,
221
+ self.destroy_weights_update_group_communicator.handle_recv,
222
+ ),
210
223
  (
211
224
  UpdateWeightsFromDistributedReqOutput,
212
225
  self.update_weights_from_distributed_communicator.handle_recv,
@@ -264,7 +277,7 @@ class TokenizerCommunicatorMixin:
264
277
  self.expert_distribution_communicator.handle_recv,
265
278
  ),
266
279
  (
267
- LoRAUpdateResult,
280
+ LoRAUpdateOutput,
268
281
  self.update_lora_adapter_communicator.handle_recv,
269
282
  ),
270
283
  (
@@ -323,15 +336,18 @@ class TokenizerCommunicatorMixin:
323
336
 
324
337
  async def start_expert_distribution_record(self: TokenizerManager):
325
338
  self.auto_create_handle_loop()
326
- await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
339
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.START_RECORD)
340
+ await self.expert_distribution_communicator(req)
327
341
 
328
342
  async def stop_expert_distribution_record(self: TokenizerManager):
329
343
  self.auto_create_handle_loop()
330
- await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
344
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.STOP_RECORD)
345
+ await self.expert_distribution_communicator(req)
331
346
 
332
347
  async def dump_expert_distribution_record(self: TokenizerManager):
333
348
  self.auto_create_handle_loop()
334
- await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
349
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.DUMP_RECORD)
350
+ await self.expert_distribution_communicator(req)
335
351
 
336
352
  async def init_weights_update_group(
337
353
  self: TokenizerManager,
@@ -345,6 +361,18 @@ class TokenizerCommunicatorMixin:
345
361
  result = (await self.init_weights_update_group_communicator(obj))[0]
346
362
  return result.success, result.message
347
363
 
364
+ async def destroy_weights_update_group(
365
+ self,
366
+ obj: DestroyWeightsUpdateGroupReqInput,
367
+ request: Optional[fastapi.Request] = None,
368
+ ) -> Tuple[bool, str]:
369
+ self.auto_create_handle_loop()
370
+ assert (
371
+ self.server_args.dp_size == 1
372
+ ), "dp_size must be 1 for destroy parameter update group"
373
+ result = (await self.destroy_weights_update_group_communicator(obj))[0]
374
+ return result.success, result.message
375
+
348
376
  async def update_weights_from_distributed(
349
377
  self: TokenizerManager,
350
378
  obj: UpdateWeightsFromDistributedReqInput,
@@ -567,3 +595,81 @@ class TokenizerCommunicatorMixin:
567
595
  async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
568
596
  req = GetLoadReqInput()
569
597
  return await self.get_load_communicator(req)
598
+
599
+ async def open_session(
600
+ self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
601
+ ):
602
+ self.auto_create_handle_loop()
603
+
604
+ if obj.session_id is None:
605
+ obj.session_id = uuid.uuid4().hex
606
+ elif obj.session_id in self.session_futures:
607
+ return None
608
+
609
+ if self.server_args.tokenizer_worker_num > 1:
610
+ obj = MultiTokenizerWrapper(self.worker_id, obj)
611
+ self.send_to_scheduler.send_pyobj(obj)
612
+
613
+ self.session_futures[obj.session_id] = asyncio.Future()
614
+ session_id = await self.session_futures[obj.session_id]
615
+ del self.session_futures[obj.session_id]
616
+ return session_id
617
+
618
+ async def close_session(
619
+ self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
620
+ ):
621
+ await self.send_to_scheduler.send_pyobj(obj)
622
+
623
+ def get_log_request_metadata(self):
624
+ max_length = None
625
+ skip_names = None
626
+ out_skip_names = None
627
+ if self.log_requests:
628
+ if self.log_requests_level == 0:
629
+ max_length = 1 << 30
630
+ skip_names = set(
631
+ [
632
+ "text",
633
+ "input_ids",
634
+ "input_embeds",
635
+ "image_data",
636
+ "audio_data",
637
+ "lora_path",
638
+ "sampling_params",
639
+ ]
640
+ )
641
+ out_skip_names = set(
642
+ [
643
+ "text",
644
+ "output_ids",
645
+ "embedding",
646
+ ]
647
+ )
648
+ elif self.log_requests_level == 1:
649
+ max_length = 1 << 30
650
+ skip_names = set(
651
+ [
652
+ "text",
653
+ "input_ids",
654
+ "input_embeds",
655
+ "image_data",
656
+ "audio_data",
657
+ "lora_path",
658
+ ]
659
+ )
660
+ out_skip_names = set(
661
+ [
662
+ "text",
663
+ "output_ids",
664
+ "embedding",
665
+ ]
666
+ )
667
+ elif self.log_requests_level == 2:
668
+ max_length = 2048
669
+ elif self.log_requests_level == 3:
670
+ max_length = 1 << 30
671
+ else:
672
+ raise ValueError(
673
+ f"Invalid --log-requests-level: {self.log_requests_level=}"
674
+ )
675
+ return max_length, skip_names, out_skip_names
@@ -43,23 +43,17 @@ from fastapi import BackgroundTasks
43
43
  from sglang.srt.aio_rwlock import RWLock
44
44
  from sglang.srt.configs.model_config import ModelConfig
45
45
  from sglang.srt.disaggregation.utils import DisaggregationMode
46
- from sglang.srt.hf_transformers_utils import (
47
- get_processor,
48
- get_tokenizer,
49
- get_tokenizer_from_processor,
50
- )
51
- from sglang.srt.lora.lora_registry import LoRARef, LoRARegistry
46
+ from sglang.srt.lora.lora_registry import LoRARegistry
52
47
  from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
53
48
  from sglang.srt.managers.disagg_service import start_disagg_service
54
49
  from sglang.srt.managers.io_struct import (
55
50
  AbortReq,
56
- BatchEmbeddingOut,
57
- BatchMultimodalOut,
58
- BatchStrOut,
59
- BatchTokenIDOut,
51
+ BatchEmbeddingOutput,
52
+ BatchMultimodalOutput,
53
+ BatchStrOutput,
54
+ BatchTokenIDOutput,
60
55
  BatchTokenizedEmbeddingReqInput,
61
56
  BatchTokenizedGenerateReqInput,
62
- CloseSessionReqInput,
63
57
  ConfigureLoggingReq,
64
58
  EmbeddingReqInput,
65
59
  FreezeGCReq,
@@ -67,7 +61,6 @@ from sglang.srt.managers.io_struct import (
67
61
  GetLoadReqInput,
68
62
  HealthCheckOutput,
69
63
  MultiTokenizerWrapper,
70
- OpenSessionReqInput,
71
64
  OpenSessionReqOutput,
72
65
  SessionParams,
73
66
  TokenizedEmbeddingReqInput,
@@ -84,6 +77,7 @@ from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicat
84
77
  from sglang.srt.metrics.collector import TokenizerMetricsCollector
85
78
  from sglang.srt.sampling.sampling_params import SamplingParams
86
79
  from sglang.srt.server_args import PortArgs, ServerArgs
80
+ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
87
81
  from sglang.srt.tracing.trace import (
88
82
  trace_get_proc_propagate_context,
89
83
  trace_req_finish,
@@ -100,6 +94,11 @@ from sglang.srt.utils import (
100
94
  get_zmq_socket,
101
95
  kill_process_tree,
102
96
  )
97
+ from sglang.srt.utils.hf_transformers_utils import (
98
+ get_processor,
99
+ get_tokenizer,
100
+ get_tokenizer_from_processor,
101
+ )
103
102
  from sglang.utils import TypeBasedDispatcher, get_exception_traceback
104
103
 
105
104
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -163,6 +162,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
163
162
  else None
164
163
  )
165
164
  self.crash_dump_folder = server_args.crash_dump_folder
165
+ self.enable_trace = server_args.enable_trace
166
166
 
167
167
  # Read model args
168
168
  self.model_path = server_args.model_path
@@ -174,8 +174,17 @@ class TokenizerManager(TokenizerCommunicatorMixin):
174
174
  self.image_token_id = self.model_config.image_token_id
175
175
  self.max_req_input_len = None # Will be set later in engine.py
176
176
 
177
+ speculative_algorithm = SpeculativeAlgorithm.from_string(
178
+ server_args.speculative_algorithm
179
+ )
180
+ self.reserve_input_token_num = (
181
+ 0
182
+ if speculative_algorithm.is_none()
183
+ else server_args.speculative_num_draft_tokens
184
+ )
185
+
177
186
  if self.model_config.is_multimodal:
178
- import_processors()
187
+ import_processors("sglang.srt.multimodal.processors")
179
188
  try:
180
189
  _processor = get_processor(
181
190
  server_args.tokenizer_path,
@@ -310,8 +319,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
310
319
  "model_name": self.server_args.served_model_name,
311
320
  # TODO: Add lora name/path in the future,
312
321
  }
313
- if server_args.tokenizer_metrics_allowed_customer_labels:
314
- for label in server_args.tokenizer_metrics_allowed_customer_labels:
322
+ if server_args.tokenizer_metrics_allowed_custom_labels:
323
+ for label in server_args.tokenizer_metrics_allowed_custom_labels:
315
324
  labels[label] = ""
316
325
  self.metrics_collector = TokenizerMetricsCollector(
317
326
  server_args=server_args,
@@ -330,10 +339,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
330
339
  [
331
340
  (
332
341
  (
333
- BatchStrOut,
334
- BatchEmbeddingOut,
335
- BatchTokenIDOut,
336
- BatchMultimodalOut,
342
+ BatchStrOutput,
343
+ BatchEmbeddingOutput,
344
+ BatchTokenIDOutput,
345
+ BatchMultimodalOutput,
337
346
  ),
338
347
  self._handle_batch_output,
339
348
  ),
@@ -371,23 +380,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
371
380
  # If it's a single value, add worker_id prefix
372
381
  obj.rid = f"{self.worker_id}_{obj.rid}"
373
382
 
374
- if obj.is_single:
375
- bootstrap_room = (
376
- obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
377
- )
378
- trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
379
- trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
380
- else:
381
- for i in range(len(obj.rid)):
382
- bootstrap_room = (
383
- obj.bootstrap_room[i]
384
- if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
385
- else None
386
- )
387
- trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
388
- trace_slice_start(
389
- "", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
390
- )
383
+ if self.enable_trace:
384
+ self._trace_request_start(obj, created_time)
391
385
 
392
386
  if self.log_requests:
393
387
  max_length, skip_names, _ = self.log_request_metadata
@@ -618,6 +612,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
618
612
  _max_req_len = self.context_len
619
613
 
620
614
  input_token_num = len(input_ids) if input_ids is not None else 0
615
+ input_token_num += self.reserve_input_token_num
621
616
  if input_token_num >= self.context_len:
622
617
  if self.server_args.allow_auto_truncate:
623
618
  logger.warning(
@@ -719,7 +714,6 @@ class TokenizerManager(TokenizerCommunicatorMixin):
719
714
  )
720
715
 
721
716
  tokenized_obj = TokenizedGenerateReqInput(
722
- obj.rid,
723
717
  input_text,
724
718
  input_ids,
725
719
  mm_inputs,
@@ -729,6 +723,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
729
723
  obj.top_logprobs_num,
730
724
  obj.token_ids_logprob,
731
725
  obj.stream,
726
+ rid=obj.rid,
732
727
  bootstrap_host=obj.bootstrap_host,
733
728
  bootstrap_port=obj.bootstrap_port,
734
729
  bootstrap_room=obj.bootstrap_room,
@@ -738,15 +733,18 @@ class TokenizerManager(TokenizerCommunicatorMixin):
738
733
  custom_logit_processor=obj.custom_logit_processor,
739
734
  return_hidden_states=obj.return_hidden_states,
740
735
  data_parallel_rank=obj.data_parallel_rank,
736
+ priority=obj.priority,
737
+ extra_key=obj.extra_key,
741
738
  )
742
739
  elif isinstance(obj, EmbeddingReqInput):
743
740
  tokenized_obj = TokenizedEmbeddingReqInput(
744
- obj.rid,
745
741
  input_text,
746
742
  input_ids,
747
743
  mm_inputs,
748
744
  token_type_ids,
749
745
  sampling_params,
746
+ rid=obj.rid,
747
+ priority=obj.priority,
750
748
  )
751
749
 
752
750
  return tokenized_obj
@@ -1038,10 +1036,13 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1038
1036
  def abort_request(self, rid: str = "", abort_all: bool = False):
1039
1037
  if not abort_all and rid not in self.rid_to_state:
1040
1038
  return
1041
- req = AbortReq(rid, abort_all)
1039
+ req = AbortReq(rid=rid, abort_all=abort_all)
1042
1040
  self.send_to_scheduler.send_pyobj(req)
1043
1041
  if self.enable_metrics:
1044
- self.metrics_collector.observe_one_aborted_request()
1042
+ # TODO: also use custom_labels from the request
1043
+ self.metrics_collector.observe_one_aborted_request(
1044
+ self.metrics_collector.labels
1045
+ )
1045
1046
 
1046
1047
  async def pause_generation(self):
1047
1048
  async with self.is_pause_cond:
@@ -1103,84 +1104,6 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1103
1104
  all_paused_requests = [r.num_paused_requests for r in result]
1104
1105
  return all_success, all_message, all_paused_requests
1105
1106
 
1106
- async def open_session(
1107
- self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
1108
- ):
1109
- self.auto_create_handle_loop()
1110
-
1111
- if obj.session_id is None:
1112
- obj.session_id = uuid.uuid4().hex
1113
- elif obj.session_id in self.session_futures:
1114
- return None
1115
-
1116
- if self.server_args.tokenizer_worker_num > 1:
1117
- obj = MultiTokenizerWrapper(self.worker_id, obj)
1118
- self.send_to_scheduler.send_pyobj(obj)
1119
-
1120
- self.session_futures[obj.session_id] = asyncio.Future()
1121
- session_id = await self.session_futures[obj.session_id]
1122
- del self.session_futures[obj.session_id]
1123
- return session_id
1124
-
1125
- async def close_session(
1126
- self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
1127
- ):
1128
- await self.send_to_scheduler.send_pyobj(obj)
1129
-
1130
- def get_log_request_metadata(self):
1131
- max_length = None
1132
- skip_names = None
1133
- out_skip_names = None
1134
- if self.log_requests:
1135
- if self.log_requests_level == 0:
1136
- max_length = 1 << 30
1137
- skip_names = set(
1138
- [
1139
- "text",
1140
- "input_ids",
1141
- "input_embeds",
1142
- "image_data",
1143
- "audio_data",
1144
- "lora_path",
1145
- "sampling_params",
1146
- ]
1147
- )
1148
- out_skip_names = set(
1149
- [
1150
- "text",
1151
- "output_ids",
1152
- "embedding",
1153
- ]
1154
- )
1155
- elif self.log_requests_level == 1:
1156
- max_length = 1 << 30
1157
- skip_names = set(
1158
- [
1159
- "text",
1160
- "input_ids",
1161
- "input_embeds",
1162
- "image_data",
1163
- "audio_data",
1164
- "lora_path",
1165
- ]
1166
- )
1167
- out_skip_names = set(
1168
- [
1169
- "text",
1170
- "output_ids",
1171
- "embedding",
1172
- ]
1173
- )
1174
- elif self.log_requests_level == 2:
1175
- max_length = 2048
1176
- elif self.log_requests_level == 3:
1177
- max_length = 1 << 30
1178
- else:
1179
- raise ValueError(
1180
- f"Invalid --log-requests-level: {self.log_requests_level=}"
1181
- )
1182
- return max_length, skip_names, out_skip_names
1183
-
1184
1107
  def configure_logging(self, obj: ConfigureLoggingReq):
1185
1108
  if obj.log_requests is not None:
1186
1109
  self.log_requests = obj.log_requests
@@ -1339,12 +1262,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1339
1262
  # Drain requests
1340
1263
  while True:
1341
1264
  remain_num_req = len(self.rid_to_state)
1265
+ remaining_rids = list(self.rid_to_state.keys())
1342
1266
 
1343
1267
  if self.server_status == ServerStatus.UnHealthy:
1344
1268
  # if health check failed, we should exit immediately
1345
1269
  logger.error(
1346
- "Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
1347
- remain_num_req,
1270
+ "Signal SIGTERM received while health check failed. Force exiting."
1348
1271
  )
1349
1272
  self.dump_requests_before_crash()
1350
1273
  break
@@ -1352,13 +1275,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1352
1275
  elif get_bool_env_var("SGL_FORCE_SHUTDOWN"):
1353
1276
  # if force shutdown flag set, exit immediately
1354
1277
  logger.error(
1355
- "Signal SIGTERM received while force shutdown flag set. Force exiting... remaining number of requests: %d",
1356
- remain_num_req,
1278
+ "Signal SIGTERM received while force shutdown flag set. Force exiting."
1357
1279
  )
1358
1280
  break
1359
1281
 
1360
1282
  logger.info(
1361
- f"Gracefully exiting... remaining number of requests {remain_num_req}"
1283
+ f"Gracefully exiting... Remaining number of requests {remain_num_req}. Remaining requests {remaining_rids=}."
1362
1284
  )
1363
1285
  if remain_num_req > 0:
1364
1286
  await asyncio.sleep(5)
@@ -1379,7 +1301,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1379
1301
  def _handle_batch_output(
1380
1302
  self,
1381
1303
  recv_obj: Union[
1382
- BatchStrOut, BatchEmbeddingOut, BatchMultimodalOut, BatchTokenIDOut
1304
+ BatchStrOutput,
1305
+ BatchEmbeddingOutput,
1306
+ BatchMultimodalOutput,
1307
+ BatchTokenIDOutput,
1383
1308
  ],
1384
1309
  ):
1385
1310
  for i, rid in enumerate(recv_obj.rids):
@@ -1413,7 +1338,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1413
1338
  i,
1414
1339
  )
1415
1340
 
1416
- if not isinstance(recv_obj, BatchEmbeddingOut):
1341
+ if not isinstance(recv_obj, BatchEmbeddingOutput):
1417
1342
  meta_info.update(
1418
1343
  {
1419
1344
  "completion_tokens": recv_obj.completion_tokens[i],
@@ -1424,7 +1349,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1424
1349
  if getattr(recv_obj, "output_hidden_states", None):
1425
1350
  meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
1426
1351
 
1427
- if isinstance(recv_obj, BatchStrOut):
1352
+ if isinstance(recv_obj, BatchStrOutput):
1428
1353
  state.text += recv_obj.output_strs[i]
1429
1354
  if state.obj.stream:
1430
1355
  state.output_ids.extend(recv_obj.output_ids[i])
@@ -1439,7 +1364,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1439
1364
  "output_ids": output_token_ids,
1440
1365
  "meta_info": meta_info,
1441
1366
  }
1442
- elif isinstance(recv_obj, BatchTokenIDOut):
1367
+ elif isinstance(recv_obj, BatchTokenIDOutput):
1443
1368
  if self.server_args.stream_output and state.obj.stream:
1444
1369
  state.output_ids.extend(recv_obj.output_ids[i])
1445
1370
  output_token_ids = state.output_ids[state.last_output_offset :]
@@ -1452,10 +1377,10 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1452
1377
  "output_ids": output_token_ids,
1453
1378
  "meta_info": meta_info,
1454
1379
  }
1455
- elif isinstance(recv_obj, BatchMultimodalOut):
1380
+ elif isinstance(recv_obj, BatchMultimodalOutput):
1456
1381
  raise NotImplementedError("BatchMultimodalOut not implemented")
1457
1382
  else:
1458
- assert isinstance(recv_obj, BatchEmbeddingOut)
1383
+ assert isinstance(recv_obj, BatchEmbeddingOutput)
1459
1384
  out_dict = {
1460
1385
  "embedding": recv_obj.embeddings[i],
1461
1386
  "meta_info": meta_info,
@@ -1494,7 +1419,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1494
1419
  top_logprobs_num: int,
1495
1420
  token_ids_logprob: List[int],
1496
1421
  return_text_in_logprobs: bool,
1497
- recv_obj: BatchStrOut,
1422
+ recv_obj: BatchStrOutput,
1498
1423
  recv_obj_index: int,
1499
1424
  ):
1500
1425
  if recv_obj.input_token_logprobs_val is None:
@@ -1612,17 +1537,17 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1612
1537
  ret.append(None)
1613
1538
  return ret
1614
1539
 
1615
- def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
1540
+ def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int):
1616
1541
  completion_tokens = (
1617
1542
  recv_obj.completion_tokens[i]
1618
1543
  if getattr(recv_obj, "completion_tokens", None)
1619
1544
  else 0
1620
1545
  )
1621
1546
 
1622
- customer_labels = getattr(state.obj, "customer_labels", None)
1547
+ custom_labels = getattr(state.obj, "custom_labels", None)
1623
1548
  labels = (
1624
- {**self.metrics_collector.labels, **customer_labels}
1625
- if customer_labels
1549
+ {**self.metrics_collector.labels, **custom_labels}
1550
+ if custom_labels
1626
1551
  else self.metrics_collector.labels
1627
1552
  )
1628
1553
  if (
@@ -1708,7 +1633,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1708
1633
 
1709
1634
  asyncio.create_task(asyncio.to_thread(background_task))
1710
1635
 
1711
- def _handle_abort_req(self, recv_obj):
1636
+ def _handle_abort_req(self, recv_obj: AbortReq):
1712
1637
  if is_health_check_generate_req(recv_obj):
1713
1638
  return
1714
1639
  state = self.rid_to_state[recv_obj.rid]
@@ -1874,6 +1799,29 @@ class TokenizerManager(TokenizerCommunicatorMixin):
1874
1799
  load_udpate_req = WatchLoadUpdateReq(loads=loads)
1875
1800
  self.send_to_scheduler.send_pyobj(load_udpate_req)
1876
1801
 
1802
+ def _trace_request_start(
1803
+ self,
1804
+ obj: Union[GenerateReqInput, EmbeddingReqInput],
1805
+ created_time: Optional[float] = None,
1806
+ ):
1807
+ if obj.is_single:
1808
+ bootstrap_room = (
1809
+ obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
1810
+ )
1811
+ trace_req_start(obj.rid, bootstrap_room, ts=int(created_time * 1e9))
1812
+ trace_slice_start("", obj.rid, ts=int(created_time * 1e9), anonymous=True)
1813
+ else:
1814
+ for i in range(len(obj.rid)):
1815
+ bootstrap_room = (
1816
+ obj.bootstrap_room[i]
1817
+ if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
1818
+ else None
1819
+ )
1820
+ trace_req_start(obj.rid[i], bootstrap_room, ts=int(created_time * 1e9))
1821
+ trace_slice_start(
1822
+ "", obj.rid[i], ts=int(created_time * 1e9), anonymous=True
1823
+ )
1824
+
1877
1825
 
1878
1826
  class ServerStatus(Enum):
1879
1827
  Up = "Up"
@@ -1919,7 +1867,7 @@ class SignalHandler:
1919
1867
 
1920
1868
  def running_phase_sigquit_handler(self, signum=None, frame=None):
1921
1869
  logger.error(
1922
- "Received sigquit from a child process. It usually means the child failed."
1870
+ f"SIGQUIT received. {signum=}, {frame=}. It usually means one child failed."
1923
1871
  )
1924
1872
  self.tokenizer_manager.dump_requests_before_crash()
1925
1873
  kill_process_tree(os.getpid())