sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/lang/interpreter.py +1 -1
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/device_config.py +3 -1
  6. sglang/srt/configs/dots_vlm.py +139 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/load_config.py +1 -0
  9. sglang/srt/configs/model_config.py +50 -6
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +8 -1
  12. sglang/srt/connector/remote_instance.py +82 -0
  13. sglang/srt/constrained/base_grammar_backend.py +48 -12
  14. sglang/srt/constrained/llguidance_backend.py +0 -1
  15. sglang/srt/constrained/outlines_backend.py +0 -1
  16. sglang/srt/constrained/xgrammar_backend.py +28 -9
  17. sglang/srt/custom_op.py +11 -1
  18. sglang/srt/debug_utils/dump_comparator.py +81 -44
  19. sglang/srt/debug_utils/dump_loader.py +97 -0
  20. sglang/srt/debug_utils/dumper.py +11 -3
  21. sglang/srt/debug_utils/text_comparator.py +73 -11
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +21 -10
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  26. sglang/srt/disaggregation/fake/conn.py +1 -1
  27. sglang/srt/disaggregation/mini_lb.py +6 -445
  28. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  29. sglang/srt/disaggregation/nixl/conn.py +180 -16
  30. sglang/srt/disaggregation/prefill.py +5 -3
  31. sglang/srt/disaggregation/utils.py +5 -50
  32. sglang/srt/distributed/parallel_state.py +67 -43
  33. sglang/srt/entrypoints/engine.py +38 -17
  34. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  35. sglang/srt/entrypoints/grpc_server.py +680 -0
  36. sglang/srt/entrypoints/http_server.py +88 -53
  37. sglang/srt/entrypoints/openai/protocol.py +7 -4
  38. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  39. sglang/srt/entrypoints/openai/serving_chat.py +39 -19
  40. sglang/srt/entrypoints/openai/serving_completions.py +15 -4
  41. sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
  42. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  43. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  44. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  45. sglang/srt/eplb/eplb_manager.py +2 -2
  46. sglang/srt/eplb/expert_distribution.py +26 -13
  47. sglang/srt/eplb/expert_location.py +8 -3
  48. sglang/srt/eplb/expert_location_updater.py +1 -1
  49. sglang/srt/function_call/base_format_detector.py +3 -6
  50. sglang/srt/function_call/ebnf_composer.py +11 -9
  51. sglang/srt/function_call/function_call_parser.py +6 -0
  52. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  55. sglang/srt/grpc/__init__.py +1 -0
  56. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  57. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  58. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  59. sglang/srt/hf_transformers_utils.py +4 -0
  60. sglang/srt/layers/activation.py +142 -9
  61. sglang/srt/layers/attention/aiter_backend.py +93 -68
  62. sglang/srt/layers/attention/ascend_backend.py +11 -4
  63. sglang/srt/layers/attention/fla/chunk.py +242 -0
  64. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  65. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  66. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  67. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  68. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  69. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  70. sglang/srt/layers/attention/fla/index.py +37 -0
  71. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  72. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  73. sglang/srt/layers/attention/fla/op.py +66 -0
  74. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  75. sglang/srt/layers/attention/fla/utils.py +331 -0
  76. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  77. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  78. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  79. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  80. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  81. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  82. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  83. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  84. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  85. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  86. sglang/srt/layers/attention/triton_backend.py +18 -1
  87. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  88. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  89. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  90. sglang/srt/layers/communicator.py +45 -7
  91. sglang/srt/layers/dp_attention.py +30 -1
  92. sglang/srt/layers/layernorm.py +32 -15
  93. sglang/srt/layers/linear.py +34 -3
  94. sglang/srt/layers/logits_processor.py +29 -10
  95. sglang/srt/layers/moe/__init__.py +2 -1
  96. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  97. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  98. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  99. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  100. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  107. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  108. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  113. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  114. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  115. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  116. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  117. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  118. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  119. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  120. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  121. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  122. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  123. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  124. sglang/srt/layers/moe/topk.py +30 -9
  125. sglang/srt/layers/moe/utils.py +12 -7
  126. sglang/srt/layers/quantization/awq.py +19 -7
  127. sglang/srt/layers/quantization/base_config.py +11 -6
  128. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  129. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  130. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  131. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  132. sglang/srt/layers/quantization/fp8.py +76 -47
  133. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  134. sglang/srt/layers/quantization/gptq.py +25 -17
  135. sglang/srt/layers/quantization/modelopt_quant.py +182 -49
  136. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  137. sglang/srt/layers/quantization/mxfp4.py +68 -41
  138. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  139. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  140. sglang/srt/layers/quantization/quark/utils.py +97 -0
  141. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  142. sglang/srt/layers/quantization/unquant.py +135 -47
  143. sglang/srt/layers/quantization/w4afp8.py +30 -17
  144. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  145. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  146. sglang/srt/layers/rocm_linear_utils.py +44 -0
  147. sglang/srt/layers/rotary_embedding.py +0 -18
  148. sglang/srt/layers/sampler.py +162 -18
  149. sglang/srt/lora/backend/base_backend.py +50 -8
  150. sglang/srt/lora/backend/triton_backend.py +90 -2
  151. sglang/srt/lora/layers.py +32 -0
  152. sglang/srt/lora/lora.py +4 -1
  153. sglang/srt/lora/lora_manager.py +35 -112
  154. sglang/srt/lora/mem_pool.py +24 -10
  155. sglang/srt/lora/utils.py +18 -9
  156. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  157. sglang/srt/managers/cache_controller.py +200 -199
  158. sglang/srt/managers/data_parallel_controller.py +105 -35
  159. sglang/srt/managers/detokenizer_manager.py +8 -4
  160. sglang/srt/managers/disagg_service.py +46 -0
  161. sglang/srt/managers/io_struct.py +199 -12
  162. sglang/srt/managers/mm_utils.py +1 -0
  163. sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
  164. sglang/srt/managers/schedule_batch.py +77 -56
  165. sglang/srt/managers/schedule_policy.py +4 -3
  166. sglang/srt/managers/scheduler.py +191 -139
  167. sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
  168. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  169. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  170. sglang/srt/managers/template_manager.py +3 -3
  171. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  172. sglang/srt/managers/tokenizer_manager.py +260 -519
  173. sglang/srt/managers/tp_worker.py +53 -4
  174. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  175. sglang/srt/mem_cache/allocator.py +1 -1
  176. sglang/srt/mem_cache/hicache_storage.py +18 -33
  177. sglang/srt/mem_cache/hiradix_cache.py +108 -48
  178. sglang/srt/mem_cache/memory_pool.py +347 -48
  179. sglang/srt/mem_cache/memory_pool_host.py +121 -57
  180. sglang/srt/mem_cache/radix_cache.py +0 -2
  181. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  182. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  183. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
  184. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  185. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  186. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
  187. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  188. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  189. sglang/srt/metrics/collector.py +502 -77
  190. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  191. sglang/srt/metrics/utils.py +48 -0
  192. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  193. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  194. sglang/srt/model_executor/forward_batch_info.py +75 -19
  195. sglang/srt/model_executor/model_runner.py +357 -30
  196. sglang/srt/model_loader/__init__.py +9 -3
  197. sglang/srt/model_loader/loader.py +128 -4
  198. sglang/srt/model_loader/weight_utils.py +2 -1
  199. sglang/srt/models/apertus.py +686 -0
  200. sglang/srt/models/bailing_moe.py +798 -218
  201. sglang/srt/models/bailing_moe_nextn.py +168 -0
  202. sglang/srt/models/deepseek_v2.py +346 -48
  203. sglang/srt/models/dots_vlm.py +174 -0
  204. sglang/srt/models/dots_vlm_vit.py +337 -0
  205. sglang/srt/models/ernie4.py +1 -1
  206. sglang/srt/models/gemma3n_mm.py +1 -1
  207. sglang/srt/models/glm4_moe.py +11 -2
  208. sglang/srt/models/glm4v.py +4 -2
  209. sglang/srt/models/glm4v_moe.py +3 -0
  210. sglang/srt/models/gpt_oss.py +1 -1
  211. sglang/srt/models/internvl.py +28 -0
  212. sglang/srt/models/llama4.py +9 -0
  213. sglang/srt/models/llama_eagle3.py +13 -0
  214. sglang/srt/models/longcat_flash.py +2 -2
  215. sglang/srt/models/minicpmv.py +165 -3
  216. sglang/srt/models/mllama4.py +25 -0
  217. sglang/srt/models/opt.py +637 -0
  218. sglang/srt/models/qwen2.py +7 -0
  219. sglang/srt/models/qwen2_5_vl.py +27 -3
  220. sglang/srt/models/qwen2_moe.py +60 -13
  221. sglang/srt/models/qwen3.py +8 -2
  222. sglang/srt/models/qwen3_moe.py +40 -9
  223. sglang/srt/models/qwen3_next.py +1042 -0
  224. sglang/srt/models/qwen3_next_mtp.py +112 -0
  225. sglang/srt/models/step3_vl.py +1 -1
  226. sglang/srt/models/torch_native_llama.py +1 -1
  227. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  228. sglang/srt/multimodal/processors/glm4v.py +9 -9
  229. sglang/srt/multimodal/processors/internvl.py +141 -129
  230. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  231. sglang/srt/offloader.py +27 -3
  232. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  233. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  234. sglang/srt/sampling/sampling_batch_info.py +18 -15
  235. sglang/srt/server_args.py +355 -37
  236. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  237. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  238. sglang/srt/speculative/eagle_utils.py +0 -2
  239. sglang/srt/speculative/eagle_worker.py +197 -112
  240. sglang/srt/speculative/spec_info.py +5 -0
  241. sglang/srt/speculative/standalone_worker.py +109 -0
  242. sglang/srt/tracing/trace.py +552 -0
  243. sglang/srt/utils.py +46 -3
  244. sglang/srt/weight_sync/utils.py +1 -1
  245. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  246. sglang/test/few_shot_gsm8k.py +1 -0
  247. sglang/test/runners.py +4 -0
  248. sglang/test/test_cutlass_moe.py +24 -6
  249. sglang/test/test_disaggregation_utils.py +66 -0
  250. sglang/test/test_fp4_moe.py +370 -1
  251. sglang/test/test_utils.py +28 -1
  252. sglang/utils.py +12 -0
  253. sglang/version.py +1 -1
  254. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  255. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
  256. sglang/srt/disaggregation/launch_lb.py +0 -118
  257. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  258. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  259. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  260. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  261. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  262. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  263. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  264. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  265. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,24 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
5
  from collections import defaultdict
4
- from typing import List, Optional
6
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
7
+
8
+ import torch
5
9
 
6
10
  from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
7
11
  from sglang.srt.disaggregation.utils import DisaggregationMode
12
+ from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
8
13
  from sglang.srt.managers.schedule_policy import PrefillAdder
9
14
  from sglang.srt.managers.scheduler import Req, ScheduleBatch
15
+ from sglang.srt.managers.utils import DPBalanceMeta
10
16
  from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
11
17
  from sglang.srt.utils import get_bool_env_var
12
18
 
19
+ if TYPE_CHECKING:
20
+ from sglang.srt.managers.scheduler import Scheduler
21
+
13
22
  logger = logging.getLogger(__name__)
14
23
 
15
24
  RECORD_STEP_TIME = get_bool_env_var("SGLANG_RECORD_STEP_TIME")
@@ -28,7 +37,9 @@ class KvMetrics:
28
37
 
29
38
 
30
39
  class SchedulerMetricsMixin:
31
- def init_metrics(self, tp_rank: int, pp_rank: int, dp_rank: Optional[int]):
40
+ def init_metrics(
41
+ self: Scheduler, tp_rank: int, pp_rank: int, dp_rank: Optional[int]
42
+ ):
32
43
  self.last_gen_throughput: float = 0.0
33
44
  self.last_input_throughput: float = 0.0
34
45
  self.step_time_dict = defaultdict(list) # Dict[batch size -> step time]
@@ -50,14 +61,24 @@ class SchedulerMetricsMixin:
50
61
  labels["dp_rank"] = dp_rank
51
62
  self.metrics_collector = SchedulerMetricsCollector(labels=labels)
52
63
 
53
- def init_kv_events(self, kv_events_config: Optional[str]):
64
+ def init_dp_balance(self: Scheduler, dp_balance_meta: Optional[DPBalanceMeta]):
65
+ self.balance_meta = dp_balance_meta
66
+ if (
67
+ self.server_args.enable_dp_attention
68
+ and self.server_args.load_balance_method == "minimum_tokens"
69
+ ):
70
+ assert dp_balance_meta is not None
71
+
72
+ self.recv_dp_balance_id_this_term = []
73
+
74
+ def init_kv_events(self: Scheduler, kv_events_config: Optional[str]):
54
75
  if self.enable_kv_cache_events:
55
76
  self.kv_event_publisher = EventPublisherFactory.create(
56
77
  kv_events_config, self.attn_dp_rank
57
78
  )
58
79
 
59
80
  def log_prefill_stats(
60
- self,
81
+ self: Scheduler,
61
82
  adder: PrefillAdder,
62
83
  can_run_list: List[Req],
63
84
  running_bs: int,
@@ -138,7 +159,7 @@ class SchedulerMetricsMixin:
138
159
  self._publish_kv_events()
139
160
 
140
161
  def log_decode_stats(
141
- self, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
162
+ self: Scheduler, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
142
163
  ):
143
164
  batch = running_batch or self.running_batch
144
165
 
@@ -193,7 +214,7 @@ class SchedulerMetricsMixin:
193
214
  msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
194
215
 
195
216
  msg += (
196
- f"cuda graph: {can_run_cuda_graph}, "
217
+ f"{'cpu graph' if self.device == 'cpu' else 'cuda graph'}: {can_run_cuda_graph}, "
197
218
  f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
198
219
  f"#queue-req: {len(self.waiting_queue)}, "
199
220
  )
@@ -209,7 +230,7 @@ class SchedulerMetricsMixin:
209
230
  self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
210
231
  self.stats.spec_accept_length = spec_accept_length
211
232
  self.stats.total_retracted_reqs = self.total_retracted_reqs
212
- self.metrics_collector.log_stats(self.stats)
233
+ self.stats.avg_request_queue_latency = 0.0
213
234
  if self.disaggregation_mode == DisaggregationMode.DECODE:
214
235
  self.stats.num_decode_prealloc_queue_reqs = len(
215
236
  self.disagg_decode_prealloc_queue.queue
@@ -217,10 +238,11 @@ class SchedulerMetricsMixin:
217
238
  self.stats.num_decode_transfer_queue_reqs = len(
218
239
  self.disagg_decode_transfer_queue.queue
219
240
  )
241
+ self.metrics_collector.log_stats(self.stats)
220
242
  self._emit_kv_metrics()
221
243
  self._publish_kv_events()
222
244
 
223
- def _emit_kv_metrics(self):
245
+ def _emit_kv_metrics(self: Scheduler):
224
246
  kv_metrics = KvMetrics()
225
247
  kv_metrics.request_active_slots = self.stats.num_running_reqs
226
248
  kv_metrics.request_total_slots = self.max_running_requests
@@ -236,9 +258,94 @@ class SchedulerMetricsMixin:
236
258
  if not self.send_metrics_from_scheduler.closed:
237
259
  self.send_metrics_from_scheduler.send_pyobj(kv_metrics)
238
260
 
239
- def _publish_kv_events(self):
261
+ def _publish_kv_events(self: Scheduler):
240
262
  if self.enable_kv_cache_events:
241
263
  events = self.tree_cache.take_events()
242
264
  if events:
243
265
  batch = KVEventBatch(ts=time.time(), events=events)
244
266
  self.kv_event_publisher.publish(batch)
267
+
268
+ def maybe_update_dp_balance_data(
269
+ self: Scheduler, recv_req: TokenizedGenerateReqInput
270
+ ):
271
+ if (
272
+ self.server_args.enable_dp_attention
273
+ and self.server_args.load_balance_method == "minimum_tokens"
274
+ ):
275
+ self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
276
+
277
+ def maybe_handle_dp_balance_data(self: Scheduler):
278
+ if (
279
+ self.server_args.load_balance_method == "minimum_tokens"
280
+ and self.forward_ct % 40 == 0
281
+ ):
282
+ holding_tokens = self.get_load().num_tokens
283
+
284
+ new_recv_dp_balance_id_list, holding_token_list = (
285
+ self.gather_dp_balance_info(holding_tokens)
286
+ )
287
+
288
+ self.recv_dp_balance_id_this_term.clear()
289
+ if self.tp_rank == 0: # only first worker write info
290
+ self.write_shared_dp_balance_info(
291
+ new_recv_dp_balance_id_list, holding_token_list
292
+ )
293
+
294
+ def gather_dp_balance_info(
295
+ self: Scheduler, holding_tokens_list
296
+ ) -> Union[None, List[List[int]]]:
297
+ """gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
298
+ recv_list = self.recv_dp_balance_id_this_term
299
+ assert len(recv_list) <= 511, (
300
+ "The number of requests received this round is too large. "
301
+ "Please increase gather_tensor_size and onfly_info_size."
302
+ )
303
+ # The maximum size of the tensor used for gathering data from all workers.
304
+ gather_tensor_size = 512
305
+
306
+ # recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
307
+ recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
308
+ recv_tensor[0] = holding_tokens_list
309
+ recv_tensor[1] = len(recv_list) # The first element is the length of the list.
310
+ recv_tensor[2 : len(recv_list) + 2] = torch.tensor(recv_list, dtype=torch.int32)
311
+
312
+ if self.tp_rank == 0:
313
+ gathered_list = [
314
+ torch.zeros(gather_tensor_size, dtype=torch.int32)
315
+ for _ in range(self.balance_meta.num_workers)
316
+ ]
317
+ else:
318
+ gathered_list = None
319
+
320
+ torch.distributed.gather(recv_tensor, gathered_list, group=self.tp_cpu_group)
321
+
322
+ gathered_id_list_per_worker = None
323
+ if self.tp_rank == 0:
324
+ gathered_id_list_per_worker = []
325
+ holding_tokens_list = []
326
+ for tensor in gathered_list:
327
+ holding_tokens_list.append(tensor[0].item())
328
+ list_length = tensor[1].item()
329
+ gathered_id_list_per_worker.append(tensor[2 : list_length + 2].tolist())
330
+
331
+ return gathered_id_list_per_worker, holding_tokens_list
332
+
333
+ def write_shared_dp_balance_info(self: Scheduler, new_recv_rid_lists, local_tokens):
334
+ meta = self.balance_meta
335
+
336
+ with meta.mutex:
337
+ onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
338
+ assert len(new_recv_rid_lists) == len(onfly_list), "num_worker not equal"
339
+ # 1.Check if the rid received by each worker this round is present in onfly.
340
+ # If it is, remove the corresponding onfly item.
341
+ worker_id = 0
342
+ for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
343
+ for new_recv_rid in new_recv_rids:
344
+ assert (
345
+ new_recv_rid in on_fly_reqs
346
+ ), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
347
+ del on_fly_reqs[new_recv_rid]
348
+ worker_id += 1
349
+ # 2. Atomically write local_tokens and onfly into shm under the mutex
350
+ meta.set_shared_onfly_info(onfly_list)
351
+ meta.set_shared_local_tokens(local_tokens)
@@ -5,6 +5,8 @@ import threading
5
5
  import time
6
6
  from typing import TYPE_CHECKING, List, Optional, Tuple, Union
7
7
 
8
+ import torch
9
+
8
10
  from sglang.srt.disaggregation.utils import DisaggregationMode
9
11
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
10
12
  from sglang.srt.managers.io_struct import AbortReq, BatchEmbeddingOut, BatchTokenIDOut
@@ -71,6 +73,7 @@ class SchedulerOutputProcessorMixin:
71
73
 
72
74
  # Check finish conditions
73
75
  logprob_pt = 0
76
+
74
77
  for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
75
78
  if req.is_retracted:
76
79
  continue
@@ -99,6 +102,7 @@ class SchedulerOutputProcessorMixin:
99
102
  extend_logprob_start_len = extend_logprob_start_len_per_req[i]
100
103
  extend_input_len = extend_input_len_per_req[i]
101
104
  num_input_logprobs = extend_input_len - extend_logprob_start_len
105
+
102
106
  if req.return_logprob:
103
107
  self.add_logprob_return_values(
104
108
  i,
@@ -441,27 +445,59 @@ class SchedulerOutputProcessorMixin:
441
445
  output: LogitsProcessorOutput,
442
446
  ):
443
447
  """Attach logprobs to the return values."""
444
- req.output_token_logprobs_val.append(output.next_token_logprobs[i])
445
- req.output_token_logprobs_idx.append(next_token_ids[i])
446
-
447
- self.add_input_logprob_return_values(
448
- i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
449
- )
448
+ if output.next_token_logprobs is not None:
449
+ req.output_token_logprobs_val.append(output.next_token_logprobs[i])
450
+ req.output_token_logprobs_idx.append(next_token_ids[i])
451
+
452
+ # Only add input logprobs if there are input tokens to process
453
+ # Note: For prefill-only requests with default logprob_start_len, this will be 0,
454
+ # meaning we only compute output logprobs (which is the intended behavior)
455
+ if num_input_logprobs > 0:
456
+ self.add_input_logprob_return_values(
457
+ i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
458
+ )
459
+ else:
460
+ self._initialize_empty_logprob_containers(req)
450
461
 
451
462
  if req.top_logprobs_num > 0:
452
463
  req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
453
464
  req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
454
465
 
455
- if req.token_ids_logprob is not None:
456
- req.output_token_ids_logprobs_val.append(
457
- output.next_token_token_ids_logprobs_val[i]
458
- )
466
+ if (
467
+ req.token_ids_logprob is not None
468
+ and output.next_token_token_ids_logprobs_val is not None
469
+ ):
470
+ # Convert GPU tensor to list if needed
471
+ logprobs_val = output.next_token_token_ids_logprobs_val[i]
472
+ if isinstance(logprobs_val, torch.Tensor):
473
+ logprobs_val = logprobs_val.tolist()
474
+ req.output_token_ids_logprobs_val.append(logprobs_val)
459
475
  req.output_token_ids_logprobs_idx.append(
460
476
  output.next_token_token_ids_logprobs_idx[i]
461
477
  )
462
478
 
463
479
  return num_input_logprobs
464
480
 
481
+ def _initialize_empty_logprob_containers(self, req: Req) -> None:
482
+ """
483
+ Initialize logprob fields to empty lists if unset.
484
+
485
+ This is needed for prefill-only requests where the normal initialization
486
+ flow might be bypassed, but downstream code expects these fields to be lists.
487
+ """
488
+ if req.input_token_logprobs_val is None:
489
+ req.input_token_logprobs_val = []
490
+ if req.input_token_logprobs_idx is None:
491
+ req.input_token_logprobs_idx = []
492
+ if req.input_top_logprobs_val is None:
493
+ req.input_top_logprobs_val = []
494
+ if req.input_top_logprobs_idx is None:
495
+ req.input_top_logprobs_idx = []
496
+ if req.input_token_ids_logprobs_val is None:
497
+ req.input_token_ids_logprobs_val = []
498
+ if req.input_token_ids_logprobs_idx is None:
499
+ req.input_token_ids_logprobs_idx = []
500
+
465
501
  def stream_output(
466
502
  self: Scheduler,
467
503
  reqs: List[Req],
@@ -700,6 +736,8 @@ class SchedulerOutputProcessorMixin:
700
736
  output_token_ids_logprobs_val,
701
737
  output_token_ids_logprobs_idx,
702
738
  output_hidden_states,
739
+ placeholder_tokens_idx=None,
740
+ placeholder_tokens_val=None,
703
741
  )
704
742
  )
705
743
 
@@ -719,6 +757,12 @@ class SchedulerOutputProcessorMixin:
719
757
  cached_tokens.append(req.cached_tokens)
720
758
  self.send_to_detokenizer.send_pyobj(
721
759
  BatchEmbeddingOut(
722
- rids, finished_reasons, embeddings, prompt_tokens, cached_tokens
760
+ rids,
761
+ finished_reasons,
762
+ embeddings,
763
+ prompt_tokens,
764
+ cached_tokens,
765
+ placeholder_tokens_idx=None,
766
+ placeholder_tokens_val=None,
723
767
  )
724
768
  )
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
26
26
 
27
27
  class SchedulerProfilerMixin:
28
28
 
29
- def init_profier(self):
29
+ def init_profiler(self):
30
30
  self.torch_profiler = None
31
31
  self.torch_profiler_output_dir: Optional[str] = None
32
32
  self.profiler_activities: Optional[List[str]] = None
@@ -24,20 +24,20 @@ import os
24
24
  import re
25
25
  from typing import Optional
26
26
 
27
- from sglang.srt.code_completion_parser import (
27
+ from sglang.srt.parser.code_completion_parser import (
28
28
  CompletionTemplate,
29
29
  FimPosition,
30
30
  completion_template_exists,
31
31
  register_completion_template,
32
32
  )
33
- from sglang.srt.conversation import (
33
+ from sglang.srt.parser.conversation import (
34
34
  Conversation,
35
35
  SeparatorStyle,
36
36
  chat_template_exists,
37
37
  get_conv_template_by_model_path,
38
38
  register_conv_template,
39
39
  )
40
- from sglang.srt.jinja_template_utils import detect_jinja_template_content_format
40
+ from sglang.srt.parser.jinja_template_utils import detect_jinja_template_content_format
41
41
 
42
42
  logger = logging.getLogger(__name__)
43
43