sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ from sglang.srt.layers.attention.flashinfer_backend import (
30
30
  from sglang.srt.layers.dp_attention import get_attention_tp_size
31
31
  from sglang.srt.managers.schedule_batch import global_server_args_dict
32
32
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
33
- from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
33
+ from sglang.srt.speculative.spec_info import SpecInput
34
34
  from sglang.srt.utils import (
35
35
  is_flashinfer_available,
36
36
  is_sm100_supported,
@@ -40,7 +40,7 @@ from sglang.srt.utils import (
40
40
  if TYPE_CHECKING:
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.model_runner import ModelRunner
43
- from sglang.srt.speculative.spec_info import SpecInfo
43
+ from sglang.srt.speculative.spec_info import SpecInput
44
44
 
45
45
  if is_flashinfer_available():
46
46
  from flashinfer import (
@@ -96,6 +96,7 @@ class FlashInferMhaChunkKVRunner:
96
96
  def update_wrapper(
97
97
  self,
98
98
  forward_batch: ForwardBatch,
99
+ disable_flashinfer_ragged: bool = False,
99
100
  ):
100
101
  assert forward_batch.num_prefix_chunks is not None
101
102
  num_prefix_chunks = forward_batch.num_prefix_chunks
@@ -128,16 +129,17 @@ class FlashInferMhaChunkKVRunner:
128
129
  causal=False,
129
130
  )
130
131
  # ragged prefill
131
- self.ragged_wrapper.begin_forward(
132
- qo_indptr=qo_indptr,
133
- kv_indptr=qo_indptr,
134
- num_qo_heads=self.num_local_heads,
135
- num_kv_heads=self.num_local_heads,
136
- head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
137
- head_dim_vo=self.v_head_dim,
138
- q_data_type=self.q_data_type,
139
- causal=True,
140
- )
132
+ if not disable_flashinfer_ragged:
133
+ self.ragged_wrapper.begin_forward(
134
+ qo_indptr=qo_indptr,
135
+ kv_indptr=qo_indptr,
136
+ num_qo_heads=self.num_local_heads,
137
+ num_kv_heads=self.num_local_heads,
138
+ head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
139
+ head_dim_vo=self.v_head_dim,
140
+ q_data_type=self.q_data_type,
141
+ causal=True,
142
+ )
141
143
 
142
144
  def forward(
143
145
  self,
@@ -359,7 +361,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
359
361
  seq_lens: torch.Tensor,
360
362
  encoder_lens: Optional[torch.Tensor],
361
363
  forward_mode: ForwardMode,
362
- spec_info: Optional[SpecInfo],
364
+ spec_info: Optional[SpecInput],
363
365
  ):
364
366
  if forward_mode.is_decode_or_idle():
365
367
  decode_wrapper = BatchMLAPagedAttentionWrapper(
@@ -439,7 +441,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
439
441
  seq_lens_sum: int,
440
442
  encoder_lens: Optional[torch.Tensor],
441
443
  forward_mode: ForwardMode,
442
- spec_info: Optional[SpecInfo],
444
+ spec_info: Optional[SpecInput],
443
445
  seq_lens_cpu: Optional[torch.Tensor],
444
446
  ):
445
447
  if forward_mode.is_decode_or_idle():
@@ -491,9 +493,11 @@ class FlashInferMLAAttnBackend(AttentionBackend):
491
493
  def get_cuda_graph_seq_len_fill_value(self):
492
494
  return 1
493
495
 
494
- def init_mha_chunk_metadata(self, forward_batch: ForwardBatch):
496
+ def init_mha_chunk_metadata(
497
+ self, forward_batch: ForwardBatch, disable_flashinfer_ragged: bool = False
498
+ ):
495
499
  """Init the metadata for a forward pass."""
496
- self.mha_chunk_kv_cache.update_wrapper(forward_batch)
500
+ self.mha_chunk_kv_cache.update_wrapper(forward_batch, disable_flashinfer_ragged)
497
501
 
498
502
  def forward_extend(
499
503
  self,
@@ -659,7 +663,7 @@ class FlashInferMLAIndicesUpdaterDecode:
659
663
  seq_lens_sum: int,
660
664
  decode_wrapper: BatchMLAPagedAttentionWrapper,
661
665
  init_metadata_replay: bool = False,
662
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
666
+ spec_info: Optional[SpecInput] = None,
663
667
  **fast_decode_kwargs,
664
668
  ):
665
669
  decode_wrapper = decode_wrapper or self.decode_wrapper
@@ -684,7 +688,7 @@ class FlashInferMLAIndicesUpdaterDecode:
684
688
  q_indptr: torch.Tensor,
685
689
  kv_indptr: torch.Tensor,
686
690
  init_metadata_replay: bool = False,
687
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
691
+ spec_info: Optional[SpecInput] = None,
688
692
  **fast_decode_kwargs,
689
693
  ):
690
694
  bs = len(req_pool_indices)
@@ -772,7 +776,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
772
776
  prefix_lens: torch.Tensor,
773
777
  prefill_wrapper_paged: BatchMLAPagedAttentionWrapper,
774
778
  use_ragged: bool,
775
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
779
+ spec_info: Optional[SpecInput] = None,
776
780
  ):
777
781
  if use_ragged:
778
782
  paged_kernel_lens = prefix_lens
@@ -807,7 +811,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
807
811
  kv_indptr: torch.Tensor,
808
812
  qo_indptr: torch.Tensor,
809
813
  use_ragged: bool,
810
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
814
+ spec_info: Optional[SpecInput] = None,
811
815
  ):
812
816
  bs = len(seq_lens)
813
817
  sm_scale = self.scaling
@@ -834,9 +838,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
834
838
  qo_indptr = qo_indptr[: bs + 1]
835
839
  custom_mask = None
836
840
  else:
837
- assert isinstance(spec_info, EagleDraftInput) or isinstance(
838
- spec_info, EagleVerifyInput
839
- )
841
+ assert isinstance(spec_info, SpecInput)
840
842
  # TODO: Support topk > 1 with custom mask
841
843
  kv_indices, kv_indptr, qo_indptr, custom_mask = (
842
844
  spec_info.generate_attn_arg_prefill(
@@ -890,7 +892,7 @@ class FlashInferMLAMultiStepDraftBackend:
890
892
  topk: int,
891
893
  speculative_num_steps: int,
892
894
  ):
893
- from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
895
+ from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
894
896
 
895
897
  if topk > 1:
896
898
  raise ValueError(
@@ -959,7 +961,7 @@ class FlashInferMLAMultiStepDraftBackend:
959
961
  )
960
962
 
961
963
  assert forward_batch.spec_info is not None
962
- assert isinstance(forward_batch.spec_info, EagleDraftInput)
964
+ assert forward_batch.spec_info.is_draft_input()
963
965
 
964
966
  for i in range(self.speculative_num_steps - 1):
965
967
  forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
@@ -979,8 +981,6 @@ class FlashInferMLAMultiStepDraftBackend:
979
981
  )
980
982
 
981
983
  def call_fn(i, forward_batch):
982
- assert forward_batch.spec_info is not None
983
- assert isinstance(forward_batch.spec_info, EagleDraftInput)
984
984
  forward_batch.spec_info.kv_indptr = (
985
985
  forward_batch.spec_info.kv_indptr.clone()
986
986
  )
@@ -1060,7 +1060,7 @@ def fast_mla_decode_plan(
1060
1060
 
1061
1061
  try:
1062
1062
  # Standard version with just the required arguments (no use_profiler)
1063
- self._cached_module.plan.default(
1063
+ self._cached_module.plan(
1064
1064
  self._float_workspace_buffer,
1065
1065
  self._int_workspace_buffer,
1066
1066
  self._pin_memory_int_workspace_buffer,
@@ -19,7 +19,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
19
19
  if TYPE_CHECKING:
20
20
  from sglang.srt.layers.radix_attention import RadixAttention
21
21
  from sglang.srt.model_executor.model_runner import ModelRunner
22
- from sglang.srt.speculative.spec_info import SpecInfo
22
+ from sglang.srt.speculative.spec_info import SpecInput
23
23
 
24
24
 
25
25
  # FlashMLA only supports pagesize=64
@@ -187,7 +187,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
187
187
  seq_lens: torch.Tensor,
188
188
  encoder_lens: Optional[torch.Tensor],
189
189
  forward_mode: ForwardMode,
190
- spec_info: Optional[SpecInfo],
190
+ spec_info: Optional[SpecInput],
191
191
  ):
192
192
  if forward_mode.is_decode_or_idle():
193
193
  max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
@@ -201,9 +201,10 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
201
201
  self.req_to_token.stride(0),
202
202
  self.cuda_graph_kv_indices.stride(0),
203
203
  )
204
+ num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
204
205
  mla_metadata, num_splits = get_mla_metadata(
205
206
  seq_lens.to(torch.int32),
206
- self.num_q_heads,
207
+ num_q_heads,
207
208
  1,
208
209
  )
209
210
  self.cuda_graph_mla_metadata.copy_(mla_metadata)
@@ -257,7 +258,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
257
258
  seq_lens_sum: int,
258
259
  encoder_lens: Optional[torch.Tensor],
259
260
  forward_mode: ForwardMode,
260
- spec_info: Optional[SpecInfo],
261
+ spec_info: Optional[SpecInput],
261
262
  seq_lens_cpu: Optional[torch.Tensor],
262
263
  ):
263
264
 
@@ -275,9 +276,10 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
275
276
  self.req_to_token.stride(0),
276
277
  self.cuda_graph_kv_indices.stride(0),
277
278
  )
279
+ num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
278
280
  mla_metadata, num_splits = get_mla_metadata(
279
281
  seq_lens.to(torch.int32),
280
- self.num_q_heads,
282
+ num_q_heads,
281
283
  1,
282
284
  )
283
285
  self.cuda_graph_mla_metadata.copy_(mla_metadata)
@@ -3,10 +3,11 @@ from typing import Optional, Union
3
3
  import torch
4
4
 
5
5
  from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
6
+ from sglang.srt.layers.attention.nsa.nsa_indexer import BaseIndexerMetadata
6
7
  from sglang.srt.layers.radix_attention import RadixAttention
7
8
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
8
9
  from sglang.srt.model_executor.model_runner import ModelRunner
9
- from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
10
+ from sglang.srt.speculative.spec_info import SpecInput
10
11
 
11
12
 
12
13
  class HybridAttnBackend(AttentionBackend):
@@ -21,18 +22,46 @@ class HybridAttnBackend(AttentionBackend):
21
22
  self.model_runner = model_runner
22
23
  self.prefill_backend = prefill_backend
23
24
  self.decode_backend = decode_backend
25
+ self.data_type = model_runner.kv_cache_dtype
24
26
 
25
- def init_forward_metadata(self, forward_batch: ForwardBatch):
26
- if forward_batch.forward_mode.is_decode_or_idle():
27
- self.decode_backend.init_forward_metadata(forward_batch)
27
+ def _select_backend(self, forward_mode: ForwardMode) -> AttentionBackend:
28
+ """
29
+ Select the appropriate attention backend based on the forward mode.
30
+
31
+ Args:
32
+ forward_mode: The current forward mode indicating the operation type
33
+
34
+ Returns:
35
+ The selected attention backend (prefill or decode)
36
+
37
+ Note:
38
+ - decode_or_idle: Always uses decode backend
39
+ - target_verify or draft_extend: Uses decode backend if speculative_attention_mode is "decode", otherwise prefill backend
40
+ - prefill: Always uses prefill backend
41
+ """
42
+ if forward_mode.is_decode_or_idle():
43
+ return self.decode_backend
44
+ elif forward_mode.is_target_verify() or forward_mode.is_draft_extend():
45
+ return (
46
+ self.decode_backend
47
+ if self.model_runner.server_args.speculative_attention_mode == "decode"
48
+ else self.prefill_backend
49
+ )
28
50
  else:
29
- self.prefill_backend.init_forward_metadata(forward_batch)
51
+ return self.prefill_backend
52
+
53
+ def init_forward_metadata(self, forward_batch: ForwardBatch):
54
+ backend = self._select_backend(forward_batch.forward_mode)
55
+ backend.init_forward_metadata(forward_batch)
30
56
 
31
57
  def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
32
58
  self.decode_backend.init_cuda_graph_state(max_bs, max_num_tokens)
33
- if self.model_runner.server_args.speculative_algorithm is not None:
34
- # When speculative decoding is enabled, we also need to initialize the
35
- # prefill backend's cuda graph state to support target_verify.
59
+ if (
60
+ self.model_runner.server_args.speculative_algorithm is not None
61
+ and self.model_runner.server_args.speculative_attention_mode == "prefill"
62
+ ):
63
+ # When speculative decoding is enabled, we need to initialize the backend
64
+ # that will be used for target_verify.
36
65
  self.prefill_backend.init_cuda_graph_state(max_bs, max_num_tokens)
37
66
 
38
67
  def init_forward_metadata_capture_cuda_graph(
@@ -43,28 +72,18 @@ class HybridAttnBackend(AttentionBackend):
43
72
  seq_lens: torch.Tensor,
44
73
  encoder_lens: Optional[torch.Tensor],
45
74
  forward_mode: ForwardMode,
46
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
75
+ spec_info: Optional[SpecInput],
47
76
  ):
48
- if forward_mode.is_decode_or_idle():
49
- self.decode_backend.init_forward_metadata_capture_cuda_graph(
50
- bs,
51
- num_tokens,
52
- req_pool_indices,
53
- seq_lens,
54
- encoder_lens,
55
- forward_mode,
56
- spec_info,
57
- )
58
- else:
59
- self.prefill_backend.init_forward_metadata_capture_cuda_graph(
60
- bs,
61
- num_tokens,
62
- req_pool_indices,
63
- seq_lens,
64
- encoder_lens,
65
- forward_mode,
66
- spec_info,
67
- )
77
+ backend = self._select_backend(forward_mode)
78
+ backend.init_forward_metadata_capture_cuda_graph(
79
+ bs,
80
+ num_tokens,
81
+ req_pool_indices,
82
+ seq_lens,
83
+ encoder_lens,
84
+ forward_mode,
85
+ spec_info,
86
+ )
68
87
 
69
88
  def init_forward_metadata_replay_cuda_graph(
70
89
  self,
@@ -74,31 +93,20 @@ class HybridAttnBackend(AttentionBackend):
74
93
  seq_lens_sum: int,
75
94
  encoder_lens: Optional[torch.Tensor],
76
95
  forward_mode: ForwardMode,
77
- spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
96
+ spec_info: Optional[SpecInput],
78
97
  seq_lens_cpu: Optional[torch.Tensor],
79
98
  ):
80
- if forward_mode.is_decode_or_idle():
81
- self.decode_backend.init_forward_metadata_replay_cuda_graph(
82
- bs,
83
- req_pool_indices,
84
- seq_lens,
85
- seq_lens_sum,
86
- encoder_lens,
87
- forward_mode,
88
- spec_info,
89
- seq_lens_cpu,
90
- )
91
- else:
92
- self.prefill_backend.init_forward_metadata_replay_cuda_graph(
93
- bs,
94
- req_pool_indices,
95
- seq_lens,
96
- seq_lens_sum,
97
- encoder_lens,
98
- forward_mode,
99
- spec_info,
100
- seq_lens_cpu,
101
- )
99
+ backend = self._select_backend(forward_mode)
100
+ backend.init_forward_metadata_replay_cuda_graph(
101
+ bs,
102
+ req_pool_indices,
103
+ seq_lens,
104
+ seq_lens_sum,
105
+ encoder_lens,
106
+ forward_mode,
107
+ spec_info,
108
+ seq_lens_cpu,
109
+ )
102
110
 
103
111
  def get_cuda_graph_seq_len_fill_value(self):
104
112
  return self.decode_backend.get_cuda_graph_seq_len_fill_value()
@@ -127,6 +135,13 @@ class HybridAttnBackend(AttentionBackend):
127
135
  save_kv_cache: bool = True,
128
136
  **kwargs,
129
137
  ):
130
- return self.prefill_backend.forward_extend(
138
+ backend = self._select_backend(forward_batch.forward_mode)
139
+ return backend.forward_extend(
131
140
  q, k, v, layer, forward_batch, save_kv_cache, **kwargs
132
141
  )
142
+
143
+ def get_indexer_metadata(
144
+ self, layer_id: int, forward_batch: ForwardBatch
145
+ ) -> Optional[BaseIndexerMetadata]:
146
+ backend = self._select_backend(forward_batch.forward_mode)
147
+ return backend.get_indexer_metadata(layer_id, forward_batch)