sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,8 @@ from typing import Optional, Tuple
7
7
  import torch
8
8
  import triton
9
9
 
10
+ from sglang.srt.server_args import get_global_server_args
11
+
10
12
  logger = logging.getLogger(__name__)
11
13
 
12
14
  from dataclasses import dataclass
@@ -16,10 +18,11 @@ import torch.nn.functional as F
16
18
  from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
17
19
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
18
20
  from sglang.srt.layers.sampler import apply_custom_logit_processor
19
- from sglang.srt.managers.schedule_batch import (
20
- ScheduleBatch,
21
+ from sglang.srt.managers.schedule_batch import ScheduleBatch
22
+ from sglang.srt.mem_cache.common import (
23
+ alloc_paged_token_slots_extend,
24
+ alloc_token_slots,
21
25
  get_last_loc,
22
- global_server_args_dict,
23
26
  )
24
27
  from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
25
28
  from sglang.srt.speculative.spec_info import SpecInput, SpecInputType
@@ -74,7 +77,10 @@ class NgramVerifyInput(SpecInput):
74
77
  batch.input_ids = self.draft_token
75
78
 
76
79
  if page_size == 1:
77
- batch.out_cache_loc = batch.alloc_token_slots(len(batch.input_ids))
80
+ batch.out_cache_loc = alloc_token_slots(
81
+ batch.tree_cache,
82
+ len(batch.input_ids),
83
+ )
78
84
  end_offset = batch.seq_lens + self.draft_token_num
79
85
  else:
80
86
  # TODO(lsyin): add prefix lens cpu here to support page size > 1
@@ -87,7 +93,8 @@ class NgramVerifyInput(SpecInput):
87
93
  batch.req_pool_indices,
88
94
  prefix_lens,
89
95
  )
90
- batch.out_cache_loc = batch.alloc_paged_token_slots_extend(
96
+ batch.out_cache_loc = alloc_paged_token_slots_extend(
97
+ batch.tree_cache,
91
98
  prefix_lens,
92
99
  prefix_lens_cpu,
93
100
  end_offset,
@@ -345,10 +352,8 @@ class NgramVerifyInput(SpecInput):
345
352
  uniform_samples_for_final_sampling=coins_for_final_sampling,
346
353
  target_probs=target_probs,
347
354
  draft_probs=draft_probs,
348
- threshold_single=global_server_args_dict[
349
- "speculative_accept_threshold_single"
350
- ],
351
- threshold_acc=global_server_args_dict["speculative_accept_threshold_acc"],
355
+ threshold_single=get_global_server_args().speculative_accept_threshold_single,
356
+ threshold_acc=get_global_server_args().speculative_accept_threshold_acc,
352
357
  deterministic=True,
353
358
  )
354
359
 
@@ -6,11 +6,12 @@ import torch
6
6
  from sgl_kernel.speculative import reconstruct_indices_from_tree_mask
7
7
 
8
8
  from sglang.srt.managers.schedule_batch import ScheduleBatch
9
+ from sglang.srt.managers.scheduler import GenerationBatchResult
9
10
  from sglang.srt.managers.tp_worker import TpModelWorker
10
- from sglang.srt.model_executor.forward_batch_info import ForwardBatchOutput, ForwardMode
11
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode
11
12
  from sglang.srt.server_args import ServerArgs
12
13
  from sglang.srt.speculative.cpp_ngram.ngram_cache import NgramCache
13
- from sglang.srt.speculative.ngram_utils import NgramVerifyInput
14
+ from sglang.srt.speculative.ngram_info import NgramVerifyInput
14
15
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
15
16
 
16
17
  logger = logging.getLogger(__name__)
@@ -207,18 +208,18 @@ class NGRAMWorker:
207
208
  batch_tokens.append(put_ids)
208
209
  self.ngram_cache.batch_put(batch_tokens)
209
210
 
210
- def forward_batch_generation(self, batch: ScheduleBatch) -> ForwardBatchOutput:
211
+ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResult:
211
212
  self._prepare_for_speculative_decoding(batch)
212
213
  model_worker_batch = batch.get_model_worker_batch()
213
214
  num_accepted_tokens = 0
214
215
 
215
216
  if model_worker_batch.forward_mode.is_target_verify():
216
- forward_batch_output = self.target_worker.forward_batch_generation(
217
+ batch_result = self.target_worker.forward_batch_generation(
217
218
  model_worker_batch, is_verify=True
218
219
  )
219
220
  logits_output, can_run_cuda_graph = (
220
- forward_batch_output.logits_output,
221
- forward_batch_output.can_run_cuda_graph,
221
+ batch_result.logits_output,
222
+ batch_result.can_run_cuda_graph,
222
223
  )
223
224
  verify_input = model_worker_batch.spec_info
224
225
  logits_output, next_token_ids, num_accepted_tokens = verify_input.verify(
@@ -228,16 +229,16 @@ class NGRAMWorker:
228
229
  batch.forward_mode = ForwardMode.DECODE
229
230
 
230
231
  else:
231
- forward_batch_output = self.target_worker.forward_batch_generation(
232
+ batch_result = self.target_worker.forward_batch_generation(
232
233
  model_worker_batch
233
234
  )
234
235
  logits_output, next_token_ids, can_run_cuda_graph = (
235
- forward_batch_output.logits_output,
236
- forward_batch_output.next_token_ids,
237
- forward_batch_output.can_run_cuda_graph,
236
+ batch_result.logits_output,
237
+ batch_result.next_token_ids,
238
+ batch_result.can_run_cuda_graph,
238
239
  )
239
240
 
240
- return ForwardBatchOutput(
241
+ return GenerationBatchResult(
241
242
  logits_output=logits_output,
242
243
  next_token_ids=next_token_ids,
243
244
  num_accepted_tokens=num_accepted_tokens,
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import IntEnum, auto
3
+ from functools import lru_cache
3
4
  from typing import List, Tuple
4
5
 
5
6
  from sglang.srt.managers.schedule_batch import ModelWorkerBatch
@@ -27,6 +28,7 @@ class SpeculativeAlgorithm(IntEnum):
27
28
  def is_ngram(self):
28
29
  return self == SpeculativeAlgorithm.NGRAM
29
30
 
31
+ @lru_cache(maxsize=None)
30
32
  @staticmethod
31
33
  def from_string(name: str):
32
34
  name_map = {
@@ -3,24 +3,33 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  import time
6
+ from contextlib import contextmanager
6
7
  from typing import TYPE_CHECKING, List
7
8
 
8
9
  import torch
9
10
  import triton
10
11
  import triton.language as tl
12
+ from huggingface_hub import snapshot_download
11
13
 
12
14
  from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
15
+ from sglang.srt.distributed.parallel_state import (
16
+ GroupCoordinator,
17
+ patch_tensor_parallel_group,
18
+ )
13
19
  from sglang.srt.environ import envs
20
+ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
14
21
  from sglang.srt.managers.schedule_batch import Req
15
22
  from sglang.srt.utils import is_cuda, is_hip
16
23
 
24
+ if TYPE_CHECKING:
25
+ from sglang.srt.speculative.eagle_info import EagleVerifyInput
26
+
27
+
17
28
  if is_cuda():
18
29
  from sgl_kernel import fast_topk
19
30
  elif is_hip():
20
31
  from sgl_kernel import fast_topk
21
32
 
22
- if TYPE_CHECKING:
23
- from sglang.srt.speculative.eagle_info import EagleVerifyInput
24
33
 
25
34
  logger = logging.getLogger(__name__)
26
35
 
@@ -436,7 +445,7 @@ def select_top_k_tokens(
436
445
  return input_ids, hidden_states, scores, tree_info
437
446
 
438
447
 
439
- def _generate_simulated_accept_index(
448
+ def generate_simulated_accept_index(
440
449
  accept_index,
441
450
  predict,
442
451
  accept_length,
@@ -604,3 +613,29 @@ def generate_token_bitmask(
604
613
 
605
614
  verify_input.grammar = grammar
606
615
  return allocate_token_bitmask
616
+
617
+
618
+ def load_token_map(token_map_path: str) -> List[int]:
619
+ if not os.path.exists(token_map_path):
620
+ cache_dir = snapshot_download(
621
+ os.path.dirname(token_map_path),
622
+ ignore_patterns=["*.bin", "*.safetensors"],
623
+ )
624
+ token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
625
+ hot_token_id = torch.load(token_map_path, weights_only=True)
626
+ return torch.tensor(hot_token_id, dtype=torch.int64)
627
+
628
+
629
+ @contextmanager
630
+ def draft_tp_context(tp_group: GroupCoordinator):
631
+ # Draft model doesn't use dp and has its own tp group.
632
+ # We disable mscclpp now because it doesn't support 2 comm groups.
633
+ with patch_tensor_parallel_group(tp_group):
634
+ yield
635
+
636
+
637
+ def detect_nan(logits_output: LogitsProcessorOutput):
638
+ logits = logits_output.next_token_logits
639
+ if torch.any(torch.isnan(logits)):
640
+ logger.error("Detected errors during sampling! NaN in the logits.")
641
+ raise ValueError("Detected errors during sampling! NaN in the logits.")
@@ -1,29 +1,20 @@
1
1
  import logging
2
- from contextlib import contextmanager
3
2
  from typing import Optional
4
3
 
5
4
  import torch
6
5
 
7
- from sglang.srt.distributed import GroupCoordinator, patch_tensor_parallel_group
8
6
  from sglang.srt.managers.tp_worker import TpModelWorker
9
7
  from sglang.srt.server_args import ServerArgs
10
- from sglang.srt.speculative.eagle_worker import EAGLEWorker, load_token_map
8
+ from sglang.srt.speculative.eagle_worker import EAGLEWorker
11
9
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
10
+ from sglang.srt.speculative.spec_utils import draft_tp_context, load_token_map
12
11
  from sglang.srt.utils import empty_context, get_bool_env_var, is_cuda
13
12
 
14
13
  if is_cuda():
15
- from sgl_kernel import segment_packbits
14
+ from sgl_kernel import segment_packbits # noqa: F401
16
15
 
17
16
  logger = logging.getLogger(__name__)
18
- RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
19
-
20
-
21
- @contextmanager
22
- def draft_tp_context(tp_group: GroupCoordinator):
23
- # Draft model doesn't use dp and has its own tp group.
24
- # We disable mscclpp now because it doesn't support 2 comm groups.
25
- with patch_tensor_parallel_group(tp_group):
26
- yield
17
+ SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB")
27
18
 
28
19
 
29
20
  class StandaloneWorker(EAGLEWorker):
@@ -51,7 +42,6 @@ class StandaloneWorker(EAGLEWorker):
51
42
  self.speculative_algorithm = SpeculativeAlgorithm.from_string(
52
43
  server_args.speculative_algorithm
53
44
  )
54
- self.padded_static_len = -1
55
45
 
56
46
  # Override the context length of the draft model to be the same as the target model.
57
47
  server_args.context_length = target_worker.model_runner.model_config.context_len
@@ -133,9 +133,9 @@ class TiktokenTokenizer:
133
133
  )
134
134
  return self.encode(ret) if tokenize else ret
135
135
 
136
- def __call__(self, text, **kwargs):
136
+ def __call__(self, text: List[str], **kwargs):
137
137
  return {
138
- "input_ids": self.encode(text),
138
+ "input_ids": [self.encode(x) for x in text],
139
139
  }
140
140
 
141
141
  def init_xgrammar(self):
@@ -4,10 +4,11 @@ import copy
4
4
  import dataclasses
5
5
  import logging
6
6
  from dataclasses import replace
7
- from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
7
+ from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
8
8
 
9
9
  import torch
10
10
 
11
+ from sglang.srt.layers import deep_gemm_wrapper
11
12
  from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
12
13
  from sglang.srt.layers.communicator import (
13
14
  CommunicateContext,
@@ -20,9 +21,11 @@ from sglang.srt.layers.moe import (
20
21
  get_tbo_token_distribution_threshold,
21
22
  is_tbo_enabled,
22
23
  )
23
- from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
24
- from sglang.srt.layers.quantization import deep_gemm_wrapper
25
- from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
24
+ from sglang.srt.layers.moe.token_dispatcher import (
25
+ DeepEPDispatcher,
26
+ MooncakeEPDispatcher,
27
+ )
28
+ from sglang.srt.managers.schedule_batch import ScheduleBatch
26
29
  from sglang.srt.model_executor.forward_batch_info import (
27
30
  ForwardBatch,
28
31
  ForwardMode,
@@ -30,12 +33,13 @@ from sglang.srt.model_executor.forward_batch_info import (
30
33
  )
31
34
  from sglang.srt.operations import execute_operations, execute_overlapped_operations
32
35
  from sglang.srt.operations_strategy import OperationsStrategy
33
- from sglang.srt.speculative.eagle_info import EagleDraftInput, EagleVerifyInput
36
+ from sglang.srt.server_args import get_global_server_args
34
37
  from sglang.srt.speculative.spec_info import SpecInput
35
38
  from sglang.srt.utils import BumpAllocator, empty_context, get_bool_env_var, is_hip
36
39
 
37
40
  if TYPE_CHECKING:
38
41
  from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
42
+ from sglang.srt.speculative.eagle_info import EagleVerifyInput
39
43
 
40
44
  _is_hip = is_hip()
41
45
 
@@ -153,7 +157,7 @@ def _update_device_and_sum_field_from_cpu_field(
153
157
  cpu_value
154
158
  if isinstance(cpu_value, torch.Tensor)
155
159
  else torch.tensor(cpu_value, dtype=old_device_value.dtype)
156
- ).to(device=global_server_args_dict["device"], non_blocking=True)
160
+ ).to(device=get_global_server_args().device, non_blocking=True)
157
161
  setattr(batch, device_field, new_device_value)
158
162
 
159
163
  if sum_field is not None:
@@ -362,7 +366,7 @@ class TboDPAttentionPreparer:
362
366
  ):
363
367
 
364
368
  deepep_mode = get_deepep_mode()
365
- enable_deepep_moe = get_moe_a2a_backend().is_deepep()
369
+ enable_a2a_moe = not get_moe_a2a_backend().is_none()
366
370
  enable_two_batch_overlap = is_tbo_enabled()
367
371
 
368
372
  self.enable_two_batch_overlap = enable_two_batch_overlap
@@ -391,7 +395,7 @@ class TboDPAttentionPreparer:
391
395
  local_batch.forward_mode.is_extend()
392
396
  and not local_batch.forward_mode.is_target_verify()
393
397
  )
394
- and enable_deepep_moe
398
+ and enable_a2a_moe
395
399
  and (resolved_deepep_mode.is_low_latency())
396
400
  )
397
401
  else:
@@ -582,7 +586,7 @@ class TboForwardBatchPreparer:
582
586
  sum_field=None,
583
587
  )
584
588
  _, child_b.extend_start_loc = compute_position(
585
- global_server_args_dict["attention_backend"],
589
+ get_global_server_args().attention_backend,
586
590
  child_b.extend_prefix_lens,
587
591
  child_b.extend_seq_lens,
588
592
  child_b.extend_num_tokens,
@@ -667,6 +671,7 @@ class TboForwardBatchPreparer:
667
671
  "can_run_dp_cuda_graph",
668
672
  "dp_padding_mode",
669
673
  "global_forward_mode",
674
+ "is_prefill_only",
670
675
  "spec_algorithm",
671
676
  "capture_hidden_mode",
672
677
  "padded_static_len",
@@ -686,7 +691,7 @@ class TboForwardBatchPreparer:
686
691
 
687
692
  # TODO improve, e.g. unify w/ `init_raw`
688
693
  if (
689
- global_server_args_dict["moe_dense_tp_size"] == 1
694
+ get_global_server_args().moe_dense_tp_size == 1
690
695
  and batch.global_dp_buffer_len is not None
691
696
  ):
692
697
  sum_len = end_token_index - start_token_index
@@ -754,7 +759,7 @@ class TboForwardBatchPreparer:
754
759
  value_a = min(tbo_split_token_index, num_token_non_padded)
755
760
  value_b = max(0, num_token_non_padded - tbo_split_token_index)
756
761
  return torch.tensor([value_a, value_b], dtype=torch.int32).to(
757
- device=global_server_args_dict["device"], non_blocking=True
762
+ device=get_global_server_args().device, non_blocking=True
758
763
  )
759
764
 
760
765
  @classmethod
@@ -966,9 +971,14 @@ def _model_forward_tbo_merge_outputs(output_a, output_b):
966
971
  class MaybeTboDeepEPDispatcher:
967
972
  def __init__(self, **kwargs):
968
973
  num_inner_dispatchers = 2 if is_tbo_enabled() else 1
969
- self._inners = [
970
- DeepEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
971
- ]
974
+ if get_moe_a2a_backend().is_deepep():
975
+ self._inners = [
976
+ DeepEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
977
+ ]
978
+ elif get_moe_a2a_backend().is_mooncake():
979
+ self._inners = [
980
+ MooncakeEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
981
+ ]
972
982
 
973
983
  def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
974
984
  return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
@@ -990,3 +1000,7 @@ class MaybeTboDeepEPDispatcher:
990
1000
 
991
1001
  def combine_b(self, **kwargs):
992
1002
  return self._execute("combine_b", **kwargs)
1003
+
1004
+ def set_quant_config(self, quant_config: dict):
1005
+ for inner in self._inners:
1006
+ inner.set_quant_config(quant_config)
@@ -1,2 +1,2 @@
1
1
  # Temporarily do this to avoid changing all imports in the repo
2
- from .common import *
2
+ from sglang.srt.utils.common import *
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import re
2
3
  import sys
3
4
  from contextlib import nullcontext
4
5
 
@@ -108,7 +109,8 @@ def bench_kineto(
108
109
  if not with_multiple_kernels:
109
110
  for name in kernel_names:
110
111
  assert (
111
- sum([name in line for line in prof_lines]) == 1
112
+ sum([int(re.search(name, line) is not None) for line in prof_lines])
113
+ == 1
112
114
  ), f"Errors of the kernel {name} in the profiling table (table: {prof_lines})"
113
115
 
114
116
  # Save chrome traces
@@ -122,7 +124,7 @@ def bench_kineto(
122
124
  total_time = 0
123
125
  total_num = 0
124
126
  for line in prof_lines:
125
- if name in line:
127
+ if re.search(name, line) is not None:
126
128
  time_str = line.split()[-2]
127
129
  num_str = line.split()[-1]
128
130
  for unit, scale in units.items():