sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Common utilities."""
15
-
16
15
  from __future__ import annotations
17
16
 
18
17
  import argparse
@@ -43,6 +42,7 @@ import tempfile
43
42
  import threading
44
43
  import time
45
44
  import traceback
45
+ import types
46
46
  import uuid
47
47
  import warnings
48
48
  from collections import OrderedDict, defaultdict
@@ -63,6 +63,7 @@ from typing import (
63
63
  List,
64
64
  Optional,
65
65
  Protocol,
66
+ Sequence,
66
67
  Set,
67
68
  Tuple,
68
69
  TypeVar,
@@ -70,6 +71,7 @@ from typing import (
70
71
  )
71
72
 
72
73
  import numpy as np
74
+ import orjson
73
75
  import psutil
74
76
  import pybase64
75
77
  import requests
@@ -88,6 +90,7 @@ from torch.profiler import ProfilerActivity, profile, record_function
88
90
  from torch.utils._contextlib import _DecoratorContextManager
89
91
  from typing_extensions import Literal
90
92
 
93
+ from sglang.srt.environ import envs
91
94
  from sglang.srt.metrics.func_timer import enable_func_timer
92
95
 
93
96
  logger = logging.getLogger(__name__)
@@ -131,6 +134,7 @@ def is_xpu() -> bool:
131
134
  return hasattr(torch, "xpu") and torch.xpu.is_available()
132
135
 
133
136
 
137
+ @lru_cache(maxsize=1)
134
138
  def is_npu() -> bool:
135
139
  return hasattr(torch, "npu") and torch.npu.is_available()
136
140
 
@@ -162,6 +166,20 @@ def _check(cc_major):
162
166
  ) >= (12, 3)
163
167
 
164
168
 
169
+ @contextmanager
170
+ def device_context(device: torch.device):
171
+ if device.type == "cpu" and is_cpu():
172
+ with torch.device("cpu"):
173
+ yield
174
+ else:
175
+ module = torch.get_device_module(device)
176
+ if module is not None:
177
+ with module.device(device.index):
178
+ yield
179
+ else:
180
+ raise ValueError(f"Unknown device module: {device}")
181
+
182
+
165
183
  is_ampere_with_cuda_12_3 = lambda: _check(8)
166
184
  is_hopper_with_cuda_12_3 = lambda: _check(9)
167
185
 
@@ -173,6 +191,15 @@ def is_blackwell():
173
191
  return torch.cuda.get_device_capability()[0] == 10
174
192
 
175
193
 
194
+ @lru_cache(maxsize=1)
195
+ def is_sm120_supported(device=None) -> bool:
196
+ if not is_cuda_alike():
197
+ return False
198
+ return (torch.cuda.get_device_capability(device)[0] == 12) and (
199
+ torch.version.cuda >= "12.8"
200
+ )
201
+
202
+
176
203
  @lru_cache(maxsize=1)
177
204
  def is_sm100_supported(device=None) -> bool:
178
205
  if not is_cuda_alike():
@@ -228,7 +255,7 @@ def support_triton(backend: str) -> bool:
228
255
 
229
256
 
230
257
  try:
231
- import sgl_kernel
258
+ import sgl_kernel # noqa: F401
232
259
 
233
260
  is_intel_amx_backend_available = hasattr(
234
261
  torch.ops.sgl_kernel, "convert_weight_packed"
@@ -253,6 +280,14 @@ def use_intel_amx_backend(layer):
253
280
  return getattr(layer, "use_intel_amx_backend", False)
254
281
 
255
282
 
283
+ def xpu_has_xmx_support():
284
+ # TODO: update with XPU capalibity query
285
+ if is_xpu():
286
+ # currently only PVC/LNL/BMG supports F64, so we only support these now
287
+ return torch.xpu.get_device_properties().has_fp64
288
+ return False
289
+
290
+
256
291
  def is_flashinfer_available():
257
292
  """
258
293
  Check whether flashinfer is available.
@@ -263,6 +298,17 @@ def is_flashinfer_available():
263
298
  return importlib.util.find_spec("flashinfer") is not None and is_cuda()
264
299
 
265
300
 
301
+ def is_nvidia_cublas_cu12_version_ge_12_9():
302
+ """
303
+ temporary fix for issue #11272
304
+ """
305
+ try:
306
+ installed_version = version("nvidia-cublas-cu12")
307
+ except PackageNotFoundError:
308
+ return False
309
+ return pkg_version.parse(installed_version) >= pkg_version.parse("12.9")
310
+
311
+
266
312
  def random_uuid() -> str:
267
313
  return str(uuid.uuid4().hex)
268
314
 
@@ -409,7 +455,15 @@ def get_available_gpu_memory(
409
455
 
410
456
  if empty_cache:
411
457
  torch.cuda.empty_cache()
412
- free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
458
+ SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121) # Orin, Thor, Spark
459
+ if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
460
+ # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
461
+ # only reports "free" memory, which can be lower than what is actually
462
+ # available due to not including cache memory. So we use the system available
463
+ # memory metric instead.
464
+ free_gpu_memory = psutil.virtual_memory().available
465
+ else:
466
+ free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
413
467
 
414
468
  elif device == "xpu":
415
469
  num_gpus = torch.xpu.device_count()
@@ -453,6 +507,8 @@ def get_available_gpu_memory(
453
507
  f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
454
508
  "which may cause useless memory allocation for torch NPU context.",
455
509
  )
510
+ if empty_cache:
511
+ torch.npu.empty_cache()
456
512
  free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
457
513
 
458
514
  if distributed:
@@ -481,13 +537,13 @@ def make_layers(
481
537
  pp_size: Optional[int] = None,
482
538
  prefix: str = "",
483
539
  return_tuple: bool = False,
484
- offloader_kwargs: Dict[str, Any] = {},
540
+ offloader_kwargs: Optional[Dict[str, Any]] = None,
485
541
  ) -> Tuple[torch.nn.Module, int, int]:
486
542
  """Make a list of layers with the given layer function"""
487
543
  # circula imports
488
544
  from sglang.srt.distributed import get_pp_indices
489
545
  from sglang.srt.layers.utils import PPMissingLayer
490
- from sglang.srt.offloader import get_offloader
546
+ from sglang.srt.utils.offloader import get_offloader
491
547
 
492
548
  assert not pp_size or num_hidden_layers >= pp_size
493
549
  start_layer, end_layer = (
@@ -506,7 +562,7 @@ def make_layers(
506
562
  layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
507
563
  for idx in range(start_layer, end_layer)
508
564
  ),
509
- **offloader_kwargs,
565
+ **(offloader_kwargs or {}),
510
566
  )
511
567
  + [
512
568
  PPMissingLayer(return_tuple=return_tuple)
@@ -518,6 +574,24 @@ def make_layers(
518
574
  return modules, start_layer, end_layer
519
575
 
520
576
 
577
+ def make_layers_non_pp(
578
+ num_hidden_layers: int,
579
+ layer_fn: LayerFn,
580
+ prefix: str = "",
581
+ ) -> torch.nn.ModuleList:
582
+ from sglang.srt.utils.offloader import get_offloader
583
+
584
+ layers = torch.nn.ModuleList(
585
+ get_offloader().wrap_modules(
586
+ (
587
+ layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
588
+ for idx in range(num_hidden_layers)
589
+ )
590
+ )
591
+ )
592
+ return layers
593
+
594
+
521
595
  cmo_stream = None
522
596
 
523
597
 
@@ -811,9 +885,9 @@ def get_image_bytes(image_file: Union[str, bytes]):
811
885
  return f.read()
812
886
  elif image_file.startswith("data:"):
813
887
  image_file = image_file.split(",")[1]
814
- return pybase64.b64decode(image_file)
888
+ return pybase64.b64decode(image_file, validate=True)
815
889
  elif isinstance(image_file, str):
816
- return pybase64.b64decode(image_file)
890
+ return pybase64.b64decode(image_file, validate=True)
817
891
  else:
818
892
  raise NotImplementedError(f"Invalid image: {image_file}")
819
893
 
@@ -850,7 +924,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
850
924
  vr = VideoReader(tmp_file.name, ctx=ctx)
851
925
  elif video_file.startswith("data:"):
852
926
  _, encoded = video_file.split(",", 1)
853
- video_bytes = pybase64.b64decode(encoded)
927
+ video_bytes = pybase64.b64decode(encoded, validate=True)
854
928
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
855
929
  tmp_file.write(video_bytes)
856
930
  tmp_file.close()
@@ -858,7 +932,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
858
932
  elif os.path.isfile(video_file):
859
933
  vr = VideoReader(video_file, ctx=ctx)
860
934
  else:
861
- video_bytes = pybase64.b64decode(video_file)
935
+ video_bytes = pybase64.b64decode(video_file, validate=True)
862
936
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
863
937
  tmp_file.write(video_bytes)
864
938
  tmp_file.close()
@@ -992,32 +1066,6 @@ def monkey_patch_p2p_access_check():
992
1066
  setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
993
1067
 
994
1068
 
995
- def monkey_patch_vllm_gguf_config():
996
- try:
997
- from vllm.model_executor.layers.quantization.gguf import (
998
- GGUFConfig,
999
- GGUFEmbeddingMethod,
1000
- GGUFLinearMethod,
1001
- )
1002
- except ImportError:
1003
- return
1004
-
1005
- from sglang.srt.layers.linear import LinearBase
1006
- from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
1007
-
1008
- def get_quant_method_with_embedding_replaced(
1009
- self, layer: torch.nn.Module, prefix: str
1010
- ) -> Optional["QuantizeMethodBase"]:
1011
- if isinstance(layer, LinearBase):
1012
- return GGUFLinearMethod(self)
1013
- elif isinstance(layer, VocabParallelEmbedding):
1014
- # patch to own VocabParallelEmbedding
1015
- return GGUFEmbeddingMethod(self)
1016
- return None
1017
-
1018
- setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
1019
-
1020
-
1021
1069
  def set_ulimit(target_soft_limit=65535):
1022
1070
  # number of open files
1023
1071
  resource_type = resource.RLIMIT_NOFILE
@@ -1054,9 +1102,9 @@ def add_api_key_middleware(app, api_key: str):
1054
1102
  async def authentication(request, call_next):
1055
1103
  if request.method == "OPTIONS":
1056
1104
  return await call_next(request)
1057
- if request.url.path.startswith("/health"):
1058
- return await call_next(request)
1059
- if request.url.path.startswith("/metrics"):
1105
+ if request.url.path.startswith("/health") or request.url.path.startswith(
1106
+ "/metrics"
1107
+ ):
1060
1108
  return await call_next(request)
1061
1109
  if request.headers.get("Authorization") != "Bearer " + api_key:
1062
1110
  return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
@@ -1083,7 +1131,7 @@ def configure_logger(server_args, prefix: str = ""):
1083
1131
  f"{SGLANG_LOGGING_CONFIG_PATH} but it does not exist!"
1084
1132
  )
1085
1133
  with open(SGLANG_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
1086
- custom_config = json.loads(file.read())
1134
+ custom_config = orjson.loads(file.read())
1087
1135
  logging.config.dictConfig(custom_config)
1088
1136
  return
1089
1137
  format = f"[%(asctime)s{prefix}] %(message)s"
@@ -1262,8 +1310,46 @@ def pytorch_profile(name, func, *args, data_size=-1):
1262
1310
 
1263
1311
 
1264
1312
  def get_zmq_socket(
1265
- context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
1266
- ) -> zmq.Socket:
1313
+ context: zmq.Context,
1314
+ socket_type: zmq.SocketType,
1315
+ endpoint: Optional[str] = None,
1316
+ bind: bool = True,
1317
+ ) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
1318
+ """Create and configure a ZeroMQ socket.
1319
+
1320
+ Args:
1321
+ context: ZeroMQ context to create the socket from.
1322
+ socket_type: Type of ZeroMQ socket to create.
1323
+ endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
1324
+ bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
1325
+
1326
+ Returns:
1327
+ If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
1328
+ If endpoint is provided: The configured ZeroMQ socket.
1329
+ """
1330
+ socket = context.socket(socket_type)
1331
+
1332
+ if endpoint is None:
1333
+ # Bind to random TCP port
1334
+ config_socket(socket, socket_type)
1335
+ port = socket.bind_to_random_port("tcp://*")
1336
+ return port, socket
1337
+ else:
1338
+ # Handle IPv6 if endpoint contains brackets
1339
+ if endpoint.find("[") != -1:
1340
+ socket.setsockopt(zmq.IPV6, 1)
1341
+
1342
+ config_socket(socket, socket_type)
1343
+
1344
+ if bind:
1345
+ socket.bind(endpoint)
1346
+ else:
1347
+ socket.connect(endpoint)
1348
+
1349
+ return socket
1350
+
1351
+
1352
+ def config_socket(socket, socket_type: zmq.SocketType):
1267
1353
  mem = psutil.virtual_memory()
1268
1354
  total_mem = mem.total / 1024**3
1269
1355
  available_mem = mem.available / 1024**3
@@ -1272,10 +1358,6 @@ def get_zmq_socket(
1272
1358
  else:
1273
1359
  buf_size = -1
1274
1360
 
1275
- socket = context.socket(socket_type)
1276
- if endpoint.find("[") != -1:
1277
- socket.setsockopt(zmq.IPV6, 1)
1278
-
1279
1361
  def set_send_opt():
1280
1362
  socket.setsockopt(zmq.SNDHWM, 0)
1281
1363
  socket.setsockopt(zmq.SNDBUF, buf_size)
@@ -1288,19 +1370,12 @@ def get_zmq_socket(
1288
1370
  set_send_opt()
1289
1371
  elif socket_type == zmq.PULL:
1290
1372
  set_recv_opt()
1291
- elif socket_type == zmq.DEALER:
1373
+ elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
1292
1374
  set_send_opt()
1293
1375
  set_recv_opt()
1294
1376
  else:
1295
1377
  raise ValueError(f"Unsupported socket type: {socket_type}")
1296
1378
 
1297
- if bind:
1298
- socket.bind(endpoint)
1299
- else:
1300
- socket.connect(endpoint)
1301
-
1302
- return socket
1303
-
1304
1379
 
1305
1380
  def dump_to_file(dirpath, name, value):
1306
1381
  from sglang.srt.distributed import get_tensor_model_parallel_rank
@@ -1500,7 +1575,7 @@ def get_hpu_memory_capacity():
1500
1575
 
1501
1576
  def get_npu_memory_capacity():
1502
1577
  try:
1503
- import torch_npu
1578
+ import torch_npu # noqa: F401
1504
1579
 
1505
1580
  return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB
1506
1581
  except ImportError as e:
@@ -1521,13 +1596,18 @@ def get_cpu_memory_capacity():
1521
1596
  for numa_id in range(n_numa_node):
1522
1597
  file_meminfo = f"node{numa_id}/meminfo"
1523
1598
  with open(os.path.join(file_prefix, file_meminfo), "r") as f:
1524
- # 1st line contains 'MemTotal'
1525
- line = f.read().split("\n")[0]
1526
- numa_mem_list.append(int(line.split()[3]))
1599
+ # MemTotal info is at the 1st line
1600
+ line = f.readline()
1601
+ # Expected format: "Node 0 MemTotal: 100000000 kB"
1602
+ parts = line.split()
1603
+ if len(parts) >= 4 and parts[2] == "MemTotal:":
1604
+ numa_mem_list.append(int(parts[3]))
1605
+ else:
1606
+ raise ValueError(f"Unexpected format in {file_meminfo}: {line}")
1527
1607
  # Retrieved value in KB, need MB
1528
1608
  numa_mem = float(min(numa_mem_list) // 1024)
1529
1609
  return numa_mem
1530
- except FileNotFoundError:
1610
+ except (FileNotFoundError, ValueError, IndexError):
1531
1611
  numa_mem = psutil.virtual_memory().total / n_numa_node
1532
1612
  # Retrieved value in Byte, need MB
1533
1613
  return float(numa_mem // (1 << 20))
@@ -1687,7 +1767,7 @@ def get_device(device_id: Optional[int] = None) -> str:
1687
1767
 
1688
1768
  if is_habana_available():
1689
1769
  try:
1690
- import habana_frameworks.torch.hpu
1770
+ import habana_frameworks.torch.hpu # noqa: F401
1691
1771
 
1692
1772
  if torch.hpu.is_available():
1693
1773
  if device_id == None:
@@ -1717,7 +1797,7 @@ def get_device_count() -> int:
1717
1797
 
1718
1798
  if is_habana_available():
1719
1799
  try:
1720
- import habana_frameworks.torch.hpu
1800
+ import habana_frameworks.torch.hpu # noqa: F401
1721
1801
 
1722
1802
  if torch.hpu.is_available():
1723
1803
  return torch.hpu.device_count()
@@ -1860,7 +1940,9 @@ def direct_register_custom_op(
1860
1940
  if fake_impl is not None:
1861
1941
  my_lib._register_fake(op_name, fake_impl)
1862
1942
  except RuntimeError as error:
1863
- if "Tried to register an operator" in str(e) and "multiple times" in str(e):
1943
+ if "Tried to register an operator" in str(error) and "multiple times" in str(
1944
+ error
1945
+ ):
1864
1946
  # Silently ignore duplicate registration errors
1865
1947
  # This can happen in multi-engine scenarios
1866
1948
  pass
@@ -1873,6 +1955,7 @@ def direct_register_custom_op(
1873
1955
 
1874
1956
 
1875
1957
  def set_gpu_proc_affinity(
1958
+ pp_size: int,
1876
1959
  tp_size: int,
1877
1960
  nnodes: int,
1878
1961
  gpu_id: int,
@@ -1881,7 +1964,8 @@ def set_gpu_proc_affinity(
1881
1964
  pid = os.getpid()
1882
1965
  p = psutil.Process(pid)
1883
1966
 
1884
- tp_size_per_node = tp_size // nnodes
1967
+ nnodes_per_tp_group = max(nnodes // pp_size, 1)
1968
+ tp_size_per_node = tp_size // nnodes_per_tp_group
1885
1969
 
1886
1970
  # total physical cores
1887
1971
  total_pcores = psutil.cpu_count(logical=False)
@@ -2012,7 +2096,78 @@ class MultiprocessingSerializer:
2012
2096
  # Decode base64 string to bytes
2013
2097
  data = pybase64.b64decode(data, validate=True)
2014
2098
 
2015
- return ForkingPickler.loads(data)
2099
+ return SafeUnpickler(io.BytesIO(data)).load()
2100
+
2101
+
2102
+ class SafeUnpickler(pickle.Unpickler):
2103
+ ALLOWED_MODULE_PREFIXES = {
2104
+ # --- Python types ---
2105
+ "builtins.",
2106
+ "collections.",
2107
+ "copyreg.",
2108
+ "functools.",
2109
+ "itertools.",
2110
+ "operator.",
2111
+ "types.",
2112
+ "weakref.",
2113
+ # --- PyTorch types ---
2114
+ "torch.",
2115
+ "torch._tensor.",
2116
+ "torch.storage.",
2117
+ "torch.nn.parameter.",
2118
+ "torch.autograd.function.",
2119
+ # --- torch distributed ---
2120
+ "torch.distributed.",
2121
+ "torch.distributed._shard.",
2122
+ "torch.distributed._composable.",
2123
+ "torch._C._distributed_c10d.",
2124
+ "torch._C._distributed_fsdp.",
2125
+ "torch.distributed.optim.",
2126
+ # --- multiprocessing ---
2127
+ "multiprocessing.resource_sharer.",
2128
+ "multiprocessing.reduction.",
2129
+ "pickletools.",
2130
+ # --- PEFT / LoRA ---
2131
+ "peft.",
2132
+ "transformers.",
2133
+ "huggingface_hub.",
2134
+ # --- SGLang & Unitest ---
2135
+ "sglang.srt.weight_sync.tensor_bucket.",
2136
+ "sglang.srt.model_executor.model_runner.",
2137
+ "sglang.srt.layers.",
2138
+ "sglang.srt.utils.",
2139
+ }
2140
+
2141
+ DENY_CLASSES = {
2142
+ ("builtins", "eval"),
2143
+ ("builtins", "exec"),
2144
+ ("builtins", "compile"),
2145
+ ("os", "system"),
2146
+ ("subprocess", "Popen"),
2147
+ ("subprocess", "run"),
2148
+ ("codecs", "decode"),
2149
+ ("types", "CodeType"),
2150
+ ("types", "FunctionType"),
2151
+ }
2152
+
2153
+ def find_class(self, module, name):
2154
+ # Block deterministic attacks
2155
+ if (module, name) in self.DENY_CLASSES:
2156
+ raise RuntimeError(
2157
+ f"Blocked unsafe class loading ({module}.{name}), "
2158
+ f"to prevent exploitation of CVE-2025-10164"
2159
+ )
2160
+ # Allowlist of safe-to-load modules.
2161
+ if any(
2162
+ (module + ".").startswith(prefix) for prefix in self.ALLOWED_MODULE_PREFIXES
2163
+ ):
2164
+ return super().find_class(module, name)
2165
+
2166
+ # Block everything else. (Potential attack surface)
2167
+ raise RuntimeError(
2168
+ f"Blocked unsafe class loading ({module}.{name}), "
2169
+ f"to prevent exploitation of CVE-2025-10164"
2170
+ )
2016
2171
 
2017
2172
 
2018
2173
  def debug_timing(func):
@@ -2164,6 +2319,11 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
2164
2319
 
2165
2320
  app = FastAPI()
2166
2321
 
2322
+ @app.get("/ping")
2323
+ async def ping():
2324
+ """Could be used by the checkpoint-engine update script to confirm the server is up."""
2325
+ return Response(status_code=200)
2326
+
2167
2327
  @app.get("/health")
2168
2328
  async def health():
2169
2329
  """Check the health of the http server."""
@@ -2286,6 +2446,8 @@ def retry(
2286
2446
  try:
2287
2447
  return fn()
2288
2448
  except Exception as e:
2449
+ traceback.print_exc()
2450
+
2289
2451
  if try_index >= max_retry:
2290
2452
  raise Exception(f"retry() exceed maximum number of retries.")
2291
2453
 
@@ -2299,11 +2461,30 @@ def retry(
2299
2461
  logger.warning(
2300
2462
  f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
2301
2463
  )
2302
- traceback.print_exc()
2303
2464
 
2304
2465
  time.sleep(delay)
2305
2466
 
2306
2467
 
2468
+ def has_hf_quant_config(model_path: str) -> bool:
2469
+ """Check if the model path contains hf_quant_config.json file.
2470
+
2471
+ Args:
2472
+ model_path: Path to the model, can be local path or remote URL.
2473
+
2474
+ Returns:
2475
+ True if hf_quant_config.json exists, False otherwise.
2476
+ """
2477
+ if os.path.exists(os.path.join(model_path, "hf_quant_config.json")):
2478
+ return True
2479
+ try:
2480
+ from huggingface_hub import HfApi
2481
+
2482
+ hf_api = HfApi()
2483
+ return hf_api.file_exists(model_path, "hf_quant_config.json")
2484
+ except Exception:
2485
+ return False
2486
+
2487
+
2307
2488
  def flatten_nested_list(nested_list):
2308
2489
  if isinstance(nested_list, list):
2309
2490
  return [
@@ -2439,17 +2620,12 @@ def get_local_ip_auto(fallback: str = None) -> str:
2439
2620
  raise ValueError("Can not get local ip")
2440
2621
 
2441
2622
 
2442
- def is_page_size_one(server_args):
2443
- return server_args.page_size == 1
2444
-
2445
-
2446
2623
  # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
2447
2624
  # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
2448
2625
  def is_no_spec_infer_or_topk_one(server_args):
2449
2626
  return server_args.speculative_eagle_topk is None or (
2450
- server_args.speculative_eagle_topk is not None
2451
- and server_args.speculative_eagle_topk == 1
2452
- and is_page_size_one(server_args)
2627
+ server_args.speculative_eagle_topk == 1
2628
+ and (server_args.page_size == 1 or server_args.page_size is None)
2453
2629
  )
2454
2630
 
2455
2631
 
@@ -2461,6 +2637,7 @@ def is_fa3_default_architecture(hf_config):
2461
2637
  "Qwen2ForCausalLM",
2462
2638
  "Llama4ForConditionalGeneration",
2463
2639
  "LlamaForCausalLM",
2640
+ "Olmo2ForCausalLM",
2464
2641
  "Gemma2ForCausalLM",
2465
2642
  "Gemma3ForConditionalGeneration",
2466
2643
  "Qwen3ForCausalLM",
@@ -2494,9 +2671,9 @@ def log_info_on_rank0(logger, msg):
2494
2671
 
2495
2672
  def load_json_config(data: str):
2496
2673
  try:
2497
- return json.loads(data)
2674
+ return orjson.loads(data)
2498
2675
  except JSONDecodeError:
2499
- return json.loads(Path(data).read_text())
2676
+ return orjson.loads(Path(data).read_text())
2500
2677
 
2501
2678
 
2502
2679
  def dispose_tensor(x: torch.Tensor):
@@ -2863,7 +3040,7 @@ def get_cpu_ids_by_node():
2863
3040
  def is_shm_available(dtype, world_size, local_size):
2864
3041
  return (
2865
3042
  cpu_has_amx_support()
2866
- and dtype in [torch.bfloat16, torch.float]
3043
+ and dtype in [torch.bfloat16, torch.float16, torch.float]
2867
3044
  and world_size >= 1
2868
3045
  and world_size == local_size
2869
3046
  )
@@ -2914,10 +3091,6 @@ def lru_cache_frozenset(maxsize=128):
2914
3091
  return decorator
2915
3092
 
2916
3093
 
2917
- def get_origin_rid(rid):
2918
- return rid.split("_", 1)[1] if "_" in rid else rid
2919
-
2920
-
2921
3094
  def apply_module_patch(target_module, target_function, wrappers):
2922
3095
  original_module, original_function = parse_module_path(
2923
3096
  target_module, target_function, False
@@ -3205,7 +3378,7 @@ def numa_bind_to_node(node: int):
3205
3378
 
3206
3379
  def json_list_type(value):
3207
3380
  try:
3208
- return json.loads(value)
3381
+ return orjson.loads(value)
3209
3382
  except json.JSONDecodeError:
3210
3383
  raise argparse.ArgumentTypeError(
3211
3384
  f"Invalid JSON list: {value}. Please provide a valid JSON list."
@@ -3213,7 +3386,12 @@ def json_list_type(value):
3213
3386
 
3214
3387
 
3215
3388
  @contextmanager
3216
- def temp_set_cuda_visible_devices(gpu_id: int):
3389
+ def maybe_reindex_device_id(gpu_id: int):
3390
+
3391
+ if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() is False or not is_cuda_alike():
3392
+ yield gpu_id
3393
+ return
3394
+
3217
3395
  original_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
3218
3396
  if original_cuda_visible_devices:
3219
3397
  cuda_visible_devices = original_cuda_visible_devices.split(",")
@@ -3222,7 +3400,11 @@ def temp_set_cuda_visible_devices(gpu_id: int):
3222
3400
 
3223
3401
  str_gpu_id = cuda_visible_devices[gpu_id] if cuda_visible_devices else str(gpu_id)
3224
3402
  os.environ["CUDA_VISIBLE_DEVICES"] = str_gpu_id
3225
- yield
3403
+
3404
+ logger.debug(f"Set CUDA_VISIBLE_DEVICES to {str_gpu_id}")
3405
+
3406
+ yield 0
3407
+
3226
3408
  if original_cuda_visible_devices:
3227
3409
  os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible_devices
3228
3410
  else:
@@ -3383,3 +3565,11 @@ def cached_triton_kernel(key_fn=None):
3383
3565
  return CachedKernel(fn, key_fn)
3384
3566
 
3385
3567
  return decorator
3568
+
3569
+
3570
+ # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
3571
+ def calc_diff(x, y):
3572
+ x, y = x.double(), y.double()
3573
+ denominator = (x * x + y * y).sum()
3574
+ sim = 2 * (x * y).sum() / denominator
3575
+ return 1 - sim