sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Common utilities."""
15
-
16
15
  from __future__ import annotations
17
16
 
18
17
  import argparse
@@ -43,6 +42,7 @@ import tempfile
43
42
  import threading
44
43
  import time
45
44
  import traceback
45
+ import types
46
46
  import uuid
47
47
  import warnings
48
48
  from collections import OrderedDict, defaultdict
@@ -56,6 +56,7 @@ from json import JSONDecodeError
56
56
  from multiprocessing.reduction import ForkingPickler
57
57
  from pathlib import Path
58
58
  from typing import (
59
+ TYPE_CHECKING,
59
60
  Any,
60
61
  Callable,
61
62
  Dict,
@@ -63,6 +64,7 @@ from typing import (
63
64
  List,
64
65
  Optional,
65
66
  Protocol,
67
+ Sequence,
66
68
  Set,
67
69
  Tuple,
68
70
  TypeVar,
@@ -70,6 +72,7 @@ from typing import (
70
72
  )
71
73
 
72
74
  import numpy as np
75
+ import orjson
73
76
  import psutil
74
77
  import pybase64
75
78
  import requests
@@ -88,8 +91,12 @@ from torch.profiler import ProfilerActivity, profile, record_function
88
91
  from torch.utils._contextlib import _DecoratorContextManager
89
92
  from typing_extensions import Literal
90
93
 
94
+ from sglang.srt.environ import envs
91
95
  from sglang.srt.metrics.func_timer import enable_func_timer
92
96
 
97
+ if TYPE_CHECKING:
98
+ from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
99
+
93
100
  logger = logging.getLogger(__name__)
94
101
 
95
102
  show_time_cost = False
@@ -162,6 +169,20 @@ def _check(cc_major):
162
169
  ) >= (12, 3)
163
170
 
164
171
 
172
+ @contextmanager
173
+ def device_context(device: torch.device):
174
+ if device.type == "cpu" and is_cpu():
175
+ with torch.device("cpu"):
176
+ yield
177
+ else:
178
+ module = torch.get_device_module(device)
179
+ if module is not None:
180
+ with module.device(device.index):
181
+ yield
182
+ else:
183
+ raise ValueError(f"Unknown device module: {device}")
184
+
185
+
165
186
  is_ampere_with_cuda_12_3 = lambda: _check(8)
166
187
  is_hopper_with_cuda_12_3 = lambda: _check(9)
167
188
 
@@ -173,6 +194,15 @@ def is_blackwell():
173
194
  return torch.cuda.get_device_capability()[0] == 10
174
195
 
175
196
 
197
+ @lru_cache(maxsize=1)
198
+ def is_sm120_supported(device=None) -> bool:
199
+ if not is_cuda_alike():
200
+ return False
201
+ return (torch.cuda.get_device_capability(device)[0] == 12) and (
202
+ torch.version.cuda >= "12.8"
203
+ )
204
+
205
+
176
206
  @lru_cache(maxsize=1)
177
207
  def is_sm100_supported(device=None) -> bool:
178
208
  if not is_cuda_alike():
@@ -228,7 +258,7 @@ def support_triton(backend: str) -> bool:
228
258
 
229
259
 
230
260
  try:
231
- import sgl_kernel
261
+ import sgl_kernel # noqa: F401
232
262
 
233
263
  is_intel_amx_backend_available = hasattr(
234
264
  torch.ops.sgl_kernel, "convert_weight_packed"
@@ -253,6 +283,14 @@ def use_intel_amx_backend(layer):
253
283
  return getattr(layer, "use_intel_amx_backend", False)
254
284
 
255
285
 
286
+ def xpu_has_xmx_support():
287
+ # TODO: update with XPU capalibity query
288
+ if is_xpu():
289
+ # currently only PVC/LNL/BMG supports F64, so we only support these now
290
+ return torch.xpu.get_device_properties().has_fp64
291
+ return False
292
+
293
+
256
294
  def is_flashinfer_available():
257
295
  """
258
296
  Check whether flashinfer is available.
@@ -263,6 +301,17 @@ def is_flashinfer_available():
263
301
  return importlib.util.find_spec("flashinfer") is not None and is_cuda()
264
302
 
265
303
 
304
+ def is_nvidia_cublas_cu12_version_ge_12_9():
305
+ """
306
+ temporary fix for issue #11272
307
+ """
308
+ try:
309
+ installed_version = version("nvidia-cublas-cu12")
310
+ except PackageNotFoundError:
311
+ return False
312
+ return pkg_version.parse(installed_version) >= pkg_version.parse("12.9")
313
+
314
+
266
315
  def random_uuid() -> str:
267
316
  return str(uuid.uuid4().hex)
268
317
 
@@ -409,7 +458,15 @@ def get_available_gpu_memory(
409
458
 
410
459
  if empty_cache:
411
460
  torch.cuda.empty_cache()
412
- free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
461
+ SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121) # Orin, Thor, Spark
462
+ if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
463
+ # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
464
+ # only reports "free" memory, which can be lower than what is actually
465
+ # available due to not including cache memory. So we use the system available
466
+ # memory metric instead.
467
+ free_gpu_memory = psutil.virtual_memory().available
468
+ else:
469
+ free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
413
470
 
414
471
  elif device == "xpu":
415
472
  num_gpus = torch.xpu.device_count()
@@ -453,6 +510,8 @@ def get_available_gpu_memory(
453
510
  f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
454
511
  "which may cause useless memory allocation for torch NPU context.",
455
512
  )
513
+ if empty_cache:
514
+ torch.npu.empty_cache()
456
515
  free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
457
516
 
458
517
  if distributed:
@@ -481,13 +540,13 @@ def make_layers(
481
540
  pp_size: Optional[int] = None,
482
541
  prefix: str = "",
483
542
  return_tuple: bool = False,
484
- offloader_kwargs: Dict[str, Any] = {},
543
+ offloader_kwargs: Optional[Dict[str, Any]] = None,
485
544
  ) -> Tuple[torch.nn.Module, int, int]:
486
545
  """Make a list of layers with the given layer function"""
487
546
  # circula imports
488
547
  from sglang.srt.distributed import get_pp_indices
489
548
  from sglang.srt.layers.utils import PPMissingLayer
490
- from sglang.srt.offloader import get_offloader
549
+ from sglang.srt.utils.offloader import get_offloader
491
550
 
492
551
  assert not pp_size or num_hidden_layers >= pp_size
493
552
  start_layer, end_layer = (
@@ -506,7 +565,7 @@ def make_layers(
506
565
  layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
507
566
  for idx in range(start_layer, end_layer)
508
567
  ),
509
- **offloader_kwargs,
568
+ **(offloader_kwargs or {}),
510
569
  )
511
570
  + [
512
571
  PPMissingLayer(return_tuple=return_tuple)
@@ -518,6 +577,24 @@ def make_layers(
518
577
  return modules, start_layer, end_layer
519
578
 
520
579
 
580
+ def make_layers_non_pp(
581
+ num_hidden_layers: int,
582
+ layer_fn: LayerFn,
583
+ prefix: str = "",
584
+ ) -> torch.nn.ModuleList:
585
+ from sglang.srt.utils.offloader import get_offloader
586
+
587
+ layers = torch.nn.ModuleList(
588
+ get_offloader().wrap_modules(
589
+ (
590
+ layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
591
+ for idx in range(num_hidden_layers)
592
+ )
593
+ )
594
+ )
595
+ return layers
596
+
597
+
521
598
  cmo_stream = None
522
599
 
523
600
 
@@ -811,9 +888,9 @@ def get_image_bytes(image_file: Union[str, bytes]):
811
888
  return f.read()
812
889
  elif image_file.startswith("data:"):
813
890
  image_file = image_file.split(",")[1]
814
- return pybase64.b64decode(image_file)
891
+ return pybase64.b64decode(image_file, validate=True)
815
892
  elif isinstance(image_file, str):
816
- return pybase64.b64decode(image_file)
893
+ return pybase64.b64decode(image_file, validate=True)
817
894
  else:
818
895
  raise NotImplementedError(f"Invalid image: {image_file}")
819
896
 
@@ -850,7 +927,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
850
927
  vr = VideoReader(tmp_file.name, ctx=ctx)
851
928
  elif video_file.startswith("data:"):
852
929
  _, encoded = video_file.split(",", 1)
853
- video_bytes = pybase64.b64decode(encoded)
930
+ video_bytes = pybase64.b64decode(encoded, validate=True)
854
931
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
855
932
  tmp_file.write(video_bytes)
856
933
  tmp_file.close()
@@ -858,7 +935,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
858
935
  elif os.path.isfile(video_file):
859
936
  vr = VideoReader(video_file, ctx=ctx)
860
937
  else:
861
- video_bytes = pybase64.b64decode(video_file)
938
+ video_bytes = pybase64.b64decode(video_file, validate=True)
862
939
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
863
940
  tmp_file.write(video_bytes)
864
941
  tmp_file.close()
@@ -1007,7 +1084,7 @@ def monkey_patch_vllm_gguf_config():
1007
1084
 
1008
1085
  def get_quant_method_with_embedding_replaced(
1009
1086
  self, layer: torch.nn.Module, prefix: str
1010
- ) -> Optional["QuantizeMethodBase"]:
1087
+ ) -> Optional[QuantizeMethodBase]:
1011
1088
  if isinstance(layer, LinearBase):
1012
1089
  return GGUFLinearMethod(self)
1013
1090
  elif isinstance(layer, VocabParallelEmbedding):
@@ -1083,7 +1160,7 @@ def configure_logger(server_args, prefix: str = ""):
1083
1160
  f"{SGLANG_LOGGING_CONFIG_PATH} but it does not exist!"
1084
1161
  )
1085
1162
  with open(SGLANG_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
1086
- custom_config = json.loads(file.read())
1163
+ custom_config = orjson.loads(file.read())
1087
1164
  logging.config.dictConfig(custom_config)
1088
1165
  return
1089
1166
  format = f"[%(asctime)s{prefix}] %(message)s"
@@ -1262,8 +1339,46 @@ def pytorch_profile(name, func, *args, data_size=-1):
1262
1339
 
1263
1340
 
1264
1341
  def get_zmq_socket(
1265
- context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
1266
- ) -> zmq.Socket:
1342
+ context: zmq.Context,
1343
+ socket_type: zmq.SocketType,
1344
+ endpoint: Optional[str] = None,
1345
+ bind: bool = True,
1346
+ ) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
1347
+ """Create and configure a ZeroMQ socket.
1348
+
1349
+ Args:
1350
+ context: ZeroMQ context to create the socket from.
1351
+ socket_type: Type of ZeroMQ socket to create.
1352
+ endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
1353
+ bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
1354
+
1355
+ Returns:
1356
+ If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
1357
+ If endpoint is provided: The configured ZeroMQ socket.
1358
+ """
1359
+ socket = context.socket(socket_type)
1360
+
1361
+ if endpoint is None:
1362
+ # Bind to random TCP port
1363
+ config_socket(socket, socket_type)
1364
+ port = socket.bind_to_random_port("tcp://*")
1365
+ return port, socket
1366
+ else:
1367
+ # Handle IPv6 if endpoint contains brackets
1368
+ if endpoint.find("[") != -1:
1369
+ socket.setsockopt(zmq.IPV6, 1)
1370
+
1371
+ config_socket(socket, socket_type)
1372
+
1373
+ if bind:
1374
+ socket.bind(endpoint)
1375
+ else:
1376
+ socket.connect(endpoint)
1377
+
1378
+ return socket
1379
+
1380
+
1381
+ def config_socket(socket, socket_type: zmq.SocketType):
1267
1382
  mem = psutil.virtual_memory()
1268
1383
  total_mem = mem.total / 1024**3
1269
1384
  available_mem = mem.available / 1024**3
@@ -1272,10 +1387,6 @@ def get_zmq_socket(
1272
1387
  else:
1273
1388
  buf_size = -1
1274
1389
 
1275
- socket = context.socket(socket_type)
1276
- if endpoint.find("[") != -1:
1277
- socket.setsockopt(zmq.IPV6, 1)
1278
-
1279
1390
  def set_send_opt():
1280
1391
  socket.setsockopt(zmq.SNDHWM, 0)
1281
1392
  socket.setsockopt(zmq.SNDBUF, buf_size)
@@ -1288,19 +1399,12 @@ def get_zmq_socket(
1288
1399
  set_send_opt()
1289
1400
  elif socket_type == zmq.PULL:
1290
1401
  set_recv_opt()
1291
- elif socket_type == zmq.DEALER:
1402
+ elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
1292
1403
  set_send_opt()
1293
1404
  set_recv_opt()
1294
1405
  else:
1295
1406
  raise ValueError(f"Unsupported socket type: {socket_type}")
1296
1407
 
1297
- if bind:
1298
- socket.bind(endpoint)
1299
- else:
1300
- socket.connect(endpoint)
1301
-
1302
- return socket
1303
-
1304
1408
 
1305
1409
  def dump_to_file(dirpath, name, value):
1306
1410
  from sglang.srt.distributed import get_tensor_model_parallel_rank
@@ -1500,7 +1604,7 @@ def get_hpu_memory_capacity():
1500
1604
 
1501
1605
  def get_npu_memory_capacity():
1502
1606
  try:
1503
- import torch_npu
1607
+ import torch_npu # noqa: F401
1504
1608
 
1505
1609
  return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB
1506
1610
  except ImportError as e:
@@ -1521,13 +1625,18 @@ def get_cpu_memory_capacity():
1521
1625
  for numa_id in range(n_numa_node):
1522
1626
  file_meminfo = f"node{numa_id}/meminfo"
1523
1627
  with open(os.path.join(file_prefix, file_meminfo), "r") as f:
1524
- # 1st line contains 'MemTotal'
1525
- line = f.read().split("\n")[0]
1526
- numa_mem_list.append(int(line.split()[3]))
1628
+ # MemTotal info is at the 1st line
1629
+ line = f.readline()
1630
+ # Expected format: "Node 0 MemTotal: 100000000 kB"
1631
+ parts = line.split()
1632
+ if len(parts) >= 4 and parts[2] == "MemTotal:":
1633
+ numa_mem_list.append(int(parts[3]))
1634
+ else:
1635
+ raise ValueError(f"Unexpected format in {file_meminfo}: {line}")
1527
1636
  # Retrieved value in KB, need MB
1528
1637
  numa_mem = float(min(numa_mem_list) // 1024)
1529
1638
  return numa_mem
1530
- except FileNotFoundError:
1639
+ except (FileNotFoundError, ValueError, IndexError):
1531
1640
  numa_mem = psutil.virtual_memory().total / n_numa_node
1532
1641
  # Retrieved value in Byte, need MB
1533
1642
  return float(numa_mem // (1 << 20))
@@ -1687,7 +1796,7 @@ def get_device(device_id: Optional[int] = None) -> str:
1687
1796
 
1688
1797
  if is_habana_available():
1689
1798
  try:
1690
- import habana_frameworks.torch.hpu
1799
+ import habana_frameworks.torch.hpu # noqa: F401
1691
1800
 
1692
1801
  if torch.hpu.is_available():
1693
1802
  if device_id == None:
@@ -1717,7 +1826,7 @@ def get_device_count() -> int:
1717
1826
 
1718
1827
  if is_habana_available():
1719
1828
  try:
1720
- import habana_frameworks.torch.hpu
1829
+ import habana_frameworks.torch.hpu # noqa: F401
1721
1830
 
1722
1831
  if torch.hpu.is_available():
1723
1832
  return torch.hpu.device_count()
@@ -1860,7 +1969,9 @@ def direct_register_custom_op(
1860
1969
  if fake_impl is not None:
1861
1970
  my_lib._register_fake(op_name, fake_impl)
1862
1971
  except RuntimeError as error:
1863
- if "Tried to register an operator" in str(e) and "multiple times" in str(e):
1972
+ if "Tried to register an operator" in str(error) and "multiple times" in str(
1973
+ error
1974
+ ):
1864
1975
  # Silently ignore duplicate registration errors
1865
1976
  # This can happen in multi-engine scenarios
1866
1977
  pass
@@ -1873,6 +1984,7 @@ def direct_register_custom_op(
1873
1984
 
1874
1985
 
1875
1986
  def set_gpu_proc_affinity(
1987
+ pp_size: int,
1876
1988
  tp_size: int,
1877
1989
  nnodes: int,
1878
1990
  gpu_id: int,
@@ -1881,7 +1993,8 @@ def set_gpu_proc_affinity(
1881
1993
  pid = os.getpid()
1882
1994
  p = psutil.Process(pid)
1883
1995
 
1884
- tp_size_per_node = tp_size // nnodes
1996
+ nnodes_per_tp_group = max(nnodes // pp_size, 1)
1997
+ tp_size_per_node = tp_size // nnodes_per_tp_group
1885
1998
 
1886
1999
  # total physical cores
1887
2000
  total_pcores = psutil.cpu_count(logical=False)
@@ -1993,7 +2106,7 @@ class MultiprocessingSerializer:
1993
2106
 
1994
2107
  if output_str:
1995
2108
  # Convert bytes to base64-encoded string
1996
- output = pybase64.b64encode(output).decode("utf-8")
2109
+ pybase64.b64encode(output).decode("utf-8")
1997
2110
 
1998
2111
  return output
1999
2112
 
@@ -2164,6 +2277,11 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
2164
2277
 
2165
2278
  app = FastAPI()
2166
2279
 
2280
+ @app.get("/ping")
2281
+ async def ping():
2282
+ """Could be used by the checkpoint-engine update script to confirm the server is up."""
2283
+ return Response(status_code=200)
2284
+
2167
2285
  @app.get("/health")
2168
2286
  async def health():
2169
2287
  """Check the health of the http server."""
@@ -2286,6 +2404,8 @@ def retry(
2286
2404
  try:
2287
2405
  return fn()
2288
2406
  except Exception as e:
2407
+ traceback.print_exc()
2408
+
2289
2409
  if try_index >= max_retry:
2290
2410
  raise Exception(f"retry() exceed maximum number of retries.")
2291
2411
 
@@ -2299,11 +2419,30 @@ def retry(
2299
2419
  logger.warning(
2300
2420
  f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
2301
2421
  )
2302
- traceback.print_exc()
2303
2422
 
2304
2423
  time.sleep(delay)
2305
2424
 
2306
2425
 
2426
+ def has_hf_quant_config(model_path: str) -> bool:
2427
+ """Check if the model path contains hf_quant_config.json file.
2428
+
2429
+ Args:
2430
+ model_path: Path to the model, can be local path or remote URL.
2431
+
2432
+ Returns:
2433
+ True if hf_quant_config.json exists, False otherwise.
2434
+ """
2435
+ if os.path.exists(os.path.join(model_path, "hf_quant_config.json")):
2436
+ return True
2437
+ try:
2438
+ from huggingface_hub import HfApi
2439
+
2440
+ hf_api = HfApi()
2441
+ return hf_api.file_exists(model_path, "hf_quant_config.json")
2442
+ except Exception:
2443
+ return False
2444
+
2445
+
2307
2446
  def flatten_nested_list(nested_list):
2308
2447
  if isinstance(nested_list, list):
2309
2448
  return [
@@ -2461,6 +2600,7 @@ def is_fa3_default_architecture(hf_config):
2461
2600
  "Qwen2ForCausalLM",
2462
2601
  "Llama4ForConditionalGeneration",
2463
2602
  "LlamaForCausalLM",
2603
+ "Olmo2ForCausalLM",
2464
2604
  "Gemma2ForCausalLM",
2465
2605
  "Gemma3ForConditionalGeneration",
2466
2606
  "Qwen3ForCausalLM",
@@ -2494,9 +2634,9 @@ def log_info_on_rank0(logger, msg):
2494
2634
 
2495
2635
  def load_json_config(data: str):
2496
2636
  try:
2497
- return json.loads(data)
2637
+ return orjson.loads(data)
2498
2638
  except JSONDecodeError:
2499
- return json.loads(Path(data).read_text())
2639
+ return orjson.loads(Path(data).read_text())
2500
2640
 
2501
2641
 
2502
2642
  def dispose_tensor(x: torch.Tensor):
@@ -2863,7 +3003,7 @@ def get_cpu_ids_by_node():
2863
3003
  def is_shm_available(dtype, world_size, local_size):
2864
3004
  return (
2865
3005
  cpu_has_amx_support()
2866
- and dtype in [torch.bfloat16, torch.float]
3006
+ and dtype in [torch.bfloat16, torch.float16, torch.float]
2867
3007
  and world_size >= 1
2868
3008
  and world_size == local_size
2869
3009
  )
@@ -2914,10 +3054,6 @@ def lru_cache_frozenset(maxsize=128):
2914
3054
  return decorator
2915
3055
 
2916
3056
 
2917
- def get_origin_rid(rid):
2918
- return rid.split("_", 1)[1] if "_" in rid else rid
2919
-
2920
-
2921
3057
  def apply_module_patch(target_module, target_function, wrappers):
2922
3058
  original_module, original_function = parse_module_path(
2923
3059
  target_module, target_function, False
@@ -3205,7 +3341,7 @@ def numa_bind_to_node(node: int):
3205
3341
 
3206
3342
  def json_list_type(value):
3207
3343
  try:
3208
- return json.loads(value)
3344
+ return orjson.loads(value)
3209
3345
  except json.JSONDecodeError:
3210
3346
  raise argparse.ArgumentTypeError(
3211
3347
  f"Invalid JSON list: {value}. Please provide a valid JSON list."
@@ -3213,7 +3349,12 @@ def json_list_type(value):
3213
3349
 
3214
3350
 
3215
3351
  @contextmanager
3216
- def temp_set_cuda_visible_devices(gpu_id: int):
3352
+ def maybe_reindex_device_id(gpu_id: int):
3353
+
3354
+ if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() is False or not is_cuda_alike():
3355
+ yield gpu_id
3356
+ return
3357
+
3217
3358
  original_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
3218
3359
  if original_cuda_visible_devices:
3219
3360
  cuda_visible_devices = original_cuda_visible_devices.split(",")
@@ -3222,7 +3363,11 @@ def temp_set_cuda_visible_devices(gpu_id: int):
3222
3363
 
3223
3364
  str_gpu_id = cuda_visible_devices[gpu_id] if cuda_visible_devices else str(gpu_id)
3224
3365
  os.environ["CUDA_VISIBLE_DEVICES"] = str_gpu_id
3225
- yield
3366
+
3367
+ logger.debug(f"Set CUDA_VISIBLE_DEVICES to {str_gpu_id}")
3368
+
3369
+ yield 0
3370
+
3226
3371
  if original_cuda_visible_devices:
3227
3372
  os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible_devices
3228
3373
  else:
@@ -16,9 +16,10 @@
16
16
  import contextlib
17
17
  import json
18
18
  import os
19
+ import tempfile
19
20
  import warnings
20
21
  from pathlib import Path
21
- from typing import Any, Dict, Optional, Type, Union
22
+ from typing import Any, Dict, List, Optional, Type, Union
22
23
 
23
24
  import torch
24
25
  from huggingface_hub import snapshot_download
@@ -45,27 +46,37 @@ from sglang.srt.configs import (
45
46
  KimiVLConfig,
46
47
  LongcatFlashConfig,
47
48
  MultiModalityConfig,
49
+ NemotronHConfig,
50
+ Olmo3Config,
48
51
  Qwen3NextConfig,
49
52
  Step3VLConfig,
50
53
  )
54
+ from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
51
55
  from sglang.srt.configs.internvl import InternVLChatConfig
52
56
  from sglang.srt.connector import create_remote_connector
53
57
  from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
54
58
 
55
- _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
56
- ChatGLMConfig.model_type: ChatGLMConfig,
57
- DbrxConfig.model_type: DbrxConfig,
58
- ExaoneConfig.model_type: ExaoneConfig,
59
- DeepseekVL2Config.model_type: DeepseekVL2Config,
60
- MultiModalityConfig.model_type: MultiModalityConfig,
61
- KimiVLConfig.model_type: KimiVLConfig,
62
- InternVLChatConfig.model_type: InternVLChatConfig,
63
- Step3VLConfig.model_type: Step3VLConfig,
64
- LongcatFlashConfig.model_type: LongcatFlashConfig,
65
- Qwen3NextConfig.model_type: Qwen3NextConfig,
66
- FalconH1Config.model_type: FalconH1Config,
67
- DotsVLMConfig.model_type: DotsVLMConfig,
68
- DotsOCRConfig.model_type: DotsOCRConfig,
59
+ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
60
+ ChatGLMConfig,
61
+ DbrxConfig,
62
+ ExaoneConfig,
63
+ DeepseekVL2Config,
64
+ MultiModalityConfig,
65
+ KimiVLConfig,
66
+ InternVLChatConfig,
67
+ Step3VLConfig,
68
+ LongcatFlashConfig,
69
+ Olmo3Config,
70
+ Qwen3NextConfig,
71
+ FalconH1Config,
72
+ DotsVLMConfig,
73
+ DotsOCRConfig,
74
+ NemotronHConfig,
75
+ DeepseekVLV2Config,
76
+ ]
77
+
78
+ _CONFIG_REGISTRY = {
79
+ config_cls.model_type: config_cls for config_cls in _CONFIG_REGISTRY
69
80
  }
70
81
 
71
82
  for name, cls in _CONFIG_REGISTRY.items():
@@ -106,6 +117,12 @@ def get_hf_text_config(config: PretrainedConfig):
106
117
  # if transformers config doesn't align with this assumption.
107
118
  assert hasattr(config.text_config, "num_attention_heads")
108
119
  return config.text_config
120
+
121
+ if hasattr(config, "llm_config"):
122
+ # PointsV1.5 Chat Model
123
+ assert hasattr(config.llm_config, "num_attention_heads")
124
+ return config.llm_config
125
+
109
126
  if hasattr(config, "language_config"):
110
127
  return config.language_config
111
128
  if hasattr(config, "thinker_config"):
@@ -143,7 +160,7 @@ def _load_deepseek_v32_model(
143
160
  config_json["architectures"] = ["DeepseekV3ForCausalLM"]
144
161
  config_json["model_type"] = "deepseek_v3"
145
162
 
146
- tmp_path = os.path.join(local_path, "_tmp_config_folder")
163
+ tmp_path = os.path.join(tempfile.gettempdir(), "_tmp_config_folder")
147
164
  os.makedirs(tmp_path, exist_ok=True)
148
165
 
149
166
  unique_path = os.path.join(tmp_path, f"deepseek_v32_{os.getpid()}")
@@ -180,6 +197,11 @@ def get_config(
180
197
  config = AutoConfig.from_pretrained(
181
198
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
182
199
  )
200
+ if "deepseek-ai/DeepSeek-OCR" in model:
201
+ config.model_type = "deepseek-ocr"
202
+ # Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
203
+ # This is a temporary workaround and will require further optimization.
204
+
183
205
  except ValueError as e:
184
206
  if not "deepseek_v32" in str(e):
185
207
  raise e
@@ -202,7 +224,8 @@ def get_config(
202
224
  "intermediate_size": 4304,
203
225
  "model_type": "siglip_vision_model",
204
226
  "num_attention_heads": 16,
205
- "num_hidden_layers": 26, # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
227
+ "num_hidden_layers": 26,
228
+ # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
206
229
  "patch_size": 14,
207
230
  }
208
231
  config.vision_config = SiglipVisionConfig(**vision_config)
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from dataclasses import dataclass
4
3
  from multiprocessing import shared_memory
5
4
  from pathlib import Path
@@ -11,14 +11,14 @@ from sglang.srt.distributed.naive_distributed import (
11
11
  get_naive_distributed,
12
12
  set_naive_distributed,
13
13
  )
14
- from sglang.srt.host_shared_memory import (
14
+ from sglang.srt.layers.parameter import ModelWeightParameter
15
+ from sglang.srt.server_args import ServerArgs
16
+ from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
17
+ from sglang.srt.utils.host_shared_memory import (
15
18
  HostSharedMemoryManager,
16
19
  get_host_shared_memory_manager,
17
20
  set_host_shared_memory_manager,
18
21
  )
19
- from sglang.srt.layers.parameter import ModelWeightParameter
20
- from sglang.srt.server_args import ServerArgs
21
- from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24