sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,117 @@
1
+ # Copyright 2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ """Common config utils for mamba2 - NemotronH, FalconH1, Qwen3Next, etc."""
14
+
15
+ import os
16
+ from dataclasses import dataclass, field
17
+
18
+ import numpy as np
19
+ import torch
20
+
21
+ from sglang.srt.distributed.utils import divide
22
+
23
+
24
+ def extra_groups_for_head_shards(ngroups: int, tp_size: int):
25
+ """Compute the increase in group numbers to account for
26
+ replication in order to accompany the head shards."""
27
+
28
+ # in the case ngoups % tp_size == 0, this will be zero
29
+ if ngroups % tp_size == 0:
30
+ return 0
31
+
32
+ # for n_groups == 1, this is exactly tp_size - n_groups
33
+ return tp_size - ngroups
34
+
35
+
36
+ @dataclass(kw_only=True, frozen=True)
37
+ class Mamba2StateShape:
38
+ conv: tuple[int, int]
39
+ temporal: tuple[int, int, int]
40
+
41
+ intermediate_size: int
42
+ conv_dim: int
43
+ ssm_state_size: int
44
+ num_heads: int
45
+ head_dim: int
46
+ state_size: int
47
+ conv_kernel: int
48
+
49
+ @staticmethod
50
+ def create(
51
+ *,
52
+ tp_world_size: int,
53
+ intermediate_size: int,
54
+ n_groups: int,
55
+ num_heads: int,
56
+ head_dim: int,
57
+ state_size: int,
58
+ conv_kernel: int,
59
+ ) -> "Mamba2StateShape":
60
+ # if n_groups is not divisible by world_size, need to extend the shards
61
+ # to ensure all groups needed by a head is sharded along with it
62
+ if n_groups % tp_world_size != 0:
63
+ extra_groups = extra_groups_for_head_shards(n_groups, tp_world_size)
64
+ n_groups += extra_groups
65
+ # heads and n_groups are TP-ed
66
+ conv_dim = intermediate_size + 2 * n_groups * state_size
67
+
68
+ # contiguous along 'dim' axis
69
+ conv_state_shape = divide(conv_dim, tp_world_size), conv_kernel - 1
70
+
71
+ # These are not TP-ed as they depend on A, dt_bias, D
72
+ # - they are typically small
73
+ # e.g., QWen3-Next: (32, 128, 128)
74
+ temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
75
+ return Mamba2StateShape(
76
+ conv=conv_state_shape,
77
+ temporal=temporal_state_shape,
78
+ intermediate_size=intermediate_size,
79
+ conv_dim=conv_dim,
80
+ ssm_state_size=state_size,
81
+ num_heads=num_heads,
82
+ head_dim=head_dim,
83
+ state_size=state_size,
84
+ conv_kernel=conv_kernel,
85
+ )
86
+
87
+
88
+ @dataclass(kw_only=True, frozen=True)
89
+ class Mamba2StateDType:
90
+ conv: torch.dtype
91
+ temporal: torch.dtype
92
+
93
+
94
+ CONV_DTYPE = torch.bfloat16
95
+
96
+
97
+ def mamba2_state_dtype() -> Mamba2StateDType:
98
+ dtype_map = {
99
+ "float32": torch.float32,
100
+ "bfloat16": torch.bfloat16,
101
+ }
102
+ ssm_dtype = dtype_map[os.environ["SGLANG_MAMBA_SSM_DTYPE"]]
103
+ return Mamba2StateDType(conv=CONV_DTYPE, temporal=ssm_dtype)
104
+
105
+
106
+ @dataclass(kw_only=True, frozen=True)
107
+ class Mamba2CacheParams:
108
+ shape: Mamba2StateShape
109
+ dtype: Mamba2StateDType = field(default_factory=mamba2_state_dtype)
110
+ layers: list[int]
111
+
112
+ @property
113
+ def mamba_cache_per_req(self) -> int:
114
+ return (
115
+ int(np.prod(self.shape.conv)) * self.dtype.conv.itemsize
116
+ + int(np.prod(self.shape.temporal)) * self.dtype.temporal.itemsize
117
+ ) * len(self.layers)
@@ -17,7 +17,7 @@ import logging
17
17
  import math
18
18
  import os
19
19
  from enum import Enum, IntEnum, auto
20
- from typing import List, Optional, Set, Union
20
+ from typing import Any, List, Optional, Set, Union
21
21
 
22
22
  import torch
23
23
  from transformers import PretrainedConfig
@@ -53,7 +53,11 @@ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
53
53
  return (
54
54
  config.architectures is not None
55
55
  and config.architectures[0]
56
- in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
56
+ in [
57
+ "DeepseekV3ForCausalLM",
58
+ "DeepseekV32ForCausalLM",
59
+ "DeepseekV3ForCausalLMNextN",
60
+ ]
57
61
  and getattr(config, "index_topk", None) is not None
58
62
  )
59
63
 
@@ -87,8 +91,12 @@ class ModelConfig:
87
91
  quantization: Optional[str] = None,
88
92
  override_config_file: Optional[str] = None,
89
93
  is_draft_model: bool = False,
90
- hybrid_kvcache_ratio: Optional[float] = None,
94
+ hybrid_kvcache_ratio: Optional[
95
+ float
96
+ ] = None, # TODO: remove this, it is not a model config
91
97
  model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
98
+ sampling_defaults: str = "openai",
99
+ quantize_and_serve: bool = False,
92
100
  ) -> None:
93
101
  # Parse args
94
102
  self.model_path = model_path
@@ -96,6 +104,11 @@ class ModelConfig:
96
104
  self.quantization = quantization
97
105
  self.is_draft_model = is_draft_model
98
106
  self.model_impl = model_impl
107
+ self.sampling_defaults = sampling_defaults
108
+ self.quantize_and_serve = quantize_and_serve
109
+
110
+ # Validate quantize_and_serve configuration
111
+ self._validate_quantize_and_serve_config()
99
112
 
100
113
  # Get hf config
101
114
  self._maybe_pull_model_tokenizer_from_remote()
@@ -211,6 +224,8 @@ class ModelConfig:
211
224
  quantization=server_args.quantization,
212
225
  hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
213
226
  model_impl=server_args.model_impl,
227
+ sampling_defaults=server_args.sampling_defaults,
228
+ quantize_and_serve=server_args.quantize_and_serve,
214
229
  **kwargs,
215
230
  )
216
231
 
@@ -477,31 +492,32 @@ class ModelConfig:
477
492
  # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
478
493
  # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
479
494
  is_local = os.path.exists(self.model_path)
480
- modelopt_quant_config = {"quant_method": "modelopt"}
481
495
  if not is_local:
482
496
  import huggingface_hub
483
497
 
484
498
  try:
485
- from huggingface_hub import HfApi
499
+ from huggingface_hub import HfApi, hf_hub_download
486
500
 
487
501
  hf_api = HfApi()
488
-
489
- def check_hf_quant_config():
490
- return hf_api.file_exists(
491
- self.model_path, "hf_quant_config.json"
492
- )
493
-
494
502
  # Retry HF API call up to 3 times
495
503
  file_exists = retry(
496
- check_hf_quant_config,
504
+ lambda: hf_api.file_exists(
505
+ self.model_path, "hf_quant_config.json"
506
+ ),
497
507
  max_retry=2,
498
508
  initial_delay=1.0,
499
509
  max_delay=5.0,
500
510
  )
501
-
502
511
  if file_exists:
503
- quant_cfg = modelopt_quant_config
504
-
512
+ # Download and parse the quantization config for remote models
513
+ quant_config_file = hf_hub_download(
514
+ repo_id=self.model_path,
515
+ filename="hf_quant_config.json",
516
+ revision=self.revision,
517
+ )
518
+ with open(quant_config_file) as f:
519
+ quant_config_dict = json.load(f)
520
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
505
521
  except huggingface_hub.errors.OfflineModeIsEnabled:
506
522
  logger.warning(
507
523
  "Offline mode is enabled, skipping hf_quant_config.json check"
@@ -510,21 +526,79 @@ class ModelConfig:
510
526
  logger.warning(
511
527
  f"Failed to check hf_quant_config.json: {self.model_path} {e}"
512
528
  )
513
-
514
529
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
515
530
  quant_config_file = os.path.join(
516
531
  self.model_path, "hf_quant_config.json"
517
532
  )
518
533
  with open(quant_config_file) as f:
519
534
  quant_config_dict = json.load(f)
520
- json_quant_configs = quant_config_dict["quantization"]
521
- quant_algo = json_quant_configs.get("quant_algo", None)
522
- if quant_algo == "MIXED_PRECISION":
523
- quant_cfg = {"quant_method": "w4afp8"}
524
- else:
525
- quant_cfg = modelopt_quant_config
535
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
526
536
  return quant_cfg
527
537
 
538
+ def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
539
+ """Parse ModelOpt quantization config and return the appropriate quant_method."""
540
+ json_quant_configs = quant_config_dict["quantization"]
541
+ quant_algo = json_quant_configs.get("quant_algo", None)
542
+
543
+ if quant_algo == "MIXED_PRECISION":
544
+ return {"quant_method": "w4afp8"}
545
+ elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
546
+ return {"quant_method": "modelopt_fp4"}
547
+ elif quant_algo and "FP8" in quant_algo:
548
+ return {"quant_method": "modelopt_fp8"}
549
+ else:
550
+ return None
551
+
552
+ def _is_already_quantized(self) -> bool:
553
+ """Check if the model is already quantized based on config files."""
554
+ # Check for HuggingFace quantization config
555
+ from sglang.srt.utils import has_hf_quant_config
556
+
557
+ return has_hf_quant_config(self.model_path)
558
+
559
+ def _get_modelopt_quant_type(self) -> str:
560
+ """Extract ModelOpt quantization type from unified quantization flag."""
561
+ if self.quantization == "modelopt_fp8":
562
+ return "fp8"
563
+ elif self.quantization == "modelopt_fp4":
564
+ return "nvfp4"
565
+ elif self.quantization == "modelopt":
566
+ # Auto-detect from model config
567
+ quant_cfg = self._parse_quant_hf_config()
568
+ if quant_cfg:
569
+ quant_method = quant_cfg.get("quant_method", "").lower()
570
+ if "fp4" in quant_method:
571
+ return "fp4"
572
+ elif "fp8" in quant_method:
573
+ return "fp8"
574
+ # Default to fp8 if can't detect
575
+ return "fp8"
576
+ else:
577
+ return "fp8" # Default fallback
578
+
579
+ def _validate_quantize_and_serve_config(self):
580
+ """Validate quantize_and_serve configuration."""
581
+ if not self.quantize_and_serve:
582
+ return
583
+
584
+ # Check if ModelOpt quantization is specified
585
+ modelopt_quantization_specified = self.quantization in [
586
+ "modelopt",
587
+ "modelopt_fp8",
588
+ "modelopt_fp4",
589
+ ]
590
+
591
+ if not modelopt_quantization_specified:
592
+ raise ValueError("quantize_and_serve requires ModelOpt quantization")
593
+
594
+ # quantize_and_serve is disabled due to compatibility issues
595
+ raise NotImplementedError(
596
+ "quantize_and_serve functionality is currently disabled due to compatibility issues. "
597
+ "Please use the separate quantize-then-deploy workflow instead. "
598
+ "Step 1: Quantize and export model. "
599
+ "Step 2: Deploy the exported model."
600
+ )
601
+
528
602
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
529
603
  def _verify_quantization(self) -> None:
530
604
  supported_quantization = [*QUANTIZATION_METHODS]
@@ -543,7 +617,8 @@ class ModelConfig:
543
617
  optimized_quantization_methods = [
544
618
  "fp8",
545
619
  "marlin",
546
- "modelopt",
620
+ "modelopt_fp8",
621
+ "modelopt_fp4",
547
622
  "gptq_marlin_24",
548
623
  "gptq_marlin",
549
624
  "awq_marlin",
@@ -657,6 +732,38 @@ class ModelConfig:
657
732
  eos_ids = eos_ids | generation_eos_ids
658
733
  return eos_ids
659
734
 
735
+ def get_default_sampling_params(self) -> dict[str, Any]:
736
+ """
737
+ Get default sampling parameters from the model's generation config.
738
+
739
+ This method returns non-default sampling parameters from the model's
740
+ generation_config.json when sampling_defaults is set to "model".
741
+
742
+ Returns:
743
+ A dictionary containing the non-default sampling parameters.
744
+ """
745
+ if self.sampling_defaults != "model":
746
+ return {}
747
+
748
+ if self.hf_generation_config is None:
749
+ return {}
750
+
751
+ config = self.hf_generation_config.to_dict()
752
+
753
+ available_params = [
754
+ "repetition_penalty",
755
+ "temperature",
756
+ "top_k",
757
+ "top_p",
758
+ "min_p",
759
+ ]
760
+
761
+ default_sampling_params = {
762
+ p: config.get(p) for p in available_params if config.get(p) is not None
763
+ }
764
+
765
+ return default_sampling_params
766
+
660
767
  def _maybe_pull_model_tokenizer_from_remote(self) -> None:
661
768
  """
662
769
  Pull the model config files to a temporary
@@ -698,7 +805,7 @@ def _get_and_verify_dtype(
698
805
  ) -> torch.dtype:
699
806
  # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
700
807
  # because config.torch_dtype can be None.
701
- config_dtype = getattr(config, "torch_dtype", None)
808
+ config_dtype = getattr(config, "dtype", None)
702
809
  if isinstance(config_dtype, str):
703
810
  config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
704
811
  if config_dtype is None:
@@ -802,15 +909,19 @@ multimodal_model_archs = [
802
909
  "Qwen2_5_VLForConditionalGeneration",
803
910
  "Qwen3VLForConditionalGeneration",
804
911
  "Qwen3VLMoeForConditionalGeneration",
912
+ "Qwen3OmniMoeForConditionalGeneration",
805
913
  "KimiVLForConditionalGeneration",
806
914
  "InternVLChatModel",
807
915
  "InternS1ForConditionalGeneration",
808
916
  "Phi4MMForCausalLM",
809
- "VILAForConditionalGeneration",
810
917
  "Step3VLForConditionalGeneration",
918
+ "POINTSV15ChatModel",
811
919
  "DotsVLMForCausalLM",
812
920
  "DotsOCRForCausalLM",
813
921
  "Sarashina2VisionForCausalLM",
922
+ "NVILAForConditionalGeneration",
923
+ "NVILALiteForConditionalGeneration",
924
+ "DeepseekOCRForCausalLM",
814
925
  ]
815
926
 
816
927
 
@@ -0,0 +1,30 @@
1
+ # Configuration for NVIDIA ModelOpt quantization integration
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+
6
+ @dataclass
7
+ class ModelOptConfig:
8
+ """Configuration for NVIDIA ModelOpt quantization operations.
9
+
10
+ This configuration class holds parameters for ModelOpt quantization,
11
+ checkpoint management, and model export operations.
12
+
13
+ Args:
14
+ quant: Quantization method/type (e.g., "fp8", "fp4")
15
+ checkpoint_restore_path: Path to restore ModelOpt checkpoint from
16
+ checkpoint_save_path: Path to save ModelOpt checkpoint to
17
+ export_path: Path to export quantized model in HuggingFace format
18
+ quantize_and_serve: Whether to quantize and serve in one step
19
+ """
20
+
21
+ quant: Optional[str] = None
22
+ checkpoint_restore_path: Optional[str] = None
23
+ checkpoint_save_path: Optional[str] = None
24
+ export_path: Optional[str] = None
25
+ quantize_and_serve: bool = False
26
+
27
+ def __post_init__(self):
28
+ """Validate configuration after initialization."""
29
+ # Add any validation logic if needed
30
+ pass