sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,22 @@ import logging
17
17
  import math
18
18
  import os
19
19
  from enum import Enum, IntEnum, auto
20
- from typing import List, Optional, Set, Union
20
+ from typing import Any, List, Optional, Set, Union
21
21
 
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import (
25
+ from sglang.srt.environ import envs
26
+ from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
+ from sglang.srt.server_args import ServerArgs
28
+ from sglang.srt.utils import is_hip, retry
29
+ from sglang.srt.utils.hf_transformers_utils import (
26
30
  get_config,
27
31
  get_context_length,
28
32
  get_generation_config,
29
33
  get_hf_text_config,
30
34
  get_sparse_attention_config,
31
35
  )
32
- from sglang.srt.layers.quantization import QUANTIZATION_METHODS
33
- from sglang.srt.server_args import ServerArgs
34
- from sglang.srt.utils import get_bool_env_var, is_hip
35
36
  from sglang.utils import is_in_ci
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -48,6 +49,34 @@ class ModelImpl(str, Enum):
48
49
  TRANSFORMERS = "transformers"
49
50
 
50
51
 
52
+ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
53
+ return (
54
+ config.architectures is not None
55
+ and config.architectures[0]
56
+ in [
57
+ "DeepseekV3ForCausalLM",
58
+ "DeepseekV32ForCausalLM",
59
+ "DeepseekV3ForCausalLMNextN",
60
+ ]
61
+ and getattr(config, "index_topk", None) is not None
62
+ )
63
+
64
+
65
+ def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
66
+ assert is_deepseek_nsa(config)
67
+ return config.index_head_dim
68
+
69
+
70
+ def get_nsa_index_topk(config: PretrainedConfig) -> int:
71
+ assert is_deepseek_nsa(config)
72
+ return config.index_topk
73
+
74
+
75
+ def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
76
+ assert is_deepseek_nsa(config)
77
+ return config.index_n_heads
78
+
79
+
51
80
  class ModelConfig:
52
81
  def __init__(
53
82
  self,
@@ -62,37 +91,31 @@ class ModelConfig:
62
91
  quantization: Optional[str] = None,
63
92
  override_config_file: Optional[str] = None,
64
93
  is_draft_model: bool = False,
65
- hybrid_kvcache_ratio: Optional[float] = None,
94
+ hybrid_kvcache_ratio: Optional[
95
+ float
96
+ ] = None, # TODO: remove this, it is not a model config
66
97
  model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
67
- tp_rank: Optional[int] = None,
68
- remote_instance_weight_loader_seed_instance_ip: Optional[str] = None,
69
- remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None,
70
- remote_instance_weight_loader_send_weights_group_ports: Optional[
71
- List[int]
72
- ] = None,
98
+ sampling_defaults: str = "openai",
99
+ quantize_and_serve: bool = False,
73
100
  ) -> None:
74
101
  # Parse args
75
102
  self.model_path = model_path
76
103
  self.revision = revision
77
104
  self.quantization = quantization
105
+ self.is_draft_model = is_draft_model
78
106
  self.model_impl = model_impl
79
- self.tp_rank = tp_rank
80
- self.remote_instance_weight_loader_seed_instance_ip = (
81
- remote_instance_weight_loader_seed_instance_ip
82
- )
83
- self.remote_instance_weight_loader_seed_instance_service_port = (
84
- remote_instance_weight_loader_seed_instance_service_port
85
- )
86
- self.remote_instance_weight_loader_send_weights_group_ports = (
87
- remote_instance_weight_loader_send_weights_group_ports
88
- )
107
+ self.sampling_defaults = sampling_defaults
108
+ self.quantize_and_serve = quantize_and_serve
89
109
 
90
- self.maybe_pull_model_tokenizer_from_remote()
110
+ # Validate quantize_and_serve configuration
111
+ self._validate_quantize_and_serve_config()
112
+
113
+ # Get hf config
114
+ self._maybe_pull_model_tokenizer_from_remote()
91
115
  self.model_override_args = json.loads(model_override_args)
92
116
  kwargs = {}
93
117
  if override_config_file and override_config_file.strip():
94
118
  kwargs["_configuration_file"] = override_config_file.strip()
95
-
96
119
  self.hf_config = get_config(
97
120
  self.model_path,
98
121
  trust_remote_code=trust_remote_code,
@@ -100,7 +123,7 @@ class ModelConfig:
100
123
  model_override_args=self.model_override_args,
101
124
  **kwargs,
102
125
  )
103
-
126
+ self.hf_text_config = get_hf_text_config(self.hf_config)
104
127
  self.hf_generation_config = get_generation_config(
105
128
  self.model_path,
106
129
  trust_remote_code=trust_remote_code,
@@ -108,7 +131,25 @@ class ModelConfig:
108
131
  **kwargs,
109
132
  )
110
133
 
111
- self.hf_text_config = get_hf_text_config(self.hf_config)
134
+ # Set enable_multimodal
135
+ if enable_multimodal is None:
136
+ mm_disabled_models = [
137
+ "Gemma3ForConditionalGeneration",
138
+ "Llama4ForConditionalGeneration",
139
+ "Step3VLForConditionalGeneration",
140
+ ]
141
+ if self.hf_config.architectures[0] in mm_disabled_models:
142
+ enable_multimodal = False
143
+ logger.info(
144
+ f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
145
+ )
146
+ else:
147
+ enable_multimodal = True
148
+
149
+ # Config draft model
150
+ self._config_draft_model()
151
+
152
+ # Check model type
112
153
  self.attention_chunk_size = getattr(
113
154
  self.hf_text_config, "attention_chunk_size", None
114
155
  )
@@ -124,20 +165,72 @@ class ModelConfig:
124
165
  self.hf_config.architectures, self.hf_text_config.num_hidden_layers
125
166
  )
126
167
  )
168
+ self.is_generation = is_generation_model(
169
+ self.hf_config.architectures, is_embedding
170
+ )
171
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
172
+ self.hf_config.architectures
173
+ )
174
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
175
+ self.hf_config.architectures
176
+ )
177
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
178
+ self.hf_config.architectures
179
+ )
180
+ self.is_audio_model = enable_multimodal and is_audio_model(
181
+ self.hf_config.architectures
182
+ )
183
+ self.is_multimodal_chunked_prefill_supported = (
184
+ enable_multimodal
185
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
186
+ )
187
+ self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
188
+ self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
127
189
 
128
- if enable_multimodal is None:
129
- mm_disabled_models = [
130
- "Gemma3ForConditionalGeneration",
131
- "Llama4ForConditionalGeneration",
132
- "Step3VLForConditionalGeneration",
133
- ]
134
- if self.hf_config.architectures[0] in mm_disabled_models:
135
- enable_multimodal = False
136
- logger.info(
137
- f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
138
- )
139
- else:
140
- enable_multimodal = True
190
+ # Derive context length and model shapes
191
+ self._derive_context_length(context_length)
192
+ self._derive_model_shapes()
193
+
194
+ # Verify quantization
195
+ self._verify_quantization()
196
+
197
+ # Verify dual-chunk attention config
198
+ self._verify_dual_chunk_attention_config()
199
+
200
+ # Cache attributes
201
+ self.hf_eos_token_id = self._get_hf_eos_token_id()
202
+
203
+ # multimodal
204
+ self.image_token_id = getattr(
205
+ self.hf_config, "image_token_id", None
206
+ ) or getattr(self.hf_config, "image_token_index", None)
207
+
208
+ @staticmethod
209
+ def from_server_args(
210
+ server_args: ServerArgs,
211
+ model_path: str = None,
212
+ model_revision: str = None,
213
+ **kwargs,
214
+ ):
215
+ return ModelConfig(
216
+ model_path=model_path or server_args.model_path,
217
+ trust_remote_code=server_args.trust_remote_code,
218
+ revision=model_revision or server_args.revision,
219
+ context_length=server_args.context_length,
220
+ model_override_args=server_args.json_model_override_args,
221
+ is_embedding=server_args.is_embedding,
222
+ enable_multimodal=server_args.enable_multimodal,
223
+ dtype=server_args.dtype,
224
+ quantization=server_args.quantization,
225
+ hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
226
+ model_impl=server_args.model_impl,
227
+ sampling_defaults=server_args.sampling_defaults,
228
+ quantize_and_serve=server_args.quantize_and_serve,
229
+ **kwargs,
230
+ )
231
+
232
+ def _config_draft_model(self):
233
+ is_draft_model = self.is_draft_model
141
234
 
142
235
  if (
143
236
  is_draft_model
@@ -172,31 +265,10 @@ class ModelConfig:
172
265
  self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
173
266
  self.hf_config.num_nextn_predict_layers = 1
174
267
 
175
- # Check model type
176
- self.is_generation = is_generation_model(
177
- self.hf_config.architectures, is_embedding
178
- )
179
- self.is_multimodal = enable_multimodal and is_multimodal_model(
180
- self.hf_config.architectures
181
- )
182
- self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
183
- self.hf_config.architectures
184
- )
185
- self.is_image_gen = enable_multimodal and is_image_gen_model(
186
- self.hf_config.architectures
187
- )
188
- self.is_audio_model = enable_multimodal and is_audio_model(
189
- self.hf_config.architectures
190
- )
191
- self.is_multimodal_chunked_prefill_supported = (
192
- enable_multimodal
193
- and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
194
- )
195
- self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
196
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
197
-
198
- # Derive context length
268
+ def _derive_context_length(self, context_length: int):
269
+ is_draft_model = self.is_draft_model
199
270
  derived_context_len = get_context_length(self.hf_text_config)
271
+
200
272
  if context_length is not None:
201
273
  if context_length > derived_context_len:
202
274
  reason = "Target model's" if is_draft_model else "User-specified"
@@ -205,11 +277,16 @@ class ModelConfig:
205
277
  f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
206
278
  )
207
279
  if (
208
- get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
280
+ envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
209
281
  or is_in_ci() # FIXME: fix this special case
210
282
  ):
211
283
  logger.warning(msg)
212
284
  self.context_len = context_length
285
+ if is_draft_model:
286
+ self.hf_text_config.max_position_embeddings = context_length
287
+ logger.warning(
288
+ f"Overriding the draft model's max_position_embeddings to {context_length}."
289
+ )
213
290
  else:
214
291
  raise ValueError(
215
292
  f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
@@ -219,6 +296,10 @@ class ModelConfig:
219
296
  else:
220
297
  self.context_len = derived_context_len
221
298
 
299
+ # Transfer context_len to HuggingFace config so models can access it
300
+ self.hf_config.context_len = self.context_len
301
+
302
+ def _derive_model_shapes(self):
222
303
  # Unify the config keys for hf_text_config
223
304
  self.head_dim = getattr(
224
305
  self.hf_text_config,
@@ -229,6 +310,7 @@ class ModelConfig:
229
310
  # FIXME: temporary special judge for MLA architecture
230
311
  if (
231
312
  "DeepseekV2ForCausalLM" in self.hf_config.architectures
313
+ or "DeepseekV32ForCausalLM" in self.hf_config.architectures
232
314
  or "DeepseekV3ForCausalLM" in self.hf_config.architectures
233
315
  or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
234
316
  or "LongcatFlashForCausalLM" in self.hf_config.architectures
@@ -241,6 +323,11 @@ class ModelConfig:
241
323
  self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
242
324
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
243
325
  self.v_head_dim = self.hf_config.v_head_dim
326
+ self.index_head_dim = (
327
+ get_nsa_index_head_dim(self.hf_config)
328
+ if is_deepseek_nsa(self.hf_config)
329
+ else None
330
+ )
244
331
 
245
332
  # Handle rope scaling with yarn
246
333
  self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
@@ -313,45 +400,6 @@ class ModelConfig:
313
400
  )
314
401
  self.vocab_size = self.hf_text_config.vocab_size
315
402
 
316
- # Verify quantization
317
- self._verify_quantization()
318
-
319
- # Verify dual-chunk attention config
320
- self._verify_dual_chunk_attention_config()
321
-
322
- # Cache attributes
323
- self.hf_eos_token_id = self.get_hf_eos_token_id()
324
-
325
- # multimodal
326
- self.image_token_id = getattr(
327
- self.hf_config, "image_token_id", None
328
- ) or getattr(self.hf_config, "image_token_index", None)
329
-
330
- @staticmethod
331
- def from_server_args(
332
- server_args: ServerArgs,
333
- model_path: str = None,
334
- model_revision: str = None,
335
- **kwargs,
336
- ):
337
- return ModelConfig(
338
- model_path=model_path or server_args.model_path,
339
- trust_remote_code=server_args.trust_remote_code,
340
- revision=model_revision or server_args.revision,
341
- context_length=server_args.context_length,
342
- model_override_args=server_args.json_model_override_args,
343
- is_embedding=server_args.is_embedding,
344
- enable_multimodal=server_args.enable_multimodal,
345
- dtype=server_args.dtype,
346
- quantization=server_args.quantization,
347
- hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
348
- model_impl=server_args.model_impl,
349
- remote_instance_weight_loader_seed_instance_ip=server_args.remote_instance_weight_loader_seed_instance_ip,
350
- remote_instance_weight_loader_seed_instance_service_port=server_args.remote_instance_weight_loader_seed_instance_service_port,
351
- remote_instance_weight_loader_send_weights_group_ports=server_args.remote_instance_weight_loader_send_weights_group_ports,
352
- **kwargs,
353
- )
354
-
355
403
  def get_total_num_attention_heads(self) -> int:
356
404
  return self.num_attention_heads
357
405
 
@@ -444,36 +492,114 @@ class ModelConfig:
444
492
  # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
445
493
  # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
446
494
  is_local = os.path.exists(self.model_path)
447
- modelopt_quant_config = {"quant_method": "modelopt"}
448
495
  if not is_local:
449
496
  import huggingface_hub
450
497
 
451
498
  try:
452
- from huggingface_hub import HfApi
499
+ from huggingface_hub import HfApi, hf_hub_download
453
500
 
454
501
  hf_api = HfApi()
455
- if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
456
- quant_cfg = modelopt_quant_config
502
+ # Retry HF API call up to 3 times
503
+ file_exists = retry(
504
+ lambda: hf_api.file_exists(
505
+ self.model_path, "hf_quant_config.json"
506
+ ),
507
+ max_retry=2,
508
+ initial_delay=1.0,
509
+ max_delay=5.0,
510
+ )
511
+ if file_exists:
512
+ # Download and parse the quantization config for remote models
513
+ quant_config_file = hf_hub_download(
514
+ repo_id=self.model_path,
515
+ filename="hf_quant_config.json",
516
+ revision=self.revision,
517
+ )
518
+ with open(quant_config_file) as f:
519
+ quant_config_dict = json.load(f)
520
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
457
521
  except huggingface_hub.errors.OfflineModeIsEnabled:
458
522
  logger.warning(
459
523
  "Offline mode is enabled, skipping hf_quant_config.json check"
460
524
  )
461
- pass
462
-
525
+ except Exception as e:
526
+ logger.warning(
527
+ f"Failed to check hf_quant_config.json: {self.model_path} {e}"
528
+ )
463
529
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
464
530
  quant_config_file = os.path.join(
465
531
  self.model_path, "hf_quant_config.json"
466
532
  )
467
533
  with open(quant_config_file) as f:
468
534
  quant_config_dict = json.load(f)
469
- json_quant_configs = quant_config_dict["quantization"]
470
- quant_algo = json_quant_configs.get("quant_algo", None)
471
- if quant_algo == "MIXED_PRECISION":
472
- quant_cfg = {"quant_method": "w4afp8"}
473
- else:
474
- quant_cfg = modelopt_quant_config
535
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
475
536
  return quant_cfg
476
537
 
538
+ def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
539
+ """Parse ModelOpt quantization config and return the appropriate quant_method."""
540
+ json_quant_configs = quant_config_dict["quantization"]
541
+ quant_algo = json_quant_configs.get("quant_algo", None)
542
+
543
+ if quant_algo == "MIXED_PRECISION":
544
+ return {"quant_method": "w4afp8"}
545
+ elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
546
+ return {"quant_method": "modelopt_fp4"}
547
+ elif quant_algo and "FP8" in quant_algo:
548
+ return {"quant_method": "modelopt_fp8"}
549
+ else:
550
+ # Default to FP8 for backward compatibility
551
+ return {"quant_method": "modelopt_fp8"}
552
+
553
+ def _is_already_quantized(self) -> bool:
554
+ """Check if the model is already quantized based on config files."""
555
+ # Check for HuggingFace quantization config
556
+ from sglang.srt.utils import has_hf_quant_config
557
+
558
+ return has_hf_quant_config(self.model_path)
559
+
560
+ def _get_modelopt_quant_type(self) -> str:
561
+ """Extract ModelOpt quantization type from unified quantization flag."""
562
+ if self.quantization == "modelopt_fp8":
563
+ return "fp8"
564
+ elif self.quantization == "modelopt_fp4":
565
+ return "nvfp4"
566
+ elif self.quantization == "modelopt":
567
+ # Auto-detect from model config
568
+ quant_cfg = self._parse_quant_hf_config()
569
+ if quant_cfg:
570
+ quant_method = quant_cfg.get("quant_method", "").lower()
571
+ if "fp4" in quant_method:
572
+ return "fp4"
573
+ elif "fp8" in quant_method:
574
+ return "fp8"
575
+ # Default to fp8 if can't detect
576
+ return "fp8"
577
+ else:
578
+ return "fp8" # Default fallback
579
+
580
+ def _validate_quantize_and_serve_config(self):
581
+ """Validate quantize_and_serve configuration."""
582
+ if not self.quantize_and_serve:
583
+ return
584
+
585
+ # Check if ModelOpt quantization is specified
586
+ modelopt_quantization_specified = self.quantization in [
587
+ "modelopt",
588
+ "modelopt_fp8",
589
+ "modelopt_fp4",
590
+ ]
591
+
592
+ if not modelopt_quantization_specified:
593
+ raise ValueError("quantize_and_serve requires ModelOpt quantization")
594
+
595
+ # quantize_and_serve is disabled due to compatibility issues
596
+ raise NotImplementedError(
597
+ "quantize_and_serve functionality is currently disabled due to compatibility issues. "
598
+ "Please use the separate quantize-then-deploy workflow instead. "
599
+ "Step 1: Quantize and export model. "
600
+ "Step 2: Deploy the exported model."
601
+ )
602
+
477
603
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
478
604
  def _verify_quantization(self) -> None:
479
605
  supported_quantization = [*QUANTIZATION_METHODS]
@@ -492,7 +618,8 @@ class ModelConfig:
492
618
  optimized_quantization_methods = [
493
619
  "fp8",
494
620
  "marlin",
495
- "modelopt",
621
+ "modelopt_fp8",
622
+ "modelopt_fp4",
496
623
  "gptq_marlin_24",
497
624
  "gptq_marlin",
498
625
  "awq_marlin",
@@ -586,7 +713,7 @@ class ModelConfig:
586
713
  "sparse_attention_enabled"
587
714
  ] = True
588
715
 
589
- def get_hf_eos_token_id(self) -> Optional[Set[int]]:
716
+ def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
590
717
  eos_ids = getattr(self.hf_config, "eos_token_id", None)
591
718
  if eos_ids is not None:
592
719
  # it can be either int or list of int
@@ -606,7 +733,39 @@ class ModelConfig:
606
733
  eos_ids = eos_ids | generation_eos_ids
607
734
  return eos_ids
608
735
 
609
- def maybe_pull_model_tokenizer_from_remote(self) -> None:
736
+ def get_default_sampling_params(self) -> dict[str, Any]:
737
+ """
738
+ Get default sampling parameters from the model's generation config.
739
+
740
+ This method returns non-default sampling parameters from the model's
741
+ generation_config.json when sampling_defaults is set to "model".
742
+
743
+ Returns:
744
+ A dictionary containing the non-default sampling parameters.
745
+ """
746
+ if self.sampling_defaults != "model":
747
+ return {}
748
+
749
+ if self.hf_generation_config is None:
750
+ return {}
751
+
752
+ config = self.hf_generation_config.to_dict()
753
+
754
+ available_params = [
755
+ "repetition_penalty",
756
+ "temperature",
757
+ "top_k",
758
+ "top_p",
759
+ "min_p",
760
+ ]
761
+
762
+ default_sampling_params = {
763
+ p: config.get(p) for p in available_params if config.get(p) is not None
764
+ }
765
+
766
+ return default_sampling_params
767
+
768
+ def _maybe_pull_model_tokenizer_from_remote(self) -> None:
610
769
  """
611
770
  Pull the model config files to a temporary
612
771
  directory in case of remote.
@@ -749,13 +908,20 @@ multimodal_model_archs = [
749
908
  "Qwen2AudioForConditionalGeneration",
750
909
  "Qwen2VLForConditionalGeneration",
751
910
  "Qwen2_5_VLForConditionalGeneration",
911
+ "Qwen3VLForConditionalGeneration",
912
+ "Qwen3VLMoeForConditionalGeneration",
913
+ "Qwen3OmniMoeForConditionalGeneration",
752
914
  "KimiVLForConditionalGeneration",
753
915
  "InternVLChatModel",
754
916
  "InternS1ForConditionalGeneration",
755
917
  "Phi4MMForCausalLM",
756
918
  "VILAForConditionalGeneration",
757
919
  "Step3VLForConditionalGeneration",
920
+ "POINTSV15ChatModel",
758
921
  "DotsVLMForCausalLM",
922
+ "DotsOCRForCausalLM",
923
+ "Sarashina2VisionForCausalLM",
924
+ "DeepseekOCRForCausalLM",
759
925
  ]
760
926
 
761
927
 
@@ -0,0 +1,30 @@
1
+ # Configuration for NVIDIA ModelOpt quantization integration
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+
6
+ @dataclass
7
+ class ModelOptConfig:
8
+ """Configuration for NVIDIA ModelOpt quantization operations.
9
+
10
+ This configuration class holds parameters for ModelOpt quantization,
11
+ checkpoint management, and model export operations.
12
+
13
+ Args:
14
+ quant: Quantization method/type (e.g., "fp8", "fp4")
15
+ checkpoint_restore_path: Path to restore ModelOpt checkpoint from
16
+ checkpoint_save_path: Path to save ModelOpt checkpoint to
17
+ export_path: Path to export quantized model in HuggingFace format
18
+ quantize_and_serve: Whether to quantize and serve in one step
19
+ """
20
+
21
+ quant: Optional[str] = None
22
+ checkpoint_restore_path: Optional[str] = None
23
+ checkpoint_save_path: Optional[str] = None
24
+ export_path: Optional[str] = None
25
+ quantize_and_serve: bool = False
26
+
27
+ def __post_init__(self):
28
+ """Validate configuration after initialization."""
29
+ # Add any validation logic if needed
30
+ pass