sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,22 @@ import logging
17
17
  import math
18
18
  import os
19
19
  from enum import Enum, IntEnum, auto
20
- from typing import List, Optional, Set, Union
20
+ from typing import Any, Dict, List, Optional, Set, Union
21
21
 
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import (
25
+ from sglang.srt.environ import envs
26
+ from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
+ from sglang.srt.server_args import ServerArgs
28
+ from sglang.srt.utils import is_hip, retry
29
+ from sglang.srt.utils.hf_transformers_utils import (
26
30
  get_config,
27
31
  get_context_length,
28
32
  get_generation_config,
29
33
  get_hf_text_config,
30
34
  get_sparse_attention_config,
31
35
  )
32
- from sglang.srt.layers.quantization import QUANTIZATION_METHODS
33
- from sglang.srt.server_args import ServerArgs
34
- from sglang.srt.utils import get_bool_env_var, is_hip
35
36
  from sglang.utils import is_in_ci
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
48
49
  TRANSFORMERS = "transformers"
49
50
 
50
51
 
52
+ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
53
+ return (
54
+ config.architectures is not None
55
+ and config.architectures[0]
56
+ in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
57
+ and getattr(config, "index_topk", None) is not None
58
+ )
59
+
60
+
61
+ def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
62
+ assert is_deepseek_nsa(config)
63
+ return config.index_head_dim
64
+
65
+
66
+ def get_nsa_index_topk(config: PretrainedConfig) -> int:
67
+ assert is_deepseek_nsa(config)
68
+ return config.index_topk
69
+
70
+
71
+ def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
72
+ assert is_deepseek_nsa(config)
73
+ return config.index_n_heads
74
+
75
+
51
76
  class ModelConfig:
52
77
  def __init__(
53
78
  self,
@@ -60,23 +85,28 @@ class ModelConfig:
60
85
  enable_multimodal: Optional[bool] = None,
61
86
  dtype: str = "auto",
62
87
  quantization: Optional[str] = None,
88
+ modelopt_quant: Optional[Union[str, Dict]] = None,
63
89
  override_config_file: Optional[str] = None,
64
90
  is_draft_model: bool = False,
65
91
  hybrid_kvcache_ratio: Optional[float] = None,
66
92
  model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
93
+ sampling_defaults: str = "openai",
67
94
  ) -> None:
68
95
  # Parse args
69
96
  self.model_path = model_path
70
97
  self.revision = revision
71
98
  self.quantization = quantization
99
+ self.modelopt_quant = modelopt_quant
100
+ self.is_draft_model = is_draft_model
72
101
  self.model_impl = model_impl
102
+ self.sampling_defaults = sampling_defaults
73
103
 
74
- self.maybe_pull_model_tokenizer_from_remote()
104
+ # Get hf config
105
+ self._maybe_pull_model_tokenizer_from_remote()
75
106
  self.model_override_args = json.loads(model_override_args)
76
107
  kwargs = {}
77
108
  if override_config_file and override_config_file.strip():
78
109
  kwargs["_configuration_file"] = override_config_file.strip()
79
-
80
110
  self.hf_config = get_config(
81
111
  self.model_path,
82
112
  trust_remote_code=trust_remote_code,
@@ -84,7 +114,7 @@ class ModelConfig:
84
114
  model_override_args=self.model_override_args,
85
115
  **kwargs,
86
116
  )
87
-
117
+ self.hf_text_config = get_hf_text_config(self.hf_config)
88
118
  self.hf_generation_config = get_generation_config(
89
119
  self.model_path,
90
120
  trust_remote_code=trust_remote_code,
@@ -92,7 +122,25 @@ class ModelConfig:
92
122
  **kwargs,
93
123
  )
94
124
 
95
- self.hf_text_config = get_hf_text_config(self.hf_config)
125
+ # Set enable_multimodal
126
+ if enable_multimodal is None:
127
+ mm_disabled_models = [
128
+ "Gemma3ForConditionalGeneration",
129
+ "Llama4ForConditionalGeneration",
130
+ "Step3VLForConditionalGeneration",
131
+ ]
132
+ if self.hf_config.architectures[0] in mm_disabled_models:
133
+ enable_multimodal = False
134
+ logger.info(
135
+ f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
136
+ )
137
+ else:
138
+ enable_multimodal = True
139
+
140
+ # Config draft model
141
+ self._config_draft_model()
142
+
143
+ # Check model type
96
144
  self.attention_chunk_size = getattr(
97
145
  self.hf_text_config, "attention_chunk_size", None
98
146
  )
@@ -108,20 +156,72 @@ class ModelConfig:
108
156
  self.hf_config.architectures, self.hf_text_config.num_hidden_layers
109
157
  )
110
158
  )
159
+ self.is_generation = is_generation_model(
160
+ self.hf_config.architectures, is_embedding
161
+ )
162
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
163
+ self.hf_config.architectures
164
+ )
165
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
166
+ self.hf_config.architectures
167
+ )
168
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
169
+ self.hf_config.architectures
170
+ )
171
+ self.is_audio_model = enable_multimodal and is_audio_model(
172
+ self.hf_config.architectures
173
+ )
174
+ self.is_multimodal_chunked_prefill_supported = (
175
+ enable_multimodal
176
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
177
+ )
178
+ self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
179
+ self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
111
180
 
112
- if enable_multimodal is None:
113
- mm_disabled_models = [
114
- "Gemma3ForConditionalGeneration",
115
- "Llama4ForConditionalGeneration",
116
- "Step3VLForConditionalGeneration",
117
- ]
118
- if self.hf_config.architectures[0] in mm_disabled_models:
119
- enable_multimodal = False
120
- logger.info(
121
- f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
122
- )
123
- else:
124
- enable_multimodal = True
181
+ # Derive context length and model shapes
182
+ self._derive_context_length(context_length)
183
+ self._derive_model_shapes()
184
+
185
+ # Verify quantization
186
+ self._verify_quantization()
187
+
188
+ # Verify dual-chunk attention config
189
+ self._verify_dual_chunk_attention_config()
190
+
191
+ # Cache attributes
192
+ self.hf_eos_token_id = self._get_hf_eos_token_id()
193
+
194
+ # multimodal
195
+ self.image_token_id = getattr(
196
+ self.hf_config, "image_token_id", None
197
+ ) or getattr(self.hf_config, "image_token_index", None)
198
+
199
+ @staticmethod
200
+ def from_server_args(
201
+ server_args: ServerArgs,
202
+ model_path: str = None,
203
+ model_revision: str = None,
204
+ **kwargs,
205
+ ):
206
+ return ModelConfig(
207
+ model_path=model_path or server_args.model_path,
208
+ trust_remote_code=server_args.trust_remote_code,
209
+ revision=model_revision or server_args.revision,
210
+ context_length=server_args.context_length,
211
+ model_override_args=server_args.json_model_override_args,
212
+ is_embedding=server_args.is_embedding,
213
+ enable_multimodal=server_args.enable_multimodal,
214
+ dtype=server_args.dtype,
215
+ quantization=server_args.quantization,
216
+ modelopt_quant=server_args.modelopt_quant,
217
+ hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
218
+ model_impl=server_args.model_impl,
219
+ sampling_defaults=server_args.sampling_defaults,
220
+ **kwargs,
221
+ )
222
+
223
+ def _config_draft_model(self):
224
+ is_draft_model = self.is_draft_model
125
225
 
126
226
  if (
127
227
  is_draft_model
@@ -141,37 +241,25 @@ class ModelConfig:
141
241
 
142
242
  if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
143
243
  self.hf_config.architectures[0] = "MiMoMTP"
244
+ if is_draft_model and self.hf_config.architectures[0] in [
245
+ "BailingMoeV2ForCausalLM",
246
+ "BailingMoeForCausalLM",
247
+ ]:
248
+ self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
144
249
  if (
145
250
  is_draft_model
146
251
  and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
147
252
  ):
148
253
  self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
149
254
 
150
- # Check model type
151
- self.is_generation = is_generation_model(
152
- self.hf_config.architectures, is_embedding
153
- )
154
- self.is_multimodal = enable_multimodal and is_multimodal_model(
155
- self.hf_config.architectures
156
- )
157
- self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
158
- self.hf_config.architectures
159
- )
160
- self.is_image_gen = enable_multimodal and is_image_gen_model(
161
- self.hf_config.architectures
162
- )
163
- self.is_audio_model = enable_multimodal and is_audio_model(
164
- self.hf_config.architectures
165
- )
166
- self.is_multimodal_chunked_prefill_supported = (
167
- enable_multimodal
168
- and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
169
- )
170
- self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
171
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
255
+ if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
256
+ self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
257
+ self.hf_config.num_nextn_predict_layers = 1
172
258
 
173
- # Derive context length
259
+ def _derive_context_length(self, context_length: int):
260
+ is_draft_model = self.is_draft_model
174
261
  derived_context_len = get_context_length(self.hf_text_config)
262
+
175
263
  if context_length is not None:
176
264
  if context_length > derived_context_len:
177
265
  reason = "Target model's" if is_draft_model else "User-specified"
@@ -180,11 +268,16 @@ class ModelConfig:
180
268
  f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
181
269
  )
182
270
  if (
183
- get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
271
+ envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
184
272
  or is_in_ci() # FIXME: fix this special case
185
273
  ):
186
274
  logger.warning(msg)
187
275
  self.context_len = context_length
276
+ if is_draft_model:
277
+ self.hf_text_config.max_position_embeddings = context_length
278
+ logger.warning(
279
+ f"Overriding the draft model's max_position_embeddings to {context_length}."
280
+ )
188
281
  else:
189
282
  raise ValueError(
190
283
  f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
@@ -194,6 +287,10 @@ class ModelConfig:
194
287
  else:
195
288
  self.context_len = derived_context_len
196
289
 
290
+ # Transfer context_len to HuggingFace config so models can access it
291
+ self.hf_config.context_len = self.context_len
292
+
293
+ def _derive_model_shapes(self):
197
294
  # Unify the config keys for hf_text_config
198
295
  self.head_dim = getattr(
199
296
  self.hf_text_config,
@@ -204,10 +301,12 @@ class ModelConfig:
204
301
  # FIXME: temporary special judge for MLA architecture
205
302
  if (
206
303
  "DeepseekV2ForCausalLM" in self.hf_config.architectures
304
+ or "DeepseekV32ForCausalLM" in self.hf_config.architectures
207
305
  or "DeepseekV3ForCausalLM" in self.hf_config.architectures
208
306
  or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
209
307
  or "LongcatFlashForCausalLM" in self.hf_config.architectures
210
308
  or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
309
+ or "DotsVLMForCausalLM" in self.hf_config.architectures
211
310
  ):
212
311
  self.head_dim = 256
213
312
  self.attention_arch = AttentionArch.MLA
@@ -215,6 +314,11 @@ class ModelConfig:
215
314
  self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
216
315
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
217
316
  self.v_head_dim = self.hf_config.v_head_dim
317
+ self.index_head_dim = (
318
+ get_nsa_index_head_dim(self.hf_config)
319
+ if is_deepseek_nsa(self.hf_config)
320
+ else None
321
+ )
218
322
 
219
323
  # Handle rope scaling with yarn
220
324
  self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
@@ -287,37 +391,6 @@ class ModelConfig:
287
391
  )
288
392
  self.vocab_size = self.hf_text_config.vocab_size
289
393
 
290
- # Verify quantization
291
- self._verify_quantization()
292
-
293
- # Verify dual-chunk attention config
294
- self._verify_dual_chunk_attention_config()
295
-
296
- # Cache attributes
297
- self.hf_eos_token_id = self.get_hf_eos_token_id()
298
-
299
- # multimodal
300
- self.image_token_id = getattr(
301
- self.hf_config, "image_token_id", None
302
- ) or getattr(self.hf_config, "image_token_index", None)
303
-
304
- @staticmethod
305
- def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
306
- return ModelConfig(
307
- model_path=model_path or server_args.model_path,
308
- trust_remote_code=server_args.trust_remote_code,
309
- revision=server_args.revision,
310
- context_length=server_args.context_length,
311
- model_override_args=server_args.json_model_override_args,
312
- is_embedding=server_args.is_embedding,
313
- enable_multimodal=server_args.enable_multimodal,
314
- dtype=server_args.dtype,
315
- quantization=server_args.quantization,
316
- hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
317
- model_impl=server_args.model_impl,
318
- **kwargs,
319
- )
320
-
321
394
  def get_total_num_attention_heads(self) -> int:
322
395
  return self.num_attention_heads
323
396
 
@@ -410,27 +483,52 @@ class ModelConfig:
410
483
  # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
411
484
  # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
412
485
  is_local = os.path.exists(self.model_path)
413
- modelopt_quant_config = {"quant_method": "modelopt"}
414
486
  if not is_local:
415
- from huggingface_hub import HfApi
416
-
417
- hf_api = HfApi()
418
- if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
419
- quant_cfg = modelopt_quant_config
487
+ import huggingface_hub
488
+
489
+ try:
490
+ from huggingface_hub import HfApi, hf_hub_download
491
+
492
+ hf_api = HfApi()
493
+ if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
494
+ # Download and parse the quantization config for remote models
495
+ quant_config_file = hf_hub_download(
496
+ repo_id=self.model_path,
497
+ filename="hf_quant_config.json",
498
+ revision=self.revision,
499
+ )
500
+ with open(quant_config_file) as f:
501
+ quant_config_dict = json.load(f)
502
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
503
+ except huggingface_hub.errors.OfflineModeIsEnabled:
504
+ logger.warning(
505
+ "Offline mode is enabled, skipping hf_quant_config.json check"
506
+ )
507
+ pass
420
508
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
421
509
  quant_config_file = os.path.join(
422
510
  self.model_path, "hf_quant_config.json"
423
511
  )
424
512
  with open(quant_config_file) as f:
425
513
  quant_config_dict = json.load(f)
426
- json_quant_configs = quant_config_dict["quantization"]
427
- quant_algo = json_quant_configs.get("quant_algo", None)
428
- if quant_algo == "MIXED_PRECISION":
429
- quant_cfg = {"quant_method": "w4afp8"}
430
- else:
431
- quant_cfg = modelopt_quant_config
514
+ quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
432
515
  return quant_cfg
433
516
 
517
+ def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
518
+ """Parse ModelOpt quantization config and return the appropriate quant_method."""
519
+ json_quant_configs = quant_config_dict["quantization"]
520
+ quant_algo = json_quant_configs.get("quant_algo", None)
521
+
522
+ if quant_algo == "MIXED_PRECISION":
523
+ return {"quant_method": "w4afp8"}
524
+ elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
525
+ return {"quant_method": "modelopt_fp4"}
526
+ elif quant_algo and "FP8" in quant_algo:
527
+ return {"quant_method": "modelopt_fp8"}
528
+ else:
529
+ # Default to FP8 for backward compatibility
530
+ return {"quant_method": "modelopt_fp8"}
531
+
434
532
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
435
533
  def _verify_quantization(self) -> None:
436
534
  supported_quantization = [*QUANTIZATION_METHODS]
@@ -449,7 +547,8 @@ class ModelConfig:
449
547
  optimized_quantization_methods = [
450
548
  "fp8",
451
549
  "marlin",
452
- "modelopt",
550
+ "modelopt_fp8",
551
+ "modelopt_fp4",
453
552
  "gptq_marlin_24",
454
553
  "gptq_marlin",
455
554
  "awq_marlin",
@@ -543,7 +642,7 @@ class ModelConfig:
543
642
  "sparse_attention_enabled"
544
643
  ] = True
545
644
 
546
- def get_hf_eos_token_id(self) -> Optional[Set[int]]:
645
+ def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
547
646
  eos_ids = getattr(self.hf_config, "eos_token_id", None)
548
647
  if eos_ids is not None:
549
648
  # it can be either int or list of int
@@ -563,7 +662,39 @@ class ModelConfig:
563
662
  eos_ids = eos_ids | generation_eos_ids
564
663
  return eos_ids
565
664
 
566
- def maybe_pull_model_tokenizer_from_remote(self) -> None:
665
+ def get_default_sampling_params(self) -> dict[str, Any]:
666
+ """
667
+ Get default sampling parameters from the model's generation config.
668
+
669
+ This method returns non-default sampling parameters from the model's
670
+ generation_config.json when sampling_defaults is set to "model".
671
+
672
+ Returns:
673
+ A dictionary containing the non-default sampling parameters.
674
+ """
675
+ if self.sampling_defaults != "model":
676
+ return {}
677
+
678
+ if self.hf_generation_config is None:
679
+ return {}
680
+
681
+ config = self.hf_generation_config.to_dict()
682
+
683
+ available_params = [
684
+ "repetition_penalty",
685
+ "temperature",
686
+ "top_k",
687
+ "top_p",
688
+ "min_p",
689
+ ]
690
+
691
+ default_sampling_params = {
692
+ p: config.get(p) for p in available_params if config.get(p) is not None
693
+ }
694
+
695
+ return default_sampling_params
696
+
697
+ def _maybe_pull_model_tokenizer_from_remote(self) -> None:
567
698
  """
568
699
  Pull the model config files to a temporary
569
700
  directory in case of remote.
@@ -706,12 +837,17 @@ multimodal_model_archs = [
706
837
  "Qwen2AudioForConditionalGeneration",
707
838
  "Qwen2VLForConditionalGeneration",
708
839
  "Qwen2_5_VLForConditionalGeneration",
840
+ "Qwen3VLForConditionalGeneration",
841
+ "Qwen3VLMoeForConditionalGeneration",
709
842
  "KimiVLForConditionalGeneration",
710
843
  "InternVLChatModel",
711
844
  "InternS1ForConditionalGeneration",
712
845
  "Phi4MMForCausalLM",
713
846
  "VILAForConditionalGeneration",
714
847
  "Step3VLForConditionalGeneration",
848
+ "DotsVLMForCausalLM",
849
+ "DotsOCRForCausalLM",
850
+ "Sarashina2VisionForCausalLM",
715
851
  ]
716
852
 
717
853