sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -22,16 +22,17 @@ from typing import List, Optional, Set, Union
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import (
25
+ from sglang.srt.environ import envs
26
+ from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
+ from sglang.srt.server_args import ServerArgs
28
+ from sglang.srt.utils import is_hip, retry
29
+ from sglang.srt.utils.hf_transformers_utils import (
26
30
  get_config,
27
31
  get_context_length,
28
32
  get_generation_config,
29
33
  get_hf_text_config,
30
34
  get_sparse_attention_config,
31
35
  )
32
- from sglang.srt.layers.quantization import QUANTIZATION_METHODS
33
- from sglang.srt.server_args import ServerArgs
34
- from sglang.srt.utils import get_bool_env_var, is_hip
35
36
  from sglang.utils import is_in_ci
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
48
49
  TRANSFORMERS = "transformers"
49
50
 
50
51
 
52
+ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
53
+ return (
54
+ config.architectures is not None
55
+ and config.architectures[0]
56
+ in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
57
+ and getattr(config, "index_topk", None) is not None
58
+ )
59
+
60
+
61
+ def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
62
+ assert is_deepseek_nsa(config)
63
+ return config.index_head_dim
64
+
65
+
66
+ def get_nsa_index_topk(config: PretrainedConfig) -> int:
67
+ assert is_deepseek_nsa(config)
68
+ return config.index_topk
69
+
70
+
71
+ def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
72
+ assert is_deepseek_nsa(config)
73
+ return config.index_n_heads
74
+
75
+
51
76
  class ModelConfig:
52
77
  def __init__(
53
78
  self,
@@ -64,35 +89,20 @@ class ModelConfig:
64
89
  is_draft_model: bool = False,
65
90
  hybrid_kvcache_ratio: Optional[float] = None,
66
91
  model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
67
- tp_rank: Optional[int] = None,
68
- remote_instance_weight_loader_seed_instance_ip: Optional[str] = None,
69
- remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None,
70
- remote_instance_weight_loader_send_weights_group_ports: Optional[
71
- List[int]
72
- ] = None,
73
92
  ) -> None:
74
93
  # Parse args
75
94
  self.model_path = model_path
76
95
  self.revision = revision
77
96
  self.quantization = quantization
97
+ self.is_draft_model = is_draft_model
78
98
  self.model_impl = model_impl
79
- self.tp_rank = tp_rank
80
- self.remote_instance_weight_loader_seed_instance_ip = (
81
- remote_instance_weight_loader_seed_instance_ip
82
- )
83
- self.remote_instance_weight_loader_seed_instance_service_port = (
84
- remote_instance_weight_loader_seed_instance_service_port
85
- )
86
- self.remote_instance_weight_loader_send_weights_group_ports = (
87
- remote_instance_weight_loader_send_weights_group_ports
88
- )
89
99
 
90
- self.maybe_pull_model_tokenizer_from_remote()
100
+ # Get hf config
101
+ self._maybe_pull_model_tokenizer_from_remote()
91
102
  self.model_override_args = json.loads(model_override_args)
92
103
  kwargs = {}
93
104
  if override_config_file and override_config_file.strip():
94
105
  kwargs["_configuration_file"] = override_config_file.strip()
95
-
96
106
  self.hf_config = get_config(
97
107
  self.model_path,
98
108
  trust_remote_code=trust_remote_code,
@@ -100,7 +110,7 @@ class ModelConfig:
100
110
  model_override_args=self.model_override_args,
101
111
  **kwargs,
102
112
  )
103
-
113
+ self.hf_text_config = get_hf_text_config(self.hf_config)
104
114
  self.hf_generation_config = get_generation_config(
105
115
  self.model_path,
106
116
  trust_remote_code=trust_remote_code,
@@ -108,7 +118,25 @@ class ModelConfig:
108
118
  **kwargs,
109
119
  )
110
120
 
111
- self.hf_text_config = get_hf_text_config(self.hf_config)
121
+ # Set enable_multimodal
122
+ if enable_multimodal is None:
123
+ mm_disabled_models = [
124
+ "Gemma3ForConditionalGeneration",
125
+ "Llama4ForConditionalGeneration",
126
+ "Step3VLForConditionalGeneration",
127
+ ]
128
+ if self.hf_config.architectures[0] in mm_disabled_models:
129
+ enable_multimodal = False
130
+ logger.info(
131
+ f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
132
+ )
133
+ else:
134
+ enable_multimodal = True
135
+
136
+ # Config draft model
137
+ self._config_draft_model()
138
+
139
+ # Check model type
112
140
  self.attention_chunk_size = getattr(
113
141
  self.hf_text_config, "attention_chunk_size", None
114
142
  )
@@ -124,20 +152,70 @@ class ModelConfig:
124
152
  self.hf_config.architectures, self.hf_text_config.num_hidden_layers
125
153
  )
126
154
  )
155
+ self.is_generation = is_generation_model(
156
+ self.hf_config.architectures, is_embedding
157
+ )
158
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
159
+ self.hf_config.architectures
160
+ )
161
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
162
+ self.hf_config.architectures
163
+ )
164
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
165
+ self.hf_config.architectures
166
+ )
167
+ self.is_audio_model = enable_multimodal and is_audio_model(
168
+ self.hf_config.architectures
169
+ )
170
+ self.is_multimodal_chunked_prefill_supported = (
171
+ enable_multimodal
172
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
173
+ )
174
+ self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
175
+ self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
127
176
 
128
- if enable_multimodal is None:
129
- mm_disabled_models = [
130
- "Gemma3ForConditionalGeneration",
131
- "Llama4ForConditionalGeneration",
132
- "Step3VLForConditionalGeneration",
133
- ]
134
- if self.hf_config.architectures[0] in mm_disabled_models:
135
- enable_multimodal = False
136
- logger.info(
137
- f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
138
- )
139
- else:
140
- enable_multimodal = True
177
+ # Derive context length and model shapes
178
+ self._derive_context_length(context_length)
179
+ self._derive_model_shapes()
180
+
181
+ # Verify quantization
182
+ self._verify_quantization()
183
+
184
+ # Verify dual-chunk attention config
185
+ self._verify_dual_chunk_attention_config()
186
+
187
+ # Cache attributes
188
+ self.hf_eos_token_id = self._get_hf_eos_token_id()
189
+
190
+ # multimodal
191
+ self.image_token_id = getattr(
192
+ self.hf_config, "image_token_id", None
193
+ ) or getattr(self.hf_config, "image_token_index", None)
194
+
195
+ @staticmethod
196
+ def from_server_args(
197
+ server_args: ServerArgs,
198
+ model_path: str = None,
199
+ model_revision: str = None,
200
+ **kwargs,
201
+ ):
202
+ return ModelConfig(
203
+ model_path=model_path or server_args.model_path,
204
+ trust_remote_code=server_args.trust_remote_code,
205
+ revision=model_revision or server_args.revision,
206
+ context_length=server_args.context_length,
207
+ model_override_args=server_args.json_model_override_args,
208
+ is_embedding=server_args.is_embedding,
209
+ enable_multimodal=server_args.enable_multimodal,
210
+ dtype=server_args.dtype,
211
+ quantization=server_args.quantization,
212
+ hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
213
+ model_impl=server_args.model_impl,
214
+ **kwargs,
215
+ )
216
+
217
+ def _config_draft_model(self):
218
+ is_draft_model = self.is_draft_model
141
219
 
142
220
  if (
143
221
  is_draft_model
@@ -172,31 +250,10 @@ class ModelConfig:
172
250
  self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
173
251
  self.hf_config.num_nextn_predict_layers = 1
174
252
 
175
- # Check model type
176
- self.is_generation = is_generation_model(
177
- self.hf_config.architectures, is_embedding
178
- )
179
- self.is_multimodal = enable_multimodal and is_multimodal_model(
180
- self.hf_config.architectures
181
- )
182
- self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
183
- self.hf_config.architectures
184
- )
185
- self.is_image_gen = enable_multimodal and is_image_gen_model(
186
- self.hf_config.architectures
187
- )
188
- self.is_audio_model = enable_multimodal and is_audio_model(
189
- self.hf_config.architectures
190
- )
191
- self.is_multimodal_chunked_prefill_supported = (
192
- enable_multimodal
193
- and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
194
- )
195
- self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
196
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
197
-
198
- # Derive context length
253
+ def _derive_context_length(self, context_length: int):
254
+ is_draft_model = self.is_draft_model
199
255
  derived_context_len = get_context_length(self.hf_text_config)
256
+
200
257
  if context_length is not None:
201
258
  if context_length > derived_context_len:
202
259
  reason = "Target model's" if is_draft_model else "User-specified"
@@ -205,11 +262,16 @@ class ModelConfig:
205
262
  f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
206
263
  )
207
264
  if (
208
- get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
265
+ envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
209
266
  or is_in_ci() # FIXME: fix this special case
210
267
  ):
211
268
  logger.warning(msg)
212
269
  self.context_len = context_length
270
+ if is_draft_model:
271
+ self.hf_text_config.max_position_embeddings = context_length
272
+ logger.warning(
273
+ f"Overriding the draft model's max_position_embeddings to {context_length}."
274
+ )
213
275
  else:
214
276
  raise ValueError(
215
277
  f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
@@ -219,6 +281,10 @@ class ModelConfig:
219
281
  else:
220
282
  self.context_len = derived_context_len
221
283
 
284
+ # Transfer context_len to HuggingFace config so models can access it
285
+ self.hf_config.context_len = self.context_len
286
+
287
+ def _derive_model_shapes(self):
222
288
  # Unify the config keys for hf_text_config
223
289
  self.head_dim = getattr(
224
290
  self.hf_text_config,
@@ -229,6 +295,7 @@ class ModelConfig:
229
295
  # FIXME: temporary special judge for MLA architecture
230
296
  if (
231
297
  "DeepseekV2ForCausalLM" in self.hf_config.architectures
298
+ or "DeepseekV32ForCausalLM" in self.hf_config.architectures
232
299
  or "DeepseekV3ForCausalLM" in self.hf_config.architectures
233
300
  or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
234
301
  or "LongcatFlashForCausalLM" in self.hf_config.architectures
@@ -241,6 +308,11 @@ class ModelConfig:
241
308
  self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
242
309
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
243
310
  self.v_head_dim = self.hf_config.v_head_dim
311
+ self.index_head_dim = (
312
+ get_nsa_index_head_dim(self.hf_config)
313
+ if is_deepseek_nsa(self.hf_config)
314
+ else None
315
+ )
244
316
 
245
317
  # Handle rope scaling with yarn
246
318
  self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
@@ -313,45 +385,6 @@ class ModelConfig:
313
385
  )
314
386
  self.vocab_size = self.hf_text_config.vocab_size
315
387
 
316
- # Verify quantization
317
- self._verify_quantization()
318
-
319
- # Verify dual-chunk attention config
320
- self._verify_dual_chunk_attention_config()
321
-
322
- # Cache attributes
323
- self.hf_eos_token_id = self.get_hf_eos_token_id()
324
-
325
- # multimodal
326
- self.image_token_id = getattr(
327
- self.hf_config, "image_token_id", None
328
- ) or getattr(self.hf_config, "image_token_index", None)
329
-
330
- @staticmethod
331
- def from_server_args(
332
- server_args: ServerArgs,
333
- model_path: str = None,
334
- model_revision: str = None,
335
- **kwargs,
336
- ):
337
- return ModelConfig(
338
- model_path=model_path or server_args.model_path,
339
- trust_remote_code=server_args.trust_remote_code,
340
- revision=model_revision or server_args.revision,
341
- context_length=server_args.context_length,
342
- model_override_args=server_args.json_model_override_args,
343
- is_embedding=server_args.is_embedding,
344
- enable_multimodal=server_args.enable_multimodal,
345
- dtype=server_args.dtype,
346
- quantization=server_args.quantization,
347
- hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
348
- model_impl=server_args.model_impl,
349
- remote_instance_weight_loader_seed_instance_ip=server_args.remote_instance_weight_loader_seed_instance_ip,
350
- remote_instance_weight_loader_seed_instance_service_port=server_args.remote_instance_weight_loader_seed_instance_service_port,
351
- remote_instance_weight_loader_send_weights_group_ports=server_args.remote_instance_weight_loader_send_weights_group_ports,
352
- **kwargs,
353
- )
354
-
355
388
  def get_total_num_attention_heads(self) -> int:
356
389
  return self.num_attention_heads
357
390
 
@@ -452,13 +485,31 @@ class ModelConfig:
452
485
  from huggingface_hub import HfApi
453
486
 
454
487
  hf_api = HfApi()
455
- if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
488
+
489
+ def check_hf_quant_config():
490
+ return hf_api.file_exists(
491
+ self.model_path, "hf_quant_config.json"
492
+ )
493
+
494
+ # Retry HF API call up to 3 times
495
+ file_exists = retry(
496
+ check_hf_quant_config,
497
+ max_retry=2,
498
+ initial_delay=1.0,
499
+ max_delay=5.0,
500
+ )
501
+
502
+ if file_exists:
456
503
  quant_cfg = modelopt_quant_config
504
+
457
505
  except huggingface_hub.errors.OfflineModeIsEnabled:
458
506
  logger.warning(
459
507
  "Offline mode is enabled, skipping hf_quant_config.json check"
460
508
  )
461
- pass
509
+ except Exception as e:
510
+ logger.warning(
511
+ f"Failed to check hf_quant_config.json: {self.model_path} {e}"
512
+ )
462
513
 
463
514
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
464
515
  quant_config_file = os.path.join(
@@ -586,7 +637,7 @@ class ModelConfig:
586
637
  "sparse_attention_enabled"
587
638
  ] = True
588
639
 
589
- def get_hf_eos_token_id(self) -> Optional[Set[int]]:
640
+ def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
590
641
  eos_ids = getattr(self.hf_config, "eos_token_id", None)
591
642
  if eos_ids is not None:
592
643
  # it can be either int or list of int
@@ -606,7 +657,7 @@ class ModelConfig:
606
657
  eos_ids = eos_ids | generation_eos_ids
607
658
  return eos_ids
608
659
 
609
- def maybe_pull_model_tokenizer_from_remote(self) -> None:
660
+ def _maybe_pull_model_tokenizer_from_remote(self) -> None:
610
661
  """
611
662
  Pull the model config files to a temporary
612
663
  directory in case of remote.
@@ -749,6 +800,8 @@ multimodal_model_archs = [
749
800
  "Qwen2AudioForConditionalGeneration",
750
801
  "Qwen2VLForConditionalGeneration",
751
802
  "Qwen2_5_VLForConditionalGeneration",
803
+ "Qwen3VLForConditionalGeneration",
804
+ "Qwen3VLMoeForConditionalGeneration",
752
805
  "KimiVLForConditionalGeneration",
753
806
  "InternVLChatModel",
754
807
  "InternS1ForConditionalGeneration",
@@ -756,6 +809,8 @@ multimodal_model_archs = [
756
809
  "VILAForConditionalGeneration",
757
810
  "Step3VLForConditionalGeneration",
758
811
  "DotsVLMForCausalLM",
812
+ "DotsOCRForCausalLM",
813
+ "Sarashina2VisionForCausalLM",
759
814
  ]
760
815
 
761
816