sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -32,14 +32,10 @@
32
32
 
33
33
  import concurrent.futures
34
34
  import logging
35
- import os
36
- from enum import IntEnum, auto
37
- from typing import Any, Dict, Iterable, Optional, Tuple, Union
35
+ from typing import Iterable, Optional, Tuple
38
36
 
39
37
  import torch
40
- import torch.nn.functional as F
41
38
  from torch import nn
42
- from tqdm import tqdm
43
39
 
44
40
  from sglang.srt.configs import LongcatFlashConfig
45
41
  from sglang.srt.distributed import (
@@ -48,9 +44,8 @@ from sglang.srt.distributed import (
48
44
  )
49
45
  from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
50
46
  from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
51
- from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
47
+ from sglang.srt.layers import deep_gemm_wrapper
52
48
  from sglang.srt.layers.activation import SiluAndMul
53
- from sglang.srt.layers.amx_utils import PackWeightMethod
54
49
  from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
55
50
  from sglang.srt.layers.dp_attention import (
56
51
  get_attention_tp_rank,
@@ -68,7 +63,6 @@ from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
68
63
  from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
69
64
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
70
65
  from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
71
- from sglang.srt.layers.quantization import deep_gemm_wrapper
72
66
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
73
67
  from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
74
68
  from sglang.srt.layers.quantization.fp8_utils import (
@@ -85,26 +79,21 @@ from sglang.srt.layers.vocab_parallel_embedding import (
85
79
  ParallelLMHead,
86
80
  VocabParallelEmbedding,
87
81
  )
88
- from sglang.srt.managers.schedule_batch import global_server_args_dict
89
82
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
90
83
  from sglang.srt.model_loader.weight_utils import default_weight_loader
91
84
  from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
85
+ from sglang.srt.server_args import get_global_server_args
92
86
  from sglang.srt.utils import (
93
87
  BumpAllocator,
94
- LazyValue,
95
88
  add_prefix,
96
89
  bind_or_assign,
97
90
  cpu_has_amx_support,
98
91
  get_bool_env_var,
99
92
  get_device_sm,
100
- get_int_env_var,
101
93
  is_cpu,
102
94
  is_cuda,
103
- is_flashinfer_available,
104
95
  is_hip,
105
- is_non_idle_and_non_empty,
106
96
  is_npu,
107
- is_sm100_supported,
108
97
  )
109
98
 
110
99
  _is_hip = is_hip()
@@ -117,13 +106,7 @@ _is_cpu = is_cpu()
117
106
  _device_sm = get_device_sm()
118
107
 
119
108
  if _is_cuda:
120
- from sgl_kernel import (
121
- awq_dequantize,
122
- bmm_fp8,
123
- dsv3_fused_a_gemm,
124
- dsv3_router_gemm,
125
- merge_state_v2,
126
- )
109
+ from sgl_kernel import awq_dequantize
127
110
  elif _is_cpu and _is_cpu_amx_available:
128
111
  pass
129
112
  elif _is_hip:
@@ -131,7 +114,7 @@ elif _is_hip:
131
114
  awq_dequantize_triton as awq_dequantize,
132
115
  )
133
116
  else:
134
- from vllm._custom_ops import awq_dequantize
117
+ pass
135
118
 
136
119
  logger = logging.getLogger(__name__)
137
120
 
@@ -595,7 +578,7 @@ class LongcatFlashForCausalLM(nn.Module):
595
578
  config.hidden_size,
596
579
  quant_config=quant_config,
597
580
  prefix=add_prefix("lm_head", prefix),
598
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
581
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
599
582
  )
600
583
  self.logits_processor = LogitsProcessor(config)
601
584
 
@@ -32,17 +32,14 @@
32
32
 
33
33
  import concurrent.futures
34
34
  import logging
35
- import os
36
- from enum import IntEnum, auto
37
- from typing import Any, Dict, Iterable, Optional, Tuple, Union
35
+ from typing import Iterable, Optional, Tuple
38
36
 
39
37
  import torch
40
- import torch.nn.functional as F
41
38
  from torch import nn
42
- from tqdm import tqdm
43
39
 
44
40
  from sglang.srt.configs import LongcatFlashConfig
45
41
  from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
42
+ from sglang.srt.layers import deep_gemm_wrapper
46
43
  from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
47
44
  from sglang.srt.layers.dp_attention import (
48
45
  get_attention_tp_rank,
@@ -52,7 +49,6 @@ from sglang.srt.layers.dp_attention import (
52
49
  from sglang.srt.layers.layernorm import RMSNorm
53
50
  from sglang.srt.layers.linear import ReplicatedLinear
54
51
  from sglang.srt.layers.logits_processor import LogitsProcessor
55
- from sglang.srt.layers.quantization import deep_gemm_wrapper
56
52
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
57
53
  from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
58
54
  from sglang.srt.layers.quantization.fp8_utils import (
@@ -75,7 +71,6 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
75
71
  from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
76
72
  from sglang.srt.utils import (
77
73
  BumpAllocator,
78
- LazyValue,
79
74
  add_prefix,
80
75
  bind_or_assign,
81
76
  cpu_has_amx_support,
@@ -97,13 +92,7 @@ _is_cpu = is_cpu()
97
92
  _device_sm = get_device_sm()
98
93
 
99
94
  if _is_cuda:
100
- from sgl_kernel import (
101
- awq_dequantize,
102
- bmm_fp8,
103
- dsv3_fused_a_gemm,
104
- dsv3_router_gemm,
105
- merge_state_v2,
106
- )
95
+ from sgl_kernel import awq_dequantize
107
96
  elif _is_cpu and _is_cpu_amx_available:
108
97
  pass
109
98
  elif _is_hip:
@@ -111,7 +100,7 @@ elif _is_hip:
111
100
  awq_dequantize_triton as awq_dequantize,
112
101
  )
113
102
  else:
114
- from vllm._custom_ops import awq_dequantize
103
+ pass
115
104
 
116
105
 
117
106
  logger = logging.getLogger(__name__)
sglang/srt/models/mimo.py CHANGED
@@ -1,28 +1,17 @@
1
1
  # Adapted from qwen2.py
2
2
 
3
- from functools import partial
4
- from typing import Any, Dict, Iterable, Optional, Tuple
3
+ from typing import Iterable, Optional, Tuple
5
4
 
6
5
  import torch
7
6
  from torch import nn
8
7
 
9
- from sglang.srt.distributed import (
10
- get_tensor_model_parallel_rank,
11
- get_tensor_model_parallel_world_size,
12
- split_tensor_along_last_dim,
13
- tensor_model_parallel_all_gather,
14
- )
15
- from sglang.srt.layers.layernorm import RMSNorm
16
- from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
17
8
  from sglang.srt.layers.logits_processor import LogitsProcessor
18
9
  from sglang.srt.layers.pooler import Pooler, PoolingType
19
10
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
20
- from sglang.srt.layers.radix_attention import RadixAttention
21
- from sglang.srt.layers.rotary_embedding import get_rope
22
11
  from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
23
12
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
24
13
  from sglang.srt.model_loader.weight_utils import default_weight_loader
25
- from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2Model
14
+ from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model
26
15
  from sglang.srt.utils import add_prefix
27
16
 
28
17
  MiMoConfig = None
@@ -1,7 +1,6 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/pull/17433/files and deepseek_nextn.py
2
2
 
3
- from functools import partial
4
- from typing import Any, Dict, Iterable, Optional, Tuple
3
+ from typing import Iterable, Optional, Tuple
5
4
 
6
5
  import torch
7
6
  from torch import nn
@@ -43,7 +43,6 @@ from sglang.srt.managers.mm_utils import (
43
43
  general_mm_embed_routine,
44
44
  )
45
45
  from sglang.srt.managers.schedule_batch import (
46
- Modality,
47
46
  MultimodalDataItem,
48
47
  MultimodalInputs,
49
48
  flatten_nested_list,
@@ -59,8 +58,6 @@ from sglang.srt.utils import logger
59
58
  try:
60
59
  from transformers import LogitsWarper
61
60
  from vector_quantize_pytorch import GroupedResidualFSQ
62
- from vocos import Vocos
63
- from vocos.pretrained import instantiate_class
64
61
 
65
62
  _tts_deps = True
66
63
  except:
@@ -795,8 +792,10 @@ class ConditionalChatTTS(PreTrainedModel):
795
792
  force_no_stop=False,
796
793
  min_new_token=10,
797
794
  max_new_token=50,
798
- logits_warpers: List[LogitsWarper] = [],
799
- logits_processors: List[CustomRepetitionPenaltyLogitsProcessorRepeat] = [],
795
+ logits_warpers: Optional[List[LogitsWarper]] = None,
796
+ logits_processors: Optional[
797
+ List[CustomRepetitionPenaltyLogitsProcessorRepeat]
798
+ ] = None,
800
799
  show_tqdm=False,
801
800
  ):
802
801
  """Generate audio codes in streaming setting or non-streaming setting.
@@ -825,6 +824,9 @@ class ConditionalChatTTS(PreTrainedModel):
825
824
  assert input_ids.shape[0] == 1
826
825
  assert past_key_values is not None
827
826
 
827
+ logits_warpers = logits_warpers or []
828
+ logits_processors = logits_processors or []
829
+
828
830
  # fix: this should not be `input_ids.shape[1]`
829
831
  # start_idx = input_ids.shape[1]
830
832
  start_idx = (
@@ -24,7 +24,6 @@ from torch import nn
24
24
  from transformers import MixtralConfig
25
25
 
26
26
  from sglang.srt.distributed import (
27
- get_moe_expert_parallel_world_size,
28
27
  get_pp_group,
29
28
  get_tensor_model_parallel_world_size,
30
29
  tensor_model_parallel_all_reduce,
@@ -36,7 +35,6 @@ from sglang.srt.layers.linear import (
36
35
  RowParallelLinear,
37
36
  )
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
40
38
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
41
39
  from sglang.srt.layers.moe.topk import TopK
42
40
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -94,8 +92,7 @@ class MixtralMoE(nn.Module):
94
92
  renormalize=True,
95
93
  )
96
94
 
97
- MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
98
- self.experts = MoEImpl(
95
+ self.experts = FusedMoE(
99
96
  num_experts=num_experts,
100
97
  top_k=top_k,
101
98
  layer_id=layer_id,
@@ -901,7 +901,7 @@ class MllamaForConditionalGeneration(nn.Module):
901
901
  img = pixel_values[0, j]
902
902
  num_tiles = img.shape[0]
903
903
  batched_images[i, j, :num_tiles] = img
904
- batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_id[0, j]
904
+ batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_ids[0, j]
905
905
 
906
906
  batched_ar_mask[i, j, :num_tiles] = mm_input.mm_items[
907
907
  0
@@ -2,6 +2,7 @@ import json as json_lib
2
2
  import logging
3
3
  import math
4
4
  import os
5
+ import re
5
6
  from collections.abc import Iterable
6
7
  from typing import List, Optional, Set, Tuple
7
8
 
@@ -30,9 +31,9 @@ from sglang.srt.managers.schedule_batch import (
30
31
  Modality,
31
32
  MultimodalDataItem,
32
33
  MultimodalInputs,
33
- global_server_args_dict,
34
34
  )
35
35
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
36
+ from sglang.srt.server_args import get_global_server_args
36
37
  from sglang.srt.utils import is_cpu
37
38
 
38
39
  _is_cpu = is_cpu()
@@ -291,7 +292,7 @@ class Llama4UnfoldConvolution(nn.Module):
291
292
 
292
293
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
293
294
  hidden_states = self.unfold(hidden_states)
294
- hidden_states = hidden_states.permute(0, 2, 1)
295
+ hidden_states = hidden_states.permute(0, 2, 1).contiguous()
295
296
  hidden_states, _ = self.linear(hidden_states)
296
297
  return hidden_states
297
298
 
@@ -422,6 +423,11 @@ class Llama4ForConditionalGeneration(nn.Module):
422
423
  "gate_up_proj": ["gate_proj", "up_proj"],
423
424
  }
424
425
 
426
+ # Pattern to match language model layers only (skip vision_model and multi_modal_projector)
427
+ lora_pattern = re.compile(
428
+ r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
429
+ )
430
+
425
431
  def __init__(
426
432
  self,
427
433
  config: Llama4Config,
@@ -442,13 +448,24 @@ class Llama4ForConditionalGeneration(nn.Module):
442
448
  )
443
449
 
444
450
  self.has_vision = (
445
- self.has_vision_weights and global_server_args_dict["enable_multimodal"]
451
+ self.has_vision_weights and get_global_server_args().enable_multimodal
446
452
  )
447
453
 
448
454
  if self.has_vision:
455
+ # TODO: make this more general
456
+ ignore_quant_layers = getattr(config, "quantization_config", {}).get(
457
+ "ignore", {}
458
+ )
459
+ if (
460
+ "model.layers.vision_model*" in ignore_quant_layers
461
+ and "model.layers.multi_modal_projector*" in ignore_quant_layers
462
+ ):
463
+ vision_quant_config = None
464
+ else:
465
+ vision_quant_config = quant_config
449
466
  self.vision_model = Llama4VisionModel(
450
467
  config.vision_config,
451
- quant_config=quant_config,
468
+ quant_config=vision_quant_config,
452
469
  prefix=add_prefix("vision_model", prefix),
453
470
  )
454
471
 
@@ -544,6 +561,10 @@ class Llama4ForConditionalGeneration(nn.Module):
544
561
 
545
562
  return projected_vision_flat
546
563
 
564
+ def should_apply_lora(self, module_name: str) -> bool:
565
+ """Skip vision model and multi_modal_projector for LoRA."""
566
+ return bool(self.lora_pattern.match(module_name))
567
+
547
568
  def forward(
548
569
  self,
549
570
  input_ids: torch.Tensor,
@@ -560,7 +581,7 @@ class Llama4ForConditionalGeneration(nn.Module):
560
581
  forward_batch=forward_batch,
561
582
  language_model=self.language_model,
562
583
  data_embedding_funcs={
563
- Modality.IMAGE: self.get_image_feature,
584
+ Modality.IMAGE: image_embedding_func,
564
585
  },
565
586
  positions=positions,
566
587
  )
@@ -689,7 +710,7 @@ class Llama4ForConditionalGeneration(nn.Module):
689
710
  """Handle scale parameter remapping. Returns True if handled."""
690
711
  if "scale" in name and "expert" not in name:
691
712
  remapped_name = maybe_remap_kv_scale_name(name, params_dict)
692
- return remapped_name is None
713
+ return remapped_name != name
693
714
  return False
694
715
 
695
716
  def _handle_stacked_params(