sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- """Inference-only GLM-4.5 model compatible with HuggingFace weights"""
15
+ """Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights"""
16
16
 
17
17
  import logging
18
18
  from typing import Any, Dict, Iterable, Optional, Tuple
@@ -27,7 +27,6 @@ from sglang.srt.distributed import (
27
27
  get_pp_group,
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
- parallel_state,
31
30
  tensor_model_parallel_all_reduce,
32
31
  )
33
32
  from sglang.srt.layers.activation import SiluAndMul
@@ -44,30 +43,23 @@ from sglang.srt.layers.dp_attention import (
44
43
  )
45
44
  from sglang.srt.layers.layernorm import RMSNorm
46
45
  from sglang.srt.layers.linear import (
47
- ColumnParallelLinear,
48
46
  MergedColumnParallelLinear,
49
47
  QKVParallelLinear,
50
- ReplicatedLinear,
51
48
  RowParallelLinear,
52
49
  )
53
50
  from sglang.srt.layers.logits_processor import LogitsProcessor
54
- from sglang.srt.layers.moe import get_deepep_mode, get_moe_a2a_backend
51
+ from sglang.srt.layers.moe import get_moe_a2a_backend
55
52
  from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
56
53
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
57
54
  from sglang.srt.layers.moe.topk import TopK
58
55
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
59
- from sglang.srt.layers.quantization.fp8_kernel import (
60
- is_fp8_fnuz,
61
- per_tensor_quant_mla_fp8,
62
- per_token_group_quant_mla_deep_gemm_masked_fp8,
63
- )
56
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
64
57
  from sglang.srt.layers.radix_attention import RadixAttention
65
58
  from sglang.srt.layers.rotary_embedding import get_rope
66
59
  from sglang.srt.layers.vocab_parallel_embedding import (
67
60
  ParallelLMHead,
68
61
  VocabParallelEmbedding,
69
62
  )
70
- from sglang.srt.managers.schedule_batch import global_server_args_dict
71
63
  from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
72
64
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
73
65
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -77,21 +69,17 @@ from sglang.srt.models.deepseek_v2 import (
77
69
  DeepseekV2Model,
78
70
  DeepseekV2MoE,
79
71
  )
80
- from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
72
+ from sglang.srt.server_args import get_global_server_args
81
73
  from sglang.srt.utils import (
82
74
  BumpAllocator,
83
75
  LazyValue,
84
76
  add_prefix,
85
- bind_or_assign,
86
77
  cpu_has_amx_support,
87
78
  get_bool_env_var,
88
79
  get_device_sm,
89
- get_int_env_var,
90
80
  is_cpu,
91
81
  is_cuda,
92
- is_flashinfer_available,
93
82
  is_hip,
94
- is_non_idle_and_non_empty,
95
83
  log_info_on_rank0,
96
84
  use_intel_amx_backend,
97
85
  )
@@ -395,7 +383,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
395
383
  self.n_shared_experts = config.n_shared_experts
396
384
  self.num_fused_shared_experts = (
397
385
  0
398
- if global_server_args_dict["disable_shared_experts_fusion"]
386
+ if get_global_server_args().disable_shared_experts_fusion
399
387
  else config.n_shared_experts
400
388
  )
401
389
  self.config = config
@@ -432,7 +420,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
432
420
  self.experts = get_moe_impl_class(quant_config)(
433
421
  num_experts=config.n_routed_experts
434
422
  + self.num_fused_shared_experts
435
- + global_server_args_dict["ep_num_redundant_experts"],
423
+ + get_global_server_args().ep_num_redundant_experts,
436
424
  num_fused_shared_experts=self.num_fused_shared_experts,
437
425
  top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
438
426
  hidden_size=config.hidden_size,
@@ -471,12 +459,12 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
471
459
 
472
460
  self.top_k = config.num_experts_per_tok
473
461
 
474
- if get_moe_a2a_backend().is_deepep():
462
+ if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
475
463
  # TODO: we will support tp < ep in the future
476
464
  self.ep_size = get_moe_expert_parallel_world_size()
477
465
  self.num_experts = (
478
466
  config.n_routed_experts
479
- + global_server_args_dict["ep_num_redundant_experts"]
467
+ + get_global_server_args().ep_num_redundant_experts
480
468
  )
481
469
  self.renormalize = config.norm_topk_prob
482
470
  self.topk_group = config.topk_group
@@ -487,20 +475,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
487
475
  else None
488
476
  )
489
477
 
490
- self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
491
- group=parallel_state.get_tp_group().device_group,
492
- router_topk=self.top_k,
493
- permute_fusion=True,
494
- num_experts=self.num_experts,
495
- num_local_experts=config.n_routed_experts // self.tp_size,
496
- hidden_size=config.hidden_size,
497
- params_dtype=config.torch_dtype,
498
- deepep_mode=get_deepep_mode(),
499
- async_finish=True,
500
- return_recv_hook=True,
501
- )
502
-
503
- self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
478
+ self._enable_a2a_moe = (
479
+ get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
480
+ )
504
481
 
505
482
  def forward_normal_dual_stream(
506
483
  self,
@@ -664,7 +641,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
664
641
  layer_scatter_modes=self.layer_scatter_modes,
665
642
  input_layernorm=self.input_layernorm,
666
643
  post_attention_layernorm=self.post_attention_layernorm,
667
- allow_reduce_scatter=True,
644
+ allow_reduce_scatter=False,
668
645
  )
669
646
 
670
647
  def forward(
@@ -758,7 +735,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
758
735
  config.hidden_size,
759
736
  quant_config=quant_config,
760
737
  prefix=add_prefix("lm_head", prefix),
761
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
738
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
762
739
  )
763
740
  self.logits_processor = LogitsProcessor(config)
764
741
 
@@ -774,7 +751,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
774
751
  self, architecture: str = "Glm4MoeForCausalLM"
775
752
  ):
776
753
  self.num_fused_shared_experts = 0
777
- if global_server_args_dict["disable_shared_experts_fusion"]:
754
+ if get_global_server_args().disable_shared_experts_fusion:
778
755
  return
779
756
 
780
757
  # Only Deepseek V3/R1 can use shared experts fusion optimization now.
@@ -785,12 +762,12 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
785
762
  or self.config.architectures[0] != architecture
786
763
  or self.config.n_shared_experts != 1
787
764
  ):
788
- disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
765
+ disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization."
789
766
  elif get_moe_expert_parallel_world_size() > 1:
790
- disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
767
+ disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."
791
768
 
792
769
  if disable_reason is not None:
793
- global_server_args_dict["disable_shared_experts_fusion"] = True
770
+ get_global_server_args().disable_shared_experts_fusion = True
794
771
  self.num_fused_shared_experts = 0
795
772
  log_info_on_rank0(
796
773
  logger,
@@ -12,7 +12,7 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- """Inference-only GLM-4.5 NextN Speculative Decoding."""
15
+ """Inference-only GLM-4.5, GLM-4.6 NextN Speculative Decoding."""
16
16
  import logging
17
17
  from typing import Iterable, Optional, Tuple
18
18
 
@@ -30,9 +30,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
30
30
  ParallelLMHead,
31
31
  VocabParallelEmbedding,
32
32
  )
33
- from sglang.srt.managers.schedule_batch import global_server_args_dict
34
33
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
35
34
  from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
35
+ from sglang.srt.server_args import get_global_server_args
36
36
  from sglang.srt.utils import BumpAllocator, add_prefix
37
37
 
38
38
  logger = logging.getLogger(__name__)
@@ -48,7 +48,7 @@ class Glm4MoeModelNextN(nn.Module):
48
48
  super().__init__()
49
49
  if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
50
50
  logger.warning(
51
- "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 model."
51
+ "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 / GLM-4.6 model."
52
52
  )
53
53
  quant_config = None
54
54
 
@@ -145,7 +145,7 @@ class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
145
145
  config.hidden_size,
146
146
  quant_config=quant_config,
147
147
  prefix=add_prefix("model.shared_head.head", prefix),
148
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
148
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
149
149
  )
150
150
  self.logits_processor = LogitsProcessor(config)
151
151
 
@@ -7,9 +7,9 @@ import torch.nn as nn
7
7
  import torch.nn.functional as F
8
8
  from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
9
9
 
10
- from sglang.srt.hf_transformers_utils import get_processor
11
10
  from sglang.srt.layers.activation import SiluAndMul
12
11
  from sglang.srt.layers.attention import vision_utils
12
+ from sglang.srt.layers.dp_attention import get_attention_tp_size
13
13
  from sglang.srt.layers.layernorm import RMSNorm
14
14
  from sglang.srt.layers.linear import (
15
15
  ColumnParallelLinear,
@@ -28,6 +28,7 @@ from sglang.srt.models.qwen2_5_vl import (
28
28
  Qwen2_5_VLForConditionalGeneration,
29
29
  )
30
30
  from sglang.srt.utils import add_prefix
31
+ from sglang.srt.utils.hf_transformers_utils import get_processor
31
32
 
32
33
  logger = logging.getLogger(__name__)
33
34
 
@@ -434,7 +435,7 @@ class Glm4vVisionModel(nn.Module):
434
435
  cu_seqlens = torch.repeat_interleave(
435
436
  grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
436
437
  ).cumsum(dim=0, dtype=torch.int32)
437
- cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
438
+ cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
438
439
 
439
440
  seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
440
441
  x = self.embeddings(
@@ -10,18 +10,18 @@ from sglang.srt.distributed import (
10
10
  get_moe_expert_parallel_world_size,
11
11
  get_tensor_model_parallel_world_size,
12
12
  )
13
- from sglang.srt.hf_transformers_utils import get_processor
14
13
  from sglang.srt.layers.attention import vision_utils
15
14
  from sglang.srt.layers.logits_processor import LogitsProcessor
16
15
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
17
16
  from sglang.srt.layers.pooler import Pooler, PoolingType
18
17
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
19
18
  from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
20
- from sglang.srt.managers.schedule_batch import global_server_args_dict
21
19
  from sglang.srt.model_loader.weight_utils import default_weight_loader
22
20
  from sglang.srt.models.glm4_moe import Glm4MoeModel
23
21
  from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
22
+ from sglang.srt.server_args import get_global_server_args
24
23
  from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
24
+ from sglang.srt.utils.hf_transformers_utils import get_processor
25
25
 
26
26
  _is_cuda = is_cuda()
27
27
 
@@ -47,7 +47,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
47
47
  self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
48
48
  self.num_fused_shared_experts = (
49
49
  0
50
- if global_server_args_dict["disable_shared_experts_fusion"]
50
+ if get_global_server_args().disable_shared_experts_fusion
51
51
  else config.n_shared_experts
52
52
  )
53
53
 
@@ -68,7 +68,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
68
68
  config.hidden_size,
69
69
  quant_config=quant_config,
70
70
  prefix=add_prefix("lm_head", prefix),
71
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
71
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
72
72
  )
73
73
  self.logits_processor = LogitsProcessor(config)
74
74
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
@@ -81,7 +81,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
81
81
  self, architecture: str = "Glm4MoeForCausalLM"
82
82
  ):
83
83
  self.num_fused_shared_experts = 0
84
- if global_server_args_dict["disable_shared_experts_fusion"]:
84
+ if get_global_server_args().disable_shared_experts_fusion:
85
85
  return
86
86
 
87
87
  # Only Deepseek V3/R1 can use shared experts fusion optimization now.
@@ -97,7 +97,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
97
97
  disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
98
98
 
99
99
  if disable_reason is not None:
100
- global_server_args_dict["disable_shared_experts_fusion"] = True
100
+ get_global_server_args().disable_shared_experts_fusion = True
101
101
  self.num_fused_shared_experts = 0
102
102
  log_info_on_rank0(
103
103
  logger,
@@ -63,9 +63,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
63
63
  ParallelLMHead,
64
64
  VocabParallelEmbedding,
65
65
  )
66
- from sglang.srt.managers.schedule_batch import global_server_args_dict
67
66
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
68
67
  from sglang.srt.model_loader.weight_utils import default_weight_loader
68
+ from sglang.srt.models.utils import (
69
+ create_fused_set_kv_buffer_arg,
70
+ enable_fused_set_kv_buffer,
71
+ )
72
+ from sglang.srt.server_args import get_global_server_args
69
73
  from sglang.srt.utils import (
70
74
  LazyValue,
71
75
  add_prefix,
@@ -81,7 +85,7 @@ _is_sm100_supported = is_cuda() and is_sm100_supported()
81
85
 
82
86
 
83
87
  if _is_cuda:
84
- from sgl_kernel import FusedSetKVBufferArg
88
+ from sgl_kernel import FusedSetKVBufferArg # noqa: F401
85
89
 
86
90
 
87
91
  class GptOssConfig(PretrainedConfig):
@@ -134,7 +138,7 @@ class GptOssSparseMoeBlock(nn.Module):
134
138
  }
135
139
  self.experts = experts_type(
136
140
  num_experts=config.num_local_experts
137
- + global_server_args_dict["ep_num_redundant_experts"],
141
+ + get_global_server_args().ep_num_redundant_experts,
138
142
  top_k=config.num_experts_per_tok,
139
143
  layer_id=layer_id,
140
144
  hidden_size=config.hidden_size,
@@ -193,33 +197,6 @@ class GptOssSparseMoeBlock(nn.Module):
193
197
  return ans
194
198
 
195
199
 
196
- def _enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
197
- """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
198
- return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
199
-
200
-
201
- # TODO maybe move to a model-common utils
202
- def _create_fused_set_kv_buffer_arg(
203
- value: torch.Tensor,
204
- layer: RadixAttention,
205
- forward_batch: ForwardBatch,
206
- ):
207
- layer_id = layer.layer_id
208
- token_to_kv_pool = forward_batch.token_to_kv_pool
209
-
210
- k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
211
- v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
212
-
213
- return FusedSetKVBufferArg(
214
- value=value,
215
- k_buffer=k_buffer.view(k_buffer.shape[0], -1),
216
- v_buffer=v_buffer.view(v_buffer.shape[0], -1),
217
- k_scale=layer.k_scale,
218
- v_scale=layer.v_scale,
219
- cache_loc=forward_batch.out_cache_loc,
220
- )
221
-
222
-
223
200
  class GptOssAttention(nn.Module):
224
201
  def __init__(
225
202
  self,
@@ -282,7 +259,7 @@ class GptOssAttention(nn.Module):
282
259
 
283
260
  # Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
284
261
  # others can use bfloat16
285
- attn_backend = global_server_args_dict.get("attention_backend")
262
+ attn_backend = get_global_server_args().attention_backend
286
263
  sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
287
264
  self.sinks = nn.Parameter(
288
265
  torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
@@ -337,12 +314,12 @@ class GptOssAttention(nn.Module):
337
314
  q,
338
315
  k,
339
316
  fused_set_kv_buffer_arg=(
340
- _create_fused_set_kv_buffer_arg(
317
+ create_fused_set_kv_buffer_arg(
341
318
  value=v,
342
319
  layer=self.attn,
343
320
  forward_batch=forward_batch,
344
321
  )
345
- if _enable_fused_set_kv_buffer(forward_batch)
322
+ if enable_fused_set_kv_buffer(forward_batch)
346
323
  else None
347
324
  ),
348
325
  )
@@ -356,7 +333,7 @@ class GptOssAttention(nn.Module):
356
333
  attn_output = self.attn(
357
334
  *inner_state,
358
335
  sinks=self.sinks,
359
- save_kv_cache=not _enable_fused_set_kv_buffer(forward_batch),
336
+ save_kv_cache=not enable_fused_set_kv_buffer(forward_batch),
360
337
  )
361
338
  output, _ = self.o_proj(attn_output)
362
339
  return output
@@ -614,7 +591,7 @@ class GptOssForCausalLM(nn.Module):
614
591
  config.hidden_size,
615
592
  # quant_config=quant_config,
616
593
  prefix=add_prefix("lm_head", prefix),
617
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
594
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
618
595
  )
619
596
  self.logits_processor = LogitsProcessor(config)
620
597
  self.capture_aux_hidden_states = False
sglang/srt/models/grok.py CHANGED
@@ -28,7 +28,6 @@ from torch import nn
28
28
  from transformers import PretrainedConfig
29
29
 
30
30
  from sglang.srt.distributed import (
31
- get_moe_expert_parallel_world_size,
32
31
  get_tensor_model_parallel_rank,
33
32
  get_tensor_model_parallel_world_size,
34
33
  tensor_model_parallel_all_gather,
@@ -36,7 +35,6 @@ from sglang.srt.distributed import (
36
35
  )
37
36
  from sglang.srt.layers.activation import GeluAndMul
38
37
  from sglang.srt.layers.elementwise import (
39
- experts_combine_triton,
40
38
  fused_dual_residual_rmsnorm,
41
39
  fused_rmsnorm,
42
40
  gelu_and_mul_triton,
@@ -49,7 +47,6 @@ from sglang.srt.layers.linear import (
49
47
  RowParallelLinear,
50
48
  )
51
49
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
53
50
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
54
51
  from sglang.srt.layers.moe.router import fused_moe_router_shim
55
52
  from sglang.srt.layers.moe.topk import TopK
@@ -65,10 +62,10 @@ from sglang.srt.layers.vocab_parallel_embedding import (
65
62
  ParallelLMHead,
66
63
  VocabParallelEmbedding,
67
64
  )
68
- from sglang.srt.managers.schedule_batch import global_server_args_dict
69
65
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
70
66
  from sglang.srt.model_loader.loader import DefaultModelLoader
71
67
  from sglang.srt.model_loader.weight_utils import default_weight_loader
68
+ from sglang.srt.server_args import get_global_server_args
72
69
  from sglang.srt.utils import add_prefix, dispose_tensor, dump_to_file
73
70
 
74
71
  logger = logging.getLogger(__name__)
@@ -76,9 +73,6 @@ logger = logging.getLogger(__name__)
76
73
 
77
74
  # Dump tensors for debugging
78
75
  debug_tensor_dump_output_folder = None
79
- debug_tensor_dump_prefill_only = False
80
- # Skip all the other tensor dumps, only dump the target logits
81
- debug_tensor_dump_only_target_logprobs = False
82
76
  debug_tensor_dump_inject = False
83
77
  debug_tensor_dump_layers = None
84
78
  debug_tensor_dump_test = False
@@ -176,17 +170,7 @@ class Grok1MoE(nn.Module):
176
170
  custom_routing_function=custom_routing_function,
177
171
  )
178
172
 
179
- kwargs = {}
180
- if get_moe_expert_parallel_world_size() > 1:
181
- MoEImpl = EPMoE
182
- else:
183
- MoEImpl = FusedMoE
184
- kwargs["reduce_results"] = reduce_results
185
- kwargs["use_presharded_weights"] = use_presharded_weights
186
- kwargs["inplace"] = inplace
187
- kwargs["no_combine"] = no_combine
188
-
189
- self.experts = MoEImpl(
173
+ self.experts = FusedMoE(
190
174
  num_experts=num_experts,
191
175
  top_k=top_k,
192
176
  layer_id=layer_id,
@@ -195,7 +179,10 @@ class Grok1MoE(nn.Module):
195
179
  params_dtype=params_dtype,
196
180
  quant_config=quant_config,
197
181
  activation="gelu",
198
- **kwargs,
182
+ reduce_results=reduce_results,
183
+ use_presharded_weights=use_presharded_weights,
184
+ inplace=inplace,
185
+ no_combine=no_combine,
199
186
  )
200
187
 
201
188
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -877,10 +864,10 @@ class Grok1ForCausalLM(nn.Module):
877
864
 
878
865
  # Dump tensors for debugging
879
866
  global debug_tensor_dump_output_folder, debug_tensor_dump_inject
880
- debug_tensor_dump_output_folder = global_server_args_dict[
881
- "debug_tensor_dump_output_folder"
882
- ]
883
- debug_tensor_dump_inject = global_server_args_dict["debug_tensor_dump_inject"]
867
+ debug_tensor_dump_output_folder = (
868
+ get_global_server_args().debug_tensor_dump_output_folder
869
+ )
870
+ debug_tensor_dump_inject = get_global_server_args().debug_tensor_dump_inject
884
871
  warnings.filterwarnings("ignore", category=FutureWarning)
885
872
 
886
873
  if get_tensor_model_parallel_rank() == 0:
@@ -12,18 +12,14 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  """Inference-only HunYuan model compatible with HuggingFace weights."""
15
- import logging
16
15
  import re
17
- from dataclasses import dataclass
18
- from enum import Enum, auto
19
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
16
+ from typing import Any, Dict, Iterable, Optional, Tuple
20
17
 
21
18
  import torch
22
19
  from torch import nn
23
20
  from transformers import PretrainedConfig
24
21
 
25
22
  from sglang.srt.distributed import (
26
- get_pp_group,
27
23
  get_tensor_model_parallel_rank,
28
24
  get_tensor_model_parallel_world_size,
29
25
  tensor_model_parallel_all_reduce,
@@ -46,7 +42,6 @@ from sglang.srt.layers.radix_attention import RadixAttention
46
42
  from sglang.srt.layers.rotary_embedding import get_rope
47
43
  from sglang.srt.layers.sampler import Sampler
48
44
  from sglang.srt.layers.vocab_parallel_embedding import (
49
- DEFAULT_VOCAB_PADDING_SIZE,
50
45
  ParallelLMHead,
51
46
  VocabParallelEmbedding,
52
47
  )
@@ -56,7 +51,7 @@ from sglang.srt.model_loader.weight_utils import (
56
51
  kv_cache_scales_loader,
57
52
  maybe_remap_kv_scale_name,
58
53
  )
59
- from sglang.srt.utils import add_prefix, is_hip
54
+ from sglang.srt.utils import is_hip
60
55
 
61
56
  expert_distribution_recorder = ExpertDistributionRecorder()
62
57
 
@@ -5,7 +5,6 @@ from torch import nn
5
5
  from transformers import PretrainedConfig
6
6
 
7
7
  from sglang.srt.layers.attention import vision_utils
8
- from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
9
8
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
10
9
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
11
10
  from sglang.srt.managers.mm_utils import (
@@ -43,10 +43,8 @@
43
43
 
44
44
  import copy
45
45
  import logging
46
- import math
47
- from collections.abc import Mapping
48
46
  from dataclasses import dataclass
49
- from typing import Any, Iterable, List, Optional, Tuple
47
+ from typing import Iterable, List, Optional, Tuple
50
48
 
51
49
  import torch
52
50
  from torch import nn
@@ -56,10 +54,6 @@ from sglang.srt.configs import KimiVLConfig
56
54
  from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
57
55
  from sglang.srt.configs.kimi_vl import KimiVLConfig
58
56
  from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
59
- from sglang.srt.distributed import (
60
- get_tensor_model_parallel_rank,
61
- get_tensor_model_parallel_world_size,
62
- )
63
57
  from sglang.srt.layers.activation import QuickGELU
64
58
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
65
59
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -49,7 +49,7 @@ from typing import List, Optional, Sequence, Tuple, Union
49
49
  import torch
50
50
  import torch.nn as nn
51
51
  import torch.nn.functional as F
52
- from transformers.activations import ACT2FN, PytorchGELUTanh
52
+ from transformers.activations import ACT2FN
53
53
  from transformers.modeling_utils import PreTrainedModel
54
54
 
55
55
  try:
@@ -596,6 +596,8 @@ class MoonVitPretrainedModel(PreTrainedModel):
596
596
  _supports_sdpa = True
597
597
 
598
598
  def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
599
+ from transformers.activations import GELUTanh
600
+
599
601
  super().__init__(config, *inputs, **kwargs)
600
602
  config = deepcopy(config)
601
603
  self.merge_kernel_size = config.merge_kernel_size
@@ -614,7 +616,7 @@ class MoonVitPretrainedModel(PreTrainedModel):
614
616
  "num_heads": config.num_attention_heads,
615
617
  "hidden_dim": config.hidden_size,
616
618
  "mlp_dim": config.intermediate_size,
617
- "activation": PytorchGELUTanh(),
619
+ "activation": GELUTanh(),
618
620
  "attn_bias": True,
619
621
  "attn_implementation": config._attn_implementation,
620
622
  },
@@ -45,13 +45,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
45
45
  ParallelLMHead,
46
46
  VocabParallelEmbedding,
47
47
  )
48
- from sglang.srt.managers.schedule_batch import global_server_args_dict
49
48
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
50
49
  from sglang.srt.model_loader.weight_utils import (
51
50
  default_weight_loader,
52
51
  kv_cache_scales_loader,
53
52
  maybe_remap_kv_scale_name,
54
53
  )
54
+ from sglang.srt.server_args import get_global_server_args
55
55
  from sglang.srt.utils import add_prefix, make_layers
56
56
  from sglang.utils import get_exception_traceback
57
57
 
@@ -385,6 +385,10 @@ class LlamaModel(nn.Module):
385
385
  "Self attention has no KV cache scaling " "factor attribute!"
386
386
  )
387
387
 
388
+ def get_input_embeddings(self) -> nn.Embedding:
389
+ """Get input embeddings from the model."""
390
+ return self.embed_tokens
391
+
388
392
 
389
393
  class LlamaForCausalLM(nn.Module):
390
394
  # BitandBytes specific attributes
@@ -429,7 +433,7 @@ class LlamaForCausalLM(nn.Module):
429
433
  config.hidden_size,
430
434
  quant_config=quant_config,
431
435
  prefix=add_prefix("lm_head", prefix),
432
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
436
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
433
437
  )
434
438
  self.logits_processor = LogitsProcessor(config)
435
439
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
@@ -27,7 +27,7 @@ from transformers import LlamaConfig
27
27
 
28
28
  from sglang.srt.distributed import get_pp_group
29
29
  from sglang.srt.layers.layernorm import RMSNorm
30
- from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
30
+ from sglang.srt.layers.linear import QKVParallelLinear
31
31
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
32
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
33
33
  from sglang.srt.layers.vocab_parallel_embedding import (