sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,6 @@ from typing import Iterable, List, Optional, Tuple
6
6
 
7
7
  import torch
8
8
  import torch.nn as nn
9
- from transformers.activations import ACT2FN
10
9
 
11
10
  from sglang.srt.configs import DotsOCRConfig
12
11
  from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -22,7 +21,6 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
22
21
  from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer
23
22
  from sglang.srt.models.qwen2 import Qwen2ForCausalLM
24
23
  from sglang.srt.utils import add_prefix
25
- from sglang.srt.utils.hf_transformers_utils import get_processor
26
24
 
27
25
  logger = logging.getLogger(__name__)
28
26
 
@@ -23,7 +23,6 @@ import torch
23
23
  from torch import nn
24
24
 
25
25
  from sglang.srt.configs.dots_vlm import DotsVLMConfig
26
- from sglang.srt.distributed import parallel_state
27
26
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
28
27
  from sglang.srt.managers.mm_utils import (
29
28
  MultiModalityDataPaddingPatternMultimodalTokens,
@@ -323,7 +323,7 @@ class DotsVisionTransformer(PreTrainedModel):
323
323
  dim=0,
324
324
  dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
325
325
  )
326
- cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
326
+ cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
327
327
 
328
328
  for blk in self.blocks:
329
329
  hidden_states = blk(
@@ -1,4 +1,3 @@
1
- import enum
2
1
  import logging
3
2
  from typing import Any, Iterable, List, Optional, Set, Tuple
4
3
 
@@ -8,6 +7,10 @@ from torch import nn
8
7
  from sglang.srt.configs.falcon_h1 import FalconH1Config
9
8
  from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
10
9
  from sglang.srt.layers.activation import SiluAndMul
10
+ from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
11
+ HybridLinearAttnBackend,
12
+ Mamba2AttnBackend,
13
+ )
11
14
  from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
12
15
  from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
13
16
  from sglang.srt.layers.dp_attention import (
@@ -29,9 +32,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
29
32
  ParallelLMHead,
30
33
  VocabParallelEmbedding,
31
34
  )
32
- from sglang.srt.managers.schedule_batch import global_server_args_dict
33
35
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
34
36
  from sglang.srt.model_loader.weight_utils import default_weight_loader
37
+ from sglang.srt.server_args import get_global_server_args
35
38
  from sglang.srt.utils import add_prefix, is_cuda, make_layers
36
39
 
37
40
  logger = logging.getLogger(__name__)
@@ -184,18 +187,12 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
184
187
  )
185
188
 
186
189
  self.mamba = MambaMixer2(
190
+ cache_params=config.mamba2_cache_params,
187
191
  hidden_size=config.hidden_size,
188
- ssm_state_size=config.mamba_d_state,
189
- conv_kernel_size=config.mamba_d_conv,
190
- intermediate_size=self.d_ssm,
191
192
  use_conv_bias=config.mamba_conv_bias,
192
193
  use_bias=config.mamba_proj_bias,
193
194
  n_groups=config.mamba_n_groups,
194
- num_heads=config.mamba_n_heads,
195
- layer_id=layer_id,
196
- head_dim=config.mamba_d_head,
197
195
  rms_norm_eps=config.rms_norm_eps,
198
- chunk_size=config.mamba_chunk_size,
199
196
  activation=config.hidden_act,
200
197
  use_rms_norm=config.mamba_rms_norm,
201
198
  prefix=f"{prefix}.mixer",
@@ -339,12 +336,16 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
339
336
  )
340
337
  attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
341
338
 
339
+ attn_backend = forward_batch.attn_backend
340
+ assert isinstance(attn_backend, HybridLinearAttnBackend)
341
+ assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
342
342
  # Mamba block
343
343
  mamba_hidden_states = torch.empty_like(hidden_states)
344
- self.mamba(
344
+ attn_backend.linear_attn_backend.forward(
345
+ self.mamba,
345
346
  hidden_states * self.ssm_in_multiplier,
346
347
  mamba_hidden_states,
347
- forward_batch=forward_batch,
348
+ layer_id=self.layer_id,
348
349
  mup_vector=self.mup_vector,
349
350
  )
350
351
  mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
@@ -448,13 +449,6 @@ class FalconH1Model(nn.Module):
448
449
  return hidden_states
449
450
 
450
451
 
451
- class HybridLayerType(enum.Enum):
452
- full_attention = "attention"
453
- swa_attention = "swa_attention"
454
- linear_attention = "linear_attention"
455
- mamba2 = "mamba"
456
-
457
-
458
452
  class FalconH1ForCausalLM(nn.Module):
459
453
  fall_back_to_pt_during_load = False
460
454
 
@@ -481,7 +475,7 @@ class FalconH1ForCausalLM(nn.Module):
481
475
  quant_config=quant_config,
482
476
  org_num_embeddings=config.vocab_size,
483
477
  prefix=add_prefix("lm_head", prefix),
484
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
478
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
485
479
  )
486
480
  self.lm_head = self.lm_head.float()
487
481
  self.lm_head_multiplier = config.lm_head_multiplier
@@ -16,6 +16,7 @@
16
16
  # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
17
17
 
18
18
  import logging
19
+ import re
19
20
  from functools import lru_cache
20
21
  from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
21
22
 
@@ -154,6 +155,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
154
155
  embedding_modules = {}
155
156
  embedding_padding_modules = []
156
157
  supports_lora = True
158
+ # Pattern to match language model layers only (skip vision_tower and multi_modal_projector)
159
+ lora_pattern = re.compile(
160
+ r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
161
+ )
157
162
 
158
163
  def __init__(
159
164
  self,
@@ -165,6 +170,13 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
165
170
  self.config = config
166
171
  self.quant_config = quant_config
167
172
 
173
+ # For LoRA compatibility: expose text_config attributes at top level
174
+ # This allows LoRA code to work without special multimodal handling
175
+ if not hasattr(config, "num_hidden_layers"):
176
+ config.num_hidden_layers = config.text_config.num_hidden_layers
177
+ if not hasattr(config, "hidden_size"):
178
+ config.hidden_size = config.text_config.hidden_size
179
+
168
180
  self.vision_tower = SiglipVisionModel(
169
181
  config=config.vision_config,
170
182
  quant_config=quant_config,
@@ -380,6 +392,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
380
392
 
381
393
  return hs
382
394
 
395
+ def should_apply_lora(self, module_name: str) -> bool:
396
+ """Skip vision tower and multi_modal_projector for LoRA."""
397
+ return bool(self.lora_pattern.match(module_name))
398
+
383
399
  def tie_weights(self):
384
400
  return self.language_model.tie_weights()
385
401
 
@@ -14,8 +14,7 @@ from transformers import (
14
14
  )
15
15
  from transformers.models.auto.modeling_auto import AutoModel
16
16
 
17
- from sglang.srt.layers.layernorm import RMSNorm
18
- from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
17
+ from sglang.srt.layers.linear import RowParallelLinear
19
18
  from sglang.srt.layers.logits_processor import LogitsProcessor
20
19
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
21
20
  from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -27,7 +27,6 @@ from sglang.srt.distributed import (
27
27
  get_pp_group,
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
- parallel_state,
31
30
  tensor_model_parallel_all_reduce,
32
31
  )
33
32
  from sglang.srt.layers.activation import SiluAndMul
@@ -44,30 +43,23 @@ from sglang.srt.layers.dp_attention import (
44
43
  )
45
44
  from sglang.srt.layers.layernorm import RMSNorm
46
45
  from sglang.srt.layers.linear import (
47
- ColumnParallelLinear,
48
46
  MergedColumnParallelLinear,
49
47
  QKVParallelLinear,
50
- ReplicatedLinear,
51
48
  RowParallelLinear,
52
49
  )
53
50
  from sglang.srt.layers.logits_processor import LogitsProcessor
54
- from sglang.srt.layers.moe import get_deepep_mode, get_moe_a2a_backend
51
+ from sglang.srt.layers.moe import get_moe_a2a_backend
55
52
  from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
56
53
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
57
54
  from sglang.srt.layers.moe.topk import TopK
58
55
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
59
- from sglang.srt.layers.quantization.fp8_kernel import (
60
- is_fp8_fnuz,
61
- per_tensor_quant_mla_fp8,
62
- per_token_group_quant_mla_deep_gemm_masked_fp8,
63
- )
56
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
64
57
  from sglang.srt.layers.radix_attention import RadixAttention
65
58
  from sglang.srt.layers.rotary_embedding import get_rope
66
59
  from sglang.srt.layers.vocab_parallel_embedding import (
67
60
  ParallelLMHead,
68
61
  VocabParallelEmbedding,
69
62
  )
70
- from sglang.srt.managers.schedule_batch import global_server_args_dict
71
63
  from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
72
64
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
73
65
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -77,21 +69,17 @@ from sglang.srt.models.deepseek_v2 import (
77
69
  DeepseekV2Model,
78
70
  DeepseekV2MoE,
79
71
  )
80
- from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
72
+ from sglang.srt.server_args import get_global_server_args
81
73
  from sglang.srt.utils import (
82
74
  BumpAllocator,
83
75
  LazyValue,
84
76
  add_prefix,
85
- bind_or_assign,
86
77
  cpu_has_amx_support,
87
78
  get_bool_env_var,
88
79
  get_device_sm,
89
- get_int_env_var,
90
80
  is_cpu,
91
81
  is_cuda,
92
- is_flashinfer_available,
93
82
  is_hip,
94
- is_non_idle_and_non_empty,
95
83
  log_info_on_rank0,
96
84
  use_intel_amx_backend,
97
85
  )
@@ -395,7 +383,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
395
383
  self.n_shared_experts = config.n_shared_experts
396
384
  self.num_fused_shared_experts = (
397
385
  0
398
- if global_server_args_dict["disable_shared_experts_fusion"]
386
+ if get_global_server_args().disable_shared_experts_fusion
399
387
  else config.n_shared_experts
400
388
  )
401
389
  self.config = config
@@ -432,7 +420,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
432
420
  self.experts = get_moe_impl_class(quant_config)(
433
421
  num_experts=config.n_routed_experts
434
422
  + self.num_fused_shared_experts
435
- + global_server_args_dict["ep_num_redundant_experts"],
423
+ + get_global_server_args().ep_num_redundant_experts,
436
424
  num_fused_shared_experts=self.num_fused_shared_experts,
437
425
  top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
438
426
  hidden_size=config.hidden_size,
@@ -471,12 +459,12 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
471
459
 
472
460
  self.top_k = config.num_experts_per_tok
473
461
 
474
- if get_moe_a2a_backend().is_deepep():
462
+ if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
475
463
  # TODO: we will support tp < ep in the future
476
464
  self.ep_size = get_moe_expert_parallel_world_size()
477
465
  self.num_experts = (
478
466
  config.n_routed_experts
479
- + global_server_args_dict["ep_num_redundant_experts"]
467
+ + get_global_server_args().ep_num_redundant_experts
480
468
  )
481
469
  self.renormalize = config.norm_topk_prob
482
470
  self.topk_group = config.topk_group
@@ -487,20 +475,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
487
475
  else None
488
476
  )
489
477
 
490
- self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
491
- group=parallel_state.get_tp_group().device_group,
492
- router_topk=self.top_k,
493
- permute_fusion=True,
494
- num_experts=self.num_experts,
495
- num_local_experts=config.n_routed_experts // self.tp_size,
496
- hidden_size=config.hidden_size,
497
- params_dtype=config.torch_dtype,
498
- deepep_mode=get_deepep_mode(),
499
- async_finish=True,
500
- return_recv_hook=True,
501
- )
502
-
503
- self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
478
+ self._enable_a2a_moe = (
479
+ get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
480
+ )
504
481
 
505
482
  def forward_normal_dual_stream(
506
483
  self,
@@ -664,7 +641,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
664
641
  layer_scatter_modes=self.layer_scatter_modes,
665
642
  input_layernorm=self.input_layernorm,
666
643
  post_attention_layernorm=self.post_attention_layernorm,
667
- allow_reduce_scatter=True,
644
+ allow_reduce_scatter=False,
668
645
  )
669
646
 
670
647
  def forward(
@@ -758,7 +735,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
758
735
  config.hidden_size,
759
736
  quant_config=quant_config,
760
737
  prefix=add_prefix("lm_head", prefix),
761
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
738
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
762
739
  )
763
740
  self.logits_processor = LogitsProcessor(config)
764
741
 
@@ -774,7 +751,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
774
751
  self, architecture: str = "Glm4MoeForCausalLM"
775
752
  ):
776
753
  self.num_fused_shared_experts = 0
777
- if global_server_args_dict["disable_shared_experts_fusion"]:
754
+ if get_global_server_args().disable_shared_experts_fusion:
778
755
  return
779
756
 
780
757
  # Only Deepseek V3/R1 can use shared experts fusion optimization now.
@@ -790,7 +767,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
790
767
  disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."
791
768
 
792
769
  if disable_reason is not None:
793
- global_server_args_dict["disable_shared_experts_fusion"] = True
770
+ get_global_server_args().disable_shared_experts_fusion = True
794
771
  self.num_fused_shared_experts = 0
795
772
  log_info_on_rank0(
796
773
  logger,
@@ -30,9 +30,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
30
30
  ParallelLMHead,
31
31
  VocabParallelEmbedding,
32
32
  )
33
- from sglang.srt.managers.schedule_batch import global_server_args_dict
34
33
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
35
34
  from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
35
+ from sglang.srt.server_args import get_global_server_args
36
36
  from sglang.srt.utils import BumpAllocator, add_prefix
37
37
 
38
38
  logger = logging.getLogger(__name__)
@@ -145,7 +145,7 @@ class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
145
145
  config.hidden_size,
146
146
  quant_config=quant_config,
147
147
  prefix=add_prefix("model.shared_head.head", prefix),
148
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
148
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
149
149
  )
150
150
  self.logits_processor = LogitsProcessor(config)
151
151
 
@@ -9,6 +9,7 @@ from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisi
9
9
 
10
10
  from sglang.srt.layers.activation import SiluAndMul
11
11
  from sglang.srt.layers.attention import vision_utils
12
+ from sglang.srt.layers.dp_attention import get_attention_tp_size
12
13
  from sglang.srt.layers.layernorm import RMSNorm
13
14
  from sglang.srt.layers.linear import (
14
15
  ColumnParallelLinear,
@@ -434,7 +435,7 @@ class Glm4vVisionModel(nn.Module):
434
435
  cu_seqlens = torch.repeat_interleave(
435
436
  grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
436
437
  ).cumsum(dim=0, dtype=torch.int32)
437
- cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
438
+ cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
438
439
 
439
440
  seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
440
441
  x = self.embeddings(
@@ -16,10 +16,10 @@ from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
16
16
  from sglang.srt.layers.pooler import Pooler, PoolingType
17
17
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
18
18
  from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
19
- from sglang.srt.managers.schedule_batch import global_server_args_dict
20
19
  from sglang.srt.model_loader.weight_utils import default_weight_loader
21
20
  from sglang.srt.models.glm4_moe import Glm4MoeModel
22
21
  from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
22
+ from sglang.srt.server_args import get_global_server_args
23
23
  from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
24
24
  from sglang.srt.utils.hf_transformers_utils import get_processor
25
25
 
@@ -47,7 +47,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
47
47
  self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
48
48
  self.num_fused_shared_experts = (
49
49
  0
50
- if global_server_args_dict["disable_shared_experts_fusion"]
50
+ if get_global_server_args().disable_shared_experts_fusion
51
51
  else config.n_shared_experts
52
52
  )
53
53
 
@@ -68,7 +68,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
68
68
  config.hidden_size,
69
69
  quant_config=quant_config,
70
70
  prefix=add_prefix("lm_head", prefix),
71
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
71
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
72
72
  )
73
73
  self.logits_processor = LogitsProcessor(config)
74
74
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
@@ -81,7 +81,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
81
81
  self, architecture: str = "Glm4MoeForCausalLM"
82
82
  ):
83
83
  self.num_fused_shared_experts = 0
84
- if global_server_args_dict["disable_shared_experts_fusion"]:
84
+ if get_global_server_args().disable_shared_experts_fusion:
85
85
  return
86
86
 
87
87
  # Only Deepseek V3/R1 can use shared experts fusion optimization now.
@@ -97,7 +97,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
97
97
  disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
98
98
 
99
99
  if disable_reason is not None:
100
- global_server_args_dict["disable_shared_experts_fusion"] = True
100
+ get_global_server_args().disable_shared_experts_fusion = True
101
101
  self.num_fused_shared_experts = 0
102
102
  log_info_on_rank0(
103
103
  logger,
@@ -63,13 +63,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
63
63
  ParallelLMHead,
64
64
  VocabParallelEmbedding,
65
65
  )
66
- from sglang.srt.managers.schedule_batch import global_server_args_dict
67
66
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
68
67
  from sglang.srt.model_loader.weight_utils import default_weight_loader
69
68
  from sglang.srt.models.utils import (
70
69
  create_fused_set_kv_buffer_arg,
71
70
  enable_fused_set_kv_buffer,
72
71
  )
72
+ from sglang.srt.server_args import get_global_server_args
73
73
  from sglang.srt.utils import (
74
74
  LazyValue,
75
75
  add_prefix,
@@ -85,7 +85,7 @@ _is_sm100_supported = is_cuda() and is_sm100_supported()
85
85
 
86
86
 
87
87
  if _is_cuda:
88
- from sgl_kernel import FusedSetKVBufferArg
88
+ from sgl_kernel import FusedSetKVBufferArg # noqa: F401
89
89
 
90
90
 
91
91
  class GptOssConfig(PretrainedConfig):
@@ -138,7 +138,7 @@ class GptOssSparseMoeBlock(nn.Module):
138
138
  }
139
139
  self.experts = experts_type(
140
140
  num_experts=config.num_local_experts
141
- + global_server_args_dict["ep_num_redundant_experts"],
141
+ + get_global_server_args().ep_num_redundant_experts,
142
142
  top_k=config.num_experts_per_tok,
143
143
  layer_id=layer_id,
144
144
  hidden_size=config.hidden_size,
@@ -259,7 +259,7 @@ class GptOssAttention(nn.Module):
259
259
 
260
260
  # Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
261
261
  # others can use bfloat16
262
- attn_backend = global_server_args_dict.get("attention_backend")
262
+ attn_backend = get_global_server_args().attention_backend
263
263
  sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
264
264
  self.sinks = nn.Parameter(
265
265
  torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
@@ -591,7 +591,7 @@ class GptOssForCausalLM(nn.Module):
591
591
  config.hidden_size,
592
592
  # quant_config=quant_config,
593
593
  prefix=add_prefix("lm_head", prefix),
594
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
594
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
595
595
  )
596
596
  self.logits_processor = LogitsProcessor(config)
597
597
  self.capture_aux_hidden_states = False
sglang/srt/models/grok.py CHANGED
@@ -28,7 +28,6 @@ from torch import nn
28
28
  from transformers import PretrainedConfig
29
29
 
30
30
  from sglang.srt.distributed import (
31
- get_moe_expert_parallel_world_size,
32
31
  get_tensor_model_parallel_rank,
33
32
  get_tensor_model_parallel_world_size,
34
33
  tensor_model_parallel_all_gather,
@@ -36,7 +35,6 @@ from sglang.srt.distributed import (
36
35
  )
37
36
  from sglang.srt.layers.activation import GeluAndMul
38
37
  from sglang.srt.layers.elementwise import (
39
- experts_combine_triton,
40
38
  fused_dual_residual_rmsnorm,
41
39
  fused_rmsnorm,
42
40
  gelu_and_mul_triton,
@@ -49,7 +47,6 @@ from sglang.srt.layers.linear import (
49
47
  RowParallelLinear,
50
48
  )
51
49
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
53
50
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
54
51
  from sglang.srt.layers.moe.router import fused_moe_router_shim
55
52
  from sglang.srt.layers.moe.topk import TopK
@@ -65,10 +62,10 @@ from sglang.srt.layers.vocab_parallel_embedding import (
65
62
  ParallelLMHead,
66
63
  VocabParallelEmbedding,
67
64
  )
68
- from sglang.srt.managers.schedule_batch import global_server_args_dict
69
65
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
70
66
  from sglang.srt.model_loader.loader import DefaultModelLoader
71
67
  from sglang.srt.model_loader.weight_utils import default_weight_loader
68
+ from sglang.srt.server_args import get_global_server_args
72
69
  from sglang.srt.utils import add_prefix, dispose_tensor, dump_to_file
73
70
 
74
71
  logger = logging.getLogger(__name__)
@@ -76,9 +73,6 @@ logger = logging.getLogger(__name__)
76
73
 
77
74
  # Dump tensors for debugging
78
75
  debug_tensor_dump_output_folder = None
79
- debug_tensor_dump_prefill_only = False
80
- # Skip all the other tensor dumps, only dump the target logits
81
- debug_tensor_dump_only_target_logprobs = False
82
76
  debug_tensor_dump_inject = False
83
77
  debug_tensor_dump_layers = None
84
78
  debug_tensor_dump_test = False
@@ -176,17 +170,7 @@ class Grok1MoE(nn.Module):
176
170
  custom_routing_function=custom_routing_function,
177
171
  )
178
172
 
179
- kwargs = {}
180
- if get_moe_expert_parallel_world_size() > 1:
181
- MoEImpl = EPMoE
182
- else:
183
- MoEImpl = FusedMoE
184
- kwargs["reduce_results"] = reduce_results
185
- kwargs["use_presharded_weights"] = use_presharded_weights
186
- kwargs["inplace"] = inplace
187
- kwargs["no_combine"] = no_combine
188
-
189
- self.experts = MoEImpl(
173
+ self.experts = FusedMoE(
190
174
  num_experts=num_experts,
191
175
  top_k=top_k,
192
176
  layer_id=layer_id,
@@ -195,7 +179,10 @@ class Grok1MoE(nn.Module):
195
179
  params_dtype=params_dtype,
196
180
  quant_config=quant_config,
197
181
  activation="gelu",
198
- **kwargs,
182
+ reduce_results=reduce_results,
183
+ use_presharded_weights=use_presharded_weights,
184
+ inplace=inplace,
185
+ no_combine=no_combine,
199
186
  )
200
187
 
201
188
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -877,10 +864,10 @@ class Grok1ForCausalLM(nn.Module):
877
864
 
878
865
  # Dump tensors for debugging
879
866
  global debug_tensor_dump_output_folder, debug_tensor_dump_inject
880
- debug_tensor_dump_output_folder = global_server_args_dict[
881
- "debug_tensor_dump_output_folder"
882
- ]
883
- debug_tensor_dump_inject = global_server_args_dict["debug_tensor_dump_inject"]
867
+ debug_tensor_dump_output_folder = (
868
+ get_global_server_args().debug_tensor_dump_output_folder
869
+ )
870
+ debug_tensor_dump_inject = get_global_server_args().debug_tensor_dump_inject
884
871
  warnings.filterwarnings("ignore", category=FutureWarning)
885
872
 
886
873
  if get_tensor_model_parallel_rank() == 0:
@@ -12,18 +12,14 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  """Inference-only HunYuan model compatible with HuggingFace weights."""
15
- import logging
16
15
  import re
17
- from dataclasses import dataclass
18
- from enum import Enum, auto
19
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
16
+ from typing import Any, Dict, Iterable, Optional, Tuple
20
17
 
21
18
  import torch
22
19
  from torch import nn
23
20
  from transformers import PretrainedConfig
24
21
 
25
22
  from sglang.srt.distributed import (
26
- get_pp_group,
27
23
  get_tensor_model_parallel_rank,
28
24
  get_tensor_model_parallel_world_size,
29
25
  tensor_model_parallel_all_reduce,
@@ -46,7 +42,6 @@ from sglang.srt.layers.radix_attention import RadixAttention
46
42
  from sglang.srt.layers.rotary_embedding import get_rope
47
43
  from sglang.srt.layers.sampler import Sampler
48
44
  from sglang.srt.layers.vocab_parallel_embedding import (
49
- DEFAULT_VOCAB_PADDING_SIZE,
50
45
  ParallelLMHead,
51
46
  VocabParallelEmbedding,
52
47
  )
@@ -56,7 +51,7 @@ from sglang.srt.model_loader.weight_utils import (
56
51
  kv_cache_scales_loader,
57
52
  maybe_remap_kv_scale_name,
58
53
  )
59
- from sglang.srt.utils import add_prefix, is_hip
54
+ from sglang.srt.utils import is_hip
60
55
 
61
56
  expert_distribution_recorder = ExpertDistributionRecorder()
62
57
 
@@ -5,7 +5,6 @@ from torch import nn
5
5
  from transformers import PretrainedConfig
6
6
 
7
7
  from sglang.srt.layers.attention import vision_utils
8
- from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
9
8
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
10
9
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
11
10
  from sglang.srt.managers.mm_utils import (
@@ -43,10 +43,8 @@
43
43
 
44
44
  import copy
45
45
  import logging
46
- import math
47
- from collections.abc import Mapping
48
46
  from dataclasses import dataclass
49
- from typing import Any, Iterable, List, Optional, Tuple
47
+ from typing import Iterable, List, Optional, Tuple
50
48
 
51
49
  import torch
52
50
  from torch import nn
@@ -56,10 +54,6 @@ from sglang.srt.configs import KimiVLConfig
56
54
  from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
57
55
  from sglang.srt.configs.kimi_vl import KimiVLConfig
58
56
  from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
59
- from sglang.srt.distributed import (
60
- get_tensor_model_parallel_rank,
61
- get_tensor_model_parallel_world_size,
62
- )
63
57
  from sglang.srt.layers.activation import QuickGELU
64
58
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
65
59
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -49,7 +49,7 @@ from typing import List, Optional, Sequence, Tuple, Union
49
49
  import torch
50
50
  import torch.nn as nn
51
51
  import torch.nn.functional as F
52
- from transformers.activations import ACT2FN, GELUTanh
52
+ from transformers.activations import ACT2FN
53
53
  from transformers.modeling_utils import PreTrainedModel
54
54
 
55
55
  try:
@@ -596,6 +596,8 @@ class MoonVitPretrainedModel(PreTrainedModel):
596
596
  _supports_sdpa = True
597
597
 
598
598
  def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
599
+ from transformers.activations import GELUTanh
600
+
599
601
  super().__init__(config, *inputs, **kwargs)
600
602
  config = deepcopy(config)
601
603
  self.merge_kernel_size = config.merge_kernel_size
@@ -45,13 +45,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
45
45
  ParallelLMHead,
46
46
  VocabParallelEmbedding,
47
47
  )
48
- from sglang.srt.managers.schedule_batch import global_server_args_dict
49
48
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
50
49
  from sglang.srt.model_loader.weight_utils import (
51
50
  default_weight_loader,
52
51
  kv_cache_scales_loader,
53
52
  maybe_remap_kv_scale_name,
54
53
  )
54
+ from sglang.srt.server_args import get_global_server_args
55
55
  from sglang.srt.utils import add_prefix, make_layers
56
56
  from sglang.utils import get_exception_traceback
57
57
 
@@ -433,7 +433,7 @@ class LlamaForCausalLM(nn.Module):
433
433
  config.hidden_size,
434
434
  quant_config=quant_config,
435
435
  prefix=add_prefix("lm_head", prefix),
436
- use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
436
+ use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
437
437
  )
438
438
  self.logits_processor = LogitsProcessor(config)
439
439
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)