sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,286 @@
1
+ # Copyright 2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/nemotron_h.py
15
+
16
+ """NemotronH model configuration"""
17
+
18
+ import regex as re
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+ from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
23
+ from sglang.srt.layers.dp_attention import get_attention_tp_size
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ MAMBA = "M"
28
+ ATTENTION = "*"
29
+ MLP = "-"
30
+
31
+
32
+ class NemotronHConfig(PretrainedConfig):
33
+ r"""
34
+ This is the configuration class to store the configuration of a
35
+ [`NemotronHModel`]. It is used to instantiate a NemotronH model according
36
+ to the specified arguments, defining the model architecture. Instantiating
37
+ a configuration with the defaults will yield a similar configuration to
38
+ that of the NemotronH-v0.1 model.
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 131072):
41
+ Vocabulary size of the NemotronH model. Defines the number of
42
+ different tokens that can be represented by the `inputs_ids`
43
+ passed when calling [`NemotronHModel`]
44
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
45
+ Whether the model's input and output word embeddings should be
46
+ tied. Note that this is only relevant if the model has an output
47
+ word embedding layer.
48
+ hidden_size (`int`, *optional*, defaults to 4096):
49
+ Dimension of the hidden representations.
50
+ intermediate_size (`int`, *optional*, defaults to 21504):
51
+ Dimension of the MLP representations.
52
+ num_hidden_layers (`int`, *optional*, defaults to 52):
53
+ Number of hidden layers in the Transformer encoder.
54
+ hybrid_override_pattern (`str`, *optional*, defaults to
55
+ `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
56
+ The pattern of the hybrid model. The pattern is a string of
57
+ characters where each character represents
58
+ M: Mamba2, *: Attention, -: MLP
59
+ num_attention_heads (`int`, *optional*, defaults to 32):
60
+ Number of attention heads for each attention layer in the
61
+ Transformer encoder.
62
+ attention_head_dim (`int`, *optional*, defaults to 128):
63
+ Dimension of each attention head.
64
+ num_key_value_heads (`int`, *optional*, defaults to 8):
65
+ This is the number of key_value heads that should be used to
66
+ implement Grouped Query Attention. If
67
+ `num_key_value_heads=num_attention_heads`, the model will use
68
+ Multi Head Attention (MHA), if `num_key_value_heads=1` the model
69
+ will use Multi Query Attention (MQA) otherwise GQA is used.
70
+ mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
71
+ The non-linear activation function in the MLP layers.
72
+ attention_bias (`bool`, *optional*, defaults to `False`):
73
+ Whether to use bias in attention layers.
74
+ mlp_bias (`bool`, *optional*, defaults to `False`):
75
+ Whether to use bias in MLP layers.
76
+ use_bias (`bool`, *optional*, defaults to `False`):
77
+ Whether to use bias in the model.
78
+ initializer_range (`float`, *optional*, defaults to 0.02):
79
+ The standard deviation of the truncated_normal_initializer for
80
+ initializing all weight matrices.
81
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
82
+ The epsilon used by the layer normalization layers.
83
+ residual_in_fp32 (`bool`, *optional*, defaults to `False`):
84
+ Whether or not residuals should be in `float32`. If set to `False`
85
+ residuals will keep the same `dtype` as the rest of the model.
86
+ use_cache (`bool`, *optional*, defaults to `True`):
87
+ Whether or not the model should return the last key/values
88
+ attentions (not used by all models). Only relevant if
89
+ `config.is_decoder=True`.
90
+ num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
91
+ Number of prompt logits to calculate during generation. If `None`,
92
+ all logits will be calculated. If an integer value, only last
93
+ `num_logits_to_keep` logits will be calculated.
94
+ pad_token_id (`int`, *optional*, defaults to 0):
95
+ The id of the padding token.
96
+ bos_token_id (`int`, *optional*, defaults to 1):
97
+ The id of the "beginning-of-sequence" token.
98
+ eos_token_id (`int`, *optional*, defaults to 2):
99
+ The id of the "end-of-sequence" token.
100
+ sliding_window (`int`, *optional*, defaults to None):
101
+ Sliding window attention window size.
102
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
103
+ The maximum sequence length that this model might ever be used
104
+ with.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+ hidden_dropout (`float`, *optional*, defaults to 0.0):
108
+ The dropout ratio for the hidden states.
109
+ use_mamba_kernels (`bool`, *optional*, defaults to `True`):
110
+ Flag indicating whether or not to use the fast mamba kernels.
111
+ These are available only if `mamba-ssm` and `causal-conv1d`
112
+ are installed, and the mamba modules are running on a CUDA device.
113
+ ssm_state_size (`int`, *optional*, defaults to 128):
114
+ The dimension of the mamba state space latents.
115
+ mamba_num_heads (`int`, *optional*, defaults to 128):
116
+ Number of heads in Mamba layers.
117
+ mamba_n_groups (`int`, *optional*, defaults to 8):
118
+ Number of groups in Mamba layers.
119
+ mamba_head_dim (`int`, *optional*, defaults to 64):
120
+ Dimension of each Mamba head.
121
+ mamba_d_conv (`int`, *optional*, defaults to 4):
122
+ The size of the mamba convolution kernel.
123
+ mamba_expand (`int`, *optional*, defaults to 2):
124
+ Expanding factor used to determine the mamba intermediate size.
125
+ mamba_hidden_act (`str`, *optional*, defaults to "silu"):
126
+ The non-linear activation function in the Mamba layers.
127
+ mamba_dt_min (`float`, *optional*, defaults to 0.001):
128
+ Minimum value for the time step in Mamba.
129
+ mamba_dt_max (`float`, *optional*, defaults to 0.1):
130
+ Maximum value for the time step in Mamba.
131
+ mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
132
+ Limits for the time step in Mamba.
133
+ mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
134
+ Floor value for time step initialization in Mamba.
135
+ mamba_conv_bias (`bool`, *optional*, defaults to `True`):
136
+ Whether to use bias in the convolution layer of the mamba mixer
137
+ block.
138
+ mamba_proj_bias (`bool`, *optional*, defaults to `False`):
139
+ Whether to use bias in the input and output projections of the
140
+ mamba mixer block.
141
+ mamba_chunk_size (`int`, *optional*, defaults to 256):
142
+ Size of chunks for Mamba processing.
143
+ rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
144
+ Whether to rescale the pre-normalization residual connections.
145
+ """
146
+
147
+ model_type = "nemotron_h"
148
+ keys_to_ignore_at_inference = ["past_key_values"]
149
+
150
+ def __init__(
151
+ self,
152
+ vocab_size=131072,
153
+ tie_word_embeddings=False,
154
+ hidden_size=4096,
155
+ intermediate_size=21504,
156
+ num_hidden_layers=52,
157
+ hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
158
+ num_attention_heads=32,
159
+ head_dim=128,
160
+ num_key_value_heads=8, # nemo: num_query_groups
161
+ mlp_hidden_act="relu2",
162
+ attention_bias=False,
163
+ mlp_bias=False,
164
+ use_bias=False,
165
+ initializer_range=0.02, # nemo: init_method_std
166
+ layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
167
+ residual_in_fp32=False, # Megatron Core default value
168
+ use_cache=True,
169
+ num_logits_to_keep=1,
170
+ pad_token_id=0,
171
+ bos_token_id=1,
172
+ eos_token_id=2,
173
+ sliding_window=None,
174
+ max_position_embeddings=4096,
175
+ attention_dropout=0.0,
176
+ hidden_dropout=0.0, # * ADDED
177
+ use_mamba_kernels=True,
178
+ ssm_state_size=128, # mamba_state_size
179
+ mamba_num_heads=128,
180
+ mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads
181
+ mamba_head_dim=64,
182
+ mamba_d_conv=4,
183
+ mamba_expand=2,
184
+ mamba_hidden_act="silu",
185
+ mamba_dt_min=0.001,
186
+ mamba_dt_max=0.1,
187
+ mamba_dt_limit=(0.0, float("inf")),
188
+ mamba_dt_init_floor=1e-4,
189
+ mamba_conv_bias=True,
190
+ mamba_proj_bias=False,
191
+ mamba_chunk_size=256,
192
+ rescale_prenorm_residual=True,
193
+ **kwargs,
194
+ ):
195
+ self.vocab_size = vocab_size
196
+ self.tie_word_embeddings = tie_word_embeddings
197
+ self.hidden_size = hidden_size
198
+ self.intermediate_size = intermediate_size
199
+ self.num_hidden_layers = num_hidden_layers
200
+ self.hybrid_override_pattern = hybrid_override_pattern
201
+ self.num_attention_heads = num_attention_heads
202
+ self.head_dim = head_dim
203
+ self.sliding_window = sliding_window
204
+ self.max_position_embeddings = max_position_embeddings
205
+ self.attention_dropout = attention_dropout
206
+ self.hidden_dropout = hidden_dropout
207
+
208
+ # Validate hybrid_override_pattern
209
+ # M: Mamba2, *: Attention, -: MLP
210
+ assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
211
+ "hybrid_override_pattern must have same length as " "num_hidden_layers"
212
+ )
213
+ assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
214
+ "hybrid_override_pattern must only contain characters " "'M', '*', or '-'"
215
+ )
216
+
217
+ # for backward compatibility
218
+ if num_key_value_heads is None:
219
+ num_key_value_heads = num_attention_heads
220
+
221
+ self.num_key_value_heads = num_key_value_heads
222
+ self.mlp_hidden_act = mlp_hidden_act
223
+ self.attention_bias = attention_bias
224
+ self.mlp_bias = mlp_bias
225
+ self.use_bias = use_bias
226
+ self.initializer_range = initializer_range
227
+ self.layer_norm_epsilon = layer_norm_epsilon
228
+ self.residual_in_fp32 = residual_in_fp32
229
+
230
+ self.use_cache = use_cache
231
+ self.num_logits_to_keep = num_logits_to_keep
232
+
233
+ self.use_mamba_kernels = use_mamba_kernels
234
+ self.mamba_n_groups = mamba_n_groups
235
+ self.mamba_head_dim = mamba_head_dim
236
+ self.ssm_state_size = ssm_state_size
237
+ self.mamba_num_heads = mamba_num_heads
238
+ self.conv_kernel = mamba_d_conv
239
+ self.expand = mamba_expand
240
+ self.mamba_hidden_act = mamba_hidden_act
241
+ self.time_step_min = mamba_dt_min
242
+ self.time_step_max = mamba_dt_max
243
+ self.time_step_limit = mamba_dt_limit
244
+ self.time_step_floor = mamba_dt_init_floor
245
+ self.use_conv_bias = mamba_conv_bias
246
+ self.mamba_proj_bias = mamba_proj_bias
247
+ self.mamba_chunk_size = mamba_chunk_size
248
+ self.rescale_prenorm_residual = rescale_prenorm_residual
249
+
250
+ super().__init__(
251
+ pad_token_id=pad_token_id,
252
+ bos_token_id=bos_token_id,
253
+ eos_token_id=eos_token_id,
254
+ tie_word_embeddings=tie_word_embeddings,
255
+ **kwargs,
256
+ )
257
+
258
+ @property
259
+ def mamba_layer_ids(self):
260
+ return [
261
+ i
262
+ for i in range(self.num_hidden_layers)
263
+ if self.hybrid_override_pattern[i] == MAMBA
264
+ ]
265
+
266
+ @property
267
+ def full_attention_layer_ids(self):
268
+ return [
269
+ i
270
+ for i in range(self.num_hidden_layers)
271
+ if self.hybrid_override_pattern[i] == ATTENTION
272
+ ]
273
+
274
+ @property
275
+ def mamba2_cache_params(self) -> Mamba2CacheParams:
276
+ shape = Mamba2StateShape.create(
277
+ tp_world_size=get_attention_tp_size(),
278
+ intermediate_size=self.mamba_num_heads * self.mamba_head_dim,
279
+ n_groups=self.n_groups,
280
+ num_heads=self.mamba_num_heads,
281
+ head_dim=self.mamba_head_dim,
282
+ state_size=self.ssm_state_size,
283
+ conv_kernel=self.conv_kernel,
284
+ )
285
+
286
+ return Mamba2CacheParams(shape=shape, layers=self.mamba_layer_ids)
@@ -0,0 +1,105 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Olmo3 model configuration"""
16
+
17
+ import enum
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.modeling_rope_utils import rope_config_validation
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ class Olmo3LayerType(enum.Enum):
27
+ full_attention = "full_attention"
28
+ sliding_attention = "sliding_attention"
29
+
30
+
31
+ class Olmo3Config(PretrainedConfig):
32
+
33
+ model_type = "olmo3"
34
+ keys_to_ignore_at_inference = ["past_key_values"]
35
+
36
+ def __init__(
37
+ self,
38
+ vocab_size=50304,
39
+ hidden_size=4096,
40
+ intermediate_size=11008,
41
+ num_hidden_layers=32,
42
+ num_attention_heads=32,
43
+ num_key_value_heads=None,
44
+ hidden_act="silu",
45
+ max_position_embeddings=2048,
46
+ initializer_range=0.02,
47
+ use_cache=True,
48
+ pad_token_id=1,
49
+ bos_token_id=None,
50
+ eos_token_id=50279,
51
+ tie_word_embeddings=False,
52
+ rope_theta=10000.0,
53
+ rope_scaling=None,
54
+ attention_bias=False,
55
+ attention_dropout=0.0,
56
+ rms_norm_eps=1e-5,
57
+ sliding_window=4096,
58
+ layer_types=None,
59
+ **kwargs,
60
+ ):
61
+ # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
62
+ # in sglang.
63
+ if "architectures" not in kwargs:
64
+ kwargs["architectures"] = ["Olmo2ForCausalLM"]
65
+ elif "Olmo3ForCausalLM" in kwargs["architectures"]:
66
+ kwargs["architectures"].remove("Olmo3ForCausalLM")
67
+ kwargs["architectures"].append("Olmo2ForCausalLM")
68
+
69
+ super().__init__(
70
+ pad_token_id=pad_token_id,
71
+ bos_token_id=bos_token_id,
72
+ eos_token_id=eos_token_id,
73
+ tie_word_embeddings=tie_word_embeddings,
74
+ **kwargs,
75
+ )
76
+ self.vocab_size = vocab_size
77
+ self.max_position_embeddings = max_position_embeddings
78
+ self.hidden_size = hidden_size
79
+ self.intermediate_size = intermediate_size
80
+ self.num_hidden_layers = num_hidden_layers
81
+ self.num_attention_heads = num_attention_heads
82
+
83
+ # for backward compatibility
84
+ if num_key_value_heads is None:
85
+ num_key_value_heads = num_attention_heads
86
+
87
+ self.num_key_value_heads = num_key_value_heads
88
+ self.hidden_act = hidden_act
89
+ self.initializer_range = initializer_range
90
+ self.use_cache = use_cache
91
+ self.rope_theta = rope_theta
92
+ self.rope_scaling = rope_scaling
93
+ rope_config_validation(self)
94
+ self.attention_bias = attention_bias
95
+ self.attention_dropout = attention_dropout
96
+
97
+ self.rms_norm_eps = rms_norm_eps
98
+
99
+ self.sliding_window = sliding_window
100
+ self.layer_types = layer_types
101
+ if self.layer_types is None:
102
+ self.layer_types = [
103
+ "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
104
+ for i in range(self.num_hidden_layers)
105
+ ]
@@ -0,0 +1,29 @@
1
+ from typing import Optional, Union
2
+
3
+ from transformers import PretrainedConfig, Qwen2Config
4
+ from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
5
+
6
+
7
+ class POINTSV15ChatConfig(PretrainedConfig):
8
+ model_type = "pointsv1.5_chat"
9
+
10
+ def __init__(
11
+ self,
12
+ vision_config: Optional[Union[dict, Qwen2VLVisionConfig]] = None,
13
+ llm_config: Optional[Union[dict, Qwen2Config]] = None,
14
+ **kwargs,
15
+ ):
16
+ super().__init__(**kwargs)
17
+ if vision_config is None:
18
+ vision_config = Qwen2VLVisionConfig()
19
+ elif isinstance(vision_config, dict):
20
+ vision_config = Qwen2VLVisionConfig(**vision_config)
21
+ self.vision_config = vision_config
22
+
23
+ if llm_config is None:
24
+ llm_config = Qwen2Config()
25
+ elif isinstance(llm_config, dict):
26
+ llm_config = Qwen2Config(**llm_config)
27
+
28
+ self.llm_config = llm_config
29
+ self.hidden_size = self.llm_config.hidden_size
@@ -15,26 +15,20 @@
15
15
  """Qwen3Hybrid model configuration"""
16
16
 
17
17
  import enum
18
- import os
19
18
 
20
- import numpy as np
21
- import torch
22
19
  from transformers.configuration_utils import PretrainedConfig
23
20
  from transformers.modeling_rope_utils import rope_config_validation
24
21
  from transformers.utils import logging
25
22
 
26
- from sglang.srt.distributed.utils import divide
23
+ from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
27
24
  from sglang.srt.layers.dp_attention import get_attention_tp_size
28
25
 
29
26
  logger = logging.get_logger(__name__)
30
27
 
31
28
 
32
- # NOTE: HybridLayerType
33
29
  class HybridLayerType(enum.Enum):
34
30
  full_attention = "attention"
35
- swa_attention = "swa_attention"
36
31
  linear_attention = "linear_attention"
37
- mamba2 = "mamba"
38
32
 
39
33
 
40
34
  class Qwen3NextConfig(PretrainedConfig):
@@ -282,45 +276,15 @@ class Qwen3NextConfig(PretrainedConfig):
282
276
  ]
283
277
 
284
278
  @property
285
- def hybrid_gdn_params(self):
286
- world_size = get_attention_tp_size()
287
- conv_dim = (
288
- self.linear_key_head_dim * self.linear_num_key_heads * 2
289
- + self.linear_value_head_dim * self.linear_num_value_heads
279
+ def mamba2_cache_params(self) -> Mamba2CacheParams:
280
+ shape = Mamba2StateShape.create(
281
+ tp_world_size=get_attention_tp_size(),
282
+ intermediate_size=self.linear_value_head_dim * self.linear_num_value_heads,
283
+ n_groups=self.linear_num_key_heads,
284
+ num_heads=self.linear_num_value_heads,
285
+ head_dim=self.linear_value_head_dim,
286
+ state_size=self.linear_key_head_dim,
287
+ conv_kernel=self.linear_conv_kernel_dim,
290
288
  )
291
- conv_state_shape = (
292
- divide(conv_dim, world_size),
293
- self.linear_conv_kernel_dim - 1,
294
- )
295
-
296
- temporal_state_shape = (
297
- divide(self.linear_num_value_heads, world_size),
298
- self.linear_key_head_dim,
299
- self.linear_value_head_dim,
300
- )
301
- conv_dtype = torch.bfloat16
302
- dtype_map = {
303
- "float32": torch.float32,
304
- "bfloat16": torch.bfloat16,
305
- }
306
- ssm_dtype = dtype_map[os.environ["SGLANG_MAMBA_SSM_DTYPE"]]
307
- mamba_layers = self.linear_layer_ids
308
- return (
309
- conv_state_shape,
310
- temporal_state_shape,
311
- conv_dtype,
312
- ssm_dtype,
313
- mamba_layers,
314
- )
315
-
316
- @property
317
- def mamba_cache_per_req(self):
318
- conv_state_shape, temporal_state_shape, conv_dtype, ssm_dtype, mamba_layers = (
319
- self.hybrid_gdn_params
320
- )
321
- mamba_layers_len = len(mamba_layers)
322
289
 
323
- return (
324
- int(np.prod(conv_state_shape)) * conv_dtype.itemsize
325
- + int(np.prod(temporal_state_shape)) * ssm_dtype.itemsize
326
- ) * mamba_layers_len
290
+ return Mamba2CacheParams(shape=shape, layers=self.linear_layer_ids)