sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -16,9 +16,10 @@
16
16
  import contextlib
17
17
  import json
18
18
  import os
19
+ import tempfile
19
20
  import warnings
20
21
  from pathlib import Path
21
- from typing import Any, Dict, Optional, Type, Union
22
+ from typing import Any, Dict, List, Optional, Type, Union
22
23
 
23
24
  import torch
24
25
  from huggingface_hub import snapshot_download
@@ -45,27 +46,37 @@ from sglang.srt.configs import (
45
46
  KimiVLConfig,
46
47
  LongcatFlashConfig,
47
48
  MultiModalityConfig,
49
+ NemotronHConfig,
50
+ Olmo3Config,
48
51
  Qwen3NextConfig,
49
52
  Step3VLConfig,
50
53
  )
54
+ from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
51
55
  from sglang.srt.configs.internvl import InternVLChatConfig
52
56
  from sglang.srt.connector import create_remote_connector
53
57
  from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
54
58
 
55
- _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
56
- ChatGLMConfig.model_type: ChatGLMConfig,
57
- DbrxConfig.model_type: DbrxConfig,
58
- ExaoneConfig.model_type: ExaoneConfig,
59
- DeepseekVL2Config.model_type: DeepseekVL2Config,
60
- MultiModalityConfig.model_type: MultiModalityConfig,
61
- KimiVLConfig.model_type: KimiVLConfig,
62
- InternVLChatConfig.model_type: InternVLChatConfig,
63
- Step3VLConfig.model_type: Step3VLConfig,
64
- LongcatFlashConfig.model_type: LongcatFlashConfig,
65
- Qwen3NextConfig.model_type: Qwen3NextConfig,
66
- FalconH1Config.model_type: FalconH1Config,
67
- DotsVLMConfig.model_type: DotsVLMConfig,
68
- DotsOCRConfig.model_type: DotsOCRConfig,
59
+ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
60
+ ChatGLMConfig,
61
+ DbrxConfig,
62
+ ExaoneConfig,
63
+ DeepseekVL2Config,
64
+ MultiModalityConfig,
65
+ KimiVLConfig,
66
+ InternVLChatConfig,
67
+ Step3VLConfig,
68
+ LongcatFlashConfig,
69
+ Olmo3Config,
70
+ Qwen3NextConfig,
71
+ FalconH1Config,
72
+ DotsVLMConfig,
73
+ DotsOCRConfig,
74
+ NemotronHConfig,
75
+ DeepseekVLV2Config,
76
+ ]
77
+
78
+ _CONFIG_REGISTRY = {
79
+ config_cls.model_type: config_cls for config_cls in _CONFIG_REGISTRY
69
80
  }
70
81
 
71
82
  for name, cls in _CONFIG_REGISTRY.items():
@@ -106,6 +117,12 @@ def get_hf_text_config(config: PretrainedConfig):
106
117
  # if transformers config doesn't align with this assumption.
107
118
  assert hasattr(config.text_config, "num_attention_heads")
108
119
  return config.text_config
120
+
121
+ if hasattr(config, "llm_config"):
122
+ # PointsV1.5 Chat Model
123
+ assert hasattr(config.llm_config, "num_attention_heads")
124
+ return config.llm_config
125
+
109
126
  if hasattr(config, "language_config"):
110
127
  return config.language_config
111
128
  if hasattr(config, "thinker_config"):
@@ -143,7 +160,7 @@ def _load_deepseek_v32_model(
143
160
  config_json["architectures"] = ["DeepseekV3ForCausalLM"]
144
161
  config_json["model_type"] = "deepseek_v3"
145
162
 
146
- tmp_path = os.path.join(local_path, "_tmp_config_folder")
163
+ tmp_path = os.path.join(tempfile.gettempdir(), "_tmp_config_folder")
147
164
  os.makedirs(tmp_path, exist_ok=True)
148
165
 
149
166
  unique_path = os.path.join(tmp_path, f"deepseek_v32_{os.getpid()}")
@@ -180,6 +197,15 @@ def get_config(
180
197
  config = AutoConfig.from_pretrained(
181
198
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
182
199
  )
200
+ if (
201
+ getattr(config, "auto_map", None) is not None
202
+ and config.auto_map.get("AutoModel")
203
+ == "modeling_deepseekocr.DeepseekOCRForCausalLM"
204
+ ):
205
+ config.model_type = "deepseek-ocr"
206
+ # TODO: Remove this workaround when AutoConfig correctly identifies deepseek-ocr.
207
+ # Hugging Face's AutoConfig currently misidentifies it as deepseekvl2.
208
+
183
209
  except ValueError as e:
184
210
  if not "deepseek_v32" in str(e):
185
211
  raise e
@@ -202,7 +228,8 @@ def get_config(
202
228
  "intermediate_size": 4304,
203
229
  "model_type": "siglip_vision_model",
204
230
  "num_attention_heads": 16,
205
- "num_hidden_layers": 26, # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
231
+ "num_hidden_layers": 26,
232
+ # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
206
233
  "patch_size": 14,
207
234
  }
208
235
  config.vision_config = SiglipVisionConfig(**vision_config)
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from dataclasses import dataclass
4
3
  from multiprocessing import shared_memory
5
4
  from pathlib import Path
@@ -11,14 +11,14 @@ from sglang.srt.distributed.naive_distributed import (
11
11
  get_naive_distributed,
12
12
  set_naive_distributed,
13
13
  )
14
- from sglang.srt.host_shared_memory import (
14
+ from sglang.srt.layers.parameter import ModelWeightParameter
15
+ from sglang.srt.server_args import ServerArgs
16
+ from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
17
+ from sglang.srt.utils.host_shared_memory import (
15
18
  HostSharedMemoryManager,
16
19
  get_host_shared_memory_manager,
17
20
  set_host_shared_memory_manager,
18
21
  )
19
- from sglang.srt.layers.parameter import ModelWeightParameter
20
- from sglang.srt.server_args import ServerArgs
21
- from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -0,0 +1,199 @@
1
+ """Merge Chrome trace files from multiple ranks (TP, DP, PP, EP) into a single trace."""
2
+
3
+ import glob
4
+ import gzip
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ProfileMerger:
15
+ """Merge profile traces from all parallelism types: TP, DP, PP, EP."""
16
+
17
+ def __init__(self, output_dir: str, profile_id: str):
18
+ self.output_dir = output_dir
19
+ self.profile_id = profile_id
20
+ self.merged_trace_path = os.path.join(
21
+ output_dir, f"merged-{profile_id}.trace.json.gz"
22
+ )
23
+
24
+ # Rank types in priority order (used for sorting and labeling)
25
+ self.rank_types = ["tp", "dp", "pp", "ep"]
26
+
27
+ # Sort index multipliers: DP (highest) > EP > PP > TP (lowest)
28
+ # These ensure proper visual ordering in trace viewer
29
+ self.sort_index_multipliers = {
30
+ "dp_rank": 100_000_000,
31
+ "ep_rank": 1_000_000,
32
+ "pp_rank": 10_000,
33
+ "tp_rank": 100,
34
+ }
35
+
36
+ # PID threshold for sort_index updates (only update for system PIDs < 1000)
37
+ self.pid_sort_index_threshold = 1000
38
+
39
+ def merge_chrome_traces(self) -> str:
40
+ """Merge Chrome traces from all ranks into a single trace.
41
+
42
+ Returns:
43
+ Path to merged trace file.
44
+
45
+ Raises:
46
+ ValueError: If no trace files found.
47
+ """
48
+ trace_files = self._discover_trace_files()
49
+ if not trace_files:
50
+ raise ValueError(f"No trace files found for profile_id: {self.profile_id}")
51
+
52
+ logger.info(f"Found {len(trace_files)} trace files to merge")
53
+
54
+ merged_trace = {"traceEvents": []}
55
+ all_device_properties = []
56
+
57
+ for trace_file in sorted(trace_files, key=self._get_rank_sort_key):
58
+ rank_info = self._extract_rank_info(trace_file)
59
+ logger.info(f"Processing {trace_file} with rank info: {rank_info}")
60
+
61
+ output = self._handle_file(trace_file, rank_info)
62
+
63
+ merged_trace["traceEvents"].extend(output["traceEvents"])
64
+
65
+ if "deviceProperties" in output:
66
+ all_device_properties.extend(output["deviceProperties"])
67
+ del output["deviceProperties"]
68
+
69
+ for key, value in output.items():
70
+ if key != "traceEvents" and key not in merged_trace:
71
+ merged_trace[key] = value
72
+
73
+ if all_device_properties:
74
+ merged_trace["deviceProperties"] = all_device_properties
75
+
76
+ with gzip.open(self.merged_trace_path, "wb") as f:
77
+ f.write(json.dumps(merged_trace).encode("utf-8"))
78
+
79
+ logger.info(f"Merged profile saved to: {self.merged_trace_path}")
80
+ logger.info(f"Total events merged: {len(merged_trace['traceEvents'])}")
81
+
82
+ return self.merged_trace_path
83
+
84
+ def _discover_trace_files(self) -> List[str]:
85
+ """Discover trace files matching profile_id (supports TP/DP/PP/EP formats)."""
86
+ patterns = [f"{self.profile_id}*.trace.json.gz"]
87
+
88
+ trace_files = []
89
+ for pattern in patterns:
90
+ search_pattern = os.path.join(self.output_dir, pattern)
91
+ trace_files.extend(glob.glob(search_pattern))
92
+
93
+ trace_files = [
94
+ f
95
+ for f in trace_files
96
+ if not f.endswith(f"merged-{self.profile_id}.trace.json.gz")
97
+ and not f.endswith("-memory.pickle")
98
+ and "TP-" in f
99
+ ]
100
+ trace_files = list(set(trace_files))
101
+ return trace_files
102
+
103
+ def _extract_rank_info(self, filename: str) -> Dict[str, int]:
104
+ """Extract rank info (TP/DP/PP/EP) from filename."""
105
+ basename = os.path.basename(filename)
106
+ rank_info = {}
107
+
108
+ for rank_type in self.rank_types:
109
+ match = re.search(rf"{rank_type.upper()}-(\d+)", basename)
110
+ if match:
111
+ rank_info[f"{rank_type}_rank"] = int(match.group(1))
112
+
113
+ return rank_info
114
+
115
+ def _create_rank_label(self, rank_info: Dict[str, int]) -> str:
116
+ parts = []
117
+ for rank_type in self.rank_types:
118
+ rank_key = f"{rank_type}_rank"
119
+ if rank_key in rank_info:
120
+ parts.append(f"{rank_type.upper()}{rank_info[rank_key]:02d}")
121
+
122
+ return f"[{'-'.join(parts)}]" if parts else "[Unknown]"
123
+
124
+ def _handle_file(self, path: str, rank_info: Dict[str, int]) -> Dict[str, Any]:
125
+ logger.info(f"Processing file: {path}")
126
+
127
+ try:
128
+ with gzip.open(path, "rt", encoding="utf-8") as f:
129
+ trace = json.load(f)
130
+
131
+ output = {
132
+ key: value for key, value in trace.items() if key != "traceEvents"
133
+ }
134
+ output["traceEvents"] = self._process_events(
135
+ trace.get("traceEvents", []), rank_info
136
+ )
137
+ return output
138
+
139
+ except Exception as e:
140
+ logger.error(f"Failed to process trace file {path}: {e}")
141
+ return {"traceEvents": []}
142
+
143
+ def _process_events(
144
+ self, events: List[Dict], rank_info: Dict[str, int]
145
+ ) -> List[Dict]:
146
+ """Process events: update sort_index and add rank labels to PIDs."""
147
+ rank_label = self._create_rank_label(rank_info)
148
+
149
+ for event in events:
150
+ if event.get("name") == "process_sort_index":
151
+ pid = self._maybe_cast_int(event.get("pid"))
152
+ if pid is not None and pid < self.pid_sort_index_threshold:
153
+ event["args"]["sort_index"] = self._calculate_sort_index(
154
+ rank_info, pid
155
+ )
156
+
157
+ event["pid"] = f"{rank_label} {event['pid']}"
158
+
159
+ return events
160
+
161
+ def _calculate_sort_index(self, rank_info: Dict[str, int], pid: int) -> int:
162
+ sort_index = pid
163
+ for rank_type, multiplier in self.sort_index_multipliers.items():
164
+ sort_index += rank_info.get(rank_type, 0) * multiplier
165
+ return sort_index
166
+
167
+ def _get_rank_sort_key(self, path: str) -> Tuple[int, int, int, int]:
168
+ rank_info = self._extract_rank_info(path)
169
+ return tuple(
170
+ rank_info.get(f"{rank_type}_rank", 0)
171
+ for rank_type in ["dp", "ep", "pp", "tp"]
172
+ )
173
+
174
+ def _maybe_cast_int(self, x) -> Optional[int]:
175
+ try:
176
+ return int(x)
177
+ except (ValueError, TypeError):
178
+ return None
179
+
180
+ def get_merge_summary(self) -> Dict[str, Any]:
181
+ if not os.path.exists(self.merged_trace_path):
182
+ return {"error": "Merged trace file not found"}
183
+
184
+ try:
185
+ with gzip.open(self.merged_trace_path, "rt") as f:
186
+ merged_data = json.load(f)
187
+
188
+ trace_files = self._discover_trace_files()
189
+
190
+ return {
191
+ "merged_file": self.merged_trace_path,
192
+ "total_events": len(merged_data.get("traceEvents", [])),
193
+ "total_files": len(trace_files),
194
+ "source_files": [os.path.basename(f) for f in trace_files],
195
+ "profile_id": self.profile_id,
196
+ "device_properties_count": len(merged_data.get("deviceProperties", [])),
197
+ }
198
+ except Exception as e:
199
+ return {"error": f"Failed to read merged trace: {str(e)}"}
@@ -66,7 +66,7 @@ class MockModelRunner:
66
66
  enable_memory_saver=False,
67
67
  )
68
68
  # Required by torch native backend
69
- self.server_args = ServerArgs(model_path="fake_model_path")
69
+ self.server_args = ServerArgs(model_path="dummy")
70
70
 
71
71
 
72
72
  @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
@@ -4,7 +4,6 @@ import torch
4
4
 
5
5
  from sglang.srt.configs.model_config import AttentionArch
6
6
  from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
7
- from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
8
7
  from sglang.srt.layers.radix_attention import RadixAttention
9
8
  from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
10
9
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
@@ -2,8 +2,6 @@ import unittest
2
2
 
3
3
  import torch
4
4
 
5
- from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
6
- from sglang.srt.layers.radix_attention import RadixAttention
7
5
  from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
8
6
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
9
7
  from sglang.test.test_utils import CustomTestCase