sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ import uuid
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from fastapi import Request
11
+ from fastapi.responses import ORJSONResponse
12
+
13
+ from sglang.srt.entrypoints.openai.protocol import (
14
+ ClassifyRequest,
15
+ ClassifyResponse,
16
+ ErrorResponse,
17
+ )
18
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
19
+ from sglang.srt.managers.io_struct import EmbeddingReqInput
20
+
21
+ if TYPE_CHECKING:
22
+ from sglang.srt.managers.template_manager import TemplateManager
23
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class OpenAIServingClassify(OpenAIServingBase):
29
+ """Handler for v1/classify requests"""
30
+
31
+ def __init__(
32
+ self,
33
+ tokenizer_manager: TokenizerManager,
34
+ template_manager: TemplateManager,
35
+ ):
36
+ super().__init__(tokenizer_manager)
37
+ self.template_manager = template_manager
38
+ self.id2label = self._get_id2label_mapping()
39
+ self.model_name = (
40
+ self.tokenizer_manager.served_model_name
41
+ if self.tokenizer_manager.served_model_name
42
+ else self.tokenizer_manager.server_args.model_path
43
+ )
44
+ if not self.id2label:
45
+ raise ValueError("id2label mapping is missing")
46
+
47
+ def _request_id_prefix(self) -> str:
48
+ return "classify-"
49
+
50
+ def _convert_to_internal_request(
51
+ self,
52
+ request: ClassifyRequest,
53
+ raw_request: Request = None,
54
+ ) -> tuple[EmbeddingReqInput, ClassifyRequest]:
55
+ """Convert OpenAI embedding request to internal format"""
56
+ prompt = request.input
57
+
58
+ if isinstance(prompt, str):
59
+ # Single string input
60
+ prompt_kwargs = {"text": prompt}
61
+ elif isinstance(prompt, list):
62
+ if len(prompt) > 0 and isinstance(prompt[0], str):
63
+ prompt_kwargs = {"text": prompt}
64
+ else:
65
+ # List of integers (token IDs) or empty list
66
+ prompt_kwargs = {"input_ids": prompt}
67
+ else:
68
+ # Other types (should not happen but handle gracefully)
69
+ prompt_kwargs = {"input_ids": prompt}
70
+
71
+ adapted_request = EmbeddingReqInput(
72
+ **prompt_kwargs,
73
+ rid=request.rid,
74
+ priority=request.priority,
75
+ )
76
+
77
+ return adapted_request, request
78
+
79
+ def _validate_request(self, request: ClassifyRequest) -> Optional[str]:
80
+ """Validate that the input is not empty or whitespace only."""
81
+ if not (input := request.input):
82
+ return "Input cannot be empty"
83
+
84
+ # Handle single string
85
+ if isinstance(input, str):
86
+ if not input.strip():
87
+ return "Input cannot be empty or whitespace only"
88
+ return None
89
+
90
+ # Handle list inputs
91
+ if isinstance(input, list):
92
+ # Check first element to determine type
93
+ first_item = input[0]
94
+
95
+ if isinstance(first_item, str):
96
+ # List of strings
97
+ for i, item in enumerate(input):
98
+ if not isinstance(item, str):
99
+ return f"All items in input list must be strings"
100
+ if not item.strip():
101
+ return f"Input at index {i} cannot be empty or whitespace only"
102
+ elif isinstance(first_item, int):
103
+ # List of integers (token IDs)
104
+ for i, item in enumerate(input):
105
+ if not isinstance(item, int):
106
+ return f"All items in input list must be integers"
107
+ if item < 0:
108
+ return f"Token ID at index {i} must be non-negative"
109
+ return None
110
+
111
+ def _get_id2label_mapping(self) -> Optional[Dict[int, str]]:
112
+ """Get id2label mapping from model config."""
113
+ try:
114
+ hf_config = self.tokenizer_manager.model_config.hf_config
115
+ # Check for id2label in hf_config
116
+ if hf_config.id2label:
117
+ return hf_config.id2label
118
+ # Check for num_labels and create default mapping if needed
119
+ if hasattr(hf_config, "num_labels") and hf_config.num_labels:
120
+ num_labels = hf_config.num_labels
121
+ # Create default mapping: {0: "LABEL_0", 1: "LABEL_1", ...}
122
+ return {i: f"LABEL_{i}" for i in range(num_labels)}
123
+
124
+ except Exception as e:
125
+ logger.warning(f"Failed to get id2label mapping: {e}")
126
+
127
+ return None
128
+
129
+ async def _handle_non_streaming_request(
130
+ self,
131
+ adapted_request: EmbeddingReqInput,
132
+ request: ClassifyRequest,
133
+ raw_request: Request,
134
+ ) -> Union[ClassifyResponse, ErrorResponse, ORJSONResponse]:
135
+ """Handle non-streaming classification request."""
136
+ # Generate request ID
137
+
138
+ try:
139
+ ret = await self.tokenizer_manager.generate_request(
140
+ adapted_request, raw_request
141
+ ).__anext__()
142
+ except ValueError as e:
143
+ return self.create_error_response(str(e))
144
+
145
+ if not isinstance(ret, list):
146
+ ret = [ret]
147
+
148
+ response = self._build_classify_response(ret)
149
+ return response
150
+
151
+ def _build_classify_response(self, ret: List[Dict[str, Any]]) -> ClassifyResponse:
152
+ request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
153
+ created_time = int(time.time())
154
+ classify_objects = []
155
+ prompt_tokens = 0
156
+ total_latency = 0.0
157
+
158
+ for i, item in enumerate(ret):
159
+ embedding = item.get("embedding", [])
160
+ meta_info = item.get("meta_info", {})
161
+
162
+ prompt_tokens += meta_info.get("prompt_tokens", 0)
163
+ total_latency += meta_info.get("e2e_latency", 0.0)
164
+
165
+ if embedding:
166
+ try:
167
+ embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
168
+ probs = F.softmax(embedding_tensor, dim=0).tolist()
169
+
170
+ predicted_class = torch.argmax(embedding_tensor).item()
171
+
172
+ label = self.id2label[predicted_class]
173
+
174
+ except Exception as e:
175
+ logger.error(f"Error processing embedding for item {i}: {e}")
176
+ probs = [1.0]
177
+ label = "Default"
178
+ else:
179
+ probs = [1.0]
180
+ label = "Default"
181
+
182
+ classify_obj = {
183
+ "index": i,
184
+ "label": label,
185
+ "probs": probs,
186
+ "num_classes": len(probs),
187
+ }
188
+ classify_objects.append(classify_obj)
189
+
190
+ response = {
191
+ "id": request_id,
192
+ "object": "list",
193
+ "created": created_time,
194
+ "model": self.model_name,
195
+ "data": classify_objects,
196
+ "usage": {
197
+ "prompt_tokens": prompt_tokens,
198
+ "total_tokens": prompt_tokens,
199
+ "completion_tokens": 0,
200
+ "prompt_tokens_details": None,
201
+ },
202
+ }
203
+
204
+ return ClassifyResponse(**response)
@@ -93,6 +93,17 @@ class OpenAIServingCompletion(OpenAIServingBase):
93
93
  # Extract custom labels from raw request headers
94
94
  custom_labels = self.extract_custom_labels(raw_request)
95
95
 
96
+ # Resolve LoRA adapter from model parameter or explicit lora_path
97
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
98
+ if lora_path:
99
+ first_adapter = (
100
+ lora_path
101
+ if isinstance(lora_path, str)
102
+ else next((a for a in lora_path if a), None)
103
+ )
104
+ if first_adapter:
105
+ self._validate_lora_enabled(first_adapter)
106
+
96
107
  adapted_request = GenerateReqInput(
97
108
  **prompt_kwargs,
98
109
  sampling_params=sampling_params,
@@ -101,7 +112,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
101
112
  logprob_start_len=logprob_start_len,
102
113
  return_text_in_logprobs=True,
103
114
  stream=request.stream,
104
- lora_path=request.lora_path,
115
+ lora_path=lora_path,
105
116
  bootstrap_host=request.bootstrap_host,
106
117
  bootstrap_port=request.bootstrap_port,
107
118
  bootstrap_room=request.bootstrap_room,
@@ -110,6 +121,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
110
121
  extra_key=self._compute_extra_key(request),
111
122
  priority=request.priority,
112
123
  custom_labels=custom_labels,
124
+ custom_logit_processor=request.custom_logit_processor,
113
125
  )
114
126
 
115
127
  return adapted_request, request
@@ -123,6 +135,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
123
135
  "min_new_tokens": request.min_tokens,
124
136
  "stop": request.stop,
125
137
  "stop_token_ids": request.stop_token_ids,
138
+ "stop_regex": request.stop_regex,
126
139
  "top_p": request.top_p,
127
140
  "top_k": request.top_k,
128
141
  "min_p": request.min_p,
@@ -137,6 +150,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
137
150
  "ignore_eos": request.ignore_eos,
138
151
  "skip_special_tokens": request.skip_special_tokens,
139
152
  "logit_bias": request.logit_bias,
153
+ "custom_params": request.custom_params,
140
154
  }
141
155
 
142
156
  # Handle response_format constraints
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional,
14
14
 
15
15
  import jinja2
16
16
  import openai.types.responses as openai_responses_types
17
+ import orjson
17
18
  from fastapi import Request
18
19
  from fastapi.responses import ORJSONResponse
19
20
  from openai.types.responses import (
@@ -778,7 +779,9 @@ class OpenAIServingResponses(OpenAIServingChat):
778
779
  # Update the status to "cancelled"
779
780
  response.status = "cancelled"
780
781
 
781
- # Abort the request
782
+ # The response_id is the same as the rid used when submitting the request
783
+ self.tokenizer_manager.abort_request(rid=response_id)
784
+
782
785
  if task := self.background_tasks.get(response_id):
783
786
  task.cancel()
784
787
  try:
@@ -1061,7 +1064,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1061
1064
  ):
1062
1065
  function_name = previous_item.recipient[len("browser.") :]
1063
1066
  action = None
1064
- parsed_args = json.loads(previous_item.content[0].text)
1067
+ parsed_args = orjson.loads(previous_item.content[0].text)
1065
1068
  if function_name == "search":
1066
1069
  action = openai_responses_types.response_function_web_search.ActionSearch(
1067
1070
  type="search",
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import List, Union
4
+
5
+ from fastapi import Request
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import (
8
+ DetokenizeRequest,
9
+ DetokenizeResponse,
10
+ ErrorResponse,
11
+ TokenizeRequest,
12
+ TokenizeResponse,
13
+ )
14
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OpenAIServingTokenize(OpenAIServingBase):
20
+ """Handler for /v1/tokenize requests"""
21
+
22
+ def _request_id_prefix(self) -> str:
23
+ return "tok-"
24
+
25
+ def _convert_to_internal_request(
26
+ self, request: TokenizeRequest, raw_request: Request
27
+ ) -> tuple[TokenizeRequest, TokenizeRequest]:
28
+ return request, request
29
+
30
+ async def _handle_non_streaming_request(
31
+ self,
32
+ adapted_request: TokenizeRequest,
33
+ request: TokenizeRequest,
34
+ raw_request: Request,
35
+ ) -> Union[TokenizeResponse, ErrorResponse]:
36
+ try:
37
+ tokenizer = self.tokenizer_manager.tokenizer
38
+ max_model_len = getattr(tokenizer, "model_max_length", -1)
39
+
40
+ if isinstance(request.prompt, str):
41
+ token_ids = tokenizer.encode(
42
+ request.prompt,
43
+ add_special_tokens=request.add_special_tokens,
44
+ )
45
+ tokens = token_ids
46
+ count = len(token_ids)
47
+ elif isinstance(request.prompt, list):
48
+ token_ids_list = [
49
+ tokenizer.encode(
50
+ text, add_special_tokens=request.add_special_tokens
51
+ )
52
+ for text in request.prompt
53
+ ]
54
+ tokens = token_ids_list
55
+ count = [len(ids) for ids in token_ids_list]
56
+ else:
57
+ return self.create_error_response(
58
+ f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
59
+ )
60
+
61
+ return TokenizeResponse(
62
+ tokens=tokens, count=count, max_model_len=max_model_len
63
+ )
64
+ except Exception as e:
65
+ logger.error("Error during tokenization", exc_info=True)
66
+ return self.create_error_response(
67
+ f"Internal server error during tokenization: {e}",
68
+ err_type="InternalServerError",
69
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
70
+ )
71
+
72
+
73
+ class OpenAIServingDetokenize(OpenAIServingBase):
74
+ """Handler for /v1/detokenize requests"""
75
+
76
+ def _request_id_prefix(self) -> str:
77
+ return "detok-"
78
+
79
+ def _convert_to_internal_request(
80
+ self, request: DetokenizeRequest, raw_request: Request
81
+ ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
82
+ return request, request
83
+
84
+ async def _handle_non_streaming_request(
85
+ self,
86
+ adapted_request: DetokenizeRequest,
87
+ request: DetokenizeRequest,
88
+ raw_request: Request,
89
+ ) -> Union[DetokenizeResponse, ErrorResponse]:
90
+ try:
91
+ tokenizer = self.tokenizer_manager.tokenizer
92
+
93
+ if (
94
+ isinstance(request.tokens, list)
95
+ and request.tokens
96
+ and isinstance(request.tokens[0], int)
97
+ ):
98
+ if not all(isinstance(t, int) for t in request.tokens):
99
+ return self.create_error_response(
100
+ "Invalid input: 'tokens' must be a list of integers."
101
+ )
102
+ tokens_to_decode = [int(t) for t in request.tokens]
103
+ text = tokenizer.decode(
104
+ tokens_to_decode, skip_special_tokens=request.skip_special_tokens
105
+ )
106
+ text_out: Union[str, List[str]] = text
107
+ elif (
108
+ isinstance(request.tokens, list)
109
+ and request.tokens
110
+ and isinstance(request.tokens[0], list)
111
+ ):
112
+ texts: List[str] = []
113
+ for token_list in request.tokens:
114
+ if not all(isinstance(t, int) for t in token_list):
115
+ return self.create_error_response(
116
+ f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
117
+ )
118
+ decoded_text = tokenizer.decode(
119
+ [int(t) for t in token_list],
120
+ skip_special_tokens=request.skip_special_tokens,
121
+ )
122
+ texts.append(decoded_text)
123
+ text_out = texts
124
+ elif isinstance(request.tokens, list) and not request.tokens:
125
+ text_out = ""
126
+ else:
127
+ return self.create_error_response(
128
+ f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
129
+ )
130
+
131
+ return DetokenizeResponse(text=text_out)
132
+ except Exception as e:
133
+ logger.error("Error during detokenization", exc_info=True)
134
+ if "decode" in str(e).lower():
135
+ return self.create_error_response(
136
+ f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
137
+ err_type="DecodeError",
138
+ status_code=HTTPStatus.BAD_REQUEST,
139
+ )
140
+ return self.create_error_response(
141
+ f"Internal server error during detokenization: {e}",
142
+ err_type="InternalServerError",
143
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
144
+ )
sglang/srt/environ.py CHANGED
@@ -111,25 +111,45 @@ class Envs:
111
111
  # Model & File Download
112
112
  SGLANG_USE_MODELSCOPE = EnvBool(False)
113
113
 
114
+ # Logging Options
115
+ SGLANG_LOG_GC = EnvBool(False)
116
+ SGLANG_LOG_FORWARD_ITERS = EnvBool(False)
117
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
118
+
114
119
  # Test & Debug
115
120
  SGLANG_IS_IN_CI = EnvBool(False)
116
- SGLANG_AMD_CI = EnvBool(False)
117
- SGLANG_TEST_RETRACT = EnvBool(False)
121
+ SGLANG_IS_IN_CI_AMD = EnvBool(False)
118
122
  SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
123
  SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
124
  SGLANG_RECORD_STEP_TIME = EnvBool(False)
121
- SGLANG_GC_LOG = EnvBool(False)
122
125
  SGLANG_FORCE_SHUTDOWN = EnvBool(False)
123
126
  SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
124
127
  SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
125
128
  SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
126
- SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
127
129
  SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
128
130
  SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
131
  SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
132
 
133
+ # Scheduler: memory leak test
134
+ SGLANG_TEST_RETRACT = EnvBool(False)
135
+ SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
136
+ SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
137
+
138
+ # Scheduler: new token ratio hyperparameters
139
+ SGLANG_INIT_NEW_TOKEN_RATIO = EnvFloat(0.7)
140
+ SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR = EnvFloat(0.14)
141
+ SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS = EnvInt(600)
142
+ SGLANG_RETRACT_DECODE_STEPS = EnvInt(20)
143
+
144
+ # Scheduler: others:
145
+ SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period.
146
+ # Test: pd-disaggregation
147
+ SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
148
+ SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
149
+
131
150
  # Model Parallel
132
151
  SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
152
+ SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS = EnvBool(False)
133
153
 
134
154
  # Constrained Decoding
135
155
  SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
@@ -145,6 +165,7 @@ class Envs:
145
165
  # AMD & ROCm
146
166
  SGLANG_USE_AITER = EnvBool(False)
147
167
  SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
168
+ SGLANG_ROCM_DISABLE_LINEARQUANT = EnvBool(False)
148
169
 
149
170
  # Quantization
150
171
  SGLANG_INT4_WEIGHT = EnvBool(False)
@@ -155,6 +176,7 @@ class Envs:
155
176
  # Flashinfer
156
177
  SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
157
178
  SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
179
+ SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024)
158
180
 
159
181
  # Triton
160
182
  SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
@@ -167,6 +189,7 @@ class Envs:
167
189
  SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
168
190
  SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
169
191
  SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
192
+ SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR = EnvStr("/tmp")
170
193
 
171
194
  # TBO
172
195
  SGLANG_TBO_DEBUG = EnvBool(False)
@@ -183,12 +206,12 @@ class Envs:
183
206
  # sgl-kernel
184
207
  SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
185
208
 
186
- # vLLM dependencies
209
+ # vLLM dependencies (TODO: they have been deprecated, we can remove them safely)
187
210
  USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
188
211
  USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
189
212
 
190
213
  USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
191
- RETURN_ORIGINAL_LOGPROB = EnvBool(False)
214
+ SGLANG_RETURN_ORIGINAL_LOGPROB = EnvBool(False)
192
215
  SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
193
216
  SGLANG_MOE_PADDING = EnvBool(False)
194
217
  SGLANG_CUTLASS_MOE = EnvBool(False)
@@ -207,13 +230,42 @@ class Envs:
207
230
  SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
208
231
  SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
209
232
 
233
+ # Overlap Spec V2
234
+ SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
235
+ SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
236
+
237
+ # VLM
238
+ SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
239
+ SGLANG_RESIZE_RESAMPLE = EnvStr("")
240
+
241
+ # Ktransformers
242
+ SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
243
+ SGLANG_KT_MOE_CPUINFER = EnvInt(None)
244
+ SGLANG_KT_THREADPOOL_COUNT = EnvInt(None)
245
+ SGLANG_KT_MOE_AMX_WEIGHT_PATH = EnvStr(None)
246
+ SGLANG_KT_AMX_METHOD = EnvStr(None)
247
+ SGLANG_KT_MOE_CHUNKED_PREFILL_SIZE = EnvInt(None)
248
+
249
+ # Sparse Embeddings
250
+ SGLANG_EMBEDDINGS_SPARSE_HEAD = EnvStr(None)
251
+
210
252
  # fmt: on
211
253
 
212
254
 
213
255
  envs = Envs()
214
256
 
215
257
 
258
+ def _print_deprecated_env(new_name: str, old_name: str):
259
+ if old_name in os.environ:
260
+ warnings.warn(
261
+ f"Environment variable {old_name} will be deprecated, please use {new_name} instead"
262
+ )
263
+ os.environ[new_name] = os.environ[old_name]
264
+
265
+
216
266
  def _convert_SGL_to_SGLANG():
267
+ _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
268
+
217
269
  for key, value in os.environ.items():
218
270
  if key.startswith("SGL_"):
219
271
  new_key = key.replace("SGL_", "SGLANG_", 1)
@@ -3,7 +3,8 @@ from typing import Optional
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec
6
+ from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
7
+ from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec, elasticity_aware
7
8
 
8
9
 
9
10
  class EplbAlgorithm(Enum):
@@ -11,6 +12,7 @@ class EplbAlgorithm(Enum):
11
12
  deepseek_hierarchical = auto()
12
13
  deepseek_vec = auto()
13
14
  deepseek_vec_hierarchical = auto()
15
+ elasticity_aware = auto()
14
16
  # TODO may have more algorithm later
15
17
 
16
18
 
@@ -45,6 +47,21 @@ def rebalance_experts(
45
47
  enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
46
48
  )
47
49
 
50
+ if algorithm == EplbAlgorithm.elasticity_aware:
51
+ return elasticity_aware.rebalance_experts(
52
+ weight=tokens_per_expert.sum(dim=0),
53
+ num_replicas=num_physical_experts,
54
+ num_groups=num_groups,
55
+ num_nodes=num_nodes,
56
+ num_gpus=num_physical_experts // num_local_physical_experts,
57
+ enable_hierarchical=True,
58
+ active_ranks=(
59
+ ElasticEPStateManager.instance().active_ranks
60
+ if ElasticEPStateManager.instance() is not None
61
+ else ElasticEPStateManager.healthy_rank_state()
62
+ ),
63
+ )
64
+
48
65
  raise NotImplementedError
49
66
 
50
67
 
@@ -3,8 +3,6 @@ from typing import Tuple
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.utils import get_bool_env_var
7
-
8
6
 
9
7
  def balanced_packing(
10
8
  weight: torch.Tensor, num_packs: int
@@ -0,0 +1,87 @@
1
+ from typing import Tuple
2
+
3
+ import torch
4
+
5
+ from sglang.srt.eplb.eplb_algorithms.deepseek import rebalance_experts_hierarchical
6
+
7
+
8
+ def rebalance_experts(
9
+ weight: torch.Tensor,
10
+ num_replicas: int,
11
+ num_groups: int,
12
+ num_nodes: int,
13
+ num_gpus: int,
14
+ enable_hierarchical: bool,
15
+ active_ranks: torch.Tensor,
16
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
17
+ """
18
+ Entry point for expert-parallelism load balancer.
19
+
20
+ Parameters:
21
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
22
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
23
+ num_groups: number of expert groups
24
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
25
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
26
+
27
+ Returns:
28
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
29
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
30
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
31
+ """
32
+
33
+ num_layers, num_logical_experts = weight.shape
34
+ weight = weight.float().cpu()
35
+ num_active_ranks = active_ranks.sum().item()
36
+ num_local_experts = num_replicas // num_gpus
37
+ if num_active_ranks < num_gpus:
38
+ # Must fall back to global load-balance policy
39
+ # and fix some params
40
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
41
+ weight,
42
+ num_local_experts * num_active_ranks,
43
+ 1,
44
+ 1,
45
+ num_active_ranks,
46
+ )
47
+ elif enable_hierarchical:
48
+ # use hierarchical load-balance policy
49
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
50
+ weight, num_replicas, num_groups, num_nodes, num_gpus
51
+ )
52
+ else:
53
+ # use global load-balance policy
54
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
55
+ weight, num_replicas, 1, 1, num_gpus
56
+ )
57
+ maxlogcnt = logcnt.max().item()
58
+ log2phy: torch.Tensor = torch.full(
59
+ (num_layers, num_logical_experts, maxlogcnt),
60
+ -1,
61
+ dtype=torch.int64,
62
+ device=logcnt.device,
63
+ )
64
+ log2phy.view(num_layers, -1).scatter_(
65
+ -1,
66
+ phy2log * maxlogcnt + phyrank,
67
+ torch.arange(
68
+ num_local_experts * num_active_ranks,
69
+ dtype=torch.int64,
70
+ device=log2phy.device,
71
+ ).expand(num_layers, -1),
72
+ )
73
+ if num_active_ranks < num_gpus:
74
+ phy2log_slices = list(
75
+ phy2log.view(num_layers, num_active_ranks, -1).unbind(dim=1)
76
+ )
77
+ active_ranks_list = active_ranks.tolist()
78
+ for idx, active_rank in enumerate(active_ranks_list):
79
+ if not active_rank:
80
+ phy2log_slices.insert(idx, torch.zeros_like(phy2log_slices[0]))
81
+ log2phy = torch.where(
82
+ log2phy >= idx * num_local_experts,
83
+ log2phy + num_local_experts,
84
+ log2phy,
85
+ )
86
+ phy2log = torch.stack(phy2log_slices, dim=1).contiguous().view(num_layers, -1)
87
+ return phy2log, log2phy, logcnt