sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/lang/interpreter.py +1 -1
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/device_config.py +3 -1
  6. sglang/srt/configs/dots_vlm.py +139 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/load_config.py +1 -0
  9. sglang/srt/configs/model_config.py +50 -6
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +8 -1
  12. sglang/srt/connector/remote_instance.py +82 -0
  13. sglang/srt/constrained/base_grammar_backend.py +48 -12
  14. sglang/srt/constrained/llguidance_backend.py +0 -1
  15. sglang/srt/constrained/outlines_backend.py +0 -1
  16. sglang/srt/constrained/xgrammar_backend.py +28 -9
  17. sglang/srt/custom_op.py +11 -1
  18. sglang/srt/debug_utils/dump_comparator.py +81 -44
  19. sglang/srt/debug_utils/dump_loader.py +97 -0
  20. sglang/srt/debug_utils/dumper.py +11 -3
  21. sglang/srt/debug_utils/text_comparator.py +73 -11
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +21 -10
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  26. sglang/srt/disaggregation/fake/conn.py +1 -1
  27. sglang/srt/disaggregation/mini_lb.py +6 -445
  28. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  29. sglang/srt/disaggregation/nixl/conn.py +180 -16
  30. sglang/srt/disaggregation/prefill.py +5 -3
  31. sglang/srt/disaggregation/utils.py +5 -50
  32. sglang/srt/distributed/parallel_state.py +67 -43
  33. sglang/srt/entrypoints/engine.py +38 -17
  34. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  35. sglang/srt/entrypoints/grpc_server.py +680 -0
  36. sglang/srt/entrypoints/http_server.py +88 -53
  37. sglang/srt/entrypoints/openai/protocol.py +7 -4
  38. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  39. sglang/srt/entrypoints/openai/serving_chat.py +39 -19
  40. sglang/srt/entrypoints/openai/serving_completions.py +15 -4
  41. sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
  42. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  43. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  44. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  45. sglang/srt/eplb/eplb_manager.py +2 -2
  46. sglang/srt/eplb/expert_distribution.py +26 -13
  47. sglang/srt/eplb/expert_location.py +8 -3
  48. sglang/srt/eplb/expert_location_updater.py +1 -1
  49. sglang/srt/function_call/base_format_detector.py +3 -6
  50. sglang/srt/function_call/ebnf_composer.py +11 -9
  51. sglang/srt/function_call/function_call_parser.py +6 -0
  52. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  55. sglang/srt/grpc/__init__.py +1 -0
  56. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  57. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  58. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  59. sglang/srt/hf_transformers_utils.py +4 -0
  60. sglang/srt/layers/activation.py +142 -9
  61. sglang/srt/layers/attention/aiter_backend.py +93 -68
  62. sglang/srt/layers/attention/ascend_backend.py +11 -4
  63. sglang/srt/layers/attention/fla/chunk.py +242 -0
  64. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  65. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  66. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  67. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  68. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  69. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  70. sglang/srt/layers/attention/fla/index.py +37 -0
  71. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  72. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  73. sglang/srt/layers/attention/fla/op.py +66 -0
  74. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  75. sglang/srt/layers/attention/fla/utils.py +331 -0
  76. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  77. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  78. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  79. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  80. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  81. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  82. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  83. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  84. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  85. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  86. sglang/srt/layers/attention/triton_backend.py +18 -1
  87. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  88. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  89. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  90. sglang/srt/layers/communicator.py +45 -7
  91. sglang/srt/layers/dp_attention.py +30 -1
  92. sglang/srt/layers/layernorm.py +32 -15
  93. sglang/srt/layers/linear.py +34 -3
  94. sglang/srt/layers/logits_processor.py +29 -10
  95. sglang/srt/layers/moe/__init__.py +2 -1
  96. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  97. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  98. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  99. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  100. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  107. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  108. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  113. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  114. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  115. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  116. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  117. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  118. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  119. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  120. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  121. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  122. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  123. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  124. sglang/srt/layers/moe/topk.py +30 -9
  125. sglang/srt/layers/moe/utils.py +12 -7
  126. sglang/srt/layers/quantization/awq.py +19 -7
  127. sglang/srt/layers/quantization/base_config.py +11 -6
  128. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  129. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  130. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  131. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  132. sglang/srt/layers/quantization/fp8.py +76 -47
  133. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  134. sglang/srt/layers/quantization/gptq.py +25 -17
  135. sglang/srt/layers/quantization/modelopt_quant.py +182 -49
  136. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  137. sglang/srt/layers/quantization/mxfp4.py +68 -41
  138. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  139. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  140. sglang/srt/layers/quantization/quark/utils.py +97 -0
  141. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  142. sglang/srt/layers/quantization/unquant.py +135 -47
  143. sglang/srt/layers/quantization/w4afp8.py +30 -17
  144. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  145. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  146. sglang/srt/layers/rocm_linear_utils.py +44 -0
  147. sglang/srt/layers/rotary_embedding.py +0 -18
  148. sglang/srt/layers/sampler.py +162 -18
  149. sglang/srt/lora/backend/base_backend.py +50 -8
  150. sglang/srt/lora/backend/triton_backend.py +90 -2
  151. sglang/srt/lora/layers.py +32 -0
  152. sglang/srt/lora/lora.py +4 -1
  153. sglang/srt/lora/lora_manager.py +35 -112
  154. sglang/srt/lora/mem_pool.py +24 -10
  155. sglang/srt/lora/utils.py +18 -9
  156. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  157. sglang/srt/managers/cache_controller.py +200 -199
  158. sglang/srt/managers/data_parallel_controller.py +105 -35
  159. sglang/srt/managers/detokenizer_manager.py +8 -4
  160. sglang/srt/managers/disagg_service.py +46 -0
  161. sglang/srt/managers/io_struct.py +199 -12
  162. sglang/srt/managers/mm_utils.py +1 -0
  163. sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
  164. sglang/srt/managers/schedule_batch.py +77 -56
  165. sglang/srt/managers/schedule_policy.py +4 -3
  166. sglang/srt/managers/scheduler.py +191 -139
  167. sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
  168. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  169. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  170. sglang/srt/managers/template_manager.py +3 -3
  171. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  172. sglang/srt/managers/tokenizer_manager.py +260 -519
  173. sglang/srt/managers/tp_worker.py +53 -4
  174. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  175. sglang/srt/mem_cache/allocator.py +1 -1
  176. sglang/srt/mem_cache/hicache_storage.py +18 -33
  177. sglang/srt/mem_cache/hiradix_cache.py +108 -48
  178. sglang/srt/mem_cache/memory_pool.py +347 -48
  179. sglang/srt/mem_cache/memory_pool_host.py +121 -57
  180. sglang/srt/mem_cache/radix_cache.py +0 -2
  181. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  182. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  183. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
  184. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  185. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  186. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
  187. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  188. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  189. sglang/srt/metrics/collector.py +502 -77
  190. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  191. sglang/srt/metrics/utils.py +48 -0
  192. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  193. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  194. sglang/srt/model_executor/forward_batch_info.py +75 -19
  195. sglang/srt/model_executor/model_runner.py +357 -30
  196. sglang/srt/model_loader/__init__.py +9 -3
  197. sglang/srt/model_loader/loader.py +128 -4
  198. sglang/srt/model_loader/weight_utils.py +2 -1
  199. sglang/srt/models/apertus.py +686 -0
  200. sglang/srt/models/bailing_moe.py +798 -218
  201. sglang/srt/models/bailing_moe_nextn.py +168 -0
  202. sglang/srt/models/deepseek_v2.py +346 -48
  203. sglang/srt/models/dots_vlm.py +174 -0
  204. sglang/srt/models/dots_vlm_vit.py +337 -0
  205. sglang/srt/models/ernie4.py +1 -1
  206. sglang/srt/models/gemma3n_mm.py +1 -1
  207. sglang/srt/models/glm4_moe.py +11 -2
  208. sglang/srt/models/glm4v.py +4 -2
  209. sglang/srt/models/glm4v_moe.py +3 -0
  210. sglang/srt/models/gpt_oss.py +1 -1
  211. sglang/srt/models/internvl.py +28 -0
  212. sglang/srt/models/llama4.py +9 -0
  213. sglang/srt/models/llama_eagle3.py +13 -0
  214. sglang/srt/models/longcat_flash.py +2 -2
  215. sglang/srt/models/minicpmv.py +165 -3
  216. sglang/srt/models/mllama4.py +25 -0
  217. sglang/srt/models/opt.py +637 -0
  218. sglang/srt/models/qwen2.py +7 -0
  219. sglang/srt/models/qwen2_5_vl.py +27 -3
  220. sglang/srt/models/qwen2_moe.py +60 -13
  221. sglang/srt/models/qwen3.py +8 -2
  222. sglang/srt/models/qwen3_moe.py +40 -9
  223. sglang/srt/models/qwen3_next.py +1042 -0
  224. sglang/srt/models/qwen3_next_mtp.py +112 -0
  225. sglang/srt/models/step3_vl.py +1 -1
  226. sglang/srt/models/torch_native_llama.py +1 -1
  227. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  228. sglang/srt/multimodal/processors/glm4v.py +9 -9
  229. sglang/srt/multimodal/processors/internvl.py +141 -129
  230. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  231. sglang/srt/offloader.py +27 -3
  232. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  233. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  234. sglang/srt/sampling/sampling_batch_info.py +18 -15
  235. sglang/srt/server_args.py +355 -37
  236. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  237. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  238. sglang/srt/speculative/eagle_utils.py +0 -2
  239. sglang/srt/speculative/eagle_worker.py +197 -112
  240. sglang/srt/speculative/spec_info.py +5 -0
  241. sglang/srt/speculative/standalone_worker.py +109 -0
  242. sglang/srt/tracing/trace.py +552 -0
  243. sglang/srt/utils.py +46 -3
  244. sglang/srt/weight_sync/utils.py +1 -1
  245. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  246. sglang/test/few_shot_gsm8k.py +1 -0
  247. sglang/test/runners.py +4 -0
  248. sglang/test/test_cutlass_moe.py +24 -6
  249. sglang/test/test_disaggregation_utils.py +66 -0
  250. sglang/test/test_fp4_moe.py +370 -1
  251. sglang/test/test_utils.py +28 -1
  252. sglang/utils.py +12 -0
  253. sglang/version.py +1 -1
  254. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  255. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
  256. sglang/srt/disaggregation/launch_lb.py +0 -118
  257. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  258. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  259. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  260. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  261. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  262. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  263. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  264. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  265. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,14 +19,16 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
+ import socket
22
23
  import sys
23
24
  import tempfile
24
25
  from typing import List, Literal, Optional, Union
25
26
 
27
+ from sglang.srt.connector import ConnectorType
26
28
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
29
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
30
  from sglang.srt.lora.lora_registry import LoRARef
29
- from sglang.srt.reasoning_parser import ReasoningParser
31
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
30
32
  from sglang.srt.utils import (
31
33
  LORA_TARGET_ALL_MODULES,
32
34
  SUPPORTED_LORA_TARGET_MODULES,
@@ -36,14 +38,18 @@ from sglang.srt.utils import (
36
38
  is_cuda,
37
39
  is_flashinfer_available,
38
40
  is_hip,
41
+ is_npu,
39
42
  is_port_available,
40
43
  is_remote_url,
41
44
  is_sm90_supported,
42
45
  is_sm100_supported,
43
46
  is_triton_kernels_available,
44
47
  is_valid_ipv6_address,
48
+ json_list_type,
45
49
  nullable_str,
50
+ parse_connector_type,
46
51
  )
52
+ from sglang.utils import is_in_ci
47
53
 
48
54
  logger = logging.getLogger(__name__)
49
55
 
@@ -60,6 +66,7 @@ LOAD_FORMAT_CHOICES = [
60
66
  "bitsandbytes",
61
67
  "layered",
62
68
  "remote",
69
+ "remote_instance",
63
70
  ]
64
71
 
65
72
  QUANTIZATION_CHOICES = [
@@ -94,6 +101,7 @@ ATTENTION_BACKEND_CHOICES = [
94
101
  "trtllm_mla",
95
102
  "trtllm_mha",
96
103
  "dual_chunk_flash_attn",
104
+ "hybrid_linear_attn",
97
105
  # AMD specific
98
106
  "aiter",
99
107
  "wave",
@@ -104,6 +112,8 @@ ATTENTION_BACKEND_CHOICES = [
104
112
 
105
113
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
106
114
 
115
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
116
+
107
117
 
108
118
  # Allow external code to add more choices
109
119
  def add_load_format_choices(choices):
@@ -122,6 +132,10 @@ def add_disagg_transfer_backend_choices(choices):
122
132
  DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
123
133
 
124
134
 
135
+ def add_grammar_backend_choices(choices):
136
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
137
+
138
+
125
139
  @dataclasses.dataclass
126
140
  class ServerArgs:
127
141
  # Model and tokenizer
@@ -191,14 +205,20 @@ class ServerArgs:
191
205
  show_time_cost: bool = False
192
206
  enable_metrics: bool = False
193
207
  enable_metrics_for_all_schedulers: bool = False
208
+ tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
209
+ tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
194
210
  bucket_time_to_first_token: Optional[List[float]] = None
195
211
  bucket_inter_token_latency: Optional[List[float]] = None
196
212
  bucket_e2e_request_latency: Optional[List[float]] = None
197
213
  collect_tokens_histogram: bool = False
214
+ prompt_tokens_buckets: Optional[List[str]] = None
215
+ generation_tokens_buckets: Optional[List[str]] = None
198
216
  decode_log_interval: int = 40
199
217
  enable_request_time_stats_logging: bool = False
200
218
  kv_events_config: Optional[str] = None
201
219
  gc_warning_threshold_secs: float = 0.0
220
+ enable_trace: bool = False
221
+ oltp_traces_endpoint: str = "localhost:4317"
202
222
 
203
223
  # API related
204
224
  api_key: Optional[str] = None
@@ -215,6 +235,9 @@ class ServerArgs:
215
235
  # Data parallelism
216
236
  dp_size: int = 1
217
237
  load_balance_method: str = "round_robin"
238
+ load_watch_interval: float = 0.1
239
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
240
+ prefill_round_robin_balance: bool = False
218
241
 
219
242
  # Multi-node distributed serving
220
243
  dist_init_addr: Optional[str] = None
@@ -247,12 +270,14 @@ class ServerArgs:
247
270
  # Speculative decoding
248
271
  speculative_algorithm: Optional[str] = None
249
272
  speculative_draft_model_path: Optional[str] = None
273
+ speculative_draft_model_revision: Optional[str] = None
250
274
  speculative_num_steps: Optional[int] = None
251
275
  speculative_eagle_topk: Optional[int] = None
252
276
  speculative_num_draft_tokens: Optional[int] = None
253
277
  speculative_accept_threshold_single: float = 1.0
254
278
  speculative_accept_threshold_acc: float = 1.0
255
279
  speculative_token_map: Optional[str] = None
280
+ speculative_attention_mode: str = "prefill"
256
281
 
257
282
  # Expert parallelism
258
283
  ep_size: int = 1
@@ -294,6 +319,8 @@ class ServerArgs:
294
319
  hicache_storage_backend: Optional[str] = None
295
320
  hicache_storage_prefetch_policy: str = "best_effort"
296
321
  hicache_storage_backend_extra_config: Optional[str] = None
322
+ # LMCache
323
+ enable_lmcache: bool = False
297
324
 
298
325
  # Double Sparsity
299
326
  enable_double_sparsity: bool = False
@@ -338,6 +365,7 @@ class ServerArgs:
338
365
  enable_p2p_check: bool = False
339
366
  triton_attention_reduce_in_fp32: bool = False
340
367
  triton_attention_num_kv_splits: int = 8
368
+ triton_attention_split_tile_size: Optional[int] = None
341
369
  num_continuous_decode_steps: int = 1
342
370
  delete_ckpt_after_loading: bool = False
343
371
  enable_memory_saver: bool = False
@@ -349,6 +377,12 @@ class ServerArgs:
349
377
  disable_fast_image_processor: bool = False
350
378
  enable_return_hidden_states: bool = False
351
379
  scheduler_recv_interval: int = 1
380
+ numa_node: Optional[List[int]] = None
381
+
382
+ # Dynamic batch tokenizer
383
+ enable_dynamic_batch_tokenizer: bool = False
384
+ dynamic_batch_tokenizer_batch_size: int = 32
385
+ dynamic_batch_tokenizer_batch_timeout: float = 0.002
352
386
 
353
387
  # Debug tensor dumps
354
388
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -357,7 +391,7 @@ class ServerArgs:
357
391
  debug_tensor_dump_prefill_only: bool = False
358
392
 
359
393
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
360
- disaggregation_mode: str = "null"
394
+ disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
361
395
  disaggregation_transfer_backend: str = "mooncake"
362
396
  disaggregation_bootstrap_port: int = 8998
363
397
  disaggregation_decode_tp: Optional[int] = None
@@ -365,20 +399,32 @@ class ServerArgs:
365
399
  disaggregation_prefill_pp: Optional[int] = 1
366
400
  disaggregation_ib_device: Optional[str] = None
367
401
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
368
- pdlb_url: Optional[str] = None
402
+
403
+ # FIXME: hack to reduce ITL when decode bs is small
404
+ disaggregation_decode_polling_interval: int = 1
369
405
 
370
406
  # For model weight update
371
407
  custom_weight_loader: Optional[List[str]] = None
372
408
  weight_loader_disable_mmap: bool = False
373
409
 
410
+ # Remote instance weight loading
411
+ remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
412
+ remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
413
+ remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
414
+
374
415
  # For PD-Multiplexing
375
416
  enable_pdmux: bool = False
376
417
  sm_group_num: int = 3
377
418
 
419
+ # Mamba cache
420
+ max_mamba_cache_size: Optional[int] = None
421
+ mamba_ssm_dtype: str = "float32"
422
+
378
423
  # Deprecated arguments
379
424
  enable_ep_moe: bool = False
380
425
  enable_deepep_moe: bool = False
381
426
  enable_flashinfer_cutlass_moe: bool = False
427
+ enable_flashinfer_cutedsl_moe: bool = False
382
428
  enable_flashinfer_trtllm_moe: bool = False
383
429
  enable_triton_kernel_moe: bool = False
384
430
  enable_flashinfer_mxfp4_moe: bool = False
@@ -400,6 +446,11 @@ class ServerArgs:
400
446
  print_deprecated_warning(
401
447
  "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
402
448
  )
449
+ if self.enable_flashinfer_cutedsl_moe:
450
+ self.moe_runner_backend = "flashinfer_cutedsl"
451
+ print_deprecated_warning(
452
+ "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
453
+ )
403
454
  if self.enable_flashinfer_cutlass_moe:
404
455
  self.moe_runner_backend = "flashinfer_cutlass"
405
456
  print_deprecated_warning(
@@ -419,6 +470,7 @@ class ServerArgs:
419
470
  # Set missing default values
420
471
  if self.tokenizer_path is None:
421
472
  self.tokenizer_path = self.model_path
473
+
422
474
  if self.served_model_name is None:
423
475
  self.served_model_name = self.model_path
424
476
  if self.device is None:
@@ -462,9 +514,14 @@ class ServerArgs:
462
514
  # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
463
515
  reserved_mem = 32 * 1024
464
516
 
517
+ # draft model and larger cuda graph buffers
465
518
  if self.speculative_algorithm is not None:
466
- # draft model and larger cuda graph buffers
467
- reserved_mem += 2 * 1024
519
+ if self.speculative_algorithm == "STANDALONE":
520
+ # Standalone speculative decoding needs more memory than other speculative
521
+ # decoding algorithms since the draft model is typically larger.
522
+ reserved_mem += 6 * 1024
523
+ else:
524
+ reserved_mem += 2 * 1024
468
525
  if self.enable_dp_attention:
469
526
  reserved_mem += 4 * 1024
470
527
 
@@ -507,7 +564,8 @@ class ServerArgs:
507
564
  self.sampling_backend = "pytorch"
508
565
 
509
566
  # Model-specific adjustments
510
- self.model_specific_adjustments()
567
+ if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
568
+ self.model_specific_adjustments()
511
569
 
512
570
  # Set kernel backends
513
571
  if self.device == "cpu":
@@ -526,7 +584,7 @@ class ServerArgs:
526
584
  )
527
585
  self.disable_cuda_graph = True
528
586
 
529
- if self.attention_backend == "ascend":
587
+ if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
530
588
  logger.warning(
531
589
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
532
590
  )
@@ -606,12 +664,13 @@ class ServerArgs:
606
664
  if self.grammar_backend is None:
607
665
  self.grammar_backend = "xgrammar"
608
666
 
667
+ if self.dp_size == 1:
668
+ self.enable_dp_attention = False
669
+ self.enable_dp_lm_head = False
670
+
609
671
  # Data parallelism attention
610
672
  if self.enable_dp_attention:
611
673
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
612
- assert (
613
- self.dp_size > 1
614
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
615
674
  assert self.tp_size % self.dp_size == 0
616
675
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
617
676
  logger.warning(
@@ -634,11 +693,13 @@ class ServerArgs:
634
693
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
635
694
 
636
695
  if self.moe_runner_backend == "flashinfer_trtllm":
637
- if not self.disable_shared_experts_fusion:
638
- self.disable_shared_experts_fusion = True
639
- logger.warning(
640
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
641
- )
696
+ assert (
697
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
698
+ ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
699
+ self.disable_shared_experts_fusion = True
700
+ logger.warning(
701
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
702
+ )
642
703
 
643
704
  # DeepEP MoE
644
705
  if self.moe_a2a_backend == "deepep":
@@ -688,12 +749,24 @@ class ServerArgs:
688
749
  self.hicache_io_backend = "kernel"
689
750
  self.hicache_mem_layout = "page_first"
690
751
 
752
+ if self.hicache_mem_layout == "page_first_direct":
753
+ if self.hicache_io_backend != "direct":
754
+ self.hicache_io_backend = "direct"
755
+ logger.warning(
756
+ "Page first direct layout only support direct io backend"
757
+ )
758
+
691
759
  # Speculative Decoding
692
760
  if self.speculative_algorithm == "NEXTN":
693
761
  # NEXTN shares the same implementation of EAGLE
694
762
  self.speculative_algorithm = "EAGLE"
695
763
 
696
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
764
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
765
+ if self.speculative_algorithm == "STANDALONE":
766
+ # TODO: support dp attention for standalone speculative decoding
767
+ assert (
768
+ self.enable_dp_attention is False
769
+ ), "Currently standalone speculative decoding does not support dp attention."
697
770
  if self.max_running_requests is None:
698
771
  self.max_running_requests = 48
699
772
  self.disable_overlap_schedule = True
@@ -709,7 +782,12 @@ class ServerArgs:
709
782
  )
710
783
 
711
784
  model_arch = self.get_hf_config().architectures[0]
712
- if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
785
+ if model_arch in [
786
+ "DeepseekV3ForCausalLM",
787
+ "Glm4MoeForCausalLM",
788
+ "BailingMoeForCausalLM",
789
+ "BailingMoeV2ForCausalLM",
790
+ ]:
713
791
  # Auto set draft_model_path DeepSeek-V3/R1
714
792
  if self.speculative_draft_model_path is None:
715
793
  self.speculative_draft_model_path = self.model_path
@@ -768,12 +846,19 @@ class ServerArgs:
768
846
  ) and check_gguf_file(self.model_path):
769
847
  self.quantization = self.load_format = "gguf"
770
848
 
771
- # Model loading
772
849
  if is_remote_url(self.model_path):
773
850
  self.load_format = "remote"
774
851
  if self.custom_weight_loader is None:
775
852
  self.custom_weight_loader = []
776
853
 
854
+ if self.load_format == "remote_instance":
855
+ if (
856
+ self.remote_instance_weight_loader_seed_instance_ip is None
857
+ or self.remote_instance_weight_loader_seed_instance_service_port is None
858
+ or self.remote_instance_weight_loader_send_weights_group_ports is None
859
+ ):
860
+ self.load_format = "auto"
861
+
777
862
  # PD disaggregation
778
863
  if self.disaggregation_mode == "decode":
779
864
  assert (
@@ -785,6 +870,13 @@ class ServerArgs:
785
870
 
786
871
  self.disable_radix_cache = True
787
872
  logger.warning("KV cache is forced as chunk cache for decode server")
873
+
874
+ if self.dp_size > 1 and not is_in_ci():
875
+ assert self.prefill_round_robin_balance, (
876
+ "Prefill round robin balance is required when dp size > 1. "
877
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
878
+ " and `--prefill-round-robin-balance` is set for decode server."
879
+ )
788
880
  elif self.disaggregation_mode == "prefill":
789
881
  if self.disaggregation_decode_tp is None:
790
882
  self.disaggregation_decode_tp = self.tp_size
@@ -797,10 +889,19 @@ class ServerArgs:
797
889
  self.disable_cuda_graph = True
798
890
  logger.warning("Cuda graph is disabled for prefill server")
799
891
 
892
+ # Validation: prevent both tokenizer batching features from being enabled
893
+ if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
894
+ raise ValueError(
895
+ "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
896
+ "Please choose one tokenizer batching approach."
897
+ )
898
+
800
899
  # Propagate env vars
801
900
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
802
901
  "1" if self.enable_torch_compile else "0"
803
902
  )
903
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
904
+
804
905
  # Set env var before grammar backends init
805
906
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
806
907
  "1" if self.disable_outlines_disk_cache else "0"
@@ -812,6 +913,14 @@ class ServerArgs:
812
913
  "and cannot be used at the same time. Please use only one of them."
813
914
  )
814
915
 
916
+ if (
917
+ not self.tokenizer_metrics_custom_labels_header
918
+ and self.tokenizer_metrics_allowed_customer_labels
919
+ ):
920
+ raise ValueError(
921
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
922
+ )
923
+
815
924
  @staticmethod
816
925
  def add_cli_args(parser: argparse.ArgumentParser):
817
926
  # Model and tokenizer
@@ -823,16 +932,28 @@ class ServerArgs:
823
932
  required=True,
824
933
  )
825
934
  parser.add_argument(
826
- "--tokenizer-path",
935
+ "--remote-instance-weight-loader-seed-instance-ip",
827
936
  type=str,
828
- default=ServerArgs.tokenizer_path,
829
- help="The path of the tokenizer.",
937
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
938
+ help="The ip of the seed instance for loading weights from remote instance.",
830
939
  )
831
940
  parser.add_argument(
832
- "--tokenizer-worker-num",
941
+ "--remote-instance-weight-loader-seed-instance-service-port",
833
942
  type=int,
834
- default=ServerArgs.tokenizer_worker_num,
835
- help="The worker num of the tokenizer manager.",
943
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
944
+ help="The service port of the seed instance for loading weights from remote instance.",
945
+ )
946
+ parser.add_argument(
947
+ "--remote-instance-weight-loader-send-weights-group-ports",
948
+ type=json_list_type,
949
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
950
+ help="The communication group ports for loading weights from remote instance.",
951
+ )
952
+ parser.add_argument(
953
+ "--tokenizer-path",
954
+ type=str,
955
+ default=ServerArgs.tokenizer_path,
956
+ help="The path of the tokenizer.",
836
957
  )
837
958
  parser.add_argument(
838
959
  "--tokenizer-mode",
@@ -843,6 +964,12 @@ class ServerArgs:
843
964
  "tokenizer if available, and 'slow' will "
844
965
  "always use the slow tokenizer.",
845
966
  )
967
+ parser.add_argument(
968
+ "--tokenizer-worker-num",
969
+ type=int,
970
+ default=ServerArgs.tokenizer_worker_num,
971
+ help="The worker num of the tokenizer manager.",
972
+ )
846
973
  parser.add_argument(
847
974
  "--skip-tokenizer-init",
848
975
  action="store_true",
@@ -1033,7 +1160,7 @@ class ServerArgs:
1033
1160
  "--schedule-policy",
1034
1161
  type=str,
1035
1162
  default=ServerArgs.schedule_policy,
1036
- choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
1163
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1037
1164
  help="The scheduling policy of the requests.",
1038
1165
  )
1039
1166
  parser.add_argument(
@@ -1207,6 +1334,21 @@ class ServerArgs:
1207
1334
  "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
1208
1335
  "otherwise all metrics appear to come from TP 0.",
1209
1336
  )
1337
+ parser.add_argument(
1338
+ "--tokenizer-metrics-custom-labels-header",
1339
+ type=str,
1340
+ default=ServerArgs.tokenizer_metrics_custom_labels_header,
1341
+ help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
1342
+ )
1343
+ parser.add_argument(
1344
+ "--tokenizer-metrics-allowed-customer-labels",
1345
+ type=str,
1346
+ nargs="+",
1347
+ default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
1348
+ help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
1349
+ "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1350
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
1351
+ )
1210
1352
  parser.add_argument(
1211
1353
  "--bucket-time-to-first-token",
1212
1354
  type=float,
@@ -1234,6 +1376,26 @@ class ServerArgs:
1234
1376
  default=ServerArgs.collect_tokens_histogram,
1235
1377
  help="Collect prompt/generation tokens histogram.",
1236
1378
  )
1379
+ bucket_rule = (
1380
+ "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1381
+ "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1382
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1383
+ "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
1384
+ )
1385
+ parser.add_argument(
1386
+ "--prompt-tokens-buckets",
1387
+ type=str,
1388
+ nargs="+",
1389
+ default=ServerArgs.prompt_tokens_buckets,
1390
+ help=f"The buckets rule of prompt tokens. {bucket_rule}",
1391
+ )
1392
+ parser.add_argument(
1393
+ "--generation-tokens-buckets",
1394
+ type=str,
1395
+ nargs="+",
1396
+ default=ServerArgs.generation_tokens_buckets,
1397
+ help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
1398
+ )
1237
1399
  parser.add_argument(
1238
1400
  "--gc-warning-threshold-secs",
1239
1401
  type=float,
@@ -1258,6 +1420,17 @@ class ServerArgs:
1258
1420
  default=None,
1259
1421
  help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1260
1422
  )
1423
+ parser.add_argument(
1424
+ "--enable-trace",
1425
+ action="store_true",
1426
+ help="Enable opentelemetry trace",
1427
+ )
1428
+ parser.add_argument(
1429
+ "--oltp-traces-endpoint",
1430
+ type=str,
1431
+ default="localhost:4317",
1432
+ help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
1433
+ )
1261
1434
 
1262
1435
  # API related
1263
1436
  parser.add_argument(
@@ -1342,6 +1515,18 @@ class ServerArgs:
1342
1515
  "minimum_tokens",
1343
1516
  ],
1344
1517
  )
1518
+ parser.add_argument(
1519
+ "--load-watch-interval",
1520
+ type=float,
1521
+ default=ServerArgs.load_watch_interval,
1522
+ help="The interval of load watching in seconds.",
1523
+ )
1524
+ parser.add_argument(
1525
+ "--prefill-round-robin-balance",
1526
+ default=ServerArgs.prefill_round_robin_balance,
1527
+ action="store_true",
1528
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1529
+ )
1345
1530
 
1346
1531
  # Multi-node distributed serving
1347
1532
  parser.add_argument(
@@ -1452,7 +1637,7 @@ class ServerArgs:
1452
1637
  parser.add_argument(
1453
1638
  "--grammar-backend",
1454
1639
  type=str,
1455
- choices=["xgrammar", "outlines", "llguidance", "none"],
1640
+ choices=GRAMMAR_BACKEND_CHOICES,
1456
1641
  default=ServerArgs.grammar_backend,
1457
1642
  help="Choose the backend for grammar-guided decoding.",
1458
1643
  )
@@ -1468,14 +1653,23 @@ class ServerArgs:
1468
1653
  parser.add_argument(
1469
1654
  "--speculative-algorithm",
1470
1655
  type=str,
1471
- choices=["EAGLE", "EAGLE3", "NEXTN"],
1656
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
1472
1657
  help="Speculative algorithm.",
1473
1658
  )
1474
1659
  parser.add_argument(
1475
1660
  "--speculative-draft-model-path",
1661
+ "--speculative-draft-model",
1476
1662
  type=str,
1477
1663
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1478
1664
  )
1665
+ parser.add_argument(
1666
+ "--speculative-draft-model-revision",
1667
+ type=str,
1668
+ default=None,
1669
+ help="The specific draft model version to use. It can be a branch "
1670
+ "name, a tag name, or a commit id. If unspecified, will use "
1671
+ "the default version.",
1672
+ )
1479
1673
  parser.add_argument(
1480
1674
  "--speculative-num-steps",
1481
1675
  type=int,
@@ -1512,6 +1706,13 @@ class ServerArgs:
1512
1706
  help="The path of the draft model's small vocab table.",
1513
1707
  default=ServerArgs.speculative_token_map,
1514
1708
  )
1709
+ parser.add_argument(
1710
+ "--speculative-attention-mode",
1711
+ type=str,
1712
+ choices=["prefill", "decode"],
1713
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1714
+ default=ServerArgs.speculative_attention_mode,
1715
+ )
1515
1716
 
1516
1717
  # Expert parallelism
1517
1718
  parser.add_argument(
@@ -1539,6 +1740,7 @@ class ServerArgs:
1539
1740
  "flashinfer_trtllm",
1540
1741
  "flashinfer_cutlass",
1541
1742
  "flashinfer_mxfp4",
1743
+ "flashinfer_cutedsl",
1542
1744
  ],
1543
1745
  default=ServerArgs.moe_runner_backend,
1544
1746
  help="Choose the runner backend for MoE.",
@@ -1546,7 +1748,7 @@ class ServerArgs:
1546
1748
  parser.add_argument(
1547
1749
  "--flashinfer-mxfp4-moe-precision",
1548
1750
  type=str,
1549
- choices=["mxfp4", "bf16"],
1751
+ choices=["default", "bf16"],
1550
1752
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1551
1753
  help="Choose the computation precision of flashinfer mxfp4 moe",
1552
1754
  )
@@ -1639,6 +1841,21 @@ class ServerArgs:
1639
1841
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1640
1842
  )
1641
1843
 
1844
+ # Mamba Cache
1845
+ parser.add_argument(
1846
+ "--max-mamba-cache-size",
1847
+ type=int,
1848
+ default=ServerArgs.max_mamba_cache_size,
1849
+ help="The maximum size of the mamba cache.",
1850
+ )
1851
+ parser.add_argument(
1852
+ "--mamba-ssm-dtype",
1853
+ type=str,
1854
+ default=ServerArgs.mamba_ssm_dtype,
1855
+ choices=["float32", "bfloat16"],
1856
+ help="The data type of the SSM states in mamba cache.",
1857
+ )
1858
+
1642
1859
  # Hierarchical cache
1643
1860
  parser.add_argument(
1644
1861
  "--enable-hierarchical-cache",
@@ -1674,7 +1891,7 @@ class ServerArgs:
1674
1891
  parser.add_argument(
1675
1892
  "--hicache-mem-layout",
1676
1893
  type=str,
1677
- choices=["layer_first", "page_first"],
1894
+ choices=["layer_first", "page_first", "page_first_direct"],
1678
1895
  default=ServerArgs.hicache_mem_layout,
1679
1896
  help="The layout of host memory pool for hierarchical cache.",
1680
1897
  )
@@ -1698,6 +1915,12 @@ class ServerArgs:
1698
1915
  default=ServerArgs.hicache_storage_backend_extra_config,
1699
1916
  help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1700
1917
  )
1918
+ # LMCache
1919
+ parser.add_argument(
1920
+ "--enable-lmcache",
1921
+ action="store_true",
1922
+ help="Using LMCache as an alternative hierarchical cache solution",
1923
+ )
1701
1924
 
1702
1925
  # Double Sparsity
1703
1926
  parser.add_argument(
@@ -1911,6 +2134,12 @@ class ServerArgs:
1911
2134
  default=ServerArgs.triton_attention_num_kv_splits,
1912
2135
  help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
1913
2136
  )
2137
+ parser.add_argument(
2138
+ "--triton-attention-split-tile-size",
2139
+ type=int,
2140
+ default=ServerArgs.triton_attention_split_tile_size,
2141
+ help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
2142
+ )
1914
2143
  parser.add_argument(
1915
2144
  "--num-continuous-decode-steps",
1916
2145
  type=int,
@@ -1970,6 +2199,12 @@ class ServerArgs:
1970
2199
  default=ServerArgs.scheduler_recv_interval,
1971
2200
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1972
2201
  )
2202
+ parser.add_argument(
2203
+ "--numa-node",
2204
+ type=int,
2205
+ nargs="+",
2206
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2207
+ )
1973
2208
 
1974
2209
  # Debug tensor dumps
1975
2210
  parser.add_argument(
@@ -1995,12 +2230,29 @@ class ServerArgs:
1995
2230
  action="store_true",
1996
2231
  help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1997
2232
  )
2233
+ parser.add_argument(
2234
+ "--enable-dynamic-batch-tokenizer",
2235
+ action="store_true",
2236
+ help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
2237
+ )
2238
+ parser.add_argument(
2239
+ "--dynamic-batch-tokenizer-batch-size",
2240
+ type=int,
2241
+ default=ServerArgs.dynamic_batch_tokenizer_batch_size,
2242
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
2243
+ )
2244
+ parser.add_argument(
2245
+ "--dynamic-batch-tokenizer-batch-timeout",
2246
+ type=float,
2247
+ default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
2248
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
2249
+ )
1998
2250
 
1999
2251
  # PD disaggregation
2000
2252
  parser.add_argument(
2001
2253
  "--disaggregation-mode",
2002
2254
  type=str,
2003
- default="null",
2255
+ default=ServerArgs.disaggregation_mode,
2004
2256
  choices=["null", "prefill", "decode"],
2005
2257
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
2006
2258
  )
@@ -2050,10 +2302,10 @@ class ServerArgs:
2050
2302
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
2051
2303
  )
2052
2304
  parser.add_argument(
2053
- "--pdlb-url",
2054
- type=str,
2055
- default=None,
2056
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2305
+ "--disaggregation-decode-polling-interval",
2306
+ type=int,
2307
+ default=ServerArgs.disaggregation_decode_polling_interval,
2308
+ help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
2057
2309
  )
2058
2310
 
2059
2311
  # Custom weight loader
@@ -2100,6 +2352,11 @@ class ServerArgs:
2100
2352
  action="store_true",
2101
2353
  help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2102
2354
  )
2355
+ parser.add_argument(
2356
+ "--enable-flashinfer-cutedsl-moe",
2357
+ action="store_true",
2358
+ help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2359
+ )
2103
2360
  parser.add_argument(
2104
2361
  "--enable-flashinfer-trtllm-moe",
2105
2362
  action="store_true",
@@ -2122,6 +2379,7 @@ class ServerArgs:
2122
2379
  args.pp_size = args.pipeline_parallel_size
2123
2380
  args.dp_size = args.data_parallel_size
2124
2381
  args.ep_size = args.expert_parallel_size
2382
+
2125
2383
  attrs = [attr.name for attr in dataclasses.fields(cls)]
2126
2384
  return cls(**{attr: getattr(args, attr) for attr in attrs})
2127
2385
 
@@ -2178,13 +2436,20 @@ class ServerArgs:
2178
2436
 
2179
2437
  # Check chunked prefill
2180
2438
  # Skip validation if chunked prefill is disabled (i.e., size <= 0).
2181
- if self.chunked_prefill_size > 0:
2439
+ # Skip validation if disaggregation mode is decode.
2440
+ if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
2182
2441
  assert (
2183
2442
  self.chunked_prefill_size % self.page_size == 0
2184
2443
  ), "chunked_prefill_size must be divisible by page_size"
2185
2444
 
2186
2445
  # Check multi tokenizer
2187
2446
  assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2447
+ self.validate_buckets_rule(
2448
+ "--prompt-tokens-buckets", self.prompt_tokens_buckets
2449
+ )
2450
+ self.validate_buckets_rule(
2451
+ "--generation-tokens-buckets", self.generation_tokens_buckets
2452
+ )
2188
2453
 
2189
2454
  def check_lora_server_args(self):
2190
2455
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
@@ -2277,6 +2542,54 @@ class ServerArgs:
2277
2542
  f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
2278
2543
  )
2279
2544
 
2545
+ def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
2546
+ if not buckets_rule:
2547
+ return
2548
+
2549
+ assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
2550
+ rule = buckets_rule[0]
2551
+ assert rule in [
2552
+ "tse",
2553
+ "default",
2554
+ "customer",
2555
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
2556
+
2557
+ if rule == "tse":
2558
+ assert (
2559
+ len(buckets_rule) == 4
2560
+ ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
2561
+ try:
2562
+ middle = float(buckets_rule[1])
2563
+ base = float(buckets_rule[2])
2564
+ count = int(buckets_rule[3])
2565
+ except (ValueError, IndexError):
2566
+ assert (
2567
+ False
2568
+ ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
2569
+ assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
2570
+ assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
2571
+ assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
2572
+
2573
+ elif rule == "default":
2574
+ assert (
2575
+ len(buckets_rule) == 1
2576
+ ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2577
+
2578
+ elif rule == "customer":
2579
+ assert (
2580
+ len(buckets_rule) >= 2
2581
+ ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
2582
+ try:
2583
+ bucket_values = [float(x) for x in buckets_rule[1:]]
2584
+ except ValueError:
2585
+ assert False, f"{arg_name} customer rule bucket values must be numeric"
2586
+ assert len(set(bucket_values)) == len(
2587
+ bucket_values
2588
+ ), f"{arg_name} customer rule bucket values should not contain duplicates"
2589
+ assert all(
2590
+ val >= 0 for val in bucket_values
2591
+ ), f"{arg_name} customer rule bucket values should be non-negative"
2592
+
2280
2593
  def model_specific_adjustments(self):
2281
2594
  hf_config = self.get_hf_config()
2282
2595
  model_arch = hf_config.architectures[0]
@@ -2336,7 +2649,8 @@ class ServerArgs:
2336
2649
  assert self.attention_backend in {
2337
2650
  "fa3",
2338
2651
  "aiter",
2339
- }, "fa3 or aiter is required for Llama4 model"
2652
+ "triton",
2653
+ }, "fa3, aiter, or triton is required for Llama4 model"
2340
2654
  elif model_arch in [
2341
2655
  "Gemma2ForCausalLM",
2342
2656
  "Gemma3ForCausalLM",
@@ -2535,7 +2849,9 @@ def auto_choose_speculative_params(self: ServerArgs):
2535
2849
  """
2536
2850
  hf_config = self.get_hf_config()
2537
2851
  arch = hf_config.architectures[0]
2538
-
2852
+ if self.speculative_algorithm == "STANDALONE":
2853
+ # The default value for standalone speculative decoding
2854
+ return (3, 1, 4)
2539
2855
  if arch in ["LlamaForCausalLM"]:
2540
2856
  # The default value for llama
2541
2857
  return (5, 4, 8)
@@ -2543,6 +2859,8 @@ def auto_choose_speculative_params(self: ServerArgs):
2543
2859
  "DeepseekV3ForCausalLM",
2544
2860
  "DeepseekV2ForCausalLM",
2545
2861
  "GptOssForCausalLM",
2862
+ "BailingMoeForCausalLM",
2863
+ "BailingMoeV2ForCausalLM",
2546
2864
  ]:
2547
2865
  # The default value for deepseek and gpt-oss
2548
2866
  return (3, 1, 4)