sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -25,9 +25,8 @@ from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
- from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
29
28
  from sglang.srt.lora.lora_registry import LoRARef
30
- from sglang.srt.reasoning_parser import ReasoningParser
29
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
31
30
  from sglang.srt.utils import (
32
31
  LORA_TARGET_ALL_MODULES,
33
32
  SUPPORTED_LORA_TARGET_MODULES,
@@ -39,20 +38,105 @@ from sglang.srt.utils import (
39
38
  is_hip,
40
39
  is_port_available,
41
40
  is_remote_url,
41
+ is_sm90_supported,
42
+ is_sm100_supported,
42
43
  is_triton_kernels_available,
43
44
  is_valid_ipv6_address,
44
45
  nullable_str,
45
46
  )
47
+ from sglang.utils import is_in_ci
46
48
 
47
49
  logger = logging.getLogger(__name__)
48
50
 
49
51
 
52
+ # Define constants
53
+ LOAD_FORMAT_CHOICES = [
54
+ "auto",
55
+ "pt",
56
+ "safetensors",
57
+ "npcache",
58
+ "dummy",
59
+ "sharded_state",
60
+ "gguf",
61
+ "bitsandbytes",
62
+ "layered",
63
+ "remote",
64
+ ]
65
+
66
+ QUANTIZATION_CHOICES = [
67
+ "awq",
68
+ "fp8",
69
+ "gptq",
70
+ "marlin",
71
+ "gptq_marlin",
72
+ "awq_marlin",
73
+ "bitsandbytes",
74
+ "gguf",
75
+ "modelopt",
76
+ "modelopt_fp4",
77
+ "petit_nvfp4",
78
+ "w8a8_int8",
79
+ "w8a8_fp8",
80
+ "moe_wna16",
81
+ "qoq",
82
+ "w4afp8",
83
+ "mxfp4",
84
+ ]
85
+
86
+ ATTENTION_BACKEND_CHOICES = [
87
+ # Common
88
+ "triton",
89
+ "torch_native",
90
+ # NVIDIA specific
91
+ "cutlass_mla",
92
+ "fa3",
93
+ "flashinfer",
94
+ "flashmla",
95
+ "trtllm_mla",
96
+ "trtllm_mha",
97
+ "dual_chunk_flash_attn",
98
+ "hybrid_linear_attn",
99
+ # AMD specific
100
+ "aiter",
101
+ "wave",
102
+ # Other platforms
103
+ "intel_amx",
104
+ "ascend",
105
+ ]
106
+
107
+ DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
108
+
109
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
110
+
111
+
112
+ # Allow external code to add more choices
113
+ def add_load_format_choices(choices):
114
+ LOAD_FORMAT_CHOICES.extend(choices)
115
+
116
+
117
+ def add_quantization_method_choices(choices):
118
+ QUANTIZATION_CHOICES.extend(choices)
119
+
120
+
121
+ def add_attention_backend_choices(choices):
122
+ ATTENTION_BACKEND_CHOICES.extend(choices)
123
+
124
+
125
+ def add_disagg_transfer_backend_choices(choices):
126
+ DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
127
+
128
+
129
+ def add_grammar_backend_choices(choices):
130
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
131
+
132
+
50
133
  @dataclasses.dataclass
51
134
  class ServerArgs:
52
135
  # Model and tokenizer
53
136
  model_path: str
54
137
  tokenizer_path: Optional[str] = None
55
138
  tokenizer_mode: str = "auto"
139
+ tokenizer_worker_num: int = 1
56
140
  skip_tokenizer_init: bool = False
57
141
  load_format: str = "auto"
58
142
  model_loader_extra_config: str = "{}"
@@ -119,6 +203,8 @@ class ServerArgs:
119
203
  bucket_inter_token_latency: Optional[List[float]] = None
120
204
  bucket_e2e_request_latency: Optional[List[float]] = None
121
205
  collect_tokens_histogram: bool = False
206
+ prompt_tokens_buckets: Optional[List[str]] = None
207
+ generation_tokens_buckets: Optional[List[str]] = None
122
208
  decode_log_interval: int = 40
123
209
  enable_request_time_stats_logging: bool = False
124
210
  kv_events_config: Optional[str] = None
@@ -139,6 +225,8 @@ class ServerArgs:
139
225
  # Data parallelism
140
226
  dp_size: int = 1
141
227
  load_balance_method: str = "round_robin"
228
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
229
+ prefill_round_robin_balance: bool = False
142
230
 
143
231
  # Multi-node distributed serving
144
232
  dist_init_addr: Optional[str] = None
@@ -171,12 +259,14 @@ class ServerArgs:
171
259
  # Speculative decoding
172
260
  speculative_algorithm: Optional[str] = None
173
261
  speculative_draft_model_path: Optional[str] = None
262
+ speculative_draft_model_revision: Optional[str] = None
174
263
  speculative_num_steps: Optional[int] = None
175
264
  speculative_eagle_topk: Optional[int] = None
176
265
  speculative_num_draft_tokens: Optional[int] = None
177
266
  speculative_accept_threshold_single: float = 1.0
178
267
  speculative_accept_threshold_acc: float = 1.0
179
268
  speculative_token_map: Optional[str] = None
269
+ speculative_attention_mode: str = "prefill"
180
270
 
181
271
  # Expert parallelism
182
272
  ep_size: int = 1
@@ -199,6 +289,7 @@ class ServerArgs:
199
289
  eplb_algorithm: str = "auto"
200
290
  eplb_rebalance_num_iterations: int = 1000
201
291
  eplb_rebalance_layers_per_chunk: Optional[int] = None
292
+ eplb_min_rebalancing_utilization_threshold: float = 1.0
202
293
  expert_distribution_recorder_mode: Optional[
203
294
  Literal["stat", "stat_approx", "per_pass", "per_token"]
204
295
  ] = None
@@ -211,11 +302,14 @@ class ServerArgs:
211
302
  enable_hierarchical_cache: bool = False
212
303
  hicache_ratio: float = 2.0
213
304
  hicache_size: int = 0
214
- hicache_write_policy: str = "write_through_selective"
305
+ hicache_write_policy: str = "write_through"
215
306
  hicache_io_backend: str = "kernel"
216
307
  hicache_mem_layout: str = "layer_first"
217
308
  hicache_storage_backend: Optional[str] = None
218
309
  hicache_storage_prefetch_policy: str = "best_effort"
310
+ hicache_storage_backend_extra_config: Optional[str] = None
311
+ # LMCache
312
+ enable_lmcache: bool = False
219
313
 
220
314
  # Double Sparsity
221
315
  enable_double_sparsity: bool = False
@@ -271,6 +365,7 @@ class ServerArgs:
271
365
  disable_fast_image_processor: bool = False
272
366
  enable_return_hidden_states: bool = False
273
367
  scheduler_recv_interval: int = 1
368
+ numa_node: Optional[List[int]] = None
274
369
 
275
370
  # Debug tensor dumps
276
371
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -287,7 +382,6 @@ class ServerArgs:
287
382
  disaggregation_prefill_pp: Optional[int] = 1
288
383
  disaggregation_ib_device: Optional[str] = None
289
384
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
290
- pdlb_url: Optional[str] = None
291
385
 
292
386
  # For model weight update
293
387
  custom_weight_loader: Optional[List[str]] = None
@@ -297,6 +391,10 @@ class ServerArgs:
297
391
  enable_pdmux: bool = False
298
392
  sm_group_num: int = 3
299
393
 
394
+ # Mamba cache
395
+ max_mamba_cache_size: Optional[int] = None
396
+ mamba_ssm_dtype: str = "float32"
397
+
300
398
  # Deprecated arguments
301
399
  enable_ep_moe: bool = False
302
400
  enable_deepep_moe: bool = False
@@ -384,9 +482,14 @@ class ServerArgs:
384
482
  # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
385
483
  reserved_mem = 32 * 1024
386
484
 
485
+ # draft model and larger cuda graph buffers
387
486
  if self.speculative_algorithm is not None:
388
- # draft model and larger cuda graph buffers
389
- reserved_mem += 2 * 1024
487
+ if self.speculative_algorithm == "STANDALONE":
488
+ # Standalone speculative decoding needs more memory than other speculative
489
+ # decoding algorithms since the draft model is typically larger.
490
+ reserved_mem += 6 * 1024
491
+ else:
492
+ reserved_mem += 2 * 1024
390
493
  if self.enable_dp_attention:
391
494
  reserved_mem += 4 * 1024
392
495
 
@@ -528,12 +631,12 @@ class ServerArgs:
528
631
  if self.grammar_backend is None:
529
632
  self.grammar_backend = "xgrammar"
530
633
 
634
+ if self.dp_size == 1:
635
+ self.enable_dp_attention = False
636
+
531
637
  # Data parallelism attention
532
638
  if self.enable_dp_attention:
533
639
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
534
- assert (
535
- self.dp_size > 1
536
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
537
640
  assert self.tp_size % self.dp_size == 0
538
641
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
539
642
  logger.warning(
@@ -556,11 +659,13 @@ class ServerArgs:
556
659
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
557
660
 
558
661
  if self.moe_runner_backend == "flashinfer_trtllm":
559
- if not self.disable_shared_experts_fusion:
560
- self.disable_shared_experts_fusion = True
561
- logger.warning(
562
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
563
- )
662
+ assert (
663
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
664
+ ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
665
+ self.disable_shared_experts_fusion = True
666
+ logger.warning(
667
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
668
+ )
564
669
 
565
670
  # DeepEP MoE
566
671
  if self.moe_a2a_backend == "deepep":
@@ -615,7 +720,12 @@ class ServerArgs:
615
720
  # NEXTN shares the same implementation of EAGLE
616
721
  self.speculative_algorithm = "EAGLE"
617
722
 
618
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
723
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
724
+ if self.speculative_algorithm == "STANDALONE":
725
+ # TODO: support dp attention for standalone speculative decoding
726
+ assert (
727
+ self.enable_dp_attention is False
728
+ ), "Currently standalone speculative decoding does not support dp attention."
619
729
  if self.max_running_requests is None:
620
730
  self.max_running_requests = 48
621
731
  self.disable_overlap_schedule = True
@@ -671,6 +781,15 @@ class ServerArgs:
671
781
  )
672
782
  self.speculative_num_draft_tokens = self.speculative_num_steps + 1
673
783
 
784
+ if (
785
+ self.speculative_eagle_topk > 1
786
+ and self.page_size > 1
787
+ and self.attention_backend != "flashinfer"
788
+ ):
789
+ raise ValueError(
790
+ "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
791
+ )
792
+
674
793
  # The token generated from the verify step is counted.
675
794
  # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
676
795
  # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -698,6 +817,13 @@ class ServerArgs:
698
817
 
699
818
  self.disable_radix_cache = True
700
819
  logger.warning("KV cache is forced as chunk cache for decode server")
820
+
821
+ if self.dp_size > 1 and not is_in_ci():
822
+ assert self.prefill_round_robin_balance, (
823
+ "Prefill round robin balance is required when dp size > 1. "
824
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
825
+ " and `--prefill-round-robin-balance` is set for decode server."
826
+ )
701
827
  elif self.disaggregation_mode == "prefill":
702
828
  if self.disaggregation_decode_tp is None:
703
829
  self.disaggregation_decode_tp = self.tp_size
@@ -714,6 +840,8 @@ class ServerArgs:
714
840
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
715
841
  "1" if self.enable_torch_compile else "0"
716
842
  )
843
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
844
+
717
845
  # Set env var before grammar backends init
718
846
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
719
847
  "1" if self.disable_outlines_disk_cache else "0"
@@ -750,6 +878,12 @@ class ServerArgs:
750
878
  "tokenizer if available, and 'slow' will "
751
879
  "always use the slow tokenizer.",
752
880
  )
881
+ parser.add_argument(
882
+ "--tokenizer-worker-num",
883
+ type=int,
884
+ default=ServerArgs.tokenizer_worker_num,
885
+ help="The worker num of the tokenizer manager.",
886
+ )
753
887
  parser.add_argument(
754
888
  "--skip-tokenizer-init",
755
889
  action="store_true",
@@ -759,18 +893,7 @@ class ServerArgs:
759
893
  "--load-format",
760
894
  type=str,
761
895
  default=ServerArgs.load_format,
762
- choices=[
763
- "auto",
764
- "pt",
765
- "safetensors",
766
- "npcache",
767
- "dummy",
768
- "sharded_state",
769
- "gguf",
770
- "bitsandbytes",
771
- "layered",
772
- "remote",
773
- ],
896
+ choices=LOAD_FORMAT_CHOICES,
774
897
  help="The format of the model weights to load. "
775
898
  '"auto" will try to load the weights in the safetensors format '
776
899
  "and fall back to the pytorch bin format if safetensors format "
@@ -889,25 +1012,7 @@ class ServerArgs:
889
1012
  "--quantization",
890
1013
  type=str,
891
1014
  default=ServerArgs.quantization,
892
- choices=[
893
- "awq",
894
- "fp8",
895
- "gptq",
896
- "marlin",
897
- "gptq_marlin",
898
- "awq_marlin",
899
- "bitsandbytes",
900
- "gguf",
901
- "modelopt",
902
- "modelopt_fp4",
903
- "petit_nvfp4",
904
- "w8a8_int8",
905
- "w8a8_fp8",
906
- "moe_wna16",
907
- "qoq",
908
- "w4afp8",
909
- "mxfp4",
910
- ],
1015
+ choices=QUANTIZATION_CHOICES,
911
1016
  help="The quantization method.",
912
1017
  )
913
1018
  parser.add_argument(
@@ -1170,6 +1275,26 @@ class ServerArgs:
1170
1275
  default=ServerArgs.collect_tokens_histogram,
1171
1276
  help="Collect prompt/generation tokens histogram.",
1172
1277
  )
1278
+ bucket_rule = (
1279
+ "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1280
+ "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1281
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1282
+ "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
1283
+ )
1284
+ parser.add_argument(
1285
+ "--prompt-tokens-buckets",
1286
+ type=str,
1287
+ nargs="+",
1288
+ default=ServerArgs.prompt_tokens_buckets,
1289
+ help=f"The buckets rule of prompt tokens. {bucket_rule}",
1290
+ )
1291
+ parser.add_argument(
1292
+ "--generation-tokens-buckets",
1293
+ type=str,
1294
+ nargs="+",
1295
+ default=ServerArgs.generation_tokens_buckets,
1296
+ help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
1297
+ )
1173
1298
  parser.add_argument(
1174
1299
  "--gc-warning-threshold-secs",
1175
1300
  type=float,
@@ -1278,6 +1403,12 @@ class ServerArgs:
1278
1403
  "minimum_tokens",
1279
1404
  ],
1280
1405
  )
1406
+ parser.add_argument(
1407
+ "--prefill-round-robin-balance",
1408
+ default=ServerArgs.prefill_round_robin_balance,
1409
+ action="store_true",
1410
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1411
+ )
1281
1412
 
1282
1413
  # Multi-node distributed serving
1283
1414
  parser.add_argument(
@@ -1357,43 +1488,24 @@ class ServerArgs:
1357
1488
  )
1358
1489
 
1359
1490
  # Kernel backend
1360
- ATTN_BACKENDS = [
1361
- # Common
1362
- "triton",
1363
- "torch_native",
1364
- # NVIDIA specific
1365
- "cutlass_mla",
1366
- "fa3",
1367
- "flashinfer",
1368
- "flashmla",
1369
- "trtllm_mla",
1370
- "trtllm_mha",
1371
- "dual_chunk_flash_attn",
1372
- # AMD specific
1373
- "aiter",
1374
- "wave",
1375
- # Other platforms
1376
- "intel_amx",
1377
- "ascend",
1378
- ]
1379
1491
  parser.add_argument(
1380
1492
  "--attention-backend",
1381
1493
  type=str,
1382
- choices=ATTN_BACKENDS,
1494
+ choices=ATTENTION_BACKEND_CHOICES,
1383
1495
  default=ServerArgs.attention_backend,
1384
1496
  help="Choose the kernels for attention layers.",
1385
1497
  )
1386
1498
  parser.add_argument(
1387
1499
  "--prefill-attention-backend",
1388
1500
  type=str,
1389
- choices=ATTN_BACKENDS,
1501
+ choices=ATTENTION_BACKEND_CHOICES,
1390
1502
  default=ServerArgs.prefill_attention_backend,
1391
1503
  help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
1392
1504
  )
1393
1505
  parser.add_argument(
1394
1506
  "--decode-attention-backend",
1395
1507
  type=str,
1396
- choices=ATTN_BACKENDS,
1508
+ choices=ATTENTION_BACKEND_CHOICES,
1397
1509
  default=ServerArgs.decode_attention_backend,
1398
1510
  help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
1399
1511
  )
@@ -1407,7 +1519,7 @@ class ServerArgs:
1407
1519
  parser.add_argument(
1408
1520
  "--grammar-backend",
1409
1521
  type=str,
1410
- choices=["xgrammar", "outlines", "llguidance", "none"],
1522
+ choices=GRAMMAR_BACKEND_CHOICES,
1411
1523
  default=ServerArgs.grammar_backend,
1412
1524
  help="Choose the backend for grammar-guided decoding.",
1413
1525
  )
@@ -1423,14 +1535,23 @@ class ServerArgs:
1423
1535
  parser.add_argument(
1424
1536
  "--speculative-algorithm",
1425
1537
  type=str,
1426
- choices=["EAGLE", "EAGLE3", "NEXTN"],
1538
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
1427
1539
  help="Speculative algorithm.",
1428
1540
  )
1429
1541
  parser.add_argument(
1430
1542
  "--speculative-draft-model-path",
1543
+ "--speculative-draft-model",
1431
1544
  type=str,
1432
1545
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1433
1546
  )
1547
+ parser.add_argument(
1548
+ "--speculative-draft-model-revision",
1549
+ type=str,
1550
+ default=None,
1551
+ help="The specific draft model version to use. It can be a branch "
1552
+ "name, a tag name, or a commit id. If unspecified, will use "
1553
+ "the default version.",
1554
+ )
1434
1555
  parser.add_argument(
1435
1556
  "--speculative-num-steps",
1436
1557
  type=int,
@@ -1467,6 +1588,13 @@ class ServerArgs:
1467
1588
  help="The path of the draft model's small vocab table.",
1468
1589
  default=ServerArgs.speculative_token_map,
1469
1590
  )
1591
+ parser.add_argument(
1592
+ "--speculative-attention-mode",
1593
+ type=str,
1594
+ choices=["prefill", "decode"],
1595
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1596
+ default=ServerArgs.speculative_attention_mode,
1597
+ )
1470
1598
 
1471
1599
  # Expert parallelism
1472
1600
  parser.add_argument(
@@ -1501,7 +1629,7 @@ class ServerArgs:
1501
1629
  parser.add_argument(
1502
1630
  "--flashinfer-mxfp4-moe-precision",
1503
1631
  type=str,
1504
- choices=["mxfp4", "bf16"],
1632
+ choices=["default", "bf16"],
1505
1633
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1506
1634
  help="Choose the computation precision of flashinfer mxfp4 moe",
1507
1635
  )
@@ -1558,6 +1686,12 @@ class ServerArgs:
1558
1686
  default=ServerArgs.eplb_rebalance_layers_per_chunk,
1559
1687
  help="Number of layers to rebalance per forward pass.",
1560
1688
  )
1689
+ parser.add_argument(
1690
+ "--eplb-min-rebalancing-utilization-threshold",
1691
+ type=float,
1692
+ default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
1693
+ help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
1694
+ )
1561
1695
  parser.add_argument(
1562
1696
  "--expert-distribution-recorder-mode",
1563
1697
  type=str,
@@ -1588,6 +1722,21 @@ class ServerArgs:
1588
1722
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1589
1723
  )
1590
1724
 
1725
+ # Mamba Cache
1726
+ parser.add_argument(
1727
+ "--max-mamba-cache-size",
1728
+ type=int,
1729
+ default=ServerArgs.max_mamba_cache_size,
1730
+ help="The maximum size of the mamba cache.",
1731
+ )
1732
+ parser.add_argument(
1733
+ "--mamba-ssm-dtype",
1734
+ type=str,
1735
+ default=ServerArgs.mamba_ssm_dtype,
1736
+ choices=["float32", "bfloat16"],
1737
+ help="The data type of the SSM states in mamba cache.",
1738
+ )
1739
+
1591
1740
  # Hierarchical cache
1592
1741
  parser.add_argument(
1593
1742
  "--enable-hierarchical-cache",
@@ -1641,6 +1790,18 @@ class ServerArgs:
1641
1790
  default=ServerArgs.hicache_storage_prefetch_policy,
1642
1791
  help="Control when prefetching from the storage backend should stop.",
1643
1792
  )
1793
+ parser.add_argument(
1794
+ "--hicache-storage-backend-extra-config",
1795
+ type=str,
1796
+ default=ServerArgs.hicache_storage_backend_extra_config,
1797
+ help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1798
+ )
1799
+ # LMCache
1800
+ parser.add_argument(
1801
+ "--enable-lmcache",
1802
+ action="store_true",
1803
+ help="Using LMCache as an alternative hierarchical cache solution",
1804
+ )
1644
1805
 
1645
1806
  # Double Sparsity
1646
1807
  parser.add_argument(
@@ -1913,6 +2074,12 @@ class ServerArgs:
1913
2074
  default=ServerArgs.scheduler_recv_interval,
1914
2075
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1915
2076
  )
2077
+ parser.add_argument(
2078
+ "--numa-node",
2079
+ type=int,
2080
+ nargs="+",
2081
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2082
+ )
1916
2083
 
1917
2084
  # Debug tensor dumps
1918
2085
  parser.add_argument(
@@ -1951,7 +2118,7 @@ class ServerArgs:
1951
2118
  "--disaggregation-transfer-backend",
1952
2119
  type=str,
1953
2120
  default=ServerArgs.disaggregation_transfer_backend,
1954
- choices=["mooncake", "nixl", "ascend"],
2121
+ choices=DISAGG_TRANSFER_BACKEND_CHOICES,
1955
2122
  help="The backend for disaggregation transfer. Default is mooncake.",
1956
2123
  )
1957
2124
  parser.add_argument(
@@ -1992,12 +2159,6 @@ class ServerArgs:
1992
2159
  default=ServerArgs.num_reserved_decode_tokens,
1993
2160
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
1994
2161
  )
1995
- parser.add_argument(
1996
- "--pdlb-url",
1997
- type=str,
1998
- default=None,
1999
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2000
- )
2001
2162
 
2002
2163
  # Custom weight loader
2003
2164
  parser.add_argument(
@@ -2126,6 +2287,15 @@ class ServerArgs:
2126
2287
  self.chunked_prefill_size % self.page_size == 0
2127
2288
  ), "chunked_prefill_size must be divisible by page_size"
2128
2289
 
2290
+ # Check multi tokenizer
2291
+ assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2292
+ self.validate_buckets_rule(
2293
+ "--prompt-tokens-buckets", self.prompt_tokens_buckets
2294
+ )
2295
+ self.validate_buckets_rule(
2296
+ "--generation-tokens-buckets", self.generation_tokens_buckets
2297
+ )
2298
+
2129
2299
  def check_lora_server_args(self):
2130
2300
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2131
2301
 
@@ -2217,6 +2387,54 @@ class ServerArgs:
2217
2387
  f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
2218
2388
  )
2219
2389
 
2390
+ def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
2391
+ if not buckets_rule:
2392
+ return
2393
+
2394
+ assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
2395
+ rule = buckets_rule[0]
2396
+ assert rule in [
2397
+ "tse",
2398
+ "default",
2399
+ "customer",
2400
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
2401
+
2402
+ if rule == "tse":
2403
+ assert (
2404
+ len(buckets_rule) == 4
2405
+ ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
2406
+ try:
2407
+ middle = float(buckets_rule[1])
2408
+ base = float(buckets_rule[2])
2409
+ count = int(buckets_rule[3])
2410
+ except (ValueError, IndexError):
2411
+ assert (
2412
+ False
2413
+ ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
2414
+ assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
2415
+ assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
2416
+ assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
2417
+
2418
+ elif rule == "default":
2419
+ assert (
2420
+ len(buckets_rule) == 1
2421
+ ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2422
+
2423
+ elif rule == "customer":
2424
+ assert (
2425
+ len(buckets_rule) >= 2
2426
+ ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
2427
+ try:
2428
+ bucket_values = [float(x) for x in buckets_rule[1:]]
2429
+ except ValueError:
2430
+ assert False, f"{arg_name} customer rule bucket values must be numeric"
2431
+ assert len(set(bucket_values)) == len(
2432
+ bucket_values
2433
+ ), f"{arg_name} customer rule bucket values should not contain duplicates"
2434
+ assert all(
2435
+ val >= 0 for val in bucket_values
2436
+ ), f"{arg_name} customer rule bucket values should be non-negative"
2437
+
2220
2438
  def model_specific_adjustments(self):
2221
2439
  hf_config = self.get_hf_config()
2222
2440
  model_arch = hf_config.architectures[0]
@@ -2271,11 +2489,13 @@ class ServerArgs:
2271
2489
  if is_mxfp4_quant_format:
2272
2490
  # use bf16 for mxfp4 triton kernels
2273
2491
  self.dtype = "bfloat16"
2492
+
2274
2493
  elif "Llama4" in model_arch:
2275
2494
  assert self.attention_backend in {
2276
2495
  "fa3",
2277
2496
  "aiter",
2278
- }, "fa3 or aiter is required for Llama4 model"
2497
+ "triton",
2498
+ }, "fa3, aiter, or triton is required for Llama4 model"
2279
2499
  elif model_arch in [
2280
2500
  "Gemma2ForCausalLM",
2281
2501
  "Gemma3ForCausalLM",
@@ -2368,6 +2588,9 @@ class PortArgs:
2368
2588
  # The ipc filename for Scheduler to send metrics
2369
2589
  metrics_ipc_name: str
2370
2590
 
2591
+ # The ipc filename for Tokenizer and worker tokenizer
2592
+ tokenizer_worker_ipc_name: Optional[str]
2593
+
2371
2594
  @staticmethod
2372
2595
  def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
2373
2596
  if server_args.nccl_port is None:
@@ -2391,6 +2614,7 @@ class PortArgs:
2391
2614
  nccl_port=nccl_port,
2392
2615
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2393
2616
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2617
+ tokenizer_worker_ipc_name=None,
2394
2618
  )
2395
2619
  else:
2396
2620
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -2424,6 +2648,7 @@ class PortArgs:
2424
2648
  nccl_port=nccl_port,
2425
2649
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2426
2650
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2651
+ tokenizer_worker_ipc_name=None,
2427
2652
  )
2428
2653
 
2429
2654
 
@@ -2469,7 +2694,9 @@ def auto_choose_speculative_params(self: ServerArgs):
2469
2694
  """
2470
2695
  hf_config = self.get_hf_config()
2471
2696
  arch = hf_config.architectures[0]
2472
-
2697
+ if self.speculative_algorithm == "STANDALONE":
2698
+ # The default value for standalone speculative decoding
2699
+ return (3, 1, 4)
2473
2700
  if arch in ["LlamaForCausalLM"]:
2474
2701
  # The default value for llama
2475
2702
  return (5, 4, 8)