sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +10 -1
  3. sglang/bench_serving.py +251 -26
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/internvl.py +6 -0
  7. sglang/srt/configs/longcat_flash.py +104 -0
  8. sglang/srt/configs/model_config.py +37 -7
  9. sglang/srt/configs/qwen3_next.py +326 -0
  10. sglang/srt/connector/__init__.py +1 -1
  11. sglang/srt/connector/base_connector.py +1 -2
  12. sglang/srt/connector/redis.py +2 -2
  13. sglang/srt/connector/serde/__init__.py +1 -1
  14. sglang/srt/connector/serde/safe_serde.py +4 -3
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/ascend/conn.py +75 -0
  21. sglang/srt/disaggregation/base/conn.py +1 -1
  22. sglang/srt/disaggregation/common/conn.py +15 -12
  23. sglang/srt/disaggregation/decode.py +6 -4
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -420
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +6 -4
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +94 -58
  31. sglang/srt/entrypoints/engine.py +34 -14
  32. sglang/srt/entrypoints/http_server.py +172 -47
  33. sglang/srt/entrypoints/openai/protocol.py +63 -3
  34. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  35. sglang/srt/entrypoints/openai/serving_chat.py +34 -19
  36. sglang/srt/entrypoints/openai/serving_completions.py +10 -4
  37. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  38. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  39. sglang/srt/eplb/eplb_manager.py +28 -4
  40. sglang/srt/eplb/expert_distribution.py +55 -15
  41. sglang/srt/eplb/expert_location.py +8 -3
  42. sglang/srt/eplb/expert_location_updater.py +1 -1
  43. sglang/srt/function_call/ebnf_composer.py +11 -9
  44. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  45. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  46. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  47. sglang/srt/hf_transformers_utils.py +12 -0
  48. sglang/srt/layers/activation.py +44 -9
  49. sglang/srt/layers/attention/aiter_backend.py +93 -68
  50. sglang/srt/layers/attention/ascend_backend.py +250 -112
  51. sglang/srt/layers/attention/fla/chunk.py +242 -0
  52. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  53. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  54. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  55. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  56. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  57. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  58. sglang/srt/layers/attention/fla/index.py +37 -0
  59. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  60. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  61. sglang/srt/layers/attention/fla/op.py +66 -0
  62. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  63. sglang/srt/layers/attention/fla/utils.py +331 -0
  64. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  65. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  66. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  67. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  68. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  69. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  70. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  71. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  72. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  73. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  74. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  75. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  76. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  77. sglang/srt/layers/communicator.py +45 -7
  78. sglang/srt/layers/layernorm.py +54 -12
  79. sglang/srt/layers/logits_processor.py +10 -3
  80. sglang/srt/layers/moe/__init__.py +2 -1
  81. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  82. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  83. sglang/srt/layers/moe/ep_moe/layer.py +110 -49
  84. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  85. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  94. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  95. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  96. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  97. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  98. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  99. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  100. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  101. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  102. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  103. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  104. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  105. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  106. sglang/srt/layers/moe/topk.py +43 -12
  107. sglang/srt/layers/moe/utils.py +6 -5
  108. sglang/srt/layers/quantization/awq.py +19 -7
  109. sglang/srt/layers/quantization/base_config.py +11 -6
  110. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  111. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  112. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  113. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  114. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  115. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  116. sglang/srt/layers/quantization/fp8.py +76 -47
  117. sglang/srt/layers/quantization/fp8_utils.py +43 -29
  118. sglang/srt/layers/quantization/gptq.py +25 -17
  119. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  120. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  121. sglang/srt/layers/quantization/mxfp4.py +77 -45
  122. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  123. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  124. sglang/srt/layers/quantization/quark/utils.py +97 -0
  125. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  126. sglang/srt/layers/quantization/unquant.py +135 -47
  127. sglang/srt/layers/quantization/utils.py +13 -0
  128. sglang/srt/layers/quantization/w4afp8.py +60 -42
  129. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  130. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  131. sglang/srt/layers/rocm_linear_utils.py +44 -0
  132. sglang/srt/layers/rotary_embedding.py +28 -19
  133. sglang/srt/layers/sampler.py +29 -5
  134. sglang/srt/lora/backend/base_backend.py +50 -8
  135. sglang/srt/lora/backend/triton_backend.py +90 -2
  136. sglang/srt/lora/layers.py +32 -0
  137. sglang/srt/lora/lora.py +4 -1
  138. sglang/srt/lora/lora_manager.py +35 -112
  139. sglang/srt/lora/mem_pool.py +24 -10
  140. sglang/srt/lora/utils.py +18 -9
  141. sglang/srt/managers/cache_controller.py +242 -278
  142. sglang/srt/managers/data_parallel_controller.py +30 -15
  143. sglang/srt/managers/detokenizer_manager.py +13 -2
  144. sglang/srt/managers/disagg_service.py +46 -0
  145. sglang/srt/managers/io_struct.py +160 -11
  146. sglang/srt/managers/mm_utils.py +6 -1
  147. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  148. sglang/srt/managers/schedule_batch.py +27 -44
  149. sglang/srt/managers/schedule_policy.py +4 -3
  150. sglang/srt/managers/scheduler.py +90 -115
  151. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  152. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  153. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  154. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  155. sglang/srt/managers/template_manager.py +3 -3
  156. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  157. sglang/srt/managers/tokenizer_manager.py +41 -477
  158. sglang/srt/managers/tp_worker.py +16 -4
  159. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  160. sglang/srt/mem_cache/allocator.py +1 -1
  161. sglang/srt/mem_cache/chunk_cache.py +1 -1
  162. sglang/srt/mem_cache/hicache_storage.py +24 -22
  163. sglang/srt/mem_cache/hiradix_cache.py +184 -101
  164. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  165. sglang/srt/mem_cache/memory_pool.py +324 -41
  166. sglang/srt/mem_cache/memory_pool_host.py +25 -18
  167. sglang/srt/mem_cache/radix_cache.py +5 -6
  168. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  169. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  170. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  171. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  172. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
  173. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  174. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  175. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
  176. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  177. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  178. sglang/srt/metrics/collector.py +484 -63
  179. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  180. sglang/srt/metrics/utils.py +48 -0
  181. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  182. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  183. sglang/srt/model_executor/forward_batch_info.py +72 -18
  184. sglang/srt/model_executor/model_runner.py +189 -31
  185. sglang/srt/model_loader/__init__.py +9 -3
  186. sglang/srt/model_loader/loader.py +33 -28
  187. sglang/srt/model_loader/utils.py +12 -0
  188. sglang/srt/model_loader/weight_utils.py +2 -1
  189. sglang/srt/models/deepseek_v2.py +311 -50
  190. sglang/srt/models/gemma3n_mm.py +1 -1
  191. sglang/srt/models/glm4_moe.py +10 -1
  192. sglang/srt/models/glm4v.py +4 -2
  193. sglang/srt/models/gpt_oss.py +5 -18
  194. sglang/srt/models/internvl.py +28 -0
  195. sglang/srt/models/llama4.py +9 -0
  196. sglang/srt/models/llama_eagle3.py +17 -0
  197. sglang/srt/models/longcat_flash.py +1026 -0
  198. sglang/srt/models/longcat_flash_nextn.py +699 -0
  199. sglang/srt/models/minicpmv.py +165 -3
  200. sglang/srt/models/mllama4.py +25 -0
  201. sglang/srt/models/opt.py +637 -0
  202. sglang/srt/models/qwen2.py +33 -3
  203. sglang/srt/models/qwen2_5_vl.py +90 -42
  204. sglang/srt/models/qwen2_moe.py +79 -14
  205. sglang/srt/models/qwen3.py +8 -2
  206. sglang/srt/models/qwen3_moe.py +39 -8
  207. sglang/srt/models/qwen3_next.py +1039 -0
  208. sglang/srt/models/qwen3_next_mtp.py +109 -0
  209. sglang/srt/models/torch_native_llama.py +1 -1
  210. sglang/srt/models/transformers.py +1 -1
  211. sglang/srt/multimodal/processors/base_processor.py +4 -2
  212. sglang/srt/multimodal/processors/glm4v.py +9 -9
  213. sglang/srt/multimodal/processors/internvl.py +141 -129
  214. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  215. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  216. sglang/srt/sampling/sampling_batch_info.py +18 -15
  217. sglang/srt/server_args.py +297 -79
  218. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  219. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  220. sglang/srt/speculative/eagle_worker.py +216 -120
  221. sglang/srt/speculative/spec_info.py +5 -0
  222. sglang/srt/speculative/standalone_worker.py +109 -0
  223. sglang/srt/utils.py +37 -2
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  226. sglang/test/few_shot_gsm8k.py +1 -0
  227. sglang/test/runners.py +4 -0
  228. sglang/test/test_cutlass_moe.py +24 -6
  229. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  230. sglang/test/test_disaggregation_utils.py +66 -0
  231. sglang/test/test_utils.py +25 -1
  232. sglang/utils.py +5 -0
  233. sglang/version.py +1 -1
  234. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
  235. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
  236. sglang/srt/disaggregation/launch_lb.py +0 -131
  237. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  238. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  239. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  240. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  241. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  242. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  243. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  244. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  245. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -26,7 +26,7 @@ from typing import List, Literal, Optional, Union
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
28
  from sglang.srt.lora.lora_registry import LoRARef
29
- from sglang.srt.reasoning_parser import ReasoningParser
29
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
30
30
  from sglang.srt.utils import (
31
31
  LORA_TARGET_ALL_MODULES,
32
32
  SUPPORTED_LORA_TARGET_MODULES,
@@ -44,16 +44,99 @@ from sglang.srt.utils import (
44
44
  is_valid_ipv6_address,
45
45
  nullable_str,
46
46
  )
47
+ from sglang.utils import is_in_ci
47
48
 
48
49
  logger = logging.getLogger(__name__)
49
50
 
50
51
 
52
+ # Define constants
53
+ LOAD_FORMAT_CHOICES = [
54
+ "auto",
55
+ "pt",
56
+ "safetensors",
57
+ "npcache",
58
+ "dummy",
59
+ "sharded_state",
60
+ "gguf",
61
+ "bitsandbytes",
62
+ "layered",
63
+ "remote",
64
+ ]
65
+
66
+ QUANTIZATION_CHOICES = [
67
+ "awq",
68
+ "fp8",
69
+ "gptq",
70
+ "marlin",
71
+ "gptq_marlin",
72
+ "awq_marlin",
73
+ "bitsandbytes",
74
+ "gguf",
75
+ "modelopt",
76
+ "modelopt_fp4",
77
+ "petit_nvfp4",
78
+ "w8a8_int8",
79
+ "w8a8_fp8",
80
+ "moe_wna16",
81
+ "qoq",
82
+ "w4afp8",
83
+ "mxfp4",
84
+ ]
85
+
86
+ ATTENTION_BACKEND_CHOICES = [
87
+ # Common
88
+ "triton",
89
+ "torch_native",
90
+ # NVIDIA specific
91
+ "cutlass_mla",
92
+ "fa3",
93
+ "flashinfer",
94
+ "flashmla",
95
+ "trtllm_mla",
96
+ "trtllm_mha",
97
+ "dual_chunk_flash_attn",
98
+ "hybrid_linear_attn",
99
+ # AMD specific
100
+ "aiter",
101
+ "wave",
102
+ # Other platforms
103
+ "intel_amx",
104
+ "ascend",
105
+ ]
106
+
107
+ DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
108
+
109
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
110
+
111
+
112
+ # Allow external code to add more choices
113
+ def add_load_format_choices(choices):
114
+ LOAD_FORMAT_CHOICES.extend(choices)
115
+
116
+
117
+ def add_quantization_method_choices(choices):
118
+ QUANTIZATION_CHOICES.extend(choices)
119
+
120
+
121
+ def add_attention_backend_choices(choices):
122
+ ATTENTION_BACKEND_CHOICES.extend(choices)
123
+
124
+
125
+ def add_disagg_transfer_backend_choices(choices):
126
+ DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
127
+
128
+
129
+ def add_grammar_backend_choices(choices):
130
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
131
+
132
+
51
133
  @dataclasses.dataclass
52
134
  class ServerArgs:
53
135
  # Model and tokenizer
54
136
  model_path: str
55
137
  tokenizer_path: Optional[str] = None
56
138
  tokenizer_mode: str = "auto"
139
+ tokenizer_worker_num: int = 1
57
140
  skip_tokenizer_init: bool = False
58
141
  load_format: str = "auto"
59
142
  model_loader_extra_config: str = "{}"
@@ -120,6 +203,8 @@ class ServerArgs:
120
203
  bucket_inter_token_latency: Optional[List[float]] = None
121
204
  bucket_e2e_request_latency: Optional[List[float]] = None
122
205
  collect_tokens_histogram: bool = False
206
+ prompt_tokens_buckets: Optional[List[str]] = None
207
+ generation_tokens_buckets: Optional[List[str]] = None
123
208
  decode_log_interval: int = 40
124
209
  enable_request_time_stats_logging: bool = False
125
210
  kv_events_config: Optional[str] = None
@@ -140,6 +225,8 @@ class ServerArgs:
140
225
  # Data parallelism
141
226
  dp_size: int = 1
142
227
  load_balance_method: str = "round_robin"
228
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
229
+ prefill_round_robin_balance: bool = False
143
230
 
144
231
  # Multi-node distributed serving
145
232
  dist_init_addr: Optional[str] = None
@@ -172,12 +259,14 @@ class ServerArgs:
172
259
  # Speculative decoding
173
260
  speculative_algorithm: Optional[str] = None
174
261
  speculative_draft_model_path: Optional[str] = None
262
+ speculative_draft_model_revision: Optional[str] = None
175
263
  speculative_num_steps: Optional[int] = None
176
264
  speculative_eagle_topk: Optional[int] = None
177
265
  speculative_num_draft_tokens: Optional[int] = None
178
266
  speculative_accept_threshold_single: float = 1.0
179
267
  speculative_accept_threshold_acc: float = 1.0
180
268
  speculative_token_map: Optional[str] = None
269
+ speculative_attention_mode: str = "prefill"
181
270
 
182
271
  # Expert parallelism
183
272
  ep_size: int = 1
@@ -200,6 +289,7 @@ class ServerArgs:
200
289
  eplb_algorithm: str = "auto"
201
290
  eplb_rebalance_num_iterations: int = 1000
202
291
  eplb_rebalance_layers_per_chunk: Optional[int] = None
292
+ eplb_min_rebalancing_utilization_threshold: float = 1.0
203
293
  expert_distribution_recorder_mode: Optional[
204
294
  Literal["stat", "stat_approx", "per_pass", "per_token"]
205
295
  ] = None
@@ -212,12 +302,14 @@ class ServerArgs:
212
302
  enable_hierarchical_cache: bool = False
213
303
  hicache_ratio: float = 2.0
214
304
  hicache_size: int = 0
215
- hicache_write_policy: str = "write_through_selective"
305
+ hicache_write_policy: str = "write_through"
216
306
  hicache_io_backend: str = "kernel"
217
307
  hicache_mem_layout: str = "layer_first"
218
308
  hicache_storage_backend: Optional[str] = None
219
309
  hicache_storage_prefetch_policy: str = "best_effort"
220
310
  hicache_storage_backend_extra_config: Optional[str] = None
311
+ # LMCache
312
+ enable_lmcache: bool = False
221
313
 
222
314
  # Double Sparsity
223
315
  enable_double_sparsity: bool = False
@@ -273,6 +365,7 @@ class ServerArgs:
273
365
  disable_fast_image_processor: bool = False
274
366
  enable_return_hidden_states: bool = False
275
367
  scheduler_recv_interval: int = 1
368
+ numa_node: Optional[List[int]] = None
276
369
 
277
370
  # Debug tensor dumps
278
371
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -289,7 +382,6 @@ class ServerArgs:
289
382
  disaggregation_prefill_pp: Optional[int] = 1
290
383
  disaggregation_ib_device: Optional[str] = None
291
384
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
292
- pdlb_url: Optional[str] = None
293
385
 
294
386
  # For model weight update
295
387
  custom_weight_loader: Optional[List[str]] = None
@@ -299,6 +391,10 @@ class ServerArgs:
299
391
  enable_pdmux: bool = False
300
392
  sm_group_num: int = 3
301
393
 
394
+ # Mamba cache
395
+ max_mamba_cache_size: Optional[int] = None
396
+ mamba_ssm_dtype: str = "float32"
397
+
302
398
  # Deprecated arguments
303
399
  enable_ep_moe: bool = False
304
400
  enable_deepep_moe: bool = False
@@ -386,9 +482,14 @@ class ServerArgs:
386
482
  # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
387
483
  reserved_mem = 32 * 1024
388
484
 
485
+ # draft model and larger cuda graph buffers
389
486
  if self.speculative_algorithm is not None:
390
- # draft model and larger cuda graph buffers
391
- reserved_mem += 2 * 1024
487
+ if self.speculative_algorithm == "STANDALONE":
488
+ # Standalone speculative decoding needs more memory than other speculative
489
+ # decoding algorithms since the draft model is typically larger.
490
+ reserved_mem += 6 * 1024
491
+ else:
492
+ reserved_mem += 2 * 1024
392
493
  if self.enable_dp_attention:
393
494
  reserved_mem += 4 * 1024
394
495
 
@@ -530,12 +631,12 @@ class ServerArgs:
530
631
  if self.grammar_backend is None:
531
632
  self.grammar_backend = "xgrammar"
532
633
 
634
+ if self.dp_size == 1:
635
+ self.enable_dp_attention = False
636
+
533
637
  # Data parallelism attention
534
638
  if self.enable_dp_attention:
535
639
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
536
- assert (
537
- self.dp_size > 1
538
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
539
640
  assert self.tp_size % self.dp_size == 0
540
641
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
541
642
  logger.warning(
@@ -558,11 +659,13 @@ class ServerArgs:
558
659
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
559
660
 
560
661
  if self.moe_runner_backend == "flashinfer_trtllm":
561
- if not self.disable_shared_experts_fusion:
562
- self.disable_shared_experts_fusion = True
563
- logger.warning(
564
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
565
- )
662
+ assert (
663
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
664
+ ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
665
+ self.disable_shared_experts_fusion = True
666
+ logger.warning(
667
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
668
+ )
566
669
 
567
670
  # DeepEP MoE
568
671
  if self.moe_a2a_backend == "deepep":
@@ -617,7 +720,12 @@ class ServerArgs:
617
720
  # NEXTN shares the same implementation of EAGLE
618
721
  self.speculative_algorithm = "EAGLE"
619
722
 
620
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
723
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
724
+ if self.speculative_algorithm == "STANDALONE":
725
+ # TODO: support dp attention for standalone speculative decoding
726
+ assert (
727
+ self.enable_dp_attention is False
728
+ ), "Currently standalone speculative decoding does not support dp attention."
621
729
  if self.max_running_requests is None:
622
730
  self.max_running_requests = 48
623
731
  self.disable_overlap_schedule = True
@@ -673,6 +781,15 @@ class ServerArgs:
673
781
  )
674
782
  self.speculative_num_draft_tokens = self.speculative_num_steps + 1
675
783
 
784
+ if (
785
+ self.speculative_eagle_topk > 1
786
+ and self.page_size > 1
787
+ and self.attention_backend != "flashinfer"
788
+ ):
789
+ raise ValueError(
790
+ "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
791
+ )
792
+
676
793
  # The token generated from the verify step is counted.
677
794
  # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
678
795
  # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -700,6 +817,13 @@ class ServerArgs:
700
817
 
701
818
  self.disable_radix_cache = True
702
819
  logger.warning("KV cache is forced as chunk cache for decode server")
820
+
821
+ if self.dp_size > 1 and not is_in_ci():
822
+ assert self.prefill_round_robin_balance, (
823
+ "Prefill round robin balance is required when dp size > 1. "
824
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
825
+ " and `--prefill-round-robin-balance` is set for decode server."
826
+ )
703
827
  elif self.disaggregation_mode == "prefill":
704
828
  if self.disaggregation_decode_tp is None:
705
829
  self.disaggregation_decode_tp = self.tp_size
@@ -716,6 +840,8 @@ class ServerArgs:
716
840
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
717
841
  "1" if self.enable_torch_compile else "0"
718
842
  )
843
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
844
+
719
845
  # Set env var before grammar backends init
720
846
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
721
847
  "1" if self.disable_outlines_disk_cache else "0"
@@ -752,6 +878,12 @@ class ServerArgs:
752
878
  "tokenizer if available, and 'slow' will "
753
879
  "always use the slow tokenizer.",
754
880
  )
881
+ parser.add_argument(
882
+ "--tokenizer-worker-num",
883
+ type=int,
884
+ default=ServerArgs.tokenizer_worker_num,
885
+ help="The worker num of the tokenizer manager.",
886
+ )
755
887
  parser.add_argument(
756
888
  "--skip-tokenizer-init",
757
889
  action="store_true",
@@ -761,18 +893,7 @@ class ServerArgs:
761
893
  "--load-format",
762
894
  type=str,
763
895
  default=ServerArgs.load_format,
764
- choices=[
765
- "auto",
766
- "pt",
767
- "safetensors",
768
- "npcache",
769
- "dummy",
770
- "sharded_state",
771
- "gguf",
772
- "bitsandbytes",
773
- "layered",
774
- "remote",
775
- ],
896
+ choices=LOAD_FORMAT_CHOICES,
776
897
  help="The format of the model weights to load. "
777
898
  '"auto" will try to load the weights in the safetensors format '
778
899
  "and fall back to the pytorch bin format if safetensors format "
@@ -891,25 +1012,7 @@ class ServerArgs:
891
1012
  "--quantization",
892
1013
  type=str,
893
1014
  default=ServerArgs.quantization,
894
- choices=[
895
- "awq",
896
- "fp8",
897
- "gptq",
898
- "marlin",
899
- "gptq_marlin",
900
- "awq_marlin",
901
- "bitsandbytes",
902
- "gguf",
903
- "modelopt",
904
- "modelopt_fp4",
905
- "petit_nvfp4",
906
- "w8a8_int8",
907
- "w8a8_fp8",
908
- "moe_wna16",
909
- "qoq",
910
- "w4afp8",
911
- "mxfp4",
912
- ],
1015
+ choices=QUANTIZATION_CHOICES,
913
1016
  help="The quantization method.",
914
1017
  )
915
1018
  parser.add_argument(
@@ -1172,6 +1275,26 @@ class ServerArgs:
1172
1275
  default=ServerArgs.collect_tokens_histogram,
1173
1276
  help="Collect prompt/generation tokens histogram.",
1174
1277
  )
1278
+ bucket_rule = (
1279
+ "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1280
+ "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1281
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1282
+ "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
1283
+ )
1284
+ parser.add_argument(
1285
+ "--prompt-tokens-buckets",
1286
+ type=str,
1287
+ nargs="+",
1288
+ default=ServerArgs.prompt_tokens_buckets,
1289
+ help=f"The buckets rule of prompt tokens. {bucket_rule}",
1290
+ )
1291
+ parser.add_argument(
1292
+ "--generation-tokens-buckets",
1293
+ type=str,
1294
+ nargs="+",
1295
+ default=ServerArgs.generation_tokens_buckets,
1296
+ help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
1297
+ )
1175
1298
  parser.add_argument(
1176
1299
  "--gc-warning-threshold-secs",
1177
1300
  type=float,
@@ -1280,6 +1403,12 @@ class ServerArgs:
1280
1403
  "minimum_tokens",
1281
1404
  ],
1282
1405
  )
1406
+ parser.add_argument(
1407
+ "--prefill-round-robin-balance",
1408
+ default=ServerArgs.prefill_round_robin_balance,
1409
+ action="store_true",
1410
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1411
+ )
1283
1412
 
1284
1413
  # Multi-node distributed serving
1285
1414
  parser.add_argument(
@@ -1359,43 +1488,24 @@ class ServerArgs:
1359
1488
  )
1360
1489
 
1361
1490
  # Kernel backend
1362
- ATTN_BACKENDS = [
1363
- # Common
1364
- "triton",
1365
- "torch_native",
1366
- # NVIDIA specific
1367
- "cutlass_mla",
1368
- "fa3",
1369
- "flashinfer",
1370
- "flashmla",
1371
- "trtllm_mla",
1372
- "trtllm_mha",
1373
- "dual_chunk_flash_attn",
1374
- # AMD specific
1375
- "aiter",
1376
- "wave",
1377
- # Other platforms
1378
- "intel_amx",
1379
- "ascend",
1380
- ]
1381
1491
  parser.add_argument(
1382
1492
  "--attention-backend",
1383
1493
  type=str,
1384
- choices=ATTN_BACKENDS,
1494
+ choices=ATTENTION_BACKEND_CHOICES,
1385
1495
  default=ServerArgs.attention_backend,
1386
1496
  help="Choose the kernels for attention layers.",
1387
1497
  )
1388
1498
  parser.add_argument(
1389
1499
  "--prefill-attention-backend",
1390
1500
  type=str,
1391
- choices=ATTN_BACKENDS,
1501
+ choices=ATTENTION_BACKEND_CHOICES,
1392
1502
  default=ServerArgs.prefill_attention_backend,
1393
1503
  help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
1394
1504
  )
1395
1505
  parser.add_argument(
1396
1506
  "--decode-attention-backend",
1397
1507
  type=str,
1398
- choices=ATTN_BACKENDS,
1508
+ choices=ATTENTION_BACKEND_CHOICES,
1399
1509
  default=ServerArgs.decode_attention_backend,
1400
1510
  help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
1401
1511
  )
@@ -1409,7 +1519,7 @@ class ServerArgs:
1409
1519
  parser.add_argument(
1410
1520
  "--grammar-backend",
1411
1521
  type=str,
1412
- choices=["xgrammar", "outlines", "llguidance", "none"],
1522
+ choices=GRAMMAR_BACKEND_CHOICES,
1413
1523
  default=ServerArgs.grammar_backend,
1414
1524
  help="Choose the backend for grammar-guided decoding.",
1415
1525
  )
@@ -1425,14 +1535,23 @@ class ServerArgs:
1425
1535
  parser.add_argument(
1426
1536
  "--speculative-algorithm",
1427
1537
  type=str,
1428
- choices=["EAGLE", "EAGLE3", "NEXTN"],
1538
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
1429
1539
  help="Speculative algorithm.",
1430
1540
  )
1431
1541
  parser.add_argument(
1432
1542
  "--speculative-draft-model-path",
1543
+ "--speculative-draft-model",
1433
1544
  type=str,
1434
1545
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1435
1546
  )
1547
+ parser.add_argument(
1548
+ "--speculative-draft-model-revision",
1549
+ type=str,
1550
+ default=None,
1551
+ help="The specific draft model version to use. It can be a branch "
1552
+ "name, a tag name, or a commit id. If unspecified, will use "
1553
+ "the default version.",
1554
+ )
1436
1555
  parser.add_argument(
1437
1556
  "--speculative-num-steps",
1438
1557
  type=int,
@@ -1469,6 +1588,13 @@ class ServerArgs:
1469
1588
  help="The path of the draft model's small vocab table.",
1470
1589
  default=ServerArgs.speculative_token_map,
1471
1590
  )
1591
+ parser.add_argument(
1592
+ "--speculative-attention-mode",
1593
+ type=str,
1594
+ choices=["prefill", "decode"],
1595
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1596
+ default=ServerArgs.speculative_attention_mode,
1597
+ )
1472
1598
 
1473
1599
  # Expert parallelism
1474
1600
  parser.add_argument(
@@ -1503,7 +1629,7 @@ class ServerArgs:
1503
1629
  parser.add_argument(
1504
1630
  "--flashinfer-mxfp4-moe-precision",
1505
1631
  type=str,
1506
- choices=["mxfp4", "bf16"],
1632
+ choices=["default", "bf16"],
1507
1633
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1508
1634
  help="Choose the computation precision of flashinfer mxfp4 moe",
1509
1635
  )
@@ -1560,6 +1686,12 @@ class ServerArgs:
1560
1686
  default=ServerArgs.eplb_rebalance_layers_per_chunk,
1561
1687
  help="Number of layers to rebalance per forward pass.",
1562
1688
  )
1689
+ parser.add_argument(
1690
+ "--eplb-min-rebalancing-utilization-threshold",
1691
+ type=float,
1692
+ default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
1693
+ help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
1694
+ )
1563
1695
  parser.add_argument(
1564
1696
  "--expert-distribution-recorder-mode",
1565
1697
  type=str,
@@ -1590,6 +1722,21 @@ class ServerArgs:
1590
1722
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1591
1723
  )
1592
1724
 
1725
+ # Mamba Cache
1726
+ parser.add_argument(
1727
+ "--max-mamba-cache-size",
1728
+ type=int,
1729
+ default=ServerArgs.max_mamba_cache_size,
1730
+ help="The maximum size of the mamba cache.",
1731
+ )
1732
+ parser.add_argument(
1733
+ "--mamba-ssm-dtype",
1734
+ type=str,
1735
+ default=ServerArgs.mamba_ssm_dtype,
1736
+ choices=["float32", "bfloat16"],
1737
+ help="The data type of the SSM states in mamba cache.",
1738
+ )
1739
+
1593
1740
  # Hierarchical cache
1594
1741
  parser.add_argument(
1595
1742
  "--enable-hierarchical-cache",
@@ -1649,6 +1796,12 @@ class ServerArgs:
1649
1796
  default=ServerArgs.hicache_storage_backend_extra_config,
1650
1797
  help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1651
1798
  )
1799
+ # LMCache
1800
+ parser.add_argument(
1801
+ "--enable-lmcache",
1802
+ action="store_true",
1803
+ help="Using LMCache as an alternative hierarchical cache solution",
1804
+ )
1652
1805
 
1653
1806
  # Double Sparsity
1654
1807
  parser.add_argument(
@@ -1921,6 +2074,12 @@ class ServerArgs:
1921
2074
  default=ServerArgs.scheduler_recv_interval,
1922
2075
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1923
2076
  )
2077
+ parser.add_argument(
2078
+ "--numa-node",
2079
+ type=int,
2080
+ nargs="+",
2081
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2082
+ )
1924
2083
 
1925
2084
  # Debug tensor dumps
1926
2085
  parser.add_argument(
@@ -1959,7 +2118,7 @@ class ServerArgs:
1959
2118
  "--disaggregation-transfer-backend",
1960
2119
  type=str,
1961
2120
  default=ServerArgs.disaggregation_transfer_backend,
1962
- choices=["mooncake", "nixl", "ascend"],
2121
+ choices=DISAGG_TRANSFER_BACKEND_CHOICES,
1963
2122
  help="The backend for disaggregation transfer. Default is mooncake.",
1964
2123
  )
1965
2124
  parser.add_argument(
@@ -2000,12 +2159,6 @@ class ServerArgs:
2000
2159
  default=ServerArgs.num_reserved_decode_tokens,
2001
2160
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
2002
2161
  )
2003
- parser.add_argument(
2004
- "--pdlb-url",
2005
- type=str,
2006
- default=None,
2007
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2008
- )
2009
2162
 
2010
2163
  # Custom weight loader
2011
2164
  parser.add_argument(
@@ -2134,6 +2287,15 @@ class ServerArgs:
2134
2287
  self.chunked_prefill_size % self.page_size == 0
2135
2288
  ), "chunked_prefill_size must be divisible by page_size"
2136
2289
 
2290
+ # Check multi tokenizer
2291
+ assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2292
+ self.validate_buckets_rule(
2293
+ "--prompt-tokens-buckets", self.prompt_tokens_buckets
2294
+ )
2295
+ self.validate_buckets_rule(
2296
+ "--generation-tokens-buckets", self.generation_tokens_buckets
2297
+ )
2298
+
2137
2299
  def check_lora_server_args(self):
2138
2300
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2139
2301
 
@@ -2225,6 +2387,54 @@ class ServerArgs:
2225
2387
  f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
2226
2388
  )
2227
2389
 
2390
+ def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
2391
+ if not buckets_rule:
2392
+ return
2393
+
2394
+ assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
2395
+ rule = buckets_rule[0]
2396
+ assert rule in [
2397
+ "tse",
2398
+ "default",
2399
+ "customer",
2400
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
2401
+
2402
+ if rule == "tse":
2403
+ assert (
2404
+ len(buckets_rule) == 4
2405
+ ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
2406
+ try:
2407
+ middle = float(buckets_rule[1])
2408
+ base = float(buckets_rule[2])
2409
+ count = int(buckets_rule[3])
2410
+ except (ValueError, IndexError):
2411
+ assert (
2412
+ False
2413
+ ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
2414
+ assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
2415
+ assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
2416
+ assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
2417
+
2418
+ elif rule == "default":
2419
+ assert (
2420
+ len(buckets_rule) == 1
2421
+ ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2422
+
2423
+ elif rule == "customer":
2424
+ assert (
2425
+ len(buckets_rule) >= 2
2426
+ ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
2427
+ try:
2428
+ bucket_values = [float(x) for x in buckets_rule[1:]]
2429
+ except ValueError:
2430
+ assert False, f"{arg_name} customer rule bucket values must be numeric"
2431
+ assert len(set(bucket_values)) == len(
2432
+ bucket_values
2433
+ ), f"{arg_name} customer rule bucket values should not contain duplicates"
2434
+ assert all(
2435
+ val >= 0 for val in bucket_values
2436
+ ), f"{arg_name} customer rule bucket values should be non-negative"
2437
+
2228
2438
  def model_specific_adjustments(self):
2229
2439
  hf_config = self.get_hf_config()
2230
2440
  model_arch = hf_config.architectures[0]
@@ -2284,7 +2494,8 @@ class ServerArgs:
2284
2494
  assert self.attention_backend in {
2285
2495
  "fa3",
2286
2496
  "aiter",
2287
- }, "fa3 or aiter is required for Llama4 model"
2497
+ "triton",
2498
+ }, "fa3, aiter, or triton is required for Llama4 model"
2288
2499
  elif model_arch in [
2289
2500
  "Gemma2ForCausalLM",
2290
2501
  "Gemma3ForCausalLM",
@@ -2377,6 +2588,9 @@ class PortArgs:
2377
2588
  # The ipc filename for Scheduler to send metrics
2378
2589
  metrics_ipc_name: str
2379
2590
 
2591
+ # The ipc filename for Tokenizer and worker tokenizer
2592
+ tokenizer_worker_ipc_name: Optional[str]
2593
+
2380
2594
  @staticmethod
2381
2595
  def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
2382
2596
  if server_args.nccl_port is None:
@@ -2400,6 +2614,7 @@ class PortArgs:
2400
2614
  nccl_port=nccl_port,
2401
2615
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2402
2616
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2617
+ tokenizer_worker_ipc_name=None,
2403
2618
  )
2404
2619
  else:
2405
2620
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -2433,6 +2648,7 @@ class PortArgs:
2433
2648
  nccl_port=nccl_port,
2434
2649
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2435
2650
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2651
+ tokenizer_worker_ipc_name=None,
2436
2652
  )
2437
2653
 
2438
2654
 
@@ -2478,7 +2694,9 @@ def auto_choose_speculative_params(self: ServerArgs):
2478
2694
  """
2479
2695
  hf_config = self.get_hf_config()
2480
2696
  arch = hf_config.architectures[0]
2481
-
2697
+ if self.speculative_algorithm == "STANDALONE":
2698
+ # The default value for standalone speculative decoding
2699
+ return (3, 1, 4)
2482
2700
  if arch in ["LlamaForCausalLM"]:
2483
2701
  # The default value for llama
2484
2702
  return (5, 4, 8)