sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +2 -2
  3. sglang/bench_latency.py +1 -553
  4. sglang/bench_offline_throughput.py +48 -20
  5. sglang/bench_one_batch.py +472 -0
  6. sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
  7. sglang/bench_serving.py +125 -6
  8. sglang/check_env.py +3 -6
  9. sglang/lang/backend/base_backend.py +1 -1
  10. sglang/lang/backend/runtime_endpoint.py +2 -2
  11. sglang/srt/configs/model_config.py +13 -14
  12. sglang/srt/constrained/__init__.py +13 -14
  13. sglang/srt/constrained/base_grammar_backend.py +13 -15
  14. sglang/srt/constrained/outlines_backend.py +28 -17
  15. sglang/srt/constrained/outlines_jump_forward.py +13 -15
  16. sglang/srt/constrained/xgrammar_backend.py +47 -58
  17. sglang/srt/conversation.py +13 -15
  18. sglang/srt/hf_transformers_utils.py +13 -15
  19. sglang/srt/layers/activation.py +16 -13
  20. sglang/srt/layers/attention/flashinfer_backend.py +106 -54
  21. sglang/srt/layers/attention/triton_backend.py +9 -7
  22. sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  23. sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  24. sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  25. sglang/srt/layers/custom_op_util.py +25 -0
  26. sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  27. sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
  28. sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
  29. sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
  30. sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  31. sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  32. sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  33. sglang/srt/layers/layernorm.py +17 -15
  34. sglang/srt/layers/logits_processor.py +23 -25
  35. sglang/srt/layers/quantization/__init__.py +77 -17
  36. sglang/srt/layers/radix_attention.py +13 -15
  37. sglang/srt/layers/rotary_embedding.py +13 -13
  38. sglang/srt/layers/sampler.py +4 -8
  39. sglang/srt/layers/torchao_utils.py +2 -0
  40. sglang/srt/lora/lora.py +13 -14
  41. sglang/srt/lora/lora_config.py +13 -14
  42. sglang/srt/lora/lora_manager.py +22 -24
  43. sglang/srt/managers/data_parallel_controller.py +98 -27
  44. sglang/srt/managers/detokenizer_manager.py +13 -15
  45. sglang/srt/managers/io_struct.py +63 -21
  46. sglang/srt/managers/schedule_batch.py +154 -59
  47. sglang/srt/managers/schedule_policy.py +18 -16
  48. sglang/srt/managers/scheduler.py +278 -109
  49. sglang/srt/managers/session_controller.py +61 -0
  50. sglang/srt/managers/tokenizer_manager.py +63 -18
  51. sglang/srt/managers/tp_worker.py +25 -16
  52. sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
  53. sglang/srt/metrics/collector.py +13 -15
  54. sglang/srt/metrics/func_timer.py +13 -15
  55. sglang/srt/mm_utils.py +13 -14
  56. sglang/srt/model_executor/cuda_graph_runner.py +63 -25
  57. sglang/srt/model_executor/forward_batch_info.py +128 -32
  58. sglang/srt/model_executor/model_runner.py +132 -64
  59. sglang/srt/model_parallel.py +98 -0
  60. sglang/srt/models/chatglm.py +15 -16
  61. sglang/srt/models/commandr.py +15 -16
  62. sglang/srt/models/dbrx.py +15 -16
  63. sglang/srt/models/deepseek.py +15 -15
  64. sglang/srt/models/deepseek_v2.py +162 -59
  65. sglang/srt/models/exaone.py +14 -15
  66. sglang/srt/models/gemma.py +14 -14
  67. sglang/srt/models/gemma2.py +31 -25
  68. sglang/srt/models/gemma2_reward.py +13 -14
  69. sglang/srt/models/gpt_bigcode.py +14 -14
  70. sglang/srt/models/grok.py +15 -15
  71. sglang/srt/models/internlm2.py +13 -15
  72. sglang/srt/models/internlm2_reward.py +13 -14
  73. sglang/srt/models/llama.py +21 -21
  74. sglang/srt/models/llama_classification.py +13 -14
  75. sglang/srt/models/llama_reward.py +13 -14
  76. sglang/srt/models/llava.py +14 -16
  77. sglang/srt/models/llavavid.py +14 -16
  78. sglang/srt/models/minicpm.py +13 -15
  79. sglang/srt/models/minicpm3.py +13 -15
  80. sglang/srt/models/mistral.py +13 -15
  81. sglang/srt/models/mixtral.py +15 -15
  82. sglang/srt/models/mixtral_quant.py +14 -14
  83. sglang/srt/models/olmo.py +22 -20
  84. sglang/srt/models/olmoe.py +23 -20
  85. sglang/srt/models/phi3_small.py +447 -0
  86. sglang/srt/models/qwen.py +14 -14
  87. sglang/srt/models/qwen2.py +22 -19
  88. sglang/srt/models/qwen2_moe.py +17 -18
  89. sglang/srt/models/qwen2_vl.py +13 -6
  90. sglang/srt/models/stablelm.py +18 -16
  91. sglang/srt/models/torch_native_llama.py +107 -93
  92. sglang/srt/models/xverse.py +13 -14
  93. sglang/srt/models/xverse_moe.py +15 -16
  94. sglang/srt/models/yivl.py +13 -15
  95. sglang/srt/openai_api/adapter.py +19 -17
  96. sglang/srt/openai_api/protocol.py +14 -16
  97. sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
  98. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
  99. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
  100. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
  101. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
  102. sglang/srt/sampling/sampling_batch_info.py +61 -57
  103. sglang/srt/sampling/sampling_params.py +14 -16
  104. sglang/srt/server.py +86 -35
  105. sglang/srt/server_args.py +96 -80
  106. sglang/srt/utils.py +266 -68
  107. sglang/test/few_shot_gsm8k.py +8 -4
  108. sglang/test/runners.py +38 -20
  109. sglang/test/srt/sampling/penaltylib/utils.py +23 -21
  110. sglang/test/test_utils.py +31 -20
  111. sglang/version.py +1 -1
  112. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
  113. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
  114. sglang-0.3.6.post1.dist-info/RECORD +164 -0
  115. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
  116. sglang/srt/layers/fused_moe/__init__.py +0 -1
  117. sglang-0.3.5.post2.dist-info/RECORD +0 -156
  118. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """
17
15
  The entry point of inference server.
18
16
  SRT = SGLang Runtime.
@@ -50,8 +48,10 @@ from sglang.srt.managers.data_parallel_controller import (
50
48
  )
51
49
  from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
52
50
  from sglang.srt.managers.io_struct import (
51
+ CloseSessionReqInput,
53
52
  EmbeddingReqInput,
54
53
  GenerateReqInput,
54
+ OpenSessionReqInput,
55
55
  UpdateWeightReqInput,
56
56
  )
57
57
  from sglang.srt.managers.scheduler import run_scheduler_process
@@ -102,6 +102,7 @@ app.add_middleware(
102
102
  )
103
103
 
104
104
  tokenizer_manager: TokenizerManager = None
105
+ _max_total_num_tokens = None
105
106
 
106
107
  ##### Native API endpoints #####
107
108
 
@@ -145,10 +146,15 @@ async def get_model_info():
145
146
  return result
146
147
 
147
148
 
148
- @app.get("/get_server_args")
149
- async def get_server_args():
150
- """Get the server arguments."""
151
- return dataclasses.asdict(tokenizer_manager.server_args)
149
+ @app.get("/get_server_info")
150
+ async def get_server_info():
151
+ try:
152
+ return await _get_server_info()
153
+
154
+ except Exception as e:
155
+ return ORJSONResponse(
156
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
157
+ )
152
158
 
153
159
 
154
160
  @app.post("/flush_cache")
@@ -184,19 +190,6 @@ async def stop_profile():
184
190
  )
185
191
 
186
192
 
187
- @app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
188
- async def get_memory_pool_size():
189
- """Get the memory pool size in number of tokens"""
190
- try:
191
- ret = await tokenizer_manager.get_memory_pool_size()
192
-
193
- return ret
194
- except Exception as e:
195
- return ORJSONResponse(
196
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
197
- )
198
-
199
-
200
193
  @app.post("/update_weights")
201
194
  @time_func_latency
202
195
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
@@ -215,6 +208,30 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
215
208
  )
216
209
 
217
210
 
211
+ @app.api_route("/open_session", methods=["GET", "POST"])
212
+ async def open_session(obj: OpenSessionReqInput, request: Request):
213
+ """Open a session, and return its unique session id."""
214
+ try:
215
+ session_id = await tokenizer_manager.open_session(obj, request)
216
+ return session_id
217
+ except Exception as e:
218
+ return ORJSONResponse(
219
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
220
+ )
221
+
222
+
223
+ @app.api_route("/close_session", methods=["GET", "POST"])
224
+ async def close_session(obj: CloseSessionReqInput, request: Request):
225
+ """Close the session"""
226
+ try:
227
+ await tokenizer_manager.close_session(obj, request)
228
+ return Response(status_code=200)
229
+ except Exception as e:
230
+ return ORJSONResponse(
231
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
232
+ )
233
+
234
+
218
235
  @time_func_latency
219
236
  async def generate_request(obj: GenerateReqInput, request: Request):
220
237
  """Handle a generate request."""
@@ -366,6 +383,7 @@ def launch_engine(
366
383
  """
367
384
 
368
385
  global tokenizer_manager
386
+ global _max_total_num_tokens
369
387
 
370
388
  # Configure global environment
371
389
  configure_logger(server_args)
@@ -392,7 +410,7 @@ def launch_engine(
392
410
  )
393
411
  for tp_rank in tp_rank_range:
394
412
  reader, writer = mp.Pipe(duplex=False)
395
- gpu_id = tp_rank % tp_size_per_node
413
+ gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
396
414
  proc = mp.Process(
397
415
  target=run_scheduler_process,
398
416
  args=(server_args, port_args, gpu_id, tp_rank, None, writer),
@@ -431,9 +449,20 @@ def launch_engine(
431
449
  if server_args.chat_template:
432
450
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
433
451
 
434
- # Wait for model to finish loading
452
+ # Wait for model to finish loading & get max token nums
453
+ scheduler_info = []
435
454
  for i in range(len(scheduler_pipe_readers)):
436
- scheduler_pipe_readers[i].recv()
455
+ data = scheduler_pipe_readers[i].recv()
456
+
457
+ if data["status"] != "ready":
458
+ self.shutdown()
459
+ raise RuntimeError(
460
+ "Initialization failed. Please see the error messages above."
461
+ )
462
+ scheduler_info.append(data)
463
+
464
+ # Assume all schedulers have same max_total_num_tokens
465
+ _max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
437
466
 
438
467
 
439
468
  def launch_server(
@@ -494,6 +523,14 @@ def launch_server(
494
523
  t.join()
495
524
 
496
525
 
526
+ async def _get_server_info():
527
+ return {
528
+ **dataclasses.asdict(tokenizer_manager.server_args), # server args
529
+ "memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
530
+ "max_total_num_tokens": _max_total_num_tokens, # max total num tokens
531
+ }
532
+
533
+
497
534
  def _set_envs_and_config(server_args: ServerArgs):
498
535
  # Set global environments
499
536
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -735,6 +772,17 @@ class Runtime:
735
772
  response = requests.post(self.url + "/encode", json=json_data)
736
773
  return json.dumps(response.json())
737
774
 
775
+ async def get_server_info(self):
776
+ async with aiohttp.ClientSession() as session:
777
+ async with session.get(f"{self.url}/get_server_info") as response:
778
+ if response.status == 200:
779
+ return await response.json()
780
+ else:
781
+ error_data = await response.json()
782
+ raise RuntimeError(
783
+ f"Failed to get server info. {error_data['error']['message']}"
784
+ )
785
+
738
786
  def __del__(self):
739
787
  self.shutdown()
740
788
 
@@ -884,3 +932,6 @@ class Engine:
884
932
  # get the current event loop
885
933
  loop = asyncio.get_event_loop()
886
934
  return loop.run_until_complete(encode_request(obj, None))
935
+
936
+ async def get_server_info(self):
937
+ return await _get_server_info()
sglang/srt/server_args.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """The arguments of the server."""
17
15
 
18
16
  import argparse
@@ -23,8 +21,10 @@ import tempfile
23
21
  from typing import List, Optional
24
22
 
25
23
  from sglang.srt.utils import (
26
- get_gpu_memory_capacity,
24
+ get_amdgpu_memory_capacity,
25
+ get_nvgpu_memory_capacity,
27
26
  is_flashinfer_available,
27
+ is_hip,
28
28
  is_ipv6,
29
29
  is_port_available,
30
30
  )
@@ -62,6 +62,7 @@ class ServerArgs:
62
62
  max_prefill_tokens: int = 16384
63
63
  schedule_policy: str = "lpm"
64
64
  schedule_conservativeness: float = 1.0
65
+ cpu_offload_gb: int = 0
65
66
 
66
67
  # Other runtime options
67
68
  tp_size: int = 1
@@ -70,6 +71,7 @@ class ServerArgs:
70
71
  constrained_json_whitespace_pattern: Optional[str] = None
71
72
  watchdog_timeout: float = 300
72
73
  download_dir: Optional[str] = None
74
+ base_gpu_id: int = 0
73
75
 
74
76
  # Logging
75
77
  log_level: str = "info"
@@ -114,8 +116,6 @@ class ServerArgs:
114
116
  grammar_backend: Optional[str] = "outlines"
115
117
 
116
118
  # Optimization/debug options
117
- disable_flashinfer: bool = False
118
- disable_flashinfer_sampling: bool = False
119
119
  disable_radix_cache: bool = False
120
120
  disable_jump_forward: bool = False
121
121
  disable_cuda_graph: bool = False
@@ -123,14 +123,14 @@ class ServerArgs:
123
123
  disable_disk_cache: bool = False
124
124
  disable_custom_all_reduce: bool = False
125
125
  disable_mla: bool = False
126
- disable_penalizer: bool = False
127
- disable_nan_detection: bool = False
128
- enable_overlap_schedule: bool = False
126
+ disable_overlap_schedule: bool = False
129
127
  enable_mixed_chunk: bool = False
128
+ enable_dp_attention: bool = False
130
129
  enable_torch_compile: bool = False
131
130
  torch_compile_max_bs: int = 32
132
131
  cuda_graph_max_bs: int = 160
133
132
  torchao_config: str = ""
133
+ enable_nan_detection: bool = False
134
134
  enable_p2p_check: bool = False
135
135
  triton_attention_reduce_in_fp32: bool = False
136
136
  num_continuous_decode_steps: int = 1
@@ -156,7 +156,7 @@ class ServerArgs:
156
156
  if self.tp_size >= 16:
157
157
  self.mem_fraction_static = 0.79
158
158
  elif self.tp_size >= 8:
159
- self.mem_fraction_static = 0.83
159
+ self.mem_fraction_static = 0.82
160
160
  elif self.tp_size >= 4:
161
161
  self.mem_fraction_static = 0.85
162
162
  elif self.tp_size >= 2:
@@ -165,59 +165,39 @@ class ServerArgs:
165
165
  self.mem_fraction_static = 0.88
166
166
 
167
167
  # Adjust for GPUs with small memory capacities
168
- gpu_mem = get_gpu_memory_capacity()
168
+ if is_hip():
169
+ gpu_mem = get_amdgpu_memory_capacity()
170
+ else:
171
+ gpu_mem = get_nvgpu_memory_capacity()
169
172
  if gpu_mem < 25000:
170
- logger.warning(
171
- "Automatically adjust --chunked-prefill-size for small GPUs."
172
- )
173
173
  self.chunked_prefill_size //= 4 # make it 2048
174
174
  self.cuda_graph_max_bs = 4
175
+ logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
175
176
 
176
- # Deprecation warnings
177
- if self.disable_flashinfer:
178
- logger.warning(
179
- "The option '--disable-flashinfer' will be deprecated in the next release. "
180
- "Please use '--attention-backend triton' instead."
181
- )
182
- self.attention_backend = "triton"
183
- if self.disable_flashinfer_sampling:
184
- logger.warning(
185
- "The option '--disable-flashinfer-sampling' will be deprecated in the next release. "
186
- "Please use '--sampling-backend pytorch' instead. "
187
- )
188
- self.sampling_backend = "pytorch"
189
-
177
+ # Choose kernel backends
190
178
  if not is_flashinfer_available():
191
179
  self.attention_backend = "triton"
192
180
  self.sampling_backend = "pytorch"
193
181
 
194
- # Default kernel backends
195
182
  if self.attention_backend is None:
196
183
  self.attention_backend = "flashinfer"
197
-
198
184
  if self.sampling_backend is None:
199
185
  self.sampling_backend = "flashinfer"
200
186
 
201
- if self.enable_overlap_schedule:
202
- logger.warning(
203
- "Overlap scheduler mode is enabled. This is an experimental feature. "
204
- "Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
205
- "and embedding APIs are not supported and will lead to wrong results. "
206
- "The NaN detection is also disabled."
207
- )
208
- self.disable_penalizer = True
209
- self.disable_nan_detection = True
210
-
211
- # Model-specific patches
212
- if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
187
+ # Others
188
+ if self.enable_dp_attention:
189
+ self.dp_size = self.tp_size
190
+ self.chunked_prefill_size = self.chunked_prefill_size // 2
191
+ self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
192
+ self.schedule_conservativeness = self.schedule_conservativeness * 0.3
193
+ self.disable_overlap_schedule = True
213
194
  logger.info(
214
- "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
195
+ f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
196
+ f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
197
+ f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
198
+ "Data parallel size is adjusted to be the same as tensor parallel size. "
199
+ "Overlap schedule is disabled."
215
200
  )
216
- self.trust_remote_code = False
217
-
218
- if "gemma-2" in self.model_path.lower():
219
- logger.info("When using sliding window in gemma-2, turn on flashinfer.")
220
- self.attention_backend = "flashinfer"
221
201
 
222
202
  @staticmethod
223
203
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -321,7 +301,7 @@ class ServerArgs:
321
301
  "--device",
322
302
  type=str,
323
303
  default="cuda",
324
- choices=["cuda", "xpu"],
304
+ choices=["cuda", "xpu", "hpu"],
325
305
  help="The device type.",
326
306
  )
327
307
  parser.add_argument(
@@ -388,6 +368,13 @@ class ServerArgs:
388
368
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
389
369
  )
390
370
 
371
+ parser.add_argument(
372
+ "--cpu-offload-gb",
373
+ type=int,
374
+ default=ServerArgs.cpu_offload_gb,
375
+ help="How many GBs of RAM to reserve for CPU offloading",
376
+ )
377
+
391
378
  # Other runtime options
392
379
  parser.add_argument(
393
380
  "--tensor-parallel-size",
@@ -426,6 +413,12 @@ class ServerArgs:
426
413
  default=ServerArgs.download_dir,
427
414
  help="Model download directory.",
428
415
  )
416
+ parser.add_argument(
417
+ "--base-gpu-id",
418
+ type=int,
419
+ default=ServerArgs.base_gpu_id,
420
+ help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
421
+ )
429
422
 
430
423
  # Logging
431
424
  parser.add_argument(
@@ -599,16 +592,6 @@ class ServerArgs:
599
592
  )
600
593
 
601
594
  # Optimization/debug options
602
- parser.add_argument(
603
- "--disable-flashinfer",
604
- action="store_true",
605
- help="Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead.",
606
- )
607
- parser.add_argument(
608
- "--disable-flashinfer-sampling",
609
- action="store_true",
610
- help="Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead.",
611
- )
612
595
  parser.add_argument(
613
596
  "--disable-radix-cache",
614
597
  action="store_true",
@@ -644,26 +627,26 @@ class ServerArgs:
644
627
  action="store_true",
645
628
  help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
646
629
  )
647
- parser.add_argument(
648
- "--disable-penalizer",
649
- action="store_true",
650
- help="Disable the logit penalizers (e.g., frequency and repetition penalty) for better performance if they are not used in any requests.",
651
- )
652
630
  parser.add_argument(
653
631
  "--disable-nan-detection",
654
632
  action="store_true",
655
633
  help="Disable the NaN detection for better performance.",
656
634
  )
657
635
  parser.add_argument(
658
- "--enable-overlap-schedule",
636
+ "--disable-overlap-schedule",
659
637
  action="store_true",
660
- help="Overlap the CPU scheduler with GPU model worker. Experimental feature.",
638
+ help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
661
639
  )
662
640
  parser.add_argument(
663
641
  "--enable-mixed-chunk",
664
642
  action="store_true",
665
643
  help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
666
644
  )
645
+ parser.add_argument(
646
+ "--enable-dp-attention",
647
+ action="store_true",
648
+ help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
649
+ )
667
650
  parser.add_argument(
668
651
  "--enable-torch-compile",
669
652
  action="store_true",
@@ -685,7 +668,12 @@ class ServerArgs:
685
668
  "--torchao-config",
686
669
  type=str,
687
670
  default=ServerArgs.torchao_config,
688
- help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
671
+ help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row",
672
+ )
673
+ parser.add_argument(
674
+ "--enable-nan-detection",
675
+ action="store_true",
676
+ help="Enable the NaN detection for debugging purposes.",
689
677
  )
690
678
  parser.add_argument(
691
679
  "--enable-p2p-check",
@@ -712,6 +700,23 @@ class ServerArgs:
712
700
  help="Delete the model checkpoint after loading the model.",
713
701
  )
714
702
 
703
+ # Deprecated arguments
704
+ parser.add_argument(
705
+ "--enable-overlap-schedule",
706
+ action=DeprecatedAction,
707
+ help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
708
+ )
709
+ parser.add_argument(
710
+ "--disable-flashinfer",
711
+ action=DeprecatedAction,
712
+ help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
713
+ )
714
+ parser.add_argument(
715
+ "--disable-flashinfer-sampling",
716
+ action=DeprecatedAction,
717
+ help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
718
+ )
719
+
715
720
  @classmethod
716
721
  def from_cli_args(cls, args: argparse.Namespace):
717
722
  args.tp_size = args.tensor_parallel_size
@@ -738,6 +743,7 @@ class ServerArgs:
738
743
  and (self.lora_paths is None or self.disable_cuda_graph)
739
744
  and (self.lora_paths is None or self.disable_radix_cache)
740
745
  ), "compatibility of lora and cuda graph and radix attention is in progress"
746
+ assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
741
747
 
742
748
  if isinstance(self.lora_paths, list):
743
749
  lora_paths = self.lora_paths
@@ -782,7 +788,7 @@ class PortArgs:
782
788
 
783
789
  @staticmethod
784
790
  def init_new(server_args) -> "PortArgs":
785
- port = server_args.port + 42
791
+ port = server_args.port + random.randint(100, 1000)
786
792
  while True:
787
793
  if is_port_available(port):
788
794
  break
@@ -805,3 +811,13 @@ class LoRAPathAction(argparse.Action):
805
811
  getattr(namespace, self.dest)[name] = path
806
812
  else:
807
813
  getattr(namespace, self.dest)[lora_path] = lora_path
814
+
815
+
816
+ class DeprecatedAction(argparse.Action):
817
+ def __init__(self, option_strings, dest, nargs=0, **kwargs):
818
+ super(DeprecatedAction, self).__init__(
819
+ option_strings, dest, nargs=nargs, **kwargs
820
+ )
821
+
822
+ def __call__(self, parser, namespace, values, option_string=None):
823
+ raise ValueError(self.help)