sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +2 -2
- sglang/bench_latency.py +1 -553
- sglang/bench_offline_throughput.py +48 -20
- sglang/bench_one_batch.py +472 -0
- sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
- sglang/bench_serving.py +125 -6
- sglang/check_env.py +3 -6
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +2 -2
- sglang/srt/configs/model_config.py +13 -14
- sglang/srt/constrained/__init__.py +13 -14
- sglang/srt/constrained/base_grammar_backend.py +13 -15
- sglang/srt/constrained/outlines_backend.py +28 -17
- sglang/srt/constrained/outlines_jump_forward.py +13 -15
- sglang/srt/constrained/xgrammar_backend.py +47 -58
- sglang/srt/conversation.py +13 -15
- sglang/srt/hf_transformers_utils.py +13 -15
- sglang/srt/layers/activation.py +16 -13
- sglang/srt/layers/attention/flashinfer_backend.py +106 -54
- sglang/srt/layers/attention/triton_backend.py +9 -7
- sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang/srt/layers/custom_op_util.py +25 -0
- sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
- sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
- sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
- sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- sglang/srt/layers/layernorm.py +17 -15
- sglang/srt/layers/logits_processor.py +23 -25
- sglang/srt/layers/quantization/__init__.py +77 -17
- sglang/srt/layers/radix_attention.py +13 -15
- sglang/srt/layers/rotary_embedding.py +13 -13
- sglang/srt/layers/sampler.py +4 -8
- sglang/srt/layers/torchao_utils.py +2 -0
- sglang/srt/lora/lora.py +13 -14
- sglang/srt/lora/lora_config.py +13 -14
- sglang/srt/lora/lora_manager.py +22 -24
- sglang/srt/managers/data_parallel_controller.py +98 -27
- sglang/srt/managers/detokenizer_manager.py +13 -15
- sglang/srt/managers/io_struct.py +63 -21
- sglang/srt/managers/schedule_batch.py +154 -59
- sglang/srt/managers/schedule_policy.py +18 -16
- sglang/srt/managers/scheduler.py +278 -109
- sglang/srt/managers/session_controller.py +61 -0
- sglang/srt/managers/tokenizer_manager.py +63 -18
- sglang/srt/managers/tp_worker.py +25 -16
- sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
- sglang/srt/metrics/collector.py +13 -15
- sglang/srt/metrics/func_timer.py +13 -15
- sglang/srt/mm_utils.py +13 -14
- sglang/srt/model_executor/cuda_graph_runner.py +63 -25
- sglang/srt/model_executor/forward_batch_info.py +128 -32
- sglang/srt/model_executor/model_runner.py +132 -64
- sglang/srt/model_parallel.py +98 -0
- sglang/srt/models/chatglm.py +15 -16
- sglang/srt/models/commandr.py +15 -16
- sglang/srt/models/dbrx.py +15 -16
- sglang/srt/models/deepseek.py +15 -15
- sglang/srt/models/deepseek_v2.py +162 -59
- sglang/srt/models/exaone.py +14 -15
- sglang/srt/models/gemma.py +14 -14
- sglang/srt/models/gemma2.py +31 -25
- sglang/srt/models/gemma2_reward.py +13 -14
- sglang/srt/models/gpt_bigcode.py +14 -14
- sglang/srt/models/grok.py +15 -15
- sglang/srt/models/internlm2.py +13 -15
- sglang/srt/models/internlm2_reward.py +13 -14
- sglang/srt/models/llama.py +21 -21
- sglang/srt/models/llama_classification.py +13 -14
- sglang/srt/models/llama_reward.py +13 -14
- sglang/srt/models/llava.py +14 -16
- sglang/srt/models/llavavid.py +14 -16
- sglang/srt/models/minicpm.py +13 -15
- sglang/srt/models/minicpm3.py +13 -15
- sglang/srt/models/mistral.py +13 -15
- sglang/srt/models/mixtral.py +15 -15
- sglang/srt/models/mixtral_quant.py +14 -14
- sglang/srt/models/olmo.py +22 -20
- sglang/srt/models/olmoe.py +23 -20
- sglang/srt/models/phi3_small.py +447 -0
- sglang/srt/models/qwen.py +14 -14
- sglang/srt/models/qwen2.py +22 -19
- sglang/srt/models/qwen2_moe.py +17 -18
- sglang/srt/models/qwen2_vl.py +13 -6
- sglang/srt/models/stablelm.py +18 -16
- sglang/srt/models/torch_native_llama.py +107 -93
- sglang/srt/models/xverse.py +13 -14
- sglang/srt/models/xverse_moe.py +15 -16
- sglang/srt/models/yivl.py +13 -15
- sglang/srt/openai_api/adapter.py +19 -17
- sglang/srt/openai_api/protocol.py +14 -16
- sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
- sglang/srt/sampling/sampling_batch_info.py +61 -57
- sglang/srt/sampling/sampling_params.py +14 -16
- sglang/srt/server.py +86 -35
- sglang/srt/server_args.py +96 -80
- sglang/srt/utils.py +266 -68
- sglang/test/few_shot_gsm8k.py +8 -4
- sglang/test/runners.py +38 -20
- sglang/test/srt/sampling/penaltylib/utils.py +23 -21
- sglang/test/test_utils.py +31 -20
- sglang/version.py +1 -1
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
- sglang-0.3.6.post1.dist-info/RECORD +164 -0
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
- sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.5.post2.dist-info/RECORD +0 -156
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""
|
17
15
|
The entry point of inference server.
|
18
16
|
SRT = SGLang Runtime.
|
@@ -50,8 +48,10 @@ from sglang.srt.managers.data_parallel_controller import (
|
|
50
48
|
)
|
51
49
|
from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
|
52
50
|
from sglang.srt.managers.io_struct import (
|
51
|
+
CloseSessionReqInput,
|
53
52
|
EmbeddingReqInput,
|
54
53
|
GenerateReqInput,
|
54
|
+
OpenSessionReqInput,
|
55
55
|
UpdateWeightReqInput,
|
56
56
|
)
|
57
57
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
@@ -102,6 +102,7 @@ app.add_middleware(
|
|
102
102
|
)
|
103
103
|
|
104
104
|
tokenizer_manager: TokenizerManager = None
|
105
|
+
_max_total_num_tokens = None
|
105
106
|
|
106
107
|
##### Native API endpoints #####
|
107
108
|
|
@@ -145,10 +146,15 @@ async def get_model_info():
|
|
145
146
|
return result
|
146
147
|
|
147
148
|
|
148
|
-
@app.get("/
|
149
|
-
async def
|
150
|
-
|
151
|
-
|
149
|
+
@app.get("/get_server_info")
|
150
|
+
async def get_server_info():
|
151
|
+
try:
|
152
|
+
return await _get_server_info()
|
153
|
+
|
154
|
+
except Exception as e:
|
155
|
+
return ORJSONResponse(
|
156
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
157
|
+
)
|
152
158
|
|
153
159
|
|
154
160
|
@app.post("/flush_cache")
|
@@ -184,19 +190,6 @@ async def stop_profile():
|
|
184
190
|
)
|
185
191
|
|
186
192
|
|
187
|
-
@app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
|
188
|
-
async def get_memory_pool_size():
|
189
|
-
"""Get the memory pool size in number of tokens"""
|
190
|
-
try:
|
191
|
-
ret = await tokenizer_manager.get_memory_pool_size()
|
192
|
-
|
193
|
-
return ret
|
194
|
-
except Exception as e:
|
195
|
-
return ORJSONResponse(
|
196
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
197
|
-
)
|
198
|
-
|
199
|
-
|
200
193
|
@app.post("/update_weights")
|
201
194
|
@time_func_latency
|
202
195
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
@@ -215,6 +208,30 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
|
215
208
|
)
|
216
209
|
|
217
210
|
|
211
|
+
@app.api_route("/open_session", methods=["GET", "POST"])
|
212
|
+
async def open_session(obj: OpenSessionReqInput, request: Request):
|
213
|
+
"""Open a session, and return its unique session id."""
|
214
|
+
try:
|
215
|
+
session_id = await tokenizer_manager.open_session(obj, request)
|
216
|
+
return session_id
|
217
|
+
except Exception as e:
|
218
|
+
return ORJSONResponse(
|
219
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
220
|
+
)
|
221
|
+
|
222
|
+
|
223
|
+
@app.api_route("/close_session", methods=["GET", "POST"])
|
224
|
+
async def close_session(obj: CloseSessionReqInput, request: Request):
|
225
|
+
"""Close the session"""
|
226
|
+
try:
|
227
|
+
await tokenizer_manager.close_session(obj, request)
|
228
|
+
return Response(status_code=200)
|
229
|
+
except Exception as e:
|
230
|
+
return ORJSONResponse(
|
231
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
232
|
+
)
|
233
|
+
|
234
|
+
|
218
235
|
@time_func_latency
|
219
236
|
async def generate_request(obj: GenerateReqInput, request: Request):
|
220
237
|
"""Handle a generate request."""
|
@@ -366,6 +383,7 @@ def launch_engine(
|
|
366
383
|
"""
|
367
384
|
|
368
385
|
global tokenizer_manager
|
386
|
+
global _max_total_num_tokens
|
369
387
|
|
370
388
|
# Configure global environment
|
371
389
|
configure_logger(server_args)
|
@@ -392,7 +410,7 @@ def launch_engine(
|
|
392
410
|
)
|
393
411
|
for tp_rank in tp_rank_range:
|
394
412
|
reader, writer = mp.Pipe(duplex=False)
|
395
|
-
gpu_id = tp_rank % tp_size_per_node
|
413
|
+
gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
|
396
414
|
proc = mp.Process(
|
397
415
|
target=run_scheduler_process,
|
398
416
|
args=(server_args, port_args, gpu_id, tp_rank, None, writer),
|
@@ -431,9 +449,20 @@ def launch_engine(
|
|
431
449
|
if server_args.chat_template:
|
432
450
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
433
451
|
|
434
|
-
# Wait for model to finish loading
|
452
|
+
# Wait for model to finish loading & get max token nums
|
453
|
+
scheduler_info = []
|
435
454
|
for i in range(len(scheduler_pipe_readers)):
|
436
|
-
scheduler_pipe_readers[i].recv()
|
455
|
+
data = scheduler_pipe_readers[i].recv()
|
456
|
+
|
457
|
+
if data["status"] != "ready":
|
458
|
+
self.shutdown()
|
459
|
+
raise RuntimeError(
|
460
|
+
"Initialization failed. Please see the error messages above."
|
461
|
+
)
|
462
|
+
scheduler_info.append(data)
|
463
|
+
|
464
|
+
# Assume all schedulers have same max_total_num_tokens
|
465
|
+
_max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
|
437
466
|
|
438
467
|
|
439
468
|
def launch_server(
|
@@ -494,6 +523,14 @@ def launch_server(
|
|
494
523
|
t.join()
|
495
524
|
|
496
525
|
|
526
|
+
async def _get_server_info():
|
527
|
+
return {
|
528
|
+
**dataclasses.asdict(tokenizer_manager.server_args), # server args
|
529
|
+
"memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
|
530
|
+
"max_total_num_tokens": _max_total_num_tokens, # max total num tokens
|
531
|
+
}
|
532
|
+
|
533
|
+
|
497
534
|
def _set_envs_and_config(server_args: ServerArgs):
|
498
535
|
# Set global environments
|
499
536
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
@@ -735,6 +772,17 @@ class Runtime:
|
|
735
772
|
response = requests.post(self.url + "/encode", json=json_data)
|
736
773
|
return json.dumps(response.json())
|
737
774
|
|
775
|
+
async def get_server_info(self):
|
776
|
+
async with aiohttp.ClientSession() as session:
|
777
|
+
async with session.get(f"{self.url}/get_server_info") as response:
|
778
|
+
if response.status == 200:
|
779
|
+
return await response.json()
|
780
|
+
else:
|
781
|
+
error_data = await response.json()
|
782
|
+
raise RuntimeError(
|
783
|
+
f"Failed to get server info. {error_data['error']['message']}"
|
784
|
+
)
|
785
|
+
|
738
786
|
def __del__(self):
|
739
787
|
self.shutdown()
|
740
788
|
|
@@ -884,3 +932,6 @@ class Engine:
|
|
884
932
|
# get the current event loop
|
885
933
|
loop = asyncio.get_event_loop()
|
886
934
|
return loop.run_until_complete(encode_request(obj, None))
|
935
|
+
|
936
|
+
async def get_server_info(self):
|
937
|
+
return await _get_server_info()
|
sglang/srt/server_args.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""The arguments of the server."""
|
17
15
|
|
18
16
|
import argparse
|
@@ -23,8 +21,10 @@ import tempfile
|
|
23
21
|
from typing import List, Optional
|
24
22
|
|
25
23
|
from sglang.srt.utils import (
|
26
|
-
|
24
|
+
get_amdgpu_memory_capacity,
|
25
|
+
get_nvgpu_memory_capacity,
|
27
26
|
is_flashinfer_available,
|
27
|
+
is_hip,
|
28
28
|
is_ipv6,
|
29
29
|
is_port_available,
|
30
30
|
)
|
@@ -62,6 +62,7 @@ class ServerArgs:
|
|
62
62
|
max_prefill_tokens: int = 16384
|
63
63
|
schedule_policy: str = "lpm"
|
64
64
|
schedule_conservativeness: float = 1.0
|
65
|
+
cpu_offload_gb: int = 0
|
65
66
|
|
66
67
|
# Other runtime options
|
67
68
|
tp_size: int = 1
|
@@ -70,6 +71,7 @@ class ServerArgs:
|
|
70
71
|
constrained_json_whitespace_pattern: Optional[str] = None
|
71
72
|
watchdog_timeout: float = 300
|
72
73
|
download_dir: Optional[str] = None
|
74
|
+
base_gpu_id: int = 0
|
73
75
|
|
74
76
|
# Logging
|
75
77
|
log_level: str = "info"
|
@@ -114,8 +116,6 @@ class ServerArgs:
|
|
114
116
|
grammar_backend: Optional[str] = "outlines"
|
115
117
|
|
116
118
|
# Optimization/debug options
|
117
|
-
disable_flashinfer: bool = False
|
118
|
-
disable_flashinfer_sampling: bool = False
|
119
119
|
disable_radix_cache: bool = False
|
120
120
|
disable_jump_forward: bool = False
|
121
121
|
disable_cuda_graph: bool = False
|
@@ -123,14 +123,14 @@ class ServerArgs:
|
|
123
123
|
disable_disk_cache: bool = False
|
124
124
|
disable_custom_all_reduce: bool = False
|
125
125
|
disable_mla: bool = False
|
126
|
-
|
127
|
-
disable_nan_detection: bool = False
|
128
|
-
enable_overlap_schedule: bool = False
|
126
|
+
disable_overlap_schedule: bool = False
|
129
127
|
enable_mixed_chunk: bool = False
|
128
|
+
enable_dp_attention: bool = False
|
130
129
|
enable_torch_compile: bool = False
|
131
130
|
torch_compile_max_bs: int = 32
|
132
131
|
cuda_graph_max_bs: int = 160
|
133
132
|
torchao_config: str = ""
|
133
|
+
enable_nan_detection: bool = False
|
134
134
|
enable_p2p_check: bool = False
|
135
135
|
triton_attention_reduce_in_fp32: bool = False
|
136
136
|
num_continuous_decode_steps: int = 1
|
@@ -156,7 +156,7 @@ class ServerArgs:
|
|
156
156
|
if self.tp_size >= 16:
|
157
157
|
self.mem_fraction_static = 0.79
|
158
158
|
elif self.tp_size >= 8:
|
159
|
-
self.mem_fraction_static = 0.
|
159
|
+
self.mem_fraction_static = 0.82
|
160
160
|
elif self.tp_size >= 4:
|
161
161
|
self.mem_fraction_static = 0.85
|
162
162
|
elif self.tp_size >= 2:
|
@@ -165,59 +165,39 @@ class ServerArgs:
|
|
165
165
|
self.mem_fraction_static = 0.88
|
166
166
|
|
167
167
|
# Adjust for GPUs with small memory capacities
|
168
|
-
|
168
|
+
if is_hip():
|
169
|
+
gpu_mem = get_amdgpu_memory_capacity()
|
170
|
+
else:
|
171
|
+
gpu_mem = get_nvgpu_memory_capacity()
|
169
172
|
if gpu_mem < 25000:
|
170
|
-
logger.warning(
|
171
|
-
"Automatically adjust --chunked-prefill-size for small GPUs."
|
172
|
-
)
|
173
173
|
self.chunked_prefill_size //= 4 # make it 2048
|
174
174
|
self.cuda_graph_max_bs = 4
|
175
|
+
logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
|
175
176
|
|
176
|
-
#
|
177
|
-
if self.disable_flashinfer:
|
178
|
-
logger.warning(
|
179
|
-
"The option '--disable-flashinfer' will be deprecated in the next release. "
|
180
|
-
"Please use '--attention-backend triton' instead."
|
181
|
-
)
|
182
|
-
self.attention_backend = "triton"
|
183
|
-
if self.disable_flashinfer_sampling:
|
184
|
-
logger.warning(
|
185
|
-
"The option '--disable-flashinfer-sampling' will be deprecated in the next release. "
|
186
|
-
"Please use '--sampling-backend pytorch' instead. "
|
187
|
-
)
|
188
|
-
self.sampling_backend = "pytorch"
|
189
|
-
|
177
|
+
# Choose kernel backends
|
190
178
|
if not is_flashinfer_available():
|
191
179
|
self.attention_backend = "triton"
|
192
180
|
self.sampling_backend = "pytorch"
|
193
181
|
|
194
|
-
# Default kernel backends
|
195
182
|
if self.attention_backend is None:
|
196
183
|
self.attention_backend = "flashinfer"
|
197
|
-
|
198
184
|
if self.sampling_backend is None:
|
199
185
|
self.sampling_backend = "flashinfer"
|
200
186
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
self.disable_penalizer = True
|
209
|
-
self.disable_nan_detection = True
|
210
|
-
|
211
|
-
# Model-specific patches
|
212
|
-
if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
|
187
|
+
# Others
|
188
|
+
if self.enable_dp_attention:
|
189
|
+
self.dp_size = self.tp_size
|
190
|
+
self.chunked_prefill_size = self.chunked_prefill_size // 2
|
191
|
+
self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
|
192
|
+
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
193
|
+
self.disable_overlap_schedule = True
|
213
194
|
logger.info(
|
214
|
-
"
|
195
|
+
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
196
|
+
f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
|
197
|
+
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
|
198
|
+
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
199
|
+
"Overlap schedule is disabled."
|
215
200
|
)
|
216
|
-
self.trust_remote_code = False
|
217
|
-
|
218
|
-
if "gemma-2" in self.model_path.lower():
|
219
|
-
logger.info("When using sliding window in gemma-2, turn on flashinfer.")
|
220
|
-
self.attention_backend = "flashinfer"
|
221
201
|
|
222
202
|
@staticmethod
|
223
203
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -321,7 +301,7 @@ class ServerArgs:
|
|
321
301
|
"--device",
|
322
302
|
type=str,
|
323
303
|
default="cuda",
|
324
|
-
choices=["cuda", "xpu"],
|
304
|
+
choices=["cuda", "xpu", "hpu"],
|
325
305
|
help="The device type.",
|
326
306
|
)
|
327
307
|
parser.add_argument(
|
@@ -388,6 +368,13 @@ class ServerArgs:
|
|
388
368
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
389
369
|
)
|
390
370
|
|
371
|
+
parser.add_argument(
|
372
|
+
"--cpu-offload-gb",
|
373
|
+
type=int,
|
374
|
+
default=ServerArgs.cpu_offload_gb,
|
375
|
+
help="How many GBs of RAM to reserve for CPU offloading",
|
376
|
+
)
|
377
|
+
|
391
378
|
# Other runtime options
|
392
379
|
parser.add_argument(
|
393
380
|
"--tensor-parallel-size",
|
@@ -426,6 +413,12 @@ class ServerArgs:
|
|
426
413
|
default=ServerArgs.download_dir,
|
427
414
|
help="Model download directory.",
|
428
415
|
)
|
416
|
+
parser.add_argument(
|
417
|
+
"--base-gpu-id",
|
418
|
+
type=int,
|
419
|
+
default=ServerArgs.base_gpu_id,
|
420
|
+
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
|
421
|
+
)
|
429
422
|
|
430
423
|
# Logging
|
431
424
|
parser.add_argument(
|
@@ -599,16 +592,6 @@ class ServerArgs:
|
|
599
592
|
)
|
600
593
|
|
601
594
|
# Optimization/debug options
|
602
|
-
parser.add_argument(
|
603
|
-
"--disable-flashinfer",
|
604
|
-
action="store_true",
|
605
|
-
help="Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead.",
|
606
|
-
)
|
607
|
-
parser.add_argument(
|
608
|
-
"--disable-flashinfer-sampling",
|
609
|
-
action="store_true",
|
610
|
-
help="Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead.",
|
611
|
-
)
|
612
595
|
parser.add_argument(
|
613
596
|
"--disable-radix-cache",
|
614
597
|
action="store_true",
|
@@ -644,26 +627,26 @@ class ServerArgs:
|
|
644
627
|
action="store_true",
|
645
628
|
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
646
629
|
)
|
647
|
-
parser.add_argument(
|
648
|
-
"--disable-penalizer",
|
649
|
-
action="store_true",
|
650
|
-
help="Disable the logit penalizers (e.g., frequency and repetition penalty) for better performance if they are not used in any requests.",
|
651
|
-
)
|
652
630
|
parser.add_argument(
|
653
631
|
"--disable-nan-detection",
|
654
632
|
action="store_true",
|
655
633
|
help="Disable the NaN detection for better performance.",
|
656
634
|
)
|
657
635
|
parser.add_argument(
|
658
|
-
"--
|
636
|
+
"--disable-overlap-schedule",
|
659
637
|
action="store_true",
|
660
|
-
help="
|
638
|
+
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
661
639
|
)
|
662
640
|
parser.add_argument(
|
663
641
|
"--enable-mixed-chunk",
|
664
642
|
action="store_true",
|
665
643
|
help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
|
666
644
|
)
|
645
|
+
parser.add_argument(
|
646
|
+
"--enable-dp-attention",
|
647
|
+
action="store_true",
|
648
|
+
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
|
649
|
+
)
|
667
650
|
parser.add_argument(
|
668
651
|
"--enable-torch-compile",
|
669
652
|
action="store_true",
|
@@ -685,7 +668,12 @@ class ServerArgs:
|
|
685
668
|
"--torchao-config",
|
686
669
|
type=str,
|
687
670
|
default=ServerArgs.torchao_config,
|
688
|
-
help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
|
671
|
+
help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row",
|
672
|
+
)
|
673
|
+
parser.add_argument(
|
674
|
+
"--enable-nan-detection",
|
675
|
+
action="store_true",
|
676
|
+
help="Enable the NaN detection for debugging purposes.",
|
689
677
|
)
|
690
678
|
parser.add_argument(
|
691
679
|
"--enable-p2p-check",
|
@@ -712,6 +700,23 @@ class ServerArgs:
|
|
712
700
|
help="Delete the model checkpoint after loading the model.",
|
713
701
|
)
|
714
702
|
|
703
|
+
# Deprecated arguments
|
704
|
+
parser.add_argument(
|
705
|
+
"--enable-overlap-schedule",
|
706
|
+
action=DeprecatedAction,
|
707
|
+
help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
|
708
|
+
)
|
709
|
+
parser.add_argument(
|
710
|
+
"--disable-flashinfer",
|
711
|
+
action=DeprecatedAction,
|
712
|
+
help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
|
713
|
+
)
|
714
|
+
parser.add_argument(
|
715
|
+
"--disable-flashinfer-sampling",
|
716
|
+
action=DeprecatedAction,
|
717
|
+
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
|
718
|
+
)
|
719
|
+
|
715
720
|
@classmethod
|
716
721
|
def from_cli_args(cls, args: argparse.Namespace):
|
717
722
|
args.tp_size = args.tensor_parallel_size
|
@@ -738,6 +743,7 @@ class ServerArgs:
|
|
738
743
|
and (self.lora_paths is None or self.disable_cuda_graph)
|
739
744
|
and (self.lora_paths is None or self.disable_radix_cache)
|
740
745
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
746
|
+
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
741
747
|
|
742
748
|
if isinstance(self.lora_paths, list):
|
743
749
|
lora_paths = self.lora_paths
|
@@ -782,7 +788,7 @@ class PortArgs:
|
|
782
788
|
|
783
789
|
@staticmethod
|
784
790
|
def init_new(server_args) -> "PortArgs":
|
785
|
-
port = server_args.port +
|
791
|
+
port = server_args.port + random.randint(100, 1000)
|
786
792
|
while True:
|
787
793
|
if is_port_available(port):
|
788
794
|
break
|
@@ -805,3 +811,13 @@ class LoRAPathAction(argparse.Action):
|
|
805
811
|
getattr(namespace, self.dest)[name] = path
|
806
812
|
else:
|
807
813
|
getattr(namespace, self.dest)[lora_path] = lora_path
|
814
|
+
|
815
|
+
|
816
|
+
class DeprecatedAction(argparse.Action):
|
817
|
+
def __init__(self, option_strings, dest, nargs=0, **kwargs):
|
818
|
+
super(DeprecatedAction, self).__init__(
|
819
|
+
option_strings, dest, nargs=nargs, **kwargs
|
820
|
+
)
|
821
|
+
|
822
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
823
|
+
raise ValueError(self.help)
|