sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +2 -2
- sglang/bench_one_batch.py +2 -4
- sglang/bench_serving.py +75 -26
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +2 -2
- sglang/srt/configs/model_config.py +13 -14
- sglang/srt/constrained/__init__.py +13 -14
- sglang/srt/constrained/base_grammar_backend.py +13 -15
- sglang/srt/constrained/outlines_backend.py +13 -15
- sglang/srt/constrained/outlines_jump_forward.py +13 -15
- sglang/srt/constrained/xgrammar_backend.py +38 -57
- sglang/srt/conversation.py +13 -15
- sglang/srt/hf_transformers_utils.py +13 -15
- sglang/srt/layers/activation.py +13 -13
- sglang/srt/layers/attention/flashinfer_backend.py +13 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang/srt/layers/custom_op_util.py +13 -14
- sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
- sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
- sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- sglang/srt/layers/layernorm.py +13 -15
- sglang/srt/layers/logits_processor.py +13 -15
- sglang/srt/layers/quantization/__init__.py +77 -17
- sglang/srt/layers/radix_attention.py +13 -15
- sglang/srt/layers/rotary_embedding.py +13 -13
- sglang/srt/lora/lora.py +13 -14
- sglang/srt/lora/lora_config.py +13 -14
- sglang/srt/lora/lora_manager.py +22 -24
- sglang/srt/managers/data_parallel_controller.py +25 -19
- sglang/srt/managers/detokenizer_manager.py +13 -16
- sglang/srt/managers/io_struct.py +43 -28
- sglang/srt/managers/schedule_batch.py +55 -26
- sglang/srt/managers/schedule_policy.py +13 -15
- sglang/srt/managers/scheduler.py +89 -70
- sglang/srt/managers/session_controller.py +14 -15
- sglang/srt/managers/tokenizer_manager.py +29 -22
- sglang/srt/managers/tp_worker.py +13 -15
- sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
- sglang/srt/metrics/collector.py +13 -15
- sglang/srt/metrics/func_timer.py +13 -15
- sglang/srt/mm_utils.py +13 -14
- sglang/srt/model_executor/cuda_graph_runner.py +20 -19
- sglang/srt/model_executor/forward_batch_info.py +19 -17
- sglang/srt/model_executor/model_runner.py +42 -30
- sglang/srt/models/chatglm.py +15 -16
- sglang/srt/models/commandr.py +15 -16
- sglang/srt/models/dbrx.py +15 -16
- sglang/srt/models/deepseek.py +15 -15
- sglang/srt/models/deepseek_v2.py +15 -15
- sglang/srt/models/exaone.py +14 -15
- sglang/srt/models/gemma.py +14 -14
- sglang/srt/models/gemma2.py +24 -19
- sglang/srt/models/gemma2_reward.py +13 -14
- sglang/srt/models/gpt_bigcode.py +14 -14
- sglang/srt/models/grok.py +15 -15
- sglang/srt/models/internlm2.py +13 -15
- sglang/srt/models/internlm2_reward.py +13 -14
- sglang/srt/models/llama.py +21 -21
- sglang/srt/models/llama_classification.py +13 -14
- sglang/srt/models/llama_reward.py +13 -14
- sglang/srt/models/llava.py +13 -15
- sglang/srt/models/llavavid.py +13 -15
- sglang/srt/models/minicpm.py +13 -15
- sglang/srt/models/minicpm3.py +13 -15
- sglang/srt/models/mistral.py +13 -15
- sglang/srt/models/mixtral.py +15 -15
- sglang/srt/models/mixtral_quant.py +14 -14
- sglang/srt/models/olmo.py +21 -19
- sglang/srt/models/olmoe.py +23 -20
- sglang/srt/models/qwen.py +14 -14
- sglang/srt/models/qwen2.py +22 -19
- sglang/srt/models/qwen2_moe.py +17 -18
- sglang/srt/models/stablelm.py +18 -16
- sglang/srt/models/torch_native_llama.py +15 -17
- sglang/srt/models/xverse.py +13 -14
- sglang/srt/models/xverse_moe.py +15 -16
- sglang/srt/models/yivl.py +13 -15
- sglang/srt/openai_api/adapter.py +13 -15
- sglang/srt/openai_api/protocol.py +13 -15
- sglang/srt/sampling/sampling_batch_info.py +4 -1
- sglang/srt/sampling/sampling_params.py +13 -15
- sglang/srt/server.py +59 -34
- sglang/srt/server_args.py +22 -22
- sglang/srt/utils.py +196 -17
- sglang/test/few_shot_gsm8k.py +8 -4
- sglang/test/runners.py +13 -14
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
- sglang-0.3.6.post1.dist-info/RECORD +164 -0
- sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.6.dist-info/RECORD +0 -161
- /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/models/xverse.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
# Adapted from
|
17
16
|
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
|
sglang/srt/models/xverse_moe.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Inference-only XVERSE MoE model."""
|
15
|
+
|
17
16
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
18
17
|
|
19
18
|
import torch
|
@@ -25,7 +24,6 @@ from vllm.distributed import (
|
|
25
24
|
tensor_model_parallel_all_reduce,
|
26
25
|
)
|
27
26
|
from vllm.model_executor.layers.activation import SiluAndMul
|
28
|
-
from vllm.model_executor.layers.fused_moe import fused_moe
|
29
27
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
30
28
|
from vllm.model_executor.layers.linear import (
|
31
29
|
MergedColumnParallelLinear,
|
@@ -36,6 +34,7 @@ from vllm.model_executor.layers.linear import (
|
|
36
34
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
37
35
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
38
36
|
|
37
|
+
from sglang.srt.layers.fused_moe_triton import fused_moe
|
39
38
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
39
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
40
|
from sglang.srt.layers.radix_attention import RadixAttention
|
sglang/srt/models/yivl.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Inference-only Yi-VL model."""
|
17
15
|
|
18
16
|
from typing import Iterable, Optional, Tuple
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Conversion between OpenAI APIs and native SRT APIs"""
|
17
15
|
|
18
16
|
import asyncio
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Pydantic models for OpenAI API protocol"""
|
17
15
|
|
18
16
|
import time
|
@@ -170,7 +170,10 @@ class SamplingBatchInfo:
|
|
170
170
|
|
171
171
|
for i, grammar in enumerate(self.grammars):
|
172
172
|
if grammar is not None:
|
173
|
-
|
173
|
+
try:
|
174
|
+
grammar.fill_vocab_mask(self.vocab_mask, i)
|
175
|
+
except RuntimeError:
|
176
|
+
continue
|
174
177
|
|
175
178
|
def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
176
179
|
self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Sampling parameters for text generation."""
|
17
15
|
|
18
16
|
from typing import List, Optional, Union
|
sglang/srt/server.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""
|
17
15
|
The entry point of inference server.
|
18
16
|
SRT = SGLang Runtime.
|
@@ -104,6 +102,7 @@ app.add_middleware(
|
|
104
102
|
)
|
105
103
|
|
106
104
|
tokenizer_manager: TokenizerManager = None
|
105
|
+
_max_total_num_tokens = None
|
107
106
|
|
108
107
|
##### Native API endpoints #####
|
109
108
|
|
@@ -147,10 +146,15 @@ async def get_model_info():
|
|
147
146
|
return result
|
148
147
|
|
149
148
|
|
150
|
-
@app.get("/
|
151
|
-
async def
|
152
|
-
|
153
|
-
|
149
|
+
@app.get("/get_server_info")
|
150
|
+
async def get_server_info():
|
151
|
+
try:
|
152
|
+
return await _get_server_info()
|
153
|
+
|
154
|
+
except Exception as e:
|
155
|
+
return ORJSONResponse(
|
156
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
157
|
+
)
|
154
158
|
|
155
159
|
|
156
160
|
@app.post("/flush_cache")
|
@@ -186,19 +190,6 @@ async def stop_profile():
|
|
186
190
|
)
|
187
191
|
|
188
192
|
|
189
|
-
@app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
|
190
|
-
async def get_memory_pool_size():
|
191
|
-
"""Get the memory pool size in number of tokens"""
|
192
|
-
try:
|
193
|
-
ret = await tokenizer_manager.get_memory_pool_size()
|
194
|
-
|
195
|
-
return ret
|
196
|
-
except Exception as e:
|
197
|
-
return ORJSONResponse(
|
198
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
199
|
-
)
|
200
|
-
|
201
|
-
|
202
193
|
@app.post("/update_weights")
|
203
194
|
@time_func_latency
|
204
195
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
@@ -392,6 +383,7 @@ def launch_engine(
|
|
392
383
|
"""
|
393
384
|
|
394
385
|
global tokenizer_manager
|
386
|
+
global _max_total_num_tokens
|
395
387
|
|
396
388
|
# Configure global environment
|
397
389
|
configure_logger(server_args)
|
@@ -457,9 +449,20 @@ def launch_engine(
|
|
457
449
|
if server_args.chat_template:
|
458
450
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
459
451
|
|
460
|
-
# Wait for model to finish loading
|
452
|
+
# Wait for model to finish loading & get max token nums
|
453
|
+
scheduler_info = []
|
461
454
|
for i in range(len(scheduler_pipe_readers)):
|
462
|
-
scheduler_pipe_readers[i].recv()
|
455
|
+
data = scheduler_pipe_readers[i].recv()
|
456
|
+
|
457
|
+
if data["status"] != "ready":
|
458
|
+
self.shutdown()
|
459
|
+
raise RuntimeError(
|
460
|
+
"Initialization failed. Please see the error messages above."
|
461
|
+
)
|
462
|
+
scheduler_info.append(data)
|
463
|
+
|
464
|
+
# Assume all schedulers have same max_total_num_tokens
|
465
|
+
_max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
|
463
466
|
|
464
467
|
|
465
468
|
def launch_server(
|
@@ -520,6 +523,14 @@ def launch_server(
|
|
520
523
|
t.join()
|
521
524
|
|
522
525
|
|
526
|
+
async def _get_server_info():
|
527
|
+
return {
|
528
|
+
**dataclasses.asdict(tokenizer_manager.server_args), # server args
|
529
|
+
"memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
|
530
|
+
"max_total_num_tokens": _max_total_num_tokens, # max total num tokens
|
531
|
+
}
|
532
|
+
|
533
|
+
|
523
534
|
def _set_envs_and_config(server_args: ServerArgs):
|
524
535
|
# Set global environments
|
525
536
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
@@ -761,6 +772,17 @@ class Runtime:
|
|
761
772
|
response = requests.post(self.url + "/encode", json=json_data)
|
762
773
|
return json.dumps(response.json())
|
763
774
|
|
775
|
+
async def get_server_info(self):
|
776
|
+
async with aiohttp.ClientSession() as session:
|
777
|
+
async with session.get(f"{self.url}/get_server_info") as response:
|
778
|
+
if response.status == 200:
|
779
|
+
return await response.json()
|
780
|
+
else:
|
781
|
+
error_data = await response.json()
|
782
|
+
raise RuntimeError(
|
783
|
+
f"Failed to get server info. {error_data['error']['message']}"
|
784
|
+
)
|
785
|
+
|
764
786
|
def __del__(self):
|
765
787
|
self.shutdown()
|
766
788
|
|
@@ -910,3 +932,6 @@ class Engine:
|
|
910
932
|
# get the current event loop
|
911
933
|
loop = asyncio.get_event_loop()
|
912
934
|
return loop.run_until_complete(encode_request(obj, None))
|
935
|
+
|
936
|
+
async def get_server_info(self):
|
937
|
+
return await _get_server_info()
|
sglang/srt/server_args.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""The arguments of the server."""
|
17
15
|
|
18
16
|
import argparse
|
@@ -64,6 +62,7 @@ class ServerArgs:
|
|
64
62
|
max_prefill_tokens: int = 16384
|
65
63
|
schedule_policy: str = "lpm"
|
66
64
|
schedule_conservativeness: float = 1.0
|
65
|
+
cpu_offload_gb: int = 0
|
67
66
|
|
68
67
|
# Other runtime options
|
69
68
|
tp_size: int = 1
|
@@ -200,12 +199,6 @@ class ServerArgs:
|
|
200
199
|
"Overlap schedule is disabled."
|
201
200
|
)
|
202
201
|
|
203
|
-
if self.enable_mixed_chunk:
|
204
|
-
logger.info(
|
205
|
-
"Overlap schedule is disabled because mixed-style chunked prefill is enabled."
|
206
|
-
)
|
207
|
-
self.disable_overlap_schedule = True
|
208
|
-
|
209
202
|
@staticmethod
|
210
203
|
def add_cli_args(parser: argparse.ArgumentParser):
|
211
204
|
# Model and port args
|
@@ -308,7 +301,7 @@ class ServerArgs:
|
|
308
301
|
"--device",
|
309
302
|
type=str,
|
310
303
|
default="cuda",
|
311
|
-
choices=["cuda", "xpu"],
|
304
|
+
choices=["cuda", "xpu", "hpu"],
|
312
305
|
help="The device type.",
|
313
306
|
)
|
314
307
|
parser.add_argument(
|
@@ -375,6 +368,13 @@ class ServerArgs:
|
|
375
368
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
376
369
|
)
|
377
370
|
|
371
|
+
parser.add_argument(
|
372
|
+
"--cpu-offload-gb",
|
373
|
+
type=int,
|
374
|
+
default=ServerArgs.cpu_offload_gb,
|
375
|
+
help="How many GBs of RAM to reserve for CPU offloading",
|
376
|
+
)
|
377
|
+
|
378
378
|
# Other runtime options
|
379
379
|
parser.add_argument(
|
380
380
|
"--tensor-parallel-size",
|