sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +33 -26
- sglang/api.py +9 -1
- sglang/bench_latency.py +2 -2
- sglang/bench_serving.py +10 -1
- sglang/check_env.py +1 -1
- sglang/lang/backend/litellm.py +1 -1
- sglang/lang/backend/openai.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +4 -4
- sglang/lang/interpreter.py +24 -9
- sglang/lang/ir.py +1 -1
- sglang/srt/constrained/__init__.py +15 -0
- sglang/srt/constrained/base_cache.py +15 -0
- sglang/srt/constrained/fsm_cache.py +36 -1
- sglang/srt/constrained/jump_forward.py +15 -0
- sglang/srt/conversation.py +26 -0
- sglang/srt/hf_transformers_utils.py +18 -1
- sglang/srt/layers/context_flashattention_nopad.py +15 -0
- sglang/srt/layers/extend_attention.py +15 -0
- sglang/srt/layers/fused_moe.py +15 -0
- sglang/srt/layers/linear.py +15 -0
- sglang/srt/layers/logits_processor.py +109 -72
- sglang/srt/layers/quantization/__init__.py +15 -0
- sglang/srt/layers/quantization/fp8.py +15 -0
- sglang/srt/layers/radix_attention.py +21 -3
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
- sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
- sglang/srt/managers/detokenizer_manager.py +16 -1
- sglang/srt/managers/io_struct.py +38 -5
- sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
- sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
- sglang/srt/managers/tokenizer_manager.py +99 -57
- sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
- sglang/srt/mem_cache/flush_cache.py +33 -0
- sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
- sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
- sglang/srt/mm_utils.py +15 -0
- sglang/srt/model_config.py +20 -0
- sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
- sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
- sglang/srt/model_loader/model_loader.py +15 -0
- sglang/srt/model_loader/utils.py +16 -1
- sglang/srt/models/chatglm.py +16 -1
- sglang/srt/models/commandr.py +16 -1
- sglang/srt/models/dbrx.py +16 -1
- sglang/srt/models/deepseek.py +16 -1
- sglang/srt/models/deepseek_v2.py +532 -0
- sglang/srt/models/gemma.py +16 -1
- sglang/srt/models/gemma2.py +16 -1
- sglang/srt/models/gpt_bigcode.py +16 -1
- sglang/srt/models/grok.py +16 -1
- sglang/srt/models/internlm2.py +16 -1
- sglang/srt/models/llama2.py +16 -1
- sglang/srt/models/llama_classification.py +19 -4
- sglang/srt/models/llava.py +17 -2
- sglang/srt/models/llavavid.py +17 -2
- sglang/srt/models/minicpm.py +16 -1
- sglang/srt/models/mistral.py +15 -0
- sglang/srt/models/mixtral.py +16 -1
- sglang/srt/models/mixtral_quant.py +16 -1
- sglang/srt/models/qwen.py +16 -1
- sglang/srt/models/qwen2.py +16 -1
- sglang/srt/models/qwen2_moe.py +16 -1
- sglang/srt/models/stablelm.py +16 -1
- sglang/srt/models/yivl.py +15 -0
- sglang/srt/openai_api/adapter.py +545 -160
- sglang/srt/openai_api/protocol.py +65 -1
- sglang/srt/sampling_params.py +20 -4
- sglang/srt/server.py +90 -37
- sglang/srt/server_args.py +76 -17
- sglang/srt/utils.py +15 -0
- sglang/test/test_programs.py +5 -1
- sglang/utils.py +22 -0
- sglang/version.py +1 -1
- {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
- sglang-0.2.7.dist-info/RECORD +93 -0
- {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
- sglang/srt/flush_cache.py +0 -18
- sglang-0.2.5.dist-info/RECORD +0 -92
- {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
- {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Pydantic models for OpenAI API protocol"""
|
2
17
|
|
3
18
|
import time
|
@@ -45,6 +60,55 @@ class UsageInfo(BaseModel):
|
|
45
60
|
completion_tokens: Optional[int] = 0
|
46
61
|
|
47
62
|
|
63
|
+
class FileRequest(BaseModel):
|
64
|
+
# https://platform.openai.com/docs/api-reference/files/create
|
65
|
+
file: bytes # The File object (not file name) to be uploaded
|
66
|
+
purpose: str = (
|
67
|
+
"batch" # The intended purpose of the uploaded file, default is "batch"
|
68
|
+
)
|
69
|
+
|
70
|
+
|
71
|
+
class FileResponse(BaseModel):
|
72
|
+
id: str
|
73
|
+
object: str = "file"
|
74
|
+
bytes: int
|
75
|
+
created_at: int
|
76
|
+
filename: str
|
77
|
+
purpose: str
|
78
|
+
|
79
|
+
|
80
|
+
class BatchRequest(BaseModel):
|
81
|
+
input_file_id: (
|
82
|
+
str # The ID of an uploaded file that contains requests for the new batch
|
83
|
+
)
|
84
|
+
endpoint: str # The endpoint to be used for all requests in the batch
|
85
|
+
completion_window: str # The time frame within which the batch should be processed
|
86
|
+
metadata: Optional[dict] = None # Optional custom metadata for the batch
|
87
|
+
|
88
|
+
|
89
|
+
class BatchResponse(BaseModel):
|
90
|
+
id: str
|
91
|
+
object: str = "batch"
|
92
|
+
endpoint: str
|
93
|
+
errors: Optional[dict] = None
|
94
|
+
input_file_id: str
|
95
|
+
completion_window: str
|
96
|
+
status: str = "validating"
|
97
|
+
output_file_id: Optional[str] = None
|
98
|
+
error_file_id: Optional[str] = None
|
99
|
+
created_at: int
|
100
|
+
in_progress_at: Optional[int] = None
|
101
|
+
expires_at: Optional[int] = None
|
102
|
+
finalizing_at: Optional[int] = None
|
103
|
+
completed_at: Optional[int] = None
|
104
|
+
failed_at: Optional[int] = None
|
105
|
+
expired_at: Optional[int] = None
|
106
|
+
cancelling_at: Optional[int] = None
|
107
|
+
cancelled_at: Optional[int] = None
|
108
|
+
request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
|
109
|
+
metadata: Optional[dict] = None
|
110
|
+
|
111
|
+
|
48
112
|
class CompletionRequest(BaseModel):
|
49
113
|
# Ordered by official OpenAI API documentation
|
50
114
|
# https://platform.openai.com/docs/api-reference/completions/create
|
@@ -152,7 +216,7 @@ class ChatCompletionRequest(BaseModel):
|
|
152
216
|
logit_bias: Optional[Dict[str, float]] = None
|
153
217
|
logprobs: Optional[bool] = False
|
154
218
|
top_logprobs: Optional[int] = None
|
155
|
-
max_tokens: Optional[int] =
|
219
|
+
max_tokens: Optional[int] = None
|
156
220
|
n: Optional[int] = 1
|
157
221
|
presence_penalty: Optional[float] = 0.0
|
158
222
|
response_format: Optional[ResponseFormat] = None
|
sglang/srt/sampling_params.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Sampling parameters for text generation."""
|
2
17
|
|
3
18
|
from typing import List, Optional, Union
|
@@ -65,10 +80,11 @@ class SamplingParams:
|
|
65
80
|
raise ValueError(
|
66
81
|
"presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
|
67
82
|
)
|
68
|
-
if self.max_new_tokens
|
69
|
-
|
70
|
-
|
71
|
-
|
83
|
+
if self.max_new_tokens is not None:
|
84
|
+
if self.max_new_tokens < 0:
|
85
|
+
raise ValueError(
|
86
|
+
f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
|
87
|
+
)
|
72
88
|
|
73
89
|
def normalize(self, tokenizer):
|
74
90
|
# Process stop strings
|
sglang/srt/server.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""
|
2
17
|
The entry point of inference server.
|
3
18
|
SRT = SGLang Runtime.
|
@@ -23,17 +38,17 @@ import psutil
|
|
23
38
|
import requests
|
24
39
|
import uvicorn
|
25
40
|
import uvloop
|
26
|
-
from fastapi import FastAPI, Request
|
41
|
+
from fastapi import FastAPI, File, Form, Request, UploadFile
|
27
42
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
28
43
|
|
29
44
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
30
45
|
from sglang.srt.constrained import disable_cache
|
31
46
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
32
|
-
from sglang.srt.managers.
|
47
|
+
from sglang.srt.managers.controller_multi import (
|
33
48
|
start_controller_process as start_controller_process_multi,
|
34
49
|
)
|
35
|
-
from sglang.srt.managers.
|
36
|
-
from sglang.srt.managers.
|
50
|
+
from sglang.srt.managers.controller_single import launch_tp_servers
|
51
|
+
from sglang.srt.managers.controller_single import (
|
37
52
|
start_controller_process as start_controller_process_single,
|
38
53
|
)
|
39
54
|
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
|
|
41
56
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
42
57
|
from sglang.srt.openai_api.adapter import (
|
43
58
|
load_chat_template_for_openai_api,
|
59
|
+
v1_batches,
|
44
60
|
v1_chat_completions,
|
45
61
|
v1_completions,
|
62
|
+
v1_files_create,
|
63
|
+
v1_retrieve_batch,
|
64
|
+
v1_retrieve_file,
|
65
|
+
v1_retrieve_file_content,
|
46
66
|
)
|
47
67
|
from sglang.srt.openai_api.protocol import ModelCard, ModelList
|
48
68
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
@@ -65,9 +85,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
65
85
|
app = FastAPI()
|
66
86
|
tokenizer_manager = None
|
67
87
|
|
68
|
-
# Put some args for easily access
|
69
|
-
global_server_args_dict = {}
|
70
|
-
|
71
88
|
|
72
89
|
@app.get("/health")
|
73
90
|
async def health() -> Response:
|
@@ -140,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
|
|
140
157
|
return await v1_chat_completions(tokenizer_manager, raw_request)
|
141
158
|
|
142
159
|
|
160
|
+
@app.post("/v1/files")
|
161
|
+
async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
|
162
|
+
return await v1_files_create(
|
163
|
+
file, purpose, tokenizer_manager.server_args.file_storage_pth
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
@app.post("/v1/batches")
|
168
|
+
async def openai_v1_batches(raw_request: Request):
|
169
|
+
return await v1_batches(tokenizer_manager, raw_request)
|
170
|
+
|
171
|
+
|
172
|
+
@app.get("/v1/batches/{batch_id}")
|
173
|
+
async def retrieve_batch(batch_id: str):
|
174
|
+
return await v1_retrieve_batch(batch_id)
|
175
|
+
|
176
|
+
|
177
|
+
@app.get("/v1/files/{file_id}")
|
178
|
+
async def retrieve_file(file_id: str):
|
179
|
+
# https://platform.openai.com/docs/api-reference/files/retrieve
|
180
|
+
return await v1_retrieve_file(file_id)
|
181
|
+
|
182
|
+
|
183
|
+
@app.get("/v1/files/{file_id}/content")
|
184
|
+
async def retrieve_file_content(file_id: str):
|
185
|
+
# https://platform.openai.com/docs/api-reference/files/retrieve-contents
|
186
|
+
return await v1_retrieve_file_content(file_id)
|
187
|
+
|
188
|
+
|
143
189
|
@app.get("/v1/models")
|
144
190
|
def available_models():
|
145
191
|
"""Show available models."""
|
@@ -150,14 +196,6 @@ def available_models():
|
|
150
196
|
return ModelList(data=model_cards)
|
151
197
|
|
152
198
|
|
153
|
-
def _set_global_server_args(server_args: ServerArgs):
|
154
|
-
global global_server_args_dict
|
155
|
-
global_server_args_dict = {
|
156
|
-
"disable_flashinfer": server_args.disable_flashinfer,
|
157
|
-
"attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
|
158
|
-
}
|
159
|
-
|
160
|
-
|
161
199
|
def _set_torch_compile_config():
|
162
200
|
# The following configurations are for torch compile optimizations
|
163
201
|
import torch._dynamo.config
|
@@ -171,11 +209,46 @@ def _set_torch_compile_config():
|
|
171
209
|
torch._dynamo.config.accumulated_cache_size_limit = 256
|
172
210
|
|
173
211
|
|
212
|
+
def set_envs_and_config(server_args: ServerArgs):
|
213
|
+
# Set global environments
|
214
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
215
|
+
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
216
|
+
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
217
|
+
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
218
|
+
|
219
|
+
# Set ulimit
|
220
|
+
set_ulimit()
|
221
|
+
|
222
|
+
# Enable show time cost for debugging
|
223
|
+
if server_args.show_time_cost:
|
224
|
+
enable_show_time_cost()
|
225
|
+
|
226
|
+
# Disable disk cache
|
227
|
+
if server_args.disable_disk_cache:
|
228
|
+
disable_cache()
|
229
|
+
|
230
|
+
# Fix triton bugs
|
231
|
+
if server_args.tp_size * server_args.dp_size > 1:
|
232
|
+
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
|
233
|
+
maybe_set_triton_cache_manager()
|
234
|
+
|
235
|
+
# Set torch compile config
|
236
|
+
if server_args.enable_torch_compile:
|
237
|
+
_set_torch_compile_config()
|
238
|
+
|
239
|
+
# Set global chat template
|
240
|
+
if server_args.chat_template:
|
241
|
+
# TODO: replace this with huggingface transformers template
|
242
|
+
load_chat_template_for_openai_api(server_args.chat_template)
|
243
|
+
|
244
|
+
|
174
245
|
def launch_server(
|
175
246
|
server_args: ServerArgs,
|
176
247
|
model_overide_args: Optional[dict] = None,
|
177
248
|
pipe_finish_writer: Optional[mp.connection.Connection] = None,
|
178
249
|
):
|
250
|
+
server_args.check_server_args()
|
251
|
+
|
179
252
|
"""Launch an HTTP server."""
|
180
253
|
global tokenizer_manager
|
181
254
|
|
@@ -184,34 +257,16 @@ def launch_server(
|
|
184
257
|
format="%(message)s",
|
185
258
|
)
|
186
259
|
|
187
|
-
# Set global environments
|
188
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
189
|
-
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
190
|
-
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
191
|
-
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
192
|
-
set_ulimit()
|
193
|
-
if server_args.show_time_cost:
|
194
|
-
enable_show_time_cost()
|
195
|
-
if server_args.disable_disk_cache:
|
196
|
-
disable_cache()
|
197
260
|
if not server_args.disable_flashinfer:
|
198
261
|
assert_pkg_version(
|
199
262
|
"flashinfer",
|
200
|
-
"0.1.
|
263
|
+
"0.1.2",
|
201
264
|
"Please uninstall the old version and "
|
202
265
|
"reinstall the latest version by following the instructions "
|
203
266
|
"at https://docs.flashinfer.ai/installation.html.",
|
204
267
|
)
|
205
|
-
if server_args.tp_size * server_args.dp_size > 1:
|
206
|
-
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
|
207
|
-
maybe_set_triton_cache_manager()
|
208
|
-
if server_args.chat_template:
|
209
|
-
# TODO: replace this with huggingface transformers template
|
210
|
-
load_chat_template_for_openai_api(server_args.chat_template)
|
211
|
-
if server_args.enable_torch_compile:
|
212
|
-
_set_torch_compile_config()
|
213
268
|
|
214
|
-
|
269
|
+
set_envs_and_config(server_args)
|
215
270
|
|
216
271
|
# Allocate ports
|
217
272
|
server_args.port, server_args.additional_ports = allocate_init_ports(
|
@@ -230,8 +285,6 @@ def launch_server(
|
|
230
285
|
|
231
286
|
# Handle multi-node tensor parallelism
|
232
287
|
if server_args.nnodes > 1:
|
233
|
-
assert server_args.dp_size == 1, "Multi-node dp is not supported."
|
234
|
-
|
235
288
|
if server_args.node_rank != 0:
|
236
289
|
tp_size_local = server_args.tp_size // server_args.nnodes
|
237
290
|
gpu_ids = [
|
sglang/srt/server_args.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""The arguments of the server."""
|
2
17
|
|
3
18
|
import argparse
|
@@ -28,7 +43,8 @@ class ServerArgs:
|
|
28
43
|
mem_fraction_static: Optional[float] = None
|
29
44
|
max_prefill_tokens: Optional[int] = None
|
30
45
|
max_running_requests: Optional[int] = None
|
31
|
-
|
46
|
+
max_num_reqs: Optional[int] = None
|
47
|
+
schedule_policy: str = "lpm"
|
32
48
|
schedule_conservativeness: float = 1.0
|
33
49
|
|
34
50
|
# Other runtime options
|
@@ -44,20 +60,25 @@ class ServerArgs:
|
|
44
60
|
|
45
61
|
# Other
|
46
62
|
api_key: str = ""
|
63
|
+
file_storage_pth: str = "SGlang_storage"
|
47
64
|
|
48
65
|
# Data parallelism
|
49
66
|
dp_size: int = 1
|
50
67
|
load_balance_method: str = "round_robin"
|
51
68
|
|
69
|
+
# Chunked Prefill
|
70
|
+
chunked_prefill_size: Optional[int] = None
|
71
|
+
|
52
72
|
# Optimization/debug options
|
53
73
|
disable_flashinfer: bool = False
|
74
|
+
disable_flashinfer_sampling: bool = False
|
54
75
|
disable_radix_cache: bool = False
|
55
76
|
disable_regex_jump_forward: bool = False
|
56
77
|
disable_cuda_graph: bool = False
|
57
78
|
disable_disk_cache: bool = False
|
58
79
|
enable_torch_compile: bool = False
|
59
|
-
attention_reduce_in_fp32: bool = False
|
60
80
|
enable_p2p_check: bool = False
|
81
|
+
attention_reduce_in_fp32: bool = False
|
61
82
|
efficient_weight_load: bool = False
|
62
83
|
|
63
84
|
# Distributed args
|
@@ -70,15 +91,15 @@ class ServerArgs:
|
|
70
91
|
self.tokenizer_path = self.model_path
|
71
92
|
if self.mem_fraction_static is None:
|
72
93
|
if self.tp_size >= 16:
|
73
|
-
self.mem_fraction_static = 0.
|
94
|
+
self.mem_fraction_static = 0.79
|
74
95
|
elif self.tp_size >= 8:
|
75
|
-
self.mem_fraction_static = 0.
|
96
|
+
self.mem_fraction_static = 0.83
|
76
97
|
elif self.tp_size >= 4:
|
77
|
-
self.mem_fraction_static = 0.
|
98
|
+
self.mem_fraction_static = 0.85
|
78
99
|
elif self.tp_size >= 2:
|
79
|
-
self.mem_fraction_static = 0.
|
100
|
+
self.mem_fraction_static = 0.87
|
80
101
|
else:
|
81
|
-
self.mem_fraction_static = 0.
|
102
|
+
self.mem_fraction_static = 0.88
|
82
103
|
if isinstance(self.additional_ports, int):
|
83
104
|
self.additional_ports = [self.additional_ports]
|
84
105
|
elif self.additional_ports is None:
|
@@ -174,6 +195,7 @@ class ServerArgs:
|
|
174
195
|
"gptq",
|
175
196
|
"marlin",
|
176
197
|
"gptq_marlin",
|
198
|
+
"awq_marlin",
|
177
199
|
"squeezellm",
|
178
200
|
"bitsandbytes",
|
179
201
|
],
|
@@ -204,11 +226,17 @@ class ServerArgs:
|
|
204
226
|
help="The maximum number of running requests.",
|
205
227
|
)
|
206
228
|
parser.add_argument(
|
207
|
-
"--
|
229
|
+
"--max-num-reqs",
|
230
|
+
type=int,
|
231
|
+
default=ServerArgs.max_num_reqs,
|
232
|
+
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
|
233
|
+
)
|
234
|
+
parser.add_argument(
|
235
|
+
"--schedule-policy",
|
208
236
|
type=str,
|
209
|
-
default=ServerArgs.
|
237
|
+
default=ServerArgs.schedule_policy,
|
210
238
|
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
211
|
-
help="The scheduling
|
239
|
+
help="The scheduling policy of the requests.",
|
212
240
|
)
|
213
241
|
parser.add_argument(
|
214
242
|
"--schedule-conservativeness",
|
@@ -262,6 +290,12 @@ class ServerArgs:
|
|
262
290
|
default=ServerArgs.api_key,
|
263
291
|
help="Set API key of the server.",
|
264
292
|
)
|
293
|
+
parser.add_argument(
|
294
|
+
"--file-storage-pth",
|
295
|
+
type=str,
|
296
|
+
default=ServerArgs.file_storage_pth,
|
297
|
+
help="The path of the file storage in backend.",
|
298
|
+
)
|
265
299
|
|
266
300
|
# Data parallelism
|
267
301
|
parser.add_argument(
|
@@ -288,15 +322,28 @@ class ServerArgs:
|
|
288
322
|
help="The nccl init address of multi-node server.",
|
289
323
|
)
|
290
324
|
parser.add_argument(
|
291
|
-
"--nnodes", type=int, default=
|
325
|
+
"--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
|
292
326
|
)
|
293
327
|
parser.add_argument("--node-rank", type=int, help="The node rank.")
|
294
328
|
|
329
|
+
# Chunked prefill
|
330
|
+
parser.add_argument(
|
331
|
+
"--chunked-prefill-size",
|
332
|
+
type=int,
|
333
|
+
default=ServerArgs.chunked_prefill_size,
|
334
|
+
help="The size of the chunked prefill.",
|
335
|
+
)
|
336
|
+
|
295
337
|
# Optimization/debug options
|
296
338
|
parser.add_argument(
|
297
339
|
"--disable-flashinfer",
|
298
340
|
action="store_true",
|
299
|
-
help="Disable flashinfer
|
341
|
+
help="Disable flashinfer attention kernels.",
|
342
|
+
)
|
343
|
+
parser.add_argument(
|
344
|
+
"--disable-flashinfer-sampling",
|
345
|
+
action="store_true",
|
346
|
+
help="Disable flashinfer sampling kernels.",
|
300
347
|
)
|
301
348
|
parser.add_argument(
|
302
349
|
"--disable-radix-cache",
|
@@ -324,15 +371,15 @@ class ServerArgs:
|
|
324
371
|
help="Optimize the model with torch.compile, experimental feature.",
|
325
372
|
)
|
326
373
|
parser.add_argument(
|
327
|
-
"--
|
374
|
+
"--enable-p2p-check",
|
328
375
|
action="store_true",
|
329
|
-
help="
|
330
|
-
"This only affects Triton attention kernels",
|
376
|
+
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
331
377
|
)
|
332
378
|
parser.add_argument(
|
333
|
-
"--
|
379
|
+
"--attention-reduce-in-fp32",
|
334
380
|
action="store_true",
|
335
|
-
help="
|
381
|
+
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
382
|
+
"This only affects Triton attention kernels",
|
336
383
|
)
|
337
384
|
parser.add_argument(
|
338
385
|
"--efficient-weight-load",
|
@@ -357,6 +404,18 @@ class ServerArgs:
|
|
357
404
|
f"disable_disk_cache={self.disable_disk_cache}, "
|
358
405
|
)
|
359
406
|
|
407
|
+
def check_server_args(self):
|
408
|
+
assert (
|
409
|
+
self.tp_size % self.nnodes == 0
|
410
|
+
), "tp_size must be divisible by number of nodes"
|
411
|
+
assert not (
|
412
|
+
self.dp_size > 1 and self.node_rank is not None
|
413
|
+
), "multi-node data parallel is not supported"
|
414
|
+
|
415
|
+
assert not (
|
416
|
+
self.chunked_prefill_size is not None and self.disable_radix_cache
|
417
|
+
), "chunked prefill is not supported with radix cache disabled currently"
|
418
|
+
|
360
419
|
|
361
420
|
@dataclasses.dataclass
|
362
421
|
class PortArgs:
|
sglang/srt/utils.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Common utilities."""
|
2
17
|
|
3
18
|
import base64
|
sglang/test/test_programs.py
CHANGED
@@ -118,7 +118,11 @@ def test_decode_json_regex():
|
|
118
118
|
s += "}"
|
119
119
|
|
120
120
|
ret = decode_json.run()
|
121
|
-
|
121
|
+
try:
|
122
|
+
js_obj = json.loads(ret["json_output"])
|
123
|
+
except json.decoder.JSONDecodeError:
|
124
|
+
print(ret["json_output"])
|
125
|
+
raise
|
122
126
|
assert isinstance(js_obj["name"], str)
|
123
127
|
assert isinstance(js_obj["population"], int)
|
124
128
|
|
sglang/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Common utilities."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import importlib
|
4
5
|
import json
|
5
6
|
import logging
|
6
7
|
import signal
|
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
|
|
261
262
|
logger.info(f"{sub_module_name} recive sigterm")
|
262
263
|
|
263
264
|
signal.signal(signal.SIGTERM, graceful_shutdown)
|
265
|
+
|
266
|
+
|
267
|
+
class LazyImport:
|
268
|
+
def __init__(self, module_name, class_name):
|
269
|
+
self.module_name = module_name
|
270
|
+
self.class_name = class_name
|
271
|
+
self._module = None
|
272
|
+
|
273
|
+
def _load(self):
|
274
|
+
if self._module is None:
|
275
|
+
module = importlib.import_module(self.module_name)
|
276
|
+
self._module = getattr(module, self.class_name)
|
277
|
+
return self._module
|
278
|
+
|
279
|
+
def __getattr__(self, name):
|
280
|
+
module = self._load()
|
281
|
+
return getattr(module, name)
|
282
|
+
|
283
|
+
def __call__(self, *args, **kwargs):
|
284
|
+
module = self._load()
|
285
|
+
return module(*args, **kwargs)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.7"
|