sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +33 -26
- sglang/api.py +9 -1
- sglang/bench_latency.py +2 -2
- sglang/bench_serving.py +10 -1
- sglang/check_env.py +1 -1
- sglang/lang/backend/litellm.py +1 -1
- sglang/lang/backend/openai.py +1 -1
- sglang/lang/interpreter.py +21 -5
- sglang/lang/ir.py +1 -2
- sglang/srt/constrained/__init__.py +15 -0
- sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
- sglang/srt/constrained/fsm_cache.py +17 -2
- sglang/srt/constrained/jump_forward.py +17 -2
- sglang/srt/conversation.py +26 -0
- sglang/srt/hf_transformers_utils.py +15 -0
- sglang/srt/layers/context_flashattention_nopad.py +15 -0
- sglang/srt/layers/extend_attention.py +15 -0
- sglang/srt/layers/fused_moe.py +15 -0
- sglang/srt/layers/linear.py +15 -0
- sglang/srt/layers/logits_processor.py +41 -13
- sglang/srt/layers/quantization/__init__.py +15 -0
- sglang/srt/layers/quantization/fp8.py +15 -0
- sglang/srt/layers/radix_attention.py +17 -2
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
- sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
- sglang/srt/managers/detokenizer_manager.py +16 -1
- sglang/srt/managers/io_struct.py +36 -3
- sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
- sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
- sglang/srt/managers/tokenizer_manager.py +39 -16
- sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
- sglang/srt/mem_cache/base_cache.py +43 -0
- sglang/srt/mem_cache/chunk_cache.py +60 -0
- sglang/srt/mem_cache/flush_cache.py +33 -0
- sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
- sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
- sglang/srt/mm_utils.py +15 -0
- sglang/srt/model_config.py +15 -0
- sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
- sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
- sglang/srt/model_loader/model_loader.py +15 -0
- sglang/srt/model_loader/utils.py +16 -1
- sglang/srt/models/chatglm.py +16 -1
- sglang/srt/models/commandr.py +16 -1
- sglang/srt/models/dbrx.py +16 -1
- sglang/srt/models/deepseek.py +16 -1
- sglang/srt/models/deepseek_v2.py +16 -1
- sglang/srt/models/gemma.py +16 -1
- sglang/srt/models/gemma2.py +16 -1
- sglang/srt/models/gpt_bigcode.py +16 -1
- sglang/srt/models/grok.py +16 -1
- sglang/srt/models/internlm2.py +16 -1
- sglang/srt/models/llama2.py +21 -22
- sglang/srt/models/llama_classification.py +16 -1
- sglang/srt/models/llava.py +17 -2
- sglang/srt/models/llavavid.py +17 -2
- sglang/srt/models/minicpm.py +16 -1
- sglang/srt/models/mistral.py +15 -0
- sglang/srt/models/mixtral.py +16 -1
- sglang/srt/models/mixtral_quant.py +16 -1
- sglang/srt/models/qwen.py +16 -1
- sglang/srt/models/qwen2.py +16 -1
- sglang/srt/models/qwen2_moe.py +16 -1
- sglang/srt/models/stablelm.py +16 -1
- sglang/srt/models/yivl.py +15 -0
- sglang/srt/openai_api/adapter.py +569 -131
- sglang/srt/openai_api/protocol.py +84 -2
- sglang/srt/sampling_params.py +15 -0
- sglang/srt/server.py +92 -23
- sglang/srt/server_args.py +52 -11
- sglang/srt/utils.py +15 -0
- sglang/test/test_programs.py +9 -6
- sglang/utils.py +22 -0
- sglang/version.py +1 -1
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
- sglang-0.2.8.dist-info/RECORD +95 -0
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
- sglang/srt/flush_cache.py +0 -18
- sglang-0.2.6.dist-info/RECORD +0 -93
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Pydantic models for OpenAI API protocol"""
|
2
17
|
|
3
18
|
import time
|
@@ -39,12 +54,79 @@ class LogProbs(BaseModel):
|
|
39
54
|
top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
|
40
55
|
|
41
56
|
|
57
|
+
class TopLogprob(BaseModel):
|
58
|
+
token: str
|
59
|
+
bytes: List[int]
|
60
|
+
logprob: float
|
61
|
+
|
62
|
+
|
63
|
+
class ChatCompletionTokenLogprob(BaseModel):
|
64
|
+
token: str
|
65
|
+
bytes: List[int]
|
66
|
+
logprob: float
|
67
|
+
top_logprobs: List[TopLogprob]
|
68
|
+
|
69
|
+
|
70
|
+
class ChoiceLogprobs(BaseModel):
|
71
|
+
# build for v1/chat/completions response
|
72
|
+
content: List[ChatCompletionTokenLogprob]
|
73
|
+
|
74
|
+
|
42
75
|
class UsageInfo(BaseModel):
|
43
76
|
prompt_tokens: int = 0
|
44
77
|
total_tokens: int = 0
|
45
78
|
completion_tokens: Optional[int] = 0
|
46
79
|
|
47
80
|
|
81
|
+
class FileRequest(BaseModel):
|
82
|
+
# https://platform.openai.com/docs/api-reference/files/create
|
83
|
+
file: bytes # The File object (not file name) to be uploaded
|
84
|
+
purpose: str = (
|
85
|
+
"batch" # The intended purpose of the uploaded file, default is "batch"
|
86
|
+
)
|
87
|
+
|
88
|
+
|
89
|
+
class FileResponse(BaseModel):
|
90
|
+
id: str
|
91
|
+
object: str = "file"
|
92
|
+
bytes: int
|
93
|
+
created_at: int
|
94
|
+
filename: str
|
95
|
+
purpose: str
|
96
|
+
|
97
|
+
|
98
|
+
class BatchRequest(BaseModel):
|
99
|
+
input_file_id: (
|
100
|
+
str # The ID of an uploaded file that contains requests for the new batch
|
101
|
+
)
|
102
|
+
endpoint: str # The endpoint to be used for all requests in the batch
|
103
|
+
completion_window: str # The time frame within which the batch should be processed
|
104
|
+
metadata: Optional[dict] = None # Optional custom metadata for the batch
|
105
|
+
|
106
|
+
|
107
|
+
class BatchResponse(BaseModel):
|
108
|
+
id: str
|
109
|
+
object: str = "batch"
|
110
|
+
endpoint: str
|
111
|
+
errors: Optional[dict] = None
|
112
|
+
input_file_id: str
|
113
|
+
completion_window: str
|
114
|
+
status: str = "validating"
|
115
|
+
output_file_id: Optional[str] = None
|
116
|
+
error_file_id: Optional[str] = None
|
117
|
+
created_at: int
|
118
|
+
in_progress_at: Optional[int] = None
|
119
|
+
expires_at: Optional[int] = None
|
120
|
+
finalizing_at: Optional[int] = None
|
121
|
+
completed_at: Optional[int] = None
|
122
|
+
failed_at: Optional[int] = None
|
123
|
+
expired_at: Optional[int] = None
|
124
|
+
cancelling_at: Optional[int] = None
|
125
|
+
cancelled_at: Optional[int] = None
|
126
|
+
request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
|
127
|
+
metadata: Optional[dict] = None
|
128
|
+
|
129
|
+
|
48
130
|
class CompletionRequest(BaseModel):
|
49
131
|
# Ordered by official OpenAI API documentation
|
50
132
|
# https://platform.openai.com/docs/api-reference/completions/create
|
@@ -175,8 +257,8 @@ class ChatMessage(BaseModel):
|
|
175
257
|
class ChatCompletionResponseChoice(BaseModel):
|
176
258
|
index: int
|
177
259
|
message: ChatMessage
|
178
|
-
logprobs: Optional[LogProbs] = None
|
179
|
-
finish_reason:
|
260
|
+
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
261
|
+
finish_reason: str
|
180
262
|
|
181
263
|
|
182
264
|
class ChatCompletionResponse(BaseModel):
|
sglang/srt/sampling_params.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Sampling parameters for text generation."""
|
2
17
|
|
3
18
|
from typing import List, Optional, Union
|
sglang/srt/server.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""
|
2
17
|
The entry point of inference server.
|
3
18
|
SRT = SGLang Runtime.
|
@@ -23,17 +38,17 @@ import psutil
|
|
23
38
|
import requests
|
24
39
|
import uvicorn
|
25
40
|
import uvloop
|
26
|
-
from fastapi import FastAPI, Request
|
41
|
+
from fastapi import FastAPI, File, Form, Request, UploadFile
|
27
42
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
28
43
|
|
29
44
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
30
45
|
from sglang.srt.constrained import disable_cache
|
31
46
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
32
|
-
from sglang.srt.managers.
|
47
|
+
from sglang.srt.managers.controller_multi import (
|
33
48
|
start_controller_process as start_controller_process_multi,
|
34
49
|
)
|
35
|
-
from sglang.srt.managers.
|
36
|
-
from sglang.srt.managers.
|
50
|
+
from sglang.srt.managers.controller_single import launch_tp_servers
|
51
|
+
from sglang.srt.managers.controller_single import (
|
37
52
|
start_controller_process as start_controller_process_single,
|
38
53
|
)
|
39
54
|
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
|
|
41
56
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
42
57
|
from sglang.srt.openai_api.adapter import (
|
43
58
|
load_chat_template_for_openai_api,
|
59
|
+
v1_batches,
|
44
60
|
v1_chat_completions,
|
45
61
|
v1_completions,
|
62
|
+
v1_files_create,
|
63
|
+
v1_retrieve_batch,
|
64
|
+
v1_retrieve_file,
|
65
|
+
v1_retrieve_file_content,
|
46
66
|
)
|
47
67
|
from sglang.srt.openai_api.protocol import ModelCard, ModelList
|
48
68
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
@@ -137,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
|
|
137
157
|
return await v1_chat_completions(tokenizer_manager, raw_request)
|
138
158
|
|
139
159
|
|
160
|
+
@app.post("/v1/files")
|
161
|
+
async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
|
162
|
+
return await v1_files_create(
|
163
|
+
file, purpose, tokenizer_manager.server_args.file_storage_pth
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
@app.post("/v1/batches")
|
168
|
+
async def openai_v1_batches(raw_request: Request):
|
169
|
+
return await v1_batches(tokenizer_manager, raw_request)
|
170
|
+
|
171
|
+
|
172
|
+
@app.get("/v1/batches/{batch_id}")
|
173
|
+
async def retrieve_batch(batch_id: str):
|
174
|
+
return await v1_retrieve_batch(batch_id)
|
175
|
+
|
176
|
+
|
177
|
+
@app.get("/v1/files/{file_id}")
|
178
|
+
async def retrieve_file(file_id: str):
|
179
|
+
# https://platform.openai.com/docs/api-reference/files/retrieve
|
180
|
+
return await v1_retrieve_file(file_id)
|
181
|
+
|
182
|
+
|
183
|
+
@app.get("/v1/files/{file_id}/content")
|
184
|
+
async def retrieve_file_content(file_id: str):
|
185
|
+
# https://platform.openai.com/docs/api-reference/files/retrieve-contents
|
186
|
+
return await v1_retrieve_file_content(file_id)
|
187
|
+
|
188
|
+
|
140
189
|
@app.get("/v1/models")
|
141
190
|
def available_models():
|
142
191
|
"""Show available models."""
|
@@ -160,6 +209,39 @@ def _set_torch_compile_config():
|
|
160
209
|
torch._dynamo.config.accumulated_cache_size_limit = 256
|
161
210
|
|
162
211
|
|
212
|
+
def set_envs_and_config(server_args: ServerArgs):
|
213
|
+
# Set global environments
|
214
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
215
|
+
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
216
|
+
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
217
|
+
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
218
|
+
|
219
|
+
# Set ulimit
|
220
|
+
set_ulimit()
|
221
|
+
|
222
|
+
# Enable show time cost for debugging
|
223
|
+
if server_args.show_time_cost:
|
224
|
+
enable_show_time_cost()
|
225
|
+
|
226
|
+
# Disable disk cache
|
227
|
+
if server_args.disable_disk_cache:
|
228
|
+
disable_cache()
|
229
|
+
|
230
|
+
# Fix triton bugs
|
231
|
+
if server_args.tp_size * server_args.dp_size > 1:
|
232
|
+
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
|
233
|
+
maybe_set_triton_cache_manager()
|
234
|
+
|
235
|
+
# Set torch compile config
|
236
|
+
if server_args.enable_torch_compile:
|
237
|
+
_set_torch_compile_config()
|
238
|
+
|
239
|
+
# Set global chat template
|
240
|
+
if server_args.chat_template:
|
241
|
+
# TODO: replace this with huggingface transformers template
|
242
|
+
load_chat_template_for_openai_api(server_args.chat_template)
|
243
|
+
|
244
|
+
|
163
245
|
def launch_server(
|
164
246
|
server_args: ServerArgs,
|
165
247
|
model_overide_args: Optional[dict] = None,
|
@@ -175,32 +257,16 @@ def launch_server(
|
|
175
257
|
format="%(message)s",
|
176
258
|
)
|
177
259
|
|
178
|
-
# Set global environments
|
179
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
180
|
-
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
181
|
-
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
182
|
-
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
183
|
-
set_ulimit()
|
184
|
-
if server_args.show_time_cost:
|
185
|
-
enable_show_time_cost()
|
186
|
-
if server_args.disable_disk_cache:
|
187
|
-
disable_cache()
|
188
260
|
if not server_args.disable_flashinfer:
|
189
261
|
assert_pkg_version(
|
190
262
|
"flashinfer",
|
191
|
-
"0.1.
|
263
|
+
"0.1.3",
|
192
264
|
"Please uninstall the old version and "
|
193
265
|
"reinstall the latest version by following the instructions "
|
194
266
|
"at https://docs.flashinfer.ai/installation.html.",
|
195
267
|
)
|
196
|
-
|
197
|
-
|
198
|
-
maybe_set_triton_cache_manager()
|
199
|
-
if server_args.chat_template:
|
200
|
-
# TODO: replace this with huggingface transformers template
|
201
|
-
load_chat_template_for_openai_api(server_args.chat_template)
|
202
|
-
if server_args.enable_torch_compile:
|
203
|
-
_set_torch_compile_config()
|
268
|
+
|
269
|
+
set_envs_and_config(server_args)
|
204
270
|
|
205
271
|
# Allocate ports
|
206
272
|
server_args.port, server_args.additional_ports = allocate_init_ports(
|
@@ -413,6 +479,9 @@ class Runtime:
|
|
413
479
|
parent.wait(timeout=5)
|
414
480
|
self.pid = None
|
415
481
|
|
482
|
+
def cache_prefix(self, prefix: str):
|
483
|
+
self.endpoint.cache_prefix(prefix)
|
484
|
+
|
416
485
|
def get_tokenizer(self):
|
417
486
|
return get_tokenizer(
|
418
487
|
self.server_args.tokenizer_path,
|
sglang/srt/server_args.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""The arguments of the server."""
|
2
17
|
|
3
18
|
import argparse
|
@@ -29,7 +44,8 @@ class ServerArgs:
|
|
29
44
|
max_prefill_tokens: Optional[int] = None
|
30
45
|
max_running_requests: Optional[int] = None
|
31
46
|
max_num_reqs: Optional[int] = None
|
32
|
-
|
47
|
+
max_total_tokens: Optional[int] = None
|
48
|
+
schedule_policy: str = "lpm"
|
33
49
|
schedule_conservativeness: float = 1.0
|
34
50
|
|
35
51
|
# Other runtime options
|
@@ -45,11 +61,15 @@ class ServerArgs:
|
|
45
61
|
|
46
62
|
# Other
|
47
63
|
api_key: str = ""
|
64
|
+
file_storage_pth: str = "SGlang_storage"
|
48
65
|
|
49
66
|
# Data parallelism
|
50
67
|
dp_size: int = 1
|
51
68
|
load_balance_method: str = "round_robin"
|
52
69
|
|
70
|
+
# Chunked Prefill
|
71
|
+
chunked_prefill_size: Optional[int] = None
|
72
|
+
|
53
73
|
# Optimization/debug options
|
54
74
|
disable_flashinfer: bool = False
|
55
75
|
disable_flashinfer_sampling: bool = False
|
@@ -72,15 +92,15 @@ class ServerArgs:
|
|
72
92
|
self.tokenizer_path = self.model_path
|
73
93
|
if self.mem_fraction_static is None:
|
74
94
|
if self.tp_size >= 16:
|
75
|
-
self.mem_fraction_static = 0.
|
95
|
+
self.mem_fraction_static = 0.79
|
76
96
|
elif self.tp_size >= 8:
|
77
|
-
self.mem_fraction_static = 0.
|
97
|
+
self.mem_fraction_static = 0.83
|
78
98
|
elif self.tp_size >= 4:
|
79
|
-
self.mem_fraction_static = 0.
|
99
|
+
self.mem_fraction_static = 0.85
|
80
100
|
elif self.tp_size >= 2:
|
81
|
-
self.mem_fraction_static = 0.
|
101
|
+
self.mem_fraction_static = 0.87
|
82
102
|
else:
|
83
|
-
self.mem_fraction_static = 0.
|
103
|
+
self.mem_fraction_static = 0.88
|
84
104
|
if isinstance(self.additional_ports, int):
|
85
105
|
self.additional_ports = [self.additional_ports]
|
86
106
|
elif self.additional_ports is None:
|
@@ -176,6 +196,7 @@ class ServerArgs:
|
|
176
196
|
"gptq",
|
177
197
|
"marlin",
|
178
198
|
"gptq_marlin",
|
199
|
+
"awq_marlin",
|
179
200
|
"squeezellm",
|
180
201
|
"bitsandbytes",
|
181
202
|
],
|
@@ -208,15 +229,21 @@ class ServerArgs:
|
|
208
229
|
parser.add_argument(
|
209
230
|
"--max-num-reqs",
|
210
231
|
type=int,
|
211
|
-
default=
|
232
|
+
default=ServerArgs.max_num_reqs,
|
212
233
|
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
|
213
234
|
)
|
214
235
|
parser.add_argument(
|
215
|
-
"--
|
236
|
+
"--max-total-tokens",
|
237
|
+
type=int,
|
238
|
+
default=ServerArgs.max_total_tokens,
|
239
|
+
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
|
240
|
+
)
|
241
|
+
parser.add_argument(
|
242
|
+
"--schedule-policy",
|
216
243
|
type=str,
|
217
|
-
default=ServerArgs.
|
244
|
+
default=ServerArgs.schedule_policy,
|
218
245
|
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
219
|
-
help="The scheduling
|
246
|
+
help="The scheduling policy of the requests.",
|
220
247
|
)
|
221
248
|
parser.add_argument(
|
222
249
|
"--schedule-conservativeness",
|
@@ -270,6 +297,12 @@ class ServerArgs:
|
|
270
297
|
default=ServerArgs.api_key,
|
271
298
|
help="Set API key of the server.",
|
272
299
|
)
|
300
|
+
parser.add_argument(
|
301
|
+
"--file-storage-pth",
|
302
|
+
type=str,
|
303
|
+
default=ServerArgs.file_storage_pth,
|
304
|
+
help="The path of the file storage in backend.",
|
305
|
+
)
|
273
306
|
|
274
307
|
# Data parallelism
|
275
308
|
parser.add_argument(
|
@@ -296,10 +329,18 @@ class ServerArgs:
|
|
296
329
|
help="The nccl init address of multi-node server.",
|
297
330
|
)
|
298
331
|
parser.add_argument(
|
299
|
-
"--nnodes", type=int, default=
|
332
|
+
"--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
|
300
333
|
)
|
301
334
|
parser.add_argument("--node-rank", type=int, help="The node rank.")
|
302
335
|
|
336
|
+
# Chunked prefill
|
337
|
+
parser.add_argument(
|
338
|
+
"--chunked-prefill-size",
|
339
|
+
type=int,
|
340
|
+
default=ServerArgs.chunked_prefill_size,
|
341
|
+
help="The size of the chunked prefill.",
|
342
|
+
)
|
343
|
+
|
303
344
|
# Optimization/debug options
|
304
345
|
parser.add_argument(
|
305
346
|
"--disable-flashinfer",
|
sglang/srt/utils.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Common utilities."""
|
2
17
|
|
3
18
|
import base64
|
sglang/test/test_programs.py
CHANGED
@@ -113,15 +113,14 @@ def test_decode_json_regex():
|
|
113
113
|
s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
|
114
114
|
s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
|
115
115
|
s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
|
116
|
-
s += ' "country": ' + sgl.gen(regex=REGEX_STRING
|
117
|
-
s += ' "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
|
116
|
+
s += ' "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
|
118
117
|
s += "}"
|
119
118
|
|
120
|
-
ret = decode_json.run()
|
119
|
+
ret = decode_json.run(temperature=0.0)
|
121
120
|
try:
|
122
121
|
js_obj = json.loads(ret["json_output"])
|
123
122
|
except json.decoder.JSONDecodeError:
|
124
|
-
print(ret["json_output"])
|
123
|
+
print("JSONDecodeError", ret["json_output"])
|
125
124
|
raise
|
126
125
|
assert isinstance(js_obj["name"], str)
|
127
126
|
assert isinstance(js_obj["population"], int)
|
@@ -141,8 +140,12 @@ def test_decode_json():
|
|
141
140
|
s += ' "timezone": ' + sgl.gen(dtype=str) + "\n"
|
142
141
|
s += "}"
|
143
142
|
|
144
|
-
ret = decode_json.run()
|
145
|
-
|
143
|
+
ret = decode_json.run(max_new_tokens=64)
|
144
|
+
try:
|
145
|
+
js_obj = json.loads(ret["json_output"])
|
146
|
+
except json.decoder.JSONDecodeError:
|
147
|
+
print("JSONDecodeError", ret["json_output"])
|
148
|
+
raise
|
146
149
|
assert isinstance(js_obj["name"], str)
|
147
150
|
assert isinstance(js_obj["population"], int)
|
148
151
|
|
sglang/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Common utilities."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import importlib
|
4
5
|
import json
|
5
6
|
import logging
|
6
7
|
import signal
|
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
|
|
261
262
|
logger.info(f"{sub_module_name} recive sigterm")
|
262
263
|
|
263
264
|
signal.signal(signal.SIGTERM, graceful_shutdown)
|
265
|
+
|
266
|
+
|
267
|
+
class LazyImport:
|
268
|
+
def __init__(self, module_name, class_name):
|
269
|
+
self.module_name = module_name
|
270
|
+
self.class_name = class_name
|
271
|
+
self._module = None
|
272
|
+
|
273
|
+
def _load(self):
|
274
|
+
if self._module is None:
|
275
|
+
module = importlib.import_module(self.module_name)
|
276
|
+
self._module = getattr(module, self.class_name)
|
277
|
+
return self._module
|
278
|
+
|
279
|
+
def __getattr__(self, name):
|
280
|
+
module = self._load()
|
281
|
+
return getattr(module, name)
|
282
|
+
|
283
|
+
def __call__(self, *args, **kwargs):
|
284
|
+
module = self._load()
|
285
|
+
return module(*args, **kwargs)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.8"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.8
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -245,6 +245,13 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
|
245
245
|
|
246
246
|
<div align="center">
|
247
247
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
248
|
+
|
249
|
+
[](https://pypi.org/project/sglang)
|
250
|
+

|
251
|
+
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
252
|
+
[](https://github.com/sgl-project/sglang/issues)
|
253
|
+
[](https://github.com/sgl-project/sglang/issues)
|
254
|
+
|
248
255
|
</div>
|
249
256
|
|
250
257
|
--------------------------------------------------------------------------------
|
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
292
299
|
|
293
300
|
### Method 2: From source
|
294
301
|
```
|
295
|
-
|
302
|
+
# Use the stable v0.2.8 branch
|
303
|
+
git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
|
296
304
|
cd sglang
|
297
305
|
|
298
306
|
pip install --upgrade pip
|
@@ -304,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
304
312
|
|
305
313
|
### Method 3: Using docker
|
306
314
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
307
|
-
|
315
|
+
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
308
316
|
|
309
317
|
```bash
|
310
318
|
docker run --gpus all \
|
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
|
|
341
349
|
}
|
342
350
|
}'
|
343
351
|
```
|
344
|
-
Learn more about the argument format [here](docs/sampling_params.md).
|
352
|
+
Learn more about the argument format [here](docs/en/sampling_params.md).
|
345
353
|
|
346
354
|
### OpenAI Compatible API
|
347
355
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
388
396
|
```
|
389
397
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
390
398
|
```
|
391
|
-
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
399
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
392
400
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
393
401
|
```
|
394
402
|
# Node 0
|
@@ -397,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
397
405
|
# Node 1
|
398
406
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
399
407
|
```
|
400
|
-
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
408
|
+
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
401
409
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
402
410
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
403
411
|
|
@@ -440,7 +448,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
440
448
|
- InternLM 2
|
441
449
|
- Mistral NeMo
|
442
450
|
|
443
|
-
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
451
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
444
452
|
|
445
453
|
### Benchmark Performance
|
446
454
|
|
@@ -671,6 +679,24 @@ for out in state.text_iter():
|
|
671
679
|
print(out, end="", flush=True)
|
672
680
|
```
|
673
681
|
|
682
|
+
#### Roles
|
683
|
+
|
684
|
+
Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
|
685
|
+
|
686
|
+
```python
|
687
|
+
@sgl.function
|
688
|
+
def chat_example(s):
|
689
|
+
s += sgl.system("You are a helpful assistant.")
|
690
|
+
# Same as: s += s.system("You are a helpful assistant.")
|
691
|
+
|
692
|
+
with s.user():
|
693
|
+
s += "Question: What is the capital of France?"
|
694
|
+
|
695
|
+
s += sgl.assistant_begin()
|
696
|
+
s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
|
697
|
+
s += sgl.assistant_end()
|
698
|
+
```
|
699
|
+
|
674
700
|
#### Tips and Implementation Details
|
675
701
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
676
702
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|