sglang 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/bench_offline_throughput.py +18 -6
- sglang/bench_one_batch.py +13 -0
- sglang/bench_serving.py +8 -1
- sglang/check_env.py +140 -48
- sglang/lang/backend/runtime_endpoint.py +1 -0
- sglang/lang/chat_template.py +32 -0
- sglang/llama3_eval.py +316 -0
- sglang/srt/constrained/outlines_backend.py +5 -0
- sglang/srt/constrained/xgrammar_backend.py +9 -6
- sglang/srt/layers/attention/__init__.py +5 -2
- sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
- sglang/srt/layers/attention/flashinfer_backend.py +22 -5
- sglang/srt/layers/attention/torch_native_backend.py +22 -8
- sglang/srt/layers/attention/triton_backend.py +38 -33
- sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
- sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
- sglang/srt/layers/ep_moe/__init__.py +0 -0
- sglang/srt/layers/ep_moe/kernels.py +349 -0
- sglang/srt/layers/ep_moe/layer.py +665 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
- sglang/srt/layers/fused_moe_triton/layer.py +1 -1
- sglang/srt/layers/logits_processor.py +133 -95
- sglang/srt/layers/quantization/__init__.py +2 -47
- sglang/srt/layers/quantization/fp8.py +607 -0
- sglang/srt/layers/quantization/fp8_utils.py +27 -0
- sglang/srt/layers/radix_attention.py +11 -2
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/torchao_utils.py +58 -45
- sglang/srt/managers/detokenizer_manager.py +37 -17
- sglang/srt/managers/io_struct.py +39 -10
- sglang/srt/managers/schedule_batch.py +39 -24
- sglang/srt/managers/schedule_policy.py +64 -5
- sglang/srt/managers/scheduler.py +236 -197
- sglang/srt/managers/tokenizer_manager.py +99 -58
- sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +2 -2
- sglang/srt/mem_cache/memory_pool.py +5 -1
- sglang/srt/mem_cache/radix_cache.py +12 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -11
- sglang/srt/model_executor/model_runner.py +24 -9
- sglang/srt/model_parallel.py +67 -10
- sglang/srt/models/commandr.py +2 -2
- sglang/srt/models/deepseek_v2.py +87 -7
- sglang/srt/models/gemma2.py +34 -0
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/granite.py +517 -0
- sglang/srt/models/grok.py +72 -13
- sglang/srt/models/llama.py +22 -5
- sglang/srt/models/llama_classification.py +11 -23
- sglang/srt/models/llama_reward.py +0 -2
- sglang/srt/models/llava.py +37 -14
- sglang/srt/models/mixtral.py +12 -9
- sglang/srt/models/phi3_small.py +0 -5
- sglang/srt/models/qwen2.py +20 -0
- sglang/srt/models/qwen2_moe.py +0 -5
- sglang/srt/models/torch_native_llama.py +0 -5
- sglang/srt/openai_api/adapter.py +4 -0
- sglang/srt/openai_api/protocol.py +9 -4
- sglang/srt/sampling/sampling_batch_info.py +9 -8
- sglang/srt/server.py +4 -4
- sglang/srt/server_args.py +62 -13
- sglang/srt/utils.py +57 -10
- sglang/test/test_utils.py +3 -2
- sglang/utils.py +10 -3
- sglang/version.py +1 -1
- {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/METADATA +15 -9
- {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/RECORD +72 -65
- {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -510,6 +510,8 @@ def v1_generate_request(
|
|
510
510
|
"stop": request.stop,
|
511
511
|
"stop_token_ids": request.stop_token_ids,
|
512
512
|
"top_p": request.top_p,
|
513
|
+
"top_k": request.top_k,
|
514
|
+
"min_p": request.min_p,
|
513
515
|
"presence_penalty": request.presence_penalty,
|
514
516
|
"frequency_penalty": request.frequency_penalty,
|
515
517
|
"repetition_penalty": request.repetition_penalty,
|
@@ -926,6 +928,8 @@ def v1_chat_generate_request(
|
|
926
928
|
"stop": stop,
|
927
929
|
"stop_token_ids": request.stop_token_ids,
|
928
930
|
"top_p": request.top_p,
|
931
|
+
"top_k": request.top_k,
|
932
|
+
"min_p": request.min_p,
|
929
933
|
"presence_penalty": request.presence_penalty,
|
930
934
|
"frequency_penalty": request.frequency_penalty,
|
931
935
|
"repetition_penalty": request.repetition_penalty,
|
@@ -166,17 +166,19 @@ class CompletionRequest(BaseModel):
|
|
166
166
|
temperature: float = 1.0
|
167
167
|
top_p: float = 1.0
|
168
168
|
user: Optional[str] = None
|
169
|
-
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
170
169
|
|
171
170
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
172
|
-
|
173
|
-
|
171
|
+
top_k: int = -1
|
172
|
+
min_p: float = 0.0
|
174
173
|
min_tokens: int = 0
|
174
|
+
regex: Optional[str] = None
|
175
|
+
json_schema: Optional[str] = None
|
175
176
|
repetition_penalty: float = 1.0
|
176
177
|
stop_token_ids: Optional[List[int]] = None
|
177
178
|
no_stop_trim: bool = False
|
178
179
|
ignore_eos: bool = False
|
179
180
|
skip_special_tokens: bool = True
|
181
|
+
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
180
182
|
|
181
183
|
|
182
184
|
class CompletionResponseChoice(BaseModel):
|
@@ -276,13 +278,16 @@ class ChatCompletionRequest(BaseModel):
|
|
276
278
|
user: Optional[str] = None
|
277
279
|
|
278
280
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
279
|
-
|
281
|
+
top_k: int = -1
|
282
|
+
min_p: float = 0.0
|
280
283
|
min_tokens: int = 0
|
284
|
+
regex: Optional[str] = None
|
281
285
|
repetition_penalty: float = 1.0
|
282
286
|
stop_token_ids: Optional[List[int]] = None
|
283
287
|
no_stop_trim: bool = False
|
284
288
|
ignore_eos: bool = False
|
285
289
|
skip_special_tokens: bool = True
|
290
|
+
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
286
291
|
|
287
292
|
|
288
293
|
class ChatMessage(BaseModel):
|
@@ -158,22 +158,23 @@ class SamplingBatchInfo:
|
|
158
158
|
return
|
159
159
|
|
160
160
|
# find a grammar from the list
|
161
|
-
|
161
|
+
first_grammar = next(grammar for grammar in self.grammars if grammar)
|
162
162
|
|
163
163
|
# maybe we can reuse the existing mask?
|
164
|
-
self.vocab_mask =
|
164
|
+
self.vocab_mask = first_grammar.allocate_vocab_mask(
|
165
165
|
vocab_size=self.vocab_size,
|
166
166
|
batch_size=len(self.temperatures),
|
167
167
|
device=self.device,
|
168
168
|
)
|
169
|
-
self.apply_mask =
|
169
|
+
self.apply_mask = first_grammar.apply_vocab_mask # force to use static method
|
170
170
|
|
171
|
+
# Apply the mask
|
171
172
|
for i, grammar in enumerate(self.grammars):
|
172
|
-
if grammar
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
173
|
+
if grammar and not grammar.finished:
|
174
|
+
grammar.fill_vocab_mask(self.vocab_mask, i)
|
175
|
+
|
176
|
+
# Move the mask to the device if needed
|
177
|
+
self.vocab_mask = first_grammar.move_vocab_mask(self.vocab_mask, self.device)
|
177
178
|
|
178
179
|
def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
179
180
|
self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
|
sglang/srt/server.py
CHANGED
@@ -196,7 +196,7 @@ async def stop_profile_async():
|
|
196
196
|
@app.post("/update_weights_from_disk")
|
197
197
|
@time_func_latency
|
198
198
|
async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
|
199
|
-
"""Update the weights from disk
|
199
|
+
"""Update the weights from disk in-place without re-launching the server."""
|
200
200
|
success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
|
201
201
|
content = {"success": success, "message": message}
|
202
202
|
if success:
|
@@ -329,7 +329,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
|
|
329
329
|
)
|
330
330
|
|
331
331
|
|
332
|
-
@app.api_route("/
|
332
|
+
@app.api_route("/classify", methods=["POST", "PUT"])
|
333
333
|
@time_func_latency
|
334
334
|
async def classify_request(obj: EmbeddingReqInput, request: Request):
|
335
335
|
"""Handle a reward model request. Now the arguments and return values are the same as embedding models."""
|
@@ -462,8 +462,8 @@ def launch_engine(
|
|
462
462
|
if server_args.node_rank >= 1:
|
463
463
|
# For other nodes, they do not need to run tokenizer or detokenizer,
|
464
464
|
# so they can just wait here.
|
465
|
-
|
466
|
-
|
465
|
+
for proc in scheduler_procs:
|
466
|
+
proc.join()
|
467
467
|
else:
|
468
468
|
# Launch the data parallel controller
|
469
469
|
reader, writer = mp.Pipe(duplex=False)
|
sglang/srt/server_args.py
CHANGED
@@ -20,9 +20,12 @@ import random
|
|
20
20
|
import tempfile
|
21
21
|
from typing import List, Optional
|
22
22
|
|
23
|
+
import torch
|
24
|
+
|
23
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file
|
24
26
|
from sglang.srt.utils import (
|
25
27
|
get_amdgpu_memory_capacity,
|
28
|
+
get_hpu_memory_capacity,
|
26
29
|
get_nvgpu_memory_capacity,
|
27
30
|
is_flashinfer_available,
|
28
31
|
is_hip,
|
@@ -91,6 +94,8 @@ class ServerArgs:
|
|
91
94
|
# Data parallelism
|
92
95
|
dp_size: int = 1
|
93
96
|
load_balance_method: str = "round_robin"
|
97
|
+
# Expert parallelism
|
98
|
+
ep_size: int = 1
|
94
99
|
|
95
100
|
# Multi-node distributed serving
|
96
101
|
dist_init_addr: Optional[str] = None
|
@@ -128,6 +133,7 @@ class ServerArgs:
|
|
128
133
|
disable_overlap_schedule: bool = False
|
129
134
|
enable_mixed_chunk: bool = False
|
130
135
|
enable_dp_attention: bool = False
|
136
|
+
enable_ep_moe: bool = False
|
131
137
|
enable_torch_compile: bool = False
|
132
138
|
torch_compile_max_bs: int = 32
|
133
139
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -135,6 +141,7 @@ class ServerArgs:
|
|
135
141
|
enable_nan_detection: bool = False
|
136
142
|
enable_p2p_check: bool = False
|
137
143
|
triton_attention_reduce_in_fp32: bool = False
|
144
|
+
triton_attention_num_kv_splits: int = 8
|
138
145
|
num_continuous_decode_steps: int = 1
|
139
146
|
delete_ckpt_after_loading: bool = False
|
140
147
|
|
@@ -151,8 +158,13 @@ class ServerArgs:
|
|
151
158
|
|
152
159
|
if is_hip():
|
153
160
|
gpu_mem = get_amdgpu_memory_capacity()
|
154
|
-
|
161
|
+
elif torch.cuda.is_available():
|
155
162
|
gpu_mem = get_nvgpu_memory_capacity()
|
163
|
+
elif self.device == "hpu":
|
164
|
+
gpu_mem = get_hpu_memory_capacity()
|
165
|
+
else:
|
166
|
+
# GPU memory is not known yet or no GPU is available.
|
167
|
+
gpu_mem = None
|
156
168
|
|
157
169
|
# Set mem fraction static, which depends on the tensor parallelism size
|
158
170
|
if self.mem_fraction_static is None:
|
@@ -169,19 +181,27 @@ class ServerArgs:
|
|
169
181
|
|
170
182
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
171
183
|
if self.chunked_prefill_size is None:
|
172
|
-
if gpu_mem < 25_000:
|
184
|
+
if gpu_mem is not None and gpu_mem < 25_000:
|
173
185
|
self.chunked_prefill_size = 2048
|
174
186
|
else:
|
175
187
|
self.chunked_prefill_size = 8192
|
176
188
|
|
177
189
|
# Set cuda graph max batch size
|
178
190
|
if self.cuda_graph_max_bs is None:
|
179
|
-
|
180
|
-
|
191
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
192
|
+
if gpu_mem is not None and gpu_mem < 25_000:
|
193
|
+
if self.tp_size < 4:
|
194
|
+
self.cuda_graph_max_bs = 8
|
195
|
+
else:
|
196
|
+
self.cuda_graph_max_bs = 80
|
181
197
|
else:
|
182
198
|
self.cuda_graph_max_bs = 160
|
183
199
|
|
184
200
|
# Choose kernel backends
|
201
|
+
if self.device == "hpu":
|
202
|
+
self.attention_backend = "torch_native"
|
203
|
+
self.sampling_backend = "pytorch"
|
204
|
+
|
185
205
|
if self.attention_backend is None:
|
186
206
|
self.attention_backend = (
|
187
207
|
"flashinfer" if is_flashinfer_available() else "triton"
|
@@ -201,16 +221,20 @@ class ServerArgs:
|
|
201
221
|
if self.enable_dp_attention:
|
202
222
|
self.dp_size = self.tp_size
|
203
223
|
self.chunked_prefill_size = self.chunked_prefill_size // 2
|
204
|
-
self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
|
205
224
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
206
225
|
self.disable_overlap_schedule = True
|
207
226
|
logger.warning(
|
208
227
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
209
|
-
f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
|
210
228
|
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
|
211
229
|
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
212
230
|
"Overlap scheduler is disabled."
|
213
231
|
)
|
232
|
+
# Expert parallelism
|
233
|
+
if self.enable_ep_moe:
|
234
|
+
self.ep_size = self.tp_size
|
235
|
+
logger.info(
|
236
|
+
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
237
|
+
)
|
214
238
|
|
215
239
|
# GGUF
|
216
240
|
if (
|
@@ -257,7 +281,15 @@ class ServerArgs:
|
|
257
281
|
"--load-format",
|
258
282
|
type=str,
|
259
283
|
default=ServerArgs.load_format,
|
260
|
-
choices=[
|
284
|
+
choices=[
|
285
|
+
"auto",
|
286
|
+
"pt",
|
287
|
+
"safetensors",
|
288
|
+
"npcache",
|
289
|
+
"dummy",
|
290
|
+
"gguf",
|
291
|
+
"bitsandbytes",
|
292
|
+
],
|
261
293
|
help="The format of the model weights to load. "
|
262
294
|
'"auto" will try to load the weights in the safetensors format '
|
263
295
|
"and fall back to the pytorch bin format if safetensors format "
|
@@ -268,7 +300,9 @@ class ServerArgs:
|
|
268
300
|
"a numpy cache to speed up the loading. "
|
269
301
|
'"dummy" will initialize the weights with random values, '
|
270
302
|
"which is mainly for profiling."
|
271
|
-
'"gguf" will load the weights in the gguf format. '
|
303
|
+
'"gguf" will load the weights in the gguf format. '
|
304
|
+
'"bitsandbytes" will load the weights using bitsandbytes '
|
305
|
+
"quantization.",
|
272
306
|
)
|
273
307
|
parser.add_argument(
|
274
308
|
"--trust-remote-code",
|
@@ -521,6 +555,14 @@ class ServerArgs:
|
|
521
555
|
"shortest_queue",
|
522
556
|
],
|
523
557
|
)
|
558
|
+
# Expert parallelism
|
559
|
+
parser.add_argument(
|
560
|
+
"--expert-parallel-size",
|
561
|
+
"--ep-size",
|
562
|
+
type=int,
|
563
|
+
default=ServerArgs.ep_size,
|
564
|
+
help="The expert parallelism size.",
|
565
|
+
)
|
524
566
|
|
525
567
|
# Multi-node distributed serving
|
526
568
|
parser.add_argument(
|
@@ -656,11 +698,6 @@ class ServerArgs:
|
|
656
698
|
action="store_true",
|
657
699
|
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
658
700
|
)
|
659
|
-
parser.add_argument(
|
660
|
-
"--disable-nan-detection",
|
661
|
-
action="store_true",
|
662
|
-
help="Disable the NaN detection for better performance.",
|
663
|
-
)
|
664
701
|
parser.add_argument(
|
665
702
|
"--disable-overlap-schedule",
|
666
703
|
action="store_true",
|
@@ -676,6 +713,11 @@ class ServerArgs:
|
|
676
713
|
action="store_true",
|
677
714
|
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
|
678
715
|
)
|
716
|
+
parser.add_argument(
|
717
|
+
"--enable-ep-moe",
|
718
|
+
action="store_true",
|
719
|
+
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
720
|
+
)
|
679
721
|
parser.add_argument(
|
680
722
|
"--enable-torch-compile",
|
681
723
|
action="store_true",
|
@@ -715,6 +757,12 @@ class ServerArgs:
|
|
715
757
|
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
716
758
|
"This only affects Triton attention kernels.",
|
717
759
|
)
|
760
|
+
parser.add_argument(
|
761
|
+
"--triton-attention-num-kv-splits",
|
762
|
+
type=int,
|
763
|
+
default=ServerArgs.triton_attention_num_kv_splits,
|
764
|
+
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
765
|
+
)
|
718
766
|
parser.add_argument(
|
719
767
|
"--num-continuous-decode-steps",
|
720
768
|
type=int,
|
@@ -755,6 +803,7 @@ class ServerArgs:
|
|
755
803
|
def from_cli_args(cls, args: argparse.Namespace):
|
756
804
|
args.tp_size = args.tensor_parallel_size
|
757
805
|
args.dp_size = args.data_parallel_size
|
806
|
+
args.ep_size = args.expert_parallel_size
|
758
807
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
759
808
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
760
809
|
|
sglang/srt/utils.py
CHANGED
@@ -92,7 +92,7 @@ def is_flashinfer_available():
|
|
92
92
|
"""
|
93
93
|
if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
|
94
94
|
return False
|
95
|
-
return torch.cuda.is_available() and
|
95
|
+
return torch.cuda.is_available() and torch.version.cuda
|
96
96
|
|
97
97
|
|
98
98
|
def is_ipv6(address):
|
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
|
|
169
169
|
return wrapper
|
170
170
|
|
171
171
|
|
172
|
-
def get_available_gpu_memory(device, gpu_id, distributed=False):
|
172
|
+
def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
|
173
173
|
"""
|
174
174
|
Get available memory for cuda:gpu_id device.
|
175
175
|
When distributed is True, the available memory is the minimum available memory of all GPUs.
|
@@ -184,7 +184,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
|
|
184
184
|
"which may cause useless memory allocation for torch CUDA context.",
|
185
185
|
)
|
186
186
|
|
187
|
-
|
187
|
+
if empty_cache:
|
188
|
+
torch.cuda.empty_cache()
|
188
189
|
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
|
189
190
|
|
190
191
|
elif device == "xpu":
|
@@ -196,11 +197,25 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
|
|
196
197
|
f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
|
197
198
|
"which may cause useless memory allocation for torch XPU context.",
|
198
199
|
)
|
199
|
-
|
200
|
+
|
201
|
+
if empty_cache:
|
202
|
+
torch.xpu.empty_cache()
|
200
203
|
used_memory = torch.xpu.memory_allocated()
|
201
204
|
total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
|
202
205
|
free_gpu_memory = total_gpu_memory - used_memory
|
203
206
|
|
207
|
+
elif device == "hpu":
|
208
|
+
num_gpus = torch.hpu.device_count()
|
209
|
+
assert gpu_id < num_gpus
|
210
|
+
|
211
|
+
if torch.hpu.current_device() != gpu_id:
|
212
|
+
print(
|
213
|
+
f"WARNING: current device is not {gpu_id}, but {torch.hpu.current_device()}, ",
|
214
|
+
"which may cause useless memory allocation for torch HPU context.",
|
215
|
+
)
|
216
|
+
|
217
|
+
free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
|
218
|
+
|
204
219
|
if distributed:
|
205
220
|
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
|
206
221
|
torch.device(device, gpu_id)
|
@@ -939,6 +954,37 @@ def get_nvgpu_memory_capacity():
|
|
939
954
|
)
|
940
955
|
|
941
956
|
|
957
|
+
def get_hpu_memory_capacity():
|
958
|
+
try:
|
959
|
+
# Run hl-smi and capture the output
|
960
|
+
result = subprocess.run(
|
961
|
+
["hl-smi --query | grep 'Total'"],
|
962
|
+
stdout=subprocess.PIPE,
|
963
|
+
stderr=subprocess.PIPE,
|
964
|
+
shell=True,
|
965
|
+
text=True,
|
966
|
+
)
|
967
|
+
|
968
|
+
if result.returncode != 0:
|
969
|
+
raise RuntimeError(f"hl-smi error: {result.stderr.strip()}")
|
970
|
+
|
971
|
+
# Parse the output to extract memory values in MiB
|
972
|
+
memory_values = [
|
973
|
+
float(mem.split(" ")[-2]) for mem in result.stdout.strip().split("\n")
|
974
|
+
]
|
975
|
+
|
976
|
+
if not memory_values:
|
977
|
+
raise ValueError("No GPU memory values found.")
|
978
|
+
|
979
|
+
# Return the minimum memory value
|
980
|
+
return min(memory_values)
|
981
|
+
|
982
|
+
except FileNotFoundError:
|
983
|
+
raise RuntimeError(
|
984
|
+
"hl-smi not found. Ensure Habana drivers are installed and accessible."
|
985
|
+
)
|
986
|
+
|
987
|
+
|
942
988
|
# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
|
943
989
|
# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
|
944
990
|
# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
|
@@ -1025,9 +1071,6 @@ def get_device_name(device_id: int = 0) -> str:
|
|
1025
1071
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1026
1072
|
return torch.cuda.get_device_name(device_id)
|
1027
1073
|
|
1028
|
-
if hasattr(torch, "hip") and torch.hip.is_available():
|
1029
|
-
return torch.hip.get_device_name(device_id)
|
1030
|
-
|
1031
1074
|
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
1032
1075
|
return torch.xpu.get_device_name(device_id)
|
1033
1076
|
|
@@ -1040,9 +1083,6 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
|
1040
1083
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1041
1084
|
major, minor = torch.cuda.get_device_capability(device_id)
|
1042
1085
|
|
1043
|
-
if hasattr(torch, "hip") and torch.hip.is_available():
|
1044
|
-
major, minor = torch.cuda.get_device_capability(device_id)
|
1045
|
-
|
1046
1086
|
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
1047
1087
|
major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
|
1048
1088
|
"."
|
@@ -1062,6 +1102,13 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
|
1062
1102
|
return major, minor
|
1063
1103
|
|
1064
1104
|
|
1105
|
+
def get_compiler_backend() -> str:
|
1106
|
+
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1107
|
+
return "hpu_backend"
|
1108
|
+
|
1109
|
+
return "inductor"
|
1110
|
+
|
1111
|
+
|
1065
1112
|
sglang_lib = Library("sglang", "FRAGMENT") # noqa
|
1066
1113
|
|
1067
1114
|
|
sglang/test/test_utils.py
CHANGED
@@ -568,6 +568,7 @@ def run_bench_serving(
|
|
568
568
|
disable_tqdm=False,
|
569
569
|
disable_stream=disable_stream,
|
570
570
|
disable_ignore_eos=False,
|
571
|
+
return_logprob=False,
|
571
572
|
lora_name=None,
|
572
573
|
extra_request_body=None,
|
573
574
|
profile=None,
|
@@ -719,13 +720,13 @@ def run_and_check_memory_leak(
|
|
719
720
|
|
720
721
|
# Clean up everything
|
721
722
|
kill_process_tree(process.pid)
|
722
|
-
kill_process_tree(process.pid)
|
723
723
|
stdout.close()
|
724
724
|
stderr.close()
|
725
725
|
if os.path.exists(STDOUT_FILENAME):
|
726
726
|
os.remove(STDOUT_FILENAME)
|
727
727
|
if os.path.exists(STDERR_FILENAME):
|
728
728
|
os.remove(STDERR_FILENAME)
|
729
|
+
kill_process_tree(process.pid)
|
729
730
|
t.join()
|
730
731
|
|
731
732
|
# Assert success
|
@@ -733,7 +734,7 @@ def run_and_check_memory_leak(
|
|
733
734
|
has_leak = False
|
734
735
|
has_abort = False
|
735
736
|
for line in output_lines:
|
736
|
-
if "
|
737
|
+
if "Uvicorn running" in line:
|
737
738
|
has_new_server = True
|
738
739
|
if "leak" in line:
|
739
740
|
has_leak = True
|
sglang/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
"""Common utilities
|
1
|
+
"""Common utilities"""
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import gc
|
@@ -79,7 +79,14 @@ class HttpResponse:
|
|
79
79
|
return self.resp.status
|
80
80
|
|
81
81
|
|
82
|
-
def http_request(
|
82
|
+
def http_request(
|
83
|
+
url,
|
84
|
+
json=None,
|
85
|
+
stream=False,
|
86
|
+
api_key=None,
|
87
|
+
verify=None,
|
88
|
+
method: Optional[str] = None,
|
89
|
+
):
|
83
90
|
"""A faster version of requests.post with low-level urllib API."""
|
84
91
|
headers = {"Content-Type": "application/json; charset=utf-8"}
|
85
92
|
|
@@ -90,7 +97,7 @@ def http_request(url, json=None, stream=False, api_key=None, verify=None):
|
|
90
97
|
if stream:
|
91
98
|
return requests.post(url, json=json, stream=True, headers=headers)
|
92
99
|
else:
|
93
|
-
req = urllib.request.Request(url, headers=headers)
|
100
|
+
req = urllib.request.Request(url, headers=headers, method=method)
|
94
101
|
if json is None:
|
95
102
|
data = None
|
96
103
|
else:
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.0"
|
1
|
+
__version__ = "0.4.0.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.0
|
3
|
+
Version: 0.4.0.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -215,6 +215,7 @@ Requires-Dist: requests
|
|
215
215
|
Requires-Dist: tqdm
|
216
216
|
Requires-Dist: numpy
|
217
217
|
Requires-Dist: IPython
|
218
|
+
Requires-Dist: setproctitle
|
218
219
|
Provides-Extra: runtime-common
|
219
220
|
Requires-Dist: aiohttp; extra == "runtime-common"
|
220
221
|
Requires-Dist: decord; extra == "runtime-common"
|
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
|
|
232
233
|
Requires-Dist: pydantic; extra == "runtime-common"
|
233
234
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
234
235
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
235
|
-
Requires-Dist: torchao; extra == "runtime-common"
|
236
|
+
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
|
+
Requires-Dist: gemlite; extra == "runtime-common"
|
236
238
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
239
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar>=0.1.
|
240
|
+
Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
239
241
|
Provides-Extra: srt
|
240
242
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
243
|
Requires-Dist: torch; extra == "srt"
|
242
|
-
Requires-Dist: vllm
|
244
|
+
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
243
245
|
Requires-Dist: cuda-python; extra == "srt"
|
244
|
-
Requires-Dist: flashinfer
|
246
|
+
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
245
247
|
Provides-Extra: srt-hip
|
246
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
247
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -311,10 +313,14 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
311
313
|
|
312
314
|
--------------------------------------------------------------------------------
|
313
315
|
|
314
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
315
|
-
|
316
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
317
|
+
| [**Documentation**](https://sgl-project.github.io/)
|
318
|
+
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
|
319
|
+
| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
|
320
|
+
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
316
321
|
|
317
322
|
## News
|
323
|
+
- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
318
324
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
319
325
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
320
326
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -346,13 +352,13 @@ The core features include:
|
|
346
352
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
347
353
|
|
348
354
|
## Benchmark And Performance
|
349
|
-
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
355
|
+
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
350
356
|
|
351
357
|
## Roadmap
|
352
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
353
359
|
|
354
360
|
## Adoption and Sponsorship
|
355
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
356
362
|
|
357
363
|
## Acknowledgment and Citation
|
358
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|