sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_latency.py +3 -3
- sglang/bench_server_latency.py +2 -3
- sglang/bench_serving.py +92 -0
- sglang/global_config.py +9 -3
- sglang/lang/chat_template.py +50 -25
- sglang/lang/interpreter.py +9 -1
- sglang/lang/ir.py +11 -2
- sglang/launch_server.py +1 -1
- sglang/srt/configs/model_config.py +76 -15
- sglang/srt/constrained/__init__.py +18 -0
- sglang/srt/constrained/bnf_cache.py +61 -0
- sglang/srt/constrained/fsm_cache.py +10 -3
- sglang/srt/constrained/grammar.py +190 -0
- sglang/srt/hf_transformers_utils.py +20 -5
- sglang/srt/layers/attention/flashinfer_backend.py +5 -5
- sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
- sglang/srt/layers/fused_moe/fused_moe.py +4 -3
- sglang/srt/layers/fused_moe/layer.py +28 -0
- sglang/srt/layers/logits_processor.py +5 -5
- sglang/srt/layers/quantization/base_config.py +16 -1
- sglang/srt/layers/rotary_embedding.py +15 -48
- sglang/srt/layers/sampler.py +51 -39
- sglang/srt/layers/vocab_parallel_embedding.py +486 -0
- sglang/srt/managers/data_parallel_controller.py +8 -7
- sglang/srt/managers/detokenizer_manager.py +11 -9
- sglang/srt/managers/image_processor.py +4 -3
- sglang/srt/managers/io_struct.py +80 -78
- sglang/srt/managers/schedule_batch.py +46 -52
- sglang/srt/managers/schedule_policy.py +24 -13
- sglang/srt/managers/scheduler.py +145 -82
- sglang/srt/managers/tokenizer_manager.py +236 -334
- sglang/srt/managers/tp_worker.py +5 -5
- sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
- sglang/srt/mem_cache/flush_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +10 -3
- sglang/srt/model_executor/cuda_graph_runner.py +34 -23
- sglang/srt/model_executor/forward_batch_info.py +6 -9
- sglang/srt/model_executor/model_runner.py +10 -19
- sglang/srt/models/baichuan.py +4 -4
- sglang/srt/models/chatglm.py +4 -4
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +5 -5
- sglang/srt/models/deepseek.py +4 -4
- sglang/srt/models/deepseek_v2.py +4 -4
- sglang/srt/models/exaone.py +4 -4
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -1
- sglang/srt/models/gpt2.py +287 -0
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +4 -4
- sglang/srt/models/internlm2.py +4 -4
- sglang/srt/models/llama.py +15 -7
- sglang/srt/models/llama_embedding.py +2 -10
- sglang/srt/models/llama_reward.py +5 -0
- sglang/srt/models/minicpm.py +4 -4
- sglang/srt/models/minicpm3.py +4 -4
- sglang/srt/models/mixtral.py +7 -5
- sglang/srt/models/mixtral_quant.py +4 -4
- sglang/srt/models/mllama.py +5 -5
- sglang/srt/models/olmo.py +4 -4
- sglang/srt/models/olmoe.py +4 -4
- sglang/srt/models/qwen.py +4 -4
- sglang/srt/models/qwen2.py +4 -4
- sglang/srt/models/qwen2_moe.py +4 -4
- sglang/srt/models/qwen2_vl.py +4 -8
- sglang/srt/models/stablelm.py +4 -4
- sglang/srt/models/torch_native_llama.py +4 -4
- sglang/srt/models/xverse.py +4 -4
- sglang/srt/models/xverse_moe.py +4 -4
- sglang/srt/openai_api/adapter.py +52 -66
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- sglang/srt/sampling/sampling_batch_info.py +7 -13
- sglang/srt/sampling/sampling_params.py +5 -7
- sglang/srt/server.py +41 -33
- sglang/srt/server_args.py +34 -5
- sglang/srt/utils.py +40 -56
- sglang/test/run_eval.py +2 -0
- sglang/test/runners.py +2 -1
- sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- sglang/test/test_utils.py +151 -6
- sglang/utils.py +62 -1
- sglang/version.py +1 -1
- sglang-0.3.5.dist-info/METADATA +344 -0
- sglang-0.3.5.dist-info/RECORD +152 -0
- {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
- sglang-0.3.4.post1.dist-info/METADATA +0 -900
- sglang-0.3.4.post1.dist-info/RECORD +0 -148
- {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -63,6 +63,7 @@ class ServerArgs:
|
|
63
63
|
stream_interval: int = 1
|
64
64
|
random_seed: Optional[int] = None
|
65
65
|
constrained_json_whitespace_pattern: Optional[str] = None
|
66
|
+
decode_log_interval: int = 40
|
66
67
|
|
67
68
|
# Logging
|
68
69
|
log_level: str = "info"
|
@@ -74,6 +75,7 @@ class ServerArgs:
|
|
74
75
|
api_key: Optional[str] = None
|
75
76
|
file_storage_pth: str = "SGLang_storage"
|
76
77
|
enable_cache_report: bool = False
|
78
|
+
watchdog_timeout: float = 600
|
77
79
|
|
78
80
|
# Data parallelism
|
79
81
|
dp_size: int = 1
|
@@ -102,6 +104,7 @@ class ServerArgs:
|
|
102
104
|
# Kernel backend
|
103
105
|
attention_backend: Optional[str] = None
|
104
106
|
sampling_backend: Optional[str] = None
|
107
|
+
grammar_backend: Optional[str] = "outlines"
|
105
108
|
|
106
109
|
# Optimization/debug options
|
107
110
|
disable_flashinfer: bool = False
|
@@ -118,7 +121,8 @@ class ServerArgs:
|
|
118
121
|
enable_overlap_schedule: bool = False
|
119
122
|
enable_mixed_chunk: bool = False
|
120
123
|
enable_torch_compile: bool = False
|
121
|
-
|
124
|
+
torch_compile_max_bs: int = 32
|
125
|
+
cuda_graph_max_bs: int = 160
|
122
126
|
torchao_config: str = ""
|
123
127
|
enable_p2p_check: bool = False
|
124
128
|
triton_attention_reduce_in_fp32: bool = False
|
@@ -427,6 +431,18 @@ class ServerArgs:
|
|
427
431
|
action="store_true",
|
428
432
|
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
429
433
|
)
|
434
|
+
parser.add_argument(
|
435
|
+
"--watchdog-timeout",
|
436
|
+
type=float,
|
437
|
+
default=ServerArgs.watchdog_timeout,
|
438
|
+
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
439
|
+
)
|
440
|
+
parser.add_argument(
|
441
|
+
"--decode-log-interval",
|
442
|
+
type=int,
|
443
|
+
default=ServerArgs.decode_log_interval,
|
444
|
+
help="The log interval of decode batch"
|
445
|
+
)
|
430
446
|
|
431
447
|
# Data parallelism
|
432
448
|
parser.add_argument(
|
@@ -537,6 +553,13 @@ class ServerArgs:
|
|
537
553
|
default=ServerArgs.sampling_backend,
|
538
554
|
help="Choose the kernels for sampling layers.",
|
539
555
|
)
|
556
|
+
parser.add_argument(
|
557
|
+
"--grammar-backend",
|
558
|
+
type=str,
|
559
|
+
choices=["xgrammar", "outlines"],
|
560
|
+
default=ServerArgs.grammar_backend,
|
561
|
+
help="Choose the backend for constrained decoding.",
|
562
|
+
)
|
540
563
|
|
541
564
|
# Optimization/debug options
|
542
565
|
parser.add_argument(
|
@@ -611,11 +634,17 @@ class ServerArgs:
|
|
611
634
|
help="Optimize the model with torch.compile. Experimental feature.",
|
612
635
|
)
|
613
636
|
parser.add_argument(
|
614
|
-
"--
|
637
|
+
"--torch-compile-max-bs",
|
615
638
|
type=int,
|
616
|
-
default=ServerArgs.
|
639
|
+
default=ServerArgs.torch_compile_max_bs,
|
617
640
|
help="Set the maximum batch size when using torch compile.",
|
618
641
|
)
|
642
|
+
parser.add_argument(
|
643
|
+
"--cuda-graph-max-bs",
|
644
|
+
type=int,
|
645
|
+
default=ServerArgs.cuda_graph_max_bs,
|
646
|
+
help="Set the maximum batch size for cuda graph.",
|
647
|
+
)
|
619
648
|
parser.add_argument(
|
620
649
|
"--torchao-config",
|
621
650
|
type=str,
|
@@ -712,11 +741,11 @@ class PortArgs:
|
|
712
741
|
|
713
742
|
@staticmethod
|
714
743
|
def init_new(server_args) -> "PortArgs":
|
715
|
-
port = server_args.port +
|
744
|
+
port = server_args.port + 42
|
716
745
|
while True:
|
717
746
|
if is_port_available(port):
|
718
747
|
break
|
719
|
-
port +=
|
748
|
+
port += 42
|
720
749
|
|
721
750
|
return PortArgs(
|
722
751
|
tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
|
sglang/srt/utils.py
CHANGED
@@ -35,6 +35,7 @@ import psutil
|
|
35
35
|
import requests
|
36
36
|
import torch
|
37
37
|
import torch.distributed as dist
|
38
|
+
import zmq
|
38
39
|
from fastapi.responses import ORJSONResponse
|
39
40
|
from packaging import version as pkg_version
|
40
41
|
from torch import nn
|
@@ -203,56 +204,6 @@ def is_port_available(port):
|
|
203
204
|
return False
|
204
205
|
|
205
206
|
|
206
|
-
def is_multimodal_model(model_architectures):
|
207
|
-
if (
|
208
|
-
"LlavaLlamaForCausalLM" in model_architectures
|
209
|
-
or "LlavaQwenForCausalLM" in model_architectures
|
210
|
-
or "LlavaMistralForCausalLM" in model_architectures
|
211
|
-
or "LlavaVidForCausalLM" in model_architectures
|
212
|
-
or "MllamaForConditionalGeneration" in model_architectures
|
213
|
-
or "Qwen2VLForConditionalGeneration" in model_architectures
|
214
|
-
):
|
215
|
-
return True
|
216
|
-
else:
|
217
|
-
return False
|
218
|
-
|
219
|
-
|
220
|
-
def is_attention_free_model(model_architectures):
|
221
|
-
return False
|
222
|
-
|
223
|
-
|
224
|
-
def model_has_inner_state(model_architectures):
|
225
|
-
return False
|
226
|
-
|
227
|
-
|
228
|
-
def is_embedding_model(model_architectures):
|
229
|
-
if (
|
230
|
-
"LlamaEmbeddingModel" in model_architectures
|
231
|
-
or "MistralModel" in model_architectures
|
232
|
-
or "LlamaForSequenceClassification" in model_architectures
|
233
|
-
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
|
234
|
-
):
|
235
|
-
return True
|
236
|
-
else:
|
237
|
-
return False
|
238
|
-
|
239
|
-
|
240
|
-
def is_generation_model(model_architectures, is_embedding: bool = False):
|
241
|
-
# We have two ways to determine whether a model is a generative model.
|
242
|
-
# 1. Check the model architectue
|
243
|
-
# 2. check the `is_embedding` server args
|
244
|
-
|
245
|
-
if (
|
246
|
-
"LlamaEmbeddingModel" in model_architectures
|
247
|
-
or "MistralModel" in model_architectures
|
248
|
-
or "LlamaForSequenceClassification" in model_architectures
|
249
|
-
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
|
250
|
-
):
|
251
|
-
return False
|
252
|
-
else:
|
253
|
-
return not is_embedding
|
254
|
-
|
255
|
-
|
256
207
|
def decode_video_base64(video_base64):
|
257
208
|
from PIL import Image
|
258
209
|
|
@@ -397,17 +348,26 @@ def kill_parent_process():
|
|
397
348
|
"""Kill the parent process and all children of the parent process."""
|
398
349
|
current_process = psutil.Process()
|
399
350
|
parent_process = current_process.parent()
|
400
|
-
kill_child_process(
|
351
|
+
kill_child_process(
|
352
|
+
parent_process.pid, include_self=True, skip_pid=current_process.pid
|
353
|
+
)
|
354
|
+
try:
|
355
|
+
current_process.kill()
|
356
|
+
except psutil.NoSuchProcess:
|
357
|
+
pass
|
401
358
|
|
402
359
|
|
403
|
-
def kill_child_process(pid,
|
360
|
+
def kill_child_process(pid=None, include_self=False, skip_pid=None):
|
404
361
|
"""Kill the process and all its children process."""
|
362
|
+
if pid is None:
|
363
|
+
pid = os.getpid()
|
364
|
+
|
405
365
|
try:
|
406
|
-
|
366
|
+
itself = psutil.Process(pid)
|
407
367
|
except psutil.NoSuchProcess:
|
408
368
|
return
|
409
369
|
|
410
|
-
children =
|
370
|
+
children = itself.children(recursive=True)
|
411
371
|
for child in children:
|
412
372
|
if child.pid == skip_pid:
|
413
373
|
continue
|
@@ -416,9 +376,9 @@ def kill_child_process(pid, including_parent=True, skip_pid=None):
|
|
416
376
|
except psutil.NoSuchProcess:
|
417
377
|
pass
|
418
378
|
|
419
|
-
if
|
379
|
+
if include_self:
|
420
380
|
try:
|
421
|
-
|
381
|
+
itself.kill()
|
422
382
|
except psutil.NoSuchProcess:
|
423
383
|
pass
|
424
384
|
|
@@ -720,3 +680,27 @@ def first_rank_print(*args, **kwargs):
|
|
720
680
|
print(*args, **kwargs)
|
721
681
|
else:
|
722
682
|
pass
|
683
|
+
|
684
|
+
|
685
|
+
def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
|
686
|
+
mem = psutil.virtual_memory()
|
687
|
+
total_mem = mem.total / 1024**3
|
688
|
+
available_mem = mem.available / 1024**3
|
689
|
+
if total_mem > 32 and available_mem > 16:
|
690
|
+
buf_size = int(0.5 * 1024**3)
|
691
|
+
else:
|
692
|
+
buf_size = -1
|
693
|
+
|
694
|
+
socket = context.socket(socket_type)
|
695
|
+
if socket_type == zmq.PUSH:
|
696
|
+
socket.setsockopt(zmq.SNDHWM, 0)
|
697
|
+
socket.setsockopt(zmq.SNDBUF, buf_size)
|
698
|
+
socket.connect(f"ipc://{endpoint}")
|
699
|
+
elif socket_type == zmq.PULL:
|
700
|
+
socket.setsockopt(zmq.RCVHWM, 0)
|
701
|
+
socket.setsockopt(zmq.RCVBUF, buf_size)
|
702
|
+
socket.bind(f"ipc://{endpoint}")
|
703
|
+
else:
|
704
|
+
raise ValueError(f"Unsupported socket type: {socket_type}")
|
705
|
+
|
706
|
+
return socket
|
sglang/test/run_eval.py
CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
|
|
67
67
|
model=args.model,
|
68
68
|
max_tokens=2048,
|
69
69
|
base_url=base_url,
|
70
|
+
temperature=getattr(args, "temperature", 0.0),
|
70
71
|
)
|
71
72
|
|
72
73
|
# Run eval
|
@@ -119,6 +120,7 @@ if __name__ == "__main__":
|
|
119
120
|
parser.add_argument("--eval-name", type=str, default="mmlu")
|
120
121
|
parser.add_argument("--num-examples", type=int)
|
121
122
|
parser.add_argument("--num-threads", type=int, default=512)
|
123
|
+
parser.add_argument("--temperature", type=float, default=0.0)
|
122
124
|
args = parser.parse_args()
|
123
125
|
|
124
126
|
run_eval(args)
|
sglang/test/runners.py
CHANGED
@@ -273,6 +273,7 @@ class SRTRunner:
|
|
273
273
|
disable_cuda_graph=disable_cuda_graph,
|
274
274
|
disable_radix_cache=disable_radix_cache,
|
275
275
|
)
|
276
|
+
self.tokenizer = get_tokenizer(model_path)
|
276
277
|
|
277
278
|
def forward(
|
278
279
|
self,
|
@@ -366,7 +367,7 @@ class SRTRunner:
|
|
366
367
|
return ModelOutput(embed_logits=logits)
|
367
368
|
else:
|
368
369
|
scores = [x["embedding"][0] for x in response]
|
369
|
-
return ModelOutput(scores=
|
370
|
+
return ModelOutput(scores=scores)
|
370
371
|
|
371
372
|
def __enter__(self):
|
372
373
|
return self
|
sglang/test/test_utils.py
CHANGED
@@ -3,9 +3,11 @@
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
5
|
import os
|
6
|
+
import random
|
6
7
|
import subprocess
|
7
8
|
import threading
|
8
9
|
import time
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
9
11
|
from functools import partial
|
10
12
|
from types import SimpleNamespace
|
11
13
|
from typing import Callable, List, Optional
|
@@ -20,6 +22,7 @@ from sglang.global_config import global_config
|
|
20
22
|
from sglang.lang.backend.openai import OpenAI
|
21
23
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
22
24
|
from sglang.srt.utils import kill_child_process
|
25
|
+
from sglang.test.run_eval import run_eval
|
23
26
|
from sglang.utils import get_exception_traceback
|
24
27
|
|
25
28
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
@@ -400,7 +403,7 @@ def popen_launch_server(
|
|
400
403
|
api_key: Optional[str] = None,
|
401
404
|
other_args: tuple = (),
|
402
405
|
env: Optional[dict] = None,
|
403
|
-
return_stdout_stderr:
|
406
|
+
return_stdout_stderr: Optional[tuple] = None,
|
404
407
|
):
|
405
408
|
_, host, port = base_url.split(":")
|
406
409
|
host = host[2:]
|
@@ -423,8 +426,8 @@ def popen_launch_server(
|
|
423
426
|
if return_stdout_stderr:
|
424
427
|
process = subprocess.Popen(
|
425
428
|
command,
|
426
|
-
stdout=
|
427
|
-
stderr=
|
429
|
+
stdout=return_stdout_stderr[0],
|
430
|
+
stderr=return_stdout_stderr[1],
|
428
431
|
env=env,
|
429
432
|
text=True,
|
430
433
|
)
|
@@ -493,7 +496,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
493
496
|
)
|
494
497
|
assert ret_code == 0
|
495
498
|
except TimeoutError:
|
496
|
-
kill_child_process(process.pid)
|
499
|
+
kill_child_process(process.pid, include_self=True)
|
497
500
|
time.sleep(5)
|
498
501
|
print(
|
499
502
|
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
|
@@ -561,7 +564,7 @@ def run_bench_serving(
|
|
561
564
|
try:
|
562
565
|
res = run_benchmark(args)
|
563
566
|
finally:
|
564
|
-
kill_child_process(process.pid)
|
567
|
+
kill_child_process(process.pid, include_self=True)
|
565
568
|
|
566
569
|
assert res["completed"] == num_prompts
|
567
570
|
return res
|
@@ -594,7 +597,7 @@ def run_bench_latency(model, other_args):
|
|
594
597
|
lastline = output.split("\n")[-3]
|
595
598
|
output_throughput = float(lastline.split(" ")[-2])
|
596
599
|
finally:
|
597
|
-
kill_child_process(process.pid)
|
600
|
+
kill_child_process(process.pid, include_self=True)
|
598
601
|
|
599
602
|
return output_throughput
|
600
603
|
|
@@ -631,3 +634,145 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
|
631
634
|
rouge_l_scores.append(fmeasure)
|
632
635
|
|
633
636
|
return rouge_l_scores
|
637
|
+
|
638
|
+
|
639
|
+
STDOUT_FILENAME = "stdout.txt"
|
640
|
+
STDERR_FILENAME = "stderr.txt"
|
641
|
+
|
642
|
+
|
643
|
+
def read_output(output_lines):
|
644
|
+
"""Print the output in real time with another thread."""
|
645
|
+
while not os.path.exists(STDERR_FILENAME):
|
646
|
+
time.sleep(1)
|
647
|
+
|
648
|
+
pt = 0
|
649
|
+
while pt >= 0:
|
650
|
+
if pt > 0 and not os.path.exists(STDERR_FILENAME):
|
651
|
+
break
|
652
|
+
lines = open(STDERR_FILENAME).readlines()
|
653
|
+
for line in lines[pt:]:
|
654
|
+
print(line, end="", flush=True)
|
655
|
+
output_lines.append(line)
|
656
|
+
pt += 1
|
657
|
+
time.sleep(0.1)
|
658
|
+
|
659
|
+
|
660
|
+
def run_and_check_memory_leak(
|
661
|
+
workload_func,
|
662
|
+
disable_radix_cache,
|
663
|
+
enable_mixed_chunk,
|
664
|
+
enable_overlap,
|
665
|
+
chunked_prefill_size,
|
666
|
+
):
|
667
|
+
other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
|
668
|
+
if disable_radix_cache:
|
669
|
+
other_args += ["--disable-radix-cache"]
|
670
|
+
if enable_mixed_chunk:
|
671
|
+
other_args += ["--enable-mixed-chunk"]
|
672
|
+
if enable_overlap:
|
673
|
+
other_args += ["--enable-overlap-scheduler"]
|
674
|
+
|
675
|
+
model = DEFAULT_MODEL_NAME_FOR_TEST
|
676
|
+
port = random.randint(4000, 5000)
|
677
|
+
base_url = f"http://127.0.0.1:{port}"
|
678
|
+
|
679
|
+
# Create files and launch the server
|
680
|
+
stdout = open(STDOUT_FILENAME, "w")
|
681
|
+
stderr = open(STDERR_FILENAME, "w")
|
682
|
+
process = popen_launch_server(
|
683
|
+
model,
|
684
|
+
base_url,
|
685
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
686
|
+
other_args=other_args,
|
687
|
+
return_stdout_stderr=(stdout, stderr),
|
688
|
+
)
|
689
|
+
|
690
|
+
# Launch a thread to stream the output
|
691
|
+
output_lines = []
|
692
|
+
t = threading.Thread(target=read_output, args=(output_lines,))
|
693
|
+
t.start()
|
694
|
+
|
695
|
+
# Run the workload
|
696
|
+
workload_func(base_url, model)
|
697
|
+
|
698
|
+
# Clean up everything
|
699
|
+
kill_child_process(process.pid, include_self=True)
|
700
|
+
kill_child_process(process.pid, include_self=True)
|
701
|
+
stdout.close()
|
702
|
+
stderr.close()
|
703
|
+
if os.path.exists(STDOUT_FILENAME):
|
704
|
+
os.remove(STDOUT_FILENAME)
|
705
|
+
if os.path.exists(STDERR_FILENAME):
|
706
|
+
os.remove(STDERR_FILENAME)
|
707
|
+
t.join()
|
708
|
+
|
709
|
+
# Assert success
|
710
|
+
has_new_server = False
|
711
|
+
has_leak = False
|
712
|
+
for line in output_lines:
|
713
|
+
if "The server is fired" in line:
|
714
|
+
has_new_server = True
|
715
|
+
if "leak" in line:
|
716
|
+
has_leak = True
|
717
|
+
|
718
|
+
assert has_new_server
|
719
|
+
assert not has_leak
|
720
|
+
|
721
|
+
|
722
|
+
def run_mmlu_test(
|
723
|
+
disable_radix_cache=False,
|
724
|
+
enable_mixed_chunk=False,
|
725
|
+
enable_overlap=False,
|
726
|
+
chunked_prefill_size=32,
|
727
|
+
):
|
728
|
+
def workload_func(base_url, model):
|
729
|
+
# Run the eval
|
730
|
+
args = SimpleNamespace(
|
731
|
+
base_url=base_url,
|
732
|
+
model=model,
|
733
|
+
eval_name="mmlu",
|
734
|
+
num_examples=128,
|
735
|
+
num_threads=128,
|
736
|
+
)
|
737
|
+
|
738
|
+
try:
|
739
|
+
metrics = run_eval(args)
|
740
|
+
print(f"{metrics=}")
|
741
|
+
assert metrics["score"] >= 0.65
|
742
|
+
finally:
|
743
|
+
pass
|
744
|
+
|
745
|
+
run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
|
746
|
+
|
747
|
+
|
748
|
+
def run_mulit_request_test(
|
749
|
+
disable_radix_cache=False,
|
750
|
+
enable_mixed_chunk=False,
|
751
|
+
enable_overlap=False,
|
752
|
+
chunked_prefill_size=32,
|
753
|
+
):
|
754
|
+
|
755
|
+
def workload_func(base_url, model):
|
756
|
+
def run_one(_):
|
757
|
+
prompt = """
|
758
|
+
System: You are a helpful assistant.
|
759
|
+
User: What is the capital of France?
|
760
|
+
Assistant: The capital of France is
|
761
|
+
"""
|
762
|
+
|
763
|
+
response = requests.post(
|
764
|
+
f"{base_url}/generate",
|
765
|
+
json={
|
766
|
+
"text": prompt,
|
767
|
+
"sampling_params": {
|
768
|
+
"temperature": 0,
|
769
|
+
"max_new_tokens": 8,
|
770
|
+
},
|
771
|
+
},
|
772
|
+
)
|
773
|
+
ret = response.json()
|
774
|
+
|
775
|
+
with ThreadPoolExecutor(2) as executor:
|
776
|
+
list(executor.map(run_one, list(range(4))))
|
777
|
+
|
778
|
+
run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
|
sglang/utils.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
"""Common utilities."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import gc
|
4
5
|
import importlib
|
5
6
|
import json
|
6
7
|
import logging
|
7
8
|
import os
|
8
9
|
import signal
|
10
|
+
import subprocess
|
9
11
|
import sys
|
12
|
+
import time
|
10
13
|
import traceback
|
11
14
|
import urllib.request
|
12
15
|
from concurrent.futures import ThreadPoolExecutor
|
@@ -16,6 +19,7 @@ from typing import Optional, Union
|
|
16
19
|
|
17
20
|
import numpy as np
|
18
21
|
import requests
|
22
|
+
from IPython.display import HTML, display
|
19
23
|
from tqdm import tqdm
|
20
24
|
|
21
25
|
logger = logging.getLogger(__name__)
|
@@ -151,7 +155,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
|
|
151
155
|
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
152
156
|
|
153
157
|
frames = []
|
154
|
-
for
|
158
|
+
for _ in range(total_frames):
|
155
159
|
ret, frame = cap.read()
|
156
160
|
if ret:
|
157
161
|
frames.append(frame)
|
@@ -294,3 +298,60 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
294
298
|
bar.update(len(chunk))
|
295
299
|
|
296
300
|
return filename
|
301
|
+
|
302
|
+
|
303
|
+
def execute_shell_command(command: str) -> subprocess.Popen:
|
304
|
+
"""
|
305
|
+
Execute a shell command and return the process handle
|
306
|
+
|
307
|
+
Args:
|
308
|
+
command: Shell command as a string (can include \\ line continuations)
|
309
|
+
Returns:
|
310
|
+
subprocess.Popen: Process handle
|
311
|
+
"""
|
312
|
+
# Replace \ newline with space and split
|
313
|
+
command = command.replace("\\\n", " ").replace("\\", " ")
|
314
|
+
parts = command.split()
|
315
|
+
|
316
|
+
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
|
317
|
+
|
318
|
+
|
319
|
+
def wait_for_server(base_url: str, timeout: int = None) -> None:
|
320
|
+
"""Wait for the server to be ready by polling the /v1/models endpoint.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
base_url: The base URL of the server
|
324
|
+
timeout: Maximum time to wait in seconds. None means wait forever.
|
325
|
+
"""
|
326
|
+
start_time = time.time()
|
327
|
+
while True:
|
328
|
+
try:
|
329
|
+
response = requests.get(
|
330
|
+
f"{base_url}/v1/models",
|
331
|
+
headers={"Authorization": "Bearer None"},
|
332
|
+
)
|
333
|
+
if response.status_code == 200:
|
334
|
+
time.sleep(5)
|
335
|
+
print_highlight(
|
336
|
+
"""\n
|
337
|
+
NOTE: Typically, the server runs in a separate terminal.
|
338
|
+
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
339
|
+
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
340
|
+
"""
|
341
|
+
)
|
342
|
+
break
|
343
|
+
|
344
|
+
if timeout and time.time() - start_time > timeout:
|
345
|
+
raise TimeoutError("Server did not become ready within timeout period")
|
346
|
+
except requests.exceptions.RequestException:
|
347
|
+
time.sleep(1)
|
348
|
+
|
349
|
+
|
350
|
+
def terminate_process(process):
|
351
|
+
from sglang.srt.utils import kill_child_process
|
352
|
+
kill_child_process(process.pid, include_self=True)
|
353
|
+
|
354
|
+
|
355
|
+
def print_highlight(html_content: str):
|
356
|
+
html_content = str(html_content).replace("\n", "<br>")
|
357
|
+
display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.5"
|