sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +6 -0
- sglang/bench_latency.py +7 -3
- sglang/bench_serving.py +50 -26
- sglang/check_env.py +15 -0
- sglang/lang/chat_template.py +10 -5
- sglang/lang/compiler.py +4 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +9 -0
- sglang/launch_server.py +8 -1
- sglang/srt/conversation.py +50 -1
- sglang/srt/hf_transformers_utils.py +22 -23
- sglang/srt/layers/activation.py +24 -1
- sglang/srt/layers/decode_attention.py +338 -50
- sglang/srt/layers/fused_moe/layer.py +2 -2
- sglang/srt/layers/layernorm.py +3 -0
- sglang/srt/layers/logits_processor.py +60 -23
- sglang/srt/layers/radix_attention.py +3 -4
- sglang/srt/layers/sampler.py +154 -0
- sglang/srt/managers/controller_multi.py +2 -8
- sglang/srt/managers/controller_single.py +7 -10
- sglang/srt/managers/detokenizer_manager.py +20 -9
- sglang/srt/managers/io_struct.py +44 -11
- sglang/srt/managers/policy_scheduler.py +5 -2
- sglang/srt/managers/schedule_batch.py +52 -167
- sglang/srt/managers/tokenizer_manager.py +192 -83
- sglang/srt/managers/tp_worker.py +130 -43
- sglang/srt/mem_cache/memory_pool.py +82 -8
- sglang/srt/mm_utils.py +79 -7
- sglang/srt/model_executor/cuda_graph_runner.py +49 -11
- sglang/srt/model_executor/forward_batch_info.py +59 -27
- sglang/srt/model_executor/model_runner.py +210 -61
- sglang/srt/models/chatglm.py +4 -12
- sglang/srt/models/commandr.py +5 -1
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +5 -1
- sglang/srt/models/deepseek_v2.py +5 -1
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/gemma2.py +15 -7
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +16 -2
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/llama2.py +7 -3
- sglang/srt/models/llama_classification.py +2 -2
- sglang/srt/models/llama_embedding.py +4 -0
- sglang/srt/models/llava.py +176 -59
- sglang/srt/models/minicpm.py +5 -1
- sglang/srt/models/mixtral.py +5 -1
- sglang/srt/models/mixtral_quant.py +5 -1
- sglang/srt/models/qwen.py +5 -2
- sglang/srt/models/qwen2.py +13 -3
- sglang/srt/models/qwen2_moe.py +5 -14
- sglang/srt/models/stablelm.py +5 -1
- sglang/srt/openai_api/adapter.py +117 -37
- sglang/srt/sampling/sampling_batch_info.py +209 -0
- sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
- sglang/srt/server.py +84 -56
- sglang/srt/server_args.py +43 -15
- sglang/srt/utils.py +26 -16
- sglang/test/runners.py +23 -31
- sglang/test/simple_eval_common.py +9 -10
- sglang/test/simple_eval_gpqa.py +2 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_math.py +2 -1
- sglang/test/simple_eval_mmlu.py +2 -1
- sglang/test/test_activation.py +55 -0
- sglang/test/test_utils.py +36 -53
- sglang/version.py +1 -1
- {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
- sglang-0.2.14.dist-info/RECORD +114 -0
- {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
- sglang/launch_server_llavavid.py +0 -29
- sglang-0.2.13.dist-info/RECORD +0 -112
- {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
- {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -224,13 +224,18 @@ def is_multimodal_model(model):
|
|
224
224
|
raise ValueError("unrecognized type")
|
225
225
|
|
226
226
|
|
227
|
-
def is_generation_model(model_architectures):
|
227
|
+
def is_generation_model(model_architectures, is_embedding: bool = False):
|
228
|
+
# We have two ways to determine whether a model is a generative model.
|
229
|
+
# 1. Check the model architectue
|
230
|
+
# 2. check the `is_embedding` server args
|
231
|
+
|
228
232
|
if (
|
229
233
|
"LlamaEmbeddingModel" in model_architectures
|
230
234
|
or "MistralModel" in model_architectures
|
231
235
|
):
|
232
236
|
return False
|
233
|
-
|
237
|
+
else:
|
238
|
+
return not is_embedding
|
234
239
|
|
235
240
|
|
236
241
|
def decode_video_base64(video_base64):
|
@@ -347,7 +352,7 @@ def suppress_other_loggers():
|
|
347
352
|
logging.WARN
|
348
353
|
)
|
349
354
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
350
|
-
logging.getLogger("vllm.utils").setLevel(logging.
|
355
|
+
logging.getLogger("vllm.utils").setLevel(logging.ERROR)
|
351
356
|
|
352
357
|
|
353
358
|
def assert_pkg_version(pkg: str, min_version: str, message: str):
|
@@ -369,14 +374,11 @@ def kill_parent_process():
|
|
369
374
|
"""Kill the parent process and all children of the parent process."""
|
370
375
|
current_process = psutil.Process()
|
371
376
|
parent_process = current_process.parent()
|
372
|
-
|
373
|
-
for child in children:
|
374
|
-
if child.pid != current_process.pid:
|
375
|
-
os.kill(child.pid, 9)
|
376
|
-
os.kill(parent_process.pid, 9)
|
377
|
+
kill_child_process(parent_process.pid, skip_pid=current_process.pid)
|
377
378
|
|
378
379
|
|
379
|
-
def kill_child_process(pid, including_parent=True):
|
380
|
+
def kill_child_process(pid, including_parent=True, skip_pid=None):
|
381
|
+
"""Kill the process and all its children process."""
|
380
382
|
try:
|
381
383
|
parent = psutil.Process(pid)
|
382
384
|
except psutil.NoSuchProcess:
|
@@ -384,6 +386,8 @@ def kill_child_process(pid, including_parent=True):
|
|
384
386
|
|
385
387
|
children = parent.children(recursive=True)
|
386
388
|
for child in children:
|
389
|
+
if child.pid == skip_pid:
|
390
|
+
continue
|
387
391
|
try:
|
388
392
|
child.kill()
|
389
393
|
except psutil.NoSuchProcess:
|
@@ -452,10 +456,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
452
456
|
quant_method = getattr(module, "quant_method", None)
|
453
457
|
if quant_method is not None:
|
454
458
|
quant_method.process_weights_after_loading(module)
|
455
|
-
# FIXME: Remove this after Mixtral is updated
|
456
|
-
# to use quant_method.
|
457
|
-
if hasattr(module, "process_weights_after_loading"):
|
458
|
-
module.process_weights_after_loading()
|
459
459
|
|
460
460
|
# NOTE(woosuk): For accurate performance evaluation, we assign
|
461
461
|
# random values to the weights.
|
@@ -692,7 +692,7 @@ def monkey_patch_vllm_qvk_linear_loader():
|
|
692
692
|
setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
|
693
693
|
|
694
694
|
|
695
|
-
def add_api_key_middleware(app, api_key):
|
695
|
+
def add_api_key_middleware(app, api_key: str):
|
696
696
|
@app.middleware("http")
|
697
697
|
async def authentication(request, call_next):
|
698
698
|
if request.method == "OPTIONS":
|
@@ -704,7 +704,7 @@ def add_api_key_middleware(app, api_key):
|
|
704
704
|
return await call_next(request)
|
705
705
|
|
706
706
|
|
707
|
-
def prepare_model(model_path):
|
707
|
+
def prepare_model(model_path: str):
|
708
708
|
if "SGLANG_USE_MODELSCOPE" in os.environ:
|
709
709
|
if not os.path.exists(model_path):
|
710
710
|
from modelscope import snapshot_download
|
@@ -713,7 +713,7 @@ def prepare_model(model_path):
|
|
713
713
|
return model_path
|
714
714
|
|
715
715
|
|
716
|
-
def prepare_tokenizer(tokenizer_path):
|
716
|
+
def prepare_tokenizer(tokenizer_path: str):
|
717
717
|
if "SGLANG_USE_MODELSCOPE" in os.environ:
|
718
718
|
if not os.path.exists(tokenizer_path):
|
719
719
|
from modelscope import snapshot_download
|
@@ -722,3 +722,13 @@ def prepare_tokenizer(tokenizer_path):
|
|
722
722
|
tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
|
723
723
|
)
|
724
724
|
return tokenizer_path
|
725
|
+
|
726
|
+
|
727
|
+
def configure_logger(server_args, prefix: str = ""):
|
728
|
+
format = f"[%(asctime)s{prefix}] %(message)s"
|
729
|
+
logging.basicConfig(
|
730
|
+
level=getattr(logging, server_args.log_level.upper()),
|
731
|
+
format=format,
|
732
|
+
datefmt="%H:%M:%S",
|
733
|
+
force=True,
|
734
|
+
)
|
sglang/test/runners.py
CHANGED
@@ -14,7 +14,7 @@ limitations under the License.
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
import json
|
17
|
-
import multiprocessing
|
17
|
+
import multiprocessing as mp
|
18
18
|
import os
|
19
19
|
from dataclasses import dataclass
|
20
20
|
from typing import List, Union
|
@@ -24,15 +24,15 @@ import torch.nn.functional as F
|
|
24
24
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
25
25
|
|
26
26
|
from sglang.srt.server import Runtime
|
27
|
-
from sglang.
|
27
|
+
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
|
28
28
|
|
29
29
|
DEFAULT_PROMPTS = [
|
30
30
|
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
31
31
|
# "The capital of France is",
|
32
|
+
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
32
33
|
"The capital of the United Kindom is",
|
33
34
|
"Today is a sunny day and I like",
|
34
35
|
"AI is a field of computer science focused on",
|
35
|
-
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
36
36
|
]
|
37
37
|
|
38
38
|
dirpath = os.path.dirname(__file__)
|
@@ -63,44 +63,37 @@ class HFRunner:
|
|
63
63
|
def __init__(
|
64
64
|
self,
|
65
65
|
model_path,
|
66
|
-
torch_dtype
|
67
|
-
|
66
|
+
torch_dtype,
|
67
|
+
is_generation,
|
68
68
|
):
|
69
|
-
self.
|
70
|
-
|
69
|
+
self.is_generation = is_generation
|
70
|
+
|
71
|
+
self.in_queue = mp.Queue()
|
72
|
+
self.out_queue = mp.Queue()
|
71
73
|
|
72
|
-
self.model_proc =
|
74
|
+
self.model_proc = mp.Process(
|
73
75
|
target=self.start_model_process,
|
74
76
|
args=(
|
75
77
|
self.in_queue,
|
76
78
|
self.out_queue,
|
77
79
|
model_path,
|
78
80
|
torch_dtype,
|
79
|
-
is_generation_model,
|
80
81
|
),
|
81
82
|
)
|
82
83
|
self.model_proc.start()
|
83
84
|
|
84
|
-
def start_model_process(
|
85
|
-
self, in_queue, out_queue, model_path, torch_dtype, is_generation_model
|
86
|
-
):
|
85
|
+
def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
|
87
86
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
88
87
|
model_path,
|
89
88
|
torch_dtype=torch_dtype,
|
90
|
-
trust_remote_code=True,
|
91
89
|
)
|
92
90
|
|
93
|
-
self.
|
94
|
-
is_generation_model(model_path)
|
95
|
-
if is_generation_model is None
|
96
|
-
else is_generation_model
|
97
|
-
)
|
98
|
-
if self.is_generation_model:
|
91
|
+
if self.is_generation:
|
99
92
|
self.model = AutoModelForCausalLM.from_pretrained(
|
100
93
|
model_path,
|
101
94
|
torch_dtype=torch_dtype,
|
95
|
+
trust_remote_code=False,
|
102
96
|
low_cpu_mem_usage=True,
|
103
|
-
trust_remote_code=True,
|
104
97
|
).cuda()
|
105
98
|
else:
|
106
99
|
from sentence_transformers import SentenceTransformer
|
@@ -113,7 +106,7 @@ class HFRunner:
|
|
113
106
|
while True:
|
114
107
|
prompts, max_new_tokens = in_queue.get()
|
115
108
|
if prompts is not None:
|
116
|
-
if self.
|
109
|
+
if self.is_generation:
|
117
110
|
output_strs = []
|
118
111
|
prefill_logprobs = []
|
119
112
|
for p in prompts:
|
@@ -176,22 +169,20 @@ class SRTRunner:
|
|
176
169
|
def __init__(
|
177
170
|
self,
|
178
171
|
model_path,
|
172
|
+
torch_dtype,
|
173
|
+
is_generation,
|
179
174
|
tp_size=1,
|
180
|
-
|
181
|
-
is_generation_model=None,
|
182
|
-
port=5157,
|
175
|
+
port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
|
183
176
|
):
|
184
|
-
self.
|
185
|
-
is_generation_model(model_path)
|
186
|
-
if is_generation_model is None
|
187
|
-
else is_generation_model
|
188
|
-
)
|
177
|
+
self.is_generation = is_generation
|
189
178
|
self.runtime = Runtime(
|
190
179
|
model_path=model_path,
|
191
180
|
tp_size=tp_size,
|
192
181
|
dtype=get_dtype_str(torch_dtype),
|
193
182
|
port=port,
|
194
|
-
mem_fraction_static=0.
|
183
|
+
mem_fraction_static=0.69,
|
184
|
+
trust_remote_code=False,
|
185
|
+
is_embedding=not self.is_generation,
|
195
186
|
)
|
196
187
|
|
197
188
|
def forward(
|
@@ -199,7 +190,7 @@ class SRTRunner:
|
|
199
190
|
prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
|
200
191
|
max_new_tokens=8,
|
201
192
|
):
|
202
|
-
if self.
|
193
|
+
if self.is_generation:
|
203
194
|
# the return value contains logprobs from prefill
|
204
195
|
output_strs = []
|
205
196
|
top_input_logprobs = []
|
@@ -209,6 +200,7 @@ class SRTRunner:
|
|
209
200
|
prompt,
|
210
201
|
sampling_params=sampling_params,
|
211
202
|
return_logprob=True,
|
203
|
+
logprob_start_len=0,
|
212
204
|
top_logprobs_num=NUM_TOP_LOGPROBS,
|
213
205
|
)
|
214
206
|
response = json.loads(response)
|
@@ -1,13 +1,12 @@
|
|
1
1
|
# Adapted from https://github.com/openai/simple-evals/
|
2
2
|
|
3
|
-
import base64
|
4
3
|
import os
|
5
4
|
import resource
|
6
5
|
import time
|
7
6
|
from collections import defaultdict
|
8
7
|
from dataclasses import dataclass, field
|
9
8
|
from multiprocessing.pool import ThreadPool
|
10
|
-
from typing import Any, Dict, List, Tuple
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
11
10
|
|
12
11
|
import httpx
|
13
12
|
import jinja2
|
@@ -44,8 +43,8 @@ class EvalResult:
|
|
44
43
|
Result of running an evaluation (usually consisting of many samples)
|
45
44
|
"""
|
46
45
|
|
47
|
-
score: float
|
48
|
-
metrics: Dict[str, float]
|
46
|
+
score: Optional[float] # top-line metric
|
47
|
+
metrics: Optional[Dict[str, float]] # other metrics
|
49
48
|
htmls: List[str] # strings of valid HTML
|
50
49
|
convos: List[MessageList] # sampled conversations
|
51
50
|
|
@@ -56,10 +55,10 @@ class SingleEvalResult:
|
|
56
55
|
Result of evaluating a single sample
|
57
56
|
"""
|
58
57
|
|
59
|
-
score: float
|
58
|
+
score: Optional[float]
|
60
59
|
metrics: Dict[str, float] = field(default_factory=dict)
|
61
|
-
html: str
|
62
|
-
convo: MessageList
|
60
|
+
html: Optional[str] = None
|
61
|
+
convo: Optional[MessageList] = None # sampled conversation
|
63
62
|
|
64
63
|
|
65
64
|
class Eval:
|
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
|
|
89
88
|
def __init__(
|
90
89
|
self,
|
91
90
|
base_url: str = None,
|
92
|
-
model: str
|
93
|
-
system_message: str
|
91
|
+
model: Optional[str] = None,
|
92
|
+
system_message: Optional[str] = None,
|
94
93
|
temperature: float = 0.0,
|
95
94
|
max_tokens: int = 2048,
|
96
95
|
):
|
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
|
|
272
271
|
def aggregate_results(
|
273
272
|
single_eval_results: List[SingleEvalResult],
|
274
273
|
default_stats: Tuple[str] = ("mean", "std"),
|
275
|
-
name2stats: Dict[str, Tuple[str]]
|
274
|
+
name2stats: Optional[Dict[str, Tuple[str]]] = None,
|
276
275
|
) -> EvalResult:
|
277
276
|
"""
|
278
277
|
Aggregate results from multiple evaluations into a single EvalResult.
|
sglang/test/simple_eval_gpqa.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
|
|
28
29
|
def __init__(
|
29
30
|
self,
|
30
31
|
filename: str,
|
31
|
-
num_examples: int
|
32
|
+
num_examples: Optional[int],
|
32
33
|
num_threads: int,
|
33
34
|
n_repeats: int = 1,
|
34
35
|
):
|
@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
|
9
9
|
import random
|
10
10
|
import re
|
11
11
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
12
|
-
from typing import Dict, List
|
12
|
+
from typing import Dict, List, Optional
|
13
13
|
|
14
14
|
import tqdm
|
15
15
|
|
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
|
|
61
61
|
class HumanEval(Eval):
|
62
62
|
def __init__(
|
63
63
|
self,
|
64
|
-
num_examples: int
|
64
|
+
num_examples: Optional[int],
|
65
65
|
num_threads: int,
|
66
66
|
num_samples_per_task: int = 5,
|
67
67
|
ks_passes: List[int] = [1, 2, 5],
|
sglang/test/simple_eval_math.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -36,7 +37,7 @@ class MathEval(Eval):
|
|
36
37
|
self,
|
37
38
|
filename: str,
|
38
39
|
equality_checker: SamplerBase,
|
39
|
-
num_examples: int
|
40
|
+
num_examples: Optional[int],
|
40
41
|
num_threads: int,
|
41
42
|
):
|
42
43
|
df = pandas.read_csv(filename)
|
sglang/test/simple_eval_mmlu.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -84,7 +85,7 @@ subject2category = {
|
|
84
85
|
|
85
86
|
|
86
87
|
class MMLUEval(Eval):
|
87
|
-
def __init__(self, filename: str, num_examples: int
|
88
|
+
def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
|
88
89
|
df = pandas.read_csv(filename)
|
89
90
|
examples = [row.to_dict() for _, row in df.iterrows()]
|
90
91
|
if num_examples:
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import itertools
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.layers.activation import GeluAndMul
|
7
|
+
|
8
|
+
|
9
|
+
class TestGeluAndMul(unittest.TestCase):
|
10
|
+
DTYPES = [torch.half, torch.bfloat16]
|
11
|
+
NUM_TOKENS = [7, 83, 2048]
|
12
|
+
D = [512, 4096, 5120, 13824]
|
13
|
+
SEEDS = [0]
|
14
|
+
|
15
|
+
@classmethod
|
16
|
+
def setUpClass(cls):
|
17
|
+
if not torch.cuda.is_available():
|
18
|
+
raise unittest.SkipTest("CUDA is not available")
|
19
|
+
torch.set_default_device("cuda")
|
20
|
+
|
21
|
+
def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
|
22
|
+
torch.manual_seed(seed)
|
23
|
+
|
24
|
+
layer = GeluAndMul().to(dtype=dtype)
|
25
|
+
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
26
|
+
|
27
|
+
with torch.inference_mode():
|
28
|
+
ref_out = layer.forward_native(x)
|
29
|
+
out = layer.forward_cuda(x)
|
30
|
+
|
31
|
+
if dtype == torch.bfloat16:
|
32
|
+
atol = rtol = 1e-2
|
33
|
+
else:
|
34
|
+
atol = rtol = 1e-3
|
35
|
+
|
36
|
+
self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
|
37
|
+
|
38
|
+
def test_gelu_and_mul(self):
|
39
|
+
for params in itertools.product(
|
40
|
+
self.NUM_TOKENS,
|
41
|
+
self.D,
|
42
|
+
self.DTYPES,
|
43
|
+
self.SEEDS,
|
44
|
+
):
|
45
|
+
with self.subTest(
|
46
|
+
num_tokens=params[0],
|
47
|
+
d=params[1],
|
48
|
+
dtype=params[2],
|
49
|
+
seed=params[3],
|
50
|
+
):
|
51
|
+
self._run_gelu_and_mul_test(*params)
|
52
|
+
|
53
|
+
|
54
|
+
if __name__ == "__main__":
|
55
|
+
unittest.main(verbosity=2)
|
sglang/test/test_utils.py
CHANGED
@@ -2,11 +2,10 @@
|
|
2
2
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
|
-
import
|
5
|
+
import os
|
6
6
|
import subprocess
|
7
7
|
import threading
|
8
8
|
import time
|
9
|
-
import unittest
|
10
9
|
from functools import partial
|
11
10
|
from typing import Callable, List, Optional
|
12
11
|
|
@@ -18,14 +17,19 @@ import torch.nn.functional as F
|
|
18
17
|
from sglang.global_config import global_config
|
19
18
|
from sglang.lang.backend.openai import OpenAI
|
20
19
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
20
|
+
from sglang.srt.utils import kill_child_process
|
21
21
|
from sglang.utils import get_exception_traceback
|
22
22
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
24
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
26
|
+
|
27
|
+
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
28
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
29
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
|
30
|
+
else:
|
31
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
|
32
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
|
29
33
|
|
30
34
|
|
31
35
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
|
|
104
108
|
return pred
|
105
109
|
|
106
110
|
|
107
|
-
def
|
108
|
-
|
109
|
-
from ginfer import sampler_pb2, sampler_pb2_grpc
|
110
|
-
|
111
|
-
sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
|
112
|
-
sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
|
113
|
-
|
114
|
-
if stop is None:
|
115
|
-
stop_strings = None
|
116
|
-
else:
|
117
|
-
stop_strings = [stop]
|
118
|
-
|
119
|
-
sample_request = sampler_pb2.SampleTextRequest(
|
120
|
-
prompt=prompt,
|
121
|
-
settings=sampler_pb2.SampleSettings(
|
122
|
-
max_len=max_tokens,
|
123
|
-
rng_seed=0,
|
124
|
-
temperature=max(temperature, 1e-7),
|
125
|
-
nucleus_p=1,
|
126
|
-
stop_strings=stop_strings,
|
127
|
-
),
|
128
|
-
)
|
129
|
-
stream = sampler.SampleText(sample_request)
|
130
|
-
response = "".join([x.text for x in stream])
|
131
|
-
return response
|
111
|
+
def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
|
112
|
+
raise NotImplementedError()
|
132
113
|
|
133
114
|
|
134
115
|
def call_generate_guidance(
|
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
271
252
|
"vllm",
|
272
253
|
"outlines",
|
273
254
|
"lightllm",
|
274
|
-
"
|
255
|
+
"gserver",
|
275
256
|
"guidance",
|
276
257
|
"lmql",
|
277
258
|
"srt-raw",
|
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
292
273
|
"lightllm": 22000,
|
293
274
|
"lmql": 23000,
|
294
275
|
"srt-raw": 30000,
|
295
|
-
"
|
276
|
+
"gserver": 9988,
|
296
277
|
}
|
297
278
|
args.port = default_port.get(args.backend, None)
|
298
279
|
return args
|
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
|
|
328
309
|
return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
|
329
310
|
elif args.backend == "srt-raw":
|
330
311
|
return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
|
331
|
-
elif args.backend == "
|
332
|
-
return partial(
|
312
|
+
elif args.backend == "gserver":
|
313
|
+
return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
|
333
314
|
elif args.backend == "outlines":
|
334
315
|
return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
|
335
316
|
elif args.backend == "guidance":
|
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
480
461
|
success = True
|
481
462
|
|
482
463
|
for filename in files:
|
464
|
+
global process
|
483
465
|
|
484
|
-
def
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
p.join()
|
466
|
+
def run_one_file(filename):
|
467
|
+
filename = os.path.join(os.getcwd(), filename)
|
468
|
+
print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
|
469
|
+
process = subprocess.Popen(
|
470
|
+
["python3", filename], stdout=None, stderr=None, env=os.environ
|
471
|
+
)
|
472
|
+
process.wait()
|
473
|
+
return process.returncode
|
493
474
|
|
494
475
|
try:
|
495
|
-
run_with_timeout(
|
496
|
-
|
497
|
-
|
498
|
-
|
476
|
+
ret_code = run_with_timeout(
|
477
|
+
run_one_file, args=(filename,), timeout=timeout_per_file
|
478
|
+
)
|
479
|
+
assert ret_code == 0
|
499
480
|
except TimeoutError:
|
500
|
-
|
481
|
+
kill_child_process(process.pid)
|
501
482
|
time.sleep(5)
|
502
483
|
print(
|
503
|
-
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
|
484
|
+
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
|
485
|
+
flush=True,
|
504
486
|
)
|
505
|
-
|
487
|
+
success = False
|
488
|
+
break
|
506
489
|
|
507
490
|
if success:
|
508
|
-
print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
|
491
|
+
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
509
492
|
else:
|
510
|
-
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
|
493
|
+
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
511
494
|
|
512
495
|
return 0 if success else -1
|
513
496
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.14"
|