sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +19 -3
- sglang/bench_serving.py +8 -9
- sglang/compile_deep_gemm.py +45 -4
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +1 -1
- sglang/srt/configs/model_config.py +9 -3
- sglang/srt/constrained/llguidance_backend.py +78 -61
- sglang/srt/conversation.py +34 -1
- sglang/srt/disaggregation/decode.py +67 -13
- sglang/srt/disaggregation/fake/__init__.py +1 -0
- sglang/srt/disaggregation/fake/conn.py +88 -0
- sglang/srt/disaggregation/mini_lb.py +45 -8
- sglang/srt/disaggregation/mooncake/conn.py +198 -31
- sglang/srt/disaggregation/prefill.py +36 -12
- sglang/srt/disaggregation/utils.py +16 -2
- sglang/srt/entrypoints/engine.py +9 -0
- sglang/srt/entrypoints/http_server.py +35 -4
- sglang/srt/function_call_parser.py +77 -5
- sglang/srt/layers/attention/base_attn_backend.py +3 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
- sglang/srt/layers/attention/flashattention_backend.py +28 -10
- sglang/srt/layers/attention/flashmla_backend.py +8 -11
- sglang/srt/layers/attention/utils.py +1 -1
- sglang/srt/layers/attention/vision.py +2 -0
- sglang/srt/layers/layernorm.py +38 -16
- sglang/srt/layers/logits_processor.py +2 -2
- sglang/srt/layers/moe/fused_moe_native.py +2 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
- sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
- sglang/srt/layers/pooler.py +6 -0
- sglang/srt/layers/quantization/awq.py +5 -1
- sglang/srt/layers/quantization/deep_gemm.py +17 -10
- sglang/srt/layers/quantization/fp8.py +20 -22
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/int8_kernel.py +32 -1
- sglang/srt/layers/radix_attention.py +13 -3
- sglang/srt/layers/rotary_embedding.py +170 -126
- sglang/srt/managers/data_parallel_controller.py +10 -3
- sglang/srt/managers/io_struct.py +7 -0
- sglang/srt/managers/mm_utils.py +85 -28
- sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
- sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
- sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
- sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
- sglang/srt/managers/schedule_batch.py +38 -12
- sglang/srt/managers/scheduler.py +41 -28
- sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
- sglang/srt/managers/tokenizer_manager.py +5 -1
- sglang/srt/managers/tp_worker.py +3 -3
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
- sglang/srt/mem_cache/memory_pool.py +87 -0
- sglang/srt/model_executor/cuda_graph_runner.py +4 -3
- sglang/srt/model_executor/forward_batch_info.py +51 -95
- sglang/srt/model_executor/model_runner.py +19 -25
- sglang/srt/models/deepseek.py +12 -2
- sglang/srt/models/deepseek_nextn.py +101 -6
- sglang/srt/models/deepseek_v2.py +144 -70
- sglang/srt/models/deepseek_vl2.py +9 -4
- sglang/srt/models/gemma3_causal.py +1 -1
- sglang/srt/models/llama4.py +0 -1
- sglang/srt/models/minicpmo.py +5 -1
- sglang/srt/models/mllama4.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +3 -6
- sglang/srt/models/qwen2_vl.py +3 -7
- sglang/srt/models/roberta.py +178 -0
- sglang/srt/openai_api/adapter.py +50 -11
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/reasoning_parser.py +25 -1
- sglang/srt/server_args.py +31 -24
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/torch_memory_saver_adapter.py +10 -1
- sglang/srt/utils.py +5 -1
- sglang/test/runners.py +6 -13
- sglang/test/send_one.py +84 -28
- sglang/test/test_utils.py +74 -18
- sglang/version.py +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0
sglang/test/send_one.py
CHANGED
@@ -6,11 +6,56 @@ python3 -m sglang.test.send_one
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import argparse
|
9
|
+
import dataclasses
|
9
10
|
import json
|
10
11
|
|
11
12
|
import requests
|
12
13
|
|
13
14
|
|
15
|
+
@dataclasses.dataclass
|
16
|
+
class BenchArgs:
|
17
|
+
host: str = "localhost"
|
18
|
+
port: int = 30000
|
19
|
+
batch_size: int = 1
|
20
|
+
temperature: float = 0.0
|
21
|
+
max_new_tokens: int = 512
|
22
|
+
frequency_penalty: float = 0.0
|
23
|
+
presence_penalty: float = 0.0
|
24
|
+
json: bool = False
|
25
|
+
return_logprob: bool = False
|
26
|
+
prompt: str = (
|
27
|
+
"Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
|
28
|
+
)
|
29
|
+
image: bool = False
|
30
|
+
stream: bool = False
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
34
|
+
parser.add_argument("--host", type=str, default=BenchArgs.host)
|
35
|
+
parser.add_argument("--port", type=int, default=BenchArgs.port)
|
36
|
+
parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
|
37
|
+
parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
|
38
|
+
parser.add_argument(
|
39
|
+
"--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
|
40
|
+
)
|
41
|
+
parser.add_argument(
|
42
|
+
"--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
|
43
|
+
)
|
44
|
+
parser.add_argument(
|
45
|
+
"--presence-penalty", type=float, default=BenchArgs.presence_penalty
|
46
|
+
)
|
47
|
+
parser.add_argument("--json", action="store_true")
|
48
|
+
parser.add_argument("--return-logprob", action="store_true")
|
49
|
+
parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
|
50
|
+
parser.add_argument("--image", action="store_true")
|
51
|
+
parser.add_argument("--stream", action="store_true")
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
55
|
+
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
56
|
+
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
57
|
+
|
58
|
+
|
14
59
|
def send_one_prompt(args):
|
15
60
|
if args.image:
|
16
61
|
args.prompt = (
|
@@ -20,20 +65,42 @@ def send_one_prompt(args):
|
|
20
65
|
else:
|
21
66
|
image_data = None
|
22
67
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
"
|
28
|
-
"
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
}
|
34
|
-
|
35
|
-
|
68
|
+
prompt = args.prompt
|
69
|
+
|
70
|
+
if args.json:
|
71
|
+
prompt = (
|
72
|
+
"Human: What is the capital of France and how is that city like. "
|
73
|
+
"Give me 3 trivial information about that city. "
|
74
|
+
"Write in a format of json.\nAssistant:"
|
75
|
+
)
|
76
|
+
json_schema = "$$ANY$$"
|
77
|
+
json_schema = (
|
78
|
+
'{"type": "object", "properties": {"population": {"type": "integer"}}}'
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
json_schema = None
|
82
|
+
|
83
|
+
if args.batch_size > 1:
|
84
|
+
prompt = [prompt] * args.batch_size
|
85
|
+
|
86
|
+
json_data = {
|
87
|
+
"text": prompt,
|
88
|
+
"image_data": image_data,
|
89
|
+
"sampling_params": {
|
90
|
+
"temperature": args.temperature,
|
91
|
+
"max_new_tokens": args.max_new_tokens,
|
92
|
+
"frequency_penalty": args.frequency_penalty,
|
93
|
+
"presence_penalty": args.presence_penalty,
|
94
|
+
"json_schema": json_schema,
|
95
|
+
"stop": ["Question", "Assistant:", "<|separator|>", "<|eos|>"],
|
36
96
|
},
|
97
|
+
"return_logprob": args.return_logprob,
|
98
|
+
"stream": args.stream,
|
99
|
+
}
|
100
|
+
|
101
|
+
response = requests.post(
|
102
|
+
f"http://{args.host}:{args.port}/generate",
|
103
|
+
json=json_data,
|
37
104
|
stream=args.stream,
|
38
105
|
)
|
39
106
|
|
@@ -47,6 +114,9 @@ def send_one_prompt(args):
|
|
47
114
|
else:
|
48
115
|
ret = response.json()
|
49
116
|
|
117
|
+
if args.batch_size > 1:
|
118
|
+
ret = ret[0]
|
119
|
+
|
50
120
|
latency = ret["meta_info"]["e2e_latency"]
|
51
121
|
|
52
122
|
if "spec_verify_ct" in ret["meta_info"]:
|
@@ -68,21 +138,7 @@ def send_one_prompt(args):
|
|
68
138
|
|
69
139
|
if __name__ == "__main__":
|
70
140
|
parser = argparse.ArgumentParser()
|
71
|
-
|
72
|
-
parser.add_argument("--max-new-tokens", type=int, default=512)
|
73
|
-
parser.add_argument("--frequency-penalty", type=float, default=0.0)
|
74
|
-
parser.add_argument("--presence-penalty", type=float, default=0.0)
|
75
|
-
parser.add_argument("--return-logprob", action="store_true")
|
76
|
-
parser.add_argument(
|
77
|
-
"--prompt",
|
78
|
-
type=str,
|
79
|
-
default="Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
|
80
|
-
)
|
81
|
-
parser.add_argument(
|
82
|
-
"--image",
|
83
|
-
action="store_true",
|
84
|
-
)
|
85
|
-
parser.add_argument("--stream", action="store_true")
|
141
|
+
BenchArgs.add_cli_args(parser)
|
86
142
|
args = parser.parse_args()
|
87
143
|
|
88
144
|
send_one_prompt(args)
|
sglang/test/test_utils.py
CHANGED
@@ -8,7 +8,6 @@ import random
|
|
8
8
|
import subprocess
|
9
9
|
import threading
|
10
10
|
import time
|
11
|
-
import traceback
|
12
11
|
import unittest
|
13
12
|
from concurrent.futures import ThreadPoolExecutor
|
14
13
|
from dataclasses import dataclass
|
@@ -34,27 +33,44 @@ from sglang.srt.utils import (
|
|
34
33
|
from sglang.test.run_eval import run_eval
|
35
34
|
from sglang.utils import get_exception_traceback
|
36
35
|
|
37
|
-
|
38
|
-
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
39
|
-
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
40
|
-
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
41
|
-
)
|
42
|
-
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
|
43
|
-
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
44
|
-
)
|
45
|
-
|
36
|
+
# General test models
|
46
37
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
47
38
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
48
39
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
49
40
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
50
|
-
|
41
|
+
|
42
|
+
# MLA test models
|
51
43
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
52
44
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
45
|
+
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
46
|
+
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
|
47
|
+
|
48
|
+
# FP8 models
|
49
|
+
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
50
|
+
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
51
|
+
DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
|
52
|
+
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
53
|
+
)
|
54
|
+
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
|
55
|
+
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
56
|
+
)
|
57
|
+
|
58
|
+
# EAGLE
|
59
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
60
|
+
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
61
|
+
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
|
62
|
+
|
63
|
+
# Other use cases
|
64
|
+
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
65
|
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
66
|
+
)
|
67
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
53
68
|
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
54
69
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
55
70
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
56
71
|
)
|
57
|
-
|
72
|
+
|
73
|
+
# Nightly tests
|
58
74
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
59
75
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
60
76
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
@@ -63,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
63
79
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
64
80
|
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
|
65
81
|
|
66
|
-
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
67
|
-
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
68
|
-
|
69
82
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
70
83
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
71
84
|
|
85
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
|
86
|
+
|
72
87
|
|
73
88
|
def is_in_ci():
|
74
89
|
"""Return whether it is in CI runner."""
|
@@ -494,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
494
509
|
tic = time.time()
|
495
510
|
success = True
|
496
511
|
|
497
|
-
for file in files:
|
512
|
+
for i, file in enumerate(files):
|
498
513
|
filename, estimated_time = file.name, file.estimated_time
|
499
514
|
process = None
|
500
515
|
|
@@ -502,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
502
517
|
nonlocal process
|
503
518
|
|
504
519
|
filename = os.path.join(os.getcwd(), filename)
|
505
|
-
print(
|
520
|
+
print(
|
521
|
+
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
522
|
+
flush=True,
|
523
|
+
)
|
506
524
|
tic = time.time()
|
507
525
|
|
508
526
|
process = subprocess.Popen(
|
@@ -512,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
512
530
|
elapsed = time.time() - tic
|
513
531
|
|
514
532
|
print(
|
515
|
-
f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
533
|
+
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
516
534
|
flush=True,
|
517
535
|
)
|
518
536
|
return process.returncode
|
@@ -714,6 +732,44 @@ def run_bench_one_batch(model, other_args):
|
|
714
732
|
return output_throughput
|
715
733
|
|
716
734
|
|
735
|
+
def run_bench_offline_throughput(model, other_args):
|
736
|
+
command = [
|
737
|
+
"python3",
|
738
|
+
"-m",
|
739
|
+
"sglang.bench_offline_throughput",
|
740
|
+
"--num-prompts",
|
741
|
+
"1",
|
742
|
+
"--dataset-name",
|
743
|
+
"random",
|
744
|
+
"--random-input-len",
|
745
|
+
"256",
|
746
|
+
"--random-output-len",
|
747
|
+
"256",
|
748
|
+
"--model-path",
|
749
|
+
model,
|
750
|
+
*[str(x) for x in other_args],
|
751
|
+
]
|
752
|
+
|
753
|
+
print(f"{command=}")
|
754
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
755
|
+
|
756
|
+
try:
|
757
|
+
stdout, stderr = process.communicate()
|
758
|
+
output = stdout.decode()
|
759
|
+
error = stderr.decode()
|
760
|
+
print(f"Output: {output}", flush=True)
|
761
|
+
print(f"Error: {error}", flush=True)
|
762
|
+
|
763
|
+
output_throughput = -1
|
764
|
+
for line in output.split("\n"):
|
765
|
+
if "Last generation throughput (tok/s):" in line:
|
766
|
+
output_throughput = float(line.split(":")[-1])
|
767
|
+
finally:
|
768
|
+
kill_process_tree(process.pid)
|
769
|
+
|
770
|
+
return output_throughput
|
771
|
+
|
772
|
+
|
717
773
|
def lcs(X, Y):
|
718
774
|
m = len(X)
|
719
775
|
n = len(Y)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.6.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.6.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
226
226
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
227
227
|
Requires-Dist: interegular; extra == "runtime-common"
|
228
|
-
Requires-Dist: llguidance
|
228
|
+
Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
|
229
229
|
Requires-Dist: modelscope; extra == "runtime-common"
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
@@ -242,11 +242,10 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
|
242
242
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
246
245
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
246
|
Provides-Extra: srt
|
248
247
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.0
|
248
|
+
Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
|
250
249
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
251
250
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
252
251
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
|
|
409
408
|
|
410
409
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
411
410
|
|
412
|
-
## Acknowledgment
|
413
|
-
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
411
|
+
## Acknowledgment
|
412
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|