sglang 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +17 -11
- sglang/bench_one_batch.py +14 -6
- sglang/bench_serving.py +47 -44
- sglang/lang/chat_template.py +31 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
- sglang/srt/entrypoints/engine.py +5 -2
- sglang/srt/entrypoints/http_server.py +24 -0
- sglang/srt/function_call_parser.py +494 -0
- sglang/srt/layers/activation.py +5 -5
- sglang/srt/layers/dp_attention.py +3 -1
- sglang/srt/layers/layernorm.py +5 -5
- sglang/srt/layers/linear.py +24 -9
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +20 -12
- sglang/srt/layers/moe/fused_moe_native.py +17 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
- sglang/srt/layers/parameter.py +16 -7
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/fp8.py +4 -1
- sglang/srt/layers/rotary_embedding.py +6 -1
- sglang/srt/layers/sampler.py +28 -8
- sglang/srt/layers/torchao_utils.py +12 -6
- sglang/srt/managers/detokenizer_manager.py +1 -0
- sglang/srt/managers/io_struct.py +36 -5
- sglang/srt/managers/schedule_batch.py +31 -25
- sglang/srt/managers/scheduler.py +61 -35
- sglang/srt/managers/tokenizer_manager.py +4 -0
- sglang/srt/model_executor/cuda_graph_runner.py +23 -25
- sglang/srt/model_executor/forward_batch_info.py +5 -7
- sglang/srt/model_executor/model_runner.py +7 -4
- sglang/srt/model_loader/loader.py +75 -0
- sglang/srt/model_loader/weight_utils.py +91 -5
- sglang/srt/models/commandr.py +14 -2
- sglang/srt/models/dbrx.py +9 -1
- sglang/srt/models/deepseek_v2.py +3 -3
- sglang/srt/models/gemma2.py +9 -1
- sglang/srt/models/grok.py +1 -0
- sglang/srt/models/minicpm3.py +3 -3
- sglang/srt/models/torch_native_llama.py +17 -4
- sglang/srt/openai_api/adapter.py +139 -37
- sglang/srt/openai_api/protocol.py +5 -4
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- sglang/srt/sampling/sampling_batch_info.py +4 -14
- sglang/srt/server.py +2 -2
- sglang/srt/server_args.py +20 -1
- sglang/srt/speculative/eagle_utils.py +37 -15
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/utils.py +62 -65
- sglang/test/test_programs.py +1 -0
- sglang/test/test_utils.py +81 -22
- sglang/version.py +1 -1
- {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA +7 -7
- {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/RECORD +67 -56
- {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14
14
|
"""Common utilities."""
|
15
15
|
|
16
16
|
import base64
|
17
|
+
import ctypes
|
17
18
|
import dataclasses
|
18
19
|
import io
|
19
20
|
import ipaddress
|
@@ -29,6 +30,7 @@ import shutil
|
|
29
30
|
import signal
|
30
31
|
import socket
|
31
32
|
import subprocess
|
33
|
+
import sys
|
32
34
|
import tempfile
|
33
35
|
import time
|
34
36
|
import warnings
|
@@ -59,7 +61,6 @@ from triton.runtime.cache import (
|
|
59
61
|
default_dump_dir,
|
60
62
|
default_override_dir,
|
61
63
|
)
|
62
|
-
from uvicorn.config import LOGGING_CONFIG
|
63
64
|
|
64
65
|
logger = logging.getLogger(__name__)
|
65
66
|
|
@@ -73,7 +74,7 @@ def is_hip() -> bool:
|
|
73
74
|
|
74
75
|
|
75
76
|
def is_cuda():
|
76
|
-
return hasattr(torch, "cuda") and torch.cuda
|
77
|
+
return hasattr(torch, "cuda") and torch.version.cuda is not None
|
77
78
|
|
78
79
|
|
79
80
|
def is_cuda_alike():
|
@@ -773,7 +774,7 @@ def get_zmq_socket(
|
|
773
774
|
|
774
775
|
|
775
776
|
def dump_to_file(dirpath, name, value):
|
776
|
-
from
|
777
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
777
778
|
|
778
779
|
if get_tensor_model_parallel_rank() != 0:
|
779
780
|
return
|
@@ -1242,68 +1243,6 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1242
1243
|
return str(data)
|
1243
1244
|
|
1244
1245
|
|
1245
|
-
TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
|
1246
|
-
|
1247
|
-
|
1248
|
-
def parse_tool_response(text, tools, **kwargs):
|
1249
|
-
"""Parse model response containing tool information.
|
1250
|
-
|
1251
|
-
Args:
|
1252
|
-
text(str): model response in string format
|
1253
|
-
tools(List): tools from user request
|
1254
|
-
"""
|
1255
|
-
if "<|plugin|>" in text: # internlm2
|
1256
|
-
text, action = text.split("<|action_start|><|plugin|>")
|
1257
|
-
action = action.split("<|action_end|>".strip())[0]
|
1258
|
-
action = action[action.find("{") :]
|
1259
|
-
action = json.loads(action)
|
1260
|
-
name, parameters = action["name"], json.dumps(
|
1261
|
-
action.get("parameters", action.get("arguments", {})), ensure_ascii=False
|
1262
|
-
)
|
1263
|
-
call_info_list = [(name, parameters)]
|
1264
|
-
elif "<function=" in text: # llama3.1
|
1265
|
-
action, _ = text.split("</function>")
|
1266
|
-
parameters = action[action.find("{") :]
|
1267
|
-
name = action.split("<function=")[1].split(">{")[0]
|
1268
|
-
call_info_list = [(name, parameters)]
|
1269
|
-
elif "<tool_call>" in text and "</tool_call>" in text: # qwen2.5
|
1270
|
-
# get tool_call in text
|
1271
|
-
pattern = r"<tool_call>(.*?)</tool_call>"
|
1272
|
-
match_result_list = re.findall(pattern, text, re.DOTALL)
|
1273
|
-
call_info_list = []
|
1274
|
-
for match_result in match_result_list:
|
1275
|
-
action = json.loads(match_result)
|
1276
|
-
call_info_list.append(
|
1277
|
-
(action["name"], json.dumps(action["arguments"], ensure_ascii=False))
|
1278
|
-
)
|
1279
|
-
# get text outside of tags
|
1280
|
-
if not text.startswith("<tool_call>"):
|
1281
|
-
text = text[: text.find("<tool_call>")]
|
1282
|
-
elif not text.endswith("</tool_call>"):
|
1283
|
-
text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
|
1284
|
-
else:
|
1285
|
-
text = ""
|
1286
|
-
elif "<|python_tag|>" in text: # llama3.2
|
1287
|
-
_, action = text.split("<|python_tag|>")
|
1288
|
-
action = json.loads(action)
|
1289
|
-
name, parameters = action["name"], json.dumps(
|
1290
|
-
action.get("parameters", action.get("arguments", {})), ensure_ascii=False
|
1291
|
-
)
|
1292
|
-
call_info_list = [(name, parameters)]
|
1293
|
-
else:
|
1294
|
-
raise RuntimeError(f"Unexpected model response: {text}")
|
1295
|
-
|
1296
|
-
call_info_list = [
|
1297
|
-
(
|
1298
|
-
[tool.function.name for tool in tools].index(call_info[0]),
|
1299
|
-
call_info[0],
|
1300
|
-
call_info[1],
|
1301
|
-
)
|
1302
|
-
for call_info in call_info_list
|
1303
|
-
]
|
1304
|
-
return text, call_info_list
|
1305
|
-
|
1306
|
-
|
1307
1246
|
def permute_weight(x: torch.Tensor) -> torch.Tensor:
|
1308
1247
|
b_ = x.shape[0]
|
1309
1248
|
n_ = x.shape[1]
|
@@ -1366,7 +1305,33 @@ def nullable_str(val: str):
|
|
1366
1305
|
return val
|
1367
1306
|
|
1368
1307
|
|
1308
|
+
def pyspy_dump_schedulers():
|
1309
|
+
"""py-spy dump on all scheduler in a local node."""
|
1310
|
+
try:
|
1311
|
+
pid = psutil.Process().pid
|
1312
|
+
# Command to run py-spy with the PID
|
1313
|
+
cmd = f"py-spy dump --pid {pid}"
|
1314
|
+
result = subprocess.run(
|
1315
|
+
cmd, shell=True, capture_output=True, text=True, check=True
|
1316
|
+
)
|
1317
|
+
logger.info(f"Profile for PID {pid}:\n{result.stdout}")
|
1318
|
+
except subprocess.CalledProcessError as e:
|
1319
|
+
logger.info(f"Failed to profile PID {pid}. Error: {e.stderr}")
|
1320
|
+
|
1321
|
+
|
1322
|
+
def kill_itself_when_parent_died():
|
1323
|
+
if sys.platform == "linux":
|
1324
|
+
# sigkill this process when parent worker manager dies
|
1325
|
+
PR_SET_PDEATHSIG = 1
|
1326
|
+
libc = ctypes.CDLL("libc.so.6")
|
1327
|
+
libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL)
|
1328
|
+
else:
|
1329
|
+
logger.warninig("kill_itself_when_parent_died is only supported in linux.")
|
1330
|
+
|
1331
|
+
|
1369
1332
|
def set_uvicorn_logging_configs():
|
1333
|
+
from uvicorn.config import LOGGING_CONFIG
|
1334
|
+
|
1370
1335
|
LOGGING_CONFIG["formatters"]["default"][
|
1371
1336
|
"fmt"
|
1372
1337
|
] = "[%(asctime)s] %(levelprefix)s %(message)s"
|
@@ -1442,3 +1407,35 @@ def is_valid_ipv6_address(address: str) -> bool:
|
|
1442
1407
|
return True
|
1443
1408
|
except ValueError:
|
1444
1409
|
return False
|
1410
|
+
|
1411
|
+
|
1412
|
+
def rank0_print(msg: str):
|
1413
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1414
|
+
|
1415
|
+
if get_tensor_model_parallel_rank() == 0:
|
1416
|
+
print(msg, flush=True)
|
1417
|
+
|
1418
|
+
|
1419
|
+
def launch_dummy_health_check_server(host, port):
|
1420
|
+
import uvicorn
|
1421
|
+
from fastapi import FastAPI, Response
|
1422
|
+
|
1423
|
+
app = FastAPI()
|
1424
|
+
|
1425
|
+
@app.get("/health")
|
1426
|
+
async def health():
|
1427
|
+
"""Check the health of the http server."""
|
1428
|
+
return Response(status_code=200)
|
1429
|
+
|
1430
|
+
@app.get("/health_generate")
|
1431
|
+
async def health_generate():
|
1432
|
+
"""Check the health of the http server."""
|
1433
|
+
return Response(status_code=200)
|
1434
|
+
|
1435
|
+
uvicorn.run(
|
1436
|
+
app,
|
1437
|
+
host=host,
|
1438
|
+
port=port,
|
1439
|
+
timeout_keep_alive=5,
|
1440
|
+
loop="uvloop",
|
1441
|
+
)
|
sglang/test/test_programs.py
CHANGED
@@ -535,6 +535,7 @@ def test_hellaswag_select():
|
|
535
535
|
|
536
536
|
# Compute accuracy
|
537
537
|
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
538
|
+
print(f"{accuracy=}, {accuracy_gen=}")
|
538
539
|
assert np.abs(accuracy_gen - accuracy) < 0.05
|
539
540
|
assert np.abs(latency_gen - latency) < 1
|
540
541
|
|
sglang/test/test_utils.py
CHANGED
@@ -34,7 +34,7 @@ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
|
34
34
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
35
35
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
36
36
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
37
|
-
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH =
|
37
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
|
38
38
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
39
39
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
40
40
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
|
|
42
42
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
43
43
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
44
44
|
|
45
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
46
|
+
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
|
47
|
+
|
45
48
|
|
46
49
|
def is_in_ci():
|
47
50
|
"""Return whether it is in CI runner."""
|
@@ -132,10 +135,6 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
|
|
132
135
|
return pred
|
133
136
|
|
134
137
|
|
135
|
-
def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
|
136
|
-
raise NotImplementedError()
|
137
|
-
|
138
|
-
|
139
138
|
def call_generate_guidance(
|
140
139
|
prompt, temperature, max_tokens, stop=None, n=1, regex=None, model=None
|
141
140
|
):
|
@@ -527,6 +526,48 @@ def get_similarities(vec1, vec2):
|
|
527
526
|
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
|
528
527
|
|
529
528
|
|
529
|
+
def get_benchmark_args(
|
530
|
+
base_url="",
|
531
|
+
dataset_name="",
|
532
|
+
dataset_path="",
|
533
|
+
tokenizer="",
|
534
|
+
num_prompts=500,
|
535
|
+
random_input_len=4096,
|
536
|
+
random_output_len=2048,
|
537
|
+
request_rate=float("inf"),
|
538
|
+
disable_stream=False,
|
539
|
+
disable_ignore_eos=False,
|
540
|
+
):
|
541
|
+
return SimpleNamespace(
|
542
|
+
backend="sglang",
|
543
|
+
base_url=base_url,
|
544
|
+
host=None,
|
545
|
+
port=None,
|
546
|
+
dataset_name=dataset_name,
|
547
|
+
dataset_path=dataset_path,
|
548
|
+
model=None,
|
549
|
+
tokenizer=tokenizer,
|
550
|
+
num_prompts=num_prompts,
|
551
|
+
sharegpt_output_len=None,
|
552
|
+
sharegpt_context_len=None,
|
553
|
+
random_input_len=random_input_len,
|
554
|
+
random_output_len=random_output_len,
|
555
|
+
random_range_ratio=0.0,
|
556
|
+
request_rate=request_rate,
|
557
|
+
multi=None,
|
558
|
+
output_file=None,
|
559
|
+
disable_tqdm=False,
|
560
|
+
disable_stream=disable_stream,
|
561
|
+
return_logprob=False,
|
562
|
+
seed=0,
|
563
|
+
disable_ignore_eos=disable_ignore_eos,
|
564
|
+
extra_request_body=None,
|
565
|
+
apply_chat_template=False,
|
566
|
+
profile=None,
|
567
|
+
lora_name=None,
|
568
|
+
)
|
569
|
+
|
570
|
+
|
530
571
|
def run_bench_serving(
|
531
572
|
model,
|
532
573
|
num_prompts,
|
@@ -538,6 +579,7 @@ def run_bench_serving(
|
|
538
579
|
random_input_len=4096,
|
539
580
|
random_output_len=2048,
|
540
581
|
disable_stream=False,
|
582
|
+
disable_ignore_eos=False,
|
541
583
|
need_warmup=False,
|
542
584
|
):
|
543
585
|
# Launch the server
|
@@ -550,32 +592,17 @@ def run_bench_serving(
|
|
550
592
|
)
|
551
593
|
|
552
594
|
# Run benchmark
|
553
|
-
args =
|
554
|
-
backend="sglang",
|
595
|
+
args = get_benchmark_args(
|
555
596
|
base_url=base_url,
|
556
|
-
host=None,
|
557
|
-
port=None,
|
558
597
|
dataset_name=dataset_name,
|
559
598
|
dataset_path=dataset_path,
|
560
|
-
model=None,
|
561
599
|
tokenizer=tokenizer,
|
562
600
|
num_prompts=num_prompts,
|
563
|
-
sharegpt_output_len=None,
|
564
|
-
sharegpt_context_len=None,
|
565
601
|
random_input_len=random_input_len,
|
566
602
|
random_output_len=random_output_len,
|
567
|
-
random_range_ratio=0.0,
|
568
603
|
request_rate=request_rate,
|
569
|
-
multi=None,
|
570
|
-
seed=0,
|
571
|
-
output_file=None,
|
572
|
-
disable_tqdm=False,
|
573
604
|
disable_stream=disable_stream,
|
574
|
-
disable_ignore_eos=
|
575
|
-
return_logprob=False,
|
576
|
-
lora_name=None,
|
577
|
-
extra_request_body=None,
|
578
|
-
profile=None,
|
605
|
+
disable_ignore_eos=disable_ignore_eos,
|
579
606
|
)
|
580
607
|
|
581
608
|
try:
|
@@ -591,6 +618,38 @@ def run_bench_serving(
|
|
591
618
|
return res
|
592
619
|
|
593
620
|
|
621
|
+
def run_bench_serving_multi(
|
622
|
+
model,
|
623
|
+
base_url,
|
624
|
+
other_server_args,
|
625
|
+
benchmark_args,
|
626
|
+
need_warmup=False,
|
627
|
+
):
|
628
|
+
# Launch the server
|
629
|
+
process = popen_launch_server(
|
630
|
+
model,
|
631
|
+
base_url,
|
632
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
633
|
+
other_args=other_server_args,
|
634
|
+
)
|
635
|
+
|
636
|
+
# run benchmark for all
|
637
|
+
res_l = []
|
638
|
+
try:
|
639
|
+
for args in benchmark_args:
|
640
|
+
if need_warmup:
|
641
|
+
warmup_args = copy.deepcopy(args)
|
642
|
+
warmup_args.num_prompts = 16
|
643
|
+
run_benchmark(warmup_args)
|
644
|
+
|
645
|
+
res = run_benchmark(args)
|
646
|
+
res_l.append((args, res))
|
647
|
+
finally:
|
648
|
+
kill_process_tree(process.pid)
|
649
|
+
|
650
|
+
return res_l
|
651
|
+
|
652
|
+
|
594
653
|
def run_bench_one_batch(model, other_args):
|
595
654
|
command = [
|
596
655
|
"python3",
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
242
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
245
|
Requires-Dist: vllm==0.6.4.post1; extra == "srt"
|
246
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
@@ -333,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
333
333
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
334
334
|
|
335
335
|
## News
|
336
|
-
- [
|
337
|
-
- [2024/
|
338
|
-
- [2024/09]
|
339
|
-
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
336
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
337
|
+
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
338
|
+
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
339
|
+
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
340
340
|
|
341
341
|
<details>
|
342
342
|
<summary>More</summary>
|
343
343
|
|
344
|
+
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
344
345
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
345
|
-
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
346
346
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
347
347
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
348
348
|
|