sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +2 -2
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +9 -7
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +78 -78
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +2 -2
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -43
- sglang/srt/conversation.py +48 -43
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +7 -2
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +227 -120
- sglang/srt/disaggregation/nixl/conn.py +1 -0
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +7 -1
- sglang/srt/entrypoints/engine.py +17 -2
- sglang/srt/entrypoints/http_server.py +17 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +1 -1
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +72 -71
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_kernel.py +3 -3
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +3 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +76 -24
- sglang/srt/managers/schedule_policy.py +0 -3
- sglang/srt/managers/scheduler.py +113 -88
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +133 -34
- sglang/srt/managers/tp_worker.py +12 -9
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/memory_pool.py +2 -0
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +19 -14
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +23 -20
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llama4.py +5 -6
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/openai_api/adapter.py +30 -4
- sglang/srt/openai_api/protocol.py +0 -8
- sglang/srt/reasoning_parser.py +3 -3
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +4 -56
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server_args.py +34 -4
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +6 -5
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +89 -14
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
sglang/test/run_eval.py
CHANGED
@@ -71,9 +71,9 @@ def run_eval(args):
|
|
71
71
|
)
|
72
72
|
|
73
73
|
# Run eval
|
74
|
-
tic = time.
|
74
|
+
tic = time.perf_counter()
|
75
75
|
result = eval_obj(sampler)
|
76
|
-
latency = time.
|
76
|
+
latency = time.perf_counter() - tic
|
77
77
|
|
78
78
|
# Dump reports
|
79
79
|
metrics = result.metrics | {"score": result.score}
|
sglang/test/runners.py
CHANGED
@@ -19,7 +19,9 @@ from typing import List, Optional, Tuple, Union
|
|
19
19
|
|
20
20
|
import torch
|
21
21
|
import torch.nn.functional as F
|
22
|
+
import transformers
|
22
23
|
from transformers import (
|
24
|
+
AutoConfig,
|
23
25
|
AutoModel,
|
24
26
|
AutoModelForCausalLM,
|
25
27
|
AutoModelForVision2Seq,
|
@@ -211,7 +213,12 @@ class HFRunner:
|
|
211
213
|
|
212
214
|
# Load the model and tokenizer
|
213
215
|
if self.model_type == "generation":
|
214
|
-
|
216
|
+
config = AutoConfig.from_pretrained(model_path)
|
217
|
+
if model_archs := getattr(config, "architectures"):
|
218
|
+
model_cls = getattr(transformers, model_archs[0])
|
219
|
+
else:
|
220
|
+
model_cls = AutoModelForCausalLM
|
221
|
+
self.base_model = model_cls.from_pretrained(
|
215
222
|
model_path,
|
216
223
|
torch_dtype=torch_dtype,
|
217
224
|
trust_remote_code=self.trust_remote_code,
|
sglang/test/send_one.py
CHANGED
@@ -27,6 +27,7 @@ class BenchArgs:
|
|
27
27
|
"Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
|
28
28
|
)
|
29
29
|
image: bool = False
|
30
|
+
many_images: bool = False
|
30
31
|
stream: bool = False
|
31
32
|
|
32
33
|
@staticmethod
|
@@ -48,6 +49,7 @@ class BenchArgs:
|
|
48
49
|
parser.add_argument("--return-logprob", action="store_true")
|
49
50
|
parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
|
50
51
|
parser.add_argument("--image", action="store_true")
|
52
|
+
parser.add_argument("--many-images", action="store_true")
|
51
53
|
parser.add_argument("--stream", action="store_true")
|
52
54
|
|
53
55
|
@classmethod
|
@@ -62,6 +64,17 @@ def send_one_prompt(args):
|
|
62
64
|
"Human: Describe this image in a very short sentence.\n\nAssistant:"
|
63
65
|
)
|
64
66
|
image_data = "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
|
67
|
+
elif args.many_images:
|
68
|
+
args.prompt = (
|
69
|
+
"Human: I have one reference image and many images."
|
70
|
+
"Describe their relationship in a very short sentence.\n\nAssistant:"
|
71
|
+
)
|
72
|
+
image_data = [
|
73
|
+
"https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
|
74
|
+
"https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
|
75
|
+
"https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
|
76
|
+
"https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
|
77
|
+
]
|
65
78
|
else:
|
66
79
|
image_data = None
|
67
80
|
|
@@ -74,9 +87,6 @@ def send_one_prompt(args):
|
|
74
87
|
"Write in a format of json.\nAssistant:"
|
75
88
|
)
|
76
89
|
json_schema = "$$ANY$$"
|
77
|
-
json_schema = (
|
78
|
-
'{"type": "object", "properties": {"population": {"type": "integer"}}}'
|
79
|
-
)
|
80
90
|
else:
|
81
91
|
json_schema = None
|
82
92
|
|
@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
|
|
140
140
|
max_tokens=self.max_tokens,
|
141
141
|
)
|
142
142
|
return response.choices[0].message.content
|
143
|
-
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are
|
143
|
+
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
|
144
144
|
except openai.BadRequestError as e:
|
145
145
|
print("Bad Request Error", e)
|
146
146
|
return ""
|
@@ -121,7 +121,7 @@ class HumanEval(Eval):
|
|
121
121
|
convo=convo,
|
122
122
|
metrics={
|
123
123
|
f"pass@{k}": estimate_pass_at_k([total], [correct], k)
|
124
|
-
# this will be
|
124
|
+
# this will be aggregated so no need of .mean()
|
125
125
|
for k in self._ks_passes
|
126
126
|
if total >= k
|
127
127
|
},
|
sglang/test/test_programs.py
CHANGED
@@ -370,7 +370,7 @@ def test_dtype_gen():
|
|
370
370
|
@sgl.function
|
371
371
|
def dtype_gen(s):
|
372
372
|
s += "Q: What is the full name of DNS?\n"
|
373
|
-
s += "A: The full
|
373
|
+
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
374
374
|
s += "Q: Which year was DNS invented?\n"
|
375
375
|
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
|
376
376
|
s += "Q: What is the value of pi?\n"
|
@@ -503,7 +503,7 @@ def test_hellaswag_select():
|
|
503
503
|
#####################################
|
504
504
|
|
505
505
|
# Run requests
|
506
|
-
tic = time.
|
506
|
+
tic = time.perf_counter()
|
507
507
|
rets = few_shot_hellaswag.run_batch(
|
508
508
|
arguments,
|
509
509
|
temperature=0,
|
@@ -514,13 +514,13 @@ def test_hellaswag_select():
|
|
514
514
|
preds = []
|
515
515
|
for i, ret in enumerate(rets):
|
516
516
|
preds.append(choices[i].index(ret["answer"]))
|
517
|
-
latency = time.
|
517
|
+
latency = time.perf_counter() - tic
|
518
518
|
|
519
519
|
# Compute accuracy
|
520
520
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
521
521
|
|
522
522
|
# Test generator style of run_batch
|
523
|
-
tic = time.
|
523
|
+
tic = time.perf_counter()
|
524
524
|
rets = few_shot_hellaswag.run_batch(
|
525
525
|
arguments,
|
526
526
|
temperature=0,
|
@@ -531,7 +531,7 @@ def test_hellaswag_select():
|
|
531
531
|
preds_gen = []
|
532
532
|
for i, ret in enumerate(rets):
|
533
533
|
preds_gen.append(choices[i].index(ret["answer"]))
|
534
|
-
latency_gen = time.
|
534
|
+
latency_gen = time.perf_counter() - tic
|
535
535
|
|
536
536
|
# Compute accuracy
|
537
537
|
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
sglang/test/test_utils.py
CHANGED
@@ -395,12 +395,12 @@ def popen_launch_server(
|
|
395
395
|
other_args: list[str] = (),
|
396
396
|
env: Optional[dict] = None,
|
397
397
|
return_stdout_stderr: Optional[tuple] = None,
|
398
|
-
|
398
|
+
pd_separated: bool = False,
|
399
399
|
):
|
400
400
|
_, host, port = base_url.split(":")
|
401
401
|
host = host[2:]
|
402
402
|
|
403
|
-
if
|
403
|
+
if pd_separated:
|
404
404
|
command = "sglang.launch_pd_server"
|
405
405
|
else:
|
406
406
|
command = "sglang.launch_server"
|
@@ -414,7 +414,7 @@ def popen_launch_server(
|
|
414
414
|
*[str(x) for x in other_args],
|
415
415
|
]
|
416
416
|
|
417
|
-
if
|
417
|
+
if pd_separated:
|
418
418
|
command.extend(
|
419
419
|
[
|
420
420
|
"--lb-host",
|
@@ -449,9 +449,9 @@ def popen_launch_server(
|
|
449
449
|
else:
|
450
450
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
451
451
|
|
452
|
-
start_time = time.
|
452
|
+
start_time = time.perf_counter()
|
453
453
|
with requests.Session() as session:
|
454
|
-
while time.
|
454
|
+
while time.perf_counter() - start_time < timeout:
|
455
455
|
try:
|
456
456
|
headers = {
|
457
457
|
"Content-Type": "application/json; charset=utf-8",
|
@@ -478,6 +478,81 @@ def popen_launch_server(
|
|
478
478
|
raise TimeoutError("Server failed to start within the timeout period.")
|
479
479
|
|
480
480
|
|
481
|
+
def popen_launch_pd_server(
|
482
|
+
model: str,
|
483
|
+
base_url: str,
|
484
|
+
timeout: float,
|
485
|
+
api_key: Optional[str] = None,
|
486
|
+
other_args: list[str] = (),
|
487
|
+
env: Optional[dict] = None,
|
488
|
+
return_stdout_stderr: Optional[tuple] = None,
|
489
|
+
):
|
490
|
+
_, host, port = base_url.split(":")
|
491
|
+
host = host[2:]
|
492
|
+
|
493
|
+
command = "sglang.launch_server"
|
494
|
+
|
495
|
+
command = [
|
496
|
+
"python3",
|
497
|
+
"-m",
|
498
|
+
command,
|
499
|
+
"--model-path",
|
500
|
+
model,
|
501
|
+
*[str(x) for x in other_args],
|
502
|
+
]
|
503
|
+
|
504
|
+
command.extend(
|
505
|
+
[
|
506
|
+
"--host",
|
507
|
+
host,
|
508
|
+
"--port",
|
509
|
+
port,
|
510
|
+
]
|
511
|
+
)
|
512
|
+
|
513
|
+
if api_key:
|
514
|
+
command += ["--api-key", api_key]
|
515
|
+
|
516
|
+
print(f"command={' '.join(command)}")
|
517
|
+
|
518
|
+
if return_stdout_stderr:
|
519
|
+
process = subprocess.Popen(
|
520
|
+
command,
|
521
|
+
stdout=return_stdout_stderr[0],
|
522
|
+
stderr=return_stdout_stderr[1],
|
523
|
+
env=env,
|
524
|
+
text=True,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
528
|
+
|
529
|
+
start_time = time.time()
|
530
|
+
with requests.Session() as session:
|
531
|
+
while time.time() - start_time < timeout:
|
532
|
+
try:
|
533
|
+
headers = {
|
534
|
+
"Content-Type": "application/json; charset=utf-8",
|
535
|
+
"Authorization": f"Bearer {api_key}",
|
536
|
+
}
|
537
|
+
response = session.get(
|
538
|
+
f"{base_url}/health",
|
539
|
+
headers=headers,
|
540
|
+
)
|
541
|
+
if response.status_code == 200:
|
542
|
+
return process
|
543
|
+
except requests.RequestException:
|
544
|
+
pass
|
545
|
+
|
546
|
+
return_code = process.poll()
|
547
|
+
if return_code is not None:
|
548
|
+
raise Exception(f"Server unexpectedly exits ({return_code=}).")
|
549
|
+
|
550
|
+
time.sleep(10)
|
551
|
+
|
552
|
+
kill_process_tree(process.pid)
|
553
|
+
raise TimeoutError("Server failed to start within the timeout period.")
|
554
|
+
|
555
|
+
|
481
556
|
def run_with_timeout(
|
482
557
|
func: Callable,
|
483
558
|
args: tuple = (),
|
@@ -509,7 +584,7 @@ class TestFile:
|
|
509
584
|
|
510
585
|
|
511
586
|
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
512
|
-
tic = time.
|
587
|
+
tic = time.perf_counter()
|
513
588
|
success = True
|
514
589
|
|
515
590
|
for i, file in enumerate(files):
|
@@ -524,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
524
599
|
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
525
600
|
flush=True,
|
526
601
|
)
|
527
|
-
tic = time.
|
602
|
+
tic = time.perf_counter()
|
528
603
|
|
529
604
|
process = subprocess.Popen(
|
530
605
|
["python3", filename], stdout=None, stderr=None, env=os.environ
|
531
606
|
)
|
532
607
|
process.wait()
|
533
|
-
elapsed = time.
|
608
|
+
elapsed = time.perf_counter() - tic
|
534
609
|
|
535
610
|
print(
|
536
611
|
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
@@ -556,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
556
631
|
break
|
557
632
|
|
558
633
|
if success:
|
559
|
-
print(f"Success. Time elapsed: {time.
|
634
|
+
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
560
635
|
else:
|
561
|
-
print(f"Fail. Time elapsed: {time.
|
636
|
+
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
562
637
|
|
563
638
|
return 0 if success else -1
|
564
639
|
|
@@ -581,7 +656,7 @@ def get_benchmark_args(
|
|
581
656
|
disable_stream=False,
|
582
657
|
disable_ignore_eos=False,
|
583
658
|
seed: int = 0,
|
584
|
-
|
659
|
+
pd_separated: bool = False,
|
585
660
|
):
|
586
661
|
return SimpleNamespace(
|
587
662
|
backend="sglang",
|
@@ -611,7 +686,7 @@ def get_benchmark_args(
|
|
611
686
|
profile=None,
|
612
687
|
lora_name=None,
|
613
688
|
prompt_suffix="",
|
614
|
-
|
689
|
+
pd_separated=pd_separated,
|
615
690
|
)
|
616
691
|
|
617
692
|
|
@@ -675,7 +750,7 @@ def run_bench_serving_multi(
|
|
675
750
|
other_server_args,
|
676
751
|
benchmark_args,
|
677
752
|
need_warmup=False,
|
678
|
-
|
753
|
+
pd_separated=False,
|
679
754
|
):
|
680
755
|
# Launch the server
|
681
756
|
process = popen_launch_server(
|
@@ -683,7 +758,7 @@ def run_bench_serving_multi(
|
|
683
758
|
base_url,
|
684
759
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
685
760
|
other_args=other_server_args,
|
686
|
-
|
761
|
+
pd_separated=pd_separated,
|
687
762
|
)
|
688
763
|
|
689
764
|
# run benchmark for all
|
sglang/utils.py
CHANGED
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
|
|
278
278
|
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
279
279
|
)
|
280
280
|
if signum == signal.SIGTERM:
|
281
|
-
logger.info(f"{sub_module_name}
|
281
|
+
logger.info(f"{sub_module_name} receive sigterm")
|
282
282
|
|
283
283
|
signal.signal(signal.SIGTERM, graceful_shutdown)
|
284
284
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6.
|
1
|
+
__version__ = "0.4.6.post4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6.
|
3
|
+
Version: 0.4.6.post4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -247,7 +247,7 @@ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
|
247
247
|
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
248
248
|
Provides-Extra: srt
|
249
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
250
|
-
Requires-Dist: sgl-kernel==0.1.
|
250
|
+
Requires-Dist: sgl-kernel==0.1.2.post1; extra == "srt"
|
251
251
|
Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
|
252
252
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
253
253
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
@@ -301,6 +301,7 @@ Requires-Dist: sglang[srt]; extra == "all"
|
|
301
301
|
Requires-Dist: sglang[openai]; extra == "all"
|
302
302
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
303
303
|
Requires-Dist: sglang[litellm]; extra == "all"
|
304
|
+
Requires-Dist: sglang[torch_memory_saver]; extra == "all"
|
304
305
|
Provides-Extra: all-hip
|
305
306
|
Requires-Dist: sglang[srt_hip]; extra == "all-hip"
|
306
307
|
Requires-Dist: sglang[openai]; extra == "all-hip"
|
@@ -368,16 +369,16 @@ Dynamic: license-file
|
|
368
369
|
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
369
370
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
370
371
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
371
|
-
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
372
372
|
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
373
373
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
374
|
-
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
375
374
|
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
376
375
|
|
377
376
|
<details>
|
378
377
|
<summary>More</summary>
|
379
378
|
|
379
|
+
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
380
380
|
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
381
|
+
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
381
382
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
382
383
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
383
384
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -409,7 +410,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
409
410
|
|
410
411
|
## Adoption and Sponsorship
|
411
412
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
412
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
413
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, InnoMatrix, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
413
414
|
|
414
415
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
415
416
|
|