sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +3 -13
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +158 -8
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +119 -75
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +5 -2
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +18 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +71 -53
- sglang/srt/conversation.py +78 -46
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +11 -3
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +236 -138
- sglang/srt/disaggregation/nixl/conn.py +242 -71
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +51 -2
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +31 -4
- sglang/srt/entrypoints/http_server.py +45 -3
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +147 -51
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
- sglang/srt/layers/moe/ep_moe/layer.py +121 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +77 -71
- sglang/srt/layers/quantization/fp8.py +110 -97
- sglang/srt/layers/quantization/fp8_kernel.py +81 -62
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +11 -14
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +13 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +93 -23
- sglang/srt/managers/schedule_policy.py +11 -8
- sglang/srt/managers/scheduler.py +140 -100
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +157 -47
- sglang/srt/managers/tp_worker.py +21 -21
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +4 -2
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +57 -41
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +3 -3
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +77 -39
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +3 -1
- sglang/srt/models/llama4.py +58 -13
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +52 -42
- sglang/srt/openai_api/protocol.py +20 -16
- sglang/srt/reasoning_parser.py +1 -1
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +2 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +64 -10
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +41 -6
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +92 -15
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
sglang/test/test_programs.py
CHANGED
@@ -370,7 +370,7 @@ def test_dtype_gen():
|
|
370
370
|
@sgl.function
|
371
371
|
def dtype_gen(s):
|
372
372
|
s += "Q: What is the full name of DNS?\n"
|
373
|
-
s += "A: The full
|
373
|
+
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
374
374
|
s += "Q: Which year was DNS invented?\n"
|
375
375
|
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
|
376
376
|
s += "Q: What is the value of pi?\n"
|
@@ -503,7 +503,7 @@ def test_hellaswag_select():
|
|
503
503
|
#####################################
|
504
504
|
|
505
505
|
# Run requests
|
506
|
-
tic = time.
|
506
|
+
tic = time.perf_counter()
|
507
507
|
rets = few_shot_hellaswag.run_batch(
|
508
508
|
arguments,
|
509
509
|
temperature=0,
|
@@ -514,13 +514,13 @@ def test_hellaswag_select():
|
|
514
514
|
preds = []
|
515
515
|
for i, ret in enumerate(rets):
|
516
516
|
preds.append(choices[i].index(ret["answer"]))
|
517
|
-
latency = time.
|
517
|
+
latency = time.perf_counter() - tic
|
518
518
|
|
519
519
|
# Compute accuracy
|
520
520
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
521
521
|
|
522
522
|
# Test generator style of run_batch
|
523
|
-
tic = time.
|
523
|
+
tic = time.perf_counter()
|
524
524
|
rets = few_shot_hellaswag.run_batch(
|
525
525
|
arguments,
|
526
526
|
temperature=0,
|
@@ -531,7 +531,7 @@ def test_hellaswag_select():
|
|
531
531
|
preds_gen = []
|
532
532
|
for i, ret in enumerate(rets):
|
533
533
|
preds_gen.append(choices[i].index(ret["answer"]))
|
534
|
-
latency_gen = time.
|
534
|
+
latency_gen = time.perf_counter() - tic
|
535
535
|
|
536
536
|
# Compute accuracy
|
537
537
|
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
sglang/test/test_utils.py
CHANGED
@@ -66,6 +66,7 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
|
66
66
|
)
|
67
67
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
68
68
|
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
69
|
+
DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
|
69
70
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
70
71
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
71
72
|
)
|
@@ -78,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
|
|
78
79
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
79
80
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
80
81
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
81
|
-
|
82
|
+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
83
|
+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
|
82
84
|
|
83
85
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
84
86
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
@@ -393,12 +395,12 @@ def popen_launch_server(
|
|
393
395
|
other_args: list[str] = (),
|
394
396
|
env: Optional[dict] = None,
|
395
397
|
return_stdout_stderr: Optional[tuple] = None,
|
396
|
-
|
398
|
+
pd_separated: bool = False,
|
397
399
|
):
|
398
400
|
_, host, port = base_url.split(":")
|
399
401
|
host = host[2:]
|
400
402
|
|
401
|
-
if
|
403
|
+
if pd_separated:
|
402
404
|
command = "sglang.launch_pd_server"
|
403
405
|
else:
|
404
406
|
command = "sglang.launch_server"
|
@@ -412,7 +414,7 @@ def popen_launch_server(
|
|
412
414
|
*[str(x) for x in other_args],
|
413
415
|
]
|
414
416
|
|
415
|
-
if
|
417
|
+
if pd_separated:
|
416
418
|
command.extend(
|
417
419
|
[
|
418
420
|
"--lb-host",
|
@@ -447,9 +449,9 @@ def popen_launch_server(
|
|
447
449
|
else:
|
448
450
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
449
451
|
|
450
|
-
start_time = time.
|
452
|
+
start_time = time.perf_counter()
|
451
453
|
with requests.Session() as session:
|
452
|
-
while time.
|
454
|
+
while time.perf_counter() - start_time < timeout:
|
453
455
|
try:
|
454
456
|
headers = {
|
455
457
|
"Content-Type": "application/json; charset=utf-8",
|
@@ -476,6 +478,81 @@ def popen_launch_server(
|
|
476
478
|
raise TimeoutError("Server failed to start within the timeout period.")
|
477
479
|
|
478
480
|
|
481
|
+
def popen_launch_pd_server(
|
482
|
+
model: str,
|
483
|
+
base_url: str,
|
484
|
+
timeout: float,
|
485
|
+
api_key: Optional[str] = None,
|
486
|
+
other_args: list[str] = (),
|
487
|
+
env: Optional[dict] = None,
|
488
|
+
return_stdout_stderr: Optional[tuple] = None,
|
489
|
+
):
|
490
|
+
_, host, port = base_url.split(":")
|
491
|
+
host = host[2:]
|
492
|
+
|
493
|
+
command = "sglang.launch_server"
|
494
|
+
|
495
|
+
command = [
|
496
|
+
"python3",
|
497
|
+
"-m",
|
498
|
+
command,
|
499
|
+
"--model-path",
|
500
|
+
model,
|
501
|
+
*[str(x) for x in other_args],
|
502
|
+
]
|
503
|
+
|
504
|
+
command.extend(
|
505
|
+
[
|
506
|
+
"--host",
|
507
|
+
host,
|
508
|
+
"--port",
|
509
|
+
port,
|
510
|
+
]
|
511
|
+
)
|
512
|
+
|
513
|
+
if api_key:
|
514
|
+
command += ["--api-key", api_key]
|
515
|
+
|
516
|
+
print(f"command={' '.join(command)}")
|
517
|
+
|
518
|
+
if return_stdout_stderr:
|
519
|
+
process = subprocess.Popen(
|
520
|
+
command,
|
521
|
+
stdout=return_stdout_stderr[0],
|
522
|
+
stderr=return_stdout_stderr[1],
|
523
|
+
env=env,
|
524
|
+
text=True,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
528
|
+
|
529
|
+
start_time = time.time()
|
530
|
+
with requests.Session() as session:
|
531
|
+
while time.time() - start_time < timeout:
|
532
|
+
try:
|
533
|
+
headers = {
|
534
|
+
"Content-Type": "application/json; charset=utf-8",
|
535
|
+
"Authorization": f"Bearer {api_key}",
|
536
|
+
}
|
537
|
+
response = session.get(
|
538
|
+
f"{base_url}/health",
|
539
|
+
headers=headers,
|
540
|
+
)
|
541
|
+
if response.status_code == 200:
|
542
|
+
return process
|
543
|
+
except requests.RequestException:
|
544
|
+
pass
|
545
|
+
|
546
|
+
return_code = process.poll()
|
547
|
+
if return_code is not None:
|
548
|
+
raise Exception(f"Server unexpectedly exits ({return_code=}).")
|
549
|
+
|
550
|
+
time.sleep(10)
|
551
|
+
|
552
|
+
kill_process_tree(process.pid)
|
553
|
+
raise TimeoutError("Server failed to start within the timeout period.")
|
554
|
+
|
555
|
+
|
479
556
|
def run_with_timeout(
|
480
557
|
func: Callable,
|
481
558
|
args: tuple = (),
|
@@ -507,7 +584,7 @@ class TestFile:
|
|
507
584
|
|
508
585
|
|
509
586
|
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
510
|
-
tic = time.
|
587
|
+
tic = time.perf_counter()
|
511
588
|
success = True
|
512
589
|
|
513
590
|
for i, file in enumerate(files):
|
@@ -522,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
522
599
|
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
523
600
|
flush=True,
|
524
601
|
)
|
525
|
-
tic = time.
|
602
|
+
tic = time.perf_counter()
|
526
603
|
|
527
604
|
process = subprocess.Popen(
|
528
605
|
["python3", filename], stdout=None, stderr=None, env=os.environ
|
529
606
|
)
|
530
607
|
process.wait()
|
531
|
-
elapsed = time.
|
608
|
+
elapsed = time.perf_counter() - tic
|
532
609
|
|
533
610
|
print(
|
534
611
|
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
@@ -554,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
554
631
|
break
|
555
632
|
|
556
633
|
if success:
|
557
|
-
print(f"Success. Time elapsed: {time.
|
634
|
+
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
558
635
|
else:
|
559
|
-
print(f"Fail. Time elapsed: {time.
|
636
|
+
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
560
637
|
|
561
638
|
return 0 if success else -1
|
562
639
|
|
@@ -579,7 +656,7 @@ def get_benchmark_args(
|
|
579
656
|
disable_stream=False,
|
580
657
|
disable_ignore_eos=False,
|
581
658
|
seed: int = 0,
|
582
|
-
|
659
|
+
pd_separated: bool = False,
|
583
660
|
):
|
584
661
|
return SimpleNamespace(
|
585
662
|
backend="sglang",
|
@@ -609,7 +686,7 @@ def get_benchmark_args(
|
|
609
686
|
profile=None,
|
610
687
|
lora_name=None,
|
611
688
|
prompt_suffix="",
|
612
|
-
|
689
|
+
pd_separated=pd_separated,
|
613
690
|
)
|
614
691
|
|
615
692
|
|
@@ -673,7 +750,7 @@ def run_bench_serving_multi(
|
|
673
750
|
other_server_args,
|
674
751
|
benchmark_args,
|
675
752
|
need_warmup=False,
|
676
|
-
|
753
|
+
pd_separated=False,
|
677
754
|
):
|
678
755
|
# Launch the server
|
679
756
|
process = popen_launch_server(
|
@@ -681,7 +758,7 @@ def run_bench_serving_multi(
|
|
681
758
|
base_url,
|
682
759
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
683
760
|
other_args=other_server_args,
|
684
|
-
|
761
|
+
pd_separated=pd_separated,
|
685
762
|
)
|
686
763
|
|
687
764
|
# run benchmark for all
|
sglang/utils.py
CHANGED
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
|
|
278
278
|
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
279
279
|
)
|
280
280
|
if signum == signal.SIGTERM:
|
281
|
-
logger.info(f"{sub_module_name}
|
281
|
+
logger.info(f"{sub_module_name} receive sigterm")
|
282
282
|
|
283
283
|
signal.signal(signal.SIGTERM, graceful_shutdown)
|
284
284
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6.
|
1
|
+
__version__ = "0.4.6.post4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6.
|
3
|
+
Version: 0.4.6.post4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
232
232
|
Requires-Dist: packaging; extra == "runtime-common"
|
233
|
+
Requires-Dist: partial_json_parser; extra == "runtime-common"
|
233
234
|
Requires-Dist: pillow; extra == "runtime-common"
|
234
235
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
235
236
|
Requires-Dist: psutil; extra == "runtime-common"
|
@@ -242,17 +243,16 @@ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
|
|
242
243
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
244
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
245
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: xgrammar==0.1.
|
246
|
+
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
246
247
|
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
247
248
|
Provides-Extra: srt
|
248
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.1.
|
250
|
+
Requires-Dist: sgl-kernel==0.1.2.post1; extra == "srt"
|
250
251
|
Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
|
251
252
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
252
253
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
253
254
|
Requires-Dist: cuda-python; extra == "srt"
|
254
255
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
255
|
-
Requires-Dist: partial_json_parser; extra == "srt"
|
256
256
|
Requires-Dist: einops; extra == "srt"
|
257
257
|
Provides-Extra: blackwell
|
258
258
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
@@ -261,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
|
|
261
261
|
Requires-Dist: torchvision; extra == "blackwell"
|
262
262
|
Requires-Dist: cuda-python; extra == "blackwell"
|
263
263
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
264
|
-
Requires-Dist: partial_json_parser; extra == "blackwell"
|
265
264
|
Requires-Dist: einops; extra == "blackwell"
|
266
265
|
Provides-Extra: srt-hip
|
267
266
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
@@ -278,6 +277,9 @@ Provides-Extra: srt-cpu
|
|
278
277
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
279
278
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
280
279
|
Requires-Dist: torch; extra == "srt-cpu"
|
280
|
+
Provides-Extra: srt-npu
|
281
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
282
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
|
281
283
|
Provides-Extra: openai
|
282
284
|
Requires-Dist: openai>=1.0; extra == "openai"
|
283
285
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -299,6 +301,7 @@ Requires-Dist: sglang[srt]; extra == "all"
|
|
299
301
|
Requires-Dist: sglang[openai]; extra == "all"
|
300
302
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
301
303
|
Requires-Dist: sglang[litellm]; extra == "all"
|
304
|
+
Requires-Dist: sglang[torch_memory_saver]; extra == "all"
|
302
305
|
Provides-Extra: all-hip
|
303
306
|
Requires-Dist: sglang[srt_hip]; extra == "all-hip"
|
304
307
|
Requires-Dist: sglang[openai]; extra == "all-hip"
|
@@ -319,6 +322,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
|
319
322
|
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
320
323
|
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
321
324
|
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
325
|
+
Provides-Extra: all-npu
|
326
|
+
Requires-Dist: sglang[srt_npu]; extra == "all-npu"
|
327
|
+
Requires-Dist: sglang[openai]; extra == "all-npu"
|
328
|
+
Requires-Dist: sglang[anthropic]; extra == "all-npu"
|
329
|
+
Requires-Dist: sglang[litellm]; extra == "all-npu"
|
322
330
|
Provides-Extra: dev
|
323
331
|
Requires-Dist: sglang[all]; extra == "dev"
|
324
332
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -358,18 +366,19 @@ Dynamic: license-file
|
|
358
366
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
359
367
|
|
360
368
|
## News
|
369
|
+
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
361
370
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
362
371
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
363
|
-
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
364
372
|
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
365
373
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
366
|
-
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
367
374
|
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
368
375
|
|
369
376
|
<details>
|
370
377
|
<summary>More</summary>
|
371
378
|
|
379
|
+
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
372
380
|
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
381
|
+
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
373
382
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
374
383
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
375
384
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -383,7 +392,7 @@ The core features include:
|
|
383
392
|
|
384
393
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
385
394
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
386
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral,
|
395
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
387
396
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
388
397
|
|
389
398
|
## Getting Started
|
@@ -401,7 +410,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
401
410
|
|
402
411
|
## Adoption and Sponsorship
|
403
412
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
404
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
413
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, InnoMatrix, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
405
414
|
|
406
415
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
407
416
|
|