sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +168 -22
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +49 -0
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +35 -0
- sglang/srt/custom_op.py +7 -1
- sglang/srt/disaggregation/base/conn.py +2 -0
- sglang/srt/disaggregation/decode.py +22 -6
- sglang/srt/disaggregation/mooncake/conn.py +289 -48
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
- sglang/srt/disaggregation/nixl/conn.py +100 -52
- sglang/srt/disaggregation/prefill.py +5 -4
- sglang/srt/disaggregation/utils.py +13 -12
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +45 -9
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/openai/protocol.py +51 -6
- sglang/srt/entrypoints/openai/serving_chat.py +52 -76
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +7 -0
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +56 -23
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +18 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +41 -0
- sglang/srt/layers/linear.py +99 -12
- sglang/srt/layers/logits_processor.py +15 -6
- sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
- sglang/srt/layers/moe/ep_moe/layer.py +115 -25
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
- sglang/srt/layers/moe/fused_moe_native.py +7 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +36 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +44 -0
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +6 -6
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +105 -13
- sglang/srt/layers/vocab_parallel_embedding.py +19 -2
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +60 -15
- sglang/srt/managers/mm_utils.py +73 -59
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +80 -79
- sglang/srt/managers/scheduler.py +153 -63
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/memory_pool.py +289 -3
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +3 -2
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +302 -58
- sglang/srt/model_loader/loader.py +86 -10
- sglang/srt/model_loader/weight_utils.py +160 -3
- sglang/srt/models/deepseek_nextn.py +5 -4
- sglang/srt/models/deepseek_v2.py +305 -26
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_audio.py +949 -0
- sglang/srt/models/gemma3n_causal.py +1010 -0
- sglang/srt/models/gemma3n_mm.py +495 -0
- sglang/srt/models/hunyuan.py +771 -0
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +43 -11
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/multimodal/processors/gemma3n.py +82 -0
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +85 -24
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +204 -28
- sglang/srt/utils.py +369 -138
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_utils.py +15 -3
- sglang/version.py +1 -1
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
sglang/test/runners.py
CHANGED
@@ -503,6 +503,8 @@ class SRTRunner:
|
|
503
503
|
disable_overlap_schedule: bool = False,
|
504
504
|
disable_custom_all_reduce: bool = False,
|
505
505
|
torchao_config: Optional[str] = None,
|
506
|
+
cuda_graph_max_bs: int = 4,
|
507
|
+
sleep_on_idle=False,
|
506
508
|
):
|
507
509
|
self.model_type = model_type
|
508
510
|
self.is_generation = model_type == "generation"
|
@@ -538,8 +540,9 @@ class SRTRunner:
|
|
538
540
|
tokenizer_path=tokenizer_path,
|
539
541
|
enable_ep_moe=enable_ep_moe,
|
540
542
|
disable_overlap_schedule=disable_overlap_schedule,
|
541
|
-
cuda_graph_max_bs=
|
543
|
+
cuda_graph_max_bs=cuda_graph_max_bs,
|
542
544
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
545
|
+
sleep_on_idle=sleep_on_idle,
|
543
546
|
**spec_kwargs,
|
544
547
|
)
|
545
548
|
|
@@ -550,6 +553,12 @@ class SRTRunner:
|
|
550
553
|
else:
|
551
554
|
self.tokenizer = None
|
552
555
|
|
556
|
+
def load_lora_adapter(self, lora_name: str, lora_path: str):
|
557
|
+
return self.engine.load_lora_adapter(lora_name, lora_path)
|
558
|
+
|
559
|
+
def unload_lora_adapter(self, lora_name: str):
|
560
|
+
return self.engine.unload_lora_adapter(lora_name)
|
561
|
+
|
553
562
|
def forward(
|
554
563
|
self,
|
555
564
|
prompts: Union[
|
sglang/test/test_utils.py
CHANGED
@@ -5,6 +5,7 @@ import copy
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import random
|
8
|
+
import re
|
8
9
|
import subprocess
|
9
10
|
import threading
|
10
11
|
import time
|
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
|
|
840
841
|
print(f"Output: {output}", flush=True)
|
841
842
|
print(f"Error: {error}", flush=True)
|
842
843
|
|
843
|
-
|
844
|
-
|
844
|
+
# Return prefill_latency, decode_throughput, decode_latency
|
845
|
+
prefill_line = output.split("\n")[-9]
|
846
|
+
decode_line = output.split("\n")[-3]
|
847
|
+
pattern = (
|
848
|
+
r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
|
849
|
+
)
|
850
|
+
match = re.search(pattern, prefill_line)
|
851
|
+
if match:
|
852
|
+
prefill_latency = float(match.group("latency"))
|
853
|
+
match = re.search(pattern, decode_line)
|
854
|
+
if match:
|
855
|
+
decode_latency = float(match.group("latency"))
|
856
|
+
decode_throughput = float(match.group("throughput"))
|
845
857
|
finally:
|
846
858
|
kill_process_tree(process.pid)
|
847
859
|
|
848
|
-
return
|
860
|
+
return prefill_latency, decode_throughput, decode_latency
|
849
861
|
|
850
862
|
|
851
863
|
def run_bench_offline_throughput(model, other_args):
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.9"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.9
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -219,6 +219,7 @@ Requires-Dist: IPython
|
|
219
219
|
Requires-Dist: setproctitle
|
220
220
|
Provides-Extra: runtime-common
|
221
221
|
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
222
|
+
Requires-Dist: build; extra == "runtime-common"
|
222
223
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
223
224
|
Requires-Dist: datasets; extra == "runtime-common"
|
224
225
|
Requires-Dist: fastapi; extra == "runtime-common"
|
@@ -243,19 +244,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
243
244
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
244
245
|
Requires-Dist: scipy; extra == "runtime-common"
|
245
246
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
246
|
-
Requires-Dist: transformers==4.
|
247
|
+
Requires-Dist: transformers==4.53.0; extra == "runtime-common"
|
248
|
+
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
247
249
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
248
250
|
Requires-Dist: uvloop; extra == "runtime-common"
|
249
251
|
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
250
252
|
Provides-Extra: srt
|
251
253
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
252
|
-
Requires-Dist: sgl-kernel==0.
|
254
|
+
Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
|
253
255
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
254
256
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
255
257
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
256
258
|
Requires-Dist: cuda-python; extra == "srt"
|
257
259
|
Requires-Dist: einops; extra == "srt"
|
258
|
-
Requires-Dist: flashinfer_python==0.2.
|
260
|
+
Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
|
259
261
|
Provides-Extra: blackwell
|
260
262
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
261
263
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -264,7 +266,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
|
|
264
266
|
Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
265
267
|
Requires-Dist: cuda-python; extra == "blackwell"
|
266
268
|
Requires-Dist: einops; extra == "blackwell"
|
267
|
-
Requires-Dist: flashinfer_python==0.2.
|
269
|
+
Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
|
268
270
|
Provides-Extra: srt-hip
|
269
271
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
270
272
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -295,7 +297,6 @@ Requires-Dist: jsonlines; extra == "test"
|
|
295
297
|
Requires-Dist: matplotlib; extra == "test"
|
296
298
|
Requires-Dist: pandas; extra == "test"
|
297
299
|
Requires-Dist: peft; extra == "test"
|
298
|
-
Requires-Dist: timm; extra == "test"
|
299
300
|
Requires-Dist: sentence_transformers; extra == "test"
|
300
301
|
Provides-Extra: all
|
301
302
|
Requires-Dist: sglang[srt]; extra == "all"
|
@@ -373,6 +374,8 @@ Dynamic: license-file
|
|
373
374
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
374
375
|
|
375
376
|
## News
|
377
|
+
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
378
|
+
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
376
379
|
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
377
380
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
378
381
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|