sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +168 -22
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +49 -0
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +35 -0
  8. sglang/srt/custom_op.py +7 -1
  9. sglang/srt/disaggregation/base/conn.py +2 -0
  10. sglang/srt/disaggregation/decode.py +22 -6
  11. sglang/srt/disaggregation/mooncake/conn.py +289 -48
  12. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  13. sglang/srt/disaggregation/nixl/conn.py +100 -52
  14. sglang/srt/disaggregation/prefill.py +5 -4
  15. sglang/srt/disaggregation/utils.py +13 -12
  16. sglang/srt/distributed/parallel_state.py +44 -17
  17. sglang/srt/entrypoints/EngineBase.py +8 -0
  18. sglang/srt/entrypoints/engine.py +45 -9
  19. sglang/srt/entrypoints/http_server.py +111 -24
  20. sglang/srt/entrypoints/openai/protocol.py +51 -6
  21. sglang/srt/entrypoints/openai/serving_chat.py +52 -76
  22. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  23. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  24. sglang/srt/eplb/__init__.py +0 -0
  25. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  26. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  27. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  28. sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
  29. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  30. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  31. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  32. sglang/srt/hf_transformers_utils.py +2 -1
  33. sglang/srt/layers/activation.py +7 -0
  34. sglang/srt/layers/amx_utils.py +86 -0
  35. sglang/srt/layers/attention/ascend_backend.py +219 -0
  36. sglang/srt/layers/attention/flashattention_backend.py +56 -23
  37. sglang/srt/layers/attention/tbo_backend.py +37 -9
  38. sglang/srt/layers/communicator.py +18 -2
  39. sglang/srt/layers/dp_attention.py +9 -3
  40. sglang/srt/layers/elementwise.py +76 -12
  41. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  42. sglang/srt/layers/layernorm.py +41 -0
  43. sglang/srt/layers/linear.py +99 -12
  44. sglang/srt/layers/logits_processor.py +15 -6
  45. sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
  46. sglang/srt/layers/moe/ep_moe/layer.py +115 -25
  47. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  49. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
  50. sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
  51. sglang/srt/layers/moe/router.py +60 -22
  52. sglang/srt/layers/moe/topk.py +36 -28
  53. sglang/srt/layers/parameter.py +67 -7
  54. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  55. sglang/srt/layers/quantization/fp8.py +44 -0
  56. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  57. sglang/srt/layers/quantization/fp8_utils.py +6 -6
  58. sglang/srt/layers/quantization/gptq.py +5 -1
  59. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  60. sglang/srt/layers/quantization/quant_utils.py +166 -0
  61. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  62. sglang/srt/layers/rotary_embedding.py +105 -13
  63. sglang/srt/layers/vocab_parallel_embedding.py +19 -2
  64. sglang/srt/lora/lora.py +4 -5
  65. sglang/srt/lora/lora_manager.py +73 -20
  66. sglang/srt/managers/configure_logging.py +1 -1
  67. sglang/srt/managers/io_struct.py +60 -15
  68. sglang/srt/managers/mm_utils.py +73 -59
  69. sglang/srt/managers/multimodal_processor.py +2 -6
  70. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  71. sglang/srt/managers/schedule_batch.py +80 -79
  72. sglang/srt/managers/scheduler.py +153 -63
  73. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  74. sglang/srt/managers/session_controller.py +12 -3
  75. sglang/srt/managers/tokenizer_manager.py +314 -103
  76. sglang/srt/managers/tp_worker.py +13 -1
  77. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  78. sglang/srt/mem_cache/allocator.py +290 -0
  79. sglang/srt/mem_cache/chunk_cache.py +34 -2
  80. sglang/srt/mem_cache/memory_pool.py +289 -3
  81. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  82. sglang/srt/model_executor/cuda_graph_runner.py +3 -2
  83. sglang/srt/model_executor/forward_batch_info.py +17 -4
  84. sglang/srt/model_executor/model_runner.py +302 -58
  85. sglang/srt/model_loader/loader.py +86 -10
  86. sglang/srt/model_loader/weight_utils.py +160 -3
  87. sglang/srt/models/deepseek_nextn.py +5 -4
  88. sglang/srt/models/deepseek_v2.py +305 -26
  89. sglang/srt/models/deepseek_vl2.py +3 -5
  90. sglang/srt/models/gemma3_causal.py +1 -2
  91. sglang/srt/models/gemma3n_audio.py +949 -0
  92. sglang/srt/models/gemma3n_causal.py +1010 -0
  93. sglang/srt/models/gemma3n_mm.py +495 -0
  94. sglang/srt/models/hunyuan.py +771 -0
  95. sglang/srt/models/kimi_vl.py +1 -2
  96. sglang/srt/models/llama.py +10 -4
  97. sglang/srt/models/llama4.py +32 -45
  98. sglang/srt/models/llama_eagle3.py +61 -11
  99. sglang/srt/models/llava.py +5 -5
  100. sglang/srt/models/minicpmo.py +2 -2
  101. sglang/srt/models/mistral.py +1 -1
  102. sglang/srt/models/mllama4.py +43 -11
  103. sglang/srt/models/phi4mm.py +1 -3
  104. sglang/srt/models/pixtral.py +3 -7
  105. sglang/srt/models/qwen2.py +31 -3
  106. sglang/srt/models/qwen2_5_vl.py +1 -3
  107. sglang/srt/models/qwen2_audio.py +200 -0
  108. sglang/srt/models/qwen2_moe.py +32 -6
  109. sglang/srt/models/qwen2_vl.py +1 -4
  110. sglang/srt/models/qwen3.py +94 -25
  111. sglang/srt/models/qwen3_moe.py +68 -21
  112. sglang/srt/models/vila.py +3 -8
  113. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
  114. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  115. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  116. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  117. sglang/srt/multimodal/processors/gemma3n.py +82 -0
  118. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  119. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  129. sglang/srt/operations_strategy.py +6 -2
  130. sglang/srt/reasoning_parser.py +26 -0
  131. sglang/srt/sampling/sampling_batch_info.py +39 -1
  132. sglang/srt/server_args.py +85 -24
  133. sglang/srt/speculative/build_eagle_tree.py +57 -18
  134. sglang/srt/speculative/eagle_worker.py +6 -4
  135. sglang/srt/two_batch_overlap.py +204 -28
  136. sglang/srt/utils.py +369 -138
  137. sglang/srt/warmup.py +12 -3
  138. sglang/test/runners.py +10 -1
  139. sglang/test/test_utils.py +15 -3
  140. sglang/version.py +1 -1
  141. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
  142. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
  143. sglang/math_utils.py +0 -8
  144. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  145. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  146. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  147. /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
  148. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
  149. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
sglang/test/runners.py CHANGED
@@ -503,6 +503,8 @@ class SRTRunner:
503
503
  disable_overlap_schedule: bool = False,
504
504
  disable_custom_all_reduce: bool = False,
505
505
  torchao_config: Optional[str] = None,
506
+ cuda_graph_max_bs: int = 4,
507
+ sleep_on_idle=False,
506
508
  ):
507
509
  self.model_type = model_type
508
510
  self.is_generation = model_type == "generation"
@@ -538,8 +540,9 @@ class SRTRunner:
538
540
  tokenizer_path=tokenizer_path,
539
541
  enable_ep_moe=enable_ep_moe,
540
542
  disable_overlap_schedule=disable_overlap_schedule,
541
- cuda_graph_max_bs=4,
543
+ cuda_graph_max_bs=cuda_graph_max_bs,
542
544
  disable_custom_all_reduce=disable_custom_all_reduce,
545
+ sleep_on_idle=sleep_on_idle,
543
546
  **spec_kwargs,
544
547
  )
545
548
 
@@ -550,6 +553,12 @@ class SRTRunner:
550
553
  else:
551
554
  self.tokenizer = None
552
555
 
556
+ def load_lora_adapter(self, lora_name: str, lora_path: str):
557
+ return self.engine.load_lora_adapter(lora_name, lora_path)
558
+
559
+ def unload_lora_adapter(self, lora_name: str):
560
+ return self.engine.unload_lora_adapter(lora_name)
561
+
553
562
  def forward(
554
563
  self,
555
564
  prompts: Union[
sglang/test/test_utils.py CHANGED
@@ -5,6 +5,7 @@ import copy
5
5
  import logging
6
6
  import os
7
7
  import random
8
+ import re
8
9
  import subprocess
9
10
  import threading
10
11
  import time
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
840
841
  print(f"Output: {output}", flush=True)
841
842
  print(f"Error: {error}", flush=True)
842
843
 
843
- lastline = output.split("\n")[-3]
844
- output_throughput = float(lastline.split(" ")[-2])
844
+ # Return prefill_latency, decode_throughput, decode_latency
845
+ prefill_line = output.split("\n")[-9]
846
+ decode_line = output.split("\n")[-3]
847
+ pattern = (
848
+ r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
849
+ )
850
+ match = re.search(pattern, prefill_line)
851
+ if match:
852
+ prefill_latency = float(match.group("latency"))
853
+ match = re.search(pattern, decode_line)
854
+ if match:
855
+ decode_latency = float(match.group("latency"))
856
+ decode_throughput = float(match.group("throughput"))
845
857
  finally:
846
858
  kill_process_tree(process.pid)
847
859
 
848
- return output_throughput
860
+ return prefill_latency, decode_throughput, decode_latency
849
861
 
850
862
 
851
863
  def run_bench_offline_throughput(model, other_args):
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.8"
1
+ __version__ = "0.4.9"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.8
3
+ Version: 0.4.9
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -219,6 +219,7 @@ Requires-Dist: IPython
219
219
  Requires-Dist: setproctitle
220
220
  Provides-Extra: runtime-common
221
221
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
222
+ Requires-Dist: build; extra == "runtime-common"
222
223
  Requires-Dist: compressed-tensors; extra == "runtime-common"
223
224
  Requires-Dist: datasets; extra == "runtime-common"
224
225
  Requires-Dist: fastapi; extra == "runtime-common"
@@ -243,19 +244,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
243
244
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
244
245
  Requires-Dist: scipy; extra == "runtime-common"
245
246
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
246
- Requires-Dist: transformers==4.52.3; extra == "runtime-common"
247
+ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
248
+ Requires-Dist: timm==1.0.16; extra == "runtime-common"
247
249
  Requires-Dist: uvicorn; extra == "runtime-common"
248
250
  Requires-Dist: uvloop; extra == "runtime-common"
249
251
  Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
250
252
  Provides-Extra: srt
251
253
  Requires-Dist: sglang[runtime_common]; extra == "srt"
252
- Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
254
+ Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
253
255
  Requires-Dist: torch==2.7.1; extra == "srt"
254
256
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
255
257
  Requires-Dist: torchvision==0.22.1; extra == "srt"
256
258
  Requires-Dist: cuda-python; extra == "srt"
257
259
  Requires-Dist: einops; extra == "srt"
258
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
260
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
259
261
  Provides-Extra: blackwell
260
262
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
261
263
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -264,7 +266,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
264
266
  Requires-Dist: torchvision==0.22.1; extra == "blackwell"
265
267
  Requires-Dist: cuda-python; extra == "blackwell"
266
268
  Requires-Dist: einops; extra == "blackwell"
267
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
269
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
268
270
  Provides-Extra: srt-hip
269
271
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
270
272
  Requires-Dist: torch; extra == "srt-hip"
@@ -295,7 +297,6 @@ Requires-Dist: jsonlines; extra == "test"
295
297
  Requires-Dist: matplotlib; extra == "test"
296
298
  Requires-Dist: pandas; extra == "test"
297
299
  Requires-Dist: peft; extra == "test"
298
- Requires-Dist: timm; extra == "test"
299
300
  Requires-Dist: sentence_transformers; extra == "test"
300
301
  Provides-Extra: all
301
302
  Requires-Dist: sglang[srt]; extra == "all"
@@ -373,6 +374,8 @@ Dynamic: license-file
373
374
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
374
375
 
375
376
  ## News
377
+ - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
378
+ - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
376
379
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
377
380
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
378
381
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))