sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,18 @@ from __future__ import annotations
3
3
  import importlib
4
4
  import sys
5
5
  from types import MappingProxyType
6
- from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ List,
12
+ Mapping,
13
+ Optional,
14
+ Tuple,
15
+ Union,
16
+ cast,
17
+ )
7
18
 
8
19
  import torch
9
20
  from torch.nn.parameter import Parameter
@@ -79,22 +90,16 @@ def npu_wrapper_rmsnorm_forward(func):
79
90
  ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
80
91
  if not x.is_contiguous():
81
92
  x = x.contiguous()
82
- original_dtype = x.dtype
83
- x = x.to(torch.float32)
84
93
  if residual is not None:
85
- x = x + residual.to(torch.float32)
86
- residual = x.to(original_dtype)
87
-
88
- x = (
89
- torch_npu.npu_rms_norm(
90
- x, self.weight.to(torch.float32), self.variance_epsilon
91
- )[0]
92
- + self.bias
93
- )
94
+ out, _, residual_out = torch_npu.npu_add_rms_norm(
95
+ residual, x, self.weight.data, self.variance_epsilon
96
+ )
97
+ out = out + self.bias
98
+ return out.to(x.dtype), residual_out
94
99
 
95
- if residual is None:
96
- return x.to(original_dtype)
97
- return x.to(original_dtype), residual
100
+ out = torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
101
+ out = out + self.bias
102
+ return out.to(x.dtype)
98
103
 
99
104
  return _rmsnorm_forward_oot
100
105
 
@@ -250,17 +255,23 @@ class W8A8Int8Config(QuantizationConfig):
250
255
 
251
256
  if _is_npu:
252
257
  if isinstance(layer, LinearBase):
258
+ key = "model"
259
+ if "vision_model" in prefix:
260
+ key = "vision_model"
261
+ elif "visual" in prefix:
262
+ key = "visual"
263
+ packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
253
264
  prefix_in_quant_config = prefix
254
265
  proj_name = prefix.split(".")[-1]
255
- if proj_name in self.packed_modules_mapping:
266
+ if proj_name in packed_modules_mapping_subset:
256
267
  prefix_in_quant_config = prefix.replace(
257
- proj_name, self.packed_modules_mapping[proj_name][0]
268
+ proj_name, packed_modules_mapping_subset[proj_name][0]
258
269
  )
259
270
  self.is_dynamic = (
260
271
  self.quant_description[prefix_in_quant_config + ".weight"]
261
272
  == "W8A8_DYNAMIC"
262
273
  )
263
- if self.is_layer_skipped(prefix, self.packed_modules_mapping):
274
+ if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
264
275
  return UnquantizedLinearMethod()
265
276
  return (
266
277
  NPU_W8A8DynamicLinearMethod(self)
@@ -571,8 +582,10 @@ class NPU_W8A8LinearMethodImpl:
571
582
  layer: torch.nn.Module,
572
583
  x: torch.Tensor,
573
584
  bias: Optional[torch.Tensor] = None,
574
- tp_rank: Optional[int] = 0,
575
585
  ) -> torch.Tensor:
586
+ # To prevent import loops
587
+ from sglang.srt.layers.linear import RowParallelLinear
588
+
576
589
  original_dtype = x.dtype
577
590
  if original_dtype != torch.int8:
578
591
  x = torch_npu.npu_quantize(
@@ -583,8 +596,12 @@ class NPU_W8A8LinearMethodImpl:
583
596
  -1,
584
597
  True,
585
598
  )
586
-
587
- quant_bias = layer.quant_bias if tp_rank == 0 else None
599
+ # Only fuse bias add into GEMM for rank 0 (this ensures that
600
+ # bias will not get added more than once in Attention TP>1 case)
601
+ if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
602
+ quant_bias = None
603
+ else:
604
+ quant_bias = layer.quant_bias
588
605
  return torch_npu.npu_quant_matmul(
589
606
  x,
590
607
  layer.weight,
@@ -651,13 +668,21 @@ class NPU_W8A8LinearMethodMTImpl:
651
668
  layer: torch.nn.Module,
652
669
  x: torch.Tensor,
653
670
  bias: Optional[torch.Tensor] = None,
654
- tp_rank: Optional[int] = 0,
655
671
  ) -> torch.Tensor:
672
+ # To prevent import loops
673
+ from sglang.srt.layers.linear import RowParallelLinear
674
+
656
675
  original_dtype = x.dtype
657
676
  if original_dtype != torch.int8:
658
677
  x = quant_per_tensor(x, layer.input_scale, layer.input_offset)
659
678
 
660
- quant_bias = layer.quant_bias if tp_rank == 0 else None
679
+ # Only fuse bias add into GEMM for rank 0 (this ensures that
680
+ # bias will not get added more than once in Attention TP>1 case)
681
+ if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
682
+ quant_bias = None
683
+ else:
684
+ quant_bias = layer.quant_bias
685
+
661
686
  return ops.quant_matmul(
662
687
  x=x, weight=layer.weight, deq_scale=layer.deq_scale, deq_bias=quant_bias
663
688
  )
@@ -737,11 +762,6 @@ class NPU_W8A8LinearMethod(LinearMethodBase):
737
762
  x: torch.Tensor,
738
763
  bias: Optional[torch.Tensor] = None,
739
764
  ) -> torch.Tensor:
740
- from sglang.srt.layers.linear import RowParallelLinear
741
-
742
- if isinstance(layer, RowParallelLinear):
743
- tp_rank = get_tensor_model_parallel_rank()
744
- return self.quant_method.apply(layer, x, bias, tp_rank)
745
765
  return self.quant_method.apply(layer, x, bias)
746
766
 
747
767
 
@@ -780,7 +800,6 @@ class NPU_W8A8DynamicLinearMethodImpl:
780
800
  tp_rank: Optional[int] = 0,
781
801
  ) -> torch.Tensor:
782
802
  original_dtype = x.dtype
783
- # use ATB quantize
784
803
  quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
785
804
  return torch_npu.npu_quant_matmul(
786
805
  quant_out,
@@ -863,11 +882,6 @@ class NPU_W8A8DynamicLinearMethod(LinearMethodBase):
863
882
  x: torch.Tensor,
864
883
  bias: Optional[torch.Tensor] = None,
865
884
  ) -> torch.Tensor:
866
- from sglang.srt.layers.linear import RowParallelLinear
867
-
868
- if isinstance(layer, RowParallelLinear):
869
- tp_rank = get_tensor_model_parallel_rank()
870
- return self.quant_method.apply(layer, x, bias, tp_rank)
871
885
  return self.quant_method.apply(layer, x, bias)
872
886
 
873
887