sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,8 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ from typing import Optional
17
+
16
18
  import torch
17
19
 
18
20
 
@@ -24,7 +26,7 @@ class MXFP4QuantizeUtil:
24
26
  E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
25
27
 
26
28
  @classmethod
27
- def quantize(cls, input: torch.Tensor, block_size: int | None) -> tuple:
29
+ def quantize(cls, input: torch.Tensor, block_size: Optional[int]) -> tuple:
28
30
  """Converting a tensor to a quantized format based on MXFP4 quantization. Only E4M3 is supported.
29
31
  Args:
30
32
  input (torch.Tensor): The input tensor to be quantized.
@@ -77,6 +77,19 @@ def is_layer_skipped(
77
77
  )
78
78
  else:
79
79
  is_skipped = prefix in ignored_layers
80
+ if "gate_up_proj" in prefix:
81
+ prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
82
+ prefix_up = prefix.replace("gate_up_proj", "up_proj")
83
+ if prefix_gate in ignored_layers and prefix_up in ignored_layers:
84
+ is_skipped = True
85
+ elif "experts" in prefix:
86
+ is_skipped = any(
87
+ [
88
+ prefix in layer_name
89
+ for layer_name in ignored_layers
90
+ if "experts" in layer_name
91
+ ]
92
+ )
80
93
 
81
94
  assert is_skipped is not None
82
95
  return is_skipped
@@ -551,7 +551,7 @@ class NPU_W8A8LinearMethodImpl:
551
551
  def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
552
552
  params_dict = {}
553
553
  params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
554
- params_dict["input_offset"] = torch.empty(1, dtype=torch.int8)
554
+ params_dict["input_offset"] = torch.empty(1, dtype=params_dtype)
555
555
  return params_dict
556
556
 
557
557
  @staticmethod
@@ -582,11 +582,11 @@ class NPU_W8A8LinearMethodImpl:
582
582
  if original_dtype != torch.int8:
583
583
  x = torch_npu.npu_quantize(
584
584
  x,
585
- layer.aclnn_input_scale,
585
+ layer.aclnn_input_scale_reciprocal,
586
586
  layer.aclnn_input_offset,
587
587
  torch.qint8,
588
588
  -1,
589
- True,
589
+ False,
590
590
  )
591
591
  # Only fuse bias add into GEMM for rank 0 (this ensures that
592
592
  # bias will not get added more than once in Attention TP>1 case)
@@ -608,6 +608,10 @@ class NPU_W8A8LinearMethodImpl:
608
608
  layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
609
609
  requires_grad=False,
610
610
  )
611
+ layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
612
+ layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
613
+ requires_grad=False,
614
+ )
611
615
  layer.aclnn_input_offset = torch.nn.Parameter(
612
616
  layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
613
617
  requires_grad=False,
@@ -1876,7 +1876,7 @@ def rotate_half(x):
1876
1876
  return torch.cat((-x2, x1), dim=-1)
1877
1877
 
1878
1878
 
1879
- def apply_rotary_pos_emb(
1879
+ def apply_rotary_pos_emb_native(
1880
1880
  q: torch.Tensor,
1881
1881
  k: torch.Tensor,
1882
1882
  cos: torch.Tensor,
@@ -1899,6 +1899,33 @@ def apply_rotary_pos_emb(
1899
1899
  return q_embed, k_embed
1900
1900
 
1901
1901
 
1902
+ def apply_rotary_pos_emb_npu(
1903
+ q: torch.Tensor,
1904
+ k: torch.Tensor,
1905
+ cos: torch.Tensor,
1906
+ sin: torch.Tensor,
1907
+ unsqueeze_dim=1,
1908
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1909
+ if q.shape[1] != 128:
1910
+ return apply_rotary_pos_emb_native(q, k, cos, sin, unsqueeze_dim)
1911
+ cos = cos.unsqueeze(unsqueeze_dim)
1912
+ cos = torch.transpose(cos, 1, 2)
1913
+ sin = sin.unsqueeze(unsqueeze_dim)
1914
+ sin = torch.transpose(sin, 1, 2)
1915
+ q = torch.transpose(q, 1, 2)
1916
+ k = torch.transpose(k, 1, 2)
1917
+ q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb(q, k, cos, sin)
1918
+ q_embed = torch.transpose(q_embed, 1, 2)
1919
+ k_embed = torch.transpose(k_embed, 1, 2)
1920
+ return q_embed, k_embed
1921
+
1922
+
1923
+ if _is_npu:
1924
+ apply_rotary_pos_emb = apply_rotary_pos_emb_npu
1925
+ else:
1926
+ apply_rotary_pos_emb = apply_rotary_pos_emb_native
1927
+
1928
+
1902
1929
  def get_rope_cpu(
1903
1930
  head_size: int,
1904
1931
  rotary_dim: int,
@@ -27,6 +27,7 @@ if is_cuda():
27
27
  logger = logging.getLogger(__name__)
28
28
 
29
29
  SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
30
+ RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
30
31
 
31
32
 
32
33
  class Sampler(nn.Module):
@@ -77,7 +78,12 @@ class Sampler(nn.Module):
77
78
  batch_next_token_ids = torch.argmax(logits, -1)
78
79
  if return_logprob:
79
80
  logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
81
+
80
82
  else:
83
+ # Post process original logits. if temperatures are all 1.0, no need to rescale
84
+ if return_logprob and RETURN_ORIGINAL_LOGPROB:
85
+ logprobs = torch.softmax(logits, dim=-1)
86
+
81
87
  # Post process logits
82
88
  logits.div_(sampling_info.temperatures)
83
89
  logits[:] = torch.softmax(logits, dim=-1)
@@ -116,7 +122,12 @@ class Sampler(nn.Module):
116
122
 
117
123
  if return_logprob:
118
124
  # clamp to avoid -inf
119
- logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
125
+ if RETURN_ORIGINAL_LOGPROB:
126
+ logprobs = torch.log(logprobs).clamp(
127
+ min=torch.finfo(logprobs.dtype).min
128
+ )
129
+ else:
130
+ logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
120
131
 
121
132
  # Attach logprobs to logits_output (in-place modification)
122
133
  if return_logprob:
@@ -201,7 +212,10 @@ def top_p_normalize_probs_torch(
201
212
  return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
202
213
 
203
214
 
204
- def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
215
+ def get_top_logprobs(
216
+ logprobs: torch.Tensor,
217
+ top_logprobs_nums: List[int],
218
+ ):
205
219
  max_k = max(top_logprobs_nums)
206
220
  ret = logprobs.topk(max_k, dim=1)
207
221
  values = ret.values.tolist()
@@ -212,10 +226,17 @@ def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
212
226
  for i, k in enumerate(top_logprobs_nums):
213
227
  output_top_logprobs_val.append(values[i][:k])
214
228
  output_top_logprobs_idx.append(indices[i][:k])
215
- return output_top_logprobs_val, output_top_logprobs_idx
229
+
230
+ return (
231
+ output_top_logprobs_val,
232
+ output_top_logprobs_idx,
233
+ )
216
234
 
217
235
 
218
- def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List[int]]):
236
+ def get_token_ids_logprobs(
237
+ logprobs: torch.Tensor,
238
+ token_ids_logprobs: List[List[int]],
239
+ ):
219
240
  output_token_ids_logprobs_val = []
220
241
  output_token_ids_logprobs_idx = []
221
242
  for i, token_ids in enumerate(token_ids_logprobs):
@@ -226,7 +247,10 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
226
247
  output_token_ids_logprobs_val.append([])
227
248
  output_token_ids_logprobs_idx.append([])
228
249
 
229
- return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
250
+ return (
251
+ output_token_ids_logprobs_val,
252
+ output_token_ids_logprobs_idx,
253
+ )
230
254
 
231
255
 
232
256
  def apply_custom_logit_processor(
@@ -34,17 +34,3 @@ class PPMissingLayer(torch.nn.Identity):
34
34
  """
35
35
  input = args[0] if args else next(iter(kwargs.values()))
36
36
  return (input,) if self.return_tuple else input
37
-
38
-
39
- @lru_cache(maxsize=1)
40
- def is_sm100_supported(device=None) -> bool:
41
- return (torch.cuda.get_device_capability(device)[0] == 10) and (
42
- torch.version.cuda >= "12.8"
43
- )
44
-
45
-
46
- @lru_cache(maxsize=1)
47
- def is_sm90_supported(device=None) -> bool:
48
- return (torch.cuda.get_device_capability(device)[0] == 9) and (
49
- torch.version.cuda >= "12.3"
50
- )