sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +2 -0
  3. sglang/srt/configs/longcat_flash.py +104 -0
  4. sglang/srt/configs/model_config.py +14 -1
  5. sglang/srt/connector/__init__.py +1 -1
  6. sglang/srt/connector/base_connector.py +1 -2
  7. sglang/srt/connector/redis.py +2 -2
  8. sglang/srt/connector/serde/__init__.py +1 -1
  9. sglang/srt/connector/serde/safe_serde.py +4 -3
  10. sglang/srt/disaggregation/ascend/conn.py +75 -0
  11. sglang/srt/disaggregation/launch_lb.py +0 -13
  12. sglang/srt/disaggregation/mini_lb.py +33 -8
  13. sglang/srt/disaggregation/prefill.py +1 -1
  14. sglang/srt/distributed/parallel_state.py +27 -15
  15. sglang/srt/entrypoints/engine.py +19 -12
  16. sglang/srt/entrypoints/http_server.py +174 -34
  17. sglang/srt/entrypoints/openai/protocol.py +60 -0
  18. sglang/srt/eplb/eplb_manager.py +26 -2
  19. sglang/srt/eplb/expert_distribution.py +29 -2
  20. sglang/srt/hf_transformers_utils.py +10 -0
  21. sglang/srt/layers/activation.py +12 -0
  22. sglang/srt/layers/attention/ascend_backend.py +240 -109
  23. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  24. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  25. sglang/srt/layers/layernorm.py +28 -3
  26. sglang/srt/layers/linear.py +3 -2
  27. sglang/srt/layers/logits_processor.py +1 -1
  28. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  29. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  30. sglang/srt/layers/moe/ep_moe/layer.py +14 -13
  31. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  32. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  34. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  37. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  38. sglang/srt/layers/moe/topk.py +35 -12
  39. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  40. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  41. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  42. sglang/srt/layers/quantization/mxfp4.py +9 -4
  43. sglang/srt/layers/quantization/utils.py +13 -0
  44. sglang/srt/layers/quantization/w4afp8.py +30 -25
  45. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  46. sglang/srt/layers/rotary_embedding.py +28 -1
  47. sglang/srt/layers/sampler.py +29 -5
  48. sglang/srt/managers/cache_controller.py +62 -96
  49. sglang/srt/managers/detokenizer_manager.py +9 -2
  50. sglang/srt/managers/io_struct.py +27 -0
  51. sglang/srt/managers/mm_utils.py +5 -1
  52. sglang/srt/managers/multi_tokenizer_mixin.py +629 -0
  53. sglang/srt/managers/scheduler.py +39 -2
  54. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  55. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  56. sglang/srt/managers/tokenizer_manager.py +86 -39
  57. sglang/srt/mem_cache/chunk_cache.py +1 -1
  58. sglang/srt/mem_cache/hicache_storage.py +20 -3
  59. sglang/srt/mem_cache/hiradix_cache.py +94 -71
  60. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  61. sglang/srt/mem_cache/memory_pool.py +4 -0
  62. sglang/srt/mem_cache/memory_pool_host.py +4 -4
  63. sglang/srt/mem_cache/radix_cache.py +5 -4
  64. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  65. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  66. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -9
  67. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
  68. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  69. sglang/srt/model_executor/model_runner.py +5 -4
  70. sglang/srt/model_loader/loader.py +15 -24
  71. sglang/srt/model_loader/utils.py +12 -0
  72. sglang/srt/models/deepseek_v2.py +31 -10
  73. sglang/srt/models/gpt_oss.py +5 -18
  74. sglang/srt/models/llama_eagle3.py +4 -0
  75. sglang/srt/models/longcat_flash.py +1026 -0
  76. sglang/srt/models/longcat_flash_nextn.py +699 -0
  77. sglang/srt/models/qwen2.py +26 -3
  78. sglang/srt/models/qwen2_5_vl.py +65 -41
  79. sglang/srt/models/qwen2_moe.py +22 -2
  80. sglang/srt/models/transformers.py +1 -1
  81. sglang/srt/multimodal/processors/base_processor.py +4 -2
  82. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  83. sglang/srt/server_args.py +112 -55
  84. sglang/srt/speculative/eagle_worker.py +28 -8
  85. sglang/srt/utils.py +4 -0
  86. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  87. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  88. sglang/version.py +1 -1
  89. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +5 -5
  90. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +93 -85
  91. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
  92. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
  93. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -46,6 +46,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
46
46
  from sglang.srt.utils import (
47
47
  empty_context,
48
48
  get_available_gpu_memory,
49
+ get_bool_env_var,
49
50
  is_cuda,
50
51
  next_power_of_2,
51
52
  )
@@ -54,6 +55,7 @@ if is_cuda():
54
55
  from sgl_kernel import segment_packbits
55
56
 
56
57
  logger = logging.getLogger(__name__)
58
+ RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
57
59
 
58
60
 
59
61
  @contextmanager
@@ -137,8 +139,15 @@ class EAGLEWorker(TpModelWorker):
137
139
  embed, head = self.target_worker.model_runner.model.get_embed_and_head()
138
140
 
139
141
  if self.speculative_algorithm.is_eagle3():
140
- # EAGLE3 models don't share lm_head
141
- self.draft_model_runner.model.set_embed(embed)
142
+ # most cases EAGLE3 models don't share lm_head
143
+ # but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
144
+ if (
145
+ hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
146
+ and self.draft_model_runner.model.load_lm_head_from_target
147
+ ):
148
+ self.draft_model_runner.model.set_embed_and_head(embed, head)
149
+ else:
150
+ self.draft_model_runner.model.set_embed(embed)
142
151
 
143
152
  # grab hot token ids
144
153
  if self.draft_model_runner.model.hot_token_id is not None:
@@ -781,15 +790,20 @@ class EAGLEWorker(TpModelWorker):
781
790
  token_ids_logprobs = batch.token_ids_logprobs
782
791
  accepted_indices = res.accepted_indices
783
792
  assert len(accepted_indices) == len(logits_output.next_token_logits)
793
+
784
794
  temperatures = batch.sampling_info.temperatures
785
795
  num_draft_tokens = batch.spec_info.draft_token_num
786
796
  # acceptance indices are the indices in a "flattened" batch.
787
797
  # dividing it to num_draft_tokens will yield the actual batch index.
788
798
  temperatures = temperatures[accepted_indices // num_draft_tokens]
789
-
790
- logprobs = torch.nn.functional.log_softmax(
791
- logits_output.next_token_logits / temperatures, dim=-1
792
- )
799
+ if RETURN_ORIGINAL_LOGPROB:
800
+ logprobs = torch.nn.functional.log_softmax(
801
+ logits_output.next_token_logits, dim=-1
802
+ )
803
+ else:
804
+ logprobs = torch.nn.functional.log_softmax(
805
+ logits_output.next_token_logits / temperatures, dim=-1
806
+ )
793
807
  batch_next_token_ids = res.verified_id
794
808
  num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
795
809
 
@@ -806,13 +820,19 @@ class EAGLEWorker(TpModelWorker):
806
820
  (
807
821
  logits_output.next_token_top_logprobs_val,
808
822
  logits_output.next_token_top_logprobs_idx,
809
- ) = get_top_logprobs(logprobs, top_logprobs_nums_repeat_interleaved)
823
+ ) = get_top_logprobs(
824
+ logprobs,
825
+ top_logprobs_nums_repeat_interleaved,
826
+ )
810
827
 
811
828
  if any(x is not None for x in token_ids_logprobs):
812
829
  (
813
830
  logits_output.next_token_token_ids_logprobs_val,
814
831
  logits_output.next_token_token_ids_logprobs_idx,
815
- ) = get_token_ids_logprobs(logprobs, token_ids_logprobs_repeat_interleaved)
832
+ ) = get_token_ids_logprobs(
833
+ logprobs,
834
+ token_ids_logprobs_repeat_interleaved,
835
+ )
816
836
 
817
837
  logits_output.next_token_logprobs = logprobs[
818
838
  torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
sglang/srt/utils.py CHANGED
@@ -2787,6 +2787,10 @@ def lru_cache_frozenset(maxsize=128):
2787
2787
  return decorator
2788
2788
 
2789
2789
 
2790
+ def get_origin_rid(rid):
2791
+ return rid.split("_", 1)[1] if "_" in rid else rid
2792
+
2793
+
2790
2794
  def apply_module_patch(target_module, target_function, wrappers):
2791
2795
  original_module, original_function = parse_module_path(
2792
2796
  target_module, target_function, False
@@ -208,6 +208,15 @@ class MockModelRunner:
208
208
  self.kv_cache_dtype = config["kv_cache_dtype"]
209
209
  self.page_size = config["page_size"]
210
210
 
211
+ # Server args stub - needed by attention backends
212
+ self.server_args = type(
213
+ "ServerArgs",
214
+ (),
215
+ {
216
+ "enable_dp_attention": False, # Default value for testing
217
+ },
218
+ )
219
+
211
220
  # Model-config stub with MLA attributes
212
221
  self.model_config = type(
213
222
  "ModelConfig",
@@ -833,7 +842,7 @@ class TestTRTLLMMLA(CustomTestCase):
833
842
 
834
843
  # Test workspace properties
835
844
  self.assertEqual(metadata.workspace.device.type, "cuda")
836
- self.assertEqual(metadata.workspace.dtype, torch.int8)
845
+ self.assertEqual(metadata.workspace.dtype, torch.uint8)
837
846
  self.assertGreater(
838
847
  metadata.workspace.numel(), 0, "Workspace should have non-zero size"
839
848
  )
@@ -993,8 +1002,8 @@ class TestTRTLLMMLA(CustomTestCase):
993
1002
  )
994
1003
 
995
1004
  # Verify CUDA graph buffers are allocated
996
- self.assertIsNotNone(backend.cuda_graph_kv_indices)
997
- self.assertIsNotNone(backend.cuda_graph_workspace)
1005
+ self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
1006
+ self.assertIsNotNone(backend.decode_cuda_graph_workspace)
998
1007
 
999
1008
  # Test capture metadata
1000
1009
  seq_lens = torch.full(
@@ -1,6 +1,6 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
- from typing import Optional
3
+ from typing import Literal, Optional
4
4
 
5
5
  import pytest
6
6
  import torch
@@ -25,7 +25,7 @@ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Ten
25
25
  return packed_tensor.to(torch.int8)
26
26
 
27
27
 
28
- def pack_interleave(num_experts, ref_weight, ref_scale):
28
+ def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
29
29
  n, k = ref_weight.shape[1], ref_weight.shape[2]
30
30
 
31
31
  weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
@@ -33,11 +33,16 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
33
33
  w_q = w_q.contiguous()
34
34
 
35
35
  scale_interleaved = ref_scale.reshape(
36
- ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
36
+ ref_scale.shape[0],
37
+ ref_scale.shape[1],
38
+ (ref_scale.shape[2] // alignment),
39
+ alignment,
37
40
  ) # [E, N, K/4, 4]
38
41
  scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
39
42
  scale_interleaved = scale_interleaved.reshape(
40
- ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
43
+ ref_scale.shape[0],
44
+ ref_scale.shape[2] // alignment,
45
+ ref_scale.shape[1] * alignment,
41
46
  ) # [E, K/4, N*4]
42
47
  w_scale = scale_interleaved.contiguous()
43
48
 
@@ -48,12 +53,17 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
48
53
  @pytest.mark.parametrize("N", [2048])
49
54
  @pytest.mark.parametrize("K", [7168])
50
55
  @pytest.mark.parametrize("E", [256])
51
- @pytest.mark.parametrize("ep_size", [8])
56
+ @pytest.mark.parametrize("tp_size", [8])
57
+ @pytest.mark.parametrize("use_ep_moe", [True, False])
52
58
  @pytest.mark.parametrize("topk", [8])
53
59
  @pytest.mark.parametrize("group_size", [128])
54
60
  @pytest.mark.parametrize("dtype", [torch.bfloat16])
55
- def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
56
- local_e = E // ep_size
61
+ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
62
+ if use_ep_moe:
63
+ local_e = E // tp_size
64
+ else: # tp mode
65
+ local_e = E
66
+ N = N // tp_size
57
67
 
58
68
  debug = False
59
69
  if debug:
@@ -87,7 +97,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
87
97
  )
88
98
 
89
99
  w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
90
- w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
100
+ if use_ep_moe:
101
+ w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
102
+ else:
103
+ w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
91
104
 
92
105
  device = "cuda"
93
106
  a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
@@ -265,7 +278,9 @@ def ref(
265
278
 
266
279
  gate, fc1 = fc1.chunk(2, dim=-1)
267
280
  fc1 = fc1 * torch.nn.functional.silu(gate)
268
- act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
281
+ act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
282
+ torch.float8_e4m3fn
283
+ )
269
284
  act = act.to(dtype)
270
285
 
271
286
  w2 = ref_weight_2[e_idx]
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.1.post3"
1
+ __version__ = "0.5.2rc1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.1.post3
3
+ Version: 0.5.2rc1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
251
251
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
252
  Requires-Dist: tiktoken; extra == "runtime-common"
253
253
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
254
- Requires-Dist: transformers==4.55.2; extra == "runtime-common"
254
+ Requires-Dist: transformers==4.56.0; extra == "runtime-common"
255
255
  Requires-Dist: uvicorn; extra == "runtime-common"
256
256
  Requires-Dist: uvloop; extra == "runtime-common"
257
257
  Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
- Requires-Dist: sgl-kernel==0.3.7; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.8; extra == "srt"
261
261
  Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"
264
264
  Requires-Dist: cuda-python; extra == "srt"
265
- Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
266
266
  Provides-Extra: blackwell
267
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
268
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
270
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
271
271
  Requires-Dist: torchvision; extra == "blackwell"
272
272
  Requires-Dist: cuda-python; extra == "blackwell"
273
- Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
274
274
  Provides-Extra: srt-hip
275
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
276
276
  Requires-Dist: torch; extra == "srt-hip"