sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sglang/srt/configs/model_config.py +2 -1
  2. sglang/srt/distributed/parallel_state.py +3 -1
  3. sglang/srt/entrypoints/engine.py +1 -1
  4. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  5. sglang/srt/layers/moe/ep_moe/layer.py +2 -7
  6. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  7. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  8. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  9. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  10. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  11. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  12. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
  13. sglang/srt/layers/quantization/w4afp8.py +30 -25
  14. sglang/srt/managers/detokenizer_manager.py +0 -34
  15. sglang/srt/managers/multi_tokenizer_mixin.py +44 -6
  16. sglang/srt/managers/scheduler.py +3 -0
  17. sglang/srt/mem_cache/hiradix_cache.py +19 -3
  18. sglang/srt/mem_cache/memory_pool_host.py +2 -0
  19. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  20. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +27 -6
  21. sglang/srt/models/deepseek_v2.py +5 -0
  22. sglang/srt/models/gpt_oss.py +5 -4
  23. sglang/srt/models/longcat_flash.py +26 -15
  24. sglang/srt/models/longcat_flash_nextn.py +23 -15
  25. sglang/srt/utils.py +0 -10
  26. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  27. sglang/version.py +1 -1
  28. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +2 -2
  29. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +32 -29
  30. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
  31. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
  32. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -405,9 +405,10 @@ class ModelConfig:
405
405
  # compressed-tensors uses a "compression_config" key
406
406
  quant_cfg = getattr(self.hf_config, "compression_config", None)
407
407
  if quant_cfg is None:
408
- # check if is modelopt model -- modelopt doesn't have corresponding field
408
+ # check if is modelopt or mixed-precision model -- Both of them don't have corresponding field
409
409
  # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
410
410
  # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
411
+ # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
411
412
  is_local = os.path.exists(self.model_path)
412
413
  modelopt_quant_config = {"quant_method": "modelopt"}
413
414
  if not is_local:
@@ -43,6 +43,7 @@ from sglang.srt.utils import (
43
43
  direct_register_custom_op,
44
44
  get_bool_env_var,
45
45
  get_int_env_var,
46
+ is_cpu,
46
47
  is_cuda_alike,
47
48
  is_hip,
48
49
  is_npu,
@@ -51,6 +52,7 @@ from sglang.srt.utils import (
51
52
  )
52
53
 
53
54
  _is_npu = is_npu()
55
+ _is_cpu = is_cpu()
54
56
 
55
57
  IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
56
58
 
@@ -1643,7 +1645,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
1643
1645
 
1644
1646
  ray.shutdown()
1645
1647
  gc.collect()
1646
- if not current_platform.is_cpu():
1648
+ if not _is_cpu:
1647
1649
  if hasattr(torch, "cuda") and torch.cuda.is_available():
1648
1650
  torch.cuda.empty_cache()
1649
1651
  if hasattr(torch._C, "_host_emptyCache"):
@@ -681,7 +681,7 @@ def _set_envs_and_config(server_args: ServerArgs):
681
681
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
682
682
  assert_pkg_version(
683
683
  "sgl-kernel",
684
- "0.3.7.post1",
684
+ "0.3.8",
685
685
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
686
686
  )
687
687
 
@@ -91,18 +91,10 @@ def cutlass_w4a8_moe(
91
91
  assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
92
92
  assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
93
93
  assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
94
- assert (
95
- w1_scale.shape[1] == w1_q.shape[2] * 2 / 512
96
- and w1_scale.shape[2] == w1_q.shape[1] * 4
97
- ), "W1 scale shape mismatch"
98
- assert (
99
- w2_scale.shape[1] == w2_q.shape[2] * 2 / 512
100
- and w2_scale.shape[2] == w2_q.shape[1] * 4
101
- ), "W2 scale shape mismatch"
102
94
 
103
95
  assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
104
96
  assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
105
- assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
97
+ assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
106
98
  assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
107
99
  num_experts = w1_q.size(0)
108
100
  m = a.size(0)
@@ -114,9 +114,6 @@ class EPMoE(FusedMoE):
114
114
  with_bias=with_bias,
115
115
  )
116
116
 
117
- self.start_expert_id = self.moe_ep_rank * self.num_local_experts
118
- self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
119
-
120
117
  self.intermediate_size = intermediate_size
121
118
 
122
119
  if isinstance(quant_config, Fp8Config):
@@ -232,7 +229,7 @@ class EPMoE(FusedMoE):
232
229
  (
233
230
  _cast_to_e8m0_with_rounding_up(gateup_input_scale)
234
231
  if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
235
- else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
232
+ else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
236
233
  gateup_input_scale
237
234
  )
238
235
  ),
@@ -289,9 +286,7 @@ class EPMoE(FusedMoE):
289
286
  (
290
287
  down_input_scale
291
288
  if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
292
- else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
293
- down_input_scale
294
- )
289
+ else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
295
290
  ),
296
291
  )
297
292
  down_output = torch.empty(
@@ -1,16 +1,18 @@
1
1
  from contextlib import contextmanager
2
2
  from typing import Any, Dict, Optional
3
3
 
4
- from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
5
- fused_experts,
4
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
5
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_config import (
6
6
  get_config_file_name,
7
- moe_align_block_size,
8
7
  try_get_optimal_moe_config,
9
8
  )
10
9
  from sglang.srt.layers.moe.fused_moe_triton.layer import (
11
10
  FusedMoE,
12
11
  FusedMoeWeightScaleSupported,
13
12
  )
13
+ from sglang.srt.layers.moe.fused_moe_triton.moe_align_block_size import (
14
+ moe_align_block_size,
15
+ )
14
16
 
15
17
  _config: Optional[Dict[str, Any]] = None
16
18