PyPI - sglang - Versions diffs - 0.5.2rc0__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl - Mend

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sglang/srt/configs/model_config.py +2 -1
sglang/srt/distributed/parallel_state.py +3 -1
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
sglang/srt/layers/moe/ep_moe/layer.py +2 -7
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
sglang/srt/layers/quantization/w4afp8.py +30 -25
sglang/srt/managers/detokenizer_manager.py +0 -34
sglang/srt/managers/multi_tokenizer_mixin.py +44 -6
sglang/srt/managers/scheduler.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +19 -3
sglang/srt/mem_cache/memory_pool_host.py +2 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +27 -6
sglang/srt/models/deepseek_v2.py +5 -0
sglang/srt/models/gpt_oss.py +5 -4
sglang/srt/models/longcat_flash.py +26 -15
sglang/srt/models/longcat_flash_nextn.py +23 -15
sglang/srt/utils.py +0 -10
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/version.py +1 -1
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +2 -2
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +32 -29
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -405,9 +405,10 @@ class ModelConfig:
             # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
         if quant_cfg is None:
-            # check if is modelopt model -- modelopt doesn't have corresponding field
+            # check if is modelopt or mixed-precision model -- Both of them don't have corresponding field
             # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
             # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
+            # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
             is_local = os.path.exists(self.model_path)
             modelopt_quant_config = {"quant_method": "modelopt"}
             if not is_local:

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -43,6 +43,7 @@ from sglang.srt.utils import (
     direct_register_custom_op,
     get_bool_env_var,
     get_int_env_var,
+    is_cpu,
     is_cuda_alike,
     is_hip,
     is_npu,
@@ -51,6 +52,7 @@ from sglang.srt.utils import (
 )
 _is_npu = is_npu()
+_is_cpu = is_cpu()
 IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
@@ -1643,7 +1645,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         ray.shutdown()
     gc.collect()
-    if not current_platform.is_cpu():
+    if not _is_cpu:
         if hasattr(torch, "cuda") and torch.cuda.is_available():
             torch.cuda.empty_cache()
             if hasattr(torch._C, "_host_emptyCache"):

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -681,7 +681,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.3.7.post1",
+            "0.3.8",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )

sglang/srt/layers/moe/cutlass_w4a8_moe.py CHANGED Viewed

@@ -91,18 +91,10 @@ def cutlass_w4a8_moe(
     assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
     assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
     assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
-    assert (
-        w1_scale.shape[1] == w1_q.shape[2] * 2 / 512
-        and w1_scale.shape[2] == w1_q.shape[1] * 4
-    ), "W1 scale shape mismatch"
-    assert (
-        w2_scale.shape[1] == w2_q.shape[2] * 2 / 512
-        and w2_scale.shape[2] == w2_q.shape[1] * 4
-    ), "W2 scale shape mismatch"
     assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
     assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
-    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number  mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
     assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
     num_experts = w1_q.size(0)
     m = a.size(0)

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -114,9 +114,6 @@ class EPMoE(FusedMoE):
             with_bias=with_bias,
         )
-        self.start_expert_id = self.moe_ep_rank * self.num_local_experts
-        self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
         self.intermediate_size = intermediate_size
         if isinstance(quant_config, Fp8Config):
@@ -232,7 +229,7 @@ class EPMoE(FusedMoE):
             (
                 _cast_to_e8m0_with_rounding_up(gateup_input_scale)
                 if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
                     gateup_input_scale
                 )
             ),
@@ -289,9 +286,7 @@ class EPMoE(FusedMoE):
             (
                 down_input_scale
                 if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
-                    down_input_scale
-                )
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
             ),
         )
         down_output = torch.empty(

sglang/srt/layers/moe/fused_moe_triton/__init__.py CHANGED Viewed

@@ -1,16 +1,18 @@
 from contextlib import contextmanager
 from typing import Any, Dict, Optional
-from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
-    fused_experts,
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_config import (
     get_config_file_name,
-    moe_align_block_size,
     try_get_optimal_moe_config,
 )
 from sglang.srt.layers.moe.fused_moe_triton.layer import (
     FusedMoE,
     FusedMoeWeightScaleSupported,
 )
+from sglang.srt.layers.moe.fused_moe_triton.moe_align_block_size import (
+    moe_align_block_size,
+)
 _config: Optional[Dict[str, Any]] = None

sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc1py3-none-any.whl