PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp4.py DELETED Viewed

@@ -1,557 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-import fnmatch
-import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast
-import aiter
-import torch
-import torch.nn.functional as F
-from aiter import ActivationType, QuantType, dtypes
-from aiter.fused_moe import fused_moe
-from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
-from aiter.ops.gemm_op_a4w4 import gemm_a4w4
-from aiter.ops.quant import get_torch_quant
-from aiter.ops.shuffle import shuffle_weight
-from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
-from aiter.ops.triton.quant import dynamic_mxfp4_quant
-from aiter.utility.fp4_utils import e8m0_shuffle
-from torch.nn import Module
-from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-from sglang.srt.layers.parameter import ModelWeightParameter
-from sglang.srt.layers.quantization.base_config import (
-    FusedMoEMethodBase,
-    LinearMethodBase,
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
-from sglang.srt.layers.quantization.quark.schemes import QuarkScheme, QuarkW4A4MXFP4
-from sglang.srt.layers.quantization.quark.utils import deep_compare, should_ignore_layer
-from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import (
-    get_bool_env_var,
-    get_device_capability,
-    log_info_on_rank0,
-    mxfp_supported,
-    set_weight_attrs,
-)
-if TYPE_CHECKING:
-    from sglang.srt.layers.moe.topk import TopKOutput
-logger = logging.getLogger(__name__)
-use_dynamic_mxfp4_linear = get_bool_env_var("SGLANG_USE_DYNAMIC_MXFP4_linear")
-OCP_MX_BLOCK_SIZE = 32
-class Mxfp4Config(QuantizationConfig):
-    def __init__(self, ignored_layers: Optional[list[str]] = None):
-        super().__init__()
-        self.ignored_layers = ignored_layers
-    @classmethod
-    def from_config(cls, config):
-        return cls()
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 80
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "mxfp4"
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.bfloat16]
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return []
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-        if isinstance(layer, LinearBase):
-            if self.ignored_layers and is_layer_skipped(
-                prefix=prefix,
-                ignored_layers=self.ignored_layers,
-                fused_mapping=self.packed_modules_mapping,
-            ):
-                return UnquantizedLinearMethod()
-            raise NotImplementedError("Mxfp4 linear layer is not implemented")
-        elif isinstance(layer, FusedMoE):
-            return Mxfp4MoEMethod(layer.moe_config)
-        elif isinstance(layer, Attention):
-            raise NotImplementedError("Mxfp4 attention layer is not implemented")
-        return None
-class MxFp4LinearMethod(LinearMethodBase):
-    def __init__(self, quantization_config: MxFp4Config):
-        self.quantization_config = quantization_config
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        return
-        # if self.quantization_config.is_checkpoint_fp4_serialized:
-        #    layer.scheme.process_weights_after_loading(layer)
-        # else:
-        #    #w, w_scales = dynamic_mxfp4_quant(layer.weight.data)
-        #    ##log_info_on_rank0(logger, f"w.shape: {w.shape}")
-        #    #wshuffle = w#shuffle_weight(w, layout=(16, 16))
-        #    #w_scales_shuffle = w_scales#e8m0_shuffle(w_scales).view(dtypes.fp8_e8m0)
-        #    quant_func = aiter.get_triton_quant(aiter.QuantType.per_1x32)
-        #    w, w_scales_shuffle = quant_func(layer.weight.data, shuffle=True)
-        #    wshuffle = shuffle_weight(w, layout=(16, 16))
-        #    layer.weight = torch.nn.Parameter(wshuffle,
-        #                                      requires_grad=False)
-        #    layer.weight_scale = torch.nn.Parameter(w_scales_shuffle,
-        #                                            requires_grad=False)
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        """
-        Use the CompressedTensorsScheme associated with each layer to create
-        the necessary parameters for the layer. See LinearMethodBase for param
-        details
-        """
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        if self.quantization_config.is_checkpoint_fp4_serialized:
-            layer.scheme.create_weights(
-                layer=layer,
-                input_size=input_size,
-                input_size_per_partition=input_size_per_partition,
-                output_partition_sizes=output_partition_sizes,
-                output_size=output_size,
-                params_dtype=params_dtype,
-                weight_loader=weight_loader,
-            )
-        else:
-            output_size_per_partition = sum(output_partition_sizes)
-            layer.logical_widths = output_partition_sizes
-            layer.input_size_per_partition = input_size_per_partition
-            layer.output_size_per_partition = output_size_per_partition
-            layer.orig_dtype = params_dtype
-            weight_dtype = params_dtype
-            weight = ModelWeightParameter(
-                data=torch.empty(
-                    output_size_per_partition,
-                    input_size_per_partition,
-                    dtype=weight_dtype,
-                ),
-                input_dim=1,
-                output_dim=0,
-                weight_loader=weight_loader,
-            )
-            layer.register_parameter("weight", weight)
-            layer.register_parameter("weight_scale", None)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ):
-        """
-        Use the output of create_weights and the CompressedTensorsScheme
-        associated with the layer to apply the forward pass with the
-        layer input.  See LinearMethodBase for param details
-        """
-        if self.quantization_config.is_checkpoint_fp4_serialized:
-            scheme = layer.scheme
-            if scheme is None:
-                raise ValueError("A scheme must be defined for each layer")
-            return scheme.apply_weights(layer, x, bias=bias)
-        else:
-            out_dtype = x.dtype
-            # ck or asm implement
-            # M = x.shape[0]
-            # N = layer.weight.shape[0]
-            # quant_func = aiter.get_triton_quant(aiter.QuantType.per_1x32)
-            # x, x_scales_shuffle = quant_func(x, shuffle=True)
-            # y = torch.zeros((M + 255) // 256 * 256, N, device=x.device, dtype=out_dtype)
-            # out = gemm_a4w4(x, layer.weight.data, x_scales_shuffle, layer.weight_scale.data, y, bias=bias)
-            # return out[:M]
-            # triton implement
-            x_q, x_s = dynamic_mxfp4_quant(x)
-            y = torch.empty(
-                x_q.shape[0], layer.weight.shape[0], device=x_q.device, dtype=out_dtype
-            )
-            out = gemm_afp4wfp4(
-                x_q, layer.weight, x_s, layer.weight_scale, out_dtype, y
-            )
-            return out
-class MxFp4MoEMethod:
-    def __new__(cls, *args, **kwargs):
-        if not hasattr(cls, "_initialized"):
-            original_init = cls.__init__
-            new_cls = type(
-                cls.__name__,
-                (FusedMoEMethodBase,),
-                {
-                    "__init__": original_init,
-                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
-                },
-            )
-            obj = super(new_cls, new_cls).__new__(new_cls)
-            obj.__init__(*args, **kwargs)
-            return obj
-        return super().__new__(cls)
-    @staticmethod
-    def get_moe_method(
-        quant_config: "MxFp4Config",  # type: ignore # noqa E501 # noqa F821
-        module: torch.nn.Module,
-        layer_name: str,
-    ) -> "MxFp4MoEMethod":
-        if quant_config.is_checkpoint_fp4_serialized:
-            layer_quant_config = quant_config._find_matched_config(layer_name, module)
-            if layer_quant_config.get("output_tensors") or layer_quant_config.get(
-                "bias"
-            ):
-                raise NotImplementedError(
-                    "Currently, Quark models with "
-                    "output_tensors and bias "
-                    "quantized are not supported"
-                )
-            weight_config = layer_quant_config.get("weight")
-            input_config = layer_quant_config.get("input_tensors")
-            if quant_config._is_mx_fp4(weight_config, input_config):
-                return W4A4MXFp4MoEStaticMethod(weight_config, input_config)
-            else:
-                raise RuntimeError("Unsupported FusedMoe scheme")
-        else:
-            return W4A4MXFp4MoEDynamicMethod(quant_config)
-class W4A4MXFp4MoEDynamicMethod(MxFp4MoEMethod):
-    def __init__(self, quant_config):
-        self.quant_config = quant_config
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
-    def mxfp4_quantize(self, w):
-        w_shape = w.shape
-        w_need_reshape = True if w.dim() != 2 else False
-        if w_need_reshape:
-            w_last_dim_size = w_shape[-1]
-            w = w.view(-1, w_last_dim_size)
-        # log_info_on_rank0(logger, f"[Pre-quant] w.shape: {w.shape}")
-        w, mx_scales = dynamic_mxfp4_quant(w)
-        # log_info_on_rank0(logger, f"[Post-quant] w.shape: {w.shape} mx_scales.shape: {mx_scales.shape}")
-        if w_need_reshape:
-            w_new_shape = w_shape[:-1] + (w.shape[-1],)
-            w = w.view(w_new_shape)
-        # log_info_on_rank0(logger, f"[re-shape] w.shape: {w.shape} mx_scales.shape: {mx_scales.shape}")
-        mx_scales = e8m0_shuffle(mx_scales)
-        return w, mx_scales
-    def process_weights_after_loading(self, layer: Module) -> None:
-        w13, w13_mx_scales = self.mxfp4_quantize(layer.w13_weight.data)
-        w2, w2_mx_scales = self.mxfp4_quantize(layer.w2_weight.data)
-        layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
-        layer.w13_weight_scale = torch.nn.Parameter(w13_mx_scales, requires_grad=False)
-        layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
-        layer.w2_weight_scale = torch.nn.Parameter(w2_mx_scales, requires_grad=False)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
-    ) -> torch.Tensor:
-        topk_weights, topk_ids, _ = topk_output
-        return fused_moe(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights,
-            topk_ids,
-            quant_type=QuantType.per_1x32,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
-            activation=(
-                ActivationType.Silu if activation == "silu" else ActivationType.Gelu
-            ),
-            doweight_stage1=False,
-        )
-class W4A4MXFp4MoEStaticMethod(MxFp4MoEMethod):
-    def __init__(self, weight_config: dict[str, Any], input_config: dict[str, Any]):
-        self.weight_quant = weight_config
-        self.input_quant = input_config
-        weight_qscheme = self.weight_quant.get("qscheme")
-        input_qscheme = self.input_quant.get("qscheme")
-        if not (weight_qscheme == "per_group" and input_qscheme == "per_group"):
-            raise ValueError(
-                "For MX(FP4) Fused MoE layers, only per-group scales "
-                "for weights and activations are supported. Found "
-                f"{weight_qscheme=}, {input_qscheme=}"
-            )  # noqa E501
-        self.static_input_scales = not self.input_quant.get("is_dynamic")
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
-        )
-        params_dtype = torch.uint8
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // 2,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // 2,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        # WEIGHT_SCALES
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size // OCP_MX_BLOCK_SIZE,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition // OCP_MX_BLOCK_SIZE,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        float_dtype = torch.get_default_dtype()
-        # Pre-shuffle weight scales
-        s0, s1, _ = layer.w13_weight_scale.shape
-        w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1)
-        w13_weight_scale = e8m0_shuffle(w13_weight_scale)
-        layer.w13_weight_scale.data = w13_weight_scale.view(s0, s1, -1)
-        s0, s1, _ = layer.w2_weight_scale.shape
-        w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1)
-        w2_weight_scale = e8m0_shuffle(w2_weight_scale)
-        layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
-    ) -> torch.Tensor:
-        topk_weights, topk_ids, _ = topk_output
-        return fused_moe(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights,
-            topk_ids,
-            quant_type=QuantType.per_1x32,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
-            activation=(
-                ActivationType.Silu if activation == "silu" else ActivationType.Gelu
-            ),
-            doweight_stage1=False,
-        )
-class MxFp4KVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from quark checkpoints.
-    """
-    def __init__(self, quant_config: MxFp4Config):
-        self.validate_kv_cache_config(quant_config.kv_cache_config)
-        super().__init__(quant_config)
-    @staticmethod
-    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
-        """
-        Validator for the kv cache configuration. Useful for controlling the
-        kv cache quantization schemes, that are being supported in vLLM
-        :param kv_cache_config: the quark kv cache scheme
-        """
-        if kv_cache_config is None:
-            return
-        dtype = kv_cache_config.get("dtype")
-        if dtype != "fp8_e4m3":
-            raise NotImplementedError(
-                "Currently supported kv cache quantization is "
-                f"dtype=fp8_e4m3, however received {dtype}"
-            )
-        qscheme = kv_cache_config.get("qscheme")
-        if qscheme != "per_tensor":
-            raise NotImplementedError(
-                "Only support per-tensor scaling factor "
-                "for quark KV cache. "
-                f"Expected qscheme: per_tensor, found qscheme: {qscheme}"
-            )

{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl