PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/model_executor/layers/quantization/schema.py ADDED Viewed

@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains the Pydantic schemas for various quantization-related
+parameters. When a relevant quantization technique is specified, these
+parameters are loaded in the form of a JSON alongside the model weights
+and augment the model with additional information needed for use of that
+technique. The format of this JSON should be specified by one or more
+schemas contained here.
+For example, when the KV cache is quantized to FP8-E4M3 (currently only
+possible on ROCm), the model can be optionally augmented with KV cache
+scaling factors.
+"""
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: dict[int, dict[int, float]]
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
+        return self
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}."
+                )
+            for i in range(tp_size):
+                assert i in self.scaling_factor, (
+                    f"KV cache scales map for TP rank {i} not found."
+                )
+        return self
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}."
+                )
+        return self
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: str | None
+    kv_cache: KVCacheQuantSchema
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!"
+                )
+        return self

vllm/model_executor/layers/quantization/torchao.py ADDED Viewed

@@ -0,0 +1,380 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import json
+import types
+from importlib.util import find_spec
+from typing import Any, Optional
+import regex as re
+import torch
+import torch.nn.functional as F
+from packaging import version
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.utils import set_weight_attrs
+logger = init_logger(__name__)
+def _bond_method_to_cls(func, obj):
+    if hasattr(func, "__self__") or not callable(func):
+        # If the function is already bound to an instance, return it as is
+        return func
+    else:
+        return types.MethodType(func, obj)
+def _get_weight_attrs(param):
+    # record attributes attached to the weight, so we can
+    # recover later
+    recorded_weight_attr = {}
+    for key in param.__dict__:
+        if hasattr(param, key):
+            attr = getattr(param, key)
+            if not callable(attr):
+                recorded_weight_attr[key] = attr
+            elif hasattr(attr, "__self__") and param is attr.__self__:
+                # if attr is a bonded method for an instance, and
+                # attr.__self__ points to the instance (param)
+                # we'll record the underlying function object
+                recorded_weight_attr[key] = attr.__func__
+            else:
+                recorded_weight_attr[key] = attr
+    return recorded_weight_attr
+def _restore_weight_attrs(param, recorded_weight_attr):
+    for attr_name, attr in recorded_weight_attr.items():
+        if not hasattr(param, attr_name):
+            setattr(param, attr_name, _bond_method_to_cls(attr, param))
+def torchao_version_at_least(torchao_version: str) -> bool:
+    if find_spec("torchao"):
+        try:
+            if version.parse(importlib.metadata.version("torchao")) >= version.parse(
+                torchao_version
+            ):
+                return True
+        except (ImportError, version.InvalidVersion):
+            return False
+    return False
+def should_skip(prefix: str, skip_modules: list[str]) -> bool:
+    """
+    Robust skipping logic:
+    should_skip("model.model.layers.1.q_proj",
+                ["model.model.layers.1.q_proj"])  # True
+    should_skip("model.model.layers.10.o_proj", ["o_proj"])  -> True
+    should_skip("visual.model.layers.1.q_proj", ["visual"])   -> True
+    should_skip("model.model.layers.1.q_proj", ["layers.1"])  -> True
+    should_skip("model.model.layers.11.q_proj", ["layers.1"]) -> False
+    """
+    for s in skip_modules:
+        if prefix == s:
+            return True
+        if f".{s}." in f".{prefix}.":
+            return True
+    return False
+if torchao_version_at_least("0.15.0"):
+    from torchao.prototype.tensor_conversion.api import (
+        convert_to_packed_tensor_based_on_current_hardware,
+    )
+else:
+    convert_to_packed_tensor_based_on_current_hardware = lambda t: t
+class TorchAOConfig(QuantizationConfig):
+    """Config class for torchao."""
+    def __init__(
+        self,
+        torchao_config,
+        skip_modules: list[str] | None = None,
+        is_checkpoint_torchao_serialized: bool = False,
+    ) -> None:
+        """
+        # TorchAO quantization relies on tensor subclasses. In order,
+        # to enable proper caching this needs standalone compile
+        if is_torch_equal_or_newer("2.8.0.dev"):
+            os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1"
+            logger.info(
+                "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1")
+        # TODO: remove after the torch dependency is updated to 2.8
+        if is_torch_equal_or_newer(
+                "2.7.0") and not is_torch_equal_or_newer("2.8.0.dev"):
+            os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
+            logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
+        """
+        super().__init__()
+        self.torchao_config = torchao_config
+        self.skip_modules = skip_modules or []
+        self.is_checkpoint_torchao_serialized = is_checkpoint_torchao_serialized
+    def __repr__(self) -> str:
+        return (
+            f"TorchAOConfig({self.torchao_config=}, {self.skip_modules=}, "
+            f"{self.is_checkpoint_torchao_serialized=})"
+        )
+    def get_name(self) -> QuantizationMethods:
+        return "torchao"
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        """torchao doesn't require additional config files, we use
+        `config.json` from huggingface: `model_config.hf_config`
+        """
+        return []
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig":
+        """Create the quant config from an hf model config"""
+        try:
+            from torchao.core.config import config_from_dict
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchao>=0.10.0 via "
+                "`pip install torchao>=0.10.0` to use torchao quantization."
+            ) from err
+        quant_method = cls.get_from_keys_or(config, ["quant_method"], None)
+        is_checkpoint_torchao_serialized = (
+            quant_method is not None and "torchao" in quant_method
+        )
+        hf_config = cls.get_from_keys_or(config, ["quant_type"], None)
+        assert hf_config is not None, "quant_type must be specified"
+        assert len(hf_config) == 1 and "default" in hf_config, (
+            "Expected only one key 'default' in quant_type dictionary"
+        )
+        quant_type = hf_config["default"]
+        ao_config = config_from_dict(quant_type)
+        # Adds skipped modules defined in "modules_to_not_convert"
+        skip_modules = config.get("modules_to_not_convert", []) or []
+        # Adds skipped modules defined in "module_fqn_to_config"
+        _data = quant_type.get("_data", {})
+        if not isinstance(_data, dict):
+            _data = {}
+        module_fqn = _data.get("module_fqn_to_config", {})
+        if not isinstance(module_fqn, dict):
+            module_fqn = {}
+        for layer, layer_cfg in module_fqn.items():
+            if layer_cfg is None:
+                skip_modules.append(layer)
+        return cls(ao_config, skip_modules, is_checkpoint_torchao_serialized)
+    @classmethod
+    def from_config_file(cls, config_file: str) -> "TorchAOConfig":
+        """Initialize class from a config file. Example:
+        ```
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        fn = "torchao_config.json"
+        with open(fn, "w") as f:
+            f.write(json.dumps(config_to_dict(config)))
+        ```
+        """
+        with open(config_file) as f:
+            f.seek(0)
+            f_read = f.read()
+            config_dict = json.loads(f_read)
+        hf_config = {"quant_type": {"default": config_dict}}
+        return cls.from_config(hf_config)
+    @classmethod
+    def from_config_dict_json(cls, config_dict_json: str) -> "TorchAOConfig":
+        """Iniitalize class from a config_dict json string, got from
+        torchao_config_object = some AOBaseConfig object
+        json.dumps(config_to_dict(torchao_config_object))
+        """
+        config_dict = json.loads(config_dict_json)
+        hf_config = {"quant_type": {"default": config_dict}}
+        return cls.from_config(hf_config)
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if not isinstance(layer, LinearBase):
+            return None
+        from torchao.quantization import ModuleFqnToConfig
+        if should_skip(prefix, self.skip_modules):
+            return UnquantizedLinearMethod()
+        module_fqn = prefix
+        if isinstance(self.torchao_config, ModuleFqnToConfig):
+            module_fqn_to_config = self.torchao_config.module_fqn_to_config
+            c = None
+            if module_fqn in module_fqn_to_config:
+                assert not module_fqn.startswith("re:"), (
+                    "module fqn should not start with"
+                    "`re:`, which is used for specifying regex"
+                )
+                c = module_fqn_to_config[module_fqn]
+            else:
+                for maybe_module_fqn_pattern in module_fqn_to_config:
+                    if not maybe_module_fqn_pattern.startswith("re:"):
+                        continue
+                    elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn):
+                        # we'll apply the config for first fully matched pattern
+                        c = module_fqn_to_config[maybe_module_fqn_pattern]
+                        break
+                else:
+                    # fallback to use default if no module specific
+                    # config is provided
+                    c = module_fqn_to_config.get("_default", None)
+            if c is not None:
+                current_torchao_config = TorchAOConfig(
+                    c, self.skip_modules, self.is_checkpoint_torchao_serialized
+                )
+                return TorchAOLinearMethod(current_torchao_config)
+            else:
+                return UnquantizedLinearMethod()
+        return TorchAOLinearMethod(self)
+    def get_scaled_act_names(self) -> list[str]:
+        return []
+def torchao_quantize_param_data(
+    param: torch.Tensor, torchao_config: Any
+) -> torch.nn.Parameter:
+    """Quantize a Tensor with torchao quantization specified by torchao_config
+    Args:
+        param: weight parameter of the linear module
+        torchao_config: type of quantization and their arguments we want to
+            use to quantize the Tensor
+    """
+    from torchao.core.config import AOBaseConfig
+    from torchao.quantization import quantize_
+    assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}"
+    """
+    Avoid real weight allocation for faster load, since we will
+    end up setting it to param.
+    """
+    with torch.device("meta"):
+        # linear can't be top level module since quantize_ is inplace
+        # while some of our configs need to do module swap, and only non-top
+        # level modules support module swap
+        dummy_linear = torch.nn.Sequential(
+            torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
+        )
+    dummy_linear[0].weight = param
+    quantize_(dummy_linear, torchao_config)
+    return dummy_linear[0].weight
+class TorchAOLinearMethod(LinearMethodBase):
+    """Linear method for torchao.
+    Args:
+        quant_config: The torchao quantization config, a string that encodes
+            the type of quantization and all relevant arguments.
+    """
+    def __init__(self, quant_config: TorchAOConfig):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        if self.quant_config.is_checkpoint_torchao_serialized:
+            weight = torchao_quantize_param_data(
+                weight, self.quant_config.torchao_config
+            )
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if self.quant_config.is_checkpoint_torchao_serialized:
+            if not hasattr(layer, "weight"):
+                return
+            # record attributes attached to the weight, so we can
+            # recover later
+            recorded_weight_attr = _get_weight_attrs(layer.weight)
+            layer.weight = Parameter(
+                convert_to_packed_tensor_based_on_current_hardware(layer.weight),
+                requires_grad=layer.weight.requires_grad,
+            )
+            _restore_weight_attrs(layer.weight, recorded_weight_attr)
+            return
+        # online quantize the weight if the checkpoint is not already
+        # quantized by torchao
+        recorded_weight_attr = _get_weight_attrs(layer.weight)
+        weight = torchao_quantize_param_data(
+            layer.weight, self.quant_config.torchao_config
+        )
+        weight = torch.nn.Parameter(
+            convert_to_packed_tensor_based_on_current_hardware(weight),
+            weight.requires_grad,
+        )
+        _restore_weight_attrs(weight, recorded_weight_attr)
+        layer.register_parameter("weight", weight)

vllm/model_executor/layers/quantization/tpu_int8.py ADDED Viewed

@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
+from vllm.model_executor.parameter import ModelWeightParameter
+ACTIVATION_SCHEMES = ["none", "dynamic"]
+class Int8TpuConfig(QuantizationConfig):
+    """Int8 Quantization Config class for TPU Backend."""
+    def __init__(
+        self,
+        activation_scheme: str = "none",
+    ) -> None:
+        super().__init__()
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+    def get_name(self) -> QuantizationMethods:
+        return "tpu_int8"
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError("This function should not be called with TPU Backend")
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig":
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(activation_scheme=activation_scheme)
+    def get_quant_method(
+        self, layer: Module, prefix: str
+    ) -> Optional["TPUInt8LinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return TPUInt8LinearMethod(self)
+        return None
+class TPUInt8LinearMethod(LinearMethodBase):
+    """Int8 Linear method for TPU Quant."""
+    def __init__(self, quant_config: Int8TpuConfig):
+        self.quant_config = quant_config
+        self.quantize_activation = False
+        if self.quant_config.activation_scheme == "dynamic":
+            self.quantize_activation = True
+    def create_weights(
+        self,
+        layer: Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+    def _quantize_weight(
+        self, weight: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        weight_dtype = weight.dtype
+        weight = weight.cpu().to(torch.float32)
+        n_bit = 8
+        eps = 1e-5
+        max_int = 2 ** (n_bit - 1) - 1
+        min_int = -(2 ** (n_bit - 1))
+        max_val = weight.abs().amax(dim=-1, keepdim=True)
+        max_val = max_val.clamp(min=eps)
+        qscale = max_val / max_int
+        qweight = torch.clamp(
+            torch.round(weight * (1.0 / qscale)), min_int, max_int
+        ).to(torch.int8)
+        qscale = qscale.squeeze().to(weight_dtype)
+        return qweight, qscale
+    def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+        device = layer.weight.device
+        qweight, qscale = self._quantize_weight(layer.weight)
+        qweight = qweight.to(device)
+        qscale = qscale.to(device)
+        layer.weight = Parameter(qweight, requires_grad=False)
+        layer.scale = Parameter(qscale, requires_grad=False)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        try:
+            import torch_xla.experimental.custom_kernel  # noqa: F401
+        except ImportError as err:
+            raise ImportError(
+                "Please install torch_xla by following the instructions at "
+                "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "  # noqa: E501
+                "to run vLLM on TPU."
+            ) from err
+        weight = layer.weight
+        scale = layer.scale
+        out = torch.ops.xla.quantized_matmul_int8(
+            x, weight, scale, quantize_activation=self.quantize_activation
+        )
+        if bias is not None:
+            out = out + bias
+        return out

vllm/model_executor/layers/quantization/utils/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .layer_utils import replace_parameter, update_tensor_inplace
+__all__ = ["update_tensor_inplace", "replace_parameter"]

vllm/model_executor/layers/quantization/utils/allspark_utils.py ADDED Viewed

@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD = 1024
+ALLSPARK_SUPPORTED_QUANT_TYPES = [scalar_types.uint8b128]
+ALLSPARK_AMPERE_N_ALIGN = 16
+ALLSPARK_AMPERE_K_ALIGN = 16
+def check_allspark_supported_dtype_shape(
+    input_size_per_partition: int,
+    output_size_per_partition: int,
+    group_size: int,
+    weight_dtype: ScalarType,
+    act_dtype: torch.dtype,
+):
+    capability_tuple = current_platform.get_device_capability()
+    device_capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    # For Ampere GPU
+    if device_capability >= 80 and device_capability < 90:
+        if group_size != -1:
+            return (
+                False,
+                "For Ampere GPU, AllSpark does not support group_size "
+                f"= {group_size}. Only group_size = -1 are supported.",
+            )
+        if weight_dtype not in ALLSPARK_SUPPORTED_QUANT_TYPES:
+            return (
+                False,
+                "For Ampere GPU, AllSpark does not support "
+                f"quant type ({weight_dtype}). Only quant type "
+                f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported.",
+            )
+        if (
+            input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0
+            or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0
+        ):
+            return (
+                False,
+                "AllSpark needs input_size_per_partition % "
+                f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and "
+                f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 "
+                "for Ampere GPU optimized kernels.",
+            )
+        if act_dtype != torch.float16 and act_dtype != torch.bfloat16:
+            return (
+                False,
+                "AllSpark only supports act_dtype = float16 or bfloat16,"
+                f"for Ampere GPU, but got act_dtype = {act_dtype}.",
+            )
+    else:
+        return (
+            False,
+            "AllSpark currently does not support "
+            f"device_capability = {device_capability}.",
+        )
+    return True, None