PyPI - bigdl-core-npu - Versions diffs - 2.6.0b20250114__cp311-cp311-win_amd64.whl - Mend

bigdl-core-npu 2.6.0b20250114__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

intel_npu_acceleration_library/backend/linear.py ADDED Viewed

@@ -0,0 +1,60 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+import numpy as np
+class Linear(NNFactory):
+    """Linear class, computing a matrix matrix multiplication with weights prefetching."""
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        profile: bool = False,
+        device: str = "NPU",
+    ):
+        """Initialize the Linear class.
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+        input = self.parameter((self.batch, self.inC))
+        _ = self.linear(input, outC, inC, bias=False)
+        self.compile()
+    def run(self, X: np.ndarray, W: np.ndarray, op_id: str) -> np.ndarray:
+        """Run the layer: X * W^T.
+        Args:
+            X (np.ndarray): lhs operator
+            W (np.ndarray): rhs operator
+            op_id (str): operation id
+        Raises:
+            RuntimeError: Input or weight tensor shape mismatch
+        Returns:
+            np.ndarray: result
+        """
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
+            )
+        return super().run(X, W, op_id=op_id)

intel_npu_acceleration_library/backend/matmul.py ADDED Viewed

@@ -0,0 +1,59 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+import numpy as np
+class MatMul(NNFactory):
+    """MatMul class, computing a matrix matrix multiplication."""
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        profile: bool = False,
+        device: str = "NPU",
+    ):
+        """Initialize the MatMul class.
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+        input = self.parameter((self.batch, self.inC))
+        _ = self.linear(input, outC, inC, bias=False)
+        self.compile()
+    def run(self, X: np.ndarray, W: np.ndarray) -> np.ndarray:
+        """Run the layer: X * W^T.
+        Args:
+            X (np.ndarray): lhs operator
+            W (np.ndarray): rhs operator
+        Raises:
+            RuntimeError: Input or weight tensor shape mismatch
+        Returns:
+            np.ndarray: result
+        """
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
+            )
+        return super().run(X, W)

intel_npu_acceleration_library/backend/mlp.py ADDED Viewed

@@ -0,0 +1,58 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+from typing import Optional, Sequence
+class MLP(NNFactory):
+    """Linear class, computing a matrix matrix multiplication with weights prefetching."""
+    def __init__(
+        self,
+        input_shape: Sequence[int],
+        intermediate_size: int,
+        activation: str = "swiglu",
+        bias: Optional[bool] = False,
+        profile: bool = False,
+        device: str = "NPU",
+        **additional_args
+    ):
+        """Initialize the Linear class.
+        Args:
+            input_shape (Sequence[int]): input shape channels
+            intermediate_size (int): intermediate_size
+            activation (str): activation function to use
+            bias (Optional[bool], optional): Enable/Disable bias. Defaults to False.
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            additional_args: additional arguments
+        """
+        super().__init__(profile, device)
+        self.intermediate_size = intermediate_size
+        self.batch, self.hidden_size = input_shape
+        input = self.parameter((self.batch, self.hidden_size))
+        mm1 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias)
+        if activation == "swiglu":
+            mm2 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias)  # type: ignore[attr-defined]
+            mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
+        elif activation == "clamp":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("min"), additional_args.get("max"))
+        elif activation == "elu":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("alpha", 1.0))
+        elif activation == "grn":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("grn_bias"))
+        else:
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1)
+        _ = self.linear(mm1, self.hidden_size, self.intermediate_size, bias=bias)
+        self.compile()

intel_npu_acceleration_library/backend/ops.py ADDED Viewed

@@ -0,0 +1,142 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Any, Sequence
+import ctypes
+@dataclass(frozen=True)
+class SupportedOp:
+    """A class for supported runtime OPs in the NPU.
+    Attrs:
+        name (str): Operation name
+        inputs (int): Number of inputs
+        parameters (Sequence[Any]): Optional parameters type.
+    """
+    name: str
+    inputs: int
+    parameters: Sequence[Any] = ()
+@lru_cache(maxsize=None)
+def get_supported_ops() -> List[SupportedOp]:
+    """Generate a list fo supported operations.
+    Returns:
+        List[SupportedOp]: list fo supported NPU operations
+    """
+    supported_ops = [
+        SupportedOp(name="result", inputs=1),
+        SupportedOp(name="matmul", inputs=2, parameters=[ctypes.c_bool, ctypes.c_bool]),
+        SupportedOp(name="eltwise_add", inputs=2),
+        SupportedOp(name="eltwise_mul", inputs=2),
+        SupportedOp(name="eltwise_div", inputs=2),
+        SupportedOp(name="abs_act", inputs=1),
+        SupportedOp(name="acos_act", inputs=1),
+        SupportedOp(name="asin_act", inputs=1),
+        SupportedOp(name="atan_act", inputs=1),
+        SupportedOp(name="ceiling", inputs=1),
+        SupportedOp(
+            name="clamp", inputs=1, parameters=[ctypes.c_float, ctypes.c_float]
+        ),
+        SupportedOp(name="cos_act", inputs=1),
+        SupportedOp(name="cosh_act", inputs=1),
+        SupportedOp(name="erf_act", inputs=1),
+        SupportedOp(name="elu", inputs=1, parameters=[ctypes.c_float]),
+        SupportedOp(name="exp_act", inputs=1),
+        SupportedOp(name="floor_act", inputs=1),
+        SupportedOp(name="grn", inputs=1, parameters=[ctypes.c_float]),
+        SupportedOp(name="gelu", inputs=1),
+        SupportedOp(name="gelu_erf", inputs=1),
+        SupportedOp(name="log_act", inputs=1),
+        SupportedOp(name="negative", inputs=1),
+        SupportedOp(name="relu", inputs=1),
+        SupportedOp(name="sigmoid", inputs=1),
+        SupportedOp(name="sign", inputs=1),
+        SupportedOp(name="sin_act", inputs=1),
+        SupportedOp(name="sinh_act", inputs=1),
+        SupportedOp(name="sqrt_act", inputs=1),
+        SupportedOp(name="tan_act", inputs=1),
+        SupportedOp(name="tanh_act", inputs=1),
+        SupportedOp(name="acosh_act", inputs=1),
+        SupportedOp(name="asinh_act", inputs=1),
+        SupportedOp(name="atanh_act", inputs=1),
+        SupportedOp(name="hswish", inputs=1),
+        SupportedOp(name="mish", inputs=1),
+        SupportedOp(name="softplus", inputs=1),
+        SupportedOp(name="hsigmoid", inputs=1),
+        SupportedOp(name="round_act", inputs=1),
+        SupportedOp(name="softsign", inputs=1),
+        SupportedOp(name="softmax", inputs=1, parameters=[ctypes.c_int]),
+        SupportedOp(name="swish", inputs=1),
+        SupportedOp(name="convert_to_fp16", inputs=1),
+        SupportedOp(name="convert_to_fp32", inputs=1),
+        SupportedOp(name="convert_to_int32", inputs=1),
+        SupportedOp(
+            name="scaled_dot_product_attention",
+            inputs=4,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="scaled_dot_product_attention_simple",
+            inputs=3,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="normL2",
+            inputs=2,
+            parameters=[ctypes.c_float],
+        ),
+        SupportedOp(
+            name="gather",
+            inputs=3,
+            parameters=[ctypes.c_int],
+        ),
+        SupportedOp(name="reshape", inputs=2, parameters=[ctypes.c_bool, ctypes.c_int]),
+        SupportedOp(name="transpose", inputs=2),
+        SupportedOp(name="squeeze", inputs=1),
+        SupportedOp(name="unsqueeze", inputs=2),
+        SupportedOp(
+            name="concat",
+            inputs=2,
+            parameters=[ctypes.c_int64],
+        ),
+        SupportedOp(
+            name="reduce_max",
+            inputs=2,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="reduce_mean",
+            inputs=2,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="reduce_min",
+            inputs=2,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="reduce_prod",
+            inputs=2,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(
+            name="reduce_sum",
+            inputs=2,
+            parameters=[ctypes.c_bool],
+        ),
+        SupportedOp(name="adaptive_avg_pool", inputs=2),
+        SupportedOp(name="adaptive_max_pool", inputs=2),
+        SupportedOp(name="power", inputs=2),
+        SupportedOp(name="broadcast", inputs=2),
+        SupportedOp(name="log_softmax", inputs=1, parameters=[ctypes.c_int64]),
+        SupportedOp(name="rotate_half", inputs=1),
+    ]
+    return supported_ops

intel_npu_acceleration_library/backend/qlinear.py ADDED Viewed

@@ -0,0 +1,75 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+import numpy as np
+class QLinear(NNFactory):
+    """Quantized Linear class, computing a matrix matrix multiplication with weights prefetching."""
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        profile: bool = False,
+        device: str = "NPU",
+        dtype: np.dtype = np.int8,
+        asym: bool = False
+    ):
+        """Initialize the QLinear class.
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            dtype (np.dtype): weights datatype. Defaults to np.int8.
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+        self.asym = asym
+        input = self.parameter((self.batch, self.inC))
+        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
+        self.compile()
+    def run(
+        self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
+    ) -> np.ndarray:
+        """Run the layer:  $X * (W * S)^T$ .
+        Args:
+            X (np.ndarray): activation
+            W (np.ndarray): quantized weights
+            scale (np.ndarray): quantization scale
+            op_id (str): operation id
+        Raises:
+            RuntimeError: Input, weights or scale shape mismatch
+        Returns:
+            np.ndarray: result
+        """
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
+            )
+        if not self.asym:
+            return super().run(X, (W, scale), op_id=op_id)
+        else:
+            return super().run(X, (W, scale, zero), op_id=op_id)

intel_npu_acceleration_library/backend/qmatmul.py ADDED Viewed

@@ -0,0 +1,66 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+import numpy as np
+class QMatMul(NNFactory):
+    """Quantized Linear class, computing a matrix matrix multiplication."""
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        profile: bool = False,
+        device: str = "NPU",
+        dtype: np.dtype = np.int8,
+    ):
+        """Initialize the QMatmul class.
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            dtype (np.dtype): weights datatype. Defaults to np.int8.
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+        input = self.parameter((self.batch, self.inC))
+        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
+        self.compile()
+    def run(self, X: np.ndarray, W: np.ndarray, scale: np.ndarray) -> np.ndarray:
+        """Run the layer:  X * (W * S)^T.
+        Args:
+            X (np.ndarray): activation
+            W (np.ndarray): quantized weights
+            scale (np.ndarray): quantization scale
+        Raises:
+            RuntimeError: Input, weights or scale shape mismatch
+        Returns:
+            np.ndarray: result
+        """
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
+            )
+        if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
+            raise RuntimeError(
+                f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
+            )
+        return super().run(X, (W, scale))

intel_npu_acceleration_library/backend/runtime.py ADDED Viewed

@@ -0,0 +1,215 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend import Linear, QLinear
+from intel_npu_acceleration_library.backend import MatMul, QMatMul
+from intel_npu_acceleration_library.backend import NNFactory
+from torch.profiler import record_function
+from typing import Optional, Any, List, Dict, Deque, Union
+from functools import partial
+from collections import deque
+import numpy as np
+import torch
+_model_cache: Dict[str, Deque[NNFactory]] = {}
+def clear_cache():
+    """Clear the cache of models."""
+    global _model_cache
+    _model_cache = {}
+@torch.no_grad()
+def run_matmul(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    zero: Optional[torch.Tensor] = None,
+    op_id: Optional[str] = None,
+) -> torch.Tensor:
+    """Run a matmul operation. Depending on the datatype of the weights it runs a float or quantized operation.
+    Args:
+        x (torch.Tensor): Activation tensor. Its dtype must be torch.float16
+        weights (torch.Tensor): Weights tensor.  Its dtype can be torch.float16 or torch.int8
+        scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None.
+        zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4. If weights.dtype == torch.uint8 and use asym_int4 then it must be set and asym Defaults to None.
+        op_id (Optional[str], optional): Operation ID. Defaults to None.
+    Raises:
+        RuntimeError: Unsupported weights datatype. Supported types: [torch.float16, torch.int8]
+    Returns:
+        torch.Tensor: result
+    """
+    global _model_cache
+    outC, inC = weights.shape[-2:]
+    if weights.dtype == torch.uint8:
+        # In case is Int4 we need to double the input channels because weights are compressed
+        inC *= 2
+    # Set tensors as contiguous in memory
+    x = set_contiguous(x)
+    weights = set_contiguous(weights)
+    if len(weights.shape) > 2:
+        weights = weights.view([-1, weights.shape[-1]])
+    if weights.dtype.is_floating_point:
+        op_class = Linear if op_id is not None else MatMul
+        op_class_name = op_class.__name__
+        create_op = partial(op_class)
+        op_args = [weights.numpy()]
+    elif weights.dtype in (torch.int8, torch.uint8):
+        if scale is None:
+            raise RuntimeError("Quantized weights require a not null scale")
+        op_class = QLinear if op_id is not None else QMatMul
+        op_class_name = op_class.__name__
+        np_dtype = np.int8 if weights.dtype == torch.int8 else np.uint8
+        create_op = partial(op_class, dtype=np_dtype, asym=(zero is not None))
+        if scale is None:
+            raise RuntimeError(
+                f"Quantized matmul (weights dtype == {weights.dtype}) requires scale (scale = {scale})"
+            )
+        if zero is None:
+            op_args = [weights.numpy(), scale.numpy()]
+        else:
+            op_args = [weights.numpy(), scale.numpy(), zero.numpy()]
+    else:
+        raise RuntimeError(f"Unsupported dtype for weights {weights.dtype}")
+    if not x.dtype.is_floating_point:
+        raise RuntimeError(f"Unsupported dtype for activation {x.dtype}")
+    # Use or not op_id depending on the class used
+    op_kwargs = {"op_id": op_id} if op_id else {}
+    original_input_shape = x.shape
+    expected_output_shape = list(original_input_shape[:-1]) + [outC]
+    if not (len(x.shape) >= 2):
+        raise RuntimeError(f"Input shape {x.shape} must me >= 2")
+    # Reshape input
+    input_dtype = x.dtype
+    x = x.to(torch.float16) if input_dtype != torch.float16 else x
+    if len(x.shape) > 2 or x.shape[-1] != inC:
+        x = x.view([-1, inC])
+    x_np = x.numpy()
+    batch = x_np.shape[0]
+    key = f"{str(op_class_name)}_{batch}_{inC}_x_{outC}_{inC}_{x_np.dtype}"
+    models = _model_cache.get(key, None)
+    if models is None:
+        _model_cache[key] = deque([create_op(inC, outC, batch)])
+    elif len(models) < 1:
+        _model_cache[key].append(create_op(inC, outC, batch))
+    else:
+        _model_cache[key].rotate(1)
+    # Get the model
+    model = _model_cache[key][0]
+    profiling_name = "matvec" if batch == 1 else "matmul"
+    with record_function(f"npu_{profiling_name}_{key}"):
+        ret = model.run(x_np, *op_args, **op_kwargs)
+    return adapt_output_tensor(ret, expected_output_shape, input_dtype)
+def adapt_output_tensor(
+    output: np.ndarray, original_shape: torch.Size, input_dtype: torch.dtype
+) -> torch.Tensor:
+    """Adapt the output tensor to the original shape and dtype.
+    Args:
+        output (np.ndarray): output tensor
+        original_shape (torch.Size): original shape
+        input_dtype (torch.dtype): input dtype
+    Returns:
+        torch.Tensor: output tensor
+    """
+    output = torch.from_numpy(output)
+    if output.shape != original_shape:
+        output = output.view(original_shape)
+    # needs to copy as the same buffer can be reutilized
+    return output.to(input_dtype, copy=True)
+def set_contiguous(tensor: torch.Tensor) -> torch.Tensor:
+    """Set tensor to be contiguous in memory.
+    Args:
+        tensor (torch.Tensor): input tensor
+    Returns:
+        torch.Tensor: output, contiguous tensor
+    """
+    if not tensor.is_contiguous():
+        return tensor.contiguous()
+    return tensor
+@torch.no_grad()
+def run_factory(
+    x: Union[torch.Tensor, List[torch.Tensor]],
+    weights: List[torch.Tensor],
+    backend_cls: Any,
+    op_id: Optional[str] = None,
+    replica: int = 1,
+) -> torch.Tensor:
+    """Run a factory operation. Depending on the datatype of the weights it runs a float or quantized operation.
+    Args:
+        x (Union[torch.Tensor, List[torch.Tensor]]): Activation tensor(s). Its dtype must be torch.float16
+        weights (torch.Tensor): Weights tensor.  Its dtype can be torch.float16 or torch.int8
+        backend_cls (Any): Backend class to run
+        op_id (Optional[str], optional): Operation ID. Defaults to None.
+    Returns:
+        torch.Tensor: result
+    """
+    global _model_cache
+    # Use or not op_id depending on the class used
+    op_kwargs = {"op_id": op_id} if op_id else {}
+    if not isinstance(x, (list, tuple)):
+        x = [x]
+    # Reshape input
+    input_dtype = x[0].dtype
+    x_np = [set_contiguous(elem).to(torch.float16).numpy() for elem in x]
+    op_args = [set_contiguous(w).to(torch.float16).numpy() for w in weights]
+    shape_dtype_signature = "_".join(
+        ["_".join(str(dim) for dim in t.shape) + f"_{t.dtype}" for t in x_np + op_args]
+    )
+    key = f"{backend_cls.func.__name__}_{shape_dtype_signature}"
+    models = _model_cache.get(key, None)
+    input_shapes = [elem.shape for elem in x_np]
+    if models is None:
+        _model_cache[key] = deque([backend_cls(*input_shapes) for i in range(replica)])
+    elif len(models) < 1:
+        _model_cache[key].append(backend_cls(*input_shapes))
+    else:
+        _model_cache[key].rotate(1)
+    # Get the model
+    model = _model_cache[key][0]
+    with record_function(f"npu_factory_mul_{key}"):
+        ret = model.run(*x_np, *op_args, **op_kwargs)
+    if isinstance(ret, list):
+       return [adapt_output_tensor(r, r.shape, input_dtype) for r in ret]
+    return adapt_output_tensor(ret, ret.shape, input_dtype)