PyPI - bigdl-core-npu - Versions diffs - 2.6.0b20250114__cp310-cp310-win_amd64.whl - Mend

bigdl-core-npu 2.6.0b20250114__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

intel_npu_acceleration_library/backend/base.py ADDED Viewed

@@ -0,0 +1,250 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from typing import Optional, List, Union, Any, Dict, Tuple, Iterable
+from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
+import numpy as np
+import intel_npu_acceleration_library
+import ctypes
+import os
+def adapt_weight(w: np.ndarray) -> np.ndarray:
+    """Adapt the weights to run on the NPU.
+    Args:
+        w (np.ndarray): weights array
+    Returns:
+        np.ndarray: The adapted array
+    """
+    if len(w.shape) == 1:
+        w_adapted = w.reshape((1, -1))
+        return w_adapted, w_adapted.shape
+    elif len(w.shape) == 2:
+        return w, w.shape
+    else:
+        w_adapted = w.reshape((1, -1))
+        return w_adapted, w_adapted.shape
+class BaseNPUBackend:
+    """A base class that represent a abstract Matrix-Matrix operation on the NPU."""
+    def __init__(self, profile: Optional[bool] = False) -> None:
+        """Initialize class profiling.
+        Args:
+            profile (Optional[bool], optional): Enable/Disable NPU profiling. Defaults to False.
+        """
+        if profile:
+            os.environ["NPU_PRINT_PROFILING"] = "JSON"
+            os.environ["NPU_PROFILING_OUTPUT_FILE"] = "profiling.json"
+            os.environ["NPU_PROFILING_VERBOSITY"] = "HIGH"
+        self._mm: Any = None
+    def __del__(self):
+        """Deallocate and free the class from the library."""
+        if (
+            hasattr(self, "_mm")
+            and intel_npu_acceleration_library
+            and hasattr(backend_lib, "destroyNNFactory")
+        ):
+            backend_lib.destroyNNFactory(self._mm)
+    def save(self, path: str, compress_to_fp16: bool = True):
+        """Save the Openvino model.
+        Args:
+            path (str): the model save path
+            compress_to_fp16 (bool): whether to compress floating point weights to FP16 (default: True).
+        """
+        backend_lib.saveModel(self._mm, ctypes.c_char_p(path.encode()), compress_to_fp16)
+    def saveCompiledModel(self, path: str):
+        """Save the compiled model.
+        Args:
+            path (str): the compiled model save path
+        """
+        backend_lib.saveCompiledModel(self._mm, ctypes.c_char_p(path.encode()))
+    def serialize(self, xml_path: str, bin_path: str):
+        """Serialize the Openvino model.
+        Args:
+            xml_path (str): the model save xml path
+            bin_path (str): the model save bin path
+        """
+        backend_lib.serializeModel(self._mm, ctypes.c_char_p(xml_path.encode()), ctypes.c_char_p(bin_path.encode()))
+class BaseNPUBackendWithPrefetch(BaseNPUBackend):
+    """A base class that represent a abstract Matrix-Matrix operation on the NPU.
+    Linear type classes employ an algorithm to optimize weights prefetching
+    """
+    def __init__(self, profile: bool):
+        """Initialize class.
+        Args:
+            profile (bool): Enable/Disable NPU profiling.
+        """
+        super().__init__(profile)
+        self.wt_order: List[str] = []
+        self.wt_map: Dict[str, ctypes._Pointer] = {}
+        self.loaded: Optional[str] = None
+    def load_wt_fn(self, offset, module, parameters, verify_size=False):
+        """Load asyncronously the parameter into the NPU.
+        Args:
+            module: the NPU backend module
+            parameters: the weights parameter class
+        """
+        backend_lib.setNNFactoryWeights(module, offset, parameters, verify_size)
+    def create_parameters(
+        self, weights: Iterable[Union[np.ndarray, Tuple[np.ndarray, ...]]]
+    ) -> ctypes._Pointer:
+        """Create an operation parameter from a list of weights.
+        Args:
+            weights (Iterable[Union[np.ndarray, Tuple[np.ndarray, ...]]]): Operation parameters
+        Raises:
+            RuntimeError: Quantized weights needs to be in int8 format
+            ValueError: Invalid dtype for scale
+        Returns:
+            ctypes._Pointer: an instance to the Parameters object
+        """
+        param = backend_lib.createParameters()
+        if isinstance(weights, (list, tuple)):
+            for weight in weights:
+                if isinstance(weight, (list, tuple)):
+                    # int8: data and scale
+                    if len(weight) == 2:
+                        data, scale = weight
+                        zero = None
+                    elif len(weight) == 3:
+                        # for asym int4
+                        data, scale, zero = weight
+                    if data.dtype not in [np.int8, np.uint8]:
+                        raise RuntimeError(
+                            "Quantized weights needs to be in int8 or uint8 format"
+                        )
+                    adapted_weights, shape = adapt_weight(data)
+                    adapted_weights_scale, shape_scale = adapt_weight(scale)
+                    if scale.dtype == np.float16:
+                        # Mixed precision matmul
+                        if data.dtype == np.int8:
+                            backend_lib.addIntParameter(
+                                param,
+                                adapted_weights,
+                                adapted_weights_scale,
+                                *shape,
+                                *shape_scale,
+                            )
+                        elif data.dtype == np.uint8 and zero is not None:
+                            # asym_int4
+                            adapted_weights_zero, shape_zero = adapt_weight(zero)
+                            backend_lib.addAsymInt4Parameter(
+                                param,
+                                adapted_weights,
+                                adapted_weights_scale,
+                                adapted_weights_zero,
+                                *shape,
+                                *shape_scale,
+                                *shape_zero
+                            )
+                        else:
+                            # sym_int4
+                            backend_lib.addInt4Parameter(
+                                param,
+                                adapted_weights,
+                                adapted_weights_scale,
+                                *shape,
+                                *shape_scale
+                            )
+                    elif scale.dtype == np.float32:
+                        # FP16 matmul with CPU conversion
+                        backend_lib.addIntParameterConversion(
+                            param,
+                            adapted_weights,
+                            adapted_weights_scale,
+                            *shape,
+                        )
+                    else:
+                        raise ValueError(f"Invalid dtype for scale: {scale.dtype}")
+                else:
+                    adapted_weights, shape = adapt_weight(weight)
+                    if weight.dtype == np.uint8:
+                        backend_lib.addInt4WeightParameter(
+                            param,
+                            adapted_weights,
+                            *shape,
+                        )
+                    else:
+                        backend_lib.addFloatParameter(param, adapted_weights, *shape)
+        elif isinstance(weights, np.ndarray):
+            adapted_weights, shape = adapt_weight(weights)
+            backend_lib.addFloatParameter(param, adapted_weights, *shape)
+        return param
+    def add_to_map(
+        self, wt_hash: str, weights: Iterable[Union[np.ndarray, Tuple[np.ndarray, ...]]]
+    ):
+        """Add an operation parameters to the operation hash:parameter map.
+        Args:
+            wt_hash (str): operation hash
+            weights (Iterable[Union[np.ndarray, Tuple[np.ndarray, ...]]]): Operation parameters
+        """
+        self.wt_map[wt_hash] = self.create_parameters(weights)
+        self.wt_order.append(wt_hash)
+    def setWeights(
+        self, offset: int, wt_hash: Optional[str], *args: Union[np.ndarray, Tuple[np.ndarray, ...]],
+        verify_size: bool = False
+    ) -> bool:
+        """Set the operation weights in the NPU.
+        Args:
+            wt_hash (str): operation hash. If set to None force the load of the weights
+            args (Union[np.ndarray, Tuple[np.ndarray, ...]]): Variable length weights list. Can be a np array or a tuple of weight, scale in case of quantized tensors
+        Returns:
+            bool: Return True if the op parameters are already in the op map
+        """
+        if wt_hash is None:
+            self.load_wt_fn(offset, self._mm, self.create_parameters(args), verify_size=verify_size)
+            return False
+        in_wt_map = wt_hash in self.wt_map.keys()
+        if not wt_hash == self.loaded:
+            if not in_wt_map:
+                self.add_to_map(wt_hash, args)
+            self.load_wt_fn(offset, self._mm, self.wt_map[wt_hash], verify_size=verify_size)
+            self.loaded = wt_hash
+            return in_wt_map
+        return in_wt_map
+    def prefetchWeights(self, offset, verify_size: bool = False):
+        """Prefetch next operation weights."""
+        next_wt_idx = (self.wt_order.index(self.loaded) + 1) % len(self.wt_order)
+        wt_hash = self.wt_order[next_wt_idx]
+        if not wt_hash == self.loaded:
+            self.load_wt_fn(offset, self._mm, self.wt_map[wt_hash], verify_size=verify_size)
+            self.loaded = wt_hash
+    def __del__(self):
+        """Deallocate and free the class from the library."""
+        super(BaseNPUBackendWithPrefetch, self).__del__()
+        for par in self.wt_map.values():
+            if intel_npu_acceleration_library and hasattr(
+                backend_lib, "destroyParameters"
+            ):
+                backend_lib.destroyParameters(par)

intel_npu_acceleration_library/backend/bindings.py ADDED Viewed

@@ -0,0 +1,383 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.ops import get_supported_ops
+import numpy as np
+import warnings
+import ctypes
+import sys
+import os
+handler = ctypes.POINTER(ctypes.c_char)
+c_fp16_array = np.ctypeslib.ndpointer(dtype=np.float16, ndim=2, flags="C_CONTIGUOUS")
+c_fp32_array = np.ctypeslib.ndpointer(dtype=np.float32, ndim=2, flags="C_CONTIGUOUS")
+c_i8_array = np.ctypeslib.ndpointer(dtype=np.int8, ndim=2, flags="C_CONTIGUOUS")
+c_u8_array = np.ctypeslib.ndpointer(dtype=np.uint8, ndim=2, flags="C_CONTIGUOUS")
+c_u32_array = np.ctypeslib.ndpointer(dtype=np.uint32, ndim=1, flags="C_CONTIGUOUS")
+def load_library() -> ctypes.CDLL:
+    """Load the Intel® NPU Acceleration Library runtime library.
+    Raises:
+        RuntimeError: an error is raised if the platform is not supported. Currently supported platforms are WIndows and Linux
+    Returns:
+        ctypes.CDLL: The loaded dynamic library
+    """
+    path = os.path.dirname(os.path.abspath(__file__))
+    if "openvino" in sys.modules:
+        warnings.warn(
+            "OpenVINO library is already loaded. It might interfere with NPU acceleration library if it uses an old version.",
+            stacklevel=2,
+        )
+    external_path = os.path.join(path, "..", "external")
+    sys.path.insert(0, external_path)
+    if sys.platform == "win32":
+        dll_path = os.path.join(path, "..", "lib", "Release")
+        os.environ["OPENVINO_LIB_PATHS"] = dll_path
+        os.add_dll_directory(os.path.abspath(dll_path))
+        # Load DLL into memory.
+        lib = ctypes.WinDLL(
+            os.path.join(dll_path, "intel_npu_acceleration_library.dll")
+        )  # , winmode=0)
+    elif sys.platform == "linux":
+        dll_path = os.path.join(path, "..", "lib")
+        sys.path.append(dll_path)
+        # In Linux it is required to explicitly load openvino lib
+        _ = ctypes.CDLL(os.path.join(dll_path, "libopenvino.so"))
+        lib = ctypes.CDLL(
+            os.path.join(dll_path, "libintel_npu_acceleration_library.so")
+        )
+    else:
+        raise RuntimeError(
+            f"Platform {sys.platform} is not supported for intel-npu-acceleration-library library"
+        )
+    return lib
+def init_common(lib: ctypes.CDLL):
+    """Initialize common runtime bindings.
+    Args:
+        lib (ctypes.CDLL): Intel® NPU Acceleration Library runtime library
+    """
+    lib.saveModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_bool]
+    lib.saveCompiledModel.argtypes = [handler, ctypes.c_char_p]
+    lib.serializeModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_char_p]
+    # Set input activations
+    lib.set_activation.argtypes = [handler, ctypes.c_void_p, ctypes.c_int]
+    # Set outputs activations
+    lib.set_output.argtypes = [handler, ctypes.c_void_p, ctypes.c_int]
+    # Run a linar layer
+    lib.run.argtypes = [handler]
+    lib.run.restype = ctypes.c_float
+    lib.run_decoders.argtypes = [ctypes.POINTER(handler), ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int]
+    lib.run_decoders.restype = ctypes.c_float
+    # Common destructor
+    lib.destroyNNFactory.argtypes = [handler]
+    lib.isNPUAvailable.restype = ctypes.c_bool
+    lib.getNPUDriverVersion.restype = ctypes.c_int32
+    lib.compressToI4.argtypes = [c_i8_array, c_u8_array, ctypes.c_int]
+    # Remote tensors
+    lib.to_npu.argtypes = [ctypes.c_int, c_u32_array, ctypes.c_char_p, ctypes.c_void_p]
+    lib.to_npu.restype = handler
+    lib.remote_tensor_data.argtypes = [handler]
+    lib.remote_tensor_data.restype = ctypes.c_void_p
+    lib.del_remote_tensor.argtypes = [handler]
+def init_network_factory(lib: ctypes.CDLL):
+    """Initialize Netowrk factory bindings.
+    Args:
+        lib (ctypes.CDLL): Intel® NPU Acceleration Library runtime library
+    """
+    lib.createNNFactory.argtypes = [
+        ctypes.c_char_p,
+        ctypes.c_bool,
+    ]
+    lib.createNNFactory.restype = handler
+    lib.setNNFactoryWeights.argtypes = [handler, ctypes.c_int, handler, ctypes.c_bool]
+    lib.op_shape_size.argtypes = [handler, ctypes.c_int]
+    lib.op_shape_size.restype = ctypes.c_int
+    lib.op_shape.argtypes = [handler, ctypes.c_int, ctypes.c_int]
+    lib.op_shape.restype = ctypes.c_int
+    lib.op_dtype.argtypes = [handler, ctypes.c_int]
+    lib.op_dtype.restype = ctypes.c_int
+    lib.op_output_size.argtypes = [handler]
+    lib.op_output_size.restype = ctypes.c_int
+    lib.parameter.argtypes = [handler, ctypes.c_int, c_u32_array, ctypes.c_char_p]
+    lib.parameter.restype = handler
+    lib.to.argtypes = [handler, handler, ctypes.c_char_p]
+    lib.to.restype = handler
+    lib.constant.argtypes = [
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_char_p,
+        ctypes.c_void_p,
+    ]
+    lib.constant.restype = handler
+    lib.slice.argtypes = [
+        handler,
+        handler,
+        handler,
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+    ]
+    lib.slice.restype = handler
+    lib.simple_slice.argtypes = [
+        handler,
+        handler,
+        handler,
+        handler,
+        handler
+    ]
+    lib.simple_slice.restype = handler
+    lib.compile.argtypes = [handler, ctypes.c_int]
+    lib.compile.restype = handler
+    lib.get_output_tensor_shape_size.argtypes = [handler, ctypes.c_int]
+    lib.get_output_tensor_shape_size.restype = ctypes.c_int
+    lib.get_output_tensor_shape.argtypes = [handler, ctypes.c_int, ctypes.c_int]
+    lib.get_output_tensor_shape.restype = ctypes.c_int
+    lib.linear.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
+    ]
+    lib.linear.restype = handler
+    lib.convolution.argtypes = [
+        handler,
+        handler,
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        ctypes.c_char_p,
+    ]
+    lib.convolution.restype = handler
+    lib.avg_pooling.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_bool,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.avg_pooling.restype = handler
+    lib.max_pooling.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.max_pooling.restype = handler
+    lib.multi_concat.argtypes = [
+        handler,
+        ctypes.POINTER(handler),
+        ctypes.c_uint64,
+        ctypes.c_int64,
+    ]
+    lib.multi_concat.restype = handler
+    lib.variadic_split.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+    ]
+    lib.variadic_split.restype = handler
+    lib.dq_split_linear.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
+    ]
+    lib.dq_split_linear.restype = handler
+    lib.dq_split_linear_prefill.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
+    ]
+    lib.dq_split_linear_prefill.restype = handler
+    lib.gw_linear_prefill.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+    ]
+    lib.gw_linear_prefill.restype = handler
+    for op in get_supported_ops():
+        fn = getattr(lib, op.name)
+        fn.argtypes = [handler] * (op.inputs + 1) + list(op.parameters)
+        fn.restype = handler
+def init_parameters(lib: ctypes.CDLL):
+    """Initialize Netowrk factory parameters.
+    Args:
+        lib (ctypes.CDLL): Intel® NPU Acceleration Library runtime library
+    """
+    lib.createParameters.argtypes = []
+    lib.createParameters.restype = handler
+    lib.destroyParameters.argtypes = [handler]
+    lib.addFloatParameter.argtypes = [handler, c_fp16_array, ctypes.c_int, ctypes.c_int]
+    lib.addIntParameter.argtypes = [
+        handler,
+        c_i8_array,
+        c_fp16_array,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.addInt4Parameter.argtypes = [
+        handler,
+        c_u8_array,
+        c_fp16_array,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.addAsymInt4Parameter.argtypes = [
+        handler,
+        c_u8_array,
+        c_fp16_array,
+        c_fp16_array,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.addIntParameterConversion.argtypes = [
+        handler,
+        c_i8_array,
+        c_fp32_array,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.addInt4WeightParameter.argtypes = [
+        handler,
+        c_u8_array,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
+def initialize_bindings() -> ctypes.CDLL:
+    """Load the Intel® NPU Acceleration Library runtime library, and initialize all c++ <-> python bindings.
+    Returns:
+        ctypes.CDLL: Initialize matmul bindings
+    """
+    lib = load_library()
+    init_common(lib)
+    init_network_factory(lib)
+    init_parameters(lib)
+    return lib
+lib = initialize_bindings()

intel_npu_acceleration_library/backend/compression.py ADDED Viewed

@@ -0,0 +1,24 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
+import numpy as np
+def compress_to_i4(weights: np.ndarray) -> np.ndarray:
+    """Compress a int8 array to int4.
+    Args:
+        weights (np.ndarray): input array
+    Returns:
+        np.ndarray: compressed array
+    """
+    compressed_weights = np.zeros(
+        (weights.shape[0], weights.shape[1] // 2), dtype=np.uint8
+    )
+    backend_lib.compressToI4(weights, compressed_weights, np.prod(weights.shape))
+    return compressed_weights

intel_npu_acceleration_library/backend/convolution.py ADDED Viewed

@@ -0,0 +1,58 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+from intel_npu_acceleration_library.backend.factory import NNFactory
+from typing import Sequence, Union
+import numpy as np
+class Convolution(NNFactory):
+    """Linear class, computing a matrix matrix multiplication with weights prefetching."""
+    def __init__(
+        self,
+        input_shape: Sequence[int],
+        weights_shape: Sequence[int],
+        bias: bool = False,
+        strides: Union[int, Sequence[int]] = 1,
+        padding: Union[int, Sequence[int]] = 0,
+        dilation: Union[int, Sequence[int]] = 1,
+        groups: int = 1,
+        profile: bool = False,
+        device: str = "NPU",
+    ):
+        """Initialize the Linear class.
+        Args:
+            input_shape (Sequence[int]): input shape
+            weights_shape (Sequence[int]): weights shape
+            bias (bool): Enable/Disable bias. Defaults to False.
+            strides (Union[int, Sequence[int]], optional): Strides. Defaults to 1.
+            padding (Union[int, Sequence[int]], optional): Padding. Defaults to 0.
+            dilation (Union[int, Sequence[int]], optional): Dilation. Defaults to 1.
+            groups (int, optional): Groups. Defaults to 1.
+            profile (Optional[bool], optional): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+        """
+        super().__init__(profile, device)
+        input = self.parameter(input_shape)
+        weights = self.parameter(weights_shape)
+        if bias is not None:
+            bias_node = self.parameter((1, weights_shape[0], 1, 1))
+        else:
+            bias_node = None
+        _ = self.convolution(
+            input,
+            weights,
+            bias=bias_node,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            act_dtype=np.float16,
+        )
+        self.compile()