PyPI - bigdl-core-npu - Versions diffs - 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl - Mend

bigdl-core-npu 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

intel_npu_acceleration_library/backend/bindings.py CHANGED Viewed

@@ -67,8 +67,9 @@ def init_common(lib: ctypes.CDLL):
     Args:
         lib (ctypes.CDLL): Intel® NPU Acceleration Library runtime library
     """
-    lib.saveModel.argtypes = [handler, ctypes.c_char_p]
+    lib.saveModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_bool]
     lib.saveCompiledModel.argtypes = [handler, ctypes.c_char_p]
+    lib.serializeModel.argtypes = [handler, ctypes.c_char_p, ctypes.c_char_p]
     # Set input activations
     lib.set_activation.argtypes = [handler, ctypes.c_void_p, ctypes.c_int]
@@ -91,6 +92,16 @@ def init_common(lib: ctypes.CDLL):
     lib.compressToI4.argtypes = [c_i8_array, c_u8_array, ctypes.c_int]
+    # Remote tensors
+    lib.to_npu.argtypes = [ctypes.c_int, c_u32_array, ctypes.c_char_p, ctypes.c_void_p]
+    lib.to_npu.restype = handler
+    lib.remote_tensor_data.argtypes = [handler]
+    lib.remote_tensor_data.restype = ctypes.c_void_p
+    lib.del_remote_tensor.argtypes = [handler]
 def init_network_factory(lib: ctypes.CDLL):
     """Initialize Netowrk factory bindings.
@@ -106,15 +117,18 @@ def init_network_factory(lib: ctypes.CDLL):
     lib.setNNFactoryWeights.argtypes = [handler, ctypes.c_int, handler, ctypes.c_bool]
-    lib.op_shape_size.argtypes = [handler]
+    lib.op_shape_size.argtypes = [handler, ctypes.c_int]
     lib.op_shape_size.restype = ctypes.c_int
-    lib.op_shape.argtypes = [handler, ctypes.c_int]
+    lib.op_shape.argtypes = [handler, ctypes.c_int, ctypes.c_int]
     lib.op_shape.restype = ctypes.c_int
-    lib.op_dtype.argtypes = [handler]
+    lib.op_dtype.argtypes = [handler, ctypes.c_int]
     lib.op_dtype.restype = ctypes.c_int
+    lib.op_output_size.argtypes = [handler]
+    lib.op_output_size.restype = ctypes.c_int
     lib.parameter.argtypes = [handler, ctypes.c_int, c_u32_array, ctypes.c_char_p]
     lib.parameter.restype = handler
@@ -143,7 +157,16 @@ def init_network_factory(lib: ctypes.CDLL):
     ]
     lib.slice.restype = handler
-    lib.compile.argtypes = [handler]
+    lib.simple_slice.argtypes = [
+        handler,
+        handler,
+        handler,
+        handler,
+        handler
+    ]
+    lib.simple_slice.restype = handler
+    lib.compile.argtypes = [handler, ctypes.c_int]
     lib.compile.restype = handler
     lib.get_output_tensor_shape_size.argtypes = [handler, ctypes.c_int]
@@ -160,6 +183,8 @@ def init_network_factory(lib: ctypes.CDLL):
         ctypes.c_bool,
         ctypes.c_char_p,
         ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
     ]
     lib.linear.restype = handler
@@ -214,6 +239,65 @@ def init_network_factory(lib: ctypes.CDLL):
     ]
     lib.max_pooling.restype = handler
+    lib.multi_concat.argtypes = [
+        handler,
+        ctypes.POINTER(handler),
+        ctypes.c_uint64,
+        ctypes.c_int64,
+    ]
+    lib.multi_concat.restype = handler
+    lib.variadic_split.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        c_u32_array,
+        ctypes.c_int,
+    ]
+    lib.variadic_split.restype = handler
+    lib.dq_split_linear.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
+    ]
+    lib.dq_split_linear.restype = handler
+    lib.dq_split_linear_prefill.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+        ctypes.c_bool,
+    ]
+    lib.dq_split_linear_prefill.restype = handler
+    lib.gw_linear_prefill.argtypes = [
+        handler,
+        handler,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_bool,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_bool,
+    ]
+    lib.gw_linear_prefill.restype = handler
     for op in get_supported_ops():
         fn = getattr(lib, op.name)
         fn.argtypes = [handler] * (op.inputs + 1) + list(op.parameters)
@@ -252,6 +336,19 @@ def init_parameters(lib: ctypes.CDLL):
         ctypes.c_int,
     ]
+    lib.addAsymInt4Parameter.argtypes = [
+        handler,
+        c_u8_array,
+        c_fp16_array,
+        c_fp16_array,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
     lib.addIntParameterConversion.argtypes = [
         handler,
         c_i8_array,
@@ -260,6 +357,13 @@ def init_parameters(lib: ctypes.CDLL):
         ctypes.c_int,
     ]
+    lib.addInt4WeightParameter.argtypes = [
+        handler,
+        c_u8_array,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
 def initialize_bindings() -> ctypes.CDLL:
     """Load the Intel® NPU Acceleration Library runtime library, and initialize all c++ <-> python bindings.

intel_npu_acceleration_library/backend/factory.py CHANGED Viewed

@@ -7,7 +7,7 @@ from intel_npu_acceleration_library.backend.base import BaseNPUBackendWithPrefet
 from intel_npu_acceleration_library.backend.ops import get_supported_ops
 from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
 from intel_npu_acceleration_library.backend.tensor import Tensor
-from intel_npu_acceleration_library.dtypes import int4, bfloat16
+from intel_npu_acceleration_library.dtypes import int4, bfloat16, get_backend_dtype
 from typing import Optional, Tuple, Any, Union, Sequence, TypeVar, Callable, cast, List
 from functools import partial
 import numpy.typing as npt
@@ -71,17 +71,99 @@ class NNFactory(BaseNPUBackendWithPrefetch):
                 Tensor: Tensor object
             """
             # Convert Tensor objects to their underlying node
-            args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
             kwargs = {
                 k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
             }
+            if fn.__qualname__ == 'NNFactory.reshape':
+                output_idx = args[0].output_idx
+                kwargs["output_idx"] = output_idx
+            args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
             input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
                 v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
             ]
             # Call the function
             node = fn(self, *args, **kwargs)
+            output_len = backend_lib.op_output_size(node)
+            # remove input nodes from output_nodes
+            self.output_nodes = [
+                node for node in self.output_nodes if node not in input_nodes
+            ]
+            # add output node to output_nodes
+            if fn.__name__ != "constant":
+                self.output_nodes.append(node)
+            # Wrap the node in a Tensor object
+            if output_len == 1:
+                return Tensor(factory=self, node=node, output_idx=0)
+            else:
+                output_tensor_list = []
+                for i in range(output_len):
+                    output_tensor_list.append(Tensor(factory=self, node=node, output_idx=i))
+                return output_tensor_list
+        return cast(F, wrapper)
+    def return_tensor_for_list_inputs(fn: F) -> F:  # type: ignore
+        """Wrap the output of a function in a Tensor object.
+        This new wrapper add support for List Tensor input.
+        Args:
+            fn (function): Function
+        Returns:
+            function: A function that wraps the output in a Tensor object
+        """
+        def wrapper(self, *args: Any, **kwargs: Any) -> Tensor:
+            """Wrap the output of a function in a Tensor object.
+            Args:
+                args (Any): Variable length argument list
+                kwargs (Any): Arbitrary keyword arguments
+            Returns:
+                Tensor: Tensor object
+            """
+            # Convert Tensor objects to their underlying node
+            # args = tuple(arg.node if isinstance(arg, Tensor) else arg for arg in args)
+            new_args = []
+            for arg in args:
+                if isinstance(arg, Tensor):
+                    new_args.append(arg.node)
+                elif isinstance(arg, (tuple, list)):
+                    # for item in arg:
+                    for i in range(len(arg)):
+                        if isinstance(arg[i], Tensor):
+                            arg[i] = arg[i].node
+                    new_args.append(arg)
+                else:
+                    new_args.append(arg)
+            args = tuple(new_args)
+            kwargs = {
+                k: v.node if isinstance(v, Tensor) else v for k, v in kwargs.items()
+            }
+            # input_nodes = [arg for arg in args if isinstance(arg, ctypes._Pointer)] + [
+            #     v for v in kwargs.values() if isinstance(v, ctypes._Pointer)
+            # ]
+            input_nodes = []
+            for arg in args:
+                if isinstance(arg, ctypes._Pointer):
+                    input_nodes.append(arg)
+                elif isinstance(arg, (tuple, list)):
+                    for item in arg:
+                        if isinstance(item, ctypes._Pointer):
+                            input_nodes.append(item)
+            input_nodes +=  [v for v in kwargs.values() if isinstance(v, ctypes._Pointer)]
+            # Call the function
+            node = fn(self, *args, **kwargs)
             # remove input nodes from output_nodes
             self.output_nodes = [
                 node for node in self.output_nodes if node not in input_nodes
@@ -115,34 +197,10 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         Args:
             dtype: numpy dtype
-        Raises:
-            RuntimeError: Unsupported datatype
         Returns:
             ctypes.c_char_p: string representation of the dtype
         """
-        if dtype in [np.int8, torch.int8]:
-            str_dtype = "int8"
-        elif dtype == np.uint8 or dtype == int4:
-            # u8 represents packed i4 dtypes
-            str_dtype = "int4"
-        elif dtype in [np.int16, torch.int16]:
-            str_dtype = "int16"
-        elif dtype in [np.int32, torch.int32]:
-            str_dtype = "int32"
-        elif dtype in [np.int64, torch.int64]:
-            str_dtype = "int64"
-        elif dtype in [np.float16, torch.float16]:
-            str_dtype = "float16"
-        elif dtype in [np.float32, torch.float32]:
-            str_dtype = "float32"
-        elif dtype in [np.float64, torch.float64]:
-            str_dtype = "float64"
-        elif dtype in [bfloat16, torch.bfloat16]:
-            str_dtype = "bfloat16"
-        else:
-            raise RuntimeError(f"DType is not supported {dtype}")
-        return ctypes.c_char_p(str_dtype.encode())
+        return get_backend_dtype(dtype)
     @return_tensor
     def parameter(
@@ -319,6 +377,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         bias: Optional[bool] = False,
         act_dtype: npt.DTypeLike = np.float16,
         wt_dtype: npt.DTypeLike = np.float16,
+        scale_factor: bool = True,
+        asym: bool=False,
     ) -> ctypes._Pointer:
         """Generate a linear layer.
@@ -341,11 +401,52 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             bias,
             self.get_backend_dtype(act_dtype),
             self.get_backend_dtype(wt_dtype),
+            scale_factor,
+            asym
         )
+    @return_tensor
+    def dq_split_linear(
+        self, input_node: ctypes._Pointer, n_splits: int,
+        outout_channels: int, input_channels: int, bias: bool = False,
+        act_dtype: npt.DTypeLike = np.float16,
+        wt_dtype: npt.DTypeLike = np.float16,
+        scale_factor: bool = True,
+        is_prefill: bool = False,
+        use_dq: bool = True,
+        asym: bool = False,
+    ) -> ctypes._Pointer:
+        """Generate a linear layer for dynamic quantization linear layer.
+        Args:
+            input_node (ctypes._Pointer): layer input node
+            n_splits (int): number of parts the linear layer is split into
+            output_channels (int): number of output channels
+            input_channels (int): number of input channels
+            bias (bool, optional): enable/disable bias. Defaults to False.
+            act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
+            wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
+            scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
+            is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
+        Returns:
+            ctypes._Pointer: output node
+        """
+        if is_prefill:
+            func = backend_lib.dq_split_linear_prefill if use_dq else backend_lib.gw_linear_prefill
+        else:
+            func = backend_lib.dq_split_linear
+        return func(self._mm, input_node, n_splits,
+                    input_channels, outout_channels, bias,
+                    self.get_backend_dtype(act_dtype),
+                    self.get_backend_dtype(wt_dtype),
+                    scale_factor, asym)
     @return_tensor
     def reshape(
-        self, input_node: ctypes._Pointer, shape: Sequence[int]
+        self, input_node: ctypes._Pointer, shape: Sequence[int],
+        special_zero: bool = True,
+        output_idx: int = 0
     ) -> ctypes._Pointer:
         """Generate a reshape layer.
@@ -357,7 +458,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         shape_node = self.constant(shape).node  # type: ignore
-        return backend_lib.reshape(self._mm, input_node, shape_node)
+        return backend_lib.reshape(self._mm, input_node, shape_node,
+                                   special_zero, output_idx)
     @return_tensor
     def broadcast(
@@ -453,6 +555,46 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             end_mask_ptr.size,
             end_mask_ptr,
         )
+    @return_tensor
+    def simple_slice(
+        self,
+        input_node: ctypes._Pointer,
+        begin: Sequence[int],
+        end: Sequence[int],
+        step: Optional[Sequence[int]] = None,
+    ) -> ctypes._Pointer:
+        """Generate an unsqueeze layer.
+        Args:
+            input_node (ctypes._Pointer): layer input node
+            begin (Sequence[int]): begin
+            end (Sequence[int]): end
+            stride (Optional[Sequence[int]]): stride
+        Raises:
+            ValueError: begin and end must have the same length
+        Returns:
+            ctypes._Pointer: output node
+        """
+        if len(begin) != len(end):
+            raise ValueError("begin and end must have the same length")
+        if step is None:
+            step = [1] * len(begin)
+        begin = self.constant(begin).node  # type: ignore
+        end = self.constant(end).node  # type: ignore
+        step = self.constant(step).node  # type: ignore
+        return backend_lib.simple_slice(
+            self._mm,
+            input_node,
+            begin,
+            end,
+            step
+        )
     @return_tensor
     def concat(
@@ -469,11 +611,32 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if axis < 0:
-            shape_size = backend_lib.op_shape_size(input_node_1)
+            shape_size = backend_lib.op_shape_size(input_node_1, 0)
             axis = (axis + shape_size) % shape_size
         axis = np.int64(axis)
         return backend_lib.concat(self._mm, input_node_1, input_node_2, axis)
+    @return_tensor_for_list_inputs
+    def sequence_concat(
+        self, input_nodes: List[ctypes._Pointer], axis: int
+    ) -> ctypes._Pointer:
+        """Generate a concatenation layer.
+        Args:
+            input_nodes (List[ctypes._Pointer]): sequence of layer input node
+            axis (int): axis
+        Returns:
+            ctypes._Pointer: output node
+        """
+        if axis < 0:
+            shape_size = backend_lib.op_shape_size(input_nodes[0], 0)
+            axis = (axis + shape_size) % shape_size
+        axis = np.int64(axis)
+        input_ptr = (ctypes.POINTER(ctypes.c_char) * len(input_nodes))(*input_nodes)
+        return backend_lib.multi_concat(self._mm, input_ptr, len(input_nodes), axis)
     @return_tensor
     def reduce_max(
         self,
@@ -492,7 +655,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if reduction_axes is None:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             reduction_axes = list(range(shape_size - 1, -1, -1))
         axis_node = self.constant(reduction_axes).node  # type: ignore
         return backend_lib.reduce_max(self._mm, input_node, axis_node, keep_dims)
@@ -515,7 +678,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if reduction_axes is None:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             reduction_axes = list(range(shape_size - 1, -1, -1))
         axis_node = self.constant(reduction_axes).node  # type: ignore
         return backend_lib.reduce_mean(self._mm, input_node, axis_node, keep_dims)
@@ -538,7 +701,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if reduction_axes is None:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             reduction_axes = list(range(shape_size - 1, -1, -1))
         axis_node = self.constant(reduction_axes).node  # type: ignore
         return backend_lib.reduce_min(self._mm, input_node, axis_node, keep_dims)
@@ -561,7 +724,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if reduction_axes is None:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             reduction_axes = list(range(shape_size - 1, -1, -1))
         axis_node = self.constant(reduction_axes).node  # type: ignore
         return backend_lib.reduce_prod(self._mm, input_node, axis_node, keep_dims)
@@ -584,7 +747,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if reduction_axes is None:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             reduction_axes = list(range(shape_size - 1, -1, -1))
         axis_node = self.constant(reduction_axes).node  # type: ignore
         return backend_lib.reduce_sum(self._mm, input_node, axis_node, keep_dims)
@@ -604,7 +767,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             ctypes._Pointer: output node
         """
         if axis < 0:
-            shape_size = backend_lib.op_shape_size(input_node)
+            shape_size = backend_lib.op_shape_size(input_node, 0)
             axis = (axis + shape_size) % shape_size
         axis_node = self.constant(axis).node  # type: ignore
         return backend_lib.normL2(self._mm, input_node, axis_node, eps)
@@ -627,14 +790,14 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         Returns:
             ctypes._Pointer: output node
         """
-        input_shape_size = backend_lib.op_shape_size(input_node)
+        input_shape_size = backend_lib.op_shape_size(input_node, 0)
         input_shape = [
-            backend_lib.op_shape(input_node, i) for i in range(input_shape_size)
+            backend_lib.op_shape(input_node, i, 0) for i in range(input_shape_size)
         ]
         if isinstance(exponent, ctypes._Pointer):
-            exponent_shape_size = backend_lib.op_shape_size(input_node)
+            exponent_shape_size = backend_lib.op_shape_size(input_node, 0)
             exponent_shape = [
-                backend_lib.op_shape(exponent, i) for i in range(exponent_shape_size)
+                backend_lib.op_shape(exponent, i, 0) for i in range(exponent_shape_size)
             ]
         else:
             exponent_shape = list(exponent.shape)
@@ -643,6 +806,39 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         #     raise ValueError("Input tensor shapes are not equal")
         return backend_lib.power(self._mm, input_node, exponent)
+    @return_tensor
+    def variadic_split(
+        self,
+        input: ctypes._Pointer,
+        axis: int,
+        split_lengths: Sequence[int],
+    ) -> ctypes._Pointer:
+        """Generate an average pooling layer.
+        Args:
+            input (ctypes._Pointer): layer input node
+            axis (int): split axis
+            split_lengths (Sequence[int]): A list containing the sizes of each output tensor
+            along the split "axis". Size of "split_lengths" should be equal to the number of
+            outputs. The sum of split_lengths must match data.shape[axis]
+        Raises:
+            NotImplementedError: divisor_override is not supported
+        Returns:
+            ctypes._Pointer: output node
+        """
+        split_lens_ptr = np.array(split_lengths, dtype=np.uint32)
+        return backend_lib.variadic_split(
+            self._mm,
+            input,
+            axis,
+            split_lens_ptr,
+            split_lens_ptr.size,
+        )
     @return_tensor
     def avg_pooling(
@@ -777,7 +973,28 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             auto_pad,  # auto_pad
         )
-    def get_tensor_shape(self, node):
+    @return_tensor
+    def scaled_dot_product_attention(
+        self, query: ctypes._Pointer, key: ctypes._Pointer,
+        value: ctypes._Pointer, attn_mask: ctypes._Pointer,
+        is_causal: bool
+    ) -> ctypes._Pointer:
+        """Constructs a ScaledDotProductAttention operation.
+        Args:
+            query (ctypes._Pointer): query
+            key (ctypes._Pointer): key
+            value (ctypes._Pointer): value
+            attn_mask (ctypes._Pointer): attention mask
+            is_causal (ctypes._Pointer): causal/not causal
+        Returns:
+            ctypes._Pointer: output node
+        """
+        return backend_lib.scaled_dot_product_attention(self._mm,
+                                                        query, key,
+                                                        value, attn_mask,
+                                                        is_causal)
+    def get_tensor_shape(self, node, output_idx=0):
         """Get tensor shape.
         Args:
@@ -786,10 +1003,10 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         Returns:
             tuple[int]: tensor shape
         """
-        size = backend_lib.op_shape_size(node)
-        return tuple([backend_lib.op_shape(node, idx) for idx in range(size)])
+        size = backend_lib.op_shape_size(node, output_idx)
+        return tuple([backend_lib.op_shape(node, idx, output_idx) for idx in range(size)])
-    def get_tensor_dtype(self, node):
+    def get_tensor_dtype(self, node, output_idx=0):
         """Get tensor dtype.
         Args:
@@ -801,7 +1018,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         Returns:
             str: tensor dtype
         """
-        dtype_int = backend_lib.op_dtype(node)
+        dtype_int = backend_lib.op_dtype(node, output_idx)
         if dtype_int == 2:
             return np.bool
@@ -826,7 +1043,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         else:
             raise RuntimeError("Unsupported dtype")
-    def compile(self):
+    def compile(self, npu_dpu_groups=4):
         """Finalize and compile a model."""
         self.out = []
         self.torch_out = []
@@ -834,7 +1051,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             backend_lib.result(self._mm, node)
         # Compile the model
-        backend_lib.compile(self._mm)
+        backend_lib.compile(self._mm, npu_dpu_groups)
         for idx, node in enumerate(self.output_nodes):
             output_shape = self.get_tensor_shape(node)

intel_npu_acceleration_library/backend/ops.py CHANGED Viewed

@@ -98,7 +98,7 @@ def get_supported_ops() -> List[SupportedOp]:
             inputs=3,
             parameters=[ctypes.c_int],
         ),
-        SupportedOp(name="reshape", inputs=2),
+        SupportedOp(name="reshape", inputs=2, parameters=[ctypes.c_bool, ctypes.c_int]),
         SupportedOp(name="transpose", inputs=2),
         SupportedOp(name="squeeze", inputs=1),
         SupportedOp(name="unsqueeze", inputs=2),
@@ -137,5 +137,6 @@ def get_supported_ops() -> List[SupportedOp]:
         SupportedOp(name="power", inputs=2),
         SupportedOp(name="broadcast", inputs=2),
         SupportedOp(name="log_softmax", inputs=1, parameters=[ctypes.c_int64]),
+        SupportedOp(name="rotate_half", inputs=1),
     ]
     return supported_ops

intel_npu_acceleration_library/backend/qlinear.py CHANGED Viewed

@@ -18,6 +18,7 @@ class QLinear(NNFactory):
         profile: bool = False,
         device: str = "NPU",
         dtype: np.dtype = np.int8,
+        asym: bool = False
     ):
         """Initialize the QLinear class.
@@ -33,13 +34,14 @@ class QLinear(NNFactory):
         super().__init__(profile, device)
         self.inC, self.outC = inC, outC
         self.batch = batch
+        self.asym = asym
         input = self.parameter((self.batch, self.inC))
-        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
+        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
         self.compile()
     def run(
-        self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
+        self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
     ) -> np.ndarray:
         """Run the layer:  $X * (W * S)^T$ .
@@ -67,5 +69,7 @@ class QLinear(NNFactory):
             raise RuntimeError(
                 f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
             )
-        return super().run(X, (W, scale), op_id=op_id)
+        if not self.asym:
+            return super().run(X, (W, scale), op_id=op_id)
+        else:
+            return super().run(X, (W, scale, zero), op_id=op_id)