PyPI - bigdl-core-npu - Versions diffs - 2.6.0b20241203__cp310-cp310-win_amd64.whl → 2.6.0b20241206__cp310-cp310-win_amd64.whl - Mend

bigdl-core-npu 2.6.0b20241203__cp310-cp310-win_amd64.whl → 2.6.0b20241206__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

bigdl-core-npu/include/common.h CHANGED Viewed

@@ -64,6 +64,7 @@ struct npu_model_params {
     std::string embedding_post_blob_name;
     std::string config;
     std::string low_bit;
+    std::string lm_head_low_bit;
     bool layernorm_const;
     std::string model_type;
     bool transpose_value_cache;

bigdl-core-npu/npu_llm.dll CHANGED Viewed

Binary file

{bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bigdl-core-npu
-Version: 2.6.0b20241203
+Version: 2.6.0b20241206
 Summary: Intel® NPU Acceleration Library
 Home-page: https://github.com/intel/intel-npu-acceleration-library
 Author: Alessandro Palla

{bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 bigdl-core-npu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bigdl-core-npu/npu_llm.dll,sha256=DBwhZxSvez0nrUrH6MhXEpXkYUwCr8B6STAQrqSDwpM,3902464
+bigdl-core-npu/npu_llm.dll,sha256=viVJrn2M4UOoBncCLrsj78HlS00-PXBOYnpfF2tag2c,3918336
 bigdl-core-npu/npu_llm.lib,sha256=V8WB7fKSKDwBopflxjFk0460SyoAS3LNbkYgFJrgqMQ,43846
-bigdl-core-npu/include/common.h,sha256=2qMhyPMYet0BzIQ5m_wRXye_A8ileRpnsxp55TvcThE,2632
+bigdl-core-npu/include/common.h,sha256=rQ7aSNAthGaImWtvs2ZetTZoZHN8iayiuc02M5QP-Pw,2666
 bigdl-core-npu/include/npu_llm.h,sha256=dS7_Esxw9Nxz3x07zTei2GEhTgxhS5IGBnyOVf3OS0k,2171
 intel_npu_acceleration_library/__init__.py,sha256=ZKTIhGMDjF7P6pF-yX8KWcSXbeHWRk24AO_orsa18f8,536
-intel_npu_acceleration_library/_version.py,sha256=Zqvcj_YG3SeOjaQK4VqUBpZ5A3U5Giu4-GN7fqmJ4Hk,112
+intel_npu_acceleration_library/_version.py,sha256=n-zEmitSgrCJTtvRD4v5hIEhQtBI4j9eS-kJZTKXXyc,112
 intel_npu_acceleration_library/compiler.py,sha256=3IdgqjamSC8MLexDBJypIeZRiWIcTFnvQSU1LPXUr7Y,6225
 intel_npu_acceleration_library/device.py,sha256=9bn8eVXJa5cXIqgfLsQAdkMVtVUQABb8z0-mQik5jRg,7424
 intel_npu_acceleration_library/dtypes.py,sha256=gdd06Wsc9zIZFHlauUEx4xcK9WGTn1Mu6GkuYDJeA-E,4683
@@ -12,18 +12,18 @@ intel_npu_acceleration_library/modelling.py,sha256=vSiQOWGJ0l6wGV7zWQtZEkHpnMQIM
 intel_npu_acceleration_library/optimizations.py,sha256=9NY8QoDFbs2LY12jbx6As8g2v0oInX4YzvkjnqViA70,5469
 intel_npu_acceleration_library/quantization.py,sha256=6N_04h1KX6TNbw-ceANV0Pmk4_lQ2Y9C7Pwn5x-zQzo,5566
 intel_npu_acceleration_library/backend/__init__.py,sha256=2NP6Ypr1dGUNXmLGW5GD9xrh0U9KJgqxTd_c7su1RUY,857
-intel_npu_acceleration_library/backend/base.py,sha256=hbHqxSOfWH5BaA5PY6_zaf1Zdg5NrQK6WOfe-hr279k,8605
-intel_npu_acceleration_library/backend/bindings.py,sha256=6APMmDZnYBwsjcZIO5bgFz8IwfhR4CH1evWWc2IylwM,10005
+intel_npu_acceleration_library/backend/base.py,sha256=0EXHZTMrelebJ6HOSe74zE1mhy9tghXrkYnRQDLzwk4,9492
+intel_npu_acceleration_library/backend/bindings.py,sha256=mu7EJ60X1cAFa1y17yA4r3n5lwFBSzsq5u-6Nj-OaJ0,10352
 intel_npu_acceleration_library/backend/compression.py,sha256=Avz_zm2s_ELy5peVQ8zFGn8njBfh9nEGR16mflotBic,630
 intel_npu_acceleration_library/backend/convolution.py,sha256=cN3k78X3Y4Cbf7er-MFq0sJ4OwIvquj8PajpdEDmCo4,2018
-intel_npu_acceleration_library/backend/factory.py,sha256=5NQyfBxjc0lkFrzmzwDSy6q7K-W78uTpGoWWfUDJGjg,40953
+intel_npu_acceleration_library/backend/factory.py,sha256=RuVxbG-jHRhkkUU2q2kITVov3r-gBb6SGslWS-eL-Sk,41035
 intel_npu_acceleration_library/backend/linear.py,sha256=RiLUh5FOSxRWHB5kYx7mOPOOrS_vxIeBJ5t3yC6wOiQ,1908
 intel_npu_acceleration_library/backend/matmul.py,sha256=mfGi73-mIbUcXp4kyvCGW0Y9kb4Xp1ppbGNpdJFohuA,1819
 intel_npu_acceleration_library/backend/mlp.py,sha256=BuKVwSI726v3nHQQvtMBbXyWxRTq-WoLZtTxeSeWaaY,2330
 intel_npu_acceleration_library/backend/ops.py,sha256=3yS-f-VPErHFt_oWZrgplNmSWnRrjm_wdxXNsqEBN7M,5070
-intel_npu_acceleration_library/backend/qlinear.py,sha256=oeawOjRBA_kQRqfQ1Vn_e3aJQa2b4pQ8y0gMwvHBTzk,2362
+intel_npu_acceleration_library/backend/qlinear.py,sha256=4k0QGq8beo20BgXSmKFlCo6XaI3ZqAMtJR4_VCmv0rU,2565
 intel_npu_acceleration_library/backend/qmatmul.py,sha256=pJkFJaBxZk3Oh5w_f6ywRNeGOfloEOCj0mCGnvim9Ew,2250
-intel_npu_acceleration_library/backend/runtime.py,sha256=COsTslfHknjrix4kETlnKvEmLO2lyBtXpAzEQTZJSvk,7195
+intel_npu_acceleration_library/backend/runtime.py,sha256=anHg8F2mfTYm9aqli_AOibXW-klWg1A7hEAJiQa_Azk,7557
 intel_npu_acceleration_library/backend/sdpa.py,sha256=HNlL9jEA9OH3KnZqOkLcaKwt8tfCe5apUQxlWw0UhlA,3818
 intel_npu_acceleration_library/backend/tensor.py,sha256=swTymMVcXLPFXOlo6b_H3VGO1xf76Fz30RKyJPrVhPY,35468
 intel_npu_acceleration_library/backend/utils.py,sha256=WJ2agtqYxBuDd21ngE55io9VX-MOGg-AjnE63UHpCiU,2174
@@ -198,7 +198,7 @@ intel_npu_acceleration_library/external/openvino/torch/__init__.py,sha256=RXLzsf
 intel_npu_acceleration_library/functional/__init__.py,sha256=WWKwKOh6Sgovv7mKctA872TbLP98Pg5m5-MREvUmlAA,204
 intel_npu_acceleration_library/functional/scaled_dot_product_attention.py,sha256=yGUcg4tDQOLuUnP1g74cl-ec8TRr2SuAMcNLlN6qLvE,1620
 intel_npu_acceleration_library/lib/Release/cache.json,sha256=CyrSqZUWo0Ec4_7ydOiuKIC0Gm8AybrGdozUqUuHxBw,8840377
-intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=nJXii1uhKbsSkzsuei_fhQUMUIXuay5ThwUiPnaFq9s,333312
+intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll,sha256=o1V9NO0l3hYO1sjwCWuinyalVBQmHMXbWmOB6XLlvOo,340480
 intel_npu_acceleration_library/lib/Release/openvino.dll,sha256=m7M119p3JBq2YYJJ2zzCaBDz6XivKK3nNykb8L1cvDU,13244768
 intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll,sha256=2v_I9P3Qo0St1bQZMEZscnFOUVvgZQQ0HvQlG3HtTd0,203104
 intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll,sha256=e3Aj9CDRHN30dBEdPSk7OCWe52tWfhI4xeXgyFjuDHg,475488
@@ -228,7 +228,7 @@ intel_npu_acceleration_library/nn/functional.py,sha256=UfAKBc0u6RtyaMo14ldH2GpEn
 intel_npu_acceleration_library/nn/linear.py,sha256=Q06SoGQeLaI86nA_ky2GnFC6H2Fw1zyMDILKnpYC2eo,5739
 intel_npu_acceleration_library/nn/llm.py,sha256=P6dz36Yf6BHtzWcftaghC6QaMI_WeRfQwrCbO7fD6hk,15002
 intel_npu_acceleration_library/nn/module.py,sha256=EYxoTq6I_YgBDgTF76GPDxHrT8SupOTDGMzQaomBeq8,12667
-bigdl_core_npu-2.6.0b20241203.dist-info/METADATA,sha256=c98vASOTuWESrjtYC6mrQ7qe8lIx_FxyvKgoYV0mqlM,1541
-bigdl_core_npu-2.6.0b20241203.dist-info/WHEEL,sha256=tcd-HDpskugT8GYYKyyid0lOlzoZtZdWwcrj5ormtfo,101
-bigdl_core_npu-2.6.0b20241203.dist-info/top_level.txt,sha256=iMQZlTsFPJjlD-Y0MqZEP_9ifI0LlbNCJIOTaMoGMjk,46
-bigdl_core_npu-2.6.0b20241203.dist-info/RECORD,,
+bigdl_core_npu-2.6.0b20241206.dist-info/METADATA,sha256=VIzHA4u_YDpz5WLHKj_A4JFaayA9j6KscxxJ0gyJhbI,1541
+bigdl_core_npu-2.6.0b20241206.dist-info/WHEEL,sha256=tcd-HDpskugT8GYYKyyid0lOlzoZtZdWwcrj5ormtfo,101
+bigdl_core_npu-2.6.0b20241206.dist-info/top_level.txt,sha256=iMQZlTsFPJjlD-Y0MqZEP_9ifI0LlbNCJIOTaMoGMjk,46
+bigdl_core_npu-2.6.0b20241206.dist-info/RECORD,,

intel_npu_acceleration_library/_version.py CHANGED Viewed

@@ -3,4 +3,4 @@
 # SPDX-License-Identifier: Apache 2.0
 #
-__version__ = "2.6.0b20241203"
+__version__ = "2.6.0b20241206"

intel_npu_acceleration_library/backend/base.py CHANGED Viewed

@@ -116,7 +116,12 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
             for weight in weights:
                 if isinstance(weight, (list, tuple)):
                     # int8: data and scale
-                    data, scale = weight
+                    if len(weight) == 2:
+                        data, scale = weight
+                        zero = None
+                    elif len(weight) == 3:
+                        # for asym int4
+                        data, scale, zero = weight
                     if data.dtype not in [np.int8, np.uint8]:
                         raise RuntimeError(
                             "Quantized weights needs to be in int8 or uint8 format"
@@ -133,7 +138,20 @@ class BaseNPUBackendWithPrefetch(BaseNPUBackend):
                                 *shape,
                                 *shape_scale,
                             )
+                        elif data.dtype == np.uint8 and zero is not None:
+                            # asym_int4
+                            adapted_weights_zero, shape_zero = adapt_weight(zero)
+                            backend_lib.addAsymInt4Parameter(
+                                param,
+                                adapted_weights,
+                                adapted_weights_scale,
+                                adapted_weights_zero,
+                                *shape,
+                                *shape_scale,
+                                *shape_zero
+                            )
                         else:
+                            # sym_int4
                             backend_lib.addInt4Parameter(
                                 param,
                                 adapted_weights,

intel_npu_acceleration_library/backend/bindings.py CHANGED Viewed

@@ -183,6 +183,7 @@ def init_network_factory(lib: ctypes.CDLL):
         ctypes.c_char_p,
         ctypes.c_char_p,
         ctypes.c_bool,
+        ctypes.c_bool,
     ]
     lib.linear.restype = handler
@@ -265,6 +266,7 @@ def init_network_factory(lib: ctypes.CDLL):
         ctypes.c_char_p,
         ctypes.c_char_p,
         ctypes.c_bool,
+        ctypes.c_bool,
     ]
     lib.dq_split_linear.restype = handler
@@ -278,6 +280,7 @@ def init_network_factory(lib: ctypes.CDLL):
         ctypes.c_char_p,
         ctypes.c_char_p,
         ctypes.c_bool,
+        ctypes.c_bool,
     ]
     lib.dq_split_linear_prefill.restype = handler
@@ -332,6 +335,19 @@ def init_parameters(lib: ctypes.CDLL):
         ctypes.c_int,
     ]
+    lib.addAsymInt4Parameter.argtypes = [
+        handler,
+        c_u8_array,
+        c_fp16_array,
+        c_fp16_array,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+    ]
     lib.addIntParameterConversion.argtypes = [
         handler,
         c_i8_array,

intel_npu_acceleration_library/backend/factory.py CHANGED Viewed

@@ -378,6 +378,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         act_dtype: npt.DTypeLike = np.float16,
         wt_dtype: npt.DTypeLike = np.float16,
         scale_factor: bool = True,
+        asym: bool=False,
     ) -> ctypes._Pointer:
         """Generate a linear layer.
@@ -400,7 +401,8 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             bias,
             self.get_backend_dtype(act_dtype),
             self.get_backend_dtype(wt_dtype),
-            scale_factor
+            scale_factor,
+            asym
         )
     @return_tensor
@@ -412,6 +414,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
         scale_factor: bool = True,
         is_prefill: bool = False,
         use_dq: bool = True,
+        asym: bool = False,
     ) -> ctypes._Pointer:
         """Generate a linear layer for dynamic quantization linear layer.
@@ -424,7 +427,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
             act_dtype (npt.DTypeLike, optional): activation dtype. Defaults to np.float16.
             wt_dtype (npt.DTypeLike, optional): weight dtype. Defaults to np.float16.
             scale_factor (bool, optional): enable/disable mul scale factor. Default to True,
-            is_prefill (bool, optional): enable/disable prefill linear optimization. Default to True.
+            is_prefill (bool, optional): enable/disable prefill linear optimization. Default to False.
         Returns:
             ctypes._Pointer: output node
@@ -437,7 +440,7 @@ class NNFactory(BaseNPUBackendWithPrefetch):
                     input_channels, outout_channels, bias,
                     self.get_backend_dtype(act_dtype),
                     self.get_backend_dtype(wt_dtype),
-                    scale_factor)
+                    scale_factor, asym)
     @return_tensor
     def reshape(

intel_npu_acceleration_library/backend/qlinear.py CHANGED Viewed

@@ -18,6 +18,7 @@ class QLinear(NNFactory):
         profile: bool = False,
         device: str = "NPU",
         dtype: np.dtype = np.int8,
+        asym: bool = False
     ):
         """Initialize the QLinear class.
@@ -33,13 +34,14 @@ class QLinear(NNFactory):
         super().__init__(profile, device)
         self.inC, self.outC = inC, outC
         self.batch = batch
+        self.asym = asym
         input = self.parameter((self.batch, self.inC))
-        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
+        _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype, asym=asym)
         self.compile()
     def run(
-        self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
+        self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, zero: np.ndarray=None, op_id: str=None
     ) -> np.ndarray:
         """Run the layer:  $X * (W * S)^T$ .
@@ -67,5 +69,7 @@ class QLinear(NNFactory):
             raise RuntimeError(
                 f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
             )
-        return super().run(X, (W, scale), op_id=op_id)
+        if not self.asym:
+            return super().run(X, (W, scale), op_id=op_id)
+        else:
+            return super().run(X, (W, scale, zero), op_id=op_id)

intel_npu_acceleration_library/backend/runtime.py CHANGED Viewed

@@ -27,6 +27,7 @@ def run_matmul(
     x: torch.Tensor,
     weights: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
+    zero: Optional[torch.Tensor] = None,
     op_id: Optional[str] = None,
 ) -> torch.Tensor:
     """Run a matmul operation. Depending on the datatype of the weights it runs a float or quantized operation.
@@ -35,6 +36,7 @@ def run_matmul(
         x (torch.Tensor): Activation tensor. Its dtype must be torch.float16
         weights (torch.Tensor): Weights tensor.  Its dtype can be torch.float16 or torch.int8
         scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None.
+        zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4. If weights.dtype == torch.uint8 and use asym_int4 then it must be set and asym Defaults to None.
         op_id (Optional[str], optional): Operation ID. Defaults to None.
     Raises:
@@ -68,12 +70,15 @@ def run_matmul(
         op_class = QLinear if op_id is not None else QMatMul
         op_class_name = op_class.__name__
         np_dtype = np.int8 if weights.dtype == torch.int8 else np.uint8
-        create_op = partial(op_class, dtype=np_dtype)
+        create_op = partial(op_class, dtype=np_dtype, asym=(zero is not None))
         if scale is None:
             raise RuntimeError(
                 f"Quantized matmul (weights dtype == {weights.dtype}) requires scale (scale = {scale})"
             )
-        op_args = [weights.numpy(), scale.numpy()]
+        if zero is None:
+            op_args = [weights.numpy(), scale.numpy()]
+        else:
+            op_args = [weights.numpy(), scale.numpy(), zero.numpy()]
     else:
         raise RuntimeError(f"Unsupported dtype for weights {weights.dtype}")

intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll CHANGED Viewed

Binary file

{bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/WHEEL RENAMED Viewed

File without changes

{bigdl_core_npu-2.6.0b20241203.dist-info → bigdl_core_npu-2.6.0b20241206.dist-info}/top_level.txt RENAMED Viewed

File without changes