PyPI - compressed-tensors - Versions diffs - 0.12.3a20251030__tar.gz → 0.12.3a20251110__tar.gz - Mend

compressed-tensors 0.12.3a20251030tar.gz → 0.12.3a20251110tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/.github/workflows/test-check.yaml RENAMED Viewed

@@ -12,7 +12,7 @@ on:
 jobs:
   python-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ibm-wdc-k8s-vllm-h100-solo
     env:
         HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
     steps:

{compressed_tensors-0.12.3a20251030/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251110}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251030
+Version: 0.12.3a20251110
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/compressors/quantized_compressors/base.py RENAMED Viewed

@@ -90,7 +90,6 @@ class BaseQuantizationCompressor(BaseCompressor):
         desc = "Compressing with quantization"
         for name in tqdm(uncompressed_names, desc=desc, disable=(not show_progress)):
             value = model_state[name]
             # compress weights
             if name.endswith("weight"):
                 prefix = name.removesuffix("weight")
@@ -129,10 +128,18 @@ class BaseQuantizationCompressor(BaseCompressor):
                 if name.endswith("zero_point") and self._skip_zp(name, names_to_scheme):
                     continue
+                if name.endswith("weight_scale") and self._skip_scale():
+                    continue
                 compressed_dict[name] = value.to(compression_device)
         return compressed_dict
+    def _skip_scale(self):
+        from compressed_tensors.compressors import NVFP4PackedCompressor
+        return isinstance(self, NVFP4PackedCompressor)
     def _skip_zp(
         self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
     ) -> bool:

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py RENAMED Viewed

@@ -26,7 +26,7 @@ from compressed_tensors.quantization.lifecycle.forward import dequantize, quanti
 from torch import Tensor
-__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8"]
+__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8", "NVFP4PackedCompressor"]
 FLOAT_TO_E2M1 = [
     0.0,
@@ -103,6 +103,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         if device is not None:
             weight_packed = weight_packed.to(device)
         compressed_dict["weight_packed"] = weight_packed
+        compressed_dict["weight_scale"] = scale.to(quantization_args.scale_dtype)
         return compressed_dict
     def decompress_weight(
@@ -111,8 +112,8 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         quantization_args: Optional[QuantizationArgs] = None,
     ) -> torch.Tensor:
         weight = compressed_data["weight_packed"]
-        scale = compressed_data["weight_scale"]
         global_scale = compressed_data["weight_global_scale"]
+        scale = compressed_data["weight_scale"]
         m, n = weight.shape
         # TODO: use a user provided dequant dtype
         unpacked = unpack_fp4_from_uint8(weight, m, n * 2)

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -21,7 +21,7 @@ from compressed_tensors.quantization.quant_args import (
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
-    round_to_quantized_type,
+    round_to_quantized_type_args,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
@@ -466,20 +466,17 @@ def _quantize(
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
     if global_scale is not None:
-        scale = scale.to(global_scale.dtype) / global_scale
+        scale = scale / global_scale
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
-    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
-    clamped_value = torch.clamp(
-        scaled,
-        q_min,
-        q_max,
+    # clamp and round
+    quantized_value = round_to_quantized_type_args(
+        tensor=scaled, args=args, min=q_min, max=q_max
     )
-    quantized_value = round_to_quantized_type(clamped_value, args)
     if dtype is not None:
         quantized_value = quantized_value.to(dtype)
@@ -499,7 +496,7 @@ def _dequantize(
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
     if global_scale is not None:
-        scale = scale.to(global_scale.dtype) / global_scale
+        scale = scale / global_scale
     dequant_value = x_q.to(scale.dtype)

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -24,7 +24,6 @@ from compressed_tensors.modeling import (
     QuantizedKVCache,
 )
 from compressed_tensors.quantization import (
-    FP8_E4M3_DATA,
     ActivationOrdering,
     DynamicType,
     QuantizationArgs,
@@ -36,7 +35,7 @@ from compressed_tensors.quantization import (
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.utils import is_fp4, strategy_cdiv
+from compressed_tensors.quantization.utils import strategy_cdiv
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
@@ -250,20 +249,13 @@ def initialize_qparams(
     # 2. Identify quantization scale and zp dtype
     scale_dtype = observed_dtype
-    if is_fp4(quantization_args=quantization_args):
-        scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
-    else:
-        # TODO: consider erroring out in the future as if the dtype if not one of these,
-        # there is likely bug
-        if scale_dtype not in [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-            torch.float64,
-        ]:
-            scale_dtype = torch.bfloat16
-        zp_dtype = quantization_args.pytorch_dtype()
+    if scale_dtype not in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    ]:
+        scale_dtype = torch.float16
     # 3. Initializes scale/zp for the module
     init_scale = Parameter(
@@ -274,7 +266,9 @@ def initialize_qparams(
     if force_zero_point or not quantization_args.symmetric:
         init_zero_point = Parameter(
-            torch.zeros(expected_shape, device=device, dtype=zp_dtype),
+            torch.zeros(
+                expected_shape, device=device, dtype=quantization_args.zp_dtype
+            ),
             requires_grad=False,
         )
         register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -19,7 +19,15 @@ from typing import Any, Dict, List, Optional, Union
 import torch
 from compressed_tensors.utils import Aliasable
 from compressed_tensors.utils.helpers import deprecated
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from compressed_tensors.utils.type import TorchDtype
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_serializer,
+    field_validator,
+    model_validator,
+)
 __all__ = [
@@ -30,7 +38,8 @@ __all__ = [
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
-    "round_to_quantized_type",
+    "round_to_quantized_type_args",
+    "round_to_quantized_type_dtype",
     "ActivationOrdering",
     "DynamicType",
 ]
@@ -174,6 +183,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     block_structure: Optional[List[int]] = None
     dynamic: Union[DynamicType, bool] = False
     actorder: Union[ActivationOrdering, bool, None] = None
+    scale_dtype: Optional[TorchDtype] = None
+    zp_dtype: Optional[TorchDtype] = None
     observer: Optional[str] = Field(
         default=None,
         description=(
@@ -189,6 +200,12 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         ),
     )
+    @field_serializer("zp_dtype")
+    def serialize_dtype(self, dtype: torch.dtype):
+        if self.symmetric:
+            return None
+        return str(dtype)
     @field_validator("type", mode="before")
     def validate_type(cls, value) -> QuantizationType:
         if isinstance(value, str):
@@ -266,6 +283,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         dynamic = model.dynamic
         observer = model.observer
         dynamic = model.dynamic
+        zp_dtype = model.zp_dtype
         # infer strategy
         if strategy is None:
@@ -353,9 +371,16 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
             # default to minmax for non-dynamic cases
             observer = "minmax"
+        if zp_dtype is None:
+            if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
+                zp_dtype = FP8_E4M3_DATA.dtype
+            else:
+                zp_dtype = model.pytorch_dtype()
         # write back modified values
         model.strategy = strategy
         model.observer = observer
+        model.zp_dtype = zp_dtype
         return model
     def pytorch_dtype(self) -> torch.dtype:
@@ -381,18 +406,56 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     model_config = ConfigDict(extra="forbid")
-def round_to_quantized_type(
-    tensor: torch.Tensor, args: QuantizationArgs
+def round_to_quantized_type_dtype(
+    tensor: torch.Tensor,
+    dtype: torch.dtype,
+    cast_to_original_dtype: Optional[bool] = True,
 ) -> torch.Tensor:
     """
-    Rounds each element of the input tensor to the nearest quantized representation,
-    keeping to original dtype
+    Rounds an input tensor to the nearest quantized representation given a dtype.
+    The original dtype is kept post-rounding.
     :param tensor: tensor to round
-    :param args: QuantizationArgs to pull appropriate dtype from
+    :param dtype: dtype to use for rounding
+    :param cast_to_original_dtype: whether or not we cast the rounded tensor to
+        the original dtype
     :return: rounded tensor
     """
     original_dtype = tensor.dtype
+    if torch.is_floating_point(torch.tensor([], dtype=dtype)):
+        finfo = torch.finfo(dtype)
+        rounded = torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
+    else:
+        iinfo = torch.iinfo(dtype)
+        rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max)).to(dtype)
+    if cast_to_original_dtype:
+        return rounded.to(original_dtype)
+    return rounded
+def round_to_quantized_type_args(
+    tensor: torch.Tensor,
+    args: QuantizationArgs,
+    min: torch.Tensor,
+    max: torch.Tensor,
+    cast_to_original_dtype: Optional[bool] = True,
+) -> torch.Tensor:
+    """
+    Rounds an input tensor to the nearest quantized representation given
+    qunatization args. The original dtype is kept post-rounding.
+    :param tensor: tensor to round
+    :param args: quantization args to use for rounding
+    :param min: min value to use for clamping
+    :param max: max value to use for clamping
+    :param cast_to_original_dtype: whether or not we cast the rounded tensor to
+        the original dtype
+    :return: rounded tensor
+    """
+    original_dtype = tensor.dtype
+    tensor = torch.clamp(tensor, min, max)
     if args.type == QuantizationType.FLOAT:
         if args.num_bits == 8:
             rounded = tensor.to(FP8_E4M3_DATA.dtype)
@@ -405,4 +468,6 @@ def round_to_quantized_type(
     else:
         raise ValueError(f"Invalid quantization type {args.type}")
-    return rounded.to(original_dtype)
+    if cast_to_original_dtype:
+        return rounded.to(original_dtype)
+    return rounded

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_config.py RENAMED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
 from enum import Enum
 from typing import Annotated, Any, Dict, List, Optional, Set, Union

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_scheme.py RENAMED Viewed

@@ -18,6 +18,7 @@ from typing import List, Optional
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.quant_args import (
+    FP8_E4M3_DATA,
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
@@ -160,6 +161,8 @@ NVFP4A16 = dict(
         symmetric=True,
         dynamic=False,
         group_size=16,
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     )
 )
@@ -173,6 +176,8 @@ NVFP4 = dict(
         dynamic=False,
         group_size=16,
         observer="static_minmax",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     ),
     input_activations=QuantizationArgs(
         num_bits=4,
@@ -182,6 +187,8 @@ NVFP4 = dict(
         dynamic=DynamicType.LOCAL,
         group_size=16,
         observer="static_minmax",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     ),
 )

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/utils/helpers.py RENAMED Viewed

@@ -24,6 +24,7 @@ from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
+    round_to_quantized_type_dtype,
 )
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.utils import deprecated
@@ -46,7 +47,6 @@ __all__ = [
     "calculate_range",
     "calculate_qparams",
     "generate_gparam",
-    "is_fp4",
     "strategy_cdiv",
 ]
@@ -57,13 +57,6 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
-def is_fp4(quantization_args: QuantizationArgs):
-    return (
-        quantization_args.num_bits == 4
-        and quantization_args.type == QuantizationType.FLOAT
-    )
 def calculate_qparams(
     min_vals: Tensor,
     max_vals: Tensor,
@@ -92,52 +85,50 @@ def calculate_qparams(
     bit_min, bit_max = calculate_range(quantization_args, device)
     bit_range = bit_max - bit_min
-    if is_fp4(quantization_args=quantization_args):
-        zp_dtype = FP8_E4M3_DATA.dtype
-    else:
-        zp_dtype = quantization_args.pytorch_dtype()
+    # 1. Generate scale and zero-point
     if quantization_args.symmetric:
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-        if is_fp4(quantization_args=quantization_args) and global_scale is not None:
-            # Conditionally scale the generated local scale by a global_scale
-            scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
-            scales = torch.clamp(scales, max=FP8_E4M3_DATA.max, min=FP8_E4M3_DATA.min)
-            scales = scales.to(FP8_E4M3_DATA.dtype)
-        else:
-            scales = max_val_pos / (float(bit_range) / 2)
-        # TODO: in the case of MoEs, the global_scale may also be 0/need to be clamped
-        if scales.dtype == FP8_E4M3_DATA.dtype:
-            # torch.clamp not supported for FP8
-            # use the next largest fp8 value from 0
-            scales = torch.where(
-                scales == 0,
-                torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
-                scales,
-            )
-        else:
-            scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        scales = max_val_pos / (float(bit_range) / 2)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
-        if is_fp4(quantization_args=quantization_args):
+        if (
+            quantization_args.num_bits == 4
+            and quantization_args.type == QuantizationType.FLOAT
+        ):
             raise NotImplementedError(
                 "Asymmetric Quantization is not supported for FP4"
             )
         scales = (max_vals - min_vals) / float(bit_range)
-        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - (min_vals / scales)
         zero_points = torch.clamp(zero_points, bit_min, bit_max)
-    # match zero-points to quantized type
-    # if casting to int, use round instead of truncate
-    if quantization_args.type == QuantizationType.INT:
-        zero_points = torch.round(zero_points)
-    zero_points = zero_points.to(zp_dtype)
+    # 2. Conditionally scale the generated local scale by a global_scale
+    if global_scale is not None:
+        scales = global_scale * scales
+    # 3. Conditionally round the scale to the quantized dtype, if scale_dtype is set
+    if quantization_args.scale_dtype is not None:
+        scales = round_to_quantized_type_dtype(
+            scales, dtype=quantization_args.scale_dtype
+        )
+    # 4. Update any 0s with small values to
+    # prevent div by 0
+    eps = _get_dtype_eps(
+        dtype=quantization_args.scale_dtype
+        if quantization_args.scale_dtype is not None
+        else scales.dtype
+    )
+    scales = torch.where(
+        scales == 0,
+        torch.tensor(eps, dtype=scales.dtype, device=device),
+        scales,
+    )
+    # 5. Round the zp to zp_dtype
+    zero_points = round_to_quantized_type_dtype(
+        zero_points, dtype=quantization_args.zp_dtype, cast_to_original_dtype=False
+    )
     if scales.ndim == 0:
         scales = scales.reshape(1)
@@ -455,3 +446,14 @@ def strategy_cdiv(
             logger.bind(log_once=True).warning(message)
     return dividend
+def _get_dtype_eps(dtype: torch.dtype) -> float:
+    if dtype == FP8_E4M3_DATA.dtype:
+        return 0.125
+    elif dtype == FP4_E2M1_DATA.dtype:
+        return 0.25
+    elif torch.is_floating_point(torch.tensor([], dtype=dtype)):
+        return torch.finfo(dtype).eps
+    else:
+        return 1

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.12.3.a20251030'
+__version__ = version = '0.12.3.a20251110'
 __version_tuple__ = version_tuple = (0, 12, 3)

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251030
+Version: 0.12.3a20251110
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,13 +8,8 @@ setup.py
 .github/.gitkeep
 .github/actions/test/action.yml
 .github/scripts/step-status
-.github/workflows/build-test.yml
-.github/workflows/build.yml
-.github/workflows/post-release-nightly-build.yml
 .github/workflows/quality-check.yaml
 .github/workflows/test-check.yaml
-.github/workflows/test.yml
-.github/workflows/trigger-all.yml
 examples/bitmask_compression.ipynb
 examples/quantize_and_pack_int4.ipynb
 examples/bit_packing/ex_quantize_and_pack.py

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/tests/test_compressors/model_compressors/test_model_compressor.py RENAMED Viewed

@@ -22,6 +22,7 @@ import torch.nn as nn
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization import (
+    FP8_E4M3_DATA,
     QuantizationArgs,
     QuantizationConfig,
     QuantizationScheme,
@@ -425,8 +426,18 @@ def test_multiple_quant_compressors():
         format=CompressionFormat.float_quantized.value,
     )
-    input_activations = QuantizationArgs(num_bits=4, type="float")
-    weights = QuantizationArgs(num_bits=4, type="float")
+    input_activations = QuantizationArgs(
+        num_bits=4,
+        type="float",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
+    )
+    weights = QuantizationArgs(
+        num_bits=4,
+        type="float",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
+    )
     scheme_nvfp4 = QuantizationScheme(
         targets=["Linear"],

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/tests/test_quantization/lifecycle/test_apply.py RENAMED Viewed

@@ -22,6 +22,7 @@ import torch
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
+    FP8_E4M3_DATA,
     QuantizationArgs,
     QuantizationConfig,
     QuantizationScheme,
@@ -153,7 +154,11 @@ def test_apply_quantization_config_tinyllama():
                 "linear": QuantizationScheme(
                     targets=["Linear"],
                     input_activations=QuantizationArgs(
-                        num_bits=8, type="float", strategy="tensor"
+                        num_bits=8,
+                        type="float",
+                        strategy="tensor",
+                        scale_dtype=FP8_E4M3_DATA.dtype,
+                        zp_dtype=torch.float,
                     ),
                 )
             }
@@ -163,7 +168,11 @@ def test_apply_quantization_config_tinyllama():
                 "linear": QuantizationScheme(
                     targets=["Linear"],
                     input_activations=QuantizationArgs(
-                        num_bits=8, type="float", strategy="tensor"
+                        num_bits=8,
+                        type="float",
+                        strategy="tensor",
+                        scale_dtype=FP8_E4M3_DATA.dtype,
+                        zp_dtype=torch.float,
                     ),
                 )
             },
@@ -176,7 +185,11 @@ def test_apply_quantization_config_tinyllama():
         QuantizationConfig(
             config_groups={},
             kv_cache_scheme=QuantizationArgs(
-                num_bits=8, type="float", strategy="tensor"
+                num_bits=8,
+                type="float",
+                strategy="tensor",
+                scale_dtype=FP8_E4M3_DATA.dtype,
+                zp_dtype=torch.float,
             ),
         ),
         QuantizationConfig(
@@ -184,12 +197,20 @@ def test_apply_quantization_config_tinyllama():
                 "attention": QuantizationScheme(
                     targets=["LlamaAttention"],
                     input_activations=QuantizationArgs(
-                        num_bits=8, type="float", strategy="tensor"
+                        num_bits=8,
+                        type="float",
+                        strategy="tensor",
+                        scale_dtype=FP8_E4M3_DATA.dtype,
+                        zp_dtype=torch.float,
                     ),
                 )
             },
             kv_cache_scheme=QuantizationArgs(
-                num_bits=8, type="float", strategy="tensor"
+                num_bits=8,
+                type="float",
+                strategy="tensor",
+                scale_dtype=FP8_E4M3_DATA.dtype,
+                zp_dtype=torch.float,
             ),
         ),
     ],
@@ -448,7 +469,13 @@ def test_apply_kv_cache():
     with init_empty_weights():
         model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
-    args = QuantizationArgs(num_bits=8, type="float", strategy="tensor")
+    args = QuantizationArgs(
+        num_bits=8,
+        type="float",
+        strategy="tensor",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=torch.float,
+    )
     config = QuantizationConfig(config_groups={}, kv_cache_scheme=args)
     apply_quantization_config(model, config)
@@ -468,7 +495,13 @@ def test_apply_attention():
     scheme = QuantizationScheme(
         targets=["LlamaAttention"],
-        input_activations=QuantizationArgs(num_bits=8, type="float", strategy="tensor"),
+        input_activations=QuantizationArgs(
+            num_bits=8,
+            type="float",
+            strategy="tensor",
+            scale_dtype=FP8_E4M3_DATA.dtype,
+            zp_dtype=torch.float,
+        ),
     )
     config = QuantizationConfig(config_groups={"attention": scheme})

{compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251110}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py RENAMED Viewed

@@ -79,7 +79,7 @@ def _test_layer_dynamic_quantization_status(
 def get_tinyllama_model():
     return AutoModelForCausalLM.from_pretrained(
         "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
-        torch_dtype="auto",
+        torch_dtype=torch.bfloat16,
     )

compressed-tensors 0.12.3a20251030__tar.gz → 0.12.3a20251110__tar.gz

compressed-tensors 0.12.3a20251030tar.gz → 0.12.3a20251110tar.gz