PyPI - compressed-tensors - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

compressed-tensors 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -15,7 +15,9 @@
 import logging
 import re
 from collections import OrderedDict
-from typing import Dict, Iterable, Optional
+from typing import Dict, Iterable, List, Optional
+from typing import OrderedDict as OrderedDictType
+from typing import Union
 import torch
 from compressed_tensors.quantization.lifecycle.calibration import (
@@ -28,15 +30,20 @@ from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quant
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
 )
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import (
+    KV_CACHE_TARGETS,
     infer_quantization_status,
+    is_kv_cache_quant_scheme,
     iter_named_leaf_modules,
 )
 from compressed_tensors.utils.helpers import fix_fsdp_module_name
+from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
 from torch.nn import Module
@@ -45,7 +52,7 @@ __all__ = [
     "load_pretrained_quantization",
     "apply_quantization_config",
     "apply_quantization_status",
-    "find_first_name_or_class_match",
+    "find_name_or_class_matches",
 ]
 from compressed_tensors.quantization.utils.helpers import is_module_quantized
@@ -96,7 +103,7 @@ def load_pretrained_quantization(model: Module, model_name_or_path: str):
             )
-def apply_quantization_config(model: Module, config: QuantizationConfig):
+def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict:
     """
     Initializes the model for quantization in-place based on the given config
@@ -106,6 +113,8 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
     target_to_scheme = OrderedDict()
+    config = process_quantization_config(config)
+    names_to_scheme = OrderedDict()
     for scheme in config.config_groups.values():
         for target in scheme.targets:
             target_to_scheme[target] = scheme
@@ -116,13 +125,16 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
     for name, submodule in iter_named_leaf_modules(model):
         # potentially fix module name to remove FSDP wrapper prefix
         name = fix_fsdp_module_name(name)
-        if find_first_name_or_class_match(name, submodule, config.ignore):
+        if find_name_or_class_matches(name, submodule, config.ignore):
             ignored_submodules.append(name)
             continue  # layer matches ignore list, continue
-        target = find_first_name_or_class_match(name, submodule, target_to_scheme)
-        if target is not None:
+        targets = find_name_or_class_matches(name, submodule, target_to_scheme)
+        if targets:
             # target matched - add layer and scheme to target list
-            submodule.quantization_scheme = target_to_scheme[target]
+            submodule.quantization_scheme = _scheme_from_targets(
+                target_to_scheme, targets, name
+            )
+            names_to_scheme[name] = submodule.quantization_scheme.weights
     if config.ignore is not None and ignored_submodules is not None:
         if set(config.ignore) - set(ignored_submodules):
@@ -132,7 +144,42 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
                 f"{set(config.ignore) - set(ignored_submodules)}"
             )
     # apply current quantization status across all targeted layers
     apply_quantization_status(model, config.quantization_status)
+    return names_to_scheme
+def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
+    """
+    Preprocess the raw QuantizationConfig
+    :param config: the raw QuantizationConfig
+    :return: the processed QuantizationConfig
+    """
+    if config.kv_cache_scheme is not None:
+        config = process_kv_cache_config(config)
+    return config
+def process_kv_cache_config(
+    config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS
+) -> QuantizationConfig:
+    """
+    Reformulate the `config.kv_cache` as a `config_group`
+    and add it to the set of existing `config.groups`
+    :param config: the QuantizationConfig
+    :return: the QuantizationConfig with additional "kv_cache" group
+    """
+    kv_cache_dict = config.kv_cache_scheme.model_dump()
+    kv_cache_scheme = QuantizationScheme(
+        output_activations=QuantizationArgs(**kv_cache_dict),
+        targets=targets,
+    )
+    kv_cache_group = dict(kv_cache=kv_cache_scheme)
+    config.config_groups.update(kv_cache_group)
+    return config
 def apply_quantization_status(model: Module, status: QuantizationStatus):
@@ -148,7 +195,14 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
         model.apply(initialize_module_for_quantization)
     if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
-        model.apply(set_module_for_calibration)
+        # only quantize weights up front when our end goal state is calibration,
+        # weight quantization parameters are already loaded for frozen/compressed
+        quantize_weights_upfront = status == QuantizationStatus.CALIBRATION
+        model.apply(
+            lambda module: set_module_for_calibration(
+                module, quantize_weights_upfront=quantize_weights_upfront
+            )
+        )
     if current_status < status >= QuantizationStatus.FROZEN > current_status:
         model.apply(freeze_module_quantization)
@@ -156,36 +210,45 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
         model.apply(compress_quantized_weights)
-def find_first_name_or_class_match(
+def find_name_or_class_matches(
     name: str, module: Module, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # first element of targets that matches the given name
-    # if no name matches returns first target that matches the class name
-    # returns None otherwise
+) -> List[str]:
+    """
+    Returns all targets that match the given name or the class name.
+    Returns empty list otherwise.
+    The order of the output `matches` list matters.
+    The entries are sorted in the following order:
+        1. matches on exact strings
+        2. matches on regex patterns
+        3. matches on module names
+    """
+    targets = sorted(targets, key=lambda x: ("re:" in x, x))
     if isinstance(targets, Iterable):
-        return _find_first_match(name, targets) or _find_first_match(
+        matches = _find_matches(name, targets) + _find_matches(
             module.__class__.__name__, targets, check_contains
         )
+        matches = [match for match in matches if match is not None]
+        return matches
-def _find_first_match(
+def _find_matches(
     value: str, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # returns first element of target that matches value either
+) -> List[str]:
+    # returns all the targets that match value either
     # exactly or as a regex after 're:'. if check_contains is set to True,
     # additionally checks if the target string is contained with value.
+    matches = []
     for target in targets:
         if target.startswith("re:"):
             pattern = target[3:]
             if re.match(pattern, value):
-                return target
+                matches.append(target)
         elif check_contains:
             if target.lower() in value.lower():
-                return target
+                matches.append(target)
         elif target == value:
-            return target
-    return None
+            matches.append(target)
+    return matches
 def _infer_status(model: Module) -> Optional[QuantizationStatus]:
@@ -210,20 +273,79 @@ def _load_quant_args_from_state_dict(
     """
     scale_name = f"{base_name}_scale"
     zp_name = f"{base_name}_zero_point"
-    device = next(module.parameters()).device
-    scale = getattr(module, scale_name, None)
-    zp = getattr(module, zp_name, None)
-    if scale is not None:
-        state_dict_scale = state_dict.get(f"{module_name}.{scale_name}")
-        if state_dict_scale is not None:
-            scale.data = state_dict_scale.to(device).to(scale.dtype)
-        else:
-            scale.data = scale.data.to(device)
-    if zp is not None:
-        zp_from_state = state_dict.get(f"{module_name}.{zp_name}", None)
-        if zp_from_state is not None:  # load the non-zero zero points
-            zp.data = state_dict[f"{module_name}.{zp_name}"].to(device)
-        else:  # fill with zeros matching scale shape
-            zp.data = torch.zeros_like(scale, dtype=torch.int8).to(device)
+    state_dict_scale = state_dict.get(f"{module_name}.{scale_name}", None)
+    state_dict_zp = state_dict.get(f"{module_name}.{zp_name}", None)
+    if state_dict_scale is not None:
+        # module is quantized
+        update_parameter_data(module, state_dict_scale, scale_name)
+        if state_dict_zp is None:
+            # fill in zero point for symmetric quantization
+            state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
+        update_parameter_data(module, state_dict_zp, zp_name)
+def _scheme_from_targets(
+    target_to_scheme: OrderedDictType[str, QuantizationScheme],
+    targets: List[str],
+    name: str,
+) -> QuantizationScheme:
+    if len(targets) == 1:
+        # if `targets` iterable contains a single element
+        # use it as the key
+        return target_to_scheme[targets[0]]
+    # otherwise, we need to merge QuantizationSchemes corresponding
+    # to multiple targets. This is most likely because `name` module
+    # is being target both as an ordinary quantization target, as well
+    # as kv cache quantization target
+    schemes_to_merge = [target_to_scheme[target] for target in targets]
+    return _merge_schemes(schemes_to_merge, name)
+def _merge_schemes(
+    schemes_to_merge: List[QuantizationScheme], name: str
+) -> QuantizationScheme:
+    kv_cache_quantization_scheme = [
+        scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme)
+    ]
+    if not kv_cache_quantization_scheme:
+        # if the schemes_to_merge do not contain any
+        # kv cache QuantizationScheme
+        # return the first scheme (the prioritized one,
+        # since the order of schemes_to_merge matters)
+        return schemes_to_merge[0]
+    else:
+        # fetch the kv cache QuantizationScheme and the highest
+        # priority non-kv cache QuantizationScheme and merge them
+        kv_cache_quantization_scheme = kv_cache_quantization_scheme[0]
+        quantization_scheme = [
+            scheme
+            for scheme in schemes_to_merge
+            if not is_kv_cache_quant_scheme(scheme)
+        ][0]
+        schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme]
+        merged_scheme = {}
+        for scheme in schemes_to_merge:
+            scheme_dict = {
+                k: v for k, v in scheme.model_dump().items() if v is not None
+            }
+            # when merging multiple schemes, the final target will be
+            # the `name` argument - hence erase the original targets
+            del scheme_dict["targets"]
+            # make sure that schemes do not "clash" with each other
+            overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys())
+            if overlapping_keys:
+                raise ValueError(
+                    f"The module: {name} is being modified by two clashing "
+                    f"quantization schemes, that jointly try to override "
+                    f"properties: {overlapping_keys}. Fix the quantization config "
+                    "so that it is not ambiguous."
+                )
+            merged_scheme.update(scheme_dict)
+        merged_scheme.update(targets=[name])
+        return QuantizationScheme(**merged_scheme)

compressed_tensors/quantization/lifecycle/calibration.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import logging
 from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.utils import is_module_offloaded, update_parameter_data
 from torch.nn import Module
@@ -27,7 +28,7 @@ __all__ = [
 _LOGGER = logging.getLogger(__name__)
-def set_module_for_calibration(module: Module):
+def set_module_for_calibration(module: Module, quantize_weights_upfront: bool = True):
     """
     marks a layer as ready for calibration which activates observers
     to update scales and zero points on each forward pass
@@ -35,6 +36,8 @@ def set_module_for_calibration(module: Module):
     apply to full model with `model.apply(set_module_for_calibration)`
     :param module: module to set for calibration
+    :param quantize_weights_upfront: whether to automatically run weight quantization at the
+    start of calibration
     """
     if not getattr(module, "quantization_scheme", None):
         # no quantization scheme nothing to do
@@ -48,4 +51,20 @@ def set_module_for_calibration(module: Module):
             "to re-calibrate a frozen module"
         )
+    if quantize_weights_upfront and module.quantization_scheme.weights is not None:
+        # set weight scale and zero_point up front, calibration data doesn't affect it
+        observer = module.weight_observer
+        offloaded = False
+        if is_module_offloaded(module):
+            module._hf_hook.pre_forward(module)
+            offloaded = True
+        scale, zero_point = observer(module.weight)
+        update_parameter_data(module, scale, "weight_scale")
+        update_parameter_data(module, zero_point, "weight_zero_point")
+        if offloaded:
+            module._hf_hook.post_forward(module, None)
     module.quantization_status = QuantizationStatus.CALIBRATION

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -17,12 +17,15 @@ from math import ceil
 from typing import Optional
 import torch
+from compressed_tensors.quantization.observers.helpers import calculate_range
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.utils import update_parameter_data
 from torch.nn import Module
@@ -80,8 +83,9 @@ def quantize(
 def dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
-    zero_point: torch.Tensor,
+    zero_point: torch.Tensor = None,
     args: QuantizationArgs = None,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
     """
     Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -91,6 +95,7 @@ def dequantize(
     :param scale: scale tensor
     :param zero_point: zero point tensor
     :param args: quantization args used to quantize x_q
+    :param dtype: optional dtype to cast the dequantized output to
     :return: dequantized float tensor
     """
     if args is None:
@@ -107,8 +112,12 @@ def dequantize(
         else:
             raise ValueError(
                 f"Could not infer a quantization strategy from scale with {scale.ndim} "
-                "dimmensions. Expected 0-2 dimmensions."
+                "dimmensions. Expected 0 or 2 dimmensions."
             )
+    if dtype is None:
+        dtype = scale.dtype
     return _process_quantization(
         x=x_q,
         scale=scale,
@@ -116,6 +125,7 @@ def dequantize(
         args=args,
         do_quantize=False,
         do_dequantize=True,
+        dtype=dtype,
     )
@@ -159,19 +169,13 @@ def _process_quantization(
     do_quantize: bool = True,
     do_dequantize: bool = True,
 ) -> torch.Tensor:
-    bit_range = 2**args.num_bits
-    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
-    q_min = torch.tensor(-bit_range / 2, device=x.device)
+    q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
     if args.strategy == QuantizationStrategy.GROUP:
-        if do_dequantize and not do_quantize:
-            # if dequantizing a quantized type infer the output type from the scale
-            output = torch.zeros_like(x, dtype=scale.dtype)
-        else:
-            output_dtype = dtype if dtype is not None else x.dtype
-            output = torch.zeros_like(x, dtype=output_dtype)
+        output_dtype = dtype if dtype is not None else x.dtype
+        output = torch.zeros_like(x).to(output_dtype)
         # TODO: vectorize the for loop
         # TODO: fix genetric assumption about the tensor size for computing group
@@ -181,7 +185,7 @@ def _process_quantization(
         while scale.ndim < 2:
             # pad scale and zero point dims for slicing
             scale = scale.unsqueeze(1)
-            zero_point = zero_point.unsqueeze(1)
+            zero_point = zero_point.unsqueeze(1) if zero_point is not None else None
         columns = x.shape[1]
         if columns >= group_size:
@@ -194,12 +198,18 @@ def _process_quantization(
             # scale.shape should be [nchan, ndim]
             # sc.shape should be [nchan, 1] after unsqueeze
             sc = scale[:, i].view(-1, 1)
-            zp = zero_point[:, i].view(-1, 1)
+            zp = zero_point[:, i].view(-1, 1) if zero_point is not None else None
             idx = i * group_size
             if do_quantize:
                 output[:, idx : (idx + group_size)] = _quantize(
-                    x[:, idx : (idx + group_size)], sc, zp, q_min, q_max, dtype=dtype
+                    x[:, idx : (idx + group_size)],
+                    sc,
+                    zp,
+                    q_min,
+                    q_max,
+                    args,
+                    dtype=dtype,
                 )
             if do_dequantize:
                 input = (
@@ -211,7 +221,15 @@ def _process_quantization(
     else:  # covers channel, token and tensor strategies
         if do_quantize:
-            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+            output = _quantize(
+                x,
+                scale,
+                zero_point,
+                q_min,
+                q_max,
+                args,
+                dtype=dtype,
+            )
         if do_dequantize:
             output = _dequantize(output if do_quantize else x, scale, zero_point)
@@ -228,6 +246,11 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
     @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
     def wrapped_forward(self, *args, **kwargs):
+        if not getattr(module, "quantization_enabled", True):
+            # quantization is disabled on forward passes, return baseline
+            # forward call
+            return forward_func_orig.__get__(module, module.__class__)(*args, **kwargs)
         input_ = args[0]
         if scheme.input_activations is not None:
@@ -276,6 +299,11 @@ def maybe_calibrate_or_quantize(
     }:
         return value
+    if value.numel() == 0:
+        # if the tensor is empty,
+        # skip quantization
+        return value
     if args.dynamic:
         # dynamic quantization - get scale and zero point directly from observer
         observer = getattr(module, f"{base_name}_observer")
@@ -285,16 +313,19 @@ def maybe_calibrate_or_quantize(
         scale = getattr(module, f"{base_name}_scale")
         zero_point = getattr(module, f"{base_name}_zero_point")
-        if module.quantization_status == QuantizationStatus.CALIBRATION:
+        if (
+            module.quantization_status == QuantizationStatus.CALIBRATION
+            and base_name != "weight"
+        ):
             # calibration mode - get new quant params from observer
             observer = getattr(module, f"{base_name}_observer")
             updated_scale, updated_zero_point = observer(value)
             # update scale and zero point
-            device = next(module.parameters()).device
-            scale.data = updated_scale.to(device)
-            zero_point.data = updated_zero_point.to(device)
+            update_parameter_data(module, updated_scale, f"{base_name}_scale")
+            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
     return fake_quantize(value, scale, zero_point, args)
@@ -305,14 +336,18 @@ def _quantize(
     zero_point: torch.Tensor,
     q_min: torch.Tensor,
     q_max: torch.Tensor,
+    args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    quantized_value = torch.clamp(
-        torch.round(x / scale + zero_point),
+    scaled = x / scale + zero_point.to(x.dtype)
+    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
+    clamped_value = torch.clamp(
+        scaled,
         q_min,
         q_max,
     )
+    quantized_value = round_to_quantized_type(clamped_value, args)
     if dtype is not None:
         quantized_value = quantized_value.to(dtype)
@@ -323,6 +358,16 @@ def _quantize(
 def _dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
-    zero_point: torch.Tensor,
+    zero_point: torch.Tensor = None,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    return (x_q - zero_point) * scale
+    dequant_value = x_q
+    if zero_point is not None:
+        dequant_value = dequant_value - zero_point.to(scale.dtype)
+    dequant_value = dequant_value.to(scale.dtype) * scale
+    if dtype is not None:
+        dequant_value = dequant_value.to(dtype)
+    return dequant_value

compressed_tensors/quantization/lifecycle/helpers.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Miscelaneous helpers for the quantization lifecycle
+"""
+from torch.nn import Module
+__all__ = [
+    "update_layer_weight_quant_params",
+    "enable_quantization",
+    "disable_quantization",
+]
+def update_layer_weight_quant_params(layer: Module):
+    weight = getattr(layer, "weight", None)
+    scale = getattr(layer, "weight_scale", None)
+    zero_point = getattr(layer, "weight_zero_point", None)
+    observer = getattr(layer, "weight_observer", None)
+    if weight is None or observer is None or scale is None or zero_point is None:
+        # scale, zp, or observer not calibratable or weight not available
+        return
+    updated_scale, updated_zero_point = observer(weight)
+    # update scale and zero point
+    device = next(layer.parameters()).device
+    scale.data = updated_scale.to(device)
+    zero_point.data = updated_zero_point.to(device)
+def enable_quantization(module: Module):
+    module.quantization_enabled = True
+def disable_quantization(module: Module):
+    module.quantization_enabled = False

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -17,6 +17,8 @@ import logging
 from typing import Optional
 import torch
+from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+from accelerate.utils import PrefixedDataset
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
@@ -26,6 +28,7 @@ from compressed_tensors.quantization.quant_args import (
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.utils import get_execution_device, is_module_offloaded
 from torch.nn import Module, Parameter
@@ -81,9 +84,32 @@ def initialize_module_for_quantization(
     module.quantization_scheme = scheme
     module.quantization_status = QuantizationStatus.INITIALIZED
+    offloaded = False
+    if is_module_offloaded(module):
+        offloaded = True
+        hook = module._hf_hook
+        prefix_dict = module._hf_hook.weights_map
+        new_prefix = {}
+        # recreate the prefix dict (since it is immutable)
+        # and add quantization parameters
+        for key, data in module.named_parameters():
+            if key not in prefix_dict:
+                new_prefix[f"{prefix_dict.prefix}{key}"] = data
+            else:
+                new_prefix[f"{prefix_dict.prefix}{key}"] = prefix_dict[key]
+        new_prefix_dict = PrefixedDataset(new_prefix, prefix_dict.prefix)
+        remove_hook_from_module(module)
     # wrap forward call of module to perform quantized actions based on calltime status
     wrap_module_forward_quantized(module, scheme)
+    if offloaded:
+        # we need to re-add the hook for offloading now that we've wrapped forward
+        add_hook_to_module(module, hook)
+        if prefix_dict is not None:
+            module._hf_hook.weights_map = new_prefix_dict
 def _initialize_scale_zero_point_observer(
     module: Module,
@@ -99,6 +125,8 @@ def _initialize_scale_zero_point_observer(
         return  # no need to register a scale and zero point for a dynamic observer
     device = next(module.parameters()).device
+    if is_module_offloaded(module):
+        device = get_execution_device(module)
     # infer expected scale/zero point shape
     expected_shape = 1  # per tensor
@@ -120,8 +148,9 @@ def _initialize_scale_zero_point_observer(
     )
     module.register_parameter(f"{base_name}_scale", init_scale)
+    zp_dtype = quantization_args.pytorch_dtype()
     init_zero_point = Parameter(
-        torch.empty(expected_shape, device=device, dtype=int),
+        torch.empty(expected_shape, device=device, dtype=zp_dtype),
         requires_grad=False,
     )
     module.register_parameter(f"{base_name}_zero_point", init_zero_point)

compressed-tensors 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

compressed-tensors 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl