PyPI - compressed-tensors - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

compressed_tensors/base.py +1 -0
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +200 -8
compressed_tensors/compressors/dense.py +1 -1
compressed_tensors/compressors/marlin_24.py +11 -10
compressed_tensors/compressors/model_compressor.py +101 -13
compressed_tensors/compressors/naive_quantized.py +140 -0
compressed_tensors/compressors/pack_quantized.py +128 -132
compressed_tensors/compressors/sparse_bitmask.py +1 -1
compressed_tensors/config/base.py +8 -1
compressed_tensors/{compressors/utils → linear}/__init__.py +0 -6
compressed_tensors/linear/compressed_linear.py +87 -0
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -44
compressed_tensors/quantization/lifecycle/calibration.py +22 -2
compressed_tensors/quantization/lifecycle/compressed.py +3 -1
compressed_tensors/quantization/lifecycle/forward.py +139 -61
compressed_tensors/quantization/lifecycle/helpers.py +80 -0
compressed_tensors/quantization/lifecycle/initialize.py +77 -13
compressed_tensors/quantization/observers/__init__.py +1 -0
compressed_tensors/quantization/observers/base.py +93 -14
compressed_tensors/quantization/observers/helpers.py +64 -11
compressed_tensors/quantization/observers/min_max.py +8 -0
compressed_tensors/quantization/observers/mse.py +162 -0
compressed_tensors/quantization/quant_args.py +139 -23
compressed_tensors/quantization/quant_config.py +35 -2
compressed_tensors/quantization/quant_scheme.py +112 -13
compressed_tensors/quantization/utils/helpers.py +68 -2
compressed_tensors/utils/__init__.py +5 -0
compressed_tensors/utils/helpers.py +44 -2
compressed_tensors/utils/offload.py +116 -0
compressed_tensors/utils/permute.py +70 -0
compressed_tensors/utils/safetensors_load.py +2 -0
compressed_tensors/{compressors/utils → utils}/semi_structured_conversions.py +1 -0
compressed_tensors/version.py +1 -1
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -22
compressed_tensors-0.6.0.dist-info/RECORD +52 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
compressed_tensors/compressors/int_quantized.py +0 -126
compressed_tensors/compressors/utils/helpers.py +0 -43
compressed_tensors-0.4.0.dist-info/RECORD +0 -48
/compressed_tensors/{compressors/utils → utils}/permutations_24.py +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -16,6 +16,7 @@ from enum import Enum
 from typing import Dict, List, Optional, Union
 from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_scheme import (
     QuantizationScheme,
     preset_name_to_scheme,
@@ -25,6 +26,7 @@ from compressed_tensors.quantization.utils import (
     is_module_quantized,
     iter_named_leaf_modules,
     module_type,
+    parse_out_kv_cache_args,
 )
 from pydantic import BaseModel, Field
 from torch.nn import Module
@@ -117,7 +119,18 @@ class QuantizationConfig(BaseModel):
     other quantization configs
     :param format: specifies how the quantized model is stored on disk
     :quantization_status: specifies the current status of all quantized layers. It is
-    assumed all layers are in the same state.
+        assumed all layers are in the same state.
+    :param kv_cache_scheme: optional QuantizationArgs, that specify the
+        quantization of the kv cache. If None, kv cache is not quantized.
+        When applying kv cache quantization to transformer AutoModelForCausalLM,
+        the kv_cache_scheme gets converted into a QuantizationScheme that:
+            - targets the `q_proj` and `k_proj` modules of the model. The outputs
+              of those modules are the keys and values that might be cached
+            - quantizes the outputs of the aformentioned layers, so that
+              keys and values are compressed before storing them in the cache
+        There is an explicit assumption that the model contains modules with
+        `k_proj` and `v_proj` in their names. If this is not the case
+        and kv_cache_scheme != None, the quantization of kv cache will fail
     :global_compression_ratio: optional informational config to report the model
     compression ratio acheived by the quantization config
     :ignore: optional list of layers to ignore from config_groups. Layers in this list
@@ -126,6 +139,7 @@ class QuantizationConfig(BaseModel):
     config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
     quant_method: str = DEFAULT_QUANTIZATION_METHOD
+    kv_cache_scheme: Optional[QuantizationArgs] = None
     format: str = DEFAULT_QUANTIZATION_FORMAT
     quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
     global_compression_ratio: Optional[float] = None
@@ -154,7 +168,7 @@ class QuantizationConfig(BaseModel):
     ) -> Optional["QuantizationConfig"]:
         """
         Converts a model into its associated QuantizationConfig based on the
-        QuantizationScheme attached to each quanitzed module
+        QuantizationScheme attached to each quantized module
         :param model: model to calculate quantization scheme of
         :return: filled out QuantizationScheme for the input model
@@ -195,6 +209,13 @@ class QuantizationConfig(BaseModel):
             # else we leave it off the ignore list, doesn't fall under any of the
             # existing quantization schemes so it won't be quantized
+        kv_cache_args, quant_scheme_to_layers = parse_out_kv_cache_args(
+            quant_scheme_to_layers
+        )
+        kv_cache_scheme = (
+            kv_cache_args.model_dump() if kv_cache_args is not None else kv_cache_args
+        )
         config_groups = {}
         for idx, scheme in enumerate(quant_scheme_to_layers):
             group_name = "group_" + str(idx)
@@ -213,7 +234,19 @@ class QuantizationConfig(BaseModel):
         return QuantizationConfig(
             config_groups=config_groups,
             quantization_status=quantization_status,
+            kv_cache_scheme=kv_cache_scheme,
             global_compression_ratio=compression_ratio,
             format=format,
             ignore=consolidated_ignore,
         )
+    def requires_calibration_data(self):
+        for _, scheme in self.config_groups.items():
+            if scheme.input_activations is not None:
+                if not scheme.input_activations.dynamic:
+                    return True
+            if scheme.output_activations is not None:
+                if not scheme.output_activations.dynamic:
+                    return True
+        return False

compressed_tensors/quantization/quant_scheme.py CHANGED Viewed

@@ -15,7 +15,11 @@
 from copy import deepcopy
 from typing import List, Optional
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
 from pydantic import BaseModel
@@ -53,15 +57,9 @@ class QuantizationScheme(BaseModel):
             # default to quantizing all Linear layers
             targets = ["Linear"]
-        # default to 8 bit integer symmetric quantization
-        # for weights
-        weights = QuantizationArgs(num_bits=8, symmetric=True)
-        # default to 8 bit integer asymmetric quantization
-        input_activations = QuantizationArgs(num_bits=8, symmetric=True)
-        # Do not quantize the output activations
-        # by default
+        # by default, activations and weights are left unquantized
+        weights = None
+        input_activations = None
         output_activations = None
         return cls(
@@ -107,13 +105,114 @@ def is_preset_scheme(name: str) -> bool:
     return name.upper() in PRESET_SCHEMES
+UNQUANTIZED = dict()
+# 8 bit integer weights and 8 bit activations quantization
 W8A8 = dict(
-    weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=True)
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=True,
+        dynamic=True,
+    ),
 )
-W4A16 = dict(weights=QuantizationArgs(num_bits=4, group_size=128))
+# 8 bit integer weights only quantization
+W8A16 = dict(
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    ),
+)
+# 4 bit integer weights only quantization
+W4A16 = dict(
+    weights=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.GROUP,
+        group_size=128,
+        symmetric=True,
+        dynamic=False,
+    ),
+)
+# 4 bit integer weights and 8 bit activations quantization
+W4A8 = dict(
+    weights=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.INT,
+        group_size=128,
+        strategy=QuantizationStrategy.GROUP,
+        symmetric=True,
+        dynamic=False,
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=True,
+        dynamic=True,
+    ),
+)
+# FP8 weights and FP8 activations quantization
+FP8 = dict(
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.TENSOR,
+        symmetric=True,
+        dynamic=False,
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.TENSOR,
+        symmetric=True,
+        dynamic=False,
+    ),
+)
+# FP8 weights and FP8 dynamic activations quantization
+FP8_DYNAMIC = dict(
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=True,
+        dynamic=True,
+    ),
+)
 PRESET_SCHEMES = {
-    "W8A8": W8A8,
+    # Unquantized (no-op)
+    "UNQUANTIZED": UNQUANTIZED,
+    # Integer weight only schemes
+    "W8A16": W8A16,
     "W4A16": W4A16,
+    # Integer weight and activation schemes
+    "W8A8": W8A8,
+    "W4A8": W4A8,
+    # Float weight and activation schemes
+    "FP8": FP8,
+    "FP8_DYNAMIC": FP8_DYNAMIC,
 }

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -13,10 +13,13 @@
 # limitations under the License.
 import logging
-from typing import Optional, Tuple
+import re
+from typing import List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module
 from tqdm import tqdm
@@ -30,8 +33,12 @@ __all__ = [
     "calculate_compression_ratio",
     "get_torch_bit_depth",
     "can_quantize",
+    "parse_out_kv_cache_args",
+    "KV_CACHE_TARGETS",
+    "is_kv_cache_quant_scheme",
 ]
+KV_CACHE_TARGETS = ["re:.*k_proj", "re:.*v_proj"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -174,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
         for parameter in model.parameters():
             uncompressed_bits = get_torch_bit_depth(parameter)
             compressed_bits = uncompressed_bits
-            if is_module_quantized(submodule):
+            if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
                 compressed_bits = submodule.quantization_scheme.weights.num_bits
             num_weights = parameter.numel()
@@ -182,3 +189,62 @@ def calculate_compression_ratio(model: Module) -> float:
             total_uncompressed += uncompressed_bits * num_weights
     return total_uncompressed / total_compressed
+def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
+    """
+    Check whether the QuantizationScheme targets the kv cache.
+    It does if all the following criteria are met:
+    - the scheme targets either exactly match the KV_CACHE_TARGETS
+        or the match KV_CACHE_TARGETS regex pattern
+    - the scheme quantizes output_activations (we want to quantize the
+        outputs from the KV_CACHE_TARGETS, as their correspond to the
+        keys and values that are to be saved in the cache)
+    :param scheme: The QuantizationScheme to investigate
+    :return: boolean flag
+    """
+    if len(scheme.targets) == 1:
+        # match on the KV_CACHE_TARGETS regex pattern
+        # if there is only one target
+        is_match_targets = any(
+            [re.match(pattern[3:], scheme.targets[0]) for pattern in KV_CACHE_TARGETS]
+        )
+    else:
+        # match on the exact KV_CACHE_TARGETS
+        # if there are multiple targets
+        is_match_targets = set(KV_CACHE_TARGETS) == set(scheme.targets)
+    is_match_output_activations = scheme.output_activations is not None
+    return is_match_targets and is_match_output_activations
+def parse_out_kv_cache_args(
+    quant_scheme_to_layers: List[QuantizationScheme],
+) -> Tuple[Optional[QuantizationArgs], List[QuantizationScheme]]:
+    """
+    If possible, parse out the kv cache specific QuantizationArgs
+    from the list of the QuantizationSchemes. If no kv cache
+    specific QuantizationArgs available, this function acts
+    as an identity function
+    :param quant_scheme_to_layers: list of QuantizationSchemes
+    :return: kv_cache_args (optional) and the (remaining or original)
+        list of the QuantizationSchemes
+    """
+    kv_cache_quant_scheme_to_layers = [
+        scheme for scheme in quant_scheme_to_layers if is_kv_cache_quant_scheme(scheme)
+    ]
+    quant_scheme_to_layers = [
+        scheme
+        for scheme in quant_scheme_to_layers
+        if not is_kv_cache_quant_scheme(scheme)
+    ]
+    if kv_cache_quant_scheme_to_layers:
+        kv_cache_quant_scheme_to_layers = kv_cache_quant_scheme_to_layers[0]
+        kv_cache_args = kv_cache_quant_scheme_to_layers.output_activations
+    else:
+        kv_cache_args = None
+    return kv_cache_args, quant_scheme_to_layers

compressed_tensors/utils/__init__.py CHANGED Viewed

@@ -13,4 +13,9 @@
 # limitations under the License.
 # flake8: noqa
+from .helpers import *
+from .offload import *
+from .permutations_24 import *
+from .permute import *
 from .safetensors_load import *
+from .semi_structured_conversions import *

compressed_tensors/utils/helpers.py CHANGED Viewed

@@ -12,13 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
+import torch
 from transformers import AutoConfig
-__all__ = ["infer_compressor_from_model_config", "fix_fsdp_module_name"]
+__all__ = [
+    "infer_compressor_from_model_config",
+    "fix_fsdp_module_name",
+    "tensor_follows_mask_structure",
+    "replace_module",
+]
 FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -61,3 +66,40 @@ def fix_fsdp_module_name(name: str) -> str:
     return name.replace(FSDP_WRAPPER_NAME + ".", "").replace(
         "." + FSDP_WRAPPER_NAME, ""
     )
+def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool:
+    """
+    :param tensor: tensor to check
+    :param mask: mask structure to check for, in the format "n:m"
+    :return: True if the tensor follows the mask structure, False otherwise.
+        Note, some weights can incidentally be zero, so we check for
+        atleast n zeros in each chunk of size m
+    """
+    n, m = tuple(map(int, mask.split(":")))
+    # Reshape the tensor into chunks of size m
+    tensor = tensor.view(-1, m)
+    # Count the number of zeros in each chunk
+    zero_counts = (tensor == 0).sum(dim=1)
+    # Check if the number of zeros in each chunk atleast n
+    # Greater than sign is needed as some weights can incidentally
+    # be zero
+    if not torch.all(zero_counts >= n).item():
+        raise ValueError()
+    return True
+def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1 :]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model
+        child_name = name
+    setattr(parent, child_name, new_module)

compressed_tensors/utils/offload.py ADDED Viewed

@@ -0,0 +1,116 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.nn import Module
+__all__ = [
+    "is_module_offloaded",
+    "get_execution_device",
+    "get_offloaded_device",
+    "update_prefix_dict",
+    "update_parameter_data",
+]
+def is_module_offloaded(module: Module) -> bool:
+    """
+    :param module: layer to check
+    :return: True if layer is offloaded from GPU, False otherwise
+    """
+    return hasattr(module, "_hf_hook") and module._hf_hook.offload
+def get_execution_device(module: Module) -> torch.device:
+    """
+    :param module: layer to check
+    :return: device layer is loaded onto during forward pass
+    """
+    if is_module_offloaded(module):
+        return module._hf_hook.execution_device
+    device = next(module.parameters()).device
+    # offload only gets set for leaf modules, fallback to checking for device type
+    if device.type == "meta":
+        return module._hf_hook.execution_device
+    return device
+def get_offloaded_device(module: Module) -> torch.device:
+    """
+    :param module: layer to check
+    :return: device layer is offloaded to onto after forward pass
+    """
+    if is_module_offloaded(module):
+        first_key = list(module._hf_hook.weights_map.keys())[0]
+        prefix_dataset = module._hf_hook.weights_map.dataset
+        return prefix_dataset[first_key].device
+    return next(module.parameters()).device
+def update_prefix_dict(module: Module, key: str, data: torch.Tensor):
+    """
+    Updates the offloaded state dict for a given module. Parameter named key is replaced
+    by data. This is neccesary because parameter updates for offloaded modules do not
+    persist automatically between loads. This function only affects the offloaded
+    state dict and not the current state of the loaded module.
+    :param module: layer containing the parameter to update
+    :param key: name of parameter to update
+    :param data: tensor to update parameter with in the offloaded state dict
+    """
+    if not is_module_offloaded(module):
+        raise ValueError("Prefix dict is only applicable to offloaded modules")
+    prefix_dict = module._hf_hook.weights_map
+    prefix_dict.dataset[f"{prefix_dict.prefix}{key}"] = data
+def update_parameter_data(
+    module: Module, new_param_data: torch.Tensor, param_name: str
+):
+    """
+    Updates the paramter value named param_name for a given module. This function
+    updates both the current loaded module state and the offloaded state dict if
+    the module is offloaded. This is neccesary because parameter updates for offloaded
+    modules do not persist automatically between loads.
+    :param module: layer containing the parameter to update
+    :param new_param_data: tensor to update parameter with
+    :param param_name: name of layer parameter to update
+    """
+    if not hasattr(module, param_name):
+        return
+    device = next(module.parameters()).device
+    offloaded = False
+    if is_module_offloaded(module):
+        offload_device = get_offloaded_device(module)
+        offloaded = True
+    parameter = getattr(module, param_name, None)
+    if parameter is None:
+        raise ValueError("Attempted to update uninitialized parameter")
+    dtype = parameter.dtype
+    parameter.data = new_param_data.to(device).to(dtype)
+    if offloaded:
+        prefix_dict = module._hf_hook.weights_map.dataset
+        prefix = module._hf_hook.weights_map.prefix
+        prefix_dict[f"{prefix}{param_name}"] = new_param_data.to(offload_device).to(
+            dtype
+        )

compressed_tensors/utils/permute.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Set, Tuple
+import torch
+__all__ = ["safe_permute"]
+# these datatypes are missing implementations required for standard permutation
+_EXPERIMENTAL_DTYPES: Set[Tuple[torch.dtype, torch.device]] = set()
+def safe_permute(value: torch.Tensor, perm: torch.Tensor, dim: int = 0) -> torch.Tensor:
+    """
+    Perform out-of-place permutation without using torch.Tensor.index_put_,
+    whose implementation is missing for datatypes such as `torch.float8_e4m3fn`
+    :param value: tensor to permute
+    :param perm: permutation map
+    :param dim: dimension along which to apply permutation
+    :return: permuted value
+    """
+    dtype_tuple = (value.dtype, value.device)
+    if dtype_tuple in _EXPERIMENTAL_DTYPES:
+        return _fallback_permute(value, perm, dim)
+    try:
+        return value[tuple([slice(None)] * dim + [perm])]
+    except RuntimeError:
+        # Mark dtype as experimental if advanced indexing fails
+        _EXPERIMENTAL_DTYPES.add(dtype_tuple)
+        return _fallback_permute(value, perm, dim)
+def _fallback_permute(
+    value: torch.Tensor, perm: torch.Tensor, dim: int
+) -> torch.Tensor:
+    """
+    Fallback permutation method for experimental dtypes.
+    :param value: tensor to permute
+    :param perm: permutation map
+    :param dim: dimension along which to apply permutation
+    :return: permuted value
+    """
+    value_ret = value.clone()  # cannot use zeros_like b/c of missing impl.
+    orig_slices = [slice(None)] * (dim + 1)
+    perm_slices = [slice(None)] * (dim + 1)
+    for index, perm_index in enumerate(perm):
+        orig_slices[dim] = index
+        perm_slices[dim] = perm_index
+        value_ret[tuple(orig_slices)] = value[tuple(perm_slices)]
+    return value_ret

compressed_tensors/utils/safetensors_load.py CHANGED Viewed

@@ -234,5 +234,7 @@ def is_quantization_param(name: str) -> bool:
         return True
     if name.endswith("zero_point"):
         return True
+    if name.endswith("g_idx"):
+        return True
     return False

compressed_tensors/{compressors/utils → utils}/semi_structured_conversions.py RENAMED Viewed

@@ -28,6 +28,7 @@ __all__ = [
     "mask_creator",
 ]
 # This is PyTorch implementation of main part of reorder_meta()
 # function, from tools/util/include/cutlass/util/host_reorder.h file
 # of CUTLASS source tree.  Furthermore, CUTLASS template for sparse

compressed_tensors/version.py CHANGED Viewed

@@ -17,7 +17,7 @@ Functionality for storing and setting the version info for SparseML
 """
-version_base = "0.4.0"
+version_base = "0.6.0"
 is_release = True  # change to True to set the generated version as a release version

compressed-tensors 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl