PyPI - compressed-tensors - Versions diffs - 0.10.2a20250611__tar.gz → 0.10.2a20250613__tar.gz - Mend

compressed-tensors 0.10.2a20250611tar.gz → 0.10.2a20250613tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

{compressed_tensors-0.10.2a20250611/src/compressed_tensors.egg-info → compressed_tensors-0.10.2a20250613}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.10.2a20250611
+Version: 0.10.2a20250613
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/setup.py RENAMED Viewed

@@ -113,5 +113,6 @@ setup(
     extras_require=_setup_extras(),
     install_requires=_setup_install_requires(),
     package_dir={"": "src"},
+    package_data={"": ["transform/utils/hadamards.safetensors"]},
     packages=_setup_packages(),
 )

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/src/compressed_tensors/transform/factory/hadamard.py RENAMED Viewed

@@ -59,7 +59,7 @@ class HadamardFactory(TransformFactory):
         return HadamardTransform(weight, args)
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
-        data = deterministic_hadamard_matrix(size)
+        data = deterministic_hadamard_matrix(size, dtype, device)
         data = data.to(dtype=dtype, device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/src/compressed_tensors/transform/factory/random_hadamard.py RENAMED Viewed

@@ -29,6 +29,6 @@ class RandomHadamardFactory(HadamardFactory):
     """
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
-        data = random_hadamard_matrix(size, self.generator)
+        data = random_hadamard_matrix(size, dtype, device, self.generator)
         data = data.to(dtype=dtype, device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)

compressed_tensors-0.10.2a20250613/src/compressed_tensors/transform/utils/hadamard.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from pathlib import Path
+from typing import Optional
+import torch
+from safetensors import safe_open
+REPO_PATH = Path(__file__).parent / "hadamards.safetensors"
+__all__ = ["random_hadamard_matrix", "deterministic_hadamard_matrix", "is_pow2"]
+# note that hadamard matrix multiplication can be accelerated using a library such as
+# https://github.com/Dao-AILab/fast-hadamard-transform/tree/master
+def deterministic_hadamard_matrix(
+    size: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """
+    Construct an n-by-n Hadamard matrix, using Sylvester's construction.
+    `n` must be a power of 2.
+    Adapated from https://github.com/scipy/scipy/blob/v1.15.2/scipy/linalg/_special_matrices.py  # noqa: E501
+    :param size: order of the matrix, must be a power of 2
+    :param dtype: data type of matrix
+    :param device: device to construct matrix on
+    :return: hadamard matrix of size `size`
+    """
+    if size <= 0:
+        raise ValueError("Cannot construct deterministic hadamard of size <= 0")
+    log2 = int(math.log2(size))
+    if size != 2**log2:
+        raise ValueError("Cannot construct deterministic hadamard of size != 2^n")
+    H = torch.tensor([[1]], dtype=dtype, device=device)
+    # Sylvester's construction
+    for _ in range(log2):
+        H = torch.vstack((torch.hstack((H, H)), torch.hstack((H, -H))))
+    return H / math.sqrt(size)
+def random_hadamard_matrix(
+    size: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cpu"),
+    gen: Optional[torch.Generator] = None,
+) -> torch.Tensor:
+    """
+    Produces a randomly generated Hadamard matrix. Differs from
+    `deterministic_hadamard_matrix` in that this function supports non powers of 2
+    and randomization using a seeded generator
+    Adapated from https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py  # noqa: E501
+    Known matrices were retrieved from N. J. A. Sloane's Library of Hadamard Matrices http://www.neilsloane.com/hadamard/  # noqa: E501
+    :param size: The dimension of the hamadard matrix
+    :param dtype: data type of matrix
+    :param device: device to construct matrix on
+    :param gen: Optional generator random values
+    :return: randomly generated hadamard matrix
+    """
+    Q = torch.randint(low=0, high=2, size=(size,), generator=gen, dtype=dtype)  # cpu
+    Q = Q.to(device=device)
+    Q = Q * 2 - 1
+    Q = torch.diag(Q)
+    return _matmul_hadU(Q) / math.sqrt(size)
+def is_pow2(n: int) -> bool:
+    """
+    Check if a number is a power of 2
+    :param n: number to check
+    :return: True iff `n` is a power of 2
+    """
+    return n > 0 and (n & (n - 1) == 0)
+def _fetch_hadamard_divisor(
+    n: int,
+    dtype: torch.dtype,
+    device: torch.device = torch.device("cpu"),
+    file_path: str = REPO_PATH,
+) -> Optional[torch.Tensor]:
+    """
+    Fetch a known hadamard matrix from the given file path. The returned matrix will
+    be of of size `k` such that `n / k` is a power of two. Return None if no such
+    matrix exists.
+    Note: This function reopens the safetensors file every time it is called.
+    This is technically inefficient, but a very small runtime cost and simpler
+    than forcing callers to manage the file open context
+    :param n: size of known hadamard matrix
+    :return: a known hadamard matrix of size `n` if one exists, else None
+    """
+    with safe_open(file_path, framework="pt", device=str(device)) as file:
+        divisors = sorted((int(key) for key in file.keys()), reverse=True)
+        for divisor in divisors:
+            if n % divisor == 0 and is_pow2(n // divisor):
+                return file.get_tensor(str(divisor)).to(dtype=dtype)
+    return None
+def _matmul_hadU(X: torch.Tensor) -> torch.Tensor:
+    size = X.size(0)
+    dtype = X.dtype
+    device = X.device
+    # Check if we have the determined hadamard matrix
+    hadK = _fetch_hadamard_divisor(size, dtype, device=device)
+    if hadK is None:
+        raise ValueError(f"Cannot construct random hadamard matrix of size {size}")
+    K = hadK.size(0)
+    # Reshape diag matrix with randomized -1/+1
+    input = X.clone().view(-1, size, 1)
+    output = input.clone()
+    while input.shape[1] > K:
+        input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2])
+        output = output.view(input.shape)
+        output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
+        output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
+        output = output.view(input.shape[0], input.shape[1], -1)
+        (input, output) = (output, input)
+    assert input.shape[1] == K
+    del output
+    # Do not explicitly repeat - OOM
+    # input = torch.bmm(
+    #     hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input)
+    # Use bcast instead
+    input = hadK.view(1, K, K).to(input) @ input
+    # normalize
+    return input.view(X.shape)

compressed_tensors-0.10.2a20250613/src/compressed_tensors/transform/utils/hadamards.safetensors ADDED Viewed

Binary file

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/src/compressed_tensors/utils/offload.py RENAMED Viewed

@@ -14,27 +14,30 @@
 """
 Utilities associated with offloading functionality provided by `accelerate`.
-| ----------------------------------------------------------------------------------------------------- | # noqa: E501
-| Operation | Without offloading support             | With offloading support                          | # noqa: E501
-| --------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501
-| Add       | module.register_parameter(name, param) | register_offload_parameter(module, name, param)  | # noqa: E501
-| Check     | N/A                                    | has_offloaded_params(module)                     | # noqa: E501
-| Onload    | N/A                                    | with align_module_device(module)                 | # noqa: E501
-| Update    | module.name.data.copy_(new_data)       | update_offload_parameter(module, name, new_data) | # noqa: E501
-| Delete    | del module.name                        | delete_offload_parameter(module, name)           | # noqa: E501
-| ----------------------------------------------------------------------------------------------------- | # noqa: E501
+| ------------------------------------------------------------------------------------------------------ | # noqa: E501
+| Operation  | Without offloading support             | With offloading support                          | # noqa: E501
+| ---------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501
+| Add        | module.register_parameter(name, param) | register_offload_parameter(module, name, param)  | # noqa: E501
+| Check      | N/A                                    | has_offloaded_params(module)                     | # noqa: E501
+| Onload     | N/A                                    | with align_module_device(module)                 | # noqa: E501
+| Update     | module.name.data.copy_(new_data)       | update_offload_parameter(module, name, new_data) | # noqa: E501
+| Delete     | del module.name                        | delete_offload_parameter(module, name)           | # noqa: E501
+| Add Module | module.register_module(name, child)    | register_offload_module(name, child)             | # noqa: E501
+| Del Module | del module.name                        | delete_offload_module(module, name)              | # noqa: E501
+| ------------------------------------------------------------------------------------------------------ | # noqa: E501
 """
 import contextlib
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Union
+from operator import attrgetter
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Tuple, Union
 import torch
+from compressed_tensors.utils import patch_attr
 try:
-    from accelerate import dispatch_model
     from accelerate.hooks import (
         AlignDevicesHook,
         add_hook_to_module,
@@ -45,10 +48,12 @@ try:
     from accelerate.utils import (
         OffloadedWeightsLoader,
         PrefixedDataset,
+        find_tied_parameters,
         set_module_tensor_to_device,
     )
     _has_accelerate = True
 except ImportError:
     _has_accelerate = False
     AlignDevicesHook = None
@@ -58,8 +63,8 @@ except ImportError:
     PrefixedDataset = None
     set_module_tensor_to_device = None
     named_module_tensors = None
-    dispatch_model = None
     attach_align_device_hook = None
+    find_tied_parameters = None
 __all__ = [
@@ -78,14 +83,14 @@ __all__ = [
     "align_module_device",
     "register_offload_module",
     "delete_offload_module",
-    "force_cpu_offload",
+    "offloaded_dispatch",
+    "disable_offloading",
 ]
 def check_accelerate(fallback: Any):
     def decorator(func: Callable[[Any], Any]):
         if not _has_accelerate:
             if fallback == "error":
                 @wraps(func)
@@ -211,7 +216,7 @@ def register_offload_parameter(
 def update_offload_parameter(
     module: torch.nn.Module,
     name: str,
-    data: Optional[torch.Tensor],
+    data: torch.Tensor,
     offload_device: Optional[Union[torch.device, Literal["disk"]]] = None,
 ):
     """
@@ -224,7 +229,7 @@ def update_offload_parameter(
     :param offload_device: device on which weight will be offloaded to. If None is
         provided, then infer device from parameters on module
     """
-    param = getattr(module, name)
+    param: torch.nn.Parameter = getattr(module, name)
     if param.data.shape != data.shape:
         warnings.warn(
             f"Shape of parameter being updated {param.data.shape} does not match shape "
@@ -232,7 +237,7 @@ def update_offload_parameter(
         )
     # copy data into onloaded parameter if applicable
-    if param.device != torch.device("meta"):
+    if param.device != torch.device("meta") and data is not param.data:
         param.data.copy_(data)
     # update offload dict
@@ -479,46 +484,76 @@ def delete_offload_module(base: torch.nn.Module, name: str):
 @check_accelerate(fallback="error")
-def force_cpu_offload(
-    module: torch.nn.Module, execution_device: torch.device
+def offloaded_dispatch(
+    module: torch.nn.Module,
+    execution_device: torch.device,
+    offload_device: Union[torch.device, Literal["disk"]] = torch.device("cpu"),
 ) -> torch.nn.Module:
     """
-    Force cpu offloading a module, primarily used for testing
+    Unlike `dispatch_model`, this function forces a module (and its submodules) to
+    offload all parameters and replace them with meta tensors, utiliizing the
+    `AlignDevicesHook` to control onloading and offloading.
     :param module: module containing parameters to offload
-    :param execution_device: execution device submodules
-    :return: module with hooks to perform cpu offloading
-    """
-    # edge case: there is a bug in `dispatch_model` which causes
-    # the function to only work if the model contains submodules
-    if next(module.children(), None) is None:
-        attach_align_device_hook(
-            module,
-            execution_device=execution_device,
-            offload=True,
-            weights_map=module.state_dict(),
-            tied_params_map={},
-        )
-        return module
-    device_map = {}
-    def collect_device_map(name: List[str], module: torch.nn.Module):
-        if next(module.parameters(recurse=False), None) is not None:
-            device_map[".".join(name)] = "cpu"
-            return
+    :param execution_device: device that modules will onload and execute on
+    :param offload_device: device that module parameters will offload to
+    :return: module with offloading device hooks
+    """
+    if offload_device == "disk":
+        raise NotImplementedError("Disk offloading is not currently supported")
+    # create weights map
+    state_dict = module.state_dict()
+    state_dict = {key: val.to(offload_device) for key, val in state_dict.items()}
+    weights_map = OffloadedWeightsLoader(state_dict=state_dict, device=offload_device)
+    # create tied params map
+    tied_params = find_tied_parameters(module)
+    tied_params_map = {}
+    for group in tied_params:
+        for param_name in group:
+            data_ptr = attrgetter(param_name)(module).data_ptr()
+            tied_params_map[data_ptr] = {}
+    # recursively attaches hooks to all submodules
+    attach_align_device_hook(
+        module,
+        execution_device=execution_device,
+        offload=True,
+        weights_map=weights_map,
+        tied_params_map=tied_params_map,
+    )
+    return module
-        else:
-            for submodule_name, submodule in module.named_children():
-                name.append(submodule_name)
-                collect_device_map(name, submodule)
-                name.pop()
-    collect_device_map([], module)
+@contextlib.contextmanager
+def disable_offloading():
+    """
+    Keep modules onloaded and disable offloading until this context exits.
+    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
+    """
+    original_pre_forward = AlignDevicesHook.pre_forward
+    onloaded_modules: Dict[torch.nn.Module, Tuple[AlignDevicesHook, bool]] = dict()
+    # onload once and disable any future onloading/offloading steps
+    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
+        ret = original_pre_forward(self, module, *args, **kwargs)
+        if module not in onloaded_modules:
+            onloaded_modules[module] = (self, self.offload)
+            self.offload = False
+        return ret
+    # use the patched pre_forward function within the context
+    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
+        yield
-    return dispatch_model(
-        module, device_map, main_device=execution_device, force_hooks=True
-    )
+    # manually offload all modules that were onloaded
+    # update any parameters which may have changed
+    for module, (hook, offload) in onloaded_modules.items():
+        hook.offload = offload
+        for name, param in module.named_parameters():
+            update_offload_parameter(module, name, param.data)
+        hook.post_forward(module, None)
 """ Upstreamed Functions """

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.10.2.a20250611'
+__version__ = version = '0.10.2.a20250613'
 __version_tuple__ = version_tuple = (0, 10, 2)

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.10.2a20250611
+Version: 0.10.2a20250613
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

@@ -82,6 +82,7 @@ src/compressed_tensors/transform/factory/matrix_multiply.py
 src/compressed_tensors/transform/factory/random_hadamard.py
 src/compressed_tensors/transform/utils/__init__.py
 src/compressed_tensors/transform/utils/hadamard.py
+src/compressed_tensors/transform/utils/hadamards.safetensors
 src/compressed_tensors/transform/utils/utils.py
 src/compressed_tensors/utils/__init__.py
 src/compressed_tensors/utils/helpers.py

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/tests/test_transform/factory/test_correctness.py RENAMED Viewed

@@ -19,7 +19,7 @@ from compressed_tensors.transform import (
     TransformFactory,
     TransformScheme,
 )
-from compressed_tensors.utils import align_modules, force_cpu_offload
+from compressed_tensors.utils import offloaded_dispatch
 from tests.testing_utils import requires_accelerate, requires_gpu
@@ -75,7 +75,7 @@ def test_correctness_model(scheme, offload=False):
     # load model
     model = TransformableModel(2, 4, 8, 16, 32, 64)
     if offload:
-        model = force_cpu_offload(model, torch.device("cuda"))
+        model = offloaded_dispatch(model, torch.device("cuda"))
     # create factory
     scheme.apply = [

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/tests/test_transform/factory/test_memory.py RENAMED Viewed

@@ -22,7 +22,7 @@ from compressed_tensors.transform import (
     TransformFactory,
     TransformScheme,
 )
-from compressed_tensors.utils import align_modules, force_cpu_offload
+from compressed_tensors.utils import align_modules, offloaded_dispatch
 from tests.testing_utils import requires_accelerate, requires_gpu
@@ -58,7 +58,7 @@ def test_memory_sharing(scheme, offload=False):
     # load model (maybe with offloading)
     model = TransformableModel(2, 2, 4, 4, 8, 8)
     if offload:
-        force_cpu_offload(model, torch.device("cuda"))
+        offloaded_dispatch(model, torch.device("cuda"))
     # add transforms to model
     factory.apply_to_model(model)

{compressed_tensors-0.10.2a20250611 → compressed_tensors-0.10.2a20250613}/tests/test_transform/utils/test_hadamard.py RENAMED Viewed

@@ -13,46 +13,48 @@
 # limitations under the License.
-import numpy
 import pytest
 import torch
 from compressed_tensors.transform.utils.hadamard import (
-    _get_had12,
-    _get_had20,
     deterministic_hadamard_matrix,
+    is_pow2,
     random_hadamard_matrix,
 )
+from tests.testing_utils import requires_gpu
-@pytest.mark.parametrize(
-    "had_func",
-    [
-        _get_had12,
-        _get_had20,
-    ],
-)
-def test_packed_hadamard_compliant(had_func):
-    had_matrix = had_func()
-    size = had_matrix.size(0)
-    # HH.T == nI
-    product = had_matrix @ had_matrix.T
-    assert torch.equal(product, size * torch.eye(size))
+_sizes_to_test = [
+    768,  # gpt2 small
+    1024,  # gpt2 medium
+    1280,  # qwen_2_5_vl vision
+    1600,  # gpt2 xl
+    2048,  # gpt3 small
+    3584,  # qwen_2_5_vl
+    3840,  # qwen_2_5_vl vision qkv
+    4096,  # llama3
+    7168,  # deepseek_v3
+    14336,  # llama3 intermediate
+    18432,  # deepseek_v3 intermediate
+    18944,  # qwen_2_5_vl intermediate
+]
+_atol = 1e-1  # bfloat16 is low precision for large matrices
-@pytest.mark.parametrize(
-    "size",
-    [4096, 2048],
-)
+@requires_gpu
+@pytest.mark.parametrize("size", _sizes_to_test)
 def test_random_hadamard_matrix_compliant(size):
-    had_matrix = random_hadamard_matrix(size)
-    product = torch.round(had_matrix @ had_matrix.T)
-    assert torch.equal(product, torch.eye(size))
+    # (H / sqrt(n))(H.T / sqrt(n)) == I
+    matrix = random_hadamard_matrix(size, device="cuda")
+    product = matrix @ matrix.T
+    eye = torch.eye(size, dtype=product.dtype, device="cuda")
+    assert torch.allclose(product, eye, atol=_atol)
 def test_random_hadamard_generator():
+    # check that generation is deterministic with a seed
     generator = torch.Generator().manual_seed(42)
-    one = random_hadamard_matrix(2048, generator)
-    two = random_hadamard_matrix(2048, generator)
+    one = random_hadamard_matrix(2048, gen=generator)
+    two = random_hadamard_matrix(2048, gen=generator)
     one_true = torch.tensor(
         [
@@ -73,12 +75,16 @@ def test_random_hadamard_generator():
     assert torch.all(two[:3, :3].sign() == two_true.sign())
-@pytest.mark.parametrize(
-    "size",
-    [1024],
-)
+@requires_gpu
+@pytest.mark.parametrize("size", _sizes_to_test)
 def test_deterministic_hadamard_compliant(size):
-    had_matrix = deterministic_hadamard_matrix(size)
+    if not is_pow2(size):
+        with pytest.raises(ValueError):
+            matrix = deterministic_hadamard_matrix(size, device="cuda")
+        return
     # (H / sqrt(n))(H.T / sqrt(n)) == I
-    product = had_matrix @ had_matrix.T
-    assert numpy.array_equal(product, numpy.eye(size))
+    matrix = deterministic_hadamard_matrix(size, device="cuda")
+    product = matrix @ matrix.T
+    eye = torch.eye(size, dtype=product.dtype, device="cuda")
+    assert torch.allclose(product, eye, atol=_atol)

compressed-tensors 0.10.2a20250611__tar.gz → 0.10.2a20250613__tar.gz

compressed-tensors 0.10.2a20250611tar.gz → 0.10.2a20250613tar.gz