PyPI - compressed-tensors - Versions diffs - 0.13.1a20260115__tar.gz → 0.13.1a20260123__tar.gz - Mend

compressed-tensors 0.13.1a20260115tar.gz → 0.13.1a20260123tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

compressed_tensors-0.13.1a20260123/.github/mergify.yml ADDED Viewed

@@ -0,0 +1,64 @@
+pull_request_rules:
+  - name: label-documentation
+    description: Automatically apply documentation label
+    conditions:
+      - label != stale
+      - -closed
+      - or:
+          - files~=^[^/]+\.md$
+          - files~=^docs/
+          - files~=^examples/
+    actions:
+      label:
+        add:
+          - documentation
+  - name: ping author on conflicts and add 'needs-rebase' label
+    conditions:
+      - label != stale
+      - conflict
+      - -closed
+    actions:
+      label:
+        add:
+          - needs-rebase
+      comment:
+        message: |
+          This pull request has merge conflicts that must be resolved before it can be
+          merged. Please rebase the PR, @{{author}}.
+          https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+  - name: remove 'needs-rebase' label when conflict is resolved
+    conditions:
+      - -conflict
+      - -closed
+    actions:
+      label:
+        remove:
+          - needs-rebase
+  - name: add quality-failed label
+    conditions:
+      - label != stale
+      - check-failure = quality-check
+      - -closed
+    actions:
+      label:
+        add:
+          - quality-failed
+      comment:
+        message: |
+          The quality checks have failed. Please run `make style` and `make quality` under
+          the root directory to adddress the lint failures. You will need to install the
+          dev optional install to get the required linting packages.
+  - name: remove quality-failed label
+    conditions:
+      - label != stale
+      - -check-failure = quality-check
+      - -closed
+    actions:
+      label:
+        remove:
+          - quality-failed

compressed_tensors-0.13.1a20260123/.github/workflows/stale.yml ADDED Viewed

@@ -0,0 +1,44 @@
+name: 'Close inactive PRs'
+on:
+  schedule:
+    - cron: '0 17 * * *'
+jobs:
+  close-pull-requests:
+    if: github.repository == 'vllm-project/compressed-tensors'
+    permissions:
+      issues: write
+      pull-requests: write
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d
+        with:
+          operations-per-run: 1000
+          exempt-draft-pr: true
+          exempt-issue-labels: 'keep-open'
+          exempt-pr-labels: 'keep-open'
+          days-before-issue-stale: 90
+          days-before-issue-close: 30
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not
+            had any activity within 90 days. It will be automatically closed if no
+            further activity occurs within 30 days. Leave a comment if
+            you feel this issue should remain open. Thank you!
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity. Please
+            feel free to reopen if you feel it is still relevant. Thank you!
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it
+            has not had any activity within 90 days. It will be automatically
+            closed if no further activity occurs within 30 days.
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it.

{compressed_tensors-0.13.1a20260115 → compressed_tensors-0.13.1a20260123}/Makefile RENAMED Viewed

@@ -8,7 +8,7 @@ quality:
 	@echo "Running copyright checks";
 	python utils/copyright.py quality $(PYCHECKGLOBS)
 	@echo "Running python quality checks";
-	black --check $(PYCHECKDIRS);
+	black --target-version py310 --check $(PYCHECKDIRS);
 	isort --check-only $(PYCHECKDIRS);
 	flake8 $(PYCHECKDIRS);
@@ -17,7 +17,7 @@ style:
 	@echo "Running copyright style";
 	python utils/copyright.py style $(PYCHECKGLOBS)
 	@echo "Running python styling";
-	black $(PYCHECKDIRS);
+	black --target-version py310 $(PYCHECKDIRS);
 	isort $(PYCHECKDIRS);
 # run tests for the repo

{compressed_tensors-0.13.1a20260115/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260123}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.13.1a20260115
+Version: 0.13.1a20260123
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.
@@ -8,7 +8,7 @@ Author-email: support@neuralmagic.com
 License: Apache 2.0
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch>=1.7.0
+Requires-Dist: torch<=2.9.1,>=1.7.0
 Requires-Dist: transformers
 Requires-Dist: pydantic>=2.0
 Requires-Dist: loguru

{compressed_tensors-0.13.1a20260115 → compressed_tensors-0.13.1a20260123}/setup.py RENAMED Viewed

@@ -88,7 +88,7 @@ def _setup_packages() -> List:
     )
 def _setup_install_requires() -> List:
-    return ["torch>=1.7.0", "transformers", "pydantic>=2.0", "loguru"]
+    return ["torch>=1.7.0,<=2.9.1", "transformers", "pydantic>=2.0", "loguru"]
 def _setup_extras() -> Dict:
     return {

{compressed_tensors-0.13.1a20260115 → compressed_tensors-0.13.1a20260123}/src/compressed_tensors/__init__.py RENAMED Viewed

@@ -20,5 +20,14 @@ from .base import *
 from .compressors import *
 from .config import *
 from .quantization import QuantizationConfig, QuantizationStatus
-from .utils import *
+# avoid resolving compressed_tensors.offload as compressed_tensors.utils.offload
+from .utils.offload import *
+from .utils.helpers import *
+from .utils.internal import *
+from .utils.match import *
+from .utils.permutations_24 import *
+from .utils.safetensors_load import *
+from .utils.semi_structured_conversions import *
+from .utils.type import *
 from .version import *

{compressed_tensors-0.13.1a20260115 → compressed_tensors-0.13.1a20260123}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py RENAMED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import logging
+import warnings
 from typing import Dict, Generator, Tuple
 import numpy as np
@@ -138,6 +139,12 @@ class Marlin24Compressor(BaseCompressor):
         :param show_progress: whether to show tqdm progress
         :return: compressed state dict
         """
+        warnings.warn(
+            "The marlin24 format is deprecated and will be removed in a "
+            "future release. vLLM no longer supports marlin24 models.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         self.validate_quant_compatability(names_to_scheme)
         compressed_dict = {}

{compressed_tensors-0.13.1a20260115 → compressed_tensors-0.13.1a20260123}/src/compressed_tensors/config/format.py RENAMED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from typing import List, Optional
 import torch
@@ -68,6 +69,12 @@ def _get_quant_compression_format(
         ):
             # marlin24 kernel only applicable for channel/group quantization
             # Note: vLLM may only support group quant for marlin24
+            warnings.warn(
+                "The marlin24 format is deprecated and will be removed in a "
+                "future release. vLLM no longer supports marlin24 models.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             return CompressionFormat.marlin_24
         return CompressionFormat.pack_quantized

compressed_tensors-0.13.1a20260123/src/compressed_tensors/offload/__init__.py ADDED Viewed

@@ -0,0 +1,197 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+from typing import Iterable, Optional
+import torch
+from compressed_tensors.offload.cache import OffloadCache
+from compressed_tensors.offload.dispatch import (  # noqa: F401
+    dispatch_model,
+    offload_model,
+    remove_dispatch,
+)
+from compressed_tensors.offload.module import offload_module, unwrap_offload_forward
+from compressed_tensors.offload.utils import get_module_device, move_module_tensor
+from compressed_tensors.utils.helpers import patch_attr
+__all__ = [
+    # dispatch models
+    "offload_model",
+    "dispatch_model",
+    "remove_dispatch",
+    # control movement
+    "disable_onloading",
+    "disable_offloading",
+    # manipulate parameters
+    "update_offload_parameter",
+    "get_execution_device",
+    "get_offloaded_device",
+    "register_offload_module",
+    # manipulate forward
+    "unwrap_offload_forward",
+    # backwards compatibility: should be deprecated
+    "align_modules",
+    "align_module_device",
+]
+@contextlib.contextmanager
+def disable_offloading():
+    """
+    When offloading is disabled, onloaded tensors remain onloaded in memory until exit
+    ```
+    with OffloadCache.disable_offloading():
+        ... = cache["weight"]
+        ... = cache["weight"]  # cache hit
+        ... = cache["weight"]  # cache hit
+    # upon exit, all onloaded weights are released
+    ```
+    """
+    with OffloadCache.disable_offloading():
+        yield
+@contextlib.contextmanager
+def disable_onloading():
+    """
+    When onloading is disabled, tensors are not offloaded on access, and assignments do
+    not trigger offloading. This is mostly used to disable device movement for debugging
+    ```
+    with OffloadCache.disable_onloading():
+        tensor = ...
+        cache["weight"] = tensor   # assignments do not trigger onloading
+        cache["weight"] is tensor  # tensor remains offloaded
+    ```
+    """
+    with OffloadCache.disable_onloading():
+        yield
+def update_offload_parameter(module: torch.nn.Module, name: str, data: torch.Tensor):
+    """
+    Update the data of an existing parameter and its offload dict. Supports both
+    parameters of offloaded modules and non-offloaded modules
+    :param module: module containing the parameter to update
+    :param name: name of module parameter to update
+    :param data: tensor to update parameter with
+    """
+    if isinstance(module._parameters, OffloadCache):
+        with module._parameters.disable_onloading():
+            value = getattr(module, name)
+            value.copy_(module._parameters.offload(data))
+            setattr(module, name, value)
+    else:
+        getattr(module, name).copy_(data)
+def get_execution_device(module: torch.nn.Module) -> torch.device | str:
+    """
+    Get the device which inputs should be moved to before module execution.
+    :param module: module to check, may be offloaded
+    :return: onload device of module
+    """
+    if isinstance(module._parameters, OffloadCache):
+        return module._parameters.onload_device
+    else:
+        return get_module_device(module)
+def get_offloaded_device(module: torch.nn.Module) -> torch.device:
+    """
+    :param module: module to check
+    :return: device module is offloaded to onto after forward pass
+    """
+    with disable_onloading():
+        return get_module_device(module)
+def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.Module):
+    """
+    Register a submodule with offloading if the parent module is offloaded
+    :param base: module to attach submodule to
+    :param name: name of submodule
+    :param module: submodule to attach
+    """
+    cache = base._parameters
+    if isinstance(cache, OffloadCache):
+        offload_module(
+            module, cache.onload_device, cache.offload_device, no_split=False
+        )
+    base.register_module(name, module)
+""" Implemented for backwards compatibility """
+@contextlib.contextmanager
+def align_modules(
+    modules: torch.nn.Module | Iterable[torch.nn.Module],
+    execution_device: Optional[torch.device] = None,
+):
+    """
+    Context manager for onloading modules to a device, and disabling onload and offload
+    attempts triggered by forward calls. Used for sequential onloading of layers
+    :param modules: `torch.nn.Module` or iterable of `torch.nn.Module`s to onload
+    :param execution_device: device to onload to
+    """
+    with contextlib.ExitStack() as stack:
+        for module in modules:
+            stack.enter_context(align_module_device(module, execution_device))
+        yield
+@contextlib.contextmanager
+def align_module_device(
+    module: torch.nn.Module, execution_device: Optional[torch.device] = None
+):
+    """
+    Context manager that moves a module's parameters to the specified execution device.
+    :param module: Module with parameters to align
+    :param execution_device: If provided, overrides the module's execution device
+        within the context. Otherwise, use hook execution device or pass
+    """
+    if isinstance(module._parameters, OffloadCache):
+        assert isinstance(module._buffers, OffloadCache)
+        with module._parameters.disable_offloading():
+            with patch_attr(
+                module._parameters, "onload_device", execution_device
+            ), patch_attr(module._buffers, "onload_device", execution_device):
+                yield
+    else:
+        original_device = {}
+        for name, param in module.named_parameters(recurse=False):
+            original_device[name] = param.device
+            move_module_tensor(module, name, execution_device)
+        try:
+            yield
+        finally:
+            for name, param in module.named_parameters(recurse=False):
+                device = original_device[name]
+                move_module_tensor(module, name, device)

compressed_tensors-0.13.1a20260123/src/compressed_tensors/offload/cache/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .base import OffloadCache
+from .cpu import CPUCache

compressed_tensors-0.13.1a20260123/src/compressed_tensors/offload/cache/base.py ADDED Viewed

@@ -0,0 +1,231 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+from abc import ABC, abstractmethod
+from collections.abc import MutableMapping
+from typing import ClassVar, Literal, Optional
+import torch
+import torch.distributed as dist
+class OffloadCache(MutableMapping, ABC):
+    """
+    Base class for offload caches. Subclasses must implement `offload` and `onload`.
+    Instances have similar behavior to dicts, except that tensors are offloaded when
+    assigned and onloaded when accessed.
+    Typical usage:
+    ```
+    module._parameters = cache_cls.from_mapping(module._parameters, onload_device)
+    tensor = ...
+    module._parameters["name"] = tensor           # tensor is offloaded
+    onloaded_tensor = module._parameters["name"]  # tensor is onloaded
+    ```
+    This class implements two contexts for more fine-grained control of device movement:
+    `OffloadCache.disable_offloading` and `OffloadCache.disable_onloading`. For more
+    info, see `compressed_tensors.offload::(disable_offloading|disable_onloading)`
+    """
+    onload_device: torch.device | str
+    offload_device: Optional[torch.device | str]
+    # global flags for disabling
+    offloading_disabled: ClassVar[bool] = False
+    onloading_disabled: ClassVar[bool] = False
+    # names -> offloaded tensors (populated from _parameters or _buffers)
+    offloaded_values: dict[str, torch.Tensor]
+    # offloaded tensors -> onloaded tensors (only when offloading is disabled)
+    keep_onloaded_values: ClassVar[dict[torch.Tensor, torch.Tensor]] = dict()
+    @classmethod
+    def cls_from_device(
+        cls,
+        device: Optional[torch.device | str | Literal["disk"]] = None,
+    ) -> type["OffloadCache"]:
+        """
+        Get the subclass which implements offloading for the given `offload_device`.
+        Use `torch.distributed` to detect if the environment is distributed
+        :param device: offload device used to find subclass
+        :return: subclass of `OffloadCache`
+        """
+        from compressed_tensors.offload.cache.cpu import CPUCache
+        from compressed_tensors.offload.cache.device import DeviceCache
+        device_type = torch.device(device).type if device != "disk" else "disk"
+        distributed = dist.is_available() and dist.is_initialized()
+        match (device_type, distributed):
+            case ("cpu", False):
+                return CPUCache
+            case ("cuda", False):
+                return DeviceCache
+            case _:
+                raise NotImplementedError(
+                    f"Offload of type {device} and "
+                    f"distributed={distributed} has not been implemented"
+                )
+    @classmethod
+    def from_mapping(
+        cls,
+        mapping: MutableMapping[str, torch.Tensor | None],
+        onload_device: torch.device | str,
+    ):
+        """
+        Initialize an instance from a given mapping, typically `Module._parameters` or
+        `Module._buffers`. Mapping values will be offloaded
+        :param mapping: mapping used to populate cache
+        :param onload_device: device which tensors will be onloaded to
+        """
+        instance = cls(onload_device=onload_device)
+        instance.offloaded_values = {
+            name: instance.offload(tensor) for name, tensor in mapping.items()
+        }
+        return instance
+    def __init__(self, onload_device: torch.device | str):
+        super().__init__()
+        self.onload_device = onload_device
+        self.offloaded_values = dict()
+    @abstractmethod
+    def onload(self, offloaded: torch.Tensor | None) -> torch.Tensor:
+        """
+        Given an offloaded tensor, returns that tensor after onloading
+        :param offloaded: offloaded tensor
+        :return: onloaded tensor
+        """
+        raise NotImplementedError()
+    @abstractmethod
+    def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
+        """
+        Given a tensor, returns that tensor after offloading
+        :param tensor: tensor to offload
+        :return: offloaded tensor
+        """
+        raise NotImplementedError()
+    def __getitem__(self, key: str) -> torch.Tensor:
+        """
+        Onload a tensor
+        If called within the `disable_offloading` context, a strong reference of the
+        onloaded tensor is kept so that future accesses will not require device movement
+        :param key: name of tensor to access
+        :return: onloaded tensor
+        """
+        offloaded = self.offloaded_values[key]
+        # when onloading is disabled, offloaded tensors can be accessed directly
+        if offloaded is None or self.onloading_disabled:
+            return offloaded
+        # check for cache hit
+        if offloaded in self.keep_onloaded_values:
+            return self.keep_onloaded_values[offloaded]
+        # onload value
+        onloaded = self.onload(offloaded)
+        # when offloading is disabled, populate cache
+        if self.offloading_disabled:
+            self.keep_onloaded_values[offloaded] = onloaded
+        return onloaded
+    def __setitem__(self, key: str, value: torch.Tensor | None):
+        """
+        Offload a tensor and add it to the cache.
+        If called within the `disable_onloading` context, the tensor is not offloaded
+        and is instead assigned directly
+        :param key: name of tensor
+        :param value: tensor value to offload
+        """
+        if key in self:
+            del self[key]
+        # when onloading is disabled, parameters can be access and assigned directly
+        if self.onloading_disabled:
+            self.offloaded_values[key] = value
+            return
+        self.offloaded_values[key] = self.offload(value)
+    def __delitem__(self, key: str):
+        """
+        Remove the offloaded tensor associated with `key`. Any references to its
+        onloaded tensors held by this class are invalidated.
+        :param key: name of tensor to invalidate
+        """
+        offloaded = self.offloaded_values[key]
+        del self.offloaded_values[key]
+        # remove strong ref
+        if offloaded in self.keep_onloaded_values:
+            del self.keep_onloaded_values[offloaded]
+    def __contains__(self, key) -> bool:
+        return key in self.offloaded_values
+    def __iter__(self):
+        return iter(self.offloaded_values)
+    def __len__(self):
+        return len(self.offloaded_values)
+    @classmethod
+    @contextlib.contextmanager
+    def disable_offloading(cls):
+        """
+        Context to disable all offloading for offloaded modules which share this cache.
+        After a weight has been fetched once, that onloaded value is cached and
+        subsequent fetches will leverage the cache, reducing device movement
+        """
+        if not OffloadCache.offloading_disabled:
+            OffloadCache.offloading_disabled = True
+            yield
+            OffloadCache.offloading_disabled = False
+            OffloadCache.keep_onloaded_values.clear()
+        else:
+            yield
+    @classmethod
+    @contextlib.contextmanager
+    def disable_onloading(cls):
+        """
+        Context to disable all onloading for offloaded modules which share this cache.
+        This is mostly used for debugging purposes, and allows the caller to directly
+        inspect offloaded tensors and directly assign offloaded tensors without copying
+        """
+        if not OffloadCache.onloading_disabled:
+            OffloadCache.onloading_disabled = True
+            yield
+            OffloadCache.onloading_disabled = False
+        else:
+            yield

compressed-tensors 0.13.1a20260115__tar.gz → 0.13.1a20260123__tar.gz

compressed-tensors 0.13.1a20260115tar.gz → 0.13.1a20260123tar.gz