PyPI - compressed-tensors - Versions diffs - 0.12.3a20251013__tar.gz → 0.12.3a20251023__tar.gz - Mend

compressed-tensors 0.12.3a20251013tar.gz → 0.12.3a20251023tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/.github/workflows/build-test.yml RENAMED Viewed

@@ -55,27 +55,3 @@ jobs:
             whl: ${{ needs.BUILD.outputs.whl }}
             code_coverage: ${{ matrix.test_config.code_coverage || false }}
         secrets: inherit
-    UPLOAD:
-        needs: [TEST]
-        uses: ./.github/workflows/upload.yml
-        with:
-            label: gcp-k8s-util
-            timeout: 40
-            run_id: ${{ github.run_id }}
-            push_to_pypi: ${{ inputs.push_to_pypi }}
-        secrets: inherit
-    REPORT:
-        needs: [BUILD, TEST]
-        if: success() || failure()
-        uses: ./.github/workflows/report.yml
-        with:
-            label: rh-reporter
-            timeout: 40
-            run_id: ${{ github.run_id }}
-            run_name: compressed-tensors
-            wheel: ${{ needs.BUILD.outputs.whl }}
-            wf_category: ${{ inputs.wf_category }}
-            gitref: ${{ inputs.gitref }}
-        secrets: inherit

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/.github/workflows/build.yml RENAMED Viewed

@@ -86,9 +86,9 @@ jobs:
               id: auth
               uses: google-github-actions/auth@v2.1.3
               with:
-                  project_id: ${{ secrets.GCP_PROJECT }}
-                  workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
-                  service_account: ${{ secrets.GCP_GHA_SA }}
+                  project_id: ${{ secrets.GCP_VLLM_PROJECT }}
+                  workload_identity_provider: ${{ secrets.GCP_VLLM_PROJECT_WORKLOAD_IDENTITY_PROVIDER }}
+                  service_account: ${{ secrets.GCP_VLLM_PROJECT_GHA_SA }}
             - name: 'Set up Cloud SDK'
               uses: 'google-github-actions/setup-gcloud@v2'
@@ -97,8 +97,8 @@ jobs:
             - name: copy whl and source distribution
               run: |
-                  gcloud storage cp dist/${{ steps.build.outputs.whlname }} ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/${{ steps.build.outputs.whlname }}
-                  gcloud storage cp dist/${{ steps.build.outputs.tarname }} ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/${{ steps.build.outputs.tarname }}
+                  gcloud storage cp dist/${{ steps.build.outputs.whlname }} ${{ secrets.GCP_VLLM_PROJECT_BUILD_ASSETS }}/${{ github.run_id }}/${{ steps.build.outputs.whlname }}
+                  gcloud storage cp dist/${{ steps.build.outputs.tarname }} ${{ secrets.GCP_VLLM_PROJECT_BUILD_ASSETS }}/${{ github.run_id }}/${{ steps.build.outputs.tarname }}
             - name: upload whl
               uses: actions/upload-artifact@v4

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/.github/workflows/test-check.yaml RENAMED Viewed

@@ -12,10 +12,9 @@ on:
 jobs:
   python-tests:
-    runs-on: k8s-util
+    runs-on: ubuntu-22.04
     env:
-        HF_HOME: /model-cache
-        HF_TOKEN: ${{ secrets.NM_HF_TOKEN_READ_ONLY }}
+        HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
     steps:
         - uses: actions/setup-python@v5
           with:
@@ -32,5 +31,20 @@ jobs:
           run: pip3 install --upgrade pip setuptools
         - name: "⚙️ Install dependencies"
           run: pip3 install .[dev,accelerate]
+        - name: clean up
+          run: |
+            echo "cleaning up disk space as GHA runner has limited disk size."
+            python3 -m pip cache purge
+            sudo rm -rf /usr/local/.ghcup
+            sudo rm -rf /opt/hostedtoolcache/CodeQL
+            sudo rm -rf /usr/local/lib/android/sdk/ndk
+            sudo rm -rf /usr/share/dotnet
+            sudo rm -rf /opt/ghc
+            sudo rm -rf /usr/local/share/boost
+            if [[ "$(cat /etc/issue)" =~ Ubuntu ]]; then
+              sudo apt-get clean
+            fi
+            df -h
+          shell: bash
         - name: "🔬 Running tests"
           run: make test

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/.github/workflows/test.yml RENAMED Viewed

@@ -72,8 +72,7 @@ jobs:
             id-token: 'write'
             pages: 'write'
         env:
-            HF_HOME: /model-cache
-            HF_TOKEN: ${{ secrets.NM_HF_TOKEN_READ_ONLY }}
+            HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
         environment:
             name: github-pages
             url: ${{ steps.coverage.outputs.page_url }}
@@ -123,9 +122,9 @@ jobs:
               id: auth
               uses: google-github-actions/auth@v2.1.3
               with:
-                  project_id: ${{ secrets.GCP_PROJECT }}
-                  workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
-                  service_account: ${{ secrets.GCP_GHA_SA }}
+                  project_id: ${{ secrets.GCP_VLLM_PROJECT }}
+                  workload_identity_provider: ${{ secrets.GCP_VLLM_PROJECT_WORKLOAD_IDENTITY_PROVIDER }}
+                  service_account: ${{ secrets.GCP_VLLM_PROJECT_GHA_SA }}
             - name: 'Set up Cloud SDK'
               uses: 'google-github-actions/setup-gcloud@v2'
@@ -136,7 +135,7 @@ jobs:
               if: ${{ inputs.run_id != '' }}
               uses: neuralmagic/nm-actions/actions/gcp-download-assets@v1.1.0
               with:
-                  bucket_source: ${{ secrets.GCP_BUILD_ML_ASSETS2 }}
+                  bucket_source: ${{ secrets.GCP_VLLM_PROJECT_BUILD_ASSETS }}
                   run_id: ${{ inputs.run_id }}
             - name: run tests
@@ -165,7 +164,7 @@ jobs:
             - name: copy results to GCP
               run: |
-                  gcloud storage cp test-results/report.xml ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/test-results/report-${{ inputs.test_label }}.xml
+                  gcloud storage cp test-results/report.xml ${{ secrets.GCP_VLLM_PROJECT_BUILD_ASSETS }}/${{ github.run_id }}/test-results/report-${{ inputs.test_label }}.xml
             - name: upload results
               uses: actions/upload-artifact@v4

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/.github/workflows/trigger-all.yml RENAMED Viewed

@@ -11,10 +11,6 @@ on:
                 description: "workflow category, must be 'NIGHTLY' or 'RELEASE' (default: NIGHTLY)"
                 type: string
                 default: NIGHTLY
-            push_to_pypi:
-                description: "when set and tests pass, then '.whl' & '.tar.gz' will be pushed to public pypi"
-                type: boolean
-                default: false
             gitref:
                 description: "git commit hash or tag name"
                 type: string
@@ -29,10 +25,6 @@ on:
                     - NIGHTLY
                     - RELEASE
                 default: NIGHTLY
-            push_to_pypi:
-                description: "when set and tests pass, then '.whl' & '.tar.gz' will be pushed to public pypi"
-                type: boolean
-                default: false
             gitref:
                 description: "git commit hash or tag name"
                 type: string
@@ -46,9 +38,8 @@ jobs:
         with:
             wf_category: ${{ inputs.wf_category || 'NIGHTLY' }}
             gitref: ${{ inputs.gitref || 'main' }}
-            push_to_pypi: ${{ (github.event.schedule == '30 0 * * *') || inputs.push_to_pypi || false }}
-            test_configs: '[{"python":"3.11.4","label":"k8s-util","timeout":"40","code_coverage":true},
-                            {"python":"3.10.12","label":"k8s-util","timeout":"40"},
-                            {"python":"3.13","label":"k8s-h100-solo","timeout":"40"},
-                            {"python":"3.12.6","label":"k8s-a100-duo","timeout":"40"}]'
+            test_configs: '[{"python":"3.11.4","label":"ubuntu-latest","timeout":"40","code_coverage":true},
+                            {"python":"3.10.12","label":"ubuntu-latest","timeout":"40"},
+                            {"python":"3.13","label":"ubuntu-24.04","timeout":"40"},
+                            {"python":"3.12.6","label":"ubuntu-22.04","timeout":"40"}]'
         secrets: inherit

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/PKG-INFO RENAMED Viewed

@@ -1,8 +1,8 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251013
+Version: 0.12.3a20251023
 Summary: Library for utilization of compressed safetensors of neural network models
-Home-page: https://github.com/neuralmagic/compressed-tensors
+Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.
 Author-email: support@neuralmagic.com
 License: Apache 2.0
@@ -71,7 +71,7 @@ pip install --pre compressed-tensors
 ### From Source
 ```bash
-git clone https://github.com/neuralmagic/compressed-tensors
+git clone https://github.com/vllm-project/compressed-tensors
 cd compressed-tensors
 pip install -e .
 ```
@@ -112,7 +112,7 @@ We can apply bitmask compression to a whole model. For more detailed example see
 from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig
 from transformers import AutoModelForCausalLM
-model_name = "neuralmagic/llama2.c-stories110M-pruned50"
+model_name = "RedHatAI/llama2.c-stories110M-pruned50"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 original_state_dict = model.state_dict()
@@ -126,7 +126,7 @@ save_compressed_model(model, "compressed_model.safetensors", compression_format=
 state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
-For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
+For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/vllm-project/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
 ## Saving a Compressed Model with PTQ

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/README.md RENAMED Viewed

@@ -37,7 +37,7 @@ pip install --pre compressed-tensors
 ### From Source
 ```bash
-git clone https://github.com/neuralmagic/compressed-tensors
+git clone https://github.com/vllm-project/compressed-tensors
 cd compressed-tensors
 pip install -e .
 ```
@@ -78,7 +78,7 @@ We can apply bitmask compression to a whole model. For more detailed example see
 from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig
 from transformers import AutoModelForCausalLM
-model_name = "neuralmagic/llama2.c-stories110M-pruned50"
+model_name = "RedHatAI/llama2.c-stories110M-pruned50"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 original_state_dict = model.state_dict()
@@ -92,7 +92,7 @@ save_compressed_model(model, "compressed_model.safetensors", compression_format=
 state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
-For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
+For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/vllm-project/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
 ## Saving a Compressed Model with PTQ

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/setup.py RENAMED Viewed

@@ -109,7 +109,7 @@ setup(
     description="Library for utilization of compressed safetensors of neural network models",
     long_description=_setup_long_description()[0],
     long_description_content_type=_setup_long_description()[1],
-    url="https://github.com/neuralmagic/compressed-tensors",
+    url="https://github.com/vllm-project/compressed-tensors",
     extras_require=_setup_extras(),
     install_requires=_setup_install_requires(),
     package_dir={"": "src"},

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/src/compressed_tensors/compressors/quantized_compressors/__init__.py RENAMED Viewed

@@ -14,6 +14,6 @@
 # flake8: noqa
 from .base import *
+from .fp4_quantized import *
 from .naive_quantized import *
-from .nvfp4_quantized import *
 from .pack_quantized import *

compressed_tensors-0.12.3a20251013/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py → compressed_tensors-0.12.3a20251023/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py RENAMED Viewed

@@ -123,6 +123,15 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         return decompressed_weight
+@BaseCompressor.register(name=CompressionFormat.mxfp4_pack_quantized.value)
+class MXFP4PackedCompressor(NVFP4PackedCompressor):
+    """
+    Alias for mxfp4 quantized models
+    """
+    pass
 @torch.compile(fullgraph=True, dynamic=True)
 def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     """

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py RENAMED Viewed

@@ -19,7 +19,7 @@ import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityStructure
-from compressed_tensors.quantization import FP8_DTYPE
+from compressed_tensors.quantization import FP8_E4M3_DATA
 from compressed_tensors.utils import merge_names, pack_bitmasks, unpack_bitmasks
 from torch import Tensor
@@ -189,11 +189,11 @@ def sparse24_bitmask_compress(
     bytemasks = get_24_bytemasks(tensor=tensor)
-    if tensor.dtype == FP8_DTYPE:
+    if tensor.dtype == FP8_E4M3_DATA.dtype:
         # acces raw bytes of the tensor
         tensor_view = tensor.view(torch.int8)
         values = tensor_view[bytemasks]
-        values = values.view(FP8_DTYPE)
+        values = values.view(FP8_E4M3_DATA.dtype)
     else:
         values = tensor[bytemasks]
@@ -241,7 +241,7 @@ def get_24_bytemasks(tensor):
                         multiple of 4.
     """
     original_dtype = tensor.dtype
-    if tensor.dtype == FP8_DTYPE:
+    if tensor.dtype == FP8_E4M3_DATA.dtype:
         tensor = tensor.view(torch.int8)
     original_shape = tensor.shape
     num_elements = tensor.numel()

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py RENAMED Viewed

@@ -18,7 +18,7 @@ import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import FP8_DTYPE
+from compressed_tensors.quantization import FP8_E4M3_DATA
 from compressed_tensors.utils import merge_names, pack_bitmasks, unpack_bitmasks
 from torch import Tensor
@@ -138,11 +138,11 @@ def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
     bytemasks = tensor != 0
     row_counts = bytemasks.sum(dim=-1)
     row_offsets = torch.cumsum(row_counts, 0) - row_counts
-    if tensor.dtype == FP8_DTYPE:
+    if tensor.dtype == FP8_E4M3_DATA.dtype:
         # acces raw bytes of the tensor
         tensor_view = tensor.view(torch.int8)
         values = tensor_view[bytemasks]
-        values = values.view(FP8_DTYPE)
+        values = values.view(FP8_E4M3_DATA.dtype)
     else:
         values = tensor[bytemasks]
     bitmasks_packed = pack_bitmasks(bytemasks)

{compressed_tensors-0.12.3a20251013 → compressed_tensors-0.12.3a20251023}/src/compressed_tensors/config/base.py RENAMED Viewed

@@ -34,6 +34,7 @@ class CompressionFormat(Enum):
     marlin_24 = "marlin-24"
     mixed_precision = "mixed-precision"
     nvfp4_pack_quantized = "nvfp4-pack-quantized"
+    mxfp4_pack_quantized = "mxfp4-pack-quantized"
 @unique

compressed_tensors-0.12.3a20251023/src/compressed_tensors/modeling/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+# isort: off
+from .kvcache import *
+from .attention import *

compressed_tensors-0.12.3a20251023/src/compressed_tensors/modeling/attention.py ADDED Viewed

@@ -0,0 +1,147 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Callable, Optional
+from compressed_tensors.modeling.kvcache import initialize_hooked_kv_cache
+from compressed_tensors.quantization.lifecycle.forward import forward_quantize
+from compressed_tensors.utils import getattr_chain
+from compressed_tensors.utils.internal import InternalModule
+from torch import Tensor
+from torch.nn import Module
+from torch.utils.hooks import RemovableHandle
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+__all__ = [
+    "QuantizedAttentionImpl",
+    "initialize_hooked_attention",
+    "register_query_hook",
+    "IMPL_ATTR",
+]
+IMPL_ATTR = "impl"
+HOOKED_ATTENTION_NAME = "ct_hooked_attention"
+class QuantizedAttentionImpl(InternalModule):
+    """
+    QuantizedAttentionImpl module which wraps the functionality of the original
+    attention implementation. Unlike the original attention function, this
+    implementation is a `torch.nn.Module` which can be hooked to trigger
+    transforms and calibration hooks.
+    This module works by being registered as a submodule to attention modules via
+    `initialize_hooked_attention`, registering a new attention implementation function
+    which calls this module, then setting the model attention implementation to the new
+    function. After triggering hooks and quantization, this module calls the original
+    attention implementation function.
+    """
+    _original_impl = "eager"
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+    def forward(
+        self,
+        module: Module,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        *args,
+        **kwargs,
+    ):
+        # quantization
+        quant_args_attr = "quantization_scheme.input_activations"
+        quant_args = getattr_chain(module, quant_args_attr, None)
+        quant_enabled = getattr(module, "quantization_enabled", True)
+        if quant_args is not None and quant_enabled:
+            query = forward_quantize(module, query, "q", quant_args)
+        # original attention
+        return ALL_ATTENTION_FUNCTIONS[QuantizedAttentionImpl._original_impl](
+            module,
+            query,
+            key,
+            value,
+            *args,
+            **kwargs,
+        )
+# ----- initialize ----- #
+def _hooked_attention(module: Module, *args, **kwargs):
+    assert hasattr(module, IMPL_ATTR), (
+        f"Using {HOOKED_ATTENTION_NAME} attention implementation, "
+        f"but attention module does not have {IMPL_ATTR} submodule."
+    )
+    return getattr(module, IMPL_ATTR)(module, *args, **kwargs)
+def initialize_hooked_attention(model: PreTrainedModel, module: Module):
+    """
+    Initialize `QuantizedAttentionImpl` and `QuantizedKVCache` instances
+    attached to attention. Assumes that only one model is hooked at a time.
+    :param model: parent model of attention module
+    :param module: attention module to initialize with
+    """
+    if not hasattr(module, IMPL_ATTR):
+        module.register_module(IMPL_ATTR, QuantizedAttentionImpl(model.config))
+    if model.config._attn_implementation != HOOKED_ATTENTION_NAME:
+        QuantizedAttentionImpl._original_impl = model.config._attn_implementation
+        original_mask = ALL_MASK_ATTENTION_FUNCTIONS[model.config._attn_implementation]
+        ALL_ATTENTION_FUNCTIONS.register(HOOKED_ATTENTION_NAME, _hooked_attention)
+        ALL_MASK_ATTENTION_FUNCTIONS.register(HOOKED_ATTENTION_NAME, original_mask)
+        model.set_attn_implementation(HOOKED_ATTENTION_NAME)
+        assert model.config._attn_implementation == HOOKED_ATTENTION_NAME
+    initialize_hooked_kv_cache(model, module)
+# ----- hooks ----- #
+def register_query_hook(
+    module: Module, hook: Callable[[Module, Tensor], Optional[Tensor]]
+) -> RemovableHandle:
+    """
+    Register a hook which takes post-rope query states as an argument and
+    returns the modified query states or `None`
+    :param module: attention module to add hook to
+    :param hook: query hook function
+    """
+    impl: QuantizedAttentionImpl = getattr(module, IMPL_ATTR)
+    def _hook(impl: QuantizedAttentionImpl, args, kwargs):
+        bound = inspect.signature(impl.forward).bind(*args, **kwargs)
+        value = hook(module, bound.arguments["query"])
+        if value is not None:
+            bound.arguments["query"] = value
+        return bound.args, bound.kwargs
+    return impl.register_forward_pre_hook(_hook, with_kwargs=True)

compressed-tensors 0.12.3a20251013__tar.gz → 0.12.3a20251023__tar.gz

compressed-tensors 0.12.3a20251013tar.gz → 0.12.3a20251023tar.gz