PyPI - compressed-tensors - Versions diffs - 0.12.3a20251008__tar.gz → 0.12.3a20251010__tar.gz - Mend

compressed-tensors 0.12.3a20251008tar.gz → 0.12.3a20251010tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/.github/workflows/test.yml RENAMED Viewed

@@ -82,7 +82,7 @@ jobs:
             - name: set python
               id: set_python
-              uses: actions/setup-python@v5
+              uses: actions/setup-python@v6
               with:
                   python-version: ${{ inputs.python }}

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/.github/workflows/trigger-all.yml RENAMED Viewed

@@ -49,6 +49,6 @@ jobs:
             push_to_pypi: ${{ (github.event.schedule == '30 0 * * *') || inputs.push_to_pypi || false }}
             test_configs: '[{"python":"3.11.4","label":"k8s-util","timeout":"40","code_coverage":true},
                             {"python":"3.10.12","label":"k8s-util","timeout":"40"},
-                            {"python":"3.9.17","label":"k8s-h100-solo","timeout":"40"},
+                            {"python":"3.13","label":"k8s-h100-solo","timeout":"40"},
                             {"python":"3.12.6","label":"k8s-a100-duo","timeout":"40"}]'
         secrets: inherit

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251008
+Version: 0.12.3a20251010
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/base.py RENAMED Viewed

@@ -20,6 +20,3 @@ TRANSFORM_CONFIG_NAME = "transform_config"
 # required fields
 COMPRESSION_VERSION_NAME = "version"
 QUANTIZATION_METHOD_NAME = "quant_method"
-# auxillary configs
-KV_CACHE_SCHEME_NAME = "kv_cache_scheme"

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -330,7 +330,7 @@ def _process_quantization(
             inv_perm = torch.argsort(perm)
             output = output.index_select(-1, inv_perm)
-    else:  # covers channel, token and tensor strategies
+    else:  # covers tensor, channel, token, and attn_head strategies
         if do_quantize:
             output = _quantize(
                 x=x,

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -14,7 +14,7 @@
 import logging
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 import torch
 from compressed_tensors.quantization import (
@@ -152,7 +152,7 @@ def initialize_qparams(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    observed_shape: Tuple[int],
+    observed_shape: Tuple[Union[int, None]],
     observed_dtype: torch.dtype,
     force_zero_point: bool = True,
 ):
@@ -199,7 +199,7 @@ def initialize_qparams(
         expected_shape = (1,)
     elif strategy == QuantizationStrategy.TOKEN:
-        expected_shape = (1, 1)
+        raise ValueError("Cannot perform static token quantization")
     elif strategy == QuantizationStrategy.CHANNEL:
         if len(observed_shape) < 2:
@@ -234,6 +234,13 @@ def initialize_qparams(
         num_cols = strategy_cdiv(observed_shape[-1], block_structure[-1], strategy)
         expected_shape = (num_rows, num_cols)
+    elif strategy == QuantizationStrategy.ATTN_HEAD:
+        # (batch_size, num_attention_heads, seq_len, head_dim)
+        if len(observed_shape) < 3:
+            raise ValueError("Attention quant requires at least 3 observed dimensions")
+        expected_shape = (observed_shape[-3], 1, 1)
     else:
         assert False, f"Unknown strategy {strategy}"

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -101,6 +101,7 @@ class QuantizationStrategy(str, Enum):
     BLOCK = "block"
     TOKEN = "token"
     TENSOR_GROUP = "tensor_group"
+    ATTN_HEAD = "attn_head"
 class DynamicType(str, Enum):
@@ -263,6 +264,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         actorder = model.actorder
         dynamic = model.dynamic
         observer = model.observer
+        dynamic = model.dynamic
         # infer strategy
         if strategy is None:
@@ -278,6 +280,12 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                     "strategy='group' and group_size = -1 for 'channel'"
                 )
+        # validate token strategy
+        if strategy == QuantizationStrategy.TOKEN and not dynamic:
+            raise ValueError(
+                "Cannot perform static token quantization, please use `dynamic=True`"
+            )
         # validate group strategy
         if strategy == QuantizationStrategy.GROUP:
             if group_size is None or group_size <= 0:

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/quantization/quant_scheme.py RENAMED Viewed

@@ -65,6 +65,7 @@ class QuantizationScheme(BaseModel):
                 QuantizationStrategy.TENSOR,
                 QuantizationStrategy.GROUP,
                 QuantizationStrategy.TENSOR_GROUP,
+                QuantizationStrategy.ATTN_HEAD,
             ):
                 if (
                     inputs.strategy == QuantizationStrategy.GROUP

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.12.3.a20251008'
+__version__ = version = '0.12.3.a20251010'
 __version_tuple__ = version_tuple = (0, 12, 3)

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251008
+Version: 0.12.3a20251010
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

@@ -101,6 +101,7 @@ src/compressed_tensors/utils/semi_structured_conversions.py
 src/compressed_tensors/utils/type.py
 tests/__init__.py
 tests/conftest.py
+tests/mock_observer.py
 tests/test_registry.py
 tests/testing_utils.py
 tests/test_compressors/__init__.py
@@ -134,6 +135,7 @@ tests/test_quantization/lifecycle/test_enabled.py
 tests/test_quantization/lifecycle/test_forward.py
 tests/test_quantization/lifecycle/test_initialize.py
 tests/test_quantization/lifecycle/test_lifecycle.py
+tests/test_quantization/lifecycle/test_static_lifecycle.py
 tests/test_quantization/test_configs/__init__.py
 tests/test_quantization/test_configs/test_bit_depths.py
 tests/test_quantization/test_configs/test_strategies.py

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/tests/conftest.py RENAMED Viewed

@@ -29,27 +29,6 @@ def _get_dim(dim: int, value: torch.Tensor):
     return reduce_dims
-@pytest.fixture
-def mock_per_token_calibration():
-    def update_scale_zp(module: torch.nn.Module, base_name: str, value: torch.Tensor):
-        quantization_scheme = getattr(module, "quantization_scheme", None)
-        if not quantization_scheme:
-            # no quantization scheme nothing to do
-            return
-        arg_name = "weights" if base_name == "weight" else f"{base_name}_activations"
-        args = getattr(quantization_scheme, arg_name, None)
-        dim = _get_dim({0, 1}, value)
-        min_val = torch.amin(value, dim=dim, keepdims=True)
-        max_val = torch.amax(value, dim=dim, keepdims=True)
-        scale, zp = calculate_qparams(min_val, max_val, args)
-        update_parameter_data(module, scale, f"{base_name}_scale")
-        update_parameter_data(module, zp, f"{base_name}_zero_point")
-    return update_scale_zp
 @pytest.fixture
 def mock_per_group_calibration():
     def update_scale_zp(

compressed_tensors-0.12.3a20251010/tests/mock_observer.py ADDED Viewed

@@ -0,0 +1,173 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+from weakref import ref
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
+from compressed_tensors.quantization.utils import (
+    calculate_qparams,
+    generate_gparam,
+    strategy_cdiv,
+)
+class MockMinMaxObserver(torch.nn.Module):
+    def __init__(self, base_name: str, args: QuantizationArgs, module: torch.nn.Module):
+        super().__init__()
+        self.parent = ref(module)
+        self.base_name = base_name
+        self.args = args
+        # used for testing
+        self.min_vals = None
+        self.max_vals = None
+    def get_min_max(self, observed: torch.Tensor):
+        min_vals = torch.amin(observed, dim=(0, -1))
+        max_vals = torch.amax(observed, dim=(0, -1))
+        return min_vals, max_vals
+    def forward(self, observed: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        observed = flatten_for_quantization(observed, self.base_name, self.args)
+        self.min_vals, self.max_vals = self.get_min_max(observed)
+        scales, zero_points = calculate_qparams(
+            min_vals=self.min_vals,
+            max_vals=self.max_vals,
+            quantization_args=self.args,
+            global_scale=getattr(self.parent(), f"{self.base_name}_global_scale", None),
+        )
+        return scales, zero_points
+    def get_global_scale(self, observed: torch.Tensor):
+        observed = observed.reshape((1, 1, -1))  # per tensor reshape
+        min_vals, max_vals = self.get_min_max(observed)
+        global_scale = generate_gparam(min_vals, max_vals)
+        return global_scale
+def flatten_for_quantization(
+    value: torch.Tensor, base_name: str, args: QuantizationArgs
+) -> torch.Tensor:
+    if base_name == "weight":
+        return flatten_weight_for_quantization(value, args)
+    elif base_name in ("input", "output"):
+        return flatten_activation_for_quantization(value, args)
+    elif base_name in ("q", "k", "v"):
+        return flatten_attention_for_quantization(value, args)
+    else:
+        raise ValueError(f"Unknown quantization base name: {base_name}")
+def flatten_weight_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (num_rows, num_cols)
+    if args.strategy == QuantizationStrategy.TENSOR:
+        # (1, 1, num_weight_elems)
+        return value.reshape((1, 1, -1))
+    if args.strategy == QuantizationStrategy.TOKEN:
+        raise ValueError("Token quantization cannot be applied to weights")
+    if args.strategy == QuantizationStrategy.CHANNEL:
+        # (1, num_rows, 1, num_cols)
+        return value.unsqueeze(-2).unsqueeze(0)
+    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        # (1, num_rows, num_groups, group_size)
+        return value.unflatten(-1, (-1, args.group_size)).unsqueeze(0)
+    if args.strategy == QuantizationStrategy.BLOCK:
+        # (1, num_block_rows, num_block_cols, block_width * block_height)
+        block_height, block_width = args.block_structure
+        num_rows, num_cols = value.shape
+        num_block_rows = strategy_cdiv(num_rows, block_height, args.strategy)
+        num_block_cols = strategy_cdiv(num_cols, block_width, args.strategy)
+        return (
+            value.reshape(
+                num_block_rows,
+                block_height,
+                num_block_cols,
+                block_width,
+            )
+            .transpose(1, 2)
+            .flatten(-2, -1)
+            .unsqueeze(0)
+        )
+    if args.strategy == QuantizationStrategy.ATTN_HEAD:
+        raise ValueError("attention head quantization cannot be applied to weights")
+    assert False, f"Unknown strategy {args.strategy}"
+def flatten_activation_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, seq_len, hidden_dim)
+    if args.strategy == QuantizationStrategy.TENSOR:
+        # (batch_size * seq_len, 1, hidden_dim)
+        return value.reshape((-1, 1, value.size(-1)))
+    if args.strategy == QuantizationStrategy.TOKEN:
+        # (batch_size, seq_len, hidden_dim)
+        # warning: token quantization uses `compute_dynamic_scales_and_zp`
+        return value.flatten(2, -1)
+    if args.strategy == QuantizationStrategy.CHANNEL:
+        raise ValueError("Channel quantization cannot be applied to activations")
+    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        # (batch_size * seq_len, num_groups, group_size)
+        # warning: group activation quantization uses compute_dynamic_scales_and_zp
+        return value.flatten(0, 1).unflatten(-1, (-1, args.group_size))
+    if args.strategy == QuantizationStrategy.BLOCK:
+        raise ValueError("Block quantization cannot be applied to activations")
+    if args.strategy == QuantizationStrategy.ATTN_HEAD:
+        raise ValueError("attention head quantization cannot be applied to linear acts")
+    assert False, f"Unknown strategy {args.strategy}"
+def flatten_attention_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, num_heads, seq_len, head_dim)
+    if args.strategy == QuantizationStrategy.TENSOR:
+        # (batch_size * seq_len, 1, num_heads * head_dim)
+        return value.transpose(1, 2).flatten(0, 1).flatten(-2, -1).unsqueeze(-2)
+    if args.strategy == QuantizationStrategy.TOKEN:
+        raise ValueError("Token quantization cannot be applied to attention")
+    if args.strategy == QuantizationStrategy.CHANNEL:
+        raise ValueError("Channel quantization cannot be applied to attention")
+    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        raise ValueError("Group quantization cannot be applied to attention")
+    if args.strategy == QuantizationStrategy.BLOCK:
+        raise ValueError("Block quantization cannot be applied to attention")
+    if args.strategy == QuantizationStrategy.ATTN_HEAD:
+        # (batch_size * seq_len, num_heads, 1, 1, head_dim)
+        return value.transpose(1, 2).flatten(0, 1).unsqueeze(-2).unsqueeze(-2)
+    assert False, f"Unknown strategy {args.strategy}"

{compressed_tensors-0.12.3a20251008 → compressed_tensors-0.12.3a20251010}/tests/test_quantization/lifecycle/test_initialize.py RENAMED Viewed

@@ -176,10 +176,6 @@ def test_initialize_module_for_quantization_offloaded(
             QuantizationArgs(strategy="block", block_structure=[2, 4]),
             None,
         ),
-        (
-            QuantizationArgs(strategy="token"),
-            QuantizationArgs(strategy="token"),
-        ),
     ],
 )
 def test_initialize_quantization_parameters(weights, input_activations):
@@ -238,9 +234,6 @@ def test_initialize_quantization_parameters(weights, input_activations):
                 # For activations or when block_structure is None
                 expected_shape = (1,)
-        elif args.strategy == QuantizationStrategy.TOKEN:
-            expected_shape = (1, 1)
         if not args.dynamic:
             assert getattr(layer, f"{q_param_name}_scale").shape == expected_shape
             assert getattr(layer, f"{q_param_name}_zero_point").shape == expected_shape

compressed-tensors 0.12.3a20251008__tar.gz → 0.12.3a20251010__tar.gz

compressed-tensors 0.12.3a20251008tar.gz → 0.12.3a20251010tar.gz