PyPI - keras-nightly - Versions diffs - 3.12.0.dev2025100503__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl - Mend

keras-nightly 3.12.0.dev2025100503py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

keras/__init__.py +1 -0
keras/_tf_keras/keras/__init__.py +1 -0
keras/_tf_keras/keras/callbacks/__init__.py +3 -0
keras/_tf_keras/keras/distillation/__init__.py +16 -0
keras/_tf_keras/keras/distribution/__init__.py +3 -0
keras/_tf_keras/keras/dtype_policies/__init__.py +3 -0
keras/_tf_keras/keras/layers/__init__.py +21 -0
keras/_tf_keras/keras/ops/__init__.py +13 -0
keras/_tf_keras/keras/ops/image/__init__.py +1 -0
keras/_tf_keras/keras/ops/linalg/__init__.py +1 -0
keras/_tf_keras/keras/ops/nn/__init__.py +3 -0
keras/_tf_keras/keras/ops/numpy/__init__.py +9 -0
keras/_tf_keras/keras/quantizers/__init__.py +13 -0
keras/callbacks/__init__.py +3 -0
keras/distillation/__init__.py +16 -0
keras/distribution/__init__.py +3 -0
keras/dtype_policies/__init__.py +3 -0
keras/layers/__init__.py +21 -0
keras/ops/__init__.py +13 -0
keras/ops/image/__init__.py +1 -0
keras/ops/linalg/__init__.py +1 -0
keras/ops/nn/__init__.py +3 -0
keras/ops/numpy/__init__.py +9 -0
keras/quantizers/__init__.py +13 -0
keras/src/applications/imagenet_utils.py +4 -1
keras/src/backend/common/backend_utils.py +30 -6
keras/src/backend/common/name_scope.py +2 -1
keras/src/backend/common/variables.py +30 -15
keras/src/backend/jax/core.py +92 -3
keras/src/backend/jax/distribution_lib.py +16 -2
keras/src/backend/jax/linalg.py +4 -0
keras/src/backend/jax/nn.py +509 -29
keras/src/backend/jax/numpy.py +59 -8
keras/src/backend/jax/trainer.py +14 -2
keras/src/backend/numpy/linalg.py +4 -0
keras/src/backend/numpy/nn.py +311 -1
keras/src/backend/numpy/numpy.py +65 -2
keras/src/backend/openvino/__init__.py +1 -0
keras/src/backend/openvino/core.py +2 -23
keras/src/backend/openvino/linalg.py +4 -0
keras/src/backend/openvino/nn.py +271 -20
keras/src/backend/openvino/numpy.py +943 -189
keras/src/backend/tensorflow/layer.py +43 -9
keras/src/backend/tensorflow/linalg.py +24 -0
keras/src/backend/tensorflow/nn.py +545 -1
keras/src/backend/tensorflow/numpy.py +250 -50
keras/src/backend/torch/core.py +3 -1
keras/src/backend/torch/linalg.py +4 -0
keras/src/backend/torch/nn.py +125 -0
keras/src/backend/torch/numpy.py +80 -2
keras/src/callbacks/__init__.py +1 -0
keras/src/callbacks/model_checkpoint.py +5 -0
keras/src/callbacks/orbax_checkpoint.py +332 -0
keras/src/callbacks/terminate_on_nan.py +54 -5
keras/src/datasets/cifar10.py +5 -0
keras/src/distillation/__init__.py +1 -0
keras/src/distillation/distillation_loss.py +390 -0
keras/src/distillation/distiller.py +598 -0
keras/src/distribution/distribution_lib.py +14 -0
keras/src/dtype_policies/__init__.py +2 -0
keras/src/dtype_policies/dtype_policy.py +90 -1
keras/src/export/__init__.py +2 -0
keras/src/export/export_utils.py +39 -2
keras/src/export/litert.py +248 -0
keras/src/export/openvino.py +1 -1
keras/src/export/tf2onnx_lib.py +3 -0
keras/src/layers/__init__.py +13 -0
keras/src/layers/activations/softmax.py +9 -4
keras/src/layers/attention/multi_head_attention.py +4 -1
keras/src/layers/core/dense.py +241 -111
keras/src/layers/core/einsum_dense.py +316 -131
keras/src/layers/core/embedding.py +84 -94
keras/src/layers/core/input_layer.py +1 -0
keras/src/layers/core/reversible_embedding.py +399 -0
keras/src/layers/input_spec.py +17 -17
keras/src/layers/layer.py +45 -15
keras/src/layers/merging/dot.py +4 -1
keras/src/layers/pooling/adaptive_average_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_average_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_average_pooling3d.py +63 -0
keras/src/layers/pooling/adaptive_max_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_max_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_max_pooling3d.py +63 -0
keras/src/layers/pooling/base_adaptive_pooling.py +63 -0
keras/src/layers/preprocessing/discretization.py +6 -5
keras/src/layers/preprocessing/feature_space.py +8 -4
keras/src/layers/preprocessing/image_preprocessing/aug_mix.py +2 -2
keras/src/layers/preprocessing/image_preprocessing/random_contrast.py +3 -3
keras/src/layers/preprocessing/image_preprocessing/resizing.py +10 -0
keras/src/layers/preprocessing/index_lookup.py +19 -1
keras/src/layers/preprocessing/normalization.py +14 -1
keras/src/layers/regularization/dropout.py +43 -1
keras/src/layers/rnn/rnn.py +19 -0
keras/src/losses/loss.py +1 -1
keras/src/losses/losses.py +24 -0
keras/src/metrics/confusion_metrics.py +7 -6
keras/src/models/cloning.py +4 -0
keras/src/models/functional.py +11 -3
keras/src/models/model.py +172 -34
keras/src/ops/image.py +257 -20
keras/src/ops/linalg.py +93 -0
keras/src/ops/nn.py +258 -0
keras/src/ops/numpy.py +569 -36
keras/src/optimizers/muon.py +65 -31
keras/src/optimizers/schedules/learning_rate_schedule.py +4 -3
keras/src/quantizers/__init__.py +14 -1
keras/src/quantizers/awq.py +361 -0
keras/src/quantizers/awq_config.py +140 -0
keras/src/quantizers/awq_core.py +217 -0
keras/src/quantizers/gptq.py +2 -8
keras/src/quantizers/gptq_config.py +36 -1
keras/src/quantizers/gptq_core.py +65 -79
keras/src/quantizers/quantization_config.py +246 -0
keras/src/quantizers/quantizers.py +127 -61
keras/src/quantizers/utils.py +23 -0
keras/src/random/seed_generator.py +6 -4
keras/src/saving/file_editor.py +81 -6
keras/src/saving/orbax_util.py +26 -0
keras/src/saving/saving_api.py +37 -14
keras/src/saving/saving_lib.py +1 -1
keras/src/testing/__init__.py +1 -0
keras/src/testing/test_case.py +45 -5
keras/src/utils/backend_utils.py +31 -4
keras/src/utils/dataset_utils.py +234 -35
keras/src/utils/file_utils.py +49 -11
keras/src/utils/image_utils.py +14 -2
keras/src/utils/jax_layer.py +244 -55
keras/src/utils/module_utils.py +29 -0
keras/src/utils/progbar.py +10 -2
keras/src/utils/rng_utils.py +9 -1
keras/src/utils/tracking.py +5 -5
keras/src/version.py +1 -1
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/METADATA +16 -6
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/RECORD +136 -115
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/WHEEL +0 -0
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/top_level.txt +0 -0

keras/src/layers/core/einsum_dense.py CHANGED Viewed

@@ -6,6 +6,7 @@ import ml_dtypes
 import numpy as np
 from keras.src import activations
+from keras.src import backend
 from keras.src import constraints
 from keras.src import dtype_policies
 from keras.src import initializers
@@ -15,7 +16,9 @@ from keras.src import regularizers
 from keras.src.api_export import keras_export
 from keras.src.layers.input_spec import InputSpec
 from keras.src.layers.layer import Layer
+from keras.src.quantizers.quantization_config import QuantizationConfig
 from keras.src.quantizers.quantizers import dequantize_with_sz_map
+from keras.src.saving import serialization_lib
 @keras_export("keras.layers.EinsumDense")
@@ -134,6 +137,7 @@ class EinsumDense(Layer):
         lora_rank=None,
         lora_alpha=None,
         gptq_unpacked_column_size=None,
+        quantization_config=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -154,6 +158,7 @@ class EinsumDense(Layer):
         self.lora_alpha = lora_alpha if lora_alpha is not None else lora_rank
         self.lora_enabled = False
         self.gptq_unpacked_column_size = gptq_unpacked_column_size
+        self.quantization_config = quantization_config
     def build(self, input_shape):
         shape_data = _analyze_einsum_string(
@@ -169,12 +174,13 @@ class EinsumDense(Layer):
             self.quantized_build(
                 kernel_shape,
                 mode=self.quantization_mode,
+                config=self.quantization_config,
             )
         # Skip creating a duplicate kernel variable when the layer is already
         # quantized to int8 or int4, because `quantized_build` has created the
         # appropriate kernel variable. For other modes (e.g., float8 or no
         # quantization), we still need the floating-point kernel.
-        if self.quantization_mode not in ("int8", "int4", "gptq"):
+        if self.quantization_mode not in ("int8", "int4", "gptq", "awq"):
             # If the layer is quantized to int8, `self._kernel` will be added
             # in `self._int8_build`. Therefore, we skip it here.
             self._kernel = self.add_weight(
@@ -213,15 +219,17 @@ class EinsumDense(Layer):
         mode = self.quantization_mode
         is_gptq = mode == "gptq"
+        is_awq = mode == "awq"
         is_int4 = mode == "int4"
-        calibrated = bool(getattr(self, "is_gptq_calibrated", False))
+        gptq_calibrated = bool(getattr(self, "is_gptq_calibrated", False))
+        awq_calibrated = bool(getattr(self, "is_awq_calibrated", False))
         gptq_bits = (
             gptq_core.get_weight_bits_for_layer(self, None) if is_gptq else None
         )
         # Decide the source tensor first (packed vs already-quantized vs plain
         # kernel)
-        if is_gptq and calibrated and gptq_bits != 4:
+        if is_gptq and gptq_calibrated and gptq_bits != 4:
             # calibrated GPTQ, not 4-bit, no unpacking needed
             kernel = self.quantized_kernel
         else:
@@ -235,13 +243,21 @@ class EinsumDense(Layer):
                     self._orig_length_along_pack_axis,
                     self._int4_pack_axis,
                 )
-            elif is_gptq and calibrated and gptq_bits == 4:
+            elif is_gptq and gptq_calibrated and gptq_bits == 4:
                 kernel = quantizers.unpack_int4(
                     self.quantized_kernel,
                     orig_len=self.gptq_unpacked_column_size,
                     axis=0,
                     dtype="uint8",
                 )
+            elif is_awq and awq_calibrated:
+                # AWQ always uses 4-bit quantization
+                kernel = quantizers.unpack_int4(
+                    self.quantized_kernel,
+                    orig_len=self.awq_unpacked_column_size,
+                    axis=0,
+                    dtype="uint8",
+                )
         # Apply LoRA if enabled
         if self.lora_enabled:
@@ -326,25 +342,25 @@ class EinsumDense(Layer):
         if not self.built:
             return
         mode = self.quantization_mode
-        if mode not in self.quantization_variable_spec:
+        if mode not in self.variable_serialization_spec:
             raise self._quantization_mode_error(mode)
         # Kernel plus optional merged LoRA-aware scale (returns (kernel, None)
         # for None/gptq)
         kernel_value, merged_kernel_scale = self._get_kernel_with_merged_lora()
-        # Save the variables using the name as the key.
-        if mode != "gptq":
-            store["kernel"] = kernel_value
-        if self.bias is not None:
-            store["bias"] = self.bias
-        for name in self.quantization_variable_spec[mode]:
-            if name == "kernel_scale" and mode in ("int4", "int8"):
+        idx = 0
+        for name in self.variable_serialization_spec[mode]:
+            if name == "kernel":
+                store[str(idx)] = kernel_value
+            elif name == "bias" and self.bias is None:
+                continue
+            elif name == "kernel_scale" and mode in ("int4", "int8"):
                 # For int4/int8, the merged LoRA scale (if any) comes from
                 # `_get_kernel_with_merged_lora()`
-                store[name] = merged_kernel_scale
+                store[str(idx)] = merged_kernel_scale
             else:
-                store[name] = getattr(self, name)
+                store[str(idx)] = getattr(self, name)
+            idx += 1
     def load_own_variables(self, store):
         if not self.lora_enabled:
@@ -353,39 +369,22 @@ class EinsumDense(Layer):
         if not self.built:
             return
         mode = self.quantization_mode
-        if mode not in self.quantization_variable_spec:
+        if mode not in self.variable_serialization_spec:
             raise self._quantization_mode_error(mode)
-        # Determine whether to use the legacy loading method.
-        if "0" in store:
-            return self._legacy_load_own_variables(store)
-        # Load the variables using the name as the key.
-        if mode != "gptq":
-            self._kernel.assign(store["kernel"])
-        if self.bias is not None:
-            self.bias.assign(store["bias"])
-        for name in self.quantization_variable_spec[mode]:
-            getattr(self, name).assign(store[name])
-        if self.lora_enabled:
-            self.lora_kernel_a.assign(ops.zeros(self.lora_kernel_a.shape))
-            self.lora_kernel_b.assign(ops.zeros(self.lora_kernel_b.shape))
+        # A saved GPTQ/AWQ quantized model will always be calibrated.
+        self.is_gptq_calibrated = mode == "gptq"
+        self.is_awq_calibrated = mode == "awq"
-    def _legacy_load_own_variables(self, store):
-        # The keys of the `store` will be saved as determined because the
-        # default ordering will change after quantization
-        mode = self.quantization_mode
-        targets = []
-        if mode != "gptq":
-            targets.append(self._kernel)
-        if self.bias is not None:
-            targets.append(self.bias)
-        targets.extend(
-            getattr(self, name)
-            for name in self.quantization_variable_spec[mode]
-        )
-        for i, variable in enumerate(targets):
-            variable.assign(store[str(i)])
+        idx = 0
+        for name in self.variable_serialization_spec[mode]:
+            if name == "kernel":
+                self._kernel.assign(store[str(idx)])
+            elif name == "bias" and self.bias is None:
+                continue
+            else:
+                getattr(self, name).assign(store[str(idx)])
+            idx += 1
         if self.lora_enabled:
             self.lora_kernel_a.assign(ops.zeros(self.lora_kernel_a.shape))
             self.lora_kernel_b.assign(ops.zeros(self.lora_kernel_b.shape))
@@ -410,6 +409,9 @@ class EinsumDense(Layer):
             ),
             "kernel_constraint": constraints.serialize(self.kernel_constraint),
             "bias_constraint": constraints.serialize(self.bias_constraint),
+            "quantization_config": serialization_lib.serialize_keras_object(
+                self.quantization_config
+            ),
         }
         if self.lora_rank:
             config["lora_rank"] = self.lora_rank
@@ -418,53 +420,42 @@ class EinsumDense(Layer):
             config["gptq_unpacked_column_size"] = self.gptq_unpacked_column_size
         return {**base_config, **config}
-    def _check_load_own_variables(self, store):
-        all_vars = self._trainable_variables + self._non_trainable_variables
-        if len(store.keys()) != len(all_vars):
-            if len(all_vars) == 0 and not self.built:
-                raise ValueError(
-                    f"Layer '{self.name}' was never built "
-                    "and thus it doesn't have any variables. "
-                    f"However the weights file lists {len(store.keys())} "
-                    "variables for this layer.\n"
-                    "In most cases, this error indicates that either:\n\n"
-                    "1. The layer is owned by a parent layer that "
-                    "implements a `build()` method, but calling the "
-                    "parent's `build()` method did NOT create the state of "
-                    f"the child layer '{self.name}'. A `build()` method "
-                    "must create ALL state for the layer, including "
-                    "the state of any children layers.\n\n"
-                    "2. You need to implement "
-                    "the `def build_from_config(self, config)` method "
-                    f"on layer '{self.name}', to specify how to rebuild "
-                    "it during loading. "
-                    "In this case, you might also want to implement the "
-                    "method that generates the build config at saving time, "
-                    "`def get_build_config(self)`. "
-                    "The method `build_from_config()` is meant "
-                    "to create the state "
-                    "of the layer (i.e. its variables) upon deserialization.",
-                )
-            raise ValueError(
-                f"Layer '{self.name}' expected {len(all_vars)} variables, "
-                "but received "
-                f"{len(store.keys())} variables during loading. "
-                f"Expected: {[v.name for v in all_vars]}"
+    @classmethod
+    def from_config(cls, config):
+        config = config.copy()
+        config["quantization_config"] = (
+            serialization_lib.deserialize_keras_object(
+                config.get("quantization_config", None)
             )
+        )
+        return super().from_config(config)
     @property
-    def quantization_variable_spec(self):
-        """Returns a dict mapping quantization modes to variable names.
+    def variable_serialization_spec(self):
+        """Returns a dict mapping quantization modes to variable names in order.
         This spec is used by `save_own_variables` and `load_own_variables` to
-        determine which variables should be saved/loaded for each quantization
-        mode.
+        determine the correct ordering of variables during serialization for
+        each quantization mode. `None` means no quantization.
         """
         return {
-            None: [],
-            "int8": ["kernel_scale"],
-            "int4": ["kernel_scale"],
+            None: [
+                "kernel",
+                "bias",
+            ],
+            "int8": [
+                "kernel",
+                "bias",
+                "kernel_scale",
+            ],
+            "int4": [
+                "kernel",
+                "bias",
+                "kernel_scale",
+            ],
             "float8": [
+                "kernel",
+                "bias",
                 "inputs_scale",
                 "inputs_amax_history",
                 "kernel_scale",
@@ -473,31 +464,48 @@ class EinsumDense(Layer):
                 "outputs_grad_amax_history",
             ],
             "gptq": [
+                "bias",
+                "quantized_kernel",
+                "kernel_scale",
+                "kernel_zero",
+                "g_idx",
+            ],
+            "awq": [
+                "bias",
                 "quantized_kernel",
                 "kernel_scale",
                 "kernel_zero",
+                "awq_scales",
                 "g_idx",
             ],
         }
     def quantized_build(self, kernel_shape, mode, config=None):
         if mode == "int8":
-            self._int8_build(kernel_shape)
+            self._int8_build(kernel_shape, config)
         elif mode == "int4":
-            self._int4_build(kernel_shape)
+            self._int4_build(kernel_shape, config)
         elif mode == "float8":
             self._float8_build()
         elif mode == "gptq":
             self._gptq_build(kernel_shape, config)
+        elif mode == "awq":
+            self._awq_build(kernel_shape, config)
         else:
             raise self._quantization_mode_error(mode)
         self._is_quantized = True
-    def _int8_build(self, kernel_shape):
+    def _int8_build(self, kernel_shape, config=None):
         self._set_quantization_info()
-        self.inputs_quantizer = quantizers.AbsMaxQuantizer(
-            axis=self._input_reduced_axes
+        self.inputs_quantizer = (
+            QuantizationConfig.activation_quantizer_or_default(
+                config,
+                quantizers.AbsMaxQuantizer(),
+            )
         )
+        # If the config provided a default AbsMaxQuantizer, we need to
+        # override the axis to match the equation's reduction axes.
+        self.quantization_axis = tuple(self._input_reduced_axes)
         self._kernel = self.add_weight(
             name="kernel",
             shape=kernel_shape,
@@ -535,12 +543,7 @@ class EinsumDense(Layer):
             columns = kernel_shape[1]
         elif len(kernel_shape) == 3:
             shape = list(self.original_kernel_shape)
-            try:
-                d_model_dim_index = shape.index(max(shape))
-            except ValueError:
-                raise TypeError(
-                    f"Could not determine hidden dimension from shape {shape}"
-                )
+            d_model_dim_index = shape.index(max(shape))
             if d_model_dim_index == 0:  # QKV projection case
                 in_features, heads, head_dim = shape
@@ -566,8 +569,7 @@ class EinsumDense(Layer):
         # For 4-bit weights, we pack two values per byte.
         kernel_columns = (columns + 1) // 2 if weight_bits == 4 else columns
-        if hasattr(self, "_set_quantization_info"):
-            self._set_quantization_info()
+        self._set_quantization_info()
         self.quantized_kernel = self.add_weight(
             name="kernel",
@@ -635,7 +637,128 @@ class EinsumDense(Layer):
             y = self.activation(y)
         return y
-    def _int4_build(self, kernel_shape):
+    def _awq_build(self, kernel_shape, config):
+        """Build variables for AWQ quantization.
+        AWQ uses 4-bit quantization with per-channel AWQ scales that protect
+        salient weights based on activation magnitudes.
+        """
+        from keras.src.quantizers import awq_core
+        # Ensures the forward pass uses the original high-precision kernel
+        # until calibration has been performed.
+        self.is_awq_calibrated = False
+        self.original_kernel_shape = kernel_shape
+        if len(kernel_shape) == 2:
+            rows = kernel_shape[0]
+            columns = kernel_shape[1]
+        elif len(kernel_shape) == 3:
+            shape = list(self.original_kernel_shape)
+            d_model_dim_index = shape.index(max(shape))
+            if d_model_dim_index == 0:  # QKV projection case
+                in_features, heads, head_dim = shape
+                rows, columns = (
+                    in_features,
+                    heads * head_dim,
+                )
+            elif d_model_dim_index in [1, 2]:  # Attention Output case
+                heads, head_dim, out_features = shape
+                rows, columns = (
+                    heads * head_dim,
+                    out_features,
+                )
+            else:
+                raise ValueError("Could not determine row/column split.")
+        else:
+            raise ValueError("AWQ quantization only supports 2D or 3D kernels.")
+        group_size = awq_core.get_group_size_for_layer(self, config)
+        num_groups = 1 if group_size == -1 else math.ceil(rows / group_size)
+        self.awq_unpacked_column_size = columns
+        # For 4-bit weights, we pack two values per byte.
+        kernel_columns = (columns + 1) // 2
+        self._set_quantization_info()
+        self.quantized_kernel = self.add_weight(
+            name="kernel",
+            shape=(kernel_columns, rows),
+            initializer="zeros",
+            dtype="uint8",
+            trainable=False,
+        )
+        self.kernel_scale = self.add_weight(
+            name="kernel_scale",
+            shape=(columns, num_groups),
+            initializer="ones",
+            trainable=False,
+        )
+        self.kernel_zero = self.add_weight(
+            name="zero_point",
+            shape=(columns, num_groups),
+            initializer="zeros",
+            dtype="uint8",
+            trainable=False,
+        )
+        # Per-channel AWQ scales from activation magnitudes
+        self.awq_scales = self.add_weight(
+            name="awq_scales",
+            shape=(rows,),
+            initializer="ones",
+            trainable=False,
+        )
+        self.g_idx = self.add_weight(
+            name="g_idx",
+            shape=(rows,),
+            initializer="zeros",
+            dtype="float32",
+            trainable=False,
+        )
+    def _awq_call(self, inputs, training=False):
+        """Forward pass for AWQ quantized layer."""
+        if not self.is_awq_calibrated:
+            W = self._kernel
+        else:
+            # Unpack 4-bit weights
+            W = quantizers.unpack_int4(
+                self.quantized_kernel,
+                orig_len=self.awq_unpacked_column_size,
+                axis=0,
+                dtype="uint8",
+            )
+            # Dequantize using scale/zero maps
+            W = dequantize_with_sz_map(
+                W,
+                self.kernel_scale,
+                self.kernel_zero,
+                self.g_idx,
+            )
+            W = ops.transpose(W)
+            # Apply AWQ scales by dividing to restore original magnitude
+            # (We multiplied by scales before quantization, so divide to undo)
+            # awq_scales has shape [input_dim], W has shape [input_dim, out_dim]
+            # Expand dims for proper broadcasting.
+            W = ops.divide(W, ops.expand_dims(self.awq_scales, -1))
+            W = ops.reshape(W, self.original_kernel_shape)
+        y = ops.einsum(self.equation, inputs, W)
+        if self.bias is not None:
+            y = ops.add(y, self.bias)
+        if self.activation is not None:
+            y = self.activation(y)
+        return y
+    def _int4_build(self, kernel_shape, config=None):
         """Build variables for int4 quantization.
         The packed int4 kernel stores two int4 values within a single int8
@@ -647,9 +770,15 @@ class EinsumDense(Layer):
         self._set_quantization_info()
         # Quantizer for the inputs (per the reduced axes)
-        self.inputs_quantizer = quantizers.AbsMaxQuantizer(
-            axis=self._input_reduced_axes
+        self.inputs_quantizer = (
+            QuantizationConfig.activation_quantizer_or_default(
+                config,
+                quantizers.AbsMaxQuantizer(),
+            )
         )
+        # If the config provided a default AbsMaxQuantizer, we need to
+        # override the axis to match the equation's reduction axes.
+        self.quantization_axis = tuple(self._input_reduced_axes)
         # Choose the axis to perform int4 packing - use the first reduced axis
         # for the kernel (analogous to the input dimension of a Dense layer).
@@ -771,13 +900,36 @@ class EinsumDense(Layer):
                 )
                 return (inputs_grad, None, None)
-            inputs, inputs_scale = self.inputs_quantizer(inputs)
-            x = ops.einsum(self.equation, inputs, kernel)
-            # Deal with `inputs_scale`
-            inputs_scale = self._adjust_scale_for_quant(inputs_scale, "input")
-            # De-scale outputs
-            x = ops.cast(x, self.compute_dtype)
-            x = ops.divide(x, ops.multiply(inputs_scale, kernel_scale))
+            if self.inputs_quantizer:
+                inputs, inputs_scale = self.inputs_quantizer(
+                    inputs, axis=self.quantization_axis
+                )
+                # Align `inputs_scale` axes with the output
+                # for correct broadcasting
+                inputs_scale = self._adjust_scale_for_quant(
+                    inputs_scale, "input"
+                )
+                x = ops.einsum(self.equation, inputs, kernel)
+                # De-scale outputs
+                x = ops.cast(x, self.compute_dtype)
+                x = ops.divide(x, ops.multiply(inputs_scale, kernel_scale))
+            else:
+                # Weight-only quantization: dequantize kernel and use float
+                # einsum. This is a workaround for PyTorch's einsum which
+                # doesn't support mixed-precision inputs (float input,
+                # int8 kernel).
+                if backend.backend() == "torch":
+                    kernel_scale = self._adjust_scale_for_dequant(kernel_scale)
+                    float_kernel = ops.divide(
+                        ops.cast(kernel, dtype=self.compute_dtype),
+                        kernel_scale,
+                    )
+                    x = ops.einsum(self.equation, inputs, float_kernel)
+                else:
+                    x = ops.einsum(self.equation, inputs, kernel)
+                    # De-scale outputs
+                    x = ops.cast(x, self.compute_dtype)
+                    x = ops.divide(x, kernel_scale)
             return x, grad_fn
         x = einsum_with_inputs_gradient(
@@ -847,17 +999,38 @@ class EinsumDense(Layer):
                 return (inputs_grad, None, None)
             # Quantize inputs per `self.inputs_quantizer`.
-            inputs_q, inputs_scale = self.inputs_quantizer(inputs)
-            # Compute einsum on quantized inputs and unpacked int4 kernel.
-            x = ops.einsum(self.equation, inputs_q, unpacked_kernel)
-            # Align `inputs_scale` axes with the output for correct broadcasting
-            inputs_scale = self._adjust_scale_for_quant(inputs_scale, "input")
-            # De-scale outputs.
-            x = ops.cast(x, self.compute_dtype)
-            x = ops.divide(x, ops.multiply(inputs_scale, kernel_scale))
+            if self.inputs_quantizer:
+                inputs_q, inputs_scale = self.inputs_quantizer(
+                    inputs, axis=self.quantization_axis
+                )
+                # Align `inputs_scale` axes with the output
+                # for correct broadcasting
+                inputs_scale = self._adjust_scale_for_quant(
+                    inputs_scale, "input"
+                )
+                x = ops.einsum(self.equation, inputs_q, unpacked_kernel)
+                # De-scale outputs
+                x = ops.cast(x, self.compute_dtype)
+                x = ops.divide(x, ops.multiply(inputs_scale, kernel_scale))
+            else:
+                # Weight-only quantization: dequantize kernel and use float
+                # einsum. This is a workaround for PyTorch's einsum which
+                # doesn't support mixed-precision inputs (float input,
+                # int4 kernel).
+                if backend.backend() == "torch":
+                    # Align `kernel_scale` to the same layout as
+                    # `unpacked_kernel`.
+                    kernel_scale = self._adjust_scale_for_dequant(kernel_scale)
+                    float_kernel = ops.divide(
+                        ops.cast(unpacked_kernel, dtype=self.compute_dtype),
+                        kernel_scale,
+                    )
+                    x = ops.einsum(self.equation, inputs, float_kernel)
+                else:
+                    x = ops.einsum(self.equation, inputs, unpacked_kernel)
+                    # De-scale outputs
+                    x = ops.cast(x, self.compute_dtype)
+                    x = ops.divide(x, kernel_scale)
             return x, grad_fn
         x = einsum_with_inputs_gradient(
@@ -971,30 +1144,40 @@ class EinsumDense(Layer):
             x = self.activation(x)
         return x
-    def quantize(self, mode, type_check=True, config=None):
+    def quantize(self, mode=None, type_check=True, config=None):
         # Prevent quantization of the subclasses
         if type_check and (type(self) is not EinsumDense):
             raise self._not_implemented_error(self.quantize)
+        self.quantization_config = config
         kernel_shape = self._kernel.shape
-        if mode in ("int8", "int4", "gptq"):
+        if mode in ("int8", "int4", "gptq", "awq"):
             self._set_quantization_info()
         if mode == "int8":
             # Quantize `self._kernel` to int8 and compute corresponding scale
-            kernel_value, kernel_scale = quantizers.abs_max_quantize(
-                self._kernel, axis=self._kernel_reduced_axes, to_numpy=True
+            weight_quantizer = QuantizationConfig.weight_quantizer_or_default(
+                self.quantization_config,
+                quantizers.AbsMaxQuantizer(axis=self._kernel_reduced_axes),
+            )
+            kernel_value, kernel_scale = weight_quantizer(
+                self._kernel, to_numpy=True
             )
             kernel_scale = self._adjust_scale_for_quant(kernel_scale, "kernel")
             del self._kernel
         elif mode == "int4":
             # Quantize to int4 values (stored in int8 dtype, range [-8, 7])
-            kernel_value_int4, kernel_scale = quantizers.abs_max_quantize(
-                self._kernel,
-                axis=self._kernel_reduced_axes,
-                value_range=(-8, 7),
-                dtype="int8",
-                to_numpy=True,
+            weight_quantizer = QuantizationConfig.weight_quantizer_or_default(
+                self.quantization_config,
+                quantizers.AbsMaxQuantizer(
+                    axis=self._kernel_reduced_axes,
+                    value_range=(-8, 7),
+                    output_dtype="int8",
+                ),
+            )
+            kernel_value_int4, kernel_scale = weight_quantizer(
+                self._kernel, to_numpy=True
             )
             kernel_scale = self._adjust_scale_for_quant(kernel_scale, "kernel")
@@ -1005,7 +1188,7 @@ class EinsumDense(Layer):
             )
             kernel_value = packed_kernel_value
             del self._kernel
-        self.quantized_build(kernel_shape, mode, config)
+        self.quantized_build(kernel_shape, mode, self.quantization_config)
         # Assign values to the newly created variables.
         if mode in ("int8", "int4"):
@@ -1016,7 +1199,9 @@ class EinsumDense(Layer):
         if self.dtype_policy.quantization_mode is None:
             policy_name = mode
             if mode == "gptq":
-                policy_name = config.dtype_policy_string()
+                policy_name = self.quantization_config.dtype_policy_string()
+            elif mode == "awq":
+                policy_name = self.quantization_config.dtype_policy_string()
             policy = dtype_policies.get(
                 f"{policy_name}_from_{self.dtype_policy.name}"
             )
@@ -1080,7 +1265,7 @@ class EinsumDense(Layer):
                     This is `None` if the layer is not quantized.
         """
         # If not a quantized layer, return the full-precision kernel directly.
-        if self.dtype_policy.quantization_mode in (None, "gptq"):
+        if self.dtype_policy.quantization_mode in (None, "gptq", "awq"):
             return self.kernel, None
         # If quantized but LoRA is not enabled, return the original quantized

keras-nightly 3.12.0.dev2025100503__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl

keras-nightly 3.12.0.dev2025100503py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl